Skip to content

Commit

Permalink
adding R scripts
Browse files Browse the repository at this point in the history
  • Loading branch information
sebastianlieske committed Jan 29, 2019
1 parent 4e90145 commit 3fe5c28
Show file tree
Hide file tree
Showing 23 changed files with 338 additions and 0 deletions.
18 changes: 18 additions & 0 deletions R/add.expression.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
#' Add Gene Expression
#' @description This function adds the gene expression values to each "case_id" of a gene in the data frame. It uses the expressionmatrix from organ_data.
#' @param cancer_data A large list created by multimodalR
#' @param nested_metadata A list of data frames made by >make.nested.metadata<
#' @param key Character or integer - Column name or index of unique identifier of nested_metadata, default = "case_id"
#' @importFrom magittr %>%
#' @examples
#' lungMetaExpression <- add.expression(lungXY, lungMetaExpression)
#' @return Returns the same list but with data franes containing the expression values in a new colummn.


add.expression <- function(cancer_data, nested_metadata, key = "case_id") { #nested_metadata = List of metadata for each gene
for (gene in names(nested_metadata)) {
geneExpression <- cancer_data$Expressionmatrix[gene, c(nested_metadata[[gene]][[key]])] %>% as.numeric #[genename, case_id from the matrix]
nested_metadata[[gene]]$expression <- geneExpression #add the expression values to the data.table
}
return(nested_metadata)
}
18 changes: 18 additions & 0 deletions R/add.stage.simple.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
#' Adds column with simplified tumor stages without subtypes
#' @description This function adds a new column "stage" to the meta data containing the simplified tumor stages (i.e. without subtypes). It uses the function stages.to.number.
#' As for now 4 subtypes are accepted: a,b,c and 0 - it doesnt compute "stage i/ii NOS", "stage 0" and "stage x"
#' @param meta_data A data frame
#' @param tumor_stage Character or Integer - Column name or index of tumor stage
#' @param new_name Character - Name of new column
#' @examples
#' lungMeta <- add.stage.simple(lungmeta, "tumor_stage")
#' @return Returns the data frame with an additional column.

add.stage.simple <- function(meta_data, tumor_stage = "tumor_stage", new_name = "stage") { #add simple stage to metadata without a/b
stageVector <- as.character(meta_data[[tumor_stage]])
correctLevels <- mmRmeta::stage.to.numeral(stageVector)
stage <- sub(pattern = "[abc0]", "", correctLevels)
meta_data[[new_name]] <- stage
meta_data[[new_name]] <- factor(stage, exclude = "NA", ordered = TRUE)
return(meta_data)
}
27 changes: 27 additions & 0 deletions R/counts.per.group.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
#' Group by and summarize counts
#' @description This functions groups by 2 variables/columns with the counts of the combination. It accepts a list of data frames or a single data frame. When passing a single data frame this function will simply return a data frame with the counts. When using a list as input one might wants to know how often a patient(case_id)
#' appears in a group. If youre using a single data frame you need to change the grouping variables. Using ".id" as first_grouping and lists the counts of groups for the genes.
#' @param list_df Either a list of data frames or a single data frame
#' @param first_grouping Character - first grouping variable | Default "case_id"
#' @param second_grouping Character - second grouping variable | Default "group""
#' @importFrom magittr %>%
#' @importFrom dplyr group_by_ summarize
#' @examples
#' Using a Data Frame
#' df <- data.frame()
#' df_counts <- counts.per.group(df, "firstColumn", "secondColumn")
#' @return Returns a data frame



counts.per.group <- function(list_df, first_grouping = "case_id", second_grouping = "group") {
if(class(list_df) == "list") {
flatList <- plyr::ldply(list_df, data.table::data.table) #flatten list to data.table
countFrame <- flatList %>% group_by_(first_grouping, second_grouping) %>% summarize(n = n()) #data.table is grouped by the 2 variables and a third column containing the frequency is made
outputFrame <- tidyr::spread(countFrame, second_grouping, n , fill = 0 ) #spread the data frame by the second_grouping
} else {
countFrame <- list_df %>% group_by_(first_grouping, second_grouping) %>% summarize(n = n())
outputFrame <- tidyr::spread(countFrame, second_grouping, n, fill = 0)
}
return(outputFrame)
}
14 changes: 14 additions & 0 deletions R/drop.unused.levels.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
#' Drop unused factor levels
#' @description This function drops all unused levels from all factors in a data frame.
#' @param df Data frame
#' @examples
#' @return Returns the altered data frame.

drop.unused.levels<- function(df) {
for(column in names(df)) {
if(is.factor(df[[column]])) {
df[[column]] <- factor(df[[column]])
}
}
return(df)
}
16 changes: 16 additions & 0 deletions R/filter.columns.as.na.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
#' Delete Colums with only NA
#' @description Sometimes actual NA values are indicated as other strings (e.g. "not reported"). This function converts these strings to NA and ultimately deletes any column consisting only of NA values.
#' @param df Data frame
#' @param value A value or character that should be converted to NA.
#' @importFrom magrittr %>%
#' @examples
#' df <- data.frame(num1 = c(1,2,3,4), num2= c(5,6,7, "not reported"))
#' df <- filter.column.as.na(df, "no value")
#' @return Returns the filtered data frame


filter.columns.as.na <- function(df, value = "not reported") {
df <- dplyr::na_if(df, word) #changes all occurences of "word" to NA, Default is "not reported"
df <- df %>% dplyr::select_if(~!all(is.na(.))) #deletes columns with only NA values
return(df)
}
35 changes: 35 additions & 0 deletions R/make.nested.metadata.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
#' Make nested data frames
#' @description Create a data frame of metadata for every gene. The case_id are taken from cancer_data and are matched against cancer_metadata. An additional
#' column "group" is appended to the data table. The column "group" represents the modality of the gene expression.
#' @param cancer_metadata Meta data provided by TCGA
#' @param cancer_data A large list created prior by multimodalR
#' @param key Character or integer - Column name or index of unique identifier, default = "case_id"
#' @examples
#' lungMetaExpression <- nested.metadata(lungMetaData, lungXY)
#' @return Returns a list of data frames for every gene

make.nested.metadata <- function(cancer_metadata, cancer_data, key = "case_id") {
cancer_metadata <- data.table::data.table(cancer_metadata)
outputList <- list()
for (gene in 1:length(cancer_data$Output$Genes)) {
groupList <- list()
for (group in 1:length(cancer_data$Output$Genes[[gene]]$groups)) { #for every group
short <- cancer_data$Output$Genes[[gene]]$groups #shorten the variable
groupID <- unlist(short[group]) #unlist the case_ids
groupID <- c(gsub("X","",groupID)) #remove the X
rowNumbers <- numeric()

for(i in groupID) { #match rows to every case_id and store it
match <- grep(i, cancer_metadata[[key]])
rowNumbers <- c(rowNumbers, match)
}
geneDataFrame <- cancer_metadata[c(rowNumbers),] #subset via the stored matches
geneDataFrame$group <- factor(x = group) #add new column group
groupList[[group]] <- geneDataFrame #add data.table to list
}
geneDataFrameMerged <- plyr::ldply(groupList, data.frame) #combine the 2 to 3 dataframes together
outputList[[gene]] <- geneDataFrameMerged
}
names(outputList) <- names(cancer_data$Output$Genes)
return(outputList)
}
15 changes: 15 additions & 0 deletions R/remove.x.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
#' Remove 'X' infront of caseID
#' @description This function removes the 'X' in the column names (case_id) of cancer_data$Expressionmatrix that occures in R, when the start of the string is a number.
#' It's necessary to remove them so downstream functions work propperly.
#' @param cancer_data A large list created prior by multimodalR.
#' @examples
#' lungXY <- remove.x(lungXY)
#' @return Returns the same large list with changed column names of the expressionmatrix.

remove.x <- function (cancer_data) {
columnNames <- names(cancer_data$Expressionmatrix)
columnNames <- c(gsub("X","", columnNames))
columnNames <- c(gsub("\\.","-", columnNames)) #change "." to "-" just like in meta data
names(cancer_data$Expressionmatrix) <- columnNames #change columnames
return(cancer_data)
}
15 changes: 15 additions & 0 deletions R/rename.columns.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
#' Shorten column names
#' @description This function shortenes the column names using regex. When flattening a list from .json the resulting colums are named like "diagnoses.treatment.updated_datetime". Each period indicating a level of the list.
#' For better readability everything in front of the last period is deleted.
#' @param df A Data frame
#' @examples
#' df <- data.frame(a.long.column.name_1= c(1,2,3,4), another.long2.column.name_2= c(5,6,7, "not reported"))
#' df <- rename.columns(df)
#' @return Returns the same df with shortened column names.

rename.columns <- function(df) {
oldNames <- colnames(df)
newNames <- stringr::str_extract(pattern = "(?!\\.)\\w+$", oldNames) #negative lookahead of periods, match any word character at end of line/string
colnames(df) <- newNames
return(df)
}
21 changes: 21 additions & 0 deletions R/reorder.all.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
#' Uses reorder.column on all eligible columns
#' @description This function removes factor levels with less counts than a set value of every column. Additionally, it reorders the factor levels depending on the counts.
#'
#' @param cancer_data A large list created prior by multimodalR
#' @param exception Character - One or more column names that should not be processed by this function
#' @param remove Integer - remove >= counts of the factor level
#' @examples
#' lungMeta <- reorder.column.all(lungMeta, c("tumor_stage", "race"), 10)
#' @return Returns the data frame with filtered/ordered columns.


reorder.column.all <- function(cancer_metadata, exception = NULL, remove = 0) {
columnNames <- names(cancer_metadata)
if(!is.null(exception)) {
columnNames <- columnNames[!c(columnNames %in% exception)] #remove column names given in exception
}
for(names in columnNames) {
cancer_metadata <- mmRmeta::reorder.column(cancer_metadata, column = names, remove = remove) #use reorder.columns on given column names
}
return(cancer_metadata)
}
27 changes: 27 additions & 0 deletions R/reorder.column.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
#' Remove small factor levels and reorder
#' @description This function removes factor levels with counts less than a set value. Additionally, it reorders the factor levels depending on the counts.
#' This may be necessary later on to get ordered labels when plotting.
#' @param cancer_data A large list created prior by multimodalR.
#' @param column Character - column name
#' @param remove Integer - threshold of counts; any factor level with equal or less counts are discarded
#' @examples
#' lungMeta <- reorder.colums(lungMeta, "primary_diagnosis", 10) #keeps factor levels with >= 10 counts.
#' @return Returns the same large list with changed column names of the expressionmatrix.


reorder.column <- function(cancer_metadata, column, remove = 0) {
if(is.factor(cancer_metadata[[column]])) {
arrangedCounts <- cancer_metadata %>% dplyr::group_by_(column) %>% dplyr::summarize(n = n())
arrangedCounts <- dplyr::arrange(arrangedCounts, desc(n)) #order by descending counts
cancer_metadata[[column]] <- factor(cancer_metadata[[column]], levels = arrangedCounts[[column]]) #reorder factor in data frame

factorCount <- plyr::count(cancer_metadata[[column]]) %>% dplyr::filter(freq >= remove) #removes counts less than the value of removes
factorCount <- factorCount[,1]
columnSymbol <- rlang::sym(column)
cancer_metadata <- dplyr::filter(cancer_metadata, !!(columnSymbol) %in% factorCount) # !! is the newer version of rlang::UQ() which unquotes the variable
cancer_metadata[[column]] <- factor(cancer_metadata[[column]])
} else {
warning(column, " is no factor, no changes applied")
}
return(cancer_metadata)
}
33 changes: 33 additions & 0 deletions R/stage.to.numeral.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
#' Change roman to arabic numerals
#' @description This function changes roman numerals to arabic numerals and keeps the subtypes. As for now 4 subtypes are accepted: a,b,c and 0 - it doesnt compute "stage i/ii NOS", "stage 0" and "stage x"
#' @param stage_values A vector of tumor stages with roman numerals and subtypes
#' @examples
#' stages <- stage.to.numeral(metadata$tumor_stage)
#' @return Returns a vetor of tumor stages with arabic numerals and subtypes.

stage.to.numeral <- function(stage_values) { #A vector of tumor stages
stageVector <- as.character(stage_values)
lvl1 <- gsub("stage ", "", stageVector)
lvl2 <- gsub("[0abc]", "", lvl1)
lvlNum <- as.numeric(as.roman(lvl2)) #change to roman

storage <- character()
for(i in 1:length(lvl1)) {

if(any(grep("a", lvl1[i]))) {
storage[i] <- "a"
}
else if (any(grep("b", lvl1[i]))) {
storage[i] <- "b"
}
else if (any(grep("c", lvl1[i]))) {
storage[i] <- "c"
}
else if (any(grep("0", lvl1[i]))) {
storage[i] <- "0"
}
else {storage[i] <- ""}
}
correctLevels <- paste0(lvlNum, storage)
return(correctLevels)
}
30 changes: 30 additions & 0 deletions R/subset.metadata.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
#' Filter Metadata for Primary Cancer Site (Organ)
#' @description This function subsets the metadata for an organ. It works for meta data directly downloaded from TCGA or already filtered meta data. The subset is done by matching the "key" with the column names (patient ID or case_id) of the expression matrix.
#' @param meta_data Meta data provided by TCGA.
#' @param cancer_data A large list created prior by multimodalR for a cancer site.
#' @param key character - Unique identifier, default = "case_id"
#' @param additional_key needs to be added
#' @param additional_value needs to be added
#' @importFrom magittr %>%
#' @examples
#' lungMetaData <- subset.metadata(metadata, lungXY)
#' @return Returns a data table of the metadata for a primary cancer site (organ).

subset.metadata <- function(meta_data, cancer_data, key = "case_id", additional_key = NULL, additional_value = NULL) {
if(!is.character(meta_data[[key]]) {
meta_data[[key]] <- as.character(meta_data[[key]]) #transform factor to character
}
correctRows <- numeric() #empty vector to be filled later
caseNames <- names(cancer_data$Expressionmatrix) #vector of case_id from expressionmatrix
for (cases in caseNames) { #match case_id of the vector with case_id of metadata
matchedRow <- grep(cases, meta_data[[key]]
correctRows <- c(correctRows, matchedRow)
}
meta_data <- meta_data[c(correctRows),] #subset by machted case_id
if(!is.null(additional_key)) {
additional_key <- rlang::sym(additional_key)

meta_data <- meta_data %>% filter_(., !!(additional_key) == additional_value)
}
return(meta_data)
}
Binary file added example/filteredDataLung.Rdata
Binary file not shown.
Binary file added example/imageFron2019_01_28.RData
Binary file not shown.
Binary file added example/lungFiltered.RDS
Binary file not shown.
24 changes: 24 additions & 0 deletions example/preprocessing.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
#Preprocessing Data
#### 1.1 MetaData
metadata <- RJSONIO::fromJSON("P:/TCGA/clinical.cases_selection.2019-01-18.json", nullValue = NA, simplify = FALSE)
metadata <- plyr::ldply(metadata, data.frame) #flatten the list into a data frame

DataExplorer::plot_intro(metadata)
DataExplorer::plot_missing(metadata)
metadata <- filter.columns.as.na(metadata)
#15 colums were dropped because of the NA values. From here I'd suggest to extract colums of interest because many arent necessary for the evaluation.
metadata <- rename.columns(metadata)
#Now you can select your colums of interest. For this example 11 colums are selected. Note that you may have duplicated column names.
metadataSelect <- subset(metadata, select = c(case_id, tumor_stage, primary_diagnosis, site_of_resection_or_biopsy, vital_status, days_to_death, age_at_diagnosis, gender, race, ethnicity))
#the last thing you have to do is to change your column with the patient/case id from a factor to characters.
metadataSelect$case_id <- as.character(metadataSelect$case_id)
####1.2 Organ data | Primary cancer Data | Expression data etc.





### Now you are set to work with your objects created by multimodalR
#2.1 Load your file (filteredOrgan.Rdata or .rds file)
#2.2 Match meta data case_id with filteredOrgan case_id
lungMeta <- filter.metadata.uni(fSelect, lungXY, key = "case_id")
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added example/report_files/figure-html/plot_intro-1.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
45 changes: 45 additions & 0 deletions preprocessing.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
#Preprocessing Data
#### 1.1 MetaData
metadata <- RJSONIO::fromJSON("P:/TCGA/clinical.cases_selection.2019-01-18.json", nullValue = NA, simplify = FALSE)
metadata <- plyr::ldply(metadata, data.frame) #flatten the list into a data frame

DataExplorer::plot_intro(metadata)
DataExplorer::plot_missing(metadata)
metadata <- filter.columns.as.na(metadata)
#15 colums were dropped because of the NA values. From here I'd suggest to extract colums of interest because many arent necessary for the evaluation.
metadata <- rename.columns(metadata)
#Now you can select your colums of interest. For this example 11 colums are selected. Note that you may have duplicated column names.
metadataSelect <- subset(metadata, select = c(case_id, tumor_stage, primary_diagnosis, site_of_resection_or_biopsy, vital_status, days_to_death, age_at_diagnosis, gender, race, ethnicity))
#the last thing you have to do is to change your column with the patient/case id from a factor to characters.
metadataSelect$case_id <- as.character(metadataSelect$case_id)
####1.2 Organ data | Primary cancer Data | Expression data etc.
#1.2.1 Load your file (filteredOrgan.Rdata or .rds file)
lung <- readRDS("lungFiltered.RDS")
#1.2.2 filter
lung <- multimodalR::updateGeneNames(filteredOutput = lung$Output, lung$Expressionmatrix)
lungX <- multimodalR::filterForYChromosomeGenes(output = lung$Output,expressionmatrix = lung$Expressionmatrix)
lungXY <- multimodalR::filterForXChromosomeGenes(output = lungX$Output,expressionmatrix = lungX$Expressionmatrix)
lungXY <- remove.x(lungXY)

### Now you are set to work with your objects created by multimodalR - Process Data
#2.2 Match meta data case_id with filteredOrgan case_id
lungMeta <- subset.metadata(metadataSelect, lungXY, key = "case_id")
lungMeta <- drop.unused.levels(lungMeta)
lungMeta <- add.stage.simple(meta_data = lungMeta, tumor_stage = "tumor_stage", new_name = "stage")
#optional: reorder columns / filter out small counts of factor levels
#lungMeta <- reorder.column(lungMeta, "primary.diagnosis", 20)
#2.3 Make data tables and add expression values to datatables
lungMetaExpression <- create.data.tables.new(lungMeta, lungXY)
lungMetaExpression <- add.expression.new(lungXY, lungMetaExpression, key = "case_id")
#lungMetaExpression <- lapply(lungMetaExpression, function(x) reorder.column(x, "primary_diagnosis", 15))
#lungMetaExpression <- lapply(lungMetaExpression, function(x) reorder.column(x, "site_of_resection_or_biopsy", 15))
#get overview of data
DataExplorer::plot_bar(lungMetaExpression$SFTPB)
DataExplorer::plot_histogram(lungMeta)
#decide if you want to drop certain factor levels because they are so small

x <- DataExplorer::split_columns(lungMeta)
#how often gene is splitted into modality groups
y <- counts.per.group(lungMetaExpression)
ymeta <- merge(y, lungMeta, by = "case_id") #merge counts of groups
z <- counts.per.group(lungMetaExpression, ".id", "group")

0 comments on commit 3fe5c28

Please sign in to comment.