adding R scripts

loosolab · Jan 29, 2019 · 3fe5c28 · 3fe5c28
1 parent 4e90145
commit 3fe5c28
Show file tree

Hide file tree

Showing 23 changed files with 338 additions and 0 deletions.
diff --git a/R/add.expression.R b/R/add.expression.R
@@ -0,0 +1,18 @@
+#' Add Gene Expression
+#' @description This function adds the gene expression values to each "case_id" of a gene in the data frame. It uses the expressionmatrix from organ_data.
+#' @param cancer_data A large list created by multimodalR
+#' @param nested_metadata A list of data frames made by >make.nested.metadata<
+#' @param key Character or integer - Column name or index of unique identifier of nested_metadata, default = "case_id"
+#' @importFrom magittr %>%
+#' @examples
+#' lungMetaExpression <- add.expression(lungXY, lungMetaExpression)
+#' @return Returns the same list but with data franes containing the expression values in a new colummn.
+
+
+add.expression <- function(cancer_data, nested_metadata, key = "case_id") {                                #nested_metadata = List of metadata for each gene
+  for (gene in names(nested_metadata)) {
+    geneExpression <- cancer_data$Expressionmatrix[gene, c(nested_metadata[[gene]][[key]])] %>% as.numeric #[genename, case_id from the matrix]
+    nested_metadata[[gene]]$expression <- geneExpression                                                   #add the expression values to the data.table
+  }
+  return(nested_metadata)
+}
diff --git a/R/add.stage.simple.R b/R/add.stage.simple.R
@@ -0,0 +1,18 @@
+#' Adds column with simplified tumor stages without subtypes
+#' @description This function adds a new column "stage" to the meta data containing the simplified tumor stages (i.e. without subtypes). It uses the function stages.to.number.
+#' As for now 4 subtypes are accepted: a,b,c and 0 - it doesnt compute "stage i/ii NOS", "stage 0" and "stage x"
+#' @param meta_data A data frame
+#' @param tumor_stage Character or Integer - Column name or index of tumor stage
+#' @param new_name Character - Name of new column
+#' @examples
+#' lungMeta <- add.stage.simple(lungmeta, "tumor_stage")
+#' @return Returns the data frame with an additional column.
+
+add.stage.simple <- function(meta_data, tumor_stage = "tumor_stage", new_name = "stage") { #add simple stage to metadata without a/b
+  stageVector <- as.character(meta_data[[tumor_stage]])
+  correctLevels <- mmRmeta::stage.to.numeral(stageVector)
+  stage <- sub(pattern = "[abc0]", "", correctLevels)
+  meta_data[[new_name]] <- stage
+  meta_data[[new_name]] <- factor(stage, exclude = "NA", ordered = TRUE)
+  return(meta_data)
+}
diff --git a/R/counts.per.group.R b/R/counts.per.group.R
@@ -0,0 +1,27 @@
+#' Group by and summarize counts
+#' @description This functions groups by 2 variables/columns with the counts of the combination. It accepts a list of data frames or a single data frame. When passing a single data frame this function will simply return a data frame with the counts. When using a list as input one might wants to know how often a patient(case_id)
+#' appears in a group. If youre using a single data frame you need to change the grouping variables. Using ".id" as first_grouping and lists the counts of groups for the genes.
+#' @param list_df Either a list of data frames or a single data frame
+#' @param first_grouping Character - first grouping variable | Default "case_id"
+#' @param second_grouping Character - second grouping variable | Default  "group""
+#' @importFrom magittr %>%
+#' @importFrom dplyr group_by_ summarize
+#' @examples
+#' Using a Data Frame
+#' df <- data.frame()
+#' df_counts <- counts.per.group(df, "firstColumn", "secondColumn")
+#' @return Returns a data frame
+
+
+
+counts.per.group <- function(list_df, first_grouping = "case_id", second_grouping = "group") {
+  if(class(list_df) == "list") {
+    flatList <- plyr::ldply(list_df, data.table::data.table)                                       #flatten list to data.table
+    countFrame <- flatList %>% group_by_(first_grouping, second_grouping) %>% summarize(n = n())   #data.table is grouped by the 2 variables and a third column containing the frequency is made
+    outputFrame <- tidyr::spread(countFrame, second_grouping, n , fill = 0 )                       #spread the data frame by the second_grouping
+  } else {
+    countFrame <- list_df %>% group_by_(first_grouping, second_grouping) %>% summarize(n = n())
+    outputFrame <- tidyr::spread(countFrame, second_grouping, n, fill = 0)
+  }
+  return(outputFrame)
+}
diff --git a/R/drop.unused.levels.R b/R/drop.unused.levels.R
@@ -0,0 +1,14 @@
+#' Drop unused factor levels
+#' @description This function drops all unused levels from all factors in a data frame.
+#' @param df Data frame
+#' @examples
+#' @return Returns the altered data frame.
+
+drop.unused.levels<- function(df) {
+  for(column in names(df)) {
+    if(is.factor(df[[column]])) {
+      df[[column]] <- factor(df[[column]])
+    }
+  }
+  return(df)
+}
diff --git a/R/filter.columns.as.na.R b/R/filter.columns.as.na.R
@@ -0,0 +1,16 @@
+#' Delete Colums with only NA
+#' @description Sometimes actual NA values are indicated as other strings (e.g. "not reported"). This function converts these strings to NA and ultimately deletes any column consisting only of NA values.
+#' @param df Data frame
+#' @param value A value or character that should be converted to NA.
+#' @importFrom magrittr %>%
+#' @examples
+#' df <-  data.frame(num1 = c(1,2,3,4), num2= c(5,6,7, "not reported"))
+#' df <- filter.column.as.na(df, "no value")
+#' @return Returns the filtered data frame
+
+
+filter.columns.as.na <- function(df, value = "not reported") {
+  df <- dplyr::na_if(df, word)                   #changes all occurences of "word" to NA, Default is "not reported"
+  df <- df %>% dplyr::select_if(~!all(is.na(.))) #deletes columns with only NA values
+  return(df)
+}
diff --git a/R/make.nested.metadata.R b/R/make.nested.metadata.R
@@ -0,0 +1,35 @@
+#' Make nested data frames
+#' @description Create a data frame of metadata for every gene. The case_id are taken from cancer_data and are matched against cancer_metadata. An additional
+#' column "group" is appended to the data table. The column "group" represents the modality of the gene expression.
+#' @param cancer_metadata Meta data provided by TCGA
+#' @param cancer_data A large list created prior by multimodalR
+#' @param key Character or integer - Column name or index of unique identifier, default = "case_id"
+#' @examples
+#' lungMetaExpression <- nested.metadata(lungMetaData, lungXY)
+#' @return Returns a list of data frames for every gene
+
+make.nested.metadata <- function(cancer_metadata, cancer_data, key = "case_id") {
+  cancer_metadata <- data.table::data.table(cancer_metadata)
+  outputList <- list()
+  for (gene in 1:length(cancer_data$Output$Genes)) {
+    groupList <- list()
+    for (group in 1:length(cancer_data$Output$Genes[[gene]]$groups)) { #for every group
+      short <- cancer_data$Output$Genes[[gene]]$groups #shorten the variable
+      groupID <- unlist(short[group]) #unlist the case_ids
+      groupID <- c(gsub("X","",groupID)) #remove the X
+      rowNumbers <- numeric()
+
+      for(i in groupID) {                #match rows to every case_id and store it
+        match <- grep(i, cancer_metadata[[key]])
+        rowNumbers <- c(rowNumbers, match)
+      }
+      geneDataFrame <- cancer_metadata[c(rowNumbers),] #subset via the stored matches
+      geneDataFrame$group <- factor(x = group) #add new column group
+      groupList[[group]] <- geneDataFrame #add data.table to list
+    }
+    geneDataFrameMerged <- plyr::ldply(groupList, data.frame) #combine the 2 to 3 dataframes together
+    outputList[[gene]] <- geneDataFrameMerged
+  }
+  names(outputList) <- names(cancer_data$Output$Genes)
+  return(outputList)
+}
diff --git a/R/remove.x.R b/R/remove.x.R
@@ -0,0 +1,15 @@
+#' Remove 'X' infront of caseID
+#' @description This function removes the 'X' in the column names (case_id) of cancer_data$Expressionmatrix that occures in R, when the start of the string is a number.
+#' It's necessary to remove them so downstream functions work propperly.
+#' @param cancer_data A large list created prior by multimodalR.
+#' @examples
+#' lungXY <- remove.x(lungXY)
+#' @return Returns the same large list with changed column names of the expressionmatrix.
+
+remove.x <- function (cancer_data) {
+  columnNames <- names(cancer_data$Expressionmatrix)
+  columnNames <- c(gsub("X","", columnNames))
+  columnNames <- c(gsub("\\.","-", columnNames))    #change "." to "-" just like in meta data
+  names(cancer_data$Expressionmatrix) <- columnNames #change columnames
+  return(cancer_data)
+}
diff --git a/R/rename.columns.R b/R/rename.columns.R
@@ -0,0 +1,15 @@
+#' Shorten column names
+#' @description This function shortenes the column names using regex. When flattening a list from .json the resulting colums are named like "diagnoses.treatment.updated_datetime". Each period indicating a level of the list.
+#' For better readability everything in front of the last period is deleted.
+#' @param df A Data frame
+#' @examples
+#' df <-  data.frame(a.long.column.name_1= c(1,2,3,4), another.long2.column.name_2= c(5,6,7, "not reported"))
+#' df <- rename.columns(df)
+#' @return Returns the same df with shortened column names.
+
+rename.columns <- function(df) {
+  oldNames <- colnames(df)
+  newNames <- stringr::str_extract(pattern = "(?!\\.)\\w+$", oldNames) #negative lookahead of periods, match any word character at end of line/string
+  colnames(df) <- newNames
+  return(df)
+}
diff --git a/R/reorder.all.R b/R/reorder.all.R
@@ -0,0 +1,21 @@
+#' Uses reorder.column on all eligible columns
+#' @description This function removes factor levels with less counts than a set value of every column. Additionally, it reorders the factor levels depending on the counts.
+#'
+#' @param cancer_data A large list created prior by multimodalR
+#' @param exception Character - One or more column names that should not be processed by this function
+#' @param remove Integer - remove >= counts of the factor level
+#' @examples
+#' lungMeta <- reorder.column.all(lungMeta, c("tumor_stage", "race"), 10)
+#' @return Returns the data frame with filtered/ordered columns.
+
+
+reorder.column.all <- function(cancer_metadata, exception = NULL, remove = 0) {
+  columnNames <- names(cancer_metadata)
+  if(!is.null(exception)) {
+    columnNames <- columnNames[!c(columnNames %in% exception)] #remove column names given in exception
+  }
+  for(names in columnNames) {
+    cancer_metadata <- mmRmeta::reorder.column(cancer_metadata, column = names, remove = remove) #use reorder.columns on given column names
+  }
+  return(cancer_metadata)
+}
diff --git a/R/reorder.column.R b/R/reorder.column.R
@@ -0,0 +1,27 @@
+#' Remove small factor levels  and reorder
+#' @description This function removes factor levels with counts less than a set value. Additionally, it reorders the factor levels depending on the counts.
+#' This may be necessary later on to get ordered labels when plotting.
+#' @param cancer_data A large list created prior by multimodalR.
+#' @param column Character - column name
+#' @param remove Integer - threshold of counts; any factor level with equal or less counts are discarded
+#' @examples
+#' lungMeta <- reorder.colums(lungMeta, "primary_diagnosis", 10) #keeps factor levels with >= 10 counts.
+#' @return Returns the same large list with changed column names of the expressionmatrix.
+
+
+reorder.column <- function(cancer_metadata, column, remove = 0) {
+  if(is.factor(cancer_metadata[[column]])) {
+  arrangedCounts <- cancer_metadata %>% dplyr::group_by_(column) %>% dplyr::summarize(n = n())
+  arrangedCounts <- dplyr::arrange(arrangedCounts, desc(n))                                              #order by descending counts
+  cancer_metadata[[column]] <- factor(cancer_metadata[[column]], levels = arrangedCounts[[column]])        #reorder factor in data frame
+
+  factorCount <- plyr::count(cancer_metadata[[column]]) %>% dplyr::filter(freq >= remove)                 #removes counts less than the value of removes
+  factorCount  <- factorCount[,1]
+  columnSymbol <- rlang::sym(column)
+  cancer_metadata <- dplyr::filter(cancer_metadata, !!(columnSymbol) %in% factorCount)                     # !! is the newer version of rlang::UQ() which unquotes the variable
+  cancer_metadata[[column]] <- factor(cancer_metadata[[column]])
+} else {
+  warning(column, " is no factor, no changes applied")
+}
+  return(cancer_metadata)
+}
diff --git a/R/stage.to.numeral.R b/R/stage.to.numeral.R
@@ -0,0 +1,33 @@
+#' Change roman to arabic numerals
+#' @description This function changes roman numerals to arabic numerals and keeps the subtypes. As for now 4 subtypes are accepted: a,b,c and 0 - it doesnt compute "stage i/ii NOS", "stage 0" and "stage x"
+#' @param stage_values A vector of tumor stages with roman numerals and subtypes
+#' @examples
+#' stages <- stage.to.numeral(metadata$tumor_stage)
+#' @return Returns a vetor of tumor stages with arabic numerals and subtypes.
+
+stage.to.numeral <- function(stage_values) { #A vector of tumor stages
+  stageVector <- as.character(stage_values)
+  lvl1 <- gsub("stage ", "", stageVector)
+  lvl2 <- gsub("[0abc]", "", lvl1)
+  lvlNum <- as.numeric(as.roman(lvl2)) #change to roman
+
+  storage <- character()
+  for(i in 1:length(lvl1)) {
+
+    if(any(grep("a", lvl1[i]))) {
+      storage[i] <- "a"
+    }
+    else if (any(grep("b", lvl1[i]))) {
+      storage[i] <- "b"
+    }
+    else if (any(grep("c", lvl1[i]))) {
+      storage[i] <- "c"
+    }
+    else if (any(grep("0", lvl1[i]))) {
+      storage[i] <- "0"
+    }
+    else {storage[i] <- ""}
+  }
+  correctLevels <- paste0(lvlNum, storage)
+  return(correctLevels)
+}
diff --git a/R/subset.metadata.R b/R/subset.metadata.R
@@ -0,0 +1,30 @@
+#' Filter Metadata for Primary Cancer Site (Organ)
+#' @description This function subsets the metadata for an organ. It works for meta data directly downloaded from TCGA or already filtered meta data. The subset is done by matching the "key" with the column names (patient ID or case_id) of the expression matrix.
+#' @param meta_data Meta data provided by TCGA.
+#' @param cancer_data A large list created prior by multimodalR for a cancer site.
+#' @param key character - Unique identifier, default = "case_id"
+#' @param additional_key needs to be added
+#' @param additional_value needs to be added
+#' @importFrom magittr %>%
+#' @examples
+#' lungMetaData <- subset.metadata(metadata, lungXY)
+#' @return Returns a data table of the metadata for a primary cancer site (organ).
+
+subset.metadata <- function(meta_data, cancer_data, key = "case_id", additional_key = NULL, additional_value = NULL) {
+  if(!is.character(meta_data[[key]]) {
+    meta_data[[key]] <- as.character(meta_data[[key]]) #transform factor to character
+  }
+  correctRows <- numeric()                             #empty vector to be filled later
+  caseNames <- names(cancer_data$Expressionmatrix)      #vector of case_id from expressionmatrix
+  for (cases in caseNames) {                           #match case_id of the vector with case_id of metadata
+    matchedRow <- grep(cases, meta_data[[key]]
+    correctRows <- c(correctRows, matchedRow)
+  }
+  meta_data <- meta_data[c(correctRows),]              #subset by machted case_id
+  if(!is.null(additional_key)) {
+    additional_key <- rlang::sym(additional_key)
+
+    meta_data <- meta_data %>% filter_(., !!(additional_key) == additional_value)
+  }
+  return(meta_data)
+}
diff --git a/example/filteredDataLung.Rdata b/example/filteredDataLung.Rdata
diff --git a/example/imageFron2019_01_28.RData b/example/imageFron2019_01_28.RData
diff --git a/example/lungFiltered.RDS b/example/lungFiltered.RDS
diff --git a/example/preprocessing.R b/example/preprocessing.R
@@ -0,0 +1,24 @@
+#Preprocessing Data
+#### 1.1 MetaData
+metadata <- RJSONIO::fromJSON("P:/TCGA/clinical.cases_selection.2019-01-18.json", nullValue = NA, simplify = FALSE)
+metadata <- plyr::ldply(metadata, data.frame) #flatten the list into a data frame
+
+DataExplorer::plot_intro(metadata)
+DataExplorer::plot_missing(metadata)
+metadata <- filter.columns.as.na(metadata)
+#15 colums were dropped because of the NA values. From here I'd suggest to extract colums of interest because many arent necessary for the evaluation. 
+metadata <- rename.columns(metadata)
+#Now you can select your colums of interest. For this example 11 colums are selected. Note that you may have duplicated column names.
+metadataSelect <-  subset(metadata, select = c(case_id, tumor_stage, primary_diagnosis, site_of_resection_or_biopsy, vital_status, days_to_death, age_at_diagnosis, gender, race, ethnicity))
+#the last thing you have to do is to change your column with the patient/case id from a factor to characters.
+metadataSelect$case_id <- as.character(metadataSelect$case_id)
+####1.2 Organ data | Primary cancer Data | Expression data etc.
+
+
+
+
+
+### Now you are set to work with your objects created by multimodalR
+#2.1 Load your file (filteredOrgan.Rdata or .rds file)
+#2.2 Match meta data case_id with filteredOrgan case_id
+lungMeta <- filter.metadata.uni(fSelect, lungXY, key = "case_id")
diff --git a/example/report_files/figure-html/correlation_analysis-1.png b/example/report_files/figure-html/correlation_analysis-1.png
diff --git a/example/report_files/figure-html/missing_profile-1.png b/example/report_files/figure-html/missing_profile-1.png
diff --git a/example/report_files/figure-html/plot_frequency_bar-1.png b/example/report_files/figure-html/plot_frequency_bar-1.png
diff --git a/example/report_files/figure-html/plot_histogram-1.png b/example/report_files/figure-html/plot_histogram-1.png
diff --git a/example/report_files/figure-html/plot_intro-1.png b/example/report_files/figure-html/plot_intro-1.png
diff --git a/example/report_files/figure-html/plot_normal_qq-1.png b/example/report_files/figure-html/plot_normal_qq-1.png
diff --git a/preprocessing.R b/preprocessing.R
@@ -0,0 +1,45 @@
+#Preprocessing Data
+#### 1.1 MetaData
+metadata <- RJSONIO::fromJSON("P:/TCGA/clinical.cases_selection.2019-01-18.json", nullValue = NA, simplify = FALSE)
+metadata <- plyr::ldply(metadata, data.frame) #flatten the list into a data frame
+
+DataExplorer::plot_intro(metadata)
+DataExplorer::plot_missing(metadata)
+metadata <- filter.columns.as.na(metadata)
+#15 colums were dropped because of the NA values. From here I'd suggest to extract colums of interest because many arent necessary for the evaluation. 
+metadata <- rename.columns(metadata)
+#Now you can select your colums of interest. For this example 11 colums are selected. Note that you may have duplicated column names.
+metadataSelect <-  subset(metadata, select = c(case_id, tumor_stage, primary_diagnosis, site_of_resection_or_biopsy, vital_status, days_to_death, age_at_diagnosis, gender, race, ethnicity))
+#the last thing you have to do is to change your column with the patient/case id from a factor to characters.
+metadataSelect$case_id <- as.character(metadataSelect$case_id)
+####1.2 Organ data | Primary cancer Data | Expression data etc.
+#1.2.1 Load your file (filteredOrgan.Rdata or .rds file)
+lung <- readRDS("lungFiltered.RDS")
+#1.2.2 filter
+lung <- multimodalR::updateGeneNames(filteredOutput = lung$Output, lung$Expressionmatrix)
+lungX <- multimodalR::filterForYChromosomeGenes(output = lung$Output,expressionmatrix = lung$Expressionmatrix)
+lungXY <- multimodalR::filterForXChromosomeGenes(output = lungX$Output,expressionmatrix = lungX$Expressionmatrix)
+lungXY <- remove.x(lungXY)
+
+### Now you are set to work with your objects created by multimodalR - Process Data
+#2.2 Match meta data case_id with filteredOrgan case_id
+lungMeta <- subset.metadata(metadataSelect, lungXY, key = "case_id")
+lungMeta <- drop.unused.levels(lungMeta)
+lungMeta <- add.stage.simple(meta_data = lungMeta, tumor_stage = "tumor_stage", new_name = "stage")
+#optional: reorder columns / filter out small counts of factor levels
+#lungMeta <- reorder.column(lungMeta, "primary.diagnosis", 20)
+#2.3 Make data tables and add expression values to datatables 
+lungMetaExpression <- create.data.tables.new(lungMeta, lungXY)
+lungMetaExpression <- add.expression.new(lungXY, lungMetaExpression, key = "case_id")
+#lungMetaExpression <- lapply(lungMetaExpression, function(x) reorder.column(x, "primary_diagnosis", 15))
+#lungMetaExpression <- lapply(lungMetaExpression, function(x) reorder.column(x, "site_of_resection_or_biopsy", 15))
+#get overview of data
+DataExplorer::plot_bar(lungMetaExpression$SFTPB)
+DataExplorer::plot_histogram(lungMeta)
+#decide if you want to drop certain factor levels because they are so small
+
+x <- DataExplorer::split_columns(lungMeta)
+#how often gene is splitted into modality groups
+y <- counts.per.group(lungMetaExpression)
+ymeta <- merge(y, lungMeta, by = "case_id") #merge counts of groups
+z <- counts.per.group(lungMetaExpression, ".id", "group")