diff --git a/R/add.expression.R b/R/add.expression.R new file mode 100644 index 0000000..7a7234a --- /dev/null +++ b/R/add.expression.R @@ -0,0 +1,18 @@ +#' Add Gene Expression +#' @description This function adds the gene expression values to each "case_id" of a gene in the data frame. It uses the expressionmatrix from organ_data. +#' @param cancer_data A large list created by multimodalR +#' @param nested_metadata A list of data frames made by >make.nested.metadata< +#' @param key Character or integer - Column name or index of unique identifier of nested_metadata, default = "case_id" +#' @importFrom magittr %>% +#' @examples +#' lungMetaExpression <- add.expression(lungXY, lungMetaExpression) +#' @return Returns the same list but with data franes containing the expression values in a new colummn. + + +add.expression <- function(cancer_data, nested_metadata, key = "case_id") { #nested_metadata = List of metadata for each gene + for (gene in names(nested_metadata)) { + geneExpression <- cancer_data$Expressionmatrix[gene, c(nested_metadata[[gene]][[key]])] %>% as.numeric #[genename, case_id from the matrix] + nested_metadata[[gene]]$expression <- geneExpression #add the expression values to the data.table + } + return(nested_metadata) +} diff --git a/R/add.stage.simple.R b/R/add.stage.simple.R new file mode 100644 index 0000000..e5bd2c3 --- /dev/null +++ b/R/add.stage.simple.R @@ -0,0 +1,18 @@ +#' Adds column with simplified tumor stages without subtypes +#' @description This function adds a new column "stage" to the meta data containing the simplified tumor stages (i.e. without subtypes). It uses the function stages.to.number. +#' As for now 4 subtypes are accepted: a,b,c and 0 - it doesnt compute "stage i/ii NOS", "stage 0" and "stage x" +#' @param meta_data A data frame +#' @param tumor_stage Character or Integer - Column name or index of tumor stage +#' @param new_name Character - Name of new column +#' @examples +#' lungMeta <- add.stage.simple(lungmeta, "tumor_stage") +#' @return Returns the data frame with an additional column. + +add.stage.simple <- function(meta_data, tumor_stage = "tumor_stage", new_name = "stage") { #add simple stage to metadata without a/b + stageVector <- as.character(meta_data[[tumor_stage]]) + correctLevels <- mmRmeta::stage.to.numeral(stageVector) + stage <- sub(pattern = "[abc0]", "", correctLevels) + meta_data[[new_name]] <- stage + meta_data[[new_name]] <- factor(stage, exclude = "NA", ordered = TRUE) + return(meta_data) +} diff --git a/R/counts.per.group.R b/R/counts.per.group.R new file mode 100644 index 0000000..67e60f2 --- /dev/null +++ b/R/counts.per.group.R @@ -0,0 +1,27 @@ +#' Group by and summarize counts +#' @description This functions groups by 2 variables/columns with the counts of the combination. It accepts a list of data frames or a single data frame. When passing a single data frame this function will simply return a data frame with the counts. When using a list as input one might wants to know how often a patient(case_id) +#' appears in a group. If youre using a single data frame you need to change the grouping variables. Using ".id" as first_grouping and lists the counts of groups for the genes. +#' @param list_df Either a list of data frames or a single data frame +#' @param first_grouping Character - first grouping variable | Default "case_id" +#' @param second_grouping Character - second grouping variable | Default "group"" +#' @importFrom magittr %>% +#' @importFrom dplyr group_by_ summarize +#' @examples +#' Using a Data Frame +#' df <- data.frame() +#' df_counts <- counts.per.group(df, "firstColumn", "secondColumn") +#' @return Returns a data frame + + + +counts.per.group <- function(list_df, first_grouping = "case_id", second_grouping = "group") { + if(class(list_df) == "list") { + flatList <- plyr::ldply(list_df, data.table::data.table) #flatten list to data.table + countFrame <- flatList %>% group_by_(first_grouping, second_grouping) %>% summarize(n = n()) #data.table is grouped by the 2 variables and a third column containing the frequency is made + outputFrame <- tidyr::spread(countFrame, second_grouping, n , fill = 0 ) #spread the data frame by the second_grouping + } else { + countFrame <- list_df %>% group_by_(first_grouping, second_grouping) %>% summarize(n = n()) + outputFrame <- tidyr::spread(countFrame, second_grouping, n, fill = 0) + } + return(outputFrame) +} diff --git a/R/drop.unused.levels.R b/R/drop.unused.levels.R new file mode 100644 index 0000000..ca57af0 --- /dev/null +++ b/R/drop.unused.levels.R @@ -0,0 +1,14 @@ +#' Drop unused factor levels +#' @description This function drops all unused levels from all factors in a data frame. +#' @param df Data frame +#' @examples +#' @return Returns the altered data frame. + +drop.unused.levels<- function(df) { + for(column in names(df)) { + if(is.factor(df[[column]])) { + df[[column]] <- factor(df[[column]]) + } + } + return(df) +} diff --git a/R/filter.columns.as.na.R b/R/filter.columns.as.na.R new file mode 100644 index 0000000..0316b8a --- /dev/null +++ b/R/filter.columns.as.na.R @@ -0,0 +1,16 @@ +#' Delete Colums with only NA +#' @description Sometimes actual NA values are indicated as other strings (e.g. "not reported"). This function converts these strings to NA and ultimately deletes any column consisting only of NA values. +#' @param df Data frame +#' @param value A value or character that should be converted to NA. +#' @importFrom magrittr %>% +#' @examples +#' df <- data.frame(num1 = c(1,2,3,4), num2= c(5,6,7, "not reported")) +#' df <- filter.column.as.na(df, "no value") +#' @return Returns the filtered data frame + + +filter.columns.as.na <- function(df, value = "not reported") { + df <- dplyr::na_if(df, word) #changes all occurences of "word" to NA, Default is "not reported" + df <- df %>% dplyr::select_if(~!all(is.na(.))) #deletes columns with only NA values + return(df) +} diff --git a/R/make.nested.metadata.R b/R/make.nested.metadata.R new file mode 100644 index 0000000..ff84544 --- /dev/null +++ b/R/make.nested.metadata.R @@ -0,0 +1,35 @@ +#' Make nested data frames +#' @description Create a data frame of metadata for every gene. The case_id are taken from cancer_data and are matched against cancer_metadata. An additional +#' column "group" is appended to the data table. The column "group" represents the modality of the gene expression. +#' @param cancer_metadata Meta data provided by TCGA +#' @param cancer_data A large list created prior by multimodalR +#' @param key Character or integer - Column name or index of unique identifier, default = "case_id" +#' @examples +#' lungMetaExpression <- nested.metadata(lungMetaData, lungXY) +#' @return Returns a list of data frames for every gene + +make.nested.metadata <- function(cancer_metadata, cancer_data, key = "case_id") { + cancer_metadata <- data.table::data.table(cancer_metadata) + outputList <- list() + for (gene in 1:length(cancer_data$Output$Genes)) { + groupList <- list() + for (group in 1:length(cancer_data$Output$Genes[[gene]]$groups)) { #for every group + short <- cancer_data$Output$Genes[[gene]]$groups #shorten the variable + groupID <- unlist(short[group]) #unlist the case_ids + groupID <- c(gsub("X","",groupID)) #remove the X + rowNumbers <- numeric() + + for(i in groupID) { #match rows to every case_id and store it + match <- grep(i, cancer_metadata[[key]]) + rowNumbers <- c(rowNumbers, match) + } + geneDataFrame <- cancer_metadata[c(rowNumbers),] #subset via the stored matches + geneDataFrame$group <- factor(x = group) #add new column group + groupList[[group]] <- geneDataFrame #add data.table to list + } + geneDataFrameMerged <- plyr::ldply(groupList, data.frame) #combine the 2 to 3 dataframes together + outputList[[gene]] <- geneDataFrameMerged + } + names(outputList) <- names(cancer_data$Output$Genes) + return(outputList) +} diff --git a/R/remove.x.R b/R/remove.x.R new file mode 100644 index 0000000..12d387d --- /dev/null +++ b/R/remove.x.R @@ -0,0 +1,15 @@ +#' Remove 'X' infront of caseID +#' @description This function removes the 'X' in the column names (case_id) of cancer_data$Expressionmatrix that occures in R, when the start of the string is a number. +#' It's necessary to remove them so downstream functions work propperly. +#' @param cancer_data A large list created prior by multimodalR. +#' @examples +#' lungXY <- remove.x(lungXY) +#' @return Returns the same large list with changed column names of the expressionmatrix. + +remove.x <- function (cancer_data) { + columnNames <- names(cancer_data$Expressionmatrix) + columnNames <- c(gsub("X","", columnNames)) + columnNames <- c(gsub("\\.","-", columnNames)) #change "." to "-" just like in meta data + names(cancer_data$Expressionmatrix) <- columnNames #change columnames + return(cancer_data) +} diff --git a/R/rename.columns.R b/R/rename.columns.R new file mode 100644 index 0000000..b666779 --- /dev/null +++ b/R/rename.columns.R @@ -0,0 +1,15 @@ +#' Shorten column names +#' @description This function shortenes the column names using regex. When flattening a list from .json the resulting colums are named like "diagnoses.treatment.updated_datetime". Each period indicating a level of the list. +#' For better readability everything in front of the last period is deleted. +#' @param df A Data frame +#' @examples +#' df <- data.frame(a.long.column.name_1= c(1,2,3,4), another.long2.column.name_2= c(5,6,7, "not reported")) +#' df <- rename.columns(df) +#' @return Returns the same df with shortened column names. + +rename.columns <- function(df) { + oldNames <- colnames(df) + newNames <- stringr::str_extract(pattern = "(?!\\.)\\w+$", oldNames) #negative lookahead of periods, match any word character at end of line/string + colnames(df) <- newNames + return(df) +} diff --git a/R/reorder.all.R b/R/reorder.all.R new file mode 100644 index 0000000..6a59475 --- /dev/null +++ b/R/reorder.all.R @@ -0,0 +1,21 @@ +#' Uses reorder.column on all eligible columns +#' @description This function removes factor levels with less counts than a set value of every column. Additionally, it reorders the factor levels depending on the counts. +#' +#' @param cancer_data A large list created prior by multimodalR +#' @param exception Character - One or more column names that should not be processed by this function +#' @param remove Integer - remove >= counts of the factor level +#' @examples +#' lungMeta <- reorder.column.all(lungMeta, c("tumor_stage", "race"), 10) +#' @return Returns the data frame with filtered/ordered columns. + + +reorder.column.all <- function(cancer_metadata, exception = NULL, remove = 0) { + columnNames <- names(cancer_metadata) + if(!is.null(exception)) { + columnNames <- columnNames[!c(columnNames %in% exception)] #remove column names given in exception + } + for(names in columnNames) { + cancer_metadata <- mmRmeta::reorder.column(cancer_metadata, column = names, remove = remove) #use reorder.columns on given column names + } + return(cancer_metadata) +} diff --git a/R/reorder.column.R b/R/reorder.column.R new file mode 100644 index 0000000..880b56f --- /dev/null +++ b/R/reorder.column.R @@ -0,0 +1,27 @@ +#' Remove small factor levels and reorder +#' @description This function removes factor levels with counts less than a set value. Additionally, it reorders the factor levels depending on the counts. +#' This may be necessary later on to get ordered labels when plotting. +#' @param cancer_data A large list created prior by multimodalR. +#' @param column Character - column name +#' @param remove Integer - threshold of counts; any factor level with equal or less counts are discarded +#' @examples +#' lungMeta <- reorder.colums(lungMeta, "primary_diagnosis", 10) #keeps factor levels with >= 10 counts. +#' @return Returns the same large list with changed column names of the expressionmatrix. + + +reorder.column <- function(cancer_metadata, column, remove = 0) { + if(is.factor(cancer_metadata[[column]])) { + arrangedCounts <- cancer_metadata %>% dplyr::group_by_(column) %>% dplyr::summarize(n = n()) + arrangedCounts <- dplyr::arrange(arrangedCounts, desc(n)) #order by descending counts + cancer_metadata[[column]] <- factor(cancer_metadata[[column]], levels = arrangedCounts[[column]]) #reorder factor in data frame + + factorCount <- plyr::count(cancer_metadata[[column]]) %>% dplyr::filter(freq >= remove) #removes counts less than the value of removes + factorCount <- factorCount[,1] + columnSymbol <- rlang::sym(column) + cancer_metadata <- dplyr::filter(cancer_metadata, !!(columnSymbol) %in% factorCount) # !! is the newer version of rlang::UQ() which unquotes the variable + cancer_metadata[[column]] <- factor(cancer_metadata[[column]]) +} else { + warning(column, " is no factor, no changes applied") +} + return(cancer_metadata) +} diff --git a/R/stage.to.numeral.R b/R/stage.to.numeral.R new file mode 100644 index 0000000..4e245de --- /dev/null +++ b/R/stage.to.numeral.R @@ -0,0 +1,33 @@ +#' Change roman to arabic numerals +#' @description This function changes roman numerals to arabic numerals and keeps the subtypes. As for now 4 subtypes are accepted: a,b,c and 0 - it doesnt compute "stage i/ii NOS", "stage 0" and "stage x" +#' @param stage_values A vector of tumor stages with roman numerals and subtypes +#' @examples +#' stages <- stage.to.numeral(metadata$tumor_stage) +#' @return Returns a vetor of tumor stages with arabic numerals and subtypes. + +stage.to.numeral <- function(stage_values) { #A vector of tumor stages + stageVector <- as.character(stage_values) + lvl1 <- gsub("stage ", "", stageVector) + lvl2 <- gsub("[0abc]", "", lvl1) + lvlNum <- as.numeric(as.roman(lvl2)) #change to roman + + storage <- character() + for(i in 1:length(lvl1)) { + + if(any(grep("a", lvl1[i]))) { + storage[i] <- "a" + } + else if (any(grep("b", lvl1[i]))) { + storage[i] <- "b" + } + else if (any(grep("c", lvl1[i]))) { + storage[i] <- "c" + } + else if (any(grep("0", lvl1[i]))) { + storage[i] <- "0" + } + else {storage[i] <- ""} + } + correctLevels <- paste0(lvlNum, storage) + return(correctLevels) +} diff --git a/R/subset.metadata.R b/R/subset.metadata.R new file mode 100644 index 0000000..5669431 --- /dev/null +++ b/R/subset.metadata.R @@ -0,0 +1,30 @@ +#' Filter Metadata for Primary Cancer Site (Organ) +#' @description This function subsets the metadata for an organ. It works for meta data directly downloaded from TCGA or already filtered meta data. The subset is done by matching the "key" with the column names (patient ID or case_id) of the expression matrix. +#' @param meta_data Meta data provided by TCGA. +#' @param cancer_data A large list created prior by multimodalR for a cancer site. +#' @param key character - Unique identifier, default = "case_id" +#' @param additional_key needs to be added +#' @param additional_value needs to be added +#' @importFrom magittr %>% +#' @examples +#' lungMetaData <- subset.metadata(metadata, lungXY) +#' @return Returns a data table of the metadata for a primary cancer site (organ). + +subset.metadata <- function(meta_data, cancer_data, key = "case_id", additional_key = NULL, additional_value = NULL) { + if(!is.character(meta_data[[key]]) { + meta_data[[key]] <- as.character(meta_data[[key]]) #transform factor to character + } + correctRows <- numeric() #empty vector to be filled later + caseNames <- names(cancer_data$Expressionmatrix) #vector of case_id from expressionmatrix + for (cases in caseNames) { #match case_id of the vector with case_id of metadata + matchedRow <- grep(cases, meta_data[[key]] + correctRows <- c(correctRows, matchedRow) + } + meta_data <- meta_data[c(correctRows),] #subset by machted case_id + if(!is.null(additional_key)) { + additional_key <- rlang::sym(additional_key) + + meta_data <- meta_data %>% filter_(., !!(additional_key) == additional_value) + } + return(meta_data) +} diff --git a/example/filteredDataLung.Rdata b/example/filteredDataLung.Rdata new file mode 100644 index 0000000..8c42ee6 Binary files /dev/null and b/example/filteredDataLung.Rdata differ diff --git a/example/imageFron2019_01_28.RData b/example/imageFron2019_01_28.RData new file mode 100644 index 0000000..6d26d15 Binary files /dev/null and b/example/imageFron2019_01_28.RData differ diff --git a/example/lungFiltered.RDS b/example/lungFiltered.RDS new file mode 100644 index 0000000..2bf024e Binary files /dev/null and b/example/lungFiltered.RDS differ diff --git a/example/preprocessing.R b/example/preprocessing.R new file mode 100644 index 0000000..e7aff5b --- /dev/null +++ b/example/preprocessing.R @@ -0,0 +1,24 @@ +#Preprocessing Data +#### 1.1 MetaData +metadata <- RJSONIO::fromJSON("P:/TCGA/clinical.cases_selection.2019-01-18.json", nullValue = NA, simplify = FALSE) +metadata <- plyr::ldply(metadata, data.frame) #flatten the list into a data frame + +DataExplorer::plot_intro(metadata) +DataExplorer::plot_missing(metadata) +metadata <- filter.columns.as.na(metadata) +#15 colums were dropped because of the NA values. From here I'd suggest to extract colums of interest because many arent necessary for the evaluation. +metadata <- rename.columns(metadata) +#Now you can select your colums of interest. For this example 11 colums are selected. Note that you may have duplicated column names. +metadataSelect <- subset(metadata, select = c(case_id, tumor_stage, primary_diagnosis, site_of_resection_or_biopsy, vital_status, days_to_death, age_at_diagnosis, gender, race, ethnicity)) +#the last thing you have to do is to change your column with the patient/case id from a factor to characters. +metadataSelect$case_id <- as.character(metadataSelect$case_id) +####1.2 Organ data | Primary cancer Data | Expression data etc. + + + + + +### Now you are set to work with your objects created by multimodalR +#2.1 Load your file (filteredOrgan.Rdata or .rds file) +#2.2 Match meta data case_id with filteredOrgan case_id +lungMeta <- filter.metadata.uni(fSelect, lungXY, key = "case_id") diff --git a/example/report_files/figure-html/correlation_analysis-1.png b/example/report_files/figure-html/correlation_analysis-1.png new file mode 100644 index 0000000..6bb3373 Binary files /dev/null and b/example/report_files/figure-html/correlation_analysis-1.png differ diff --git a/example/report_files/figure-html/missing_profile-1.png b/example/report_files/figure-html/missing_profile-1.png new file mode 100644 index 0000000..56bc7b9 Binary files /dev/null and b/example/report_files/figure-html/missing_profile-1.png differ diff --git a/example/report_files/figure-html/plot_frequency_bar-1.png b/example/report_files/figure-html/plot_frequency_bar-1.png new file mode 100644 index 0000000..b00ba21 Binary files /dev/null and b/example/report_files/figure-html/plot_frequency_bar-1.png differ diff --git a/example/report_files/figure-html/plot_histogram-1.png b/example/report_files/figure-html/plot_histogram-1.png new file mode 100644 index 0000000..15c5f86 Binary files /dev/null and b/example/report_files/figure-html/plot_histogram-1.png differ diff --git a/example/report_files/figure-html/plot_intro-1.png b/example/report_files/figure-html/plot_intro-1.png new file mode 100644 index 0000000..a7d4327 Binary files /dev/null and b/example/report_files/figure-html/plot_intro-1.png differ diff --git a/example/report_files/figure-html/plot_normal_qq-1.png b/example/report_files/figure-html/plot_normal_qq-1.png new file mode 100644 index 0000000..ad35759 Binary files /dev/null and b/example/report_files/figure-html/plot_normal_qq-1.png differ diff --git a/preprocessing.R b/preprocessing.R new file mode 100644 index 0000000..aa1fc80 --- /dev/null +++ b/preprocessing.R @@ -0,0 +1,45 @@ +#Preprocessing Data +#### 1.1 MetaData +metadata <- RJSONIO::fromJSON("P:/TCGA/clinical.cases_selection.2019-01-18.json", nullValue = NA, simplify = FALSE) +metadata <- plyr::ldply(metadata, data.frame) #flatten the list into a data frame + +DataExplorer::plot_intro(metadata) +DataExplorer::plot_missing(metadata) +metadata <- filter.columns.as.na(metadata) +#15 colums were dropped because of the NA values. From here I'd suggest to extract colums of interest because many arent necessary for the evaluation. +metadata <- rename.columns(metadata) +#Now you can select your colums of interest. For this example 11 colums are selected. Note that you may have duplicated column names. +metadataSelect <- subset(metadata, select = c(case_id, tumor_stage, primary_diagnosis, site_of_resection_or_biopsy, vital_status, days_to_death, age_at_diagnosis, gender, race, ethnicity)) +#the last thing you have to do is to change your column with the patient/case id from a factor to characters. +metadataSelect$case_id <- as.character(metadataSelect$case_id) +####1.2 Organ data | Primary cancer Data | Expression data etc. +#1.2.1 Load your file (filteredOrgan.Rdata or .rds file) +lung <- readRDS("lungFiltered.RDS") +#1.2.2 filter +lung <- multimodalR::updateGeneNames(filteredOutput = lung$Output, lung$Expressionmatrix) +lungX <- multimodalR::filterForYChromosomeGenes(output = lung$Output,expressionmatrix = lung$Expressionmatrix) +lungXY <- multimodalR::filterForXChromosomeGenes(output = lungX$Output,expressionmatrix = lungX$Expressionmatrix) +lungXY <- remove.x(lungXY) + +### Now you are set to work with your objects created by multimodalR - Process Data +#2.2 Match meta data case_id with filteredOrgan case_id +lungMeta <- subset.metadata(metadataSelect, lungXY, key = "case_id") +lungMeta <- drop.unused.levels(lungMeta) +lungMeta <- add.stage.simple(meta_data = lungMeta, tumor_stage = "tumor_stage", new_name = "stage") +#optional: reorder columns / filter out small counts of factor levels +#lungMeta <- reorder.column(lungMeta, "primary.diagnosis", 20) +#2.3 Make data tables and add expression values to datatables +lungMetaExpression <- create.data.tables.new(lungMeta, lungXY) +lungMetaExpression <- add.expression.new(lungXY, lungMetaExpression, key = "case_id") +#lungMetaExpression <- lapply(lungMetaExpression, function(x) reorder.column(x, "primary_diagnosis", 15)) +#lungMetaExpression <- lapply(lungMetaExpression, function(x) reorder.column(x, "site_of_resection_or_biopsy", 15)) +#get overview of data +DataExplorer::plot_bar(lungMetaExpression$SFTPB) +DataExplorer::plot_histogram(lungMeta) +#decide if you want to drop certain factor levels because they are so small + +x <- DataExplorer::split_columns(lungMeta) +#how often gene is splitted into modality groups +y <- counts.per.group(lungMetaExpression) +ymeta <- merge(y, lungMeta, by = "case_id") #merge counts of groups +z <- counts.per.group(lungMetaExpression, ".id", "group")