example/preprocessing.R

#Preprocessing Data
#### 1.1 MetaData
metadata <- RJSONIO::fromJSON("P:/TCGA/clinical.cases_selection.2019-01-18.json", nullValue = NA, simplify = FALSE)
metadata <- plyr::ldply(metadata, data.frame) #flatten the list into a data frame

DataExplorer::plot_intro(metadata)
DataExplorer::plot_missing(metadata)
metadata <- filter.columns.as.na(metadata)
#15 colums were dropped because of the NA values. From here I'd suggest to extract colums of interest because many arent necessary for the evaluation. 
metadata <- rename.columns(metadata)
#Now you can select your colums of interest. For this example 11 colums are selected. Note that you may have duplicated column names.
metadataSelect <-  subset(metadata, select = c(case_id, tumor_stage, primary_diagnosis, site_of_resection_or_biopsy, vital_status, days_to_death, age_at_diagnosis, gender, race, ethnicity))
#the last thing you have to do is to change your column with the patient/case id from a factor to characters.
#metadataSelect$case_id <- as.character(metadataSelect$case_id)
####1.2 Organ data | Primary cancer Data | Expression data etc.
#1.2.1 Load your file (filteredOrgan.Rdata or .rds file)
lung <- readRDS("lungFiltered.RDS")
#1.2.2 filter
lung <- multimodalR::updateGeneNames(filteredOutput = lung$Output, lung$Expressionmatrix)
lungX <- multimodalR::filterForYChromosomeGenes(output = lung$Output,expressionmatrix = lung$Expressionmatrix)
lungXY <- multimodalR::filterForXChromosomeGenes(output = lungX$Output,expressionmatrix = lungX$Expressionmatrix)
lungXY <- remove.x(lungXY)

### Now you are set to work with your objects created by multimodalR - Process Data
#2.2 Match meta data case_id with filteredOrgan case_id
lungMeta <- subset.metadata(metadataSelect, lungXY, key = "case_id")
lungMeta <- drop.unused.levels(lungMeta)
lungMeta <- add.stage.simple(meta_data = lungMeta, tumor_stage = "tumor_stage", new_name = "stage")
#optional: reorder columns / filter out small counts of factor levels
#lungMeta <- reorder.column(lungMeta, "primary.diagnosis", 20)
#2.3 Make data tables and add expression values to datatables 
lungMetaExpression <- create.data.tables.new(lungMeta, lungXY)
lungMetaExpression <- add.expression.new(lungXY, lungMetaExpression, key = "case_id")
#lungMetaExpression <- lapply(lungMetaExpression, function(x) reorder.column(x, "primary_diagnosis", 15))
#lungMetaExpression <- lapply(lungMetaExpression, function(x) reorder.column(x, "site_of_resection_or_biopsy", 15))
#get overview of data
DataExplorer::plot_bar(lungMetaExpression$SFTPB)
DataExplorer::plot_histogram(lungMeta)
#decide if you want to drop certain factor levels because they are so small

#3. Calculation
#3.1 Proportions
lungDiagnosisProp <- make.prop.frame(lungMetaExpression, "group", "primary_diagnosis")
lungStageProp <- make.prop.frame(lungMetaExpression, "group", "stage")
lungSiteProp <- make.prop.frame(lungMetaExpression, "group", "site_of_resection_or_biopsy")

lungCalculatedMeta <- make.calculated.metadata.new(lungMetaExpression)
lungAgeKruskal <- make.kruskal.frame(lungMetaExpression, 7, 12, "age.kruskal")
lungDeathInKruskal <- make.kruskal.frame(lungMetaExpression, 6, 12, "deathIn.kruskal")
lungDeadProp <- proptest.template(lungCalculatedMeta, col_counts = 4 , col_total = 2, col_name = "dead.proportion", p_adjust = FALSE)
lungGenderProp <- proptest.template(lungCalculatedMeta, col_counts = "n.male" , col_total = 2, col_name = "gender.proportion", p_adjust = FALSE)
lungMaleDeadProp <- proptest.dead.gender(lungCalculatedMeta, "male")
lungFemaleDeadProp <- proptest.dead.gender(lungCalculatedMeta, "female")

lungStageExpressionKruskal <- make.kruskal.frame(lungMetaExpression, 13, 11, "stageXexpression")
lungSiteExpressionKruskal <- make.kruskal.frame(lungMetaExpression, 13, 4, "siteXexpression")
lungDiagnosisExpressionKruskal <- make.kruskal.frame(lungMetaExpression, 13, "primary_diagnosis", "diagnosisXexpression")

pairwise.wilcox.test(lungMetaExpression$SFTA1P$expression, lungMetaExpression$SFTA1P$stage)


#how often gene is splitted into modality groups
lungCountGroup<- counts.per.group(lungMetaExpression, first_grouping = )
lungCountGene<- counts.per.group(lungMetaExpression, ".id", "group")