Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
mmRmeta/example/preprocessing.R
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
64 lines (56 sloc)
4.1 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Preprocessing Data | |
#### 1.1 MetaData | |
metadata <- RJSONIO::fromJSON("P:/TCGA/clinical.cases_selection.2019-01-18.json", nullValue = NA, simplify = FALSE) | |
metadata <- plyr::ldply(metadata, data.frame) #flatten the list into a data frame | |
DataExplorer::plot_intro(metadata) | |
DataExplorer::plot_missing(metadata) | |
metadata <- filter.columns.as.na(metadata) | |
#15 colums were dropped because of the NA values. From here I'd suggest to extract colums of interest because many arent necessary for the evaluation. | |
metadata <- rename.columns(metadata) | |
#Now you can select your colums of interest. For this example 11 colums are selected. Note that you may have duplicated column names. | |
metadataSelect <- subset(metadata, select = c(case_id, tumor_stage, primary_diagnosis, site_of_resection_or_biopsy, vital_status, days_to_death, age_at_diagnosis, gender, race, ethnicity)) | |
#the last thing you have to do is to change your column with the patient/case id from a factor to characters. | |
#metadataSelect$case_id <- as.character(metadataSelect$case_id) | |
####1.2 Organ data | Primary cancer Data | Expression data etc. | |
#1.2.1 Load your file (filteredOrgan.Rdata or .rds file) | |
lung <- readRDS("lungFiltered.RDS") | |
#1.2.2 filter | |
lung <- multimodalR::updateGeneNames(filteredOutput = lung$Output, lung$Expressionmatrix) | |
lungX <- multimodalR::filterForYChromosomeGenes(output = lung$Output,expressionmatrix = lung$Expressionmatrix) | |
lungXY <- multimodalR::filterForXChromosomeGenes(output = lungX$Output,expressionmatrix = lungX$Expressionmatrix) | |
lungXY <- remove.x(lungXY) | |
### Now you are set to work with your objects created by multimodalR - Process Data | |
#2.2 Match meta data case_id with filteredOrgan case_id | |
lungMeta <- subset.metadata(metadataSelect, lungXY, key = "case_id") | |
lungMeta <- drop.unused.levels(lungMeta) | |
lungMeta <- add.stage.simple(meta_data = lungMeta, tumor_stage = "tumor_stage", new_name = "stage") | |
#optional: reorder columns / filter out small counts of factor levels | |
#lungMeta <- reorder.column(lungMeta, "primary.diagnosis", 20) | |
#2.3 Make data tables and add expression values to datatables | |
lungMetaExpression <- create.data.tables.new(lungMeta, lungXY) | |
lungMetaExpression <- add.expression.new(lungXY, lungMetaExpression, key = "case_id") | |
#lungMetaExpression <- lapply(lungMetaExpression, function(x) reorder.column(x, "primary_diagnosis", 15)) | |
#lungMetaExpression <- lapply(lungMetaExpression, function(x) reorder.column(x, "site_of_resection_or_biopsy", 15)) | |
#get overview of data | |
DataExplorer::plot_bar(lungMetaExpression$SFTPB) | |
DataExplorer::plot_histogram(lungMeta) | |
#decide if you want to drop certain factor levels because they are so small | |
#3. Calculation | |
#3.1 Proportions | |
lungDiagnosisProp <- make.prop.frame(lungMetaExpression, "group", "primary_diagnosis") | |
lungStageProp <- make.prop.frame(lungMetaExpression, "group", "stage") | |
lungSiteProp <- make.prop.frame(lungMetaExpression, "group", "site_of_resection_or_biopsy") | |
lungCalculatedMeta <- make.calculated.metadata.new(lungMetaExpression) | |
lungAgeKruskal <- make.kruskal.frame(lungMetaExpression, 7, 12, "age.kruskal") | |
lungDeathInKruskal <- make.kruskal.frame(lungMetaExpression, 6, 12, "deathIn.kruskal") | |
lungDeadProp <- proptest.template(lungCalculatedMeta, col_counts = 4 , col_total = 2, col_name = "dead.proportion", p_adjust = FALSE) | |
lungGenderProp <- proptest.template(lungCalculatedMeta, col_counts = "n.male" , col_total = 2, col_name = "gender.proportion", p_adjust = FALSE) | |
lungMaleDeadProp <- proptest.dead.gender(lungCalculatedMeta, "male") | |
lungFemaleDeadProp <- proptest.dead.gender(lungCalculatedMeta, "female") | |
lungStageExpressionKruskal <- make.kruskal.frame(lungMetaExpression, 13, 11, "stageXexpression") | |
lungSiteExpressionKruskal <- make.kruskal.frame(lungMetaExpression, 13, 4, "siteXexpression") | |
lungDiagnosisExpressionKruskal <- make.kruskal.frame(lungMetaExpression, 13, "primary_diagnosis", "diagnosisXexpression") | |
pairwise.wilcox.test(lungMetaExpression$SFTA1P$expression, lungMetaExpression$SFTA1P$stage) | |
#how often gene is splitted into modality groups | |
lungCountGroup<- counts.per.group(lungMetaExpression, first_grouping = ) | |
lungCountGene<- counts.per.group(lungMetaExpression, ".id", "group") |