Skip to content
Permalink
c7ebae4508
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
64 lines (56 sloc) 4.1 KB
#Preprocessing Data
#### 1.1 MetaData
metadata <- RJSONIO::fromJSON("P:/TCGA/clinical.cases_selection.2019-01-18.json", nullValue = NA, simplify = FALSE)
metadata <- plyr::ldply(metadata, data.frame) #flatten the list into a data frame
DataExplorer::plot_intro(metadata)
DataExplorer::plot_missing(metadata)
metadata <- filter.columns.as.na(metadata)
#15 colums were dropped because of the NA values. From here I'd suggest to extract colums of interest because many arent necessary for the evaluation.
metadata <- rename.columns(metadata)
#Now you can select your colums of interest. For this example 11 colums are selected. Note that you may have duplicated column names.
metadataSelect <- subset(metadata, select = c(case_id, tumor_stage, primary_diagnosis, site_of_resection_or_biopsy, vital_status, days_to_death, age_at_diagnosis, gender, race, ethnicity))
#the last thing you have to do is to change your column with the patient/case id from a factor to characters.
#metadataSelect$case_id <- as.character(metadataSelect$case_id)
####1.2 Organ data | Primary cancer Data | Expression data etc.
#1.2.1 Load your file (filteredOrgan.Rdata or .rds file)
lung <- readRDS("lungFiltered.RDS")
#1.2.2 filter
lung <- multimodalR::updateGeneNames(filteredOutput = lung$Output, lung$Expressionmatrix)
lungX <- multimodalR::filterForYChromosomeGenes(output = lung$Output,expressionmatrix = lung$Expressionmatrix)
lungXY <- multimodalR::filterForXChromosomeGenes(output = lungX$Output,expressionmatrix = lungX$Expressionmatrix)
lungXY <- remove.x(lungXY)
### Now you are set to work with your objects created by multimodalR - Process Data
#2.2 Match meta data case_id with filteredOrgan case_id
lungMeta <- subset.metadata(metadataSelect, lungXY, key = "case_id")
lungMeta <- drop.unused.levels(lungMeta)
lungMeta <- add.stage.simple(meta_data = lungMeta, tumor_stage = "tumor_stage", new_name = "stage")
#optional: reorder columns / filter out small counts of factor levels
#lungMeta <- reorder.column(lungMeta, "primary.diagnosis", 20)
#2.3 Make data tables and add expression values to datatables
lungMetaExpression <- create.data.tables.new(lungMeta, lungXY)
lungMetaExpression <- add.expression.new(lungXY, lungMetaExpression, key = "case_id")
#lungMetaExpression <- lapply(lungMetaExpression, function(x) reorder.column(x, "primary_diagnosis", 15))
#lungMetaExpression <- lapply(lungMetaExpression, function(x) reorder.column(x, "site_of_resection_or_biopsy", 15))
#get overview of data
DataExplorer::plot_bar(lungMetaExpression$SFTPB)
DataExplorer::plot_histogram(lungMeta)
#decide if you want to drop certain factor levels because they are so small
#3. Calculation
#3.1 Proportions
lungDiagnosisProp <- make.prop.frame(lungMetaExpression, "group", "primary_diagnosis")
lungStageProp <- make.prop.frame(lungMetaExpression, "group", "stage")
lungSiteProp <- make.prop.frame(lungMetaExpression, "group", "site_of_resection_or_biopsy")
lungCalculatedMeta <- make.calculated.metadata.new(lungMetaExpression)
lungAgeKruskal <- make.kruskal.frame(lungMetaExpression, 7, 12, "age.kruskal")
lungDeathInKruskal <- make.kruskal.frame(lungMetaExpression, 6, 12, "deathIn.kruskal")
lungDeadProp <- proptest.template(lungCalculatedMeta, col_counts = 4 , col_total = 2, col_name = "dead.proportion", p_adjust = FALSE)
lungGenderProp <- proptest.template(lungCalculatedMeta, col_counts = "n.male" , col_total = 2, col_name = "gender.proportion", p_adjust = FALSE)
lungMaleDeadProp <- proptest.dead.gender(lungCalculatedMeta, "male")
lungFemaleDeadProp <- proptest.dead.gender(lungCalculatedMeta, "female")
lungStageExpressionKruskal <- make.kruskal.frame(lungMetaExpression, 13, 11, "stageXexpression")
lungSiteExpressionKruskal <- make.kruskal.frame(lungMetaExpression, 13, 4, "siteXexpression")
lungDiagnosisExpressionKruskal <- make.kruskal.frame(lungMetaExpression, 13, "primary_diagnosis", "diagnosisXexpression")
pairwise.wilcox.test(lungMetaExpression$SFTA1P$expression, lungMetaExpression$SFTA1P$stage)
#how often gene is splitted into modality groups
lungCountGroup<- counts.per.group(lungMetaExpression, first_grouping = )
lungCountGene<- counts.per.group(lungMetaExpression, ".id", "group")