From c7ebae450873ff56efc40843bd81407c39158a60 Mon Sep 17 00:00:00 2001 From: sebastianlieske Date: Wed, 30 Jan 2019 18:33:58 +0100 Subject: [PATCH] filter meta data, finished section 2.2 --- example/preprocessing.R | 2 +- lung_example.Rmd | 26 +++++++++++++++++++++++--- lung_example.html | 26 +++++++++++++++++++++++--- 3 files changed, 47 insertions(+), 7 deletions(-) diff --git a/example/preprocessing.R b/example/preprocessing.R index b9d2efd..43152c1 100644 --- a/example/preprocessing.R +++ b/example/preprocessing.R @@ -11,7 +11,7 @@ metadata <- rename.columns(metadata) #Now you can select your colums of interest. For this example 11 colums are selected. Note that you may have duplicated column names. metadataSelect <- subset(metadata, select = c(case_id, tumor_stage, primary_diagnosis, site_of_resection_or_biopsy, vital_status, days_to_death, age_at_diagnosis, gender, race, ethnicity)) #the last thing you have to do is to change your column with the patient/case id from a factor to characters. -metadataSelect$case_id <- as.character(metadataSelect$case_id) +#metadataSelect$case_id <- as.character(metadataSelect$case_id) ####1.2 Organ data | Primary cancer Data | Expression data etc. #1.2.1 Load your file (filteredOrgan.Rdata or .rds file) lung <- readRDS("lungFiltered.RDS") diff --git a/lung_example.Rmd b/lung_example.Rmd index b11b34d..ce68867 100644 --- a/lung_example.Rmd +++ b/lung_example.Rmd @@ -23,8 +23,18 @@ The meta data, a JSON file, can be obtained using the TCGA database or by loadin metadata <- RJSONIO::fromJSON("clinical.cases_selection.2019-01-18.json", nullValue = NA, simplify = FALSE) lung <- readRDS("lungFiltered.RDS") ``` +#### 2.1 Filter Expression Data +The output of filtering done by multimodalR is a large list for a cancer type consisting of 2 elements: An "Output" containing information about the genes and the modality groups and an "Expressionmatrix" with the gene expression values for every gene and patient. +If not already done, you need to process this data a little bit further by using functions of multimodalR. + +```{r eval = FALSE} +lung <- multimodalR::updateGeneNames(filteredOutput = lung$Output, lung$Expressionmatrix) +lungY <- multimodalR::filterForYChromosomeGenes(output = lung$Output,expressionmatrix = lung$Expressionmatrix) +lungXY <- multimodalR::filterForXChromosomeGenes(output = lungY$Output,expressionmatrix = lungY$Expressionmatrix) +lungXY <- remove.x(lungXY) #remove the unnecessary "X" infront of case_id +``` -#### 2.1 Filter Metadata +#### 2.2 Filter Meta Data The metadata is a large list that needs to be flattened into a data table. Furthermore, we want to filter out any columns with NA values and select columns of interests. ```{r eval = FALSE} @@ -32,7 +42,17 @@ metadata <- plyr::ldply(metadata, data.frame) #flatten the list metadata <- filter.columns.as.na(metadata, "not reported") #filter any column consisting of NA metadata <- rename.columns(metadata) #shorten the column names ``` -By using filter.column.as.na and rename.columns any colums consisting of only NA values are dropped and the remaining column names are shortened. It may be possible that there are several duplicated names. +By using filter.column.as.na and rename.columns any colums consisting of only NA values are dropped and the remaining column names are shortened. It may be possible that there are several duplicated names. From here, you need to select the colums of interest you want to keep. Note that you may have duplicated column names. In this example eleven colums are selected. -Then we are going to match the meta data with the gene expression data by the key "case_id" which is a unique identifier for a patient. \ No newline at end of file +```{r eval = FALSE} +metadataSelect <- subset(metadata, select = c(case_id, tumor_stage, primary_diagnosis, site_of_resection_or_biopsy, vital_status, days_to_death, age_at_diagnosis, gender, race, ethnicity)) +``` +Then we are going to match the meta data with the gene expression data by the key "case_id" which is a unique identifier for a patient to get meta data specific to the cancer type. After subsetting every factor from the original data frame is kept and they need to be removed. Additionally, the column "tumor_stage" contains values encoded as for example "iiia" representing roman numbers with a possible subtype. For better comparison a new column "stage" is created that contains roman numerals representing the tumor stage without the subtype. + +```{r eval = FALSE} +metadataSelect <- subset(metadata, select = c(case_id, tumor_stage, primary_diagnosis, site_of_resection_or_biopsy, vital_status, days_to_death, age_at_diagnosis, gender, race, ethnicity)) +lungMeta <- subset.metadata(metadataSelect, lungXY, key = "case_id") #match both objects +lungMeta <- drop.unused.levels(lungMeta) #drop unused factor levels in whole data frame +lungMeta <- add.stage.simple(lungMeta, tumor_stage = "tumor_stage", new_name = "stage") #adds a new column +``` \ No newline at end of file diff --git a/lung_example.html b/lung_example.html index 3c0d39b..659f823 100644 --- a/lung_example.html +++ b/lung_example.html @@ -197,12 +197,32 @@

1 Installing Packages

2 Loading and Preprocessing Data

-

You need to load two files: * Meta data * filtered expression data

+

You need to load two files: * meta data * filtered expression data

The meta data, a JSON file, can be obtained using the TCGA database or by loading the existing data. The filtered expression data is generated by multimodalR and may be saved as a .RData or .RDS file.

metadata <- RJSONIO::fromJSON("clinical.cases_selection.2019-01-18.json", nullValue = NA, simplify = FALSE)
 lung <- readRDS("lungFiltered.RDS")
-
-

2.1 Filter Metadata

+
+

2.1 Filter Expression Data

+

The output of filtering done by multimodalR is a large list for a cancer type consisting of 2 elements: An “Output” containing information about the genes and the modality groups and an “Expressionmatrix” with the gene expression values for every gene and patient. If not already done, you need to process this data a little bit further by using functions of multimodalR.

+
lung <- multimodalR::updateGeneNames(filteredOutput = lung$Output, lung$Expressionmatrix)
+lungY <- multimodalR::filterForYChromosomeGenes(output = lung$Output,expressionmatrix = lung$Expressionmatrix)
+lungXY <- multimodalR::filterForXChromosomeGenes(output = lungY$Output,expressionmatrix = lungY$Expressionmatrix)
+lungXY <- remove.x(lungXY) #remove the unnecessary "X" infront of case_id
+
+
+

2.2 Filter Meta Data

+

The metadata is a large list that needs to be flattened into a data table. Furthermore, we want to filter out any columns with NA values and select columns of interests.

+
metadata <- plyr::ldply(metadata, data.frame)                 #flatten the list into a data frame
+metadata <- filter.columns.as.na(metadata, "not reported")    #filter any column consisting of NA
+metadata <- rename.columns(metadata)                          #shorten the column names
+

By using filter.column.as.na and rename.columns any colums consisting of only NA values are dropped and the remaining column names are shortened. It may be possible that there are several duplicated names.
+From here, you need to select the colums of interest you want to keep. Note that you may have duplicated column names. In this example eleven colums are selected.

+
metadataSelect <-  subset(metadata, select = c(case_id, tumor_stage, primary_diagnosis, site_of_resection_or_biopsy, vital_status, days_to_death, age_at_diagnosis, gender, race, ethnicity))
+

Then we are going to match the meta data with the gene expression data by the key “case_id” which is a unique identifier for a patient to get meta data specific to the cancer type. After subsetting every factor from the original data frame is kept and they need to be removed. Additionally, the column “tumor_stage” contains values encoded as for example “iiia” representing roman numbers with a possible subtype. For better comparison a new column “stage” is created that contains roman numerals representing the tumor stage without the subtype.

+
metadataSelect <-  subset(metadata, select = c(case_id, tumor_stage, primary_diagnosis, site_of_resection_or_biopsy, vital_status, days_to_death, age_at_diagnosis, gender, race, ethnicity))
+lungMeta <- subset.metadata(metadataSelect, lungXY, key = "case_id") #match both objects 
+lungMeta <- drop.unused.levels(lungMeta) #drop unused factor levels in whole data frame 
+lungMeta <- add.stage.simple(lungMeta, tumor_stage = "tumor_stage", new_name = "stage") #adds a new column