From c7ebae450873ff56efc40843bd81407c39158a60 Mon Sep 17 00:00:00 2001
From: sebastianlieske <sebastian.lieske@mpi-bn.mpg.de>
Date: Wed, 30 Jan 2019 18:33:58 +0100
Subject: [PATCH] filter meta data, finished section 2.2

---
 example/preprocessing.R |  2 +-
 lung_example.Rmd        | 26 +++++++++++++++++++++++---
 lung_example.html       | 26 +++++++++++++++++++++++---
 3 files changed, 47 insertions(+), 7 deletions(-)

diff --git a/example/preprocessing.R b/example/preprocessing.R
index b9d2efd..43152c1 100644
--- a/example/preprocessing.R
+++ b/example/preprocessing.R
@@ -11,7 +11,7 @@ metadata <- rename.columns(metadata)
 #Now you can select your colums of interest. For this example 11 colums are selected. Note that you may have duplicated column names.
 metadataSelect <-  subset(metadata, select = c(case_id, tumor_stage, primary_diagnosis, site_of_resection_or_biopsy, vital_status, days_to_death, age_at_diagnosis, gender, race, ethnicity))
 #the last thing you have to do is to change your column with the patient/case id from a factor to characters.
-metadataSelect$case_id <- as.character(metadataSelect$case_id)
+#metadataSelect$case_id <- as.character(metadataSelect$case_id)
 ####1.2 Organ data | Primary cancer Data | Expression data etc.
 #1.2.1 Load your file (filteredOrgan.Rdata or .rds file)
 lung <- readRDS("lungFiltered.RDS")
diff --git a/lung_example.Rmd b/lung_example.Rmd
index b11b34d..ce68867 100644
--- a/lung_example.Rmd
+++ b/lung_example.Rmd
@@ -23,8 +23,18 @@ The meta data, a JSON file, can be obtained using the TCGA database or by loadin
 metadata <- RJSONIO::fromJSON("clinical.cases_selection.2019-01-18.json", nullValue = NA, simplify = FALSE)
 lung <- readRDS("lungFiltered.RDS")
 ```
+#### 2.1 Filter Expression Data 
+The output of filtering done by multimodalR is a large list for a cancer type consisting of 2 elements: An "Output" containing information about the genes and the modality groups and an "Expressionmatrix" with the gene expression values for every gene and patient.
+If not already done, you need to process this data a little bit further by using functions of multimodalR.
+
+```{r eval = FALSE}
+lung <- multimodalR::updateGeneNames(filteredOutput = lung$Output, lung$Expressionmatrix)
+lungY <- multimodalR::filterForYChromosomeGenes(output = lung$Output,expressionmatrix = lung$Expressionmatrix)
+lungXY <- multimodalR::filterForXChromosomeGenes(output = lungY$Output,expressionmatrix = lungY$Expressionmatrix)
+lungXY <- remove.x(lungXY) #remove the unnecessary "X" infront of case_id
+```
   
-#### 2.1 Filter Metadata
+#### 2.2 Filter Meta Data
 The metadata is a large list that needs to be flattened into a data table. Furthermore, we want to filter out any columns with NA values and select columns of interests. 
 
 ```{r eval = FALSE}
@@ -32,7 +42,17 @@ metadata <- plyr::ldply(metadata, data.frame)                 #flatten the list
 metadata <- filter.columns.as.na(metadata, "not reported")    #filter any column consisting of NA
 metadata <- rename.columns(metadata)                          #shorten the column names
 ```
-By using filter.column.as.na and rename.columns any colums consisting of only NA values are dropped and the remaining column names are shortened. It may be possible that there are several duplicated names.
+By using filter.column.as.na and rename.columns any colums consisting of only NA values are dropped and the remaining column names are shortened. It may be possible that there are several duplicated names.  
 From here, you need to select the colums of interest you want to keep. Note that you may have duplicated column names. In this example eleven colums are selected.
 
-Then we are going to match the meta data with the gene expression data by the key "case_id" which is a unique identifier for a patient.
\ No newline at end of file
+```{r eval = FALSE}
+metadataSelect <-  subset(metadata, select = c(case_id, tumor_stage, primary_diagnosis, site_of_resection_or_biopsy, vital_status, days_to_death, age_at_diagnosis, gender, race, ethnicity))
+```
+Then we are going to match the meta data with the gene expression data by the key "case_id" which is a unique identifier for a patient to get meta data specific to the cancer type. After subsetting every factor from the original data frame is kept and they need to be removed. Additionally, the column "tumor_stage" contains values encoded as for example "iiia" representing roman numbers with a possible subtype. For better comparison a new column "stage" is created that contains roman numerals representing the tumor stage without the subtype.
+
+```{r eval = FALSE}
+metadataSelect <-  subset(metadata, select = c(case_id, tumor_stage, primary_diagnosis, site_of_resection_or_biopsy, vital_status, days_to_death, age_at_diagnosis, gender, race, ethnicity))
+lungMeta <- subset.metadata(metadataSelect, lungXY, key = "case_id") #match both objects 
+lungMeta <- drop.unused.levels(lungMeta) #drop unused factor levels in whole data frame 
+lungMeta <- add.stage.simple(lungMeta, tumor_stage = "tumor_stage", new_name = "stage") #adds a new column
+```
\ No newline at end of file
diff --git a/lung_example.html b/lung_example.html
index 3c0d39b..659f823 100644
--- a/lung_example.html
+++ b/lung_example.html
@@ -197,12 +197,32 @@ <h2>1 Installing Packages</h2>
 </div>
 <div id="loading-and-preprocessing-data" class="section level2">
 <h2>2 Loading and Preprocessing Data</h2>
-<p>You need to load two files: * Meta data * filtered expression data</p>
+<p>You need to load two files: * meta data * filtered expression data</p>
 <p>The meta data, a JSON file, can be obtained using the TCGA database or by loading the existing data. The filtered expression data is generated by multimodalR and may be saved as a .RData or .RDS file.</p>
 <pre class="r"><code>metadata &lt;- RJSONIO::fromJSON(&quot;clinical.cases_selection.2019-01-18.json&quot;, nullValue = NA, simplify = FALSE)
 lung &lt;- readRDS(&quot;lungFiltered.RDS&quot;)</code></pre>
-<div id="filter-metadata" class="section level4">
-<h4>2.1 Filter Metadata</h4>
+<div id="filter-expression-data" class="section level4">
+<h4>2.1 Filter Expression Data</h4>
+<p>The output of filtering done by multimodalR is a large list for a cancer type consisting of 2 elements: An “Output” containing information about the genes and the modality groups and an “Expressionmatrix” with the gene expression values for every gene and patient. If not already done, you need to process this data a little bit further by using functions of multimodalR.</p>
+<pre class="r"><code>lung &lt;- multimodalR::updateGeneNames(filteredOutput = lung$Output, lung$Expressionmatrix)
+lungY &lt;- multimodalR::filterForYChromosomeGenes(output = lung$Output,expressionmatrix = lung$Expressionmatrix)
+lungXY &lt;- multimodalR::filterForXChromosomeGenes(output = lungY$Output,expressionmatrix = lungY$Expressionmatrix)
+lungXY &lt;- remove.x(lungXY) #remove the unnecessary &quot;X&quot; infront of case_id</code></pre>
+</div>
+<div id="filter-meta-data" class="section level4">
+<h4>2.2 Filter Meta Data</h4>
+<p>The metadata is a large list that needs to be flattened into a data table. Furthermore, we want to filter out any columns with NA values and select columns of interests.</p>
+<pre class="r"><code>metadata &lt;- plyr::ldply(metadata, data.frame)                 #flatten the list into a data frame
+metadata &lt;- filter.columns.as.na(metadata, &quot;not reported&quot;)    #filter any column consisting of NA
+metadata &lt;- rename.columns(metadata)                          #shorten the column names</code></pre>
+<p>By using filter.column.as.na and rename.columns any colums consisting of only NA values are dropped and the remaining column names are shortened. It may be possible that there are several duplicated names.<br />
+From here, you need to select the colums of interest you want to keep. Note that you may have duplicated column names. In this example eleven colums are selected.</p>
+<pre class="r"><code>metadataSelect &lt;-  subset(metadata, select = c(case_id, tumor_stage, primary_diagnosis, site_of_resection_or_biopsy, vital_status, days_to_death, age_at_diagnosis, gender, race, ethnicity))</code></pre>
+<p>Then we are going to match the meta data with the gene expression data by the key “case_id” which is a unique identifier for a patient to get meta data specific to the cancer type. After subsetting every factor from the original data frame is kept and they need to be removed. Additionally, the column “tumor_stage” contains values encoded as for example “iiia” representing roman numbers with a possible subtype. For better comparison a new column “stage” is created that contains roman numerals representing the tumor stage without the subtype.</p>
+<pre class="r"><code>metadataSelect &lt;-  subset(metadata, select = c(case_id, tumor_stage, primary_diagnosis, site_of_resection_or_biopsy, vital_status, days_to_death, age_at_diagnosis, gender, race, ethnicity))
+lungMeta &lt;- subset.metadata(metadataSelect, lungXY, key = &quot;case_id&quot;) #match both objects 
+lungMeta &lt;- drop.unused.levels(lungMeta) #drop unused factor levels in whole data frame 
+lungMeta &lt;- add.stage.simple(lungMeta, tumor_stage = &quot;tumor_stage&quot;, new_name = &quot;stage&quot;) #adds a new column</code></pre>
 </div>
 </div>