deseq2_tutorial.Rmd

---
title: "DESeq2 Tutorial"
author: "Franziska Metge"
date: "June 13, 2018"
output: html_document
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```

This tutorial follows <http://master.bioconductor.org/packages/release/workflows/vignettes/rnaseqGene/inst/doc/rnaseqGene.html>

## Contents
* Preparing count matrices
* The DESeqDataSet object
* Exploratory analysis and visualization
* Differential expression analysis
* Plotting results

## Preparing count matrices
We will use an example dataset from the *airway* package. It contains four human [airway](http://bioconductor.org/packages/release/data/experiment/html/airway.html) smooth muscle cell lines with two conditions each (untreated control and dexamethasone treated).  
<https://www.ncbi.nlm.nih.gov/pubmed/24926665>

```{bash, eval = FALSE}
# to align your raw reads stored in fastq files, you can use for example STAR to align your reads to the reference genome

STAR --genomeDir <STAR/index/directory> 
     --runThreadN 10
     --readFilesIn <reads_R1.fastq.gz> [<reads_R2.fastq.gz>] 
     --outFileNamePrefix <sample_name.> 
     --outSAMtype BAM SortedByCoordinate 
     --readFilesCommand zcat 

# this call produces a binary alignment file in BAM format with reads sorted by coordinates
```

```{r, eval = TRUE, collapse = T}
# loading all libraries
library("airway")
library("Rsamtools")
library("GenomicFeatures")
library("GenomicAlignments")
library("BiocParallel")
library("magrittr")
library("DESeq2")
library("vsn")
library("dplyr")
library("ggplot2")
library("pheatmap")
library("RColorBrewer")
library("PoiClaClu")
library("ggbeeswarm")
library("apeglm")
library("genefilter")
library("AnnotationDbi")
library("org.Hs.eg.db")
library("ReportingTools")
library("Gviz")
library("sva")
library("RUVSeq")
library("fission")
```

We will use they data from the [airway](http://bioconductor.org/packages/release/data/experiment/html/airway.html) package, which contains 8 samples with a reduced number of reads and already aligned.

```{r, collapse = TRUE, eval = TRUE}
# source("https://bioconductor.org/biocLite.R")
# biocLite("airway")

library(airway)

# get the package's directory
indir <- system.file("extdata", package="airway", mustWork=TRUE)

# look at the files in the directory
list.files(indir)
```

```{r, collapse = TRUE, eval = TRUE}
csvfile <- file.path(indir, "sample_table.csv")
sampleTable <- read.csv(csvfile, row.names = 1)
sampleTable
```

```{r, collapse = TRUE, eval = TRUE}
filenames <- file.path(indir, paste0(sampleTable$Run, "_subset.bam"))
file.exists(filenames)
```

```{r, collapse = TRUE, eval = TRUE}
library('Rsamtools')
bamfiles <- BamFileList(filenames, yieldSize = 2000000)
seqinfo(bamfiles[1])
```

```{r, collapse = TRUE, eval = TRUE}
library("GenomicFeatures")
gtffile <- file.path(indir,"Homo_sapiens.GRCh37.75_subset.gtf")
txdb <- makeTxDbFromGFF(gtffile, format = "gtf", circ_seqs = character())
txdb

ebg <- exonsBy(txdb, by = 'gene')
ebg
```

```{r, collapse=TRUE, eval = TRUE}
library("GenomicAlignments")
library("BiocParallel")
register(SerialParam())
```

```{r, collapse=TRUE, eval=TRUE}
se <- summarizeOverlaps(features=ebg, reads=bamfiles,
                        mode="Union",
                        singleEnd=FALSE,
                        ignore.strand=TRUE,
                        fragments=TRUE )
se
dim(se)
head(assay(se), 3)
colSums(assay(se))
```
```{r, collapse=TRUE, eval=TRUE}
colData(se) <- DataFrame(sampleTable)
colData(se)
```

```{r, collapse = TRUE, eval = TRUE}
# now with the full data
data("airway")
se <- airway
se$dex <- relevel(se$dex, "untrt")
library(DESeq2)
dds <- DESeqDataSet(se, design = ~ cell + dex)
nrow(dds)
# filter for expressed genes
dds <- dds[ rowSums(counts(dds)) > 1, ]
nrow(dds)
```


```{r, collapse = TRUE, eval = TRUE}
lambda <- 10^seq(from = -1, to = 2, length = 1000)
cts <- matrix(rpois(1000*100, lambda), ncol = 100)
library("vsn")
#install.packages('hexbin')
library('hexbin')
meanSdPlot(cts, ranks = FALSE)
```

```{r, collapse = TRUE, eval = TRUE}
log.cts.one <- log2(cts + 1)
meanSdPlot(log.cts.one, ranks = FALSE)

# two possible transformation
# variance stabilizing transformation (vst) recommended for large datasets (n > 30)
vsd <- vst(dds, blind = FALSE)
head(assay(vsd), 3)
# regularized-logarithm transformation (rlog) reccommended for smaller datasets (n < 30)
rld <- rlog(dds, blind = FALSE)
head(assay(rld), 3)
# blind, treatment group is not considered
```

```{r, collapse = TRUE, eval = FALSE}
# compare transformation graphically
library("dplyr")
library("ggplot2")

dds <- estimateSizeFactors(dds)

df <- bind_rows(
  as_data_frame(log2(counts(dds, normalized=TRUE)[, 1:2]+1)) %>%
         mutate(transformation = "log2(x + 1)"),
  as_data_frame(assay(vsd)[, 1:2]) %>% mutate(transformation = "vst"),
  as_data_frame(assay(rld)[, 1:2]) %>% mutate(transformation = "rlog"))
  
colnames(df)[1:2] <- c("x", "y")  

ggplot(df, aes(x = x, y = y)) + geom_hex(bins = 80) +
  coord_fixed() + facet_grid( . ~ transformation)  
```
```{r, collapse = TRUE, eval = TRUE}
sampleDists <- dist(t(assay(vsd)))
sampleDists
library("pheatmap")
library("RColorBrewer")
sampleDistMatrix <- as.matrix( sampleDists )
rownames(sampleDistMatrix) <- paste( vsd$dex, vsd$cell, sep = " - " )
colnames(sampleDistMatrix) <- NULL
colors <- colorRampPalette( rev(brewer.pal(9, "Blues")) )(255)
pheatmap(sampleDistMatrix,
         clustering_distance_rows = sampleDists,
         clustering_distance_cols = sampleDists,
         col = colors)
```


```{r, collapse=TRUE, eval = TRUE}
library("PoiClaClu")
poisd <- PoissonDistance(t(counts(dds)))

samplePoisDistMatrix <- as.matrix( poisd$dd )
rownames(samplePoisDistMatrix) <- paste( dds$dex, dds$cell, sep=" - " )
colnames(samplePoisDistMatrix) <- NULL
pheatmap(samplePoisDistMatrix,
         clustering_distance_rows = poisd$dd,
         clustering_distance_cols = poisd$dd,
         col = colors)
```

```{r, collapse = TRUE, eval = TRUE}
plotPCA(vsd, intgroup = c("dex", "cell"))
pcaData <- plotPCA(vsd, intgroup = c( "dex", "cell"), returnData = TRUE)
pcaData

percentVar <- round(100 * attr(pcaData, "percentVar"))
ggplot(pcaData, aes(x = PC1, y = PC2, color = dex, shape = cell)) +
  geom_point(size =3) +
  xlab(paste0("PC1: ", percentVar[1], "% variance")) +
  ylab(paste0("PC2: ", percentVar[2], "% variance")) +
  coord_fixed()
```

```{r, collapse=TRUE, eval = TRUE}
mds <- as.data.frame(colData(vsd))  %>%
         cbind(cmdscale(sampleDistMatrix))
ggplot(mds, aes(x = `1`, y = `2`, color = dex, shape = cell)) +
  geom_point(size = 3) + coord_fixed()
```

# Actual Differential Gene Expression Analysis
```{r, collapse = TRUE, eval = TRUE}
colData(dds)
design(dds)
dds <- DESeq(dds)
res <- results(dds)
res
# or if comparison of interest is not last level
res <- results(dds, contrast=c("dex","trt","untrt"))
summary(res)

# significant results
res.05 <- results(dds, alpha = 0.05)
table(res.05$padj < 0.05)

# significant restults with a 2fold increase
resLFC1 <- results(dds, lfcThreshold=1)
table(resLFC1$padj < 0.1)
```
# Different Comparisons
```{r, collapse = TRUE, eval = TRUE}
results(dds, contrast = c("cell", "N061011", "N61311"))
```

```{r, collapse = TRUE, eval = TRUE}
# filter genes to an FDR of 10%
sum(res$padj < 0.1, na.rm=TRUE)
resSig <- subset(res, padj < 0.1)
head(resSig[ order(resSig$log2FoldChange), ])
```

# Plotting results
```{r, collapse=TRUE, eval=TRUE}
topGene <- rownames(res)[which.min(res$padj)]
plotCounts(dds, gene = topGene, intgroup=c("dex"))
```
```{r, collapse = TRUE, eval = TRUE}
geneCounts <- plotCounts(dds, gene = topGene, intgroup = c("dex","cell"),
                         returnData = TRUE)
ggplot(geneCounts, aes(x = dex, y = count, color = cell)) +
  scale_y_log10() +  geom_beeswarm(cex = 3)
ggplot(geneCounts, aes(x = dex, y = count, color = cell, group = cell)) +
  scale_y_log10() + geom_point(size = 3) + geom_line()
```

```{r, collapse = TRUE, eval = TRUE}
library("apeglm")
resultsNames(dds)
res <- lfcShrink(dds, coef="dex_trt_vs_untrt", type="apeglm")
plotMA(res, ylim = c(-5, 5), xlim = c(0, 10000))
```
<!-- find additional dataset and let them go by theirselfs..if they dont have their own data to play around -->