Skip to content
Permalink
b7cf2eecc1
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
289 lines (242 sloc) 8.04 KB
---
title: "DESeq2 Tutorial"
author: "Franziska Metge"
date: "June 13, 2018"
output: html_document
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```
This tutorial follows <http://master.bioconductor.org/packages/release/workflows/vignettes/rnaseqGene/inst/doc/rnaseqGene.html>
## Contents
* Preparing count matrices
* The DESeqDataSet object
* Exploratory analysis and visualization
* Differential expression analysis
* Plotting results
## Preparing count matrices
We will use an example dataset from the *airway* package. It contains four human [airway](http://bioconductor.org/packages/release/data/experiment/html/airway.html) smooth muscle cell lines with two conditions each (untreated control and dexamethasone treated).
<https://www.ncbi.nlm.nih.gov/pubmed/24926665>
```{bash, eval = FALSE}
# to align your raw reads stored in fastq files, you can use for example STAR to align your reads to the reference genome
STAR --genomeDir <STAR/index/directory>
--runThreadN 10
--readFilesIn <reads_R1.fastq.gz> [<reads_R2.fastq.gz>]
--outFileNamePrefix <sample_name.>
--outSAMtype BAM SortedByCoordinate
--readFilesCommand zcat
# this call produces a binary alignment file in BAM format with reads sorted by coordinates
```
```{r, eval = TRUE, collapse = T}
# loading all libraries
library("airway")
library("Rsamtools")
library("GenomicFeatures")
library("GenomicAlignments")
library("BiocParallel")
library("magrittr")
library("DESeq2")
library("vsn")
library("dplyr")
library("ggplot2")
library("pheatmap")
library("RColorBrewer")
library("PoiClaClu")
library("ggbeeswarm")
library("apeglm")
library("genefilter")
library("AnnotationDbi")
library("org.Hs.eg.db")
library("ReportingTools")
library("Gviz")
library("sva")
library("RUVSeq")
library("fission")
```
We will use they data from the [airway](http://bioconductor.org/packages/release/data/experiment/html/airway.html) package, which contains 8 samples with a reduced number of reads and already aligned.
```{r, collapse = TRUE, eval = TRUE}
# source("https://bioconductor.org/biocLite.R")
# biocLite("airway")
library(airway)
# get the package's directory
indir <- system.file("extdata", package="airway", mustWork=TRUE)
# look at the files in the directory
list.files(indir)
```
```{r, collapse = TRUE, eval = TRUE}
csvfile <- file.path(indir, "sample_table.csv")
sampleTable <- read.csv(csvfile, row.names = 1)
sampleTable
```
```{r, collapse = TRUE, eval = TRUE}
filenames <- file.path(indir, paste0(sampleTable$Run, "_subset.bam"))
file.exists(filenames)
```
```{r, collapse = TRUE, eval = TRUE}
library('Rsamtools')
bamfiles <- BamFileList(filenames, yieldSize = 2000000)
seqinfo(bamfiles[1])
```
```{r, collapse = TRUE, eval = TRUE}
library("GenomicFeatures")
gtffile <- file.path(indir,"Homo_sapiens.GRCh37.75_subset.gtf")
txdb <- makeTxDbFromGFF(gtffile, format = "gtf", circ_seqs = character())
txdb
ebg <- exonsBy(txdb, by = 'gene')
ebg
```
```{r, collapse=TRUE, eval = TRUE}
library("GenomicAlignments")
library("BiocParallel")
register(SerialParam())
```
```{r, collapse=TRUE, eval=TRUE}
se <- summarizeOverlaps(features=ebg, reads=bamfiles,
mode="Union",
singleEnd=FALSE,
ignore.strand=TRUE,
fragments=TRUE )
se
dim(se)
head(assay(se), 3)
colSums(assay(se))
```
```{r, collapse=TRUE, eval=TRUE}
colData(se) <- DataFrame(sampleTable)
colData(se)
```
```{r, collapse = TRUE, eval = TRUE}
# now with the full data
data("airway")
se <- airway
se$dex <- relevel(se$dex, "untrt")
library(DESeq2)
dds <- DESeqDataSet(se, design = ~ cell + dex)
nrow(dds)
# filter for expressed genes
dds <- dds[ rowSums(counts(dds)) > 1, ]
nrow(dds)
```
```{r, collapse = TRUE, eval = TRUE}
lambda <- 10^seq(from = -1, to = 2, length = 1000)
cts <- matrix(rpois(1000*100, lambda), ncol = 100)
library("vsn")
#install.packages('hexbin')
library('hexbin')
meanSdPlot(cts, ranks = FALSE)
```
```{r, collapse = TRUE, eval = TRUE}
log.cts.one <- log2(cts + 1)
meanSdPlot(log.cts.one, ranks = FALSE)
# two possible transformation
# variance stabilizing transformation (vst) recommended for large datasets (n > 30)
vsd <- vst(dds, blind = FALSE)
head(assay(vsd), 3)
# regularized-logarithm transformation (rlog) reccommended for smaller datasets (n < 30)
rld <- rlog(dds, blind = FALSE)
head(assay(rld), 3)
# blind, treatment group is not considered
```
```{r, collapse = TRUE, eval = FALSE}
# compare transformation graphically
library("dplyr")
library("ggplot2")
dds <- estimateSizeFactors(dds)
df <- bind_rows(
as_data_frame(log2(counts(dds, normalized=TRUE)[, 1:2]+1)) %>%
mutate(transformation = "log2(x + 1)"),
as_data_frame(assay(vsd)[, 1:2]) %>% mutate(transformation = "vst"),
as_data_frame(assay(rld)[, 1:2]) %>% mutate(transformation = "rlog"))
colnames(df)[1:2] <- c("x", "y")
ggplot(df, aes(x = x, y = y)) + geom_hex(bins = 80) +
coord_fixed() + facet_grid( . ~ transformation)
```
```{r, collapse = TRUE, eval = TRUE}
sampleDists <- dist(t(assay(vsd)))
sampleDists
library("pheatmap")
library("RColorBrewer")
sampleDistMatrix <- as.matrix( sampleDists )
rownames(sampleDistMatrix) <- paste( vsd$dex, vsd$cell, sep = " - " )
colnames(sampleDistMatrix) <- NULL
colors <- colorRampPalette( rev(brewer.pal(9, "Blues")) )(255)
pheatmap(sampleDistMatrix,
clustering_distance_rows = sampleDists,
clustering_distance_cols = sampleDists,
col = colors)
```
```{r, collapse=TRUE, eval = TRUE}
library("PoiClaClu")
poisd <- PoissonDistance(t(counts(dds)))
samplePoisDistMatrix <- as.matrix( poisd$dd )
rownames(samplePoisDistMatrix) <- paste( dds$dex, dds$cell, sep=" - " )
colnames(samplePoisDistMatrix) <- NULL
pheatmap(samplePoisDistMatrix,
clustering_distance_rows = poisd$dd,
clustering_distance_cols = poisd$dd,
col = colors)
```
```{r, collapse = TRUE, eval = TRUE}
plotPCA(vsd, intgroup = c("dex", "cell"))
pcaData <- plotPCA(vsd, intgroup = c( "dex", "cell"), returnData = TRUE)
pcaData
percentVar <- round(100 * attr(pcaData, "percentVar"))
ggplot(pcaData, aes(x = PC1, y = PC2, color = dex, shape = cell)) +
geom_point(size =3) +
xlab(paste0("PC1: ", percentVar[1], "% variance")) +
ylab(paste0("PC2: ", percentVar[2], "% variance")) +
coord_fixed()
```
```{r, collapse=TRUE, eval = TRUE}
mds <- as.data.frame(colData(vsd)) %>%
cbind(cmdscale(sampleDistMatrix))
ggplot(mds, aes(x = `1`, y = `2`, color = dex, shape = cell)) +
geom_point(size = 3) + coord_fixed()
```
# Actual Differential Gene Expression Analysis
```{r, collapse = TRUE, eval = TRUE}
colData(dds)
design(dds)
dds <- DESeq(dds)
res <- results(dds)
res
# or if comparison of interest is not last level
res <- results(dds, contrast=c("dex","trt","untrt"))
summary(res)
# significant results
res.05 <- results(dds, alpha = 0.05)
table(res.05$padj < 0.05)
# significant restults with a 2fold increase
resLFC1 <- results(dds, lfcThreshold=1)
table(resLFC1$padj < 0.1)
```
# Different Comparisons
```{r, collapse = TRUE, eval = TRUE}
results(dds, contrast = c("cell", "N061011", "N61311"))
```
```{r, collapse = TRUE, eval = TRUE}
# filter genes to an FDR of 10%
sum(res$padj < 0.1, na.rm=TRUE)
resSig <- subset(res, padj < 0.1)
head(resSig[ order(resSig$log2FoldChange), ])
```
# Plotting results
```{r, collapse=TRUE, eval=TRUE}
topGene <- rownames(res)[which.min(res$padj)]
plotCounts(dds, gene = topGene, intgroup=c("dex"))
```
```{r, collapse = TRUE, eval = TRUE}
geneCounts <- plotCounts(dds, gene = topGene, intgroup = c("dex","cell"),
returnData = TRUE)
ggplot(geneCounts, aes(x = dex, y = count, color = cell)) +
scale_y_log10() + geom_beeswarm(cex = 3)
ggplot(geneCounts, aes(x = dex, y = count, color = cell, group = cell)) +
scale_y_log10() + geom_point(size = 3) + geom_line()
```
```{r, collapse = TRUE, eval = TRUE}
library("apeglm")
resultsNames(dds)
res <- lfcShrink(dds, coef="dex_trt_vs_untrt", type="apeglm")
plotMA(res, ylim = c(-5, 5), xlim = c(0, 10000))
```
<!-- find additional dataset and let them go by theirselfs..if they dont have their own data to play around -->