Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
bit_R_workshop/deseq2_tutorial.Rmd
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
289 lines (242 sloc)
8.04 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
--- | |
title: "DESeq2 Tutorial" | |
author: "Franziska Metge" | |
date: "June 13, 2018" | |
output: html_document | |
--- | |
```{r setup, include=FALSE} | |
knitr::opts_chunk$set(echo = TRUE) | |
``` | |
This tutorial follows <http://master.bioconductor.org/packages/release/workflows/vignettes/rnaseqGene/inst/doc/rnaseqGene.html> | |
## Contents | |
* Preparing count matrices | |
* The DESeqDataSet object | |
* Exploratory analysis and visualization | |
* Differential expression analysis | |
* Plotting results | |
## Preparing count matrices | |
We will use an example dataset from the *airway* package. It contains four human [airway](http://bioconductor.org/packages/release/data/experiment/html/airway.html) smooth muscle cell lines with two conditions each (untreated control and dexamethasone treated). | |
<https://www.ncbi.nlm.nih.gov/pubmed/24926665> | |
```{bash, eval = FALSE} | |
# to align your raw reads stored in fastq files, you can use for example STAR to align your reads to the reference genome | |
STAR --genomeDir <STAR/index/directory> | |
--runThreadN 10 | |
--readFilesIn <reads_R1.fastq.gz> [<reads_R2.fastq.gz>] | |
--outFileNamePrefix <sample_name.> | |
--outSAMtype BAM SortedByCoordinate | |
--readFilesCommand zcat | |
# this call produces a binary alignment file in BAM format with reads sorted by coordinates | |
``` | |
```{r, eval = TRUE, collapse = T} | |
# loading all libraries | |
library("airway") | |
library("Rsamtools") | |
library("GenomicFeatures") | |
library("GenomicAlignments") | |
library("BiocParallel") | |
library("magrittr") | |
library("DESeq2") | |
library("vsn") | |
library("dplyr") | |
library("ggplot2") | |
library("pheatmap") | |
library("RColorBrewer") | |
library("PoiClaClu") | |
library("ggbeeswarm") | |
library("apeglm") | |
library("genefilter") | |
library("AnnotationDbi") | |
library("org.Hs.eg.db") | |
library("ReportingTools") | |
library("Gviz") | |
library("sva") | |
library("RUVSeq") | |
library("fission") | |
``` | |
We will use they data from the [airway](http://bioconductor.org/packages/release/data/experiment/html/airway.html) package, which contains 8 samples with a reduced number of reads and already aligned. | |
```{r, collapse = TRUE, eval = TRUE} | |
# source("https://bioconductor.org/biocLite.R") | |
# biocLite("airway") | |
library(airway) | |
# get the package's directory | |
indir <- system.file("extdata", package="airway", mustWork=TRUE) | |
# look at the files in the directory | |
list.files(indir) | |
``` | |
```{r, collapse = TRUE, eval = TRUE} | |
csvfile <- file.path(indir, "sample_table.csv") | |
sampleTable <- read.csv(csvfile, row.names = 1) | |
sampleTable | |
``` | |
```{r, collapse = TRUE, eval = TRUE} | |
filenames <- file.path(indir, paste0(sampleTable$Run, "_subset.bam")) | |
file.exists(filenames) | |
``` | |
```{r, collapse = TRUE, eval = TRUE} | |
library('Rsamtools') | |
bamfiles <- BamFileList(filenames, yieldSize = 2000000) | |
seqinfo(bamfiles[1]) | |
``` | |
```{r, collapse = TRUE, eval = TRUE} | |
library("GenomicFeatures") | |
gtffile <- file.path(indir,"Homo_sapiens.GRCh37.75_subset.gtf") | |
txdb <- makeTxDbFromGFF(gtffile, format = "gtf", circ_seqs = character()) | |
txdb | |
ebg <- exonsBy(txdb, by = 'gene') | |
ebg | |
``` | |
```{r, collapse=TRUE, eval = TRUE} | |
library("GenomicAlignments") | |
library("BiocParallel") | |
register(SerialParam()) | |
``` | |
```{r, collapse=TRUE, eval=TRUE} | |
se <- summarizeOverlaps(features=ebg, reads=bamfiles, | |
mode="Union", | |
singleEnd=FALSE, | |
ignore.strand=TRUE, | |
fragments=TRUE ) | |
se | |
dim(se) | |
head(assay(se), 3) | |
colSums(assay(se)) | |
``` | |
```{r, collapse=TRUE, eval=TRUE} | |
colData(se) <- DataFrame(sampleTable) | |
colData(se) | |
``` | |
```{r, collapse = TRUE, eval = TRUE} | |
# now with the full data | |
data("airway") | |
se <- airway | |
se$dex <- relevel(se$dex, "untrt") | |
library(DESeq2) | |
dds <- DESeqDataSet(se, design = ~ cell + dex) | |
nrow(dds) | |
# filter for expressed genes | |
dds <- dds[ rowSums(counts(dds)) > 1, ] | |
nrow(dds) | |
``` | |
```{r, collapse = TRUE, eval = TRUE} | |
lambda <- 10^seq(from = -1, to = 2, length = 1000) | |
cts <- matrix(rpois(1000*100, lambda), ncol = 100) | |
library("vsn") | |
#install.packages('hexbin') | |
library('hexbin') | |
meanSdPlot(cts, ranks = FALSE) | |
``` | |
```{r, collapse = TRUE, eval = TRUE} | |
log.cts.one <- log2(cts + 1) | |
meanSdPlot(log.cts.one, ranks = FALSE) | |
# two possible transformation | |
# variance stabilizing transformation (vst) recommended for large datasets (n > 30) | |
vsd <- vst(dds, blind = FALSE) | |
head(assay(vsd), 3) | |
# regularized-logarithm transformation (rlog) reccommended for smaller datasets (n < 30) | |
rld <- rlog(dds, blind = FALSE) | |
head(assay(rld), 3) | |
# blind, treatment group is not considered | |
``` | |
```{r, collapse = TRUE, eval = FALSE} | |
# compare transformation graphically | |
library("dplyr") | |
library("ggplot2") | |
dds <- estimateSizeFactors(dds) | |
df <- bind_rows( | |
as_data_frame(log2(counts(dds, normalized=TRUE)[, 1:2]+1)) %>% | |
mutate(transformation = "log2(x + 1)"), | |
as_data_frame(assay(vsd)[, 1:2]) %>% mutate(transformation = "vst"), | |
as_data_frame(assay(rld)[, 1:2]) %>% mutate(transformation = "rlog")) | |
colnames(df)[1:2] <- c("x", "y") | |
ggplot(df, aes(x = x, y = y)) + geom_hex(bins = 80) + | |
coord_fixed() + facet_grid( . ~ transformation) | |
``` | |
```{r, collapse = TRUE, eval = TRUE} | |
sampleDists <- dist(t(assay(vsd))) | |
sampleDists | |
library("pheatmap") | |
library("RColorBrewer") | |
sampleDistMatrix <- as.matrix( sampleDists ) | |
rownames(sampleDistMatrix) <- paste( vsd$dex, vsd$cell, sep = " - " ) | |
colnames(sampleDistMatrix) <- NULL | |
colors <- colorRampPalette( rev(brewer.pal(9, "Blues")) )(255) | |
pheatmap(sampleDistMatrix, | |
clustering_distance_rows = sampleDists, | |
clustering_distance_cols = sampleDists, | |
col = colors) | |
``` | |
```{r, collapse=TRUE, eval = TRUE} | |
library("PoiClaClu") | |
poisd <- PoissonDistance(t(counts(dds))) | |
samplePoisDistMatrix <- as.matrix( poisd$dd ) | |
rownames(samplePoisDistMatrix) <- paste( dds$dex, dds$cell, sep=" - " ) | |
colnames(samplePoisDistMatrix) <- NULL | |
pheatmap(samplePoisDistMatrix, | |
clustering_distance_rows = poisd$dd, | |
clustering_distance_cols = poisd$dd, | |
col = colors) | |
``` | |
```{r, collapse = TRUE, eval = TRUE} | |
plotPCA(vsd, intgroup = c("dex", "cell")) | |
pcaData <- plotPCA(vsd, intgroup = c( "dex", "cell"), returnData = TRUE) | |
pcaData | |
percentVar <- round(100 * attr(pcaData, "percentVar")) | |
ggplot(pcaData, aes(x = PC1, y = PC2, color = dex, shape = cell)) + | |
geom_point(size =3) + | |
xlab(paste0("PC1: ", percentVar[1], "% variance")) + | |
ylab(paste0("PC2: ", percentVar[2], "% variance")) + | |
coord_fixed() | |
``` | |
```{r, collapse=TRUE, eval = TRUE} | |
mds <- as.data.frame(colData(vsd)) %>% | |
cbind(cmdscale(sampleDistMatrix)) | |
ggplot(mds, aes(x = `1`, y = `2`, color = dex, shape = cell)) + | |
geom_point(size = 3) + coord_fixed() | |
``` | |
# Actual Differential Gene Expression Analysis | |
```{r, collapse = TRUE, eval = TRUE} | |
colData(dds) | |
design(dds) | |
dds <- DESeq(dds) | |
res <- results(dds) | |
res | |
# or if comparison of interest is not last level | |
res <- results(dds, contrast=c("dex","trt","untrt")) | |
summary(res) | |
# significant results | |
res.05 <- results(dds, alpha = 0.05) | |
table(res.05$padj < 0.05) | |
# significant restults with a 2fold increase | |
resLFC1 <- results(dds, lfcThreshold=1) | |
table(resLFC1$padj < 0.1) | |
``` | |
# Different Comparisons | |
```{r, collapse = TRUE, eval = TRUE} | |
results(dds, contrast = c("cell", "N061011", "N61311")) | |
``` | |
```{r, collapse = TRUE, eval = TRUE} | |
# filter genes to an FDR of 10% | |
sum(res$padj < 0.1, na.rm=TRUE) | |
resSig <- subset(res, padj < 0.1) | |
head(resSig[ order(resSig$log2FoldChange), ]) | |
``` | |
# Plotting results | |
```{r, collapse=TRUE, eval=TRUE} | |
topGene <- rownames(res)[which.min(res$padj)] | |
plotCounts(dds, gene = topGene, intgroup=c("dex")) | |
``` | |
```{r, collapse = TRUE, eval = TRUE} | |
geneCounts <- plotCounts(dds, gene = topGene, intgroup = c("dex","cell"), | |
returnData = TRUE) | |
ggplot(geneCounts, aes(x = dex, y = count, color = cell)) + | |
scale_y_log10() + geom_beeswarm(cex = 3) | |
ggplot(geneCounts, aes(x = dex, y = count, color = cell, group = cell)) + | |
scale_y_log10() + geom_point(size = 3) + geom_line() | |
``` | |
```{r, collapse = TRUE, eval = TRUE} | |
library("apeglm") | |
resultsNames(dds) | |
res <- lfcShrink(dds, coef="dex_trt_vs_untrt", type="apeglm") | |
plotMA(res, ylim = c(-5, 5), xlim = c(0, 10000)) | |
``` | |
<!-- find additional dataset and let them go by theirselfs..if they dont have their own data to play around --> | |