Updated README and uploaded all analysis scripts and figures.

MPIBR-Bioinformatics · Jan 31, 2017 · 69d9bb9 · 69d9bb9
1 parent 5ae4cd8
commit 69d9bb9
Show file tree

Hide file tree

Showing 23 changed files with 524 additions and 1 deletion.
diff --git a/.gitignore b/.gitignore
@@ -1,2 +1,4 @@
 .DS_Store
 *~
+bin
+data
diff --git a/README.md b/README.md
@@ -34,8 +34,9 @@ Directory structure:
   - bams: contains the aligned reads as BAM Files
   - logs: contains the log files of each alignment run
 - gene_counts: contains the read counts for each gene in the reference.
+- analysis: contains this repository (all the scripts used for analysis) as well as generated data, external programs, etc.
 
-## Steps
+## Preprocessing
 
 ### 1. MD5 checksum
 
@@ -152,3 +153,53 @@ Then we add a header to that file with labels and sample names:
 ```bash
 paste -d '\t' <(echo "GeneId") <(echo "Length") <(head -1 gene_counts_180117.txt.summary | cut -f2- | tr -s '\t' '\n' | awk 'OFS="\t"{split($1, a, "_"); print a[1]}' | paste -s -d '\t' -) | cat - gene_counts_180117_only_mapped.txt > gene_counts_180117_only_mapped_with_header.txt
 ```
+
+## Analysis
+
+### 1. Normalization
+
+Removed entries with RPKM values lower than 2 and applied deSeq normalisation:
+```
+analysis_rpkm_deSeq_normalization.m
+```
+
+### 2. Fold change
+
+Visualize fold change for each experiment time-range pair.
+```
+analysis_foldchange.m
+analysis_clustergram_foldchange.m
+```
+
+### 3. Binomial fit test
+Visualize binomial fit test for each experiment time-range pair.
+```
+analysis_binomial_test.m
+```
+
+### 4. Development of gene expression over time
+Analyze the changes in gene expression over time (for each experiment) for each gene. This classifies each gene into one of 27 possible combinations/conditions: ddu, uud, ndd, ... etc, where:
+- "d" = downregulated
+- "n" = no regulation (normal)
+- "u" = upregulated
+```
+analysis_expression_development.m
+```
+
+### 5. Gene ontology
+
+Run gene ontology annotation for each condition/combination file generated in the previous step. We use the perl script [GOAnalysis](https://software.scic.corp.brain.mpg.de/projects/MPIBR-Bioinformatics/GOAnalysis) by Georgi Tushev.
+
+```bash
+for file in target_genes_lists/*.txt; do file_name=$(basename "$file"); file_name="${file_name%.*}"; perl ../../bin/GOAnalysis/GOAnalysis.pl -obo annotations/go-basic.obo -ann annotations/gene_association.rgd.gz -background background_list.txt -target $file > "go_"$file_name".txt"; done
+```
+
+Get all unique GO terms available in the annotated files:
+```bash
+cat go_* | cut -f1 | sort -u > all_go_terms.txt
+```
+
+Create table with the P-value of each condition for each GO Term available:
+```
+analysis_gene_ontology.m
+```
diff --git a/analysis_binomial_test.m b/analysis_binomial_test.m
@@ -0,0 +1,27 @@
+clc
+clear variables;
+close all;
+
+% get records
+fRead = fopen('../gene_counts/gene_counts_180117_only_mapped_with_header.txt','r');
+% read first line (header)
+header = fgetl(fRead);
+header = strsplit(header);
+header(2) = []; % remove Length from header, not necessary for output file
+
+% read counts
+txt = textscan(fRead,'%s %n %n %n %n %n %n %n %n %n %n %n %n %n %n %n %n %n','delimiter','\t');
+fclose(fRead);
+symbols = txt{1}; % gene names
+length = txt{2}; % gene lengths
+counts = [txt{3:end}] + 1; % counts for each sample
+
+% remove entries with rpkm values lower than 2
+idx_threshold = calculate_rpkm(counts, length);
+counts(idx_threshold,:) = [];
+symbols(idx_threshold) = [];
+
+% create figures for each experiment time range (5min, 30min, 60min)
+stats05 = NBinFitTest(counts(:, [2,5]), counts(:, [1,4]), 'Constant', true);
+stats30 = NBinFitTest(counts(:, [7,10]), counts(:, [6,9]), 'Constant', true);
+stats60 = NBinFitTest(counts(:, [12,15]), counts(:, [11,14,16]), 'Constant', true);
diff --git a/analysis_clustergram_foldchange.m b/analysis_clustergram_foldchange.m
@@ -0,0 +1,49 @@
+clc
+clear variables;
+close all;
+
+% get records
+fRead = fopen('../gene_counts/gene_counts_180117_only_mapped_with_header.txt','r');
+% read first line (header)
+header = fgetl(fRead);
+header = strsplit(header);
+header(1:2) = []; % remove Length from header, not necessary for output file
+
+% read counts
+txt = textscan(fRead,'%s %n %n %n %n %n %n %n %n %n %n %n %n %n %n %n %n %n','delimiter','\t');
+fclose(fRead);
+symbols = txt{1}; % gene names
+length = txt{2}; % gene lengths
+counts = [txt{3:end}] + 1; % counts for each sample
+
+% remove entries with rpkm values lower than 2
+idx_threshold = calculate_rpkm(counts, length);
+counts(idx_threshold,:) = [];
+symbols(idx_threshold) = [];
+
+% create figures for each experiment time range (5min, 30min, 60min)
+stats05 = BinFoldChangeTest(counts(:, [2,5]), counts(:, [1,4]), 100, false);
+stats30 = BinFoldChangeTest(counts(:, [7,10]), counts(:, [6,9]), 100, false);
+stats60 = BinFoldChangeTest(counts(:, [12,15]), counts(:, [11,14,16]), 100, false);
+
+% idx_x contains genes meaninfully underexpressed
+% idx_y contains genes meaninfully overexpressed
+idx_common = stats05.idx_x | stats05.idx_y | stats30.idx_x | stats30.idx_y | stats60.idx_x | stats60.idx_y;
+M = [stats05.M, stats30.M, stats60.M];
+M = M(idx_common,:);
+
+R = tiedrank(M)./size(M,1);
+
+clustergram(R,'Standardize', 2,'cluster',1, 'ColumnLabels', {'05 min', '30 min', '60 min'});
+
+%{
+mtx = calculate_deSeq(counts);
+mtx = mtx(idx_common,:);
+idx_remove = [3,8,13];
+mtx(:,idx_remove) = [];
+header(idx_remove) = [];
+
+% disp(symbols(stats05.idx_y));
+R = tiedrank(mtx)./size(mtx,1);
+clustergram(R,'Standardize',2,'cluster',1)
+%}
diff --git a/analysis_expression_development.m b/analysis_expression_development.m
@@ -0,0 +1,78 @@
+clc
+clear variables;
+close all;
+
+% get records
+fRead = fopen('../gene_counts/gene_counts_180117_only_mapped_with_header.txt','r');
+% read first line (header)
+header = fgetl(fRead);
+header = strsplit(header);
+header(1:2) = []; % remove Length from header, not necessary for output file
+
+% read counts
+txt = textscan(fRead,'%s %n %n %n %n %n %n %n %n %n %n %n %n %n %n %n %n %n','delimiter','\t');
+fclose(fRead);
+symbols = txt{1}; % gene names
+length = txt{2}; % gene lengths
+counts = [txt{3:end}] + 1; % counts for each sample
+
+% remove entries with rpkm values lower than 2
+idx_threshold = calculate_rpkm(counts, length);
+counts(idx_threshold,:) = [];
+symbols(idx_threshold) = [];
+
+% create genes background list
+fileID = fopen('data/gene_ontology/background_list.txt','w');
+formatSpec = '%s\n';
+[nrows,ncols] = size(symbols);
+for row = 1:nrows
+    fprintf(fileID,formatSpec,symbols{row,:});
+end
+fclose(fileID);
+
+% create figures for each experiment time range (5min, 30min, 60min)
+stats05 = BinFoldChangeTest(counts(:, [2,5]), counts(:, [1,4]), 100, false);
+stats30 = BinFoldChangeTest(counts(:, [7,10]), counts(:, [6,9]), 100, false);
+stats60 = BinFoldChangeTest(counts(:, [12,15]), counts(:, [11,14,16]), 100, false);
+
+M = [stats05.M, stats30.M, stats60.M];
+
+% get all gene expression combinations over time
+% 'd' means significantly downregulated
+% 'n' means no significant change in expression (normal)
+% 'u' means significantly upregulated
+% i.e.: u,n,n means gene upregulated in sample01, normal in sample02, normal in sample03
+
+% http://stackoverflow.com/questions/4165859/generate-all-possible-combinations-of-the-elements-of-some-vectors-cartesian-pr#4169488
+x = ['d', 'n', 'u']; % Set of possible gene expression states
+K = 3; % Length of each permutation
+s = struct('d', 'idx_x', 'n', 'idx_none', 'u', 'idx_y'); % reference of each state to corresponding index from fold change stats
+
+% Create all possible permutations (with repetition) of states stored in x
+C = cell(K, 1);  % Preallocate a cell array
+[C{:}] = ndgrid(x); % Create K grids of values
+y = cellfun(@(x){x(:)}, C); % Convert grids to column vectors
+y = [y{:}]; % Obtain all permutations
+
+[nrows,ncols] = size(y);
+exp = [];
+exp_header = {};
+for row = 1:nrows
+    exp_idx = (stats05.(s.(y(row,1))) & stats30.(s.(y(row,2))) & stats60.(s.(y(row,3))));
+    exp_geneIds = symbols(exp_idx, :);
+    combination_label = y(row,:);
+    % write gene IDs to file
+    fileName = sprintf('data/gene_ontology/%s.txt', combination_label);
+    fileHandler = fopen(fileName,'w');
+    formatSpec = '%s\n';
+    fprintf(fileHandler,formatSpec,exp_geneIds{:});
+    fclose(fileHandler);
+    % append expression combination label to header
+    exp_header{end+1} = combination_label;
+    % write index for all genes
+    exp = [exp exp_idx];
+end
+% create table with row and column labels
+exp_table = array2table(exp, 'RowNames', symbols, 'VariableNames', exp_header);
+
+
diff --git a/analysis_foldchange.m b/analysis_foldchange.m
@@ -0,0 +1,27 @@
+clc
+clear variables;
+close all;
+
+% get records
+fRead = fopen('../gene_counts/gene_counts_180117_only_mapped_with_header.txt','r');
+% read first line (header)
+header = fgetl(fRead);
+header = strsplit(header);
+header(1:2) = []; % remove Length from header, not necessary for output file
+
+% read counts
+txt = textscan(fRead,'%s %n %n %n %n %n %n %n %n %n %n %n %n %n %n %n %n %n','delimiter','\t');
+fclose(fRead);
+symbols = txt{1}; % gene names
+length = txt{2}; % gene lengths
+counts = [txt{3:end}] + 1; % counts for each sample
+
+% remove entries with rpkm values lower than 2
+idx_threshold = calculate_rpkm(counts, length);
+counts(idx_threshold,:) = [];
+symbols(idx_threshold) = [];
+
+% create figures for each experiment time range (5min, 30min, 60min)
+stats05 = BinFoldChangeTest(counts(:, [2,5]), counts(:, [1,4]), 100, true);
+stats30 = BinFoldChangeTest(counts(:, [7,10]), counts(:, [6,9]), 100, true);
+stats60 = BinFoldChangeTest(counts(:, [12,15]), counts(:, [11,14,16]), 100, true);
diff --git a/analysis_gene_ontology.m b/analysis_gene_ontology.m
@@ -0,0 +1,31 @@
+clc
+clear variables;
+close all;
+
+% get all unique GO terms
+fRead = fopen('data/gene_ontology/all_go_terms.txt','r');
+ref_go_terms = textscan(fRead,'%s','delimiter','\t');
+ref_go_terms = ref_go_terms{1};
+fclose(fRead);
+
+% get go annotation files
+annot_files = dir('data/gene_ontology/go_*.txt');
+
+go_matrix = zeros(size(ref_go_terms, 1), size(annot_files, 1));
+for i = 1:size(annot_files, 1)
+    file = annot_files(i);
+    fRead = fopen(sprintf('data/gene_ontology/%s',file.name),'r');
+    content = textscan(fRead,'%s %s %s %n %n %n %n %n','delimiter','\t');
+    qry_go_terms = content{1};
+    p_values = content{end};
+    fclose(fRead);
+
+    [~, idx_ref, idx_qry] = intersect(ref_go_terms, qry_go_terms);
+
+    go_matrix(idx_ref, i) = p_values(idx_qry);
+
+end
+
+go_table = array2table(go_matrix, 'RowNames', ref_go_terms, 'VariableNames', cellfun(@(s) strtok(s, '.') ,{annot_files.name}, 'UniformOutput', false));
+
+clustergram(go_matrix,'Standardize', 2,'cluster',1, 'ColumnLabels', cellfun(@(s) strtok(s, '.') ,{annot_files.name}, 'UniformOutput', false));
diff --git a/analysis_rpkm_deSeq_normalization.m b/analysis_rpkm_deSeq_normalization.m
@@ -0,0 +1,37 @@
+clc
+clear variables;
+close all;
+
+% get records
+fRead = fopen('../gene_counts/gene_counts_180117_only_mapped_with_header.txt','r');
+% read first line (header)
+header = fgetl(fRead);
+header = strsplit(header);
+header(2) = []; % remove Length from header, not necessary for output file
+
+% read counts
+txt = textscan(fRead,'%s %n %n %n %n %n %n %n %n %n %n %n %n %n %n %n %n %n','delimiter','\t');
+fclose(fRead);
+symbols = txt{1}; % gene names
+length = txt{2}; % gene lengths
+counts = [txt{3:end}]; % counts for each sample
+
+% remove entries with rpkm values lower than 2
+idx_threshold = calculate_rpkm(counts, length);
+counts(idx_threshold,:) = [];
+symbols(idx_threshold) = [];
+
+% apply deSeq normalization to counts
+normalized_counts = calculate_deSeq(counts);
+
+% get normalized list of gene counts
+normalized_cell = [symbols num2cell(normalized_counts)];
+% print list to text file
+normalized_table = cell2table(normalized_cell, 'VariableNames', header);
+% create file
+writetable(normalized_table, 'rpkm_and_deSeq_normalized_data.txt', 'Delimiter','\t');
+
+% plot correlation matrix
+header(1) = []; % remove gene ids column from header
+plotCorrelationMatrix(normalized_counts, header, [16,16], 'correlation_matrix_rpkmBigger2_and_deSeq');
+
diff --git a/figures/binomial_fit_test_s12-s15_s11-s14-s16.fig b/figures/binomial_fit_test_s12-s15_s11-s14-s16.fig
diff --git a/figures/binomial_fit_test_s2-s5_s1-s4.fig b/figures/binomial_fit_test_s2-s5_s1-s4.fig
diff --git a/figures/binomial_fit_test_s7-s10_s6-s9.fig b/figures/binomial_fit_test_s7-s10_s6-s9.fig
diff --git a/figures/clustergram_fold_change.fig b/figures/clustergram_fold_change.fig
diff --git a/figures/correlation_matrix_rpkmBigger2.png b/figures/correlation_matrix_rpkmBigger2.png
diff --git a/figures/correlation_matrix_rpkmBigger2_and_deSeq.png b/figures/correlation_matrix_rpkmBigger2_and_deSeq.png
diff --git a/figures/fold_change_s12-s15_s11-s14-s16.fig b/figures/fold_change_s12-s15_s11-s14-s16.fig
diff --git a/figures/fold_change_s2-s5_s1-s4.fig b/figures/fold_change_s2-s5_s1-s4.fig
diff --git a/figures/fold_change_s7-s10_s6-s9.fig b/figures/fold_change_s7-s10_s6-s9.fig
diff --git a/figures/gene_ontology_clustergram.fig b/figures/gene_ontology_clustergram.fig
diff --git a/helpers/BinFoldChangeTest.m b/helpers/BinFoldChangeTest.m
@@ -0,0 +1,57 @@
+function [stats] = BinFoldChangeTest(X,Y,nbins,draw)
+
+    nrm = calculate_deSeq([X,Y]);
+    Xnrm = nrm(:,1:size(X,2));
+    Ynrm = nrm(:,(size(X,2)+1):end);
+    A = log10(mean(nrm, 2));
+    M = log2(mean(Ynrm,2)) - log2(mean(Xnrm,2));
+
+    % create bins
+    idx_bin = zeros(size(A,1),1);
+    idx_bin(1:nbins:end) = 1;
+    idx_bin = cumsum(idx_bin);
+
+    % resort
+    [~, idx_sort] = sort(A,'descend');
+    idx_rev(idx_sort) = (1:size(idx_sort,1))';
+    idx_res = false(size(A,1),1);
+    Z = zeros(size(A,1),1);
+    for k = 1 : max(idx_bin)
+        idx_now = idx_bin == k;
+        M_now =  M(idx_now(idx_rev));
+        Z_now = zscore(M_now);
+        idx_good = abs(Z_now) >= 1.65;
+        idx_res(idx_now(idx_rev)) = idx_good;
+        Z(idx_now(idx_rev)) = Z_now;
+    end
+
+    idx_x = idx_res & (M < median(M)) & (A >= 1);
+    idx_y = idx_res & (M >= median(M)) & (A >= 1);
+    idx_none = ~(idx_x | idx_y);
+
+    if draw
+
+        figure('Color','w');
+        hold(gca,'on');
+        plot(A(idx_none),M(idx_none), '.', 'color', [.65,.65,.65]);
+        plot(A(idx_x), M(idx_x), '.', 'color', [30,144,255]./255);
+        plot(A(idx_y), M(idx_y), '.', 'color', [148,0,211]./255);
+        plot([min(A),max(A)],[median(M),median(M)],'k');
+        hold(gca,'off');
+
+        set(gca, 'box','off',...
+                 'xlim',[min(A), max(A)],...
+                 'ylim',[-max(abs(M)),max(abs(M))]);
+
+    end
+
+    % pack in structure
+    stats.nrm = nrm;
+    stats.A = A;
+    stats.M = M;
+    stats.Z = Z;
+    stats.idx_x = idx_x;
+    stats.idx_y = idx_y;
+    stats.idx_none = idx_none;
+
+end