diff --git a/README.md b/README.md index 704f8c9..1d73bcb 100644 --- a/README.md +++ b/README.md @@ -23,7 +23,7 @@ ## Organisation -Files are located under: +**Raw and preprocessed files are located under:** `/storage/schu/ProjectGroups/RNA/Data/RNASequencing/Novogene/NHHW162910` @@ -34,10 +34,21 @@ Directory structure: - bams: contains the aligned reads as BAM Files - logs: contains the log files of each alignment run - gene_counts: contains the read counts for each gene in the reference. -- analysis: contains this repository (all the scripts used for analysis) as well as generated data, external programs, etc. + +**Analysis files (means, after gene counting) are located under:** + +`/storage/schu/ProjectGroups/RNA/Projects/LTP_transcriptomics/analysis` + +The folder `analysis` contains this repository (all the scripts used for analysis) as well as generated data, external programs, etc. + +The parent folder `LTP_transcriptomics` contains also a symlink `data` to the raw and preprocessed files mentioned above. + ## Preprocessing +Preprocessing steps are run under the folder: +`/storage/schu/ProjectGroups/RNA/Data/RNASequencing/Novogene/NHHW162910` + ### 1. MD5 checksum ```bash @@ -156,6 +167,9 @@ paste -d '\t' <(echo "GeneId") <(echo "Length") <(head -1 gene_counts_180117.txt ## Analysis +Analysis steps are run under the folder: +`/storage/schu/ProjectGroups/RNA/Projects/LTP_transcriptomics/analysis` + ### 1. Normalization Removed entries with RPKM values lower than 2 and applied deSeq normalisation: @@ -190,6 +204,12 @@ analysis_expression_development.m Run gene ontology annotation for each condition/combination file generated in the previous step. We use the perl script [GOAnalysis](https://software.scic.corp.brain.mpg.de/projects/MPIBR-Bioinformatics/GOAnalysis) by Georgi Tushev. +First change to the corresponding analsyis' data folder: +```bash +cd data/gene_ontology +``` + +The run the annotation: ```bash for file in target_genes_lists/*.txt; do file_name=$(basename "$file"); file_name="${file_name%.*}"; perl ../../bin/GOAnalysis/GOAnalysis.pl -obo annotations/go-basic.obo -ann annotations/gene_association.rgd.gz -background background_list.txt -target $file > "go_"$file_name".txt"; done ``` diff --git a/analysis_binomial_test.m b/analysis_binomial_test.m index c66e548..1f8d520 100644 --- a/analysis_binomial_test.m +++ b/analysis_binomial_test.m @@ -2,8 +2,11 @@ clear variables; close all; +% add helpers folder to path +addpath(genpath('helpers')); + % get records -fRead = fopen('../gene_counts/gene_counts_180117_only_mapped_with_header.txt','r'); +fRead = fopen('../data/gene_counts/gene_counts_180117_only_mapped_with_header.txt','r'); % read first line (header) header = fgetl(fRead); header = strsplit(header); @@ -25,3 +28,6 @@ stats05 = NBinFitTest(counts(:, [2,5]), counts(:, [1,4]), 'Constant', true); stats30 = NBinFitTest(counts(:, [7,10]), counts(:, [6,9]), 'Constant', true); stats60 = NBinFitTest(counts(:, [12,15]), counts(:, [11,14,16]), 'Constant', true); + +% remove helpers folder from path +rmpath('helpers'); \ No newline at end of file diff --git a/analysis_clustergram_foldchange.m b/analysis_clustergram_foldchange.m index f48523a..e8c2a12 100644 --- a/analysis_clustergram_foldchange.m +++ b/analysis_clustergram_foldchange.m @@ -2,8 +2,11 @@ clear variables; close all; +% add helpers folder to path +addpath(genpath('helpers')); + % get records -fRead = fopen('../gene_counts/gene_counts_180117_only_mapped_with_header.txt','r'); +fRead = fopen('../data/gene_counts/gene_counts_180117_only_mapped_with_header.txt','r'); % read first line (header) header = fgetl(fRead); header = strsplit(header); @@ -36,14 +39,5 @@ clustergram(R,'Standardize', 2,'cluster',1, 'ColumnLabels', {'05 min', '30 min', '60 min'}); -%{ -mtx = calculate_deSeq(counts); -mtx = mtx(idx_common,:); -idx_remove = [3,8,13]; -mtx(:,idx_remove) = []; -header(idx_remove) = []; - -% disp(symbols(stats05.idx_y)); -R = tiedrank(mtx)./size(mtx,1); -clustergram(R,'Standardize',2,'cluster',1) -%} \ No newline at end of file +% remove helpers folder from path +rmpath('helpers'); \ No newline at end of file diff --git a/analysis_expression_development.m b/analysis_expression_development.m index b798b71..37ac09f 100644 --- a/analysis_expression_development.m +++ b/analysis_expression_development.m @@ -2,8 +2,11 @@ clear variables; close all; +% add helpers folder to path +addpath(genpath('helpers')); + % get records -fRead = fopen('../gene_counts/gene_counts_180117_only_mapped_with_header.txt','r'); +fRead = fopen('../data/gene_counts/gene_counts_180117_only_mapped_with_header.txt','r'); % read first line (header) header = fgetl(fRead); header = strsplit(header); @@ -75,4 +78,5 @@ % create table with row and column labels exp_table = array2table(exp, 'RowNames', symbols, 'VariableNames', exp_header); - +% remove helpers folder from path +rmpath('helpers'); diff --git a/analysis_foldchange.m b/analysis_foldchange.m index b748054..4154086 100644 --- a/analysis_foldchange.m +++ b/analysis_foldchange.m @@ -2,8 +2,11 @@ clear variables; close all; +% add helpers folder to path +addpath(genpath('helpers')); + % get records -fRead = fopen('../gene_counts/gene_counts_180117_only_mapped_with_header.txt','r'); +fRead = fopen('../data/gene_counts/gene_counts_180117_only_mapped_with_header.txt','r'); % read first line (header) header = fgetl(fRead); header = strsplit(header); @@ -21,7 +24,22 @@ counts(idx_threshold,:) = []; symbols(idx_threshold) = []; +counts = calculate_deSeq(counts); + % create figures for each experiment time range (5min, 30min, 60min) -stats05 = BinFoldChangeTest(counts(:, [2,5]), counts(:, [1,4]), 100, true); -stats30 = BinFoldChangeTest(counts(:, [7,10]), counts(:, [6,9]), 100, true); -stats60 = BinFoldChangeTest(counts(:, [12,15]), counts(:, [11,14,16]), 100, true); \ No newline at end of file +%stats05 = BinFoldChangeTest(counts(:, [2,5]), counts(:, [1,4]), 100, true); +%stats30 = BinFoldChangeTest(counts(:, [7,10]), counts(:, [6,9]), 100, true); +%stats60 = BinFoldChangeTest(counts(:, [12,15]), counts(:, [11,14,16]), 100, true); + +stats05C = BinFoldChangeTest(counts(:,2),counts(:,5),100,true); +stats05T = BinFoldChangeTest(counts(:,5),counts(:,4),100,true); +%stats60A = BinFoldChangeTest(counts(:,12),counts(:,11), 100, true); +%stats60B = BinFoldChangeTest(counts(:,15),counts(:,14),100,true); +idx = (stats05C.idx_x | stats05C.idx_y | stats05T.idx_x | stats05T.idx_y); + +stats05 = BinFoldChangeTest(counts(~idx, [2,5]), counts(~idx, [1,4]), 100, true); +stats05A = BinFoldChangeTest(counts(~idx,2),counts(~idx,1), 100, true); +stats05B = BinFoldChangeTest(counts(~idx,5),counts(~idx,4),100,true); + +% remove helpers folder from path +rmpath('helpers'); \ No newline at end of file diff --git a/analysis_gene_ontology.m b/analysis_gene_ontology.m index 2b15110..72262cf 100644 --- a/analysis_gene_ontology.m +++ b/analysis_gene_ontology.m @@ -2,6 +2,9 @@ clear variables; close all; +% add helpers folder to path +addpath(genpath('helpers')); + % get all unique GO terms fRead = fopen('data/gene_ontology/all_go_terms.txt','r'); ref_go_terms = textscan(fRead,'%s','delimiter','\t'); @@ -32,3 +35,6 @@ go_table = array2table(log_matrix, 'RowNames', ref_go_terms, 'VariableNames', cellfun(@(s) strtok(s, '.'), {annot_files.name}, 'UniformOutput', false)); clustergram(log_matrix,'Standardize', 2,'cluster',1, 'ColumnLabels', cellfun(@(s) strtok(s, '.'), {annot_files.name}, 'UniformOutput', false)); + +% remove helpers folder from path +rmpath('helpers'); \ No newline at end of file diff --git a/analysis_rpkm_deSeq_normalization.m b/analysis_rpkm_deSeq_normalization.m index 24c9b5a..d538bc6 100644 --- a/analysis_rpkm_deSeq_normalization.m +++ b/analysis_rpkm_deSeq_normalization.m @@ -2,8 +2,11 @@ clear variables; close all; +% add helpers folder to path +addpath(genpath('helpers')); + % get records -fRead = fopen('../gene_counts/gene_counts_180117_only_mapped_with_header.txt','r'); +fRead = fopen('../data/gene_counts/gene_counts_180117_only_mapped_with_header.txt','r'); % read first line (header) header = fgetl(fRead); header = strsplit(header); @@ -35,3 +38,5 @@ header(1) = []; % remove gene ids column from header plotCorrelationMatrix(normalized_counts, header, [16,16], 'correlation_matrix_rpkmBigger2_and_deSeq'); +% remove helpers folder from path +rmpath('helpers'); \ No newline at end of file