From 3228d4169c7769d5e19b1a33832c5e995a0d46c9 Mon Sep 17 00:00:00 2001 From: sepro Date: Mon, 25 Apr 2016 15:27:36 +0200 Subject: [PATCH] further work on QC --- pipeline/check/quality.py | 39 +++++++++++++++++++++++++++++++++++++++ pipeline/transcriptome.py | 15 ++++++++------- 2 files changed, 47 insertions(+), 7 deletions(-) diff --git a/pipeline/check/quality.py b/pipeline/check/quality.py index e69de29..9dfa1ff 100644 --- a/pipeline/check/quality.py +++ b/pipeline/check/quality.py @@ -0,0 +1,39 @@ +import sys + +__bad_fields = ['no_feature', 'ambiguous', 'too_low_aQual', 'not_aligned', 'alignment_not_unique'] + + +def htseq_count_quality(filename, cutoff): + values = [] + total_count = 0 + + with open(filename, "r") as fin: + for l in fin: + gene, count = l.strip().split('\t') + if all([gene not in bf for bf in __bad_fields]): + values.append(int(count)) + total_count += int(count) + + if total_count == 0: + print("N50 check for", filename, "failed. No reads found") + return False + + values.sort(reverse=True) + + current_count = 0 + for e, v in enumerate(values, start=1): + current_count += v + if current_count > total_count/2: + if e >= cutoff: + return True + + break + + print("N50 check for", filename, "failed.", e, cutoff) + return False + + + + + + diff --git a/pipeline/transcriptome.py b/pipeline/transcriptome.py index 89bf354..5147a93 100644 --- a/pipeline/transcriptome.py +++ b/pipeline/transcriptome.py @@ -4,6 +4,7 @@ from cluster import wait_for_job from utils.matrix import read_matrix, write_matrix, normalize_matrix_counts, normalize_matrix_length from pipeline.base import PipelineBase +from pipeline.check.quality import htseq_count_quality class TranscriptomePipeline(PipelineBase): @@ -267,15 +268,15 @@ def htseq_to_matrix(self): for file in htseq_files: full_path = os.path.join(htseq_output, file) + if htseq_count_quality(full_path, 1): + with open(full_path, "r") as f: + for row in f: + gene_id, count = row.strip().split('\t') - with open(full_path, "r") as f: - for row in f: - gene_id, count = row.strip().split('\t') + if gene_id not in counts.keys(): + counts[gene_id] = {} - if gene_id not in counts.keys(): - counts[gene_id] = {} - - counts[gene_id][file] = count + counts[gene_id][file] = count output_file = self.dp[g]['exp_matrix_output'] with open(output_file, "w") as f_out: