From 3228d4169c7769d5e19b1a33832c5e995a0d46c9 Mon Sep 17 00:00:00 2001
From: sepro <sebastian.proost@gmail.com>
Date: Mon, 25 Apr 2016 15:27:36 +0200
Subject: [PATCH] further work on QC

---
 pipeline/check/quality.py | 39 +++++++++++++++++++++++++++++++++++++++
 pipeline/transcriptome.py | 15 ++++++++-------
 2 files changed, 47 insertions(+), 7 deletions(-)

diff --git a/pipeline/check/quality.py b/pipeline/check/quality.py
index e69de29..9dfa1ff 100644
--- a/pipeline/check/quality.py
+++ b/pipeline/check/quality.py
@@ -0,0 +1,39 @@
+import sys
+
+__bad_fields = ['no_feature', 'ambiguous', 'too_low_aQual', 'not_aligned', 'alignment_not_unique']
+
+
+def htseq_count_quality(filename, cutoff):
+    values = []
+    total_count = 0
+
+    with open(filename, "r") as fin:
+        for l in fin:
+            gene, count = l.strip().split('\t')
+            if all([gene not in bf for bf in __bad_fields]):
+                values.append(int(count))
+                total_count += int(count)
+
+    if total_count == 0:
+        print("N50 check for", filename, "failed. No reads found")
+        return False
+
+    values.sort(reverse=True)
+
+    current_count = 0
+    for e, v in enumerate(values, start=1):
+        current_count += v
+        if current_count > total_count/2:
+            if e >= cutoff:
+                return True
+
+            break
+
+    print("N50 check for", filename, "failed.", e, cutoff)
+    return False
+
+
+
+
+
+
diff --git a/pipeline/transcriptome.py b/pipeline/transcriptome.py
index 89bf354..5147a93 100644
--- a/pipeline/transcriptome.py
+++ b/pipeline/transcriptome.py
@@ -4,6 +4,7 @@
 from cluster import wait_for_job
 from utils.matrix import read_matrix, write_matrix, normalize_matrix_counts, normalize_matrix_length
 from pipeline.base import PipelineBase
+from pipeline.check.quality import htseq_count_quality
 
 
 class TranscriptomePipeline(PipelineBase):
@@ -267,15 +268,15 @@ def htseq_to_matrix(self):
 
             for file in htseq_files:
                 full_path = os.path.join(htseq_output, file)
+                if htseq_count_quality(full_path, 1):
+                    with open(full_path, "r") as f:
+                        for row in f:
+                            gene_id, count = row.strip().split('\t')
 
-                with open(full_path, "r") as f:
-                    for row in f:
-                        gene_id, count = row.strip().split('\t')
+                            if gene_id not in counts.keys():
+                                counts[gene_id] = {}
 
-                        if gene_id not in counts.keys():
-                            counts[gene_id] = {}
-
-                        counts[gene_id][file] = count
+                            counts[gene_id][file] = count
 
             output_file = self.dp[g]['exp_matrix_output']
             with open(output_file, "w") as f_out: