From 3c9d1ec35e2347114629e00adfee1d184a669be6 Mon Sep 17 00:00:00 2001
From: Jens Preussner <jens.preussner@mpi-bn.mpg.de>
Date: Fri, 6 Apr 2018 09:14:59 +0200
Subject: [PATCH] Implemented collection of statistics

---
 config/default.yml  | 22 +++++++++++++++++-----
 sc-preprocess.snake |  8 ++++++--
 src/clean.snake     | 40 +++++++++++++++++++++++++++++++++++++++-
 src/salmon.snake    | 32 ++++++++++++++++++++++++++++++--
 4 files changed, 92 insertions(+), 10 deletions(-)

diff --git a/config/default.yml b/config/default.yml
index 1ebada4..2b86fdf 100644
--- a/config/default.yml
+++ b/config/default.yml
@@ -3,10 +3,12 @@
 #
 # demultiplex - whether or not to perform demultiplexing. In the latter case, a fastq file per sample is expected (e.g. from Fluidigm C1).
 # clean - whether or not to skip the prinseq cleaning step (see below).
+# stats - whether or not to collect statistics after data processing.
 #
 action:
   demultiplex: False
   clean: True
+  stats: True
 
 #
 # The samplesheet section
@@ -57,11 +59,17 @@ reference:
 salmon:
   index_flags: '--gencode'
   quant_flags: '-l ISR --gcBias'
+  collect:
+    - 'num_processed'
+    - 'num_mapped'
+    - 'percent_mapped'
 
 #
 # The HTStream section
 #
-# chain - actions of HTStream (will be prefixed with hts_) and their flags
+# actions - actions of HTStream (will be prefixed with hts_) and their flags
+# flags - parameters to individual actions. Add --notes prefix for a column prefix in the stats table
+# collect - a list of statistics that are collected from logs
 #
 htstream:
   actions:
@@ -70,10 +78,14 @@ htstream:
     - SuperDeduper
     - Stats
   flags:
-    - '-StOA --min-trim 8 --max-mismatch 3'
-    - '-StOA --window-size 20 --avg-qual 20 --min-length 50'
-    - '-StOA --start 1 --length 50'
-    - '-SFfgA --notes stats_after_qc'
+    - '-StOA --min-trim 8 --max-mismatch 3 --notes polyATTrim'
+    - '-StOA --window-size 20 --avg-qual 20 --min-length 50 --notes qualityTrim'
+    - '-StOA --start 1 --length 50 --notes Dedup'
+    - '-SFfgA --notes AfterQC'
+  collect:
+    - 'totalFragmentsOutput'
+    - 'duplicate'
+    - 'ignored'
 
 #
 # The fastq-multx section
diff --git a/sc-preprocess.snake b/sc-preprocess.snake
index f0b59e9..1d13d2a 100644
--- a/sc-preprocess.snake
+++ b/sc-preprocess.snake
@@ -33,17 +33,21 @@ output_files = [
 	join(config['dirs']['ref'], 'tx2gene', basename(REFERENCE_FASTA).rstrip(".fa"))
 ]
 
-if config["action"]["demultiplex"]:
+if config['action']['demultiplex']:
   demultiplexed_files = expand('{o}/{s}.fastq.gz', o = config['dirs']['fastq'], s = SAMPLE_NAMES)
   output_files.extend(demultiplexed_files)
 
-if config["action"]["clean"]:
+if config['action']['clean']:
   clean_files = expand('{o}/{s}.clean.fastq.gz', o = config['dirs']['fastq'], s = SAMPLE_NAMES)
   output_files.extend(clean_files)
 
 quant_files = expand('{o}/{s}/quant.sf', o = config['dirs']['quant'], s = SAMPLE_NAMES)
 output_files.extend(quant_files)
 
+if config['action']['stats']:
+	stat_files = expand('{o}/{t}.txt', o = config['dirs']['tables'], t = ['htstream', 'salmon'])
+	output_files.extend(stat_files)
+
 #-------------------------------------------------------------------------------#
 #---------------------------------- RUN :-) ------------------------------------#
 #-------------------------------------------------------------------------------#
diff --git a/src/clean.snake b/src/clean.snake
index 0244f1c..bb56280 100644
--- a/src/clean.snake
+++ b/src/clean.snake
@@ -25,6 +25,44 @@ rule htstream_clean_se_gz:
   threads: 4
   shell:
     """
-    hts_Stats -U {input} -L {log} -tO | {params.call} -p {config[dirs][fastq]}/{wildcards.sample}
+    hts_Stats -U {input} -L {log} -tO --notes BeforeQC | {params.call} -p {config[dirs][fastq]}/{wildcards.sample}
     mv {config[dirs][fastq]}/{wildcards.sample}_SE.fastq.gz {output}
     """
+
+rule htstream_stats:
+  version: "1.0"
+  input:
+    expand('{dir}/htstream/{sample}.txt', dir = config['dirs']['log'], sample = SAMPLE_NAMES)
+  output:
+    table = config['dirs']['tables'] + '/htstream.txt'
+  params:
+    columns = config['htstream']['collect'] if 'collect' in config.get('htstream', {}) else [],
+    samples = SAMPLE_NAMES,
+    label = config['samplesheet']['index']
+  message: 'Collecting statistics from HTStream runs.'
+  run:
+    import json
+    import pandas as pd
+
+    stats = list()
+
+    for file in input:
+      with open(file, "r") as f:
+        data = json.load(f)
+
+      task_row = dict()
+
+      for task in data.keys():
+        try:
+          task_data = data[task]
+          column_base = task_data['Notes'] if task_data['Notes'] != "" else task
+          column_keys = list(set(params.columns).intersection(set(task_data.keys())))
+          column_values = [task_data[x] for x in column_keys]
+          task_row.update(dict(zip([column_base + '_' + s for s in column_keys], column_values)))
+
+        except KeyError:
+          continue
+      stats.append(task_row)
+
+    df = pd.DataFrame(stats, index = params.samples)
+    df.to_csv(output.table, sep = "\t", index_label = params.label)
diff --git a/src/salmon.snake b/src/salmon.snake
index 4b97be3..4645e8a 100644
--- a/src/salmon.snake
+++ b/src/salmon.snake
@@ -18,7 +18,7 @@ rule salmon_index:
   params:
     flags = config['salmon']['index_flags'] if 'index_flags' in config.get('salmon', {}) else ''
   log:
-    'logs/salmon_index.log'
+    config['dirs']['log']+'/salmon/index.log'
   message: 'Creating Salmon index for {input}.'
   version: SALMON_VERSION
   shell:
@@ -41,7 +41,7 @@ rule salmon_quant_se:
   params:
     flags = config['salmon']['quant_flags'] if 'quant_flags' in config.get('salmon', {}) else ''
   log:
-    'logs/salmon_quant_{sample}.log'
+    config['dirs']['log']+'/salmon/{sample}.quant.log'
   message: 'Quantifying {wildcards.sample} with Salmon using {threads} threads.'
   threads: 30
   version: SALMON_VERSION
@@ -49,3 +49,31 @@ rule salmon_quant_se:
     """
     salmon quant -p {threads} -i {input.index} -r {input.r1} -o {config[dirs][quant]}/{wildcards.sample} {params.flags} &> {log}
     """
+
+rule salmon_stats:
+  input:
+    expand('{dir}/{sample}/aux_info/meta_info.json', dir = config['dirs']['quant'], sample = SAMPLE_NAMES)
+  output:
+    table = config['dirs']['tables'] + '/salmon.txt'
+  params:
+    columns = config['salmon']['collect'] if 'collect' in config.get('salmon', {}) else [],
+    samples = SAMPLE_NAMES,
+    label = config['samplesheet']['index']
+  message: 'Collecting statistics from Salmon runs.'
+  run:
+    import json
+    import pandas as pd
+
+    stats = list()
+
+    for file in input:
+      with open(file, "r") as f:
+        data = json.load(f)
+
+      column_keys = list(set(params.columns).intersection(set(data.keys())))
+      column_values = [data[x] for x in column_keys]
+      sample_row = dict(zip(column_keys, column_values))
+      stats.append(sample_row)
+
+    df = pd.DataFrame(stats, index = params.samples)
+    df.to_csv(output.table, sep = "\t", index_label = params.label)