From 3c9d1ec35e2347114629e00adfee1d184a669be6 Mon Sep 17 00:00:00 2001 From: Jens Preussner Date: Fri, 6 Apr 2018 09:14:59 +0200 Subject: [PATCH] Implemented collection of statistics --- config/default.yml | 22 +++++++++++++++++----- sc-preprocess.snake | 8 ++++++-- src/clean.snake | 40 +++++++++++++++++++++++++++++++++++++++- src/salmon.snake | 32 ++++++++++++++++++++++++++++++-- 4 files changed, 92 insertions(+), 10 deletions(-) diff --git a/config/default.yml b/config/default.yml index 1ebada4..2b86fdf 100644 --- a/config/default.yml +++ b/config/default.yml @@ -3,10 +3,12 @@ # # demultiplex - whether or not to perform demultiplexing. In the latter case, a fastq file per sample is expected (e.g. from Fluidigm C1). # clean - whether or not to skip the prinseq cleaning step (see below). +# stats - whether or not to collect statistics after data processing. # action: demultiplex: False clean: True + stats: True # # The samplesheet section @@ -57,11 +59,17 @@ reference: salmon: index_flags: '--gencode' quant_flags: '-l ISR --gcBias' + collect: + - 'num_processed' + - 'num_mapped' + - 'percent_mapped' # # The HTStream section # -# chain - actions of HTStream (will be prefixed with hts_) and their flags +# actions - actions of HTStream (will be prefixed with hts_) and their flags +# flags - parameters to individual actions. Add --notes prefix for a column prefix in the stats table +# collect - a list of statistics that are collected from logs # htstream: actions: @@ -70,10 +78,14 @@ htstream: - SuperDeduper - Stats flags: - - '-StOA --min-trim 8 --max-mismatch 3' - - '-StOA --window-size 20 --avg-qual 20 --min-length 50' - - '-StOA --start 1 --length 50' - - '-SFfgA --notes stats_after_qc' + - '-StOA --min-trim 8 --max-mismatch 3 --notes polyATTrim' + - '-StOA --window-size 20 --avg-qual 20 --min-length 50 --notes qualityTrim' + - '-StOA --start 1 --length 50 --notes Dedup' + - '-SFfgA --notes AfterQC' + collect: + - 'totalFragmentsOutput' + - 'duplicate' + - 'ignored' # # The fastq-multx section diff --git a/sc-preprocess.snake b/sc-preprocess.snake index f0b59e9..1d13d2a 100644 --- a/sc-preprocess.snake +++ b/sc-preprocess.snake @@ -33,17 +33,21 @@ output_files = [ join(config['dirs']['ref'], 'tx2gene', basename(REFERENCE_FASTA).rstrip(".fa")) ] -if config["action"]["demultiplex"]: +if config['action']['demultiplex']: demultiplexed_files = expand('{o}/{s}.fastq.gz', o = config['dirs']['fastq'], s = SAMPLE_NAMES) output_files.extend(demultiplexed_files) -if config["action"]["clean"]: +if config['action']['clean']: clean_files = expand('{o}/{s}.clean.fastq.gz', o = config['dirs']['fastq'], s = SAMPLE_NAMES) output_files.extend(clean_files) quant_files = expand('{o}/{s}/quant.sf', o = config['dirs']['quant'], s = SAMPLE_NAMES) output_files.extend(quant_files) +if config['action']['stats']: + stat_files = expand('{o}/{t}.txt', o = config['dirs']['tables'], t = ['htstream', 'salmon']) + output_files.extend(stat_files) + #-------------------------------------------------------------------------------# #---------------------------------- RUN :-) ------------------------------------# #-------------------------------------------------------------------------------# diff --git a/src/clean.snake b/src/clean.snake index 0244f1c..bb56280 100644 --- a/src/clean.snake +++ b/src/clean.snake @@ -25,6 +25,44 @@ rule htstream_clean_se_gz: threads: 4 shell: """ - hts_Stats -U {input} -L {log} -tO | {params.call} -p {config[dirs][fastq]}/{wildcards.sample} + hts_Stats -U {input} -L {log} -tO --notes BeforeQC | {params.call} -p {config[dirs][fastq]}/{wildcards.sample} mv {config[dirs][fastq]}/{wildcards.sample}_SE.fastq.gz {output} """ + +rule htstream_stats: + version: "1.0" + input: + expand('{dir}/htstream/{sample}.txt', dir = config['dirs']['log'], sample = SAMPLE_NAMES) + output: + table = config['dirs']['tables'] + '/htstream.txt' + params: + columns = config['htstream']['collect'] if 'collect' in config.get('htstream', {}) else [], + samples = SAMPLE_NAMES, + label = config['samplesheet']['index'] + message: 'Collecting statistics from HTStream runs.' + run: + import json + import pandas as pd + + stats = list() + + for file in input: + with open(file, "r") as f: + data = json.load(f) + + task_row = dict() + + for task in data.keys(): + try: + task_data = data[task] + column_base = task_data['Notes'] if task_data['Notes'] != "" else task + column_keys = list(set(params.columns).intersection(set(task_data.keys()))) + column_values = [task_data[x] for x in column_keys] + task_row.update(dict(zip([column_base + '_' + s for s in column_keys], column_values))) + + except KeyError: + continue + stats.append(task_row) + + df = pd.DataFrame(stats, index = params.samples) + df.to_csv(output.table, sep = "\t", index_label = params.label) diff --git a/src/salmon.snake b/src/salmon.snake index 4b97be3..4645e8a 100644 --- a/src/salmon.snake +++ b/src/salmon.snake @@ -18,7 +18,7 @@ rule salmon_index: params: flags = config['salmon']['index_flags'] if 'index_flags' in config.get('salmon', {}) else '' log: - 'logs/salmon_index.log' + config['dirs']['log']+'/salmon/index.log' message: 'Creating Salmon index for {input}.' version: SALMON_VERSION shell: @@ -41,7 +41,7 @@ rule salmon_quant_se: params: flags = config['salmon']['quant_flags'] if 'quant_flags' in config.get('salmon', {}) else '' log: - 'logs/salmon_quant_{sample}.log' + config['dirs']['log']+'/salmon/{sample}.quant.log' message: 'Quantifying {wildcards.sample} with Salmon using {threads} threads.' threads: 30 version: SALMON_VERSION @@ -49,3 +49,31 @@ rule salmon_quant_se: """ salmon quant -p {threads} -i {input.index} -r {input.r1} -o {config[dirs][quant]}/{wildcards.sample} {params.flags} &> {log} """ + +rule salmon_stats: + input: + expand('{dir}/{sample}/aux_info/meta_info.json', dir = config['dirs']['quant'], sample = SAMPLE_NAMES) + output: + table = config['dirs']['tables'] + '/salmon.txt' + params: + columns = config['salmon']['collect'] if 'collect' in config.get('salmon', {}) else [], + samples = SAMPLE_NAMES, + label = config['samplesheet']['index'] + message: 'Collecting statistics from Salmon runs.' + run: + import json + import pandas as pd + + stats = list() + + for file in input: + with open(file, "r") as f: + data = json.load(f) + + column_keys = list(set(params.columns).intersection(set(data.keys()))) + column_values = [data[x] for x in column_keys] + sample_row = dict(zip(column_keys, column_values)) + stats.append(sample_row) + + df = pd.DataFrame(stats, index = params.samples) + df.to_csv(output.table, sep = "\t", index_label = params.label)