Skip to content
This repository has been archived by the owner. It is now read-only.

Commit

Permalink
Implemented collection of statistics
Browse files Browse the repository at this point in the history
  • Loading branch information
jenzopr committed Apr 6, 2018
1 parent ef3f5bc commit 3c9d1ec
Show file tree
Hide file tree
Showing 4 changed files with 92 additions and 10 deletions.
22 changes: 17 additions & 5 deletions config/default.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,12 @@
#
# demultiplex - whether or not to perform demultiplexing. In the latter case, a fastq file per sample is expected (e.g. from Fluidigm C1).
# clean - whether or not to skip the prinseq cleaning step (see below).
# stats - whether or not to collect statistics after data processing.
#
action:
demultiplex: False
clean: True
stats: True

#
# The samplesheet section
Expand Down Expand Up @@ -57,11 +59,17 @@ reference:
salmon:
index_flags: '--gencode'
quant_flags: '-l ISR --gcBias'
collect:
- 'num_processed'
- 'num_mapped'
- 'percent_mapped'

#
# The HTStream section
#
# chain - actions of HTStream (will be prefixed with hts_) and their flags
# actions - actions of HTStream (will be prefixed with hts_) and their flags
# flags - parameters to individual actions. Add --notes prefix for a column prefix in the stats table
# collect - a list of statistics that are collected from logs
#
htstream:
actions:
Expand All @@ -70,10 +78,14 @@ htstream:
- SuperDeduper
- Stats
flags:
- '-StOA --min-trim 8 --max-mismatch 3'
- '-StOA --window-size 20 --avg-qual 20 --min-length 50'
- '-StOA --start 1 --length 50'
- '-SFfgA --notes stats_after_qc'
- '-StOA --min-trim 8 --max-mismatch 3 --notes polyATTrim'
- '-StOA --window-size 20 --avg-qual 20 --min-length 50 --notes qualityTrim'
- '-StOA --start 1 --length 50 --notes Dedup'
- '-SFfgA --notes AfterQC'
collect:
- 'totalFragmentsOutput'
- 'duplicate'
- 'ignored'

#
# The fastq-multx section
Expand Down
8 changes: 6 additions & 2 deletions sc-preprocess.snake
Original file line number Diff line number Diff line change
Expand Up @@ -33,17 +33,21 @@ output_files = [
join(config['dirs']['ref'], 'tx2gene', basename(REFERENCE_FASTA).rstrip(".fa"))
]

if config["action"]["demultiplex"]:
if config['action']['demultiplex']:
demultiplexed_files = expand('{o}/{s}.fastq.gz', o = config['dirs']['fastq'], s = SAMPLE_NAMES)
output_files.extend(demultiplexed_files)

if config["action"]["clean"]:
if config['action']['clean']:
clean_files = expand('{o}/{s}.clean.fastq.gz', o = config['dirs']['fastq'], s = SAMPLE_NAMES)
output_files.extend(clean_files)

quant_files = expand('{o}/{s}/quant.sf', o = config['dirs']['quant'], s = SAMPLE_NAMES)
output_files.extend(quant_files)

if config['action']['stats']:
stat_files = expand('{o}/{t}.txt', o = config['dirs']['tables'], t = ['htstream', 'salmon'])
output_files.extend(stat_files)

#-------------------------------------------------------------------------------#
#---------------------------------- RUN :-) ------------------------------------#
#-------------------------------------------------------------------------------#
Expand Down
40 changes: 39 additions & 1 deletion src/clean.snake
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,44 @@ rule htstream_clean_se_gz:
threads: 4
shell:
"""
hts_Stats -U {input} -L {log} -tO | {params.call} -p {config[dirs][fastq]}/{wildcards.sample}
hts_Stats -U {input} -L {log} -tO --notes BeforeQC | {params.call} -p {config[dirs][fastq]}/{wildcards.sample}
mv {config[dirs][fastq]}/{wildcards.sample}_SE.fastq.gz {output}
"""

rule htstream_stats:
version: "1.0"
input:
expand('{dir}/htstream/{sample}.txt', dir = config['dirs']['log'], sample = SAMPLE_NAMES)
output:
table = config['dirs']['tables'] + '/htstream.txt'
params:
columns = config['htstream']['collect'] if 'collect' in config.get('htstream', {}) else [],
samples = SAMPLE_NAMES,
label = config['samplesheet']['index']
message: 'Collecting statistics from HTStream runs.'
run:
import json
import pandas as pd

stats = list()

for file in input:
with open(file, "r") as f:
data = json.load(f)

task_row = dict()

for task in data.keys():
try:
task_data = data[task]
column_base = task_data['Notes'] if task_data['Notes'] != "" else task
column_keys = list(set(params.columns).intersection(set(task_data.keys())))
column_values = [task_data[x] for x in column_keys]
task_row.update(dict(zip([column_base + '_' + s for s in column_keys], column_values)))

except KeyError:
continue
stats.append(task_row)

df = pd.DataFrame(stats, index = params.samples)
df.to_csv(output.table, sep = "\t", index_label = params.label)
32 changes: 30 additions & 2 deletions src/salmon.snake
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ rule salmon_index:
params:
flags = config['salmon']['index_flags'] if 'index_flags' in config.get('salmon', {}) else ''
log:
'logs/salmon_index.log'
config['dirs']['log']+'/salmon/index.log'
message: 'Creating Salmon index for {input}.'
version: SALMON_VERSION
shell:
Expand All @@ -41,11 +41,39 @@ rule salmon_quant_se:
params:
flags = config['salmon']['quant_flags'] if 'quant_flags' in config.get('salmon', {}) else ''
log:
'logs/salmon_quant_{sample}.log'
config['dirs']['log']+'/salmon/{sample}.quant.log'
message: 'Quantifying {wildcards.sample} with Salmon using {threads} threads.'
threads: 30
version: SALMON_VERSION
shell:
"""
salmon quant -p {threads} -i {input.index} -r {input.r1} -o {config[dirs][quant]}/{wildcards.sample} {params.flags} &> {log}
"""

rule salmon_stats:
input:
expand('{dir}/{sample}/aux_info/meta_info.json', dir = config['dirs']['quant'], sample = SAMPLE_NAMES)
output:
table = config['dirs']['tables'] + '/salmon.txt'
params:
columns = config['salmon']['collect'] if 'collect' in config.get('salmon', {}) else [],
samples = SAMPLE_NAMES,
label = config['samplesheet']['index']
message: 'Collecting statistics from Salmon runs.'
run:
import json
import pandas as pd

stats = list()

for file in input:
with open(file, "r") as f:
data = json.load(f)

column_keys = list(set(params.columns).intersection(set(data.keys())))
column_values = [data[x] for x in column_keys]
sample_row = dict(zip(column_keys, column_values))
stats.append(sample_row)

df = pd.DataFrame(stats, index = params.samples)
df.to_csv(output.table, sep = "\t", index_label = params.label)

0 comments on commit 3c9d1ec

Please sign in to comment.