From 82e0c73bd281981456d3da4fe165f3d84682cc7f Mon Sep 17 00:00:00 2001 From: Jens Preussner Date: Tue, 3 Apr 2018 16:24:28 +0200 Subject: [PATCH] Initial commit of prinseq-cleaning --- .gitignore | 5 +++ config/default.yml | 98 +++++++++++++++++++++++++++++++++++++++++++++ environment.yml | 11 +++++ sc-preprocess.snake | 52 ++++++++++++++++++++++++ src/auxiliary.snake | 17 ++++++++ src/clean.snake | 16 ++++++++ 6 files changed, 199 insertions(+) create mode 100644 .gitignore create mode 100644 config/default.yml create mode 100644 environment.yml create mode 100644 sc-preprocess.snake create mode 100644 src/auxiliary.snake create mode 100644 src/clean.snake diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..5b9ba23 --- /dev/null +++ b/.gitignore @@ -0,0 +1,5 @@ +*.fastq +*.fastq.gz +.snakemake/* +barcodes.txt +SampleSheet.txt diff --git a/config/default.yml b/config/default.yml new file mode 100644 index 0000000..556c083 --- /dev/null +++ b/config/default.yml @@ -0,0 +1,98 @@ +# +# The action section +# +# demultiplex - whether or not to perform demultiplexing. In the latter case, a fastq file per sample is expected (e.g. from Fluidigm C1). +# clean - whether or not to skip the prinseq cleaning step (see below). +# +action: + demultiplex: False + clean: True + +# +# The samplesheet section +# +# file - The path to the SampleSheet text file +# index - The name of the Column that should be used as index +# barcode - The name of the Column containing a cell barcode. Set to None if barcodes should be inferred from the data (e.g. in DropSeq) +# +samplesheet: + file: 'SampleSheet.txt' + index: 'Sample' + barcode: 'Barcode' + +# +# The data section +# +# files - The input sequencing files that will be used for demultiplexing. +# +data: + files: + r1: '' + r2: '' + +# +# The reference section +# +# URL - The URL from where the reference transcriptome can be downloaded. Wildcards are allowed in curly brackets +# organism - Wildcard for the reference organism +# release - Wildcard for the reference release +# file - Wildcard(s) for the reference files that should be merged into the final reference transcriptome +# spikeIns - The path to a fasta file with additional cDNA sequences. Names should follow the "symbol|ID" convention +# +reference: + URL: 'ftp.ebi.ac.uk/pub/databases/gencode/Gencode_{organism}/release_{release}/gencode.v{release}.{file}.fa.gz' + organism: 'mouse' + release: 'M16' + file: + - pc_transcripts + - lncRNA_transcripts + spikeIns: 'ref/humanDux.fasta' + +# +# The salmon section +# +# index_flags - flags to add to the salmon call while indexing +# quant_flags - flags to add to the salmon call while quantification +# +salmon: + index_flags: '--gencode' + quant_flags: '--gcBias' + +# +# The PRINSEQ section +# +# flags - flags to add to the PRINSEQ call while cleaning fastq files +# +prinseq: + flags: '-derep 1 -custom_params "A 8"' + +# +# The fastq-multx section +# +# flags - flags to add to the fastq-multx call while demultiplexing fastq files +# +fastq_multx: + flags: '-m 1' + +# +# The directory section +# +# ref - directory name for reference files +# fastq - directory name for fastq files +# bam - directory name for STAR alignments +# quant - directory name for Salmon/RSEM quant files +# tables - directory name for aggregated tables +# log - directory name for program logs +# +dirs: + ref: 'ref' + fastq: 'fastq' + bam: 'bam' + quant: 'quant' + tables: 'tables' + log: 'log' + +# +# The debug section +# +debug: False diff --git a/environment.yml b/environment.yml new file mode 100644 index 0000000..6b66da8 --- /dev/null +++ b/environment.yml @@ -0,0 +1,11 @@ +name: sc-preprocess + +channels: + - bioconda + - conda-forge + +dependencies: + - snakemake + - salmon + - prinseq + - fastq-multx diff --git a/sc-preprocess.snake b/sc-preprocess.snake new file mode 100644 index 0000000..a44d728 --- /dev/null +++ b/sc-preprocess.snake @@ -0,0 +1,52 @@ +import pandas as pd +from os.path import join, basename, dirname + +if workflow.overwrite_configfile != None: + configfile: str(workflow.overwrite_configfile) +else: + configfile: 'config/default.yml' + +#-------------------------------------------------------------------------------# +#-------------- Handle reference related paths and file names ------------------# +#-------------------------------------------------------------------------------# + +GENCODE_FASTA = join(config['dirs']['ref'], config['reference']['organism'], config['reference']['release'] + '.fa') +REFERENCE_FASTA = join(config['dirs']['ref'], config['reference']['organism'], config['reference']['release'] + '-withSpikeIns.fa') if 'spikeIns' in config.get('reference', {}) else GENCODE_FASTA + +#-------------------------------------------------------------------------------# +#------------- Read samplesheet and provide necessary variables ----------------# +#-------------------------------------------------------------------------------# + +samplesheet = pd.read_table(config['samplesheet']['file'], sep = '\t', index_col = config['samplesheet']['index']) +SAMPLES = samplesheet.to_dict(orient = 'index') +SAMPLE_NAMES = sorted(SAMPLES.keys()) + +#-------------------------------------------------------------------------------# +#--------------------------- Generate output files -----------------------------# +#-------------------------------------------------------------------------------# + +output_files = [] + +if config["action"]["demultiplex"]: + demultiplexed_files = expand('{o}/{s}.fastq.gz', o = config['dirs']['fastq'], s = SAMPLE_NAMES) + output_files.extend(demultiplexed_files) + +if config["action"]["clean"]: + clean_files = expand('{o}/{s}.clean.fastq.gz', o = config['dirs']['fastq'], s = SAMPLE_NAMES) + output_files.extend(clean_files) + +#-------------------------------------------------------------------------------# +#---------------------------------- RUN :-) ------------------------------------# +#-------------------------------------------------------------------------------# + +include: "src/auxiliary.snake" +#include: "src/demultiplex.snake" # not yet implemented +include: "src/clean.snake" + +if config["debug"]: + print_debug() + +rule all: + input: + output_files + message: "Done." diff --git a/src/auxiliary.snake b/src/auxiliary.snake new file mode 100644 index 0000000..3e252d6 --- /dev/null +++ b/src/auxiliary.snake @@ -0,0 +1,17 @@ +import textwrap + +def print_debug(): + debug_msg = textwrap.dedent("""\ + Single-cell preprocessing pipeline + + The following samples are registered: + {samples} + + I will use {reference} as reference for transcript quantification. + + The following output files will be produced: + {output_files} + """) + print(debug_msg.format(samples = SAMPLE_NAMES, + reference = REFERENCE_FASTA, + output_files = output_files)) diff --git a/src/clean.snake b/src/clean.snake new file mode 100644 index 0000000..50c7e7c --- /dev/null +++ b/src/clean.snake @@ -0,0 +1,16 @@ + +rule prinseq_clean_se_gz: + version: "1.0" + input: + '{out}/{sample}.fastq.gz' + output: + clean = '{out}/{sample}.clean.fastq.gz', + grubby = '{out}/{sample}.grubby.fastq.gz', + params: + flags = config['prinseq']['flags'] if 'flags' in config.get('prinseq', {}) else '' + message: 'Cleaning {input} using prinseq-lite.' + log: join(config['dirs']['log'], 'prinseq', 'clean-se.log') + shell: + """ + zcat {input} | prinseq-lite.pl -fastq -out_good {wildcards.out}/{wildcards.sample}.clean -out_bad {wildcards.out}/{wildcards.sample}.grubby -graph_data {log} {params.flags} + """