From 82e0c73bd281981456d3da4fe165f3d84682cc7f Mon Sep 17 00:00:00 2001
From: Jens Preussner <jens.preussner@mpi-bn.mpg.de>
Date: Tue, 3 Apr 2018 16:24:28 +0200
Subject: [PATCH] Initial commit of prinseq-cleaning

---
 .gitignore          |  5 +++
 config/default.yml  | 98 +++++++++++++++++++++++++++++++++++++++++++++
 environment.yml     | 11 +++++
 sc-preprocess.snake | 52 ++++++++++++++++++++++++
 src/auxiliary.snake | 17 ++++++++
 src/clean.snake     | 16 ++++++++
 6 files changed, 199 insertions(+)
 create mode 100644 .gitignore
 create mode 100644 config/default.yml
 create mode 100644 environment.yml
 create mode 100644 sc-preprocess.snake
 create mode 100644 src/auxiliary.snake
 create mode 100644 src/clean.snake

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..5b9ba23
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,5 @@
+*.fastq
+*.fastq.gz
+.snakemake/*
+barcodes.txt
+SampleSheet.txt
diff --git a/config/default.yml b/config/default.yml
new file mode 100644
index 0000000..556c083
--- /dev/null
+++ b/config/default.yml
@@ -0,0 +1,98 @@
+#
+# The action section
+#
+# demultiplex - whether or not to perform demultiplexing. In the latter case, a fastq file per sample is expected (e.g. from Fluidigm C1).
+# clean - whether or not to skip the prinseq cleaning step (see below).
+#
+action:
+  demultiplex: False
+  clean: True
+
+#
+# The samplesheet section
+#
+# file - The path to the SampleSheet text file
+# index - The name of the Column that should be used as index
+# barcode - The name of the Column containing a cell barcode. Set to None if barcodes should be inferred from the data (e.g. in DropSeq)
+#
+samplesheet:
+  file: 'SampleSheet.txt'
+  index: 'Sample'
+  barcode: 'Barcode'
+
+#
+# The data section
+#
+# files - The input sequencing files that will be used for demultiplexing.
+#
+data:
+  files:
+    r1: ''
+    r2: ''
+
+#
+# The reference section
+#
+# URL - The URL from where the reference transcriptome can be downloaded. Wildcards are allowed in curly brackets
+# organism - Wildcard for the reference organism
+# release - Wildcard for the reference release
+# file - Wildcard(s) for the reference files that should be merged into the final reference transcriptome
+# spikeIns - The path to a fasta file with additional cDNA sequences. Names should follow the "symbol|ID" convention
+#
+reference:
+  URL: 'ftp.ebi.ac.uk/pub/databases/gencode/Gencode_{organism}/release_{release}/gencode.v{release}.{file}.fa.gz'
+  organism: 'mouse'
+  release: 'M16'
+  file:
+    - pc_transcripts
+    - lncRNA_transcripts
+  spikeIns: 'ref/humanDux.fasta'
+
+#
+# The salmon section
+#
+# index_flags - flags to add to the salmon call while indexing
+# quant_flags - flags to add to the salmon call while quantification
+#
+salmon:
+  index_flags: '--gencode'
+  quant_flags: '--gcBias'
+
+#
+# The PRINSEQ section
+#
+# flags - flags to add to the PRINSEQ call while cleaning fastq files
+#
+prinseq:
+  flags: '-derep 1 -custom_params "A 8"'
+
+#
+# The fastq-multx section
+#
+# flags - flags to add to the fastq-multx call while demultiplexing fastq files
+#
+fastq_multx:
+  flags: '-m 1'
+
+#
+# The directory section
+#
+# ref - directory name for reference files
+# fastq - directory name for fastq files
+# bam - directory name for STAR alignments
+# quant - directory name for Salmon/RSEM quant files
+# tables - directory name for aggregated tables
+# log - directory name for program logs
+#
+dirs:
+  ref: 'ref'
+  fastq: 'fastq'
+  bam: 'bam'
+  quant: 'quant'
+  tables: 'tables'
+  log: 'log'
+
+#
+# The debug section
+#
+debug: False
diff --git a/environment.yml b/environment.yml
new file mode 100644
index 0000000..6b66da8
--- /dev/null
+++ b/environment.yml
@@ -0,0 +1,11 @@
+name: sc-preprocess
+
+channels:
+  - bioconda
+  - conda-forge
+
+dependencies:
+  - snakemake
+  - salmon
+  - prinseq
+  - fastq-multx
diff --git a/sc-preprocess.snake b/sc-preprocess.snake
new file mode 100644
index 0000000..a44d728
--- /dev/null
+++ b/sc-preprocess.snake
@@ -0,0 +1,52 @@
+import pandas as pd
+from os.path import join, basename, dirname
+
+if workflow.overwrite_configfile != None:
+	configfile: str(workflow.overwrite_configfile)
+else:
+	configfile: 'config/default.yml'
+
+#-------------------------------------------------------------------------------#
+#-------------- Handle reference related paths and file names ------------------#
+#-------------------------------------------------------------------------------#
+
+GENCODE_FASTA = join(config['dirs']['ref'], config['reference']['organism'], config['reference']['release'] + '.fa')
+REFERENCE_FASTA = join(config['dirs']['ref'], config['reference']['organism'], config['reference']['release'] + '-withSpikeIns.fa') if 'spikeIns' in config.get('reference', {}) else GENCODE_FASTA
+
+#-------------------------------------------------------------------------------#
+#------------- Read samplesheet and provide necessary variables ----------------#
+#-------------------------------------------------------------------------------#
+
+samplesheet = pd.read_table(config['samplesheet']['file'], sep = '\t', index_col = config['samplesheet']['index'])
+SAMPLES = samplesheet.to_dict(orient = 'index')
+SAMPLE_NAMES = sorted(SAMPLES.keys())
+
+#-------------------------------------------------------------------------------#
+#--------------------------- Generate output files -----------------------------#
+#-------------------------------------------------------------------------------#
+
+output_files = []
+
+if config["action"]["demultiplex"]:
+  demultiplexed_files = expand('{o}/{s}.fastq.gz', o = config['dirs']['fastq'], s = SAMPLE_NAMES)
+  output_files.extend(demultiplexed_files)
+
+if config["action"]["clean"]:
+  clean_files = expand('{o}/{s}.clean.fastq.gz', o = config['dirs']['fastq'], s = SAMPLE_NAMES)
+  output_files.extend(clean_files)
+
+#-------------------------------------------------------------------------------#
+#---------------------------------- RUN :-) ------------------------------------#
+#-------------------------------------------------------------------------------#
+
+include: "src/auxiliary.snake"
+#include: "src/demultiplex.snake" # not yet implemented
+include: "src/clean.snake"
+
+if config["debug"]:
+  print_debug()
+
+rule all:
+  input:
+    output_files
+  message: "Done."
diff --git a/src/auxiliary.snake b/src/auxiliary.snake
new file mode 100644
index 0000000..3e252d6
--- /dev/null
+++ b/src/auxiliary.snake
@@ -0,0 +1,17 @@
+import textwrap
+
+def print_debug():
+  debug_msg = textwrap.dedent("""\
+  Single-cell preprocessing pipeline
+
+  The following samples are registered:
+  {samples}
+
+  I will use {reference} as reference for transcript quantification.
+
+  The following output files will be produced:
+  {output_files}
+  """)
+  print(debug_msg.format(samples = SAMPLE_NAMES,
+                         reference = REFERENCE_FASTA,
+                         output_files = output_files))
diff --git a/src/clean.snake b/src/clean.snake
new file mode 100644
index 0000000..50c7e7c
--- /dev/null
+++ b/src/clean.snake
@@ -0,0 +1,16 @@
+
+rule prinseq_clean_se_gz:
+  version: "1.0"
+  input:
+    '{out}/{sample}.fastq.gz'
+  output:
+    clean = '{out}/{sample}.clean.fastq.gz',
+    grubby = '{out}/{sample}.grubby.fastq.gz',
+  params:
+    flags = config['prinseq']['flags'] if 'flags' in config.get('prinseq', {}) else ''
+  message: 'Cleaning {input} using prinseq-lite.'
+  log: join(config['dirs']['log'], 'prinseq', 'clean-se.log')
+  shell:
+    """
+    zcat {input} | prinseq-lite.pl -fastq -out_good {wildcards.out}/{wildcards.sample}.clean -out_bad {wildcards.out}/{wildcards.sample}.grubby -graph_data {log} {params.flags}
+    """