Skip to content
This repository has been archived by the owner. It is now read-only.

Commit

Permalink
Initial commit of prinseq-cleaning
Browse files Browse the repository at this point in the history
  • Loading branch information
jenzopr committed Apr 3, 2018
1 parent e67ad59 commit 82e0c73
Show file tree
Hide file tree
Showing 6 changed files with 199 additions and 0 deletions.
5 changes: 5 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
*.fastq
*.fastq.gz
.snakemake/*
barcodes.txt
SampleSheet.txt
98 changes: 98 additions & 0 deletions config/default.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
#
# The action section
#
# demultiplex - whether or not to perform demultiplexing. In the latter case, a fastq file per sample is expected (e.g. from Fluidigm C1).
# clean - whether or not to skip the prinseq cleaning step (see below).
#
action:
demultiplex: False
clean: True

#
# The samplesheet section
#
# file - The path to the SampleSheet text file
# index - The name of the Column that should be used as index
# barcode - The name of the Column containing a cell barcode. Set to None if barcodes should be inferred from the data (e.g. in DropSeq)
#
samplesheet:
file: 'SampleSheet.txt'
index: 'Sample'
barcode: 'Barcode'

#
# The data section
#
# files - The input sequencing files that will be used for demultiplexing.
#
data:
files:
r1: ''
r2: ''

#
# The reference section
#
# URL - The URL from where the reference transcriptome can be downloaded. Wildcards are allowed in curly brackets
# organism - Wildcard for the reference organism
# release - Wildcard for the reference release
# file - Wildcard(s) for the reference files that should be merged into the final reference transcriptome
# spikeIns - The path to a fasta file with additional cDNA sequences. Names should follow the "symbol|ID" convention
#
reference:
URL: 'ftp.ebi.ac.uk/pub/databases/gencode/Gencode_{organism}/release_{release}/gencode.v{release}.{file}.fa.gz'
organism: 'mouse'
release: 'M16'
file:
- pc_transcripts
- lncRNA_transcripts
spikeIns: 'ref/humanDux.fasta'

#
# The salmon section
#
# index_flags - flags to add to the salmon call while indexing
# quant_flags - flags to add to the salmon call while quantification
#
salmon:
index_flags: '--gencode'
quant_flags: '--gcBias'

#
# The PRINSEQ section
#
# flags - flags to add to the PRINSEQ call while cleaning fastq files
#
prinseq:
flags: '-derep 1 -custom_params "A 8"'

#
# The fastq-multx section
#
# flags - flags to add to the fastq-multx call while demultiplexing fastq files
#
fastq_multx:
flags: '-m 1'

#
# The directory section
#
# ref - directory name for reference files
# fastq - directory name for fastq files
# bam - directory name for STAR alignments
# quant - directory name for Salmon/RSEM quant files
# tables - directory name for aggregated tables
# log - directory name for program logs
#
dirs:
ref: 'ref'
fastq: 'fastq'
bam: 'bam'
quant: 'quant'
tables: 'tables'
log: 'log'

#
# The debug section
#
debug: False
11 changes: 11 additions & 0 deletions environment.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
name: sc-preprocess

channels:
- bioconda
- conda-forge

dependencies:
- snakemake
- salmon
- prinseq
- fastq-multx
52 changes: 52 additions & 0 deletions sc-preprocess.snake
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
import pandas as pd
from os.path import join, basename, dirname

if workflow.overwrite_configfile != None:
configfile: str(workflow.overwrite_configfile)
else:
configfile: 'config/default.yml'

#-------------------------------------------------------------------------------#
#-------------- Handle reference related paths and file names ------------------#
#-------------------------------------------------------------------------------#

GENCODE_FASTA = join(config['dirs']['ref'], config['reference']['organism'], config['reference']['release'] + '.fa')
REFERENCE_FASTA = join(config['dirs']['ref'], config['reference']['organism'], config['reference']['release'] + '-withSpikeIns.fa') if 'spikeIns' in config.get('reference', {}) else GENCODE_FASTA

#-------------------------------------------------------------------------------#
#------------- Read samplesheet and provide necessary variables ----------------#
#-------------------------------------------------------------------------------#

samplesheet = pd.read_table(config['samplesheet']['file'], sep = '\t', index_col = config['samplesheet']['index'])
SAMPLES = samplesheet.to_dict(orient = 'index')
SAMPLE_NAMES = sorted(SAMPLES.keys())

#-------------------------------------------------------------------------------#
#--------------------------- Generate output files -----------------------------#
#-------------------------------------------------------------------------------#

output_files = []

if config["action"]["demultiplex"]:
demultiplexed_files = expand('{o}/{s}.fastq.gz', o = config['dirs']['fastq'], s = SAMPLE_NAMES)
output_files.extend(demultiplexed_files)

if config["action"]["clean"]:
clean_files = expand('{o}/{s}.clean.fastq.gz', o = config['dirs']['fastq'], s = SAMPLE_NAMES)
output_files.extend(clean_files)

#-------------------------------------------------------------------------------#
#---------------------------------- RUN :-) ------------------------------------#
#-------------------------------------------------------------------------------#

include: "src/auxiliary.snake"
#include: "src/demultiplex.snake" # not yet implemented
include: "src/clean.snake"

if config["debug"]:
print_debug()

rule all:
input:
output_files
message: "Done."
17 changes: 17 additions & 0 deletions src/auxiliary.snake
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
import textwrap

def print_debug():
debug_msg = textwrap.dedent("""\
Single-cell preprocessing pipeline

The following samples are registered:
{samples}

I will use {reference} as reference for transcript quantification.

The following output files will be produced:
{output_files}
""")
print(debug_msg.format(samples = SAMPLE_NAMES,
reference = REFERENCE_FASTA,
output_files = output_files))
16 changes: 16 additions & 0 deletions src/clean.snake
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@

rule prinseq_clean_se_gz:
version: "1.0"
input:
'{out}/{sample}.fastq.gz'
output:
clean = '{out}/{sample}.clean.fastq.gz',
grubby = '{out}/{sample}.grubby.fastq.gz',
params:
flags = config['prinseq']['flags'] if 'flags' in config.get('prinseq', {}) else ''
message: 'Cleaning {input} using prinseq-lite.'
log: join(config['dirs']['log'], 'prinseq', 'clean-se.log')
shell:
"""
zcat {input} | prinseq-lite.pl -fastq -out_good {wildcards.out}/{wildcards.sample}.clean -out_bad {wildcards.out}/{wildcards.sample}.grubby -graph_data {log} {params.flags}
"""

0 comments on commit 82e0c73

Please sign in to comment.