This repository has been archived by the owner. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
6 changed files
with
199 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
*.fastq | ||
*.fastq.gz | ||
.snakemake/* | ||
barcodes.txt | ||
SampleSheet.txt |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,98 @@ | ||
# | ||
# The action section | ||
# | ||
# demultiplex - whether or not to perform demultiplexing. In the latter case, a fastq file per sample is expected (e.g. from Fluidigm C1). | ||
# clean - whether or not to skip the prinseq cleaning step (see below). | ||
# | ||
action: | ||
demultiplex: False | ||
clean: True | ||
|
||
# | ||
# The samplesheet section | ||
# | ||
# file - The path to the SampleSheet text file | ||
# index - The name of the Column that should be used as index | ||
# barcode - The name of the Column containing a cell barcode. Set to None if barcodes should be inferred from the data (e.g. in DropSeq) | ||
# | ||
samplesheet: | ||
file: 'SampleSheet.txt' | ||
index: 'Sample' | ||
barcode: 'Barcode' | ||
|
||
# | ||
# The data section | ||
# | ||
# files - The input sequencing files that will be used for demultiplexing. | ||
# | ||
data: | ||
files: | ||
r1: '' | ||
r2: '' | ||
|
||
# | ||
# The reference section | ||
# | ||
# URL - The URL from where the reference transcriptome can be downloaded. Wildcards are allowed in curly brackets | ||
# organism - Wildcard for the reference organism | ||
# release - Wildcard for the reference release | ||
# file - Wildcard(s) for the reference files that should be merged into the final reference transcriptome | ||
# spikeIns - The path to a fasta file with additional cDNA sequences. Names should follow the "symbol|ID" convention | ||
# | ||
reference: | ||
URL: 'ftp.ebi.ac.uk/pub/databases/gencode/Gencode_{organism}/release_{release}/gencode.v{release}.{file}.fa.gz' | ||
organism: 'mouse' | ||
release: 'M16' | ||
file: | ||
- pc_transcripts | ||
- lncRNA_transcripts | ||
spikeIns: 'ref/humanDux.fasta' | ||
|
||
# | ||
# The salmon section | ||
# | ||
# index_flags - flags to add to the salmon call while indexing | ||
# quant_flags - flags to add to the salmon call while quantification | ||
# | ||
salmon: | ||
index_flags: '--gencode' | ||
quant_flags: '--gcBias' | ||
|
||
# | ||
# The PRINSEQ section | ||
# | ||
# flags - flags to add to the PRINSEQ call while cleaning fastq files | ||
# | ||
prinseq: | ||
flags: '-derep 1 -custom_params "A 8"' | ||
|
||
# | ||
# The fastq-multx section | ||
# | ||
# flags - flags to add to the fastq-multx call while demultiplexing fastq files | ||
# | ||
fastq_multx: | ||
flags: '-m 1' | ||
|
||
# | ||
# The directory section | ||
# | ||
# ref - directory name for reference files | ||
# fastq - directory name for fastq files | ||
# bam - directory name for STAR alignments | ||
# quant - directory name for Salmon/RSEM quant files | ||
# tables - directory name for aggregated tables | ||
# log - directory name for program logs | ||
# | ||
dirs: | ||
ref: 'ref' | ||
fastq: 'fastq' | ||
bam: 'bam' | ||
quant: 'quant' | ||
tables: 'tables' | ||
log: 'log' | ||
|
||
# | ||
# The debug section | ||
# | ||
debug: False |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
name: sc-preprocess | ||
|
||
channels: | ||
- bioconda | ||
- conda-forge | ||
|
||
dependencies: | ||
- snakemake | ||
- salmon | ||
- prinseq | ||
- fastq-multx |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,52 @@ | ||
import pandas as pd | ||
from os.path import join, basename, dirname | ||
|
||
if workflow.overwrite_configfile != None: | ||
configfile: str(workflow.overwrite_configfile) | ||
else: | ||
configfile: 'config/default.yml' | ||
|
||
#-------------------------------------------------------------------------------# | ||
#-------------- Handle reference related paths and file names ------------------# | ||
#-------------------------------------------------------------------------------# | ||
|
||
GENCODE_FASTA = join(config['dirs']['ref'], config['reference']['organism'], config['reference']['release'] + '.fa') | ||
REFERENCE_FASTA = join(config['dirs']['ref'], config['reference']['organism'], config['reference']['release'] + '-withSpikeIns.fa') if 'spikeIns' in config.get('reference', {}) else GENCODE_FASTA | ||
|
||
#-------------------------------------------------------------------------------# | ||
#------------- Read samplesheet and provide necessary variables ----------------# | ||
#-------------------------------------------------------------------------------# | ||
|
||
samplesheet = pd.read_table(config['samplesheet']['file'], sep = '\t', index_col = config['samplesheet']['index']) | ||
SAMPLES = samplesheet.to_dict(orient = 'index') | ||
SAMPLE_NAMES = sorted(SAMPLES.keys()) | ||
|
||
#-------------------------------------------------------------------------------# | ||
#--------------------------- Generate output files -----------------------------# | ||
#-------------------------------------------------------------------------------# | ||
|
||
output_files = [] | ||
|
||
if config["action"]["demultiplex"]: | ||
demultiplexed_files = expand('{o}/{s}.fastq.gz', o = config['dirs']['fastq'], s = SAMPLE_NAMES) | ||
output_files.extend(demultiplexed_files) | ||
|
||
if config["action"]["clean"]: | ||
clean_files = expand('{o}/{s}.clean.fastq.gz', o = config['dirs']['fastq'], s = SAMPLE_NAMES) | ||
output_files.extend(clean_files) | ||
|
||
#-------------------------------------------------------------------------------# | ||
#---------------------------------- RUN :-) ------------------------------------# | ||
#-------------------------------------------------------------------------------# | ||
|
||
include: "src/auxiliary.snake" | ||
#include: "src/demultiplex.snake" # not yet implemented | ||
include: "src/clean.snake" | ||
|
||
if config["debug"]: | ||
print_debug() | ||
|
||
rule all: | ||
input: | ||
output_files | ||
message: "Done." |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
import textwrap | ||
|
||
def print_debug(): | ||
debug_msg = textwrap.dedent("""\ | ||
Single-cell preprocessing pipeline | ||
|
||
The following samples are registered: | ||
{samples} | ||
|
||
I will use {reference} as reference for transcript quantification. | ||
|
||
The following output files will be produced: | ||
{output_files} | ||
""") | ||
print(debug_msg.format(samples = SAMPLE_NAMES, | ||
reference = REFERENCE_FASTA, | ||
output_files = output_files)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
|
||
rule prinseq_clean_se_gz: | ||
version: "1.0" | ||
input: | ||
'{out}/{sample}.fastq.gz' | ||
output: | ||
clean = '{out}/{sample}.clean.fastq.gz', | ||
grubby = '{out}/{sample}.grubby.fastq.gz', | ||
params: | ||
flags = config['prinseq']['flags'] if 'flags' in config.get('prinseq', {}) else '' | ||
message: 'Cleaning {input} using prinseq-lite.' | ||
log: join(config['dirs']['log'], 'prinseq', 'clean-se.log') | ||
shell: | ||
""" | ||
zcat {input} | prinseq-lite.pl -fastq -out_good {wildcards.out}/{wildcards.sample}.clean -out_bad {wildcards.out}/{wildcards.sample}.grubby -graph_data {log} {params.flags} | ||
""" |