Skip to content

Commit

Permalink
Initial commit
Browse files Browse the repository at this point in the history
jenzopr committed Jul 3, 2018
1 parent 13769bc commit af7a558
Showing 4 changed files with 187 additions and 0 deletions.
84 changes: 84 additions & 0 deletions config/default.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@

#
# The action section
#
# clean - whether or not to skip the prinseq cleaning step (see below).
# snv - whether or not to perform SNV analysis.
# cnv - whether or not to perform CNV analysis.
# container - whether or not to create a RDS container.
#
action:
clean: True
snv: True
cnv: True
container: True

#
# The samplesheet section
#
# file - The path to the SampleSheet text file. Set to None if no samplesheet is available
# index - The name of the Column that should be used as index
# case - The name of the Column containing the case identifier to which the index belongs to
#
samplesheet:
file: 'SampleSheet.txt'
index: 'Sample'
case: 'Case'

#
# The data section
#
# Contains named dictionaries with input sequencing files:
# r1 - The fastq file containing R1.
# r2 - The fastq file containing R2.
#
data:
sample1:
r1: ''
r2: ''

#
# The reference section
#
# URL - The URL from where the reference genome can be downloaded. Wildcards are allowed in curly brackets
# organism - Wildcard for the reference organism
# release - Wildcard for the reference release
# file - Wildcard(s) for the reference files that should be merged into the final reference transcriptome
# target - The path to a bed file with coordinates of the target regions
#
reference:
annotation_URL: 'ftp.ebi.ac.uk/pub/databases/gencode/Gencode_{organism}/release_{release}/gencode.v{release}.{file}.fa.gz'
genome_URL: 'ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_{organism}/release_{release}/{assembly}.primary_assembly.genome.fa.gz'
organism: 'mouse'
release: 'M17'
assembly: 'GRCm38'
file:
- basic_annotation
target: 'path/to/target_regions.bed'

#
# The directory section
#
# ref - directory name for reference files
# fastq - directory name for fastq files
# bam - directory name for STAR alignments
# cnv - directory name for Copy Number Variation analysis
# snv - directory name for Sequence Nucleotide Variation analysis
# tables - directory name for aggregated tables
# log - directory name for program logs
# R - directory name for Rdata objects
#
dirs:
ref: 'ref'
fastq: 'fastq'
bam: 'bam'
cnv: 'cnv'
snv: 'snv'
tables: 'tables'
log: 'log'
R: 'rds'

#
# The debug section
#
debug: False
18 changes: 18 additions & 0 deletions environment.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
name: target-dnaseq

channels:
- bioconda
- conda-forge

dependencies:
- python=3.5
- snakemake
- trimmomatic
- star
- samtools
- biobambam
- qualimap
- varscan
- vcfanno
- vcflib
- control-freec
36 changes: 36 additions & 0 deletions src/reference.snake
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
# vim: syntax=python tabstop=4 expandtab
# coding: utf-8

'''
@author: jpreuss
Provides rules for reference related modifications
'''

from snakemake.remote.FTP import RemoteProvider as FTPRemoteProvider
FTP = FTPRemoteProvider()

rule genome_download:
input:
FTP.remote(expand(config['reference']['genome_URL'], **config['reference']), keep_local = True)
output:
GENOME
threads: 1
message:
'Downloading gencode genome reference.'
shell:
"""
zcat -f {input} > {output}
"""

rule annotation_download:
input:
FTP.remote(expand(config['reference']['annotation_URL'], **config['reference']), keep_local = True)
output:
ANNOTATION
threads: 1
message:
'Downloading gencode annotation reference.'
shell:
"""
zcat -f {input} > {output}
"""
49 changes: 49 additions & 0 deletions target-dnaseq.snake
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
import pandas as pd
from os.path import join, basename, dirname

if workflow.overwrite_configfile != None:
configfile: str(workflow.overwrite_configfile)
else:
configfile: 'config/default.yml'

#-------------------------------------------------------------------------------#
#-------------- Handle reference related paths and file names ------------------#
#-------------------------------------------------------------------------------#

GENOME = join(config['dirs']['ref'], config['reference']['organism'], config['reference']['assembly'] + '.fa.gz')
ANNOTATION = join(config['dirs']['ref'], config['reference']['organism'], config['reference']['release'] + '.gtf.gz')

#-------------------------------------------------------------------------------#
#------------- Read samplesheet and provide necessary variables ----------------#
#-------------------------------------------------------------------------------#

samplesheet = pd.read_table(config['samplesheet']['file'], sep = '\t', index_col = config['samplesheet']['index'])

if 'URL_r1' not in list(samplesheet):
samplesheet['URL_r1'] = ['{dir}/{sample}.{format}'.format(dir = config['dirs']['fastq'], sample = sample, format = 'fastq.gz') for sample in list(samplesheet.index)]

SAMPLES = samplesheet.to_dict(orient = 'index')
SAMPLE_NAMES = sorted(SAMPLES.keys())

#-------------------------------------------------------------------------------#
#--------------------------- Generate output files -----------------------------#
#-------------------------------------------------------------------------------#

output_files = [
GENOME,
ANNOTATION
]

#-------------------------------------------------------------------------------#
#---------------------------------- RUN :-) ------------------------------------#
#-------------------------------------------------------------------------------#

include: "src/reference.snake"

if config["debug"]:
print_debug()

rule all:
input:
output_files
message: "Done."

0 comments on commit af7a558

Please sign in to comment.