From af7a558b8d18514339ae5a53429f0ead312b2573 Mon Sep 17 00:00:00 2001
From: Jens Preussner <jens.preussner@mpi-bn.mpg.de>
Date: Tue, 3 Jul 2018 13:07:03 +0200
Subject: [PATCH] Initial commit

---
 config/default.yml  | 84 +++++++++++++++++++++++++++++++++++++++++++++
 environment.yml     | 18 ++++++++++
 src/reference.snake | 36 +++++++++++++++++++
 target-dnaseq.snake | 49 ++++++++++++++++++++++++++
 4 files changed, 187 insertions(+)
 create mode 100644 config/default.yml
 create mode 100644 environment.yml
 create mode 100644 src/reference.snake
 create mode 100644 target-dnaseq.snake

diff --git a/config/default.yml b/config/default.yml
new file mode 100644
index 0000000..8b171f8
--- /dev/null
+++ b/config/default.yml
@@ -0,0 +1,84 @@
+
+#
+# The action section
+#
+# clean - whether or not to skip the prinseq cleaning step (see below).
+# snv - whether or not to perform SNV analysis.
+# cnv - whether or not to perform CNV analysis.
+# container - whether or not to create a RDS container.
+#
+action:
+  clean: True
+  snv: True
+  cnv: True
+  container: True
+
+#
+# The samplesheet section
+#
+# file - The path to the SampleSheet text file. Set to None if no samplesheet is available
+# index - The name of the Column that should be used as index
+# case - The name of the Column containing the case identifier to which the index belongs to
+#
+samplesheet:
+  file: 'SampleSheet.txt'
+  index: 'Sample'
+  case: 'Case'
+
+#
+# The data section
+#
+# Contains named dictionaries with input sequencing files:
+# r1 - The fastq file containing R1.
+# r2 - The fastq file containing R2.
+#
+data:
+  sample1:
+    r1: ''
+    r2: ''
+
+#
+# The reference section
+#
+# URL - The URL from where the reference genome can be downloaded. Wildcards are allowed in curly brackets
+# organism - Wildcard for the reference organism
+# release - Wildcard for the reference release
+# file - Wildcard(s) for the reference files that should be merged into the final reference transcriptome
+# target - The path to a bed file with coordinates of the target regions
+#
+reference:
+  annotation_URL: 'ftp.ebi.ac.uk/pub/databases/gencode/Gencode_{organism}/release_{release}/gencode.v{release}.{file}.fa.gz'
+  genome_URL: 'ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_{organism}/release_{release}/{assembly}.primary_assembly.genome.fa.gz'
+  organism: 'mouse'
+  release: 'M17'
+  assembly: 'GRCm38'
+  file:
+    - basic_annotation
+  target: 'path/to/target_regions.bed'
+
+#
+# The directory section
+#
+# ref - directory name for reference files
+# fastq - directory name for fastq files
+# bam - directory name for STAR alignments
+# cnv - directory name for Copy Number Variation analysis
+# snv - directory name for Sequence Nucleotide Variation analysis
+# tables - directory name for aggregated tables
+# log - directory name for program logs
+# R - directory name for Rdata objects
+#
+dirs:
+  ref: 'ref'
+  fastq: 'fastq'
+  bam: 'bam'
+  cnv: 'cnv'
+  snv: 'snv'
+  tables: 'tables'
+  log: 'log'
+  R: 'rds'
+
+#
+# The debug section
+#
+debug: False
diff --git a/environment.yml b/environment.yml
new file mode 100644
index 0000000..561984c
--- /dev/null
+++ b/environment.yml
@@ -0,0 +1,18 @@
+name: target-dnaseq
+
+channels:
+  - bioconda
+  - conda-forge
+
+dependencies:
+  - python=3.5
+  - snakemake
+  - trimmomatic
+  - star
+  - samtools
+  - biobambam
+  - qualimap
+  - varscan
+  - vcfanno
+  - vcflib
+  - control-freec
diff --git a/src/reference.snake b/src/reference.snake
new file mode 100644
index 0000000..31af219
--- /dev/null
+++ b/src/reference.snake
@@ -0,0 +1,36 @@
+# vim: syntax=python tabstop=4 expandtab
+# coding: utf-8
+
+'''
+@author: jpreuss
+Provides rules for reference related modifications
+'''
+
+from snakemake.remote.FTP import RemoteProvider as FTPRemoteProvider
+FTP = FTPRemoteProvider()
+
+rule genome_download:
+  input:
+    FTP.remote(expand(config['reference']['genome_URL'], **config['reference']), keep_local = True)
+  output:
+    GENOME
+  threads: 1
+  message:
+    'Downloading gencode genome reference.'
+  shell:
+    """
+    zcat -f {input} > {output}
+    """
+
+rule annotation_download:
+  input:
+    FTP.remote(expand(config['reference']['annotation_URL'], **config['reference']), keep_local = True)
+  output:
+    ANNOTATION
+  threads: 1
+  message:
+    'Downloading gencode annotation reference.'
+  shell:
+    """
+    zcat -f {input} > {output}
+    """
diff --git a/target-dnaseq.snake b/target-dnaseq.snake
new file mode 100644
index 0000000..c98f595
--- /dev/null
+++ b/target-dnaseq.snake
@@ -0,0 +1,49 @@
+import pandas as pd
+from os.path import join, basename, dirname
+
+if workflow.overwrite_configfile != None:
+	configfile: str(workflow.overwrite_configfile)
+else:
+configfile: 'config/default.yml'
+
+#-------------------------------------------------------------------------------#
+#-------------- Handle reference related paths and file names ------------------#
+#-------------------------------------------------------------------------------#
+
+GENOME = join(config['dirs']['ref'], config['reference']['organism'], config['reference']['assembly'] + '.fa.gz')
+ANNOTATION = join(config['dirs']['ref'], config['reference']['organism'], config['reference']['release'] + '.gtf.gz')
+
+#-------------------------------------------------------------------------------#
+#------------- Read samplesheet and provide necessary variables ----------------#
+#-------------------------------------------------------------------------------#
+
+samplesheet = pd.read_table(config['samplesheet']['file'], sep = '\t', index_col = config['samplesheet']['index'])
+
+if 'URL_r1' not in list(samplesheet):
+	samplesheet['URL_r1'] = ['{dir}/{sample}.{format}'.format(dir = config['dirs']['fastq'], sample = sample, format = 'fastq.gz') for sample in list(samplesheet.index)]
+
+SAMPLES = samplesheet.to_dict(orient = 'index')
+SAMPLE_NAMES = sorted(SAMPLES.keys())
+
+#-------------------------------------------------------------------------------#
+#--------------------------- Generate output files -----------------------------#
+#-------------------------------------------------------------------------------#
+
+output_files = [
+	GENOME,
+	ANNOTATION
+]
+
+#-------------------------------------------------------------------------------#
+#---------------------------------- RUN :-) ------------------------------------#
+#-------------------------------------------------------------------------------#
+
+include: "src/reference.snake"
+
+if config["debug"]:
+  print_debug()
+
+rule all:
+  input:
+    output_files
+message: "Done."