diff --git a/publication/pyvalid.zip b/publication/pyvalid.zip new file mode 100644 index 0000000..39f041e Binary files /dev/null and b/publication/pyvalid.zip differ diff --git a/publication/supp_file_1_process_raw.xml b/publication/supp_file_1_process_raw.xml new file mode 100644 index 0000000..98fd0ea --- /dev/null +++ b/publication/supp_file_1_process_raw.xml @@ -0,0 +1,68 @@ + + + EXAMPLE + 1 + + John Doe + john@doe.org + + + This file illustrates the basic specification of a process. The example analysis in this process counts - in a rather complicated way - the number + of lines in an input file that contain two specified words and checks if this number is equal to the number of lines in a reference file. The output file generated by an analysis run of this process contains a yes/no answer. + This process describes an analysis that can be executed on most computers running a common Linux installation such as Debian. All command lines listed below can be tested in a shell. + + + + input_file + txt + single + The input file can be of arbitrary length. + + + + + ref_file + txt + single + The reference file can be of arbitrary length. + + + + + result_file + txt + single + The result file contains only yes or no. + + + + + grep + 2.12 + temp_file ]]> + + All lines having a match for the first word are saved to a temporary file. + + + grep + 2.12 + temp_file2 ]]> + + All lines having a match for both words are saved to a temporary file. + + + wc, cut + 8.13, 8.13 + temp_file3 ]]> + + wc outputs the number of lines and the corresponding filename; we reduce the output to just the number using cut + + + wc, cut, cat + 8.13, 8.13, 8.13 + {result_file}; else echo "no" > {result_file}; fi; ]]> + + Since this command line contains some bash syntax, the process author should state the precise version of the shell environment: GNU bash ver. 4.2.37 + + + diff --git a/publication/supp_file_2_process_css.xml b/publication/supp_file_2_process_css.xml new file mode 100644 index 0000000..535c152 --- /dev/null +++ b/publication/supp_file_2_process_css.xml @@ -0,0 +1,69 @@ + + + + EXAMPLE + 1 + + John Doe + john@doe.org + + + This file illustrates the basic specification of a process. The example analysis in this process counts - in a rather complicated way - the number + of lines in an input file that contain two specified words and checks if this number is equal to the number of lines in a reference file. The output file generated by an analysis run of this process contains a yes/no answer. + This process describes an analysis that can be executed on most computers running a common Linux installation such as Debian. All command lines listed below can be tested in a shell. + + + + input_file + txt + single + The input file can be of arbitrary length. + + + + + ref_file + txt + single + The reference file can be of arbitrary length. + + + + + result_file + txt + single + The result file contains only yes or no. + + + + + grep + 2.12 + temp_file ]]> + + All lines having a match for the first word are saved to a temporary file. + + + grep + 2.12 + temp_file2 ]]> + + All lines having a match for both words are saved to a temporary file. + + + wc, cut + 8.13, 8.13 + temp_file3 ]]> + + wc outputs the number of lines and the corresponding filename; we reduce the output to just the number using cut + + + wc, cut, cat + 8.13, 8.13, 8.13 + {result_file}; else echo "no" > {result_file}; fi; ]]> + + Since this command line contains some bash syntax, the process author should state the precise version of the shell environment: GNU bash ver. 4.2.37 + + + diff --git a/publication/supp_file_3_style.css b/publication/supp_file_3_style.css new file mode 100644 index 0000000..66e064c --- /dev/null +++ b/publication/supp_file_3_style.css @@ -0,0 +1,217 @@ + +/* Design for top level elements */ + +process { + margin: 25px; + background-color: white; + width: auto; + font-family: "Arial", sans-serif; +} + +*:before { + font-weight: bold; +} + +process>name:before { + content: "Process: "; +} + +process>name { + display: block; +} + +version:before { + content: "Version: "; + color: black; +} + +process>version { + display: block; + font-weight: bold; + color: red; +} + + +author:before { + content: "Contact:"; + font-weight: bold; +} + +author { + top: 10px; + display: block; + margin-bottom: 10px; +} + +author>name { + display: block; + padding-left: 10px; + padding-top: 3px; + padding-bottom: 3px; +} + +author>email { + display: block; + padding-left: 10px; +} + +description:before { + content: "Process description: "; + font-weight: bold; + display: block; +} + +description { + display: block; + border: 0px; + padding-bottom: 10px; + padding-top: 10px; + border-bottom-width: 2px; + border-top-width: 2px; + border-style: solid; + border-color: lightgrey; +} + + +inputs:before { + content: "Input files"; + font-weight: bold; +} + +inputs { + margin-top: 10px; + display: block; +} + +filetype:before { + content: "File"; + display: block; +} +filetype { + position: relative; + left: 20px; + display: block; +} + +identifier:before { + content: "Identifier: "; + font-family: "Arial", sans-serif; +} + +identifier, format, quantity, comment { + display: block; + position: relative; + left: 25px; + padding-bottom: 3px; +} + +identifier, command_line { + font-family: "Courier New", Monospace; +} + +format:before { + content: "File format: "; +} + +quantity:before { + content: "Quantity: "; +} + +comment:before { + content: "Comment: "; +} + +references:before { + content: "Reference files"; +} + +references { + margin-top: 10px; + display: block; + background-color: #F0F0F0; + padding-top: 10px; + padding-bottom: 10px; +} + +outputs:before { + content: "Output files"; + font-weight: bold; +} +outputs { + margin-top: 10px; + margin-bottom: 10px; + display: block; +} +software:before { + content: "Process Steps"; + font-weight: bold; + display: block; + margin-bottom: 10px; +} + +software { + display: block; + padding-top: 10px; + border: 0px; + border-top-width: 2px; + border-style: solid; + border-color: lightgrey; + counter-reset: subsection; + padding-bottom: 10px; +} + +tool>name:before { + counter-increment: subsection; + content: "Step " counter(subsection) ": "; +} + +tool>name { + position: relative; + margin-left: 15px; + padding: 5px; + border-style: solid; + border-width: 2px; + border-color: #47D147; + border-radius: 5px; + box-sizing: border-box; + text-align: center; +} + +tool>version:before { + content: "Software version: "; +} + +tool>version { + display: block; + margin-left: 25px; + margin-top: 10px; + padding-bottom: 3px; +} + +command_line { + display: block; + position: relative; + left: 40px; + padding-top: 10px; + padding-bottom: 10px; + padding-left: 5px; +} + +command_line:hover { + background-color: lightgrey; +} + +loop:before { + content: "Loop: "; +} + +loop { + display: block; + margin-left: 25px; + padding-bottom: 5px; + +} + +tool>comment { + margin-bottom: 15px; +} \ No newline at end of file diff --git a/publication/supp_file_4_analysis_metadata.amd.tsv b/publication/supp_file_4_analysis_metadata.amd.tsv new file mode 100644 index 0000000..a9c1f28 --- /dev/null +++ b/publication/supp_file_4_analysis_metadata.amd.tsv @@ -0,0 +1,20 @@ +[Description] +process EXAMPLEv1 +user jdoe +date 20150105 +analysis_id ex1_run_20150105 + +[Inputs] +input_file pg218.txt + +[References] +ref_file 38503-0.txt + +[Outputs] +result_file res.txt + +[Parameters] +word1 fines +word2 sunt + +[Metrics] \ No newline at end of file diff --git a/publication/supp_file_5_CHPv2.xml b/publication/supp_file_5_CHPv2.xml new file mode 100644 index 0000000..f5df08b --- /dev/null +++ b/publication/supp_file_5_CHPv2.xml @@ -0,0 +1,185 @@ + + + + CHP + 2 + + Andreas Richter, Peter Ebert + arichter@ie-freiburg.mpg.de, pebert@mpi-inf.mpg.de + + + Process CHPv2 has been created to correct a couple of mistakes in the v1 process description and - more importantly - since new software versions have been installed on the DEEP cluster at DAC/MPI-Inf. This process takes as input aligned reads coming from the DCC/DKFZ and creates individual and comparative signal tracks as well as peak files for the different histone marks. Note that before the correlation among all files is computed, a couple of known problematic regions are removed that usually show a spurious read distribution that would subsequently lead to an inaccurate correlation among the files. The last step of this process plots the coverage of the histone signal (and, if available, of the input control) in a few selected control regions (for details, contact Andreas Richter). Note that these plots are by no means suited to interpret the data or judge the quality of the entire dataset - the plot of the control regions just shows regions with expected high or low signal compared to the input; the scaling of the values is performed for layout reasons and independent for each region, i.e. plots of different regions cannot be compared directly. + + + + GALvX_histone + BAM + collection + + + + GALvX_input + BAM + single + + + + GALvX_index + BAM index + collection + Index files are renamed internally to .bam.bai since deepTools is expecting index naming like this + + + + + filtered_regions + BED + single + ENCODE blacklist extended by A. Richter (FB); see DCC/download/results/references/annotations + + + reference_genome + 2bit + single + The reference genome file; see DCC/download/results/references/genomes + + + plot_regions + BED + single + Control regions obtained from A. Richter (FB) for quality control of ChIPseq samples; see DCC/download/results/references/annotations + + + + + samplesID.PROCESS.DATE.corplot.cormethod + deepTools graphics PNG + single + + + + samplesID.PROCESS.DATE.fgprplot + deepTools graphics PNG + single + + + + sampleID.PROCESS.DATE.gcbplot + deepTools graphics PNG + collection + + + + sampleID.PROCESS.DATE.gcbfreq + tab-separated text file + collection + + + + sampleID.PROCESS.DATE._peaks.xls + XLS table + collection + Standard MACS2 output XLS table for broad and narrow marks + + + sampleID.PROCESS.DATE._peaks.broadPeak + broadPeak + collection + Standard MACS2 output in ENCODE's broadPeak format for broad marks, this file is usually used for subsequent analyses + + + sampleID.PROCESS.DATE._peaks.gappedPeak + gappedPeak + collection + Standard MACS2 output in ENCODE's gappedPeak format for broad marks + + + sampleID.PROCESS.DATE._summits.bed + BED + collection + Standard MACS2 output for narrow marks + + + sampleID.PROCESS.DATE._peaks.narrowPeak + narrowPeak + collection + Standard MACS2 output for narrow marks, this file is usually used for subsequent analyses + + + sampleIDs.PROCESS.DATE.bamcomp.scalemethod + bigwig + collection + Input-normalized histone signal tracks + + + sampleID.PROCESS.DATE.bamcov.seqDepthNorm + bigwig + collection + Sequencing-depth normalized signal coverage tracks + + + sampleID.PROCESS.DATE.ctrlreg + graphics PNG + collection + A plot of a set of control regions for each histone mark (histone and input signal). Attention: this plot can only be used for a rough quality assessment (experiment fail or success), you cannot base any interpretation on this plot. + + + + + region_filter.py + 0.1 + + GALvX_histone, GALvX_input + Script to generate a temporary BAM file with ENCODE blacklist regions excluded, only relevant for bamCorrelate tool. The filtered BAM files are discarded at the end of this process. One temporary file per histone mark plus the input is generated. + + + bamCorrelate (deepTools) + 1.5.8.1 + + + Window/bin size of 1kb since multiple narrow signals will be merged with default value (10kb), 1m samples + + + bamFingerprint (deepTools) + 1.5.8.1 + + + + + + computeGCBias (deepTools) + 1.5.8.1 + + GALvX_histone, GALvX_input + + + + MACS2 + macs2 2.1.0.20140616 + + GALvX_histone + parameter "--broad" for samples H3K4me1/H3K27me3/H3K36me/H3K9me3; default q-value cutoff of 0.05 is recommended by the author at least for broad marks and approved by A. Richter for all marks + + + bamCompare (deepTools) + 1.5.8.1 + + GALvX_histone + scaling_method: "readCount" for samples H3K27me3/H3K9me3, "SES" else + + + bamCoverage (deepTools) + 1.5.8.1 + + GALvX_histone, GALvX_input + report read coverage normalized to 1x sequencing depth + + + signal_plotter.py + 0.1 + + only for histone marks: sampleID.PROCESS.DATE.bamcov.seqDepthNorm + + + + diff --git a/publication/supp_file_6.amd.tsv b/publication/supp_file_6.amd.tsv new file mode 100644 index 0000000..99d21eb --- /dev/null +++ b/publication/supp_file_6.amd.tsv @@ -0,0 +1,88 @@ +# Comment: +# real name of file is 44_Mm04_WEAd_C11_chipseq_F.CHPv2.20141024.amd + +[Description] +run_by_user pebert +run_machine deep11 +process CHPv2 +date 20141024 +run_start 2014-10-24 17:52:39.814533 +dac_run_id DAC141024s277 +software 0.2r375 +run_end 2014-10-25 12:33:21.202072 + +[Inputs] +GALvX_index 44_Mm04_WEAd_C11_H3K36me3_F.bwa.20141009.bam.bai,44_Mm04_WEAd_C11_H3K4me1_F.bwa.20141009.bam.bai,44_Mm04_WEAd_C11_Input_F.bwa.20141010.bam.bai,44_Mm04_WEAd_C11_H3K4me3_F.bwa.20141009.bam.bai,44_Mm04_WEAd_C11_H3K9me3_F.bwa.20141010.bam.bai,44_Mm04_WEAd_C11_H3K27ac_F.bwa.20141009.bam.bai,44_Mm04_WEAd_C11_H3K27me3_F.bwa.20141010.bam.bai +GALvX_histone 44_Mm04_WEAd_C11_H3K36me3_F.bwa.20141009.bam,44_Mm04_WEAd_C11_H3K4me1_F.bwa.20141009.bam,44_Mm04_WEAd_C11_H3K4me3_F.bwa.20141009.bam,44_Mm04_WEAd_C11_H3K9me3_F.bwa.20141010.bam,44_Mm04_WEAd_C11_H3K27ac_F.bwa.20141009.bam,44_Mm04_WEAd_C11_H3K27me3_F.bwa.20141010.bam +GALvX_input 44_Mm04_WEAd_C11_Input_F.bwa.20141010.bam + +[Outputs] +samplesID.PROCESS.DATE.corplot.cormethod 44_Mm04_WEAd_C11_chipseq_F.CHPv2.20141024.corplot.pearson.png +samplesID.PROCESS.DATE.fgprplot 44_Mm04_WEAd_C11_chipseq_F.CHPv2.20141024.fgpplot.png +sampleID.PROCESS.DATE.gcbplot 44_Mm04_WEAd_C11_H3K36me3_F.CHPv2.20141024.gcbplot.png,44_Mm04_WEAd_C11_H3K4me1_F.CHPv2.20141024.gcbplot.png,44_Mm04_WEAd_C11_Input_F.CHPv2.20141024.gcbplot.png,44_Mm04_WEAd_C11_H3K4me3_F.CHPv2.20141024.gcbplot.png,44_Mm04_WEAd_C11_H3K9me3_F.CHPv2.20141024.gcbplot.png,44_Mm04_WEAd_C11_H3K27ac_F.CHPv2.20141024.gcbplot.png,44_Mm04_WEAd_C11_H3K27me3_F.CHPv2.20141024.gcbplot.png +sampleID.PROCESS.DATE.gcbfreq 44_Mm04_WEAd_C11_H3K36me3_F.CHPv2.20141024.gcbfreq.txt,44_Mm04_WEAd_C11_H3K4me1_F.CHPv2.20141024.gcbfreq.txt,44_Mm04_WEAd_C11_Input_F.CHPv2.20141024.gcbfreq.txt,44_Mm04_WEAd_C11_H3K4me3_F.CHPv2.20141024.gcbfreq.txt,44_Mm04_WEAd_C11_H3K9me3_F.CHPv2.20141024.gcbfreq.txt,44_Mm04_WEAd_C11_H3K27ac_F.CHPv2.20141024.gcbfreq.txt,44_Mm04_WEAd_C11_H3K27me3_F.CHPv2.20141024.gcbfreq.txt +sampleID.PROCESS.DATE._peaks.xls 44_Mm04_WEAd_C11_H3K36me3_F.CHPv2.20141024._peaks.xls,44_Mm04_WEAd_C11_H3K4me1_F.CHPv2.20141024._peaks.xls,44_Mm04_WEAd_C11_H3K4me3_F.CHPv2.20141024._peaks.xls,44_Mm04_WEAd_C11_H3K9me3_F.CHPv2.20141024._peaks.xls,44_Mm04_WEAd_C11_H3K27ac_F.CHPv2.20141024._peaks.xls,44_Mm04_WEAd_C11_H3K27me3_F.CHPv2.20141024._peaks.xls +sampleID.PROCESS.DATE._peaks.broadPeak 44_Mm04_WEAd_C11_H3K36me3_F.CHPv2.20141024._peaks.broadPeak,44_Mm04_WEAd_C11_H3K4me1_F.CHPv2.20141024._peaks.broadPeak,44_Mm04_WEAd_C11_H3K9me3_F.CHPv2.20141024._peaks.broadPeak,44_Mm04_WEAd_C11_H3K27me3_F.CHPv2.20141024._peaks.broadPeak +sampleID.PROCESS.DATE._peaks.gappedPeak 44_Mm04_WEAd_C11_H3K36me3_F.CHPv2.20141024._peaks.gappedPeak,44_Mm04_WEAd_C11_H3K4me1_F.CHPv2.20141024._peaks.gappedPeak,44_Mm04_WEAd_C11_H3K9me3_F.CHPv2.20141024._peaks.gappedPeak,44_Mm04_WEAd_C11_H3K27me3_F.CHPv2.20141024._peaks.gappedPeak +sampleID.PROCESS.DATE._summits.bed 44_Mm04_WEAd_C11_H3K4me3_F.CHPv2.20141024._summits.bed,44_Mm04_WEAd_C11_H3K27ac_F.CHPv2.20141024._summits.bed +sampleID.PROCESS.DATE._peaks.narrowPeak 44_Mm04_WEAd_C11_H3K4me3_F.CHPv2.20141024._peaks.narrowPeak,44_Mm04_WEAd_C11_H3K27ac_F.CHPv2.20141024._peaks.narrowPeak +sampleIDs.PROCESS.DATE.bamcomp.scalemethod 44_Mm04_WEAd_C11_H3K36me3-log2-Input_F.CHPv2.20141024.bamcomp.SES.bw,44_Mm04_WEAd_C11_H3K4me1-log2-Input_F.CHPv2.20141024.bamcomp.SES.bw,44_Mm04_WEAd_C11_H3K4me3-log2-Input_F.CHPv2.20141024.bamcomp.SES.bw,44_Mm04_WEAd_C11_H3K9me3-log2-Input_F.CHPv2.20141024.bamcomp.readCount.bw,44_Mm04_WEAd_C11_H3K27ac-log2-Input_F.CHPv2.20141024.bamcomp.SES.bw,44_Mm04_WEAd_C11_H3K27me3-log2-Input_F.CHPv2.20141024.bamcomp.readCount.bw +sampleID.PROCESS.DATE.bamcov.seqdepthnorm 44_Mm04_WEAd_C11_H3K36me3_F.CHPv2.20141024.bamcov.seqDepthNorm.bw,44_Mm04_WEAd_C11_H3K4me1_F.CHPv2.20141024.bamcov.seqDepthNorm.bw,44_Mm04_WEAd_C11_Input_F.CHPv2.20141024.bamcov.seqDepthNorm.bw,44_Mm04_WEAd_C11_H3K4me3_F.CHPv2.20141024.bamcov.seqDepthNorm.bw,44_Mm04_WEAd_C11_H3K9me3_F.CHPv2.20141024.bamcov.seqDepthNorm.bw,44_Mm04_WEAd_C11_H3K27ac_F.CHPv2.20141024.bamcov.seqDepthNorm.bw,44_Mm04_WEAd_C11_H3K27me3_F.CHPv2.20141024.bamcov.seqDepthNorm.bw +sampleID.PROCESS.DATE.ctrlreg 44_Mm04_WEAd_C11_H3K36me3_F.CHPv2.20141024.ctrlreg.png,44_Mm04_WEAd_C11_H3K4me1_F.CHPv2.20141024.ctrlreg.png,44_Mm04_WEAd_C11_H3K4me3_F.CHPv2.20141024.ctrlreg.png,44_Mm04_WEAd_C11_H3K9me3_F.CHPv2.20141024.ctrlreg.png,44_Mm04_WEAd_C11_H3K27ac_F.CHPv2.20141024.ctrlreg.png,44_Mm04_WEAd_C11_H3K27me3_F.CHPv2.20141024.ctrlreg.png + +[References] +filtered_regions mm10_ENC.DKFZ_blacklist.bed +reference_genome GRCm38mm10.2bit +plot_regions histone_ChIP_pos_ctrl_regions_mm10.bed + +[Parameters] +labels 44_Mm04_WEAd_C11_H3K36me3_F,44_Mm04_WEAd_C11_H3K4me1_F,44_Mm04_WEAd_C11_Input_F,44_Mm04_WEAd_C11_H3K4me3_F,44_Mm04_WEAd_C11_H3K9me3_F,44_Mm04_WEAd_C11_H3K27ac_F,44_Mm04_WEAd_C11_H3K27me3_F +all_median_fraglen 190 +numproc 16 +genomesize 2650000000 +h3k36me3_median_fraglen 239 +h3k4me1_median_fraglen 190 +input_median_fraglen 121 +h3k4me3_median_fraglen 173 +h3k9me3_median_fraglen 252 +h3k27ac_median_fraglen 175 +h3k27me3_median_fraglen 255 +h3k36me3_broad True +h3k36me3_name_prefix 44_Mm04_WEAd_C11_H3K36me3_F.CHPv2.20141024. +h3k4me1_broad True +h3k4me1_name_prefix 44_Mm04_WEAd_C11_H3K4me1_F.CHPv2.20141024. +h3k4me3_broad False +h3k4me3_name_prefix 44_Mm04_WEAd_C11_H3K4me3_F.CHPv2.20141024. +h3k9me3_broad True +h3k9me3_name_prefix 44_Mm04_WEAd_C11_H3K9me3_F.CHPv2.20141024. +h3k27ac_name_prefix 44_Mm04_WEAd_C11_H3K27ac_F.CHPv2.20141024. +h3k27ac_broad False +h3k27me3_name_prefix 44_Mm04_WEAd_C11_H3K27me3_F.CHPv2.20141024. +h3k27me3_broad True +h3k36me3_scaling_method SES +h3k4me1_scaling_method SES +h3k4me3_scaling_method SES +h3k9me3_scaling_method readCount +h3k27ac_scaling_method SES +h3k27me3_scaling_method readCount + +[Metrics] +read_count_h3k36me3 149416256 +peak_count_h3k36me3 41290 +frip_h3k36me3 0.56 +read_count_h3k4me1 123384550 +peak_count_h3k4me1 103773 +frip_h3k4me1 0.46 +read_count_input 214965156 +read_count_h3k4me3 117531972 +peak_count_h3k4me3 30418 +frip_h3k4me3 0.62 +read_count_h3k9me3 232803908 +peak_count_h3k9me3 125819 +frip_h3k9me3 0.44 +read_count_h3k27ac 98939872 +peak_count_h3k27ac 98324 +frip_h3k27ac 0.51 +read_count_h3k27me3 280728496 +peak_count_h3k27me3 98020 +frip_h3k27me3 0.27 \ No newline at end of file diff --git a/publication/supp_file_7_schema.xsd b/publication/supp_file_7_schema.xsd new file mode 100644 index 0000000..7c2b4c4 --- /dev/null +++ b/publication/supp_file_7_schema.xsd @@ -0,0 +1,77 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/publication/supp_files.zip b/publication/supp_files.zip new file mode 100644 index 0000000..512c3ea Binary files /dev/null and b/publication/supp_files.zip differ diff --git a/publication/suppmat.md b/publication/suppmat.md new file mode 100644 index 0000000..4f19320 --- /dev/null +++ b/publication/suppmat.md @@ -0,0 +1,8 @@ +# Supplemental Material + +The material hosted here is **for reference only**. Please **do not use** it to create new *Process* documents or to validate existing documents. It is kept *as-is* to reflect the state at the time of publication. + +The supplemental material belongs to the following publication: + +["A general concept for consistent documentation of computational analyses", *Database* 2015](https://dx.doi.org/10.1093/database/bav050) +