Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
pHDee/run_pipeline.py
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
894 lines (863 sloc)
35.7 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# Python pipeline for end-to-end analysis of HiChIP data using various tools. | |
# This script uses the following softwares/tools when run from start to end. | |
# 1. HiC-Pro -- from fastq files to (normalized) contact matrices | |
# 2. HiCPlotter -- static visualization of matrices and tracks etc. | |
# 4. HiCcompare -- differential comparison of matrices | |
# 5. Fit-Hi-C -- calling siginificant interactions | |
# 6. Juicebox -- dynamic visualization | |
# | |
# Author: Sarvesh Nikumbh | |
# Email: snikumbh@mpi-inf.mpg.de | |
# | |
# Date: May 07, 2018 | |
# | |
# | |
# import statements | |
import os, sys, datetime, ast | |
import shutil, subprocess | |
import argparse, glob | |
from random import randint | |
from HiCPlotter import HiCplotter | |
import logging | |
parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter,\ | |
add_help = False, \ | |
description =""" | |
Pipeline for processing HiChIP data from end-to-end. | |
Usage example: $python run_pipeline.py -c umbrella-config-file.txt -s do_all | |
""") | |
pR = parser.add_argument_group('Required arguments') | |
pR.add_argument('-c', '--config_file', \ | |
help = "Specify the umbrella config file.", \ | |
default = 'umbrella-config-file.txt', \ | |
required = True) | |
# get the user to specify which function to perform; correspondingly use only | |
# that portion of variables from config file | |
pR.add_argument('-st', '--stage', | |
help = 'Specify what to run. Possible values: \ | |
[do_hicpro, do_hicplotter, do_fithic, do_juicebox, \ | |
do_differential, suggest_resolution, do_all]. \ | |
Non-relevant details from the config file will \ | |
be ignored. When *do_all*, we perform sequentially \ | |
do_hicpro, do_fithic, and do_juicebox. \ | |
For others, run the pipeline individually.', \ | |
default = 'all', \ | |
nargs = '+', \ | |
required = True) | |
pO = parser.add_argument_group('Optional arguments') | |
# execute or do not execute commands; just print them | |
pO.add_argument('-ne', '--no_execute', | |
help = 'Specify this if you do not \ | |
want to execute commands right away; just \ | |
print them in the commands logfile.', \ | |
action = 'store_true') | |
pO.add_argument('-h', '--help', help = "Show this message and exit.", action = 'help') | |
args = parser.parse_args() | |
def log_pipeline_end_message(endsWithMsg): | |
""" | |
""" | |
logger = logging.getLogger(pipeline_run_name) | |
logger.info("---- Pipeline pHDee ends with {} ----".format(endsWithMsg)) | |
if endsWithMsg == "ERROR": | |
logger.info("See the error log for any errors related to running the tools") | |
logger.info("Any other errors are reported in this log") | |
return | |
def assert_config_options(config, stages_to_run): | |
""" | |
""" | |
#DONE: 1. pHDee.resolutions cannot be an empty string // done when | |
# reading config file | |
#DONE: 2. hiccompare resolutions should be a subset of resolutions | |
#DONE: 3. number of inner and outer sample names should be same | |
#TODO: 4. For hadling more than one inner sample names per outer sample name, | |
# we could use comma as a separator | |
# | |
logger = logging.getLogger(pipeline_run_name) | |
# Assert 3 | |
if len(config['pHDee.outer_sample_names']) != \ | |
len(config['pHDee.inner_sample_names']): | |
logger.error("Check the config file, #inner and outer sample names should be equal") | |
log_pipeline_end_message("ERROR") | |
sys.exit(1) | |
# Assert 2 | |
# check if we should care about this; if do_differential is not asked, we can defer | |
if stages_to_run['do_differential']: | |
# assertions relevant to the 'do_differential' stage | |
if not set(config['pHDee.hiccompare.resolutions']).issubset(config['pHDee.resolutions']): | |
if len(set(config['pHDee.resolutions'])) >= set(config['pHDee.hiccompare.resolutions']): | |
d = set(config['pHDee.resolutions']).difference(\ | |
config['pHDee.hiccompare.resolutions']) | |
else: | |
d = set(config['pHDee.hiccompare.resolutions']).difference(\ | |
config['pHDee.resolutions']) | |
# It could be that pHDee.resolutions specified in the config file are unrelated | |
# Then, this is not necessarily an error; thus, we only warn! | |
logger.warn("Check the following resolutions specified for HiCcompare {}. HiCPro only run for resolution(s) {}".\ | |
format(list(d), config['pHDee.resolutions'])) | |
# | |
return | |
def read_config_file(config_file): | |
""" | |
Read the config file | |
""" | |
logger = logging.getLogger(pipeline_run_name) | |
param_dict = {} | |
with open(config_file, 'r') as f: | |
param_lines = [x.strip() for x in f.readlines() if x.find('#') == -1] | |
param_lines = [x for x in param_lines if len(x) > 0] | |
for line in param_lines: | |
param_dict[line.split('=')[0].strip()] = line.split('=')[1].strip() | |
# Collect the multiple sample names as list | |
param_dict.__setitem__('pHDee.inner_sample_names', \ | |
param_dict['pHDee.inner_sample_names'].split(' ')) | |
param_dict.__setitem__('pHDee.outer_sample_names', | |
param_dict['pHDee.outer_sample_names'].split(' ')) | |
param_dict.__setitem__('pHDee.hiccompare.group_id', | |
param_dict['pHDee.hiccompare.group_id'].split(' ')) | |
# Store the pHDee.resolutions as a separate list. | |
# Note: The check that it is not empty has to happen here instead | |
# of the function 'assert_config_options' | |
if param_dict['pHDee.resolutions'] != '': | |
param_dict['pHDee.resolutions'] = [int(x) for x in param_dict['pHDee.resolutions'].split(' ')] | |
else: | |
logger.warn("Resolutions is empty. Specify at least one value for resolutions") | |
# Similarly, for hiccompare.resolutions | |
if param_dict['pHDee.hiccompare.resolutions'] != '': | |
param_dict['pHDee.hiccompare.resolutions'] = [int(x) for x in param_dict['pHDee.hiccompare.resolutions'].split(' ')] | |
else: | |
logger.warn("Resolutions is empty. Specify at least one value for resolutions") | |
# | |
param_dict['no_execute'] = False | |
# | |
return param_dict | |
def create_hicpro_input_folder(config, sampleID): | |
""" | |
Create input folder for HiC-Pro | |
Assumption: Only one sample (i.e. inner sample name) | |
@param: config | |
""" | |
#NOTE: creating this input folder can be omitted altogether | |
# in case there are multiple samples to be combined. | |
# The multiple samples condition is not handled at the moment. | |
# Handling this will warrant changes in the way inner_sample_names | |
# are specified in the config file. | |
logger = logging.getLogger(pipeline_run_name) | |
hicpro_input_folder, hicpro_output_folder = create_hicpro_foldernames(config, sampleID) | |
logger.info("Creating input folder {}".format(hicpro_input_folder)) | |
if config['no_execute'] is False: | |
if os.path.exists(hicpro_input_folder) is False: | |
os.makedirs(hicpro_input_folder \ | |
+ '/' + config['pHDee.inner_sample_names'][sampleID]) | |
else: | |
# Maybe, raise a warning for the user? | |
# # warnings.warn("Input folder exists, will be overwritten") | |
logger.warning("Input folder exists. Exiting.") | |
sys.exit(1) | |
# copy fastq files | |
relevant_files = [x for x in os.listdir(config['pHDee.input_folder']) if x.find(config['pHDee.outer_sample_names'][sampleID]) != -1] | |
relevant_files = [x for x in relevant_files if x.find('fastq.gz') != -1] | |
assert(len(relevant_files) == 2) # for the two mates | |
for f in relevant_files: | |
if os.path.exists(hicpro_input_folder+ '/' + config['pHDee.inner_sample_names'][sampleID]) is True: | |
shutil.copyfile('/'.join([config['pHDee.input_folder'], f]), \ | |
'/'.join([hicpro_input_folder, config['pHDee.inner_sample_names'][sampleID], f])) | |
# copy config file | |
shutil.copyfile(config['pHDee.hicpro.config_filename'], hicpro_input_folder +'/config-hicpro.txt') | |
# Replace the resolution values with given values for HiC-Pro | |
with open(config['pHDee.hicpro.config_filename'], 'r') as f: | |
old_config_lines = f.readlines() | |
new_config_lines = old_config_lines | |
for k,l in enumerate(old_config_lines): | |
if l.find('BIN_SIZE') != -1: | |
new_config_lines[k] = 'BIN_SIZE = ' + ' '.join([str(x) for x in config['pHDee.resolutions']])+'\n' | |
with open(hicpro_input_folder +'/config-hicpro.txt', 'w') as f: | |
f.writelines(new_config_lines) | |
# | |
config['pHDee.hicpro.config_filename'] = '/'.join([ hicpro_input_folder, config['pHDee.hicpro.config_filename'] ]) | |
logger.info("HiC-Pro config file name updated to {}".format(config['pHDee.hicpro.config_filename'])) | |
return | |
def create_hicpro_foldernames(config, sampleID): | |
""" | |
Returns the folder names for HiC-Pro input and output | |
@param: config | |
""" | |
logger = logging.getLogger(pipeline_run_name) | |
location = config['pHDee.output_folder'] + '/' | |
# | |
input_folder_l = [] | |
if config['pHDee.experiment_type'] != '': | |
input_folder_l.append(config['pHDee.experiment_type']) | |
if config['pHDee.celltype'] != '': | |
input_folder_l.append(config['pHDee.celltype']) | |
if config['pHDee.outer_sample_names'][sampleID] != '': | |
input_folder_l.append(config['pHDee.outer_sample_names'][sampleID]) | |
if config['pHDee.condition'] != '': | |
input_folder_l.append(config['pHDee.condition']) | |
if config['pHDee.replicate'] != '': | |
input_folder_l.append(config['pHDee.replicate']) | |
# | |
input_folder = '_'.join(input_folder_l) \ | |
+ config['pHDee.hicpro.input_folder_suffix'] | |
input_folder = location + input_folder | |
# This if-else can be removed. It exists currently to handle | |
# the difference in naming schemes in the untreated, tumor | |
# and dox conditions | |
if config['pHDee.condition'] == 'untreated' or config['pHDee.condition'] == 'tumor': | |
output_folder_l = [] | |
if config['pHDee.experiment_type'] != '': | |
output_folder_l.append(config['pHDee.experiment_type']) | |
if config['pHDee.celltype'] != '': | |
output_folder_l.append(config['pHDee.celltype']) | |
if config['pHDee.outer_sample_names'][sampleID] != '': | |
output_folder_l.append(config['pHDee.outer_sample_names'][sampleID]) | |
if config['pHDee.condition'] != '': | |
output_folder_l.append(config['pHDee.condition']) | |
if config['pHDee.replicate'] != '': | |
output_folder_l.append(config['pHDee.replicate']) | |
output_folder = '_'.join(output_folder_l) \ | |
+ config['pHDee.hicpro.output_folder_suffix'] | |
else: | |
output_folder_l = [] | |
if config['pHDee.experiment_type'] != '': | |
output_folder_l.append(config['pHDee.experiment_type']) | |
if config['pHDee.celltype'] != '': | |
output_folder_l.append(config['pHDee.celltype']) | |
if config['pHDee.outer_sample_names'][sampleID] != '': | |
output_folder_l.append(config['pHDee.outer_sample_names'][sampleID]) | |
if config['pHDee.condition'] != '': | |
output_folder_l.append(config['pHDee.condition']) | |
if config['pHDee.replicate'] != '': | |
output_folder_l.append(config['pHDee.replicate']) | |
output_folder = '_'.join(output_folder_l) \ | |
+ config['pHDee.hicpro.output_folder_suffix'] | |
output_folder = location + output_folder | |
return input_folder, output_folder | |
def run_hicpro(config): | |
""" | |
Run HiC-Pro | |
@param: config | |
""" | |
logger = logging.getLogger(pipeline_run_name) | |
logger.info("Running HiC-Pro") | |
for idx in range(len(config['pHDee.outer_sample_names'])): | |
# Create input folder | |
hicpro_input_folder, hicpro_output_folder = create_hicpro_foldernames(config, idx) | |
create_hicpro_input_folder(config, idx) | |
## Get the BAM or fastq files ready | |
##get_bam_or_fastq_ready(config) | |
# Run HiC-Pro | |
hicpro_path = config['pHDee.hicpro.path'] + '/bin/HiC-Pro' | |
hicpro_cmd = ['time', hicpro_path, '-i', hicpro_input_folder, '-o', hicpro_output_folder, \ | |
'-c', config['pHDee.hicpro.config_filename']] | |
logger.debug("# Run HiC-Pro:") | |
logger.debug(' '.join(hicpro_cmd)) | |
if config['no_execute'] is False: | |
err_log = open(config['pHDee.err_log'], "a") | |
cmd_log = open(config['pHDee.hicpro.output_file'], "a") | |
subprocess.call(hicpro_cmd, stdout=cmd_log, stderr=err_log) | |
cmd_log.close() | |
err_log.close() | |
# Clear intermediate files from the mapping/alignment step | |
logger.info("Clearing intermediate bowtie files") | |
if os.path.exists(hicpro_output_folder+ '/bowtie_results/bwt2_global/') is False: | |
logger.error("Check if HiC-Pro completed successfully. Exiting.") | |
log_pipeline_end_message("ERROR") | |
sys.exit(1) | |
else: | |
shutil.rmtree(str(hicpro_output_folder + '/bowtie_results/bwt2_global/')) | |
shutil.rmtree(str(hicpro_output_folder + '/bowtie_results/bwt2_local/')) | |
return | |
def hicpro_matrix_foldername(config, sampleID): | |
""" | |
Returns the path to raw or iced folder | |
in the HiCPro structure | |
""" | |
logger = logging.getLogger(pipeline_run_name) | |
_, hicpro_output_folder = create_hicpro_foldernames(config, sampleID) | |
name = os.path.join(hicpro_output_folder, 'hic_results', 'matrix', \ | |
config['pHDee.inner_sample_names'][sampleID]) | |
raw_name = os.path.join(name, 'raw') | |
iced_name = os.path.join(name, 'iced') | |
return raw_name, iced_name | |
def get_identifier(config, sampleID): | |
""" | |
Concat params to create an identifier for files | |
""" | |
logger = logging.getLogger(pipeline_run_name) | |
iName_l = [] | |
if config['pHDee.condition'] != '': | |
iName_l.append(config['pHDee.condition']) | |
if config['pHDee.experiment_type'] != '': | |
iName_l.append(config['pHDee.experiment_type']) | |
if config['pHDee.celltype'] != '': | |
iName_l.append(config['pHDee.celltype']) | |
if config['pHDee.outer_sample_names'][sampleID] != '': | |
iName_l.append(config['pHDee.outer_sample_names'][sampleID]) | |
if config['pHDee.replicate'] != '': | |
iName_l.append(config['pHDee.replicate']) | |
iName = '_'.join(iName_l) | |
return iName | |
def run_fithic(config): | |
""" | |
Run Fit-Hi-C | |
@param config | |
""" | |
logger = logging.getLogger(pipeline_run_name) | |
#location = config['pHDee.output_folder'] + '/' | |
logger.info("Running Fit-Hi-C") | |
for idx in range(len(config['pHDee.outer_sample_names'])): | |
hicpro_input_folder, hicpro_output_folder = create_hicpro_foldernames(config, idx) | |
raw_folder, iced_folder = hicpro_matrix_foldername(config, idx) | |
for res in config['pHDee.resolutions']: | |
# Assert input files exist | |
iced_matrix_filename = os.path.join(iced_folder, str(res), \ | |
'_'.join([config['pHDee.inner_sample_names'][idx], \ | |
str(res), 'iced.matrix']) ) | |
fithic_folder_location = os.path.join( iced_folder, str(res), \ | |
config['pHDee.fithic.output_folder'] ) | |
if config['no_execute'] is False: | |
if os.path.exists(iced_matrix_filename) is False: | |
logger.error("Check if iced matrix exists (or HiC-Pro completed successfully)") | |
log_pipeline_end_message("ERROR") | |
sys.exit(1) | |
# Create Fit-Hi-C folder | |
if os.path.exists(fithic_folder_location) is False: | |
logger.info("Creating folder '{}' at '{}'".format(config['pHDee.fithic.output_folder'], fithic_folder_location)) | |
os.mkdir(fithic_folder_location) | |
else: | |
logger.info("Using existing '{}' folder at '{}'".format(config['pHDee.fithic.output_folder'], fithic_folder_location)) | |
else: | |
logger.info("Using existing '{}' folder at '{}'".format(config['pHDee.fithic.output_folder'], fithic_folder_location)) | |
# Run Fit-HiC using Rscript | |
fithic_cmd = ['Rscript', 'scripts/run_fithic.R', \ | |
# fragsfile | |
os.path.join(raw_folder, str(res), \ | |
config['pHDee.inner_sample_names'][idx] + '_'\ | |
+ str(res) + '_ord.bed'), | |
# intersFile | |
os.path.join(iced_folder, str(res), \ | |
config['pHDee.inner_sample_names'][idx] + '_' \ | |
+ str(res) + '_iced.matrix'), | |
# biasFile | |
os.path.join(iced_folder, str(res), \ | |
config['pHDee.inner_sample_names'][idx] + '_'\ | |
+ str(res) + '_iced.matrix.biases'), | |
# outputDir | |
fithic_folder_location, | |
# name_of_lib | |
'_'.join([ get_identifier(config, idx), \ | |
str(res) ]), | |
# noOfBins, | |
config['pHDee.fithic.noOfBins'], \ | |
# whether_toUseHiCPro | |
'TRUE', | |
os.path.join(fithic_folder_location, \ | |
config['pHDee.fithic.output_log']) | |
] | |
logger.debug("# Run Fit-Hi-C:") | |
logger.debug(' '.join(fithic_cmd)) | |
if config['no_execute'] is False: | |
err_log = open(config['pHDee.err_log'], "a") | |
subprocess.call(fithic_cmd, stderr=err_log, shell=False) | |
#process.wait() | |
logger.info("Resolution: {} ==> Fit-HiC output directed to '{}'".format(res, '/'.join([ fithic_folder_location, \ | |
config['pHDee.fithic.output_log'] ]))) | |
err_log.close() | |
return | |
def call_HiCPlotter(wholeGenomeBool, given_chromosome, hicplotter_params): | |
""" | |
Makes the HiCPlotter function call | |
""" | |
logger = logging.getLogger(pipeline_run_name) | |
thisSampleID = hicplotter_params['umbrella_config']\ | |
['pHDee.outer_sample_names'].index(\ | |
hicplotter_params['curr_outer_sample_name']) | |
HiCplotter(files=[hicplotter_params['iced_matrix_file']],\ | |
names = [hicplotter_params['this_plot_title']],\ | |
resolution = hicplotter_params['resolution'],\ | |
chromosome = hicplotter_params['umbrella_config']\ | |
['pHDee.hicplotter.given_chromosome'],\ | |
# this is prefix for the output filename by hicplotter | |
output = os.path.join(hicplotter_params['output_folder'], \ | |
get_identifier(hicplotter_params['umbrella_config'], \ | |
thisSampleID)),\ | |
histograms = [],\ | |
histLabels = [],\ | |
fillHist = [],\ | |
histMax = [],\ | |
verbose = False,\ | |
fileHeader = 0,\ | |
fileFooter = 0,\ | |
matrixMax = 0,\ | |
histColors = [],\ | |
barPlots = [],\ | |
barLabels = [],\ | |
plotGenes = '',\ | |
superImpose = False,\ | |
start = 0,\ | |
end = 0,\ | |
tileLabels = [],\ | |
tilePlots = [],\ | |
tileColors = [],\ | |
tileText = False,\ | |
arcLabels = [],\ | |
arcPlots = [],\ | |
arcColors = [],\ | |
peakFiles = [],\ | |
epiLogos = '',\ | |
window = 5,\ | |
tadRange = 8,\ | |
tripleColumn = True,\ | |
bedFile = hicplotter_params['bed_file'],\ | |
barColors = [],\ | |
dPixels = 600,\ | |
compareEx = '',\ | |
compareSm = '',\ | |
upSide = False,\ | |
smoothNoise = 0.5,\ | |
cleanNANs = True,\ | |
plotTriangular = False,\ | |
plotTadDomains = False,\ | |
randomBins = False,\ | |
wholeGenome = hicplotter_params['umbrella_config']['pHDee.hicplotter.whole_genome'],\ | |
plotPublishedTadDomains = False,\ | |
plotDomainsAsBars = False,\ | |
imputed = False,\ | |
barMax = [],\ | |
spine = False,\ | |
plotDomainTicks = False,\ | |
triangularHeight = False,\ | |
highlights = 0,\ | |
highFile = '',\ | |
heatmapColor = 3,\ | |
highResolution = True,\ | |
plotInsulation = False,\ | |
plotCustomDomains = False,\ | |
publishedTadDomainOrganism = False,\ | |
customDomainsFile = [],\ | |
compare = False,\ | |
pair = False,\ | |
domColors = [],\ | |
oExtension = 'pdf',\ | |
geneLabels = False,\ | |
dark = False) | |
return | |
def plot_with_hicplotter(config): | |
""" | |
Handles static plotting with HiCPlotter: | |
wholeGenome, perChromosome or only givenChromosome | |
""" | |
logger = logging.getLogger(pipeline_run_name) | |
logger.info("Running HiCPlotter") | |
hicplotter_params = {} | |
chromosomes = ['chr'+str(x) for x in [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 'X']] | |
if config['no_execute'] is False: | |
for idx in range(len(config['pHDee.outer_sample_names'])): | |
hicpro_input_folder, hicpro_output_folder = create_hicpro_foldernames(config, idx) | |
hicplotter_params['curr_outer_sample_name'] = config['pHDee.outer_sample_names'][idx] | |
hicplotter_params['output_folder'] = os.path.join( hicpro_output_folder, \ | |
config['pHDee.hicplotter.output_folder'] ) | |
if os.path.exists(hicplotter_params['output_folder']) is False: | |
logger.info("Creating hicplotter_results directory") | |
os.mkdir(hicplotter_params['output_folder']) | |
else: | |
logger.info("Using existing hicplotter_results directory") | |
hicplotter_params['iced_matrix_file'] = '' | |
hicplotter_params['this_plot_title'] = '' | |
hicplotter_params['bed_file'] = '' | |
hicplotter_params['umbrella_config'] = config | |
raw_folder, iced_folder = hicpro_matrix_foldername(config, idx) | |
for res in config['pHDee.resolutions']: | |
hicplotter_params['resolution'] = res | |
input_filenames_prefix = os.path.join( iced_folder, \ | |
str(res), \ | |
config['pHDee.inner_sample_names'][idx] ) | |
# iced matrix file | |
hicplotter_params['iced_matrix_file'] = '_'.join([ \ | |
input_filenames_prefix, str(res), 'iced.matrix']) | |
# plot title | |
hicplotter_params['this_plot_title'] = '_'.join([ \ | |
get_identifier(config, idx), str(res) ]) | |
# bed file | |
hicplotter_params['bed_file'] = '_'.join([ \ | |
input_filenames_prefix.replace('iced', 'raw'), \ | |
str(res), 'ord.bed']) | |
# | |
if config['pHDee.hicplotter.given_chromosome'] != '': | |
logger.info("Plotting matrix for given chromosome") | |
call_HiCPlotter(config['pHDee.hicplotter.whole_genome'], \ | |
config['pHDee.hicplotter.given_chromosome'], \ | |
hicplotter_params) | |
# if whole_genome is set, given_chr is overwritten and set to chrY | |
if ast.literal_eval(config['pHDee.hicplotter.whole_genome']): | |
logger.info("Plotting whole genome matrix") | |
config['pHDee.hicplotter.given_chromosome'] = 'chrY' | |
call_HiCPlotter(config['pHDee.hicplotter.whole_genome'], \ | |
config['pHDee.hicplotter.given_chromosome'], \ | |
hicplotter_params) | |
# if per chromosome intra-chromosomal heatmaps are to be plotted | |
if ast.literal_eval(config['pHDee.hicplotter.per_chromosome']): | |
# iterate with chr set to each chromosome and call; wholeGenome to False | |
logger.info("Plotting matrices per chromosome") | |
for thisChr in chromosomes: | |
call_HiCPlotter(config['pHDee.hicplotter.whole_genome'], \ | |
thisChr, \ | |
hicplotter_params) | |
logger.info("{}".format(thisChr)) | |
return | |
def create_files_for_juicebox(config): | |
""" | |
Convert HiC-Pro allValidPairs files into | |
juicebox format for viewing | |
@param: config | |
""" | |
logger = logging.getLogger(pipeline_run_name) | |
logger.info("Running conversion of files for juicebox") | |
with open(config['pHDee.hicpro.config_filename'], 'r') as f: | |
for line in f.readlines(): | |
#if line.split('=')[0].strip() == 'GENOME_SIZE': | |
# genome_size_file = config['pHDee.hicpro.path'] + '/annotation/' \ | |
# + line.strip().split('=')[1].strip() | |
# # should use a file named hg19.chrom.sizes | |
if line.split('=')[0].strip() == 'GENOME_FRAGMENT': | |
restriction_fragment_file = config['pHDee.hicpro.path'] + '/annotation/' \ | |
+ line.strip().split('=')[1].strip() | |
# | |
for idx in range(len(config['pHDee.outer_sample_names'])): | |
hicpro_input_folder, hicpro_output_folder = create_hicpro_foldernames(config, idx) | |
logger.debug(hicpro_output_folder) | |
# | |
if ast.literal_eval(config['pHDee.juicebox.independent_folder']): | |
results_folder = config['pHDee.juicebox.output_folder'] | |
else: | |
results_folder = os.path.join( hicpro_output_folder, config['pHDee.juicebox.output_folder'] ) | |
tmp_folder = os.path.join( results_folder, config['pHDee.juicebox.temp_folder'] ) | |
if os.path.exists( results_folder ) is False: | |
if ast.literal_eval(config['pHDee.juicebox.independent_folder']) is False: | |
logger.info("Creating folder '{}' at '{}'".format(config['pHDee.juicebox.output_folder'], \ | |
hicpro_output_folder)) | |
else: | |
logger.info("Creating folder '{}'".format(config['pHDee.juicebox.output_folder'])) | |
os.mkdir(results_folder) | |
os.mkdir(tmp_folder) | |
else: | |
if ast.literal_eval(config['pHDee.juicebox.independent_folder']) is False: | |
logger.info("Using existing folder '{}' at '{}'".format(config['pHDee.juicebox.output_folder'], \ | |
hicpro_output_folder)) | |
else: | |
logger.info("Using existing folder '{}'".format(config['pHDee.juicebox.output_folder'])) | |
# | |
# #INFO: Also copy allValidPairs file | |
#logger.info("Copying allValidPairs file") | |
# #INFO: If the allValidPairs is in the same folder (which was the case after copyiing them), | |
# use them from the current location. Thus, filename is changed from the more generic | |
# HiC-Pro named version to the more specific one used by/identified by this pipeline | |
# | |
#input_filename = os.path.join(hicpro_output_folder, \ | |
# 'hic_results', 'data', \ | |
# config['pHDee.inner_sample_names'][idx], \ | |
# config['pHDee.inner_sample_names'][idx] \ | |
# + config['pHDee.hicpro.allValidPairs_suffix'] ) | |
# | |
#shutil.copyfile(input_filename, '/'.join([ results_folder, \ | |
# get_identifier(config, idx) + config['pHDee.hicpro.allValidPairs_suffix']])) | |
input_filename = '/'.join([results_folder, \ | |
get_identifier(config, idx) + config['pHDee.hicpro.allValidPairs_suffix']]) | |
# | |
output_filename = '/'.join([ results_folder, \ | |
get_identifier(config, idx) ]) | |
logger.info("Creating '{}'".format(output_filename+'.hic')) | |
logger.info("Using '{}' as input file".format(input_filename)) | |
logger.info("Using '{}' as temp folder".format(tmp_folder)) | |
logger.info("Using '{}' for reading genome size".format(config['pHDee.juicebox.genome_size_file'])) | |
logger.info("Using '{}' for restriction fragments".format(restriction_fragment_file)) | |
# | |
hicpro2juicebox_cmd = ['bash', config['pHDee.juicebox.hicpro2juicebox_script'], \ | |
'-i', input_filename, \ | |
'-g', config['pHDee.juicebox.genome_size_file'], \ | |
'-j', config['pHDee.juicebox.jar_file'] , \ | |
'-r', restriction_fragment_file, \ | |
'-t', tmp_folder, | |
'-o', output_filename | |
] | |
logger.debug("# Run hicpro2juicebox script") | |
logger.debug(' '.join(hicpro2juicebox_cmd)) | |
if config['no_execute'] is False: | |
err_log = open(config['pHDee.err_log'], "a") | |
cmd_log = open(config['pHDee.juicebox.script_output_file'], "a") | |
subprocess.call(hicpro2juicebox_cmd, stdout=cmd_log, stderr=err_log, shell=False) | |
cmd_log.close() | |
err_log.close() | |
logger.info("You can now use the '{}.hic' file with juicebox on desktop or on the web browser".format(output_filename)) | |
return | |
def do_differential_comparison(config): | |
""" | |
Perform a differential comparison between matrices | |
using HiCcompare. Use raw matrices since HiCcompare | |
jointly normalizes them. | |
@param config | |
Output generated includes | |
-- PDFs of MD plots? | |
-- PDFs of distance-normalization plots? | |
-- TSV file giving statistics on number | |
of total interactions, those with p-val < 0.01, | |
0.01 > = p-val < 0.05 and | |
p-val > 0.05 (one file for all chromosomes) | |
-- CSV file listing all interactions with p-val < 0.01, | |
0.01 > = p-val < 0.05 | |
""" | |
logger = logging.getLogger(pipeline_run_name) | |
logger.info("Running differential comparison using HiCcompare") | |
if config['no_execute'] is False: | |
if os.path.exists(os.path.join(config['pHDee.output_folder'], \ | |
config['pHDee.hiccompare.output_folder'])) is False: | |
logger.info("Creating folder '{}' at '{}'".format(\ | |
config['pHDee.hiccompare.output_folder'], \ | |
config['pHDee.output_folder'])) | |
os.mkdir( os.path.join(config['pHDee.output_folder'], \ | |
config['pHDee.hiccompare.output_folder']) ) | |
else: | |
logger.info("Using existing folder '{}' at '{}'".format(\ | |
config['pHDee.hiccompare.output_folder'], \ | |
config['pHDee.output_folder'])) | |
for g in range(len(config['pHDee.hiccompare.group_id'])): | |
logger.info("Comparisons for group '{}'".format(\ | |
config['pHDee.hiccompare.group_id'][g])) | |
# A folder for each group | |
group_foldername = os.path.join(config['pHDee.output_folder'], \ | |
config['pHDee.hiccompare.output_folder'], \ | |
config['pHDee.hiccompare.group_id'][g]) | |
csv_suffix = config['pHDee.hiccompare.csv_folder'] | |
# | |
mdplot_suffix = config['pHDee.hiccompare.md_plots_folder'] | |
# | |
dnplot_suffix = config['pHDee.hiccompare.dist_norm_plots_folder'] | |
if config['no_execute'] is False: | |
if os.path.exists(group_foldername) is False: | |
logger.info("Creating folder for group '{}'".format(\ | |
config['pHDee.hiccompare.group_id'][g])) | |
# Create folder for group with separate folders | |
# for MD plots and distance normalization plots | |
os.makedirs(os.path.join(group_foldername, \ | |
mdplot_suffix)) | |
# Now folder for csv files | |
os.mkdir(os.path.join(group_foldername,\ | |
csv_suffix)) | |
# Now folder for distance normalization plots | |
os.mkdir(os.path.join(group_foldername,\ | |
dnplot_suffix)) | |
# | |
else: | |
logger.info("Using existing folder for group '{}'".format(\ | |
config['pHDee.hiccompare.group_id'][g])) | |
# Folder for CSV files | |
if os.path.exists(os.path.join(group_foldername, \ | |
csv_suffix)) is False: | |
logger.info("Creating folder for CSV files") | |
os.mkdir(os.path.join(group_foldername,\ | |
csv_suffix)) | |
else: | |
logger.info("Using existing folder for CSV files") | |
# Folder for MD plots | |
if os.path.exists(os.path.join(group_foldername, \ | |
mdplot_suffix)) is False: | |
logger.info("Creating folder for MD plots") | |
os.mkdir(os.path.join(group_foldername,\ | |
mdplot_suffix)) | |
else: | |
logger.info("Using existing folder for MD plots") | |
# Folder for distance normalization plots | |
if os.path.exists(os.path.join(group_foldername,\ | |
dnplot_suffix)) is False: | |
logger.info("Creating folder for distance normalization plots") | |
os.mkdir(os.path.join(group_foldername,\ | |
dnplot_suffix)) | |
else: | |
logger.info("Using existing folder for distance normalization plots") | |
# | |
for r in range(len(config['pHDee.hiccompare.resolutions'])): | |
# Read Name1, Name2 and groupID from groupInfo file | |
with open(config['pHDee.hiccompare.groups_info_file'], "r") as f: | |
groupInfo_lines = [x.strip() for x in f.readlines() if x.find("#") == -1] | |
groupIDs = [] | |
INames = [] | |
ONames = [] | |
base_locations = [] | |
for line in groupInfo_lines: | |
[groupID, OName, IName, base_location, uniqueValInt] = [x.strip() for x in line.split("\t")] | |
if(groupID == config['pHDee.hiccompare.group_id'][g]): | |
groupIDs.append(groupID) | |
INames.append(IName) | |
ONames.append(OName) | |
base_locations.append(base_location) | |
# | |
for aIDX in range(0,len(ONames)): | |
n1 = ONames[aIDX] | |
mf1 = os.path.join(base_locations[aIDX], \ | |
ONames[aIDX] + \ | |
config['pHDee.hicpro.output_folder_suffix'], \ | |
"hic_results", "matrix", INames[aIDX], "raw", \ | |
str(config['pHDee.hiccompare.resolutions'][r]), \ | |
INames[aIDX] + '_' + str(config['pHDee.hiccompare.resolutions'][r]) + ".matrix") | |
# | |
bf1 = os.path.join(base_locations[aIDX], \ | |
ONames[aIDX] + \ | |
config['pHDee.hicpro.output_folder_suffix'],\ | |
"hic_results", "matrix", INames[aIDX], "raw",\ | |
str(config['pHDee.hiccompare.resolutions'][r]), | |
INames[aIDX] + '_' + str(config['pHDee.hiccompare.resolutions'][r]) + "_ord.bed") | |
# | |
for bIDX in range(aIDX, len(ONames)): | |
if ONames[aIDX] != ONames[bIDX]: | |
n2 = ONames[bIDX] | |
mf2 = os.path.join(base_locations[aIDX], \ | |
ONames[bIDX] + \ | |
config['pHDee.hicpro.output_folder_suffix'], \ | |
"hic_results", "matrix", INames[bIDX], "raw", \ | |
str(config['pHDee.hiccompare.resolutions'][r]), \ | |
INames[bIDX] + '_' + str(config['pHDee.hiccompare.resolutions'][r]) + ".matrix") | |
# | |
bf2 = os.path.join(base_locations[aIDX], \ | |
ONames[bIDX] + \ | |
config['pHDee.hicpro.output_folder_suffix'],\ | |
"hic_results", "matrix", INames[bIDX], "raw",\ | |
str(config['pHDee.hiccompare.resolutions'][r]), | |
INames[bIDX] + '_' + str(config['pHDee.hiccompare.resolutions'][r]) + "_ord.bed") | |
# | |
diff_comparison_cmd = [ 'Rscript', \ | |
'scripts/make_differential_comparisons/compare_contact_matrices_for_chromosomes.R', \ | |
# Name1 and Name2 | |
n1, n2, \ | |
# resolution | |
str(config['pHDee.hiccompare.resolutions'][r]), \ | |
# matfile, bedfile / 1 and 2 | |
mf1, bf1, \ | |
mf2, bf2, \ | |
# result_location | |
os.path.join(config['pHDee.output_folder'], config['pHDee.hiccompare.output_folder']), \ | |
# groupID | |
groupIDs[0], \ | |
# rOut file | |
os.path.join(config['pHDee.output_folder'], config['pHDee.hiccompare.output_folder'], | |
"output_comparing_contact_matrices_group"+ config['pHDee.hiccompare.group_id'][g] + "_" + \ | |
str(config['pHDee.hiccompare.resolutions'][r])+".rOut") | |
] | |
logger.info("# Running differential comparison") | |
logger.info("Resolution: {}".format(\ | |
str(config['pHDee.hiccompare.resolutions'][r]))) | |
logger.debug(' '.join(diff_comparison_cmd)) | |
if config['no_execute'] is False: | |
logger.debug("Resolution: {}".format(\ | |
str(config['pHDee.hiccompare.resolutions'][r]))) | |
err_log = open(config['pHDee.err_log'], "a") | |
subprocess.call(diff_comparison_cmd, stderr=err_log, shell=False) | |
err_log.close() | |
return | |
def setup_stages_for_pipeline_to_run(args): | |
""" | |
Manage the stages to run | |
""" | |
logger = logging.getLogger(pipeline_run_name) | |
stages_to_run = {'do_all': False, \ | |
'do_hicpro': False, \ | |
'do_fithic': False, \ | |
'do_hicplotter': False, \ | |
'do_juicebox': False, \ | |
'do_differential': False, \ | |
'suggest_resolution': False | |
} | |
given_stages = args.stage | |
# Check if all stages are specified using valid keywords | |
for given_stage in given_stages: | |
if given_stage not in stages_to_run.keys(): | |
logger.error("'{}' is not a valid keyword for stage. \nValid stages: {}".format(given_stage, ', '.join(stages_to_run.keys() + ['do_all'] ))) | |
log_pipeline_end_message("ERROR") | |
sys.exit(1) | |
logger.info("Stage(s) to run: {}".format(', '.join(given_stages))) | |
for given_stage in given_stages: | |
if given_stage == 'do_all': | |
stages_to_run['do_hicpro'] = True | |
stages_to_run['do_fithic'] = True | |
stages_to_run['do_juicebox'] = True | |
elif given_stage == 'do_hicpro': | |
stages_to_run['do_hicpro'] = True | |
elif given_stage == 'do_fithic': | |
stages_to_run['do_fithic'] = True | |
elif given_stage == 'do_juicebox': | |
stages_to_run['do_juicebox'] = True | |
elif given_stage == 'do_hicplotter': | |
stages_to_run['do_hicplotter'] = True | |
elif given_stage == 'do_differential': | |
stages_to_run['do_differential'] = True | |
return stages_to_run | |
def handle_logging(): | |
""" | |
""" | |
#set up logging to file and console, both | |
rand_id = randint(1000, 9999) | |
log_filename = 'pHDee-' + str(rand_id) +'.log' | |
global pipeline_run_name | |
pipeline_run_name = 'pHDee-' + str(rand_id) | |
logging.basicConfig(level=logging.DEBUG, format='\n%(asctime)s - %(name)s - %(levelname)s - %(message)s',\ | |
datefmt='%Y-%m-%d %H:%M:%S', filename=log_filename, filemode='a') | |
# define a Handler which writes INFO messages or higher to the sys.stderr | |
console = logging.StreamHandler() | |
console.setLevel(logging.INFO) | |
# format | |
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s', datefmt='%Y-%m-%d %H:%M:%S') | |
console.setFormatter(formatter) | |
# add console handler | |
logging.getLogger('').addHandler(console) | |
# get a logger | |
logger = logging.getLogger(pipeline_run_name) | |
return logger | |
def main(): | |
""" | |
Main function | |
""" | |
logger = handle_logging() | |
logger.info("---- Pipeline pHDee begins ----") | |
logger.info("Log file: {}.log".format(pipeline_run_name)) | |
stages_to_run = setup_stages_for_pipeline_to_run(args) | |
# Read the config file; check assertions | |
config = read_config_file(args.config_file) | |
assert_config_options(config, stages_to_run) | |
if args.no_execute is True: | |
config['no_execute'] = True | |
# Does this ensure the order? | |
if stages_to_run['do_hicpro']: | |
# Run HiC-Pro | |
run_hicpro(config) | |
logger.info("do_hicpro [DONE].") | |
# | |
if stages_to_run['do_fithic']: | |
# Run Fit-HiC | |
run_fithic(config) | |
logger.info("do_fithic [DONE].") | |
# | |
if stages_to_run['do_hicplotter']: | |
# 1. User should ensure that hicpro completed successfully | |
# 2. Run HiCPlotter | |
plot_with_hicplotter(config) | |
logger.info("do_hicplotter [DONE].") | |
# | |
if stages_to_run['do_juicebox']: | |
# 1. Check that hicpro run is completed | |
# 2. Convert to juicebox format | |
create_files_for_juicebox(config) | |
logger.info("do_juicebox [DONE].") | |
# | |
if stages_to_run['do_differential']: | |
# Perform differential comparisons | |
do_differential_comparison(config) | |
logger.info("do_differential [DONE].") | |
# | |
if stages_to_run['suggest_resolution']: | |
# TODO | |
# Juicer tools has a shell script to do this, but it | |
# requires the Juicer-specific merge_nodups.txt file | |
logger.info("suggest_resolution [TODO].") | |
log_pipeline_end_message("SUCCESS") | |
logging.shutdown() | |
if __name__ == '__main__': | |
main() | |