Skip to content
Permalink
master
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
#!/usr/bin/env python
# Python pipeline for end-to-end analysis of HiChIP data using various tools.
# This script uses the following softwares/tools when run from start to end.
# 1. HiC-Pro -- from fastq files to (normalized) contact matrices
# 2. HiCPlotter -- static visualization of matrices and tracks etc.
# 4. HiCcompare -- differential comparison of matrices
# 5. Fit-Hi-C -- calling siginificant interactions
# 6. Juicebox -- dynamic visualization
#
# Author: Sarvesh Nikumbh
# Email: snikumbh@mpi-inf.mpg.de
#
# Date: May 07, 2018
#
#
# import statements
import os, sys, datetime, ast
import shutil, subprocess
import argparse, glob
from random import randint
from HiCPlotter import HiCplotter
import logging
parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter,\
add_help = False, \
description ="""
Pipeline for processing HiChIP data from end-to-end.
Usage example: $python run_pipeline.py -c umbrella-config-file.txt -s do_all
""")
pR = parser.add_argument_group('Required arguments')
pR.add_argument('-c', '--config_file', \
help = "Specify the umbrella config file.", \
default = 'umbrella-config-file.txt', \
required = True)
# get the user to specify which function to perform; correspondingly use only
# that portion of variables from config file
pR.add_argument('-st', '--stage',
help = 'Specify what to run. Possible values: \
[do_hicpro, do_hicplotter, do_fithic, do_juicebox, \
do_differential, suggest_resolution, do_all]. \
Non-relevant details from the config file will \
be ignored. When *do_all*, we perform sequentially \
do_hicpro, do_fithic, and do_juicebox. \
For others, run the pipeline individually.', \
default = 'all', \
nargs = '+', \
required = True)
pO = parser.add_argument_group('Optional arguments')
# execute or do not execute commands; just print them
pO.add_argument('-ne', '--no_execute',
help = 'Specify this if you do not \
want to execute commands right away; just \
print them in the commands logfile.', \
action = 'store_true')
pO.add_argument('-h', '--help', help = "Show this message and exit.", action = 'help')
args = parser.parse_args()
def log_pipeline_end_message(endsWithMsg):
"""
"""
logger = logging.getLogger(pipeline_run_name)
logger.info("---- Pipeline pHDee ends with {} ----".format(endsWithMsg))
if endsWithMsg == "ERROR":
logger.info("See the error log for any errors related to running the tools")
logger.info("Any other errors are reported in this log")
return
def assert_config_options(config, stages_to_run):
"""
"""
#DONE: 1. pHDee.resolutions cannot be an empty string // done when
# reading config file
#DONE: 2. hiccompare resolutions should be a subset of resolutions
#DONE: 3. number of inner and outer sample names should be same
#TODO: 4. For hadling more than one inner sample names per outer sample name,
# we could use comma as a separator
#
logger = logging.getLogger(pipeline_run_name)
# Assert 3
if len(config['pHDee.outer_sample_names']) != \
len(config['pHDee.inner_sample_names']):
logger.error("Check the config file, #inner and outer sample names should be equal")
log_pipeline_end_message("ERROR")
sys.exit(1)
# Assert 2
# check if we should care about this; if do_differential is not asked, we can defer
if stages_to_run['do_differential']:
# assertions relevant to the 'do_differential' stage
if not set(config['pHDee.hiccompare.resolutions']).issubset(config['pHDee.resolutions']):
if len(set(config['pHDee.resolutions'])) >= set(config['pHDee.hiccompare.resolutions']):
d = set(config['pHDee.resolutions']).difference(\
config['pHDee.hiccompare.resolutions'])
else:
d = set(config['pHDee.hiccompare.resolutions']).difference(\
config['pHDee.resolutions'])
# It could be that pHDee.resolutions specified in the config file are unrelated
# Then, this is not necessarily an error; thus, we only warn!
logger.warn("Check the following resolutions specified for HiCcompare {}. HiCPro only run for resolution(s) {}".\
format(list(d), config['pHDee.resolutions']))
#
return
def read_config_file(config_file):
"""
Read the config file
"""
logger = logging.getLogger(pipeline_run_name)
param_dict = {}
with open(config_file, 'r') as f:
param_lines = [x.strip() for x in f.readlines() if x.find('#') == -1]
param_lines = [x for x in param_lines if len(x) > 0]
for line in param_lines:
param_dict[line.split('=')[0].strip()] = line.split('=')[1].strip()
# Collect the multiple sample names as list
param_dict.__setitem__('pHDee.inner_sample_names', \
param_dict['pHDee.inner_sample_names'].split(' '))
param_dict.__setitem__('pHDee.outer_sample_names',
param_dict['pHDee.outer_sample_names'].split(' '))
param_dict.__setitem__('pHDee.hiccompare.group_id',
param_dict['pHDee.hiccompare.group_id'].split(' '))
# Store the pHDee.resolutions as a separate list.
# Note: The check that it is not empty has to happen here instead
# of the function 'assert_config_options'
if param_dict['pHDee.resolutions'] != '':
param_dict['pHDee.resolutions'] = [int(x) for x in param_dict['pHDee.resolutions'].split(' ')]
else:
logger.warn("Resolutions is empty. Specify at least one value for resolutions")
# Similarly, for hiccompare.resolutions
if param_dict['pHDee.hiccompare.resolutions'] != '':
param_dict['pHDee.hiccompare.resolutions'] = [int(x) for x in param_dict['pHDee.hiccompare.resolutions'].split(' ')]
else:
logger.warn("Resolutions is empty. Specify at least one value for resolutions")
#
param_dict['no_execute'] = False
#
return param_dict
def create_hicpro_input_folder(config, sampleID):
"""
Create input folder for HiC-Pro
Assumption: Only one sample (i.e. inner sample name)
@param: config
"""
#NOTE: creating this input folder can be omitted altogether
# in case there are multiple samples to be combined.
# The multiple samples condition is not handled at the moment.
# Handling this will warrant changes in the way inner_sample_names
# are specified in the config file.
logger = logging.getLogger(pipeline_run_name)
hicpro_input_folder, hicpro_output_folder = create_hicpro_foldernames(config, sampleID)
logger.info("Creating input folder {}".format(hicpro_input_folder))
if config['no_execute'] is False:
if os.path.exists(hicpro_input_folder) is False:
os.makedirs(hicpro_input_folder \
+ '/' + config['pHDee.inner_sample_names'][sampleID])
else:
# Maybe, raise a warning for the user?
# # warnings.warn("Input folder exists, will be overwritten")
logger.warning("Input folder exists. Exiting.")
sys.exit(1)
# copy fastq files
relevant_files = [x for x in os.listdir(config['pHDee.input_folder']) if x.find(config['pHDee.outer_sample_names'][sampleID]) != -1]
relevant_files = [x for x in relevant_files if x.find('fastq.gz') != -1]
assert(len(relevant_files) == 2) # for the two mates
for f in relevant_files:
if os.path.exists(hicpro_input_folder+ '/' + config['pHDee.inner_sample_names'][sampleID]) is True:
shutil.copyfile('/'.join([config['pHDee.input_folder'], f]), \
'/'.join([hicpro_input_folder, config['pHDee.inner_sample_names'][sampleID], f]))
# copy config file
shutil.copyfile(config['pHDee.hicpro.config_filename'], hicpro_input_folder +'/config-hicpro.txt')
# Replace the resolution values with given values for HiC-Pro
with open(config['pHDee.hicpro.config_filename'], 'r') as f:
old_config_lines = f.readlines()
new_config_lines = old_config_lines
for k,l in enumerate(old_config_lines):
if l.find('BIN_SIZE') != -1:
new_config_lines[k] = 'BIN_SIZE = ' + ' '.join([str(x) for x in config['pHDee.resolutions']])+'\n'
with open(hicpro_input_folder +'/config-hicpro.txt', 'w') as f:
f.writelines(new_config_lines)
#
config['pHDee.hicpro.config_filename'] = '/'.join([ hicpro_input_folder, config['pHDee.hicpro.config_filename'] ])
logger.info("HiC-Pro config file name updated to {}".format(config['pHDee.hicpro.config_filename']))
return
def create_hicpro_foldernames(config, sampleID):
"""
Returns the folder names for HiC-Pro input and output
@param: config
"""
logger = logging.getLogger(pipeline_run_name)
location = config['pHDee.output_folder'] + '/'
#
input_folder_l = []
if config['pHDee.experiment_type'] != '':
input_folder_l.append(config['pHDee.experiment_type'])
if config['pHDee.celltype'] != '':
input_folder_l.append(config['pHDee.celltype'])
if config['pHDee.outer_sample_names'][sampleID] != '':
input_folder_l.append(config['pHDee.outer_sample_names'][sampleID])
if config['pHDee.condition'] != '':
input_folder_l.append(config['pHDee.condition'])
if config['pHDee.replicate'] != '':
input_folder_l.append(config['pHDee.replicate'])
#
input_folder = '_'.join(input_folder_l) \
+ config['pHDee.hicpro.input_folder_suffix']
input_folder = location + input_folder
# This if-else can be removed. It exists currently to handle
# the difference in naming schemes in the untreated, tumor
# and dox conditions
if config['pHDee.condition'] == 'untreated' or config['pHDee.condition'] == 'tumor':
output_folder_l = []
if config['pHDee.experiment_type'] != '':
output_folder_l.append(config['pHDee.experiment_type'])
if config['pHDee.celltype'] != '':
output_folder_l.append(config['pHDee.celltype'])
if config['pHDee.outer_sample_names'][sampleID] != '':
output_folder_l.append(config['pHDee.outer_sample_names'][sampleID])
if config['pHDee.condition'] != '':
output_folder_l.append(config['pHDee.condition'])
if config['pHDee.replicate'] != '':
output_folder_l.append(config['pHDee.replicate'])
output_folder = '_'.join(output_folder_l) \
+ config['pHDee.hicpro.output_folder_suffix']
else:
output_folder_l = []
if config['pHDee.experiment_type'] != '':
output_folder_l.append(config['pHDee.experiment_type'])
if config['pHDee.celltype'] != '':
output_folder_l.append(config['pHDee.celltype'])
if config['pHDee.outer_sample_names'][sampleID] != '':
output_folder_l.append(config['pHDee.outer_sample_names'][sampleID])
if config['pHDee.condition'] != '':
output_folder_l.append(config['pHDee.condition'])
if config['pHDee.replicate'] != '':
output_folder_l.append(config['pHDee.replicate'])
output_folder = '_'.join(output_folder_l) \
+ config['pHDee.hicpro.output_folder_suffix']
output_folder = location + output_folder
return input_folder, output_folder
def run_hicpro(config):
"""
Run HiC-Pro
@param: config
"""
logger = logging.getLogger(pipeline_run_name)
logger.info("Running HiC-Pro")
for idx in range(len(config['pHDee.outer_sample_names'])):
# Create input folder
hicpro_input_folder, hicpro_output_folder = create_hicpro_foldernames(config, idx)
create_hicpro_input_folder(config, idx)
## Get the BAM or fastq files ready
##get_bam_or_fastq_ready(config)
# Run HiC-Pro
hicpro_path = config['pHDee.hicpro.path'] + '/bin/HiC-Pro'
hicpro_cmd = ['time', hicpro_path, '-i', hicpro_input_folder, '-o', hicpro_output_folder, \
'-c', config['pHDee.hicpro.config_filename']]
logger.debug("# Run HiC-Pro:")
logger.debug(' '.join(hicpro_cmd))
if config['no_execute'] is False:
err_log = open(config['pHDee.err_log'], "a")
cmd_log = open(config['pHDee.hicpro.output_file'], "a")
subprocess.call(hicpro_cmd, stdout=cmd_log, stderr=err_log)
cmd_log.close()
err_log.close()
# Clear intermediate files from the mapping/alignment step
logger.info("Clearing intermediate bowtie files")
if os.path.exists(hicpro_output_folder+ '/bowtie_results/bwt2_global/') is False:
logger.error("Check if HiC-Pro completed successfully. Exiting.")
log_pipeline_end_message("ERROR")
sys.exit(1)
else:
shutil.rmtree(str(hicpro_output_folder + '/bowtie_results/bwt2_global/'))
shutil.rmtree(str(hicpro_output_folder + '/bowtie_results/bwt2_local/'))
return
def hicpro_matrix_foldername(config, sampleID):
"""
Returns the path to raw or iced folder
in the HiCPro structure
"""
logger = logging.getLogger(pipeline_run_name)
_, hicpro_output_folder = create_hicpro_foldernames(config, sampleID)
name = os.path.join(hicpro_output_folder, 'hic_results', 'matrix', \
config['pHDee.inner_sample_names'][sampleID])
raw_name = os.path.join(name, 'raw')
iced_name = os.path.join(name, 'iced')
return raw_name, iced_name
def get_identifier(config, sampleID):
"""
Concat params to create an identifier for files
"""
logger = logging.getLogger(pipeline_run_name)
iName_l = []
if config['pHDee.condition'] != '':
iName_l.append(config['pHDee.condition'])
if config['pHDee.experiment_type'] != '':
iName_l.append(config['pHDee.experiment_type'])
if config['pHDee.celltype'] != '':
iName_l.append(config['pHDee.celltype'])
if config['pHDee.outer_sample_names'][sampleID] != '':
iName_l.append(config['pHDee.outer_sample_names'][sampleID])
if config['pHDee.replicate'] != '':
iName_l.append(config['pHDee.replicate'])
iName = '_'.join(iName_l)
return iName
def run_fithic(config):
"""
Run Fit-Hi-C
@param config
"""
logger = logging.getLogger(pipeline_run_name)
#location = config['pHDee.output_folder'] + '/'
logger.info("Running Fit-Hi-C")
for idx in range(len(config['pHDee.outer_sample_names'])):
hicpro_input_folder, hicpro_output_folder = create_hicpro_foldernames(config, idx)
raw_folder, iced_folder = hicpro_matrix_foldername(config, idx)
for res in config['pHDee.resolutions']:
# Assert input files exist
iced_matrix_filename = os.path.join(iced_folder, str(res), \
'_'.join([config['pHDee.inner_sample_names'][idx], \
str(res), 'iced.matrix']) )
fithic_folder_location = os.path.join( iced_folder, str(res), \
config['pHDee.fithic.output_folder'] )
if config['no_execute'] is False:
if os.path.exists(iced_matrix_filename) is False:
logger.error("Check if iced matrix exists (or HiC-Pro completed successfully)")
log_pipeline_end_message("ERROR")
sys.exit(1)
# Create Fit-Hi-C folder
if os.path.exists(fithic_folder_location) is False:
logger.info("Creating folder '{}' at '{}'".format(config['pHDee.fithic.output_folder'], fithic_folder_location))
os.mkdir(fithic_folder_location)
else:
logger.info("Using existing '{}' folder at '{}'".format(config['pHDee.fithic.output_folder'], fithic_folder_location))
else:
logger.info("Using existing '{}' folder at '{}'".format(config['pHDee.fithic.output_folder'], fithic_folder_location))
# Run Fit-HiC using Rscript
fithic_cmd = ['Rscript', 'scripts/run_fithic.R', \
# fragsfile
os.path.join(raw_folder, str(res), \
config['pHDee.inner_sample_names'][idx] + '_'\
+ str(res) + '_ord.bed'),
# intersFile
os.path.join(iced_folder, str(res), \
config['pHDee.inner_sample_names'][idx] + '_' \
+ str(res) + '_iced.matrix'),
# biasFile
os.path.join(iced_folder, str(res), \
config['pHDee.inner_sample_names'][idx] + '_'\
+ str(res) + '_iced.matrix.biases'),
# outputDir
fithic_folder_location,
# name_of_lib
'_'.join([ get_identifier(config, idx), \
str(res) ]),
# noOfBins,
config['pHDee.fithic.noOfBins'], \
# whether_toUseHiCPro
'TRUE',
os.path.join(fithic_folder_location, \
config['pHDee.fithic.output_log'])
]
logger.debug("# Run Fit-Hi-C:")
logger.debug(' '.join(fithic_cmd))
if config['no_execute'] is False:
err_log = open(config['pHDee.err_log'], "a")
subprocess.call(fithic_cmd, stderr=err_log, shell=False)
#process.wait()
logger.info("Resolution: {} ==> Fit-HiC output directed to '{}'".format(res, '/'.join([ fithic_folder_location, \
config['pHDee.fithic.output_log'] ])))
err_log.close()
return
def call_HiCPlotter(wholeGenomeBool, given_chromosome, hicplotter_params):
"""
Makes the HiCPlotter function call
"""
logger = logging.getLogger(pipeline_run_name)
thisSampleID = hicplotter_params['umbrella_config']\
['pHDee.outer_sample_names'].index(\
hicplotter_params['curr_outer_sample_name'])
HiCplotter(files=[hicplotter_params['iced_matrix_file']],\
names = [hicplotter_params['this_plot_title']],\
resolution = hicplotter_params['resolution'],\
chromosome = hicplotter_params['umbrella_config']\
['pHDee.hicplotter.given_chromosome'],\
# this is prefix for the output filename by hicplotter
output = os.path.join(hicplotter_params['output_folder'], \
get_identifier(hicplotter_params['umbrella_config'], \
thisSampleID)),\
histograms = [],\
histLabels = [],\
fillHist = [],\
histMax = [],\
verbose = False,\
fileHeader = 0,\
fileFooter = 0,\
matrixMax = 0,\
histColors = [],\
barPlots = [],\
barLabels = [],\
plotGenes = '',\
superImpose = False,\
start = 0,\
end = 0,\
tileLabels = [],\
tilePlots = [],\
tileColors = [],\
tileText = False,\
arcLabels = [],\
arcPlots = [],\
arcColors = [],\
peakFiles = [],\
epiLogos = '',\
window = 5,\
tadRange = 8,\
tripleColumn = True,\
bedFile = hicplotter_params['bed_file'],\
barColors = [],\
dPixels = 600,\
compareEx = '',\
compareSm = '',\
upSide = False,\
smoothNoise = 0.5,\
cleanNANs = True,\
plotTriangular = False,\
plotTadDomains = False,\
randomBins = False,\
wholeGenome = hicplotter_params['umbrella_config']['pHDee.hicplotter.whole_genome'],\
plotPublishedTadDomains = False,\
plotDomainsAsBars = False,\
imputed = False,\
barMax = [],\
spine = False,\
plotDomainTicks = False,\
triangularHeight = False,\
highlights = 0,\
highFile = '',\
heatmapColor = 3,\
highResolution = True,\
plotInsulation = False,\
plotCustomDomains = False,\
publishedTadDomainOrganism = False,\
customDomainsFile = [],\
compare = False,\
pair = False,\
domColors = [],\
oExtension = 'pdf',\
geneLabels = False,\
dark = False)
return
def plot_with_hicplotter(config):
"""
Handles static plotting with HiCPlotter:
wholeGenome, perChromosome or only givenChromosome
"""
logger = logging.getLogger(pipeline_run_name)
logger.info("Running HiCPlotter")
hicplotter_params = {}
chromosomes = ['chr'+str(x) for x in [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 'X']]
if config['no_execute'] is False:
for idx in range(len(config['pHDee.outer_sample_names'])):
hicpro_input_folder, hicpro_output_folder = create_hicpro_foldernames(config, idx)
hicplotter_params['curr_outer_sample_name'] = config['pHDee.outer_sample_names'][idx]
hicplotter_params['output_folder'] = os.path.join( hicpro_output_folder, \
config['pHDee.hicplotter.output_folder'] )
if os.path.exists(hicplotter_params['output_folder']) is False:
logger.info("Creating hicplotter_results directory")
os.mkdir(hicplotter_params['output_folder'])
else:
logger.info("Using existing hicplotter_results directory")
hicplotter_params['iced_matrix_file'] = ''
hicplotter_params['this_plot_title'] = ''
hicplotter_params['bed_file'] = ''
hicplotter_params['umbrella_config'] = config
raw_folder, iced_folder = hicpro_matrix_foldername(config, idx)
for res in config['pHDee.resolutions']:
hicplotter_params['resolution'] = res
input_filenames_prefix = os.path.join( iced_folder, \
str(res), \
config['pHDee.inner_sample_names'][idx] )
# iced matrix file
hicplotter_params['iced_matrix_file'] = '_'.join([ \
input_filenames_prefix, str(res), 'iced.matrix'])
# plot title
hicplotter_params['this_plot_title'] = '_'.join([ \
get_identifier(config, idx), str(res) ])
# bed file
hicplotter_params['bed_file'] = '_'.join([ \
input_filenames_prefix.replace('iced', 'raw'), \
str(res), 'ord.bed'])
#
if config['pHDee.hicplotter.given_chromosome'] != '':
logger.info("Plotting matrix for given chromosome")
call_HiCPlotter(config['pHDee.hicplotter.whole_genome'], \
config['pHDee.hicplotter.given_chromosome'], \
hicplotter_params)
# if whole_genome is set, given_chr is overwritten and set to chrY
if ast.literal_eval(config['pHDee.hicplotter.whole_genome']):
logger.info("Plotting whole genome matrix")
config['pHDee.hicplotter.given_chromosome'] = 'chrY'
call_HiCPlotter(config['pHDee.hicplotter.whole_genome'], \
config['pHDee.hicplotter.given_chromosome'], \
hicplotter_params)
# if per chromosome intra-chromosomal heatmaps are to be plotted
if ast.literal_eval(config['pHDee.hicplotter.per_chromosome']):
# iterate with chr set to each chromosome and call; wholeGenome to False
logger.info("Plotting matrices per chromosome")
for thisChr in chromosomes:
call_HiCPlotter(config['pHDee.hicplotter.whole_genome'], \
thisChr, \
hicplotter_params)
logger.info("{}".format(thisChr))
return
def create_files_for_juicebox(config):
"""
Convert HiC-Pro allValidPairs files into
juicebox format for viewing
@param: config
"""
logger = logging.getLogger(pipeline_run_name)
logger.info("Running conversion of files for juicebox")
with open(config['pHDee.hicpro.config_filename'], 'r') as f:
for line in f.readlines():
#if line.split('=')[0].strip() == 'GENOME_SIZE':
# genome_size_file = config['pHDee.hicpro.path'] + '/annotation/' \
# + line.strip().split('=')[1].strip()
# # should use a file named hg19.chrom.sizes
if line.split('=')[0].strip() == 'GENOME_FRAGMENT':
restriction_fragment_file = config['pHDee.hicpro.path'] + '/annotation/' \
+ line.strip().split('=')[1].strip()
#
for idx in range(len(config['pHDee.outer_sample_names'])):
hicpro_input_folder, hicpro_output_folder = create_hicpro_foldernames(config, idx)
logger.debug(hicpro_output_folder)
#
if ast.literal_eval(config['pHDee.juicebox.independent_folder']):
results_folder = config['pHDee.juicebox.output_folder']
else:
results_folder = os.path.join( hicpro_output_folder, config['pHDee.juicebox.output_folder'] )
tmp_folder = os.path.join( results_folder, config['pHDee.juicebox.temp_folder'] )
if os.path.exists( results_folder ) is False:
if ast.literal_eval(config['pHDee.juicebox.independent_folder']) is False:
logger.info("Creating folder '{}' at '{}'".format(config['pHDee.juicebox.output_folder'], \
hicpro_output_folder))
else:
logger.info("Creating folder '{}'".format(config['pHDee.juicebox.output_folder']))
os.mkdir(results_folder)
os.mkdir(tmp_folder)
else:
if ast.literal_eval(config['pHDee.juicebox.independent_folder']) is False:
logger.info("Using existing folder '{}' at '{}'".format(config['pHDee.juicebox.output_folder'], \
hicpro_output_folder))
else:
logger.info("Using existing folder '{}'".format(config['pHDee.juicebox.output_folder']))
#
# #INFO: Also copy allValidPairs file
#logger.info("Copying allValidPairs file")
# #INFO: If the allValidPairs is in the same folder (which was the case after copyiing them),
# use them from the current location. Thus, filename is changed from the more generic
# HiC-Pro named version to the more specific one used by/identified by this pipeline
#
#input_filename = os.path.join(hicpro_output_folder, \
# 'hic_results', 'data', \
# config['pHDee.inner_sample_names'][idx], \
# config['pHDee.inner_sample_names'][idx] \
# + config['pHDee.hicpro.allValidPairs_suffix'] )
#
#shutil.copyfile(input_filename, '/'.join([ results_folder, \
# get_identifier(config, idx) + config['pHDee.hicpro.allValidPairs_suffix']]))
input_filename = '/'.join([results_folder, \
get_identifier(config, idx) + config['pHDee.hicpro.allValidPairs_suffix']])
#
output_filename = '/'.join([ results_folder, \
get_identifier(config, idx) ])
logger.info("Creating '{}'".format(output_filename+'.hic'))
logger.info("Using '{}' as input file".format(input_filename))
logger.info("Using '{}' as temp folder".format(tmp_folder))
logger.info("Using '{}' for reading genome size".format(config['pHDee.juicebox.genome_size_file']))
logger.info("Using '{}' for restriction fragments".format(restriction_fragment_file))
#
hicpro2juicebox_cmd = ['bash', config['pHDee.juicebox.hicpro2juicebox_script'], \
'-i', input_filename, \
'-g', config['pHDee.juicebox.genome_size_file'], \
'-j', config['pHDee.juicebox.jar_file'] , \
'-r', restriction_fragment_file, \
'-t', tmp_folder,
'-o', output_filename
]
logger.debug("# Run hicpro2juicebox script")
logger.debug(' '.join(hicpro2juicebox_cmd))
if config['no_execute'] is False:
err_log = open(config['pHDee.err_log'], "a")
cmd_log = open(config['pHDee.juicebox.script_output_file'], "a")
subprocess.call(hicpro2juicebox_cmd, stdout=cmd_log, stderr=err_log, shell=False)
cmd_log.close()
err_log.close()
logger.info("You can now use the '{}.hic' file with juicebox on desktop or on the web browser".format(output_filename))
return
def do_differential_comparison(config):
"""
Perform a differential comparison between matrices
using HiCcompare. Use raw matrices since HiCcompare
jointly normalizes them.
@param config
Output generated includes
-- PDFs of MD plots?
-- PDFs of distance-normalization plots?
-- TSV file giving statistics on number
of total interactions, those with p-val < 0.01,
0.01 > = p-val < 0.05 and
p-val > 0.05 (one file for all chromosomes)
-- CSV file listing all interactions with p-val < 0.01,
0.01 > = p-val < 0.05
"""
logger = logging.getLogger(pipeline_run_name)
logger.info("Running differential comparison using HiCcompare")
if config['no_execute'] is False:
if os.path.exists(os.path.join(config['pHDee.output_folder'], \
config['pHDee.hiccompare.output_folder'])) is False:
logger.info("Creating folder '{}' at '{}'".format(\
config['pHDee.hiccompare.output_folder'], \
config['pHDee.output_folder']))
os.mkdir( os.path.join(config['pHDee.output_folder'], \
config['pHDee.hiccompare.output_folder']) )
else:
logger.info("Using existing folder '{}' at '{}'".format(\
config['pHDee.hiccompare.output_folder'], \
config['pHDee.output_folder']))
for g in range(len(config['pHDee.hiccompare.group_id'])):
logger.info("Comparisons for group '{}'".format(\
config['pHDee.hiccompare.group_id'][g]))
# A folder for each group
group_foldername = os.path.join(config['pHDee.output_folder'], \
config['pHDee.hiccompare.output_folder'], \
config['pHDee.hiccompare.group_id'][g])
csv_suffix = config['pHDee.hiccompare.csv_folder']
#
mdplot_suffix = config['pHDee.hiccompare.md_plots_folder']
#
dnplot_suffix = config['pHDee.hiccompare.dist_norm_plots_folder']
if config['no_execute'] is False:
if os.path.exists(group_foldername) is False:
logger.info("Creating folder for group '{}'".format(\
config['pHDee.hiccompare.group_id'][g]))
# Create folder for group with separate folders
# for MD plots and distance normalization plots
os.makedirs(os.path.join(group_foldername, \
mdplot_suffix))
# Now folder for csv files
os.mkdir(os.path.join(group_foldername,\
csv_suffix))
# Now folder for distance normalization plots
os.mkdir(os.path.join(group_foldername,\
dnplot_suffix))
#
else:
logger.info("Using existing folder for group '{}'".format(\
config['pHDee.hiccompare.group_id'][g]))
# Folder for CSV files
if os.path.exists(os.path.join(group_foldername, \
csv_suffix)) is False:
logger.info("Creating folder for CSV files")
os.mkdir(os.path.join(group_foldername,\
csv_suffix))
else:
logger.info("Using existing folder for CSV files")
# Folder for MD plots
if os.path.exists(os.path.join(group_foldername, \
mdplot_suffix)) is False:
logger.info("Creating folder for MD plots")
os.mkdir(os.path.join(group_foldername,\
mdplot_suffix))
else:
logger.info("Using existing folder for MD plots")
# Folder for distance normalization plots
if os.path.exists(os.path.join(group_foldername,\
dnplot_suffix)) is False:
logger.info("Creating folder for distance normalization plots")
os.mkdir(os.path.join(group_foldername,\
dnplot_suffix))
else:
logger.info("Using existing folder for distance normalization plots")
#
for r in range(len(config['pHDee.hiccompare.resolutions'])):
# Read Name1, Name2 and groupID from groupInfo file
with open(config['pHDee.hiccompare.groups_info_file'], "r") as f:
groupInfo_lines = [x.strip() for x in f.readlines() if x.find("#") == -1]
groupIDs = []
INames = []
ONames = []
base_locations = []
for line in groupInfo_lines:
[groupID, OName, IName, base_location, uniqueValInt] = [x.strip() for x in line.split("\t")]
if(groupID == config['pHDee.hiccompare.group_id'][g]):
groupIDs.append(groupID)
INames.append(IName)
ONames.append(OName)
base_locations.append(base_location)
#
for aIDX in range(0,len(ONames)):
n1 = ONames[aIDX]
mf1 = os.path.join(base_locations[aIDX], \
ONames[aIDX] + \
config['pHDee.hicpro.output_folder_suffix'], \
"hic_results", "matrix", INames[aIDX], "raw", \
str(config['pHDee.hiccompare.resolutions'][r]), \
INames[aIDX] + '_' + str(config['pHDee.hiccompare.resolutions'][r]) + ".matrix")
#
bf1 = os.path.join(base_locations[aIDX], \
ONames[aIDX] + \
config['pHDee.hicpro.output_folder_suffix'],\
"hic_results", "matrix", INames[aIDX], "raw",\
str(config['pHDee.hiccompare.resolutions'][r]),
INames[aIDX] + '_' + str(config['pHDee.hiccompare.resolutions'][r]) + "_ord.bed")
#
for bIDX in range(aIDX, len(ONames)):
if ONames[aIDX] != ONames[bIDX]:
n2 = ONames[bIDX]
mf2 = os.path.join(base_locations[aIDX], \
ONames[bIDX] + \
config['pHDee.hicpro.output_folder_suffix'], \
"hic_results", "matrix", INames[bIDX], "raw", \
str(config['pHDee.hiccompare.resolutions'][r]), \
INames[bIDX] + '_' + str(config['pHDee.hiccompare.resolutions'][r]) + ".matrix")
#
bf2 = os.path.join(base_locations[aIDX], \
ONames[bIDX] + \
config['pHDee.hicpro.output_folder_suffix'],\
"hic_results", "matrix", INames[bIDX], "raw",\
str(config['pHDee.hiccompare.resolutions'][r]),
INames[bIDX] + '_' + str(config['pHDee.hiccompare.resolutions'][r]) + "_ord.bed")
#
diff_comparison_cmd = [ 'Rscript', \
'scripts/make_differential_comparisons/compare_contact_matrices_for_chromosomes.R', \
# Name1 and Name2
n1, n2, \
# resolution
str(config['pHDee.hiccompare.resolutions'][r]), \
# matfile, bedfile / 1 and 2
mf1, bf1, \
mf2, bf2, \
# result_location
os.path.join(config['pHDee.output_folder'], config['pHDee.hiccompare.output_folder']), \
# groupID
groupIDs[0], \
# rOut file
os.path.join(config['pHDee.output_folder'], config['pHDee.hiccompare.output_folder'],
"output_comparing_contact_matrices_group"+ config['pHDee.hiccompare.group_id'][g] + "_" + \
str(config['pHDee.hiccompare.resolutions'][r])+".rOut")
]
logger.info("# Running differential comparison")
logger.info("Resolution: {}".format(\
str(config['pHDee.hiccompare.resolutions'][r])))
logger.debug(' '.join(diff_comparison_cmd))
if config['no_execute'] is False:
logger.debug("Resolution: {}".format(\
str(config['pHDee.hiccompare.resolutions'][r])))
err_log = open(config['pHDee.err_log'], "a")
subprocess.call(diff_comparison_cmd, stderr=err_log, shell=False)
err_log.close()
return
def setup_stages_for_pipeline_to_run(args):
"""
Manage the stages to run
"""
logger = logging.getLogger(pipeline_run_name)
stages_to_run = {'do_all': False, \
'do_hicpro': False, \
'do_fithic': False, \
'do_hicplotter': False, \
'do_juicebox': False, \
'do_differential': False, \
'suggest_resolution': False
}
given_stages = args.stage
# Check if all stages are specified using valid keywords
for given_stage in given_stages:
if given_stage not in stages_to_run.keys():
logger.error("'{}' is not a valid keyword for stage. \nValid stages: {}".format(given_stage, ', '.join(stages_to_run.keys() + ['do_all'] )))
log_pipeline_end_message("ERROR")
sys.exit(1)
logger.info("Stage(s) to run: {}".format(', '.join(given_stages)))
for given_stage in given_stages:
if given_stage == 'do_all':
stages_to_run['do_hicpro'] = True
stages_to_run['do_fithic'] = True
stages_to_run['do_juicebox'] = True
elif given_stage == 'do_hicpro':
stages_to_run['do_hicpro'] = True
elif given_stage == 'do_fithic':
stages_to_run['do_fithic'] = True
elif given_stage == 'do_juicebox':
stages_to_run['do_juicebox'] = True
elif given_stage == 'do_hicplotter':
stages_to_run['do_hicplotter'] = True
elif given_stage == 'do_differential':
stages_to_run['do_differential'] = True
return stages_to_run
def handle_logging():
"""
"""
#set up logging to file and console, both
rand_id = randint(1000, 9999)
log_filename = 'pHDee-' + str(rand_id) +'.log'
global pipeline_run_name
pipeline_run_name = 'pHDee-' + str(rand_id)
logging.basicConfig(level=logging.DEBUG, format='\n%(asctime)s - %(name)s - %(levelname)s - %(message)s',\
datefmt='%Y-%m-%d %H:%M:%S', filename=log_filename, filemode='a')
# define a Handler which writes INFO messages or higher to the sys.stderr
console = logging.StreamHandler()
console.setLevel(logging.INFO)
# format
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s', datefmt='%Y-%m-%d %H:%M:%S')
console.setFormatter(formatter)
# add console handler
logging.getLogger('').addHandler(console)
# get a logger
logger = logging.getLogger(pipeline_run_name)
return logger
def main():
"""
Main function
"""
logger = handle_logging()
logger.info("---- Pipeline pHDee begins ----")
logger.info("Log file: {}.log".format(pipeline_run_name))
stages_to_run = setup_stages_for_pipeline_to_run(args)
# Read the config file; check assertions
config = read_config_file(args.config_file)
assert_config_options(config, stages_to_run)
if args.no_execute is True:
config['no_execute'] = True
# Does this ensure the order?
if stages_to_run['do_hicpro']:
# Run HiC-Pro
run_hicpro(config)
logger.info("do_hicpro [DONE].")
#
if stages_to_run['do_fithic']:
# Run Fit-HiC
run_fithic(config)
logger.info("do_fithic [DONE].")
#
if stages_to_run['do_hicplotter']:
# 1. User should ensure that hicpro completed successfully
# 2. Run HiCPlotter
plot_with_hicplotter(config)
logger.info("do_hicplotter [DONE].")
#
if stages_to_run['do_juicebox']:
# 1. Check that hicpro run is completed
# 2. Convert to juicebox format
create_files_for_juicebox(config)
logger.info("do_juicebox [DONE].")
#
if stages_to_run['do_differential']:
# Perform differential comparisons
do_differential_comparison(config)
logger.info("do_differential [DONE].")
#
if stages_to_run['suggest_resolution']:
# TODO
# Juicer tools has a shell script to do this, but it
# requires the Juicer-specific merge_nodups.txt file
logger.info("suggest_resolution [TODO].")
log_pipeline_end_message("SUCCESS")
logging.shutdown()
if __name__ == '__main__':
main()