run_pipeline.py

#!/usr/bin/env python

# Python pipeline for end-to-end analysis of HiChIP data using various tools.
# This script uses the following softwares/tools when run from start to end.
# 1. HiC-Pro -- from fastq files to (normalized) contact matrices
# 2. HiCPlotter -- static visualization of matrices and tracks etc.
# 4. HiCcompare -- differential comparison of matrices
# 5. Fit-Hi-C -- calling siginificant interactions
# 6. Juicebox -- dynamic visualization
#
# Author: Sarvesh Nikumbh
# Email: snikumbh@mpi-inf.mpg.de
#
# Date: May 07, 2018
#
#

# import statements
import os, sys, datetime, ast
import shutil, subprocess
import argparse, glob
from random import randint
from HiCPlotter import HiCplotter
import logging

parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter,\
				add_help = False, \
				description ="""
Pipeline for processing HiChIP data from end-to-end.
Usage example: $python run_pipeline.py -c umbrella-config-file.txt -s do_all
				""")
pR = parser.add_argument_group('Required arguments')
pR.add_argument('-c', '--config_file', \
				help = "Specify the umbrella config file.", \
				default = 'umbrella-config-file.txt', \
				required = True)
# get the user to specify which function to perform; correspondingly use only
# that portion of variables from config file
pR.add_argument('-st', '--stage',
				help = 'Specify what to run. Possible values: \
				[do_hicpro, do_hicplotter, do_fithic, do_juicebox, \
				do_differential, suggest_resolution, do_all]. \
				Non-relevant details from the config file will \
				be ignored. When *do_all*, we perform sequentially \
				do_hicpro, do_fithic, and do_juicebox. \
				For others, run the pipeline individually.', \
				default = 'all', \
				nargs = '+', \
				required = True)
pO = parser.add_argument_group('Optional arguments')
# execute or do not execute commands; just print them
pO.add_argument('-ne', '--no_execute',
				help = 'Specify this if you do not \
				want to execute commands right away; just \
				print them in the commands logfile.', \
				action = 'store_true')

pO.add_argument('-h', '--help', help = "Show this message and exit.", action = 'help')

args = parser.parse_args()

def log_pipeline_end_message(endsWithMsg):
	"""
	"""
	logger = logging.getLogger(pipeline_run_name)
	logger.info("---- Pipeline pHDee ends with {} ----".format(endsWithMsg))
	if endsWithMsg == "ERROR":
		logger.info("See the error log for any errors related to running the tools")
		logger.info("Any other errors are reported in this log")
	return

def assert_config_options(config, stages_to_run):
	"""
	"""
	#DONE: 1. pHDee.resolutions cannot be an empty string // done when
	#	reading config file
	#DONE: 2. hiccompare resolutions should be a subset of resolutions
	#DONE: 3. number of inner and outer sample names should be same
	#TODO: 4. For hadling more than one inner sample names per outer sample name,
	#	we could use comma as a separator
	#
	logger = logging.getLogger(pipeline_run_name)
        # Assert 3
	if len(config['pHDee.outer_sample_names']) != \
		len(config['pHDee.inner_sample_names']):
		logger.error("Check the config file, #inner and outer sample names should be equal")
		log_pipeline_end_message("ERROR")
		sys.exit(1)
	# Assert 2
	# check if we should care about this; if do_differential is not asked, we can defer
	if stages_to_run['do_differential']:
		# assertions relevant to the 'do_differential' stage
		if not set(config['pHDee.hiccompare.resolutions']).issubset(config['pHDee.resolutions']):
			if len(set(config['pHDee.resolutions'])) >= set(config['pHDee.hiccompare.resolutions']):
				d = set(config['pHDee.resolutions']).difference(\
						config['pHDee.hiccompare.resolutions'])
			else:
				d = set(config['pHDee.hiccompare.resolutions']).difference(\
						config['pHDee.resolutions'])
			# It could be that pHDee.resolutions specified in the config file are unrelated
			# Then, this is not necessarily an error; thus, we only warn!
			logger.warn("Check the following resolutions specified for HiCcompare {}. HiCPro only run for resolution(s) {}".\
				format(list(d), config['pHDee.resolutions']))
	#
	return

def read_config_file(config_file):
	"""
	Read the config file
	"""
	logger = logging.getLogger(pipeline_run_name)
	param_dict = {}
	with open(config_file, 'r') as f:
		param_lines = [x.strip() for x in f.readlines() if x.find('#') == -1]
	param_lines = [x for x in param_lines if len(x) > 0]
	for line in param_lines:
		param_dict[line.split('=')[0].strip()] = line.split('=')[1].strip()
	# Collect the multiple sample names as list
	param_dict.__setitem__('pHDee.inner_sample_names', \
			param_dict['pHDee.inner_sample_names'].split(' '))
	param_dict.__setitem__('pHDee.outer_sample_names',
			param_dict['pHDee.outer_sample_names'].split(' '))
	param_dict.__setitem__('pHDee.hiccompare.group_id',
                        param_dict['pHDee.hiccompare.group_id'].split(' '))
	# Store the pHDee.resolutions as a separate list.
	# Note: The check that it is not empty has to happen here instead
	# of the function 'assert_config_options'
	if param_dict['pHDee.resolutions'] != '':
		param_dict['pHDee.resolutions'] = [int(x) for x in param_dict['pHDee.resolutions'].split(' ')]
	else:
		logger.warn("Resolutions is empty. Specify at least one value for resolutions")
	# Similarly, for hiccompare.resolutions
	if param_dict['pHDee.hiccompare.resolutions'] != '':
		param_dict['pHDee.hiccompare.resolutions'] = [int(x) for x in param_dict['pHDee.hiccompare.resolutions'].split(' ')]
	else:
		logger.warn("Resolutions is empty. Specify at least one value for resolutions")
	#
	param_dict['no_execute'] = False
	#
	return param_dict


def create_hicpro_input_folder(config, sampleID):
	"""
	Create input folder for HiC-Pro
	Assumption: Only one sample (i.e. inner sample name)
	@param: config
	"""
	#NOTE: creating this input folder can be omitted altogether
	# in case there are multiple samples to be combined.
	# The multiple samples condition is not handled at the moment.
	# Handling this will warrant changes in the way inner_sample_names
	# are specified in the config file.
	logger = logging.getLogger(pipeline_run_name)
	hicpro_input_folder, hicpro_output_folder = create_hicpro_foldernames(config, sampleID)
	logger.info("Creating input folder {}".format(hicpro_input_folder))
	if config['no_execute'] is False:
		if os.path.exists(hicpro_input_folder) is False:
			os.makedirs(hicpro_input_folder \
				+ '/' + config['pHDee.inner_sample_names'][sampleID])
		else:
			# Maybe, raise a warning for the user?
			# # warnings.warn("Input folder exists, will be overwritten")
			logger.warning("Input folder exists. Exiting.")
			sys.exit(1)
		# copy fastq files
		relevant_files = [x for x in os.listdir(config['pHDee.input_folder']) if x.find(config['pHDee.outer_sample_names'][sampleID]) != -1]
		relevant_files = [x for x in relevant_files if x.find('fastq.gz') != -1]
		assert(len(relevant_files) == 2) # for the two mates
		for f in relevant_files:
			if os.path.exists(hicpro_input_folder+ '/' + config['pHDee.inner_sample_names'][sampleID]) is True:
				shutil.copyfile('/'.join([config['pHDee.input_folder'], f]), \
					'/'.join([hicpro_input_folder, config['pHDee.inner_sample_names'][sampleID], f]))
		# copy config file
		shutil.copyfile(config['pHDee.hicpro.config_filename'], hicpro_input_folder +'/config-hicpro.txt')
		# Replace the resolution values with given values for HiC-Pro
		with open(config['pHDee.hicpro.config_filename'], 'r') as f:
			old_config_lines = f.readlines()
		new_config_lines = old_config_lines
		for k,l in enumerate(old_config_lines):
			if l.find('BIN_SIZE') != -1:
				new_config_lines[k] = 'BIN_SIZE = ' + ' '.join([str(x) for x in config['pHDee.resolutions']])+'\n'
		with open(hicpro_input_folder +'/config-hicpro.txt', 'w') as f:
			f.writelines(new_config_lines)
		#
		config['pHDee.hicpro.config_filename'] = '/'.join([ hicpro_input_folder, config['pHDee.hicpro.config_filename'] ])
		logger.info("HiC-Pro config file name updated to {}".format(config['pHDee.hicpro.config_filename']))
	return

def create_hicpro_foldernames(config, sampleID):
	"""
	Returns the folder names for HiC-Pro input and output
	@param: config
	"""
	logger = logging.getLogger(pipeline_run_name)
	location = config['pHDee.output_folder'] + '/'
	#
	input_folder_l = []
	if config['pHDee.experiment_type'] != '':
		input_folder_l.append(config['pHDee.experiment_type'])
	if config['pHDee.celltype'] != '':
		input_folder_l.append(config['pHDee.celltype'])
	if config['pHDee.outer_sample_names'][sampleID] != '':
		input_folder_l.append(config['pHDee.outer_sample_names'][sampleID])
	if config['pHDee.condition'] != '':
		input_folder_l.append(config['pHDee.condition'])
	if config['pHDee.replicate'] != '':
		input_folder_l.append(config['pHDee.replicate'])
	#
	input_folder = '_'.join(input_folder_l) \
			+ config['pHDee.hicpro.input_folder_suffix']
	input_folder = location + input_folder
	# This if-else can be removed. It exists currently to handle
	# the difference in naming schemes in the untreated, tumor
	# and dox conditions
	if config['pHDee.condition'] == 'untreated' or config['pHDee.condition'] == 'tumor':
		output_folder_l = []
		if config['pHDee.experiment_type'] != '':
			output_folder_l.append(config['pHDee.experiment_type'])
		if config['pHDee.celltype'] != '':
			output_folder_l.append(config['pHDee.celltype'])
		if config['pHDee.outer_sample_names'][sampleID] != '':
			output_folder_l.append(config['pHDee.outer_sample_names'][sampleID])
		if config['pHDee.condition'] != '':
			output_folder_l.append(config['pHDee.condition'])
		if config['pHDee.replicate'] != '':
			output_folder_l.append(config['pHDee.replicate'])
		output_folder = '_'.join(output_folder_l) \
				+ config['pHDee.hicpro.output_folder_suffix']
	else:
		output_folder_l = []
		if config['pHDee.experiment_type'] != '':
			output_folder_l.append(config['pHDee.experiment_type'])
		if config['pHDee.celltype'] != '':
			output_folder_l.append(config['pHDee.celltype'])
		if config['pHDee.outer_sample_names'][sampleID] != '':
			output_folder_l.append(config['pHDee.outer_sample_names'][sampleID])
		if config['pHDee.condition'] != '':
			output_folder_l.append(config['pHDee.condition'])
		if config['pHDee.replicate'] != '':
			output_folder_l.append(config['pHDee.replicate'])
		output_folder = '_'.join(output_folder_l) \
				+ config['pHDee.hicpro.output_folder_suffix']
	output_folder = location + output_folder
	return input_folder, output_folder

def run_hicpro(config):
	"""
	Run HiC-Pro
	@param: config
	"""
	logger = logging.getLogger(pipeline_run_name)
	logger.info("Running HiC-Pro")
	for idx in range(len(config['pHDee.outer_sample_names'])):
		# Create input folder
		hicpro_input_folder, hicpro_output_folder = create_hicpro_foldernames(config, idx)
		create_hicpro_input_folder(config, idx)
		## Get the BAM or fastq files ready
		##get_bam_or_fastq_ready(config)
		# Run HiC-Pro
		hicpro_path = config['pHDee.hicpro.path'] + '/bin/HiC-Pro'
		hicpro_cmd = ['time', hicpro_path, '-i', hicpro_input_folder, '-o', hicpro_output_folder, \
				'-c', config['pHDee.hicpro.config_filename']]
		logger.debug("# Run HiC-Pro:")
		logger.debug(' '.join(hicpro_cmd))
		if config['no_execute'] is False:
			err_log = open(config['pHDee.err_log'], "a")
			cmd_log = open(config['pHDee.hicpro.output_file'], "a")
			subprocess.call(hicpro_cmd, stdout=cmd_log, stderr=err_log)
			cmd_log.close()
			err_log.close()
			# Clear intermediate files from the mapping/alignment step
			logger.info("Clearing intermediate bowtie files")
			if os.path.exists(hicpro_output_folder+ '/bowtie_results/bwt2_global/') is False:
				logger.error("Check if HiC-Pro completed successfully. Exiting.")
				log_pipeline_end_message("ERROR")
				sys.exit(1)
			else:
				shutil.rmtree(str(hicpro_output_folder + '/bowtie_results/bwt2_global/'))
				shutil.rmtree(str(hicpro_output_folder + '/bowtie_results/bwt2_local/'))
	return

def hicpro_matrix_foldername(config, sampleID):
	"""
	Returns the path to raw or iced folder
	in the HiCPro structure
	"""
	logger = logging.getLogger(pipeline_run_name)
	_, hicpro_output_folder = create_hicpro_foldernames(config, sampleID)
	name = os.path.join(hicpro_output_folder, 'hic_results', 'matrix', \
                                config['pHDee.inner_sample_names'][sampleID])
	raw_name = os.path.join(name, 'raw')
	iced_name = os.path.join(name, 'iced')
	return raw_name, iced_name

def get_identifier(config, sampleID):
	"""
	Concat params to create an identifier for files
	"""
	logger = logging.getLogger(pipeline_run_name)
	iName_l = []
	if config['pHDee.condition'] != '':
		iName_l.append(config['pHDee.condition'])
	if config['pHDee.experiment_type'] != '':
		iName_l.append(config['pHDee.experiment_type'])
	if config['pHDee.celltype'] != '':
		iName_l.append(config['pHDee.celltype'])
	if config['pHDee.outer_sample_names'][sampleID] != '':
		iName_l.append(config['pHDee.outer_sample_names'][sampleID])
	if config['pHDee.replicate'] != '':
		iName_l.append(config['pHDee.replicate'])
	iName = '_'.join(iName_l)
	return iName

def run_fithic(config):
	"""
	Run Fit-Hi-C
	@param config
	"""
	logger = logging.getLogger(pipeline_run_name)
	#location = config['pHDee.output_folder'] + '/'
	logger.info("Running Fit-Hi-C")
	for idx in range(len(config['pHDee.outer_sample_names'])):
		hicpro_input_folder, hicpro_output_folder = create_hicpro_foldernames(config, idx)
		raw_folder, iced_folder = hicpro_matrix_foldername(config, idx)
		for res in config['pHDee.resolutions']:
			# Assert input files exist
			iced_matrix_filename = os.path.join(iced_folder, str(res), \
						'_'.join([config['pHDee.inner_sample_names'][idx], \
						str(res), 'iced.matrix']) )
			fithic_folder_location = os.path.join( iced_folder, str(res), \
				config['pHDee.fithic.output_folder'] )
			if config['no_execute'] is False:
				if os.path.exists(iced_matrix_filename) is False:
					logger.error("Check if iced matrix exists (or HiC-Pro completed successfully)")
					log_pipeline_end_message("ERROR")
					sys.exit(1)
                        	# Create Fit-Hi-C folder
				if os.path.exists(fithic_folder_location) is False:
					logger.info("Creating folder '{}' at '{}'".format(config['pHDee.fithic.output_folder'], fithic_folder_location))
					os.mkdir(fithic_folder_location)
				else:
					logger.info("Using existing '{}' folder at '{}'".format(config['pHDee.fithic.output_folder'], fithic_folder_location))
			else:
					logger.info("Using existing '{}' folder at '{}'".format(config['pHDee.fithic.output_folder'], fithic_folder_location))
			# Run Fit-HiC using Rscript
			fithic_cmd = ['Rscript', 'scripts/run_fithic.R', \
					# fragsfile
					os.path.join(raw_folder, str(res), \
					config['pHDee.inner_sample_names'][idx] + '_'\
					+ str(res) + '_ord.bed'),
					# intersFile
					os.path.join(iced_folder, str(res), \
                                        config['pHDee.inner_sample_names'][idx] + '_' \
                                        + str(res) + '_iced.matrix'),
					# biasFile
					os.path.join(iced_folder, str(res), \
					config['pHDee.inner_sample_names'][idx] + '_'\
                                        + str(res) + '_iced.matrix.biases'),
					# outputDir
					fithic_folder_location,
					# name_of_lib
					'_'.join([ get_identifier(config, idx), \
					str(res) ]),
					# noOfBins,
					config['pHDee.fithic.noOfBins'], \
					# whether_toUseHiCPro
					'TRUE',
					os.path.join(fithic_folder_location, \
					config['pHDee.fithic.output_log'])
					]
			logger.debug("# Run Fit-Hi-C:")
			logger.debug(' '.join(fithic_cmd))
			if config['no_execute'] is False:
				err_log = open(config['pHDee.err_log'], "a")
				subprocess.call(fithic_cmd, stderr=err_log, shell=False)
				#process.wait()
				logger.info("Resolution: {} ==> Fit-HiC output directed to '{}'".format(res, '/'.join([ fithic_folder_location, \
                                        config['pHDee.fithic.output_log'] ])))
				err_log.close()
	return

def call_HiCPlotter(wholeGenomeBool, given_chromosome, hicplotter_params):
	"""
	Makes the HiCPlotter function call
	"""
	logger = logging.getLogger(pipeline_run_name)
	thisSampleID = hicplotter_params['umbrella_config']\
				['pHDee.outer_sample_names'].index(\
				hicplotter_params['curr_outer_sample_name'])
	HiCplotter(files=[hicplotter_params['iced_matrix_file']],\
		names = [hicplotter_params['this_plot_title']],\
		resolution = hicplotter_params['resolution'],\
		chromosome = hicplotter_params['umbrella_config']\
				['pHDee.hicplotter.given_chromosome'],\
		# this is prefix for the output filename by hicplotter
		output = os.path.join(hicplotter_params['output_folder'], \
				get_identifier(hicplotter_params['umbrella_config'], \
				thisSampleID)),\
		histograms = [],\
		histLabels = [],\
		fillHist = [],\
		histMax = [],\
		verbose = False,\
		fileHeader = 0,\
		fileFooter = 0,\
		matrixMax = 0,\
		histColors = [],\
		barPlots = [],\
		barLabels = [],\
		plotGenes = '',\
		superImpose = False,\
		start = 0,\
		end = 0,\
		tileLabels = [],\
		tilePlots = [],\
		tileColors = [],\
		tileText = False,\
		arcLabels = [],\
		arcPlots = [],\
		arcColors = [],\
		peakFiles = [],\
		epiLogos = '',\
		window = 5,\
		tadRange = 8,\
		tripleColumn = True,\
		bedFile = hicplotter_params['bed_file'],\
		barColors = [],\
		dPixels = 600,\
		compareEx = '',\
		compareSm = '',\
		upSide = False,\
		smoothNoise = 0.5,\
		cleanNANs = True,\
		plotTriangular = False,\
		plotTadDomains = False,\
		randomBins = False,\
		wholeGenome = hicplotter_params['umbrella_config']['pHDee.hicplotter.whole_genome'],\
		plotPublishedTadDomains = False,\
		plotDomainsAsBars = False,\
		imputed = False,\
		barMax = [],\
		spine = False,\
		plotDomainTicks = False,\
		triangularHeight = False,\
		highlights = 0,\
		highFile = '',\
		heatmapColor = 3,\
		highResolution = True,\
		plotInsulation = False,\
		plotCustomDomains = False,\
		publishedTadDomainOrganism = False,\
		customDomainsFile = [],\
		compare = False,\
		pair = False,\
		domColors = [],\
		oExtension = 'pdf',\
		geneLabels = False,\
		dark = False)
	return

def plot_with_hicplotter(config):
	"""
	Handles static plotting with HiCPlotter:
	wholeGenome, perChromosome or only givenChromosome
	"""
	logger = logging.getLogger(pipeline_run_name)
	logger.info("Running HiCPlotter")
	hicplotter_params = {}
	chromosomes = ['chr'+str(x) for x in [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 'X']]
	if config['no_execute'] is False:
		for idx in range(len(config['pHDee.outer_sample_names'])):
			hicpro_input_folder, hicpro_output_folder = create_hicpro_foldernames(config, idx)
			hicplotter_params['curr_outer_sample_name'] = config['pHDee.outer_sample_names'][idx]
			hicplotter_params['output_folder'] = os.path.join( hicpro_output_folder, \
								config['pHDee.hicplotter.output_folder'] )
			if os.path.exists(hicplotter_params['output_folder']) is False:
				logger.info("Creating hicplotter_results directory")
				os.mkdir(hicplotter_params['output_folder'])
			else:
				logger.info("Using existing hicplotter_results directory")
			hicplotter_params['iced_matrix_file'] = ''
			hicplotter_params['this_plot_title'] = ''
			hicplotter_params['bed_file'] = ''
			hicplotter_params['umbrella_config'] = config
			raw_folder, iced_folder = hicpro_matrix_foldername(config, idx)
			for res in config['pHDee.resolutions']:
				hicplotter_params['resolution'] = res
				input_filenames_prefix = os.path.join( iced_folder, \
							str(res), \
							config['pHDee.inner_sample_names'][idx] )
				# iced matrix file
				hicplotter_params['iced_matrix_file'] = '_'.join([ \
						input_filenames_prefix, str(res), 'iced.matrix'])
				# plot title
				hicplotter_params['this_plot_title'] = '_'.join([ \
						get_identifier(config, idx), str(res) ])
				# bed file
				hicplotter_params['bed_file'] = '_'.join([ \
						input_filenames_prefix.replace('iced', 'raw'), \
						str(res), 'ord.bed'])
				#
				if config['pHDee.hicplotter.given_chromosome'] != '':
					logger.info("Plotting matrix for given chromosome")
					call_HiCPlotter(config['pHDee.hicplotter.whole_genome'], \
						config['pHDee.hicplotter.given_chromosome'], \
						hicplotter_params)
				# if whole_genome is set, given_chr is overwritten and set to chrY
				if ast.literal_eval(config['pHDee.hicplotter.whole_genome']):
					logger.info("Plotting whole genome matrix")
					config['pHDee.hicplotter.given_chromosome'] = 'chrY'
					call_HiCPlotter(config['pHDee.hicplotter.whole_genome'], \
						config['pHDee.hicplotter.given_chromosome'], \
						hicplotter_params)
				# if per chromosome intra-chromosomal heatmaps are to be plotted
				if ast.literal_eval(config['pHDee.hicplotter.per_chromosome']):
					# iterate with chr set to each chromosome and call; wholeGenome to False
					logger.info("Plotting matrices per chromosome")
					for thisChr in chromosomes:
						call_HiCPlotter(config['pHDee.hicplotter.whole_genome'], \
						thisChr, \
						hicplotter_params)
						logger.info("{}".format(thisChr))
	return

def create_files_for_juicebox(config):
	"""
	Convert HiC-Pro allValidPairs files into
	juicebox format for viewing
	@param: config
	"""
	logger = logging.getLogger(pipeline_run_name)
	logger.info("Running conversion of files for juicebox")
	with open(config['pHDee.hicpro.config_filename'], 'r') as f:
		for line in f.readlines():
			#if line.split('=')[0].strip() == 'GENOME_SIZE':
			#	genome_size_file = config['pHDee.hicpro.path'] + '/annotation/' \
			#		+ line.strip().split('=')[1].strip()
			#	# should use a file named hg19.chrom.sizes
			if line.split('=')[0].strip() == 'GENOME_FRAGMENT':
				restriction_fragment_file =  config['pHDee.hicpro.path'] + '/annotation/' \
					+ line.strip().split('=')[1].strip()
	#
	for idx in range(len(config['pHDee.outer_sample_names'])):
		hicpro_input_folder, hicpro_output_folder = create_hicpro_foldernames(config, idx)
		logger.debug(hicpro_output_folder)
		#
		if ast.literal_eval(config['pHDee.juicebox.independent_folder']):
			results_folder = config['pHDee.juicebox.output_folder']
		else:
			results_folder = os.path.join( hicpro_output_folder, config['pHDee.juicebox.output_folder'] )
		tmp_folder = os.path.join( results_folder, config['pHDee.juicebox.temp_folder'] )
		if os.path.exists( results_folder ) is False:
			if ast.literal_eval(config['pHDee.juicebox.independent_folder']) is False:
				logger.info("Creating folder '{}' at '{}'".format(config['pHDee.juicebox.output_folder'], \
                                	hicpro_output_folder))
			else:
				logger.info("Creating folder '{}'".format(config['pHDee.juicebox.output_folder']))
			os.mkdir(results_folder)
			os.mkdir(tmp_folder)
		else:
			if ast.literal_eval(config['pHDee.juicebox.independent_folder']) is False:
				logger.info("Using existing folder '{}' at '{}'".format(config['pHDee.juicebox.output_folder'], \
                                	hicpro_output_folder))
			else:
				logger.info("Using existing folder '{}'".format(config['pHDee.juicebox.output_folder']))
		#
		# #INFO: Also copy allValidPairs file
		#logger.info("Copying allValidPairs file")
		# #INFO: If the allValidPairs is in the same folder (which was the case after copyiing them),
		# use them from the current location. Thus, filename is changed from the more generic
		# HiC-Pro named version to the more specific one used by/identified by this pipeline
		#
		#input_filename = os.path.join(hicpro_output_folder, \
		#		'hic_results', 'data', \
		#		config['pHDee.inner_sample_names'][idx], \
		#		config['pHDee.inner_sample_names'][idx] \
		#		+ config['pHDee.hicpro.allValidPairs_suffix'] )
		#
		#shutil.copyfile(input_filename, '/'.join([ results_folder, \
		#					get_identifier(config, idx) + config['pHDee.hicpro.allValidPairs_suffix']]))
		input_filename = '/'.join([results_folder, \
					get_identifier(config, idx) + config['pHDee.hicpro.allValidPairs_suffix']])
		#
		output_filename = '/'.join([ results_folder, \
					get_identifier(config, idx) ])
		logger.info("Creating '{}'".format(output_filename+'.hic'))
		logger.info("Using '{}' as input file".format(input_filename))
		logger.info("Using '{}' as temp folder".format(tmp_folder))
		logger.info("Using '{}' for reading genome size".format(config['pHDee.juicebox.genome_size_file']))
		logger.info("Using '{}' for restriction fragments".format(restriction_fragment_file))
		#
		hicpro2juicebox_cmd = ['bash', config['pHDee.juicebox.hicpro2juicebox_script'], \
					'-i', input_filename, \
					'-g', config['pHDee.juicebox.genome_size_file'], \
					'-j', config['pHDee.juicebox.jar_file'] , \
					'-r', restriction_fragment_file, \
					'-t', tmp_folder,
					'-o', output_filename
					]
		logger.debug("# Run hicpro2juicebox script")
		logger.debug(' '.join(hicpro2juicebox_cmd))
		if config['no_execute'] is False:
			err_log = open(config['pHDee.err_log'], "a")
			cmd_log = open(config['pHDee.juicebox.script_output_file'], "a")
			subprocess.call(hicpro2juicebox_cmd, stdout=cmd_log, stderr=err_log, shell=False)
			cmd_log.close()
			err_log.close()
			logger.info("You can now use the '{}.hic' file with juicebox on desktop or on the web browser".format(output_filename))
	return

def do_differential_comparison(config):
	"""
	Perform a differential comparison between matrices
	using HiCcompare. Use raw matrices since HiCcompare
	jointly normalizes them.
	@param	config
	Output generated includes
	-- PDFs of MD plots?
	-- PDFs of distance-normalization plots?
	-- TSV file giving statistics on number
	   of total interactions, those with p-val < 0.01,
	   0.01 > = p-val < 0.05 and
	   p-val > 0.05 (one file for all chromosomes)
	-- CSV file listing all interactions with p-val < 0.01,
	   0.01 > = p-val < 0.05
	"""
	logger = logging.getLogger(pipeline_run_name)
	logger.info("Running differential comparison using HiCcompare")
	if config['no_execute'] is False:
		if os.path.exists(os.path.join(config['pHDee.output_folder'], \
                                        config['pHDee.hiccompare.output_folder'])) is False:
			logger.info("Creating folder '{}' at '{}'".format(\
                                config['pHDee.hiccompare.output_folder'], \
                                config['pHDee.output_folder']))
			os.mkdir( os.path.join(config['pHDee.output_folder'], \
					config['pHDee.hiccompare.output_folder']) )
		else:
			logger.info("Using existing folder '{}' at '{}'".format(\
                                config['pHDee.hiccompare.output_folder'], \
                                config['pHDee.output_folder']))
	for g in range(len(config['pHDee.hiccompare.group_id'])):
		logger.info("Comparisons for group '{}'".format(\
                        config['pHDee.hiccompare.group_id'][g]))
		# A folder for each group
		group_foldername = os.path.join(config['pHDee.output_folder'], \
                                config['pHDee.hiccompare.output_folder'], \
                                config['pHDee.hiccompare.group_id'][g])
		csv_suffix = config['pHDee.hiccompare.csv_folder']
		#
		mdplot_suffix = config['pHDee.hiccompare.md_plots_folder']
		#
		dnplot_suffix = config['pHDee.hiccompare.dist_norm_plots_folder']
		if config['no_execute'] is False:
			if os.path.exists(group_foldername) is False:
				logger.info("Creating folder for group '{}'".format(\
                                        config['pHDee.hiccompare.group_id'][g]))
				# Create folder for group with separate folders
				# for MD plots and distance normalization plots
				os.makedirs(os.path.join(group_foldername, \
							mdplot_suffix))
				# Now folder for csv files
				os.mkdir(os.path.join(group_foldername,\
							csv_suffix))
				# Now folder for distance normalization plots
				os.mkdir(os.path.join(group_foldername,\
							dnplot_suffix))
			#
			else:
				logger.info("Using existing folder for group '{}'".format(\
                                        config['pHDee.hiccompare.group_id'][g]))
				# Folder for CSV files
				if os.path.exists(os.path.join(group_foldername, \
							csv_suffix)) is False:
					logger.info("Creating folder for CSV files")
					os.mkdir(os.path.join(group_foldername,\
							csv_suffix))
				else:
					logger.info("Using existing folder for CSV files")
				# Folder for MD plots
				if os.path.exists(os.path.join(group_foldername, \
							mdplot_suffix)) is False:
					logger.info("Creating folder for MD plots")
					os.mkdir(os.path.join(group_foldername,\
							mdplot_suffix))
				else:
					logger.info("Using existing folder for MD plots")
				# Folder for distance normalization plots
				if os.path.exists(os.path.join(group_foldername,\
							dnplot_suffix)) is False:
					logger.info("Creating folder for distance normalization plots")
					os.mkdir(os.path.join(group_foldername,\
							dnplot_suffix))
				else:
					logger.info("Using existing folder for distance normalization plots")
		#
		for r in range(len(config['pHDee.hiccompare.resolutions'])):
			# Read Name1, Name2 and groupID from groupInfo file
			with open(config['pHDee.hiccompare.groups_info_file'], "r") as f:
				groupInfo_lines = [x.strip() for x in f.readlines() if x.find("#") == -1]
			groupIDs = []
			INames = []
			ONames = []
			base_locations = []
			for line in groupInfo_lines:
				[groupID, OName, IName, base_location, uniqueValInt] = [x.strip() for x in line.split("\t")]
				if(groupID == config['pHDee.hiccompare.group_id'][g]):
					groupIDs.append(groupID)
					INames.append(IName)
					ONames.append(OName)
					base_locations.append(base_location)
			#
			for aIDX in range(0,len(ONames)):
				n1 = ONames[aIDX]
				mf1 = os.path.join(base_locations[aIDX], \
						ONames[aIDX] + \
						config['pHDee.hicpro.output_folder_suffix'], \
						"hic_results", "matrix", INames[aIDX], "raw", \
						str(config['pHDee.hiccompare.resolutions'][r]), \
						INames[aIDX] + '_' + str(config['pHDee.hiccompare.resolutions'][r]) + ".matrix")
				#
				bf1 = os.path.join(base_locations[aIDX], \
                                                ONames[aIDX] + \
                                                config['pHDee.hicpro.output_folder_suffix'],\
                                                "hic_results", "matrix", INames[aIDX], "raw",\
                                                str(config['pHDee.hiccompare.resolutions'][r]),
						INames[aIDX] + '_' + str(config['pHDee.hiccompare.resolutions'][r]) + "_ord.bed")
				#
				for bIDX in range(aIDX, len(ONames)):
					if ONames[aIDX] != ONames[bIDX]:
						n2 = ONames[bIDX]
						mf2 = os.path.join(base_locations[aIDX], \
							ONames[bIDX] + \
							config['pHDee.hicpro.output_folder_suffix'], \
							"hic_results", "matrix", INames[bIDX], "raw", \
							str(config['pHDee.hiccompare.resolutions'][r]), \
							INames[bIDX] + '_' + str(config['pHDee.hiccompare.resolutions'][r]) + ".matrix")
						#
						bf2 = os.path.join(base_locations[aIDX], \
							ONames[bIDX] + \
							config['pHDee.hicpro.output_folder_suffix'],\
							"hic_results", "matrix", INames[bIDX], "raw",\
							str(config['pHDee.hiccompare.resolutions'][r]),
							INames[bIDX] + '_' + str(config['pHDee.hiccompare.resolutions'][r]) + "_ord.bed")
						#
						diff_comparison_cmd = [ 'Rscript', \
							'scripts/make_differential_comparisons/compare_contact_matrices_for_chromosomes.R', \
							# Name1 and Name2
							n1, n2, \
							# resolution
							str(config['pHDee.hiccompare.resolutions'][r]), \
							# matfile, bedfile / 1 and 2
							mf1, bf1, \
							mf2, bf2, \
							# result_location
							os.path.join(config['pHDee.output_folder'], config['pHDee.hiccompare.output_folder']), \
							# groupID
							groupIDs[0], \
							# rOut file
							os.path.join(config['pHDee.output_folder'], config['pHDee.hiccompare.output_folder'],
							"output_comparing_contact_matrices_group"+ config['pHDee.hiccompare.group_id'][g] + "_" + \
							str(config['pHDee.hiccompare.resolutions'][r])+".rOut")
							]
						logger.info("# Running differential comparison")
						logger.info("Resolution: {}".format(\
                                                                str(config['pHDee.hiccompare.resolutions'][r])))
						logger.debug(' '.join(diff_comparison_cmd))
						if config['no_execute'] is False:
							logger.debug("Resolution: {}".format(\
								str(config['pHDee.hiccompare.resolutions'][r])))
							err_log = open(config['pHDee.err_log'], "a")
							subprocess.call(diff_comparison_cmd, stderr=err_log, shell=False)
							err_log.close()
	return

def setup_stages_for_pipeline_to_run(args):
	"""
	Manage the stages to run
	"""
	logger = logging.getLogger(pipeline_run_name)
	stages_to_run = {'do_all': False, \
			'do_hicpro': False, \
                        'do_fithic': False, \
                        'do_hicplotter': False, \
                        'do_juicebox': False, \
                        'do_differential': False, \
                        'suggest_resolution': False
                        }

	given_stages = args.stage
	# Check if all stages are specified using valid keywords
	for given_stage in given_stages:
		if given_stage not in stages_to_run.keys():
			logger.error("'{}' is not a valid keyword for stage. \nValid stages: {}".format(given_stage, ', '.join(stages_to_run.keys() + ['do_all'] )))
			log_pipeline_end_message("ERROR")
			sys.exit(1)
	logger.info("Stage(s) to run: {}".format(', '.join(given_stages)))
	for given_stage in given_stages:
		if given_stage == 'do_all':
			stages_to_run['do_hicpro'] = True
			stages_to_run['do_fithic'] = True
			stages_to_run['do_juicebox'] = True
		elif given_stage == 'do_hicpro':
			stages_to_run['do_hicpro'] = True
		elif given_stage == 'do_fithic':
			stages_to_run['do_fithic'] = True
		elif given_stage == 'do_juicebox':
			stages_to_run['do_juicebox'] = True
		elif given_stage == 'do_hicplotter':
			stages_to_run['do_hicplotter'] = True
		elif given_stage == 'do_differential':
			stages_to_run['do_differential'] = True
	return stages_to_run

def handle_logging():
	"""
	"""
	#set up logging to file and console, both
	rand_id = randint(1000, 9999)
	log_filename = 'pHDee-' + str(rand_id) +'.log'
	global pipeline_run_name
	pipeline_run_name = 'pHDee-' + str(rand_id)
	logging.basicConfig(level=logging.DEBUG, format='\n%(asctime)s  - %(name)s - %(levelname)s - %(message)s',\
	            datefmt='%Y-%m-%d %H:%M:%S', filename=log_filename, filemode='a')
	# define a Handler which writes INFO messages or higher to the sys.stderr
	console = logging.StreamHandler()
	console.setLevel(logging.INFO)
	# format
	formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s', datefmt='%Y-%m-%d %H:%M:%S')
	console.setFormatter(formatter)
	# add console handler
	logging.getLogger('').addHandler(console)
	# get a logger
	logger = logging.getLogger(pipeline_run_name)
	return logger


def main():
	"""
	Main function
	"""
	logger = handle_logging()

	logger.info("---- Pipeline pHDee begins ----")
	logger.info("Log file: {}.log".format(pipeline_run_name))
	stages_to_run = setup_stages_for_pipeline_to_run(args)

	# Read the config file; check assertions
	config = read_config_file(args.config_file)
	assert_config_options(config, stages_to_run)
	if args.no_execute is True:
		config['no_execute'] = True

	# Does this ensure the order?
	if stages_to_run['do_hicpro']:
		# Run HiC-Pro
		run_hicpro(config)
		logger.info("do_hicpro [DONE].")
		#
	if stages_to_run['do_fithic']:
		# Run Fit-HiC
		run_fithic(config)
		logger.info("do_fithic [DONE].")
		#
	if stages_to_run['do_hicplotter']:
		# 1. User should ensure that hicpro completed successfully
		# 2. Run HiCPlotter
		plot_with_hicplotter(config)
		logger.info("do_hicplotter [DONE].")
		#
	if stages_to_run['do_juicebox']:
		# 1. Check that hicpro run is completed
		# 2. Convert to juicebox format
		create_files_for_juicebox(config)
		logger.info("do_juicebox [DONE].")
		#
	if stages_to_run['do_differential']:
		# Perform differential comparisons
		do_differential_comparison(config)
		logger.info("do_differential [DONE].")
		#
	if stages_to_run['suggest_resolution']:
		# TODO
		# Juicer tools has a shell script to do this, but it
		# requires the Juicer-specific merge_nodups.txt file
		logger.info("suggest_resolution [TODO].")
	log_pipeline_end_message("SUCCESS")
	logging.shutdown()


if __name__ == '__main__':
	main()