Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
weighted_MEA/haystack_mea.py
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
839 lines (657 sloc)
39 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# python modules | |
import multiprocessing as mp | |
from functools import partial | |
import uuid | |
import subprocess as sb | |
import os | |
import sys | |
import random | |
import argparse | |
import ntpath | |
import shutil | |
from collections import defaultdict | |
import codecs | |
try: | |
import cPickle as cp | |
except: | |
import pickle as cp | |
# commmon functions | |
from haystack_common import determine_path, which, check_file, initialize_genome, HAYSTACK_VERSION | |
# dependencies | |
from bioutilities import Coordinate, Sequence, Fimo | |
import numpy as np | |
import logging | |
logging.basicConfig(level=logging.INFO, | |
format='%(levelname)-5s @ %(asctime)s:\n\t %(message)s \n', | |
datefmt='%a, %d %b %Y %H:%M:%S', | |
stream=sys.stderr, | |
filemode="w") | |
error = logging.critical | |
warn = logging.warning | |
debug = logging.debug | |
info = logging.info | |
np.seterr(divide='ignore') | |
np.seterr(invalid='ignore') | |
from scipy import stats | |
from scipy.misc import factorial | |
import matplotlib as mpl | |
mpl.use('Agg') | |
import matplotlib.pyplot as pl | |
pl.rc('font', **{'size': 18}) | |
from jinja2 import Environment, FileSystemLoader | |
# external | |
from external import estimate_qvalues, generate_weblogo | |
####SUPPORT CLASSES AND FUNCTIONS################################################################# | |
def smooth(x, window_len=200, window='hanning'): | |
if not window in ['flat', 'hanning', 'hamming', 'bartlett', 'blackman']: | |
raise ValueError, "Window is on of 'flat', 'hanning', 'hamming', 'bartlett', 'blackman'" | |
s = np.r_[x[window_len - 1:0:-1], x, x[-1:-window_len:-1]] | |
if window == 'flat': # moving average | |
w = np.ones(window_len, 'd') | |
else: | |
w = eval('np.' + window + '(window_len)') | |
y = np.convolve(w / w.sum(), s, mode='valid') | |
return y[window_len / 2:-window_len / 2 + 1] | |
def generate_motif_profile(profile_target, profile_bg, motif_id, output_filename, smooth_size=200, window_size=5000): | |
s_smoothed = smooth(profile_target, smooth_size) | |
# bg_mean=profile_bg.mean() | |
# fc_s_bg=s_smoothed/bg_mean | |
# | |
bg_smoothed = smooth(profile_bg, smooth_size) | |
fc_s_bg = (s_smoothed + 0.1) / (bg_smoothed + 0.1) | |
# | |
fig = pl.figure(figsize=(9, 8)) | |
ax = fig.add_subplot(111) | |
ax.plot(range(-window_size / 2, window_size / 2), fc_s_bg, '-g', linewidth=3) | |
ax.hold(True) | |
ax.plot([-window_size / 2, window_size / 2], [1, 1], '--k') | |
pl.title(motif_id) | |
pl.xlabel('bp') | |
pl.ylabel('Fold change') | |
fig.savefig(output_filename) | |
fig.savefig(output_filename.replace('.png', '.pdf')) | |
pl.close() | |
def combine_pvalues(x): | |
# k=x.prod() | |
k = np.exp(np.log(x).sum()) | |
p = 0 | |
for i in range(len(x)): | |
p += np.power(-np.log(k), i) / factorial(i) | |
# print k --i | |
# print p | |
return k * p | |
def sample_wr(population, k): | |
"Chooses k random elements (with replacement) from a population" | |
n = len(population) | |
_random, _int = random.random, int # speed hack | |
result = [None] * k | |
for i in xrange(k): | |
j = _int(_random() * n) | |
result[i] = population[j] | |
return result | |
class constant_missing_dict(dict): | |
def __missing__(self, key): | |
return 0 | |
def pickable_defaultdict(t=list): | |
return defaultdict(t) | |
def calculate_average_ngram_presence(coordinates, genome, ngram): | |
ngram_presence = np.zeros(len(coordinates)) | |
ngram_reversed = Sequence.reverse_complement(ngram) | |
for idx, c in enumerate(coordinates): | |
seq = genome.extract_sequence(c) | |
ngram_presence[idx] = (seq.count(ngram) + seq.count(ngram_reversed)) / float(len(c)) | |
return ngram_presence | |
def get_random_coordinates(coords, genome): | |
random_coords = [] | |
for c in coords: | |
random_bpstart = np.random.randint(1, genome.chr_len[c.chr_id] - len(c) + 1) | |
random_coords.append(Coordinate(c.chr_id, random_bpstart, random_bpstart + len(c) - 1)) | |
return random_coords | |
def call_fimo(target_coords_fasta_filename, prefix, meme_motifs_filename, nucleotide_bg_filename, temp_directory, | |
p_value, motif_id): | |
output_filename = os.path.join(temp_directory, '%s_%s.motifs' % (prefix, motif_id)) | |
FNULL = open(os.devnull, 'w') | |
if motif_id == 'ALL_MOTIFS': | |
sb.call('fimo --verbosity 0 --thresh %f --text --bgfile "%s" "%s" "%s" | sed "1d" | cut -f1,2,3,4 > "%s" 2>/dev/null' % \ | |
(p_value, nucleotide_bg_filename, meme_motifs_filename, target_coords_fasta_filename, output_filename), | |
stdout = FNULL, | |
stderr = sb.STDOUT, | |
shell=True) | |
else: | |
sb.call( | |
'fimo --verbosity 0 --motif %s --thresh %f --text --bgfile "%s" "%s" "%s" | sed "1d" | cut -f1,2,3,4 > "%s" 2>/dev/null'% \ | |
(motif_id, p_value, nucleotide_bg_filename, meme_motifs_filename, target_coords_fasta_filename,output_filename), | |
stdout = FNULL, | |
stderr = sb.STDOUT, | |
shell=True) | |
return output_filename | |
def parallel_fimo_scanning(target_coords, | |
meme_motifs_filename, | |
genome, nucleotide_bg_filename, | |
temp_directory, | |
p_value, | |
mask_repetitive, | |
window_length, | |
internal_window_length, | |
num_consumers): | |
fimo = Fimo(meme_motifs_filename, | |
nucleotide_bg_filename, | |
temp_directory=temp_directory, | |
p_value=p_value) | |
# init variables | |
prefix = 'haystack_motifs_' + str(uuid.uuid4()) | |
motifs_profiles_in_sequences = dict() | |
idxs_seqs_with_motif = dict() | |
motif_coords_in_seqs_with_motif = dict() | |
# extend with flanking | |
original_target_coords = target_coords | |
if window_length: | |
internal_bpstart = window_length / 2 - internal_window_length / 2 | |
internal_bpend = window_length / 2 + internal_window_length / 2 | |
target_coords = Coordinate.coordinates_of_intervals_around_center(target_coords, window_length) | |
# write fasta | |
target_coords_fasta_filename = os.path.join(temp_directory, prefix + '.fa') | |
Coordinate.coordinates_to_fasta(target_coords, target_coords_fasta_filename, genome) | |
# mapping | |
coord_to_idx = dict() | |
for idx, c in enumerate(target_coords): | |
coord_to_idx[str(c).split()[0]] = idx | |
for motif_id in fimo.motif_ids: | |
motifs_profiles_in_sequences[motif_id] = np.zeros(len(c)) | |
idxs_seqs_with_motif[motif_id] = set() | |
motif_coords_in_seqs_with_motif[motif_id] = pickable_defaultdict() | |
motifs_in_sequences_matrix = np.zeros((len(target_coords), len(fimo.motif_ids))) | |
# num_consumers= num_consumers -2 | |
# compute motifs with fimo | |
if num_consumers > 1: | |
# partial function for multiprocessing | |
compute_single_motif = partial(call_fimo, target_coords_fasta_filename, prefix, meme_motifs_filename, | |
nucleotide_bg_filename, temp_directory, p_value) | |
pool = mp.Pool(processes=num_consumers) | |
pool.map(compute_single_motif, fimo.motif_ids) | |
pool.close() | |
pool.join() | |
fimo_output_filename = os.path.join(temp_directory, prefix + '_fimo_output.motifs') | |
sb.call('cat %s*.motifs > "%s"' % (os.path.join(temp_directory, prefix), fimo_output_filename), shell=True) | |
else: | |
call_fimo(target_coords_fasta_filename, prefix, meme_motifs_filename, nucleotide_bg_filename, temp_directory, | |
p_value, 'ALL_MOTIFS') | |
fimo_output_filename = os.path.join(temp_directory, '%s_%s.motifs' % (prefix, 'ALL_MOTIFS')) | |
with open(fimo_output_filename) as infile: | |
for line in infile: | |
try: | |
motif_id, motif_coord, motif_start, motif_end = line.split() | |
motif_start = int(motif_start) | |
motif_end = int(motif_end) | |
idx_seq = coord_to_idx[motif_coord] | |
motifs_profiles_in_sequences[motif_id][motif_start:motif_end] += 1.0 | |
if motif_start >= internal_bpstart and motif_end <= internal_bpend: # keep track only if is in the internal window! | |
idxs_seqs_with_motif[motif_id].add(idx_seq) | |
motifs_in_sequences_matrix[idx_seq, fimo.motif_id_to_index[motif_id]] = +1 | |
motif_coords_in_seqs_with_motif[motif_id][original_target_coords[idx_seq]].append((motif_start + | |
target_coords[ | |
idx_seq].bpstart - 1, | |
motif_end + | |
target_coords[ | |
idx_seq].bpstart - 1)) | |
except: | |
print line | |
sb.call('rm %s* ' % os.path.join(temp_directory, prefix), shell=True) | |
return motifs_in_sequences_matrix, motifs_profiles_in_sequences, idxs_seqs_with_motif, motif_coords_in_seqs_with_motif, fimo.motif_names, fimo.motif_ids | |
def get_target_motifs_filepaths(target_motifs_filepaths_file): | |
# check folder or sample filename | |
if not os.path.exists(target_motifs_filepaths_file): | |
error("The file or folder %s doesn't exist. Exiting." % | |
target_motifs_filepaths_file) | |
sys.exit(1) | |
if os.path.isfile(target_motifs_filepaths_file): | |
specific_regions_filenames = [] | |
bg_regions_filenames = [] | |
sample_names = [] | |
with open( target_motifs_filepaths_file) as infile: | |
for line in infile: | |
if not line.strip(): | |
continue | |
if line.startswith('#'): # skip optional header line or empty lines | |
info('Skipping header/comment line:%s' % line) | |
continue | |
fields = line.strip().split() | |
n_fields = len(fields) | |
if n_fields == 2: | |
sample_names.append(fields[0]) | |
specific_regions_filenames.append(fields[1]) | |
bg_regions_filenames.append("random_background") | |
elif n_fields == 3: | |
sample_names.append(fields[0]) | |
specific_regions_filenames.append(fields[1]) | |
bg_regions_filenames.append(fields[2]) | |
else: | |
error('The samples file format is wrong!') | |
sys.exit(1) | |
# check all the files before starting | |
info('Checking files location...') | |
for specific_regions_filename in specific_regions_filenames: | |
check_file(specific_regions_filename) | |
if n_fields == 3: | |
# check all the files before starting | |
info('Checking files location...') | |
for bg_regions_filename in bg_regions_filenames: | |
check_file(bg_regions_filename) | |
return sample_names, specific_regions_filenames, bg_regions_filenames | |
def get_args_motif(): | |
# mandatory | |
parser = argparse.ArgumentParser(description='Haystack Motifs Parameters') | |
parser.add_argument('bed_target_filename', type=str, | |
help='A bed file containing the target coordinates on the genome of reference') | |
parser.add_argument('genome_name', type=str, help='Genome assembly to use from UCSC (for example hg19, mm9, etc.)') | |
# optional | |
parser.add_argument('--bed_bg_filename', type=str, | |
help="A bed file containing the backround coordinates on the genome of reference (default random sampled regions from the genome)", | |
default='random_background') | |
parser.add_argument('--meme_motifs_filename', type=str, | |
help='Motifs database in MEME format (default JASPAR CORE 2016)') | |
parser.add_argument('--nucleotide_bg_filename', type=str, | |
help='Nucleotide probability for the background in MEME format (default precomupted on the Genome)') | |
parser.add_argument('--p_value', type=float, | |
help='FIMO p-value for calling a motif hit significant (deafult: 1e-4)', default=1e-4) | |
parser.add_argument('--no_c_g_correction', help='Disable the matching of the C+G density of the background', | |
action='store_true') | |
parser.add_argument('--c_g_bins', type=int, help='Number of bins for the C+G density correction (default: 8)', | |
default=8) | |
parser.add_argument('--mask_repetitive', help='Mask repetitive sequences', action='store_true') | |
parser.add_argument('--n_target_coordinates', type=int, help='Number of target coordinates to use (default: all)', | |
default=np.inf) | |
parser.add_argument('--use_entire_bg', | |
help='Use the entire background file (use only when the cg correction is disabled)', | |
action='store_true') | |
parser.add_argument('--bed_score_column', type=int, | |
help='Column in the bedfile that represents the score (default: 5)', default=5) | |
parser.add_argument('--bg_target_ratio', type=int, help='Background size/Target size ratio (default: 1.0)', | |
default=2) | |
parser.add_argument('--bootstrap', | |
help='Enable the bootstrap if the target set or the background set are too small, choices: True, False (default: False)', | |
action='store_true') | |
parser.add_argument('--temp_directory', help='Directory to store temporary files (default: /tmp)', default='/tmp') | |
parser.add_argument('--no_random_sampling_target', | |
help='Select the best --n_target_coordinates using the score column from the target file instead of randomly select them', | |
action='store_true') | |
parser.add_argument('--name', help='Define a custom output filename for the report', default='') | |
parser.add_argument('--internal_window_length', type=int, | |
help='Window length in bp for the enrichment (default: average lenght of the target sequences)') | |
parser.add_argument('--window_length', type=int, | |
help='Window length in bp for the profiler (default:internal_window_length*5)') | |
parser.add_argument('--min_central_enrichment', type=float, | |
help='Minimum central enrichment to report a motif (default:>1.0)', default=1.0) | |
parser.add_argument('--disable_ratio', help='Disable target/bg ratio filter', action='store_true') | |
parser.add_argument('--dump', help='Dump all the intermediate data, choices: True, False (default: False)', | |
action='store_true') | |
parser.add_argument('--output_directory', type=str, help='Output directory (default: current directory)', | |
default='') | |
parser.add_argument('--smooth_size', type=int, | |
help='Size in bp for the smoothing window (default: internal_window_length/4)') | |
parser.add_argument('--gene_annotations_filename', type=str, | |
help='Optional gene annotations file from the UCSC Genome Browser in bed format to map each region to its closes gene') | |
parser.add_argument('--gene_ids_to_names_filename', type=str, | |
help='Optional mapping file between gene ids to gene names (relevant only if --gene_annotation_filename is used)') | |
parser.add_argument('--n_processes', type=int, | |
help='Specify the number of processes to use. The default is #cores available.', | |
default=min(4, mp.cpu_count())) | |
parser.add_argument('--version', help='Print version and exit.', action='version', | |
version='Version %s' % HAYSTACK_VERSION) | |
return parser | |
############################################################################### | |
def main(input_args=None): | |
print '\n[H A Y S T A C K M O T I F S]' | |
print('\n-MOTIF ENRICHMENT ANALYSIS- [Luca Pinello - lpinello@jimmy.harvard.edu]\n') | |
print 'Version %s\n' % HAYSTACK_VERSION | |
bootstrap = False | |
ngram_correction = 'g' | |
parser = get_args_motif() | |
args = parser.parse_args(input_args) | |
args.n_processes = max(1, args.n_processes - 1) | |
args_dict = vars(args) | |
for key, value in args_dict.items(): | |
if key == 'n_target_coordinates': | |
n_target_coordinates = value | |
else: | |
exec ('%s=%s' % (key, repr(value))) | |
bed_score_column -= 1 | |
if no_c_g_correction: | |
c_g_correction = False | |
else: | |
c_g_correction = True | |
if no_random_sampling_target: | |
random_sampling_target = False | |
else: | |
random_sampling_target = True | |
check_file(bed_target_filename) | |
if not bed_bg_filename == 'random_background': | |
check_file(bed_bg_filename) | |
if meme_motifs_filename: | |
check_file(meme_motifs_filename) | |
else: | |
meme_motifs_filename = os.path.join(determine_path('motif_databases'), 'JASPAR_CORE_2016_vertebrates.meme') | |
annotation_directory = determine_path('gene_annotations') | |
if gene_annotations_filename: | |
if which('java') is None: | |
error('The mapping to the closest gene requires Java free available from: http://java.com/en/download/') | |
use_gene_annotations = False | |
else: | |
check_file(gene_annotations_filename) | |
info('Using %s as gene annotations file' % gene_annotations_filename) | |
use_gene_annotations = True | |
else: | |
gene_annotations_filename = os.path.join(annotation_directory, '%s_genes.bed' % genome_name) | |
gene_ids_to_names_filename = os.path.join(annotation_directory, '%s_genes_id_to_names' % genome_name) | |
if os.path.exists(gene_annotations_filename) and os.path.exists(gene_ids_to_names_filename): | |
use_gene_annotations = True | |
else: | |
use_gene_annotations = False | |
info('No gene annotations file specified') | |
genome, _, nucleotide_bg_filename = initialize_genome(genome_name) | |
target_name = ntpath.basename(bed_target_filename.replace('.bed', '')) | |
bg_name = ntpath.basename(bed_bg_filename.replace('.bed', '')) | |
# timestamp=(datetime.datetime.now().isoformat()[:-3].replace('T','(')+str(np.random.randint(10000))+')').replace(':','.') | |
if name: | |
directory_name = 'HAYSTACK_MOTIFS_on_' + name | |
else: | |
directory_name = 'HAYSTACK_on_' + target_name + '_VS_' + bg_name | |
if output_directory: | |
output_directory = os.path.join(output_directory, directory_name) | |
else: | |
output_directory = directory_name | |
info( | |
'###PARAMETERS USED###\n\t\t -TARGET: %s \n\t\t -BACKGROUND: %s \n\t\t -BG_TARGET_RATIO: %s\n\t\t -C+G CORRECTION: %s\n\t\t -MASKING REPETITIVE: %s\n\t\t -COORDINATES TO ANALYZE: %s\n\t\t -OUTPUT DIRECTORY: %s\n' \ | |
% (bed_target_filename, bed_bg_filename, str(bg_target_ratio), str(c_g_correction), str(mask_repetitive), | |
'ALL' if np.isinf(n_target_coordinates) else str(n_target_coordinates), output_directory)) | |
N_TARGET = None | |
N_BG = None | |
COMMAND_USED = ' '.join(sys.argv) | |
_n_target_coordinates = n_target_coordinates | |
info('Loading Target coordinates from bed:%s' % bed_target_filename) | |
target_coords = Coordinate.bed_to_coordinates(bed_target_filename, cl_score=bed_score_column) | |
if len(target_coords) == 0: | |
info('No coordinates to analyze in your input file. Exiting.') | |
sys.exit(1) | |
# calculate automatically the average lenght of the target regions | |
if internal_window_length: | |
info('Using the user defined internal window length:%d' % internal_window_length) | |
if internal_window_length % 2: | |
internal_window_length += 1 | |
else: | |
internal_window_length = int(np.mean(map(len, target_coords))) # <--------------- was passiert hier? target_coords ist eine Liste mit Coordinate Objects | |
if internal_window_length % 2: | |
internal_window_length += 1 | |
info('Using the average length of target coordinates as internal window length:%d' % internal_window_length) | |
if not window_length: | |
window_length = internal_window_length * 5 | |
info('Total window length:%d' % window_length) | |
if not smooth_size: | |
smooth_size = internal_window_length / 5 | |
target_coords = Coordinate.coordinates_of_intervals_around_center(target_coords, internal_window_length) | |
if len(target_coords) > n_target_coordinates: | |
if random_sampling_target: | |
info('Sampling %d coordinates among the %d total' % (n_target_coordinates, len(target_coords))) | |
target_coords = random.sample(target_coords, n_target_coordinates) | |
else: | |
info('Selecting the best %d coordinates among the %d total' % (n_target_coordinates, len(target_coords))) | |
sorted_idxs_by_score = np.argsort([c.score for c in target_coords])[::-1] | |
target_coords = [target_coords[idx] for idx in sorted_idxs_by_score[:n_target_coordinates]] | |
else: | |
if random_sampling_target and bootstrap and not np.isinf(n_target_coordinates): | |
warn('Number of target regions < %d' % n_target_coordinates) | |
info('bootstrapping to obtain enough target regions') | |
target_coords = sample_wr(target_coords, n_target_coordinates) | |
else: | |
info('Using all the %d target coordinates' % len(target_coords)) | |
info('Extracting Motifs in target coordinates') | |
positive_matrix, motifs_profiles_in_sequences, idxs_seqs_with_motif, motif_coords_in_seqs_with_motif, motif_names, motif_ids = parallel_fimo_scanning( | |
target_coords, | |
meme_motifs_filename, | |
genome, nucleotide_bg_filename, | |
temp_directory=temp_directory, | |
p_value=p_value, | |
mask_repetitive=mask_repetitive, | |
window_length=window_length, | |
internal_window_length=internal_window_length, | |
num_consumers=n_processes) | |
n_target_coordinates = len(target_coords) # fix for the bootstrap! | |
if bed_bg_filename == 'random_background': | |
info('Extracting Random Coordinates from the genome...') | |
if c_g_correction: | |
info('Calculating the C+G content of the target coordinates') | |
bg_coords = [] | |
c_g_content_target = calculate_average_ngram_presence(target_coords, genome, ngram_correction) | |
info('Extract a Matching C+G Background') | |
bins = np.hstack((np.linspace(0, 1, c_g_bins), np.inf)) | |
for _ in range(bg_target_ratio): | |
for idx_c, c in enumerate(target_coords): | |
c_bin = np.nonzero(np.histogram(c_g_content_target[idx_c], bins)[0])[0][0] | |
c_random_bin = -1 | |
while c_random_bin != c_bin: | |
random_bpstart = np.random.randint(1, genome.chr_len[c.chr_id] - len(c) + 1) | |
c_random = Coordinate(c.chr_id, random_bpstart, random_bpstart + len(c) - 1) | |
seq = genome.extract_sequence(c_random) | |
c_g_content_c_random = (seq.count('c') + seq.count('g')) / float(len(c)) | |
c_random_bin = np.nonzero(np.histogram(c_g_content_c_random, bins)[0])[0][0] | |
# print bg_target_ratio,c_bin,c_random_bin, ' still to match:',len(target_coords)-idx_c | |
bg_coords.append(c_random) | |
c_g_content_bg = calculate_average_ngram_presence(bg_coords, genome, ngram_correction) | |
bg_hist = np.histogram(c_g_content_bg, bins)[0] | |
debug('original: ' + str(np.histogram(c_g_content_target, bins)[0])) | |
debug('obtained:' + str(np.histogram(c_g_content_bg, bins)[0])) | |
else: | |
bg_coords = get_random_coordinates(target_coords, genome) | |
info('Done!') | |
else: | |
info('Loading Background Coordinates from:%s' % bed_bg_filename) | |
bg_coords = Coordinate.bed_to_coordinates(bed_bg_filename) | |
bg_coords = Coordinate.coordinates_of_intervals_around_center(bg_coords, internal_window_length) | |
if use_entire_bg: | |
bg_target_ratio = float(len(bg_coords)) / n_target_coordinates | |
info('Using all the coordinates in the BG, BG/TG:%f', bg_target_ratio) | |
if c_g_correction: | |
info('Calculating the C+G content') | |
c_g_content_target = calculate_average_ngram_presence(target_coords, genome, ngram_correction) | |
c_g_content_bg = calculate_average_ngram_presence(bg_coords, genome, ngram_correction) | |
info('Extract a Matching C+G Background') | |
bins = np.hstack((np.linspace(0, 1, c_g_bins), np.inf)) | |
target_hist = np.histogram(c_g_content_target, bins)[0] | |
bg_hist = np.histogram(c_g_content_bg, bins)[0] | |
ratios = bg_hist / (target_hist * 1.0); | |
debug('original:%s' % target_hist) | |
debug('bg:%s' % bg_hist) | |
debug('ratios:%s' % ratios) | |
K_MATCH = min(bg_target_ratio, ratios[~np.isnan(ratios) & ~np.isinf(ratios) & (ratios > 0) & ( | |
target_hist / float(target_hist.sum()) > 0.05)].min()) | |
debug('K_MATCH:%d' % K_MATCH) | |
to_match = np.int32(np.floor(K_MATCH * target_hist)) | |
debug('to_match:%s' % to_match) | |
idxs_corrected_bg = np.array([], dtype=int) | |
for idx_bin in range(len(bins) - 1): | |
idxs_matching_regions = \ | |
np.nonzero((c_g_content_bg >= bins[idx_bin]) & (c_g_content_bg < bins[idx_bin + 1]))[0] | |
to_take = np.random.permutation(len(idxs_matching_regions)) | |
to_take = to_take[range(min(len(idxs_matching_regions), to_match[idx_bin]))] | |
idxs_corrected_bg = np.hstack((idxs_corrected_bg, idxs_matching_regions[to_take])) | |
debug('original:%s' % target_hist) | |
debug('K:%d' % K_MATCH) | |
debug('to sample:%s' % to_match) | |
debug('obtained:%s' % np.histogram(c_g_content_bg[idxs_corrected_bg], bins)[0]) | |
bg_coords = [bg_coords[idx] for idx in idxs_corrected_bg] | |
c_g_content_bg = calculate_average_ngram_presence(bg_coords, genome, ngram_correction) | |
debug(np.histogram(c_g_content_bg, bins)[0]) | |
if np.array_equal(K_MATCH * target_hist, np.histogram(c_g_content_bg, bins)[0]): | |
info('C+G content perfectly matched!\n\ttarget:%s\n\tbg :%s' % ( | |
target_hist, np.histogram(c_g_content_bg, bins)[0])) | |
else: | |
warn('C+G content not perfectly matched\n\ttarget:%s\n\tbg :%s' % ( | |
target_hist, np.histogram(c_g_content_bg, bins)[0])) | |
debug(target_hist / np.histogram(c_g_content_bg, bins)[0]) | |
if len(bg_coords) >= bg_target_ratio * n_target_coordinates: | |
bg_coords = random.sample(bg_coords, int(bg_target_ratio * n_target_coordinates)) | |
else: | |
if bootstrap and len(bg_coords) < (bg_target_ratio * n_target_coordinates * 0.95): # allow a small tollerance! | |
info('bootstrapping to obtain enough background regions') | |
bg_coords = sample_wr(bg_coords, int(bg_target_ratio * n_target_coordinates)) | |
c_g_content_bg = calculate_average_ngram_presence(bg_coords, genome, ngram_correction) | |
debug('After bootstrap:\n\ttarget:%s\n\tbg :%s' % (target_hist, np.histogram(c_g_content_bg, bins)[0])) | |
info('Extracting Motifs in background coordinates') | |
negative_matrix, motifs_profiles_in_bg, idxs_seqs_with_motif_bg = parallel_fimo_scanning(bg_coords, | |
meme_motifs_filename, | |
genome, | |
nucleotide_bg_filename, | |
temp_directory=temp_directory, | |
p_value=p_value, | |
mask_repetitive=mask_repetitive, | |
window_length=window_length, | |
internal_window_length=internal_window_length, | |
num_consumers=n_processes)[ | |
0:3] | |
# allocate date for reports | |
N_MOTIFS = len(motif_ids) | |
rankings = np.zeros(N_MOTIFS, dtype=np.int16) | |
motif_ratios = np.zeros(N_MOTIFS) | |
support_p = np.zeros(N_MOTIFS) | |
support_n = np.zeros(N_MOTIFS) | |
fisher_p_values = np.zeros(N_MOTIFS) | |
central_enrichment = np.zeros(N_MOTIFS) | |
N_seq_p = positive_matrix.shape[0] # len (number) of target coords | |
N_seq_n = negative_matrix.shape[0] # len (number) of background coords | |
profile_presence_p = (positive_matrix > 0).sum(0) # count the sum of the positive elements | |
profile_presence_n = (negative_matrix > 0).sum(0) | |
support_p = profile_presence_p / float(N_seq_p) | |
support_n = profile_presence_n / float(N_seq_n) | |
internal_bpstart = window_length / 2 - internal_window_length / 2 | |
internal_bpend = window_length / 2 + internal_window_length / 2 | |
for idx, motif_id in enumerate(motif_ids): | |
fisher_p_values[idx] = stats.fisher_exact([[profile_presence_p[idx], N_seq_p - profile_presence_p[idx]], | |
[profile_presence_n[idx], N_seq_n - profile_presence_n[idx]]])[1] | |
central_enrichment[idx] = motifs_profiles_in_sequences[motif_id][ | |
internal_bpstart:internal_bpend].mean() / np.hstack( | |
[motifs_profiles_in_sequences[motif_id][:internal_bpstart], | |
motifs_profiles_in_sequences[motif_id][internal_bpend:]]).mean() | |
motif_ratios = (support_p + 0.01) / (support_n + 0.01) | |
# Foundamental! | |
if not disable_ratio: | |
motif_ratios[support_p < 0.03] = 1 | |
rankings = stats.rankdata(-motif_ratios) | |
# filter here positive or positive and negative################################# | |
if not disable_ratio: | |
idxs_to_keep = np.nonzero(motif_ratios > 1)[0] | |
else: | |
idxs_to_keep = range(len(motif_ratios)) | |
rankings = rankings[idxs_to_keep] | |
motif_ratios = motif_ratios[idxs_to_keep] | |
support_p = support_p[idxs_to_keep] | |
support_n = support_n[idxs_to_keep] | |
fisher_p_values = fisher_p_values[idxs_to_keep] | |
central_enrichment = central_enrichment[idxs_to_keep] | |
motif_ids = [motif_ids[_] for _ in idxs_to_keep] | |
motif_names = [motif_names[_] for _ in idxs_to_keep] | |
motif_idxs = [_ for _ in idxs_to_keep] | |
try: | |
qvalues = estimate_qvalues(fisher_p_values); # we test the ones only with ratio >1 | |
except: | |
print fisher_p_values | |
# qvalues=estimate_qvalues(fisher_p_values,m=len(motif_ids)) | |
################################################################################ | |
# generate reports in html | |
info('Generating HTML report...') | |
imgs_directory = os.path.join(output_directory, 'images') | |
genes_list_directory = os.path.join(output_directory, 'genes_lists') | |
motif_regions_directory = os.path.join(output_directory, 'motifs_regions') | |
# create folders | |
if not os.path.exists(imgs_directory): | |
os.makedirs(imgs_directory) | |
if use_gene_annotations and not os.path.exists(genes_list_directory): | |
os.makedirs(genes_list_directory) | |
if not os.path.exists(motif_regions_directory): | |
os.makedirs(motif_regions_directory) | |
j2_env = Environment(loader=FileSystemLoader(determine_path('extra') + '/templates/'), trim_blocks=True) | |
info('DIRECTORY:%s' % determine_path('extra') + '/templates/') | |
template = j2_env.get_template('report_template.html') | |
# copy haystack logo and bg | |
shutil.copyfile(determine_path('extra') + '/templates/haystack_logo.png', | |
os.path.join(imgs_directory, 'haystack_logo.png')) | |
shutil.copyfile(determine_path('extra') + '/templates/noise.png', os.path.join(imgs_directory, 'noise.png')) | |
motifs_dump = [] | |
for i in np.argsort(rankings): | |
if (support_p[i] >= 0.03 or disable_ratio) and fisher_p_values[i] < 0.01 and ( | |
motif_ratios[i] > 1 or disable_ratio) and central_enrichment[i] > min_central_enrichment: | |
# if (support_p[i]>=0.01 or support_n[i]>=0.01) and fisher_p_values[i]<0.1 and (central_enrichment[i]>1.1 or central_enrichment[i]<0.9) and ( motif_ratios[i]>1.1 or motif_ratios[i]<0.9): | |
info('Generating logo and profile for:' + motif_ids[i]) | |
# create motif logo | |
img_logo = os.path.join(imgs_directory, 'logo_' + motif_ids[i]) | |
generate_weblogo(motif_ids[i], meme_motifs_filename, img_logo, title=motif_ids[i]) | |
generate_weblogo(motif_ids[i], meme_motifs_filename, img_logo, title=motif_ids[i], file_format='pdf') | |
# fix the weblogo prefix problem | |
img_logo_url = os.path.join('images', 'logo_' + motif_ids[i] + '.png') | |
# create motif enrichment profile | |
img_profile = os.path.join(imgs_directory, 'profile_' + motif_ids[i] + '.png') | |
motif_profile_target = motifs_profiles_in_sequences[motif_ids[i]] / N_seq_p | |
motif_profile_bg = motifs_profiles_in_bg[motif_ids[i]] / N_seq_n | |
# print motif_profile_target.shape, motif_profile_bg.shape | |
generate_motif_profile(motif_profile_target, motif_profile_bg, motif_ids[i], img_profile, | |
smooth_size=smooth_size, window_size=window_length) | |
img_profile_url = os.path.join('images', 'profile_' + motif_ids[i] + '.png') | |
# create regions | |
info('Extracting regions with:' + motif_ids[i]) | |
regions = os.path.join(motif_regions_directory, motif_ids[i] + '_motif_region_in_target.bed') | |
with open(regions, 'w+') as outfile: | |
outfile.write('Chromosome\tStart\tEnd\tMotif hits inside region\tNumber of hits\n') | |
for c, locations in motif_coords_in_seqs_with_motif[motif_ids[i]].items(): | |
outfile.write('\t'.join([c.chr_id, str(c.bpstart), str(c.bpend), | |
';'.join(['-'.join(map(str, map(int, l))) for l in locations]), | |
str(len(locations))]) + '\n') | |
regions_url = os.path.join('motifs_regions', motif_ids[i] + '_motif_region_in_target.bed') | |
# map closest downstream genes | |
genes_url = None | |
if use_gene_annotations: | |
info('Mapping regions with:%s to the clostest genes' % motif_ids[i]) | |
peak_annotator_path = os.path.join(determine_path('extra/'), 'PeakAnnotator.jar') | |
if gene_ids_to_names_filename: | |
sb.call('java -jar ' + peak_annotator_path + ' -u TSS -p %s -a %s -s %s -o %s >/dev/null 2>&1' \ | |
% (regions, gene_annotations_filename, gene_ids_to_names_filename, genes_list_directory), | |
shell=True) | |
else: | |
sb.call('java -jar ' + peak_annotator_path + ' -u TSS -p %s -a %s -o %s >/dev/null 2>&1' \ | |
% (regions, gene_annotations_filename, genes_list_directory), shell=True) | |
genes_url = os.path.join('genes_lists', motif_ids[i] + '_motif_region_in_target.tss.bed') | |
motifs_dump.append({'id': motif_ids[i], 'name': motif_names[i], 'support_p': support_p[i] * 100, | |
'support_n': support_n[i] * 100, 'ratio': motif_ratios[i], 'rank': float(rankings[i]), | |
'pvalue': fisher_p_values[i], 'qvalue': qvalues[i], | |
'central_enrichment': central_enrichment[i], | |
'img_logo': img_logo_url, 'img_profile': img_profile_url, 'regions': regions_url, | |
'genes': genes_url, 'idx_motif': motif_idxs[i]}) | |
outfile = codecs.open(os.path.join(output_directory, "Haystack_report.html"), "w", "utf-8") | |
outfile.write(template.render(motifs_dump=motifs_dump, bed_target_filename=bed_target_filename, | |
bed_bg_filename=bed_bg_filename, N_TARGET=N_seq_p, N_BG=N_seq_n, \ | |
meme_motifs_filename=meme_motifs_filename, COMMAND_USED=COMMAND_USED, | |
use_gene_annotations=use_gene_annotations)) | |
outfile.close() | |
if dump: | |
info('Saving all the intermediate data on: %s ...' % output_directory) | |
dump_directory = os.path.join(output_directory, 'dump') | |
if not os.path.exists(dump_directory): | |
os.makedirs(dump_directory) | |
np.save(os.path.join(dump_directory, 'matrix_' + target_name), positive_matrix) | |
np.save(os.path.join(dump_directory, 'matrix_BG_' + target_name), negative_matrix) | |
cp.dump(motifs_dump, open(os.path.join(dump_directory, target_name + '_motif_dumps.pickle'), 'w')) | |
# cp.dump( motifs_profiles_in_sequences,open( os.path.join(dump_directory,target_name+'_profiles.pickle'),'w')) | |
# cp.dump( motifs_profiles_in_bg,open( os.path.join(dump_directory,bg_name+'_profiles.pickle'),'w')) | |
cp.dump(idxs_seqs_with_motif, open(os.path.join(dump_directory, target_name + '_motif_seqs_idxs.pickle'), 'w')) | |
cp.dump(idxs_seqs_with_motif_bg, open(os.path.join(dump_directory, bg_name + '_motif_seqs_idxs.pickle'), 'w')) | |
cp.dump(motif_coords_in_seqs_with_motif, | |
open(os.path.join(dump_directory, target_name + '_motif_coords_in_seqs_with_motif.pickle'), 'w')) | |
Coordinate.coordinates_to_bed(target_coords, os.path.join(dump_directory, | |
'Target_coordinates_selected_on_' + target_name + '.bed'), | |
minimal_format=False) | |
Coordinate.coordinates_to_bed(bg_coords, | |
os.path.join(dump_directory, 'BG_coordinates_selected_on_' + bg_name + '.bed'), | |
minimal_format=True) | |
#info('Motif analysis for Sample %s completed' %name) | |
info('Motif analysis completed! Ciao!') | |
if __name__ == '__main__': | |
main() | |
sys.exit(0) | |