Skip to content

Commit

Permalink
working with groups of genes the user is interested in
Browse files Browse the repository at this point in the history
  • Loading branch information
anastasiia committed Dec 11, 2018
1 parent 9bc6edd commit bf0d295
Showing 1 changed file with 78 additions and 63 deletions.
141 changes: 78 additions & 63 deletions visualize.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@

#from gtfparse import read_gtf #makes the logger to look different o_o

logger = logging.getLogger('visualise')
logger = logging.getLogger('visualize')
logger.setLevel(logging.INFO)

formatter = logging.Formatter("%(asctime)s : %(message)s", "%Y-%m-%d %H:%M")
Expand All @@ -37,7 +37,8 @@ def parse_args():

required_arguments = parser.add_argument_group('required arguments')
required_arguments.add_argument('--correlation', help='a .txt file with correlation between genes length and the length of exons', required=True)
required_arguments.add_argument('--genes_of_interest', nargs='*', dest='genes_of_interest', help='a .txt file or a list with genes one is interested in', required=True)
#required_arguments.add_argument('--genes_of_interest', nargs='*', dest='genes_of_interest', help='a .txt file or a list with genes one is interested in', required=True)
required_arguments.add_argument('--groups', nargs='*', dest='groups', help='enter one or more files containing groups of genes you are interested in, the genes should be separated with tabs or printed in one column')

#all other arguments are optional
parser.add_argument('--variable', type=int, default=1, help='enter how many chars in the name of a gene can be different from the name of gene you are searching for')
Expand All @@ -46,6 +47,12 @@ def parse_args():

return args

def find_directory(args):
return os.path.splitext(os.path.dirname(args.correlation))[0]

def get_name_from_path(full_path):
return os.path.splitext(os.path.basename(full_path))[0]

def check_existing_input_files(args):

print('checking the input file')
Expand All @@ -54,57 +61,63 @@ def check_existing_input_files(args):
print('please make sure the .txt file with genes-exons correlation exists')
sys.exit()

if len(args.genes_of_interest) == 1:
if not os.path.isfile(args.genes_of_interest[0]):
print('working with one gene of interest')
genes_of_interest = args.genes_of_interest
groups = []
for group_file in args.groups:
if not os.path.isfile(group_file):
print('the path for the input file ' + str(group_file) + ' could not be opened')
sys.exit()
else:
print('working with file of genes')
#there is a file with list of genes. Extract them and return in an array
genes = []
with open(args.genes_of_interest[0]) as genes_file:
for line in genes_file:
#group_name = "group_" + str(args.groups.index(group_file) + 1)
one_group = []
with open(group_file) as opened_group_file:
for line in opened_group_file:
line_array = re.split(r'\t', line.rstrip('\n'))
for gene in line_array:
if ' ' in gene:
print('please make sure that genes in the file with genes of interest are separated with tabs or are written each on a new line')
print('please make sure that genes in the file ' + str(group_file) + ' are separated with tabs or are written each on a new line')
sys.exit()
else:
genes.append(gene)
one_group.append(gene)
opened_group_file.close()
groups.append(one_group)

genes_file.close()
genes_of_interest = genes
return groups

else: #there is a list of genes of interest
print('working with a list of genes')
genes_of_interest = args.genes_of_interest
def make_box_plot(groups_files_names, groups_genes_of_interest, output_directory, figure_name):

return genes_of_interest
groups_names = []

def make_box_plot(genes_of_interest, output_directory, figure_name, color_name):
for group_file in groups_files_names:
groups_names.append(get_name_from_path(group_file))

groups_names = []
data = []

#first sort the genes_of_interest dictionary
sorted_genes_of_interest = sorted(genes_of_interest, key = lambda x: np.mean(genes_of_interest[x]))
for genes_of_interest in groups_genes_of_interest:

group_data = []

#first sort the genes_of_interest dictionary
sorted_genes_of_interest = sorted(genes_of_interest, key = lambda x: np.mean(genes_of_interest[x]))

for gene in sorted_genes_of_interest:
print(genes_of_interest[gene])
group_data.extend(genes_of_interest[gene])

for gene in sorted_genes_of_interest:
groups_names.append(gene)
data.append(genes_of_interest[gene])
data.append(group_data)

fig, ax = plt.subplots(figsize=(10,10))

bp = ax.boxplot(data, notch=False, sym='o', vert=True, whis=1.5, patch_artist=True)

color_name = 'navy'
for patch in bp['boxes']:
patch.set_facecolor(color_name)

# Add a horizontal grid to the plot, but make it very light in color
ax.yaxis.grid(True, linestyle='-', which='major', color='lightgrey', alpha=0.5)

ax.set_title('The visualization of correlation between gene and exons length')
ax.set_xlabel('Genes')
ax.set_xlabel('Genes groups')
ax.set_ylabel('Percentage of exons len')

ax.set_xticklabels(groups_names, rotation=45, fontsize=8)
Expand Down Expand Up @@ -134,54 +147,60 @@ def make_plot(x_array, y_array, gene_names, output_directory, figure_name, color

#plt.show()

def read_correlation(correlation_file, genes_of_interest, variable):
def read_correlation(correlation_file, groups, variable):
#read the text file and make a dictionary containing genes with array of their scores

genes_to_show = {}
genes_to_show_groups = []

for gene_name in genes_of_interest:
for genes_of_interest in groups:
genes_to_show = {}

genes_to_show[gene_name] = genes_to_show.get(gene_name, []) #get a place in the dictionary for this gene

#first look for the genes in the gtf_genes file
genes_family = subprocess.getoutput("grep -i " + gene_name + " " + correlation_file)

if len(genes_family) != 0:
logger.info("working with the gene " + gene_name)

for genes_line in re.split(r'\n', genes_family):
genes_array = re.split(r'\t', genes_line)
if genes_array[0] == gene_name:
#this is a perfect match <3
genes_to_show[gene_name].append(float(genes_array[2])) #append score to the array
elif genes_array[0].startswith(gene_name) and len(genes_array[0]) <= len(gene_name) + variable: #the found gene is by default 1 char longer than the gene name
if genes_array[0] in genes_to_show.keys():
genes_to_show[genes_array[0]].append(float(genes_array[2]))
else: #this gene is not yet saved
#make a new place in the dictionary for this gene
genes_to_show[genes_array[0]] = genes_to_show.get(genes_array[0], [])
genes_to_show[genes_array[0]].append(float(genes_array[2]))
for gene_name in genes_of_interest:

else:
logger.info("no similar genes to " + gene_name + " were found")
genes_to_show[gene_name] = genes_to_show.get(gene_name, []) #get a place in the dictionary for this gene

#first look for the genes in the gtf_genes file
genes_family = subprocess.getoutput("grep -i " + gene_name + " " + correlation_file)

if len(genes_family) != 0:
logger.info("working with the gene " + gene_name)

for genes_line in re.split(r'\n', genes_family):
genes_array = re.split(r'\t', genes_line)
if genes_array[0] == gene_name:
#this is a perfect match <3
genes_to_show[gene_name].append(float(genes_array[2])) #append score to the array
elif genes_array[0].startswith(gene_name) and len(genes_array[0]) <= len(gene_name) + variable: #the found gene is by default 1 char longer than the gene name
if genes_array[0] in genes_to_show.keys():
genes_to_show[genes_array[0]].append(float(genes_array[2]))
else: #this gene is not yet saved
#make a new place in the dictionary for this gene
genes_to_show[genes_array[0]] = genes_to_show.get(genes_array[0], [])
genes_to_show[genes_array[0]].append(float(genes_array[2]))

#delete genes without information from the dictionary
genes_to_show = dict([(k, v) for k, v in genes_to_show.items() if len(v) > 0])
else:
logger.info("no similar genes to " + gene_name + " were found")

return genes_to_show
#delete genes without information from the dictionary
genes_to_show = dict([(k, v) for k, v in genes_to_show.items() if len(v) > 0])
genes_to_show_groups.append(genes_to_show)

return genes_to_show_groups

def main():

start = time.time()

args = parse_args()

genes_of_interest = check_existing_input_files(args)
groups = check_existing_input_files(args)

#check if there is an existing directory that user gave as input, otherwise create this directory from the path provided from the user
#check_directory(args.output_directory)

fh = logging.FileHandler("visualize_log.txt")
output_directory = find_directory(args)

fh = logging.FileHandler(os.path.join(output_directory, "visualize.log"))
fh.setLevel(logging.INFO)
fh.setFormatter(formatter)
logger.addHandler(fh)
Expand All @@ -191,15 +210,11 @@ def main():
ch.setFormatter(formatter)
logger.addHandler(ch)

genes_to_show = read_correlation(args.correlation, genes_of_interest, args.variable)
genes_to_show_groups = read_correlation(args.correlation, groups, args.variable)

logger.info("making a plot")
#make_plot(gene_lengths, exons_lengths, gene_names, "./output", "test1.png", 'gold')

#group1 = ["OTX1", "OTX2", "PITX1", "GSC2", "NFYA", "NFYB", "DUX4"] #important genes in early development
#group2 = ["MYCL", "LTBP3", "ERGIC3", "GCM2", "LHX8", "HIF1A", "FLI1"] #not so significant for early dev

make_box_plot(genes_to_show, "./output", "test2.png", 'navy')
make_box_plot(args.groups, genes_to_show_groups, output_directory, "boxplot.png")

logger.info("find_exons needed %s seconds to generate the output" % (round(time.time() - start, 2)))

Expand Down

0 comments on commit bf0d295

Please sign in to comment.