From 4056411fdaa99d4b2d91da5dacc3fc54df0ad4f8 Mon Sep 17 00:00:00 2001 From: anastasiia Date: Sun, 20 Jan 2019 21:24:59 +0100 Subject: [PATCH] adding R skripts and modification in find_exons.py and visualize.py --- concatenate_rna_info.R | 72 +++++++++++++++++++ create_groups.R | 70 ++++++++++++++++++ find_exons.py | 2 +- find_exons.yaml | 12 +++- using_rnaseq.R | 92 ++++++++++++++++++++++++ visualize.py | 156 +++++++++++++++++++++++++++++------------ 6 files changed, 356 insertions(+), 48 deletions(-) create mode 100644 concatenate_rna_info.R create mode 100644 create_groups.R create mode 100644 using_rnaseq.R diff --git a/concatenate_rna_info.R b/concatenate_rna_info.R new file mode 100644 index 0000000..373bbe0 --- /dev/null +++ b/concatenate_rna_info.R @@ -0,0 +1,72 @@ +library(data.table) + +if (!require(optparse)) install.packages("optparse"); library(optparse) + +option_list <- list( + make_option(opt_str = c("-b", "--rna_input"), default = NULL, help = "Input TSV-file with RNA-seq information", metavar = "character"), + make_option(opt_str = c("-t", "--treshold"), default = 10, help = "Input the threshold to cut the baseMeanB", type = "integer") +) + +opt_parser <- OptionParser(option_list = option_list, + description = "This script creates a TSV-file with information from the RNA-seq with EnsemblID and B-baseMean", + epilogue = "Author: Anastasiia Petrova ") + +opt <- parse_args(opt_parser) + +test_fun <- function(tobias_input, sample_size, rna_input, threshold){ + #message(input) + + #read the input file as data table + rna_seq_file <- rna_input + rna_seq <- fread(rna_seq_file, header = TRUE, sep = "\t", fill = TRUE) + + subset_rna_seq <- rna_seq[rna_seq$`mdux-GFPneg-rna_vs_mdux-GFPpos-rna baseMeanB mdux-GFPpos-rna` > threshold] + + #sort the table by the column baseMean descending + rna_seq_sorted <- subset_rna_seq[order(-subset_rna_seq$`mdux-GFPneg-rna_vs_mdux-GFPpos-rna baseMeanB mdux-GFPpos-rna`), ] + + + ens_ids <- unlist(rna_seq_sorted$`Ensembl gene id`) + base_Means <- unlist(rna_seq_sorted$`mdux-GFPneg-rna_vs_mdux-GFPpos-rna baseMeanB mdux-GFPpos-rna`) + + output <- cbind(ens_ids, base_Means) + + file_test <- file("test.txt") + write.table(output, file = file_test, row.names = FALSE, col.names = FALSE, quote = FALSE) + close(file_test) + + + #tobias_results_file <- "./bindetect_results.txt" + #sample_size = 10 + #tobias_results_file <- tobias_input + #tobias_results <- fread(tobias_results_file, header = TRUE, sep = "\t", fill = TRUE) + + #sort the table by the column mDuxNeg_mDuxPos_change + #tobias_results_sorted <- tobias_results[order(-tobias_results$mDuxNeg_mDuxPos_change), ] + + #take top 10 from the tobias results and save only the gene names + #top_10 <- tobias_results_sorted[1:sample_size, ]$TF_name + #c_top_10 <- unlist(top_10, use.names = FALSE) + + #concatenate the Jaspar ID + #c_top_10 <- gsub("_.*", "", c_top_10) + + #make a subset of the same length as top_10, with random samples, replace = False excludes using one gene twice + #random_10 <- tobias_results_sorted[sample(sample_size + 1:nrow(tobias_results_sorted), sample_size, replace=FALSE), ]$TF_name + #c_random_10 <- unlist(random_10, use.names = FALSE) + + #concatenate the Jaspar ID + #c_random_10 <- gsub("_.*", "", c_random_10) + + + #genes_exons_correlation <- "./genes_exons_correlation.txt" + + #genes_exons_table <- fread(genes_exons_correlation, header = TRUE, sep = "\t") + + #genes <- genes_exons_table$gene_name + #genes_ids <- unlist(genes_exons_table$gene_id) +} + +#delete the help message from the parameter +params <- opt[-length(opt)] +do.call(test_fun, args = params) \ No newline at end of file diff --git a/create_groups.R b/create_groups.R new file mode 100644 index 0000000..af21990 --- /dev/null +++ b/create_groups.R @@ -0,0 +1,70 @@ +library(data.table) + +if (!require(optparse)) install.packages("optparse"); library(optparse) + +option_list <- list( + make_option(opt_str = c("-i", "--input"), default = NULL, help = "Input TSV-file from TOBIAS", + metavar = "character"), + make_option(opt_str = c("-s", "--sample_size"), default = 10, help = "Input the size of the sample to test", type = "integer") +) + +opt_parser <- OptionParser(option_list = option_list, + description = "This script creates files for visualize.py containing two groups to compare. The first group is the top 10 + best genes from the input file, and the other group is a random sampe of genes from the input file.", + epilogue = "Author: Anastasiia Petrova ") + +opt <- parse_args(opt_parser) + +test_fun <- function(input, sample_size){ + #message(input) + #message(sample_size) + + #read the input file as data table + #tobias_results_file <- "./bindetect_results.txt" + tobias_results_file <- input + tobias_results <- fread(tobias_results_file, header = TRUE, sep = "\t", fill = TRUE) + + #sort the table by the column mDuxNeg_mDuxPos_change + tobias_results_sorted <- tobias_results[order(-tobias_results$mDuxNeg_mDuxPos_change), ] + + #take top 10 from the tobias results and save only the gene names + top_10 <- tobias_results_sorted[1:sample_size, ]$TF_name + c_top_10 <- unlist(top_10, use.names = FALSE) + + #concatenate the Jaspar ID + c_top_10 <- gsub("_.*", "", c_top_10) + + #make a subset of the same length as top_10, with random samples, replace = False excludes using one gene twice + random_10 <- tobias_results_sorted[sample(sample_size + 1:nrow(tobias_results_sorted), sample_size, replace=FALSE), ]$TF_name + c_random_10 <- unlist(random_10, use.names = FALSE) + + #concatenate the Jaspar ID + c_random_10 <- gsub("_.*", "", c_random_10) + + #write the top_10 and the random_10 to the txt files + file_top_10 <- file("top_group.txt") + writeLines(c_top_10, con = file_top_10, sep = "\n") + close(file_top_10) + cat("The best ", sample_size, " samples are saved to the file ./top_group.txt \n") + + file_random_10 <- file("random_group.txt") + writeLines(c_random_10, con = file_random_10, sep = "\n") + close(file_random_10) + cat("The random ", sample_size, " samples are saved to the file ./random_group.txt \n") + + #make the group of bottom genes + tobias_results_sorted_a <- tobias_results[order(tobias_results$mDuxNeg_mDuxPos_change), ] + + bottom_10 <- tobias_results_sorted_a[1:sample_size, ]$TF_name + c_bottom_10 <- unlist(bottom_10, use.names = FALSE) + c_bottom_10 <- gsub("_.*", "", c_bottom_10) + + file_bottom_10 <- file("bottom_group.txt") + writeLines(c_bottom_10, con = file_bottom_10, sep = "\n") + close(file_bottom_10) + cat("The bottom ", sample_size, " samples are saved to the file ./bottom_group.txt \n") +} + +#delete the help message from the parameter +params <- opt[-length(opt)] +do.call(test_fun, args = params) \ No newline at end of file diff --git a/find_exons.py b/find_exons.py index 5487a29..853ff5c 100644 --- a/find_exons.py +++ b/find_exons.py @@ -25,7 +25,7 @@ def parse_args(): parser = argparse.ArgumentParser(prog = '', description = textwrap.dedent(''' - This script produces plots and a tsv-file to show the number of exons for the list of genes of interest. + This script produces a tsv-file to show the number of exons for the list of genes of interest. '''), epilog='That is what you need to make this script work for you. Enjoy it') required_arguments = parser.add_argument_group('required arguments') diff --git a/find_exons.yaml b/find_exons.yaml index be3c8ea..ffd9b14 100644 --- a/find_exons.yaml +++ b/find_exons.yaml @@ -5,9 +5,17 @@ channels: - conda-forge dependencies: - - python + - python >=3 - snakemake + - r-seqinr + - r-base>=3.5.1 + - r-data.table + - r-pbapply + - r-igraph + - r-stringi + - r-stringr + - r-optparse - biopython - matplotlib - scipy - - statsmodels + - statsmodels \ No newline at end of file diff --git a/using_rnaseq.R b/using_rnaseq.R new file mode 100644 index 0000000..04a4eeb --- /dev/null +++ b/using_rnaseq.R @@ -0,0 +1,92 @@ +library(data.table) +library(stringr) +library(stringi) + +genes_exons_correlation <- "./genes_exons_correlation.txt" + +genes_exons_table <- fread(genes_exons_correlation, header = TRUE, sep = "\t") +View(genes_exons_table) + +rnaseq_file <- "./bigmatrix_norm.txt" + +#use read.delim as read.table produces warning message EOF within quoted string +rnaseq_table <- fread(rnaseq_file, header = TRUE, sep = "\t", fill = TRUE) +View(rnaseq_table) + +tobias_results_file <- "./bindetect_results.txt" +tobias_results <- fread(tobias_results_file, header = TRUE, sep = "\t", fill = TRUE) +View(tobias_results) + +#sort the table by the column mDuxNeg_mDuxPos_change +tobias_results_sorted <- tobias_results[order(tobias_results$mDuxNeg_mDuxPos_change), ] +View(tobias_results_sorted) + +#testing around +i <- grep("CEBPD_MA0836.1", tobias_results_sorted$TF_name) #the number of the row +tobias_results_sorted[i,] #print this row + +sample_size = 10 +#take top 10 from the tobias results and save only the gene names +top_10 <- tobias_results_sorted[1:sample_size, ]$TF_name +View(top_10) +c_top_10 <- unlist(top_10, use.names = FALSE) + +#make a subset of the same length as top_10, with random samples, replace = False excludes using one gene twice +random_10 <- tobias_results_sorted[sample(sample_size + 1:nrow(tobias_results_sorted), sample_size, replace=FALSE), ]$TF_name +View(random_10) +c_random_10 <- unlist(random_10, use.names = FALSE) + +#write the top_10 and the random_10 to the txt files +file_top_10 <- file("top_10.txt") +writeLines(c_top_10, con = file_top_10, sep = "\n") +close(file_top_10) + +file_random_10 <- file("random_10.txt") +writeLines(c_random_10, con = file_random_10, sep = "\n") +close(file_random_10) + + + +#apply(genes_exons_table, 1, function(r) any(r %in% c_top_10)) #bebe + +#look_for <- grepl("Cphx", genes_exons_table$gene_name) +#look_for + +#genes_exons_table[look_for] + +#look_for2 <- stri_subset(genes_exons_table$gene_name, regex = c_top_10) + +#look_for3 <- apply(outer(genes_exons_table$gene_name, c_top_10, stri_detect), 1, all) +#genes_exons_table[look_for3] + +#stri_detect(str = genes_exons_table$gene_name, regex = c_top_10) + +small_list <- lapply(c_top_10, function (x){ + as.data.table(stri_detect(str = genes_exons_table$gene_name, regex = x)) +}) #liste von datatables + +?do.call +new_table <- do.call(cbind, small_list) #containing rows with true/false + +new_vector <- apply(new_table, MARGIN = 1, any) #margin 1 über die zeilen + +any(new_vector) + + + +small_list2 <- lapply(c_top_10, function (y){ + as.data.table(grepl(x = genes_exons_table$gene_name, pattern = y, ignore.case = TRUE, perl = TRUE)) +}) + +new_table2 <- do.call(cbind, small_list2) #containing rows with true/false + +new_vector2 <- apply(new_table2, MARGIN = 1, any) #margin 1 über die zeilen + +any(new_vector2) #false +any(grepl(x = genes_exons_table$gene_name, pattern = "CPHX", ignore.case = TRUE, perl = TRUE)) #true + +#------------------ + + + + diff --git a/visualize.py b/visualize.py index 32ddc02..5e89eff 100644 --- a/visualize.py +++ b/visualize.py @@ -37,18 +37,20 @@ def parse_args(): required_arguments = parser.add_argument_group('required arguments') required_arguments.add_argument('--correlation', help='a .txt file with correlation between genes length and the length of exons', required=True) - #required_arguments.add_argument('--genes_of_interest', nargs='*', dest='genes_of_interest', help='a .txt file or a list with genes one is interested in', required=True) - required_arguments.add_argument('--groups', nargs='*', dest='groups', help='enter one or more files containing groups of genes you are interested in, the genes should be separated with tabs or printed in one column') - #all other arguments are optional parser.add_argument('--variable', type=int, default=1, help='enter how many chars in the name of a gene can be different from the name of gene you are searching for') + parser.add_argument('--groups', nargs='*', dest='groups', help='enter one or more files containing groups of genes you are interested in, the genes should be separated with tabs or printed in one column') + parser.add_argument('--rna_seq', help='enter the file with rna seq data') + parser.add_argument('--bindetect', help='enter the file with results from TOBIAS') + parser.add_argument('--group_size', type=int, default=10, help='if the file from TOBIAS was provided, enter the size of groups to create, by default 10') + parser.add_argument('--output', help='enter the name of the output file, by default ./boxplot', default='./boxplot') args = parser.parse_args() return args def find_directory(args): - return os.path.splitext(os.path.dirname(args.correlation))[0] + return os.path.splitext(os.path.dirname(args.output))[0] def get_name_from_path(full_path): return os.path.splitext(os.path.basename(full_path))[0] @@ -61,28 +63,60 @@ def check_existing_input_files(args): print('please make sure the .txt file with genes-exons correlation exists') sys.exit() + if args.rna_seq and not os.path.isfile(args.rna_seq): + print('please make sure the file with RNA-seq data exists') + sys.exit() + groups = [] - for group_file in args.groups: - if not os.path.isfile(group_file): - print('the path for the input file ' + str(group_file) + ' could not be opened') + if args.bindetect: + if not os.path.isfile(args.bindetect): + print('please make sure the file with information from TOBIAS exists') + sys.exit() + else: + if not args.groups: + print('please provide your own groups for visualisation or the file from TOBIAS to make the groups') sys.exit() else: - #group_name = "group_" + str(args.groups.index(group_file) + 1) - one_group = [] - with open(group_file) as opened_group_file: - for line in opened_group_file: - line_array = re.split(r'\t', line.rstrip('\n')) - for gene in line_array: - if ' ' in gene: - print('please make sure that genes in the file ' + str(group_file) + ' are separated with tabs or are written each on a new line') - sys.exit() - else: - one_group.append(gene) - opened_group_file.close() - groups.append(one_group) + for group_file in args.groups: + if not os.path.isfile(group_file): + print('the path for the input file ' + str(group_file) + ' could not be opened') + sys.exit() + else: + #group_name = "group_" + str(args.groups.index(group_file) + 1) + one_group = [] + with open(group_file) as opened_group_file: + for line in opened_group_file: + line_array = re.split(r'\t', line.rstrip('\n')) + for gene in line_array: + if ' ' in gene: + print('please make sure that genes in the file ' + str(group_file) + ' are separated with tabs or are written each on a new line') + sys.exit() + else: + one_group.append(gene) + opened_group_file.close() + groups.append(one_group) return groups +def compare_with_each(groups, groups_names): + if len(groups) >= 2: + start = 1 + for group in groups: + for i in range(start, len(groups)): + make_ttest(group, groups[i], groups_names[groups.index(group)], groups_names[i]) + start += 1 + else: + logger.info("There is nothing to compare!") + +def make_ttest(group1, group2, group1_name, group2_name): + stat_score, p_value = stats.ttest_ind(group1, group2) #, equal_var = False) + if p_value <= 0.05: + logger.info("The average of " + str(group1_name) + " is NOT EQUAL with " + str(group2_name)) + logger.info("Ttest p-value is " + str(p_value)) + else: + logger.info("The average of " + str(group1_name) + " is EQUAL with " + str(group2_name)) + logger.info("Ttest p-value is " + str(p_value)) + def make_box_plot(groups_files_names, groups_genes_of_interest, output_directory, figure_name): groups_names = [] @@ -104,6 +138,10 @@ def make_box_plot(groups_files_names, groups_genes_of_interest, output_directory data.append(group_data) + logger.info("Making a statistical test") + + compare_with_each(data, groups_names) + fig, ax = plt.subplots(figsize=(10,10)) bp = ax.boxplot(data, notch=False, sym='o', vert=True, whis=1.5, patch_artist=True) @@ -122,6 +160,7 @@ def make_box_plot(groups_files_names, groups_genes_of_interest, output_directory ax.set_xticklabels(groups_names, rotation=45, fontsize=8) fig.savefig(os.path.join(output_directory, figure_name)) + logger.info("saving the plot to the file " + str(os.path.join(output_directory, figure_name))) plt.show() @@ -146,9 +185,9 @@ def make_plot(x_array, y_array, gene_names, output_directory, figure_name, color #plt.show() -def read_correlation(correlation_file, groups, variable): - #read the text file and make a dictionary containing genes with array of their scores +def read_correlation(correlation_file, groups, variable, rna_dict): + #read the text file and make a dictionary containing genes with array of their scores genes_to_show_groups = [] for genes_of_interest in groups: @@ -162,29 +201,31 @@ def read_correlation(correlation_file, groups, variable): #first look for the genes in the gtf_genes file genes_family = subprocess.getoutput("grep -i " + gene_name + " " + correlation_file) - print(genes_family) - + if len(genes_family) != 0: logger.info("working with the gene " + gene_name) - #the score is in [3], the tsl is in [2] + #the score is in [3], the tsl is in [2], the ensemle_gene_id is in [7] for genes_line in re.split(r'\n', genes_family): genes_array = re.split(r'\t', genes_line) - - check_gene_name = genes_array[0].lower() - - print(genes_array) - if check_gene_name == orig_gene_name: - #this is a perfect match <3 - genes_to_show[check_gene_name].append(float(genes_array[3])) #append score to the array - elif check_gene_name.startswith(orig_gene_name) and len(genes_array[0]) <= len(gene_name) + variable: #the found gene is by default 1 char longer than the gene name - if check_gene_name in genes_to_show.keys(): - genes_to_show[check_gene_name].append(float(genes_array[3])) - else: #this gene is not yet saved - #make a new place in the dictionary for this gene - genes_to_show[check_gene_name] = genes_to_show.get(check_gene_name, []) - genes_to_show[check_gene_name].append(float(genes_array[3])) + + if genes_array[7] in rna_dict.keys(): + #if this geneis is saved in the rnaseq data + check_gene_name = genes_array[0].lower() + + if check_gene_name == orig_gene_name: + #this is a perfect match <3 + genes_to_show[check_gene_name].append(float(genes_array[3])) #append score to the array + elif check_gene_name.startswith(orig_gene_name) and len(genes_array[0]) <= len(gene_name) + variable: #the found gene is by default 1 char longer than the gene name + if check_gene_name in genes_to_show.keys(): + genes_to_show[check_gene_name].append(float(genes_array[3])) + else: #this gene is not yet saved + #make a new place in the dictionary for this gene + genes_to_show[check_gene_name] = genes_to_show.get(check_gene_name, []) + genes_to_show[check_gene_name].append(float(genes_array[3])) + else: + logger.info(gene_name + " has no corresponding information from the RNA-seq data and will not be considered while making a plot") else: logger.info("no similar genes to " + gene_name + " were found") @@ -193,9 +234,29 @@ def read_correlation(correlation_file, groups, variable): genes_to_show = dict([(k, v) for k, v in genes_to_show.items() if len(v) > 0]) genes_to_show_groups.append(genes_to_show) - print(genes_to_show_groups) return genes_to_show_groups +def read_bindetect(output_directory, bindetect, group_size): + #first sort the file after the column mDuxNeg_mDuxPos_change + #use the unix sort sort -k5nr + logger.info("sorting the bindetect file after the column mDuxNeg_mDuxPos_change") + sorted_bindetect = os.path.join(output_directory, "sorted_" + os.path.splitext(os.path.basename(bindetect))[0] + ".txt") + os.system("sort -k5nr " + bindetect + " > " + sorted_bindetect) + logger.info("sorted file is " + str(sorted_bindetect)) + +def read_rna_seq(rna_seq): + rna_dict = {} + + with open(rna_seq) as read_file: + for line in read_file: + line_array = re.split(r'\s', line.rstrip('\n')) + #ensid is in the first column, the basemean is in the second + rna_dict[line_array[0]] = line_array[1] + + read_file.close() + + return rna_dict + def main(): start = time.time() @@ -203,12 +264,11 @@ def main(): args = parse_args() groups = check_existing_input_files(args) + output_directory = find_directory(args) #check if there is an existing directory that user gave as input, otherwise create this directory from the path provided from the user #check_directory(args.output_directory) - output_directory = find_directory(args) - fh = logging.FileHandler(os.path.join(output_directory, "visualize.log")) fh.setLevel(logging.INFO) fh.setFormatter(formatter) @@ -219,13 +279,19 @@ def main(): ch.setFormatter(formatter) logger.addHandler(ch) - genes_to_show_groups = read_correlation(args.correlation, groups, args.variable) + rna_dict = {} + if args.rna_seq: + rna_dict = read_rna_seq(args.rna_seq) + + #read_bindetect(output_directory, args.bindetect, args.group_size) + + genes_to_show_groups = read_correlation(args.correlation, groups, args.variable, rna_dict) logger.info("making a plot") - make_box_plot(args.groups, genes_to_show_groups, output_directory, "boxplot.png") + make_box_plot(args.groups, genes_to_show_groups, output_directory, get_name_from_path(args.output)) - logger.info("find_exons needed %s seconds to generate the output" % (round(time.time() - start, 2))) + logger.info("visualize.py needed %s seconds to generate the output" % (round(time.time() - start, 2))) for handler in logger.handlers: handler.close()