diff --git a/filter_transcripts.py b/filter_transcripts.py new file mode 100644 index 0000000..b51c494 --- /dev/null +++ b/filter_transcripts.py @@ -0,0 +1,34 @@ +filename = snakemake.input[0] +file = open(filename, "r") +lines = file.readlines() +file.close() + +output = [] +current = [] +fake = True +ips = False + +for line in lines: + if (line.startswith(">")): + if (not fake): + output += current + current = [] + fake = False + + if (line.startswith("#####InterProScan")): + ips = True + count = -1 + + if (line.startswith("#####BrewerySS8 Analysis")): + if (count < 1): + fake = True + ips = False + + if (ips == True): + count += 1 + + current.append(line.strip()) + +output_file = open(snakemake.output[0],"w+") +output_file.write("\n".join(output)) +output_file.close() diff --git a/filter_utr.py b/filter_utr.py new file mode 100644 index 0000000..d5db09a --- /dev/null +++ b/filter_utr.py @@ -0,0 +1,53 @@ +from Bio import SeqIO + +utr_sequences = open(snakemake.input[0], "r") +utr_lines = utr_sequences.readlines() +utr_sequences.close() +trans_utr = dict() +for line in utr_lines: + if (line.startswith(">")): + trans_id = line[1:].strip() + if (trans_id not in trans_utr): + trans_utr[trans_id] = [] + else: + trans_utr[trans_id].append(line.strip()) + +transcripts_filename = snakemake.input[1] +transcripts = SeqIO.index(transcripts_filename, "fasta") + +output = [] + +for transcript in transcripts: + transcript_id = str(transcripts[transcript].id).split("|")[-1].strip() + seq = str(transcripts[transcript].seq) + + try: + if transcript_id in trans_utr: + for utr in trans_utr[transcript_id]: + pos = seq.find(utr) + if (pos != -1): + seq = seq[:pos] + seq[pos + len(utr):] + long_seq = seq + else: + long_seq_len = 0 + long_seq = "" + for t in trans_utr: + s = seq + seq_len = len(trans_utr[t]) + if (seq_len > long_seq_len): + long_seq_len = seq_len + for utr in trans_utr[t]: + pos = s.find(utr) + if (pos != -1): + s = s[:pos] + s[pos + len(utr):] + long_seq = s + except: + long_seq = seq + + output.append(">" + str(transcripts[transcript].id)) + output.append(long_seq) + +output_file = open(snakemake.output[0],"w+") +output_file.write("\n".join(output)) +output_file.close() + \ No newline at end of file diff --git a/iupred2a_analysis.py b/iupred2a_analysis.py new file mode 100644 index 0000000..f3a8566 --- /dev/null +++ b/iupred2a_analysis.py @@ -0,0 +1,33 @@ +import subprocess,os + +transcripts_filename = snakemake.input[0] +transcripts_file = open(transcripts_filename, "r") +transcripts = transcripts_file.readlines() +transcripts_file.close() + +transcript_name = transcripts[0].strip() +transcript_sequence = transcripts[1].strip() +output_file = open(snakemake.output[0],"w+") +output_file.write(transcript_sequence) +output_file.close() + +subprocess.run('echo "%s" >> %s' %(transcript_name,snakemake.output[1]), shell = True) +subprocess.run("%s -a %s long >> %s" %(snakemake.input[1],snakemake.output[0],snakemake.output[1]), shell=True) + + + +#transcripts = SeqIO.index(transcripts_filename, "fasta") + +#if (not os.path.exists(snakemake.output[1])): +# open(snakemake.output[1], 'w+').close() + + + +#for transcript in transcripts: +# output = [str(transcripts[transcript].seq)] +# output_file = open(snakemake.output[0],"w+") +# output_file.write("\n".join(output)) +# output_file.close() + + + \ No newline at end of file diff --git a/transcript_analysis.py b/transcript_analysis.py new file mode 100644 index 0000000..d7b33c5 --- /dev/null +++ b/transcript_analysis.py @@ -0,0 +1,53 @@ + +iupred2a_filename = snakemake.input[0] +iupred2a_file = open(iupred2a_filename, "r") +idr_positions = iupred2a_file.readlines() +iupred2a_file.close() + +interproscan_filename = snakemake.input[1] +interproscan_file = open(interproscan_filename, "r") +regions = interproscan_file.readlines() +interproscan_file.close() + + +breweryss8_filename = snakemake.input[2] +breweryss8_file = open(breweryss8_filename, "r") +secondarystructures8 = breweryss8_file.readlines() +breweryss8_file.close() + +prositesites_filename = snakemake.input[3] +prositesites_filename_file = open(prositesites_filename, "r") +sites = prositesites_filename_file.readlines() +prositesites_filename_file.close() + +transcript_name = idr_positions[0].strip() +idr_positions = idr_positions[7:] +regions = regions[5:] + + +output = [transcript_name] + +output.append("#####IUPred2A Analysis") +for position in idr_positions: + output.append(position.strip()) + +output.append("#####InterProScan Analysis") +for region in regions: + if (not region.startswith("#")): + output.append(region.strip()) + else: + break + +output.append("#####BrewerySS8 Analysis") +for position in secondarystructures8: + output.append(position.strip()) + +output.append("#####PrositeScan Analysis") +for site in sites: + output.append(site.strip().replace(">","#")) + +output.append("") + +output_file = open(snakemake.output[0],"w+") +output_file.write("\n".join(output)) +output_file.close()