Skip to content
Permalink
master
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
#!/usr/bin/env python
# coding: utf-8
# # Creation of files for submission
# This Script creates all files necessary for submission of sub challenge 1 (transcript calling) and sub challenge 2 (isoform quantification)
# The primary results are a gtf for sc1 and a expression table for sc2
#
import sys
import os
# Check the arguments
arguments = sys.argv[1:]
assert len(arguments)==3, f'Usage:{sys.argv[0]} <sample> <protocol> <submission>'
sample=arguments[0]
protocol=arguments[1]
submission_id=arguments[2]
assert sample in ['human_H1_mix', 'human_WTC11','mouse_ES','human_simulation', 'mouse_simulation'], f'unknown sample {sample}'
assert protocol in ['PacBio_CapTrap','PacBio_cDNA'], f'unknown protocol {protocol}'
from isotools.transcriptome import Transcriptome
import isotools
import numpy as np
import pandas as pd
from glob import glob
import os
from scipy.stats import lognorm
from pathlib import Path
import gzip
import json
print (f'This is isotools version {isotools.__version__}')
import logging
logger=logging.getLogger('isotools')
logger.setLevel(logging.INFO)
# In[2]:
path='/project/42/pacbio/LRGASP'
#sample='mouse_ES'
#protocol='PacBio_CapTrap'
#submission_id='IsoTools_final'
out_fn=f'{path}/submission/{sample}_{protocol.split("_")[0] if "simulation" in sample else protocol}'
ch1_path=f'{path}/submission/iso_detect_ref_{protocol}_{submission_id}/{sample}_{protocol}_long'
ch2_path=f'{path}/submission/iso_quant_{protocol}_{submission_id}/{sample}_{protocol}_long'
selection_strategy='FSM or NIC: tr coverage >2% of gene coverage and tr coverage >= 2 reads NOVEL_SPLICING or NOVEL_GENE: tr coverage >5% of gene coverage and tr coverage >= 5 reads and less than 50% A content and direct repeat len<=6'
norm_description='length dependend normalization factor based on ratio of observed length distribution over reference transcripts'
bam_files={}
for fn in sorted(glob(f'{path}/isoseq/05-raw_align/{sample}_{protocol.split("_")[0] if "simulation" in sample else protocol}*_trimmed.bam')):
infos=os.path.basename(fn).split('_')[:-1]
sn='_'.join(infos[:-1])
bam_files[sn]=fn
print(f'samples: {",".join(bam_files)}')
# ## Import of long reads:
# * read the aligned long reads
# * corrections:
# * fuzzy junctions: same shift on 5' and 3' wrt reference junction
# * chain chimeric alignments on same chromosome/ orientation
# * group reads to transcripts:
# * same splice junctions <-> same transcript
# * for mono-exons: at least 50% overlap
# * define TSS/PAS: mean of read start/end positions
# * categorize transcripts:
# * FSM: which reference transcript
# * other: how exactly different to what transcripts
#
# In[3]:
if sample[:5]=='human':
ref_fn=f'{path}/data/references/lrgasp_gencode_v38_sirvs.sorted'
genome_fn=f'{path}/data/references/lrgasp_grch38_sirvs.fasta'
else:
ref_fn=f'{path}/data/references/lrgasp_gencode_vM27_sirvs.sorted'
genome_fn=f'{path}/data/references/lrgasp_grcm39_sirvs.fasta'
try:
isoseq=Transcriptome(out_fn+'_isotools.pkl')
except FileNotFoundError:
try:
isoseq=Transcriptome.from_reference(ref_fn+'.isotools.pkl')
isoseq.collapse_immune_genes()
except FileNotFoundError:
isoseq=Transcriptome.from_reference(ref_fn+'.gtf.gz', infer_genes=True)#, chromosomes=spikein)
isoseq.save_reference(ref_fn+'.isotools.pkl')
isoseq.collapse_immune_genes()
#isoseq=Transcriptome('...')
for sa,bam in bam_files.items():
isoseq.add_sample_from_bam(bam, save_readnames=True,min_align_fraction=0.75, sample_name=sa, group=sa, cell_type=sample, protocol=protocol)#, add_chromosomes=False) #33+17 minutes
isoseq.add_qc_metrics(genome_fn) #about 20 min
isoseq.make_index()
isoseq.save(out_fn+'_isotools.pkl')
# In[5]:
gene_filter={'CHIMERIC': 'chimeric', 'EXPRESSED': 'transcripts', 'NOVEL_GENE': 'not reference', 'SPIKE_IN':f'chr[:4] in ["ERCC","SIRV"]'}
transcript_filter=isotools._transcriptome_filter.DEFAULT_TRANSCRIPT_FILTER
transcript_filter['FSM']='annotation[0]==0'
transcript_filter['RTTS']='bool(novel_splice_sites) and any(i%2==0 and i+1 in novel_splice_sites and direct_repeat_len[i%2]>=6 for i in novel_splice_sites)'
ref_transcript_filter=isotools._transcriptome_filter.DEFAULT_REF_TRANSCRIPT_FILTER
ref_transcript_filter['HIGH_SUPPORT']='transcript_support_level in ["1", "2"]'
isoseq.add_filter(gene_filter=gene_filter, transcript_filter=transcript_filter,ref_transcript_filter=ref_transcript_filter)
# In[6]:
# add a Tag "substantial" that the transcript has at least 2% of the total reads of the gene.
#This cannot be done in the common framework, hence add this placeholder and manually add it.
# note that add_filter would crash with this rule, as the gene cannot be accessed.
isoseq.infos['filter']['transcript_filter']['SUBSTANTIAL']=selection_strategy
for g,trnr,tr in isoseq.iter_transcripts():
cov=g.coverage[:,trnr].sum()
if g.is_annotated:
if 'novel_splice_sites' in tr:
if cov > 4 and cov>g.coverage.sum()*.05 and 'RTTS' not in tr['filter']:
tr['filter'].append('SUBSTANTIAL')
elif cov > 1 and cov > g.coverage.sum()*.02:
tr['filter'].append('SUBSTANTIAL')
elif cov > 4 and cov>g.coverage.sum()*.05 and tr['downstream_A_content']>.5 and 'RTTS' not in tr['filter']:
tr['filter'].append('SUBSTANTIAL')
# ## Subchallenge 1: Transcript gtf
# current filter: only substantially expressed - many issues are filtered already
#
# todo: Add more filter criteria
# * truncations
# * variable TSS/PAS?
# * within/between protocols/samples
# * in particular for transcript with deviance from annotation
# * internal priming
# * check odd pattern
# * issue with polyA trimming?
# * RTTS
# * check illumina support of novel junctions
# * alignment quality at junctions?
#
# Also I could add novel gene subcategory infos to the gtf (just to show off)
# In[7]:
#only one entry per challenge, e.g. one for iso_detect_ref and one for iso_quant
#with open(f'{path}/scripts/lrgasp-challenge-1-evaluation/utilities/entry_dummy.json') as f:
# entry = json.load(f)
Path(ch1_path).mkdir(parents=True, exist_ok=True)
try:
with open(f'{ch1_path}/../entry.json') as f:
entry = json.load(f)
except FileNotFoundError:
entry={'entry_id': f'iso_detect_ref_{protocol}_{submission_id}',
'challenge_id': 'iso_detect_ref',
'team_name': 'IsoTools',
'experiment_ids': [],
'data_category': 'long_only',
'samples': [],
'library_preps': [],
'platforms': [],
'contacts': [{'name': 'Matthias Lienhard', 'email': 'lienhard@molgen.mpg.de'},{'name': 'Ralf Herwig', 'email': 'herwig@molgen.mpg.de'}]}
#sample_identifer=sample if 'simulation' in sample else sample[sample.find('_')+1:]
#lib_identifiers=[protocol[protocol.find('_')+1:]] if '_' in protocol else []
#platforms_identifiers= [protocol[:protocol.find('_')]] if '_' in protocol else [protocol]
sample_identifer=sample if 'simulation' in sample else sample[sample.find('_')+1:]
lib_identifiers=[protocol[protocol.find('_')+1:]]
platforms_identifiers= [protocol[:protocol.find('_')]]
entry['samples']=list(set(entry['samples']+[sample_identifer]))
entry['library_preps']=list(set(entry['library_preps']+lib_identifiers))
entry['platforms']= list(set(entry['platforms']+platforms_identifiers))
entry['experiment_ids']= list(set(entry['experiment_ids']+[f'{sample}_{protocol}_long']))
with open(f'{ch1_path}/../entry.json', 'w') as json_file:
json.dump(entry, json_file)
# In[8]:
#prepare metadata
lib_accession=isoseq.sample_table['file'].str.split('_').str[-2].tolist()
if sample == 'human_simulation':
lib_accession=['syn25683376']
if sample == 'mouse_simulation':
lib_accession=['syn25683381']
experiment={'experiment_id': f'{sample}_{protocol}_long',
'challenge_id': 'iso_detect_ref', #"iso_quant"
'description': selection_strategy,
'notes': 'processed with IsoTools',
'species': sample[:sample.find('_')],
'data_category': 'long_only',
'samples': [sample_identifer],
'library_preps': lib_identifiers,
'platforms': platforms_identifiers,
'libraries': lib_accession,
'extra_libraries': [],
'software': [{'name': 'IsoTools',
'description': 'Python module for the analysis of LRTS data',
'version': isotools.__version__,
'url': 'https://pypi.org/project/isotools/',
'config': 'see https://github.molgen.mpg.de/lienhard/LRGASP_IsoTools'}]}
with open(f'{ch1_path}/experiment.json', 'w') as json_file:
json.dump(experiment, json_file)
# In[9]:
# models.gtf.gz - GTF file with model annotations, compressed with gzip.
isoseq.write_gtf(ch1_path+'/models.gtf.gz', source=protocol, use_gene_name=False, include=['SUBSTANTIAL'], gzip=True)
# In[10]:
with gzip.open(f'{ch1_path}/read_model_map.tsv.gz', 'wb') as f:
f.write('read_id\ttranscript_id\n'.encode())
for g,tr_nr, tr in isoseq.iter_transcripts(include=['SUBSTANTIAL']):
trid=f'{g.id}_{tr_nr}'
for rid in [r for rl in tr['reads'].values() for r in rl]:
f.write(f'{rid}\t{trid}\n'.encode())
if sample=='mouse_ES': # not allowed for challenge 2
os._exit(1)
# ## Subchallenge 2: Expression Table
# * Observation: at least for PacBio cDNA protocol, there is a length dependecy read counts per transcript when compared to Illumina/Kallisto
# * Idea: the counts per transcript are scaled by length dependent factor
# * defined as the ratio of the expected/observed transcript length distribution
# * transcript length distributions are modeled as log normal distributions
# * derive expected transcript length distribution from reference annotation
# * only include transcript types that we expect to sequence.
# In[11]:
#prepare metadata
Path(ch2_path).mkdir(parents=True, exist_ok=True)
try:
with open(f'{ch2_path}/../entry.json') as f:
entry = json.load(f)
except FileNotFoundError:
entry={'entry_id': f'iso_quant_{protocol}_{submission_id}',
'challenge_id': 'iso_quant',
'team_name': 'IsoTools',
'experiment_ids': [],
'data_category': 'long_only',
'samples': [],
'library_preps': [],
'platforms': [],
'contacts': [{'name': 'Matthias Lienhard', 'email': 'lienhard@molgen.mpg.de'},{'name': 'Ralf Herwig', 'email': 'herwig@molgen.mpg.de'}]}
entry['samples']=list(set(entry['samples']+[sample_identifer]))
entry['library_preps']=list(set(entry['library_preps']+lib_identifiers))
entry['platforms']= list(set(entry['platforms']+platforms_identifiers))
entry['experiment_ids']= list(set(entry['experiment_ids']+[f'{sample}_{protocol}_long']))
with open(f'{ch2_path}/../entry.json', 'w') as json_file:
json.dump(entry, json_file)
# In[12]:
lib_accession=isoseq.sample_table['file'].str.split('_').str[-2].tolist()
if sample == 'human_simulation':
lib_accession=['syn25683376']
if sample == 'mouse_simulation':
lib_accession=['syn25683381']
experiment={'experiment_id': f'{sample}_{protocol}_long',
'challenge_id': 'iso_quant',
'description': norm_description,
'notes': 'processed with IsoTools',
'species': sample[:sample.find('_')],
'data_category': 'long_only',
'samples': [sample_identifer],
'library_preps': lib_identifiers,
'platforms': platforms_identifiers,
'libraries': lib_accession,
'extra_libraries': [],
'software': [{'name': 'IsoTools',
'description': 'Python module for the analysis of LRTS data',
'version': isotools.__version__,
'url': 'https://pypi.org/project/isotools/',
'config': 'see https://github.molgen.mpg.de/lienhard/LRGASP_IsoTools'}]}
with open(f'{ch2_path}/experiment.json', 'w') as json_file:
json.dump(experiment, json_file)
# In[13]:
#number of genes/transcripts in reference annotation per category
ref_g_infos={k:{} for k in ['level', 'tag', 'gene_type', 'gene_biotype']}
ref_tr_infos={k:{} for k in ['transcript_support_level', 'transcript_type']}
for g in isoseq:
if g.is_annotated:
for k in ref_g_infos:
v=g.data['reference'].get(k,'unknown')
ref_g_infos[k].setdefault(v,0)
ref_g_infos[k][v]+=1
for tr in g.ref_transcripts:
for k in ref_tr_infos:
v=tr.get(k,'unknown')
ref_tr_infos[k].setdefault(v,0)
ref_tr_infos[k][v]+=1
# In[14]:
#number of FSM transcripts "substantially expressed" in long reads per category
found={k:{} for k in ['transcript_support_level', 'transcript_type']}
for g,trnr,tr in isoseq.iter_transcripts(include=['SUBSTANTIAL']):
if tr['annotation'][0]:#no FSM
continue
ref_tr_nr=tr['annotation'][1]['FSM'][0]
ref_tr=g.ref_transcripts[ref_tr_nr]
for k in found:
v=ref_tr.get(k,'unknown')
found[k][v]=found[k].get(v,0)+1
# In[15]:
# what types of transcripts are actually sequenced in this protocol
# and thus should be considered for length normalization
k='transcript_type'
trtype_fractions={t: found[k].get(t,0)/n for t,n in ref_tr_infos[k].items()}
for t,f in sorted(trtype_fractions.items(),key=lambda x: - x[1]):
if ref_tr_infos['transcript_type'][t]>100:
print(f'{t}: {f:.2%}')
# "unknown" seems to be the spike ins
# In[16]:
ref_len={}
for tr_type in ref_tr_infos['transcript_type']:
if ref_tr_infos['transcript_type'][tr_type]>100:
ref_len[tr_type]=[sum(e[1]-e[0] for e in tr['exons']) for _,_,tr in isoseq.iter_ref_transcripts() if tr.get("transcript_type",'unknown')==tr_type]
# In[17]:
from scipy.stats import lognorm
#fit lognorm distribution to reference
#reflen_params = {subset:lognorm.fit(ref_len[subset], floc=0) for subset in ref_len}
reflen_params=lognorm.fit([trlen for subset,f in trtype_fractions.items() if f>.05 for trlen in ref_len.get(subset,[])], floc=0)
# In[18]:
#fit lognormal to observed sample transcript length
trlen=[]
cov=[]
trid=[]
current=None
for g,trnr,tr in isoseq.iter_transcripts(include=['SUBSTANTIAL']):
if g!=current:
current=g
current_cov=g.coverage
cov.append(current_cov[:,trnr])
trlen.append(sum(e[1]-e[0] for e in tr['exons']))
trid.append(f'{g.id}_{trnr}')
cov=pd.DataFrame(cov,columns=isoseq.samples, index=trid)
# In[19]:
salen_params={}
for sa in isoseq.samples:
data=[l for l,n in zip(trlen,cov[sa]) for _ in range(n)]
salen_params[sa]=lognorm.fit(data, floc=0)
# In[20]:
def norm_factor(sa, trlen,lower_tail_fraction=.01, log=False):
#for short transcripts, the lognorm fit seems to fail - cap the lower tail
th=lognorm.ppf(lower_tail_fraction, *salen_params[sa])
trlen=np.array([tl if tl>th else th for tl in trlen])
sa_fitted = lognorm.pdf(trlen, *salen_params[sa])
ref_fitted = lognorm.pdf(trlen, *reflen_params)
if log:
return np.log10(ref_fitted)-np.log10(sa_fitted)
return ref_fitted/sa_fitted
# In[21]:
lib_accession=isoseq.sample_table['file'].str.split('_').str[-2].tolist()
if sample == 'human_simulation':
lib_accession=['syn25683376']
if sample == 'mouse_simulation':
lib_accession=['syn25683381']
#apply normalization factor
expr_tab=pd.concat([cov[sa]*norm_factor(sa, trlen) for sa in isoseq.samples], axis=1)
#scale to TPM
expr_tab=expr_tab/expr_tab.sum()*1e6
expr_tab.index.name='ID'
expr_tab.columns=lib_accession
expr_tab.to_csv(f'{ch2_path}/expression.tsv.gz', sep='\t',compression='gzip')