challenge.py

#!/usr/bin/env python
# coding: utf-8

# # Creation of files for submission
# This Script creates all files necessary for submission of sub challenge 1 (transcript calling) and sub challenge 2 (isoform quantification)
# The primary results are a gtf for sc1 and a expression table for sc2
#

import sys
import os

# Check the arguments
arguments = sys.argv[1:]
assert len(arguments)==3, f'Usage:{sys.argv[0]} <sample> <protocol> <submission>'
sample=arguments[0]
protocol=arguments[1]
submission_id=arguments[2]
assert sample  in ['human_H1_mix', 'human_WTC11','mouse_ES','human_simulation', 'mouse_simulation'], f'unknown sample {sample}'
assert protocol  in ['PacBio_CapTrap','PacBio_cDNA'], f'unknown protocol {protocol}'


from  isotools.transcriptome import Transcriptome
import isotools
import numpy as np
import pandas as pd
from glob import glob
import os
from scipy.stats import lognorm
from pathlib import Path
import gzip
import json

print (f'This is isotools version {isotools.__version__}')
import logging
logger=logging.getLogger('isotools')
logger.setLevel(logging.INFO)


# In[2]:


path='/project/42/pacbio/LRGASP'
#sample='mouse_ES'
#protocol='PacBio_CapTrap'
#submission_id='IsoTools_final'
out_fn=f'{path}/submission/{sample}_{protocol.split("_")[0] if "simulation" in sample else protocol}'
ch1_path=f'{path}/submission/iso_detect_ref_{protocol}_{submission_id}/{sample}_{protocol}_long'
ch2_path=f'{path}/submission/iso_quant_{protocol}_{submission_id}/{sample}_{protocol}_long'
selection_strategy='FSM or NIC: tr coverage >2% of gene coverage and tr coverage >= 2 reads NOVEL_SPLICING or NOVEL_GENE: tr coverage >5% of gene coverage and tr coverage >= 5 reads and less than 50% A content and direct repeat len<=6'
norm_description='length dependend normalization factor based on ratio of observed length distribution over reference transcripts'


bam_files={}
for fn in sorted(glob(f'{path}/isoseq/05-raw_align/{sample}_{protocol.split("_")[0] if "simulation" in sample else protocol}*_trimmed.bam')):
    infos=os.path.basename(fn).split('_')[:-1]
    sn='_'.join(infos[:-1])
    bam_files[sn]=fn
print(f'samples: {",".join(bam_files)}')


# ## Import of long reads:
# * read the aligned long reads
# * corrections:
#     * fuzzy junctions: same shift on 5' and 3' wrt reference junction
#     * chain chimeric alignments on same chromosome/ orientation
# * group reads to transcripts:
#     * same splice junctions <-> same transcript
#     * for mono-exons: at least 50% overlap
# * define TSS/PAS: mean of read start/end positions
# * categorize transcripts:
#     * FSM: which reference transcript
#     * other: how exactly different to what transcripts
#

# In[3]:


if sample[:5]=='human':
    ref_fn=f'{path}/data/references/lrgasp_gencode_v38_sirvs.sorted'
    genome_fn=f'{path}/data/references/lrgasp_grch38_sirvs.fasta'
else:
    ref_fn=f'{path}/data/references/lrgasp_gencode_vM27_sirvs.sorted'
    genome_fn=f'{path}/data/references/lrgasp_grcm39_sirvs.fasta'

try:
    isoseq=Transcriptome(out_fn+'_isotools.pkl')
except FileNotFoundError:
    try:
        isoseq=Transcriptome.from_reference(ref_fn+'.isotools.pkl')
        isoseq.collapse_immune_genes()
    except FileNotFoundError:
        isoseq=Transcriptome.from_reference(ref_fn+'.gtf.gz', infer_genes=True)#, chromosomes=spikein)
        isoseq.save_reference(ref_fn+'.isotools.pkl')
        isoseq.collapse_immune_genes()
    #isoseq=Transcriptome('...')
    for sa,bam in  bam_files.items():
        isoseq.add_sample_from_bam(bam, save_readnames=True,min_align_fraction=0.75, sample_name=sa, group=sa, cell_type=sample, protocol=protocol)#, add_chromosomes=False) #33+17 minutes

    isoseq.add_qc_metrics(genome_fn) #about 20 min
    isoseq.make_index()
    isoseq.save(out_fn+'_isotools.pkl')


# In[5]:


gene_filter={'CHIMERIC': 'chimeric', 'EXPRESSED': 'transcripts', 'NOVEL_GENE': 'not reference', 'SPIKE_IN':f'chr[:4] in ["ERCC","SIRV"]'}
transcript_filter=isotools._transcriptome_filter.DEFAULT_TRANSCRIPT_FILTER
transcript_filter['FSM']='annotation[0]==0'
transcript_filter['RTTS']='bool(novel_splice_sites) and any(i%2==0 and i+1 in novel_splice_sites and direct_repeat_len[i%2]>=6 for i in novel_splice_sites)'

ref_transcript_filter=isotools._transcriptome_filter.DEFAULT_REF_TRANSCRIPT_FILTER
ref_transcript_filter['HIGH_SUPPORT']='transcript_support_level in ["1", "2"]'
isoseq.add_filter(gene_filter=gene_filter, transcript_filter=transcript_filter,ref_transcript_filter=ref_transcript_filter)


# In[6]:


# add a Tag "substantial" that the transcript has at least 2% of the total reads of the gene.
#This cannot be done in the common framework, hence add this placeholder and manually add it.
# note that add_filter would crash with this rule, as the gene cannot be accessed.
isoseq.infos['filter']['transcript_filter']['SUBSTANTIAL']=selection_strategy
for g,trnr,tr in isoseq.iter_transcripts():
    cov=g.coverage[:,trnr].sum()
    if g.is_annotated:
        if 'novel_splice_sites' in tr:
            if cov > 4 and cov>g.coverage.sum()*.05 and 'RTTS' not in tr['filter']:
                tr['filter'].append('SUBSTANTIAL')
        elif cov > 1 and cov > g.coverage.sum()*.02:
            tr['filter'].append('SUBSTANTIAL')
    elif cov > 4 and cov>g.coverage.sum()*.05 and tr['downstream_A_content']>.5 and 'RTTS' not in tr['filter']:
        tr['filter'].append('SUBSTANTIAL')


# ## Subchallenge 1: Transcript gtf
# current filter: only substantially expressed - many issues are filtered already
#
# todo: Add more filter criteria
# * truncations
#     * variable TSS/PAS?
#     * within/between protocols/samples
#     * in particular for transcript with deviance from annotation
# * internal priming
#     * check odd pattern
#     * issue with polyA trimming?
# * RTTS
#     * check illumina support of novel junctions
#     * alignment quality at junctions?
#
# Also I could add novel gene subcategory infos to the gtf (just to show off)

# In[7]:


#only one entry per challenge, e.g. one for iso_detect_ref and one for iso_quant
#with open(f'{path}/scripts/lrgasp-challenge-1-evaluation/utilities/entry_dummy.json') as f:
#    entry = json.load(f)

Path(ch1_path).mkdir(parents=True, exist_ok=True)

try:
    with open(f'{ch1_path}/../entry.json') as f:
        entry = json.load(f)

except FileNotFoundError:
    entry={'entry_id': f'iso_detect_ref_{protocol}_{submission_id}',
     'challenge_id': 'iso_detect_ref',
     'team_name': 'IsoTools',
     'experiment_ids': [],
     'data_category': 'long_only',
     'samples': [],
     'library_preps': [],
     'platforms': [],
     'contacts': [{'name': 'Matthias Lienhard', 'email': 'lienhard@molgen.mpg.de'},{'name': 'Ralf Herwig', 'email': 'herwig@molgen.mpg.de'}]}

#sample_identifer=sample if 'simulation' in sample else sample[sample.find('_')+1:]
#lib_identifiers=[protocol[protocol.find('_')+1:]] if '_' in protocol else []
#platforms_identifiers= [protocol[:protocol.find('_')]] if '_' in protocol else [protocol]

sample_identifer=sample if 'simulation' in sample else sample[sample.find('_')+1:]
lib_identifiers=[protocol[protocol.find('_')+1:]]
platforms_identifiers= [protocol[:protocol.find('_')]]


entry['samples']=list(set(entry['samples']+[sample_identifer]))
entry['library_preps']=list(set(entry['library_preps']+lib_identifiers))
entry['platforms']= list(set(entry['platforms']+platforms_identifiers))
entry['experiment_ids']= list(set(entry['experiment_ids']+[f'{sample}_{protocol}_long']))

with open(f'{ch1_path}/../entry.json', 'w') as json_file:
    json.dump(entry, json_file)


# In[8]:


#prepare metadata
lib_accession=isoseq.sample_table['file'].str.split('_').str[-2].tolist()
if sample == 'human_simulation':
    lib_accession=['syn25683376']
if sample == 'mouse_simulation':
    lib_accession=['syn25683381']


experiment={'experiment_id': f'{sample}_{protocol}_long',
 'challenge_id': 'iso_detect_ref', #"iso_quant"
 'description': selection_strategy,
 'notes': 'processed with IsoTools',
 'species': sample[:sample.find('_')],
 'data_category': 'long_only',
 'samples': [sample_identifer],
 'library_preps': lib_identifiers,
 'platforms': platforms_identifiers,
 'libraries': lib_accession,
 'extra_libraries': [],
 'software': [{'name': 'IsoTools',
   'description': 'Python module for the analysis of LRTS data',
   'version': isotools.__version__,
   'url': 'https://pypi.org/project/isotools/',
   'config': 'see https://github.molgen.mpg.de/lienhard/LRGASP_IsoTools'}]}
with open(f'{ch1_path}/experiment.json', 'w') as json_file:
    json.dump(experiment, json_file)


# In[9]:


# models.gtf.gz - GTF file with model annotations, compressed with gzip.
isoseq.write_gtf(ch1_path+'/models.gtf.gz', source=protocol, use_gene_name=False, include=['SUBSTANTIAL'], gzip=True)


# In[10]:


with gzip.open(f'{ch1_path}/read_model_map.tsv.gz', 'wb') as f:
    f.write('read_id\ttranscript_id\n'.encode())
    for g,tr_nr, tr in isoseq.iter_transcripts(include=['SUBSTANTIAL']):
        trid=f'{g.id}_{tr_nr}'
        for rid in [r for rl in tr['reads'].values() for r in rl]:
            f.write(f'{rid}\t{trid}\n'.encode())

if sample=='mouse_ES': # not allowed for challenge 2
    os._exit(1)
# ## Subchallenge 2: Expression Table
# * Observation: at least for PacBio cDNA protocol, there is a length dependecy read counts per transcript when compared to Illumina/Kallisto
# * Idea: the counts per transcript are scaled by length dependent factor
#     * defined as the ratio of the expected/observed transcript length distribution
#     * transcript length distributions are modeled as log normal distributions
#     * derive expected transcript length distribution from reference annotation
#     * only include transcript types that we expect to sequence.

# In[11]:

#prepare metadata
Path(ch2_path).mkdir(parents=True, exist_ok=True)

try:
    with open(f'{ch2_path}/../entry.json') as f:
        entry = json.load(f)

except FileNotFoundError:
    entry={'entry_id': f'iso_quant_{protocol}_{submission_id}',
     'challenge_id': 'iso_quant',
     'team_name': 'IsoTools',
     'experiment_ids': [],
     'data_category': 'long_only',
     'samples': [],
     'library_preps': [],
     'platforms': [],
     'contacts': [{'name': 'Matthias Lienhard', 'email': 'lienhard@molgen.mpg.de'},{'name': 'Ralf Herwig', 'email': 'herwig@molgen.mpg.de'}]}

entry['samples']=list(set(entry['samples']+[sample_identifer]))
entry['library_preps']=list(set(entry['library_preps']+lib_identifiers))
entry['platforms']= list(set(entry['platforms']+platforms_identifiers))
entry['experiment_ids']= list(set(entry['experiment_ids']+[f'{sample}_{protocol}_long']))

with open(f'{ch2_path}/../entry.json', 'w') as json_file:
    json.dump(entry, json_file)


# In[12]:


lib_accession=isoseq.sample_table['file'].str.split('_').str[-2].tolist()
if sample == 'human_simulation':
    lib_accession=['syn25683376']
if sample == 'mouse_simulation':
    lib_accession=['syn25683381']

experiment={'experiment_id': f'{sample}_{protocol}_long',
 'challenge_id': 'iso_quant',
 'description': norm_description,
 'notes': 'processed with IsoTools',
 'species': sample[:sample.find('_')],
 'data_category': 'long_only',
 'samples': [sample_identifer],
 'library_preps': lib_identifiers,
 'platforms': platforms_identifiers,
 'libraries': lib_accession,
 'extra_libraries': [],
 'software': [{'name': 'IsoTools',
   'description': 'Python module for the analysis of LRTS data',
   'version': isotools.__version__,
   'url': 'https://pypi.org/project/isotools/',
   'config': 'see https://github.molgen.mpg.de/lienhard/LRGASP_IsoTools'}]}
with open(f'{ch2_path}/experiment.json', 'w') as json_file:
    json.dump(experiment, json_file)


# In[13]:


#number of genes/transcripts in reference annotation per category
ref_g_infos={k:{} for k in ['level', 'tag', 'gene_type', 'gene_biotype']}
ref_tr_infos={k:{} for k in ['transcript_support_level', 'transcript_type']}
for g in isoseq:
    if g.is_annotated:
        for k in ref_g_infos:
            v=g.data['reference'].get(k,'unknown')
            ref_g_infos[k].setdefault(v,0)
            ref_g_infos[k][v]+=1
        for tr in g.ref_transcripts:
            for k in ref_tr_infos:
                v=tr.get(k,'unknown')
                ref_tr_infos[k].setdefault(v,0)
                ref_tr_infos[k][v]+=1


# In[14]:


#number of FSM transcripts "substantially expressed" in long reads per category
found={k:{} for k in ['transcript_support_level', 'transcript_type']}
for g,trnr,tr in isoseq.iter_transcripts(include=['SUBSTANTIAL']):
    if tr['annotation'][0]:#no FSM
        continue
    ref_tr_nr=tr['annotation'][1]['FSM'][0]
    ref_tr=g.ref_transcripts[ref_tr_nr]
    for k in found:
        v=ref_tr.get(k,'unknown')
        found[k][v]=found[k].get(v,0)+1


# In[15]:


# what types of transcripts are actually sequenced in this protocol
# and thus should be considered for length normalization
k='transcript_type'
trtype_fractions={t: found[k].get(t,0)/n for t,n in ref_tr_infos[k].items()}
for t,f in sorted(trtype_fractions.items(),key=lambda x: - x[1]):
    if ref_tr_infos['transcript_type'][t]>100:
        print(f'{t}: {f:.2%}')
# "unknown" seems to be the spike ins


# In[16]:


ref_len={}
for tr_type in ref_tr_infos['transcript_type']:
    if ref_tr_infos['transcript_type'][tr_type]>100:
        ref_len[tr_type]=[sum(e[1]-e[0] for e in tr['exons']) for _,_,tr in isoseq.iter_ref_transcripts() if tr.get("transcript_type",'unknown')==tr_type]


# In[17]:


from scipy.stats import lognorm
#fit lognorm distribution to reference

#reflen_params = {subset:lognorm.fit(ref_len[subset], floc=0) for subset in ref_len}
reflen_params=lognorm.fit([trlen for subset,f in trtype_fractions.items() if f>.05 for trlen in ref_len.get(subset,[])], floc=0)


# In[18]:


#fit lognormal to observed sample transcript length
trlen=[]
cov=[]
trid=[]
current=None
for g,trnr,tr in isoseq.iter_transcripts(include=['SUBSTANTIAL']):
    if g!=current:
        current=g
        current_cov=g.coverage
    cov.append(current_cov[:,trnr])
    trlen.append(sum(e[1]-e[0] for e in tr['exons']))
    trid.append(f'{g.id}_{trnr}')
cov=pd.DataFrame(cov,columns=isoseq.samples, index=trid)


# In[19]:


salen_params={}
for sa in isoseq.samples:
    data=[l for l,n in zip(trlen,cov[sa]) for _ in range(n)]
    salen_params[sa]=lognorm.fit(data, floc=0)


# In[20]:


def norm_factor(sa, trlen,lower_tail_fraction=.01, log=False):
    #for short transcripts, the lognorm fit seems to fail - cap the lower tail
    th=lognorm.ppf(lower_tail_fraction, *salen_params[sa])
    trlen=np.array([tl if tl>th else th for tl in trlen])

    sa_fitted = lognorm.pdf(trlen, *salen_params[sa])
    ref_fitted = lognorm.pdf(trlen, *reflen_params)
    if log:
        return np.log10(ref_fitted)-np.log10(sa_fitted)
    return ref_fitted/sa_fitted


# In[21]:


lib_accession=isoseq.sample_table['file'].str.split('_').str[-2].tolist()
if sample == 'human_simulation':
    lib_accession=['syn25683376']
if sample == 'mouse_simulation':
    lib_accession=['syn25683381']

#apply normalization factor
expr_tab=pd.concat([cov[sa]*norm_factor(sa, trlen) for sa in isoseq.samples], axis=1)
#scale to TPM
expr_tab=expr_tab/expr_tab.sum()*1e6
expr_tab.index.name='ID'
expr_tab.columns=lib_accession
expr_tab.to_csv(f'{ch2_path}/expression.tsv.gz', sep='\t',compression='gzip')
	#!/usr/bin/env python
	# coding: utf-8

	# # Creation of files for submission
	# This Script creates all files necessary for submission of sub challenge 1 (transcript calling) and sub challenge 2 (isoform quantification)
	# The primary results are a gtf for sc1 and a expression table for sc2
	#

	import sys
	import os

	# Check the arguments
	arguments = sys.argv[1:]
	assert len(arguments)==3, f'Usage:{sys.argv[0]} <sample> <protocol> <submission>'
	sample=arguments[0]
	protocol=arguments[1]
	submission_id=arguments[2]
	assert sample in ['human_H1_mix', 'human_WTC11','mouse_ES','human_simulation', 'mouse_simulation'], f'unknown sample {sample}'
	assert protocol in ['PacBio_CapTrap','PacBio_cDNA'], f'unknown protocol {protocol}'



	from isotools.transcriptome import Transcriptome
	import isotools
	import numpy as np
	import pandas as pd
	from glob import glob
	import os
	from scipy.stats import lognorm
	from pathlib import Path
	import gzip
	import json

	print (f'This is isotools version {isotools.__version__}')
	import logging
	logger=logging.getLogger('isotools')
	logger.setLevel(logging.INFO)


	# In[2]:


	path='/project/42/pacbio/LRGASP'
	#sample='mouse_ES'
	#protocol='PacBio_CapTrap'
	#submission_id='IsoTools_final'
	out_fn=f'{path}/submission/{sample}_{protocol.split("_")[0] if "simulation" in sample else protocol}'
	ch1_path=f'{path}/submission/iso_detect_ref_{protocol}_{submission_id}/{sample}_{protocol}_long'
	ch2_path=f'{path}/submission/iso_quant_{protocol}_{submission_id}/{sample}_{protocol}_long'
	selection_strategy='FSM or NIC: tr coverage >2% of gene coverage and tr coverage >= 2 reads NOVEL_SPLICING or NOVEL_GENE: tr coverage >5% of gene coverage and tr coverage >= 5 reads and less than 50% A content and direct repeat len<=6'
	norm_description='length dependend normalization factor based on ratio of observed length distribution over reference transcripts'


	bam_files={}
	for fn in sorted(glob(f'{path}/isoseq/05-raw_align/{sample}_{protocol.split("_")[0] if "simulation" in sample else protocol}*_trimmed.bam')):
	infos=os.path.basename(fn).split('_')[:-1]
	sn='_'.join(infos[:-1])
	bam_files[sn]=fn
	print(f'samples: {",".join(bam_files)}')


	# ## Import of long reads:
	# * read the aligned long reads
	# * corrections:
	# * fuzzy junctions: same shift on 5' and 3' wrt reference junction
	# * chain chimeric alignments on same chromosome/ orientation
	# * group reads to transcripts:
	# * same splice junctions <-> same transcript
	# * for mono-exons: at least 50% overlap
	# * define TSS/PAS: mean of read start/end positions
	# * categorize transcripts:
	# * FSM: which reference transcript
	# * other: how exactly different to what transcripts
	#

	# In[3]:


	if sample[:5]=='human':
	ref_fn=f'{path}/data/references/lrgasp_gencode_v38_sirvs.sorted'
	genome_fn=f'{path}/data/references/lrgasp_grch38_sirvs.fasta'
	else:
	ref_fn=f'{path}/data/references/lrgasp_gencode_vM27_sirvs.sorted'
	genome_fn=f'{path}/data/references/lrgasp_grcm39_sirvs.fasta'

	try:
	isoseq=Transcriptome(out_fn+'_isotools.pkl')
	except FileNotFoundError:
	try:
	isoseq=Transcriptome.from_reference(ref_fn+'.isotools.pkl')
	isoseq.collapse_immune_genes()
	except FileNotFoundError:
	isoseq=Transcriptome.from_reference(ref_fn+'.gtf.gz', infer_genes=True)#, chromosomes=spikein)
	isoseq.save_reference(ref_fn+'.isotools.pkl')
	isoseq.collapse_immune_genes()
	#isoseq=Transcriptome('...')
	for sa,bam in bam_files.items():
	isoseq.add_sample_from_bam(bam, save_readnames=True,min_align_fraction=0.75, sample_name=sa, group=sa, cell_type=sample, protocol=protocol)#, add_chromosomes=False) #33+17 minutes

	isoseq.add_qc_metrics(genome_fn) #about 20 min
	isoseq.make_index()
	isoseq.save(out_fn+'_isotools.pkl')


	# In[5]:


	gene_filter={'CHIMERIC': 'chimeric', 'EXPRESSED': 'transcripts', 'NOVEL_GENE': 'not reference', 'SPIKE_IN':f'chr[:4] in ["ERCC","SIRV"]'}
	transcript_filter=isotools._transcriptome_filter.DEFAULT_TRANSCRIPT_FILTER
	transcript_filter['FSM']='annotation[0]==0'
	transcript_filter['RTTS']='bool(novel_splice_sites) and any(i%2==0 and i+1 in novel_splice_sites and direct_repeat_len[i%2]>=6 for i in novel_splice_sites)'

	ref_transcript_filter=isotools._transcriptome_filter.DEFAULT_REF_TRANSCRIPT_FILTER
	ref_transcript_filter['HIGH_SUPPORT']='transcript_support_level in ["1", "2"]'
	isoseq.add_filter(gene_filter=gene_filter, transcript_filter=transcript_filter,ref_transcript_filter=ref_transcript_filter)


	# In[6]:


	# add a Tag "substantial" that the transcript has at least 2% of the total reads of the gene.
	#This cannot be done in the common framework, hence add this placeholder and manually add it.
	# note that add_filter would crash with this rule, as the gene cannot be accessed.
	isoseq.infos['filter']['transcript_filter']['SUBSTANTIAL']=selection_strategy
	for g,trnr,tr in isoseq.iter_transcripts():
	cov=g.coverage[:,trnr].sum()
	if g.is_annotated:
	if 'novel_splice_sites' in tr:
	if cov > 4 and cov>g.coverage.sum()*.05 and 'RTTS' not in tr['filter']:
	tr['filter'].append('SUBSTANTIAL')
	elif cov > 1 and cov > g.coverage.sum()*.02:
	tr['filter'].append('SUBSTANTIAL')
	elif cov > 4 and cov>g.coverage.sum()*.05 and tr['downstream_A_content']>.5 and 'RTTS' not in tr['filter']:
	tr['filter'].append('SUBSTANTIAL')


	# ## Subchallenge 1: Transcript gtf
	# current filter: only substantially expressed - many issues are filtered already
	#
	# todo: Add more filter criteria
	# * truncations
	# * variable TSS/PAS?
	# * within/between protocols/samples
	# * in particular for transcript with deviance from annotation
	# * internal priming
	# * check odd pattern
	# * issue with polyA trimming?
	# * RTTS
	# * check illumina support of novel junctions
	# * alignment quality at junctions?
	#
	# Also I could add novel gene subcategory infos to the gtf (just to show off)

	# In[7]:


	#only one entry per challenge, e.g. one for iso_detect_ref and one for iso_quant
	#with open(f'{path}/scripts/lrgasp-challenge-1-evaluation/utilities/entry_dummy.json') as f:
	# entry = json.load(f)

	Path(ch1_path).mkdir(parents=True, exist_ok=True)

	try:
	with open(f'{ch1_path}/../entry.json') as f:
	entry = json.load(f)

	except FileNotFoundError:
	entry={'entry_id': f'iso_detect_ref_{protocol}_{submission_id}',
	'challenge_id': 'iso_detect_ref',
	'team_name': 'IsoTools',
	'experiment_ids': [],
	'data_category': 'long_only',
	'samples': [],
	'library_preps': [],
	'platforms': [],
	'contacts': [{'name': 'Matthias Lienhard', 'email': 'lienhard@molgen.mpg.de'},{'name': 'Ralf Herwig', 'email': 'herwig@molgen.mpg.de'}]}

	#sample_identifer=sample if 'simulation' in sample else sample[sample.find('_')+1:]
	#lib_identifiers=[protocol[protocol.find('_')+1:]] if '_' in protocol else []
	#platforms_identifiers= [protocol[:protocol.find('_')]] if '_' in protocol else [protocol]

	sample_identifer=sample if 'simulation' in sample else sample[sample.find('_')+1:]
	lib_identifiers=[protocol[protocol.find('_')+1:]]
	platforms_identifiers= [protocol[:protocol.find('_')]]


	entry['samples']=list(set(entry['samples']+[sample_identifer]))
	entry['library_preps']=list(set(entry['library_preps']+lib_identifiers))
	entry['platforms']= list(set(entry['platforms']+platforms_identifiers))
	entry['experiment_ids']= list(set(entry['experiment_ids']+[f'{sample}_{protocol}_long']))

	with open(f'{ch1_path}/../entry.json', 'w') as json_file:
	json.dump(entry, json_file)


	# In[8]:


	#prepare metadata
	lib_accession=isoseq.sample_table['file'].str.split('_').str[-2].tolist()
	if sample == 'human_simulation':
	lib_accession=['syn25683376']
	if sample == 'mouse_simulation':
	lib_accession=['syn25683381']


	experiment={'experiment_id': f'{sample}_{protocol}_long',
	'challenge_id': 'iso_detect_ref', #"iso_quant"
	'description': selection_strategy,
	'notes': 'processed with IsoTools',
	'species': sample[:sample.find('_')],
	'data_category': 'long_only',
	'samples': [sample_identifer],
	'library_preps': lib_identifiers,
	'platforms': platforms_identifiers,
	'libraries': lib_accession,
	'extra_libraries': [],
	'software': [{'name': 'IsoTools',
	'description': 'Python module for the analysis of LRTS data',
	'version': isotools.__version__,
	'url': 'https://pypi.org/project/isotools/',
	'config': 'see https://github.molgen.mpg.de/lienhard/LRGASP_IsoTools'}]}
	with open(f'{ch1_path}/experiment.json', 'w') as json_file:
	json.dump(experiment, json_file)


	# In[9]:


	# models.gtf.gz - GTF file with model annotations, compressed with gzip.
	isoseq.write_gtf(ch1_path+'/models.gtf.gz', source=protocol, use_gene_name=False, include=['SUBSTANTIAL'], gzip=True)


	# In[10]:


	with gzip.open(f'{ch1_path}/read_model_map.tsv.gz', 'wb') as f:
	f.write('read_id\ttranscript_id\n'.encode())
	for g,tr_nr, tr in isoseq.iter_transcripts(include=['SUBSTANTIAL']):
	trid=f'{g.id}_{tr_nr}'
	for rid in [r for rl in tr['reads'].values() for r in rl]:
	f.write(f'{rid}\t{trid}\n'.encode())

	if sample=='mouse_ES': # not allowed for challenge 2
	os._exit(1)
	# ## Subchallenge 2: Expression Table
	# * Observation: at least for PacBio cDNA protocol, there is a length dependecy read counts per transcript when compared to Illumina/Kallisto
	# * Idea: the counts per transcript are scaled by length dependent factor
	# * defined as the ratio of the expected/observed transcript length distribution
	# * transcript length distributions are modeled as log normal distributions
	# * derive expected transcript length distribution from reference annotation
	# * only include transcript types that we expect to sequence.

	# In[11]:

	#prepare metadata
	Path(ch2_path).mkdir(parents=True, exist_ok=True)

	try:
	with open(f'{ch2_path}/../entry.json') as f:
	entry = json.load(f)

	except FileNotFoundError:
	entry={'entry_id': f'iso_quant_{protocol}_{submission_id}',
	'challenge_id': 'iso_quant',
	'team_name': 'IsoTools',
	'experiment_ids': [],
	'data_category': 'long_only',
	'samples': [],
	'library_preps': [],
	'platforms': [],
	'contacts': [{'name': 'Matthias Lienhard', 'email': 'lienhard@molgen.mpg.de'},{'name': 'Ralf Herwig', 'email': 'herwig@molgen.mpg.de'}]}

	entry['samples']=list(set(entry['samples']+[sample_identifer]))
	entry['library_preps']=list(set(entry['library_preps']+lib_identifiers))
	entry['platforms']= list(set(entry['platforms']+platforms_identifiers))
	entry['experiment_ids']= list(set(entry['experiment_ids']+[f'{sample}_{protocol}_long']))

	with open(f'{ch2_path}/../entry.json', 'w') as json_file:
	json.dump(entry, json_file)


	# In[12]:



	lib_accession=isoseq.sample_table['file'].str.split('_').str[-2].tolist()
	if sample == 'human_simulation':
	lib_accession=['syn25683376']
	if sample == 'mouse_simulation':
	lib_accession=['syn25683381']

	experiment={'experiment_id': f'{sample}_{protocol}_long',
	'challenge_id': 'iso_quant',
	'description': norm_description,
	'notes': 'processed with IsoTools',
	'species': sample[:sample.find('_')],
	'data_category': 'long_only',
	'samples': [sample_identifer],
	'library_preps': lib_identifiers,
	'platforms': platforms_identifiers,
	'libraries': lib_accession,
	'extra_libraries': [],
	'software': [{'name': 'IsoTools',
	'description': 'Python module for the analysis of LRTS data',
	'version': isotools.__version__,
	'url': 'https://pypi.org/project/isotools/',
	'config': 'see https://github.molgen.mpg.de/lienhard/LRGASP_IsoTools'}]}
	with open(f'{ch2_path}/experiment.json', 'w') as json_file:
	json.dump(experiment, json_file)


	# In[13]:


	#number of genes/transcripts in reference annotation per category
	ref_g_infos={k:{} for k in ['level', 'tag', 'gene_type', 'gene_biotype']}
	ref_tr_infos={k:{} for k in ['transcript_support_level', 'transcript_type']}
	for g in isoseq:
	if g.is_annotated:
	for k in ref_g_infos:
	v=g.data['reference'].get(k,'unknown')
	ref_g_infos[k].setdefault(v,0)
	ref_g_infos[k][v]+=1
	for tr in g.ref_transcripts:
	for k in ref_tr_infos:
	v=tr.get(k,'unknown')
	ref_tr_infos[k].setdefault(v,0)
	ref_tr_infos[k][v]+=1


	# In[14]:


	#number of FSM transcripts "substantially expressed" in long reads per category
	found={k:{} for k in ['transcript_support_level', 'transcript_type']}
	for g,trnr,tr in isoseq.iter_transcripts(include=['SUBSTANTIAL']):
	if tr['annotation'][0]:#no FSM
	continue
	ref_tr_nr=tr['annotation'][1]['FSM'][0]
	ref_tr=g.ref_transcripts[ref_tr_nr]
	for k in found:
	v=ref_tr.get(k,'unknown')
	found[k][v]=found[k].get(v,0)+1


	# In[15]:


	# what types of transcripts are actually sequenced in this protocol
	# and thus should be considered for length normalization
	k='transcript_type'
	trtype_fractions={t: found[k].get(t,0)/n for t,n in ref_tr_infos[k].items()}
	for t,f in sorted(trtype_fractions.items(),key=lambda x: - x[1]):
	if ref_tr_infos['transcript_type'][t]>100:
	print(f'{t}: {f:.2%}')
	# "unknown" seems to be the spike ins


	# In[16]:


	ref_len={}
	for tr_type in ref_tr_infos['transcript_type']:
	if ref_tr_infos['transcript_type'][tr_type]>100:
	ref_len[tr_type]=[sum(e[1]-e[0] for e in tr['exons']) for _,_,tr in isoseq.iter_ref_transcripts() if tr.get("transcript_type",'unknown')==tr_type]


	# In[17]:


	from scipy.stats import lognorm
	#fit lognorm distribution to reference

	#reflen_params = {subset:lognorm.fit(ref_len[subset], floc=0) for subset in ref_len}
	reflen_params=lognorm.fit([trlen for subset,f in trtype_fractions.items() if f>.05 for trlen in ref_len.get(subset,[])], floc=0)


	# In[18]:


	#fit lognormal to observed sample transcript length
	trlen=[]
	cov=[]
	trid=[]
	current=None
	for g,trnr,tr in isoseq.iter_transcripts(include=['SUBSTANTIAL']):
	if g!=current:
	current=g
	current_cov=g.coverage
	cov.append(current_cov[:,trnr])
	trlen.append(sum(e[1]-e[0] for e in tr['exons']))
	trid.append(f'{g.id}_{trnr}')
	cov=pd.DataFrame(cov,columns=isoseq.samples, index=trid)


	# In[19]:


	salen_params={}
	for sa in isoseq.samples:
	data=[l for l,n in zip(trlen,cov[sa]) for _ in range(n)]
	salen_params[sa]=lognorm.fit(data, floc=0)


	# In[20]:


	def norm_factor(sa, trlen,lower_tail_fraction=.01, log=False):
	#for short transcripts, the lognorm fit seems to fail - cap the lower tail
	th=lognorm.ppf(lower_tail_fraction, *salen_params[sa])
	trlen=np.array([tl if tl>th else th for tl in trlen])

	sa_fitted = lognorm.pdf(trlen, *salen_params[sa])
	ref_fitted = lognorm.pdf(trlen, *reflen_params)
	if log:
	return np.log10(ref_fitted)-np.log10(sa_fitted)
	return ref_fitted/sa_fitted


	# In[21]:


	lib_accession=isoseq.sample_table['file'].str.split('_').str[-2].tolist()
	if sample == 'human_simulation':
	lib_accession=['syn25683376']
	if sample == 'mouse_simulation':
	lib_accession=['syn25683381']

	#apply normalization factor
	expr_tab=pd.concat([cov[sa]*norm_factor(sa, trlen) for sa in isoseq.samples], axis=1)
	#scale to TPM
	expr_tab=expr_tab/expr_tab.sum()*1e6
	expr_tab.index.name='ID'
	expr_tab.columns=lib_accession
	expr_tab.to_csv(f'{ch2_path}/expression.tsv.gz', sep='\t',compression='gzip')