Skip to content
Permalink
c95019ec5a
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
352 lines (278 sloc) 8.78 KB
//
// htsutils.c
// PASSFinder
//
// Created by Georgi Tushev on 07/09/16.
// Copyright © 2016 Scientific Computing. All rights reserved.
//
#include "htsutils.h"
void hts_init(hts_config_t *hts, aux_config_t *aux)
{
/* open input stream */
hts->fh_in = sam_open(aux->file_in, "r");
if (!hts->fh_in)
{
fprintf(stderr, "Error : HTS_INIT : coult not open input stream from %s\n", aux->file_in);
exit(EXIT_FAILURE);
}
/* parse BAM header */
hts->bam_hdr = sam_hdr_read(hts->fh_in);
if (!hts->bam_hdr)
{
fprintf(stderr, "Error : HTS_INIT : coult not read BAM header %s\n", aux->file_in);
exit(EXIT_FAILURE);
}
/* read tabix BED mask */
hts->tabix = regidx_init(aux->file_mask, NULL, NULL, 0, NULL);
if (!hts->tabix)
{
fprintf(stderr, "Error :: failed to load polymask index!\n");
exit(EXIT_FAILURE);
}
/* initialize BAM reocrd */
hts->bam = bam_init1();
/* initialize CHROM hash of hashes */
hts->pstrand = kh_init(pileup_h);
hts->nstrand = kh_init(pileup_h);
return;
}
void hts_parse(hts_config_t *hts, aux_config_t *aux)
{
/* local counters */
int passBaseCondition = 0;
uint32_t bam_lines = 0;
uint32_t polySize_tail = 0;
uint32_t polySize_read = 0;
uint32_t base = 0;
/* hash related variables */
int absent;
uint64_t hkey = 0;
pileup_t *hvalue;
khiter_t hitr;
khash_t(pileup_h) *hstrand;
/* mask poly base variables */
uint32_t polyBaseRegion_start = 0;
uint32_t polyBaseRegion_end = 0;
regitr_t mitr;
/* read BAM record by record */
while (sam_read1(hts->fh_in, hts->bam_hdr, hts->bam) >= 0)
{
/* count bam lines */
bam_lines++;
/* filter based on MAPQ */
if (aux->mapq > hts->bam->core.qual)
continue;
/* strand specific information */
if (hts->bam->core.flag & BAM_FREVERSE)
{
// set current hash pointer
hstrand = hts->nstrand;
// check if tail in read
polySize_tail = findPoly(hts->bam, 'T', POLY_HEAD);
// define last base
base = hts->bam->core.pos + 1;
}
else
{
// set current hash pointer
hstrand = hts->pstrand;
// check if tail in read
polySize_tail = findPoly(hts->bam, 'A', POLY_TAIL);
// define last base
base = bam_calend(&hts->bam->core, bam_get_cigar(hts->bam));
}
/* overall alignment length */
polySize_read = bam_calqlen(&hts->bam->core, bam_get_cigar(hts->bam));
/* define poly base condition */
passBaseCondition = ((polySize_tail >= aux->polysize) && (polySize_read <= (hts->bam->core.l_qseq - aux->polysize))) ? 1 : 0;
/* create 64bit hash key from 32bit chrom and 32bit position */
hkey = (uint64_t)hts->bam->core.tid << 32 | base;
/* check if key exist in hash */
hitr = kh_put(pileup_h, hstrand, hkey, &absent);
if (absent)
{
/* add new element */
// allocate hash value
hvalue = (pileup_p)malloc(sizeof(pileup_t));
if (!hvalue)
{
fprintf(stderr,"Memory failed for new hash value!\n");
exit(EXIT_FAILURE);
}
// populate hash value
hvalue->tid = hts->bam->core.tid;
hvalue->base = base;
hvalue->reads = 1;
hvalue->pass = (passBaseCondition) ? 1 : 0;
hvalue->poly = 0;
// check if base needs to be masked for internal priming
polyBaseRegion_start = (base > aux->masksize) ? (base - aux->masksize) : 0;
polyBaseRegion_end = base + aux->masksize;
if (regidx_overlap(hts->tabix, hts->bam_hdr->target_name[hts->bam->core.tid], polyBaseRegion_start, polyBaseRegion_end, &mitr))
{
hvalue->poly = 1;
}
// insert new value into hash
kh_value(hstrand, hitr) = hvalue;
}
else
{
/* update old element */
// retrieve hash value
hvalue = kh_value(hstrand, hitr);
// update read counter
hvalue->reads++;
// update poly counter
if (passBaseCondition)
hvalue->pass++;
}
}
return;
}
void hts_freehash(khash_t(pileup_h) **href)
{
khash_t(pileup_h) *hash = *href;
khiter_t hitr;
pileup_p hvalue;
for (hitr = kh_begin(hash); hitr != kh_end(hash); ++hitr)
{
if (kh_exist(hash, hitr))
{
hvalue = kh_value(hash, hitr);
free(hvalue);
}
}
kh_destroy(pileup_h, hash);
return;
}
void hts_destroy(hts_config_t *hts)
{
/* destroy hashes */
hts_freehash(&hts->pstrand);
hts_freehash(&hts->nstrand);
bam_destroy1(hts->bam);
bam_hdr_destroy(hts->bam_hdr);
regidx_destroy(hts->tabix);
sam_close(hts->fh_in);
return;
}
void hts_dumphash(khash_t(pileup_h) *hash, bam_hdr_t *hdr, const char strand)
{
khiter_t hitr;
pileup_p hvalue;
for (hitr = kh_begin(hash); hitr != kh_end(hash); ++hitr)
{
if (kh_exist(hash, hitr))
{
hvalue = kh_value(hash, hitr);
printf("%s\t%d\t%c\t%d\t%d\t%d\n", hdr->target_name[hvalue->tid], hvalue->base, strand, hvalue->reads, hvalue->pass, hvalue->poly);
}
}
return;
}
void hts_printout(hts_config_t *hts)
{
// print out positive strand hash
hts_dumphash(hts->pstrand, hts->bam_hdr, '+');
// print out negative strand hash
hts_dumphash(hts->nstrand, hts->bam_hdr, '-');
}
/* dumpSeq
* return pointer to sequence in BAM file
*/
void dumpBamSeq(bam1_t *bam)
{
int i;
for (i = 0; i < bam->core.l_qseq; ++i)
{
printf("%c", seq_nt16_str[bam_seqi(bam_get_seq(bam),i)]);
}
}
/* findPoly
* modified findTailPolyAMaybeMask
* https://github.com/jstjohn/KentLib/blob/master/lib/dnautil.c
* Kent et.al., 2002
*/
uint32_t findPoly(bam1_t *bam, int base, int side)
{
int i = 0;
int currScore = 10;
int bestScore = 10;
int currPosition = 0;
int bestPosition = -1;
uint32_t polySize = 0;
int smallBase = tolower(base);
int bigBase = toupper(base);
char currBase = 'X';
for (i = 0; i < bam->core.l_qseq; ++i)
{
// define current position based on sequence side
currPosition = i;
if (side == POLY_TAIL)
currPosition = bam->core.l_qseq - i - 1;
// get current base from BAM
currBase = seq_nt16_str[bam_seqi(bam_get_seq(bam), currPosition)];
// skip N match
if (currBase == 'N' || currBase == 'n')
continue;
// compare to query base
if (currBase == bigBase || currBase == smallBase)
{
// add match score
currScore += 1;
// update best score
if (currScore >= (bestScore - 6))
{
bestScore = currScore;
bestPosition = currPosition;
}
}
else
{
// add mismatch score
currScore -= 8;
}
// break if score is negative
if (currScore < 0)
break;
// cap if score is too big
if (currScore > 20)
currScore = 20;
}
/* assign result */
if (bestPosition >= 0)
{
polySize = bestPosition + 1;
if (side == POLY_TAIL)
polySize = bam->core.l_qseq - bestPosition;
//if (polySize < 0)
// polySize = 0;
}
return polySize;
}
/* bam_calqlen
* Calculate query alignment length
*/
static inline uint32_t bam_calqlen(const bam1_core_t *core, const uint32_t *cigar)
{
int i;
uint32_t qlen = 0;
for (i = 0; i < core->n_cigar; ++i)
{
int span = cigar[i] >> 4;
int op = cigar[i] & 0xf;
if ( (op == BAM_CMATCH) || (op == BAM_CDEL) || (op == BAM_CREF_SKIP))
{
qlen += span;
}
}
return qlen;
}
/* bam_calend
* Calculate the rightmost coordinate of an alignment on the reference genome.
* bam.h https://github.com/samtools/samtools/search?utf8=%E2%9C%93&q=bam_calend
*/
static inline uint32_t bam_calend(const bam1_core_t *core, const uint32_t *cigar)
{
return core->pos + (core->n_cigar ? bam_cigar2rlen(core->n_cigar, cigar) : 1);
}