Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
PASSFinder/htsutils.c
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
352 lines (278 sloc)
8.78 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// | |
// htsutils.c | |
// PASSFinder | |
// | |
// Created by Georgi Tushev on 07/09/16. | |
// Copyright © 2016 Scientific Computing. All rights reserved. | |
// | |
#include "htsutils.h" | |
void hts_init(hts_config_t *hts, aux_config_t *aux) | |
{ | |
/* open input stream */ | |
hts->fh_in = sam_open(aux->file_in, "r"); | |
if (!hts->fh_in) | |
{ | |
fprintf(stderr, "Error : HTS_INIT : coult not open input stream from %s\n", aux->file_in); | |
exit(EXIT_FAILURE); | |
} | |
/* parse BAM header */ | |
hts->bam_hdr = sam_hdr_read(hts->fh_in); | |
if (!hts->bam_hdr) | |
{ | |
fprintf(stderr, "Error : HTS_INIT : coult not read BAM header %s\n", aux->file_in); | |
exit(EXIT_FAILURE); | |
} | |
/* read tabix BED mask */ | |
hts->tabix = regidx_init(aux->file_mask, NULL, NULL, 0, NULL); | |
if (!hts->tabix) | |
{ | |
fprintf(stderr, "Error :: failed to load polymask index!\n"); | |
exit(EXIT_FAILURE); | |
} | |
/* initialize BAM reocrd */ | |
hts->bam = bam_init1(); | |
/* initialize CHROM hash of hashes */ | |
hts->pstrand = kh_init(pileup_h); | |
hts->nstrand = kh_init(pileup_h); | |
return; | |
} | |
void hts_parse(hts_config_t *hts, aux_config_t *aux) | |
{ | |
/* local counters */ | |
int passBaseCondition = 0; | |
uint32_t bam_lines = 0; | |
uint32_t polySize_tail = 0; | |
uint32_t polySize_read = 0; | |
uint32_t base = 0; | |
/* hash related variables */ | |
int absent; | |
uint64_t hkey = 0; | |
pileup_t *hvalue; | |
khiter_t hitr; | |
khash_t(pileup_h) *hstrand; | |
/* mask poly base variables */ | |
uint32_t polyBaseRegion_start = 0; | |
uint32_t polyBaseRegion_end = 0; | |
regitr_t mitr; | |
/* read BAM record by record */ | |
while (sam_read1(hts->fh_in, hts->bam_hdr, hts->bam) >= 0) | |
{ | |
/* count bam lines */ | |
bam_lines++; | |
/* filter based on MAPQ */ | |
if (aux->mapq > hts->bam->core.qual) | |
continue; | |
/* strand specific information */ | |
if (hts->bam->core.flag & BAM_FREVERSE) | |
{ | |
// set current hash pointer | |
hstrand = hts->nstrand; | |
// check if tail in read | |
polySize_tail = findPoly(hts->bam, 'T', POLY_HEAD); | |
// define last base | |
base = hts->bam->core.pos + 1; | |
} | |
else | |
{ | |
// set current hash pointer | |
hstrand = hts->pstrand; | |
// check if tail in read | |
polySize_tail = findPoly(hts->bam, 'A', POLY_TAIL); | |
// define last base | |
base = bam_calend(&hts->bam->core, bam_get_cigar(hts->bam)); | |
} | |
/* overall alignment length */ | |
polySize_read = bam_calqlen(&hts->bam->core, bam_get_cigar(hts->bam)); | |
/* define poly base condition */ | |
passBaseCondition = ((polySize_tail >= aux->polysize) && (polySize_read <= (hts->bam->core.l_qseq - aux->polysize))) ? 1 : 0; | |
/* create 64bit hash key from 32bit chrom and 32bit position */ | |
hkey = (uint64_t)hts->bam->core.tid << 32 | base; | |
/* check if key exist in hash */ | |
hitr = kh_put(pileup_h, hstrand, hkey, &absent); | |
if (absent) | |
{ | |
/* add new element */ | |
// allocate hash value | |
hvalue = (pileup_p)malloc(sizeof(pileup_t)); | |
if (!hvalue) | |
{ | |
fprintf(stderr,"Memory failed for new hash value!\n"); | |
exit(EXIT_FAILURE); | |
} | |
// populate hash value | |
hvalue->tid = hts->bam->core.tid; | |
hvalue->base = base; | |
hvalue->reads = 1; | |
hvalue->pass = (passBaseCondition) ? 1 : 0; | |
hvalue->poly = 0; | |
// check if base needs to be masked for internal priming | |
polyBaseRegion_start = (base > aux->masksize) ? (base - aux->masksize) : 0; | |
polyBaseRegion_end = base + aux->masksize; | |
if (regidx_overlap(hts->tabix, hts->bam_hdr->target_name[hts->bam->core.tid], polyBaseRegion_start, polyBaseRegion_end, &mitr)) | |
{ | |
hvalue->poly = 1; | |
} | |
// insert new value into hash | |
kh_value(hstrand, hitr) = hvalue; | |
} | |
else | |
{ | |
/* update old element */ | |
// retrieve hash value | |
hvalue = kh_value(hstrand, hitr); | |
// update read counter | |
hvalue->reads++; | |
// update poly counter | |
if (passBaseCondition) | |
hvalue->pass++; | |
} | |
} | |
return; | |
} | |
void hts_freehash(khash_t(pileup_h) **href) | |
{ | |
khash_t(pileup_h) *hash = *href; | |
khiter_t hitr; | |
pileup_p hvalue; | |
for (hitr = kh_begin(hash); hitr != kh_end(hash); ++hitr) | |
{ | |
if (kh_exist(hash, hitr)) | |
{ | |
hvalue = kh_value(hash, hitr); | |
free(hvalue); | |
} | |
} | |
kh_destroy(pileup_h, hash); | |
return; | |
} | |
void hts_destroy(hts_config_t *hts) | |
{ | |
/* destroy hashes */ | |
hts_freehash(&hts->pstrand); | |
hts_freehash(&hts->nstrand); | |
bam_destroy1(hts->bam); | |
bam_hdr_destroy(hts->bam_hdr); | |
regidx_destroy(hts->tabix); | |
sam_close(hts->fh_in); | |
return; | |
} | |
void hts_dumphash(khash_t(pileup_h) *hash, bam_hdr_t *hdr, const char strand) | |
{ | |
khiter_t hitr; | |
pileup_p hvalue; | |
for (hitr = kh_begin(hash); hitr != kh_end(hash); ++hitr) | |
{ | |
if (kh_exist(hash, hitr)) | |
{ | |
hvalue = kh_value(hash, hitr); | |
printf("%s\t%d\t%c\t%d\t%d\t%d\n", hdr->target_name[hvalue->tid], hvalue->base, strand, hvalue->reads, hvalue->pass, hvalue->poly); | |
} | |
} | |
return; | |
} | |
void hts_printout(hts_config_t *hts) | |
{ | |
// print out positive strand hash | |
hts_dumphash(hts->pstrand, hts->bam_hdr, '+'); | |
// print out negative strand hash | |
hts_dumphash(hts->nstrand, hts->bam_hdr, '-'); | |
} | |
/* dumpSeq | |
* return pointer to sequence in BAM file | |
*/ | |
void dumpBamSeq(bam1_t *bam) | |
{ | |
int i; | |
for (i = 0; i < bam->core.l_qseq; ++i) | |
{ | |
printf("%c", seq_nt16_str[bam_seqi(bam_get_seq(bam),i)]); | |
} | |
} | |
/* findPoly | |
* modified findTailPolyAMaybeMask | |
* https://github.com/jstjohn/KentLib/blob/master/lib/dnautil.c | |
* Kent et.al., 2002 | |
*/ | |
uint32_t findPoly(bam1_t *bam, int base, int side) | |
{ | |
int i = 0; | |
int currScore = 10; | |
int bestScore = 10; | |
int currPosition = 0; | |
int bestPosition = -1; | |
uint32_t polySize = 0; | |
int smallBase = tolower(base); | |
int bigBase = toupper(base); | |
char currBase = 'X'; | |
for (i = 0; i < bam->core.l_qseq; ++i) | |
{ | |
// define current position based on sequence side | |
currPosition = i; | |
if (side == POLY_TAIL) | |
currPosition = bam->core.l_qseq - i - 1; | |
// get current base from BAM | |
currBase = seq_nt16_str[bam_seqi(bam_get_seq(bam), currPosition)]; | |
// skip N match | |
if (currBase == 'N' || currBase == 'n') | |
continue; | |
// compare to query base | |
if (currBase == bigBase || currBase == smallBase) | |
{ | |
// add match score | |
currScore += 1; | |
// update best score | |
if (currScore >= (bestScore - 6)) | |
{ | |
bestScore = currScore; | |
bestPosition = currPosition; | |
} | |
} | |
else | |
{ | |
// add mismatch score | |
currScore -= 8; | |
} | |
// break if score is negative | |
if (currScore < 0) | |
break; | |
// cap if score is too big | |
if (currScore > 20) | |
currScore = 20; | |
} | |
/* assign result */ | |
if (bestPosition >= 0) | |
{ | |
polySize = bestPosition + 1; | |
if (side == POLY_TAIL) | |
polySize = bam->core.l_qseq - bestPosition; | |
//if (polySize < 0) | |
// polySize = 0; | |
} | |
return polySize; | |
} | |
/* bam_calqlen | |
* Calculate query alignment length | |
*/ | |
static inline uint32_t bam_calqlen(const bam1_core_t *core, const uint32_t *cigar) | |
{ | |
int i; | |
uint32_t qlen = 0; | |
for (i = 0; i < core->n_cigar; ++i) | |
{ | |
int span = cigar[i] >> 4; | |
int op = cigar[i] & 0xf; | |
if ( (op == BAM_CMATCH) || (op == BAM_CDEL) || (op == BAM_CREF_SKIP)) | |
{ | |
qlen += span; | |
} | |
} | |
return qlen; | |
} | |
/* bam_calend | |
* Calculate the rightmost coordinate of an alignment on the reference genome. | |
* bam.h https://github.com/samtools/samtools/search?utf8=%E2%9C%93&q=bam_calend | |
*/ | |
static inline uint32_t bam_calend(const bam1_core_t *core, const uint32_t *cigar) | |
{ | |
return core->pos + (core->n_cigar ? bam_cigar2rlen(core->n_cigar, cigar) : 1); | |
} | |