diff --git a/htsutils.c b/htsutils.c new file mode 100644 index 0000000..99460fe --- /dev/null +++ b/htsutils.c @@ -0,0 +1,146 @@ +// +// htsutils.c +// PASSFinder +// +// Created by Georgi Tushev on 07/09/16. +// Copyright © 2016 Scientific Computing. All rights reserved. +// + +#include "htsutils.h" + +int parseBAM(char *file_name) +{ + samFile *fh_in = sam_open(file_name, "r"); + bam_hdr_t *header = sam_hdr_read(fh_in); + bam1_t *bam = bam_init1(); + + int lines = 0; + int poly = 0; + int polyTailA = 0; + int polyTailT = 0; + int polyHeadA = 0; + int polyHeadT = 0; + //int polyHead = 0; + + while (sam_read1(fh_in, header, bam) >= 0) + { + lines++; + + + polyTailA = findPoly(bam, 'A', POLY_TAIL); + polyTailT = findPoly(bam, 'T', POLY_TAIL); + polyHeadA = findPoly(bam, 'A', POLY_HEAD); + polyHeadT = findPoly(bam, 'T', POLY_HEAD); + + //printf("Tail: %d|%d\tHead: %d|%d\tSeq:: ", polyTailA,polyTailT,polyHeadA,polyHeadT); + //dumpBamSeq(bam); + //printf("\n"); + + //if (lines == 10) + // break; + + } + + bam_destroy1(bam); + bam_hdr_destroy(header); + sam_close(fh_in); + + //printf("BAM Lines: %d / %d\n", poly, lines); + + return 0; +} + +/* dumpSeq + * return pointer to sequence in BAM file + */ +void dumpBamSeq(bam1_t *bam) +{ + int i; + + for (i = 0; i < bam->core.l_qseq; ++i) + { + printf("%c", seq_nt16_str[bam_seqi(bam_get_seq(bam),i)]); + } +} + + +/* findPoly + * modified findTailPolyAMaybeMask + * https://github.com/jstjohn/KentLib/blob/master/lib/dnautil.c + * Kent et.al., 2002 + */ +int findPoly(bam1_t *bam, int base, int side) +{ + int i = 0; + + int currScore = 10; + int bestScore = 10; + + int currPosition = 0; + int bestPosition = -1; + + int polySize = 0; + + int smallBase = tolower(base); + int bigBase = toupper(base); + + char currBase = 'X'; + + for (i = 0; i < bam->core.l_qseq; ++i) + { + // define current position based on sequence side + currPosition = i; + if (side == POLY_TAIL) + currPosition = bam->core.l_qseq - i - 1; + + // get current base from BAM + currBase = seq_nt16_str[bam_seqi(bam_get_seq(bam), currPosition)]; + + // skip N match + if (currBase == 'N' || currBase == 'n') + continue; + + // compare to query base + if (currBase == bigBase || currBase == smallBase) + { + // add match score + currScore += 1; + + // update best score + if (currScore >= (bestScore - 6)) + { + bestScore = currScore; + bestPosition = currPosition; + } + } + else + { + // add mismatch score + currScore -= 8; + } + + // break if score is negative + if (currScore < 0) + break; + + // cap if score is too big + if (currScore > 20) + currScore = 20; + + } + + /* assign result */ + if (bestPosition >= 0) + { + polySize = bestPosition + 1; + + if (side == POLY_TAIL) + polySize = bam->core.l_qseq - bestPosition; + + if (polySize < 0) + polySize = 0; + } + + return polySize; +} + diff --git a/htsutils.h b/htsutils.h new file mode 100644 index 0000000..ed85c36 --- /dev/null +++ b/htsutils.h @@ -0,0 +1,24 @@ +// +// htsutils.h +// PASSFinder +// +// Created by Georgi Tushev on 07/09/16. +// Copyright © 2016 Scientific Computing. All rights reserved. +// + +#ifndef htsutils_h +#define htsutils_h + +#include +#include +#include +#include + +#define POLY_HEAD 0 +#define POLY_TAIL 1 + +int parseBAM(char *); +int findPoly(bam1_t *, int, int); +void dumpBamSeq(bam1_t *); + +#endif /* htsutils_h */