From c7525ac25bc1c7e087f5dc730bfdeb6307ace11d Mon Sep 17 00:00:00 2001 From: Georgi Tushev Date: Thu, 13 Oct 2016 16:57:19 +0200 Subject: [PATCH] introduce uint64_t hash key concatenating chrome and base --- config.c | 40 +++------- config.h | 13 +--- htsutils.c | 221 ++++++++++++++++++++++++++++------------------------- htsutils.h | 32 ++++---- main.c | 12 +-- 5 files changed, 153 insertions(+), 165 deletions(-) diff --git a/config.c b/config.c index c830386..48c616b 100644 --- a/config.c +++ b/config.c @@ -14,13 +14,10 @@ const char PROGRAM_NAME[] = "PASSFinder"; void aux_init(aux_config_t *aux) { /* initialize default auxiliary values */ - aux->input = NULL; - aux->polymask = NULL; - aux->output = NULL; - aux->window = AUX_DEFAULT_WINDOW; - aux->depth = AUX_DEFAULT_DEPTH; + aux->file_in = NULL; + aux->file_mask = NULL; aux->polysize = AUX_DEFAULT_POLYSIZE; - aux->maskspan = AUX_DEFAULT_MASKSPAN; + aux->masksize = AUX_DEFAULT_MASKSIZE; aux->mapq = AUX_DEFAULT_MAPQ; return; @@ -50,46 +47,29 @@ int aux_parse(aux_config_t *aux, int argc, const char *argv[]) i += 1; aux->mapq = (uint32_t)strtoul(argv[i], NULL, 10); } - else if((PARAMETER_CHECK("-w", 2, parameterLength)) || (PARAMETER_CHECK("--window", 8, parameterLength))) - { - i += 1; - aux->window = (uint32_t)strtoul(argv[i], NULL, 10); - } - else if((PARAMETER_CHECK("-d", 2, parameterLength)) || - (PARAMETER_CHECK("--depth", 7, parameterLength))) - { - i += 1; - aux->depth = (uint32_t)strtoul(argv[i], NULL, 10); - } else if((PARAMETER_CHECK("-p", 2, parameterLength)) || (PARAMETER_CHECK("--polysize", 10, parameterLength))) { i += 1; aux->polysize = (uint32_t)strtoul(argv[i], NULL, 10); } - else if((PARAMETER_CHECK("-s", 2, parameterLength)) || - (PARAMETER_CHECK("--maskspan", 10, parameterLength))) + else if((PARAMETER_CHECK("-m", 2, parameterLength)) || + (PARAMETER_CHECK("--masksize", 10, parameterLength))) { i += 1; - aux->maskspan = (uint32_t)strtoul(argv[i], NULL, 10); + aux->masksize = (uint32_t)strtoul(argv[i], NULL, 10); } else if((PARAMETER_CHECK("-i", 2, parameterLength)) || (PARAMETER_CHECK("--input", 7, parameterLength))) { i += 1; - aux->input = argv[i]; - } - else if((PARAMETER_CHECK("-m", 2, parameterLength)) || - (PARAMETER_CHECK("--maskpoly", 10, parameterLength))) - { - i += 1; - aux->polymask = argv[i]; + aux->file_in = argv[i]; } - else if((PARAMETER_CHECK("-o", 2, parameterLength)) || - (PARAMETER_CHECK("--output", 8, parameterLength))) + else if((PARAMETER_CHECK("-r", 2, parameterLength)) || + (PARAMETER_CHECK("--regions", 10, parameterLength))) { i += 1; - aux->output = argv[i]; + aux->file_mask = argv[i]; } else { diff --git a/config.h b/config.h index 985c12d..d42977f 100644 --- a/config.h +++ b/config.h @@ -15,10 +15,8 @@ // define some constants #define MAX_LENGTH 1024 -#define AUX_DEFAULT_WINDOW 200 -#define AUX_DEFAULT_DEPTH 5 #define AUX_DEFAULT_POLYSIZE 3 -#define AUX_DEFAULT_MASKSPAN 3 +#define AUX_DEFAULT_MASKSIZE 3 #define AUX_DEFAULT_MAPQ 0 // define MIN & MAX macros @@ -32,12 +30,9 @@ // define parameters typedef struct _aux_config_t { - const char *input; - const char *polymask; - const char *output; - uint32_t window; - uint32_t depth; - uint32_t maskspan; + const char *file_in; + const char *file_mask; + uint32_t masksize; uint32_t polysize; uint32_t mapq; } aux_config_t; diff --git a/htsutils.c b/htsutils.c index f0dcca6..eb61e7a 100644 --- a/htsutils.c +++ b/htsutils.c @@ -8,62 +8,63 @@ #include "htsutils.h" -int hts_init(hts_config_t *hts, aux_config_t *aux) +void hts_init(hts_config_t *hts, aux_config_t *aux) { /* open input stream */ - hts->fh_in = sam_open(aux->input, "r"); + hts->fh_in = sam_open(aux->file_in, "r"); if (!hts->fh_in) { - fprintf(stderr, "Error : HTS_INIT : coult not open input stream from %s\n", aux->input); - return EXIT_FAILURE; + fprintf(stderr, "Error : HTS_INIT : coult not open input stream from %s\n", aux->file_in); + exit(EXIT_FAILURE); } /* parse BAM header */ hts->bam_hdr = sam_hdr_read(hts->fh_in); if (!hts->bam_hdr) { - fprintf(stderr, "Error : HTS_INIT : coult not read BAM header %s\n", aux->input); - return EXIT_FAILURE; + fprintf(stderr, "Error : HTS_INIT : coult not read BAM header %s\n", aux->file_in); + exit(EXIT_FAILURE); } /* read tabix BED mask */ - hts->tabix = regidx_init(aux->polymask, NULL, NULL, 0, NULL); + hts->tabix = regidx_init(aux->file_mask, NULL, NULL, 0, NULL); if (!hts->tabix) { fprintf(stderr, "Error :: failed to load polymask index!\n"); - return EXIT_FAILURE; + exit(EXIT_FAILURE); } /* initialize BAM reocrd */ hts->bam = bam_init1(); /* initialize CHROM hash of hashes */ - hts->hchrom = kh_init(pass_chrom); + hts->pstrand = kh_init(pileup_h); + hts->nstrand = kh_init(pileup_h); - return 0; + return; } -int hts_parse(hts_config_t *hts, aux_config_t *aux) +void hts_parse(hts_config_t *hts, aux_config_t *aux) { /* local counters */ + int passBaseCondition = 0; uint32_t bam_lines = 0; - uint32_t bam_poly = 0; uint32_t polySize_tail = 0; uint32_t polySize_read = 0; - uint32_t polyBase = 0; - uint32_t polyBase_maskedStart = 0; - uint32_t polyBase_maskedEnd = 0; + uint32_t base = 0; - /* hash of hashes vars */ + + /* hash related variables */ int absent; - khiter_t kchrom; - khiter_t kbase; - khash_t(pass_base) *hbase; - pass_pileup_p plp; + uint64_t hkey = 0; + pileup_t *hvalue; + khiter_t hitr; + khash_t(pileup_h) *hstrand; - /* BED mask vars */ - int polyBase_masked = 0; - regitr_t itr_mask; + /* mask poly base variables */ + uint32_t polyBaseRegion_start = 0; + uint32_t polyBaseRegion_end = 0; + regitr_t mitr; /* read BAM record by record */ while (sam_read1(hts->fh_in, hts->bam_hdr, hts->bam) >= 0) @@ -75,140 +76,150 @@ int hts_parse(hts_config_t *hts, aux_config_t *aux) if (aux->mapq > hts->bam->core.qual) continue; - /* get pass position based on strand */ + /* strand specific information */ if (hts->bam->core.flag & BAM_FREVERSE) { + // set current hash pointer + hstrand = hts->nstrand; + + // check if tail in read polySize_tail = findPoly(hts->bam, 'T', POLY_HEAD); - polyBase = hts->bam->core.pos; + + // define last base + base = hts->bam->core.pos; } else { + // set current hash pointer + hstrand = hts->pstrand; + + // check if tail in read polySize_tail = findPoly(hts->bam, 'A', POLY_TAIL); - polyBase = bam_calend(&hts->bam->core, bam_get_cigar(hts->bam)); + + // define last base + base = bam_calend(&hts->bam->core, bam_get_cigar(hts->bam)); } + /* overall alignment length */ polySize_read = bam_calqlen(&hts->bam->core, bam_get_cigar(hts->bam)); - /* check if polyBase is in poly mask */ - polyBase_masked = 0; - polyBase_maskedStart = (polyBase > aux->maskspan) ? (polyBase - aux->maskspan) : 0; - polyBase_maskedEnd = polyBase + aux->maskspan; - if (regidx_overlap(hts->tabix, hts->bam_hdr->target_name[hts->bam->core.tid], polyBase_maskedStart, polyBase_maskedEnd, &itr_mask)) - { - polyBase_masked = 1; - } + /* define poly base condition */ + passBaseCondition = ((polySize_tail >= aux->polysize) && (polySize_read <= (hts->bam->core.l_qseq - aux->polysize))) ? 1 : 0; - /* check PASS condition - * 1. Read contains poly tail :: polySize_tail >= props->polysize - * 2. Read poly tail is not aligned on genome :: polySize_read <= (bam->core.l_seq - props->polysize) - * 3. Read not aligned close to masked poly region :: polyBase_masked == 0 - */ - if ((polySize_tail >= aux->polysize) && (polySize_read <= (hts->bam->core.l_qseq - aux->polysize)) && (polyBase_masked == 0)) + /* create 64bit hash key from 32bit chrom and 32bit position */ + hkey = (uint64_t)hts->bam->core.tid << 32 | base; + + /* check if key exist in hash */ + hitr = kh_put(pileup_h, hstrand, hkey, &absent); + if (absent) { - bam_poly++; - - /* check if chrom is in hash */ - kchrom = kh_put(pass_chrom, hts->hchrom, hts->bam_hdr->target_name[hts->bam->core.tid], &absent); + /* add new element */ - if (absent) + // allocate hash value + hvalue = (pileup_p)malloc(sizeof(pileup_t)); + if (!hvalue) { - plp = (pass_pileup_p) malloc(sizeof(pass_pileup_t)); - plp->base_pos = kh_init(pass_base); - plp->base_neg = kh_init(pass_base); - plp->count_pos = 0; - plp->count_neg = 0; - kh_value(hts->hchrom, kchrom) = plp; + fprintf(stderr,"Memory failed for new hash value!\n"); + exit(EXIT_FAILURE); } - /* retrieve chrom pass value pointer */ - plp = kh_value(hts->hchrom, kchrom); + // populate hash value + hvalue->tid = hts->bam->core.tid; + hvalue->base = base; + hvalue->reads = 1; + hvalue->pass = (passBaseCondition) ? 1 : 0; + hvalue->poly = 0; - /* retrieve strand specific hash pointer */ - if (hts->bam->core.flag & BAM_FREVERSE) - { - hbase = plp->base_neg; - plp->count_neg++; - } - else + // check if base needs to be masked for internal priming + polyBaseRegion_start = (base > aux->masksize) ? (base - aux->masksize) : 0; + polyBaseRegion_end = base + aux->masksize; + if (regidx_overlap(hts->tabix, hts->bam_hdr->target_name[hts->bam->core.tid], polyBaseRegion_start, polyBaseRegion_end, &mitr)) { - hbase = plp->base_pos; - plp->count_pos++; + hvalue->poly = 1; } - /* add current polyBase to base hash */ - kbase = kh_put(pass_base, hbase, polyBase, &absent); - if (absent) - kh_value(hbase, kbase) = 1; - else - kh_value(hbase, kbase)++; + // insert new value into hash + kh_value(hstrand, hitr) = hvalue; + + } + else + { + /* update old element */ + + // retrieve hash value + hvalue = kh_value(hstrand, hitr); + + // update read counter + hvalue->reads++; + + // update poly counter + if (passBaseCondition) + hvalue->pass++; } - } - return 0; + return; } -int hts_destroy(hts_config_t *hts) +void hts_freehash(khash_t(pileup_h) **href) { - /* destroy hash of hashes */ - khiter_t k; - for (k = kh_begin(hts->hchrom); k != kh_end(hts->hchrom); ++k) + khash_t(pileup_h) *hash = *href; + khiter_t hitr; + pileup_p hvalue; + + for (hitr = kh_begin(hash); hitr != kh_end(hash); ++hitr) { - if (kh_exist(hts->hchrom, k)) + if (kh_exist(hash, hitr)) { - pass_pileup_p plp = kh_val(hts->hchrom, k); - kh_destroy(pass_base, plp->base_pos); - kh_destroy(pass_base, plp->base_neg); - free(plp); + hvalue = kh_value(hash, hitr); + free(hvalue); } } - kh_destroy(pass_chrom, hts->hchrom); + kh_destroy(pileup_h, hash); + + return; +} + +void hts_destroy(hts_config_t *hts) +{ + /* destroy hashes */ + hts_freehash(&hts->pstrand); + hts_freehash(&hts->nstrand); bam_destroy1(hts->bam); bam_hdr_destroy(hts->bam_hdr); regidx_destroy(hts->tabix); sam_close(hts->fh_in); - return 0; + return; } -int hts_dumphash_chrom(hts_config_t *hts) +void hts_dumphash(khash_t(pileup_h) *hash, bam_hdr_t *hdr, const char strand) { - khiter_t k; + khiter_t hitr; + pileup_p hvalue; - for (k = kh_begin(hts->hchrom); k != kh_end(hts->hchrom); ++k) + for (hitr = kh_begin(hash); hitr != kh_end(hash); ++hitr) { - if (kh_exist(hts->hchrom, k)) + if (kh_exist(hash, hitr)) { - const char *chrom = kh_key(hts->hchrom, k); - pass_pileup_p plp = kh_val(hts->hchrom, k); - - hts_dumphash_base(plp->base_pos, chrom, '+'); - hts_dumphash_base(plp->base_neg, chrom, '-'); - + hvalue = kh_value(hash, hitr); + printf("%s\t%d\t%c\t%d\t%d\t%d\n", hdr->target_name[hvalue->tid], hvalue->base, strand, hvalue->reads, hvalue->pass, hvalue->poly); } } - - return 0; + return; } -int hts_dumphash_base(khash_t(pass_base) *hbase, const char *chrom, int strand) +void hts_printout(hts_config_t *hts) { - khiter_t k; + // print out positive strand hash + hts_dumphash(hts->pstrand, hts->bam_hdr, '+'); - for (k = kh_begin(hbase); k != kh_end(hbase); ++k) - { - if (kh_exist(hbase, k)) - { - uint32_t key = kh_key(hbase, k); - uint32_t val = kh_val(hbase, k); - printf("%s\t%d\t%d\t%c\n", chrom, key, val, strand); - } - } - return 0; + // print out negative strand hash + hts_dumphash(hts->nstrand, hts->bam_hdr, '-'); } diff --git a/htsutils.h b/htsutils.h index ec3bd6a..ff7262d 100644 --- a/htsutils.h +++ b/htsutils.h @@ -25,15 +25,16 @@ #define POLY_HEAD 0 #define POLY_TAIL 1 -KHASH_MAP_INIT_INT(pass_base, uint32_t); -typedef struct _pass_pileup +typedef struct _pileup_t { - uint32_t count_pos; - uint32_t count_neg; - khash_t(pass_base) *base_pos; - khash_t(pass_base) *base_neg; -} pass_pileup_t, *pass_pileup_p; -KHASH_MAP_INIT_STR(pass_chrom, pass_pileup_p); + uint32_t tid; + uint32_t base; + uint32_t reads; + uint32_t pass; + uint8_t poly; +} pileup_t, *pileup_p; + +KHASH_MAP_INIT_INT64(pileup_h, pileup_p); typedef struct _hts_config_t { @@ -41,15 +42,16 @@ typedef struct _hts_config_t bam_hdr_t *bam_hdr; regidx_t *tabix; bam1_t *bam; - khash_t(pass_chrom) *hchrom; + khash_t(pileup_h) *pstrand; + khash_t(pileup_h) *nstrand; } hts_config_t; -int hts_init(hts_config_t *, aux_config_t *); -int hts_parse(hts_config_t *, aux_config_t *); -int hts_dumphash_chrom(hts_config_t *); -int hts_dumphash_base(khash_t(pass_base) *, const char *, int); -int hts_destroy(hts_config_t *); - +void hts_init(hts_config_t *, aux_config_t *); +void hts_parse(hts_config_t *, aux_config_t *); +void hts_dumphash(khash_t(pileup_h) *, bam_hdr_t *, const char); +void hts_freehash(khash_t(pileup_h) **); +void hts_printout(hts_config_t *); +void hts_destroy(hts_config_t *); //int parseBAM(Settings *); void dumpBamSeq(bam1_t *); diff --git a/main.c b/main.c index a69fade..f10e9c1 100644 --- a/main.c +++ b/main.c @@ -19,7 +19,7 @@ int main(int argc, const char * argv[]) clock_t begin = clock(); - /* parse auxiliary configuration */ + // parse auxiliary configuration aux_init(&aux); if (aux_parse(&aux, argc, argv)) { @@ -27,16 +27,16 @@ int main(int argc, const char * argv[]) return EXIT_FAILURE; } - /* initialize with hts */ + // initialize with hts hts_init(&hts, &aux); - /* parse with hts */ + // parse with hts hts_parse(&hts, &aux); - /* dump hash */ - hts_dumphash_chrom(&hts); + // dump hash + hts_printout(&hts); - /* free */ + // free hts_destroy(&hts);