static hashtable* ReadReference(const char* const refname) { hashtable* reference = new_hashtable(12); sequence* sp = read_fasta_sequence(refname); while(sp != NULL){ // allocate a coverage array for the sequence chrcoverage* cov = ckallocz(sizeof(chrcoverage)); cov->length = strlen((char*)sp->sequence); cov->map = ckallocz(strlen((char*)sp->sequence)); cov->cov = ckallocz(strlen((char*)sp->sequence)); cov->seq = ckallocz(strlen((char*)sp->sequence)+1); memcpy(cov->seq, sp->sequence, cov->length); // if the name of the sequence has more than one tokens, just use the // first token in the name int i = 0; while((sp->header[i] != '\n') && (sp->header[i] != 0) && (sp->header[i] != '\t') && (sp->header[i] != 32)) i++; sp->header[i] = 0; add_hashtable(reference,(char*)sp->header,strlen((char*)sp->header),cov); sp = get_next_sequence(sp); } close_fasta_sequence(sp); return reference; }
uint calculate_cov_params(const char* const bam_name, const int32_t tid, const int32_t start, const int32_t stop) { bamFile fp = bam_open(bam_name, "r"); bam_index_t* fp_index = bam_index_load(bam_name); bam_plbuf_t *buf; covdata* cvdt = ckallocz(sizeof(covdata)); cvdt->tid = tid; cvdt->begin = start; cvdt->end = stop; cvdt->coverage = ckallocz((cvdt->end - cvdt->begin) * sizeof(uint32_t)); buf = bam_plbuf_init(pileup_func, cvdt); bam_fetch(fp, fp_index, tid, start, stop, buf, fetch_func); bam_plbuf_push(0, buf); bam_plbuf_destroy(buf); // calculate the mean coverage in the region of the putative deletion uint i, covsum; for(i = 0, covsum = 0; i < (cvdt->end - cvdt->begin); i++){ covsum += cvdt->coverage[i]; } uint avgcov = floor(covsum * 1.0/(cvdt->end - cvdt->begin)); ckfree(cvdt->coverage); ckfree(cvdt); bam_close(fp); bam_index_destroy(fp_index); return avgcov; }
static void find_best_band(char* const refseq, const uint zstart1, const uint end1, const uint anchor, char* const readseq, const uint zstart2, const uint end2, int* const plow, int* const pup) { uint numdiagonals = ((end1 - zstart1) - (klength - 1)) + ((end2 - zstart2) - (klength - 1)); forceassert(numdiagonals > numgaps); forceassert(end2 >= zstart2); if((end2 - zstart2) < klength){ *plow = numdiagonals - 1; *pup = numdiagonals - 1; return; } // read the seeds from the reference sequence uint* refptrs = ckallocz((end1 - zstart1) * sizeof(uint)); uint* refseeds = read_seeds(refseq, zstart1, end1, &refptrs); // read the seeds from the sequence of interest uint* readptrs = ckallocz((end2 - zstart2) * sizeof(uint)); uint* readseeds = read_seeds(readseq, zstart2, end2, &readptrs); int* diagonals = ckallocz(sizeof(int) * numdiagonals); int* bands = ckallocz(sizeof(int) * numdiagonals); bin_diagonals(diagonals,numdiagonals, refseeds,refptrs, readseq, zstart2, end2, readseeds,readptrs); bin_bands(bands, diagonals, numdiagonals); uint indx1, indx2; select_band(bands, numdiagonals, 0, numdiagonals, anchor - zstart1, &indx1, &indx2, end1 - zstart1); // printf("%u %u %u %u\n", indx1, end2,zstart2, klength); // printf("%u %u %u %u\n", indx2, end2,zstart2, klength); *plow = indx1 - ((end2 - zstart2) -klength + 1); *pup = indx2 - ((end2 - zstart2) -klength + 1); ckfree(refptrs); ckfree(refseeds); ckfree(readptrs); ckfree(readseeds); ckfree(diagonals); ckfree(bands); }
/*add the following substring of length len to the hashtable. Return the * bin corresponding to it*/ bin* add_hashtable(hashtable* const hash, /*the hashtable*/ const char* const name, /*the string*/ const int length, /*use 'length' characters of name*/ void* val) /*the value*/ { pre(name != NULL); uint32_t index = hashfunc(name, length); index = index & hash->mask; char* hname = ckalloc(length+1); memcpy(hname, name, length); hname[length] = '\0'; bin* bin = ckallocz(sizeof(struct bin_st)); bin->name = hname; bin->val = val; if(hash->bins[index]){ hash->collisions++; } bin->next = hash->bins[index]; hash->bins[index] = bin; hash->elcount++; post(bin != NULL); return bin; }
void calculate_mq_params(bamFile* const pfp, bam_index_t* const fp_index, const int32_t tid, const int32_t start, const int32_t stop, uint* const mqrms, uint* const mq0) { bamFile fp = *pfp; mapqcov* mqds = ckallocz(sizeof(mapqcov)); bam_fetch(fp, fp_index, tid, start, stop, &mqds, calcrms); *mqrms = sqrt(mqds->mqsqsum*1.0/mqds->numcov); *mq0 = mqds->numcov0; ckfree(mqds); return; }
/*allocate a new hash table*/ hashtable* new_hashtable(const int po2size) { int max_size = po2size; if(po2size > 24){ fprintf(stderr,"hash po2size should not exceed 24, using 24\n"); max_size = 24; } hashtable* hashtable = ckallocz(sizeof(struct hashtable_st)); hashtable->po2size = max_size; hashtable->size = (1 << hashtable->po2size); hashtable->mask = hashtable->size - 1; hashtable->bins = ckallocz(hashtable->size*sizeof(struct bin_st*)); return hashtable; }
SEQ *seq_copy(const SEQ *s) { SEQ *ss = ckallocz(sizeof(SEQ)); *ss = *s; ss->seq = (uchar*)copy_string((const char*)s->seq); ss->header = copy_string(s->header); ss->fname = copy_string(s->fname); ss->maskname = copy_string(s->fname); ss->fp = 0; /* XXX - no subsequent seq_read operations allowed */ ss->offset = 0; /* XXX - no subsequent seq_read operations allowed */ return ss; }
// attempt diagonal alignment of refseq:zstart1-end1 to readseq:zstart2-end2 static void attempt_band_alignment(char* const refseq, const uint zstart1, const uint end1, char* const readseq, const uint zstart2, const uint end2, const int low, const int up, int* const pr1, int* const pr2, int* const pq1, int* const pq2, int* const pnumcigarops, uint32_t** const pcigarstring) { if(low > up) fprintf(stderr, "low: %d up: %d\n", low, up); forceassert(low <= up); int* S = ckallocz(((end1 - zstart1) + (end2 - zstart2))*sizeof(int)); int score = local_align(readseq+zstart2, end2 - zstart2, refseq + zstart1, end1 - zstart1, low, up, pq1, pr1, pq2, pr2, S); if(score <= 0){ *pr1 = 0; *pr2 = 0; *pq1 = 0; *pq2 = 0; ckfree(S); return; } fetch_cigar(readseq + zstart2 + *pq1 - 2, refseq + zstart1 + *pr1 - 2, *pq2 - *pq1 + 1, *pr2 - *pr1 + 1, S, *pq1, *pr1, end2 - zstart2, pnumcigarops, pcigarstring); if(TRUE == debug_flag){ fprintf(debug_file, "Aligned read:%d-%d to reference:%d-%d\n", *pq1+zstart2-1, *pq2+zstart2, *pr1+zstart1-1, *pr2+zstart1); print_alignments(refseq+zstart1, *pr1, *pr2, readseq+zstart2, *pq1, *pq2, end2 - zstart2, S); } *pr1 = *pr1 + zstart1-1; *pr2 = *pr2 + zstart1; *pq1 = *pq1 + zstart2 - 1; *pq2 = *pq2 + zstart2; ckfree(S); }
SEQ* seq_open_type(const char *fname, int type) { SEQ *s = ckallocz(sizeof(SEQ)); int r, flags = 0; r = parse_fname(fname, &(s->fname), &(s->from), &(s->slen), &(s->maskname)); if (r == -1) fatalf("improper positions specification: %s", fname); s->type = type; s->flags = check_flags(r|flags); s->fp = ckopen(s->fname, "rb"); s->count = 0; s->offset = 0; return s; }
static void CreateRefMap(const char* const refName, const char* const mapName, hashtable* const reference) { /* Return an object that lists the unique regions of the reference. The format of the input file includes description of the encoding used, followed by fasta like format. ~~ENCODING ' '~[0-0] '!'~[1-1] '"'~[2-2] ... '&'~[6-7] ... ~chr17 !!!!!!!!!!!!!!!!!!!!!!!!!""!!!!!!!!!!%%*&&00,00/66/140.-,, ,/04:41237?CDDDD??>;7*$$!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! !!!!!%$$#!!"#$&%"#.7>:><9898899999:.3---1111"!!!!##!! ... */ FILE* fp = ckopen(mapName, "r"); size_t n = 1; char* line = ckallocz(n*sizeof(char)); bool encoding_flag = FALSE; bool map_flag = FALSE; char sentinel = '~'; chrcoverage* cc; int index = 0; u64 num_mappable = 0, num_total = 0; signed long ret; while ((ret = getline(&line, &n, fp)) != -1) { if (line[0] == '~') { // beginning of a subsection if (strncmp(line, "~~ENCODING", 10) == 0) { encoding_flag = TRUE; } else if (map_flag == TRUE) { // if the name has more than one token, then I only use the // first token int i = 0; while((line[i] != '\n') && (line[i] != 0) && (line[i] != '\t') && (line[i] != 32)) i++; line[i] = 0; cc = (chrcoverage*)must_find_hashtable(reference, line+1, strlen(line+1)); index = 0; } } else { if (encoding_flag == TRUE) { if (strncmp(line+3, "~[1-1]",6) == 0) { sentinel = line[1]; encoding_flag = FALSE; map_flag = TRUE; } } else { int i, j; for (j =0; j < ret; j++) { if (line[j] == '\n') continue; if (line[j] == sentinel) { cc->map[index] = '1'; num_mappable += 1; } index += 1; num_total += 1; } } } } ckfree(line); fclose(fp); fprintf(stderr, "%"PRIu64" (%2.2f%%) mappable bases in the genome.\n", num_mappable, num_mappable * 100.0 / num_total); //bin* iter; //bin* next; // //u64 sum = 0; //u64 num = 0; //for(int i = 0; i < reference->size; i++){ // iter = reference->bins[i]; // while(iter){ // next = iter->next; // chrcoverage* chrcov = (chrcoverage*)iter->val; // for (u64 j = 0; j < chrcov->length; j++) { // if (chrcov->map[j] == '1') { // printf("%s\t%"PRIu64"\n", iter->name, j); // } // } // iter = next; // } //} }
// left2 left1 position right1 right2 // | | | | | // V V V V V // ---------------------------------------------------------------------- static evidence* attempt_diagonal_alignments(readaln* const rln, char* const refseq, const int32_t left1, const int32_t right1, const int32_t left2, const int32_t right2, const int32_t anchor, char* const readseq) { forceassert(anchor >= left1); forceassert(anchor >= left2); forceassert(anchor <= right1); forceassert(anchor <= right2); forceassert(left2 >= 0); forceassert(right2 > 0); uint readlength = strlen(readseq); int low1, up1; find_best_band(refseq, left1, right1, anchor, readseq, 0, readlength, &low1, &up1); int numcigarops1; int r1,r2,q1,q2; uint32_t* cigarstring1 = ckallocz(sizeof(uint32_t)); attempt_band_alignment(refseq, left1, right1, readseq, 0, readlength, low1, up1, &r1, &r2, &q1, &q2, &numcigarops1, &cigarstring1); if(q1 == q2){ // the read did not align ckfree(cigarstring1); return NULL; } // this is over if I was able to align the whole read here if((q1 == 0) && (q2 == (int)readlength)){ update_readsegs(rln, r1, cigarstring1, numcigarops1, readlength, 0, -1, NULL, 0); ckfree(cigarstring1); return add_evidence_from_segment(rln, NULL); } // identify the first non-match base in the alignment on the read int i, j; for(i = 0, j = 0; i < numcigarops1; i++){ int op = (cigarstring1[i] & BAM_CIGAR_MASK); if((i == 0) && (op == BAM_CSOFT_CLIP)) continue; if(op != BAM_CPMATCH) break; j += (cigarstring1[i] >> BAM_CIGAR_SHIFT); } uint f_nonmatch = j; for(i = numcigarops1 - 1, j = 0; i >= 0; i--){ int op = (cigarstring1[i] & BAM_CIGAR_MASK); if((i == (numcigarops1 - 1)) && (op == BAM_CSOFT_CLIP)) continue; if(op != BAM_CPMATCH) break; j += (cigarstring1[i] >> BAM_CIGAR_SHIFT); } uint l_nonmatch = j; int low2, up2; int numcigarops2; int r3,r4,q3,q4; uint32_t* cigarstring2 = ckallocz(sizeof(uint32_t)); if(r1 > anchor){ if(q1 == 0){ forceassert(readlength > f_nonmatch); if(((readlength - f_nonmatch) < ethreshold) || ((right2 - r1 - f_nonmatch) < ethreshold)){ ckfree(cigarstring1); ckfree(cigarstring2); return NULL; } find_best_band(refseq, r1 + f_nonmatch, right2, r1, readseq, f_nonmatch, readlength, &low2, &up2); attempt_band_alignment(refseq, r1 + f_nonmatch, right2, readseq, f_nonmatch, readlength, low2, up2, &r3, &r4, &q3, &q4, &numcigarops2, &cigarstring2); if((q4 != (int)readlength) || (q3 == q4)){ ckfree(cigarstring1); ckfree(cigarstring2); return NULL; } add_prefix_soft_clip(f_nonmatch,&numcigarops2, &cigarstring2); }else if(q2 == (int)readlength){ forceassert(readlength > l_nonmatch); if(((readlength - l_nonmatch) < ethreshold) || ((r2 - l_nonmatch - anchor) < ethreshold)){ ckfree(cigarstring1); ckfree(cigarstring2); return NULL; } find_best_band(refseq, anchor, r2 - l_nonmatch, r2, readseq, 0, readlength - l_nonmatch, &low2, &up2); attempt_band_alignment(refseq, anchor, r2 - l_nonmatch, readseq, 0, readlength - l_nonmatch, low2, up2, &r3, &r4, &q3, &q4, &numcigarops2, &cigarstring2); if((q3 != 0) || (q3 == q4)){ ckfree(cigarstring1); ckfree(cigarstring2); return NULL; } add_suffix_soft_clip(l_nonmatch,&numcigarops2, &cigarstring2); }else{ ckfree(cigarstring1); ckfree(cigarstring2); return NULL; } }else if(r1 < anchor){ if(r2 >= anchor){ ckfree(cigarstring1); ckfree(cigarstring2); return NULL; } if(q1 == 0){ forceassert(readlength > f_nonmatch); if(((readlength - f_nonmatch) < ethreshold) || ((anchor - r1 - f_nonmatch) < ethreshold)){ ckfree(cigarstring1); ckfree(cigarstring2); return NULL; } find_best_band(refseq, r1 + f_nonmatch, anchor, r1, readseq, f_nonmatch, readlength, &low2, &up2); attempt_band_alignment(refseq, r1 + f_nonmatch, anchor, readseq, f_nonmatch, readlength, low2, up2, &r3, &r4, &q3, &q4, &numcigarops2, &cigarstring2); if((q4 != (int)readlength) || (q3 == q4)){ ckfree(cigarstring1); ckfree(cigarstring2); return NULL; } add_prefix_soft_clip(f_nonmatch,&numcigarops2, &cigarstring2); }else if(q2 == (int)readlength){ forceassert(readlength > l_nonmatch); if(((readlength - l_nonmatch) < ethreshold) || ((r2 - l_nonmatch - left2) < ethreshold)){ ckfree(cigarstring1); ckfree(cigarstring2); return NULL; } find_best_band(refseq, left2, r2 - l_nonmatch, r2, readseq, 0 , readlength - l_nonmatch, &low2, &up2); attempt_band_alignment(refseq, left2, r2 - l_nonmatch, readseq, 0 , readlength - l_nonmatch, low2, up2, &r3, &r4, &q3, &q4, &numcigarops2, &cigarstring2); if((q3 != 0) || (q3 == q4)){ ckfree(cigarstring1); ckfree(cigarstring2); return NULL; } add_suffix_soft_clip(l_nonmatch,&numcigarops2, &cigarstring2); }else{ ckfree(cigarstring1); ckfree(cigarstring2); return NULL; } }else{ // is this the place to handle insertions? ckfree(cigarstring1); ckfree(cigarstring2); return NULL; } // now find the best alignment out of the possible ones forceassert(q1 < q2); forceassert(q3 < q4); int index = -1; if((q1 > q3) && (q1 <= q4)){ // printf("%s : %d %d %d %d\n", rln->qname, q1, q2, q3, q4); index = find_best_del_candidate(q3,q4,cigarstring2,numcigarops2, q1,q2,cigarstring1,numcigarops1, readlength); update_readsegs(rln, r3, cigarstring2, numcigarops2, index, q1, r1, cigarstring1, numcigarops1); }else if((q3 > q1) && (q3 <= q2)){ // printf("%s: %d %d %d %d\n", rln->qname, q1, q2, q3, q4); index = find_best_del_candidate(q1,q2,cigarstring1,numcigarops1, q3,q4,cigarstring2,numcigarops2, readlength); update_readsegs(rln, r1, cigarstring1, numcigarops1, index, q3, r3, cigarstring2, numcigarops2); }else if((q1 > q4) && (r1 == r4)){ update_readsegs(rln, r3, cigarstring2, numcigarops2, q4, q1, r1, cigarstring1, numcigarops1); }else if((q3 > q2) && (r2 == r3)){ update_readsegs(rln, r1, cigarstring1, numcigarops1, q2, q3, r3, cigarstring2, numcigarops2); }else{ ckfree(cigarstring1); ckfree(cigarstring2); return NULL; } ckfree(cigarstring1); ckfree(cigarstring2); return add_evidence_from_segment(rln, NULL); }