Exemplo n.º 1
0
static hashtable* ReadReference(const char* const refname)
{
    hashtable* reference = new_hashtable(12);

    sequence* sp = read_fasta_sequence(refname);
    
    while(sp != NULL){
        // allocate a coverage array for the sequence
        chrcoverage* cov = ckallocz(sizeof(chrcoverage));
        cov->length   = strlen((char*)sp->sequence);
        cov->map = ckallocz(strlen((char*)sp->sequence));
        cov->cov = ckallocz(strlen((char*)sp->sequence));
        cov->seq = ckallocz(strlen((char*)sp->sequence)+1);
        memcpy(cov->seq, sp->sequence, cov->length);

        // if the name of the sequence has more than one tokens, just use the
        // first token in the name
        int i = 0;
        while((sp->header[i] != '\n') && 
              (sp->header[i] != 0)    && 
              (sp->header[i] != '\t') && 
              (sp->header[i] != 32)) i++;
        sp->header[i] = 0;

        add_hashtable(reference,(char*)sp->header,strlen((char*)sp->header),cov);
        sp = get_next_sequence(sp);
    } 

    close_fasta_sequence(sp);
    return reference;
}
Exemplo n.º 2
0
uint calculate_cov_params(const char* const bam_name,
                          const int32_t tid,
                          const int32_t start,
                          const int32_t stop)
{
    bamFile fp = bam_open(bam_name, "r");
    bam_index_t* fp_index = bam_index_load(bam_name);

    bam_plbuf_t *buf;

    covdata* cvdt = ckallocz(sizeof(covdata));
    cvdt->tid = tid;
    cvdt->begin = start;
    cvdt->end   = stop;
    cvdt->coverage = ckallocz((cvdt->end - cvdt->begin) * sizeof(uint32_t));
    
    buf = bam_plbuf_init(pileup_func, cvdt);
    bam_fetch(fp, fp_index, tid, start, stop, buf, fetch_func);
    bam_plbuf_push(0, buf);
    bam_plbuf_destroy(buf);  

    // calculate the mean coverage in the region of the putative deletion
    uint i, covsum;
    for(i = 0, covsum = 0; i < (cvdt->end - cvdt->begin); i++){
        covsum += cvdt->coverage[i];
    }
  
    uint avgcov = floor(covsum * 1.0/(cvdt->end - cvdt->begin));
    ckfree(cvdt->coverage);
    ckfree(cvdt);

    bam_close(fp);   
    bam_index_destroy(fp_index);
    return avgcov;
}
Exemplo n.º 3
0
static void find_best_band(char* const refseq,
                           const uint zstart1,
                           const uint end1,
                           const uint anchor,
                           char* const readseq,
                           const uint zstart2,
                           const uint end2,
                           int* const plow,
                           int* const pup)
{
    uint numdiagonals = ((end1 - zstart1) - (klength - 1))
                      + ((end2 - zstart2) - (klength - 1));
    forceassert(numdiagonals > numgaps);

    forceassert(end2 >= zstart2);
    if((end2 - zstart2) < klength){
        *plow = numdiagonals - 1;
        *pup  = numdiagonals - 1;
        return;
    }

    // read the seeds from the reference sequence
    uint* refptrs  = ckallocz((end1 - zstart1) * sizeof(uint));
    uint* refseeds = read_seeds(refseq, zstart1, end1, &refptrs);

    // read the seeds from the sequence of interest
    uint* readptrs  = ckallocz((end2 - zstart2) * sizeof(uint));
    uint* readseeds = read_seeds(readseq, zstart2, end2, &readptrs);

    int* diagonals = ckallocz(sizeof(int) * numdiagonals);
    int* bands = ckallocz(sizeof(int) * numdiagonals);

    bin_diagonals(diagonals,numdiagonals,
                  refseeds,refptrs,
                  readseq, zstart2, end2, readseeds,readptrs);
    bin_bands(bands, diagonals, numdiagonals);

    uint indx1, indx2;
    select_band(bands, numdiagonals, 0, numdiagonals, anchor - zstart1, 
                &indx1, &indx2, end1 - zstart1);

    
//    printf("%u %u %u %u\n", indx1, end2,zstart2, klength);
//    printf("%u %u %u %u\n", indx2, end2,zstart2, klength);

    *plow = indx1 - ((end2 - zstart2) -klength + 1);
    *pup  = indx2 - ((end2 - zstart2) -klength + 1);
    
    ckfree(refptrs);
    ckfree(refseeds);
    ckfree(readptrs);
    ckfree(readseeds);
    ckfree(diagonals);
    ckfree(bands);
}
Exemplo n.º 4
0
/*add the following substring of length len to the hashtable. Return the 
 * bin corresponding to it*/
bin* add_hashtable(hashtable* const hash,  /*the hashtable*/
				   const char* const name, /*the string*/
				   const int length, 	   /*use 'length' characters of name*/
				   void* val)				/*the value*/
{
    pre(name != NULL);

	uint32_t index =  hashfunc(name, length);
	index = index & hash->mask;
	
	char* hname = ckalloc(length+1);
	memcpy(hname, name, length);
	hname[length] = '\0';

	bin* bin = ckallocz(sizeof(struct bin_st));
	bin->name = hname;
	bin->val = val;
	if(hash->bins[index]){
		hash->collisions++;
	}
	bin->next = hash->bins[index];
	hash->bins[index] = bin;
	hash->elcount++;
	
    post(bin != NULL);
	return bin;
}
Exemplo n.º 5
0
void calculate_mq_params(bamFile* const pfp,
                         bam_index_t* const fp_index,
                         const int32_t tid, 
                         const int32_t start,
                         const int32_t stop, 
                         uint* const mqrms, 
                         uint* const mq0)
{
    bamFile fp = *pfp;  
 
    mapqcov* mqds = ckallocz(sizeof(mapqcov));

    bam_fetch(fp,
              fp_index,
              tid, start, stop,
              &mqds,
              calcrms);

    *mqrms = sqrt(mqds->mqsqsum*1.0/mqds->numcov);
    *mq0 = mqds->numcov0;

    ckfree(mqds);
   
    return;
}
Exemplo n.º 6
0
/*allocate a new hash table*/
hashtable* new_hashtable(const int po2size)
{
	int max_size = po2size;
	if(po2size > 24){
		fprintf(stderr,"hash po2size should not exceed 24, using 24\n");
		max_size = 24;
	}

	hashtable* hashtable = ckallocz(sizeof(struct hashtable_st));
	hashtable->po2size = max_size;
	hashtable->size = (1 << hashtable->po2size);
	hashtable->mask = hashtable->size - 1;

	hashtable->bins  = ckallocz(hashtable->size*sizeof(struct bin_st*));

	return hashtable;
}
Exemplo n.º 7
0
Arquivo: seq.c Projeto: cestmoi7/AGAPE
SEQ *seq_copy(const SEQ *s)
{
	SEQ *ss = ckallocz(sizeof(SEQ));
	*ss = *s;
	ss->seq = (uchar*)copy_string((const char*)s->seq);
	ss->header = copy_string(s->header);
	ss->fname = copy_string(s->fname);
	ss->maskname = copy_string(s->fname);
	ss->fp = 0; /* XXX - no subsequent seq_read operations allowed */
	ss->offset = 0; /* XXX - no subsequent seq_read operations allowed */
	return ss;
}
Exemplo n.º 8
0
// attempt diagonal alignment of refseq:zstart1-end1 to readseq:zstart2-end2
static void attempt_band_alignment(char* const refseq,
                                   const uint zstart1,
                                   const uint end1,
                                   char* const readseq,
                                   const uint zstart2,
                                   const uint end2,
                                   const int low,
                                   const int up,
                                   int* const pr1,
                                   int* const pr2,
                                   int* const pq1,
                                   int* const pq2,
                                   int* const pnumcigarops,
                                   uint32_t** const pcigarstring)
{
    if(low > up) fprintf(stderr, "low: %d up: %d\n", low, up);
    forceassert(low <= up);
    int* S  = ckallocz(((end1 - zstart1) + (end2 - zstart2))*sizeof(int)); 
    int score = local_align(readseq+zstart2, end2 - zstart2, 
                            refseq + zstart1, end1 - zstart1, 
                            low, up, pq1, pr1, pq2, pr2, S);

    if(score <= 0){
        *pr1 = 0;
        *pr2 = 0;
        *pq1 = 0;
        *pq2 = 0;
        ckfree(S);
        return;
    }
 
    fetch_cigar(readseq + zstart2 + *pq1 - 2, refseq + zstart1 + *pr1 - 2, 
                *pq2 - *pq1 + 1, *pr2 - *pr1 + 1, S,
                *pq1, *pr1, end2 - zstart2, pnumcigarops, pcigarstring);   


    if(TRUE == debug_flag){
        fprintf(debug_file, "Aligned read:%d-%d to reference:%d-%d\n", *pq1+zstart2-1, *pq2+zstart2, *pr1+zstart1-1, *pr2+zstart1);
        print_alignments(refseq+zstart1, *pr1, *pr2, 
                         readseq+zstart2, *pq1, *pq2, end2 - zstart2, S);
    }

    *pr1 = *pr1 + zstart1-1;
    *pr2 = *pr2 + zstart1;
    *pq1 = *pq1 + zstart2 - 1;
    *pq2 = *pq2 + zstart2;

    ckfree(S);
}
Exemplo n.º 9
0
Arquivo: seq.c Projeto: cestmoi7/AGAPE
SEQ* seq_open_type(const char *fname, int type)
{
	SEQ *s = ckallocz(sizeof(SEQ));
	int r, flags = 0;

	r = parse_fname(fname, 
		&(s->fname), &(s->from), &(s->slen), &(s->maskname));
	if (r == -1)
		fatalf("improper positions specification: %s", fname);

	s->type = type;
	s->flags = check_flags(r|flags);
	s->fp = ckopen(s->fname, "rb");
	s->count = 0;
	s->offset = 0;
	return s;
}
Exemplo n.º 10
0
static void CreateRefMap(const char* const refName,
                         const char* const mapName,
                         hashtable* const reference)
{
    /* Return an object that lists the unique regions of the reference.
 
        The format of the input file includes description of the encoding used,
        followed by fasta like format.   
        ~~ENCODING
        ' '~[0-0]
        '!'~[1-1]
        '"'~[2-2]
         ...
        '&'~[6-7]
         ...
        ~chr17
        !!!!!!!!!!!!!!!!!!!!!!!!!""!!!!!!!!!!%%*&&00,00/66/140.-,,
        ,/04:41237?CDDDD??>;7*$$!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
        !!!!!%$$#!!"#$&%"#.7>:><9898899999:.3---1111"!!!!##!!
        ...
    */
    
    FILE* fp = ckopen(mapName, "r");
    size_t n = 1;
    char* line = ckallocz(n*sizeof(char));
    bool encoding_flag = FALSE;
    bool map_flag = FALSE;
    char sentinel = '~';

    chrcoverage* cc;
    int index = 0;
    u64 num_mappable = 0, num_total = 0;
    signed long ret;    

    while ((ret = getline(&line, &n, fp)) != -1) {
        if  (line[0] == '~') {
            // beginning of a subsection
            if (strncmp(line, "~~ENCODING", 10) == 0) {
                encoding_flag = TRUE;  
            } else if (map_flag == TRUE) {
                // if the name has more than one token, then I only use the
                // first token
                int i = 0;
                while((line[i] != '\n') &&
                      (line[i] != 0)    &&
                      (line[i] != '\t') &&
                      (line[i] != 32)) i++;
                line[i] = 0;               
                cc = (chrcoverage*)must_find_hashtable(reference, line+1, strlen(line+1));
                index = 0;
            }
        } else {
            if (encoding_flag == TRUE) {
                if (strncmp(line+3, "~[1-1]",6) == 0) {
                    sentinel = line[1];
                    encoding_flag = FALSE;
                    map_flag = TRUE;
                }
            } else {
                int i, j;
                for (j =0; j < ret; j++) {
                    if (line[j] == '\n') continue;
                    if (line[j] == sentinel) {
                        cc->map[index] = '1';  
                        num_mappable += 1;
                    }
                    index += 1;
                    num_total += 1;
                }
            }
        }
    }

    ckfree(line);
    fclose(fp);

    fprintf(stderr, "%"PRIu64" (%2.2f%%) mappable bases in the genome.\n", 
        num_mappable, num_mappable * 100.0 / num_total);

    //bin* iter;
    //bin* next;
    //
    //u64 sum = 0;
    //u64 num = 0;
    //for(int i = 0; i < reference->size; i++){
    //    iter = reference->bins[i];
    //    while(iter){
    //        next = iter->next;

    //        chrcoverage* chrcov = (chrcoverage*)iter->val;
    //        for (u64 j = 0; j < chrcov->length; j++) {
    //            if (chrcov->map[j] == '1') {
    //                printf("%s\t%"PRIu64"\n", iter->name, j);
    //            }
    //        }

    //        iter = next;
    //    }
    //}   
   
}
Exemplo n.º 11
0
// left2             left1          position        right1        right2
//   |                |                |              |             |
//   V                V                V              V             V
// ----------------------------------------------------------------------      
static evidence* attempt_diagonal_alignments(readaln* const rln,
                                             char* const refseq,
                                             const int32_t left1,
                                             const int32_t right1,
                                             const int32_t left2,
                                             const int32_t right2,
                                             const int32_t anchor,
                                             char* const readseq)
{
    forceassert(anchor >= left1);      
    forceassert(anchor >= left2);      
    forceassert(anchor <= right1);
    forceassert(anchor <= right2);
    forceassert(left2 >= 0);
    forceassert(right2 > 0);

    uint readlength = strlen(readseq);
    int low1, up1;
    find_best_band(refseq, left1, right1, anchor,
                   readseq, 0, readlength, &low1, &up1);

    int numcigarops1;
    int r1,r2,q1,q2;
    uint32_t* cigarstring1 = ckallocz(sizeof(uint32_t));
    attempt_band_alignment(refseq, left1, right1, 
                           readseq, 0, readlength, 
                           low1, up1, &r1, &r2, &q1, &q2, 
                           &numcigarops1, &cigarstring1);

    if(q1 == q2){
        // the read did not align
        ckfree(cigarstring1);
        return NULL;
    }

    // this is over if I was able to align the whole read here
    if((q1 == 0) && (q2 == (int)readlength)){
        update_readsegs(rln, 
                        r1, cigarstring1, numcigarops1, 
                        readlength,
                        0, -1, NULL, 0);
        ckfree(cigarstring1);
        return add_evidence_from_segment(rln, NULL);
    }
    
    // identify the first non-match base in the alignment on the read
    int i, j;
    for(i = 0, j = 0; i < numcigarops1; i++){
        int op = (cigarstring1[i] & BAM_CIGAR_MASK);
        if((i == 0) && (op == BAM_CSOFT_CLIP)) continue;
        if(op != BAM_CPMATCH) break;
        j += (cigarstring1[i] >> BAM_CIGAR_SHIFT);
    } 
    uint f_nonmatch = j;
    for(i = numcigarops1 - 1, j = 0; i >= 0; i--){
        int op = (cigarstring1[i] & BAM_CIGAR_MASK);
        if((i == (numcigarops1 - 1)) && (op == BAM_CSOFT_CLIP)) continue;
        if(op != BAM_CPMATCH) break;
        j += (cigarstring1[i] >> BAM_CIGAR_SHIFT);
    } 
    uint l_nonmatch = j;

    int low2, up2;
    int numcigarops2;
    int r3,r4,q3,q4;
    uint32_t* cigarstring2 = ckallocz(sizeof(uint32_t));
    if(r1 > anchor){
        if(q1 == 0){
            forceassert(readlength > f_nonmatch);
            if(((readlength - f_nonmatch) < ethreshold) ||
               ((right2 - r1 - f_nonmatch) < ethreshold)){
                ckfree(cigarstring1);
                ckfree(cigarstring2);
                return NULL;
            }
            find_best_band(refseq, r1 + f_nonmatch, right2, r1, 
                           readseq, f_nonmatch, readlength, &low2, &up2);

            attempt_band_alignment(refseq, r1 + f_nonmatch, right2, 
                                   readseq, f_nonmatch, readlength, 
                                   low2, up2, 
                                   &r3, &r4, &q3, &q4, 
                                   &numcigarops2, &cigarstring2);

            if((q4 != (int)readlength) || (q3 == q4)){
                ckfree(cigarstring1);
                ckfree(cigarstring2);
                return NULL;
            }
            add_prefix_soft_clip(f_nonmatch,&numcigarops2, &cigarstring2);
        }else if(q2 == (int)readlength){
            forceassert(readlength > l_nonmatch);
            if(((readlength - l_nonmatch) < ethreshold) || 
               ((r2 - l_nonmatch - anchor) < ethreshold)){
                ckfree(cigarstring1);
                ckfree(cigarstring2);
                return NULL;
            }
            find_best_band(refseq, anchor, r2 - l_nonmatch, r2, 
                           readseq, 0, readlength - l_nonmatch, &low2, &up2);

            attempt_band_alignment(refseq, anchor, r2 - l_nonmatch, 
                                   readseq, 0, readlength - l_nonmatch, 
                                   low2, up2, &r3, &r4, &q3, &q4, 
                                   &numcigarops2, &cigarstring2);

            if((q3 != 0) || (q3 == q4)){
                ckfree(cigarstring1);
                ckfree(cigarstring2);
                return NULL;
            }
            add_suffix_soft_clip(l_nonmatch,&numcigarops2, &cigarstring2);
        }else{
            ckfree(cigarstring1);
            ckfree(cigarstring2);
            return NULL;
        }
    }else if(r1 < anchor){
        if(r2 >= anchor){
            ckfree(cigarstring1);
            ckfree(cigarstring2);
            return NULL;
        }

        if(q1 == 0){
            forceassert(readlength > f_nonmatch);
            if(((readlength - f_nonmatch) < ethreshold) || 
               ((anchor - r1 - f_nonmatch) < ethreshold)){
                ckfree(cigarstring1);
                ckfree(cigarstring2);
                return NULL;
            }
            find_best_band(refseq, r1 + f_nonmatch, anchor, r1, 
                           readseq, f_nonmatch, readlength, &low2, &up2);
            attempt_band_alignment(refseq, r1 + f_nonmatch, anchor, 
                                   readseq, f_nonmatch, readlength, 
                                   low2, up2, 
                                   &r3, &r4, &q3, &q4, 
                                   &numcigarops2, &cigarstring2);          

            if((q4 != (int)readlength) || (q3 == q4)){
                ckfree(cigarstring1);
                ckfree(cigarstring2);
                return NULL;
            }
            add_prefix_soft_clip(f_nonmatch,&numcigarops2, &cigarstring2);
        }else if(q2 == (int)readlength){
            forceassert(readlength > l_nonmatch);
            if(((readlength - l_nonmatch) < ethreshold) || 
               ((r2 - l_nonmatch - left2) < ethreshold)){
                ckfree(cigarstring1);
                ckfree(cigarstring2);
                return NULL;
            }
            find_best_band(refseq, left2, r2 - l_nonmatch, r2, 
                           readseq, 0 , readlength - l_nonmatch, &low2, &up2);

            attempt_band_alignment(refseq, left2, r2 - l_nonmatch,
                                   readseq, 0 , readlength - l_nonmatch, 
                                   low2, up2, &r3, &r4, &q3, &q4, 
                                   &numcigarops2, &cigarstring2);

            if((q3 != 0) || (q3 == q4)){
                ckfree(cigarstring1);
                ckfree(cigarstring2);
                return NULL;
            }
            add_suffix_soft_clip(l_nonmatch,&numcigarops2, &cigarstring2);
        }else{
            ckfree(cigarstring1);
            ckfree(cigarstring2);
            return NULL;
        }
    }else{
        // is this the place to handle insertions?
        ckfree(cigarstring1);
        ckfree(cigarstring2);
        return NULL;
    }     

    // now find the best alignment out of the possible ones
    forceassert(q1 < q2);
    forceassert(q3 < q4);

    int index = -1;
    if((q1 > q3) && (q1 <= q4)){
//        printf("%s : %d %d %d %d\n", rln->qname, q1, q2, q3, q4);
        index = find_best_del_candidate(q3,q4,cigarstring2,numcigarops2,
                                   q1,q2,cigarstring1,numcigarops1, readlength);
        update_readsegs(rln,
                        r3, cigarstring2, numcigarops2, 
                        index,
                        q1, r1, cigarstring1, numcigarops1);
    }else if((q3 > q1) && (q3 <= q2)){
//        printf("%s: %d %d %d %d\n", rln->qname, q1, q2, q3, q4);
        index = find_best_del_candidate(q1,q2,cigarstring1,numcigarops1,
                                   q3,q4,cigarstring2,numcigarops2, readlength);
        update_readsegs(rln,
                        r1, cigarstring1, numcigarops1,
                        index,
                        q3, r3, cigarstring2, numcigarops2);
    }else if((q1 > q4) && (r1 == r4)){
        update_readsegs(rln,
                        r3, cigarstring2, numcigarops2,
                        q4,
                        q1, r1, cigarstring1, numcigarops1);
    }else if((q3 > q2) && (r2 == r3)){
        update_readsegs(rln,
                        r1, cigarstring1, numcigarops1,
                        q2,
                        q3, r3, cigarstring2, numcigarops2);   
    }else{
        ckfree(cigarstring1);
        ckfree(cigarstring2);
        return NULL;
    }

    ckfree(cigarstring1);
    ckfree(cigarstring2);
    return add_evidence_from_segment(rln, NULL);
}