int parse_bamfile_fosmid(char* bamfile, HASHTABLE* ht, CHROMVARS* chromvars, VARIANT* varlist, REFLIST* reflist) { fprintf(stderr, "reading sorted bamfile %s for fosmid pool\n", bamfile); int reads = 0; int MAX_READS = 5000000; // 10 million for now //struct alignedread* read = (struct alignedread*)malloc(sizeof(struct alignedread)); struct alignedread** readlist = calloc(MAX_READS, sizeof (struct alignedread*)); for (reads = 0; reads < MAX_READS; reads++) readlist[reads] = calloc(1, sizeof (struct alignedread)); struct alignedread* read_pt; FRAGMENT fragment; fragment.variants = 0; fragment.alist = (allele*) malloc(sizeof (allele)*10000); FRAGMENT* flist = (FRAGMENT*) malloc(sizeof (FRAGMENT) * MAX_READS / 5); int fragments = 0; int chrom = 0; int r = 0, i = 0; int prevchrom = -1; int prevtid = -1; //int prevposition = -1; // position of previous read in sorted bam file int lastread = 0; samfile_t *fp; if ((fp = samopen(bamfile, "rb", 0)) == 0) { fprintf(stderr, "Fail to open BAM file %s\n", bamfile); return -1; } bam1_t *b = bam_init1(); while (samread(fp, b) >= 0) { //readlist[r] = calloc(1,sizeof(struct alignedread)); fetch_func(b, fp, readlist[r]); if ((readlist[r]->flag & (BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP))) // unmapped reads, PCR/optical dups are ignored { free_readmemory(readlist[r]); continue; } // find the chromosome in reflist that matches read->chrom if the previous chromosome is different from current chromosome // if too many reads, break off when distance between adjacent reads is large if (readlist[r]->tid != prevtid || r >= MAX_READS - 1) { if (r >= MAX_READS - 1) fprintf(stderr, "limit on max reads %d exceeded.. need to clean up buffer \n", MAX_READS); if (prevtid >= 0) { fprintf(stderr, "reads in buffer %d \n", r); process_chunk(readlist, lastread, r, flist, varlist, reflist); // free up reads in list and move up index for (i = lastread; i < r; i++) free_readmemory(readlist[i]); read_pt = readlist[0]; readlist[0] = readlist[r]; readlist[r] = read_pt; r = 0; for (i = 0; i < fragments; i++) { free(flist[i].alist); free(flist[i].id); } fprintf(stderr, "free memory for reads from chrom %d cleaning up of fragment list %d\n", prevtid, fragments); fragments = 0; } chrom = getindex(ht, readlist[r]->chrom); get_chrom_name(readlist[r], ht, reflist); // reflist->current has chromosome index, -1 if no reflist lastread = r; } else chrom = prevchrom; fragment.variants = 0; //fragment.id = readlist[r]->readid; if (chrom >= 0) extract_variants_read(readlist[r], ht, chromvars, varlist, 1, &fragment, chrom, reflist); if (fragment.variants > 0) { add_fragment(flist, &fragment, readlist[r], fragments); readlist[r]->findex = fragments++; } else readlist[r]->findex = -1; reads += 1; if (reads % 2000000 == 0) fprintf(stderr, "processed %d reads, useful fragments \n", reads); prevchrom = chrom; prevtid = readlist[r]->tid; //if (readlist[r]->IS == 0) prevposition = readlist[r]->position; //else if (readlist[r]->IS > 0) prevposition = readlist[r]->position + readlist[r]->IS; // outer end of fragment r1....r2 r++; } process_chunk(readlist, lastread, r, flist, varlist, reflist); for (i = lastread; i < r; i++) free_readmemory(readlist[i]); for (reads = 0; reads < MAX_READS; reads++) free(readlist[reads]); free(readlist); bam_destroy1(b); return 1; }
// extract haplotype informative reads from sorted bam file // // need to discard reads that are marked as duplicates using flag // int parse_bamfile_sorted(char* bamfile,HASHTABLE* ht,CHROMVARS* chromvars,VARIANT* varlist,REFLIST* reflist) { fprintf(stderr,"reading sorted bamfile %s \n",bamfile); int reads=0; struct alignedread* read = (struct alignedread*)malloc(sizeof(struct alignedread)); int i=0; int sl=0; int chrom=0; int v1,v2; int absIS; int prevchrom=-1; int prevtid = -1; FRAGMENT* flist = (FRAGMENT*)malloc(sizeof(FRAGMENT)*MAXFRAG); int fragments =0; int prevfragments =0; FRAGMENT fragment; fragment.variants =0; fragment.alist = (allele*)malloc(sizeof(allele)*4096); samfile_t *fp; if ((fp = samopen(bamfile, "rb", 0)) == 0) { fprintf(stderr, "Fail to open BAM file %s\n", bamfile); return -1; } bam1_t *b = bam_init1(); while (samread(fp, b) >= 0) { fetch_func(b, fp,read); if ((read->flag & (BAM_FUNMAP|BAM_FSECONDARY|BAM_FQCFAIL|BAM_FDUP)) || read->mquality < MIN_MQ) { free_readmemory(read); continue; } // find the chromosome in reflist that matches read->chrom if the previous chromosome is different from current chromosome if (read->tid != prevtid) { chrom = getindex(ht,read->chrom); // doing this for every read, can replace this by string comparison ..april 4 2012 i = read->tid; if (reflist->ns > 0) { reflist->current = i; if (i >= reflist->ns || i < 0 || strcmp(reflist->names[i],read->chrom) !=0) { reflist->current = -1; for (i=0;i<reflist->ns;i++) { if (strcmp(reflist->names[i],read->chrom) ==0) { reflist->current = i; break; } } } } } else chrom = prevchrom; if (read->tid == read->mtid) // use mateposition to calculate insert size, march 12 2013, wrong since we need to consider the readlength/cigar { //read->IS = read->mateposition - read->position; } absIS = (read->IS < 0) ? -1*read->IS: read->IS; // add check to see if the mate and its read are on same chromosome, bug for contigs, july 16 2012 if ((read->flag & 8) || absIS > MAX_IS || absIS < MIN_IS || read->IS ==0 || !(read->flag & 1) || read->tid != read->mtid) // single read { fragment.variants =0; v1 =0; v2=0; if (chrom >=0 && PEONLY ==0) { fragment.id = read->readid; v1 = extract_variants_read(read,ht,chromvars,varlist,0,&fragment,chrom,reflist); if (fragment.variants >= 2 || (SINGLEREADS ==1 && fragment.variants >=1)) { // instead of printing fragment, we could change this to update genotype likelihoods print_fragment(&fragment,varlist,fragment_file); } } } else // paired-end read { //fprintf(stdout,"tid %d %d \n",read->tid,read->mtid); fragment.variants =0; v1 =0; v2=0; fragment.id = read->readid; if (chrom >=0) v1 = extract_variants_read(read,ht,chromvars,varlist,1,&fragment,chrom,reflist); //fprintf(stderr,"paired read stats %s %d flag %d IS %d\n",read->chrom,read->cigs,read->flag,read->IS); if (fragment.variants > 0) { //fprintf(stderr,"variants %d read %s %s \n",fragment.variants,read->chrom,read->readid); add_fragment(flist,&fragment,read,fragments); fragments++; if (fragments >= MAXFRAG) { fprintf(stderr,"exceeded max #cached fragments: %d,increase MAXFRAGMENTS using --maxfragments option \n",MAXFRAG); return -1; } } } // BUG here when the fragment list cannot be cleaned due to long mate-pair fragments (accumulated for large IS) // fragments >= 100000 and we will clean it repeatedly... // need to fix this june 4 2012.... even for long mate-pairs this could be a problem... if ( (fragments-prevfragments >= 100000) || fragments >= MAXFRAG -10000 || (chrom != prevchrom && prevchrom != -1 && fragments > 0)) // chrom of current read is not the same as previous read's chromosome... { if (PFLAG ==1) fprintf(stderr,"cleaning buffer: current chrom %s %d fragments %d\n",read->chrom,read->position,fragments); // BUG HERE when trying to clean empty fragment list (fragments ==0) if (fragments > 0) clean_fragmentlist(flist,&fragments,varlist,chrom,read->position,prevchrom); prevfragments = fragments; //fprintf(stderr,"remaining %d\n",fragments); } reads+=1; if (reads%2000000 ==0) fprintf(stderr,"processed %d reads, useful fragments %d\n",reads,fragments); prevchrom = chrom; prevtid = read->tid; free_readmemory(read); } if (fragments > 0) { fprintf(stderr,"final cleanup of fragment list: %d current chrom %s %d \n",fragments,read->chrom,read->position); clean_fragmentlist(flist,&fragments,varlist,-1,read->position,prevchrom); } bam_destroy1(b); }