Esempio n. 1
0
int parse_bamfile_fosmid(char* bamfile, HASHTABLE* ht, CHROMVARS* chromvars, VARIANT* varlist, REFLIST* reflist) {
    fprintf(stderr, "reading sorted bamfile %s for fosmid pool\n", bamfile);
    int reads = 0;
    int MAX_READS = 5000000; // 10 million for now
    //struct alignedread* read = (struct alignedread*)malloc(sizeof(struct alignedread));
    struct alignedread** readlist = calloc(MAX_READS, sizeof (struct alignedread*));
    for (reads = 0; reads < MAX_READS; reads++) readlist[reads] = calloc(1, sizeof (struct alignedread));
    struct alignedread* read_pt;

    FRAGMENT fragment;
    fragment.variants = 0;
    fragment.alist = (allele*) malloc(sizeof (allele)*10000);
    FRAGMENT* flist = (FRAGMENT*) malloc(sizeof (FRAGMENT) * MAX_READS / 5);
    int fragments = 0;

    int chrom = 0;
    int r = 0, i = 0;
    int prevchrom = -1;
    int prevtid = -1; //int prevposition = -1; // position of previous read in sorted bam file
    int lastread = 0;

    samfile_t *fp;
    if ((fp = samopen(bamfile, "rb", 0)) == 0) {
        fprintf(stderr, "Fail to open BAM file %s\n", bamfile);
        return -1;
    }
    bam1_t *b = bam_init1();

    while (samread(fp, b) >= 0) {
        //readlist[r] = calloc(1,sizeof(struct alignedread));
        fetch_func(b, fp, readlist[r]);
        if ((readlist[r]->flag & (BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP))) // unmapped reads, PCR/optical dups are ignored
        {
            free_readmemory(readlist[r]);
            continue;
        }
        // find the chromosome in reflist that matches read->chrom if the previous chromosome is different from current chromosome
        // if too many reads, break off when distance between adjacent reads is large
        if (readlist[r]->tid != prevtid || r >= MAX_READS - 1) {
            if (r >= MAX_READS - 1) fprintf(stderr, "limit on max reads %d exceeded.. need to clean up buffer \n", MAX_READS);
            if (prevtid >= 0) {
                fprintf(stderr, "reads in buffer %d \n", r);
                process_chunk(readlist, lastread, r, flist, varlist, reflist);
                // free up reads in list and move up index
                for (i = lastread; i < r; i++) free_readmemory(readlist[i]);
                read_pt = readlist[0];
                readlist[0] = readlist[r];
                readlist[r] = read_pt;
                r = 0;
                for (i = 0; i < fragments; i++) {
                    free(flist[i].alist);
                    free(flist[i].id);
                }
                fprintf(stderr, "free memory for reads from chrom %d cleaning up of fragment list %d\n", prevtid, fragments);
                fragments = 0;
            }

            chrom = getindex(ht, readlist[r]->chrom);
            get_chrom_name(readlist[r], ht, reflist); // reflist->current has chromosome index, -1 if no reflist
            lastread = r;
        } else chrom = prevchrom;

        fragment.variants = 0; //fragment.id = readlist[r]->readid;
        if (chrom >= 0) extract_variants_read(readlist[r], ht, chromvars, varlist, 1, &fragment, chrom, reflist);
        if (fragment.variants > 0) {
            add_fragment(flist, &fragment, readlist[r], fragments);
            readlist[r]->findex = fragments++;
        } else readlist[r]->findex = -1;

        reads += 1;
        if (reads % 2000000 == 0) fprintf(stderr, "processed %d reads, useful fragments \n", reads);
        prevchrom = chrom;
        prevtid = readlist[r]->tid;
        //if (readlist[r]->IS == 0) prevposition = readlist[r]->position;
        //else if (readlist[r]->IS > 0) prevposition = readlist[r]->position + readlist[r]->IS; // outer end of fragment r1....r2
        r++;
    }
    process_chunk(readlist, lastread, r, flist, varlist, reflist);
    for (i = lastread; i < r; i++) free_readmemory(readlist[i]);
    for (reads = 0; reads < MAX_READS; reads++) free(readlist[reads]);
    free(readlist);
    bam_destroy1(b);
    return 1;
}
Esempio n. 2
0
// extract haplotype informative reads from sorted bam file //
// need to discard reads that are marked as duplicates using flag //
int parse_bamfile_sorted(char* bamfile,HASHTABLE* ht,CHROMVARS* chromvars,VARIANT* varlist,REFLIST* reflist)
{
	fprintf(stderr,"reading sorted bamfile %s \n",bamfile);
	int reads=0;
	struct alignedread* read = (struct alignedread*)malloc(sizeof(struct alignedread));
	
	int i=0; int sl=0; int chrom=0;
	int v1,v2; int absIS;
	int prevchrom=-1; int prevtid = -1;

	FRAGMENT* flist = (FRAGMENT*)malloc(sizeof(FRAGMENT)*MAXFRAG); int fragments =0; int prevfragments =0;
	FRAGMENT fragment; fragment.variants =0; fragment.alist = (allele*)malloc(sizeof(allele)*4096);

	samfile_t *fp;
	if ((fp = samopen(bamfile, "rb", 0)) == 0) { fprintf(stderr, "Fail to open BAM file %s\n", bamfile); return -1; }
	bam1_t *b = bam_init1();

	while (samread(fp, b) >= 0)
	{
		fetch_func(b, fp,read);
		if ((read->flag & (BAM_FUNMAP|BAM_FSECONDARY|BAM_FQCFAIL|BAM_FDUP)) || read->mquality < MIN_MQ) 
		{
			free_readmemory(read); continue;
		}
		// find the chromosome in reflist that matches read->chrom if the previous chromosome is different from current chromosome
		if (read->tid != prevtid)
		{
			chrom = getindex(ht,read->chrom); // doing this for every read, can replace this by string comparison ..april 4 2012
			i = read->tid;
			if (reflist->ns > 0)
			{
				reflist->current = i;
				if (i >= reflist->ns || i < 0 || strcmp(reflist->names[i],read->chrom) !=0)
				{
					reflist->current = -1;
					for (i=0;i<reflist->ns;i++)
					{
						if (strcmp(reflist->names[i],read->chrom) ==0) { reflist->current = i; break; }
					}
				}
			}
		}
		else chrom = prevchrom;
		if (read->tid == read->mtid)  // use mateposition to calculate insert size, march 12 2013, wrong since we need to consider the readlength/cigar
		{
			//read->IS = read->mateposition - read->position; 
		}

		absIS = (read->IS < 0) ? -1*read->IS: read->IS; 
		// add check to see if the mate and its read are on same chromosome, bug for contigs, july 16 2012
		if ((read->flag & 8) || absIS > MAX_IS || absIS < MIN_IS || read->IS ==0 || !(read->flag & 1) || read->tid != read->mtid) // single read
		{
			fragment.variants =0; v1 =0; v2=0; 
			if (chrom >=0 && PEONLY ==0) 
			{
				fragment.id = read->readid;
				v1 = extract_variants_read(read,ht,chromvars,varlist,0,&fragment,chrom,reflist);
				if (fragment.variants >= 2 || (SINGLEREADS ==1 && fragment.variants >=1))	
				{
					// instead of printing fragment, we could change this to update genotype likelihoods 
					print_fragment(&fragment,varlist,fragment_file);
				}
			}
		}
		else  // paired-end read 
		{
			//fprintf(stdout,"tid %d %d \n",read->tid,read->mtid);
			fragment.variants =0; v1 =0; v2=0; fragment.id = read->readid;
			if (chrom >=0) 	v1 = extract_variants_read(read,ht,chromvars,varlist,1,&fragment,chrom,reflist);
			//fprintf(stderr,"paired read stats %s %d flag %d IS %d\n",read->chrom,read->cigs,read->flag,read->IS);
			if (fragment.variants > 0)
			{
				//fprintf(stderr,"variants %d read %s %s \n",fragment.variants,read->chrom,read->readid);
				add_fragment(flist,&fragment,read,fragments); fragments++;
				if (fragments >= MAXFRAG)
				{
					fprintf(stderr,"exceeded max #cached fragments: %d,increase MAXFRAGMENTS using --maxfragments option \n",MAXFRAG);
					return -1;
				}
			}
		}
		// BUG here when the fragment list cannot be cleaned due to long mate-pair fragments (accumulated for large IS)
		// fragments >= 100000 and we will clean it repeatedly...
		// need to fix this june 4 2012.... even for long mate-pairs this could be a problem...
		if ( (fragments-prevfragments >= 100000) || fragments >= MAXFRAG -10000 || (chrom != prevchrom && prevchrom != -1 && fragments > 0)) // chrom of current read is not the same as previous read's chromosome...
		{
			if (PFLAG ==1) fprintf(stderr,"cleaning buffer: current chrom %s %d fragments %d\n",read->chrom,read->position,fragments);
			// BUG HERE when trying to clean empty fragment list (fragments ==0)
			if (fragments > 0) clean_fragmentlist(flist,&fragments,varlist,chrom,read->position,prevchrom);
			prevfragments = fragments;
			//fprintf(stderr,"remaining %d\n",fragments);
		}

		reads+=1; if (reads%2000000 ==0) fprintf(stderr,"processed %d reads, useful fragments %d\n",reads,fragments);
		prevchrom = chrom; prevtid = read->tid;
		free_readmemory(read);
	}
	if (fragments > 0) 
	{
		fprintf(stderr,"final cleanup of fragment list: %d current chrom %s %d \n",fragments,read->chrom,read->position);
		clean_fragmentlist(flist,&fragments,varlist,-1,read->position,prevchrom);
	}
	bam_destroy1(b);
}