コード例 #1
0
ファイル: report_hic_pairs.cpp プロジェクト: zilhua/diffHic
SEXP test_parse_cigar (SEXP incoming, SEXP reverse) try {
	if (!isString(incoming) || LENGTH(incoming)!=1) { throw std::runtime_error("need one cigar string"); }
	if (!isLogical(reverse) || LENGTH(reverse)!=1) { throw std::runtime_error("need a reverse specifier"); }
	SEXP output=PROTECT(allocVector(INTSXP, 2));
	int* optr=INTEGER(output);
	int& alen=*optr;
	int& offset=*(optr+1);
	parse_cigar(CHAR(STRING_ELT(incoming, 0)), alen, offset, asLogical(reverse));
	UNPROTECT(1);
	return(output);
} catch (std::exception& e) {
	return mkString(e.what());
}
コード例 #2
0
ファイル: readmultiplebams.c プロジェクト: vibansal/crisp
// multi sample variant caller: CRISP, PICALL or low coverage method
int multisampleVC(struct OPTIONS* options,REFLIST* reflist,FILE* fp)
{
	if (USE_DUPLICATES ==1) BAM_FILTER_MASK = (BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL); else BAM_FILTER_MASK = (BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP);

	int bamfiles = options->bamfiles;

	int last=0; // last is the current position s.t. all reads have starting position > last
	int i=0; int h=0;
	unsigned long reads=0; int j=0; int prev_tid = -1;   int rf=0;
	int finishedfiles =0; 
	struct alignedread* pread = NULL;
	struct BAMFILE_data* bamfiles_data = calloc(bamfiles,sizeof(struct BAMFILE_data)); // added one extra to list to store indels for all samples combined

	READQUEUE* RQ = (READQUEUE*)malloc(sizeof(READQUEUE));  RQ->first = NULL; RQ->last = NULL; RQ->reads = 0; 
	int* fcigarlist = (int*)malloc(sizeof(int)*4096);

	// data structure for holding potential variants and read counts, etc 
	struct VARIANT variant;  variant.ploidy = calloc(options->bamfiles,sizeof(int)); 
	init_poolsizes(&variant,options,PICALL); 
	init_variant(&variant,options->bamfiles,options->bamfiles);
	variant.options = options;  // pointer to options

	BAMHEAP bheap; bheap.harray = (int*)malloc(sizeof(int)*bamfiles); bheap.length = bamfiles;
	for (i=0;i<bamfiles;i++) { bheap.harray[i] = i; bamfiles_data[i].finished= 0;}
	
	reflist->cinterval = -1; // first interval to the right of current base

        init_bamfiles(bamfiles_data,options->bamfilelist,bamfiles,options->regions,&options->targettid,&options->targetstart,&options->targetend);

	// error when reading indexed bam files probably due to lack of reads in some files resulting in heap error, fixed oct 17 2012
        j=0; for (i=0;i<bamfiles;i++) 
	{
		finishedfiles += bamfiles_data[i].finished; 
		if (bamfiles_data[i].finished ==0) bheap.harray[j++] = i; else bheap.length--; 	
	}
	buildminheap(&bheap,bamfiles_data); // initial minheap call
	//fprintf(stderr,"finishedfiles %d \n",finishedfiles);
	
	if (INDEL_REALIGNMENT >=1) allocate_mem_heap(bamfiles_data,bamfiles,100);

	
	HAPLOTYPES =0,MIN_COVERAGE_FLANKING =0;
	for (i=0;i<variant.samples;i++) 
	{
		MIN_COVERAGE_FLANKING += 2*variant.ploidy[i];  // enforced for regions outside the bedfile target
		HAPLOTYPES += variant.ploidy[i];
	}
	//int min_coverage_target = 1*variant->ploidy*variant->samples;  // enforced for regions outside the bedfile target
	int offset_readlength = 150;  // call variants in window (last,current_read_position-offset_readlength) to allow for indel analysis, set to 0 for original behavior of program
	// the value of offset should not affect the correctness or speed of the code
	int current_position =0;
	
	while (finishedfiles < bamfiles)
	{
		i = bheap.harray[0]; // take the top read off the heap
		if ( !(bamfiles_data[i].read->flag & BAM_FILTER_MASK))
		{
			if (bamfiles_data[i].read->tid != prev_tid) // read's chromosome is different from previousread 
			{
				if (prev_tid >=0)  // finish the processing of previous chromosome and cleanup
				{
					if (RQ->reads >0) 
					{
						fprintf(stderr,"processing %d reads left in queue for chrom %s...",RQ->reads,reflist->names[prev_tid]);
						callvariants(reflist,prev_tid,last,reflist->lengths[prev_tid],RQ,bamfiles_data,options,&variant);
						empty_queue(RQ,bamfiles_data); //clean thequeue
					}
					if (INDEL_REALIGNMENT >=1) clean_indel_lists(bamfiles_data,bamfiles,-1); current_position = 0; 
					for(j=0;j<bamfiles;j++) bamfiles_data[j].last=NULL; last =0; 
					free(reflist->sequences[prev_tid]); 
					fprintf(stderr,".....finished processing reads for chrom %s\n",reflist->names[prev_tid]);
					fprintf(stdout,".....finished processing reads for chrom %s\n",reflist->names[prev_tid]);
					reflist->cinterval = -1; // reset to -1 
				}
				read_chromosome(reflist,bamfiles_data[i].read->tid,fp); 
				prev_tid =bamfiles_data[i].read->tid;
			}

			if (bamfiles_data[i].read->position <last)
			{
				fprintf(stderr,"reads out of order i:%d h:%d pos: %d %d\n",i,h,bamfiles_data[i].read->position,last);
				fprintf(stderr,"the program will now exit, please sort the bamfiles\n");
				return 1;
			}

			if (INDEL_REALIGNMENT >=1 && bamfiles_data[i].read->position > current_position+offset_readlength) 
			{
				// need to clean up indel lists when we encounter a new chromosome... 
				print_indel_lists(bamfiles_data,bamfiles,current_position+offset_readlength); 
				clean_indel_lists(bamfiles_data,bamfiles,current_position);
				current_position = bamfiles_data[i].read->position;
			}
			// realign reads before calling variants, each read is realigned only once

			// small bug here, only call variants when last is less than current read position
			// bug fixed here, update last only when 'callvariants' is invoked, ???
			if (RQ->reads > 0 && bamfiles_data[i].read->position > last+offset_readlength) 
			{
				callvariants(reflist,bamfiles_data[i].read->tid,last,bamfiles_data[i].read->position-offset_readlength,RQ,bamfiles_data,options,&variant);  
			}
			last = bamfiles_data[i].read->position-offset_readlength; if (last < 0) last =0;

			bamfiles_data[i].read->cflag = 0; 
			// this function should only be called on reads inside/close_to targeted regions..
			parse_cigar(bamfiles_data[i].read,reflist,bamfiles_data[i].read->tid,fcigarlist); 

			if (INDEL_REALIGNMENT >=1 && bamfiles_data[i].read->gaps > 0 && bamfiles_data[i].read->mquality >= 20) extract_indel_reads(bamfiles_data[i].read,reflist,bamfiles_data[i].read->tid,i,bamfiles_data[i].ilist);
			
			//fprintf(stdout,"read s:%d IS:%d %s %d \n",i,bamfiles_data[i].read->IS,bamfiles_data[i].read->readid,bamfiles_data[i].read->position);
			if (RQ->last == NULL)
			{
				RQ->last = bamfiles_data[i].read; RQ->first = RQ->last; (RQ->last)->next = NULL;
				RQ->reads++;
			}
			else
			{
				(RQ->last)->next = bamfiles_data[i].read; RQ->last = bamfiles_data[i].read; 
				(RQ->last)->next = NULL;
				RQ->reads++;
			}
			if (bamfiles_data[i].last ==NULL) bamfiles_data[i].first = RQ->last;
			else bamfiles_data[i].last->nextread= RQ->last;
			bamfiles_data[i].last = RQ->last; (RQ->last)->nextread =NULL;
			// read that passes filters from 'i'th bam file is inserted in queue, should also add it to OPE queue 
			//if (bamfiles_data[i].read->position < bamfiles_data[i].read->mateposition && bamfiles_data[i].read->lastpos > bamfiles_data[i].read->mateposition) 
			//fprintf(stdout,"B %d %s %d %d %d \n",i,bamfiles_data[i].read->readid,bamfiles_data[i].read->position,bamfiles_data[i].read->mateposition,bamfiles_data[i].read->IS);
		}
		else free_read(bamfiles_data[i].read);
		//fprintf(stdout,"read from %d %d %s\n",i,bamfiles_data[i].read->position,bamfiles_data[i].read->readid);

		if (options->regions ==NULL) rf =samread(bamfiles_data[i].fp,bamfiles_data[i].b);
		else rf  = bam_iter_read(bamfiles_data[i].fp->x.bam,bamfiles_data[i].iter,bamfiles_data[i].b);
		if (rf >=0)
		{
			bamfiles_data[i].read = get_read_bamfile(bamfiles_data[i].b,bamfiles_data[i].fp,pread); 
			//if (options->samples ==0) bamfiles_data[i].read->sampleid = i;
			//else bamfiles_data[i].read->sampleid = options->BAM_TO_SAMPLE[i];  
			// bug here june 30 2013 commented out .... in 12 T2D pools 
			bamfiles_data[i].read->sampleid = i;
			if (!(bamfiles_data[i].read->flag & BAM_FILTER_MASK)) minHeapify(&bheap,0,bamfiles_data);
		}
		else // no more reads in file 'i' 
		{ 
			bamfiles_data[i].finished = 1; bamfiles_data[i].read= NULL; 
			bam_destroy1(bamfiles_data[i].b);
			h++; finishedfiles++; 
			//fprintf(stderr,"finished reading bam file %s \n",options->bamfilelist[i]); //return 1;
			bheap.harray[0] = bheap.harray[bheap.length-1]; bheap.length--;
			if (bheap.length > 0) minHeapify(&bheap,0,bamfiles_data);
			// call minheapify like function to push sample i off the heap, reduce heap size
		} 
		if ((++reads)%1000000 ==0 && RQ->reads >0) fprintf(stderr,".....processed %ld reads QSIZE:%d %s:%d:%d variants called %d\n",reads,RQ->reads,RQ->first->chrom,RQ->first->position,RQ->first->lastpos,VARIANTS_CALLED);
	}

	if (prev_tid >=0)  // finish the processing of last chromosome 
	{
		if (RQ->reads >0) 
		{
			fprintf(stderr,"processing %d reads left in queue for chrom %s.....",RQ->reads,reflist->names[prev_tid]);
			if (reflist->lengths[prev_tid] > last) callvariants(reflist,prev_tid,last,reflist->lengths[prev_tid],RQ,bamfiles_data,options,&variant);
			empty_queue(RQ,bamfiles_data); //clean thequeue
		}
		else fprintf(stderr,"queue for chrom %s is empty ",reflist->names[prev_tid]);
		free(reflist->sequences[prev_tid]); 
		fprintf(stderr,"finished processing reads for chrom %s \n\n",reflist->names[prev_tid]);
		if (INDEL_REALIGNMENT >=1) 
		{
			print_indel_lists(bamfiles_data,bamfiles,reflist->lengths[prev_tid]); 
			clean_indel_lists(bamfiles_data,bamfiles,reflist->lengths[prev_tid]);
		}
	}
	fprintf(stderr,"CRISP has finished processing bam files: total reads processed %ld total variants called %d \n\n",reads,VARIANTS_CALLED);

	//for (i=0;i<bamfiles;i++) bam_destroy1(bamfiles_data[i].b);
	free(bamfiles_data); free(bheap.harray); free(fcigarlist);
	//empty_queue(RQ); //clean thequeue
	//fprintf(stdout,"FILE %d %s %d %s %d %d %d mapped %d \n",i,read->readid,read->flag,read->chrom,read->position,read->mquality,read->IS,(read->flag &4));
	return 1;
}
コード例 #3
0
ファイル: report_hic_pairs.cpp プロジェクト: zilhua/diffHic
SEXP internal_loop (const base_finder * const ffptr, status (*check_self_status)(const segment&, const segment&), const check_invalid_chimera * const icptr,
		SEXP pairlen, SEXP chrs, SEXP pos, SEXP flag, SEXP cigar, SEXP mapqual, SEXP chimera_strict, SEXP minqual, SEXP do_dedup) {

	// Checking input values.
	if (!isInteger(pairlen)) { throw std::runtime_error("length of pairs must be an integer vector"); }
	if (!isInteger(chrs)) { throw std::runtime_error("chromosomes must be an integer vector"); }
	if (!isInteger(pos)) { throw std::runtime_error("positions must be an integer vector"); }
	if (!isInteger(flag)) { throw std::runtime_error("SAM flags must be an integer vector"); }
	if (!isString(cigar)) { throw std::runtime_error("CIGAR strings must be a character vector"); }
	if (!isInteger(mapqual)) { throw std::runtime_error("mapping quality must be an integer vector"); }
	const int nreads=LENGTH(chrs);
	if (LENGTH(pos)!=nreads || LENGTH(flag)!=nreads || LENGTH(cigar)!=nreads || LENGTH(mapqual)!=nreads) {
		throw std::runtime_error("lengths of vectors of read information are not consistent");
	}
	if (!isLogical(chimera_strict) || LENGTH(chimera_strict)!=1) { throw std::runtime_error("chimera removal specification should be a logical scalar"); }
	const int npairs=LENGTH(pairlen);
	if (!isLogical(do_dedup) || LENGTH(do_dedup)!=1) { throw std::runtime_error("duplicate removal specification should be a logical scalar"); }
	if (!isInteger(minqual) || LENGTH(minqual)!=1) { throw std::runtime_error("minimum mapping quality should be an integer scalar"); }

	// Initializing pointers.
	const int* cptr=INTEGER(chrs);
	const int* pptr=INTEGER(pos);
	const int* fptr=INTEGER(flag);
	const int* qptr=INTEGER(mapqual);
	const bool rm_invalid=asLogical(chimera_strict);
	const bool rm_dup=asLogical(do_dedup);
	const int minq=asInteger(minqual);
	const bool rm_min=!ISNA(minq);
	const int * plptr=INTEGER(pairlen);
	const size_t nc=ffptr->nchrs();

	// Constructing output containers
	std::deque<std::deque<std::deque<valid_pair> > > collected(nc);
	for (size_t i=0; i<nc; ++i) { collected[i].resize(i+1); }
	std::deque<segment> read1, read2;
	segment current;
	valid_pair curpair;
	int single=0;
	int total=0, dupped=0, filtered=0, mapped=0;
	int dangling=0, selfie=0;
	int total_chim=0, mapped_chim=0, multi_chim=0, inv_chimeras=0;

	// Running through all reads and identifying the interaction they represent.
	int index=0, limit, pindex=0;
	while (index < nreads) {
		read1.clear();
		read2.clear();
		if (pindex==npairs) { throw std::runtime_error("ran out of pairs before running out of reads"); }
		const int& curpl=plptr[pindex];
		++pindex;
		limit=index+curpl;
		if (limit > nreads) { throw std::runtime_error("ran out of reads before running out of pairs"); }

		// Various flags that will be needed.
		bool isdup=false, isunmap=false, ischimera=false,
		     isfirst=false, hasfirst=false, hassecond=false,
		     curdup=false, curunmap=false;

		// Running through and collecting read segments.
		while (index < limit) {
			const int& curflag=fptr[index];
			current.reverse=(curflag & 0x10);
			current.chrid=cptr[index];
			current.pos=pptr[index];
			parse_cigar(CHAR(STRING_ELT(cigar, index)), current.alen, current.offset, current.reverse);

			// Checking how we should proceed; whether we should bother adding it or not.
			curdup=(curflag & 0x400);
			curunmap=(curflag & 0x4 || (rm_min && qptr[index] < minq));
			if (current.offset==0) {
				if (curdup) { isdup=true; }
				if (curunmap) { isunmap=true; }
			} else {
				ischimera=true;
			}

			// Checking what it is.
			isfirst = (curflag & 0x40);
			if (isfirst) { hasfirst=true; }
			else { hassecond=true; }

			// Checking which deque to put it in, if we're going to keep it.
			if (! (curdup && rm_dup) && ! curunmap) {
				std::deque<segment>& current_reads=(isfirst ? read1 : read2);
				if (current.offset==0) {
					current_reads.push_front(current);
				} else {
					current_reads.push_back(current);
				}
			}
			++index;
		}

		// Skipping if it's a singleton; otherwise, reporting it as part of the total read pairs.
		if (! (hasfirst && hassecond)) {
			++single;
			continue;
		}
		++total;

		// Adding to other statistics.
		if (ischimera) { ++total_chim; }
		if (isdup) { ++dupped; }
		if (isunmap) { ++filtered; }

		/* Skipping if unmapped, marked (and we're removing them), and if the first alignment
		 * of either read has any hard 5' clipping. This means that it's not truly 5' terminated
		 * (e.g. the actual 5' end was unmapped, duplicate removed or whatever). Note that
		 * not skipping UNMAP or DUP does not imply non-empty sets, as UNMAP/DUP are only set
		 * for 0-offset alignments; if this isn't in the file, these flags won't get set, but
		 * the sets can still be empty if non-zero-offset alignments are present and filtered
		 * (to escape the singles clause above). Thus, we need to check non-emptiness explicitly.
 		 */
		if (isunmap || (rm_dup && isdup) || read1.empty() || read2.empty() || read1.front().offset || read2.front().offset) { continue; }
		++mapped;

		// Assigning fragment IDs, if everything else is good.
		for (size_t i1=0; i1<read1.size(); ++i1) {
			segment& current=read1[i1];
			current.fragid=ffptr->find_fragment(current.chrid, current.pos, current.reverse, current.alen);
		}
		for (size_t i2=0; i2<read2.size(); ++i2) {
			segment& current=read2[i2];
			current.fragid=ffptr->find_fragment(current.chrid, current.pos, current.reverse, current.alen);
		}

		// Determining the type of construct if they have the same ID.
		switch ((*check_self_status)(read1.front(), read2.front())) {
			case ISPET:
				++dangling;
				continue;
			case ISMATE:
				++selfie;
				continue;
			default:
				break;
		}

		// Pulling out chimera diagnostics.
		if (ischimera) {
			++mapped_chim;
 		   	++multi_chim;	
			bool invalid=false;
			if (read1.size()==1 && read2.size()==1) {
				--multi_chim;
			} else if (read1.size() > 2 || read2.size() > 2) {
				invalid=true;
			} else {
				invalid=(*icptr)(read1, read2);
			}
			if (invalid) {
				++inv_chimeras;
				if (rm_invalid) { continue; }
			}
		}
		
		// Choosing the anchor segment, and reporting it.
		bool anchor=false;
		if (read1.front().chrid > read2.front().chrid) {
 		   anchor=true;
	   	} else if (read1.front().chrid==read2.front().chrid) {
			if (read1.front().fragid > read2.front().fragid) {
				anchor=true;
			} else if (read1.front().fragid == read2.front().fragid) {
				if (read1.front().pos > read2.front().pos) {
					anchor=true;
				}
			}
		}
		const segment& anchor_seg=(anchor ? read1.front() : read2.front());
		const segment& target_seg=(anchor ? read2.front() : read1.front());
		
		curpair.anchor=anchor_seg.fragid;
		curpair.target=target_seg.fragid;
		curpair.apos=anchor_seg.pos;
		curpair.alen=anchor_seg.alen;
		if (anchor_seg.reverse) { curpair.alen*=-1; }
		curpair.tpos=target_seg.pos;
		curpair.tlen=target_seg.alen;
		if (target_seg.reverse) { curpair.tlen*=-1; }

		if (curpair.alen==0 || curpair.tlen==0) { throw std::runtime_error("alignment lengths of zero should not be present"); }
		collected[anchor_seg.chrid][target_seg.chrid].push_back(curpair);
	}

	// Checking if all pairs were used up.
	if (pindex!=npairs) { throw std::runtime_error("ran out of reads before running out of pairs"); }

	SEXP total_output=PROTECT(allocVector(VECSXP, 6));
	try {
		// Checking how many are not (doubly) empty.
		std::deque<std::pair<int, int> > good;
		for (size_t i=0; i<nc; ++i) {
			for (size_t j=0; j<=i; ++j) {
				const std::deque<valid_pair>& curpairs=collected[i][j];
				if (!curpairs.empty()) { good.push_back(std::make_pair(i, j)); }
			}
		}	

		SET_VECTOR_ELT(total_output, 0, allocMatrix(INTSXP, good.size(), 2));
		int* aptr=INTEGER(VECTOR_ELT(total_output, 0));
		int* tptr=aptr+good.size();
		SET_VECTOR_ELT(total_output, 1, allocVector(VECSXP, good.size()));
		SEXP output=VECTOR_ELT(total_output, 1);

		for (size_t i=0; i<good.size(); ++i) {
			aptr[i]=good[i].first+1;
			tptr[i]=good[i].second+1;

			// Filling up those non-empty pairs of chromosomes.
			std::deque<valid_pair>& curpairs=collected[good[i].first][good[i].second];
			SET_VECTOR_ELT(output, i, allocMatrix(INTSXP, curpairs.size(), 6));
			int* axptr=INTEGER(VECTOR_ELT(output, i));
			int* txptr=axptr+curpairs.size();
			int* apxptr=txptr+curpairs.size();
			int* tpxptr=apxptr+curpairs.size();
			int* afxptr=tpxptr+curpairs.size();
			int* tfxptr=afxptr+curpairs.size();
			for (size_t k=0; k<curpairs.size(); ++k) {
				axptr[k]=curpairs[k].anchor+1;
				txptr[k]=curpairs[k].target+1;
				apxptr[k]=curpairs[k].apos;
				tpxptr[k]=curpairs[k].tpos;
				afxptr[k]=curpairs[k].alen;
				tfxptr[k]=curpairs[k].tlen;
			}

			// Emptying out the container once we've processed it, to keep memory usage down.
			std::deque<valid_pair>().swap(curpairs);
		}

		// Dumping mapping diagnostics.
		SET_VECTOR_ELT(total_output, 2, allocVector(INTSXP, 4));
		int* dptr=INTEGER(VECTOR_ELT(total_output, 2));
		dptr[0]=total;
		dptr[1]=dupped;
		dptr[2]=filtered;
		dptr[3]=mapped;
	
		// Dumping the number of dangling ends, self-circles.	
		SET_VECTOR_ELT(total_output, 3, allocVector(INTSXP, 2));
		int * siptr=INTEGER(VECTOR_ELT(total_output, 3));
		siptr[0]=dangling;
		siptr[1]=selfie;

		// Dumping the number designated 'single', as there's no pairs.
		SET_VECTOR_ELT(total_output, 4, ScalarInteger(single));

		// Dumping chimeric diagnostics.
		SET_VECTOR_ELT(total_output, 5, allocVector(INTSXP, 4));
		int* cptr=INTEGER(VECTOR_ELT(total_output, 5));
		cptr[0]=total_chim;
		cptr[1]=mapped_chim;
		cptr[2]=multi_chim;
		cptr[3]=inv_chimeras;
	} catch (std::exception& e) {
		UNPROTECT(1);
		throw;
	}
	UNPROTECT(1);
	return total_output;
}