SEXP test_parse_cigar (SEXP incoming, SEXP reverse) try { if (!isString(incoming) || LENGTH(incoming)!=1) { throw std::runtime_error("need one cigar string"); } if (!isLogical(reverse) || LENGTH(reverse)!=1) { throw std::runtime_error("need a reverse specifier"); } SEXP output=PROTECT(allocVector(INTSXP, 2)); int* optr=INTEGER(output); int& alen=*optr; int& offset=*(optr+1); parse_cigar(CHAR(STRING_ELT(incoming, 0)), alen, offset, asLogical(reverse)); UNPROTECT(1); return(output); } catch (std::exception& e) { return mkString(e.what()); }
// multi sample variant caller: CRISP, PICALL or low coverage method int multisampleVC(struct OPTIONS* options,REFLIST* reflist,FILE* fp) { if (USE_DUPLICATES ==1) BAM_FILTER_MASK = (BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL); else BAM_FILTER_MASK = (BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP); int bamfiles = options->bamfiles; int last=0; // last is the current position s.t. all reads have starting position > last int i=0; int h=0; unsigned long reads=0; int j=0; int prev_tid = -1; int rf=0; int finishedfiles =0; struct alignedread* pread = NULL; struct BAMFILE_data* bamfiles_data = calloc(bamfiles,sizeof(struct BAMFILE_data)); // added one extra to list to store indels for all samples combined READQUEUE* RQ = (READQUEUE*)malloc(sizeof(READQUEUE)); RQ->first = NULL; RQ->last = NULL; RQ->reads = 0; int* fcigarlist = (int*)malloc(sizeof(int)*4096); // data structure for holding potential variants and read counts, etc struct VARIANT variant; variant.ploidy = calloc(options->bamfiles,sizeof(int)); init_poolsizes(&variant,options,PICALL); init_variant(&variant,options->bamfiles,options->bamfiles); variant.options = options; // pointer to options BAMHEAP bheap; bheap.harray = (int*)malloc(sizeof(int)*bamfiles); bheap.length = bamfiles; for (i=0;i<bamfiles;i++) { bheap.harray[i] = i; bamfiles_data[i].finished= 0;} reflist->cinterval = -1; // first interval to the right of current base init_bamfiles(bamfiles_data,options->bamfilelist,bamfiles,options->regions,&options->targettid,&options->targetstart,&options->targetend); // error when reading indexed bam files probably due to lack of reads in some files resulting in heap error, fixed oct 17 2012 j=0; for (i=0;i<bamfiles;i++) { finishedfiles += bamfiles_data[i].finished; if (bamfiles_data[i].finished ==0) bheap.harray[j++] = i; else bheap.length--; } buildminheap(&bheap,bamfiles_data); // initial minheap call //fprintf(stderr,"finishedfiles %d \n",finishedfiles); if (INDEL_REALIGNMENT >=1) allocate_mem_heap(bamfiles_data,bamfiles,100); HAPLOTYPES =0,MIN_COVERAGE_FLANKING =0; for (i=0;i<variant.samples;i++) { MIN_COVERAGE_FLANKING += 2*variant.ploidy[i]; // enforced for regions outside the bedfile target HAPLOTYPES += variant.ploidy[i]; } //int min_coverage_target = 1*variant->ploidy*variant->samples; // enforced for regions outside the bedfile target int offset_readlength = 150; // call variants in window (last,current_read_position-offset_readlength) to allow for indel analysis, set to 0 for original behavior of program // the value of offset should not affect the correctness or speed of the code int current_position =0; while (finishedfiles < bamfiles) { i = bheap.harray[0]; // take the top read off the heap if ( !(bamfiles_data[i].read->flag & BAM_FILTER_MASK)) { if (bamfiles_data[i].read->tid != prev_tid) // read's chromosome is different from previousread { if (prev_tid >=0) // finish the processing of previous chromosome and cleanup { if (RQ->reads >0) { fprintf(stderr,"processing %d reads left in queue for chrom %s...",RQ->reads,reflist->names[prev_tid]); callvariants(reflist,prev_tid,last,reflist->lengths[prev_tid],RQ,bamfiles_data,options,&variant); empty_queue(RQ,bamfiles_data); //clean thequeue } if (INDEL_REALIGNMENT >=1) clean_indel_lists(bamfiles_data,bamfiles,-1); current_position = 0; for(j=0;j<bamfiles;j++) bamfiles_data[j].last=NULL; last =0; free(reflist->sequences[prev_tid]); fprintf(stderr,".....finished processing reads for chrom %s\n",reflist->names[prev_tid]); fprintf(stdout,".....finished processing reads for chrom %s\n",reflist->names[prev_tid]); reflist->cinterval = -1; // reset to -1 } read_chromosome(reflist,bamfiles_data[i].read->tid,fp); prev_tid =bamfiles_data[i].read->tid; } if (bamfiles_data[i].read->position <last) { fprintf(stderr,"reads out of order i:%d h:%d pos: %d %d\n",i,h,bamfiles_data[i].read->position,last); fprintf(stderr,"the program will now exit, please sort the bamfiles\n"); return 1; } if (INDEL_REALIGNMENT >=1 && bamfiles_data[i].read->position > current_position+offset_readlength) { // need to clean up indel lists when we encounter a new chromosome... print_indel_lists(bamfiles_data,bamfiles,current_position+offset_readlength); clean_indel_lists(bamfiles_data,bamfiles,current_position); current_position = bamfiles_data[i].read->position; } // realign reads before calling variants, each read is realigned only once // small bug here, only call variants when last is less than current read position // bug fixed here, update last only when 'callvariants' is invoked, ??? if (RQ->reads > 0 && bamfiles_data[i].read->position > last+offset_readlength) { callvariants(reflist,bamfiles_data[i].read->tid,last,bamfiles_data[i].read->position-offset_readlength,RQ,bamfiles_data,options,&variant); } last = bamfiles_data[i].read->position-offset_readlength; if (last < 0) last =0; bamfiles_data[i].read->cflag = 0; // this function should only be called on reads inside/close_to targeted regions.. parse_cigar(bamfiles_data[i].read,reflist,bamfiles_data[i].read->tid,fcigarlist); if (INDEL_REALIGNMENT >=1 && bamfiles_data[i].read->gaps > 0 && bamfiles_data[i].read->mquality >= 20) extract_indel_reads(bamfiles_data[i].read,reflist,bamfiles_data[i].read->tid,i,bamfiles_data[i].ilist); //fprintf(stdout,"read s:%d IS:%d %s %d \n",i,bamfiles_data[i].read->IS,bamfiles_data[i].read->readid,bamfiles_data[i].read->position); if (RQ->last == NULL) { RQ->last = bamfiles_data[i].read; RQ->first = RQ->last; (RQ->last)->next = NULL; RQ->reads++; } else { (RQ->last)->next = bamfiles_data[i].read; RQ->last = bamfiles_data[i].read; (RQ->last)->next = NULL; RQ->reads++; } if (bamfiles_data[i].last ==NULL) bamfiles_data[i].first = RQ->last; else bamfiles_data[i].last->nextread= RQ->last; bamfiles_data[i].last = RQ->last; (RQ->last)->nextread =NULL; // read that passes filters from 'i'th bam file is inserted in queue, should also add it to OPE queue //if (bamfiles_data[i].read->position < bamfiles_data[i].read->mateposition && bamfiles_data[i].read->lastpos > bamfiles_data[i].read->mateposition) //fprintf(stdout,"B %d %s %d %d %d \n",i,bamfiles_data[i].read->readid,bamfiles_data[i].read->position,bamfiles_data[i].read->mateposition,bamfiles_data[i].read->IS); } else free_read(bamfiles_data[i].read); //fprintf(stdout,"read from %d %d %s\n",i,bamfiles_data[i].read->position,bamfiles_data[i].read->readid); if (options->regions ==NULL) rf =samread(bamfiles_data[i].fp,bamfiles_data[i].b); else rf = bam_iter_read(bamfiles_data[i].fp->x.bam,bamfiles_data[i].iter,bamfiles_data[i].b); if (rf >=0) { bamfiles_data[i].read = get_read_bamfile(bamfiles_data[i].b,bamfiles_data[i].fp,pread); //if (options->samples ==0) bamfiles_data[i].read->sampleid = i; //else bamfiles_data[i].read->sampleid = options->BAM_TO_SAMPLE[i]; // bug here june 30 2013 commented out .... in 12 T2D pools bamfiles_data[i].read->sampleid = i; if (!(bamfiles_data[i].read->flag & BAM_FILTER_MASK)) minHeapify(&bheap,0,bamfiles_data); } else // no more reads in file 'i' { bamfiles_data[i].finished = 1; bamfiles_data[i].read= NULL; bam_destroy1(bamfiles_data[i].b); h++; finishedfiles++; //fprintf(stderr,"finished reading bam file %s \n",options->bamfilelist[i]); //return 1; bheap.harray[0] = bheap.harray[bheap.length-1]; bheap.length--; if (bheap.length > 0) minHeapify(&bheap,0,bamfiles_data); // call minheapify like function to push sample i off the heap, reduce heap size } if ((++reads)%1000000 ==0 && RQ->reads >0) fprintf(stderr,".....processed %ld reads QSIZE:%d %s:%d:%d variants called %d\n",reads,RQ->reads,RQ->first->chrom,RQ->first->position,RQ->first->lastpos,VARIANTS_CALLED); } if (prev_tid >=0) // finish the processing of last chromosome { if (RQ->reads >0) { fprintf(stderr,"processing %d reads left in queue for chrom %s.....",RQ->reads,reflist->names[prev_tid]); if (reflist->lengths[prev_tid] > last) callvariants(reflist,prev_tid,last,reflist->lengths[prev_tid],RQ,bamfiles_data,options,&variant); empty_queue(RQ,bamfiles_data); //clean thequeue } else fprintf(stderr,"queue for chrom %s is empty ",reflist->names[prev_tid]); free(reflist->sequences[prev_tid]); fprintf(stderr,"finished processing reads for chrom %s \n\n",reflist->names[prev_tid]); if (INDEL_REALIGNMENT >=1) { print_indel_lists(bamfiles_data,bamfiles,reflist->lengths[prev_tid]); clean_indel_lists(bamfiles_data,bamfiles,reflist->lengths[prev_tid]); } } fprintf(stderr,"CRISP has finished processing bam files: total reads processed %ld total variants called %d \n\n",reads,VARIANTS_CALLED); //for (i=0;i<bamfiles;i++) bam_destroy1(bamfiles_data[i].b); free(bamfiles_data); free(bheap.harray); free(fcigarlist); //empty_queue(RQ); //clean thequeue //fprintf(stdout,"FILE %d %s %d %s %d %d %d mapped %d \n",i,read->readid,read->flag,read->chrom,read->position,read->mquality,read->IS,(read->flag &4)); return 1; }
SEXP internal_loop (const base_finder * const ffptr, status (*check_self_status)(const segment&, const segment&), const check_invalid_chimera * const icptr, SEXP pairlen, SEXP chrs, SEXP pos, SEXP flag, SEXP cigar, SEXP mapqual, SEXP chimera_strict, SEXP minqual, SEXP do_dedup) { // Checking input values. if (!isInteger(pairlen)) { throw std::runtime_error("length of pairs must be an integer vector"); } if (!isInteger(chrs)) { throw std::runtime_error("chromosomes must be an integer vector"); } if (!isInteger(pos)) { throw std::runtime_error("positions must be an integer vector"); } if (!isInteger(flag)) { throw std::runtime_error("SAM flags must be an integer vector"); } if (!isString(cigar)) { throw std::runtime_error("CIGAR strings must be a character vector"); } if (!isInteger(mapqual)) { throw std::runtime_error("mapping quality must be an integer vector"); } const int nreads=LENGTH(chrs); if (LENGTH(pos)!=nreads || LENGTH(flag)!=nreads || LENGTH(cigar)!=nreads || LENGTH(mapqual)!=nreads) { throw std::runtime_error("lengths of vectors of read information are not consistent"); } if (!isLogical(chimera_strict) || LENGTH(chimera_strict)!=1) { throw std::runtime_error("chimera removal specification should be a logical scalar"); } const int npairs=LENGTH(pairlen); if (!isLogical(do_dedup) || LENGTH(do_dedup)!=1) { throw std::runtime_error("duplicate removal specification should be a logical scalar"); } if (!isInteger(minqual) || LENGTH(minqual)!=1) { throw std::runtime_error("minimum mapping quality should be an integer scalar"); } // Initializing pointers. const int* cptr=INTEGER(chrs); const int* pptr=INTEGER(pos); const int* fptr=INTEGER(flag); const int* qptr=INTEGER(mapqual); const bool rm_invalid=asLogical(chimera_strict); const bool rm_dup=asLogical(do_dedup); const int minq=asInteger(minqual); const bool rm_min=!ISNA(minq); const int * plptr=INTEGER(pairlen); const size_t nc=ffptr->nchrs(); // Constructing output containers std::deque<std::deque<std::deque<valid_pair> > > collected(nc); for (size_t i=0; i<nc; ++i) { collected[i].resize(i+1); } std::deque<segment> read1, read2; segment current; valid_pair curpair; int single=0; int total=0, dupped=0, filtered=0, mapped=0; int dangling=0, selfie=0; int total_chim=0, mapped_chim=0, multi_chim=0, inv_chimeras=0; // Running through all reads and identifying the interaction they represent. int index=0, limit, pindex=0; while (index < nreads) { read1.clear(); read2.clear(); if (pindex==npairs) { throw std::runtime_error("ran out of pairs before running out of reads"); } const int& curpl=plptr[pindex]; ++pindex; limit=index+curpl; if (limit > nreads) { throw std::runtime_error("ran out of reads before running out of pairs"); } // Various flags that will be needed. bool isdup=false, isunmap=false, ischimera=false, isfirst=false, hasfirst=false, hassecond=false, curdup=false, curunmap=false; // Running through and collecting read segments. while (index < limit) { const int& curflag=fptr[index]; current.reverse=(curflag & 0x10); current.chrid=cptr[index]; current.pos=pptr[index]; parse_cigar(CHAR(STRING_ELT(cigar, index)), current.alen, current.offset, current.reverse); // Checking how we should proceed; whether we should bother adding it or not. curdup=(curflag & 0x400); curunmap=(curflag & 0x4 || (rm_min && qptr[index] < minq)); if (current.offset==0) { if (curdup) { isdup=true; } if (curunmap) { isunmap=true; } } else { ischimera=true; } // Checking what it is. isfirst = (curflag & 0x40); if (isfirst) { hasfirst=true; } else { hassecond=true; } // Checking which deque to put it in, if we're going to keep it. if (! (curdup && rm_dup) && ! curunmap) { std::deque<segment>& current_reads=(isfirst ? read1 : read2); if (current.offset==0) { current_reads.push_front(current); } else { current_reads.push_back(current); } } ++index; } // Skipping if it's a singleton; otherwise, reporting it as part of the total read pairs. if (! (hasfirst && hassecond)) { ++single; continue; } ++total; // Adding to other statistics. if (ischimera) { ++total_chim; } if (isdup) { ++dupped; } if (isunmap) { ++filtered; } /* Skipping if unmapped, marked (and we're removing them), and if the first alignment * of either read has any hard 5' clipping. This means that it's not truly 5' terminated * (e.g. the actual 5' end was unmapped, duplicate removed or whatever). Note that * not skipping UNMAP or DUP does not imply non-empty sets, as UNMAP/DUP are only set * for 0-offset alignments; if this isn't in the file, these flags won't get set, but * the sets can still be empty if non-zero-offset alignments are present and filtered * (to escape the singles clause above). Thus, we need to check non-emptiness explicitly. */ if (isunmap || (rm_dup && isdup) || read1.empty() || read2.empty() || read1.front().offset || read2.front().offset) { continue; } ++mapped; // Assigning fragment IDs, if everything else is good. for (size_t i1=0; i1<read1.size(); ++i1) { segment& current=read1[i1]; current.fragid=ffptr->find_fragment(current.chrid, current.pos, current.reverse, current.alen); } for (size_t i2=0; i2<read2.size(); ++i2) { segment& current=read2[i2]; current.fragid=ffptr->find_fragment(current.chrid, current.pos, current.reverse, current.alen); } // Determining the type of construct if they have the same ID. switch ((*check_self_status)(read1.front(), read2.front())) { case ISPET: ++dangling; continue; case ISMATE: ++selfie; continue; default: break; } // Pulling out chimera diagnostics. if (ischimera) { ++mapped_chim; ++multi_chim; bool invalid=false; if (read1.size()==1 && read2.size()==1) { --multi_chim; } else if (read1.size() > 2 || read2.size() > 2) { invalid=true; } else { invalid=(*icptr)(read1, read2); } if (invalid) { ++inv_chimeras; if (rm_invalid) { continue; } } } // Choosing the anchor segment, and reporting it. bool anchor=false; if (read1.front().chrid > read2.front().chrid) { anchor=true; } else if (read1.front().chrid==read2.front().chrid) { if (read1.front().fragid > read2.front().fragid) { anchor=true; } else if (read1.front().fragid == read2.front().fragid) { if (read1.front().pos > read2.front().pos) { anchor=true; } } } const segment& anchor_seg=(anchor ? read1.front() : read2.front()); const segment& target_seg=(anchor ? read2.front() : read1.front()); curpair.anchor=anchor_seg.fragid; curpair.target=target_seg.fragid; curpair.apos=anchor_seg.pos; curpair.alen=anchor_seg.alen; if (anchor_seg.reverse) { curpair.alen*=-1; } curpair.tpos=target_seg.pos; curpair.tlen=target_seg.alen; if (target_seg.reverse) { curpair.tlen*=-1; } if (curpair.alen==0 || curpair.tlen==0) { throw std::runtime_error("alignment lengths of zero should not be present"); } collected[anchor_seg.chrid][target_seg.chrid].push_back(curpair); } // Checking if all pairs were used up. if (pindex!=npairs) { throw std::runtime_error("ran out of reads before running out of pairs"); } SEXP total_output=PROTECT(allocVector(VECSXP, 6)); try { // Checking how many are not (doubly) empty. std::deque<std::pair<int, int> > good; for (size_t i=0; i<nc; ++i) { for (size_t j=0; j<=i; ++j) { const std::deque<valid_pair>& curpairs=collected[i][j]; if (!curpairs.empty()) { good.push_back(std::make_pair(i, j)); } } } SET_VECTOR_ELT(total_output, 0, allocMatrix(INTSXP, good.size(), 2)); int* aptr=INTEGER(VECTOR_ELT(total_output, 0)); int* tptr=aptr+good.size(); SET_VECTOR_ELT(total_output, 1, allocVector(VECSXP, good.size())); SEXP output=VECTOR_ELT(total_output, 1); for (size_t i=0; i<good.size(); ++i) { aptr[i]=good[i].first+1; tptr[i]=good[i].second+1; // Filling up those non-empty pairs of chromosomes. std::deque<valid_pair>& curpairs=collected[good[i].first][good[i].second]; SET_VECTOR_ELT(output, i, allocMatrix(INTSXP, curpairs.size(), 6)); int* axptr=INTEGER(VECTOR_ELT(output, i)); int* txptr=axptr+curpairs.size(); int* apxptr=txptr+curpairs.size(); int* tpxptr=apxptr+curpairs.size(); int* afxptr=tpxptr+curpairs.size(); int* tfxptr=afxptr+curpairs.size(); for (size_t k=0; k<curpairs.size(); ++k) { axptr[k]=curpairs[k].anchor+1; txptr[k]=curpairs[k].target+1; apxptr[k]=curpairs[k].apos; tpxptr[k]=curpairs[k].tpos; afxptr[k]=curpairs[k].alen; tfxptr[k]=curpairs[k].tlen; } // Emptying out the container once we've processed it, to keep memory usage down. std::deque<valid_pair>().swap(curpairs); } // Dumping mapping diagnostics. SET_VECTOR_ELT(total_output, 2, allocVector(INTSXP, 4)); int* dptr=INTEGER(VECTOR_ELT(total_output, 2)); dptr[0]=total; dptr[1]=dupped; dptr[2]=filtered; dptr[3]=mapped; // Dumping the number of dangling ends, self-circles. SET_VECTOR_ELT(total_output, 3, allocVector(INTSXP, 2)); int * siptr=INTEGER(VECTOR_ELT(total_output, 3)); siptr[0]=dangling; siptr[1]=selfie; // Dumping the number designated 'single', as there's no pairs. SET_VECTOR_ELT(total_output, 4, ScalarInteger(single)); // Dumping chimeric diagnostics. SET_VECTOR_ELT(total_output, 5, allocVector(INTSXP, 4)); int* cptr=INTEGER(VECTOR_ELT(total_output, 5)); cptr[0]=total_chim; cptr[1]=mapped_chim; cptr[2]=multi_chim; cptr[3]=inv_chimeras; } catch (std::exception& e) { UNPROTECT(1); throw; } UNPROTECT(1); return total_output; }