/*Busqueda binaria recursiva, retorna -1 si no encuentra algo*/ int binarySearch2(int value,int min,int max,int v[]){ if(min > max) return -1; int middle = (min + max) / 2; if(v[middle] == value) return middle; else if(value < v[middle]) return binarySearch2(value,min,middle - 1,v); else if(v[middle] < value) return binarySearch2(value,middle + 1,max,v); return -1; }
//binary search using recursion int binarySearch2(int a[], int low, int high, int x){ if(low > high){ std::cout<<"binary search2: Did not find the number. \n"; return -1; } int mid = low + (high - low)/2; if(x == a[mid]){ std::cout<< "binary search2: found the number in index "<<mid<<std::endl; return mid; } else if(x < a[mid]) return binarySearch2(a, low, mid -1, x); else return binarySearch2(a, mid + 1, high, x); }
int main(int argc, const char * argv[]) { int myarray[] = {0,1,2,3,4,5,6,16,20,34,54,89}; binarySearch(myarray, sizeof(myarray)/sizeof(int), -1); binarySearch2(myarray, 0, sizeof(myarray)/sizeof(int), 34); return 0; }
int main() { /*int a[5]={1,3,6,9,11}; int needle = 10; printf("1:%d:%d\n",needle,binarySearch1(a,needle,0,5)); printf("2:%d:%d\n",needle,binarySearch2(a,needle,5));*/ int n,t,i; int x[MAXN]; while(scanf("%d%d",&n,&t) != EOF){ assert(n); for(i=0; i<n; ++i) x[i] = 10*i; printf("result:%d\n",binarySearch2(x,t,n)); } return 0; }
void stitchWindowAligns(uint iA, uint nA, int Score, bool WAincl[], uint tR2, uint tG2, Transcript trA, \ uint Lread, uiWA* WA, char* R, char* Q, char* G, char* sigG,\ Parameters* P, Transcript** wTr, uint* nWinTr, ReadAlign *RA) { //recursively stitch aligns for one gene //*nWinTr - number of transcripts for the current window if (iA>=nA && tR2==0) return; //no aligns in the transcript if (iA>=nA) {//no more aligns to add, finalize the transcript //extend first Transcript trAstep1; int vOrder[2]; //decide in which order to extend: extend the 5' of the read first #if EXTEND_ORDER==1 if ( trA.roStr==0 ) {//decide in which order to extend: extend the 5' of the read first vOrder[0]=0; vOrder[1]=1; } else { vOrder[0]=1; vOrder[1]=0; }; #elif EXTEND_ORDER==2 vOrder[0]=0; vOrder[1]=1; #else #error "EXTEND_ORDER value unrecognized" #endif for (int iOrd=0;iOrd<2;iOrd++) { switch (vOrder[iOrd]) { case 0: //extend at start if (trA.rStart>0) {// if transcript does not start at base, extend to the read start //calculate # of allowed mismatches that has been left double pMMmax=(P->alignEndsType=="Extend5pOfRead1" && trA.exons[0][EX_iFrag]==0 && trA.Str==0) ? -1 : P->outFilterMismatchNoverLmax1; trAstep1.reset(); // //avoid extending before Chr start if ( extendAlign(R, Q, G, trA.rStart-1, trA.gStart-1, -1, -1, min(trA.rStart, trA.gStart - P->chrStart[trA.Chr]), tR2-trA.rStart+1, trA.nMM, RA->outFilterMismatchNmaxTotal, pMMmax, &trAstep1) ) {//if could extend trA.add(&trAstep1); Score += trAstep1.maxScore; trA.exons[0][EX_R] = trA.rStart = trA.rStart - trAstep1.extendL; trA.exons[0][EX_G] = trA.gStart = trA.gStart - trAstep1.extendL; trA.exons[0][EX_L] += trAstep1.extendL; }; //TODO penalize the unmapped bases at the start }; break; case 1: //extend at end if ( tR2+1<Lread ) {//extend alignment to the read end //calculate # of allowed mismatches that has been left double pMMmax=(P->alignEndsType=="Extend5pOfRead1" && trA.exons[trA.nExons-1][EX_iFrag]==0 && trA.Str==0) ? -1 : P->outFilterMismatchNoverLmax1; trAstep1.reset(); // //to prevent extension past the Chr end if ( extendAlign(R, Q, G, tR2+1, tG2+1, +1, +1, min(Lread-tR2-1,P->chrStart[trA.Chr+1]-tG2-2), tR2-trA.rStart+1, trA.nMM, RA->outFilterMismatchNmaxTotal, pMMmax, &trAstep1) ) {//if could extend trA.add(&trAstep1); Score += trAstep1.maxScore; tR2 += trAstep1.extendL; tG2 += trAstep1.extendL; trA.exons[trA.nExons-1][EX_L] += trAstep1.extendL;//extend the length of the last exon }; //TODO penalize unmapped bases at the end }; }; }; if (P->alignSoftClipAtReferenceEnds=="No" && \ ( (trA.exons[trA.nExons-1][EX_G] + Lread-trA.exons[trA.nExons-1][EX_R]) > (P->chrStart[trA.Chr]+P->chrLength[trA.Chr]) || \ trA.exons[0][EX_G]<(P->chrStart[trA.Chr]+trA.exons[0][EX_R]) ) ) { return; //no soft clipping past the ends of the chromosome }; trA.rLength = 0; for (uint isj=0;isj<trA.nExons;isj++) { trA.rLength += trA.exons[isj][EX_L]; }; trA.gLength = tG2+1-trA.gStart; //check exons lenghts including repeats, do not report a transcript with short exons for (uint isj=0;isj<trA.nExons-1;isj++) {//check exons for min length, if they are not annotated and precede a junction if ( trA.canonSJ[isj]>=0 ) {//junction if (trA.sjAnnot[isj]==1) {//sjdb if ( ( trA.exons[isj][EX_L] < P->alignSJDBoverhangMin && (isj==0 || trA.canonSJ[isj-1]==-3 || (trA.sjAnnot[isj-1]==0 && trA.canonSJ[isj-1]>=0) ) )\ || ( trA.exons[isj+1][EX_L] < P->alignSJDBoverhangMin && (isj==trA.nExons-2 || trA.canonSJ[isj+1]==-3 || (trA.sjAnnot[isj+1]==0 && trA.canonSJ[isj+1]>=0) ) ) )return; } else {//non-sjdb if ( trA.exons[isj][EX_L] < P->alignSJoverhangMin + trA.shiftSJ[isj][0] \ || trA.exons[isj+1][EX_L] < P->alignSJoverhangMin + trA.shiftSJ[isj][1] ) return; }; }; }; if (trA.nExons>1 && trA.sjAnnot[trA.nExons-2]==1 && trA.exons[trA.nExons-1][EX_L] < P->alignSJDBoverhangMin) return; //this exon was not checkedin the cycle above trA.intronMotifs[0]=0;trA.intronMotifs[1]=0;trA.intronMotifs[2]=0; for (uint iex=0;iex<trA.nExons-1;iex++) { if (trA.canonSJ[iex]==0) { ++trA.intronMotifs[0]; } else if (trA.canonSJ[iex]>0) { ++trA.intronMotifs[2-trA.canonSJ[iex]%2]; }; }; //filter strand consistency trA.sjMotifStrand=0; uint sjN=0; for (uint iex=0;iex<trA.nExons-1;iex++) { if (trA.canonSJ[iex]>=0) sjN++; if (trA.sjStr[iex]>0) {//only these sjs have defined strand if (trA.sjMotifStrand==0) { trA.sjMotifStrand=trA.sjStr[iex]; } else if (trA.sjMotifStrand != trA.sjStr[iex]) {//inconsistent strand return; //kill this transcript }; }; }; if (sjN>0 && trA.sjMotifStrand==0 && P->outSAMstrandField=="intronMotif") {//strand not defined for a junction return; }; if (P->outFilterIntronMotifs=="None") {//no filtering } else if (P->outFilterIntronMotifs=="RemoveNoncanonical") { for (uint iex=0;iex<trA.nExons-1;iex++) { if (trA.canonSJ[iex]==0) return; }; } else if (P->outFilterIntronMotifs=="RemoveNoncanonicalUnannotated") { for (uint iex=0;iex<trA.nExons-1;iex++) { if (trA.canonSJ[iex]==0 && trA.sjAnnot[iex]==0) return; }; } else { ostringstream errOut; errOut << "EXITING because of FATAL INPUT error: unrecognized value of --outFilterIntronMotifs=" <<P->outFilterIntronMotifs <<"\n"; errOut << "SOLUTION: re-run STAR with --outFilterIntronMotifs = None -OR- RemoveNoncanonical -OR- RemoveNoncanonicalUnannotated\n"; exitWithError(errOut.str(),std::cerr, P->inOut->logMain, EXIT_CODE_INPUT_FILES, *P); }; {//check mapped length for each mate uint nsj=0,exl=0; for (uint iex=0;iex<trA.nExons;iex++) {// exl+=trA.exons[iex][EX_L]; if (iex==trA.nExons-1 || trA.canonSJ[iex]==-3) {//mate is completed, make the checks if (nsj>0 && (exl<P->alignSplicedMateMapLmin || exl < (uint) (P->alignSplicedMateMapLminOverLmate*RA->readLength[trA.exons[iex][EX_iFrag]])) ) { return; //do not record this transcript }; exl=0;nsj=0; } else if (trA.canonSJ[iex]>=0) { nsj++; }; }; }; if (P->outFilterBySJoutStage==2) {//junctions have to be present in the filtered set P->sjnovel for (uint iex=0;iex<trA.nExons-1;iex++) { if (trA.canonSJ[iex]>=0 && trA.sjAnnot[iex]==0) { uint jS=trA.exons[iex][EX_G]+trA.exons[iex][EX_L]; uint jE=trA.exons[iex+1][EX_G]-1; if ( binarySearch2(jS,jE,P->sjNovelStart,P->sjNovelEnd,P->sjNovelN) < 0 ) return; }; }; }; if ( trA.exons[0][EX_iFrag]!=trA.exons[trA.nExons-1][EX_iFrag] ) {//check for correct overlap between mates if (trA.exons[trA.nExons-1][EX_G]+trA.exons[trA.nExons-1][EX_L] <= trA.exons[0][EX_G]) return; //to avoid negative insert size uint iexM2=trA.nExons; for (uint iex=0;iex<trA.nExons-1;iex++) {//find the first exon of the second mate if (trA.canonSJ[iex]==-3) {// iexM2=iex+1; break; }; }; if ( trA.exons[iexM2-1][EX_G] + trA.exons[iexM2-1][EX_L] > trA.exons[iexM2][EX_G] ) {//mates overlap - check consistency of junctions if (trA.exons[0][EX_G] > trA.exons[iexM2][EX_G]+trA.exons[0][EX_R]) return; //LeftMateStart > RightMateStart if (trA.exons[iexM2-1][EX_G]+trA.exons[iexM2-1][EX_L] > trA.exons[trA.nExons-1][EX_G]+Lread-trA.exons[trA.nExons-1][EX_R]) return; //LeftMateEnd > RightMateEnd //check for junctions consistency uint iex1=1, iex2=iexM2+1; //last exons of the junction for (; iex1<iexM2; iex1++) {//find first junction that overlaps 2nd mate if (trA.exons[iex1][EX_G] >= trA.exons[iex2-1][EX_G] + trA.exons[iex2-1][EX_L]) break; }; while (iex1<iexM2 && iex2<trA.nExons) {//cycle through all overlapping exons if (trA.canonSJ[iex1-1]<0) {//skip non-junctions iex1++; continue; }; if (trA.canonSJ[iex2-1]<0) {//skip non-junctions iex2++; continue; }; if ( ( trA.exons[iex1][EX_G]!=trA.exons[iex2][EX_G] ) || ( (trA.exons[iex1-1][EX_G]+trA.exons[iex1-1][EX_L]) != (trA.exons[iex2-1][EX_G]+trA.exons[iex2-1][EX_L]) ) ) { return; //inconsistent junctions on overlapping mates }; iex1++; iex2++; };//cycle through all overlapping exons };//mates overlap - check consistency of junctions };//check for correct overlap between mates if (P->scoreGenomicLengthLog2scale!=0) {//add gap length score Score += int(ceil( log2( (double) ( trA.exons[trA.nExons-1][EX_G]+trA.exons[trA.nExons-1][EX_L] - trA.exons[0][EX_G]) ) \ * P->scoreGenomicLengthLog2scale - 0.5)); Score = max(0,Score); }; //calculate some final values for the transcript trA.roStart = (trA.roStr == 0) ? trA.rStart : Lread - trA.rStart - trA.rLength; trA.maxScore=Score; if (trA.exons[0][EX_iFrag]==trA.exons[trA.nExons-1][EX_iFrag]) {//mark single fragment transcripts trA.iFrag=trA.exons[0][EX_iFrag]; RA->maxScoreMate[trA.iFrag] = max (RA->maxScoreMate[trA.iFrag] , Score); } else { trA.iFrag=-1; }; if ( Score+P->outFilterMultimapScoreRange >= wTr[0]->maxScore \ || ( trA.iFrag>=0 && Score+P->outFilterMultimapScoreRange >= RA->maxScoreMate[trA.iFrag] ) \ || P->chimSegmentMin>0) { //only record the transcripts within the window that are in the Score range //OR within the score range of each mate //OR all transcript if chimeric detection is activated if (P->outFilterMismatchNoverLmax1<0) {//check that the alignment is end-to-end uint rTotal=trA.rLength+trA.lIns; // for (uint iex=1;iex<trA.nExons;iex++) {//find the inside exons // rTotal+=trA.exons[iex][EX_R]-trA.exons[iex-1][EX_R]; // }; if ( (trA.iFrag<0 && rTotal<(RA->readLength[0]+RA->readLength[1])) || (trA.iFrag>=0 && rTotal<RA->readLength[trA.iFrag])) return; }; uint iTr=0; //transcript insertion/replacement place trA.mappedLength=0; for (uint iex=0;iex<trA.nExons;iex++) {//caclulate total mapped length trA.mappedLength += trA.exons[iex][EX_L]; }; while (iTr < *nWinTr) {//scan through all recorded transcripts for this window - check for duplicates //another way to calculate uOld, uNew: w/o gMap uint nOverlap=blocksOverlap(trA,*wTr[iTr]); uint uNew=trA.mappedLength-nOverlap; uint uOld=wTr[iTr]->mappedLength-nOverlap; if (uNew==0 && Score < wTr[iTr]->maxScore) {//new transript is a subset of the old ones break; } else if (uOld==0) {//old transcript is a subset of the new one, remove old transcript Transcript *pTr=wTr[iTr]; for (uint ii=iTr+1;ii<*nWinTr;ii++) wTr[ii-1]=wTr[ii]; //shift transcripts (*nWinTr)--; wTr[*nWinTr]=pTr; } else if (uOld>0 && (uNew>0 || Score >= wTr[iTr]->maxScore) ) {//check next transcript iTr++; }; }; if (iTr==*nWinTr) {//insert the new transcript for (iTr=0;iTr<*nWinTr;iTr++) {//find inseriton location if (Score>wTr[iTr]->maxScore || (Score==wTr[iTr]->maxScore && trA.gLength<wTr[iTr]->gLength) ) break; }; Transcript *pTr=wTr[*nWinTr]; for (int ii=*nWinTr; ii> int(iTr); ii--) {//shift all the transcript pointers down from iTr wTr[ii]=wTr[ii-1]; }; wTr[iTr]=pTr; //the new transcript pointer is now at *nWinTr+1, move it into the iTr *(wTr[iTr])=trA; if (*nWinTr<P->alignTranscriptsPerWindowNmax) { (*nWinTr)++; //increment number of transcripts per window; } else { //"WARNING: too many recorded transcripts per window: iRead="<<RA->iRead<< "\n"; }; }; }; return; }; /////////////////////////////////////////////////////////////////////////////////// int dScore=0; Transcript trAi=trA; //trA copy with this align included, to be used in the 1st recursive call of StitchAlign if (trA.nExons>0) {//stitch, a transcript has already been originated dScore=stitchAlignToTranscript(tR2, tG2, WA[iA][WA_rStart], WA[iA][WA_gStart], WA[iA][WA_Length], WA[iA][WA_iFrag], WA[iA][WA_sjA], P, R, Q, G, &trAi, RA->outFilterMismatchNmaxTotal); //TODO check if the new stitching creates too many MM, quit this transcript if so } else { //this is the first align in the transcript trAi.exons[0][EX_R]=trAi.rStart=WA[iA][WA_rStart]; //transcript start/end trAi.exons[0][EX_G]=trAi.gStart=WA[iA][WA_gStart]; trAi.exons[0][EX_L]=WA[iA][WA_Length]; trAi.exons[0][EX_iFrag]=WA[iA][WA_iFrag]; trAi.exons[0][EX_sjA]=WA[iA][WA_sjA]; trAi.nExons=1; //recorded first exon for (uint ii=0;ii<WA[iA][WA_Length];ii++) dScore+=int(Q [ WA[iA][WA_rStart] + ii ]); //sum all the scores trAi.nMatch=WA[iA][WA_Length]; //# of matches for (uint ii=0; ii<nA; ii++) WAincl[ii]=false; }; if (dScore>-1000000) {//include this align WAincl[iA]=true; if ( WA[iA][WA_Nrep]==1 ) trAi.nUnique++; //unique piece if ( WA[iA][WA_Anchor]>0 ) trAi.nAnchor++; //anchor piece piece stitchWindowAligns(iA+1, nA, Score+dScore, WAincl, WA[iA][WA_rStart]+WA[iA][WA_Length]-1, WA[iA][WA_gStart]+WA[iA][WA_Length]-1, trAi, Lread, WA, R, Q, G, sigG, P, wTr, nWinTr, RA); } else { }; //also run a transcript w/o including this align if (WA[iA][WA_Anchor]!=2 || trA.nAnchor>0) {//only allow exclusion if this is not the last anchor, or other anchors have been used WAincl[iA]=false; stitchWindowAligns(iA+1, nA, Score, WAincl, tR2, tG2, trA, Lread, WA, R, Q, G, sigG, P, wTr, nWinTr, RA); }; return; };
intScore stitchAlignToTranscript(uint rAend, uint gAend, uint rBstart, uint gBstart, uint L, uint iFragB, uint sjAB, Parameters* P, char* R, char* Q, char* G, Transcript *trA, const uint outFilterMismatchNmaxTotal) { //stitch together A and B, extend in the gap, returns max score //Q is assumed modified already int Score=0; // int score2; if (sjAB!=((uint) -1) && trA->exons[trA->nExons-1][EX_sjA]==sjAB \ && trA->exons[trA->nExons-1][EX_iFrag]==iFragB && rBstart==rAend+1 && gAend+1<gBstart ) {//simple stitching if junction belongs to a database if (P->sjdbMotif[sjAB]==0 && (L<=P->sjdbShiftRight[sjAB] || trA->exons[trA->nExons-1][EX_L]<=P->sjdbShiftLeft[sjAB]) ) { return -1000006; //too large repeats around non-canonical junction }; trA->exons[trA->nExons][EX_L] = L; //new exon length trA->exons[trA->nExons][EX_R] = rBstart; //new exon r-start trA->exons[trA->nExons][EX_G] = gBstart; //new exon g-start trA->canonSJ[trA->nExons-1]=P->sjdbMotif[sjAB]; //mark sj-db trA->shiftSJ[trA->nExons-1][0]=P->sjdbShiftLeft[sjAB]; trA->shiftSJ[trA->nExons-1][1]=P->sjdbShiftRight[sjAB]; trA->sjAnnot[trA->nExons-1]=1; trA->sjStr[trA->nExons-1]=P->sjdbStrand[sjAB];; trA->nExons++; trA->nMatch+=L; for (uint ii=rBstart;ii<rBstart+L;ii++) Score+=int(Q[ii]); //add QS for mapped portions Score+=P->sjdbScore; } else {//general stitching trA->sjAnnot[trA->nExons-1]=0; trA->sjStr[trA->nExons-1]=0; if (trA->exons[trA->nExons-1][EX_iFrag]==iFragB) {//stitch aligns on the same fragment uint gBend=gBstart+L-1; uint rBend=rBstart+L-1; // {//debug // if (sjAB!=((uint) -1) && trA->exons[trA->nExons-1][EX_sjA]!=((uint) -1) && rBend<=rAend) {// // Score -= rAend-rBstart+1; // gAend -= rAend-rBstart+1; // rAend = rBstart-1; // trA->exons[trA->nExons-1][EX_L] =rAend-trA->exons[trA->nExons-1][EX_R]+1; // }; // }; //check if r-overlapping fully and exit if (rBend<=rAend) return -1000001; if (gBend<=gAend && trA->exons[trA->nExons-1][EX_iFrag]==iFragB) return -1000002; //shift the B 5' if overlaps A 3' if (rBstart<=rAend) { gBstart+=rAend-rBstart+1; rBstart=rAend+1; L=rBend-rBstart+1; }; for (uint ii=rBstart;ii<=rBend;ii++) Score+=int(Q[ii]); //add QS for mapped portions int gGap=gBstart-gAend-1; //could be < 0 for insertions int rGap=rBstart-rAend-1;//>0 always since we removed overlap uint nMatch=L; uint nMM=0; uint Del=0, Ins=0; uint nIns=0, nDel=0; int jR=0; //junction location in R-space int jCan=999; //canonical junction type uint gBstart1=gBstart-rGap-1;//the last base of the intron if all read gap belongs to acceptor, i.e. jR=0 // check all the different combinations of gGap and rGap if ( gGap==0 && rGap==0 ) {//just joined the pieces, w/o stiching or gaps //do nothing for now } else if ( gGap>0 && rGap>0 && rGap==gGap ) {//no gaps, just try to fill space //simple stitching, assuming no insertion in the read for (int ii=1;ii<=rGap;ii++) { if (G[gAend+ii]<4 && R[rAend+ii]<4) {//only score genome bases that are not Ns if ( R[rAend+ii]==G[gAend+ii] ) { Score+=int(Q[rAend+ii]); nMatch++; // if (Q[rAend+ii]>=P->Qgood) nMatchGood++; } else { Score-=int(Q[rAend+ii]); // trA->rMM[trA->nMM + nMM] = rAend+ii; nMM++; // if (Q[rAend+ii]>=P->Qgood) nMMgood++; }; }; }; } else if ( gGap>rGap ) {//genomic gap (Deletion) nDel=1; Del=gGap-rGap; //gGap>0 here if (Del>P->alignIntronMax && P->alignIntronMax>0) { return -1000003; //large gaps not allowed }; int Score1=0; int jR1=1; //junction location in R-space do { // 1. move left, until the score for MM is less than canonical advantage jR1--; if ( R[rAend+jR1]!=G[gBstart1+jR1] && G[gBstart1+jR1]<4 && R[rAend+jR1]==G[gAend+jR1]) Score1 -= int(Q[rAend+jR1]); } while ( Score1+P->scoreStitchSJshift >= 0 && int(trA->exons[trA->nExons-1][EX_L]) + jR1 > 1);//>=P->alignSJoverhangMin); //also check that we are still within the exon int maxScore2=-999999; Score1=0; int jPen=0; do { // 2. scan to the right to find the best junction locus // ?TODO? if genome base is N, how to score? if ( R[rAend+jR1]==G[gAend+jR1] && R[rAend+jR1]!=G[gBstart1+jR1] ) Score1+=int(Q[rAend+jR1]); if ( R[rAend+jR1]!=G[gAend+jR1] && R[rAend+jR1]==G[gBstart1+jR1] ) Score1-=int(Q[rAend+jR1]); int jCan1=-1; //this marks Deletion int jPen1=0; int Score2=Score1; if (Del>=P->alignIntronMin) {//only check intron motif for large gaps= non-Dels //check if the intron is canonical, or semi-canonical if ( G[gAend+jR1+1]==2 && G[gAend+jR1+2]==3 && G[gBstart1+jR1-1]==0 && G[gBstart1+jR1]==2 ) {//GTAG jCan1=1; } else if ( G[gAend+jR1+1]==1 && G[gAend+jR1+2]==3 && G[gBstart1+jR1-1]==0 && G[gBstart1+jR1]==1 ) {//CTAC jCan1=2; } else if ( G[gAend+jR1+1]==2 && G[gAend+jR1+2]==1 && G[gBstart1+jR1-1]==0 && G[gBstart1+jR1]==2 ) {//GCAG jCan1=3; jPen1=P->scoreGapGCAG; } else if ( G[gAend+jR1+1]==1 && G[gAend+jR1+2]==3 && G[gBstart1+jR1-1]==2 && G[gBstart1+jR1]==1 ) {//CTGC jCan1=4; jPen1=P->scoreGapGCAG; } else if ( G[gAend+jR1+1]==0 && G[gAend+jR1+2]==3 && G[gBstart1+jR1-1]==0 && G[gBstart1+jR1]==1 ) {//ATAC jCan1=5; jPen1=P->scoreGapATAC; } else if ( G[gAend+jR1+1]==2 && G[gAend+jR1+2]==3 && G[gBstart1+jR1-1]==0 && G[gBstart1+jR1]==3 ) {//GTAT jCan1=6; jPen1=P->scoreGapATAC; } else { jCan1=0; jPen1=P->scoreGapNoncan; }; Score2 += jPen1; }; if (maxScore2 < Score2 ) {//check if the score is the highest. TODO: record the next highest score maxScore2=Score2; jR=jR1; //this is the last base of donor jCan=jCan1; jPen=jPen1; }; jR1++; } while ( jR1 < int(rBend) - int(rAend) );// - int(P->alignSJoverhangMin) );//TODO: do not need to search the full B-transcript, can stop as soon as Score goes down by more than //repeat length: go back and forth around jR to find repeat length uint jjL=0,jjR=0; while ( gAend+jR>=jjL && G[gAend-jjL+jR]==G[gBstart1-jjL+jR] && G[gAend-jjL+jR]<4 && jjL<=MAX_SJ_REPEAT_SEARCH) {//go back jjL++; }; while ( gAend+jjR+jR+1<P->nGenome && G[gAend+jjR+jR+1]==G[gBstart1+jjR+jR+1] && G[gAend+jjR+jR+1]<4 && jjR<=MAX_SJ_REPEAT_SEARCH) {//go forward jjR++; }; if (jCan<=0) {//flush deletions and non-canonical junction to the left jR-=jjL; if (int(trA->exons[trA->nExons-1][EX_L])+jR<1) return -1000005; jjR+=jjL; jjL=0; }; //TODO check here if the internal exon length < minDa, if so exit w/o stitiching for (int ii=min(1,jR+1);ii<=max(rGap,jR);ii++) {//score donor and acceptor uint g1=(ii<=jR) ? (gAend+ii):(gBstart1+ii); if (G[g1]<4 && R[rAend+ii]<4) {//only penalize non-N bases if ( R[rAend+ii]==G[g1] ) { if (ii>=1 && ii <=rGap) {//only add +score and matches within the gap Score+=int(Q[rAend+ii]); nMatch++; }; } else {//add -score and MM for all bases Score-=int(Q[rAend+ii]); nMM++; if (ii<1 || ii>rGap) {//subtract previuosly presumed matches Score-=int(Q[rAend+ii]); nMatch--; // if (ii<=jR) nMM--; }; }; }; }; //score the gap if (P->sjdbN>0) {//check if the junction is annotated uint jS=gAend+jR+1, jE=gBstart1+jR;//intron start/end int sjdbInd=binarySearch2(jS,jE,P->sjdbStart,P->sjdbEnd,P->sjdbN); if (sjdbInd<0) { if (Del>=P->alignIntronMin) { Score += P->scoreGap + jPen; //genome gap penalty + non-canonical penalty } else {//deletion Score += Del*P->scoreDelBase + P->scoreDelOpen; jCan=-1; trA->sjAnnot[trA->nExons-1]=0; // jjR-=jjL; // jR-=jjL; // jjL=0; // trA->shiftSJ[trA->nExons-1][0]=0; // trA->shiftSJ[trA->nExons-1][1]=jjR; }; } else {//annotated jCan=P->sjdbMotif[sjdbInd]; if (P->sjdbMotif[sjdbInd]==0) {//shift to match annotations if (L<=P->sjdbShiftLeft[sjdbInd] || trA->exons[trA->nExons-1][EX_L]<=P->sjdbShiftLeft[sjdbInd]) { return -1000006; }; jR += (int) P->sjdbShiftLeft[sjdbInd]; jjL=P->sjdbShiftLeft[sjdbInd]; jjR=P->sjdbShiftRight[sjdbInd]; }; trA->sjAnnot[trA->nExons-1]=1; trA->sjStr[trA->nExons-1]=P->sjdbStrand[sjdbInd]; Score += P->sjdbScore; }; } else {//no annotation if (Del>=P->alignIntronMin) {//junction, not short deletion Score += P->scoreGap + jPen; } else { Score += Del*P->scoreDelBase + P->scoreDelOpen; jCan=-1; trA->sjAnnot[trA->nExons-1]=0; }; }; trA->shiftSJ[trA->nExons-1][0]=jjL; trA->shiftSJ[trA->nExons-1][1]=jjR; trA->canonSJ[trA->nExons-1]=jCan; if (trA->sjAnnot[trA->nExons-1]==0) {//strand for unannotated junctions if (jCan>0) { trA->sjStr[trA->nExons-1]=2-jCan%2; //1=+,2=- } else { trA->sjStr[trA->nExons-1]=0; }; }; } else if ( rGap>gGap ) {//insertion: if also gGap>0, need to stitch Ins=rGap-gGap; nIns=1; if (gGap==0) {//simple insertion, no need to stitch jR=0; } else if (gGap<0) {//reduce the score jR=0; for (int ii=0; ii<-gGap; ii++) Score -= int(Q[rBstart+ii]); } else {//stitch: define the exon boundary jR int Score1=0; int maxScore1=0; for (int jR1=1;jR1<=gGap;jR1++) {//scan to the right to find the best score if (G[gAend+jR1]<4) {//only penalize goog genome bases Score1+=( R[rAend+jR1]==G[gAend+jR1] ) ? int(Q[rAend+jR1]):-int(Q[rAend+jR1]); Score1+=( R[rAend+Ins+jR1]==G[gAend+jR1] ) ? -int(Q[rAend+Ins+jR1]):+int(Q[rAend+Ins+jR1]); }; if (Score1>maxScore1) { maxScore1=Score1; jR=jR1; }; }; for (int ii=1;ii<=gGap;ii++) {//score donor and acceptor uint r1=rAend+ii+(ii<=jR ? 0:Ins); if (G[gAend+ii]<4 && R[r1]<4) { if ( R[r1]==G[gAend+ii] ) { Score+=int(Q[r1]); nMatch++; } else {//add -score and MM for all bases Score-=int(Q[r1]); nMM++; }; }; }; }; Score += Ins*P->scoreInsBase + P->scoreInsOpen; jCan=-3; }; //different types of gaps selection #ifdef COMPILE_FOR_LONG_READS if ( (trA->nMM + nMM)<=outFilterMismatchNmaxTotal ) // if ( Score>0 && nMM<=200 ) #else if ( (trA->nMM + nMM)<=outFilterMismatchNmaxTotal \ && ( jCan<0 || (jCan<7 && nMM<= (uint) P->alignSJstitchMismatchNmax[(jCan+1)/2]) ) ) #endif {//stitching worked only if there no mis-matches for non-GT/AG junctions trA->nMM += nMM; trA->nMatch += nMatch; if (Del>=P->alignIntronMin) { trA->nGap += nDel; trA->lGap += Del; } else { trA->nDel += nDel; trA->lDel += Del; }; //modify exons if (Del==0 && Ins==0) {//no gap => no new exon, extend the boundary of the previous exon trA->exons[trA->nExons-1][EX_L] += rBend-rAend; } else if (Del>0) { //deletion:ca only have Del> or Ins>0 trA->exons[trA->nExons-1][EX_L] += jR; //correct the previous exon boundary trA->exons[trA->nExons][EX_L] = rBend-rAend-jR; //new exon length trA->exons[trA->nExons][EX_R] = rAend+jR+1; //new exon r-start trA->exons[trA->nExons][EX_G] = gBstart1+jR+1; //new exon g-start trA->nExons++; } else if (Ins>0) { //Ins>0; trA->nIns += nIns; trA->lIns += Ins; trA->exons[trA->nExons-1][EX_L] += jR; //correct the previous exon boundary trA->exons[trA->nExons][EX_L] = rBend-rAend-jR-Ins; //new exon length trA->exons[trA->nExons][EX_R] = rAend+jR+Ins+1; //new exon r-start trA->exons[trA->nExons][EX_G] = gAend+1+jR; //new exon g-start trA->canonSJ[trA->nExons-1]=-2; //mark insertion trA->sjAnnot[trA->nExons-1]=0; trA->nExons++; }; } else {//stitching was not accepted return -1000007; }; } else if (gBstart+trA->exons[0][EX_R] >= trA->exons[0][EX_G] || trA->exons[0][EX_G] < trA->exons[0][EX_R]){//if (iFragA==iFragB) stitch aligns from different fragments if (P->alignMatesGapMax>0 && gBstart > trA->exons[trA->nExons-1][EX_G] + trA->exons[trA->nExons-1][EX_L] + P->alignMatesGapMax) { return -1000004; //gap between mates too large }; //extend the fragments inside //note, that this always works, i.e. Score>0 for (uint ii=rBstart;ii<rBstart+L;ii++) Score+=int(Q[ii]); //add QS for mapped portions Transcript trExtend; //TODO: compare extensions to the left and right, pick the best one to be performed first //otherwise if a large nMM is reached in the 2st extension, it will prevent the 2nd extension //use the following example: //>1 //TTCTGTGTCTCCCCCTCCCCCACTGGCTACATGGAGACAGGGGGGGGGGGCCGGGCGGTTCCCGGGCAGAAAAAAA //>1 //AATATTTGGAACACTTATGTGAAAAATGATTTGTTTTTCTGAAATTTACGTTTCTCTCTGAGTCCTGTAACTGTCC trExtend.reset(); if ( extendAlign(R, Q, G, rAend+1, gAend+1, 1, 1, DEF_readSeqLengthMax, trA->nMatch, trA->nMM, outFilterMismatchNmaxTotal, P->outFilterMismatchNoverLmax, \ P->alignEndsType.ext[trA->exons[trA->nExons-1][EX_iFrag]][1], &trExtend) ) { trA->add(&trExtend); Score += trExtend.maxScore; trA->exons[trA->nExons-1][EX_L] += trExtend.extendL; };// if extendAlign for read A trA->exons[trA->nExons][EX_R] = rBstart; trA->exons[trA->nExons][EX_G] = gBstart; trA->exons[trA->nExons][EX_L] = L; trA->nMatch += L; trExtend.reset(); //if ( extendAlign(R, Q, G, rBstart-1, gBstart-1, -1, -1, gBstart-trA->exons[0][EX_G]+trA->exons[0][EX_R], trA->nMatch, trA->nMM, outFilterMismatchNmaxTotal, P->outFilterMismatchNoverLmax, //if end extension needs to be forced, use large length. Otherwise, only extend until the beginning of the transcript uint extlen=P->alignEndsType.ext[iFragB][1] ? DEF_readSeqLengthMax : gBstart-trA->exons[0][EX_G]+trA->exons[0][EX_R]; if ( extendAlign(R, Q, G, rBstart-1, gBstart-1, -1, -1, extlen, trA->nMatch, trA->nMM, outFilterMismatchNmaxTotal, P->outFilterMismatchNoverLmax, \ P->alignEndsType.ext[iFragB][1], &trExtend) ) { trA->add(&trExtend); Score += trExtend.maxScore; trA->exons[trA->nExons][EX_R] -= trExtend.extendL; trA->exons[trA->nExons][EX_G] -= trExtend.extendL; trA->exons[trA->nExons][EX_L] += trExtend.extendL; }; //if extendAlign B trA->canonSJ[trA->nExons-1]=-3; //mark different fragments junction trA->sjAnnot[trA->nExons-1]=0; trA->nExons++; } else {//no stitching possible return -1000008; }; }; trA->exons[trA->nExons-1][EX_iFrag]=iFragB; //the new exon belongs to fragment iFragB trA->exons[trA->nExons-1][EX_sjA]=sjAB; return Score; };
void sjdbBuildIndex (Parameters *P, Parameters *P1, char *Gsj, char *G, PackedArray &SA, PackedArray &SA2, PackedArray &SAi) { #define SPACER_CHAR GENOME_spacingChar if (P->sjdbN==0) {//no junctions to insert return; }; time_t rawtime; time ( &rawtime ); P->inOut->logMain << timeMonthDayTime(rawtime) << " ..... Inserting junctions into the genome indices" <<endl; *P->inOut->logStdOut << timeMonthDayTime(rawtime) << " ..... Inserting junctions into the genome indices" <<endl; uint nGsj=P->sjdbLength*P->sjdbN; for (uint ii=1; ii<=P->sjdbN; ii++) { Gsj[ii*P->sjdbLength-1]=SPACER_CHAR; //to make sure this is > than any genome char }; Gsj[nGsj*2]=SPACER_CHAR+1;//mark the end of the text for (uint ii=0; ii<nGsj; ii++) {//reverse complement junction sequences Gsj[nGsj*2-1-ii]=Gsj[ii]<4 ? 3-Gsj[ii] : Gsj[ii]; //reverse complement }; char* G1c=new char[nGsj*2+1]; complementSeqNumbers(Gsj, G1c, nGsj*2+1); uint32* oldSJind=new uint32[P1->sjdbN]; // uint nIndicesSJ1=P->sjdbOverhang; uint nIndicesSJ1=P->sjdbLength;//keep all indices - this is pre-2.4.1 of generating the genome uint64* indArray=new uint64[2*P->sjdbN*(nIndicesSJ1+1)*2];//8+4 bytes for SA index and index in the genome * nJunction * nIndices per junction * 2 for reverse compl uint64 sjNew=0; #pragma omp parallel num_threads(P->runThreadN) #pragma omp for schedule (dynamic,1000) reduction(+:sjNew) for (uint isj=0; isj<2*P->sjdbN; isj++) {//find insertion points for each of the sequences char** seq1=new char*[2]; seq1[0]=Gsj+isj*P->sjdbLength; seq1[1]=G1c+isj*P->sjdbLength; uint isj1=isj<P->sjdbN ? isj : 2*P->sjdbN-1-isj; int sjdbInd = P1->sjdbN==0 ? -1 : binarySearch2(P->sjdbStart[isj1],P->sjdbEnd[isj1],P1->sjdbStart,P1->sjdbEnd,P1->sjdbN); if (sjdbInd<0) {//count new junctions ++sjNew; } else {//record new index of the old junctions oldSJind[sjdbInd]=isj1; }; for (uint istart1=0; istart1<nIndicesSJ1;istart1++) { uint istart=istart1; // uint istart=isj<P->sjdbN ? istart1 : istart1+1; //for rev-compl junction, shift by one base to start with the 1st non-spacer base uint ind1=2*(isj*nIndicesSJ1+istart1); if (sjdbInd>=0 || seq1[0][istart]>3) {//no index for already included junctions, or suffices starting with N indArray[ind1]=-1; } else { //indArray[ind1] = suffixArraySearch(seq1, istart, P->sjdbLength-istart1, G, SA, true, 0, P->nSA-1, 0, P) ; indArray[ind1] = suffixArraySearch(seq1, istart, 10000, G, SA, true, 0, P->nSA-1, 0, P) ; indArray[ind1+1] = isj*P->sjdbLength+istart; }; }; }; // for (int ii=0;ii<P1->sjdbN;ii++) {if ( oldSJind[ii]==0){cout <<ii<<endl;};}; sjNew = sjNew/2;//novel junctions were double counted on two strands time ( &rawtime ); P->inOut->logMain << timeMonthDayTime(rawtime) << " Finished SA search: number of new junctions=" << sjNew <<", old junctions="<<P->sjdbN-sjNew<<endl; uint nInd=0;//true number of new indices for (uint ii=0; ii<2*P->sjdbN*nIndicesSJ1; ii++) {//remove entries that cannot be inserted, this cannot be done in the parallel cycle above if (indArray[ii*2]!= (uint) -1) { indArray[nInd*2]=indArray[ii*2]; indArray[nInd*2+1]=indArray[ii*2+1]; ++nInd; }; }; globalGsj=Gsj; qsort((void*) indArray, nInd, 2*sizeof(uint64), funCompareUintAndSuffixes); time ( &rawtime ); P->inOut->logMain << timeMonthDayTime(rawtime) << " Finished sorting SA indicesL nInd="<<nInd <<endl; indArray[2*nInd]=-999; //mark the last junction indArray[2*nInd+1]=-999; //mark the last junction P->nGenome=P->chrStart[P->nChrReal]+nGsj; P->nSA+=nInd; uint GstrandBit1 = (uint) floor(log(P->nGenome)/log(2))+1; if (GstrandBit1<32) GstrandBit1=32; //TODO: use simple access function for SA if ( GstrandBit1 != P->GstrandBit) {//too many junctions were added - GstrandBit changed ostringstream errOut; errOut << "EXITING because of FATAL ERROR: cannot insert junctions on the fly because of strand GstrandBit problem\n"; errOut << "SOLUTION: please contact STAR author at https://groups.google.com/forum/#!forum/rna-star\n"; exitWithError(errOut.str(),std::cerr, P->inOut->logMain, EXIT_CODE_GENOME_FILES, *P); }; SA2.defineBits(P->GstrandBit+1,P->nSA); uint nGsjNew=sjNew*P->sjdbLength; //this is the actual number of bytes added to the genome, while nGsj is the total size of all junctions uint N2bit= 1LLU << P->GstrandBit; uint strandMask=~N2bit; //testing // PackedArray SAo; // SAo.defineBits(P->GstrandBit+1,P->nSA); // SAo.allocateArray(); // ifstream oldSAin("./DirTrue/SA"); // oldSAin.read(SAo.charArray,SAo.lengthByte); // oldSAin.close(); uint isj=0, isa2=0; for (uint isa=0;isa<P1->nSA;isa++) { //testing // if (isa2>0 && SA2[isa2-1]!=SAo[isa2-1]) { // cout <<isa2 <<" "<< SA2[isa2-1]<<" "<<SAo[isa2-1]<<endl; // }; // if (isa==69789089) // { // cout <<isa; // }; uint ind1=SA[isa]; if ( (ind1 & N2bit)>0 ) {//- strand uint ind1s = P1->nGenome - (ind1 & strandMask); if (ind1s>P->chrStart[P->nChrReal]) {//this index was an old sj, may need to shift it uint sj1 = (ind1s-P->chrStart[P->nChrReal])/P->sjdbLength;//old junction index ind1s += (oldSJind[sj1]-sj1)*P->sjdbLength; ind1 = (P->nGenome - ind1s) | N2bit; } else { ind1+=nGsjNew; //reverse complementary indices are all shifted by the length of junctions }; } else {//+ strand if (ind1>P->chrStart[P->nChrReal]) {//this index was an old sj, may need to shift it uint sj1 = (ind1-P->chrStart[P->nChrReal])/P->sjdbLength;//old junction index ind1 += (oldSJind[sj1]-sj1)*P->sjdbLength; }; }; SA2.writePacked(isa2,ind1); //TODO make sure that the first sj index is not before the first array index ++isa2; while (isa==indArray[isj*2]) {//insert sj index after the existing index uint ind1=indArray[isj*2+1]; if (ind1<nGsj) { ind1+=P->chrStart[P->nChrReal]; } else {//reverse strand ind1=(ind1-nGsj) | N2bit; }; SA2.writePacked(isa2,ind1); ++isa2; ++isj; }; }; time ( &rawtime ); P->inOut->logMain << timeMonthDayTime(rawtime) << " Finished inserting junction indices" <<endl; //SAi insertions for (uint iL=0; iL < P->genomeSAindexNbases; iL++) { uint iSJ=0; uint ind0=P->genomeSAindexStart[iL]-1;//last index that was present in the old genome for (uint ii=P->genomeSAindexStart[iL];ii<P->genomeSAindexStart[iL+1]; ii++) {//scan through the longest index uint iSA1=SAi[ii]; uint iSA2=iSA1 & P->SAiMarkNmask & P->SAiMarkAbsentMask; if ( iSJ<nInd && (iSA1 & P->SAiMarkAbsentMaskC)>0 ) {//index missing from the old genome uint iSJ1=iSJ; int64 ind1=funCalcSAi(Gsj+indArray[2*iSJ+1],iL); while (ind1 < (int64)(ii-P->genomeSAindexStart[iL]) && indArray[2*iSJ]<iSA2) { ++iSJ; ind1=funCalcSAi(Gsj+indArray[2*iSJ+1],iL); }; if (ind1 == (int64)(ii-P->genomeSAindexStart[iL]) ) { SAi.writePacked(ii,indArray[2*iSJ]+iSJ+1); for (uint ii0=ind0+1; ii0<ii; ii0++) {//fill all the absent indices with this value SAi.writePacked(ii0,(indArray[2*iSJ]+iSJ+1) | P->SAiMarkAbsentMaskC); }; ++iSJ; ind0=ii; } else { iSJ=iSJ1; }; } else {//index was present in the old genome while (iSJ<nInd && indArray[2*iSJ]+1<iSA2) {//for this index insert "smaller" junctions ++iSJ; }; while (iSJ<nInd && indArray[2*iSJ]+1==iSA2) {//special case, the index falls right behind SAi if (funCalcSAi(Gsj+indArray[2*iSJ+1],iL) >= (int64) (ii-P->genomeSAindexStart[iL]) ) {//this belongs to the next index break; }; ++iSJ; }; SAi.writePacked(ii,iSA1+iSJ); for (uint ii0=ind0+1; ii0<ii; ii0++) {//fill all the absent indices with this value SAi.writePacked(ii0,(iSA2+iSJ) | P->SAiMarkAbsentMaskC); }; ind0=ii; }; }; }; // time ( &rawtime ); cout << timeMonthDayTime(rawtime) << "SAi first" <<endl; for (uint isj=0;isj<nInd;isj++) { int64 ind1=0; for (uint iL=0; iL < P->genomeSAindexNbases; iL++) { uint g=(uint) Gsj[indArray[2*isj+1]+iL]; ind1 <<= 2; if (g>3) {//this iSA contains N, need to mark the previous for (uint iL1=iL; iL1 < P->genomeSAindexNbases; iL1++) { ind1+=3; int64 ind2=P->genomeSAindexStart[iL1]+ind1; for (; ind2>=0; ind2--) {//find previous index that is not absent if ( (SAi[ind2] & P->SAiMarkAbsentMaskC)==0 ) { break; }; }; SAi.writePacked(ind2,SAi[ind2] | P->SAiMarkNmaskC); ind1 <<= 2; }; break; } else { ind1 += g; }; }; }; time ( &rawtime ); P->inOut->logMain << timeMonthDayTime(rawtime) << " Finished SAi" <<endl; //change parameters, most parameters are already re-defined in sjdbPrepare.cpp SA.defineBits(P->GstrandBit+1,P->nSA);//same as SA2 SA.pointArray(SA2.charArray); P->nSAbyte=SA.lengthByte; P->sjGstart=P->chrStart[P->nChrReal]; memcpy(G+P->chrStart[P->nChrReal],Gsj, nGsj); /* testing PackedArray SAio=SAi; SAio.allocateArray(); ifstream oldSAiin("./DirTrue/SAindex"); // oldSAin.read(SAio.charArray,8*(P->genomeSAindexNbases+2));//skip first bytes oldSAiin.read(SAio.charArray,SAio.lengthByte); oldSAiin.close(); // for (uint ii=0;ii<P->nSA;ii++) { // if (SA2[ii]!=SAo[ii]) { // cout <<ii <<" "<< SA2[ii]<<" "<<SAo[ii]<<endl; // }; // }; for (uint iL=0; iL < P->genomeSAindexNbases; iL++) { for (uint ii=P->genomeSAindexStart[iL];ii<P->genomeSAindexStart[iL+1]; ii++) {//scan through the longets index if ( SAio[ii]!=SAi[ii] ) { cout <<ii<<" "<<SAio[ii]<<" "<<SAi[ii]<<endl; }; }; }; */ /* ofstream genomeOut("/home/dobin/Genome"); fstreamWriteBig(genomeOut,G,P->nGenome+nGsj,"777","777",P); genomeOut.close(); genomeOut.open("/home/dobin/SA"); fstreamWriteBig(genomeOut,SA2.charArray,SA2.lengthByte,"777","777",P); genomeOut.close(); */ delete [] indArray; delete [] G1c; delete [] oldSJind; };