Beispiel #1
0
/*Busqueda binaria recursiva, retorna -1 si no encuentra algo*/
int binarySearch2(int value,int min,int max,int v[]){
   if(min > max)
      return -1;

   int middle = (min + max) / 2;
   if(v[middle] == value)
      return middle;
   else if(value < v[middle])
      return binarySearch2(value,min,middle - 1,v);
   else if(v[middle] < value)
      return binarySearch2(value,middle + 1,max,v);
   
   return -1;
}
Beispiel #2
0
//binary search using recursion
int binarySearch2(int a[], int low, int high, int x){
    if(low > high){
        std::cout<<"binary search2: Did not find the number. \n";
        return -1;
    }
    int mid = low + (high - low)/2;
    if(x == a[mid]){
        std::cout<< "binary search2: found the number in index "<<mid<<std::endl;
        return mid;
    }
    else if(x < a[mid])
        return binarySearch2(a, low, mid -1, x);
    else
        return binarySearch2(a, mid + 1, high, x);
}
Beispiel #3
0
int main(int argc, const char * argv[])
{
    int myarray[] = {0,1,2,3,4,5,6,16,20,34,54,89};

    binarySearch(myarray, sizeof(myarray)/sizeof(int), -1);
    binarySearch2(myarray, 0, sizeof(myarray)/sizeof(int), 34);
    
    
    return 0;
}
Beispiel #4
0
int main()
{
	/*int a[5]={1,3,6,9,11};
	int needle = 10;
	printf("1:%d:%d\n",needle,binarySearch1(a,needle,0,5));
	printf("2:%d:%d\n",needle,binarySearch2(a,needle,5));*/

	int n,t,i;
	int x[MAXN];
	while(scanf("%d%d",&n,&t) != EOF){
		assert(n);
		for(i=0; i<n; ++i)
			x[i] = 10*i;
		printf("result:%d\n",binarySearch2(x,t,n));
	}
	return 0;
}
Beispiel #5
0
void stitchWindowAligns(uint iA, uint nA, int Score, bool WAincl[], uint tR2, uint tG2, Transcript trA, \
                        uint Lread, uiWA* WA, char* R, char* Q, char* G, char* sigG,\
                        Parameters* P, Transcript** wTr, uint* nWinTr, ReadAlign *RA) {
    //recursively stitch aligns for one gene
    //*nWinTr - number of transcripts for the current window   
    
    if (iA>=nA && tR2==0) return; //no aligns in the transcript

    if (iA>=nA) {//no more aligns to add, finalize the transcript
        
        //extend first
        Transcript trAstep1;

        int vOrder[2]; //decide in which order to extend: extend the 5' of the read first

        #if EXTEND_ORDER==1
        if ( trA.roStr==0 ) {//decide in which order to extend: extend the 5' of the read first
            vOrder[0]=0; vOrder[1]=1;
        } else {
            vOrder[0]=1; vOrder[1]=0;
        };
        #elif EXTEND_ORDER==2
            vOrder[0]=0; vOrder[1]=1;
        #else
            #error "EXTEND_ORDER value unrecognized"
        #endif
        
        for (int iOrd=0;iOrd<2;iOrd++) {
            
            switch (vOrder[iOrd]) {
            
            case 0: //extend at start

            if (trA.rStart>0) {// if transcript does not start at base, extend to the read start 

                //calculate # of allowed mismatches that has been left
               
                
                double pMMmax=(P->alignEndsType=="Extend5pOfRead1" && trA.exons[0][EX_iFrag]==0 && trA.Str==0) ? -1 : P->outFilterMismatchNoverLmax1;
                
                trAstep1.reset();
                //                                                            //avoid extending before Chr start
                if ( extendAlign(R, Q, G, trA.rStart-1, trA.gStart-1, -1, -1, min(trA.rStart, trA.gStart - P->chrStart[trA.Chr]), tR2-trA.rStart+1, trA.nMM, RA->outFilterMismatchNmaxTotal, pMMmax, &trAstep1) ) {//if could extend
        
                    trA.add(&trAstep1);
                    Score += trAstep1.maxScore;

                    trA.exons[0][EX_R] = trA.rStart = trA.rStart - trAstep1.extendL;
                    trA.exons[0][EX_G] = trA.gStart = trA.gStart - trAstep1.extendL;
                    trA.exons[0][EX_L] += trAstep1.extendL;                   
                    
                };                   
            //TODO penalize the unmapped bases at the start 
            };        
            break;
            
            case 1: //extend at end

            if ( tR2+1<Lread ) {//extend alignment to the read end

                //calculate # of allowed mismatches that has been left
                
                double pMMmax=(P->alignEndsType=="Extend5pOfRead1" && trA.exons[trA.nExons-1][EX_iFrag]==0 && trA.Str==0) ? -1 : P->outFilterMismatchNoverLmax1;
                
                trAstep1.reset();            
                //                                              //to prevent extension past the Chr end
                if ( extendAlign(R, Q, G, tR2+1, tG2+1, +1, +1, min(Lread-tR2-1,P->chrStart[trA.Chr+1]-tG2-2), tR2-trA.rStart+1, trA.nMM, RA->outFilterMismatchNmaxTotal, pMMmax, &trAstep1) ) {//if could extend
                    
                    trA.add(&trAstep1);
                    Score += trAstep1.maxScore;

                    tR2 += trAstep1.extendL;
                    tG2 += trAstep1.extendL;
                    
                    trA.exons[trA.nExons-1][EX_L] += trAstep1.extendL;//extend the length of the last exon

                };
            //TODO penalize unmapped bases at the end            
            };
        };
        };
        
        if (P->alignSoftClipAtReferenceEnds=="No" &&  \
                ( (trA.exons[trA.nExons-1][EX_G] + Lread-trA.exons[trA.nExons-1][EX_R]) > (P->chrStart[trA.Chr]+P->chrLength[trA.Chr]) || \
                   trA.exons[0][EX_G]<(P->chrStart[trA.Chr]+trA.exons[0][EX_R]) ) ) {
            return; //no soft clipping past the ends of the chromosome
        };
        
        
        trA.rLength = 0;
        for (uint isj=0;isj<trA.nExons;isj++) {
            trA.rLength += trA.exons[isj][EX_L];
        };
        trA.gLength = tG2+1-trA.gStart;
        
        //check exons lenghts including repeats, do not report a transcript with short exons
        for (uint isj=0;isj<trA.nExons-1;isj++) {//check exons for min length, if they are not annotated and precede a junction
            if ( trA.canonSJ[isj]>=0 ) {//junction
                if (trA.sjAnnot[isj]==1) {//sjdb
                    if (  ( trA.exons[isj][EX_L]   < P->alignSJDBoverhangMin && (isj==0            || trA.canonSJ[isj-1]==-3 || (trA.sjAnnot[isj-1]==0 && trA.canonSJ[isj-1]>=0) ) )\
                       || ( trA.exons[isj+1][EX_L] < P->alignSJDBoverhangMin && (isj==trA.nExons-2 || trA.canonSJ[isj+1]==-3 || (trA.sjAnnot[isj+1]==0 && trA.canonSJ[isj+1]>=0) ) ) )return;
                } else {//non-sjdb
                    if (  trA.exons[isj][EX_L] < P->alignSJoverhangMin + trA.shiftSJ[isj][0] \
                       || trA.exons[isj+1][EX_L] < P->alignSJoverhangMin + trA.shiftSJ[isj][1]   ) return;
                };
            };
        };        
        if (trA.nExons>1 && trA.sjAnnot[trA.nExons-2]==1 && trA.exons[trA.nExons-1][EX_L] < P->alignSJDBoverhangMin) return; //this exon was not checkedin the cycle above
        
        trA.intronMotifs[0]=0;trA.intronMotifs[1]=0;trA.intronMotifs[2]=0;
        for (uint iex=0;iex<trA.nExons-1;iex++) {
            if (trA.canonSJ[iex]==0) {
                ++trA.intronMotifs[0]; 
            } else if (trA.canonSJ[iex]>0) {
                ++trA.intronMotifs[2-trA.canonSJ[iex]%2];
            };
        };

        //filter strand consistency
        trA.sjMotifStrand=0;
        uint sjN=0;
        for (uint iex=0;iex<trA.nExons-1;iex++) {
            if (trA.canonSJ[iex]>=0) sjN++;
            if (trA.sjStr[iex]>0) {//only these sjs have defined strand
                if (trA.sjMotifStrand==0) {
                    trA.sjMotifStrand=trA.sjStr[iex];
                } else if (trA.sjMotifStrand != trA.sjStr[iex]) {//inconsistent strand
                    return; //kill this transcript
                };  
            };
        };

        if (sjN>0 && trA.sjMotifStrand==0 && P->outSAMstrandField=="intronMotif") {//strand not defined for a junction
            return;
        };

        if (P->outFilterIntronMotifs=="None") {//no filtering

        } else if (P->outFilterIntronMotifs=="RemoveNoncanonical") {
            for (uint iex=0;iex<trA.nExons-1;iex++) {
                if (trA.canonSJ[iex]==0) return;
            };
        } else if (P->outFilterIntronMotifs=="RemoveNoncanonicalUnannotated") {
            for (uint iex=0;iex<trA.nExons-1;iex++) {
                if (trA.canonSJ[iex]==0 && trA.sjAnnot[iex]==0) return;
            };
        } else {
            ostringstream errOut;
            errOut << "EXITING because of FATAL INPUT error: unrecognized value of --outFilterIntronMotifs=" <<P->outFilterIntronMotifs <<"\n";
            errOut << "SOLUTION: re-run STAR with --outFilterIntronMotifs = None -OR- RemoveNoncanonical -OR- RemoveNoncanonicalUnannotated\n";
            exitWithError(errOut.str(),std::cerr, P->inOut->logMain, EXIT_CODE_INPUT_FILES, *P);        
        };

        {//check mapped length for each mate
            uint nsj=0,exl=0;
            for (uint iex=0;iex<trA.nExons;iex++) {//
                exl+=trA.exons[iex][EX_L];
                if (iex==trA.nExons-1 || trA.canonSJ[iex]==-3) {//mate is completed, make the checks
                    if (nsj>0 && (exl<P->alignSplicedMateMapLmin || exl < (uint) (P->alignSplicedMateMapLminOverLmate*RA->readLength[trA.exons[iex][EX_iFrag]])) ) {
                        return; //do not record this transcript
                    };
                    exl=0;nsj=0;
                } else if (trA.canonSJ[iex]>=0) {
                    nsj++;
                };
            };
        };
        
        if (P->outFilterBySJoutStage==2) {//junctions have to be present in the filtered set P->sjnovel
            for (uint iex=0;iex<trA.nExons-1;iex++) {
                if (trA.canonSJ[iex]>=0 && trA.sjAnnot[iex]==0) {
                    uint jS=trA.exons[iex][EX_G]+trA.exons[iex][EX_L];
                    uint jE=trA.exons[iex+1][EX_G]-1;
                    if ( binarySearch2(jS,jE,P->sjNovelStart,P->sjNovelEnd,P->sjNovelN) < 0 ) return;
                };
            };
            
        };        
        
        if ( trA.exons[0][EX_iFrag]!=trA.exons[trA.nExons-1][EX_iFrag] ) {//check for correct overlap between mates
            if (trA.exons[trA.nExons-1][EX_G]+trA.exons[trA.nExons-1][EX_L] <= trA.exons[0][EX_G]) return; //to avoid negative insert size
            uint iexM2=trA.nExons;
            for (uint iex=0;iex<trA.nExons-1;iex++) {//find the first exon of the second mate
                if (trA.canonSJ[iex]==-3) {//        
                    iexM2=iex+1;
                    break;
                };
            };
            
            if ( trA.exons[iexM2-1][EX_G] + trA.exons[iexM2-1][EX_L] > trA.exons[iexM2][EX_G] ) {//mates overlap - check consistency of junctions
                
                if (trA.exons[0][EX_G] > trA.exons[iexM2][EX_G]+trA.exons[0][EX_R]) return; //LeftMateStart > RightMateStart
                if (trA.exons[iexM2-1][EX_G]+trA.exons[iexM2-1][EX_L] > trA.exons[trA.nExons-1][EX_G]+Lread-trA.exons[trA.nExons-1][EX_R]) return; //LeftMateEnd   > RightMateEnd
                
                //check for junctions consistency
                uint iex1=1, iex2=iexM2+1; //last exons of the junction
                for  (; iex1<iexM2; iex1++) {//find first junction that overlaps 2nd mate
                    if (trA.exons[iex1][EX_G] >= trA.exons[iex2-1][EX_G] + trA.exons[iex2-1][EX_L]) break;
                };
                while (iex1<iexM2 && iex2<trA.nExons) {//cycle through all overlapping exons
                    if (trA.canonSJ[iex1-1]<0) {//skip non-junctions
                        iex1++;
                        continue;
                    };
                    if (trA.canonSJ[iex2-1]<0) {//skip non-junctions
                        iex2++;
                        continue;
                    };
                    
                    if ( ( trA.exons[iex1][EX_G]!=trA.exons[iex2][EX_G] ) || ( (trA.exons[iex1-1][EX_G]+trA.exons[iex1-1][EX_L]) != (trA.exons[iex2-1][EX_G]+trA.exons[iex2-1][EX_L]) ) ) {
                        return; //inconsistent junctions on overlapping mates
                    };
                    iex1++;
                    iex2++;
                    
                };//cycle through all overlapping exons
            };//mates overlap - check consistency of junctions
        };//check for correct overlap between mates
        
        if (P->scoreGenomicLengthLog2scale!=0) {//add gap length score
            Score += int(ceil( log2( (double) ( trA.exons[trA.nExons-1][EX_G]+trA.exons[trA.nExons-1][EX_L] - trA.exons[0][EX_G]) ) \
                     * P->scoreGenomicLengthLog2scale - 0.5));
            Score = max(0,Score);
        };
        
        //calculate some final values for the transcript
        trA.roStart = (trA.roStr == 0) ? trA.rStart : Lread - trA.rStart - trA.rLength;     
        trA.maxScore=Score;
        
        if (trA.exons[0][EX_iFrag]==trA.exons[trA.nExons-1][EX_iFrag]) {//mark single fragment transcripts
            trA.iFrag=trA.exons[0][EX_iFrag];
            RA->maxScoreMate[trA.iFrag] = max (RA->maxScoreMate[trA.iFrag] , Score);
        } else {
            trA.iFrag=-1;
        };

        if (       Score+P->outFilterMultimapScoreRange >= wTr[0]->maxScore \
                || ( trA.iFrag>=0 && Score+P->outFilterMultimapScoreRange >= RA->maxScoreMate[trA.iFrag] ) \
                || P->chimSegmentMin>0) {
                //only record the transcripts within the window that are in the Score range
                //OR within the score range of each mate
                //OR all transcript if chimeric detection is activated
            
            if (P->outFilterMismatchNoverLmax1<0) {//check that the alignment is end-to-end
                uint rTotal=trA.rLength+trA.lIns;
//                 for (uint iex=1;iex<trA.nExons;iex++) {//find the inside exons
//                     rTotal+=trA.exons[iex][EX_R]-trA.exons[iex-1][EX_R];
//                 };                
                if ( (trA.iFrag<0 && rTotal<(RA->readLength[0]+RA->readLength[1])) || (trA.iFrag>=0 && rTotal<RA->readLength[trA.iFrag])) return;
            };
            
            uint iTr=0; //transcript insertion/replacement place
          
            trA.mappedLength=0;
            for (uint iex=0;iex<trA.nExons;iex++) {//caclulate total mapped length
                trA.mappedLength += trA.exons[iex][EX_L];
            };
            
            while (iTr < *nWinTr) {//scan through all recorded transcripts for this window - check for duplicates

                //another way to calculate uOld, uNew: w/o gMap
                uint nOverlap=blocksOverlap(trA,*wTr[iTr]);
                uint uNew=trA.mappedLength-nOverlap;
                uint uOld=wTr[iTr]->mappedLength-nOverlap;
                
                if (uNew==0 && Score < wTr[iTr]->maxScore) {//new transript is a subset of the old ones
                    break;
                } else if (uOld==0) {//old transcript is a subset of the new one, remove old transcript
                    Transcript *pTr=wTr[iTr];
                    for  (uint ii=iTr+1;ii<*nWinTr;ii++) wTr[ii-1]=wTr[ii]; //shift transcripts                    
                    (*nWinTr)--;
                    wTr[*nWinTr]=pTr;
                } else if (uOld>0 && (uNew>0 || Score >= wTr[iTr]->maxScore) ) {//check next transcript
                    iTr++;
                };
                
            };
            
            if (iTr==*nWinTr) {//insert the new transcript
                for (iTr=0;iTr<*nWinTr;iTr++) {//find inseriton location
                    if (Score>wTr[iTr]->maxScore || (Score==wTr[iTr]->maxScore && trA.gLength<wTr[iTr]->gLength) ) break;
                };

                Transcript *pTr=wTr[*nWinTr];
                for (int ii=*nWinTr; ii> int(iTr); ii--) {//shift all the transcript pointers down from iTr
                    wTr[ii]=wTr[ii-1];
                };
                wTr[iTr]=pTr; //the new transcript pointer is now at *nWinTr+1, move it into the iTr
                *(wTr[iTr])=trA;
                if (*nWinTr<P->alignTranscriptsPerWindowNmax) {
                    (*nWinTr)++; //increment number of transcripts per window;
                } else {
                        //"WARNING: too many recorded transcripts per window: iRead="<<RA->iRead<< "\n";
                };                
            };
        };

                
        return;
    };

    ///////////////////////////////////////////////////////////////////////////////////
    int dScore=0;
    Transcript trAi=trA; //trA copy with this align included, to be used in the 1st recursive call of StitchAlign
    if (trA.nExons>0) {//stitch, a transcript has already been originated

        dScore=stitchAlignToTranscript(tR2, tG2, WA[iA][WA_rStart], WA[iA][WA_gStart], WA[iA][WA_Length], WA[iA][WA_iFrag],  WA[iA][WA_sjA], P, R, Q, G, &trAi, RA->outFilterMismatchNmaxTotal);        
        //TODO check if the new stitching creates too many MM, quit this transcript if so
        
    } else { //this is the first align in the transcript
            trAi.exons[0][EX_R]=trAi.rStart=WA[iA][WA_rStart]; //transcript start/end
            trAi.exons[0][EX_G]=trAi.gStart=WA[iA][WA_gStart];
            trAi.exons[0][EX_L]=WA[iA][WA_Length];
            trAi.exons[0][EX_iFrag]=WA[iA][WA_iFrag];
            trAi.exons[0][EX_sjA]=WA[iA][WA_sjA];
            
            trAi.nExons=1; //recorded first exon
            
            for (uint ii=0;ii<WA[iA][WA_Length];ii++) dScore+=int(Q [ WA[iA][WA_rStart] + ii ]); //sum all the scores           
       
            trAi.nMatch=WA[iA][WA_Length]; //# of matches
            
            for (uint ii=0; ii<nA; ii++) WAincl[ii]=false;

        
    };
    
    if (dScore>-1000000) {//include this align
        WAincl[iA]=true;

        if ( WA[iA][WA_Nrep]==1 ) trAi.nUnique++; //unique piece
        if ( WA[iA][WA_Anchor]>0 ) trAi.nAnchor++; //anchor piece piece     
       
        stitchWindowAligns(iA+1, nA, Score+dScore, WAincl, WA[iA][WA_rStart]+WA[iA][WA_Length]-1, WA[iA][WA_gStart]+WA[iA][WA_Length]-1, trAi, Lread, WA, R, Q, G, sigG, P, wTr, nWinTr, RA);
    } else {

    };   
    
    //also run a transcript w/o including this align
    if (WA[iA][WA_Anchor]!=2 || trA.nAnchor>0) {//only allow exclusion if this is not the last anchor, or other anchors have been used
        WAincl[iA]=false;
        stitchWindowAligns(iA+1, nA, Score, WAincl, tR2, tG2, trA, Lread, WA, R, Q, G, sigG, P, wTr, nWinTr, RA);       
    };
    return;
};
intScore stitchAlignToTranscript(uint rAend, uint gAend, uint rBstart, uint gBstart, uint L, uint iFragB, uint sjAB, Parameters* P, char* R, char* Q, char* G,  Transcript *trA, const uint outFilterMismatchNmaxTotal) {
    //stitch together A and B, extend in the gap, returns max score
    //Q is assumed modified already

    int Score=0;
//     int score2;
            
    if (sjAB!=((uint) -1) && trA->exons[trA->nExons-1][EX_sjA]==sjAB \
            && trA->exons[trA->nExons-1][EX_iFrag]==iFragB && rBstart==rAend+1 && gAend+1<gBstart ) {//simple stitching if junction belongs to a database 
        if (P->sjdbMotif[sjAB]==0 && (L<=P->sjdbShiftRight[sjAB] || trA->exons[trA->nExons-1][EX_L]<=P->sjdbShiftLeft[sjAB]) ) {
            return -1000006; //too large repeats around non-canonical junction
        };
        trA->exons[trA->nExons][EX_L] = L; //new exon length
        trA->exons[trA->nExons][EX_R] = rBstart; //new exon r-start
        trA->exons[trA->nExons][EX_G] = gBstart; //new exon g-start
        trA->canonSJ[trA->nExons-1]=P->sjdbMotif[sjAB]; //mark sj-db
        trA->shiftSJ[trA->nExons-1][0]=P->sjdbShiftLeft[sjAB];
        trA->shiftSJ[trA->nExons-1][1]=P->sjdbShiftRight[sjAB];
        trA->sjAnnot[trA->nExons-1]=1;
        trA->sjStr[trA->nExons-1]=P->sjdbStrand[sjAB];;
        trA->nExons++;            
        trA->nMatch+=L;
        for (uint ii=rBstart;ii<rBstart+L;ii++) Score+=int(Q[ii]); //add QS for mapped portions    
        Score+=P->sjdbScore;
    } else {//general stitching       
        trA->sjAnnot[trA->nExons-1]=0;
        trA->sjStr[trA->nExons-1]=0;
        
        if (trA->exons[trA->nExons-1][EX_iFrag]==iFragB) {//stitch aligns on the same fragment
            uint gBend=gBstart+L-1;
            uint rBend=rBstart+L-1;    

//             {//debug 
//                 if (sjAB!=((uint) -1) && trA->exons[trA->nExons-1][EX_sjA]!=((uint) -1) && rBend<=rAend) {//
//                     Score -= rAend-rBstart+1;
//                     gAend -= rAend-rBstart+1;
//                     rAend = rBstart-1;
//                     trA->exons[trA->nExons-1][EX_L] =rAend-trA->exons[trA->nExons-1][EX_R]+1;
//                 };
//             };
            
            //check if r-overlapping fully and exit
            if (rBend<=rAend) return -1000001; 
            if (gBend<=gAend && trA->exons[trA->nExons-1][EX_iFrag]==iFragB) return -1000002; 

            //shift the B 5' if overlaps A 3'
            if (rBstart<=rAend) {
                gBstart+=rAend-rBstart+1;        
                rBstart=rAend+1;
                L=rBend-rBstart+1;
            };  

            for (uint ii=rBstart;ii<=rBend;ii++) Score+=int(Q[ii]); //add QS for mapped portions               

            int gGap=gBstart-gAend-1; //could be < 0 for insertions      
            int rGap=rBstart-rAend-1;//>0 always since we removed overlap

            uint nMatch=L;
            uint nMM=0;
            uint Del=0, Ins=0;
            uint nIns=0, nDel=0;
            int jR=0; //junction location in R-space
            int jCan=999; //canonical junction type
            uint gBstart1=gBstart-rGap-1;//the last base of the intron if all read gap belongs to acceptor, i.e. jR=0
            

            // check all the different combinations of gGap and rGap
            if ( gGap==0 && rGap==0 ) {//just joined the pieces, w/o stiching or gaps
                //do nothing for now
            } else if ( gGap>0 && rGap>0 && rGap==gGap ) {//no gaps, just try to fill space
                //simple stitching, assuming no insertion in the read

                for (int ii=1;ii<=rGap;ii++) {
                    if (G[gAend+ii]<4 && R[rAend+ii]<4) {//only score genome bases that are not Ns
                        if ( R[rAend+ii]==G[gAend+ii] ) {                    
                            Score+=int(Q[rAend+ii]);
                            nMatch++;
    //                         if (Q[rAend+ii]>=P->Qgood) nMatchGood++;
                        } else {
                            Score-=int(Q[rAend+ii]);
//                             trA->rMM[trA->nMM + nMM] = rAend+ii;                
                            nMM++;
    //                         if (Q[rAend+ii]>=P->Qgood) nMMgood++;                        
                        };
                    };
                }; 

            } else if ( gGap>rGap ) {//genomic gap (Deletion)

                nDel=1;
                Del=gGap-rGap; //gGap>0 here
                
                if (Del>P->alignIntronMax && P->alignIntronMax>0) {
                    return -1000003; //large gaps not allowed
                };

                int Score1=0;
                int jR1=1; //junction location in R-space        
                do { // 1. move left, until the score for MM is less than canonical advantage
                    jR1--;
                    if ( R[rAend+jR1]!=G[gBstart1+jR1] && G[gBstart1+jR1]<4 && R[rAend+jR1]==G[gAend+jR1]) Score1 -= int(Q[rAend+jR1]);
                }  while ( Score1+P->scoreStitchSJshift >= 0 && int(trA->exons[trA->nExons-1][EX_L]) + jR1 > 1);//>=P->alignSJoverhangMin); //also check that we are still within the exon

                int maxScore2=-999999;
                Score1=0;
                int jPen=0;
                do { // 2. scan to the right to find the best junction locus
                    // ?TODO? if genome base is N, how to score?
                    if  ( R[rAend+jR1]==G[gAend+jR1] && R[rAend+jR1]!=G[gBstart1+jR1] )  Score1+=int(Q[rAend+jR1]);          
                    if  ( R[rAend+jR1]!=G[gAend+jR1] && R[rAend+jR1]==G[gBstart1+jR1] )  Score1-=int(Q[rAend+jR1]);

                    int jCan1=-1; //this marks Deletion
                    int jPen1=0;
                    int Score2=Score1;

                    if (Del>=P->alignIntronMin) {//only check intron motif for large gaps= non-Dels
                        //check if the intron is canonical, or semi-canonical
                        if ( G[gAend+jR1+1]==2 && G[gAend+jR1+2]==3 && G[gBstart1+jR1-1]==0 && G[gBstart1+jR1]==2 ) {//GTAG
                            jCan1=1;
                        } else if ( G[gAend+jR1+1]==1 && G[gAend+jR1+2]==3 && G[gBstart1+jR1-1]==0 && G[gBstart1+jR1]==1 ) {//CTAC
                            jCan1=2;
                        } else if ( G[gAend+jR1+1]==2 && G[gAend+jR1+2]==1 && G[gBstart1+jR1-1]==0 && G[gBstart1+jR1]==2 ) {//GCAG
                            jCan1=3;
                            jPen1=P->scoreGapGCAG;
                        } else if ( G[gAend+jR1+1]==1 && G[gAend+jR1+2]==3 && G[gBstart1+jR1-1]==2 && G[gBstart1+jR1]==1 ) {//CTGC
                            jCan1=4;
                            jPen1=P->scoreGapGCAG;
                        } else if ( G[gAend+jR1+1]==0 && G[gAend+jR1+2]==3 && G[gBstart1+jR1-1]==0 && G[gBstart1+jR1]==1 ) {//ATAC
                            jCan1=5;
                            jPen1=P->scoreGapATAC;                        
                        } else if ( G[gAend+jR1+1]==2 && G[gAend+jR1+2]==3 && G[gBstart1+jR1-1]==0 && G[gBstart1+jR1]==3 ) {//GTAT
                            jCan1=6;             
                            jPen1=P->scoreGapATAC;
                        } else {
                            jCan1=0;
                            jPen1=P->scoreGapNoncan;
                        };

                        Score2 += jPen1;
                    };

                    if (maxScore2 < Score2 ) {//check if the score is the highest. TODO: record the next highest score
                        maxScore2=Score2;
                        jR=jR1; //this is the last base of donor
                        jCan=jCan1;
                        jPen=jPen1;
                    };            
                        jR1++;
                } while ( jR1 < int(rBend) - int(rAend) );// - int(P->alignSJoverhangMin) );//TODO: do not need to search the full B-transcript, can stop as soon as Score goes down by more than

                //repeat length: go back and forth around jR to find repeat length
                uint jjL=0,jjR=0;
                while ( gAend+jR>=jjL && G[gAend-jjL+jR]==G[gBstart1-jjL+jR] && G[gAend-jjL+jR]<4 && jjL<=MAX_SJ_REPEAT_SEARCH) {//go back
                    jjL++;
                };

                while ( gAend+jjR+jR+1<P->nGenome && G[gAend+jjR+jR+1]==G[gBstart1+jjR+jR+1] && G[gAend+jjR+jR+1]<4 && jjR<=MAX_SJ_REPEAT_SEARCH) {//go forward
                    jjR++;
                };

                if (jCan<=0) {//flush deletions and non-canonical junction to the left
                    jR-=jjL;
                    if (int(trA->exons[trA->nExons-1][EX_L])+jR<1) return -1000005;
                    jjR+=jjL;
                    jjL=0;
                };

                //TODO check here if the internal exon length < minDa, if so exit w/o stitiching            

                for (int ii=min(1,jR+1);ii<=max(rGap,jR);ii++) {//score donor and acceptor
                    uint g1=(ii<=jR) ? (gAend+ii):(gBstart1+ii);
                    if (G[g1]<4 && R[rAend+ii]<4) {//only penalize non-N bases
                        if ( R[rAend+ii]==G[g1] ) {
                            if (ii>=1 && ii <=rGap) {//only add +score and matches within the gap
                                Score+=int(Q[rAend+ii]);
                                nMatch++;
                            };
                        } else {//add -score and MM for all bases
                            Score-=int(Q[rAend+ii]); 
                            nMM++;
                            if (ii<1 || ii>rGap) {//subtract previuosly presumed matches
                                Score-=int(Q[rAend+ii]);                         
                                nMatch--;
//                                 if (ii<=jR) nMM--;
                            };
                        };
                    };
                };           

                //score the gap
                if (P->sjdbN>0) {//check if the junction is annotated
                        uint jS=gAend+jR+1, jE=gBstart1+jR;//intron start/end
                        int sjdbInd=binarySearch2(jS,jE,P->sjdbStart,P->sjdbEnd,P->sjdbN);
                        if (sjdbInd<0) {
                            if (Del>=P->alignIntronMin) {
                                Score += P->scoreGap + jPen; //genome gap penalty + non-canonical penalty  
                            } else {//deletion
                                Score += Del*P->scoreDelBase + P->scoreDelOpen;
                                jCan=-1;
                                trA->sjAnnot[trA->nExons-1]=0;
//                                 jjR-=jjL;
//                                 jR-=jjL;
//                                 jjL=0;
//                                 trA->shiftSJ[trA->nExons-1][0]=0;
//                                 trA->shiftSJ[trA->nExons-1][1]=jjR;                                  
                            };
                        } else {//annotated
                            jCan=P->sjdbMotif[sjdbInd];
                            if (P->sjdbMotif[sjdbInd]==0) {//shift to match annotations
                                if (L<=P->sjdbShiftLeft[sjdbInd] || trA->exons[trA->nExons-1][EX_L]<=P->sjdbShiftLeft[sjdbInd]) {
                                    return -1000006;
                                };
                                jR += (int) P->sjdbShiftLeft[sjdbInd];
                                jjL=P->sjdbShiftLeft[sjdbInd];
                                jjR=P->sjdbShiftRight[sjdbInd];
                            };
                            trA->sjAnnot[trA->nExons-1]=1;
                            trA->sjStr[trA->nExons-1]=P->sjdbStrand[sjdbInd];
                            Score += P->sjdbScore;                           
                        };                        
                } else {//no annotation
                    if (Del>=P->alignIntronMin) {//junction, not short deletion
                        Score += P->scoreGap + jPen;
                    } else {
                        Score += Del*P->scoreDelBase + P->scoreDelOpen;       
                        jCan=-1;
                        trA->sjAnnot[trA->nExons-1]=0;
                    };
                };
                
                trA->shiftSJ[trA->nExons-1][0]=jjL;
                trA->shiftSJ[trA->nExons-1][1]=jjR;            
                trA->canonSJ[trA->nExons-1]=jCan;                

                if (trA->sjAnnot[trA->nExons-1]==0) {//strand for unannotated junctions
                    if (jCan>0) {
                         trA->sjStr[trA->nExons-1]=2-jCan%2; //1=+,2=-
                    } else {
                         trA->sjStr[trA->nExons-1]=0;
                    };
                };
                
            } else if ( rGap>gGap ) {//insertion: if also gGap>0, need to stitch 
                Ins=rGap-gGap;
                nIns=1;
                if (gGap==0) {//simple insertion, no need to stitch
                    jR=0; 
                } else if (gGap<0) {//reduce the score

                    jR=0;
                    for (int ii=0; ii<-gGap; ii++) Score -= int(Q[rBstart+ii]);

                } else {//stitch: define the exon boundary jR               
                    int Score1=0; int maxScore1=0;                
                    for (int jR1=1;jR1<=gGap;jR1++) {//scan to the right to find the best score

                        if (G[gAend+jR1]<4) {//only penalize goog genome bases
                            Score1+=( R[rAend+jR1]==G[gAend+jR1] ) ? int(Q[rAend+jR1]):-int(Q[rAend+jR1]);
                            Score1+=( R[rAend+Ins+jR1]==G[gAend+jR1] ) ? -int(Q[rAend+Ins+jR1]):+int(Q[rAend+Ins+jR1]);
                        };

                        if (Score1>maxScore1) {
                            maxScore1=Score1;
                            jR=jR1;
                        };                 
                    };


                    for (int ii=1;ii<=gGap;ii++) {//score donor and acceptor
                        uint r1=rAend+ii+(ii<=jR ? 0:Ins);
                        if (G[gAend+ii]<4 && R[r1]<4) {
                            if ( R[r1]==G[gAend+ii] ) {
                                Score+=int(Q[r1]);
                                nMatch++;
                            } else {//add -score and MM for all bases
                                Score-=int(Q[r1]); 
                                nMM++;
                            };
                        };               
                    };
                };

                Score += Ins*P->scoreInsBase + P->scoreInsOpen;
                jCan=-3;
            }; //different types of gaps selection

            
            
        #ifdef COMPILE_FOR_LONG_READS
            if ( (trA->nMM + nMM)<=outFilterMismatchNmaxTotal )
//             if ( Score>0 && nMM<=200 )
                
        #else
            if ( (trA->nMM + nMM)<=outFilterMismatchNmaxTotal  \
                         && ( jCan<0 || (jCan<7 && nMM<= (uint) P->alignSJstitchMismatchNmax[(jCan+1)/2]) ) ) 
        #endif
            {//stitching worked only if there no mis-matches for non-GT/AG junctions
                trA->nMM += nMM;
                trA->nMatch += nMatch;

                if (Del>=P->alignIntronMin) {
                    trA->nGap += nDel;
                    trA->lGap += Del;
                } else {
                    trA->nDel += nDel;
                    trA->lDel += Del;
                };

                //modify exons
                if (Del==0 && Ins==0) {//no gap => no new exon, extend the boundary of the previous exon
                    trA->exons[trA->nExons-1][EX_L] += rBend-rAend;
                } else if (Del>0) { //deletion:ca only have Del> or Ins>0
                    trA->exons[trA->nExons-1][EX_L] += jR; //correct the previous exon boundary
                    trA->exons[trA->nExons][EX_L] = rBend-rAend-jR; //new exon length
                    trA->exons[trA->nExons][EX_R] = rAend+jR+1; //new exon r-start
                    trA->exons[trA->nExons][EX_G] = gBstart1+jR+1; //new exon g-start            
                    trA->nExons++;
                } else if (Ins>0) { //Ins>0;
                    trA->nIns += nIns;        
                    trA->lIns += Ins;
                    trA->exons[trA->nExons-1][EX_L] += jR; //correct the previous exon boundary
                    trA->exons[trA->nExons][EX_L] = rBend-rAend-jR-Ins; //new exon length
                    trA->exons[trA->nExons][EX_R] = rAend+jR+Ins+1; //new exon r-start
                    trA->exons[trA->nExons][EX_G] = gAend+1+jR; //new exon g-start
                    trA->canonSJ[trA->nExons-1]=-2; //mark insertion
                    trA->sjAnnot[trA->nExons-1]=0;
                    trA->nExons++;            
                };
            } else {//stitching was not accepted
                return -1000007;
            };
        } else if (gBstart+trA->exons[0][EX_R] >= trA->exons[0][EX_G] || trA->exons[0][EX_G] < trA->exons[0][EX_R]){//if (iFragA==iFragB) stitch aligns from different fragments

            if (P->alignMatesGapMax>0 && gBstart > trA->exons[trA->nExons-1][EX_G] + trA->exons[trA->nExons-1][EX_L] + P->alignMatesGapMax) {
                return -1000004; //gap between mates too large
            };
            //extend the fragments inside
            //note, that this always works, i.e. Score>0

            for (uint ii=rBstart;ii<rBstart+L;ii++) Score+=int(Q[ii]); //add QS for mapped portions                           
            
            Transcript trExtend;

            //TODO: compare extensions to the left and right, pick the best one to be performed first
            //otherwise if a large nMM is reached in the 2st extension, it will prevent the 2nd extension
            //use the following example:
            //>1
            //TTCTGTGTCTCCCCCTCCCCCACTGGCTACATGGAGACAGGGGGGGGGGGCCGGGCGGTTCCCGGGCAGAAAAAAA
            //>1
            //AATATTTGGAACACTTATGTGAAAAATGATTTGTTTTTCTGAAATTTACGTTTCTCTCTGAGTCCTGTAACTGTCC

            
            trExtend.reset();
            if ( extendAlign(R, Q, G, rAend+1, gAend+1, 1, 1, DEF_readSeqLengthMax, trA->nMatch, trA->nMM, outFilterMismatchNmaxTotal, P->outFilterMismatchNoverLmax, \
                             P->alignEndsType.ext[trA->exons[trA->nExons-1][EX_iFrag]][1], &trExtend) ) {

                trA->add(&trExtend);
                Score += trExtend.maxScore;

                trA->exons[trA->nExons-1][EX_L] += trExtend.extendL;
            };// if extendAlign for read A

            trA->exons[trA->nExons][EX_R] = rBstart;
            trA->exons[trA->nExons][EX_G] = gBstart;
            trA->exons[trA->nExons][EX_L] = L;
            trA->nMatch += L;

            trExtend.reset();
            //if ( extendAlign(R, Q, G, rBstart-1, gBstart-1, -1, -1, gBstart-trA->exons[0][EX_G]+trA->exons[0][EX_R], trA->nMatch, trA->nMM, outFilterMismatchNmaxTotal, P->outFilterMismatchNoverLmax, 
            //if end extension needs to be forced, use large length. Otherwise, only extend until the beginning of the transcript
            uint extlen=P->alignEndsType.ext[iFragB][1] ? DEF_readSeqLengthMax : gBstart-trA->exons[0][EX_G]+trA->exons[0][EX_R];
            if ( extendAlign(R, Q, G, rBstart-1, gBstart-1, -1, -1, extlen, trA->nMatch, trA->nMM, outFilterMismatchNmaxTotal, P->outFilterMismatchNoverLmax, \
                             P->alignEndsType.ext[iFragB][1], &trExtend) ) {

                trA->add(&trExtend);
                Score += trExtend.maxScore;               

                trA->exons[trA->nExons][EX_R] -= trExtend.extendL;
                trA->exons[trA->nExons][EX_G] -= trExtend.extendL;
                trA->exons[trA->nExons][EX_L] += trExtend.extendL;
            }; //if extendAlign B

            trA->canonSJ[trA->nExons-1]=-3; //mark different fragments junction
            trA->sjAnnot[trA->nExons-1]=0;

            trA->nExons++;        
        } else {//no stitching possible
            return -1000008;
        };
    };
    
    trA->exons[trA->nExons-1][EX_iFrag]=iFragB; //the new exon belongs to fragment iFragB
    trA->exons[trA->nExons-1][EX_sjA]=sjAB;     

    return Score;         
};
Beispiel #7
0
void sjdbBuildIndex (Parameters *P, Parameters *P1, char *Gsj, char *G, PackedArray &SA, PackedArray &SA2, PackedArray &SAi) {
    
    #define SPACER_CHAR GENOME_spacingChar

    if (P->sjdbN==0)
    {//no junctions to insert
        return;
    };
    
    time_t rawtime;
    time ( &rawtime );
    P->inOut->logMain   << timeMonthDayTime(rawtime) << " ..... Inserting junctions into the genome indices" <<endl;    
    *P->inOut->logStdOut  << timeMonthDayTime(rawtime) << " ..... Inserting junctions into the genome indices" <<endl;
    
    uint nGsj=P->sjdbLength*P->sjdbN;
    for (uint ii=1; ii<=P->sjdbN; ii++) 
    {
        Gsj[ii*P->sjdbLength-1]=SPACER_CHAR; //to make sure this is > than any genome char
    };
    Gsj[nGsj*2]=SPACER_CHAR+1;//mark the end of the text

    for (uint ii=0; ii<nGsj; ii++) {//reverse complement junction sequences
        Gsj[nGsj*2-1-ii]=Gsj[ii]<4 ? 3-Gsj[ii] : Gsj[ii]; //reverse complement
    };

    char* G1c=new char[nGsj*2+1];
    complementSeqNumbers(Gsj, G1c, nGsj*2+1);

    uint32* oldSJind=new uint32[P1->sjdbN];
    
//     uint nIndicesSJ1=P->sjdbOverhang;
    uint   nIndicesSJ1=P->sjdbLength;//keep all indices - this is pre-2.4.1 of generating the genome
    
    uint64* indArray=new uint64[2*P->sjdbN*(nIndicesSJ1+1)*2];//8+4 bytes for SA index and index in the genome * nJunction * nIndices per junction * 2 for reverse compl
    uint64 sjNew=0;
    #pragma omp parallel num_threads(P->runThreadN)
    #pragma omp for schedule (dynamic,1000) reduction(+:sjNew)
    for (uint isj=0; isj<2*P->sjdbN; isj++) {//find insertion points for each of the sequences

        char** seq1=new char*[2];
        seq1[0]=Gsj+isj*P->sjdbLength;
        seq1[1]=G1c+isj*P->sjdbLength;
        
        uint isj1=isj<P->sjdbN ? isj : 2*P->sjdbN-1-isj;
        int sjdbInd = P1->sjdbN==0 ? -1 : binarySearch2(P->sjdbStart[isj1],P->sjdbEnd[isj1],P1->sjdbStart,P1->sjdbEnd,P1->sjdbN);
        if (sjdbInd<0) 
        {//count new junctions
            ++sjNew;
        } else 
        {//record new index of the old junctions
            oldSJind[sjdbInd]=isj1;
        };
        
        for (uint istart1=0; istart1<nIndicesSJ1;istart1++) {
            
            uint istart=istart1;
//             uint istart=isj<P->sjdbN ? istart1 : istart1+1; //for rev-compl junction, shift by one base to start with the 1st non-spacer base
            uint ind1=2*(isj*nIndicesSJ1+istart1);
            if (sjdbInd>=0 || seq1[0][istart]>3) 
            {//no index for already included junctions, or suffices starting with N
                indArray[ind1]=-1;
            } else 
            {
                //indArray[ind1] =  suffixArraySearch(seq1, istart, P->sjdbLength-istart1, G, SA, true, 0, P->nSA-1, 0, P) ;
                indArray[ind1] =  suffixArraySearch(seq1, istart, 10000, G, SA, true, 0, P->nSA-1, 0, P) ;
                indArray[ind1+1] = isj*P->sjdbLength+istart;
            };
        };
    };
//     for (int ii=0;ii<P1->sjdbN;ii++) {if ( oldSJind[ii]==0){cout <<ii<<endl;};};
    sjNew = sjNew/2;//novel junctions were double counted on two strands
    
    time ( &rawtime );
    P->inOut->logMain  << timeMonthDayTime(rawtime) << "   Finished SA search: number of new junctions=" << sjNew <<", old junctions="<<P->sjdbN-sjNew<<endl;
    
    uint nInd=0;//true number of new indices
    for (uint ii=0; ii<2*P->sjdbN*nIndicesSJ1; ii++) {//remove entries that cannot be inserted, this cannot be done in the parallel cycle above
        if (indArray[ii*2]!= (uint) -1) {
            indArray[nInd*2]=indArray[ii*2];
            indArray[nInd*2+1]=indArray[ii*2+1];
            ++nInd;
        };
    };

    globalGsj=Gsj;
    qsort((void*) indArray, nInd, 2*sizeof(uint64), funCompareUintAndSuffixes);
    time ( &rawtime );
    P->inOut->logMain  << timeMonthDayTime(rawtime) << "   Finished sorting SA indicesL nInd="<<nInd <<endl;

    indArray[2*nInd]=-999; //mark the last junction
    indArray[2*nInd+1]=-999; //mark the last junction
    
    P->nGenome=P->chrStart[P->nChrReal]+nGsj;    
    P->nSA+=nInd;
    
    uint GstrandBit1 = (uint) floor(log(P->nGenome)/log(2))+1;
    if (GstrandBit1<32) GstrandBit1=32; //TODO: use simple access function for SA
    if ( GstrandBit1 != P->GstrandBit) 
    {//too many junctions were added - GstrandBit changed
        ostringstream errOut;
        errOut << "EXITING because of FATAL ERROR: cannot insert junctions on the fly because of strand GstrandBit problem\n";
        errOut << "SOLUTION: please contact STAR author at https://groups.google.com/forum/#!forum/rna-star\n";
        exitWithError(errOut.str(),std::cerr, P->inOut->logMain, EXIT_CODE_GENOME_FILES, *P);
    };
    
    SA2.defineBits(P->GstrandBit+1,P->nSA);
    uint nGsjNew=sjNew*P->sjdbLength; //this is the actual number of bytes added to the genome, while nGsj is the total size of all junctions
    
    uint N2bit= 1LLU << P->GstrandBit;
    uint strandMask=~N2bit;
    
    //testing
//     PackedArray SAo;
//     SAo.defineBits(P->GstrandBit+1,P->nSA);
//     SAo.allocateArray();
//     ifstream oldSAin("./DirTrue/SA");
//     oldSAin.read(SAo.charArray,SAo.lengthByte);
//     oldSAin.close();
    
    
    uint isj=0, isa2=0;
    for (uint isa=0;isa<P1->nSA;isa++) {
        //testing
//         if (isa2>0 && SA2[isa2-1]!=SAo[isa2-1]) {
//             cout <<isa2 <<" "<< SA2[isa2-1]<<" "<<SAo[isa2-1]<<endl;
//         };        

//         if (isa==69789089)
//      	{ 
//           cout <<isa;
//         };

        uint ind1=SA[isa];
        
        if ( (ind1 & N2bit)>0 ) 
        {//- strand
            uint ind1s = P1->nGenome - (ind1 & strandMask);
            if (ind1s>P->chrStart[P->nChrReal])
            {//this index was an old sj, may need to shift it
                uint sj1 = (ind1s-P->chrStart[P->nChrReal])/P->sjdbLength;//old junction index
                ind1s += (oldSJind[sj1]-sj1)*P->sjdbLength;
                ind1 = (P->nGenome - ind1s) | N2bit;
            } else
            {
                ind1+=nGsjNew; //reverse complementary indices are all shifted by the length of junctions
            };
        } else
        {//+ strand
            if (ind1>P->chrStart[P->nChrReal])
            {//this index was an old sj, may need to shift it
                uint sj1 = (ind1-P->chrStart[P->nChrReal])/P->sjdbLength;//old junction index
                ind1 += (oldSJind[sj1]-sj1)*P->sjdbLength;
            };
        };
        
        SA2.writePacked(isa2,ind1); //TODO make sure that the first sj index is not before the first array index
        ++isa2;
        
        while (isa==indArray[isj*2]) {//insert sj index after the existing index
            uint ind1=indArray[isj*2+1];
            if (ind1<nGsj) {
                ind1+=P->chrStart[P->nChrReal];
            } else {//reverse strand
                ind1=(ind1-nGsj) | N2bit;
            };
            SA2.writePacked(isa2,ind1);
            ++isa2; ++isj;
        };
    };
    time ( &rawtime );
    P->inOut->logMain  << timeMonthDayTime(rawtime) << "   Finished inserting junction indices" <<endl;
    
    //SAi insertions
    for (uint iL=0; iL < P->genomeSAindexNbases; iL++) {
        uint iSJ=0;
        uint ind0=P->genomeSAindexStart[iL]-1;//last index that was present in the old genome
        for (uint ii=P->genomeSAindexStart[iL];ii<P->genomeSAindexStart[iL+1]; ii++) {//scan through the longest index
            uint iSA1=SAi[ii];
            uint iSA2=iSA1 & P->SAiMarkNmask & P->SAiMarkAbsentMask;
            
            if ( iSJ<nInd && (iSA1 &  P->SAiMarkAbsentMaskC)>0 ) 
            {//index missing from the old genome
                uint iSJ1=iSJ;
                int64 ind1=funCalcSAi(Gsj+indArray[2*iSJ+1],iL);
                while (ind1 < (int64)(ii-P->genomeSAindexStart[iL]) && indArray[2*iSJ]<iSA2) {
                    ++iSJ;
                    ind1=funCalcSAi(Gsj+indArray[2*iSJ+1],iL);
                };
                if (ind1 == (int64)(ii-P->genomeSAindexStart[iL]) ) {
                    SAi.writePacked(ii,indArray[2*iSJ]+iSJ+1);
                    for (uint ii0=ind0+1; ii0<ii; ii0++) {//fill all the absent indices with this value
                        SAi.writePacked(ii0,(indArray[2*iSJ]+iSJ+1) | P->SAiMarkAbsentMaskC);
                    };
                    ++iSJ;
                    ind0=ii;
                } else {
                    iSJ=iSJ1;
                };
            } else 
            {//index was present in the old genome
                while (iSJ<nInd && indArray[2*iSJ]+1<iSA2) {//for this index insert "smaller" junctions
                    ++iSJ;
                };
                
                while (iSJ<nInd && indArray[2*iSJ]+1==iSA2) {//special case, the index falls right behind SAi
                    if (funCalcSAi(Gsj+indArray[2*iSJ+1],iL) >= (int64) (ii-P->genomeSAindexStart[iL]) ) {//this belongs to the next index
                        break;
                    };
                    ++iSJ;
                };   
                
                SAi.writePacked(ii,iSA1+iSJ);
                
                for (uint ii0=ind0+1; ii0<ii; ii0++) {//fill all the absent indices with this value
                    SAi.writePacked(ii0,(iSA2+iSJ) | P->SAiMarkAbsentMaskC);
                };
                ind0=ii;
            };
        };

    };
//     time ( &rawtime );    cout << timeMonthDayTime(rawtime) << "SAi first" <<endl;

    for (uint isj=0;isj<nInd;isj++) {
        int64 ind1=0;
        for (uint iL=0; iL < P->genomeSAindexNbases; iL++) {
            uint g=(uint) Gsj[indArray[2*isj+1]+iL];
            ind1 <<= 2;
            if (g>3) {//this iSA contains N, need to mark the previous
                for (uint iL1=iL; iL1 < P->genomeSAindexNbases; iL1++) {
                    ind1+=3;
                    int64 ind2=P->genomeSAindexStart[iL1]+ind1;
                    for (; ind2>=0; ind2--) {//find previous index that is not absent
                        if ( (SAi[ind2] & P->SAiMarkAbsentMaskC)==0 ) {
                            break;
                        };
                    };
                    SAi.writePacked(ind2,SAi[ind2] | P->SAiMarkNmaskC);
                    ind1 <<= 2;
                };
                break;
            } else {
                ind1 += g;
            };
        };
    };
    time ( &rawtime );
    P->inOut->logMain  << timeMonthDayTime(rawtime) << "   Finished SAi" <<endl;
    
    //change parameters, most parameters are already re-defined in sjdbPrepare.cpp
    SA.defineBits(P->GstrandBit+1,P->nSA);//same as SA2
    SA.pointArray(SA2.charArray);
    P->nSAbyte=SA.lengthByte;
    P->sjGstart=P->chrStart[P->nChrReal];
    memcpy(G+P->chrStart[P->nChrReal],Gsj, nGsj);
    
    /* testing
    PackedArray SAio=SAi;
    SAio.allocateArray();
    ifstream oldSAiin("./DirTrue/SAindex");
//     oldSAin.read(SAio.charArray,8*(P->genomeSAindexNbases+2));//skip first bytes
    oldSAiin.read(SAio.charArray,SAio.lengthByte);
    oldSAiin.close();  
    

//     for (uint ii=0;ii<P->nSA;ii++) {
//         if (SA2[ii]!=SAo[ii]) {
//             cout <<ii <<" "<< SA2[ii]<<" "<<SAo[ii]<<endl;
//         };
//     };


    for (uint iL=0; iL < P->genomeSAindexNbases; iL++) {
        for (uint ii=P->genomeSAindexStart[iL];ii<P->genomeSAindexStart[iL+1]; ii++) {//scan through the longets index
                if ( SAio[ii]!=SAi[ii] ) {
                    cout <<ii<<" "<<SAio[ii]<<" "<<SAi[ii]<<endl;
                };
        };
    };    
    */
    
    /*
    ofstream genomeOut("/home/dobin/Genome");
    fstreamWriteBig(genomeOut,G,P->nGenome+nGsj,"777","777",P);
    genomeOut.close(); 
    genomeOut.open("/home/dobin/SA");
    fstreamWriteBig(genomeOut,SA2.charArray,SA2.lengthByte,"777","777",P);
    genomeOut.close();
    */
    
    delete [] indArray;
    delete [] G1c;
    delete [] oldSJind;       
    
};