Ejemplo n.º 1
0
void genomeGenerate(Parameters *P) {
    
    //check parameters
    if (P->sjdbOverhang<=0 && (P->sjdbFileChrStartEnd.at(0)!="-" || P->sjdbGTFfile!="-")) 
    {
        ostringstream errOut;
        errOut << "EXITING because of FATAL INPUT PARAMETER ERROR: for generating genome with annotations (--sjdbFileChrStartEnd or --sjdbGTFfile options)\n";
        errOut << "you need to specify >0 --sjdbOverhang\n";
        errOut << "SOLUTION: re-run genome generation specifying non-zero --sjdbOverhang, which ideally should be equal to OneMateLength-1, or could be chosen generically as ~100\n";        
        exitWithError(errOut.str(),std::cerr, P->inOut->logMain, EXIT_CODE_INPUT_FILES, *P);
    } 
    if (P->sjdbFileChrStartEnd.at(0)=="-" && P->sjdbGTFfile=="-") 
    {
        if (P->parArray.at(P->sjdbOverhang_par)->inputLevel>0 && P->sjdbOverhang>0)
        {
            ostringstream errOut;
            errOut << "EXITING because of FATAL INPUT PARAMETER ERROR: when generating genome without annotations (--sjdbFileChrStartEnd or --sjdbGTFfile options)\n";
            errOut << "do not specify >0 --sjdbOverhang\n";
            errOut << "SOLUTION: re-run genome generation without --sjdbOverhang option\n";        
            exitWithError(errOut.str(),std::cerr, P->inOut->logMain, EXIT_CODE_INPUT_FILES, *P);
        };
        P->sjdbOverhang=0;
    };
    
    //time
    time_t rawTime;
    string timeString;
    
    time(&rawTime);
    P->inOut->logMain     << timeMonthDayTime(rawTime) <<" ... Starting to generate Genome files\n" <<flush;
    *P->inOut->logStdOut  << timeMonthDayTime(rawTime) <<" ... Starting to generate Genome files\n" <<flush;
    
    //define some parameters from input parameters
    P->genomeChrBinNbases=1LLU << P->genomeChrBinNbits;
    //write genome parameters file
    genomeParametersWrite(P->genomeDir+("/genomeParameters.txt"), P, "ERROR_00102");
    
    char *G=NULL, *G1=NULL;        
    uint nGenomeReal=genomeScanFastaFiles(P,G,false);//first scan the fasta file to find all the sizes  
    P->chrBinFill();

    uint L=10000;//maximum length of genome suffix    
    uint nG1alloc=(nGenomeReal + L)*2;
    G1=new char[nG1alloc];
    G=G1+L;
    
    memset(G1,GENOME_spacingChar,nG1alloc);//initialize to K-1 all bytes
 
    genomeScanFastaFiles(P,G,true);    //load the genome sequence   

    uint N = nGenomeReal;
    P->nGenome=N;
    uint N2 = N*2;     

    ofstream chrN,chrS,chrL,chrNL;
    
    ofstrOpen(P->genomeDir+"/chrName.txt","ERROR_00103", P, chrN);   
    ofstrOpen(P->genomeDir+"/chrStart.txt","ERROR_00103", P, chrS);   
    ofstrOpen(P->genomeDir+"/chrLength.txt","ERROR_00103", P, chrL);   
    ofstrOpen(P->genomeDir+"/chrNameLength.txt","ERROR_00103", P, chrNL);   
    
    for (uint ii=0;ii<P->nChrReal;ii++) {//output names, starts, lengths               
        chrN<<P->chrName[ii]<<"\n";
        chrS<<P->chrStart[ii]<<"\n";
        chrL<<P->chrLength.at(ii)<<"\n";
        chrNL<<P->chrName[ii]<<"\t"<<P->chrLength.at(ii)<<"\n";        
    };
    chrS<<P->chrStart[P->nChrReal]<<"\n";//size of the genome
    chrN.close();chrL.close();chrS.close(); chrNL.close();   
    
    if (P->limitGenomeGenerateRAM < (nG1alloc+nG1alloc/3)) {//allocate nG1alloc/3 for SA generation
        ostringstream errOut;                            
        errOut <<"EXITING because of FATAL PARAMETER ERROR: limitGenomeGenerateRAM="<< (P->limitGenomeGenerateRAM) <<"is too small for your genome\n";
        errOut <<"SOLUTION: please specify limitGenomeGenerateRAM not less than"<< nG1alloc+nG1alloc/3 <<" and make that much RAM available \n";
        exitWithError(errOut.str(),std::cerr, P->inOut->logMain, EXIT_CODE_INPUT_FILES, *P);
    };     
    
    //preparing to generate SA
    for (uint ii=0;ii<N;ii++) {//- strand
        G[N2-1-ii]=G[ii]<4 ? 3-G[ii] : G[ii];
    };      
    
    P->nSA=0;
    for (uint ii=0;ii<N2;ii+=P->genomeSAsparseD) {
        if (G[ii]<4) {
            P->nSA++;
        };
    };     
    
    P->GstrandBit = (uint) floor(log(N)/log(2))+1; 
    if (P->GstrandBit<32) P->GstrandBit=32; //TODO: use simple access function for SA
    
    P->GstrandMask = ~(1LLU<<P->GstrandBit);
    PackedArray SA1;//SA without sjdb
    SA1.defineBits(P->GstrandBit+1,P->nSA);
    PackedArray SA2;//SA with sjdb, reserve more space
    if (P->sjdbInsert.yes)
    {//reserve space for junction insertion
        SA2.defineBits(P->GstrandBit+1,P->nSA+2*P->limitSjdbInsertNsj*P->sjdbLength);//TODO: this allocation is wasteful, get a better estimate of the number of junctions
    } else
    {//same as SA1
        SA2.defineBits(P->GstrandBit+1,P->nSA);
    };
        
    P->nSAbyte=SA2.lengthByte;
    
    P->inOut->logMain  << "Number of SA indices: "<< P->nSA << "\n"<<flush;    

    //sort SA
    time ( &rawTime );
    P->inOut->logMain     << timeMonthDayTime(rawTime) <<" ... starting to sort  Suffix Array. This may take a long time...\n" <<flush;   
    *P->inOut->logStdOut  << timeMonthDayTime(rawTime) <<" ... starting to sort  Suffix Array. This may take a long time...\n" <<flush;
   

//     if (false)
    {//sort SA chunks
        
        for (uint ii=0;ii<N;ii++) {//re-fill the array backwards for sorting
            swap(G[N2-1-ii],G[ii]);
        };          
        globalG=G;
        globalL=L/sizeof(uint);
        //count the number of indices with 4nt prefix
        uint indPrefN=1LLU << 16;
        uint* indPrefCount = new uint [indPrefN];
        memset(indPrefCount,0,indPrefN*sizeof(indPrefCount[0]));
        P->nSA=0;
        for (uint ii=0;ii<N2;ii+=P->genomeSAsparseD) {
            if (G[ii]<4) {
                uint p1=(G[ii]<<12) + (G[ii-1]<<8) + (G[ii-2]<<4) + G[ii-3];
                indPrefCount[p1]++;
                P->nSA++;
            };
        };

        uint saChunkSize=(P->limitGenomeGenerateRAM-nG1alloc)/8/P->runThreadN; //number of SA indexes per chunk
        saChunkSize=saChunkSize*6/10; //allow extra space for qsort            
        //uint saChunkN=((P->nSA/saChunkSize+1)/P->runThreadN+1)*P->runThreadN;//ensure saChunkN is divisible by P->runThreadN
        //saChunkSize=P->nSA/saChunkN+100000;//final chunk size
        if (P->runThreadN>1) saChunkSize=min(saChunkSize,P->nSA/(P->runThreadN-1));

        uint saChunkN=P->nSA/saChunkSize;//estimate
        uint* indPrefStart = new uint [saChunkN*2]; //start and stop, *2 just in case
        uint* indPrefChunkCount = new uint [saChunkN*2];
        indPrefStart[0]=0;
        saChunkN=0;//start counting chunks
        uint chunkSize1=indPrefCount[0];
        for (uint ii=1; ii<indPrefN; ii++) {
            chunkSize1 += indPrefCount[ii];
            if (chunkSize1 > saChunkSize) {
                saChunkN++;
                indPrefStart[saChunkN]=ii;
                indPrefChunkCount[saChunkN-1]=chunkSize1-indPrefCount[ii];                    
                chunkSize1=indPrefCount[ii];
            };
        };
        saChunkN++;
        indPrefStart[saChunkN]=indPrefN+1;
        indPrefChunkCount[saChunkN-1]=chunkSize1;

        P->inOut->logMain  << "Number of chunks: " << saChunkN <<";   chunks size limit: " << saChunkSize*8 <<" bytes\n" <<flush;

        time ( &rawTime );
        P->inOut->logMain     << timeMonthDayTime(rawTime) <<" ... sorting Suffix Array chunks and saving them to disk...\n" <<flush;   
        *P->inOut->logStdOut  << timeMonthDayTime(rawTime) <<" ... sorting Suffix Array chunks and saving them to disk...\n" <<flush;

        #pragma omp parallel for num_threads(P->runThreadN) ordered schedule(dynamic,1)
        for (int iChunk=0; iChunk < (int) saChunkN; iChunk++) {//start the chunk cycle: sort each chunk with qsort and write to a file
            uint* saChunk=new uint [indPrefChunkCount[iChunk]];//allocate local array for each chunk
            for (uint ii=0,jj=0;ii<N2;ii+=P->genomeSAsparseD) {//fill the chunk with SA indices
                if (G[ii]<4) {
                    uint p1=(G[ii]<<12) + (G[ii-1]<<8) + (G[ii-2]<<4) + G[ii-3];
                    if (p1>=indPrefStart[iChunk] && p1<indPrefStart[iChunk+1]) {
                        saChunk[jj]=ii;
                        jj++;
                    };
                    //TODO: if (jj==indPrefChunkCount[iChunk]) break;
                };
            };


            //sort the chunk
            qsort(saChunk,indPrefChunkCount[iChunk],sizeof(saChunk[0]),funCompareSuffixes);
            for (uint ii=0;ii<indPrefChunkCount[iChunk];ii++) {    
                saChunk[ii]=N2-1-saChunk[ii];
            };  
            //write files
            ofstream saChunkFile;
            string chunkFileName=P->genomeDir+"/SA_"+to_string( (uint) iChunk);
            ofstrOpen(chunkFileName,"ERROR_00105", P, saChunkFile);   
            fstreamWriteBig(saChunkFile, (char*) saChunk, sizeof(saChunk[0])*indPrefChunkCount[iChunk],chunkFileName,"ERROR_00121",P);
            saChunkFile.close();
            delete [] saChunk;
            saChunk=NULL;
        };

        time ( &rawTime );
        P->inOut->logMain     << timeMonthDayTime(rawTime) <<" ... loading chunks from disk, packing SA...\n" <<flush;   
        *P->inOut->logStdOut  << timeMonthDayTime(rawTime) <<" ... loading chunks from disk, packing SA...\n" <<flush;    

        //read chunks and pack into full SA1
        SA2.allocateArray();
        SA1.pointArray(SA2.charArray + SA2.lengthByte-SA1.lengthByte); //SA1 is shifted to have space for junction insertion
        uint N2bit= 1LLU << P->GstrandBit;          
        uint packedInd=0;

        #define SA_CHUNK_BLOCK_SIZE 10000000
        uint* saIn=new uint[SA_CHUNK_BLOCK_SIZE]; //TODO make adjustable
        
        #ifdef genenomeGenerate_SA_textOutput
                ofstream SAtxtStream ((P->genomeDir + "/SAtxt").c_str());
        #endif

        for (uint iChunk=0;iChunk<saChunkN;iChunk++) {//load files one by one and convert to packed
            ostringstream saChunkFileNameStream("");
            saChunkFileNameStream<< P->genomeDir << "/SA_" << iChunk;
            ifstream saChunkFile(saChunkFileNameStream.str().c_str());
            while (! saChunkFile.eof()) {//read blocks from each file
                uint chunkBytesN=fstreamReadBig(saChunkFile,(char*) saIn,SA_CHUNK_BLOCK_SIZE*sizeof(saIn[0]));
                for (uint ii=0;ii<chunkBytesN/sizeof(saIn[0]);ii++) {
                    SA1.writePacked( packedInd+ii, (saIn[ii]<N) ? saIn[ii] : ( (saIn[ii]-N) | N2bit ) );
                    
                    #ifdef genenomeGenerate_SA_textOutput
                        SAtxtStream << saIn[ii] << "\n";
                    #endif
                };
                packedInd += chunkBytesN/sizeof(saIn[0]);
            };
            saChunkFile.close();
            remove(saChunkFileNameStream.str().c_str());//remove the chunk file
        };

        #ifdef genenomeGenerate_SA_textOutput
                SAtxtStream.close();
        #endif        
        delete [] saIn;

        if (packedInd != P->nSA ) {//
            ostringstream errOut;                            
            errOut << "EXITING because of FATAL problem while generating the suffix array\n";
            errOut << "The number of indices read from chunks = "<<packedInd<<" is not equal to expected nSA="<<P->nSA<<"\n";
            errOut << "SOLUTION: try to re-run suffix array generation, if it still does not work, report this problem to the author\n"<<flush;
            exitWithError(errOut.str(),std::cerr, P->inOut->logMain, EXIT_CODE_INPUT_FILES, *P);
        };
        
        //DONE with suffix array generation
        
        for (uint ii=0;ii<N;ii++) {//return to normal order for future use
            swap(G[N2-1-ii],G[ii]);
        };         
        delete [] indPrefCount;
        delete [] indPrefStart;
        delete [] indPrefChunkCount;
    };    

    time ( &rawTime );
    timeString=asctime(localtime ( &rawTime ));
    timeString.erase(timeString.end()-1,timeString.end());
    P->inOut->logMain     << timeMonthDayTime(rawTime) <<" ... Finished generating suffix array\n" <<flush;  
    *P->inOut->logStdOut  << timeMonthDayTime(rawTime) <<" ... Finished generating suffix array\n" <<flush;          

////////////////////////////////////////
//          SA index
//
//     PackedArray SAold;
// 
//     if (true)
//     {//testing: load SA from disk
//             //read chunks and pack into full SA1
//         
//         ifstream oldSAin("./DirTrue/SA");
//         oldSAin.seekg (0, ios::end);
//         P->nSAbyte=(uint) oldSAin.tellg();
//         oldSAin.clear();        
//         oldSAin.seekg (0, ios::beg);
// 
//         P->nSA=(P->nSAbyte*8)/(P->GstrandBit+1);
//         SAold.defineBits(P->GstrandBit+1,P->nSA);  
//         SAold.allocateArray();
//         
//         oldSAin.read(SAold.charArray,SAold.lengthByte);
//         oldSAin.close();
//         
//         SA1=SAold;
//         SA2=SAold;
//     };
    
    PackedArray SAip;
    genomeSAindex(G,SA1,P,SAip);

    if (P->sjdbFileChrStartEnd.at(0)!="-" || P->sjdbGTFfile!="-")
    {//insert junctions
        SjdbClass sjdbLoci;

        Genome mainGenome(P);
        mainGenome.G=G;
        mainGenome.SA=SA1;
        mainGenome.SApass1=SA2;
        mainGenome.SAi=SAip;
        P->sjdbInsert.outDir=P->genomeDir;
        P->sjdbN=0;//no junctions are loaded yet
        P->twoPass.pass2=false;
        
        Parameters *P1=new Parameters;
        *P1=*P;        
        
        sjdbInsertJunctions(P, P1, mainGenome, sjdbLoci);
        
        //write an extra 0 at the end of the array, filling the last bytes that otherwise are not accessible, but will be written to disk
        //this is - to avoid valgrind complaints. Note that SA2 is allocated with plenty of space to spare.
        SA2.writePacked(P->nSA,0);
    };
    
    //write genome to disk
    time ( &rawTime );
    P->inOut->logMain     << timeMonthDayTime(rawTime) <<" ... writing Genome to disk ...\n" <<flush;   
    *P->inOut->logStdOut  << timeMonthDayTime(rawTime) <<" ... writing Genome to disk ...\n" <<flush;   
    
    ofstream genomeOut;
    ofstrOpen(P->genomeDir+"/Genome","ERROR_00104", P, genomeOut);   
    fstreamWriteBig(genomeOut,G,P->nGenome,P->genomeDir+"/Genome","ERROR_00120",P);
    genomeOut.close();  

    //write SA                
    time ( &rawTime );
    P->inOut->logMain  << "SA size in bytes: "<< P->nSAbyte << "\n"<<flush;

    P->inOut->logMain     << timeMonthDayTime(rawTime) <<" ... writing Suffix Array to disk ...\n" <<flush;   
    *P->inOut->logStdOut  << timeMonthDayTime(rawTime) <<" ... writing Suffix Array to disk ...\n" <<flush;   

    ofstream SAout;
    ofstrOpen(P->genomeDir+"/SA","ERROR_00106", P, SAout);   
    fstreamWriteBig(SAout,(char*) SA2.charArray, (streamsize) P->nSAbyte,P->genomeDir+"/SA","ERROR_00122",P);
    SAout.close();    
    
    //write SAi
    time(&rawTime);    
    P->inOut->logMain    << timeMonthDayTime(rawTime) <<" ... writing SAindex to disk\n" <<flush;   
    *P->inOut->logStdOut << timeMonthDayTime(rawTime) <<" ... writing SAindex to disk\n" <<flush;   
    
    //write SAi to disk
    ofstream SAiOut;
    ofstrOpen(P->genomeDir+"/SAindex","ERROR_00107", P, SAiOut);   

    fstreamWriteBig(SAiOut, (char*) &P->genomeSAindexNbases, sizeof(P->genomeSAindexNbases),P->genomeDir+"/SAindex","ERROR_00123",P);
    fstreamWriteBig(SAiOut, (char*) P->genomeSAindexStart, sizeof(P->genomeSAindexStart[0])*(P->genomeSAindexNbases+1),P->genomeDir+"/SAindex","ERROR_00124",P);        
    fstreamWriteBig(SAiOut,  SAip.charArray, SAip.lengthByte,P->genomeDir+"/SAindex","ERROR_00125",P);
    SAiOut.close();    

    SA2.deallocateArray();

    time(&rawTime);
    timeString=asctime(localtime ( &rawTime ));
    timeString.erase(timeString.end()-1,timeString.end());
    
    time(&rawTime);        
    P->inOut->logMain    << timeMonthDayTime(rawTime) << " ..... Finished successfully\n" <<flush;    
    *P->inOut->logStdOut << timeMonthDayTime(rawTime) << " ..... Finished successfully\n" <<flush;
};
Ejemplo n.º 2
0
uint insertSeqSA(PackedArray & SA, PackedArray & SA1, PackedArray & SAi, char * G, char * G1, uint64 nG, uint64 nG1, uint64 nG2, Parameters * P)
{//insert new sequences into the SA

    uint GstrandBit1 = (uint) floor(log(nG+nG1)/log(2))+1;
    if (GstrandBit1<32) GstrandBit1=32; //TODO: use simple access function for SA
    if ( GstrandBit1+1 != SA.wordLength)
    {//sequence is too long - GstrandBit changed
        ostringstream errOut;
        errOut << "EXITING because of FATAL ERROR: cannot insert sequence on the fly because of strand GstrandBit problem\n";
        errOut << "SOLUTION: please contact STAR author at https://groups.google.com/forum/#!forum/rna-star\n";
        exitWithError(errOut.str(),std::cerr, P->inOut->logMain, EXIT_CODE_GENOME_FILES, *P);
    };

    uint N2bit= 1LLU << (SA.wordLength-1);
    uint strandMask=~N2bit;
    for (uint64 isa=0;isa<SA.length; isa++)
    {
        uint64 ind1=SA[isa];
        if ( (ind1 & N2bit)>0 )
        {//- strand
            if ( (ind1 & strandMask)>=nG2 )
            {//the first nG bases
                ind1+=nG1; //reverse complementary indices are all shifted by the length of the sequence
                SA.writePacked(isa,ind1);
            };
        } else
        {//+ strand
            if ( ind1>=nG )
            {//the last nG2 bases
                ind1+=nG1; //reverse complementary indices are all shifted by the length of the sequence
                SA.writePacked(isa,ind1);
            };
        };
    };

    char** seq1=new char*[2];

    #define GENOME_endFillL 16
    char* seqq=new char [4*nG1+3*GENOME_endFillL];//ends shouldbe filled with 5 to mark boundaries

    seq1[0]=seqq+GENOME_endFillL;//TODO: avoid defining an extra array, use reverse search
    seq1[1]=seqq+2*GENOME_endFillL+2*nG1;

    memset(seqq,GENOME_spacingChar,GENOME_endFillL);
    memset(seqq+2*nG1+GENOME_endFillL,GENOME_spacingChar,GENOME_endFillL);
    memset(seqq+4*nG1+2*GENOME_endFillL,GENOME_spacingChar,GENOME_endFillL);

    memcpy(seq1[0], G1, nG1);
    for (uint ii=0; ii<nG1; ii++)
    {//reverse complement sequence
        seq1[0][2*nG1-1-ii]=seq1[0][ii]<4 ? 3-seq1[0][ii] : seq1[0][ii];
    };
    complementSeqNumbers(seq1[0], seq1[1], 2*nG1);//complement

    uint64* indArray=new uint64[nG1*2*2+2];// for each base, 1st number - insertion place in SA, 2nd number - index, *2 for reverse compl


    #pragma omp parallel num_threads(P->runThreadN)
    #pragma omp for schedule (dynamic,1000)
    for (uint ii=0; ii<2*nG1; ii++) {//find insertion points for each of the sequences

        if (seq1[0][ii]>3)
        {//no index for suffices starting with N
            indArray[ii*2]=-1;
        } else
        {
            indArray[ii*2] =  suffixArraySearch1(seq1, ii, 10000, G, nG, SA, (ii<nG1 ? true:false), 0, SA.length-1, 0, P) ;
            indArray[ii*2+1] = ii;
        };
    };

    uint64 nInd=0;//true number of new indices
    for (uint ii=0; ii<2*nG1; ii++) {//remove entries that cannot be inserted, this cannot be done in the parallel cycle above
        if (indArray[ii*2]!= (uint) -1) {
            indArray[nInd*2]=indArray[ii*2];
            indArray[nInd*2+1]=indArray[ii*2+1];
            ++nInd;
        };
    };

    time_t rawtime;
    time ( &rawtime );
    P->inOut->logMain  << timeMonthDayTime(rawtime) << "   Finished SA search, number of new SA indices = "<<nInd<<endl;

    globalGenomeArray=seq1[0];
    qsort((void*) indArray, nInd, 2*sizeof(uint64), funCompareUintAndSuffixes);
    time ( &rawtime );
    P->inOut->logMain  << timeMonthDayTime(rawtime) << "   Finished sorting SA indices"<<endl;

    indArray[2*nInd]=-999; //mark the last junction
    indArray[2*nInd+1]=-999; //mark the last junction

    SA1.defineBits(SA.wordLength,SA.length+nInd);

    /*testing
    PackedArray SAo;
    SAo.defineBits(P->GstrandBit+1,P->nSA+nInd);
    SAo.allocateArray();
    ifstream oldSAin("./DirTrue/SA");
    oldSAin.read(SAo.charArray,SAo.lengthByte);
    oldSAin.close();
    */

    uint isa1=0, isa2=0;
    for (uint isa=0;isa<SA.length;isa++) {
        while (isa==indArray[isa1*2]) {//insert new index before the existing index
            uint ind1=indArray[isa1*2+1];
            if (ind1<nG1) {
                ind1+=nG;
            } else {//reverse strand
                ind1=(ind1-nG1+nG2) | N2bit;
            };
            SA1.writePacked(isa2,ind1);
            /*testing
            if (SA1[isa2]!=SAo[isa2]) {
               cout <<isa2 <<" "<< SA1[isa2]<<" "<<SAo[isa2]<<endl;
               //sleep(100);
            };
            */
            ++isa2; ++isa1;

        };

        SA1.writePacked(isa2,SA[isa]); //TODO make sure that the first sj index is not before the first array index
            /*testing
            if (SA1[isa2]!=SAo[isa2]) {
               cout <<isa2 <<" "<< SA1[isa2]<<" "<<SAo[isa2]<<endl;
               //sleep(100);
            };
            */
        ++isa2;
    };
    for (;isa1<nInd;isa1++)
    {//insert the last indices
        uint ind1=indArray[isa1*2+1];
        if (ind1<nG1)
        {
            ind1+=nG;
        } else
        {//reverse strand
            ind1=(ind1-nG1+nG2) | N2bit;
        };
        SA1.writePacked(isa2,ind1);
        ++isa2;
    };

    time ( &rawtime );
    P->inOut->logMain  << timeMonthDayTime(rawtime) << "   Finished inserting SA indices" <<endl;

//     //SAi insertions
//     for (uint iL=0; iL < P->genomeSAindexNbases; iL++) {
//         uint iSeq=0;
//         uint ind0=P->genomeSAindexStart[iL]-1;//last index that was present in the old genome
//         for (uint ii=P->genomeSAindexStart[iL];ii<P->genomeSAindexStart[iL+1]; ii++) {//scan through the longest index
//             if (ii==798466)
//                 cout <<ii;
//
//             uint iSA1=SAi[ii];
//             uint iSA2=iSA1 & P->SAiMarkNmask & P->SAiMarkAbsentMask;
//
//             if ( iSeq<nInd && (iSA1 &  P->SAiMarkAbsentMaskC)>0 )
//             {//index missing from the old genome
//                 uint iSeq1=iSeq;
//                 int64 ind1=funCalcSAi(seq1[0]+indArray[2*iSeq+1],iL);
//                 while (ind1 < (int64)(ii-P->genomeSAindexStart[iL]) && indArray[2*iSeq]<iSA2) {
//                     ++iSeq;
//                     ind1=funCalcSAi(seq1[0]+indArray[2*iSeq+1],iL);
//                 };
//                 if (ind1 == (int64)(ii-P->genomeSAindexStart[iL]) ) {
//                     SAi.writePacked(ii,indArray[2*iSeq]+iSeq+1);
//                     for (uint ii0=ind0+1; ii0<ii; ii0++) {//fill all the absent indices with this value
//                         SAi.writePacked(ii0,(indArray[2*iSeq]+iSeq+1) | P->SAiMarkAbsentMaskC);
//                     };
//                     ++iSeq;
//                     ind0=ii;
//                 } else {
//                     iSeq=iSeq1;
//                 };
//             } else
//             {//index was present in the old genome
//                 while (iSeq<nInd && indArray[2*iSeq]+1<iSA2) {//for this index insert "smaller" junctions
//                     ++iSeq;
//                 };
//
//                 while (iSeq<nInd && indArray[2*iSeq]+1==iSA2) {//special case, the index falls right behind SAi
//                     if (funCalcSAi(seq1[0]+indArray[2*iSeq+1],iL) >= (int64) (ii-P->genomeSAindexStart[iL]) ) {//this belongs to the next index
//                         break;
//                     };
//                     ++iSeq;
//                 };
//
//                 SAi.writePacked(ii,iSA1+iSeq);
//
//                 for (uint ii0=ind0+1; ii0<ii; ii0++) {//fill all the absent indices with this value
//                     SAi.writePacked(ii0,(iSA2+iSeq) | P->SAiMarkAbsentMaskC);
//                 };
//                 ind0=ii;
//             };
//         };
//
//     };
// //     time ( &rawtime );    cout << timeMonthDayTime(rawtime) << "SAi first" <<endl;
//
//     for (uint isj=0;isj<nInd;isj++) {
//         int64 ind1=0;
//         for (uint iL=0; iL < P->genomeSAindexNbases; iL++) {
//             uint g=(uint) seq1[0][indArray[2*isj+1]+iL];
//             ind1 <<= 2;
//             if (g>3) {//this iSA contains N, need to mark the previous
//                 for (uint iL1=iL; iL1 < P->genomeSAindexNbases; iL1++) {
//                     ind1+=3;
//                     int64 ind2=P->genomeSAindexStart[iL1]+ind1;
//                     for (; ind2>=0; ind2--) {//find previous index that is not absent
//                         if ( (SAi[ind2] & P->SAiMarkAbsentMaskC)==0 ) {
//                             break;
//                         };
//                     };
//                     SAi.writePacked(ind2,SAi[ind2] | P->SAiMarkNmaskC);
//                     ind1 <<= 2;
//                 };
//                 break;
//             } else {
//                 ind1 += g;
//             };
//         };
//     };
//     time ( &rawtime );
//     P->inOut->logMain  << timeMonthDayTime(rawtime) << "   Finished SAi" <<endl;
//
//     /* testing
//     PackedArray SAio=SAi;
//     SAio.allocateArray();
//     ifstream oldSAiin("./DirTrue/SAindex");
//     oldSAiin.read(SAio.charArray,8*(P->genomeSAindexNbases+2));//skip first bytes
//     oldSAiin.read(SAio.charArray,SAio.lengthByte);
//     oldSAiin.close();
//
//     for (uint iL=0; iL < P->genomeSAindexNbases; iL++) {
//         for (uint ii=P->genomeSAindexStart[iL];ii<P->genomeSAindexStart[iL+1]; ii++) {//scan through the longets index
//                 if ( SAio[ii]!=SAi[ii] ) {
//                     cout <<iL<<" "<<ii<<" "<<SAio[ii]<<" "<<SAi[ii]<<endl;
//                 };
//         };
//     };
//     */

    //change parameters, most parameters are already re-defined in sjdbPrepare.cpp
    SA.defineBits(P->GstrandBit+1,SA.length+nInd);//same as SA2
    SA.pointArray(SA1.charArray);
    P->nSA=SA.length;
    P->nSAbyte=SA.lengthByte;

    //generate SAi
    genomeSAindex(G,SA,P,SAi);

    time ( &rawtime );
    P->inOut->logMain  << timeMonthDayTime(rawtime) << "   Finished SAi" <<endl;


//     P->sjGstart=P->chrStart[P->nChrReal];
//     memcpy(G+P->chrStart[P->nChrReal],seq1[0], nseq1[0]);


    return nInd;
};