예제 #1
0
void Genome::insertSequences()
{
if (P->genomeFastaFiles.at(0)!="-")
{
    time_t rawtime;
    time ( &rawtime );
    P->inOut->logMain  << timeMonthDayTime(rawtime) << " ..... Inserting extra sequences into genome indexes" <<endl;       
    //move the junctions to free up space for seqs
    // chrStart/Name/Length nChrReal include the extra sequences
    // nGenome is the old, small genome size
    uint sjdblen=P->nGenome-(P->chrStart.back()-P->genomeInsertL);//length of sjdb sequences
    memmove(G+P->chrStart.back(),G+P->chrStart.back()-P->genomeInsertL,sjdblen);
    memset(G+P->chrStart.back()-P->genomeInsertL, GENOME_spacingChar, P->genomeInsertL);//fill empty space with spacing characters
        
    genomeScanFastaFiles(P, G+P->chrStart.back()-P->genomeInsertL, true); //read the seqs from file(s) into the free space
    uint64 nGenomeOld=P->nGenome;
    P->nGenome=P->chrStart.back()+sjdblen; 
    //insert new sequences into the SA
    insertSeqSA(SA, SAinsert, SAi, G, G+P->chrStart.back()-P->genomeInsertL, nGenomeOld-sjdblen, P->genomeInsertL, sjdblen, P);

    //insert new sequences into the SAi
    //update P
    //save the genome if necessary
};
};
예제 #2
0
void Stats::progressReport(ofstream &progressStream) {
    
    time_t timeCurrent;
    time( &timeCurrent);
    
    if (difftime(timeCurrent,timeLastReport)>=60.0 && readN>0) {//make the report
        //progressStream.imbue(std::locale(""));
        progressStream <<setw(15)<< timeMonthDayTime(timeCurrent) \
                <<SETW1<< setiosflags(ios::fixed) << setprecision(1) \
                << double(readN)/1e6/difftime(timeCurrent,timeStartMap)*3600 \
                <<SETW3<< readN \
                <<SETW1<< (readN>0 ? readBases/readN : 0) \
                <<SETW2<< (readN>0 ? double(mappedReadsU)/double(readN)*100 : 0)  <<'%' \
                <<SETW1<< (readN>0 ? double(mappedBases)/double(mappedReadsU) : 0)  
                <<SETW2<< (readN>0 ? double(mappedMismatchesN)/double(mappedBases)*100 : 0) <<'%' \
                <<SETW2<< (readN>0 ? double(mappedReadsM)/double(readN)*100 : 0) <<'%'\
                <<SETW2<< (readN>0 ? double(unmappedMulti)/double(readN)*100 : 0) <<'%'\
                <<SETW2<< (readN>0 ? double(unmappedMismatch)/double(readN)*100 : 0) <<'%'\
                <<SETW2<< (readN>0 ? double(unmappedShort)/double(readN)*100 : 0)<<'%'\
                <<SETW2<< (readN>0 ? double(unmappedOther)/double(readN)*100 : 0) <<'%'\
                <<"\n"<<flush;
        timeLastReport=timeCurrent;
        
    };
};
예제 #3
0
void Stats::reportFinal(ofstream &streamOut, Parameters *P) {    
    int w1=50;
    time( &timeFinish);    
    
                                            //<<setiosflags(ios::left)
    streamOut  <<setiosflags(ios::fixed)  << setprecision(2) \
               <<setw(w1)<< "Started job on |\t" << timeMonthDayTime(timeStart)<<"\n" \
               <<setw(w1)<< "Started mapping on |\t" << timeMonthDayTime(timeStartMap)<<"\n" \
               <<setw(w1)<< "Finished on |\t"<< timeMonthDayTime(timeFinish)<<"\n" \
               <<setw(w1)<< "Mapping speed, Million of reads per hour |\t"<< double(readN)/1e6/difftime(timeFinish,timeStartMap)*3600<<"\n" \
               <<"\n" \
               <<setw(w1)<< "Number of input reads |\t"                        << readN <<"\n" \
               <<setw(w1)<< "Average input read length |\t"                    << (readN>0 ? readBases/readN : 0) <<"\n" \
               <<setw(w1)<< "UNIQUE READS:\n" \
               <<setw(w1)<< "Uniquely mapped reads number |\t"                 << mappedReadsU <<"\n" \
               <<setw(w1)<< "Uniquely mapped reads % |\t"                      << (readN>0 ? double(mappedReadsU)/double(readN)*100 : 0) <<'%'<<"\n" \
               <<setw(w1)<< "Average mapped length |\t"                        << (mappedReadsU>0 ? double(mappedBases)/double(mappedReadsU) : 0) <<"\n";

    streamOut  <<setw(w1)<< "Number of splices: Total |\t"                     << splicesN[0]+splicesN[1]+splicesN[2]+splicesN[3]+splicesN[4]+splicesN[5]+splicesN[6]<< "\n" \
               <<setw(w1)<< "Number of splices: Annotated (sjdb) |\t"          << splicesNsjdb << "\n" \
               <<setw(w1)<< "Number of splices: GT/AG |\t"                     << splicesN[1]+splicesN[2] << "\n" \
               <<setw(w1)<< "Number of splices: GC/AG |\t"                     << splicesN[3]+splicesN[4] << "\n" \
               <<setw(w1)<< "Number of splices: AT/AC |\t"                     << splicesN[5]+splicesN[6] << "\n" \
               <<setw(w1)<< "Number of splices: Non-canonical |\t"             << splicesN[0] << "\n";
    
    streamOut  <<setw(w1)<< "Mismatch rate per base, % |\t"                << double(mappedMismatchesN)/double(mappedBases)*100 <<'%' <<"\n" \
               <<setw(w1)<< "Deletion rate per base |\t"                       << (mappedBases>0 ? double(mappedDelL)/double(mappedBases)*100 : 0) <<'%' <<"\n" \
               <<setw(w1)<< "Deletion average length |\t"                      << (mappedDelN>0 ? double(mappedDelL)/double(mappedDelN) : 0) <<"\n" \
               <<setw(w1)<< "Insertion rate per base |\t"                      << (mappedBases>0 ? double(mappedInsL)/double(mappedBases)*100 : 0) <<'%' <<"\n" \
               <<setw(w1)<< "Insertion average length |\t"                     << (mappedInsN>0 ? double(mappedInsL)/double(mappedInsN) : 0) <<"\n" \
               <<setw(w1)<< "MULTI-MAPPING READS:\n" \
               <<setw(w1)<< "Number of reads mapped to multiple loci |\t"      << mappedReadsM <<"\n" \
               <<setw(w1)<< "% of reads mapped to multiple loci |\t"           << (readN>0 ? double(mappedReadsM)/double(readN)*100 : 0)<<'%' <<"\n" \
               <<setw(w1)<< "Number of reads mapped to too many loci |\t"      << unmappedMulti <<"\n" \
               <<setw(w1)<< "% of reads mapped to too many loci |\t"           << (readN>0 ? double(unmappedMulti)/double(readN)*100 : 0) <<'%' <<"\n" \
               <<setw(w1)<< "UNMAPPED READS:\n" \
               <<setw(w1)<< "% of reads unmapped: too many mismatches |\t"     << (readN>0 ? double(unmappedMismatch)/double(readN)*100 : 0) <<'%' <<"\n" \
               <<setw(w1)<< "% of reads unmapped: too short |\t"               << (readN>0 ? double(unmappedShort)/double(readN)*100 : 0) <<'%' <<"\n" \
               <<setw(w1)<< "% of reads unmapped: other |\t"                   << (readN>0 ? double(unmappedOther)/double(readN)*100 :0) <<'%'<<"\n" \
               <<setw(w1)<< "CHIMERIC READS:\n" \
               <<setw(w1)<< "Number of chimeric reads |\t"                     << chimericAll <<"\n" \
               <<setw(w1)<< "% of chimeric reads |\t"                          << (readN>0 ? double(chimericAll)/double(readN)*100 :0) <<'%'<<"\n" <<flush;

};
예제 #4
0
void sjdbInsertJunctions(Parameters *P, Genome &genome) {
        
    SjdbClass sjdbLoci;        
    time_t rawtime;

    //load 1st pass junctions
    if (P->twoPass.pass1sjFile.size()>0)
    {
        ifstream sjdbStreamIn ( P->twoPass.pass1sjFile.c_str() );   
        if (sjdbStreamIn.fail()) {
            ostringstream errOut;
            errOut << "FATAL INPUT error, could not open input file with junctions from the 1st pass="******"\n";
            exitWithError(errOut.str(),std::cerr, P->inOut->logMain, EXIT_CODE_INPUT_FILES, *P);
        };
        sjdbLoadFromStream(sjdbStreamIn, sjdbLoci);
        time ( &rawtime );
        P->inOut->logMain << timeMonthDayTime(rawtime) << "   Loaded database junctions from the 1st pass file: " << P->twoPass.pass1sjFile <<": "<<sjdbLoci.chr.size()<<" total junctions\n\n";
    };
    
    //load from junction files
    if (P->sjdbFileChrStartEnd.at(0)!="-")
    {
        sjdbLoadFromFiles(P,sjdbLoci);
        P->inOut->logMain << timeMonthDayTime(rawtime) << "   Loaded database junctions from the sjdbFileChrStartEnd file(s), " << sjdbLoci.chr.size()<<" total junctions\n\n";        
    };
    
    //load from GTF
    if (P->sjdbGTFfile!="-")
    {
        loadGTF(sjdbLoci, P, P->genomeDirOut);
        P->inOut->logMain << timeMonthDayTime(rawtime) << "   Loaded database junctions from the GTF file: " << P->sjdbGTFfile<<": "<<sjdbLoci.chr.size()<<" total junctions\n\n";
    };

    sjdbPrepare (sjdbLoci, P, genome.G, P->nGenome, P->twoPass.dir);//P->nGenome - change when replacing junctions
    time ( &rawtime );
    P->inOut->logMain  << timeMonthDayTime(rawtime) << "   Finished preparing junctions" <<endl;

    //insert junctions into the genome and SA and SAi
    sjdbBuildIndex (P, genome.G, genome.SA, genome.SA2, genome.SAi);
    time ( &rawtime ); 
    *P->inOut->logStdOut  << timeMonthDayTime(rawtime) << " ..... Finished inserting 1st pass junctions into genome" <<endl;
    //re-calculate genome-related parameters
    P->winBinN = P->nGenome/(1LLU << P->winBinNbits)+1;
};
예제 #5
0
파일: Genome.cpp 프로젝트: alexf101/STAR
void Genome::genomeLoad(){//allocate and load Genome
    
    time_t rawtime;
    time ( &rawtime );
    *(P->inOut->logStdOut) << timeMonthDayTime(rawtime) << " ..... Loading genome\n" <<flush;           

    uint *shmNG=NULL, *shmNSA=NULL;   //pointers to shm stored values , *shmSG, *shmSSA
    uint64 shmSize=0;//, shmStartG=0; shmStartSA=0;
    
    uint L=200,K=6;    
    
    Parameters *P1 = new Parameters;
    
    ifstream parFile((P->genomeDir+("/genomeParameters.txt")).c_str());
    if (parFile.good()) {
        P->inOut->logMain << "Reading genome generation parameters:\n";
        P1->inOut = P->inOut;
        P1->scanAllLines(parFile,3,-1);
        parFile.close();
    } else {
        ostringstream errOut;
        errOut << "EXITING because of FATAL ERROR: could not open genome file "<< P->genomeDir+("/genomeParameters.txt") << endl;
        errOut << "SOLUTION: check that the path to genome files, specified in --genomeDir is correct and the files are present, and have user read permsissions\n" <<flush;     
        exitWithError(errOut.str(),std::cerr, P->inOut->logMain, EXIT_CODE_GENOME_FILES, *P);
    };            
    
    //check genome version
    if (P1->versionGenome.size()==0 || P1->versionGenome[0]==0) {//
        ostringstream errOut;
        errOut << "EXITING because of FATAL ERROR: read no value for the versionGenome parameter from genomeParameters.txt file\n";
        errOut << "SOLUTION: please re-generate genome from scratch with the latest version of STAR\n";
        exitWithError(errOut.str(),std::cerr, P->inOut->logMain, EXIT_CODE_GENOME_FILES, *P);
    } else if (P->sjdbFileChrStartEnd.at(0)=="-" && P1->versionGenome.at(0) >= P->versionGenome.at(0)) {//
        P->inOut->logMain << "Genome version is compatible with current STAR version\n";
    } else if (P->sjdbFileChrStartEnd.at(0)!="-" && P1->versionGenome.at(0) >= P->versionGenome.at(1)) {//
        P->inOut->logMain << "Genome version is compatible with current STAR version\n";        
    } else {
        ostringstream errOut;
        errOut << "EXITING because of FATAL ERROR: Genome version is INCOMPATIBLE with current STAR version\n";
        errOut << "SOLUTION: please re-generate genome from scratch with the latest version of STAR\n";
        exitWithError(errOut.str(),std::cerr, P->inOut->logMain, EXIT_CODE_GENOME_FILES, *P);
    };

    //check if sjdbInfo.txt exists => genome was generated with junctions
    bool sjdbInfoExists=false;
    struct stat sjdb1;
    if ( stat( (P->genomeDir+"/sjdbInfo.txt").c_str(), &sjdb1) == 0 )
    {//file exists
        sjdbInfoExists=true;
    };
    
    if ( P->sjdbInsert.yes && sjdbInfoExists && P1->sjdbInsert.save=="") 
    {//if sjdbInsert, and genome had junctions, and genome is old - it should be re-generated with new STAR
        ostringstream errOut;
        errOut << "EXITING because of FATAL ERROR: old Genome is INCOMPATIBLE with on the fly junction insertion\n";
        errOut << "SOLUTION: please re-generate genome from scratch with the latest version of STAR\n";
        exitWithError(errOut.str(),std::cerr, P->inOut->logMain, EXIT_CODE_GENOME_FILES, *P);
    };
    
    //record required genome parameters in P
    P->genomeSAindexNbases=P1->genomeSAindexNbases;
    P->genomeChrBinNbits=P1->genomeChrBinNbits;
    P->genomeSAsparseD=P1->genomeSAsparseD;
    if (P->parArray.at(P->sjdbOverhang_par)->inputLevel==0 && P1->sjdbOverhang>0) 
    {//if --sjdbOverhang was not defined by user and it was defined >0 at the genome generation step, then use sjdbOverhang from the genome generation step
        P->sjdbOverhang=P1->sjdbOverhang;
        P->inOut->logMain << "--sjdbOverhang = " << P->sjdbOverhang << " taken from the generated genome\n";
    } else if (sjdbInfoExists && P->parArray.at(P->sjdbOverhang_par)->inputLevel>0 && P->sjdbOverhang!=P1->sjdbOverhang) 
    {//if sjdbOverhang was defined at the genome generation step,the mapping step value has to agree with it
        ostringstream errOut;
        errOut << "EXITING because of fatal PARAMETERS error: present --sjdbOverhang="<<P->sjdbOverhang << " is not equal to the value at the genome generation step ="<< P1->sjdbOverhang << "\n";
        errOut << "SOLUTION: \n" <<flush;     
        exitWithError(errOut.str(),std::cerr, P->inOut->logMain, EXIT_CODE_GENOME_FILES, *P);
    };
    
    P->sjdbLength = P->sjdbOverhang==0 ? 0 : P->sjdbOverhang*2+1;


    P->inOut->logMain << "Started loading the genome: " << asctime (localtime ( &rawtime ))<<"\n"<<flush;    
  
    ifstream GenomeIn, SAin, SAiIn;

    P->nGenome = OpenStream("Genome",GenomeIn);
    P->nSAbyte = OpenStream("SA",SAin);
    OpenStream("/SAindex",SAiIn);

    uint SAiInBytes=0;
    SAiInBytes += fstreamReadBig(SAiIn,(char*) &P->genomeSAindexNbases, sizeof(P->genomeSAindexNbases));
    P->genomeSAindexStart = new uint[P->genomeSAindexNbases+1];
    SAiInBytes += fstreamReadBig(SAiIn,(char*) P->genomeSAindexStart, sizeof(P->genomeSAindexStart[0])*(P->genomeSAindexNbases+1));  
    P->nSAi=P->genomeSAindexStart[P->genomeSAindexNbases];
    P->inOut->logMain << "Read from SAindex: genomeSAindexNbases=" << P->genomeSAindexNbases <<"  nSAi="<< P->nSAi <<endl;
    

    /////////////////////////////////// at this point all array sizes should be known: calculate packed array lengths
    P->GstrandBit = (uint) floor(log(P->nGenome)/log(2))+1;
    if (P->GstrandBit<32) P->GstrandBit=32; //TODO: use simple access function for SA

    P->GstrandMask = ~(1LLU<<P->GstrandBit);
    P->nSA=(P->nSAbyte*8)/(P->GstrandBit+1);
    SA.defineBits(P->GstrandBit+1,P->nSA);  
    
    P->SAiMarkNbit=P->GstrandBit+1;
    P->SAiMarkAbsentBit=P->GstrandBit+2;
    
    P->SAiMarkNmaskC=1LLU << P->SAiMarkNbit;
    P->SAiMarkNmask=~P->SAiMarkNmaskC;
    P->SAiMarkAbsentMaskC=1LLU << P->SAiMarkAbsentBit;
    P->SAiMarkAbsentMask=~P->SAiMarkAbsentMaskC;
    
    SAi.defineBits(P->GstrandBit+3,P->nSAi); 

    P->inOut->logMain << "nGenome=" << P->nGenome << ";  nSAbyte=" << P->nSAbyte <<endl<< flush;       
    P->inOut->logMain <<"GstrandBit="<<int(P->GstrandBit)<<"   SA number of indices="<<P->nSA<<endl<<flush;      
    
    shmSize=SA.lengthByte + P->nGenome+L+L+SHM_startG+8;
    shmSize+= SAi.lengthByte;                
    if (P->annotScoreScale>0) shmSize+=P->nGenome;


    if ((P->genomeLoad=="LoadAndKeep" ||
         P->genomeLoad=="LoadAndRemove" ||
         P->genomeLoad=="LoadAndExit" ||
         P->genomeLoad=="Remove") && sharedMemory == NULL)
    {
        bool unloadLast = P->genomeLoad=="LoadAndRemove";
        try
        {
            sharedMemory = new SharedMemory(shmKey, unloadLast);
            sharedMemory->SetErrorStream(P->inOut->logStdOut);

            if (!sharedMemory->NeedsAllocation())
            P->inOut->logMain <<"Found genome in shared memory\n"<<flush;
    
    if (P->genomeLoad=="Remove") {//kill the genome and exit
                if (sharedMemory->NeedsAllocation()) {//did not find genome in shared memory, nothing to kill
            ostringstream errOut;
            errOut << "EXITING: Did not find the genome in memory, did not remove any genomes from shared memory\n";
            exitWithError(errOut.str(),std::cerr, P->inOut->logMain, EXIT_CODE_GENOME_FILES, *P);
        } else {
                    sharedMemory->Clean();
            P->inOut->logMain <<"DONE: removed the genome from shared memory\n"<<flush;            
                    return;
        };
            }
     
            if (sharedMemory->NeedsAllocation()){
                P->inOut->logMain <<"Allocating shared memory for genome\n"<<flush;
                sharedMemory->Allocate(shmSize);
            }
        }
        catch (const SharedMemoryException & exc)
        {
            HandleSharedMemoryException(exc, shmSize);
        }
    
        shmStart = (char*) sharedMemory->GetMapped();
        shmNG= (uint*) (shmStart+SHM_sizeG);
        shmNSA= (uint*) (shmStart+SHM_sizeSA);       
   
        if (!sharedMemory->IsAllocator())
        {
            // genome is in shared memory or being loaded
            // wait for the process that will populate it
            // and record the sizes
        
        uint iwait=0;
            while (*shmNG != P->nGenome) {
            iwait++;
            P->inOut->logMain <<"Another job is still loading the genome, sleeping for 1 min\n" <<flush;
            sleep(60);                    
            if (iwait==100) {
                ostringstream errOut;
                errOut << "EXITING because of FATAL ERROR: waited too long for the other job to finish loading the genome" << strerror(errno) << "\n" <<flush;
                    errOut << "SOLUTION: remove the shared memory chunk by running STAR with --genomeLoad Remove, and restart STAR" <<flush;     
                    exitWithError(errOut.str(),std::cerr, P->inOut->logMain, EXIT_CODE_GENOME_LOADING_WAITED_TOO_LONG, *P);                
            };
        };

            if (P->nSAbyte!=*shmNSA)
            {
                ostringstream errOut;
                errOut << "EXITING because of FATAL ERROR: the SA file size did not match what we found in shared memory" << "\n" << flush;
                errOut << "SOLUTION: remove the shared memory chunk by running STAR with --genomeLoad Remove, and restart STAR" << flush;     
                exitWithError(errOut.str(),std::cerr, P->inOut->logMain, EXIT_CODE_INCONSISTENT_DATA, *P);  
            }
        
            P->inOut->logMain << "Using shared memory for genome. key=0x" <<hex<<shmKey<<dec<< ";   shmid="<< sharedMemory->GetId() <<endl<<flush;
        }

        G1=shmStart+SHM_startG;
        SA.pointArray(G1+P->nGenome+L+L);
        char* shmNext=SA.charArray+P->nSAbyte;

        SAi.pointArray(shmNext);
        shmNext += SAi.lengthByte;
    
//     if (twoPass.pass1readsN==0) {//not 2-pass
//         shmStartG=SHM_startSHM;
//         shmStartSA=0;
//     } else {//2-pass
//         ostringstream errOut;
//         errOut << "EXITING because of FATAL ERROR: 2-pass procedure cannot be used with genome already loaded im memory'  "\n" ;
//         errOut << "SOLUTION: check shared memory settigns as explained in STAR manual, OR run STAR with --genomeLoad NoSharedMemory to avoid using shared memory\n" <<flush;     
//         exitWithError(errOut.str(),std::cerr, P->inOut->logMain, EXIT_CODE_SHM, *P);
//     };
     if (P->annotScoreScale>0) {//optional allocation
            sigG = shmNext;
            shmNext += P->nGenome;
        }    
    }
    else if (P->genomeLoad=="NoSharedMemory") // simply allocate memory, do not use shared memory
    {
        try {
            
            if (P->sjdbInsert.pass1 || P->sjdbInsert.pass2)
            {//reserve extra memory for insertion at the 1st and/or 2nd step
                nGenomePass1=P->nGenome;
                nSApass1=P->nSA;
                if (P->sjdbInsert.pass1)
                {
                    nGenomePass1+=P->limitSjdbInsertNsj*P->sjdbLength;
                    nSApass1+=2*P->limitSjdbInsertNsj*P->sjdbLength;
                };
                nGenomePass2=nGenomePass1;
                nSApass2=nSApass1;
                if (P->sjdbInsert.pass2)
                {
                    nGenomePass2+=P->limitSjdbInsertNsj*P->sjdbLength;
                    nSApass2+=2*P->limitSjdbInsertNsj*P->sjdbLength;                    
                };                
                
                G1=new char[nGenomePass2+L+L];        
                
                SApass2.defineBits(P->GstrandBit+1,nSApass2);
                SApass2.allocateArray();
                
                SApass1.defineBits(P->GstrandBit+1,nSApass1);
                SApass1.pointArray(SApass2.charArray+SApass2.lengthByte-SApass1.lengthByte);
                
                SA.pointArray(SApass1.charArray+SApass1.lengthByte-SA.lengthByte);
            } else 
            {//no insertions
                G1=new char[P->nGenome+L+L];        
                SA.allocateArray();
                
            };            
            SAi.allocateArray();
            P->inOut->logMain <<"Shared memory is not used for genomes. Allocated a private copy of the genome.\n"<<flush;                
        } catch (exception & exc) {
            ostringstream errOut;           
            errOut <<"EXITING: fatal error trying to allocate genome arrays, exception thrown: "<<exc.what()<<endl;
            errOut <<"Possible cause 1: not enough RAM. Check if you have enough RAM " << P->nGenome+L+L+SA.lengthByte+SAi.lengthByte+2000000000 << " bytes\n";
            errOut <<"Possible cause 2: not enough virtual memory allowed with ulimit. SOLUTION: run ulimit -v " <<  P->nGenome+L+L+SA.lengthByte+SAi.lengthByte+2000000000<<endl <<flush;
            exitWithError(errOut.str(),std::cerr, P->inOut->logMain, EXIT_CODE_MEMORY_ALLOCATION, *P);            
        };
        
    }

        
//     if (twopass1readsN==0) {//not 2-pass
//         shmStartG=SHM_startSHM;
//         shmStartSA=0;
//     } else {//2-pass
//         ostringstream errOut;
//         errOut << "EXITING because of FATAL ERROR: 2-pass procedure cannot be used with genome already loaded im memory'  "\n" ;
//         errOut << "SOLUTION: check shared memory settings as explained in STAR manual, OR run STAR with --genomeLoad NoSharedMemory to avoid using shared memory\n" <<flush;     
//         exitWithError(errOut.str(),std::cerr, P->inOut->logMain, EXIT_CODE_SHM, *P);
//     };


    G=G1+L;

    bool isAllocatorProcess = sharedMemory != NULL && sharedMemory->IsAllocator();

    if (P->genomeLoad=="NoSharedMemory" || isAllocatorProcess) {//load genome and SAs from files
        //load genome
        P->inOut->logMain <<"Genome file size: "<<P->nGenome <<" bytes; state: good=" <<GenomeIn.good()\
                <<" eof="<<GenomeIn.eof()<<" fail="<<GenomeIn.fail()<<" bad="<<GenomeIn.bad()<<"\n"<<flush;        
        P->inOut->logMain <<"Loading Genome ... " << flush;        
        uint genomeReadBytesN=fstreamReadBig(GenomeIn,G,P->nGenome);    
        P->inOut->logMain <<"done! state: good=" <<GenomeIn.good()\
                <<" eof="<<GenomeIn.eof()<<" fail="<<GenomeIn.fail()<<" bad="<<GenomeIn.bad()<<"; loaded "<<genomeReadBytesN<<" bytes\n" << flush;            
        GenomeIn.close();
        
        for (uint ii=0;ii<L;ii++) {// attach a tail with the largest symbol
            G1[ii]=K-1;
            G[P->nGenome+ii]=K-1;        
        };    
      
        //load SAs
        P->inOut->logMain <<"SA file size: "<<SA.lengthByte <<" bytes; state: good=" <<SAin.good()\
                <<" eof="<<SAin.eof()<<" fail="<<SAin.fail()<<" bad="<<SAin.bad()<<"\n"<<flush;        
        P->inOut->logMain <<"Loading SA ... " << flush;               
        genomeReadBytesN=fstreamReadBig(SAin,SA.charArray, SA.lengthByte);
        P->inOut->logMain <<"done! state: good=" <<SAin.good()\
                <<" eof="<<SAin.eof()<<" fail="<<SAin.fail()<<" bad="<<SAin.bad()<<"; loaded "<<genomeReadBytesN<<" bytes\n" << flush;            
        SAin.close();
        
        P->inOut->logMain <<"Loading SAindex ... " << flush;             
        SAiInBytes +=fstreamReadBig(SAiIn,SAi.charArray, SAi.lengthByte);
        P->inOut->logMain <<"done: "<<SAiInBytes<<" bytes\n" << flush;       
    };
    
    SAiIn.close();            

    if ((P->genomeLoad=="LoadAndKeep" || 
         P->genomeLoad=="LoadAndRemove" || 
         P->genomeLoad=="LoadAndExit") && isAllocatorProcess ) 
    {
        //record sizes. This marks the end of genome loading
        *shmNG=P->nGenome;
        *shmNSA=P->nSAbyte;
    };
    
    time ( &rawtime );
    P->inOut->logMain << "Finished loading the genome: " << asctime (localtime ( &rawtime )) <<"\n"<<flush;    
      
    #ifdef COMPILE_FOR_MAC
    {
        uint sum1=0;
        for (uint ii=0;ii<P->nGenome; ii++) sum1 +=  (uint) (unsigned char) G[ii];
        P->inOut->logMain << "Sum of all Genome bytes: " <<sum1 <<"\n"<<flush;  
        sum1=0;        
        for (uint ii=0;ii<SA.lengthByte; ii++) sum1 +=  (uint) (unsigned char) SA.charArray[ii];
        P->inOut->logMain << "Sum of all SA bytes: " <<sum1 <<"\n"<<flush;
        sum1=0;        
        for (uint ii=0;ii<SAi.lengthByte; ii++) sum1 +=  (uint) (unsigned char) SAi.charArray[ii];
        P->inOut->logMain << "Sum of all SAi bytes: " <<sum1 <<"\n"<<flush;
    };
    #endif
    
    if (P->genomeLoad=="LoadAndExit") {
	uint shmSum=0;
	for (uint ii=0;ii<shmSize;ii++) shmSum+=shmStart[ii];
        P->inOut->logMain << "genomeLoad=LoadAndExit: completed, the genome is loaded and kept in RAM, EXITING now.\n"<<flush;
//         system("echo `date` ..... Finished genome loading >> Log.timing.out");
        return;
    };
    
    //find chr starts from files
    P->chrInfoLoad();

    P->chrBinFill();
 
    //splice junctions database
    if (P->nGenome==P->chrStart[P->nChrReal]) {//no sjdb
        P->sjdbN=0;
        P->sjGstart=P->chrStart[P->nChrReal]+1; //not sure why I need that
    } else {//there are sjdb chromosomes
        ifstream sjdbInfo((P->genomeDir+"/sjdbInfo.txt").c_str());
        if (sjdbInfo.fail()) {
            ostringstream errOut;                            
            errOut << "EXITING because of FATAL error, could not open file " << (P->genomeDir+"/sjdbInfo.txt") <<"\n";
            errOut << "SOLUTION: check that the path to genome files, specified in --genomeDir is correct and the files are present, and have user read permsissions\n" <<flush;     
            exitWithError(errOut.str(),std::cerr, P->inOut->logMain, EXIT_CODE_INPUT_FILES, *P);
        };
    
        
        sjdbInfo >> P->sjdbN >> P->sjdbOverhang;
        P->inOut->logMain << "Processing splice junctions database sjdbN=" <<P->sjdbN<<",   sjdbOverhang=" <<P->sjdbOverhang <<" \n";    
        
        P->sjChrStart=P->nChrReal;
        P->sjGstart=P->chrStart[P->sjChrStart];

        //fill the sj-db to genome translation array
        P->sjDstart=new uint [P->sjdbN];
        P->sjAstart=new uint [P->sjdbN];
        P->sjdbStart=new uint [P->sjdbN];
        P->sjdbEnd=new uint [P->sjdbN];
        
        P->sjdbMotif=new uint8 [P->sjdbN];
        P->sjdbShiftLeft=new uint8 [P->sjdbN];
        P->sjdbShiftRight=new uint8 [P->sjdbN];
        P->sjdbStrand=new uint8 [P->sjdbN];

        for (uint ii=0;ii<P->sjdbN;ii++) {//get the info about junctions from sjdbInfo.txt       
            {
                uint16 d1,d2,d3,d4;
                sjdbInfo >> P->sjdbStart[ii] >> P->sjdbEnd[ii] >> d1 >> d2 >> d3 >> d4;
                P->sjdbMotif[ii]      = (uint8) d1;
                P->sjdbShiftLeft[ii]  = (uint8) d2;
                P->sjdbShiftRight[ii] = (uint8) d3;
                P->sjdbStrand[ii] = (uint8) d4;
            };
            P->sjDstart[ii]   = P->sjdbStart[ii]  - P->sjdbOverhang; 
            P->sjAstart[ii]   = P->sjdbEnd[ii] + 1;     
            if (P->sjdbMotif[ii]==0) {//shinon-canonical junctions back to their true coordinates
                P->sjDstart[ii] += P->sjdbShiftLeft[ii];
                P->sjAstart[ii] += P->sjdbShiftLeft[ii];
            };
        };
    };     
    
    //check and redefine some parameters
    //max intron size
    if (P->alignIntronMax==0 && P->alignMatesGapMax==0) {
        P->inOut->logMain << "alignIntronMax=alignMatesGapMax=0, the max intron size will be approximately determined by (2^winBinNbits)*winAnchorDistNbins=" \
                << (1LLU<<P->winBinNbits)*P->winAnchorDistNbins <<endl;
    } else {
        //redefine winBinNbits
        P->winBinNbits=max( (uint) floor(log2(P->nGenome/40000)+0.5), (uint) floor(log2(max(max(4LLU,P->alignIntronMax),P->alignMatesGapMax)/4)+0.5) );
        P->inOut->logMain << "To accomodate alignIntronMax="<<P->alignIntronMax<<" redefined winBinNbits="<< P->winBinNbits <<endl;
    };
    
    if (P->winBinNbits > P->genomeChrBinNbits) {
       P->inOut->logMain << "winBinNbits=" <<P->winBinNbits <<" > " << "genomeChrBinNbits=" << P->genomeChrBinNbits << "   redefining:\n";
       P->winBinNbits=P->genomeChrBinNbits;
       P->inOut->logMain << "winBinNbits=" <<P->winBinNbits <<endl;
    };    
    
    
    if (P->alignIntronMax==0 && P->alignMatesGapMax==0) {
    } else {
        //redefine winFlankNbins,winAnchorDistNbins
        P->winFlankNbins=max(P->alignIntronMax,P->alignMatesGapMax)/(1LLU<<P->winBinNbits)+1;
        P->winAnchorDistNbins=2*P->winFlankNbins;
        P->inOut->logMain << "To accomodate alignIntronMax="<<P->alignIntronMax<<" and alignMatesGapMax="<<P->alignMatesGapMax<<\
                ", redefined winFlankNbins="<<P->winFlankNbins<<" and winAnchorDistNbins="<<P->winAnchorDistNbins<<endl;
    };
    
    P->winBinChrNbits=P->genomeChrBinNbits-P->winBinNbits;
    P->winBinN = P->nGenome/(1LLU << P->winBinNbits)+1;//this may be chenaged later
};
예제 #6
0
파일: STAR.cpp 프로젝트: alosdiallo/STAR
int main(int argInN, char* argIn[]) {
   
    time(&g_statsAll.timeStart);
   
    Parameters *P = new Parameters; //all parameters
       
    P->inputParameters(argInN, argIn);
    
    *(P->inOut->logStdOut) << timeMonthDayTime(g_statsAll.timeStart) << " ..... Started STAR run\n" <<flush;           
    
    //generate genome
    if (P->runMode=="genomeGenerate") {
        genomeGenerate(P);
        (void) sysRemoveDir (P->outFileTmp);        
        P->inOut->logMain << "DONE: Genome generation, EXITING\n" << flush;
        exit(0);
    } else if (P->runMode!="alignReads") {
        P->inOut->logMain << "EXITING because of INPUT ERROR: unknown value of input parameter runMode=" <<P->runMode<<endl<<flush;
        exit(1);
    };
    
    Genome mainGenome (P);
    mainGenome.genomeLoad();
    if (P->genomeLoad=="LoadAndExit" || P->genomeLoad=="Remove") 
    {
        return 0;
    };
    
    P->twoPass.pass2=false; //this is the 1st pass    
    
    SjdbClass sjdbLoci;

    if (P->sjdbInsert.pass1) 
    {
        Parameters *P1=new Parameters;
        *P1=*P;
        sjdbInsertJunctions(P, P1, mainGenome, sjdbLoci);
    };

    //calculate genome-related parameters
    Transcriptome *mainTranscriptome=NULL;
    
    
/////////////////////////////////////////////////////////////////////////////////////////////////START
    if (P->runThreadN>1) {
        g_threadChunks.threadArray=new pthread_t[P->runThreadN];
        pthread_mutex_init(&g_threadChunks.mutexInRead, NULL);
        pthread_mutex_init(&g_threadChunks.mutexOutSAM, NULL);
        pthread_mutex_init(&g_threadChunks.mutexOutBAM1, NULL);
        pthread_mutex_init(&g_threadChunks.mutexOutUnmappedFastx, NULL);
        pthread_mutex_init(&g_threadChunks.mutexOutFilterBySJout, NULL);
        pthread_mutex_init(&g_threadChunks.mutexStats, NULL);
        pthread_mutex_init(&g_threadChunks.mutexBAMsortBins, NULL);
    };

    g_statsAll.progressReportHeader(P->inOut->logProgress);    
    
    if (P->twoPass.yes) {//2-pass
        //re-define P for the pass1
        
        Parameters *P1=new Parameters;
        *P1=*P;
        //turn off unnecessary calculations
        P1->outSAMtype[0]="None";
        P1->outSAMbool=false;
        P1->outBAMunsorted=false;
        P1->outBAMcoord=false;
    
        P1->chimSegmentMin=0;
        
        P1->quant.yes=false;
        P1->quant.trSAM.yes=false;
        P1->quant.geCount.yes=false;
        
        P1->outFilterBySJoutStage=0;
        
        P1->outReadsUnmapped="None";
        
        P1->outFileNamePrefix=P->twoPass.dir;

        P1->readMapNumber=P->twoPass.pass1readsN;
//         P1->inOut->logMain.open((P1->outFileNamePrefix + "Log.out").c_str());

        g_statsAll.resetN();
        time(&g_statsAll.timeStartMap);
        P->inOut->logProgress << timeMonthDayTime(g_statsAll.timeStartMap) <<"\tStarted 1st pass mapping\n" <<flush;
        *P->inOut->logStdOut << timeMonthDayTime(g_statsAll.timeStartMap) << " ..... Started 1st pass mapping\n" <<flush;

        //run mapping for Pass1
        ReadAlignChunk *RAchunk1[P->runThreadN];        
        for (int ii=0;ii<P1->runThreadN;ii++) {
            RAchunk1[ii]=new ReadAlignChunk(P1, mainGenome, mainTranscriptome, ii);
        };    
        mapThreadsSpawn(P1, RAchunk1);
        outputSJ(RAchunk1,P1); //collapse and output junctions
//         for (int ii=0;ii<P1->runThreadN;ii++) {
//             delete [] RAchunk[ii];
//         };          
        
        time_t rawtime; time (&rawtime);
        P->inOut->logProgress << timeMonthDayTime(rawtime) <<"\tFinished 1st pass mapping\n";
        *P->inOut->logStdOut << timeMonthDayTime(rawtime) << " ..... Finished 1st pass mapping\n" <<flush;
        ofstream logFinal1 ( (P->twoPass.dir + "/Log.final.out").c_str());
        g_statsAll.reportFinal(logFinal1,P1);

        P->twoPass.pass2=true;//starting the 2nd pass
        P->twoPass.pass1sjFile=P->twoPass.dir+"/SJ.out.tab";
        
        sjdbInsertJunctions(P, P1, mainGenome, sjdbLoci);

        //reopen reads files
        P->closeReadsFiles();
        P->openReadsFiles();
    } else {//not 2-pass
        //nothing for now
    };
    
    if ( P->quant.yes ) {//load transcriptome
        mainTranscriptome=new Transcriptome(P);
    };    
    
    //initialize Stats
    g_statsAll.resetN();
    time(&g_statsAll.timeStartMap);
    *P->inOut->logStdOut << timeMonthDayTime(g_statsAll.timeStartMap) << " ..... Started mapping\n" <<flush;
    
    g_statsAll.timeLastReport=g_statsAll.timeStartMap;

    //open SAM/BAM files for output
    if (P->outSAMmode != "None") {//open SAM file and write header
        ostringstream samHeaderStream;
        
        for (uint ii=0;ii<P->nChrReal;ii++) {
            samHeaderStream << "@SQ\tSN:"<< P->chrName.at(ii) <<"\tLN:"<<P->chrLength[ii]<<"\n";
        };

        if (P->outSAMheaderPG.at(0)!="-") {
            samHeaderStream << P->outSAMheaderPG.at(0);
            for (uint ii=1;ii<P->outSAMheaderPG.size(); ii++) {
                samHeaderStream << "\t" << P->outSAMheaderPG.at(ii);
            };
            samHeaderStream << "\n";
        };        
        
        samHeaderStream << "@PG\tID:STAR\tPN:STAR\tVN:" << STAR_VERSION <<"\tCL:" << P->commandLineFull <<"\n";
        
        if (P->outSAMheaderCommentFile!="-") {
            ifstream comstream (P->outSAMheaderCommentFile);
            while (comstream.good()) {
                string line1;
                getline(comstream,line1);
                if (line1.find_first_not_of(" \t\n\v\f\r")!=std::string::npos) {//skip blank lines
                    samHeaderStream << line1 <<"\n";
                };
            };
        };         
        

        for (uint32 ii=0;ii<P->outSAMattrRGlineSplit.size();ii++) {//@RG lines
            samHeaderStream << "@RG\t" << P->outSAMattrRGlineSplit.at(ii) <<"\n";
        };
 
        samHeaderStream <<  "@CO\t" <<"user command line: " << P->commandLine <<"\n";
        
        if (P->outSAMheaderHD.at(0)!="-") {
            P->samHeaderHD = P->outSAMheaderHD.at(0);
            for (uint ii=1;ii<P->outSAMheaderHD.size(); ii++) {
                P->samHeaderHD +="\t" + P->outSAMheaderHD.at(ii);
            };
        } else {
            P->samHeaderHD = "@HD\tVN:1.4";
        };        
        
        
        P->samHeader=P->samHeaderHD+"\n"+samHeaderStream.str();
        //for the sorted BAM, need to add SO:cooridnate to the header line
        P->samHeaderSortedCoord=P->samHeaderHD + (P->outSAMheaderHD.size()==0 ? "" : "\tSO:coordinate") + "\n" + samHeaderStream.str();
        
        if (P->outSAMbool) {//
            *P->inOut->outSAM << P->samHeader;
        };
        if (P->outBAMunsorted){
            outBAMwriteHeader(P->inOut->outBAMfileUnsorted,P->samHeader,P->chrName,P->chrLength);
        };
//             if (P->outBAMcoord){
//                 outBAMwriteHeader(P->inOut->outBAMfileCoord,P->samHeader,P->chrName,P->chrLength);            
//             };
        
        if ( P->quant.trSAM.yes ) {
            samHeaderStream.str("");
            vector <uint> trlength;
            for (uint32 ii=0;ii<mainTranscriptome->trID.size();ii++) {
                uint32 iex1=mainTranscriptome->trExI[ii]+mainTranscriptome->trExN[ii]-1; //last exon of the transcript
                trlength.push_back(mainTranscriptome->exLenCum[iex1]+mainTranscriptome->exSE[2*iex1+1]-mainTranscriptome->exSE[2*iex1]+1);          
                samHeaderStream << "@SQ\tSN:"<< mainTranscriptome->trID.at(ii) <<"\tLN:"<<trlength.back()<<"\n";
            };
            for (uint32 ii=0;ii<P->outSAMattrRGlineSplit.size();ii++) {//@RG lines
                samHeaderStream << "@RG\t" << P->outSAMattrRGlineSplit.at(ii) <<"\n";
            };
            outBAMwriteHeader(P->inOut->outQuantBAMfile,samHeaderStream.str(),mainTranscriptome->trID,trlength);        
        };
        
    };
    
    if (P->chimSegmentMin>0) {
        P->inOut->outChimJunction.open((P->outFileNamePrefix + "Chimeric.out.junction").c_str());
        P->inOut->outChimSAM.open((P->outFileNamePrefix + "Chimeric.out.sam").c_str());
        P->inOut->outChimSAM << P->samHeader;
        pthread_mutex_init(&g_threadChunks.mutexOutChimSAM, NULL);   
        pthread_mutex_init(&g_threadChunks.mutexOutChimJunction, NULL);
    };
         
    // P->inOut->logMain << "mlock value="<<mlockall(MCL_CURRENT|MCL_FUTURE) <<"\n"<<flush;

    // prepare chunks and spawn mapping threads    
    ReadAlignChunk *RAchunk[P->runThreadN];
    for (int ii=0;ii<P->runThreadN;ii++) {
        RAchunk[ii]=new ReadAlignChunk(P, mainGenome, mainTranscriptome, ii);
    };    
    
    mapThreadsSpawn(P, RAchunk);
   
    if (P->outFilterBySJoutStage==1) {//completed stage 1, go to stage 2
        P->inOut->logMain << "Completed stage 1 mapping of outFilterBySJout mapping\n"<<flush;
        outputSJ(RAchunk,P);//collapse novel junctions
        P->readFilesIndex=-1;
        

        P->outFilterBySJoutStage=2;
        if (P->outBAMcoord) {
            for (int it=0; it<P->runThreadN; it++) {//prepare the unmapped bin 
                RAchunk[it]->chunkOutBAMcoord->coordUnmappedPrepareBySJout();
            };
        };

        mapThreadsSpawn(P, RAchunk);
    };
    
    //close some BAM files
    if (P->inOut->outBAMfileUnsorted!=NULL) {
        bgzf_flush(P->inOut->outBAMfileUnsorted);
        bgzf_close(P->inOut->outBAMfileUnsorted);
    };
    if (P->inOut->outQuantBAMfile!=NULL) {
        bgzf_flush(P->inOut->outQuantBAMfile);
        bgzf_close(P->inOut->outQuantBAMfile);
    };      
    
    if (P->outBAMcoord && P->limitBAMsortRAM==0) {//make it equal ot the genome size
        P->limitBAMsortRAM=P->nGenome+mainGenome.SA.lengthByte+mainGenome.SAi.lengthByte;
    };
        
    //no need for genome anymore, free the memory
    mainGenome.freeMemory();
    
    if ( P->quant.geCount.yes )
    {//output gene quantifications
        for (int ichunk=1; ichunk<P->runThreadN; ichunk++)
        {//sum counts from all chunks into 0th chunk
            RAchunk[0]->chunkTr->quants->addQuants(*(RAchunk[ichunk]->chunkTr->quants));
        };
        RAchunk[0]->chunkTr->quantsOutput();
    };
    
    if (P->runThreadN>1 && P->outSAMorder=="PairedKeepInputOrder") {//concatenate Aligned.* files
        RAchunk[0]->chunkFilesCat(P->inOut->outSAM, P->outFileTmp + "/Aligned.out.sam.chunk", g_threadChunks.chunkOutN);
    };    

    
    if (P->outBAMcoord) {//sort BAM if needed
        *P->inOut->logStdOut << timeMonthDayTime() << " ..... Started sorting BAM\n" <<flush;
        P->inOut->logMain << timeMonthDayTime() << " ..... Started sorting BAM\n" <<flush;
        uint32 nBins=P->outBAMcoordNbins;
        
        //check max size needed for sorting
        uint maxMem=0;
        for (uint32 ibin=0; ibin<nBins-1; ibin++) {//check akk bins
            uint binS=0;
            for (int it=0; it<P->runThreadN; it++) {//collect sizes from threads
                binS += RAchunk[it]->chunkOutBAMcoord->binTotalBytes[ibin]+24*RAchunk[it]->chunkOutBAMcoord->binTotalN[ibin];
            };        
            if (binS>maxMem) maxMem=binS;
        };
        P->inOut->logMain << "Max memory needed for sorting = "<<maxMem<<endl;
        if (maxMem>P->limitBAMsortRAM) {
            ostringstream errOut;
            errOut <<"EXITING because of fatal ERROR: not enough memory for BAM sorting: \n";
            errOut <<"SOLUTION: re-run STAR with at least --limitBAMsortRAM " <<maxMem+1000000000;
            exitWithError(errOut.str(), std::cerr, P->inOut->logMain, EXIT_CODE_PARAMETER, *P);                                    
        };
        
        
        uint totalMem=0;
//         P->inOut->logMain << "Started sorting BAM ..." <<endl;
        #pragma omp parallel num_threads(P->outBAMsortingThreadNactual) 
        #pragma omp for schedule (dynamic,1)
        for (uint32 ibin1=0; ibin1<nBins; ibin1++) {
            uint32 ibin=nBins-1-ibin1;//reverse order to start with the last bin - unmapped reads
            
            uint binN=0, binS=0;
            for (int it=0; it<P->runThreadN; it++) {//collect sizes from threads
                binN += RAchunk[it]->chunkOutBAMcoord->binTotalN[ibin];
                binS += RAchunk[it]->chunkOutBAMcoord->binTotalBytes[ibin];
            };
            
            if (binS==0) continue; //empty bin
  
            if (ibin == nBins-1) {//last bin for unmapped reads
                BAMbinSortUnmapped(ibin,P->runThreadN,P->outBAMsortTmpDir,P->inOut->outBAMfileCoord, P);
            } else {
            uint newMem=binS+binN*24;
            bool boolWait=true;
            while (boolWait) {
                #pragma omp critical
                if (totalMem+newMem < P->limitBAMsortRAM) {
                    boolWait=false;
                    totalMem+=newMem;
                };
                sleep(0.1);
            };
            BAMbinSortByCoordinate(ibin,binN,binS,P->runThreadN,P->outBAMsortTmpDir,P->inOut->outBAMfileCoord, P);
            #pragma omp critical
            totalMem-=newMem;//"release" RAM
        };
        };
        //concatenate all BAM files, using bam_cat
        char **bamBinNames = new char* [nBins];
        vector <string> bamBinNamesV;
        for (uint32 ibin=0; ibin<nBins; ibin++) {
            
            bamBinNamesV.push_back(P->outBAMsortTmpDir+"/b"+to_string((uint) ibin));            
            struct stat buffer;
            if (stat (bamBinNamesV.back().c_str(), &buffer) != 0) {//check if file exists
                bamBinNamesV.pop_back();
            };
        };
        for (uint32 ibin=0; ibin<bamBinNamesV.size(); ibin++) {
                bamBinNames[ibin] = (char*) bamBinNamesV.at(ibin).c_str();
        };
        bam_cat(bamBinNamesV.size(), bamBinNames, 0, P->outBAMfileCoordName.c_str());
    };
    //wiggle output
    if (P->outWigFlags.yes) {
        *P->inOut->logStdOut << timeMonthDayTime() << " ..... Started wiggle output\n" <<flush;
        P->inOut->logMain << timeMonthDayTime() << " ..... Started wiggle output\n" <<flush;
        string wigOutFileNamePrefix=P->outFileNamePrefix + "Signal";
        signalFromBAM(P->outBAMfileCoordName, wigOutFileNamePrefix, *P);
    };
    
    //aggregate output junctions
    //collapse splice junctions from different threads/chunks, and output them
    outputSJ(RAchunk,P);
    
    g_statsAll.progressReport(P->inOut->logProgress);
    P->inOut->logProgress  << "ALL DONE!\n"<<flush;
    P->inOut->logFinal.open((P->outFileNamePrefix + "Log.final.out").c_str());
    g_statsAll.reportFinal(P->inOut->logFinal,P);
    *P->inOut->logStdOut << timeMonthDayTime(g_statsAll.timeFinish) << " ..... Finished successfully\n" <<flush;
    
    P->inOut->logMain  << "ALL DONE!\n"<<flush;
    sysRemoveDir (P->outFileTmp);
    
    P->closeReadsFiles();//this will kill the readFilesCommand processes if necessary
    mainGenome.~Genome(); //need explicit call because of the 'delete P->inOut' below, which will destroy P->inOut->logStdOut
    
    delete P->inOut; //to close files
    delete P;
    
    return 0;    
};
예제 #7
0
파일: loadGTF.cpp 프로젝트: alexf101/STAR
uint loadGTF(SjdbClass &sjdbLoci, Parameters *P, string dirOut) {//load gtf file, add junctions to P->sjdb
    //returns number of added junctions
    if (P->sjdbOverhang>0 && P->sjdbGTFfile!="-") {       
        time_t rawTime;
        time(&rawTime);
        P->inOut->logMain     << timeMonthDayTime(rawTime) <<" ..... Processing annotations GTF\n" <<flush;
        *P->inOut->logStdOut  << timeMonthDayTime(rawTime) <<" ..... Processing annotations GTF\n" <<flush;           
        
        ifstream sjdbStreamIn ( P->sjdbGTFfile.c_str() );   
        if (sjdbStreamIn.fail()) {
            ostringstream errOut;
            errOut << "FATAL error, could not open file sjdbGTFfile=" << P->sjdbGTFfile <<"\n";
            exitWithError(errOut.str(),std::cerr, P->inOut->logMain, EXIT_CODE_INPUT_FILES, *P);
        };    
        
        if (P->chrNameIndex.size()==0) 
        {
            for (uint ii=0;ii<P->nChrReal;ii++) {
                P->chrNameIndex[P->chrName[ii]]=ii;
            };        
        };
        std::map <string,uint> transcriptIDnumber, geneIDnumber;

        uint exonN=0;
        while (sjdbStreamIn.good()) {//count the number of exons
            string chr1,ddd2,featureType;            
            sjdbStreamIn >> chr1 >> ddd2 >> featureType;
            if (chr1.substr(0,1)!="#" && featureType==P->sjdbGTFfeatureExon) {
                exonN++;
            };
            sjdbStreamIn.ignore(1000000000,'\n'); //ignore the rest of the line
        };
        
        if (exonN==0)
        {
            P->inOut->logMain << "WARNING: found no exons in sjdbGTFfile=" << P->sjdbGTFfile <<endl;
            return 0;
        };
        uint* exonLoci=new uint [exonN*GTF_exonLoci_size];
        char* transcriptStrand = new char [exonN];
        vector <string> transcriptID, geneID;

        exonN=0;//re-calculate
        sjdbStreamIn.clear();
        sjdbStreamIn.seekg(0,ios::beg);
        while (sjdbStreamIn.good()) {

            string oneLine,chr1,ddd2,featureType;
            getline(sjdbStreamIn,oneLine);
            istringstream oneLineStream (oneLine);
            
            oneLineStream >> chr1 >> ddd2 >> featureType;
            if (chr1.substr(0,1)!="#" && featureType==P->sjdbGTFfeatureExon) {//exonic line, process
                
                if (P->sjdbGTFchrPrefix!="-") chr1=P->sjdbGTFchrPrefix + chr1;
                
                if (P->chrNameIndex.count(chr1)==0) {//chr not in Genome
                    P->inOut->logMain << "WARNING: while processing sjdbGTFfile=" << P->sjdbGTFfile <<": chromosome '"<<chr1<<"' not found in Genome fasta files for line:\n";
                    P->inOut->logMain << oneLine <<"\n"<<flush;          
                    continue; //do not process exons/transcripts on missing chromosomes
                };
                
                uint ex1,ex2;
                char str1;
                oneLineStream >> ex1 >> ex2 >> ddd2 >> str1 >> ddd2; //read all fields except the last

                string oneLine1;
                getline(oneLineStream, oneLine1);//get the last field
                replace(oneLine1.begin(),oneLine1.end(),';',' ');//to separate attributes
                replace(oneLine1.begin(),oneLine1.end(),'=',' ');//for GFF3 processing
                oneLineStream.str(oneLine1);
                oneLineStream.clear();

                string trID(""), gID(""), attr1("");
                while (oneLineStream.good()) {
                    oneLineStream >> attr1;
                    if (attr1==P->sjdbGTFtagExonParentTranscript) {
                        oneLineStream >> trID;
                        trID.erase(remove(trID.begin(),trID.end(),'"'),trID.end());
                        trID.erase(remove(trID.begin(),trID.end(),';'),trID.end());
                    } else if (attr1==P->sjdbGTFtagExonParentGene) {
                        oneLineStream >> gID;
                        gID.erase(remove(gID.begin(),gID.end(),'"'),gID.end());
                        gID.erase(remove(gID.begin(),gID.end(),';'),gID.end());
                    };
                };
예제 #8
0
void sjdbInsertJunctions(Parameters * P, Parameters * P1, Genome & genome, SjdbClass & sjdbLoci)
{
    time_t rawtime;

    if (P->sjdbN>0 && sjdbLoci.chr.size()==0)
    {//load from the saved genome, only if the loading did not happen already (if sjdb insertion happens at the 1st pass, sjdbLoci will be populated
        ifstream & sjdbStreamIn = ifstrOpen(P->genomeDir+"/sjdbList.out.tab", ERROR_OUT, "SOLUTION: re-generate the genome in genomeDir=" + P->genomeDir, P);
        sjdbLoadFromStream(sjdbStreamIn, sjdbLoci);
        sjdbLoci.priority.resize(sjdbLoci.chr.size(),30);
        time ( &rawtime );
        P->inOut->logMain << timeMonthDayTime(rawtime) << "   Loaded database junctions from the generated genome " << P->genomeDir+"/sjdbList.out.tab" <<": "<<sjdbLoci.chr.size()<<" total junctions\n\n";
    };

    if (P->twoPass.pass2)
    {//load 1st pass new junctions
     //sjdbLoci already contains the junctions from before 1st pass
        ifstream sjdbStreamIn ( P->twoPass.pass1sjFile.c_str() );
        if (sjdbStreamIn.fail()) {
            ostringstream errOut;
            errOut << "FATAL INPUT error, could not open input file with junctions from the 1st pass="******"\n";
            exitWithError(errOut.str(),std::cerr, P->inOut->logMain, EXIT_CODE_INPUT_FILES, *P);
        };
        sjdbLoadFromStream(sjdbStreamIn, sjdbLoci);
        sjdbLoci.priority.resize(sjdbLoci.chr.size(),0);
        time ( &rawtime );
        P->inOut->logMain << timeMonthDayTime(rawtime) << "   Loaded database junctions from the 1st pass file: " << P->twoPass.pass1sjFile <<": "<<sjdbLoci.chr.size()<<" total junctions\n\n";
    } else
    {//loading junctions from GTF or tab or from the saved genome is only allowed at the 1st pass
     //at the 2nd pass these are already in the sjdbLoci

        if (P->sjdbFileChrStartEnd.at(0)!="-")
        {//load from junction files
            sjdbLoadFromFiles(P,sjdbLoci);
            sjdbLoci.priority.resize(sjdbLoci.chr.size(),10);
            time ( &rawtime );
            P->inOut->logMain << timeMonthDayTime(rawtime) << "   Loaded database junctions from the sjdbFileChrStartEnd file(s), " << sjdbLoci.chr.size()<<" total junctions\n\n";
        };

        if (P->sjdbGTFfile!="-")
        {//load from GTF
            loadGTF(sjdbLoci, P, P->sjdbInsert.outDir);
            sjdbLoci.priority.resize(sjdbLoci.chr.size(),20);
            time ( &rawtime );
            P->inOut->logMain << timeMonthDayTime(rawtime) << "   Loaded database junctions from the GTF file: " << P->sjdbGTFfile<<": "<<sjdbLoci.chr.size()<<" total junctions\n\n";
        };
    };

    char *Gsj=new char [2*P->sjdbLength*sjdbLoci.chr.size()+1];//array to store junction sequences, will be filled in sjdbPrepare
    sjdbPrepare (sjdbLoci, P, P->chrStart[P->nChrReal], P->sjdbInsert.outDir, genome.G, Gsj);//P->nGenome - change when replacing junctions
    time ( &rawtime );
    P->inOut->logMain  << timeMonthDayTime(rawtime) << "   Finished preparing junctions" <<endl;

    if (P->sjdbN>P->limitSjdbInsertNsj)
    {
        ostringstream errOut;
        errOut << "Fatal LIMIT error: the number of junctions to be inserted on the fly ="<<P->sjdbN<<" is larger than the limitSjdbInsertNsj="<<P->limitSjdbInsertNsj<<"\n";                errOut << "Fatal LIMIT error: the number of junctions to be inserted on the fly ="<<P->sjdbN<<" is larger than the limitSjdbInsertNsj="<<P->limitSjdbInsertNsj<<"\n";
        errOut << "SOLUTION: re-run with at least --limitSjdbInsertNsj "<<P->sjdbN<<"\n";
        exitWithError(errOut.str(),std::cerr, P->inOut->logMain, EXIT_CODE_INPUT_FILES, *P);
    };
    //insert junctions into the genome and SA and SAi
    sjdbBuildIndex (P, P1, Gsj, genome.G, genome.SA, (P->twoPass.pass2 ? genome.SApass2 : genome.SApass1), genome.SAi);
    delete [] Gsj; //junction sequences have been added to G
    time ( &rawtime );
    P->inOut->logMain     << timeMonthDayTime(rawtime) << " ..... finished inserting junctions into genome" <<endl;

    if (P->sjdbInsert.save=="All")
    {//save and copy all genome files into sjdbInsert.outDir, except those created above
        if (P->genomeDir != P->sjdbInsert.outDir)
        {
            copyFile(P->genomeDir+"/chrName.txt", P->sjdbInsert.outDir+"/chrName.txt");
            copyFile(P->genomeDir+"/chrStart.txt", P->sjdbInsert.outDir+"/chrStart.txt");
            copyFile(P->genomeDir+"/chrNameLength.txt", P->sjdbInsert.outDir+"/chrNameLength.txt");
            copyFile(P->genomeDir+"/chrLength.txt", P->sjdbInsert.outDir+"/chrLength.txt");
        };

        genomeParametersWrite(P->sjdbInsert.outDir+("/genomeParameters.txt"), P, ERROR_OUT);

        ofstream & genomeOut = ofstrOpen(P->sjdbInsert.outDir+"/Genome",ERROR_OUT, P);
        fstreamWriteBig(genomeOut,genome.G,P->nGenome,P->sjdbInsert.outDir+"/Genome",ERROR_OUT,P);
        genomeOut.close();

        ofstream & saOut = ofstrOpen(P->sjdbInsert.outDir+"/SA",ERROR_OUT, P);
        fstreamWriteBig(saOut,(char*) genome.SA.charArray, (streamsize) genome.SA.lengthByte, P->sjdbInsert.outDir+"/SA",ERROR_OUT,P);
        saOut.close();

        ofstream & saIndexOut = ofstrOpen(P->sjdbInsert.outDir+"/SAindex",ERROR_OUT, P);
        fstreamWriteBig(saIndexOut, (char*) &P->genomeSAindexNbases, sizeof(P->genomeSAindexNbases),P->sjdbInsert.outDir+"/SAindex",ERROR_OUT,P);
        fstreamWriteBig(saIndexOut, (char*) P->genomeSAindexStart, sizeof(P->genomeSAindexStart[0])*(P->genomeSAindexNbases+1),P->sjdbInsert.outDir+"/SAindex",ERROR_OUT,P);
        fstreamWriteBig(saIndexOut,  genome.SAi.charArray, genome.SAi.lengthByte,P->sjdbInsert.outDir+"/SAindex",ERROR_OUT,P);
        saIndexOut.close();
    };

    //re-calculate genome-related parameters
    P->winBinN = P->nGenome/(1LLU << P->winBinNbits)+1;
};
예제 #9
0
void sjdbBuildIndex (Parameters *P, Parameters *P1, char *Gsj, char *G, PackedArray &SA, PackedArray &SA2, PackedArray &SAi) {
    
    #define SPACER_CHAR GENOME_spacingChar

    if (P->sjdbN==0)
    {//no junctions to insert
        return;
    };
    
    time_t rawtime;
    time ( &rawtime );
    P->inOut->logMain   << timeMonthDayTime(rawtime) << " ..... Inserting junctions into the genome indices" <<endl;    
    *P->inOut->logStdOut  << timeMonthDayTime(rawtime) << " ..... Inserting junctions into the genome indices" <<endl;
    
    uint nGsj=P->sjdbLength*P->sjdbN;
    for (uint ii=1; ii<=P->sjdbN; ii++) 
    {
        Gsj[ii*P->sjdbLength-1]=SPACER_CHAR; //to make sure this is > than any genome char
    };
    Gsj[nGsj*2]=SPACER_CHAR+1;//mark the end of the text

    for (uint ii=0; ii<nGsj; ii++) {//reverse complement junction sequences
        Gsj[nGsj*2-1-ii]=Gsj[ii]<4 ? 3-Gsj[ii] : Gsj[ii]; //reverse complement
    };

    char* G1c=new char[nGsj*2+1];
    complementSeqNumbers(Gsj, G1c, nGsj*2+1);

    uint32* oldSJind=new uint32[P1->sjdbN];
    
//     uint nIndicesSJ1=P->sjdbOverhang;
    uint   nIndicesSJ1=P->sjdbLength;//keep all indices - this is pre-2.4.1 of generating the genome
    
    uint64* indArray=new uint64[2*P->sjdbN*(nIndicesSJ1+1)*2];//8+4 bytes for SA index and index in the genome * nJunction * nIndices per junction * 2 for reverse compl
    uint64 sjNew=0;
    #pragma omp parallel num_threads(P->runThreadN)
    #pragma omp for schedule (dynamic,1000) reduction(+:sjNew)
    for (uint isj=0; isj<2*P->sjdbN; isj++) {//find insertion points for each of the sequences

        char** seq1=new char*[2];
        seq1[0]=Gsj+isj*P->sjdbLength;
        seq1[1]=G1c+isj*P->sjdbLength;
        
        uint isj1=isj<P->sjdbN ? isj : 2*P->sjdbN-1-isj;
        int sjdbInd = P1->sjdbN==0 ? -1 : binarySearch2(P->sjdbStart[isj1],P->sjdbEnd[isj1],P1->sjdbStart,P1->sjdbEnd,P1->sjdbN);
        if (sjdbInd<0) 
        {//count new junctions
            ++sjNew;
        } else 
        {//record new index of the old junctions
            oldSJind[sjdbInd]=isj1;
        };
        
        for (uint istart1=0; istart1<nIndicesSJ1;istart1++) {
            
            uint istart=istart1;
//             uint istart=isj<P->sjdbN ? istart1 : istart1+1; //for rev-compl junction, shift by one base to start with the 1st non-spacer base
            uint ind1=2*(isj*nIndicesSJ1+istart1);
            if (sjdbInd>=0 || seq1[0][istart]>3) 
            {//no index for already included junctions, or suffices starting with N
                indArray[ind1]=-1;
            } else 
            {
                //indArray[ind1] =  suffixArraySearch(seq1, istart, P->sjdbLength-istart1, G, SA, true, 0, P->nSA-1, 0, P) ;
                indArray[ind1] =  suffixArraySearch(seq1, istart, 10000, G, SA, true, 0, P->nSA-1, 0, P) ;
                indArray[ind1+1] = isj*P->sjdbLength+istart;
            };
        };
    };
//     for (int ii=0;ii<P1->sjdbN;ii++) {if ( oldSJind[ii]==0){cout <<ii<<endl;};};
    sjNew = sjNew/2;//novel junctions were double counted on two strands
    
    time ( &rawtime );
    P->inOut->logMain  << timeMonthDayTime(rawtime) << "   Finished SA search: number of new junctions=" << sjNew <<", old junctions="<<P->sjdbN-sjNew<<endl;
    
    uint nInd=0;//true number of new indices
    for (uint ii=0; ii<2*P->sjdbN*nIndicesSJ1; ii++) {//remove entries that cannot be inserted, this cannot be done in the parallel cycle above
        if (indArray[ii*2]!= (uint) -1) {
            indArray[nInd*2]=indArray[ii*2];
            indArray[nInd*2+1]=indArray[ii*2+1];
            ++nInd;
        };
    };

    globalGsj=Gsj;
    qsort((void*) indArray, nInd, 2*sizeof(uint64), funCompareUintAndSuffixes);
    time ( &rawtime );
    P->inOut->logMain  << timeMonthDayTime(rawtime) << "   Finished sorting SA indicesL nInd="<<nInd <<endl;

    indArray[2*nInd]=-999; //mark the last junction
    indArray[2*nInd+1]=-999; //mark the last junction
    
    P->nGenome=P->chrStart[P->nChrReal]+nGsj;    
    P->nSA+=nInd;
    
    uint GstrandBit1 = (uint) floor(log(P->nGenome)/log(2))+1;
    if (GstrandBit1<32) GstrandBit1=32; //TODO: use simple access function for SA
    if ( GstrandBit1 != P->GstrandBit) 
    {//too many junctions were added - GstrandBit changed
        ostringstream errOut;
        errOut << "EXITING because of FATAL ERROR: cannot insert junctions on the fly because of strand GstrandBit problem\n";
        errOut << "SOLUTION: please contact STAR author at https://groups.google.com/forum/#!forum/rna-star\n";
        exitWithError(errOut.str(),std::cerr, P->inOut->logMain, EXIT_CODE_GENOME_FILES, *P);
    };
    
    SA2.defineBits(P->GstrandBit+1,P->nSA);
    uint nGsjNew=sjNew*P->sjdbLength; //this is the actual number of bytes added to the genome, while nGsj is the total size of all junctions
    
    uint N2bit= 1LLU << P->GstrandBit;
    uint strandMask=~N2bit;
    
    //testing
//     PackedArray SAo;
//     SAo.defineBits(P->GstrandBit+1,P->nSA);
//     SAo.allocateArray();
//     ifstream oldSAin("./DirTrue/SA");
//     oldSAin.read(SAo.charArray,SAo.lengthByte);
//     oldSAin.close();
    
    
    uint isj=0, isa2=0;
    for (uint isa=0;isa<P1->nSA;isa++) {
        //testing
//         if (isa2>0 && SA2[isa2-1]!=SAo[isa2-1]) {
//             cout <<isa2 <<" "<< SA2[isa2-1]<<" "<<SAo[isa2-1]<<endl;
//         };        

//         if (isa==69789089)
//      	{ 
//           cout <<isa;
//         };

        uint ind1=SA[isa];
        
        if ( (ind1 & N2bit)>0 ) 
        {//- strand
            uint ind1s = P1->nGenome - (ind1 & strandMask);
            if (ind1s>P->chrStart[P->nChrReal])
            {//this index was an old sj, may need to shift it
                uint sj1 = (ind1s-P->chrStart[P->nChrReal])/P->sjdbLength;//old junction index
                ind1s += (oldSJind[sj1]-sj1)*P->sjdbLength;
                ind1 = (P->nGenome - ind1s) | N2bit;
            } else
            {
                ind1+=nGsjNew; //reverse complementary indices are all shifted by the length of junctions
            };
        } else
        {//+ strand
            if (ind1>P->chrStart[P->nChrReal])
            {//this index was an old sj, may need to shift it
                uint sj1 = (ind1-P->chrStart[P->nChrReal])/P->sjdbLength;//old junction index
                ind1 += (oldSJind[sj1]-sj1)*P->sjdbLength;
            };
        };
        
        SA2.writePacked(isa2,ind1); //TODO make sure that the first sj index is not before the first array index
        ++isa2;
        
        while (isa==indArray[isj*2]) {//insert sj index after the existing index
            uint ind1=indArray[isj*2+1];
            if (ind1<nGsj) {
                ind1+=P->chrStart[P->nChrReal];
            } else {//reverse strand
                ind1=(ind1-nGsj) | N2bit;
            };
            SA2.writePacked(isa2,ind1);
            ++isa2; ++isj;
        };
    };
    time ( &rawtime );
    P->inOut->logMain  << timeMonthDayTime(rawtime) << "   Finished inserting junction indices" <<endl;
    
    //SAi insertions
    for (uint iL=0; iL < P->genomeSAindexNbases; iL++) {
        uint iSJ=0;
        uint ind0=P->genomeSAindexStart[iL]-1;//last index that was present in the old genome
        for (uint ii=P->genomeSAindexStart[iL];ii<P->genomeSAindexStart[iL+1]; ii++) {//scan through the longest index
            uint iSA1=SAi[ii];
            uint iSA2=iSA1 & P->SAiMarkNmask & P->SAiMarkAbsentMask;
            
            if ( iSJ<nInd && (iSA1 &  P->SAiMarkAbsentMaskC)>0 ) 
            {//index missing from the old genome
                uint iSJ1=iSJ;
                int64 ind1=funCalcSAi(Gsj+indArray[2*iSJ+1],iL);
                while (ind1 < (int64)(ii-P->genomeSAindexStart[iL]) && indArray[2*iSJ]<iSA2) {
                    ++iSJ;
                    ind1=funCalcSAi(Gsj+indArray[2*iSJ+1],iL);
                };
                if (ind1 == (int64)(ii-P->genomeSAindexStart[iL]) ) {
                    SAi.writePacked(ii,indArray[2*iSJ]+iSJ+1);
                    for (uint ii0=ind0+1; ii0<ii; ii0++) {//fill all the absent indices with this value
                        SAi.writePacked(ii0,(indArray[2*iSJ]+iSJ+1) | P->SAiMarkAbsentMaskC);
                    };
                    ++iSJ;
                    ind0=ii;
                } else {
                    iSJ=iSJ1;
                };
            } else 
            {//index was present in the old genome
                while (iSJ<nInd && indArray[2*iSJ]+1<iSA2) {//for this index insert "smaller" junctions
                    ++iSJ;
                };
                
                while (iSJ<nInd && indArray[2*iSJ]+1==iSA2) {//special case, the index falls right behind SAi
                    if (funCalcSAi(Gsj+indArray[2*iSJ+1],iL) >= (int64) (ii-P->genomeSAindexStart[iL]) ) {//this belongs to the next index
                        break;
                    };
                    ++iSJ;
                };   
                
                SAi.writePacked(ii,iSA1+iSJ);
                
                for (uint ii0=ind0+1; ii0<ii; ii0++) {//fill all the absent indices with this value
                    SAi.writePacked(ii0,(iSA2+iSJ) | P->SAiMarkAbsentMaskC);
                };
                ind0=ii;
            };
        };

    };
//     time ( &rawtime );    cout << timeMonthDayTime(rawtime) << "SAi first" <<endl;

    for (uint isj=0;isj<nInd;isj++) {
        int64 ind1=0;
        for (uint iL=0; iL < P->genomeSAindexNbases; iL++) {
            uint g=(uint) Gsj[indArray[2*isj+1]+iL];
            ind1 <<= 2;
            if (g>3) {//this iSA contains N, need to mark the previous
                for (uint iL1=iL; iL1 < P->genomeSAindexNbases; iL1++) {
                    ind1+=3;
                    int64 ind2=P->genomeSAindexStart[iL1]+ind1;
                    for (; ind2>=0; ind2--) {//find previous index that is not absent
                        if ( (SAi[ind2] & P->SAiMarkAbsentMaskC)==0 ) {
                            break;
                        };
                    };
                    SAi.writePacked(ind2,SAi[ind2] | P->SAiMarkNmaskC);
                    ind1 <<= 2;
                };
                break;
            } else {
                ind1 += g;
            };
        };
    };
    time ( &rawtime );
    P->inOut->logMain  << timeMonthDayTime(rawtime) << "   Finished SAi" <<endl;
    
    //change parameters, most parameters are already re-defined in sjdbPrepare.cpp
    SA.defineBits(P->GstrandBit+1,P->nSA);//same as SA2
    SA.pointArray(SA2.charArray);
    P->nSAbyte=SA.lengthByte;
    P->sjGstart=P->chrStart[P->nChrReal];
    memcpy(G+P->chrStart[P->nChrReal],Gsj, nGsj);
    
    /* testing
    PackedArray SAio=SAi;
    SAio.allocateArray();
    ifstream oldSAiin("./DirTrue/SAindex");
//     oldSAin.read(SAio.charArray,8*(P->genomeSAindexNbases+2));//skip first bytes
    oldSAiin.read(SAio.charArray,SAio.lengthByte);
    oldSAiin.close();  
    

//     for (uint ii=0;ii<P->nSA;ii++) {
//         if (SA2[ii]!=SAo[ii]) {
//             cout <<ii <<" "<< SA2[ii]<<" "<<SAo[ii]<<endl;
//         };
//     };


    for (uint iL=0; iL < P->genomeSAindexNbases; iL++) {
        for (uint ii=P->genomeSAindexStart[iL];ii<P->genomeSAindexStart[iL+1]; ii++) {//scan through the longets index
                if ( SAio[ii]!=SAi[ii] ) {
                    cout <<ii<<" "<<SAio[ii]<<" "<<SAi[ii]<<endl;
                };
        };
    };    
    */
    
    /*
    ofstream genomeOut("/home/dobin/Genome");
    fstreamWriteBig(genomeOut,G,P->nGenome+nGsj,"777","777",P);
    genomeOut.close(); 
    genomeOut.open("/home/dobin/SA");
    fstreamWriteBig(genomeOut,SA2.charArray,SA2.lengthByte,"777","777",P);
    genomeOut.close();
    */
    
    delete [] indArray;
    delete [] G1c;
    delete [] oldSJind;       
    
};
예제 #10
0
void genomeGenerate(Parameters *P) {
    
    //check parameters
    if (P->sjdbOverhang<=0 && (P->sjdbFileChrStartEnd.at(0)!="-" || P->sjdbGTFfile!="-")) 
    {
        ostringstream errOut;
        errOut << "EXITING because of FATAL INPUT PARAMETER ERROR: for generating genome with annotations (--sjdbFileChrStartEnd or --sjdbGTFfile options)\n";
        errOut << "you need to specify >0 --sjdbOverhang\n";
        errOut << "SOLUTION: re-run genome generation specifying non-zero --sjdbOverhang, which ideally should be equal to OneMateLength-1, or could be chosen generically as ~100\n";        
        exitWithError(errOut.str(),std::cerr, P->inOut->logMain, EXIT_CODE_INPUT_FILES, *P);
    } 
    if (P->sjdbFileChrStartEnd.at(0)=="-" && P->sjdbGTFfile=="-") 
    {
        if (P->parArray.at(P->sjdbOverhang_par)->inputLevel>0 && P->sjdbOverhang>0)
        {
            ostringstream errOut;
            errOut << "EXITING because of FATAL INPUT PARAMETER ERROR: when generating genome without annotations (--sjdbFileChrStartEnd or --sjdbGTFfile options)\n";
            errOut << "do not specify >0 --sjdbOverhang\n";
            errOut << "SOLUTION: re-run genome generation without --sjdbOverhang option\n";        
            exitWithError(errOut.str(),std::cerr, P->inOut->logMain, EXIT_CODE_INPUT_FILES, *P);
        };
        P->sjdbOverhang=0;
    };
    
    //time
    time_t rawTime;
    string timeString;
    
    time(&rawTime);
    P->inOut->logMain     << timeMonthDayTime(rawTime) <<" ... Starting to generate Genome files\n" <<flush;
    *P->inOut->logStdOut  << timeMonthDayTime(rawTime) <<" ... Starting to generate Genome files\n" <<flush;
    
    //define some parameters from input parameters
    P->genomeChrBinNbases=1LLU << P->genomeChrBinNbits;
    //write genome parameters file
    genomeParametersWrite(P->genomeDir+("/genomeParameters.txt"), P, "ERROR_00102");
    
    char *G=NULL, *G1=NULL;        
    uint nGenomeReal=genomeScanFastaFiles(P,G,false);//first scan the fasta file to find all the sizes  
    P->chrBinFill();

    uint L=10000;//maximum length of genome suffix    
    uint nG1alloc=(nGenomeReal + L)*2;
    G1=new char[nG1alloc];
    G=G1+L;
    
    memset(G1,GENOME_spacingChar,nG1alloc);//initialize to K-1 all bytes
 
    genomeScanFastaFiles(P,G,true);    //load the genome sequence   

    uint N = nGenomeReal;
    P->nGenome=N;
    uint N2 = N*2;     

    ofstream chrN,chrS,chrL,chrNL;
    
    ofstrOpen(P->genomeDir+"/chrName.txt","ERROR_00103", P, chrN);   
    ofstrOpen(P->genomeDir+"/chrStart.txt","ERROR_00103", P, chrS);   
    ofstrOpen(P->genomeDir+"/chrLength.txt","ERROR_00103", P, chrL);   
    ofstrOpen(P->genomeDir+"/chrNameLength.txt","ERROR_00103", P, chrNL);   
    
    for (uint ii=0;ii<P->nChrReal;ii++) {//output names, starts, lengths               
        chrN<<P->chrName[ii]<<"\n";
        chrS<<P->chrStart[ii]<<"\n";
        chrL<<P->chrLength.at(ii)<<"\n";
        chrNL<<P->chrName[ii]<<"\t"<<P->chrLength.at(ii)<<"\n";        
    };
    chrS<<P->chrStart[P->nChrReal]<<"\n";//size of the genome
    chrN.close();chrL.close();chrS.close(); chrNL.close();   
    
    if (P->limitGenomeGenerateRAM < (nG1alloc+nG1alloc/3)) {//allocate nG1alloc/3 for SA generation
        ostringstream errOut;                            
        errOut <<"EXITING because of FATAL PARAMETER ERROR: limitGenomeGenerateRAM="<< (P->limitGenomeGenerateRAM) <<"is too small for your genome\n";
        errOut <<"SOLUTION: please specify limitGenomeGenerateRAM not less than"<< nG1alloc+nG1alloc/3 <<" and make that much RAM available \n";
        exitWithError(errOut.str(),std::cerr, P->inOut->logMain, EXIT_CODE_INPUT_FILES, *P);
    };     
    
    //preparing to generate SA
    for (uint ii=0;ii<N;ii++) {//- strand
        G[N2-1-ii]=G[ii]<4 ? 3-G[ii] : G[ii];
    };      
    
    P->nSA=0;
    for (uint ii=0;ii<N2;ii+=P->genomeSAsparseD) {
        if (G[ii]<4) {
            P->nSA++;
        };
    };     
    
    P->GstrandBit = (uint) floor(log(N)/log(2))+1; 
    if (P->GstrandBit<32) P->GstrandBit=32; //TODO: use simple access function for SA
    
    P->GstrandMask = ~(1LLU<<P->GstrandBit);
    PackedArray SA1;//SA without sjdb
    SA1.defineBits(P->GstrandBit+1,P->nSA);
    PackedArray SA2;//SA with sjdb, reserve more space
    if (P->sjdbInsert.yes)
    {//reserve space for junction insertion
        SA2.defineBits(P->GstrandBit+1,P->nSA+2*P->limitSjdbInsertNsj*P->sjdbLength);//TODO: this allocation is wasteful, get a better estimate of the number of junctions
    } else
    {//same as SA1
        SA2.defineBits(P->GstrandBit+1,P->nSA);
    };
        
    P->nSAbyte=SA2.lengthByte;
    
    P->inOut->logMain  << "Number of SA indices: "<< P->nSA << "\n"<<flush;    

    //sort SA
    time ( &rawTime );
    P->inOut->logMain     << timeMonthDayTime(rawTime) <<" ... starting to sort  Suffix Array. This may take a long time...\n" <<flush;   
    *P->inOut->logStdOut  << timeMonthDayTime(rawTime) <<" ... starting to sort  Suffix Array. This may take a long time...\n" <<flush;
   

//     if (false)
    {//sort SA chunks
        
        for (uint ii=0;ii<N;ii++) {//re-fill the array backwards for sorting
            swap(G[N2-1-ii],G[ii]);
        };          
        globalG=G;
        globalL=L/sizeof(uint);
        //count the number of indices with 4nt prefix
        uint indPrefN=1LLU << 16;
        uint* indPrefCount = new uint [indPrefN];
        memset(indPrefCount,0,indPrefN*sizeof(indPrefCount[0]));
        P->nSA=0;
        for (uint ii=0;ii<N2;ii+=P->genomeSAsparseD) {
            if (G[ii]<4) {
                uint p1=(G[ii]<<12) + (G[ii-1]<<8) + (G[ii-2]<<4) + G[ii-3];
                indPrefCount[p1]++;
                P->nSA++;
            };
        };

        uint saChunkSize=(P->limitGenomeGenerateRAM-nG1alloc)/8/P->runThreadN; //number of SA indexes per chunk
        saChunkSize=saChunkSize*6/10; //allow extra space for qsort            
        //uint saChunkN=((P->nSA/saChunkSize+1)/P->runThreadN+1)*P->runThreadN;//ensure saChunkN is divisible by P->runThreadN
        //saChunkSize=P->nSA/saChunkN+100000;//final chunk size
        if (P->runThreadN>1) saChunkSize=min(saChunkSize,P->nSA/(P->runThreadN-1));

        uint saChunkN=P->nSA/saChunkSize;//estimate
        uint* indPrefStart = new uint [saChunkN*2]; //start and stop, *2 just in case
        uint* indPrefChunkCount = new uint [saChunkN*2];
        indPrefStart[0]=0;
        saChunkN=0;//start counting chunks
        uint chunkSize1=indPrefCount[0];
        for (uint ii=1; ii<indPrefN; ii++) {
            chunkSize1 += indPrefCount[ii];
            if (chunkSize1 > saChunkSize) {
                saChunkN++;
                indPrefStart[saChunkN]=ii;
                indPrefChunkCount[saChunkN-1]=chunkSize1-indPrefCount[ii];                    
                chunkSize1=indPrefCount[ii];
            };
        };
        saChunkN++;
        indPrefStart[saChunkN]=indPrefN+1;
        indPrefChunkCount[saChunkN-1]=chunkSize1;

        P->inOut->logMain  << "Number of chunks: " << saChunkN <<";   chunks size limit: " << saChunkSize*8 <<" bytes\n" <<flush;

        time ( &rawTime );
        P->inOut->logMain     << timeMonthDayTime(rawTime) <<" ... sorting Suffix Array chunks and saving them to disk...\n" <<flush;   
        *P->inOut->logStdOut  << timeMonthDayTime(rawTime) <<" ... sorting Suffix Array chunks and saving them to disk...\n" <<flush;

        #pragma omp parallel for num_threads(P->runThreadN) ordered schedule(dynamic,1)
        for (int iChunk=0; iChunk < (int) saChunkN; iChunk++) {//start the chunk cycle: sort each chunk with qsort and write to a file
            uint* saChunk=new uint [indPrefChunkCount[iChunk]];//allocate local array for each chunk
            for (uint ii=0,jj=0;ii<N2;ii+=P->genomeSAsparseD) {//fill the chunk with SA indices
                if (G[ii]<4) {
                    uint p1=(G[ii]<<12) + (G[ii-1]<<8) + (G[ii-2]<<4) + G[ii-3];
                    if (p1>=indPrefStart[iChunk] && p1<indPrefStart[iChunk+1]) {
                        saChunk[jj]=ii;
                        jj++;
                    };
                    //TODO: if (jj==indPrefChunkCount[iChunk]) break;
                };
            };


            //sort the chunk
            qsort(saChunk,indPrefChunkCount[iChunk],sizeof(saChunk[0]),funCompareSuffixes);
            for (uint ii=0;ii<indPrefChunkCount[iChunk];ii++) {    
                saChunk[ii]=N2-1-saChunk[ii];
            };  
            //write files
            ofstream saChunkFile;
            string chunkFileName=P->genomeDir+"/SA_"+to_string( (uint) iChunk);
            ofstrOpen(chunkFileName,"ERROR_00105", P, saChunkFile);   
            fstreamWriteBig(saChunkFile, (char*) saChunk, sizeof(saChunk[0])*indPrefChunkCount[iChunk],chunkFileName,"ERROR_00121",P);
            saChunkFile.close();
            delete [] saChunk;
            saChunk=NULL;
        };

        time ( &rawTime );
        P->inOut->logMain     << timeMonthDayTime(rawTime) <<" ... loading chunks from disk, packing SA...\n" <<flush;   
        *P->inOut->logStdOut  << timeMonthDayTime(rawTime) <<" ... loading chunks from disk, packing SA...\n" <<flush;    

        //read chunks and pack into full SA1
        SA2.allocateArray();
        SA1.pointArray(SA2.charArray + SA2.lengthByte-SA1.lengthByte); //SA1 is shifted to have space for junction insertion
        uint N2bit= 1LLU << P->GstrandBit;          
        uint packedInd=0;

        #define SA_CHUNK_BLOCK_SIZE 10000000
        uint* saIn=new uint[SA_CHUNK_BLOCK_SIZE]; //TODO make adjustable
        
        #ifdef genenomeGenerate_SA_textOutput
                ofstream SAtxtStream ((P->genomeDir + "/SAtxt").c_str());
        #endif

        for (uint iChunk=0;iChunk<saChunkN;iChunk++) {//load files one by one and convert to packed
            ostringstream saChunkFileNameStream("");
            saChunkFileNameStream<< P->genomeDir << "/SA_" << iChunk;
            ifstream saChunkFile(saChunkFileNameStream.str().c_str());
            while (! saChunkFile.eof()) {//read blocks from each file
                uint chunkBytesN=fstreamReadBig(saChunkFile,(char*) saIn,SA_CHUNK_BLOCK_SIZE*sizeof(saIn[0]));
                for (uint ii=0;ii<chunkBytesN/sizeof(saIn[0]);ii++) {
                    SA1.writePacked( packedInd+ii, (saIn[ii]<N) ? saIn[ii] : ( (saIn[ii]-N) | N2bit ) );
                    
                    #ifdef genenomeGenerate_SA_textOutput
                        SAtxtStream << saIn[ii] << "\n";
                    #endif
                };
                packedInd += chunkBytesN/sizeof(saIn[0]);
            };
            saChunkFile.close();
            remove(saChunkFileNameStream.str().c_str());//remove the chunk file
        };

        #ifdef genenomeGenerate_SA_textOutput
                SAtxtStream.close();
        #endif        
        delete [] saIn;

        if (packedInd != P->nSA ) {//
            ostringstream errOut;                            
            errOut << "EXITING because of FATAL problem while generating the suffix array\n";
            errOut << "The number of indices read from chunks = "<<packedInd<<" is not equal to expected nSA="<<P->nSA<<"\n";
            errOut << "SOLUTION: try to re-run suffix array generation, if it still does not work, report this problem to the author\n"<<flush;
            exitWithError(errOut.str(),std::cerr, P->inOut->logMain, EXIT_CODE_INPUT_FILES, *P);
        };
        
        //DONE with suffix array generation
        
        for (uint ii=0;ii<N;ii++) {//return to normal order for future use
            swap(G[N2-1-ii],G[ii]);
        };         
        delete [] indPrefCount;
        delete [] indPrefStart;
        delete [] indPrefChunkCount;
    };    

    time ( &rawTime );
    timeString=asctime(localtime ( &rawTime ));
    timeString.erase(timeString.end()-1,timeString.end());
    P->inOut->logMain     << timeMonthDayTime(rawTime) <<" ... Finished generating suffix array\n" <<flush;  
    *P->inOut->logStdOut  << timeMonthDayTime(rawTime) <<" ... Finished generating suffix array\n" <<flush;          

////////////////////////////////////////
//          SA index
//
//     PackedArray SAold;
// 
//     if (true)
//     {//testing: load SA from disk
//             //read chunks and pack into full SA1
//         
//         ifstream oldSAin("./DirTrue/SA");
//         oldSAin.seekg (0, ios::end);
//         P->nSAbyte=(uint) oldSAin.tellg();
//         oldSAin.clear();        
//         oldSAin.seekg (0, ios::beg);
// 
//         P->nSA=(P->nSAbyte*8)/(P->GstrandBit+1);
//         SAold.defineBits(P->GstrandBit+1,P->nSA);  
//         SAold.allocateArray();
//         
//         oldSAin.read(SAold.charArray,SAold.lengthByte);
//         oldSAin.close();
//         
//         SA1=SAold;
//         SA2=SAold;
//     };
    
    PackedArray SAip;
    genomeSAindex(G,SA1,P,SAip);

    if (P->sjdbFileChrStartEnd.at(0)!="-" || P->sjdbGTFfile!="-")
    {//insert junctions
        SjdbClass sjdbLoci;

        Genome mainGenome(P);
        mainGenome.G=G;
        mainGenome.SA=SA1;
        mainGenome.SApass1=SA2;
        mainGenome.SAi=SAip;
        P->sjdbInsert.outDir=P->genomeDir;
        P->sjdbN=0;//no junctions are loaded yet
        P->twoPass.pass2=false;
        
        Parameters *P1=new Parameters;
        *P1=*P;        
        
        sjdbInsertJunctions(P, P1, mainGenome, sjdbLoci);
        
        //write an extra 0 at the end of the array, filling the last bytes that otherwise are not accessible, but will be written to disk
        //this is - to avoid valgrind complaints. Note that SA2 is allocated with plenty of space to spare.
        SA2.writePacked(P->nSA,0);
    };
    
    //write genome to disk
    time ( &rawTime );
    P->inOut->logMain     << timeMonthDayTime(rawTime) <<" ... writing Genome to disk ...\n" <<flush;   
    *P->inOut->logStdOut  << timeMonthDayTime(rawTime) <<" ... writing Genome to disk ...\n" <<flush;   
    
    ofstream genomeOut;
    ofstrOpen(P->genomeDir+"/Genome","ERROR_00104", P, genomeOut);   
    fstreamWriteBig(genomeOut,G,P->nGenome,P->genomeDir+"/Genome","ERROR_00120",P);
    genomeOut.close();  

    //write SA                
    time ( &rawTime );
    P->inOut->logMain  << "SA size in bytes: "<< P->nSAbyte << "\n"<<flush;

    P->inOut->logMain     << timeMonthDayTime(rawTime) <<" ... writing Suffix Array to disk ...\n" <<flush;   
    *P->inOut->logStdOut  << timeMonthDayTime(rawTime) <<" ... writing Suffix Array to disk ...\n" <<flush;   

    ofstream SAout;
    ofstrOpen(P->genomeDir+"/SA","ERROR_00106", P, SAout);   
    fstreamWriteBig(SAout,(char*) SA2.charArray, (streamsize) P->nSAbyte,P->genomeDir+"/SA","ERROR_00122",P);
    SAout.close();    
    
    //write SAi
    time(&rawTime);    
    P->inOut->logMain    << timeMonthDayTime(rawTime) <<" ... writing SAindex to disk\n" <<flush;   
    *P->inOut->logStdOut << timeMonthDayTime(rawTime) <<" ... writing SAindex to disk\n" <<flush;   
    
    //write SAi to disk
    ofstream SAiOut;
    ofstrOpen(P->genomeDir+"/SAindex","ERROR_00107", P, SAiOut);   

    fstreamWriteBig(SAiOut, (char*) &P->genomeSAindexNbases, sizeof(P->genomeSAindexNbases),P->genomeDir+"/SAindex","ERROR_00123",P);
    fstreamWriteBig(SAiOut, (char*) P->genomeSAindexStart, sizeof(P->genomeSAindexStart[0])*(P->genomeSAindexNbases+1),P->genomeDir+"/SAindex","ERROR_00124",P);        
    fstreamWriteBig(SAiOut,  SAip.charArray, SAip.lengthByte,P->genomeDir+"/SAindex","ERROR_00125",P);
    SAiOut.close();    

    SA2.deallocateArray();

    time(&rawTime);
    timeString=asctime(localtime ( &rawTime ));
    timeString.erase(timeString.end()-1,timeString.end());
    
    time(&rawTime);        
    P->inOut->logMain    << timeMonthDayTime(rawTime) << " ..... Finished successfully\n" <<flush;    
    *P->inOut->logStdOut << timeMonthDayTime(rawTime) << " ..... Finished successfully\n" <<flush;
};
예제 #11
0
uint insertSeqSA(PackedArray & SA, PackedArray & SA1, PackedArray & SAi, char * G, char * G1, uint64 nG, uint64 nG1, uint64 nG2, Parameters * P)
{//insert new sequences into the SA

    uint GstrandBit1 = (uint) floor(log(nG+nG1)/log(2))+1;
    if (GstrandBit1<32) GstrandBit1=32; //TODO: use simple access function for SA
    if ( GstrandBit1+1 != SA.wordLength)
    {//sequence is too long - GstrandBit changed
        ostringstream errOut;
        errOut << "EXITING because of FATAL ERROR: cannot insert sequence on the fly because of strand GstrandBit problem\n";
        errOut << "SOLUTION: please contact STAR author at https://groups.google.com/forum/#!forum/rna-star\n";
        exitWithError(errOut.str(),std::cerr, P->inOut->logMain, EXIT_CODE_GENOME_FILES, *P);
    };

    uint N2bit= 1LLU << (SA.wordLength-1);
    uint strandMask=~N2bit;
    for (uint64 isa=0;isa<SA.length; isa++)
    {
        uint64 ind1=SA[isa];
        if ( (ind1 & N2bit)>0 )
        {//- strand
            if ( (ind1 & strandMask)>=nG2 )
            {//the first nG bases
                ind1+=nG1; //reverse complementary indices are all shifted by the length of the sequence
                SA.writePacked(isa,ind1);
            };
        } else
        {//+ strand
            if ( ind1>=nG )
            {//the last nG2 bases
                ind1+=nG1; //reverse complementary indices are all shifted by the length of the sequence
                SA.writePacked(isa,ind1);
            };
        };
    };

    char** seq1=new char*[2];

    #define GENOME_endFillL 16
    char* seqq=new char [4*nG1+3*GENOME_endFillL];//ends shouldbe filled with 5 to mark boundaries

    seq1[0]=seqq+GENOME_endFillL;//TODO: avoid defining an extra array, use reverse search
    seq1[1]=seqq+2*GENOME_endFillL+2*nG1;

    memset(seqq,GENOME_spacingChar,GENOME_endFillL);
    memset(seqq+2*nG1+GENOME_endFillL,GENOME_spacingChar,GENOME_endFillL);
    memset(seqq+4*nG1+2*GENOME_endFillL,GENOME_spacingChar,GENOME_endFillL);

    memcpy(seq1[0], G1, nG1);
    for (uint ii=0; ii<nG1; ii++)
    {//reverse complement sequence
        seq1[0][2*nG1-1-ii]=seq1[0][ii]<4 ? 3-seq1[0][ii] : seq1[0][ii];
    };
    complementSeqNumbers(seq1[0], seq1[1], 2*nG1);//complement

    uint64* indArray=new uint64[nG1*2*2+2];// for each base, 1st number - insertion place in SA, 2nd number - index, *2 for reverse compl


    #pragma omp parallel num_threads(P->runThreadN)
    #pragma omp for schedule (dynamic,1000)
    for (uint ii=0; ii<2*nG1; ii++) {//find insertion points for each of the sequences

        if (seq1[0][ii]>3)
        {//no index for suffices starting with N
            indArray[ii*2]=-1;
        } else
        {
            indArray[ii*2] =  suffixArraySearch1(seq1, ii, 10000, G, nG, SA, (ii<nG1 ? true:false), 0, SA.length-1, 0, P) ;
            indArray[ii*2+1] = ii;
        };
    };

    uint64 nInd=0;//true number of new indices
    for (uint ii=0; ii<2*nG1; ii++) {//remove entries that cannot be inserted, this cannot be done in the parallel cycle above
        if (indArray[ii*2]!= (uint) -1) {
            indArray[nInd*2]=indArray[ii*2];
            indArray[nInd*2+1]=indArray[ii*2+1];
            ++nInd;
        };
    };

    time_t rawtime;
    time ( &rawtime );
    P->inOut->logMain  << timeMonthDayTime(rawtime) << "   Finished SA search, number of new SA indices = "<<nInd<<endl;

    globalGenomeArray=seq1[0];
    qsort((void*) indArray, nInd, 2*sizeof(uint64), funCompareUintAndSuffixes);
    time ( &rawtime );
    P->inOut->logMain  << timeMonthDayTime(rawtime) << "   Finished sorting SA indices"<<endl;

    indArray[2*nInd]=-999; //mark the last junction
    indArray[2*nInd+1]=-999; //mark the last junction

    SA1.defineBits(SA.wordLength,SA.length+nInd);

    /*testing
    PackedArray SAo;
    SAo.defineBits(P->GstrandBit+1,P->nSA+nInd);
    SAo.allocateArray();
    ifstream oldSAin("./DirTrue/SA");
    oldSAin.read(SAo.charArray,SAo.lengthByte);
    oldSAin.close();
    */

    uint isa1=0, isa2=0;
    for (uint isa=0;isa<SA.length;isa++) {
        while (isa==indArray[isa1*2]) {//insert new index before the existing index
            uint ind1=indArray[isa1*2+1];
            if (ind1<nG1) {
                ind1+=nG;
            } else {//reverse strand
                ind1=(ind1-nG1+nG2) | N2bit;
            };
            SA1.writePacked(isa2,ind1);
            /*testing
            if (SA1[isa2]!=SAo[isa2]) {
               cout <<isa2 <<" "<< SA1[isa2]<<" "<<SAo[isa2]<<endl;
               //sleep(100);
            };
            */
            ++isa2; ++isa1;

        };

        SA1.writePacked(isa2,SA[isa]); //TODO make sure that the first sj index is not before the first array index
            /*testing
            if (SA1[isa2]!=SAo[isa2]) {
               cout <<isa2 <<" "<< SA1[isa2]<<" "<<SAo[isa2]<<endl;
               //sleep(100);
            };
            */
        ++isa2;
    };
    for (;isa1<nInd;isa1++)
    {//insert the last indices
        uint ind1=indArray[isa1*2+1];
        if (ind1<nG1)
        {
            ind1+=nG;
        } else
        {//reverse strand
            ind1=(ind1-nG1+nG2) | N2bit;
        };
        SA1.writePacked(isa2,ind1);
        ++isa2;
    };

    time ( &rawtime );
    P->inOut->logMain  << timeMonthDayTime(rawtime) << "   Finished inserting SA indices" <<endl;

//     //SAi insertions
//     for (uint iL=0; iL < P->genomeSAindexNbases; iL++) {
//         uint iSeq=0;
//         uint ind0=P->genomeSAindexStart[iL]-1;//last index that was present in the old genome
//         for (uint ii=P->genomeSAindexStart[iL];ii<P->genomeSAindexStart[iL+1]; ii++) {//scan through the longest index
//             if (ii==798466)
//                 cout <<ii;
//
//             uint iSA1=SAi[ii];
//             uint iSA2=iSA1 & P->SAiMarkNmask & P->SAiMarkAbsentMask;
//
//             if ( iSeq<nInd && (iSA1 &  P->SAiMarkAbsentMaskC)>0 )
//             {//index missing from the old genome
//                 uint iSeq1=iSeq;
//                 int64 ind1=funCalcSAi(seq1[0]+indArray[2*iSeq+1],iL);
//                 while (ind1 < (int64)(ii-P->genomeSAindexStart[iL]) && indArray[2*iSeq]<iSA2) {
//                     ++iSeq;
//                     ind1=funCalcSAi(seq1[0]+indArray[2*iSeq+1],iL);
//                 };
//                 if (ind1 == (int64)(ii-P->genomeSAindexStart[iL]) ) {
//                     SAi.writePacked(ii,indArray[2*iSeq]+iSeq+1);
//                     for (uint ii0=ind0+1; ii0<ii; ii0++) {//fill all the absent indices with this value
//                         SAi.writePacked(ii0,(indArray[2*iSeq]+iSeq+1) | P->SAiMarkAbsentMaskC);
//                     };
//                     ++iSeq;
//                     ind0=ii;
//                 } else {
//                     iSeq=iSeq1;
//                 };
//             } else
//             {//index was present in the old genome
//                 while (iSeq<nInd && indArray[2*iSeq]+1<iSA2) {//for this index insert "smaller" junctions
//                     ++iSeq;
//                 };
//
//                 while (iSeq<nInd && indArray[2*iSeq]+1==iSA2) {//special case, the index falls right behind SAi
//                     if (funCalcSAi(seq1[0]+indArray[2*iSeq+1],iL) >= (int64) (ii-P->genomeSAindexStart[iL]) ) {//this belongs to the next index
//                         break;
//                     };
//                     ++iSeq;
//                 };
//
//                 SAi.writePacked(ii,iSA1+iSeq);
//
//                 for (uint ii0=ind0+1; ii0<ii; ii0++) {//fill all the absent indices with this value
//                     SAi.writePacked(ii0,(iSA2+iSeq) | P->SAiMarkAbsentMaskC);
//                 };
//                 ind0=ii;
//             };
//         };
//
//     };
// //     time ( &rawtime );    cout << timeMonthDayTime(rawtime) << "SAi first" <<endl;
//
//     for (uint isj=0;isj<nInd;isj++) {
//         int64 ind1=0;
//         for (uint iL=0; iL < P->genomeSAindexNbases; iL++) {
//             uint g=(uint) seq1[0][indArray[2*isj+1]+iL];
//             ind1 <<= 2;
//             if (g>3) {//this iSA contains N, need to mark the previous
//                 for (uint iL1=iL; iL1 < P->genomeSAindexNbases; iL1++) {
//                     ind1+=3;
//                     int64 ind2=P->genomeSAindexStart[iL1]+ind1;
//                     for (; ind2>=0; ind2--) {//find previous index that is not absent
//                         if ( (SAi[ind2] & P->SAiMarkAbsentMaskC)==0 ) {
//                             break;
//                         };
//                     };
//                     SAi.writePacked(ind2,SAi[ind2] | P->SAiMarkNmaskC);
//                     ind1 <<= 2;
//                 };
//                 break;
//             } else {
//                 ind1 += g;
//             };
//         };
//     };
//     time ( &rawtime );
//     P->inOut->logMain  << timeMonthDayTime(rawtime) << "   Finished SAi" <<endl;
//
//     /* testing
//     PackedArray SAio=SAi;
//     SAio.allocateArray();
//     ifstream oldSAiin("./DirTrue/SAindex");
//     oldSAiin.read(SAio.charArray,8*(P->genomeSAindexNbases+2));//skip first bytes
//     oldSAiin.read(SAio.charArray,SAio.lengthByte);
//     oldSAiin.close();
//
//     for (uint iL=0; iL < P->genomeSAindexNbases; iL++) {
//         for (uint ii=P->genomeSAindexStart[iL];ii<P->genomeSAindexStart[iL+1]; ii++) {//scan through the longets index
//                 if ( SAio[ii]!=SAi[ii] ) {
//                     cout <<iL<<" "<<ii<<" "<<SAio[ii]<<" "<<SAi[ii]<<endl;
//                 };
//         };
//     };
//     */

    //change parameters, most parameters are already re-defined in sjdbPrepare.cpp
    SA.defineBits(P->GstrandBit+1,SA.length+nInd);//same as SA2
    SA.pointArray(SA1.charArray);
    P->nSA=SA.length;
    P->nSAbyte=SA.lengthByte;

    //generate SAi
    genomeSAindex(G,SA,P,SAi);

    time ( &rawtime );
    P->inOut->logMain  << timeMonthDayTime(rawtime) << "   Finished SAi" <<endl;


//     P->sjGstart=P->chrStart[P->nChrReal];
//     memcpy(G+P->chrStart[P->nChrReal],seq1[0], nseq1[0]);


    return nInd;
};