void Genome::insertSequences() { if (P->genomeFastaFiles.at(0)!="-") { time_t rawtime; time ( &rawtime ); P->inOut->logMain << timeMonthDayTime(rawtime) << " ..... Inserting extra sequences into genome indexes" <<endl; //move the junctions to free up space for seqs // chrStart/Name/Length nChrReal include the extra sequences // nGenome is the old, small genome size uint sjdblen=P->nGenome-(P->chrStart.back()-P->genomeInsertL);//length of sjdb sequences memmove(G+P->chrStart.back(),G+P->chrStart.back()-P->genomeInsertL,sjdblen); memset(G+P->chrStart.back()-P->genomeInsertL, GENOME_spacingChar, P->genomeInsertL);//fill empty space with spacing characters genomeScanFastaFiles(P, G+P->chrStart.back()-P->genomeInsertL, true); //read the seqs from file(s) into the free space uint64 nGenomeOld=P->nGenome; P->nGenome=P->chrStart.back()+sjdblen; //insert new sequences into the SA insertSeqSA(SA, SAinsert, SAi, G, G+P->chrStart.back()-P->genomeInsertL, nGenomeOld-sjdblen, P->genomeInsertL, sjdblen, P); //insert new sequences into the SAi //update P //save the genome if necessary }; };
void Stats::progressReport(ofstream &progressStream) { time_t timeCurrent; time( &timeCurrent); if (difftime(timeCurrent,timeLastReport)>=60.0 && readN>0) {//make the report //progressStream.imbue(std::locale("")); progressStream <<setw(15)<< timeMonthDayTime(timeCurrent) \ <<SETW1<< setiosflags(ios::fixed) << setprecision(1) \ << double(readN)/1e6/difftime(timeCurrent,timeStartMap)*3600 \ <<SETW3<< readN \ <<SETW1<< (readN>0 ? readBases/readN : 0) \ <<SETW2<< (readN>0 ? double(mappedReadsU)/double(readN)*100 : 0) <<'%' \ <<SETW1<< (readN>0 ? double(mappedBases)/double(mappedReadsU) : 0) <<SETW2<< (readN>0 ? double(mappedMismatchesN)/double(mappedBases)*100 : 0) <<'%' \ <<SETW2<< (readN>0 ? double(mappedReadsM)/double(readN)*100 : 0) <<'%'\ <<SETW2<< (readN>0 ? double(unmappedMulti)/double(readN)*100 : 0) <<'%'\ <<SETW2<< (readN>0 ? double(unmappedMismatch)/double(readN)*100 : 0) <<'%'\ <<SETW2<< (readN>0 ? double(unmappedShort)/double(readN)*100 : 0)<<'%'\ <<SETW2<< (readN>0 ? double(unmappedOther)/double(readN)*100 : 0) <<'%'\ <<"\n"<<flush; timeLastReport=timeCurrent; }; };
void Stats::reportFinal(ofstream &streamOut, Parameters *P) { int w1=50; time( &timeFinish); //<<setiosflags(ios::left) streamOut <<setiosflags(ios::fixed) << setprecision(2) \ <<setw(w1)<< "Started job on |\t" << timeMonthDayTime(timeStart)<<"\n" \ <<setw(w1)<< "Started mapping on |\t" << timeMonthDayTime(timeStartMap)<<"\n" \ <<setw(w1)<< "Finished on |\t"<< timeMonthDayTime(timeFinish)<<"\n" \ <<setw(w1)<< "Mapping speed, Million of reads per hour |\t"<< double(readN)/1e6/difftime(timeFinish,timeStartMap)*3600<<"\n" \ <<"\n" \ <<setw(w1)<< "Number of input reads |\t" << readN <<"\n" \ <<setw(w1)<< "Average input read length |\t" << (readN>0 ? readBases/readN : 0) <<"\n" \ <<setw(w1)<< "UNIQUE READS:\n" \ <<setw(w1)<< "Uniquely mapped reads number |\t" << mappedReadsU <<"\n" \ <<setw(w1)<< "Uniquely mapped reads % |\t" << (readN>0 ? double(mappedReadsU)/double(readN)*100 : 0) <<'%'<<"\n" \ <<setw(w1)<< "Average mapped length |\t" << (mappedReadsU>0 ? double(mappedBases)/double(mappedReadsU) : 0) <<"\n"; streamOut <<setw(w1)<< "Number of splices: Total |\t" << splicesN[0]+splicesN[1]+splicesN[2]+splicesN[3]+splicesN[4]+splicesN[5]+splicesN[6]<< "\n" \ <<setw(w1)<< "Number of splices: Annotated (sjdb) |\t" << splicesNsjdb << "\n" \ <<setw(w1)<< "Number of splices: GT/AG |\t" << splicesN[1]+splicesN[2] << "\n" \ <<setw(w1)<< "Number of splices: GC/AG |\t" << splicesN[3]+splicesN[4] << "\n" \ <<setw(w1)<< "Number of splices: AT/AC |\t" << splicesN[5]+splicesN[6] << "\n" \ <<setw(w1)<< "Number of splices: Non-canonical |\t" << splicesN[0] << "\n"; streamOut <<setw(w1)<< "Mismatch rate per base, % |\t" << double(mappedMismatchesN)/double(mappedBases)*100 <<'%' <<"\n" \ <<setw(w1)<< "Deletion rate per base |\t" << (mappedBases>0 ? double(mappedDelL)/double(mappedBases)*100 : 0) <<'%' <<"\n" \ <<setw(w1)<< "Deletion average length |\t" << (mappedDelN>0 ? double(mappedDelL)/double(mappedDelN) : 0) <<"\n" \ <<setw(w1)<< "Insertion rate per base |\t" << (mappedBases>0 ? double(mappedInsL)/double(mappedBases)*100 : 0) <<'%' <<"\n" \ <<setw(w1)<< "Insertion average length |\t" << (mappedInsN>0 ? double(mappedInsL)/double(mappedInsN) : 0) <<"\n" \ <<setw(w1)<< "MULTI-MAPPING READS:\n" \ <<setw(w1)<< "Number of reads mapped to multiple loci |\t" << mappedReadsM <<"\n" \ <<setw(w1)<< "% of reads mapped to multiple loci |\t" << (readN>0 ? double(mappedReadsM)/double(readN)*100 : 0)<<'%' <<"\n" \ <<setw(w1)<< "Number of reads mapped to too many loci |\t" << unmappedMulti <<"\n" \ <<setw(w1)<< "% of reads mapped to too many loci |\t" << (readN>0 ? double(unmappedMulti)/double(readN)*100 : 0) <<'%' <<"\n" \ <<setw(w1)<< "UNMAPPED READS:\n" \ <<setw(w1)<< "% of reads unmapped: too many mismatches |\t" << (readN>0 ? double(unmappedMismatch)/double(readN)*100 : 0) <<'%' <<"\n" \ <<setw(w1)<< "% of reads unmapped: too short |\t" << (readN>0 ? double(unmappedShort)/double(readN)*100 : 0) <<'%' <<"\n" \ <<setw(w1)<< "% of reads unmapped: other |\t" << (readN>0 ? double(unmappedOther)/double(readN)*100 :0) <<'%'<<"\n" \ <<setw(w1)<< "CHIMERIC READS:\n" \ <<setw(w1)<< "Number of chimeric reads |\t" << chimericAll <<"\n" \ <<setw(w1)<< "% of chimeric reads |\t" << (readN>0 ? double(chimericAll)/double(readN)*100 :0) <<'%'<<"\n" <<flush; };
void sjdbInsertJunctions(Parameters *P, Genome &genome) { SjdbClass sjdbLoci; time_t rawtime; //load 1st pass junctions if (P->twoPass.pass1sjFile.size()>0) { ifstream sjdbStreamIn ( P->twoPass.pass1sjFile.c_str() ); if (sjdbStreamIn.fail()) { ostringstream errOut; errOut << "FATAL INPUT error, could not open input file with junctions from the 1st pass="******"\n"; exitWithError(errOut.str(),std::cerr, P->inOut->logMain, EXIT_CODE_INPUT_FILES, *P); }; sjdbLoadFromStream(sjdbStreamIn, sjdbLoci); time ( &rawtime ); P->inOut->logMain << timeMonthDayTime(rawtime) << " Loaded database junctions from the 1st pass file: " << P->twoPass.pass1sjFile <<": "<<sjdbLoci.chr.size()<<" total junctions\n\n"; }; //load from junction files if (P->sjdbFileChrStartEnd.at(0)!="-") { sjdbLoadFromFiles(P,sjdbLoci); P->inOut->logMain << timeMonthDayTime(rawtime) << " Loaded database junctions from the sjdbFileChrStartEnd file(s), " << sjdbLoci.chr.size()<<" total junctions\n\n"; }; //load from GTF if (P->sjdbGTFfile!="-") { loadGTF(sjdbLoci, P, P->genomeDirOut); P->inOut->logMain << timeMonthDayTime(rawtime) << " Loaded database junctions from the GTF file: " << P->sjdbGTFfile<<": "<<sjdbLoci.chr.size()<<" total junctions\n\n"; }; sjdbPrepare (sjdbLoci, P, genome.G, P->nGenome, P->twoPass.dir);//P->nGenome - change when replacing junctions time ( &rawtime ); P->inOut->logMain << timeMonthDayTime(rawtime) << " Finished preparing junctions" <<endl; //insert junctions into the genome and SA and SAi sjdbBuildIndex (P, genome.G, genome.SA, genome.SA2, genome.SAi); time ( &rawtime ); *P->inOut->logStdOut << timeMonthDayTime(rawtime) << " ..... Finished inserting 1st pass junctions into genome" <<endl; //re-calculate genome-related parameters P->winBinN = P->nGenome/(1LLU << P->winBinNbits)+1; };
void Genome::genomeLoad(){//allocate and load Genome time_t rawtime; time ( &rawtime ); *(P->inOut->logStdOut) << timeMonthDayTime(rawtime) << " ..... Loading genome\n" <<flush; uint *shmNG=NULL, *shmNSA=NULL; //pointers to shm stored values , *shmSG, *shmSSA uint64 shmSize=0;//, shmStartG=0; shmStartSA=0; uint L=200,K=6; Parameters *P1 = new Parameters; ifstream parFile((P->genomeDir+("/genomeParameters.txt")).c_str()); if (parFile.good()) { P->inOut->logMain << "Reading genome generation parameters:\n"; P1->inOut = P->inOut; P1->scanAllLines(parFile,3,-1); parFile.close(); } else { ostringstream errOut; errOut << "EXITING because of FATAL ERROR: could not open genome file "<< P->genomeDir+("/genomeParameters.txt") << endl; errOut << "SOLUTION: check that the path to genome files, specified in --genomeDir is correct and the files are present, and have user read permsissions\n" <<flush; exitWithError(errOut.str(),std::cerr, P->inOut->logMain, EXIT_CODE_GENOME_FILES, *P); }; //check genome version if (P1->versionGenome.size()==0 || P1->versionGenome[0]==0) {// ostringstream errOut; errOut << "EXITING because of FATAL ERROR: read no value for the versionGenome parameter from genomeParameters.txt file\n"; errOut << "SOLUTION: please re-generate genome from scratch with the latest version of STAR\n"; exitWithError(errOut.str(),std::cerr, P->inOut->logMain, EXIT_CODE_GENOME_FILES, *P); } else if (P->sjdbFileChrStartEnd.at(0)=="-" && P1->versionGenome.at(0) >= P->versionGenome.at(0)) {// P->inOut->logMain << "Genome version is compatible with current STAR version\n"; } else if (P->sjdbFileChrStartEnd.at(0)!="-" && P1->versionGenome.at(0) >= P->versionGenome.at(1)) {// P->inOut->logMain << "Genome version is compatible with current STAR version\n"; } else { ostringstream errOut; errOut << "EXITING because of FATAL ERROR: Genome version is INCOMPATIBLE with current STAR version\n"; errOut << "SOLUTION: please re-generate genome from scratch with the latest version of STAR\n"; exitWithError(errOut.str(),std::cerr, P->inOut->logMain, EXIT_CODE_GENOME_FILES, *P); }; //check if sjdbInfo.txt exists => genome was generated with junctions bool sjdbInfoExists=false; struct stat sjdb1; if ( stat( (P->genomeDir+"/sjdbInfo.txt").c_str(), &sjdb1) == 0 ) {//file exists sjdbInfoExists=true; }; if ( P->sjdbInsert.yes && sjdbInfoExists && P1->sjdbInsert.save=="") {//if sjdbInsert, and genome had junctions, and genome is old - it should be re-generated with new STAR ostringstream errOut; errOut << "EXITING because of FATAL ERROR: old Genome is INCOMPATIBLE with on the fly junction insertion\n"; errOut << "SOLUTION: please re-generate genome from scratch with the latest version of STAR\n"; exitWithError(errOut.str(),std::cerr, P->inOut->logMain, EXIT_CODE_GENOME_FILES, *P); }; //record required genome parameters in P P->genomeSAindexNbases=P1->genomeSAindexNbases; P->genomeChrBinNbits=P1->genomeChrBinNbits; P->genomeSAsparseD=P1->genomeSAsparseD; if (P->parArray.at(P->sjdbOverhang_par)->inputLevel==0 && P1->sjdbOverhang>0) {//if --sjdbOverhang was not defined by user and it was defined >0 at the genome generation step, then use sjdbOverhang from the genome generation step P->sjdbOverhang=P1->sjdbOverhang; P->inOut->logMain << "--sjdbOverhang = " << P->sjdbOverhang << " taken from the generated genome\n"; } else if (sjdbInfoExists && P->parArray.at(P->sjdbOverhang_par)->inputLevel>0 && P->sjdbOverhang!=P1->sjdbOverhang) {//if sjdbOverhang was defined at the genome generation step,the mapping step value has to agree with it ostringstream errOut; errOut << "EXITING because of fatal PARAMETERS error: present --sjdbOverhang="<<P->sjdbOverhang << " is not equal to the value at the genome generation step ="<< P1->sjdbOverhang << "\n"; errOut << "SOLUTION: \n" <<flush; exitWithError(errOut.str(),std::cerr, P->inOut->logMain, EXIT_CODE_GENOME_FILES, *P); }; P->sjdbLength = P->sjdbOverhang==0 ? 0 : P->sjdbOverhang*2+1; P->inOut->logMain << "Started loading the genome: " << asctime (localtime ( &rawtime ))<<"\n"<<flush; ifstream GenomeIn, SAin, SAiIn; P->nGenome = OpenStream("Genome",GenomeIn); P->nSAbyte = OpenStream("SA",SAin); OpenStream("/SAindex",SAiIn); uint SAiInBytes=0; SAiInBytes += fstreamReadBig(SAiIn,(char*) &P->genomeSAindexNbases, sizeof(P->genomeSAindexNbases)); P->genomeSAindexStart = new uint[P->genomeSAindexNbases+1]; SAiInBytes += fstreamReadBig(SAiIn,(char*) P->genomeSAindexStart, sizeof(P->genomeSAindexStart[0])*(P->genomeSAindexNbases+1)); P->nSAi=P->genomeSAindexStart[P->genomeSAindexNbases]; P->inOut->logMain << "Read from SAindex: genomeSAindexNbases=" << P->genomeSAindexNbases <<" nSAi="<< P->nSAi <<endl; /////////////////////////////////// at this point all array sizes should be known: calculate packed array lengths P->GstrandBit = (uint) floor(log(P->nGenome)/log(2))+1; if (P->GstrandBit<32) P->GstrandBit=32; //TODO: use simple access function for SA P->GstrandMask = ~(1LLU<<P->GstrandBit); P->nSA=(P->nSAbyte*8)/(P->GstrandBit+1); SA.defineBits(P->GstrandBit+1,P->nSA); P->SAiMarkNbit=P->GstrandBit+1; P->SAiMarkAbsentBit=P->GstrandBit+2; P->SAiMarkNmaskC=1LLU << P->SAiMarkNbit; P->SAiMarkNmask=~P->SAiMarkNmaskC; P->SAiMarkAbsentMaskC=1LLU << P->SAiMarkAbsentBit; P->SAiMarkAbsentMask=~P->SAiMarkAbsentMaskC; SAi.defineBits(P->GstrandBit+3,P->nSAi); P->inOut->logMain << "nGenome=" << P->nGenome << "; nSAbyte=" << P->nSAbyte <<endl<< flush; P->inOut->logMain <<"GstrandBit="<<int(P->GstrandBit)<<" SA number of indices="<<P->nSA<<endl<<flush; shmSize=SA.lengthByte + P->nGenome+L+L+SHM_startG+8; shmSize+= SAi.lengthByte; if (P->annotScoreScale>0) shmSize+=P->nGenome; if ((P->genomeLoad=="LoadAndKeep" || P->genomeLoad=="LoadAndRemove" || P->genomeLoad=="LoadAndExit" || P->genomeLoad=="Remove") && sharedMemory == NULL) { bool unloadLast = P->genomeLoad=="LoadAndRemove"; try { sharedMemory = new SharedMemory(shmKey, unloadLast); sharedMemory->SetErrorStream(P->inOut->logStdOut); if (!sharedMemory->NeedsAllocation()) P->inOut->logMain <<"Found genome in shared memory\n"<<flush; if (P->genomeLoad=="Remove") {//kill the genome and exit if (sharedMemory->NeedsAllocation()) {//did not find genome in shared memory, nothing to kill ostringstream errOut; errOut << "EXITING: Did not find the genome in memory, did not remove any genomes from shared memory\n"; exitWithError(errOut.str(),std::cerr, P->inOut->logMain, EXIT_CODE_GENOME_FILES, *P); } else { sharedMemory->Clean(); P->inOut->logMain <<"DONE: removed the genome from shared memory\n"<<flush; return; }; } if (sharedMemory->NeedsAllocation()){ P->inOut->logMain <<"Allocating shared memory for genome\n"<<flush; sharedMemory->Allocate(shmSize); } } catch (const SharedMemoryException & exc) { HandleSharedMemoryException(exc, shmSize); } shmStart = (char*) sharedMemory->GetMapped(); shmNG= (uint*) (shmStart+SHM_sizeG); shmNSA= (uint*) (shmStart+SHM_sizeSA); if (!sharedMemory->IsAllocator()) { // genome is in shared memory or being loaded // wait for the process that will populate it // and record the sizes uint iwait=0; while (*shmNG != P->nGenome) { iwait++; P->inOut->logMain <<"Another job is still loading the genome, sleeping for 1 min\n" <<flush; sleep(60); if (iwait==100) { ostringstream errOut; errOut << "EXITING because of FATAL ERROR: waited too long for the other job to finish loading the genome" << strerror(errno) << "\n" <<flush; errOut << "SOLUTION: remove the shared memory chunk by running STAR with --genomeLoad Remove, and restart STAR" <<flush; exitWithError(errOut.str(),std::cerr, P->inOut->logMain, EXIT_CODE_GENOME_LOADING_WAITED_TOO_LONG, *P); }; }; if (P->nSAbyte!=*shmNSA) { ostringstream errOut; errOut << "EXITING because of FATAL ERROR: the SA file size did not match what we found in shared memory" << "\n" << flush; errOut << "SOLUTION: remove the shared memory chunk by running STAR with --genomeLoad Remove, and restart STAR" << flush; exitWithError(errOut.str(),std::cerr, P->inOut->logMain, EXIT_CODE_INCONSISTENT_DATA, *P); } P->inOut->logMain << "Using shared memory for genome. key=0x" <<hex<<shmKey<<dec<< "; shmid="<< sharedMemory->GetId() <<endl<<flush; } G1=shmStart+SHM_startG; SA.pointArray(G1+P->nGenome+L+L); char* shmNext=SA.charArray+P->nSAbyte; SAi.pointArray(shmNext); shmNext += SAi.lengthByte; // if (twoPass.pass1readsN==0) {//not 2-pass // shmStartG=SHM_startSHM; // shmStartSA=0; // } else {//2-pass // ostringstream errOut; // errOut << "EXITING because of FATAL ERROR: 2-pass procedure cannot be used with genome already loaded im memory' "\n" ; // errOut << "SOLUTION: check shared memory settigns as explained in STAR manual, OR run STAR with --genomeLoad NoSharedMemory to avoid using shared memory\n" <<flush; // exitWithError(errOut.str(),std::cerr, P->inOut->logMain, EXIT_CODE_SHM, *P); // }; if (P->annotScoreScale>0) {//optional allocation sigG = shmNext; shmNext += P->nGenome; } } else if (P->genomeLoad=="NoSharedMemory") // simply allocate memory, do not use shared memory { try { if (P->sjdbInsert.pass1 || P->sjdbInsert.pass2) {//reserve extra memory for insertion at the 1st and/or 2nd step nGenomePass1=P->nGenome; nSApass1=P->nSA; if (P->sjdbInsert.pass1) { nGenomePass1+=P->limitSjdbInsertNsj*P->sjdbLength; nSApass1+=2*P->limitSjdbInsertNsj*P->sjdbLength; }; nGenomePass2=nGenomePass1; nSApass2=nSApass1; if (P->sjdbInsert.pass2) { nGenomePass2+=P->limitSjdbInsertNsj*P->sjdbLength; nSApass2+=2*P->limitSjdbInsertNsj*P->sjdbLength; }; G1=new char[nGenomePass2+L+L]; SApass2.defineBits(P->GstrandBit+1,nSApass2); SApass2.allocateArray(); SApass1.defineBits(P->GstrandBit+1,nSApass1); SApass1.pointArray(SApass2.charArray+SApass2.lengthByte-SApass1.lengthByte); SA.pointArray(SApass1.charArray+SApass1.lengthByte-SA.lengthByte); } else {//no insertions G1=new char[P->nGenome+L+L]; SA.allocateArray(); }; SAi.allocateArray(); P->inOut->logMain <<"Shared memory is not used for genomes. Allocated a private copy of the genome.\n"<<flush; } catch (exception & exc) { ostringstream errOut; errOut <<"EXITING: fatal error trying to allocate genome arrays, exception thrown: "<<exc.what()<<endl; errOut <<"Possible cause 1: not enough RAM. Check if you have enough RAM " << P->nGenome+L+L+SA.lengthByte+SAi.lengthByte+2000000000 << " bytes\n"; errOut <<"Possible cause 2: not enough virtual memory allowed with ulimit. SOLUTION: run ulimit -v " << P->nGenome+L+L+SA.lengthByte+SAi.lengthByte+2000000000<<endl <<flush; exitWithError(errOut.str(),std::cerr, P->inOut->logMain, EXIT_CODE_MEMORY_ALLOCATION, *P); }; } // if (twopass1readsN==0) {//not 2-pass // shmStartG=SHM_startSHM; // shmStartSA=0; // } else {//2-pass // ostringstream errOut; // errOut << "EXITING because of FATAL ERROR: 2-pass procedure cannot be used with genome already loaded im memory' "\n" ; // errOut << "SOLUTION: check shared memory settings as explained in STAR manual, OR run STAR with --genomeLoad NoSharedMemory to avoid using shared memory\n" <<flush; // exitWithError(errOut.str(),std::cerr, P->inOut->logMain, EXIT_CODE_SHM, *P); // }; G=G1+L; bool isAllocatorProcess = sharedMemory != NULL && sharedMemory->IsAllocator(); if (P->genomeLoad=="NoSharedMemory" || isAllocatorProcess) {//load genome and SAs from files //load genome P->inOut->logMain <<"Genome file size: "<<P->nGenome <<" bytes; state: good=" <<GenomeIn.good()\ <<" eof="<<GenomeIn.eof()<<" fail="<<GenomeIn.fail()<<" bad="<<GenomeIn.bad()<<"\n"<<flush; P->inOut->logMain <<"Loading Genome ... " << flush; uint genomeReadBytesN=fstreamReadBig(GenomeIn,G,P->nGenome); P->inOut->logMain <<"done! state: good=" <<GenomeIn.good()\ <<" eof="<<GenomeIn.eof()<<" fail="<<GenomeIn.fail()<<" bad="<<GenomeIn.bad()<<"; loaded "<<genomeReadBytesN<<" bytes\n" << flush; GenomeIn.close(); for (uint ii=0;ii<L;ii++) {// attach a tail with the largest symbol G1[ii]=K-1; G[P->nGenome+ii]=K-1; }; //load SAs P->inOut->logMain <<"SA file size: "<<SA.lengthByte <<" bytes; state: good=" <<SAin.good()\ <<" eof="<<SAin.eof()<<" fail="<<SAin.fail()<<" bad="<<SAin.bad()<<"\n"<<flush; P->inOut->logMain <<"Loading SA ... " << flush; genomeReadBytesN=fstreamReadBig(SAin,SA.charArray, SA.lengthByte); P->inOut->logMain <<"done! state: good=" <<SAin.good()\ <<" eof="<<SAin.eof()<<" fail="<<SAin.fail()<<" bad="<<SAin.bad()<<"; loaded "<<genomeReadBytesN<<" bytes\n" << flush; SAin.close(); P->inOut->logMain <<"Loading SAindex ... " << flush; SAiInBytes +=fstreamReadBig(SAiIn,SAi.charArray, SAi.lengthByte); P->inOut->logMain <<"done: "<<SAiInBytes<<" bytes\n" << flush; }; SAiIn.close(); if ((P->genomeLoad=="LoadAndKeep" || P->genomeLoad=="LoadAndRemove" || P->genomeLoad=="LoadAndExit") && isAllocatorProcess ) { //record sizes. This marks the end of genome loading *shmNG=P->nGenome; *shmNSA=P->nSAbyte; }; time ( &rawtime ); P->inOut->logMain << "Finished loading the genome: " << asctime (localtime ( &rawtime )) <<"\n"<<flush; #ifdef COMPILE_FOR_MAC { uint sum1=0; for (uint ii=0;ii<P->nGenome; ii++) sum1 += (uint) (unsigned char) G[ii]; P->inOut->logMain << "Sum of all Genome bytes: " <<sum1 <<"\n"<<flush; sum1=0; for (uint ii=0;ii<SA.lengthByte; ii++) sum1 += (uint) (unsigned char) SA.charArray[ii]; P->inOut->logMain << "Sum of all SA bytes: " <<sum1 <<"\n"<<flush; sum1=0; for (uint ii=0;ii<SAi.lengthByte; ii++) sum1 += (uint) (unsigned char) SAi.charArray[ii]; P->inOut->logMain << "Sum of all SAi bytes: " <<sum1 <<"\n"<<flush; }; #endif if (P->genomeLoad=="LoadAndExit") { uint shmSum=0; for (uint ii=0;ii<shmSize;ii++) shmSum+=shmStart[ii]; P->inOut->logMain << "genomeLoad=LoadAndExit: completed, the genome is loaded and kept in RAM, EXITING now.\n"<<flush; // system("echo `date` ..... Finished genome loading >> Log.timing.out"); return; }; //find chr starts from files P->chrInfoLoad(); P->chrBinFill(); //splice junctions database if (P->nGenome==P->chrStart[P->nChrReal]) {//no sjdb P->sjdbN=0; P->sjGstart=P->chrStart[P->nChrReal]+1; //not sure why I need that } else {//there are sjdb chromosomes ifstream sjdbInfo((P->genomeDir+"/sjdbInfo.txt").c_str()); if (sjdbInfo.fail()) { ostringstream errOut; errOut << "EXITING because of FATAL error, could not open file " << (P->genomeDir+"/sjdbInfo.txt") <<"\n"; errOut << "SOLUTION: check that the path to genome files, specified in --genomeDir is correct and the files are present, and have user read permsissions\n" <<flush; exitWithError(errOut.str(),std::cerr, P->inOut->logMain, EXIT_CODE_INPUT_FILES, *P); }; sjdbInfo >> P->sjdbN >> P->sjdbOverhang; P->inOut->logMain << "Processing splice junctions database sjdbN=" <<P->sjdbN<<", sjdbOverhang=" <<P->sjdbOverhang <<" \n"; P->sjChrStart=P->nChrReal; P->sjGstart=P->chrStart[P->sjChrStart]; //fill the sj-db to genome translation array P->sjDstart=new uint [P->sjdbN]; P->sjAstart=new uint [P->sjdbN]; P->sjdbStart=new uint [P->sjdbN]; P->sjdbEnd=new uint [P->sjdbN]; P->sjdbMotif=new uint8 [P->sjdbN]; P->sjdbShiftLeft=new uint8 [P->sjdbN]; P->sjdbShiftRight=new uint8 [P->sjdbN]; P->sjdbStrand=new uint8 [P->sjdbN]; for (uint ii=0;ii<P->sjdbN;ii++) {//get the info about junctions from sjdbInfo.txt { uint16 d1,d2,d3,d4; sjdbInfo >> P->sjdbStart[ii] >> P->sjdbEnd[ii] >> d1 >> d2 >> d3 >> d4; P->sjdbMotif[ii] = (uint8) d1; P->sjdbShiftLeft[ii] = (uint8) d2; P->sjdbShiftRight[ii] = (uint8) d3; P->sjdbStrand[ii] = (uint8) d4; }; P->sjDstart[ii] = P->sjdbStart[ii] - P->sjdbOverhang; P->sjAstart[ii] = P->sjdbEnd[ii] + 1; if (P->sjdbMotif[ii]==0) {//shinon-canonical junctions back to their true coordinates P->sjDstart[ii] += P->sjdbShiftLeft[ii]; P->sjAstart[ii] += P->sjdbShiftLeft[ii]; }; }; }; //check and redefine some parameters //max intron size if (P->alignIntronMax==0 && P->alignMatesGapMax==0) { P->inOut->logMain << "alignIntronMax=alignMatesGapMax=0, the max intron size will be approximately determined by (2^winBinNbits)*winAnchorDistNbins=" \ << (1LLU<<P->winBinNbits)*P->winAnchorDistNbins <<endl; } else { //redefine winBinNbits P->winBinNbits=max( (uint) floor(log2(P->nGenome/40000)+0.5), (uint) floor(log2(max(max(4LLU,P->alignIntronMax),P->alignMatesGapMax)/4)+0.5) ); P->inOut->logMain << "To accomodate alignIntronMax="<<P->alignIntronMax<<" redefined winBinNbits="<< P->winBinNbits <<endl; }; if (P->winBinNbits > P->genomeChrBinNbits) { P->inOut->logMain << "winBinNbits=" <<P->winBinNbits <<" > " << "genomeChrBinNbits=" << P->genomeChrBinNbits << " redefining:\n"; P->winBinNbits=P->genomeChrBinNbits; P->inOut->logMain << "winBinNbits=" <<P->winBinNbits <<endl; }; if (P->alignIntronMax==0 && P->alignMatesGapMax==0) { } else { //redefine winFlankNbins,winAnchorDistNbins P->winFlankNbins=max(P->alignIntronMax,P->alignMatesGapMax)/(1LLU<<P->winBinNbits)+1; P->winAnchorDistNbins=2*P->winFlankNbins; P->inOut->logMain << "To accomodate alignIntronMax="<<P->alignIntronMax<<" and alignMatesGapMax="<<P->alignMatesGapMax<<\ ", redefined winFlankNbins="<<P->winFlankNbins<<" and winAnchorDistNbins="<<P->winAnchorDistNbins<<endl; }; P->winBinChrNbits=P->genomeChrBinNbits-P->winBinNbits; P->winBinN = P->nGenome/(1LLU << P->winBinNbits)+1;//this may be chenaged later };
int main(int argInN, char* argIn[]) { time(&g_statsAll.timeStart); Parameters *P = new Parameters; //all parameters P->inputParameters(argInN, argIn); *(P->inOut->logStdOut) << timeMonthDayTime(g_statsAll.timeStart) << " ..... Started STAR run\n" <<flush; //generate genome if (P->runMode=="genomeGenerate") { genomeGenerate(P); (void) sysRemoveDir (P->outFileTmp); P->inOut->logMain << "DONE: Genome generation, EXITING\n" << flush; exit(0); } else if (P->runMode!="alignReads") { P->inOut->logMain << "EXITING because of INPUT ERROR: unknown value of input parameter runMode=" <<P->runMode<<endl<<flush; exit(1); }; Genome mainGenome (P); mainGenome.genomeLoad(); if (P->genomeLoad=="LoadAndExit" || P->genomeLoad=="Remove") { return 0; }; P->twoPass.pass2=false; //this is the 1st pass SjdbClass sjdbLoci; if (P->sjdbInsert.pass1) { Parameters *P1=new Parameters; *P1=*P; sjdbInsertJunctions(P, P1, mainGenome, sjdbLoci); }; //calculate genome-related parameters Transcriptome *mainTranscriptome=NULL; /////////////////////////////////////////////////////////////////////////////////////////////////START if (P->runThreadN>1) { g_threadChunks.threadArray=new pthread_t[P->runThreadN]; pthread_mutex_init(&g_threadChunks.mutexInRead, NULL); pthread_mutex_init(&g_threadChunks.mutexOutSAM, NULL); pthread_mutex_init(&g_threadChunks.mutexOutBAM1, NULL); pthread_mutex_init(&g_threadChunks.mutexOutUnmappedFastx, NULL); pthread_mutex_init(&g_threadChunks.mutexOutFilterBySJout, NULL); pthread_mutex_init(&g_threadChunks.mutexStats, NULL); pthread_mutex_init(&g_threadChunks.mutexBAMsortBins, NULL); }; g_statsAll.progressReportHeader(P->inOut->logProgress); if (P->twoPass.yes) {//2-pass //re-define P for the pass1 Parameters *P1=new Parameters; *P1=*P; //turn off unnecessary calculations P1->outSAMtype[0]="None"; P1->outSAMbool=false; P1->outBAMunsorted=false; P1->outBAMcoord=false; P1->chimSegmentMin=0; P1->quant.yes=false; P1->quant.trSAM.yes=false; P1->quant.geCount.yes=false; P1->outFilterBySJoutStage=0; P1->outReadsUnmapped="None"; P1->outFileNamePrefix=P->twoPass.dir; P1->readMapNumber=P->twoPass.pass1readsN; // P1->inOut->logMain.open((P1->outFileNamePrefix + "Log.out").c_str()); g_statsAll.resetN(); time(&g_statsAll.timeStartMap); P->inOut->logProgress << timeMonthDayTime(g_statsAll.timeStartMap) <<"\tStarted 1st pass mapping\n" <<flush; *P->inOut->logStdOut << timeMonthDayTime(g_statsAll.timeStartMap) << " ..... Started 1st pass mapping\n" <<flush; //run mapping for Pass1 ReadAlignChunk *RAchunk1[P->runThreadN]; for (int ii=0;ii<P1->runThreadN;ii++) { RAchunk1[ii]=new ReadAlignChunk(P1, mainGenome, mainTranscriptome, ii); }; mapThreadsSpawn(P1, RAchunk1); outputSJ(RAchunk1,P1); //collapse and output junctions // for (int ii=0;ii<P1->runThreadN;ii++) { // delete [] RAchunk[ii]; // }; time_t rawtime; time (&rawtime); P->inOut->logProgress << timeMonthDayTime(rawtime) <<"\tFinished 1st pass mapping\n"; *P->inOut->logStdOut << timeMonthDayTime(rawtime) << " ..... Finished 1st pass mapping\n" <<flush; ofstream logFinal1 ( (P->twoPass.dir + "/Log.final.out").c_str()); g_statsAll.reportFinal(logFinal1,P1); P->twoPass.pass2=true;//starting the 2nd pass P->twoPass.pass1sjFile=P->twoPass.dir+"/SJ.out.tab"; sjdbInsertJunctions(P, P1, mainGenome, sjdbLoci); //reopen reads files P->closeReadsFiles(); P->openReadsFiles(); } else {//not 2-pass //nothing for now }; if ( P->quant.yes ) {//load transcriptome mainTranscriptome=new Transcriptome(P); }; //initialize Stats g_statsAll.resetN(); time(&g_statsAll.timeStartMap); *P->inOut->logStdOut << timeMonthDayTime(g_statsAll.timeStartMap) << " ..... Started mapping\n" <<flush; g_statsAll.timeLastReport=g_statsAll.timeStartMap; //open SAM/BAM files for output if (P->outSAMmode != "None") {//open SAM file and write header ostringstream samHeaderStream; for (uint ii=0;ii<P->nChrReal;ii++) { samHeaderStream << "@SQ\tSN:"<< P->chrName.at(ii) <<"\tLN:"<<P->chrLength[ii]<<"\n"; }; if (P->outSAMheaderPG.at(0)!="-") { samHeaderStream << P->outSAMheaderPG.at(0); for (uint ii=1;ii<P->outSAMheaderPG.size(); ii++) { samHeaderStream << "\t" << P->outSAMheaderPG.at(ii); }; samHeaderStream << "\n"; }; samHeaderStream << "@PG\tID:STAR\tPN:STAR\tVN:" << STAR_VERSION <<"\tCL:" << P->commandLineFull <<"\n"; if (P->outSAMheaderCommentFile!="-") { ifstream comstream (P->outSAMheaderCommentFile); while (comstream.good()) { string line1; getline(comstream,line1); if (line1.find_first_not_of(" \t\n\v\f\r")!=std::string::npos) {//skip blank lines samHeaderStream << line1 <<"\n"; }; }; }; for (uint32 ii=0;ii<P->outSAMattrRGlineSplit.size();ii++) {//@RG lines samHeaderStream << "@RG\t" << P->outSAMattrRGlineSplit.at(ii) <<"\n"; }; samHeaderStream << "@CO\t" <<"user command line: " << P->commandLine <<"\n"; if (P->outSAMheaderHD.at(0)!="-") { P->samHeaderHD = P->outSAMheaderHD.at(0); for (uint ii=1;ii<P->outSAMheaderHD.size(); ii++) { P->samHeaderHD +="\t" + P->outSAMheaderHD.at(ii); }; } else { P->samHeaderHD = "@HD\tVN:1.4"; }; P->samHeader=P->samHeaderHD+"\n"+samHeaderStream.str(); //for the sorted BAM, need to add SO:cooridnate to the header line P->samHeaderSortedCoord=P->samHeaderHD + (P->outSAMheaderHD.size()==0 ? "" : "\tSO:coordinate") + "\n" + samHeaderStream.str(); if (P->outSAMbool) {// *P->inOut->outSAM << P->samHeader; }; if (P->outBAMunsorted){ outBAMwriteHeader(P->inOut->outBAMfileUnsorted,P->samHeader,P->chrName,P->chrLength); }; // if (P->outBAMcoord){ // outBAMwriteHeader(P->inOut->outBAMfileCoord,P->samHeader,P->chrName,P->chrLength); // }; if ( P->quant.trSAM.yes ) { samHeaderStream.str(""); vector <uint> trlength; for (uint32 ii=0;ii<mainTranscriptome->trID.size();ii++) { uint32 iex1=mainTranscriptome->trExI[ii]+mainTranscriptome->trExN[ii]-1; //last exon of the transcript trlength.push_back(mainTranscriptome->exLenCum[iex1]+mainTranscriptome->exSE[2*iex1+1]-mainTranscriptome->exSE[2*iex1]+1); samHeaderStream << "@SQ\tSN:"<< mainTranscriptome->trID.at(ii) <<"\tLN:"<<trlength.back()<<"\n"; }; for (uint32 ii=0;ii<P->outSAMattrRGlineSplit.size();ii++) {//@RG lines samHeaderStream << "@RG\t" << P->outSAMattrRGlineSplit.at(ii) <<"\n"; }; outBAMwriteHeader(P->inOut->outQuantBAMfile,samHeaderStream.str(),mainTranscriptome->trID,trlength); }; }; if (P->chimSegmentMin>0) { P->inOut->outChimJunction.open((P->outFileNamePrefix + "Chimeric.out.junction").c_str()); P->inOut->outChimSAM.open((P->outFileNamePrefix + "Chimeric.out.sam").c_str()); P->inOut->outChimSAM << P->samHeader; pthread_mutex_init(&g_threadChunks.mutexOutChimSAM, NULL); pthread_mutex_init(&g_threadChunks.mutexOutChimJunction, NULL); }; // P->inOut->logMain << "mlock value="<<mlockall(MCL_CURRENT|MCL_FUTURE) <<"\n"<<flush; // prepare chunks and spawn mapping threads ReadAlignChunk *RAchunk[P->runThreadN]; for (int ii=0;ii<P->runThreadN;ii++) { RAchunk[ii]=new ReadAlignChunk(P, mainGenome, mainTranscriptome, ii); }; mapThreadsSpawn(P, RAchunk); if (P->outFilterBySJoutStage==1) {//completed stage 1, go to stage 2 P->inOut->logMain << "Completed stage 1 mapping of outFilterBySJout mapping\n"<<flush; outputSJ(RAchunk,P);//collapse novel junctions P->readFilesIndex=-1; P->outFilterBySJoutStage=2; if (P->outBAMcoord) { for (int it=0; it<P->runThreadN; it++) {//prepare the unmapped bin RAchunk[it]->chunkOutBAMcoord->coordUnmappedPrepareBySJout(); }; }; mapThreadsSpawn(P, RAchunk); }; //close some BAM files if (P->inOut->outBAMfileUnsorted!=NULL) { bgzf_flush(P->inOut->outBAMfileUnsorted); bgzf_close(P->inOut->outBAMfileUnsorted); }; if (P->inOut->outQuantBAMfile!=NULL) { bgzf_flush(P->inOut->outQuantBAMfile); bgzf_close(P->inOut->outQuantBAMfile); }; if (P->outBAMcoord && P->limitBAMsortRAM==0) {//make it equal ot the genome size P->limitBAMsortRAM=P->nGenome+mainGenome.SA.lengthByte+mainGenome.SAi.lengthByte; }; //no need for genome anymore, free the memory mainGenome.freeMemory(); if ( P->quant.geCount.yes ) {//output gene quantifications for (int ichunk=1; ichunk<P->runThreadN; ichunk++) {//sum counts from all chunks into 0th chunk RAchunk[0]->chunkTr->quants->addQuants(*(RAchunk[ichunk]->chunkTr->quants)); }; RAchunk[0]->chunkTr->quantsOutput(); }; if (P->runThreadN>1 && P->outSAMorder=="PairedKeepInputOrder") {//concatenate Aligned.* files RAchunk[0]->chunkFilesCat(P->inOut->outSAM, P->outFileTmp + "/Aligned.out.sam.chunk", g_threadChunks.chunkOutN); }; if (P->outBAMcoord) {//sort BAM if needed *P->inOut->logStdOut << timeMonthDayTime() << " ..... Started sorting BAM\n" <<flush; P->inOut->logMain << timeMonthDayTime() << " ..... Started sorting BAM\n" <<flush; uint32 nBins=P->outBAMcoordNbins; //check max size needed for sorting uint maxMem=0; for (uint32 ibin=0; ibin<nBins-1; ibin++) {//check akk bins uint binS=0; for (int it=0; it<P->runThreadN; it++) {//collect sizes from threads binS += RAchunk[it]->chunkOutBAMcoord->binTotalBytes[ibin]+24*RAchunk[it]->chunkOutBAMcoord->binTotalN[ibin]; }; if (binS>maxMem) maxMem=binS; }; P->inOut->logMain << "Max memory needed for sorting = "<<maxMem<<endl; if (maxMem>P->limitBAMsortRAM) { ostringstream errOut; errOut <<"EXITING because of fatal ERROR: not enough memory for BAM sorting: \n"; errOut <<"SOLUTION: re-run STAR with at least --limitBAMsortRAM " <<maxMem+1000000000; exitWithError(errOut.str(), std::cerr, P->inOut->logMain, EXIT_CODE_PARAMETER, *P); }; uint totalMem=0; // P->inOut->logMain << "Started sorting BAM ..." <<endl; #pragma omp parallel num_threads(P->outBAMsortingThreadNactual) #pragma omp for schedule (dynamic,1) for (uint32 ibin1=0; ibin1<nBins; ibin1++) { uint32 ibin=nBins-1-ibin1;//reverse order to start with the last bin - unmapped reads uint binN=0, binS=0; for (int it=0; it<P->runThreadN; it++) {//collect sizes from threads binN += RAchunk[it]->chunkOutBAMcoord->binTotalN[ibin]; binS += RAchunk[it]->chunkOutBAMcoord->binTotalBytes[ibin]; }; if (binS==0) continue; //empty bin if (ibin == nBins-1) {//last bin for unmapped reads BAMbinSortUnmapped(ibin,P->runThreadN,P->outBAMsortTmpDir,P->inOut->outBAMfileCoord, P); } else { uint newMem=binS+binN*24; bool boolWait=true; while (boolWait) { #pragma omp critical if (totalMem+newMem < P->limitBAMsortRAM) { boolWait=false; totalMem+=newMem; }; sleep(0.1); }; BAMbinSortByCoordinate(ibin,binN,binS,P->runThreadN,P->outBAMsortTmpDir,P->inOut->outBAMfileCoord, P); #pragma omp critical totalMem-=newMem;//"release" RAM }; }; //concatenate all BAM files, using bam_cat char **bamBinNames = new char* [nBins]; vector <string> bamBinNamesV; for (uint32 ibin=0; ibin<nBins; ibin++) { bamBinNamesV.push_back(P->outBAMsortTmpDir+"/b"+to_string((uint) ibin)); struct stat buffer; if (stat (bamBinNamesV.back().c_str(), &buffer) != 0) {//check if file exists bamBinNamesV.pop_back(); }; }; for (uint32 ibin=0; ibin<bamBinNamesV.size(); ibin++) { bamBinNames[ibin] = (char*) bamBinNamesV.at(ibin).c_str(); }; bam_cat(bamBinNamesV.size(), bamBinNames, 0, P->outBAMfileCoordName.c_str()); }; //wiggle output if (P->outWigFlags.yes) { *P->inOut->logStdOut << timeMonthDayTime() << " ..... Started wiggle output\n" <<flush; P->inOut->logMain << timeMonthDayTime() << " ..... Started wiggle output\n" <<flush; string wigOutFileNamePrefix=P->outFileNamePrefix + "Signal"; signalFromBAM(P->outBAMfileCoordName, wigOutFileNamePrefix, *P); }; //aggregate output junctions //collapse splice junctions from different threads/chunks, and output them outputSJ(RAchunk,P); g_statsAll.progressReport(P->inOut->logProgress); P->inOut->logProgress << "ALL DONE!\n"<<flush; P->inOut->logFinal.open((P->outFileNamePrefix + "Log.final.out").c_str()); g_statsAll.reportFinal(P->inOut->logFinal,P); *P->inOut->logStdOut << timeMonthDayTime(g_statsAll.timeFinish) << " ..... Finished successfully\n" <<flush; P->inOut->logMain << "ALL DONE!\n"<<flush; sysRemoveDir (P->outFileTmp); P->closeReadsFiles();//this will kill the readFilesCommand processes if necessary mainGenome.~Genome(); //need explicit call because of the 'delete P->inOut' below, which will destroy P->inOut->logStdOut delete P->inOut; //to close files delete P; return 0; };
uint loadGTF(SjdbClass &sjdbLoci, Parameters *P, string dirOut) {//load gtf file, add junctions to P->sjdb //returns number of added junctions if (P->sjdbOverhang>0 && P->sjdbGTFfile!="-") { time_t rawTime; time(&rawTime); P->inOut->logMain << timeMonthDayTime(rawTime) <<" ..... Processing annotations GTF\n" <<flush; *P->inOut->logStdOut << timeMonthDayTime(rawTime) <<" ..... Processing annotations GTF\n" <<flush; ifstream sjdbStreamIn ( P->sjdbGTFfile.c_str() ); if (sjdbStreamIn.fail()) { ostringstream errOut; errOut << "FATAL error, could not open file sjdbGTFfile=" << P->sjdbGTFfile <<"\n"; exitWithError(errOut.str(),std::cerr, P->inOut->logMain, EXIT_CODE_INPUT_FILES, *P); }; if (P->chrNameIndex.size()==0) { for (uint ii=0;ii<P->nChrReal;ii++) { P->chrNameIndex[P->chrName[ii]]=ii; }; }; std::map <string,uint> transcriptIDnumber, geneIDnumber; uint exonN=0; while (sjdbStreamIn.good()) {//count the number of exons string chr1,ddd2,featureType; sjdbStreamIn >> chr1 >> ddd2 >> featureType; if (chr1.substr(0,1)!="#" && featureType==P->sjdbGTFfeatureExon) { exonN++; }; sjdbStreamIn.ignore(1000000000,'\n'); //ignore the rest of the line }; if (exonN==0) { P->inOut->logMain << "WARNING: found no exons in sjdbGTFfile=" << P->sjdbGTFfile <<endl; return 0; }; uint* exonLoci=new uint [exonN*GTF_exonLoci_size]; char* transcriptStrand = new char [exonN]; vector <string> transcriptID, geneID; exonN=0;//re-calculate sjdbStreamIn.clear(); sjdbStreamIn.seekg(0,ios::beg); while (sjdbStreamIn.good()) { string oneLine,chr1,ddd2,featureType; getline(sjdbStreamIn,oneLine); istringstream oneLineStream (oneLine); oneLineStream >> chr1 >> ddd2 >> featureType; if (chr1.substr(0,1)!="#" && featureType==P->sjdbGTFfeatureExon) {//exonic line, process if (P->sjdbGTFchrPrefix!="-") chr1=P->sjdbGTFchrPrefix + chr1; if (P->chrNameIndex.count(chr1)==0) {//chr not in Genome P->inOut->logMain << "WARNING: while processing sjdbGTFfile=" << P->sjdbGTFfile <<": chromosome '"<<chr1<<"' not found in Genome fasta files for line:\n"; P->inOut->logMain << oneLine <<"\n"<<flush; continue; //do not process exons/transcripts on missing chromosomes }; uint ex1,ex2; char str1; oneLineStream >> ex1 >> ex2 >> ddd2 >> str1 >> ddd2; //read all fields except the last string oneLine1; getline(oneLineStream, oneLine1);//get the last field replace(oneLine1.begin(),oneLine1.end(),';',' ');//to separate attributes replace(oneLine1.begin(),oneLine1.end(),'=',' ');//for GFF3 processing oneLineStream.str(oneLine1); oneLineStream.clear(); string trID(""), gID(""), attr1(""); while (oneLineStream.good()) { oneLineStream >> attr1; if (attr1==P->sjdbGTFtagExonParentTranscript) { oneLineStream >> trID; trID.erase(remove(trID.begin(),trID.end(),'"'),trID.end()); trID.erase(remove(trID.begin(),trID.end(),';'),trID.end()); } else if (attr1==P->sjdbGTFtagExonParentGene) { oneLineStream >> gID; gID.erase(remove(gID.begin(),gID.end(),'"'),gID.end()); gID.erase(remove(gID.begin(),gID.end(),';'),gID.end()); }; };
void sjdbInsertJunctions(Parameters * P, Parameters * P1, Genome & genome, SjdbClass & sjdbLoci) { time_t rawtime; if (P->sjdbN>0 && sjdbLoci.chr.size()==0) {//load from the saved genome, only if the loading did not happen already (if sjdb insertion happens at the 1st pass, sjdbLoci will be populated ifstream & sjdbStreamIn = ifstrOpen(P->genomeDir+"/sjdbList.out.tab", ERROR_OUT, "SOLUTION: re-generate the genome in genomeDir=" + P->genomeDir, P); sjdbLoadFromStream(sjdbStreamIn, sjdbLoci); sjdbLoci.priority.resize(sjdbLoci.chr.size(),30); time ( &rawtime ); P->inOut->logMain << timeMonthDayTime(rawtime) << " Loaded database junctions from the generated genome " << P->genomeDir+"/sjdbList.out.tab" <<": "<<sjdbLoci.chr.size()<<" total junctions\n\n"; }; if (P->twoPass.pass2) {//load 1st pass new junctions //sjdbLoci already contains the junctions from before 1st pass ifstream sjdbStreamIn ( P->twoPass.pass1sjFile.c_str() ); if (sjdbStreamIn.fail()) { ostringstream errOut; errOut << "FATAL INPUT error, could not open input file with junctions from the 1st pass="******"\n"; exitWithError(errOut.str(),std::cerr, P->inOut->logMain, EXIT_CODE_INPUT_FILES, *P); }; sjdbLoadFromStream(sjdbStreamIn, sjdbLoci); sjdbLoci.priority.resize(sjdbLoci.chr.size(),0); time ( &rawtime ); P->inOut->logMain << timeMonthDayTime(rawtime) << " Loaded database junctions from the 1st pass file: " << P->twoPass.pass1sjFile <<": "<<sjdbLoci.chr.size()<<" total junctions\n\n"; } else {//loading junctions from GTF or tab or from the saved genome is only allowed at the 1st pass //at the 2nd pass these are already in the sjdbLoci if (P->sjdbFileChrStartEnd.at(0)!="-") {//load from junction files sjdbLoadFromFiles(P,sjdbLoci); sjdbLoci.priority.resize(sjdbLoci.chr.size(),10); time ( &rawtime ); P->inOut->logMain << timeMonthDayTime(rawtime) << " Loaded database junctions from the sjdbFileChrStartEnd file(s), " << sjdbLoci.chr.size()<<" total junctions\n\n"; }; if (P->sjdbGTFfile!="-") {//load from GTF loadGTF(sjdbLoci, P, P->sjdbInsert.outDir); sjdbLoci.priority.resize(sjdbLoci.chr.size(),20); time ( &rawtime ); P->inOut->logMain << timeMonthDayTime(rawtime) << " Loaded database junctions from the GTF file: " << P->sjdbGTFfile<<": "<<sjdbLoci.chr.size()<<" total junctions\n\n"; }; }; char *Gsj=new char [2*P->sjdbLength*sjdbLoci.chr.size()+1];//array to store junction sequences, will be filled in sjdbPrepare sjdbPrepare (sjdbLoci, P, P->chrStart[P->nChrReal], P->sjdbInsert.outDir, genome.G, Gsj);//P->nGenome - change when replacing junctions time ( &rawtime ); P->inOut->logMain << timeMonthDayTime(rawtime) << " Finished preparing junctions" <<endl; if (P->sjdbN>P->limitSjdbInsertNsj) { ostringstream errOut; errOut << "Fatal LIMIT error: the number of junctions to be inserted on the fly ="<<P->sjdbN<<" is larger than the limitSjdbInsertNsj="<<P->limitSjdbInsertNsj<<"\n"; errOut << "Fatal LIMIT error: the number of junctions to be inserted on the fly ="<<P->sjdbN<<" is larger than the limitSjdbInsertNsj="<<P->limitSjdbInsertNsj<<"\n"; errOut << "SOLUTION: re-run with at least --limitSjdbInsertNsj "<<P->sjdbN<<"\n"; exitWithError(errOut.str(),std::cerr, P->inOut->logMain, EXIT_CODE_INPUT_FILES, *P); }; //insert junctions into the genome and SA and SAi sjdbBuildIndex (P, P1, Gsj, genome.G, genome.SA, (P->twoPass.pass2 ? genome.SApass2 : genome.SApass1), genome.SAi); delete [] Gsj; //junction sequences have been added to G time ( &rawtime ); P->inOut->logMain << timeMonthDayTime(rawtime) << " ..... finished inserting junctions into genome" <<endl; if (P->sjdbInsert.save=="All") {//save and copy all genome files into sjdbInsert.outDir, except those created above if (P->genomeDir != P->sjdbInsert.outDir) { copyFile(P->genomeDir+"/chrName.txt", P->sjdbInsert.outDir+"/chrName.txt"); copyFile(P->genomeDir+"/chrStart.txt", P->sjdbInsert.outDir+"/chrStart.txt"); copyFile(P->genomeDir+"/chrNameLength.txt", P->sjdbInsert.outDir+"/chrNameLength.txt"); copyFile(P->genomeDir+"/chrLength.txt", P->sjdbInsert.outDir+"/chrLength.txt"); }; genomeParametersWrite(P->sjdbInsert.outDir+("/genomeParameters.txt"), P, ERROR_OUT); ofstream & genomeOut = ofstrOpen(P->sjdbInsert.outDir+"/Genome",ERROR_OUT, P); fstreamWriteBig(genomeOut,genome.G,P->nGenome,P->sjdbInsert.outDir+"/Genome",ERROR_OUT,P); genomeOut.close(); ofstream & saOut = ofstrOpen(P->sjdbInsert.outDir+"/SA",ERROR_OUT, P); fstreamWriteBig(saOut,(char*) genome.SA.charArray, (streamsize) genome.SA.lengthByte, P->sjdbInsert.outDir+"/SA",ERROR_OUT,P); saOut.close(); ofstream & saIndexOut = ofstrOpen(P->sjdbInsert.outDir+"/SAindex",ERROR_OUT, P); fstreamWriteBig(saIndexOut, (char*) &P->genomeSAindexNbases, sizeof(P->genomeSAindexNbases),P->sjdbInsert.outDir+"/SAindex",ERROR_OUT,P); fstreamWriteBig(saIndexOut, (char*) P->genomeSAindexStart, sizeof(P->genomeSAindexStart[0])*(P->genomeSAindexNbases+1),P->sjdbInsert.outDir+"/SAindex",ERROR_OUT,P); fstreamWriteBig(saIndexOut, genome.SAi.charArray, genome.SAi.lengthByte,P->sjdbInsert.outDir+"/SAindex",ERROR_OUT,P); saIndexOut.close(); }; //re-calculate genome-related parameters P->winBinN = P->nGenome/(1LLU << P->winBinNbits)+1; };
void sjdbBuildIndex (Parameters *P, Parameters *P1, char *Gsj, char *G, PackedArray &SA, PackedArray &SA2, PackedArray &SAi) { #define SPACER_CHAR GENOME_spacingChar if (P->sjdbN==0) {//no junctions to insert return; }; time_t rawtime; time ( &rawtime ); P->inOut->logMain << timeMonthDayTime(rawtime) << " ..... Inserting junctions into the genome indices" <<endl; *P->inOut->logStdOut << timeMonthDayTime(rawtime) << " ..... Inserting junctions into the genome indices" <<endl; uint nGsj=P->sjdbLength*P->sjdbN; for (uint ii=1; ii<=P->sjdbN; ii++) { Gsj[ii*P->sjdbLength-1]=SPACER_CHAR; //to make sure this is > than any genome char }; Gsj[nGsj*2]=SPACER_CHAR+1;//mark the end of the text for (uint ii=0; ii<nGsj; ii++) {//reverse complement junction sequences Gsj[nGsj*2-1-ii]=Gsj[ii]<4 ? 3-Gsj[ii] : Gsj[ii]; //reverse complement }; char* G1c=new char[nGsj*2+1]; complementSeqNumbers(Gsj, G1c, nGsj*2+1); uint32* oldSJind=new uint32[P1->sjdbN]; // uint nIndicesSJ1=P->sjdbOverhang; uint nIndicesSJ1=P->sjdbLength;//keep all indices - this is pre-2.4.1 of generating the genome uint64* indArray=new uint64[2*P->sjdbN*(nIndicesSJ1+1)*2];//8+4 bytes for SA index and index in the genome * nJunction * nIndices per junction * 2 for reverse compl uint64 sjNew=0; #pragma omp parallel num_threads(P->runThreadN) #pragma omp for schedule (dynamic,1000) reduction(+:sjNew) for (uint isj=0; isj<2*P->sjdbN; isj++) {//find insertion points for each of the sequences char** seq1=new char*[2]; seq1[0]=Gsj+isj*P->sjdbLength; seq1[1]=G1c+isj*P->sjdbLength; uint isj1=isj<P->sjdbN ? isj : 2*P->sjdbN-1-isj; int sjdbInd = P1->sjdbN==0 ? -1 : binarySearch2(P->sjdbStart[isj1],P->sjdbEnd[isj1],P1->sjdbStart,P1->sjdbEnd,P1->sjdbN); if (sjdbInd<0) {//count new junctions ++sjNew; } else {//record new index of the old junctions oldSJind[sjdbInd]=isj1; }; for (uint istart1=0; istart1<nIndicesSJ1;istart1++) { uint istart=istart1; // uint istart=isj<P->sjdbN ? istart1 : istart1+1; //for rev-compl junction, shift by one base to start with the 1st non-spacer base uint ind1=2*(isj*nIndicesSJ1+istart1); if (sjdbInd>=0 || seq1[0][istart]>3) {//no index for already included junctions, or suffices starting with N indArray[ind1]=-1; } else { //indArray[ind1] = suffixArraySearch(seq1, istart, P->sjdbLength-istart1, G, SA, true, 0, P->nSA-1, 0, P) ; indArray[ind1] = suffixArraySearch(seq1, istart, 10000, G, SA, true, 0, P->nSA-1, 0, P) ; indArray[ind1+1] = isj*P->sjdbLength+istart; }; }; }; // for (int ii=0;ii<P1->sjdbN;ii++) {if ( oldSJind[ii]==0){cout <<ii<<endl;};}; sjNew = sjNew/2;//novel junctions were double counted on two strands time ( &rawtime ); P->inOut->logMain << timeMonthDayTime(rawtime) << " Finished SA search: number of new junctions=" << sjNew <<", old junctions="<<P->sjdbN-sjNew<<endl; uint nInd=0;//true number of new indices for (uint ii=0; ii<2*P->sjdbN*nIndicesSJ1; ii++) {//remove entries that cannot be inserted, this cannot be done in the parallel cycle above if (indArray[ii*2]!= (uint) -1) { indArray[nInd*2]=indArray[ii*2]; indArray[nInd*2+1]=indArray[ii*2+1]; ++nInd; }; }; globalGsj=Gsj; qsort((void*) indArray, nInd, 2*sizeof(uint64), funCompareUintAndSuffixes); time ( &rawtime ); P->inOut->logMain << timeMonthDayTime(rawtime) << " Finished sorting SA indicesL nInd="<<nInd <<endl; indArray[2*nInd]=-999; //mark the last junction indArray[2*nInd+1]=-999; //mark the last junction P->nGenome=P->chrStart[P->nChrReal]+nGsj; P->nSA+=nInd; uint GstrandBit1 = (uint) floor(log(P->nGenome)/log(2))+1; if (GstrandBit1<32) GstrandBit1=32; //TODO: use simple access function for SA if ( GstrandBit1 != P->GstrandBit) {//too many junctions were added - GstrandBit changed ostringstream errOut; errOut << "EXITING because of FATAL ERROR: cannot insert junctions on the fly because of strand GstrandBit problem\n"; errOut << "SOLUTION: please contact STAR author at https://groups.google.com/forum/#!forum/rna-star\n"; exitWithError(errOut.str(),std::cerr, P->inOut->logMain, EXIT_CODE_GENOME_FILES, *P); }; SA2.defineBits(P->GstrandBit+1,P->nSA); uint nGsjNew=sjNew*P->sjdbLength; //this is the actual number of bytes added to the genome, while nGsj is the total size of all junctions uint N2bit= 1LLU << P->GstrandBit; uint strandMask=~N2bit; //testing // PackedArray SAo; // SAo.defineBits(P->GstrandBit+1,P->nSA); // SAo.allocateArray(); // ifstream oldSAin("./DirTrue/SA"); // oldSAin.read(SAo.charArray,SAo.lengthByte); // oldSAin.close(); uint isj=0, isa2=0; for (uint isa=0;isa<P1->nSA;isa++) { //testing // if (isa2>0 && SA2[isa2-1]!=SAo[isa2-1]) { // cout <<isa2 <<" "<< SA2[isa2-1]<<" "<<SAo[isa2-1]<<endl; // }; // if (isa==69789089) // { // cout <<isa; // }; uint ind1=SA[isa]; if ( (ind1 & N2bit)>0 ) {//- strand uint ind1s = P1->nGenome - (ind1 & strandMask); if (ind1s>P->chrStart[P->nChrReal]) {//this index was an old sj, may need to shift it uint sj1 = (ind1s-P->chrStart[P->nChrReal])/P->sjdbLength;//old junction index ind1s += (oldSJind[sj1]-sj1)*P->sjdbLength; ind1 = (P->nGenome - ind1s) | N2bit; } else { ind1+=nGsjNew; //reverse complementary indices are all shifted by the length of junctions }; } else {//+ strand if (ind1>P->chrStart[P->nChrReal]) {//this index was an old sj, may need to shift it uint sj1 = (ind1-P->chrStart[P->nChrReal])/P->sjdbLength;//old junction index ind1 += (oldSJind[sj1]-sj1)*P->sjdbLength; }; }; SA2.writePacked(isa2,ind1); //TODO make sure that the first sj index is not before the first array index ++isa2; while (isa==indArray[isj*2]) {//insert sj index after the existing index uint ind1=indArray[isj*2+1]; if (ind1<nGsj) { ind1+=P->chrStart[P->nChrReal]; } else {//reverse strand ind1=(ind1-nGsj) | N2bit; }; SA2.writePacked(isa2,ind1); ++isa2; ++isj; }; }; time ( &rawtime ); P->inOut->logMain << timeMonthDayTime(rawtime) << " Finished inserting junction indices" <<endl; //SAi insertions for (uint iL=0; iL < P->genomeSAindexNbases; iL++) { uint iSJ=0; uint ind0=P->genomeSAindexStart[iL]-1;//last index that was present in the old genome for (uint ii=P->genomeSAindexStart[iL];ii<P->genomeSAindexStart[iL+1]; ii++) {//scan through the longest index uint iSA1=SAi[ii]; uint iSA2=iSA1 & P->SAiMarkNmask & P->SAiMarkAbsentMask; if ( iSJ<nInd && (iSA1 & P->SAiMarkAbsentMaskC)>0 ) {//index missing from the old genome uint iSJ1=iSJ; int64 ind1=funCalcSAi(Gsj+indArray[2*iSJ+1],iL); while (ind1 < (int64)(ii-P->genomeSAindexStart[iL]) && indArray[2*iSJ]<iSA2) { ++iSJ; ind1=funCalcSAi(Gsj+indArray[2*iSJ+1],iL); }; if (ind1 == (int64)(ii-P->genomeSAindexStart[iL]) ) { SAi.writePacked(ii,indArray[2*iSJ]+iSJ+1); for (uint ii0=ind0+1; ii0<ii; ii0++) {//fill all the absent indices with this value SAi.writePacked(ii0,(indArray[2*iSJ]+iSJ+1) | P->SAiMarkAbsentMaskC); }; ++iSJ; ind0=ii; } else { iSJ=iSJ1; }; } else {//index was present in the old genome while (iSJ<nInd && indArray[2*iSJ]+1<iSA2) {//for this index insert "smaller" junctions ++iSJ; }; while (iSJ<nInd && indArray[2*iSJ]+1==iSA2) {//special case, the index falls right behind SAi if (funCalcSAi(Gsj+indArray[2*iSJ+1],iL) >= (int64) (ii-P->genomeSAindexStart[iL]) ) {//this belongs to the next index break; }; ++iSJ; }; SAi.writePacked(ii,iSA1+iSJ); for (uint ii0=ind0+1; ii0<ii; ii0++) {//fill all the absent indices with this value SAi.writePacked(ii0,(iSA2+iSJ) | P->SAiMarkAbsentMaskC); }; ind0=ii; }; }; }; // time ( &rawtime ); cout << timeMonthDayTime(rawtime) << "SAi first" <<endl; for (uint isj=0;isj<nInd;isj++) { int64 ind1=0; for (uint iL=0; iL < P->genomeSAindexNbases; iL++) { uint g=(uint) Gsj[indArray[2*isj+1]+iL]; ind1 <<= 2; if (g>3) {//this iSA contains N, need to mark the previous for (uint iL1=iL; iL1 < P->genomeSAindexNbases; iL1++) { ind1+=3; int64 ind2=P->genomeSAindexStart[iL1]+ind1; for (; ind2>=0; ind2--) {//find previous index that is not absent if ( (SAi[ind2] & P->SAiMarkAbsentMaskC)==0 ) { break; }; }; SAi.writePacked(ind2,SAi[ind2] | P->SAiMarkNmaskC); ind1 <<= 2; }; break; } else { ind1 += g; }; }; }; time ( &rawtime ); P->inOut->logMain << timeMonthDayTime(rawtime) << " Finished SAi" <<endl; //change parameters, most parameters are already re-defined in sjdbPrepare.cpp SA.defineBits(P->GstrandBit+1,P->nSA);//same as SA2 SA.pointArray(SA2.charArray); P->nSAbyte=SA.lengthByte; P->sjGstart=P->chrStart[P->nChrReal]; memcpy(G+P->chrStart[P->nChrReal],Gsj, nGsj); /* testing PackedArray SAio=SAi; SAio.allocateArray(); ifstream oldSAiin("./DirTrue/SAindex"); // oldSAin.read(SAio.charArray,8*(P->genomeSAindexNbases+2));//skip first bytes oldSAiin.read(SAio.charArray,SAio.lengthByte); oldSAiin.close(); // for (uint ii=0;ii<P->nSA;ii++) { // if (SA2[ii]!=SAo[ii]) { // cout <<ii <<" "<< SA2[ii]<<" "<<SAo[ii]<<endl; // }; // }; for (uint iL=0; iL < P->genomeSAindexNbases; iL++) { for (uint ii=P->genomeSAindexStart[iL];ii<P->genomeSAindexStart[iL+1]; ii++) {//scan through the longets index if ( SAio[ii]!=SAi[ii] ) { cout <<ii<<" "<<SAio[ii]<<" "<<SAi[ii]<<endl; }; }; }; */ /* ofstream genomeOut("/home/dobin/Genome"); fstreamWriteBig(genomeOut,G,P->nGenome+nGsj,"777","777",P); genomeOut.close(); genomeOut.open("/home/dobin/SA"); fstreamWriteBig(genomeOut,SA2.charArray,SA2.lengthByte,"777","777",P); genomeOut.close(); */ delete [] indArray; delete [] G1c; delete [] oldSJind; };
void genomeGenerate(Parameters *P) { //check parameters if (P->sjdbOverhang<=0 && (P->sjdbFileChrStartEnd.at(0)!="-" || P->sjdbGTFfile!="-")) { ostringstream errOut; errOut << "EXITING because of FATAL INPUT PARAMETER ERROR: for generating genome with annotations (--sjdbFileChrStartEnd or --sjdbGTFfile options)\n"; errOut << "you need to specify >0 --sjdbOverhang\n"; errOut << "SOLUTION: re-run genome generation specifying non-zero --sjdbOverhang, which ideally should be equal to OneMateLength-1, or could be chosen generically as ~100\n"; exitWithError(errOut.str(),std::cerr, P->inOut->logMain, EXIT_CODE_INPUT_FILES, *P); } if (P->sjdbFileChrStartEnd.at(0)=="-" && P->sjdbGTFfile=="-") { if (P->parArray.at(P->sjdbOverhang_par)->inputLevel>0 && P->sjdbOverhang>0) { ostringstream errOut; errOut << "EXITING because of FATAL INPUT PARAMETER ERROR: when generating genome without annotations (--sjdbFileChrStartEnd or --sjdbGTFfile options)\n"; errOut << "do not specify >0 --sjdbOverhang\n"; errOut << "SOLUTION: re-run genome generation without --sjdbOverhang option\n"; exitWithError(errOut.str(),std::cerr, P->inOut->logMain, EXIT_CODE_INPUT_FILES, *P); }; P->sjdbOverhang=0; }; //time time_t rawTime; string timeString; time(&rawTime); P->inOut->logMain << timeMonthDayTime(rawTime) <<" ... Starting to generate Genome files\n" <<flush; *P->inOut->logStdOut << timeMonthDayTime(rawTime) <<" ... Starting to generate Genome files\n" <<flush; //define some parameters from input parameters P->genomeChrBinNbases=1LLU << P->genomeChrBinNbits; //write genome parameters file genomeParametersWrite(P->genomeDir+("/genomeParameters.txt"), P, "ERROR_00102"); char *G=NULL, *G1=NULL; uint nGenomeReal=genomeScanFastaFiles(P,G,false);//first scan the fasta file to find all the sizes P->chrBinFill(); uint L=10000;//maximum length of genome suffix uint nG1alloc=(nGenomeReal + L)*2; G1=new char[nG1alloc]; G=G1+L; memset(G1,GENOME_spacingChar,nG1alloc);//initialize to K-1 all bytes genomeScanFastaFiles(P,G,true); //load the genome sequence uint N = nGenomeReal; P->nGenome=N; uint N2 = N*2; ofstream chrN,chrS,chrL,chrNL; ofstrOpen(P->genomeDir+"/chrName.txt","ERROR_00103", P, chrN); ofstrOpen(P->genomeDir+"/chrStart.txt","ERROR_00103", P, chrS); ofstrOpen(P->genomeDir+"/chrLength.txt","ERROR_00103", P, chrL); ofstrOpen(P->genomeDir+"/chrNameLength.txt","ERROR_00103", P, chrNL); for (uint ii=0;ii<P->nChrReal;ii++) {//output names, starts, lengths chrN<<P->chrName[ii]<<"\n"; chrS<<P->chrStart[ii]<<"\n"; chrL<<P->chrLength.at(ii)<<"\n"; chrNL<<P->chrName[ii]<<"\t"<<P->chrLength.at(ii)<<"\n"; }; chrS<<P->chrStart[P->nChrReal]<<"\n";//size of the genome chrN.close();chrL.close();chrS.close(); chrNL.close(); if (P->limitGenomeGenerateRAM < (nG1alloc+nG1alloc/3)) {//allocate nG1alloc/3 for SA generation ostringstream errOut; errOut <<"EXITING because of FATAL PARAMETER ERROR: limitGenomeGenerateRAM="<< (P->limitGenomeGenerateRAM) <<"is too small for your genome\n"; errOut <<"SOLUTION: please specify limitGenomeGenerateRAM not less than"<< nG1alloc+nG1alloc/3 <<" and make that much RAM available \n"; exitWithError(errOut.str(),std::cerr, P->inOut->logMain, EXIT_CODE_INPUT_FILES, *P); }; //preparing to generate SA for (uint ii=0;ii<N;ii++) {//- strand G[N2-1-ii]=G[ii]<4 ? 3-G[ii] : G[ii]; }; P->nSA=0; for (uint ii=0;ii<N2;ii+=P->genomeSAsparseD) { if (G[ii]<4) { P->nSA++; }; }; P->GstrandBit = (uint) floor(log(N)/log(2))+1; if (P->GstrandBit<32) P->GstrandBit=32; //TODO: use simple access function for SA P->GstrandMask = ~(1LLU<<P->GstrandBit); PackedArray SA1;//SA without sjdb SA1.defineBits(P->GstrandBit+1,P->nSA); PackedArray SA2;//SA with sjdb, reserve more space if (P->sjdbInsert.yes) {//reserve space for junction insertion SA2.defineBits(P->GstrandBit+1,P->nSA+2*P->limitSjdbInsertNsj*P->sjdbLength);//TODO: this allocation is wasteful, get a better estimate of the number of junctions } else {//same as SA1 SA2.defineBits(P->GstrandBit+1,P->nSA); }; P->nSAbyte=SA2.lengthByte; P->inOut->logMain << "Number of SA indices: "<< P->nSA << "\n"<<flush; //sort SA time ( &rawTime ); P->inOut->logMain << timeMonthDayTime(rawTime) <<" ... starting to sort Suffix Array. This may take a long time...\n" <<flush; *P->inOut->logStdOut << timeMonthDayTime(rawTime) <<" ... starting to sort Suffix Array. This may take a long time...\n" <<flush; // if (false) {//sort SA chunks for (uint ii=0;ii<N;ii++) {//re-fill the array backwards for sorting swap(G[N2-1-ii],G[ii]); }; globalG=G; globalL=L/sizeof(uint); //count the number of indices with 4nt prefix uint indPrefN=1LLU << 16; uint* indPrefCount = new uint [indPrefN]; memset(indPrefCount,0,indPrefN*sizeof(indPrefCount[0])); P->nSA=0; for (uint ii=0;ii<N2;ii+=P->genomeSAsparseD) { if (G[ii]<4) { uint p1=(G[ii]<<12) + (G[ii-1]<<8) + (G[ii-2]<<4) + G[ii-3]; indPrefCount[p1]++; P->nSA++; }; }; uint saChunkSize=(P->limitGenomeGenerateRAM-nG1alloc)/8/P->runThreadN; //number of SA indexes per chunk saChunkSize=saChunkSize*6/10; //allow extra space for qsort //uint saChunkN=((P->nSA/saChunkSize+1)/P->runThreadN+1)*P->runThreadN;//ensure saChunkN is divisible by P->runThreadN //saChunkSize=P->nSA/saChunkN+100000;//final chunk size if (P->runThreadN>1) saChunkSize=min(saChunkSize,P->nSA/(P->runThreadN-1)); uint saChunkN=P->nSA/saChunkSize;//estimate uint* indPrefStart = new uint [saChunkN*2]; //start and stop, *2 just in case uint* indPrefChunkCount = new uint [saChunkN*2]; indPrefStart[0]=0; saChunkN=0;//start counting chunks uint chunkSize1=indPrefCount[0]; for (uint ii=1; ii<indPrefN; ii++) { chunkSize1 += indPrefCount[ii]; if (chunkSize1 > saChunkSize) { saChunkN++; indPrefStart[saChunkN]=ii; indPrefChunkCount[saChunkN-1]=chunkSize1-indPrefCount[ii]; chunkSize1=indPrefCount[ii]; }; }; saChunkN++; indPrefStart[saChunkN]=indPrefN+1; indPrefChunkCount[saChunkN-1]=chunkSize1; P->inOut->logMain << "Number of chunks: " << saChunkN <<"; chunks size limit: " << saChunkSize*8 <<" bytes\n" <<flush; time ( &rawTime ); P->inOut->logMain << timeMonthDayTime(rawTime) <<" ... sorting Suffix Array chunks and saving them to disk...\n" <<flush; *P->inOut->logStdOut << timeMonthDayTime(rawTime) <<" ... sorting Suffix Array chunks and saving them to disk...\n" <<flush; #pragma omp parallel for num_threads(P->runThreadN) ordered schedule(dynamic,1) for (int iChunk=0; iChunk < (int) saChunkN; iChunk++) {//start the chunk cycle: sort each chunk with qsort and write to a file uint* saChunk=new uint [indPrefChunkCount[iChunk]];//allocate local array for each chunk for (uint ii=0,jj=0;ii<N2;ii+=P->genomeSAsparseD) {//fill the chunk with SA indices if (G[ii]<4) { uint p1=(G[ii]<<12) + (G[ii-1]<<8) + (G[ii-2]<<4) + G[ii-3]; if (p1>=indPrefStart[iChunk] && p1<indPrefStart[iChunk+1]) { saChunk[jj]=ii; jj++; }; //TODO: if (jj==indPrefChunkCount[iChunk]) break; }; }; //sort the chunk qsort(saChunk,indPrefChunkCount[iChunk],sizeof(saChunk[0]),funCompareSuffixes); for (uint ii=0;ii<indPrefChunkCount[iChunk];ii++) { saChunk[ii]=N2-1-saChunk[ii]; }; //write files ofstream saChunkFile; string chunkFileName=P->genomeDir+"/SA_"+to_string( (uint) iChunk); ofstrOpen(chunkFileName,"ERROR_00105", P, saChunkFile); fstreamWriteBig(saChunkFile, (char*) saChunk, sizeof(saChunk[0])*indPrefChunkCount[iChunk],chunkFileName,"ERROR_00121",P); saChunkFile.close(); delete [] saChunk; saChunk=NULL; }; time ( &rawTime ); P->inOut->logMain << timeMonthDayTime(rawTime) <<" ... loading chunks from disk, packing SA...\n" <<flush; *P->inOut->logStdOut << timeMonthDayTime(rawTime) <<" ... loading chunks from disk, packing SA...\n" <<flush; //read chunks and pack into full SA1 SA2.allocateArray(); SA1.pointArray(SA2.charArray + SA2.lengthByte-SA1.lengthByte); //SA1 is shifted to have space for junction insertion uint N2bit= 1LLU << P->GstrandBit; uint packedInd=0; #define SA_CHUNK_BLOCK_SIZE 10000000 uint* saIn=new uint[SA_CHUNK_BLOCK_SIZE]; //TODO make adjustable #ifdef genenomeGenerate_SA_textOutput ofstream SAtxtStream ((P->genomeDir + "/SAtxt").c_str()); #endif for (uint iChunk=0;iChunk<saChunkN;iChunk++) {//load files one by one and convert to packed ostringstream saChunkFileNameStream(""); saChunkFileNameStream<< P->genomeDir << "/SA_" << iChunk; ifstream saChunkFile(saChunkFileNameStream.str().c_str()); while (! saChunkFile.eof()) {//read blocks from each file uint chunkBytesN=fstreamReadBig(saChunkFile,(char*) saIn,SA_CHUNK_BLOCK_SIZE*sizeof(saIn[0])); for (uint ii=0;ii<chunkBytesN/sizeof(saIn[0]);ii++) { SA1.writePacked( packedInd+ii, (saIn[ii]<N) ? saIn[ii] : ( (saIn[ii]-N) | N2bit ) ); #ifdef genenomeGenerate_SA_textOutput SAtxtStream << saIn[ii] << "\n"; #endif }; packedInd += chunkBytesN/sizeof(saIn[0]); }; saChunkFile.close(); remove(saChunkFileNameStream.str().c_str());//remove the chunk file }; #ifdef genenomeGenerate_SA_textOutput SAtxtStream.close(); #endif delete [] saIn; if (packedInd != P->nSA ) {// ostringstream errOut; errOut << "EXITING because of FATAL problem while generating the suffix array\n"; errOut << "The number of indices read from chunks = "<<packedInd<<" is not equal to expected nSA="<<P->nSA<<"\n"; errOut << "SOLUTION: try to re-run suffix array generation, if it still does not work, report this problem to the author\n"<<flush; exitWithError(errOut.str(),std::cerr, P->inOut->logMain, EXIT_CODE_INPUT_FILES, *P); }; //DONE with suffix array generation for (uint ii=0;ii<N;ii++) {//return to normal order for future use swap(G[N2-1-ii],G[ii]); }; delete [] indPrefCount; delete [] indPrefStart; delete [] indPrefChunkCount; }; time ( &rawTime ); timeString=asctime(localtime ( &rawTime )); timeString.erase(timeString.end()-1,timeString.end()); P->inOut->logMain << timeMonthDayTime(rawTime) <<" ... Finished generating suffix array\n" <<flush; *P->inOut->logStdOut << timeMonthDayTime(rawTime) <<" ... Finished generating suffix array\n" <<flush; //////////////////////////////////////// // SA index // // PackedArray SAold; // // if (true) // {//testing: load SA from disk // //read chunks and pack into full SA1 // // ifstream oldSAin("./DirTrue/SA"); // oldSAin.seekg (0, ios::end); // P->nSAbyte=(uint) oldSAin.tellg(); // oldSAin.clear(); // oldSAin.seekg (0, ios::beg); // // P->nSA=(P->nSAbyte*8)/(P->GstrandBit+1); // SAold.defineBits(P->GstrandBit+1,P->nSA); // SAold.allocateArray(); // // oldSAin.read(SAold.charArray,SAold.lengthByte); // oldSAin.close(); // // SA1=SAold; // SA2=SAold; // }; PackedArray SAip; genomeSAindex(G,SA1,P,SAip); if (P->sjdbFileChrStartEnd.at(0)!="-" || P->sjdbGTFfile!="-") {//insert junctions SjdbClass sjdbLoci; Genome mainGenome(P); mainGenome.G=G; mainGenome.SA=SA1; mainGenome.SApass1=SA2; mainGenome.SAi=SAip; P->sjdbInsert.outDir=P->genomeDir; P->sjdbN=0;//no junctions are loaded yet P->twoPass.pass2=false; Parameters *P1=new Parameters; *P1=*P; sjdbInsertJunctions(P, P1, mainGenome, sjdbLoci); //write an extra 0 at the end of the array, filling the last bytes that otherwise are not accessible, but will be written to disk //this is - to avoid valgrind complaints. Note that SA2 is allocated with plenty of space to spare. SA2.writePacked(P->nSA,0); }; //write genome to disk time ( &rawTime ); P->inOut->logMain << timeMonthDayTime(rawTime) <<" ... writing Genome to disk ...\n" <<flush; *P->inOut->logStdOut << timeMonthDayTime(rawTime) <<" ... writing Genome to disk ...\n" <<flush; ofstream genomeOut; ofstrOpen(P->genomeDir+"/Genome","ERROR_00104", P, genomeOut); fstreamWriteBig(genomeOut,G,P->nGenome,P->genomeDir+"/Genome","ERROR_00120",P); genomeOut.close(); //write SA time ( &rawTime ); P->inOut->logMain << "SA size in bytes: "<< P->nSAbyte << "\n"<<flush; P->inOut->logMain << timeMonthDayTime(rawTime) <<" ... writing Suffix Array to disk ...\n" <<flush; *P->inOut->logStdOut << timeMonthDayTime(rawTime) <<" ... writing Suffix Array to disk ...\n" <<flush; ofstream SAout; ofstrOpen(P->genomeDir+"/SA","ERROR_00106", P, SAout); fstreamWriteBig(SAout,(char*) SA2.charArray, (streamsize) P->nSAbyte,P->genomeDir+"/SA","ERROR_00122",P); SAout.close(); //write SAi time(&rawTime); P->inOut->logMain << timeMonthDayTime(rawTime) <<" ... writing SAindex to disk\n" <<flush; *P->inOut->logStdOut << timeMonthDayTime(rawTime) <<" ... writing SAindex to disk\n" <<flush; //write SAi to disk ofstream SAiOut; ofstrOpen(P->genomeDir+"/SAindex","ERROR_00107", P, SAiOut); fstreamWriteBig(SAiOut, (char*) &P->genomeSAindexNbases, sizeof(P->genomeSAindexNbases),P->genomeDir+"/SAindex","ERROR_00123",P); fstreamWriteBig(SAiOut, (char*) P->genomeSAindexStart, sizeof(P->genomeSAindexStart[0])*(P->genomeSAindexNbases+1),P->genomeDir+"/SAindex","ERROR_00124",P); fstreamWriteBig(SAiOut, SAip.charArray, SAip.lengthByte,P->genomeDir+"/SAindex","ERROR_00125",P); SAiOut.close(); SA2.deallocateArray(); time(&rawTime); timeString=asctime(localtime ( &rawTime )); timeString.erase(timeString.end()-1,timeString.end()); time(&rawTime); P->inOut->logMain << timeMonthDayTime(rawTime) << " ..... Finished successfully\n" <<flush; *P->inOut->logStdOut << timeMonthDayTime(rawTime) << " ..... Finished successfully\n" <<flush; };
uint insertSeqSA(PackedArray & SA, PackedArray & SA1, PackedArray & SAi, char * G, char * G1, uint64 nG, uint64 nG1, uint64 nG2, Parameters * P) {//insert new sequences into the SA uint GstrandBit1 = (uint) floor(log(nG+nG1)/log(2))+1; if (GstrandBit1<32) GstrandBit1=32; //TODO: use simple access function for SA if ( GstrandBit1+1 != SA.wordLength) {//sequence is too long - GstrandBit changed ostringstream errOut; errOut << "EXITING because of FATAL ERROR: cannot insert sequence on the fly because of strand GstrandBit problem\n"; errOut << "SOLUTION: please contact STAR author at https://groups.google.com/forum/#!forum/rna-star\n"; exitWithError(errOut.str(),std::cerr, P->inOut->logMain, EXIT_CODE_GENOME_FILES, *P); }; uint N2bit= 1LLU << (SA.wordLength-1); uint strandMask=~N2bit; for (uint64 isa=0;isa<SA.length; isa++) { uint64 ind1=SA[isa]; if ( (ind1 & N2bit)>0 ) {//- strand if ( (ind1 & strandMask)>=nG2 ) {//the first nG bases ind1+=nG1; //reverse complementary indices are all shifted by the length of the sequence SA.writePacked(isa,ind1); }; } else {//+ strand if ( ind1>=nG ) {//the last nG2 bases ind1+=nG1; //reverse complementary indices are all shifted by the length of the sequence SA.writePacked(isa,ind1); }; }; }; char** seq1=new char*[2]; #define GENOME_endFillL 16 char* seqq=new char [4*nG1+3*GENOME_endFillL];//ends shouldbe filled with 5 to mark boundaries seq1[0]=seqq+GENOME_endFillL;//TODO: avoid defining an extra array, use reverse search seq1[1]=seqq+2*GENOME_endFillL+2*nG1; memset(seqq,GENOME_spacingChar,GENOME_endFillL); memset(seqq+2*nG1+GENOME_endFillL,GENOME_spacingChar,GENOME_endFillL); memset(seqq+4*nG1+2*GENOME_endFillL,GENOME_spacingChar,GENOME_endFillL); memcpy(seq1[0], G1, nG1); for (uint ii=0; ii<nG1; ii++) {//reverse complement sequence seq1[0][2*nG1-1-ii]=seq1[0][ii]<4 ? 3-seq1[0][ii] : seq1[0][ii]; }; complementSeqNumbers(seq1[0], seq1[1], 2*nG1);//complement uint64* indArray=new uint64[nG1*2*2+2];// for each base, 1st number - insertion place in SA, 2nd number - index, *2 for reverse compl #pragma omp parallel num_threads(P->runThreadN) #pragma omp for schedule (dynamic,1000) for (uint ii=0; ii<2*nG1; ii++) {//find insertion points for each of the sequences if (seq1[0][ii]>3) {//no index for suffices starting with N indArray[ii*2]=-1; } else { indArray[ii*2] = suffixArraySearch1(seq1, ii, 10000, G, nG, SA, (ii<nG1 ? true:false), 0, SA.length-1, 0, P) ; indArray[ii*2+1] = ii; }; }; uint64 nInd=0;//true number of new indices for (uint ii=0; ii<2*nG1; ii++) {//remove entries that cannot be inserted, this cannot be done in the parallel cycle above if (indArray[ii*2]!= (uint) -1) { indArray[nInd*2]=indArray[ii*2]; indArray[nInd*2+1]=indArray[ii*2+1]; ++nInd; }; }; time_t rawtime; time ( &rawtime ); P->inOut->logMain << timeMonthDayTime(rawtime) << " Finished SA search, number of new SA indices = "<<nInd<<endl; globalGenomeArray=seq1[0]; qsort((void*) indArray, nInd, 2*sizeof(uint64), funCompareUintAndSuffixes); time ( &rawtime ); P->inOut->logMain << timeMonthDayTime(rawtime) << " Finished sorting SA indices"<<endl; indArray[2*nInd]=-999; //mark the last junction indArray[2*nInd+1]=-999; //mark the last junction SA1.defineBits(SA.wordLength,SA.length+nInd); /*testing PackedArray SAo; SAo.defineBits(P->GstrandBit+1,P->nSA+nInd); SAo.allocateArray(); ifstream oldSAin("./DirTrue/SA"); oldSAin.read(SAo.charArray,SAo.lengthByte); oldSAin.close(); */ uint isa1=0, isa2=0; for (uint isa=0;isa<SA.length;isa++) { while (isa==indArray[isa1*2]) {//insert new index before the existing index uint ind1=indArray[isa1*2+1]; if (ind1<nG1) { ind1+=nG; } else {//reverse strand ind1=(ind1-nG1+nG2) | N2bit; }; SA1.writePacked(isa2,ind1); /*testing if (SA1[isa2]!=SAo[isa2]) { cout <<isa2 <<" "<< SA1[isa2]<<" "<<SAo[isa2]<<endl; //sleep(100); }; */ ++isa2; ++isa1; }; SA1.writePacked(isa2,SA[isa]); //TODO make sure that the first sj index is not before the first array index /*testing if (SA1[isa2]!=SAo[isa2]) { cout <<isa2 <<" "<< SA1[isa2]<<" "<<SAo[isa2]<<endl; //sleep(100); }; */ ++isa2; }; for (;isa1<nInd;isa1++) {//insert the last indices uint ind1=indArray[isa1*2+1]; if (ind1<nG1) { ind1+=nG; } else {//reverse strand ind1=(ind1-nG1+nG2) | N2bit; }; SA1.writePacked(isa2,ind1); ++isa2; }; time ( &rawtime ); P->inOut->logMain << timeMonthDayTime(rawtime) << " Finished inserting SA indices" <<endl; // //SAi insertions // for (uint iL=0; iL < P->genomeSAindexNbases; iL++) { // uint iSeq=0; // uint ind0=P->genomeSAindexStart[iL]-1;//last index that was present in the old genome // for (uint ii=P->genomeSAindexStart[iL];ii<P->genomeSAindexStart[iL+1]; ii++) {//scan through the longest index // if (ii==798466) // cout <<ii; // // uint iSA1=SAi[ii]; // uint iSA2=iSA1 & P->SAiMarkNmask & P->SAiMarkAbsentMask; // // if ( iSeq<nInd && (iSA1 & P->SAiMarkAbsentMaskC)>0 ) // {//index missing from the old genome // uint iSeq1=iSeq; // int64 ind1=funCalcSAi(seq1[0]+indArray[2*iSeq+1],iL); // while (ind1 < (int64)(ii-P->genomeSAindexStart[iL]) && indArray[2*iSeq]<iSA2) { // ++iSeq; // ind1=funCalcSAi(seq1[0]+indArray[2*iSeq+1],iL); // }; // if (ind1 == (int64)(ii-P->genomeSAindexStart[iL]) ) { // SAi.writePacked(ii,indArray[2*iSeq]+iSeq+1); // for (uint ii0=ind0+1; ii0<ii; ii0++) {//fill all the absent indices with this value // SAi.writePacked(ii0,(indArray[2*iSeq]+iSeq+1) | P->SAiMarkAbsentMaskC); // }; // ++iSeq; // ind0=ii; // } else { // iSeq=iSeq1; // }; // } else // {//index was present in the old genome // while (iSeq<nInd && indArray[2*iSeq]+1<iSA2) {//for this index insert "smaller" junctions // ++iSeq; // }; // // while (iSeq<nInd && indArray[2*iSeq]+1==iSA2) {//special case, the index falls right behind SAi // if (funCalcSAi(seq1[0]+indArray[2*iSeq+1],iL) >= (int64) (ii-P->genomeSAindexStart[iL]) ) {//this belongs to the next index // break; // }; // ++iSeq; // }; // // SAi.writePacked(ii,iSA1+iSeq); // // for (uint ii0=ind0+1; ii0<ii; ii0++) {//fill all the absent indices with this value // SAi.writePacked(ii0,(iSA2+iSeq) | P->SAiMarkAbsentMaskC); // }; // ind0=ii; // }; // }; // // }; // // time ( &rawtime ); cout << timeMonthDayTime(rawtime) << "SAi first" <<endl; // // for (uint isj=0;isj<nInd;isj++) { // int64 ind1=0; // for (uint iL=0; iL < P->genomeSAindexNbases; iL++) { // uint g=(uint) seq1[0][indArray[2*isj+1]+iL]; // ind1 <<= 2; // if (g>3) {//this iSA contains N, need to mark the previous // for (uint iL1=iL; iL1 < P->genomeSAindexNbases; iL1++) { // ind1+=3; // int64 ind2=P->genomeSAindexStart[iL1]+ind1; // for (; ind2>=0; ind2--) {//find previous index that is not absent // if ( (SAi[ind2] & P->SAiMarkAbsentMaskC)==0 ) { // break; // }; // }; // SAi.writePacked(ind2,SAi[ind2] | P->SAiMarkNmaskC); // ind1 <<= 2; // }; // break; // } else { // ind1 += g; // }; // }; // }; // time ( &rawtime ); // P->inOut->logMain << timeMonthDayTime(rawtime) << " Finished SAi" <<endl; // // /* testing // PackedArray SAio=SAi; // SAio.allocateArray(); // ifstream oldSAiin("./DirTrue/SAindex"); // oldSAiin.read(SAio.charArray,8*(P->genomeSAindexNbases+2));//skip first bytes // oldSAiin.read(SAio.charArray,SAio.lengthByte); // oldSAiin.close(); // // for (uint iL=0; iL < P->genomeSAindexNbases; iL++) { // for (uint ii=P->genomeSAindexStart[iL];ii<P->genomeSAindexStart[iL+1]; ii++) {//scan through the longets index // if ( SAio[ii]!=SAi[ii] ) { // cout <<iL<<" "<<ii<<" "<<SAio[ii]<<" "<<SAi[ii]<<endl; // }; // }; // }; // */ //change parameters, most parameters are already re-defined in sjdbPrepare.cpp SA.defineBits(P->GstrandBit+1,SA.length+nInd);//same as SA2 SA.pointArray(SA1.charArray); P->nSA=SA.length; P->nSAbyte=SA.lengthByte; //generate SAi genomeSAindex(G,SA,P,SAi); time ( &rawtime ); P->inOut->logMain << timeMonthDayTime(rawtime) << " Finished SAi" <<endl; // P->sjGstart=P->chrStart[P->nChrReal]; // memcpy(G+P->chrStart[P->nChrReal],seq1[0], nseq1[0]); return nInd; };