void Genome::insertSequences() { if (P->genomeFastaFiles.at(0)!="-") { time_t rawtime; time ( &rawtime ); P->inOut->logMain << timeMonthDayTime(rawtime) << " ..... Inserting extra sequences into genome indexes" <<endl; //move the junctions to free up space for seqs // chrStart/Name/Length nChrReal include the extra sequences // nGenome is the old, small genome size uint sjdblen=P->nGenome-(P->chrStart.back()-P->genomeInsertL);//length of sjdb sequences memmove(G+P->chrStart.back(),G+P->chrStart.back()-P->genomeInsertL,sjdblen); memset(G+P->chrStart.back()-P->genomeInsertL, GENOME_spacingChar, P->genomeInsertL);//fill empty space with spacing characters genomeScanFastaFiles(P, G+P->chrStart.back()-P->genomeInsertL, true); //read the seqs from file(s) into the free space uint64 nGenomeOld=P->nGenome; P->nGenome=P->chrStart.back()+sjdblen; //insert new sequences into the SA insertSeqSA(SA, SAinsert, SAi, G, G+P->chrStart.back()-P->genomeInsertL, nGenomeOld-sjdblen, P->genomeInsertL, sjdblen, P); //insert new sequences into the SAi //update P //save the genome if necessary }; };
void Genome::genomeLoad(){//allocate and load Genome time_t rawtime; time ( &rawtime ); *(P->inOut->logStdOut) << timeMonthDayTime(rawtime) << " ..... Loading genome\n" <<flush; uint *shmNG=NULL, *shmNSA=NULL; //pointers to shm stored values , *shmSG, *shmSSA uint64 shmSize=0;//, shmStartG=0; shmStartSA=0; uint L=200,K=6; Parameters *P1 = new Parameters; ifstream parFile((P->genomeDir+("/genomeParameters.txt")).c_str()); if (parFile.good()) { P->inOut->logMain << "Reading genome generation parameters:\n"; P1->inOut = P->inOut; P1->scanAllLines(parFile,3,-1); parFile.close(); } else { ostringstream errOut; errOut << "EXITING because of FATAL ERROR: could not open genome file "<< P->genomeDir+("/genomeParameters.txt") << endl; errOut << "SOLUTION: check that the path to genome files, specified in --genomeDir is correct and the files are present, and have user read permsissions\n" <<flush; exitWithError(errOut.str(),std::cerr, P->inOut->logMain, EXIT_CODE_GENOME_FILES, *P); }; //check genome version if (P1->versionGenome.size()==0 || P1->versionGenome[0]==0) {// ostringstream errOut; errOut << "EXITING because of FATAL ERROR: read no value for the versionGenome parameter from genomeParameters.txt file\n"; errOut << "SOLUTION: please re-generate genome from scratch with the latest version of STAR\n"; exitWithError(errOut.str(),std::cerr, P->inOut->logMain, EXIT_CODE_GENOME_FILES, *P); } else if (P->sjdbFileChrStartEnd.at(0)=="-" && P1->versionGenome.at(0) >= P->versionGenome.at(0)) {// P->inOut->logMain << "Genome version is compatible with current STAR version\n"; } else if (P->sjdbFileChrStartEnd.at(0)!="-" && P1->versionGenome.at(0) >= P->versionGenome.at(1)) {// P->inOut->logMain << "Genome version is compatible with current STAR version\n"; } else { ostringstream errOut; errOut << "EXITING because of FATAL ERROR: Genome version is INCOMPATIBLE with current STAR version\n"; errOut << "SOLUTION: please re-generate genome from scratch with the latest version of STAR\n"; exitWithError(errOut.str(),std::cerr, P->inOut->logMain, EXIT_CODE_GENOME_FILES, *P); }; //find chr starts from files P->chrInfoLoad(); //check if sjdbInfo.txt exists => genome was generated with junctions bool sjdbInfoExists=false; struct stat sjdb1; if ( stat( (P->genomeDir+"/sjdbInfo.txt").c_str(), &sjdb1) == 0 ) {//file exists sjdbInfoExists=true; }; if ( P->sjdbInsert.yes && sjdbInfoExists && P1->sjdbInsert.save=="") {//if sjdbInsert, and genome had junctions, and genome is old - it should be re-generated with new STAR ostringstream errOut; errOut << "EXITING because of FATAL ERROR: old Genome is INCOMPATIBLE with on the fly junction insertion\n"; errOut << "SOLUTION: please re-generate genome from scratch with the latest version of STAR\n"; exitWithError(errOut.str(),std::cerr, P->inOut->logMain, EXIT_CODE_GENOME_FILES, *P); }; //record required genome parameters in P P->genomeSAindexNbases=P1->genomeSAindexNbases; P->genomeChrBinNbits=P1->genomeChrBinNbits; P->genomeSAsparseD=P1->genomeSAsparseD; if (P->parArray.at(P->sjdbOverhang_par)->inputLevel==0 && P1->sjdbOverhang>0) {//if --sjdbOverhang was not defined by user and it was defined >0 at the genome generation step, then use sjdbOverhang from the genome generation step P->sjdbOverhang=P1->sjdbOverhang; P->inOut->logMain << "--sjdbOverhang = " << P->sjdbOverhang << " taken from the generated genome\n"; } else if (sjdbInfoExists && P->parArray.at(P->sjdbOverhang_par)->inputLevel>0 && P->sjdbOverhang!=P1->sjdbOverhang) {//if sjdbOverhang was defined at the genome generation step,the mapping step value has to agree with it ostringstream errOut; errOut << "EXITING because of fatal PARAMETERS error: present --sjdbOverhang="<<P->sjdbOverhang << " is not equal to the value at the genome generation step ="<< P1->sjdbOverhang << "\n"; errOut << "SOLUTION: \n" <<flush; exitWithError(errOut.str(),std::cerr, P->inOut->logMain, EXIT_CODE_GENOME_FILES, *P); }; P->sjdbLength = P->sjdbOverhang==0 ? 0 : P->sjdbOverhang*2+1; P->inOut->logMain << "Started loading the genome: " << asctime (localtime ( &rawtime ))<<"\n"<<flush; ifstream GenomeIn, SAin, SAiIn; P->nGenome = OpenStream("Genome",GenomeIn); P->nSAbyte = OpenStream("SA",SAin); OpenStream("/SAindex",SAiIn); uint SAiInBytes=0; SAiInBytes += fstreamReadBig(SAiIn,(char*) &P->genomeSAindexNbases, sizeof(P->genomeSAindexNbases)); P->genomeSAindexStart = new uint[P->genomeSAindexNbases+1]; SAiInBytes += fstreamReadBig(SAiIn,(char*) P->genomeSAindexStart, sizeof(P->genomeSAindexStart[0])*(P->genomeSAindexNbases+1)); P->nSAi=P->genomeSAindexStart[P->genomeSAindexNbases]; P->inOut->logMain << "Read from SAindex: genomeSAindexNbases=" << P->genomeSAindexNbases <<" nSAi="<< P->nSAi <<endl; /////////////////////////////////// at this point all array sizes should be known: calculate packed array lengths P->GstrandBit = (uint) floor(log(P->nGenome)/log(2))+1; if (P->GstrandBit<32) P->GstrandBit=32; //TODO: use simple access function for SA P->GstrandMask = ~(1LLU<<P->GstrandBit); P->nSA=(P->nSAbyte*8)/(P->GstrandBit+1); SA.defineBits(P->GstrandBit+1,P->nSA); P->SAiMarkNbit=P->GstrandBit+1; P->SAiMarkAbsentBit=P->GstrandBit+2; P->SAiMarkNmaskC=1LLU << P->SAiMarkNbit; P->SAiMarkNmask=~P->SAiMarkNmaskC; P->SAiMarkAbsentMaskC=1LLU << P->SAiMarkAbsentBit; P->SAiMarkAbsentMask=~P->SAiMarkAbsentMaskC; SAi.defineBits(P->GstrandBit+3,P->nSAi); P->inOut->logMain << "nGenome=" << P->nGenome << "; nSAbyte=" << P->nSAbyte <<endl<< flush; P->inOut->logMain <<"GstrandBit="<<int(P->GstrandBit)<<" SA number of indices="<<P->nSA<<endl<<flush; shmSize=SA.lengthByte + P->nGenome+L+L+SHM_startG+8; shmSize+= SAi.lengthByte; if (P->annotScoreScale>0) shmSize+=P->nGenome; if ((P->genomeLoad=="LoadAndKeep" || P->genomeLoad=="LoadAndRemove" || P->genomeLoad=="LoadAndExit" || P->genomeLoad=="Remove") && sharedMemory == NULL) { bool unloadLast = P->genomeLoad=="LoadAndRemove"; try { sharedMemory = new SharedMemory(shmKey, unloadLast); sharedMemory->SetErrorStream(P->inOut->logStdOut); if (!sharedMemory->NeedsAllocation()) P->inOut->logMain <<"Found genome in shared memory\n"<<flush; if (P->genomeLoad=="Remove") {//kill the genome and exit if (sharedMemory->NeedsAllocation()) {//did not find genome in shared memory, nothing to kill ostringstream errOut; errOut << "EXITING: Did not find the genome in memory, did not remove any genomes from shared memory\n"; exitWithError(errOut.str(),std::cerr, P->inOut->logMain, EXIT_CODE_GENOME_FILES, *P); } else { sharedMemory->Clean(); P->inOut->logMain <<"DONE: removed the genome from shared memory\n"<<flush; return; }; } if (sharedMemory->NeedsAllocation()){ P->inOut->logMain <<"Allocating shared memory for genome\n"<<flush; sharedMemory->Allocate(shmSize); } } catch (const SharedMemoryException & exc) { HandleSharedMemoryException(exc, shmSize); } shmStart = (char*) sharedMemory->GetMapped(); shmNG= (uint*) (shmStart+SHM_sizeG); shmNSA= (uint*) (shmStart+SHM_sizeSA); if (!sharedMemory->IsAllocator()) { // genome is in shared memory or being loaded // wait for the process that will populate it // and record the sizes uint iwait=0; while (*shmNG != P->nGenome) { iwait++; P->inOut->logMain <<"Another job is still loading the genome, sleeping for 1 min\n" <<flush; sleep(60); if (iwait==100) { ostringstream errOut; errOut << "EXITING because of FATAL ERROR: waited too long for the other job to finish loading the genome" << strerror(errno) << "\n" <<flush; errOut << "SOLUTION: remove the shared memory chunk by running STAR with --genomeLoad Remove, and restart STAR" <<flush; exitWithError(errOut.str(),std::cerr, P->inOut->logMain, EXIT_CODE_GENOME_LOADING_WAITED_TOO_LONG, *P); }; }; if (P->nSAbyte!=*shmNSA) { ostringstream errOut; errOut << "EXITING because of FATAL ERROR: the SA file size did not match what we found in shared memory" << "\n" << flush; errOut << "SOLUTION: remove the shared memory chunk by running STAR with --genomeLoad Remove, and restart STAR" << flush; exitWithError(errOut.str(),std::cerr, P->inOut->logMain, EXIT_CODE_INCONSISTENT_DATA, *P); } P->inOut->logMain << "Using shared memory for genome. key=0x" <<hex<<shmKey<<dec<< "; shmid="<< sharedMemory->GetId() <<endl<<flush; } G1=shmStart+SHM_startG; SA.pointArray(G1+P->nGenome+L+L); char* shmNext=SA.charArray+P->nSAbyte; SAi.pointArray(shmNext); shmNext += SAi.lengthByte; // if (twoPass.pass1readsN==0) {//not 2-pass // shmStartG=SHM_startSHM; // shmStartSA=0; // } else {//2-pass // ostringstream errOut; // errOut << "EXITING because of FATAL ERROR: 2-pass procedure cannot be used with genome already loaded im memory' "\n" ; // errOut << "SOLUTION: check shared memory settigns as explained in STAR manual, OR run STAR with --genomeLoad NoSharedMemory to avoid using shared memory\n" <<flush; // exitWithError(errOut.str(),std::cerr, P->inOut->logMain, EXIT_CODE_SHM, *P); // }; if (P->annotScoreScale>0) {//optional allocation sigG = shmNext; shmNext += P->nGenome; } } else if (P->genomeLoad=="NoSharedMemory") // simply allocate memory, do not use shared memory { P->genomeInsertL=0; if (P->genomeFastaFiles.at(0)!="-") {//will insert sequences in the genome, now estimate the extra size uint oldlen=P->chrStart.back();//record the old length P->genomeInsertL=genomeScanFastaFiles(P,G,false)-oldlen; }; try { if (P->sjdbInsert.pass1 || P->sjdbInsert.pass2) {//reserve extra memory for insertion at the 1st and/or 2nd step nGenomePass1=P->nGenome+P->genomeInsertL; nSApass1=P->nSA+2*P->genomeInsertL; if (P->sjdbInsert.pass1) { nGenomePass1+=P->limitSjdbInsertNsj*P->sjdbLength; nSApass1+=2*P->limitSjdbInsertNsj*P->sjdbLength; }; nGenomePass2=nGenomePass1; nSApass2=nSApass1; if (P->sjdbInsert.pass2) { nGenomePass2+=P->limitSjdbInsertNsj*P->sjdbLength; nSApass2+=2*P->limitSjdbInsertNsj*P->sjdbLength; }; G1=new char[nGenomePass2+L+L]; SApass2.defineBits(P->GstrandBit+1,nSApass2); SApass2.allocateArray(); SApass1.defineBits(P->GstrandBit+1,nSApass1); SApass1.pointArray(SApass2.charArray+SApass2.lengthByte-SApass1.lengthByte); SA.pointArray(SApass1.charArray+SApass1.lengthByte-SA.lengthByte); } else {//no sjdb insertions if (P->genomeInsertL==0) {// no sequence insertion, simple allocation G1=new char[P->nGenome+L+L]; SA.allocateArray(); } else { G1=new char[P->nGenome+L+L+P->genomeInsertL]; SApass1.defineBits(P->GstrandBit+1,P->nSA+2*P->genomeInsertL);//TODO: re-define GstrandBit if necessary SApass1.allocateArray(); SA.pointArray(SApass1.charArray+SApass1.lengthByte-SA.lengthByte); }; }; SAi.allocateArray(); P->inOut->logMain <<"Shared memory is not used for genomes. Allocated a private copy of the genome.\n"<<flush; } catch (exception & exc) { ostringstream errOut; errOut <<"EXITING: fatal error trying to allocate genome arrays, exception thrown: "<<exc.what()<<endl; errOut <<"Possible cause 1: not enough RAM. Check if you have enough RAM " << P->nGenome+L+L+SA.lengthByte+SAi.lengthByte+2000000000 << " bytes\n"; errOut <<"Possible cause 2: not enough virtual memory allowed with ulimit. SOLUTION: run ulimit -v " << P->nGenome+L+L+SA.lengthByte+SAi.lengthByte+2000000000<<endl <<flush; exitWithError(errOut.str(),std::cerr, P->inOut->logMain, EXIT_CODE_MEMORY_ALLOCATION, *P); }; } // if (twopass1readsN==0) {//not 2-pass // shmStartG=SHM_startSHM; // shmStartSA=0; // } else {//2-pass // ostringstream errOut; // errOut << "EXITING because of FATAL ERROR: 2-pass procedure cannot be used with genome already loaded im memory' "\n" ; // errOut << "SOLUTION: check shared memory settings as explained in STAR manual, OR run STAR with --genomeLoad NoSharedMemory to avoid using shared memory\n" <<flush; // exitWithError(errOut.str(),std::cerr, P->inOut->logMain, EXIT_CODE_SHM, *P); // }; G=G1+L; bool isAllocatorProcess = sharedMemory != NULL && sharedMemory->IsAllocator(); if (P->genomeLoad=="NoSharedMemory" || isAllocatorProcess) {//load genome and SAs from files //load genome P->inOut->logMain <<"Genome file size: "<<P->nGenome <<" bytes; state: good=" <<GenomeIn.good()\ <<" eof="<<GenomeIn.eof()<<" fail="<<GenomeIn.fail()<<" bad="<<GenomeIn.bad()<<"\n"<<flush; P->inOut->logMain <<"Loading Genome ... " << flush; uint genomeReadBytesN=fstreamReadBig(GenomeIn,G,P->nGenome); P->inOut->logMain <<"done! state: good=" <<GenomeIn.good()\ <<" eof="<<GenomeIn.eof()<<" fail="<<GenomeIn.fail()<<" bad="<<GenomeIn.bad()<<"; loaded "<<genomeReadBytesN<<" bytes\n" << flush; GenomeIn.close(); for (uint ii=0;ii<L;ii++) {// attach a tail with the largest symbol G1[ii]=K-1; G[P->nGenome+ii]=K-1; }; //load SAs P->inOut->logMain <<"SA file size: "<<SA.lengthByte <<" bytes; state: good=" <<SAin.good()\ <<" eof="<<SAin.eof()<<" fail="<<SAin.fail()<<" bad="<<SAin.bad()<<"\n"<<flush; P->inOut->logMain <<"Loading SA ... " << flush; genomeReadBytesN=fstreamReadBig(SAin,SA.charArray, SA.lengthByte); P->inOut->logMain <<"done! state: good=" <<SAin.good()\ <<" eof="<<SAin.eof()<<" fail="<<SAin.fail()<<" bad="<<SAin.bad()<<"; loaded "<<genomeReadBytesN<<" bytes\n" << flush; SAin.close(); P->inOut->logMain <<"Loading SAindex ... " << flush; SAiInBytes +=fstreamReadBig(SAiIn,SAi.charArray, SAi.lengthByte); P->inOut->logMain <<"done: "<<SAiInBytes<<" bytes\n" << flush; }; SAiIn.close(); if ((P->genomeLoad=="LoadAndKeep" || P->genomeLoad=="LoadAndRemove" || P->genomeLoad=="LoadAndExit") && isAllocatorProcess ) { //record sizes. This marks the end of genome loading *shmNG=P->nGenome; *shmNSA=P->nSAbyte; }; time ( &rawtime ); P->inOut->logMain << "Finished loading the genome: " << asctime (localtime ( &rawtime )) <<"\n"<<flush; #ifdef COMPILE_FOR_MAC { uint sum1=0; for (uint ii=0;ii<P->nGenome; ii++) sum1 += (uint) (unsigned char) G[ii]; P->inOut->logMain << "Sum of all Genome bytes: " <<sum1 <<"\n"<<flush; sum1=0; for (uint ii=0;ii<SA.lengthByte; ii++) sum1 += (uint) (unsigned char) SA.charArray[ii]; P->inOut->logMain << "Sum of all SA bytes: " <<sum1 <<"\n"<<flush; sum1=0; for (uint ii=0;ii<SAi.lengthByte; ii++) sum1 += (uint) (unsigned char) SAi.charArray[ii]; P->inOut->logMain << "Sum of all SAi bytes: " <<sum1 <<"\n"<<flush; }; #endif if (P->genomeLoad=="LoadAndExit") { uint shmSum=0; for (uint ii=0;ii<shmSize;ii++) shmSum+=shmStart[ii]; P->inOut->logMain << "genomeLoad=LoadAndExit: completed, the genome is loaded and kept in RAM, EXITING now.\n"<<flush; return; }; insertSequences(); P->chrBinFill(); //splice junctions database if (P->nGenome==P->chrStart[P->nChrReal]) {//no sjdb P->sjdbN=0; P->sjGstart=P->chrStart[P->nChrReal]+1; //not sure why I need that } else {//there are sjdb chromosomes ifstream sjdbInfo((P->genomeDir+"/sjdbInfo.txt").c_str()); if (sjdbInfo.fail()) { ostringstream errOut; errOut << "EXITING because of FATAL error, could not open file " << (P->genomeDir+"/sjdbInfo.txt") <<"\n"; errOut << "SOLUTION: check that the path to genome files, specified in --genomeDir is correct and the files are present, and have user read permsissions\n" <<flush; exitWithError(errOut.str(),std::cerr, P->inOut->logMain, EXIT_CODE_INPUT_FILES, *P); }; sjdbInfo >> P->sjdbN >> P->sjdbOverhang; P->inOut->logMain << "Processing splice junctions database sjdbN=" <<P->sjdbN<<", sjdbOverhang=" <<P->sjdbOverhang <<" \n"; P->sjChrStart=P->nChrReal; P->sjGstart=P->chrStart[P->sjChrStart]; //fill the sj-db to genome translation array P->sjDstart=new uint [P->sjdbN]; P->sjAstart=new uint [P->sjdbN]; P->sjdbStart=new uint [P->sjdbN]; P->sjdbEnd=new uint [P->sjdbN]; P->sjdbMotif=new uint8 [P->sjdbN]; P->sjdbShiftLeft=new uint8 [P->sjdbN]; P->sjdbShiftRight=new uint8 [P->sjdbN]; P->sjdbStrand=new uint8 [P->sjdbN]; for (uint ii=0;ii<P->sjdbN;ii++) {//get the info about junctions from sjdbInfo.txt { uint16 d1,d2,d3,d4; sjdbInfo >> P->sjdbStart[ii] >> P->sjdbEnd[ii] >> d1 >> d2 >> d3 >> d4; P->sjdbMotif[ii] = (uint8) d1; P->sjdbShiftLeft[ii] = (uint8) d2; P->sjdbShiftRight[ii] = (uint8) d3; P->sjdbStrand[ii] = (uint8) d4; }; P->sjDstart[ii] = P->sjdbStart[ii] - P->sjdbOverhang; P->sjAstart[ii] = P->sjdbEnd[ii] + 1; if (P->sjdbMotif[ii]==0) {//shinon-canonical junctions back to their true coordinates P->sjDstart[ii] += P->sjdbShiftLeft[ii]; P->sjAstart[ii] += P->sjdbShiftLeft[ii]; }; }; }; //check and redefine some parameters //max intron size if (P->alignIntronMax==0 && P->alignMatesGapMax==0) { P->inOut->logMain << "alignIntronMax=alignMatesGapMax=0, the max intron size will be approximately determined by (2^winBinNbits)*winAnchorDistNbins=" \ << (1LLU<<P->winBinNbits)*P->winAnchorDistNbins <<endl; } else { //redefine winBinNbits P->winBinNbits=max( (uint) floor(log2(P->nGenome/40000)+0.5), (uint) floor(log2(max(max(4LLU,P->alignIntronMax),P->alignMatesGapMax)/4)+0.5) ); P->inOut->logMain << "To accomodate alignIntronMax="<<P->alignIntronMax<<" redefined winBinNbits="<< P->winBinNbits <<endl; }; if (P->winBinNbits > P->genomeChrBinNbits) { P->inOut->logMain << "winBinNbits=" <<P->winBinNbits <<" > " << "genomeChrBinNbits=" << P->genomeChrBinNbits << " redefining:\n"; P->winBinNbits=P->genomeChrBinNbits; P->inOut->logMain << "winBinNbits=" <<P->winBinNbits <<endl; }; if (P->alignIntronMax==0 && P->alignMatesGapMax==0) { } else { //redefine winFlankNbins,winAnchorDistNbins P->winFlankNbins=max(P->alignIntronMax,P->alignMatesGapMax)/(1LLU<<P->winBinNbits)+1; P->winAnchorDistNbins=2*P->winFlankNbins; P->inOut->logMain << "To accomodate alignIntronMax="<<P->alignIntronMax<<" and alignMatesGapMax="<<P->alignMatesGapMax<<\ ", redefined winFlankNbins="<<P->winFlankNbins<<" and winAnchorDistNbins="<<P->winAnchorDistNbins<<endl; }; P->winBinChrNbits=P->genomeChrBinNbits-P->winBinNbits; P->winBinN = P->nGenome/(1LLU << P->winBinNbits)+1;//this may be chenaged later };
void genomeGenerate(Parameters *P) { //check parameters if (P->sjdbOverhang<=0 && (P->sjdbFileChrStartEnd.at(0)!="-" || P->sjdbGTFfile!="-")) { ostringstream errOut; errOut << "EXITING because of FATAL INPUT PARAMETER ERROR: for generating genome with annotations (--sjdbFileChrStartEnd or --sjdbGTFfile options)\n"; errOut << "you need to specify >0 --sjdbOverhang\n"; errOut << "SOLUTION: re-run genome generation specifying non-zero --sjdbOverhang, which ideally should be equal to OneMateLength-1, or could be chosen generically as ~100\n"; exitWithError(errOut.str(),std::cerr, P->inOut->logMain, EXIT_CODE_INPUT_FILES, *P); } if (P->sjdbFileChrStartEnd.at(0)=="-" && P->sjdbGTFfile=="-") { if (P->parArray.at(P->sjdbOverhang_par)->inputLevel>0 && P->sjdbOverhang>0) { ostringstream errOut; errOut << "EXITING because of FATAL INPUT PARAMETER ERROR: when generating genome without annotations (--sjdbFileChrStartEnd or --sjdbGTFfile options)\n"; errOut << "do not specify >0 --sjdbOverhang\n"; errOut << "SOLUTION: re-run genome generation without --sjdbOverhang option\n"; exitWithError(errOut.str(),std::cerr, P->inOut->logMain, EXIT_CODE_INPUT_FILES, *P); }; P->sjdbOverhang=0; }; //time time_t rawTime; string timeString; time(&rawTime); P->inOut->logMain << timeMonthDayTime(rawTime) <<" ... Starting to generate Genome files\n" <<flush; *P->inOut->logStdOut << timeMonthDayTime(rawTime) <<" ... Starting to generate Genome files\n" <<flush; //define some parameters from input parameters P->genomeChrBinNbases=1LLU << P->genomeChrBinNbits; //write genome parameters file genomeParametersWrite(P->genomeDir+("/genomeParameters.txt"), P, "ERROR_00102"); char *G=NULL, *G1=NULL; uint nGenomeReal=genomeScanFastaFiles(P,G,false);//first scan the fasta file to find all the sizes P->chrBinFill(); uint L=10000;//maximum length of genome suffix uint nG1alloc=(nGenomeReal + L)*2; G1=new char[nG1alloc]; G=G1+L; memset(G1,GENOME_spacingChar,nG1alloc);//initialize to K-1 all bytes genomeScanFastaFiles(P,G,true); //load the genome sequence uint N = nGenomeReal; P->nGenome=N; uint N2 = N*2; ofstream chrN,chrS,chrL,chrNL; ofstrOpen(P->genomeDir+"/chrName.txt","ERROR_00103", P, chrN); ofstrOpen(P->genomeDir+"/chrStart.txt","ERROR_00103", P, chrS); ofstrOpen(P->genomeDir+"/chrLength.txt","ERROR_00103", P, chrL); ofstrOpen(P->genomeDir+"/chrNameLength.txt","ERROR_00103", P, chrNL); for (uint ii=0;ii<P->nChrReal;ii++) {//output names, starts, lengths chrN<<P->chrName[ii]<<"\n"; chrS<<P->chrStart[ii]<<"\n"; chrL<<P->chrLength.at(ii)<<"\n"; chrNL<<P->chrName[ii]<<"\t"<<P->chrLength.at(ii)<<"\n"; }; chrS<<P->chrStart[P->nChrReal]<<"\n";//size of the genome chrN.close();chrL.close();chrS.close(); chrNL.close(); if (P->limitGenomeGenerateRAM < (nG1alloc+nG1alloc/3)) {//allocate nG1alloc/3 for SA generation ostringstream errOut; errOut <<"EXITING because of FATAL PARAMETER ERROR: limitGenomeGenerateRAM="<< (P->limitGenomeGenerateRAM) <<"is too small for your genome\n"; errOut <<"SOLUTION: please specify limitGenomeGenerateRAM not less than"<< nG1alloc+nG1alloc/3 <<" and make that much RAM available \n"; exitWithError(errOut.str(),std::cerr, P->inOut->logMain, EXIT_CODE_INPUT_FILES, *P); }; //preparing to generate SA for (uint ii=0;ii<N;ii++) {//- strand G[N2-1-ii]=G[ii]<4 ? 3-G[ii] : G[ii]; }; P->nSA=0; for (uint ii=0;ii<N2;ii+=P->genomeSAsparseD) { if (G[ii]<4) { P->nSA++; }; }; P->GstrandBit = (uint) floor(log(N)/log(2))+1; if (P->GstrandBit<32) P->GstrandBit=32; //TODO: use simple access function for SA P->GstrandMask = ~(1LLU<<P->GstrandBit); PackedArray SA1;//SA without sjdb SA1.defineBits(P->GstrandBit+1,P->nSA); PackedArray SA2;//SA with sjdb, reserve more space if (P->sjdbInsert.yes) {//reserve space for junction insertion SA2.defineBits(P->GstrandBit+1,P->nSA+2*P->limitSjdbInsertNsj*P->sjdbLength);//TODO: this allocation is wasteful, get a better estimate of the number of junctions } else {//same as SA1 SA2.defineBits(P->GstrandBit+1,P->nSA); }; P->nSAbyte=SA2.lengthByte; P->inOut->logMain << "Number of SA indices: "<< P->nSA << "\n"<<flush; //sort SA time ( &rawTime ); P->inOut->logMain << timeMonthDayTime(rawTime) <<" ... starting to sort Suffix Array. This may take a long time...\n" <<flush; *P->inOut->logStdOut << timeMonthDayTime(rawTime) <<" ... starting to sort Suffix Array. This may take a long time...\n" <<flush; // if (false) {//sort SA chunks for (uint ii=0;ii<N;ii++) {//re-fill the array backwards for sorting swap(G[N2-1-ii],G[ii]); }; globalG=G; globalL=L/sizeof(uint); //count the number of indices with 4nt prefix uint indPrefN=1LLU << 16; uint* indPrefCount = new uint [indPrefN]; memset(indPrefCount,0,indPrefN*sizeof(indPrefCount[0])); P->nSA=0; for (uint ii=0;ii<N2;ii+=P->genomeSAsparseD) { if (G[ii]<4) { uint p1=(G[ii]<<12) + (G[ii-1]<<8) + (G[ii-2]<<4) + G[ii-3]; indPrefCount[p1]++; P->nSA++; }; }; uint saChunkSize=(P->limitGenomeGenerateRAM-nG1alloc)/8/P->runThreadN; //number of SA indexes per chunk saChunkSize=saChunkSize*6/10; //allow extra space for qsort //uint saChunkN=((P->nSA/saChunkSize+1)/P->runThreadN+1)*P->runThreadN;//ensure saChunkN is divisible by P->runThreadN //saChunkSize=P->nSA/saChunkN+100000;//final chunk size if (P->runThreadN>1) saChunkSize=min(saChunkSize,P->nSA/(P->runThreadN-1)); uint saChunkN=P->nSA/saChunkSize;//estimate uint* indPrefStart = new uint [saChunkN*2]; //start and stop, *2 just in case uint* indPrefChunkCount = new uint [saChunkN*2]; indPrefStart[0]=0; saChunkN=0;//start counting chunks uint chunkSize1=indPrefCount[0]; for (uint ii=1; ii<indPrefN; ii++) { chunkSize1 += indPrefCount[ii]; if (chunkSize1 > saChunkSize) { saChunkN++; indPrefStart[saChunkN]=ii; indPrefChunkCount[saChunkN-1]=chunkSize1-indPrefCount[ii]; chunkSize1=indPrefCount[ii]; }; }; saChunkN++; indPrefStart[saChunkN]=indPrefN+1; indPrefChunkCount[saChunkN-1]=chunkSize1; P->inOut->logMain << "Number of chunks: " << saChunkN <<"; chunks size limit: " << saChunkSize*8 <<" bytes\n" <<flush; time ( &rawTime ); P->inOut->logMain << timeMonthDayTime(rawTime) <<" ... sorting Suffix Array chunks and saving them to disk...\n" <<flush; *P->inOut->logStdOut << timeMonthDayTime(rawTime) <<" ... sorting Suffix Array chunks and saving them to disk...\n" <<flush; #pragma omp parallel for num_threads(P->runThreadN) ordered schedule(dynamic,1) for (int iChunk=0; iChunk < (int) saChunkN; iChunk++) {//start the chunk cycle: sort each chunk with qsort and write to a file uint* saChunk=new uint [indPrefChunkCount[iChunk]];//allocate local array for each chunk for (uint ii=0,jj=0;ii<N2;ii+=P->genomeSAsparseD) {//fill the chunk with SA indices if (G[ii]<4) { uint p1=(G[ii]<<12) + (G[ii-1]<<8) + (G[ii-2]<<4) + G[ii-3]; if (p1>=indPrefStart[iChunk] && p1<indPrefStart[iChunk+1]) { saChunk[jj]=ii; jj++; }; //TODO: if (jj==indPrefChunkCount[iChunk]) break; }; }; //sort the chunk qsort(saChunk,indPrefChunkCount[iChunk],sizeof(saChunk[0]),funCompareSuffixes); for (uint ii=0;ii<indPrefChunkCount[iChunk];ii++) { saChunk[ii]=N2-1-saChunk[ii]; }; //write files ofstream saChunkFile; string chunkFileName=P->genomeDir+"/SA_"+to_string( (uint) iChunk); ofstrOpen(chunkFileName,"ERROR_00105", P, saChunkFile); fstreamWriteBig(saChunkFile, (char*) saChunk, sizeof(saChunk[0])*indPrefChunkCount[iChunk],chunkFileName,"ERROR_00121",P); saChunkFile.close(); delete [] saChunk; saChunk=NULL; }; time ( &rawTime ); P->inOut->logMain << timeMonthDayTime(rawTime) <<" ... loading chunks from disk, packing SA...\n" <<flush; *P->inOut->logStdOut << timeMonthDayTime(rawTime) <<" ... loading chunks from disk, packing SA...\n" <<flush; //read chunks and pack into full SA1 SA2.allocateArray(); SA1.pointArray(SA2.charArray + SA2.lengthByte-SA1.lengthByte); //SA1 is shifted to have space for junction insertion uint N2bit= 1LLU << P->GstrandBit; uint packedInd=0; #define SA_CHUNK_BLOCK_SIZE 10000000 uint* saIn=new uint[SA_CHUNK_BLOCK_SIZE]; //TODO make adjustable #ifdef genenomeGenerate_SA_textOutput ofstream SAtxtStream ((P->genomeDir + "/SAtxt").c_str()); #endif for (uint iChunk=0;iChunk<saChunkN;iChunk++) {//load files one by one and convert to packed ostringstream saChunkFileNameStream(""); saChunkFileNameStream<< P->genomeDir << "/SA_" << iChunk; ifstream saChunkFile(saChunkFileNameStream.str().c_str()); while (! saChunkFile.eof()) {//read blocks from each file uint chunkBytesN=fstreamReadBig(saChunkFile,(char*) saIn,SA_CHUNK_BLOCK_SIZE*sizeof(saIn[0])); for (uint ii=0;ii<chunkBytesN/sizeof(saIn[0]);ii++) { SA1.writePacked( packedInd+ii, (saIn[ii]<N) ? saIn[ii] : ( (saIn[ii]-N) | N2bit ) ); #ifdef genenomeGenerate_SA_textOutput SAtxtStream << saIn[ii] << "\n"; #endif }; packedInd += chunkBytesN/sizeof(saIn[0]); }; saChunkFile.close(); remove(saChunkFileNameStream.str().c_str());//remove the chunk file }; #ifdef genenomeGenerate_SA_textOutput SAtxtStream.close(); #endif delete [] saIn; if (packedInd != P->nSA ) {// ostringstream errOut; errOut << "EXITING because of FATAL problem while generating the suffix array\n"; errOut << "The number of indices read from chunks = "<<packedInd<<" is not equal to expected nSA="<<P->nSA<<"\n"; errOut << "SOLUTION: try to re-run suffix array generation, if it still does not work, report this problem to the author\n"<<flush; exitWithError(errOut.str(),std::cerr, P->inOut->logMain, EXIT_CODE_INPUT_FILES, *P); }; //DONE with suffix array generation for (uint ii=0;ii<N;ii++) {//return to normal order for future use swap(G[N2-1-ii],G[ii]); }; delete [] indPrefCount; delete [] indPrefStart; delete [] indPrefChunkCount; }; time ( &rawTime ); timeString=asctime(localtime ( &rawTime )); timeString.erase(timeString.end()-1,timeString.end()); P->inOut->logMain << timeMonthDayTime(rawTime) <<" ... Finished generating suffix array\n" <<flush; *P->inOut->logStdOut << timeMonthDayTime(rawTime) <<" ... Finished generating suffix array\n" <<flush; //////////////////////////////////////// // SA index // // PackedArray SAold; // // if (true) // {//testing: load SA from disk // //read chunks and pack into full SA1 // // ifstream oldSAin("./DirTrue/SA"); // oldSAin.seekg (0, ios::end); // P->nSAbyte=(uint) oldSAin.tellg(); // oldSAin.clear(); // oldSAin.seekg (0, ios::beg); // // P->nSA=(P->nSAbyte*8)/(P->GstrandBit+1); // SAold.defineBits(P->GstrandBit+1,P->nSA); // SAold.allocateArray(); // // oldSAin.read(SAold.charArray,SAold.lengthByte); // oldSAin.close(); // // SA1=SAold; // SA2=SAold; // }; PackedArray SAip; genomeSAindex(G,SA1,P,SAip); if (P->sjdbFileChrStartEnd.at(0)!="-" || P->sjdbGTFfile!="-") {//insert junctions SjdbClass sjdbLoci; Genome mainGenome(P); mainGenome.G=G; mainGenome.SA=SA1; mainGenome.SApass1=SA2; mainGenome.SAi=SAip; P->sjdbInsert.outDir=P->genomeDir; P->sjdbN=0;//no junctions are loaded yet P->twoPass.pass2=false; Parameters *P1=new Parameters; *P1=*P; sjdbInsertJunctions(P, P1, mainGenome, sjdbLoci); //write an extra 0 at the end of the array, filling the last bytes that otherwise are not accessible, but will be written to disk //this is - to avoid valgrind complaints. Note that SA2 is allocated with plenty of space to spare. SA2.writePacked(P->nSA,0); }; //write genome to disk time ( &rawTime ); P->inOut->logMain << timeMonthDayTime(rawTime) <<" ... writing Genome to disk ...\n" <<flush; *P->inOut->logStdOut << timeMonthDayTime(rawTime) <<" ... writing Genome to disk ...\n" <<flush; ofstream genomeOut; ofstrOpen(P->genomeDir+"/Genome","ERROR_00104", P, genomeOut); fstreamWriteBig(genomeOut,G,P->nGenome,P->genomeDir+"/Genome","ERROR_00120",P); genomeOut.close(); //write SA time ( &rawTime ); P->inOut->logMain << "SA size in bytes: "<< P->nSAbyte << "\n"<<flush; P->inOut->logMain << timeMonthDayTime(rawTime) <<" ... writing Suffix Array to disk ...\n" <<flush; *P->inOut->logStdOut << timeMonthDayTime(rawTime) <<" ... writing Suffix Array to disk ...\n" <<flush; ofstream SAout; ofstrOpen(P->genomeDir+"/SA","ERROR_00106", P, SAout); fstreamWriteBig(SAout,(char*) SA2.charArray, (streamsize) P->nSAbyte,P->genomeDir+"/SA","ERROR_00122",P); SAout.close(); //write SAi time(&rawTime); P->inOut->logMain << timeMonthDayTime(rawTime) <<" ... writing SAindex to disk\n" <<flush; *P->inOut->logStdOut << timeMonthDayTime(rawTime) <<" ... writing SAindex to disk\n" <<flush; //write SAi to disk ofstream SAiOut; ofstrOpen(P->genomeDir+"/SAindex","ERROR_00107", P, SAiOut); fstreamWriteBig(SAiOut, (char*) &P->genomeSAindexNbases, sizeof(P->genomeSAindexNbases),P->genomeDir+"/SAindex","ERROR_00123",P); fstreamWriteBig(SAiOut, (char*) P->genomeSAindexStart, sizeof(P->genomeSAindexStart[0])*(P->genomeSAindexNbases+1),P->genomeDir+"/SAindex","ERROR_00124",P); fstreamWriteBig(SAiOut, SAip.charArray, SAip.lengthByte,P->genomeDir+"/SAindex","ERROR_00125",P); SAiOut.close(); SA2.deallocateArray(); time(&rawTime); timeString=asctime(localtime ( &rawTime )); timeString.erase(timeString.end()-1,timeString.end()); time(&rawTime); P->inOut->logMain << timeMonthDayTime(rawTime) << " ..... Finished successfully\n" <<flush; *P->inOut->logStdOut << timeMonthDayTime(rawTime) << " ..... Finished successfully\n" <<flush; };