int main(int argInN, char* argIn[]) { time(&g_statsAll.timeStart); Parameters *P = new Parameters; //all parameters P->inputParameters(argInN, argIn); *(P->inOut->logStdOut) << timeMonthDayTime(g_statsAll.timeStart) << " ..... Started STAR run\n" <<flush; //generate genome if (P->runMode=="genomeGenerate") { genomeGenerate(P); (void) sysRemoveDir (P->outFileTmp); P->inOut->logMain << "DONE: Genome generation, EXITING\n" << flush; exit(0); } else if (P->runMode!="alignReads") { P->inOut->logMain << "EXITING because of INPUT ERROR: unknown value of input parameter runMode=" <<P->runMode<<endl<<flush; exit(1); }; Genome mainGenome (P); mainGenome.genomeLoad(); if (P->genomeLoad=="LoadAndExit" || P->genomeLoad=="Remove") { return 0; }; P->twoPass.pass2=false; //this is the 1st pass SjdbClass sjdbLoci; if (P->sjdbInsert.pass1) { Parameters *P1=new Parameters; *P1=*P; sjdbInsertJunctions(P, P1, mainGenome, sjdbLoci); }; //calculate genome-related parameters Transcriptome *mainTranscriptome=NULL; /////////////////////////////////////////////////////////////////////////////////////////////////START if (P->runThreadN>1) { g_threadChunks.threadArray=new pthread_t[P->runThreadN]; pthread_mutex_init(&g_threadChunks.mutexInRead, NULL); pthread_mutex_init(&g_threadChunks.mutexOutSAM, NULL); pthread_mutex_init(&g_threadChunks.mutexOutBAM1, NULL); pthread_mutex_init(&g_threadChunks.mutexOutUnmappedFastx, NULL); pthread_mutex_init(&g_threadChunks.mutexOutFilterBySJout, NULL); pthread_mutex_init(&g_threadChunks.mutexStats, NULL); pthread_mutex_init(&g_threadChunks.mutexBAMsortBins, NULL); }; g_statsAll.progressReportHeader(P->inOut->logProgress); if (P->twoPass.yes) {//2-pass //re-define P for the pass1 Parameters *P1=new Parameters; *P1=*P; //turn off unnecessary calculations P1->outSAMtype[0]="None"; P1->outSAMbool=false; P1->outBAMunsorted=false; P1->outBAMcoord=false; P1->chimSegmentMin=0; P1->quant.yes=false; P1->quant.trSAM.yes=false; P1->quant.geCount.yes=false; P1->outFilterBySJoutStage=0; P1->outReadsUnmapped="None"; P1->outFileNamePrefix=P->twoPass.dir; P1->readMapNumber=P->twoPass.pass1readsN; // P1->inOut->logMain.open((P1->outFileNamePrefix + "Log.out").c_str()); g_statsAll.resetN(); time(&g_statsAll.timeStartMap); P->inOut->logProgress << timeMonthDayTime(g_statsAll.timeStartMap) <<"\tStarted 1st pass mapping\n" <<flush; *P->inOut->logStdOut << timeMonthDayTime(g_statsAll.timeStartMap) << " ..... Started 1st pass mapping\n" <<flush; //run mapping for Pass1 ReadAlignChunk *RAchunk1[P->runThreadN]; for (int ii=0;ii<P1->runThreadN;ii++) { RAchunk1[ii]=new ReadAlignChunk(P1, mainGenome, mainTranscriptome, ii); }; mapThreadsSpawn(P1, RAchunk1); outputSJ(RAchunk1,P1); //collapse and output junctions // for (int ii=0;ii<P1->runThreadN;ii++) { // delete [] RAchunk[ii]; // }; time_t rawtime; time (&rawtime); P->inOut->logProgress << timeMonthDayTime(rawtime) <<"\tFinished 1st pass mapping\n"; *P->inOut->logStdOut << timeMonthDayTime(rawtime) << " ..... Finished 1st pass mapping\n" <<flush; ofstream logFinal1 ( (P->twoPass.dir + "/Log.final.out").c_str()); g_statsAll.reportFinal(logFinal1,P1); P->twoPass.pass2=true;//starting the 2nd pass P->twoPass.pass1sjFile=P->twoPass.dir+"/SJ.out.tab"; sjdbInsertJunctions(P, P1, mainGenome, sjdbLoci); //reopen reads files P->closeReadsFiles(); P->openReadsFiles(); } else {//not 2-pass //nothing for now }; if ( P->quant.yes ) {//load transcriptome mainTranscriptome=new Transcriptome(P); }; //initialize Stats g_statsAll.resetN(); time(&g_statsAll.timeStartMap); *P->inOut->logStdOut << timeMonthDayTime(g_statsAll.timeStartMap) << " ..... Started mapping\n" <<flush; g_statsAll.timeLastReport=g_statsAll.timeStartMap; //open SAM/BAM files for output if (P->outSAMmode != "None") {//open SAM file and write header ostringstream samHeaderStream; for (uint ii=0;ii<P->nChrReal;ii++) { samHeaderStream << "@SQ\tSN:"<< P->chrName.at(ii) <<"\tLN:"<<P->chrLength[ii]<<"\n"; }; if (P->outSAMheaderPG.at(0)!="-") { samHeaderStream << P->outSAMheaderPG.at(0); for (uint ii=1;ii<P->outSAMheaderPG.size(); ii++) { samHeaderStream << "\t" << P->outSAMheaderPG.at(ii); }; samHeaderStream << "\n"; }; samHeaderStream << "@PG\tID:STAR\tPN:STAR\tVN:" << STAR_VERSION <<"\tCL:" << P->commandLineFull <<"\n"; if (P->outSAMheaderCommentFile!="-") { ifstream comstream (P->outSAMheaderCommentFile); while (comstream.good()) { string line1; getline(comstream,line1); if (line1.find_first_not_of(" \t\n\v\f\r")!=std::string::npos) {//skip blank lines samHeaderStream << line1 <<"\n"; }; }; }; for (uint32 ii=0;ii<P->outSAMattrRGlineSplit.size();ii++) {//@RG lines samHeaderStream << "@RG\t" << P->outSAMattrRGlineSplit.at(ii) <<"\n"; }; samHeaderStream << "@CO\t" <<"user command line: " << P->commandLine <<"\n"; if (P->outSAMheaderHD.at(0)!="-") { P->samHeaderHD = P->outSAMheaderHD.at(0); for (uint ii=1;ii<P->outSAMheaderHD.size(); ii++) { P->samHeaderHD +="\t" + P->outSAMheaderHD.at(ii); }; } else { P->samHeaderHD = "@HD\tVN:1.4"; }; P->samHeader=P->samHeaderHD+"\n"+samHeaderStream.str(); //for the sorted BAM, need to add SO:cooridnate to the header line P->samHeaderSortedCoord=P->samHeaderHD + (P->outSAMheaderHD.size()==0 ? "" : "\tSO:coordinate") + "\n" + samHeaderStream.str(); if (P->outSAMbool) {// *P->inOut->outSAM << P->samHeader; }; if (P->outBAMunsorted){ outBAMwriteHeader(P->inOut->outBAMfileUnsorted,P->samHeader,P->chrName,P->chrLength); }; // if (P->outBAMcoord){ // outBAMwriteHeader(P->inOut->outBAMfileCoord,P->samHeader,P->chrName,P->chrLength); // }; if ( P->quant.trSAM.yes ) { samHeaderStream.str(""); vector <uint> trlength; for (uint32 ii=0;ii<mainTranscriptome->trID.size();ii++) { uint32 iex1=mainTranscriptome->trExI[ii]+mainTranscriptome->trExN[ii]-1; //last exon of the transcript trlength.push_back(mainTranscriptome->exLenCum[iex1]+mainTranscriptome->exSE[2*iex1+1]-mainTranscriptome->exSE[2*iex1]+1); samHeaderStream << "@SQ\tSN:"<< mainTranscriptome->trID.at(ii) <<"\tLN:"<<trlength.back()<<"\n"; }; for (uint32 ii=0;ii<P->outSAMattrRGlineSplit.size();ii++) {//@RG lines samHeaderStream << "@RG\t" << P->outSAMattrRGlineSplit.at(ii) <<"\n"; }; outBAMwriteHeader(P->inOut->outQuantBAMfile,samHeaderStream.str(),mainTranscriptome->trID,trlength); }; }; if (P->chimSegmentMin>0) { P->inOut->outChimJunction.open((P->outFileNamePrefix + "Chimeric.out.junction").c_str()); P->inOut->outChimSAM.open((P->outFileNamePrefix + "Chimeric.out.sam").c_str()); P->inOut->outChimSAM << P->samHeader; pthread_mutex_init(&g_threadChunks.mutexOutChimSAM, NULL); pthread_mutex_init(&g_threadChunks.mutexOutChimJunction, NULL); }; // P->inOut->logMain << "mlock value="<<mlockall(MCL_CURRENT|MCL_FUTURE) <<"\n"<<flush; // prepare chunks and spawn mapping threads ReadAlignChunk *RAchunk[P->runThreadN]; for (int ii=0;ii<P->runThreadN;ii++) { RAchunk[ii]=new ReadAlignChunk(P, mainGenome, mainTranscriptome, ii); }; mapThreadsSpawn(P, RAchunk); if (P->outFilterBySJoutStage==1) {//completed stage 1, go to stage 2 P->inOut->logMain << "Completed stage 1 mapping of outFilterBySJout mapping\n"<<flush; outputSJ(RAchunk,P);//collapse novel junctions P->readFilesIndex=-1; P->outFilterBySJoutStage=2; if (P->outBAMcoord) { for (int it=0; it<P->runThreadN; it++) {//prepare the unmapped bin RAchunk[it]->chunkOutBAMcoord->coordUnmappedPrepareBySJout(); }; }; mapThreadsSpawn(P, RAchunk); }; //close some BAM files if (P->inOut->outBAMfileUnsorted!=NULL) { bgzf_flush(P->inOut->outBAMfileUnsorted); bgzf_close(P->inOut->outBAMfileUnsorted); }; if (P->inOut->outQuantBAMfile!=NULL) { bgzf_flush(P->inOut->outQuantBAMfile); bgzf_close(P->inOut->outQuantBAMfile); }; if (P->outBAMcoord && P->limitBAMsortRAM==0) {//make it equal ot the genome size P->limitBAMsortRAM=P->nGenome+mainGenome.SA.lengthByte+mainGenome.SAi.lengthByte; }; //no need for genome anymore, free the memory mainGenome.freeMemory(); if ( P->quant.geCount.yes ) {//output gene quantifications for (int ichunk=1; ichunk<P->runThreadN; ichunk++) {//sum counts from all chunks into 0th chunk RAchunk[0]->chunkTr->quants->addQuants(*(RAchunk[ichunk]->chunkTr->quants)); }; RAchunk[0]->chunkTr->quantsOutput(); }; if (P->runThreadN>1 && P->outSAMorder=="PairedKeepInputOrder") {//concatenate Aligned.* files RAchunk[0]->chunkFilesCat(P->inOut->outSAM, P->outFileTmp + "/Aligned.out.sam.chunk", g_threadChunks.chunkOutN); }; if (P->outBAMcoord) {//sort BAM if needed *P->inOut->logStdOut << timeMonthDayTime() << " ..... Started sorting BAM\n" <<flush; P->inOut->logMain << timeMonthDayTime() << " ..... Started sorting BAM\n" <<flush; uint32 nBins=P->outBAMcoordNbins; //check max size needed for sorting uint maxMem=0; for (uint32 ibin=0; ibin<nBins-1; ibin++) {//check akk bins uint binS=0; for (int it=0; it<P->runThreadN; it++) {//collect sizes from threads binS += RAchunk[it]->chunkOutBAMcoord->binTotalBytes[ibin]+24*RAchunk[it]->chunkOutBAMcoord->binTotalN[ibin]; }; if (binS>maxMem) maxMem=binS; }; P->inOut->logMain << "Max memory needed for sorting = "<<maxMem<<endl; if (maxMem>P->limitBAMsortRAM) { ostringstream errOut; errOut <<"EXITING because of fatal ERROR: not enough memory for BAM sorting: \n"; errOut <<"SOLUTION: re-run STAR with at least --limitBAMsortRAM " <<maxMem+1000000000; exitWithError(errOut.str(), std::cerr, P->inOut->logMain, EXIT_CODE_PARAMETER, *P); }; uint totalMem=0; // P->inOut->logMain << "Started sorting BAM ..." <<endl; #pragma omp parallel num_threads(P->outBAMsortingThreadNactual) #pragma omp for schedule (dynamic,1) for (uint32 ibin1=0; ibin1<nBins; ibin1++) { uint32 ibin=nBins-1-ibin1;//reverse order to start with the last bin - unmapped reads uint binN=0, binS=0; for (int it=0; it<P->runThreadN; it++) {//collect sizes from threads binN += RAchunk[it]->chunkOutBAMcoord->binTotalN[ibin]; binS += RAchunk[it]->chunkOutBAMcoord->binTotalBytes[ibin]; }; if (binS==0) continue; //empty bin if (ibin == nBins-1) {//last bin for unmapped reads BAMbinSortUnmapped(ibin,P->runThreadN,P->outBAMsortTmpDir,P->inOut->outBAMfileCoord, P); } else { uint newMem=binS+binN*24; bool boolWait=true; while (boolWait) { #pragma omp critical if (totalMem+newMem < P->limitBAMsortRAM) { boolWait=false; totalMem+=newMem; }; sleep(0.1); }; BAMbinSortByCoordinate(ibin,binN,binS,P->runThreadN,P->outBAMsortTmpDir,P->inOut->outBAMfileCoord, P); #pragma omp critical totalMem-=newMem;//"release" RAM }; }; //concatenate all BAM files, using bam_cat char **bamBinNames = new char* [nBins]; vector <string> bamBinNamesV; for (uint32 ibin=0; ibin<nBins; ibin++) { bamBinNamesV.push_back(P->outBAMsortTmpDir+"/b"+to_string((uint) ibin)); struct stat buffer; if (stat (bamBinNamesV.back().c_str(), &buffer) != 0) {//check if file exists bamBinNamesV.pop_back(); }; }; for (uint32 ibin=0; ibin<bamBinNamesV.size(); ibin++) { bamBinNames[ibin] = (char*) bamBinNamesV.at(ibin).c_str(); }; bam_cat(bamBinNamesV.size(), bamBinNames, 0, P->outBAMfileCoordName.c_str()); }; //wiggle output if (P->outWigFlags.yes) { *P->inOut->logStdOut << timeMonthDayTime() << " ..... Started wiggle output\n" <<flush; P->inOut->logMain << timeMonthDayTime() << " ..... Started wiggle output\n" <<flush; string wigOutFileNamePrefix=P->outFileNamePrefix + "Signal"; signalFromBAM(P->outBAMfileCoordName, wigOutFileNamePrefix, *P); }; //aggregate output junctions //collapse splice junctions from different threads/chunks, and output them outputSJ(RAchunk,P); g_statsAll.progressReport(P->inOut->logProgress); P->inOut->logProgress << "ALL DONE!\n"<<flush; P->inOut->logFinal.open((P->outFileNamePrefix + "Log.final.out").c_str()); g_statsAll.reportFinal(P->inOut->logFinal,P); *P->inOut->logStdOut << timeMonthDayTime(g_statsAll.timeFinish) << " ..... Finished successfully\n" <<flush; P->inOut->logMain << "ALL DONE!\n"<<flush; sysRemoveDir (P->outFileTmp); P->closeReadsFiles();//this will kill the readFilesCommand processes if necessary mainGenome.~Genome(); //need explicit call because of the 'delete P->inOut' below, which will destroy P->inOut->logStdOut delete P->inOut; //to close files delete P; return 0; };
void genomeGenerate(Parameters *P) { //check parameters if (P->sjdbOverhang<=0 && (P->sjdbFileChrStartEnd.at(0)!="-" || P->sjdbGTFfile!="-")) { ostringstream errOut; errOut << "EXITING because of FATAL INPUT PARAMETER ERROR: for generating genome with annotations (--sjdbFileChrStartEnd or --sjdbGTFfile options)\n"; errOut << "you need to specify >0 --sjdbOverhang\n"; errOut << "SOLUTION: re-run genome generation specifying non-zero --sjdbOverhang, which ideally should be equal to OneMateLength-1, or could be chosen generically as ~100\n"; exitWithError(errOut.str(),std::cerr, P->inOut->logMain, EXIT_CODE_INPUT_FILES, *P); } if (P->sjdbFileChrStartEnd.at(0)=="-" && P->sjdbGTFfile=="-") { if (P->parArray.at(P->sjdbOverhang_par)->inputLevel>0 && P->sjdbOverhang>0) { ostringstream errOut; errOut << "EXITING because of FATAL INPUT PARAMETER ERROR: when generating genome without annotations (--sjdbFileChrStartEnd or --sjdbGTFfile options)\n"; errOut << "do not specify >0 --sjdbOverhang\n"; errOut << "SOLUTION: re-run genome generation without --sjdbOverhang option\n"; exitWithError(errOut.str(),std::cerr, P->inOut->logMain, EXIT_CODE_INPUT_FILES, *P); }; P->sjdbOverhang=0; }; //time time_t rawTime; string timeString; time(&rawTime); P->inOut->logMain << timeMonthDayTime(rawTime) <<" ... Starting to generate Genome files\n" <<flush; *P->inOut->logStdOut << timeMonthDayTime(rawTime) <<" ... Starting to generate Genome files\n" <<flush; //define some parameters from input parameters P->genomeChrBinNbases=1LLU << P->genomeChrBinNbits; //write genome parameters file genomeParametersWrite(P->genomeDir+("/genomeParameters.txt"), P, "ERROR_00102"); char *G=NULL, *G1=NULL; uint nGenomeReal=genomeScanFastaFiles(P,G,false);//first scan the fasta file to find all the sizes P->chrBinFill(); uint L=10000;//maximum length of genome suffix uint nG1alloc=(nGenomeReal + L)*2; G1=new char[nG1alloc]; G=G1+L; memset(G1,GENOME_spacingChar,nG1alloc);//initialize to K-1 all bytes genomeScanFastaFiles(P,G,true); //load the genome sequence uint N = nGenomeReal; P->nGenome=N; uint N2 = N*2; ofstream chrN,chrS,chrL,chrNL; ofstrOpen(P->genomeDir+"/chrName.txt","ERROR_00103", P, chrN); ofstrOpen(P->genomeDir+"/chrStart.txt","ERROR_00103", P, chrS); ofstrOpen(P->genomeDir+"/chrLength.txt","ERROR_00103", P, chrL); ofstrOpen(P->genomeDir+"/chrNameLength.txt","ERROR_00103", P, chrNL); for (uint ii=0;ii<P->nChrReal;ii++) {//output names, starts, lengths chrN<<P->chrName[ii]<<"\n"; chrS<<P->chrStart[ii]<<"\n"; chrL<<P->chrLength.at(ii)<<"\n"; chrNL<<P->chrName[ii]<<"\t"<<P->chrLength.at(ii)<<"\n"; }; chrS<<P->chrStart[P->nChrReal]<<"\n";//size of the genome chrN.close();chrL.close();chrS.close(); chrNL.close(); if (P->limitGenomeGenerateRAM < (nG1alloc+nG1alloc/3)) {//allocate nG1alloc/3 for SA generation ostringstream errOut; errOut <<"EXITING because of FATAL PARAMETER ERROR: limitGenomeGenerateRAM="<< (P->limitGenomeGenerateRAM) <<"is too small for your genome\n"; errOut <<"SOLUTION: please specify limitGenomeGenerateRAM not less than"<< nG1alloc+nG1alloc/3 <<" and make that much RAM available \n"; exitWithError(errOut.str(),std::cerr, P->inOut->logMain, EXIT_CODE_INPUT_FILES, *P); }; //preparing to generate SA for (uint ii=0;ii<N;ii++) {//- strand G[N2-1-ii]=G[ii]<4 ? 3-G[ii] : G[ii]; }; P->nSA=0; for (uint ii=0;ii<N2;ii+=P->genomeSAsparseD) { if (G[ii]<4) { P->nSA++; }; }; P->GstrandBit = (uint) floor(log(N)/log(2))+1; if (P->GstrandBit<32) P->GstrandBit=32; //TODO: use simple access function for SA P->GstrandMask = ~(1LLU<<P->GstrandBit); PackedArray SA1;//SA without sjdb SA1.defineBits(P->GstrandBit+1,P->nSA); PackedArray SA2;//SA with sjdb, reserve more space if (P->sjdbInsert.yes) {//reserve space for junction insertion SA2.defineBits(P->GstrandBit+1,P->nSA+2*P->limitSjdbInsertNsj*P->sjdbLength);//TODO: this allocation is wasteful, get a better estimate of the number of junctions } else {//same as SA1 SA2.defineBits(P->GstrandBit+1,P->nSA); }; P->nSAbyte=SA2.lengthByte; P->inOut->logMain << "Number of SA indices: "<< P->nSA << "\n"<<flush; //sort SA time ( &rawTime ); P->inOut->logMain << timeMonthDayTime(rawTime) <<" ... starting to sort Suffix Array. This may take a long time...\n" <<flush; *P->inOut->logStdOut << timeMonthDayTime(rawTime) <<" ... starting to sort Suffix Array. This may take a long time...\n" <<flush; // if (false) {//sort SA chunks for (uint ii=0;ii<N;ii++) {//re-fill the array backwards for sorting swap(G[N2-1-ii],G[ii]); }; globalG=G; globalL=L/sizeof(uint); //count the number of indices with 4nt prefix uint indPrefN=1LLU << 16; uint* indPrefCount = new uint [indPrefN]; memset(indPrefCount,0,indPrefN*sizeof(indPrefCount[0])); P->nSA=0; for (uint ii=0;ii<N2;ii+=P->genomeSAsparseD) { if (G[ii]<4) { uint p1=(G[ii]<<12) + (G[ii-1]<<8) + (G[ii-2]<<4) + G[ii-3]; indPrefCount[p1]++; P->nSA++; }; }; uint saChunkSize=(P->limitGenomeGenerateRAM-nG1alloc)/8/P->runThreadN; //number of SA indexes per chunk saChunkSize=saChunkSize*6/10; //allow extra space for qsort //uint saChunkN=((P->nSA/saChunkSize+1)/P->runThreadN+1)*P->runThreadN;//ensure saChunkN is divisible by P->runThreadN //saChunkSize=P->nSA/saChunkN+100000;//final chunk size if (P->runThreadN>1) saChunkSize=min(saChunkSize,P->nSA/(P->runThreadN-1)); uint saChunkN=P->nSA/saChunkSize;//estimate uint* indPrefStart = new uint [saChunkN*2]; //start and stop, *2 just in case uint* indPrefChunkCount = new uint [saChunkN*2]; indPrefStart[0]=0; saChunkN=0;//start counting chunks uint chunkSize1=indPrefCount[0]; for (uint ii=1; ii<indPrefN; ii++) { chunkSize1 += indPrefCount[ii]; if (chunkSize1 > saChunkSize) { saChunkN++; indPrefStart[saChunkN]=ii; indPrefChunkCount[saChunkN-1]=chunkSize1-indPrefCount[ii]; chunkSize1=indPrefCount[ii]; }; }; saChunkN++; indPrefStart[saChunkN]=indPrefN+1; indPrefChunkCount[saChunkN-1]=chunkSize1; P->inOut->logMain << "Number of chunks: " << saChunkN <<"; chunks size limit: " << saChunkSize*8 <<" bytes\n" <<flush; time ( &rawTime ); P->inOut->logMain << timeMonthDayTime(rawTime) <<" ... sorting Suffix Array chunks and saving them to disk...\n" <<flush; *P->inOut->logStdOut << timeMonthDayTime(rawTime) <<" ... sorting Suffix Array chunks and saving them to disk...\n" <<flush; #pragma omp parallel for num_threads(P->runThreadN) ordered schedule(dynamic,1) for (int iChunk=0; iChunk < (int) saChunkN; iChunk++) {//start the chunk cycle: sort each chunk with qsort and write to a file uint* saChunk=new uint [indPrefChunkCount[iChunk]];//allocate local array for each chunk for (uint ii=0,jj=0;ii<N2;ii+=P->genomeSAsparseD) {//fill the chunk with SA indices if (G[ii]<4) { uint p1=(G[ii]<<12) + (G[ii-1]<<8) + (G[ii-2]<<4) + G[ii-3]; if (p1>=indPrefStart[iChunk] && p1<indPrefStart[iChunk+1]) { saChunk[jj]=ii; jj++; }; //TODO: if (jj==indPrefChunkCount[iChunk]) break; }; }; //sort the chunk qsort(saChunk,indPrefChunkCount[iChunk],sizeof(saChunk[0]),funCompareSuffixes); for (uint ii=0;ii<indPrefChunkCount[iChunk];ii++) { saChunk[ii]=N2-1-saChunk[ii]; }; //write files ofstream saChunkFile; string chunkFileName=P->genomeDir+"/SA_"+to_string( (uint) iChunk); ofstrOpen(chunkFileName,"ERROR_00105", P, saChunkFile); fstreamWriteBig(saChunkFile, (char*) saChunk, sizeof(saChunk[0])*indPrefChunkCount[iChunk],chunkFileName,"ERROR_00121",P); saChunkFile.close(); delete [] saChunk; saChunk=NULL; }; time ( &rawTime ); P->inOut->logMain << timeMonthDayTime(rawTime) <<" ... loading chunks from disk, packing SA...\n" <<flush; *P->inOut->logStdOut << timeMonthDayTime(rawTime) <<" ... loading chunks from disk, packing SA...\n" <<flush; //read chunks and pack into full SA1 SA2.allocateArray(); SA1.pointArray(SA2.charArray + SA2.lengthByte-SA1.lengthByte); //SA1 is shifted to have space for junction insertion uint N2bit= 1LLU << P->GstrandBit; uint packedInd=0; #define SA_CHUNK_BLOCK_SIZE 10000000 uint* saIn=new uint[SA_CHUNK_BLOCK_SIZE]; //TODO make adjustable #ifdef genenomeGenerate_SA_textOutput ofstream SAtxtStream ((P->genomeDir + "/SAtxt").c_str()); #endif for (uint iChunk=0;iChunk<saChunkN;iChunk++) {//load files one by one and convert to packed ostringstream saChunkFileNameStream(""); saChunkFileNameStream<< P->genomeDir << "/SA_" << iChunk; ifstream saChunkFile(saChunkFileNameStream.str().c_str()); while (! saChunkFile.eof()) {//read blocks from each file uint chunkBytesN=fstreamReadBig(saChunkFile,(char*) saIn,SA_CHUNK_BLOCK_SIZE*sizeof(saIn[0])); for (uint ii=0;ii<chunkBytesN/sizeof(saIn[0]);ii++) { SA1.writePacked( packedInd+ii, (saIn[ii]<N) ? saIn[ii] : ( (saIn[ii]-N) | N2bit ) ); #ifdef genenomeGenerate_SA_textOutput SAtxtStream << saIn[ii] << "\n"; #endif }; packedInd += chunkBytesN/sizeof(saIn[0]); }; saChunkFile.close(); remove(saChunkFileNameStream.str().c_str());//remove the chunk file }; #ifdef genenomeGenerate_SA_textOutput SAtxtStream.close(); #endif delete [] saIn; if (packedInd != P->nSA ) {// ostringstream errOut; errOut << "EXITING because of FATAL problem while generating the suffix array\n"; errOut << "The number of indices read from chunks = "<<packedInd<<" is not equal to expected nSA="<<P->nSA<<"\n"; errOut << "SOLUTION: try to re-run suffix array generation, if it still does not work, report this problem to the author\n"<<flush; exitWithError(errOut.str(),std::cerr, P->inOut->logMain, EXIT_CODE_INPUT_FILES, *P); }; //DONE with suffix array generation for (uint ii=0;ii<N;ii++) {//return to normal order for future use swap(G[N2-1-ii],G[ii]); }; delete [] indPrefCount; delete [] indPrefStart; delete [] indPrefChunkCount; }; time ( &rawTime ); timeString=asctime(localtime ( &rawTime )); timeString.erase(timeString.end()-1,timeString.end()); P->inOut->logMain << timeMonthDayTime(rawTime) <<" ... Finished generating suffix array\n" <<flush; *P->inOut->logStdOut << timeMonthDayTime(rawTime) <<" ... Finished generating suffix array\n" <<flush; //////////////////////////////////////// // SA index // // PackedArray SAold; // // if (true) // {//testing: load SA from disk // //read chunks and pack into full SA1 // // ifstream oldSAin("./DirTrue/SA"); // oldSAin.seekg (0, ios::end); // P->nSAbyte=(uint) oldSAin.tellg(); // oldSAin.clear(); // oldSAin.seekg (0, ios::beg); // // P->nSA=(P->nSAbyte*8)/(P->GstrandBit+1); // SAold.defineBits(P->GstrandBit+1,P->nSA); // SAold.allocateArray(); // // oldSAin.read(SAold.charArray,SAold.lengthByte); // oldSAin.close(); // // SA1=SAold; // SA2=SAold; // }; PackedArray SAip; genomeSAindex(G,SA1,P,SAip); if (P->sjdbFileChrStartEnd.at(0)!="-" || P->sjdbGTFfile!="-") {//insert junctions SjdbClass sjdbLoci; Genome mainGenome(P); mainGenome.G=G; mainGenome.SA=SA1; mainGenome.SApass1=SA2; mainGenome.SAi=SAip; P->sjdbInsert.outDir=P->genomeDir; P->sjdbN=0;//no junctions are loaded yet P->twoPass.pass2=false; Parameters *P1=new Parameters; *P1=*P; sjdbInsertJunctions(P, P1, mainGenome, sjdbLoci); //write an extra 0 at the end of the array, filling the last bytes that otherwise are not accessible, but will be written to disk //this is - to avoid valgrind complaints. Note that SA2 is allocated with plenty of space to spare. SA2.writePacked(P->nSA,0); }; //write genome to disk time ( &rawTime ); P->inOut->logMain << timeMonthDayTime(rawTime) <<" ... writing Genome to disk ...\n" <<flush; *P->inOut->logStdOut << timeMonthDayTime(rawTime) <<" ... writing Genome to disk ...\n" <<flush; ofstream genomeOut; ofstrOpen(P->genomeDir+"/Genome","ERROR_00104", P, genomeOut); fstreamWriteBig(genomeOut,G,P->nGenome,P->genomeDir+"/Genome","ERROR_00120",P); genomeOut.close(); //write SA time ( &rawTime ); P->inOut->logMain << "SA size in bytes: "<< P->nSAbyte << "\n"<<flush; P->inOut->logMain << timeMonthDayTime(rawTime) <<" ... writing Suffix Array to disk ...\n" <<flush; *P->inOut->logStdOut << timeMonthDayTime(rawTime) <<" ... writing Suffix Array to disk ...\n" <<flush; ofstream SAout; ofstrOpen(P->genomeDir+"/SA","ERROR_00106", P, SAout); fstreamWriteBig(SAout,(char*) SA2.charArray, (streamsize) P->nSAbyte,P->genomeDir+"/SA","ERROR_00122",P); SAout.close(); //write SAi time(&rawTime); P->inOut->logMain << timeMonthDayTime(rawTime) <<" ... writing SAindex to disk\n" <<flush; *P->inOut->logStdOut << timeMonthDayTime(rawTime) <<" ... writing SAindex to disk\n" <<flush; //write SAi to disk ofstream SAiOut; ofstrOpen(P->genomeDir+"/SAindex","ERROR_00107", P, SAiOut); fstreamWriteBig(SAiOut, (char*) &P->genomeSAindexNbases, sizeof(P->genomeSAindexNbases),P->genomeDir+"/SAindex","ERROR_00123",P); fstreamWriteBig(SAiOut, (char*) P->genomeSAindexStart, sizeof(P->genomeSAindexStart[0])*(P->genomeSAindexNbases+1),P->genomeDir+"/SAindex","ERROR_00124",P); fstreamWriteBig(SAiOut, SAip.charArray, SAip.lengthByte,P->genomeDir+"/SAindex","ERROR_00125",P); SAiOut.close(); SA2.deallocateArray(); time(&rawTime); timeString=asctime(localtime ( &rawTime )); timeString.erase(timeString.end()-1,timeString.end()); time(&rawTime); P->inOut->logMain << timeMonthDayTime(rawTime) << " ..... Finished successfully\n" <<flush; *P->inOut->logStdOut << timeMonthDayTime(rawTime) << " ..... Finished successfully\n" <<flush; };