void sjdbLoadFromFiles(Parameters *P, SjdbClass &sjdbLoci) { if (P->sjdbFileChrStartEnd.at(0)!="-") { for (uint ifile=0;ifile<P->sjdbFileChrStartEnd.size(); ifile++) { ifstream sjdbStreamIn ( P->sjdbFileChrStartEnd.at(ifile).c_str() ); if (sjdbStreamIn.fail()) { ostringstream errOut; errOut << "FATAL INPUT error, could not open input file sjdbFileChrStartEnd=" << P->sjdbFileChrStartEnd.at(ifile) <<"\n"; exitWithError(errOut.str(),std::cerr, P->inOut->logMain, EXIT_CODE_INPUT_FILES, *P); }; sjdbLoadFromStream(sjdbStreamIn, sjdbLoci); P->inOut->logMain << "Loaded database junctions from file: " << P->sjdbFileChrStartEnd.at(ifile) <<", total number of junctions: "<<sjdbLoci.chr.size()<<" junctions\n\n"; }; }; //if (P->sjdbFileChrStartEnd!="-") };
void sjdbInsertJunctions(Parameters *P, Genome &genome) { SjdbClass sjdbLoci; time_t rawtime; //load 1st pass junctions if (P->twoPass.pass1sjFile.size()>0) { ifstream sjdbStreamIn ( P->twoPass.pass1sjFile.c_str() ); if (sjdbStreamIn.fail()) { ostringstream errOut; errOut << "FATAL INPUT error, could not open input file with junctions from the 1st pass="******"\n"; exitWithError(errOut.str(),std::cerr, P->inOut->logMain, EXIT_CODE_INPUT_FILES, *P); }; sjdbLoadFromStream(sjdbStreamIn, sjdbLoci); time ( &rawtime ); P->inOut->logMain << timeMonthDayTime(rawtime) << " Loaded database junctions from the 1st pass file: " << P->twoPass.pass1sjFile <<": "<<sjdbLoci.chr.size()<<" total junctions\n\n"; }; //load from junction files if (P->sjdbFileChrStartEnd.at(0)!="-") { sjdbLoadFromFiles(P,sjdbLoci); P->inOut->logMain << timeMonthDayTime(rawtime) << " Loaded database junctions from the sjdbFileChrStartEnd file(s), " << sjdbLoci.chr.size()<<" total junctions\n\n"; }; //load from GTF if (P->sjdbGTFfile!="-") { loadGTF(sjdbLoci, P, P->genomeDirOut); P->inOut->logMain << timeMonthDayTime(rawtime) << " Loaded database junctions from the GTF file: " << P->sjdbGTFfile<<": "<<sjdbLoci.chr.size()<<" total junctions\n\n"; }; sjdbPrepare (sjdbLoci, P, genome.G, P->nGenome, P->twoPass.dir);//P->nGenome - change when replacing junctions time ( &rawtime ); P->inOut->logMain << timeMonthDayTime(rawtime) << " Finished preparing junctions" <<endl; //insert junctions into the genome and SA and SAi sjdbBuildIndex (P, genome.G, genome.SA, genome.SA2, genome.SAi); time ( &rawtime ); *P->inOut->logStdOut << timeMonthDayTime(rawtime) << " ..... Finished inserting 1st pass junctions into genome" <<endl; //re-calculate genome-related parameters P->winBinN = P->nGenome/(1LLU << P->winBinNbits)+1; };
uint loadGTF(SjdbClass &sjdbLoci, Parameters *P) {//load gtf file, add junctions to P->sjdb //returns number of added junctions if (P->sjdbOverhang>0 && P->sjdbGTFfile!="-") { ifstream sjdbStreamIn ( P->sjdbGTFfile.c_str() ); if (sjdbStreamIn.fail()) { ostringstream errOut; errOut << "FATAL error, could not open file sjdbGTFfile=" << P->sjdbGTFfile <<"\n"; exitWithError(errOut.str(),std::cerr, P->inOut->logMain, EXIT_CODE_INPUT_FILES, *P); }; std::map <string,uint> transcriptIDnumber; uint exonN=0; while (sjdbStreamIn.good()) {//count the number of exons string chr1,ddd2,featureType; sjdbStreamIn >> chr1 >> ddd2 >> featureType; if (chr1.substr(0,1)!="#" && featureType==P->sjdbGTFfeatureExon) { exonN++; }; sjdbStreamIn.ignore(1000000000,'\n'); //ignore the rest of the line }; uint* exonLoci=new uint [exonN*GTF_exonLoci_size]; char* transcriptStrand = new char [exonN]; vector <string> transcriptID; exonN=0;//re-calculate sjdbStreamIn.clear(); sjdbStreamIn.seekg(0,ios::beg); while (sjdbStreamIn.good()) { string oneLine,chr1,ddd2,featureType; getline(sjdbStreamIn,oneLine); istringstream oneLineStream (oneLine); oneLineStream >> chr1 >> ddd2 >> featureType; if (chr1.substr(0,1)!="#" && featureType==P->sjdbGTFfeatureExon) {//exonic line, process if (P->sjdbGTFchrPrefix!="-") chr1=P->sjdbGTFchrPrefix + chr1; if (P->chrNameIndex.count(chr1)==0) {//chr not in Genome P->inOut->logMain << "WARNING: while processing sjdbGTFfile=" << P->sjdbGTFfile <<": chromosome '"<<chr1<<"' not found in Genome fasta files for line:\n"; P->inOut->logMain << oneLine <<"\n"<<flush; continue; //do not process exons/transcripts on missing chromosomes }; uint ex1,ex2; char str1; oneLineStream >> ex1 >> ex2 >> ddd2 >> str1 >> ddd2; //read all fields except the last string oneLine1; getline(oneLineStream, oneLine1);//get the last field replace(oneLine1.begin(),oneLine1.end(),';',' ');//to separate attributes replace(oneLine1.begin(),oneLine1.end(),'=',' ');//for GFF3 processing oneLineStream.str(oneLine1); oneLineStream.clear(); string trID(""), attr1(""); while (oneLineStream.good()) { oneLineStream >> attr1; if (attr1==P->sjdbGTFtagExonParentTranscript) { oneLineStream >> trID; trID.erase(remove(trID.begin(),trID.end(),'"'),trID.end()); trID.erase(remove(trID.begin(),trID.end(),';'),trID.end()); // cout <<trID<<endl; }; }; if (trID=="") {//no transcript ID P->inOut->logMain << "WARNING: while processing sjdbGTFfile=" << P->sjdbGTFfile <<": no transcript_id for exon feature for line:\n"; P->inOut->logMain << oneLine <<"\n"<<flush; } else { transcriptIDnumber.insert(std::pair <string,uint> (trID,(uint) transcriptIDnumber.size()));//insert new element if necessary with a new numeric value if (transcriptID.size() < transcriptIDnumber.size()) transcriptID.push_back(trID); if (str1=='+') { transcriptStrand[transcriptIDnumber[trID]]=1; } else if (str1=='-') { transcriptStrand[transcriptIDnumber[trID]]=2; } else { transcriptStrand[transcriptIDnumber[trID]]=0; }; }; exonLoci[GTF_exonTrID(exonN)]=transcriptIDnumber[trID]; exonLoci[GTF_exonStart(exonN)]=ex1+P->chrStart[P->chrNameIndex[chr1]]-1; exonLoci[GTF_exonEnd(exonN)]=ex2+P->chrStart[P->chrNameIndex[chr1]]-1; ++exonN; };//if (chr1.substr(0,1)!="#" && featureType=="exon") };//
uint loadGTF(SjdbClass &sjdbLoci, Parameters *P, string dirOut) {//load gtf file, add junctions to P->sjdb //returns number of added junctions if (P->sjdbOverhang>0 && P->sjdbGTFfile!="-") { time_t rawTime; time(&rawTime); P->inOut->logMain << timeMonthDayTime(rawTime) <<" ..... Processing annotations GTF\n" <<flush; *P->inOut->logStdOut << timeMonthDayTime(rawTime) <<" ..... Processing annotations GTF\n" <<flush; ifstream sjdbStreamIn ( P->sjdbGTFfile.c_str() ); if (sjdbStreamIn.fail()) { ostringstream errOut; errOut << "FATAL error, could not open file sjdbGTFfile=" << P->sjdbGTFfile <<"\n"; exitWithError(errOut.str(),std::cerr, P->inOut->logMain, EXIT_CODE_INPUT_FILES, *P); }; if (P->chrNameIndex.size()==0) { for (uint ii=0;ii<P->nChrReal;ii++) { P->chrNameIndex[P->chrName[ii]]=ii; }; }; std::map <string,uint> transcriptIDnumber, geneIDnumber; uint exonN=0; while (sjdbStreamIn.good()) {//count the number of exons string chr1,ddd2,featureType; sjdbStreamIn >> chr1 >> ddd2 >> featureType; if (chr1.substr(0,1)!="#" && featureType==P->sjdbGTFfeatureExon) { exonN++; }; sjdbStreamIn.ignore(1000000000,'\n'); //ignore the rest of the line }; if (exonN==0) { P->inOut->logMain << "WARNING: found no exons in sjdbGTFfile=" << P->sjdbGTFfile <<endl; return 0; }; uint* exonLoci=new uint [exonN*GTF_exonLoci_size]; char* transcriptStrand = new char [exonN]; vector <string> transcriptID, geneID; exonN=0;//re-calculate sjdbStreamIn.clear(); sjdbStreamIn.seekg(0,ios::beg); while (sjdbStreamIn.good()) { string oneLine,chr1,ddd2,featureType; getline(sjdbStreamIn,oneLine); istringstream oneLineStream (oneLine); oneLineStream >> chr1 >> ddd2 >> featureType; if (chr1.substr(0,1)!="#" && featureType==P->sjdbGTFfeatureExon) {//exonic line, process if (P->sjdbGTFchrPrefix!="-") chr1=P->sjdbGTFchrPrefix + chr1; if (P->chrNameIndex.count(chr1)==0) {//chr not in Genome P->inOut->logMain << "WARNING: while processing sjdbGTFfile=" << P->sjdbGTFfile <<": chromosome '"<<chr1<<"' not found in Genome fasta files for line:\n"; P->inOut->logMain << oneLine <<"\n"<<flush; continue; //do not process exons/transcripts on missing chromosomes }; uint ex1,ex2; char str1; oneLineStream >> ex1 >> ex2 >> ddd2 >> str1 >> ddd2; //read all fields except the last string oneLine1; getline(oneLineStream, oneLine1);//get the last field replace(oneLine1.begin(),oneLine1.end(),';',' ');//to separate attributes replace(oneLine1.begin(),oneLine1.end(),'=',' ');//for GFF3 processing oneLineStream.str(oneLine1); oneLineStream.clear(); string trID(""), gID(""), attr1(""); while (oneLineStream.good()) { oneLineStream >> attr1; if (attr1==P->sjdbGTFtagExonParentTranscript) { oneLineStream >> trID; trID.erase(remove(trID.begin(),trID.end(),'"'),trID.end()); trID.erase(remove(trID.begin(),trID.end(),';'),trID.end()); } else if (attr1==P->sjdbGTFtagExonParentGene) { oneLineStream >> gID; gID.erase(remove(gID.begin(),gID.end(),'"'),gID.end()); gID.erase(remove(gID.begin(),gID.end(),';'),gID.end()); }; };
void sjdbInsertJunctions(Parameters * P, Parameters * P1, Genome & genome, SjdbClass & sjdbLoci) { time_t rawtime; if (P->sjdbN>0 && sjdbLoci.chr.size()==0) {//load from the saved genome, only if the loading did not happen already (if sjdb insertion happens at the 1st pass, sjdbLoci will be populated ifstream & sjdbStreamIn = ifstrOpen(P->genomeDir+"/sjdbList.out.tab", ERROR_OUT, "SOLUTION: re-generate the genome in genomeDir=" + P->genomeDir, P); sjdbLoadFromStream(sjdbStreamIn, sjdbLoci); sjdbLoci.priority.resize(sjdbLoci.chr.size(),30); time ( &rawtime ); P->inOut->logMain << timeMonthDayTime(rawtime) << " Loaded database junctions from the generated genome " << P->genomeDir+"/sjdbList.out.tab" <<": "<<sjdbLoci.chr.size()<<" total junctions\n\n"; }; if (P->twoPass.pass2) {//load 1st pass new junctions //sjdbLoci already contains the junctions from before 1st pass ifstream sjdbStreamIn ( P->twoPass.pass1sjFile.c_str() ); if (sjdbStreamIn.fail()) { ostringstream errOut; errOut << "FATAL INPUT error, could not open input file with junctions from the 1st pass="******"\n"; exitWithError(errOut.str(),std::cerr, P->inOut->logMain, EXIT_CODE_INPUT_FILES, *P); }; sjdbLoadFromStream(sjdbStreamIn, sjdbLoci); sjdbLoci.priority.resize(sjdbLoci.chr.size(),0); time ( &rawtime ); P->inOut->logMain << timeMonthDayTime(rawtime) << " Loaded database junctions from the 1st pass file: " << P->twoPass.pass1sjFile <<": "<<sjdbLoci.chr.size()<<" total junctions\n\n"; } else {//loading junctions from GTF or tab or from the saved genome is only allowed at the 1st pass //at the 2nd pass these are already in the sjdbLoci if (P->sjdbFileChrStartEnd.at(0)!="-") {//load from junction files sjdbLoadFromFiles(P,sjdbLoci); sjdbLoci.priority.resize(sjdbLoci.chr.size(),10); time ( &rawtime ); P->inOut->logMain << timeMonthDayTime(rawtime) << " Loaded database junctions from the sjdbFileChrStartEnd file(s), " << sjdbLoci.chr.size()<<" total junctions\n\n"; }; if (P->sjdbGTFfile!="-") {//load from GTF loadGTF(sjdbLoci, P, P->sjdbInsert.outDir); sjdbLoci.priority.resize(sjdbLoci.chr.size(),20); time ( &rawtime ); P->inOut->logMain << timeMonthDayTime(rawtime) << " Loaded database junctions from the GTF file: " << P->sjdbGTFfile<<": "<<sjdbLoci.chr.size()<<" total junctions\n\n"; }; }; char *Gsj=new char [2*P->sjdbLength*sjdbLoci.chr.size()+1];//array to store junction sequences, will be filled in sjdbPrepare sjdbPrepare (sjdbLoci, P, P->chrStart[P->nChrReal], P->sjdbInsert.outDir, genome.G, Gsj);//P->nGenome - change when replacing junctions time ( &rawtime ); P->inOut->logMain << timeMonthDayTime(rawtime) << " Finished preparing junctions" <<endl; if (P->sjdbN>P->limitSjdbInsertNsj) { ostringstream errOut; errOut << "Fatal LIMIT error: the number of junctions to be inserted on the fly ="<<P->sjdbN<<" is larger than the limitSjdbInsertNsj="<<P->limitSjdbInsertNsj<<"\n"; errOut << "Fatal LIMIT error: the number of junctions to be inserted on the fly ="<<P->sjdbN<<" is larger than the limitSjdbInsertNsj="<<P->limitSjdbInsertNsj<<"\n"; errOut << "SOLUTION: re-run with at least --limitSjdbInsertNsj "<<P->sjdbN<<"\n"; exitWithError(errOut.str(),std::cerr, P->inOut->logMain, EXIT_CODE_INPUT_FILES, *P); }; //insert junctions into the genome and SA and SAi sjdbBuildIndex (P, P1, Gsj, genome.G, genome.SA, (P->twoPass.pass2 ? genome.SApass2 : genome.SApass1), genome.SAi); delete [] Gsj; //junction sequences have been added to G time ( &rawtime ); P->inOut->logMain << timeMonthDayTime(rawtime) << " ..... finished inserting junctions into genome" <<endl; if (P->sjdbInsert.save=="All") {//save and copy all genome files into sjdbInsert.outDir, except those created above if (P->genomeDir != P->sjdbInsert.outDir) { copyFile(P->genomeDir+"/chrName.txt", P->sjdbInsert.outDir+"/chrName.txt"); copyFile(P->genomeDir+"/chrStart.txt", P->sjdbInsert.outDir+"/chrStart.txt"); copyFile(P->genomeDir+"/chrNameLength.txt", P->sjdbInsert.outDir+"/chrNameLength.txt"); copyFile(P->genomeDir+"/chrLength.txt", P->sjdbInsert.outDir+"/chrLength.txt"); }; genomeParametersWrite(P->sjdbInsert.outDir+("/genomeParameters.txt"), P, ERROR_OUT); ofstream & genomeOut = ofstrOpen(P->sjdbInsert.outDir+"/Genome",ERROR_OUT, P); fstreamWriteBig(genomeOut,genome.G,P->nGenome,P->sjdbInsert.outDir+"/Genome",ERROR_OUT,P); genomeOut.close(); ofstream & saOut = ofstrOpen(P->sjdbInsert.outDir+"/SA",ERROR_OUT, P); fstreamWriteBig(saOut,(char*) genome.SA.charArray, (streamsize) genome.SA.lengthByte, P->sjdbInsert.outDir+"/SA",ERROR_OUT,P); saOut.close(); ofstream & saIndexOut = ofstrOpen(P->sjdbInsert.outDir+"/SAindex",ERROR_OUT, P); fstreamWriteBig(saIndexOut, (char*) &P->genomeSAindexNbases, sizeof(P->genomeSAindexNbases),P->sjdbInsert.outDir+"/SAindex",ERROR_OUT,P); fstreamWriteBig(saIndexOut, (char*) P->genomeSAindexStart, sizeof(P->genomeSAindexStart[0])*(P->genomeSAindexNbases+1),P->sjdbInsert.outDir+"/SAindex",ERROR_OUT,P); fstreamWriteBig(saIndexOut, genome.SAi.charArray, genome.SAi.lengthByte,P->sjdbInsert.outDir+"/SAindex",ERROR_OUT,P); saIndexOut.close(); }; //re-calculate genome-related parameters P->winBinN = P->nGenome/(1LLU << P->winBinNbits)+1; };