Esempio n. 1
0
void sjdbLoadFromFiles(Parameters *P, SjdbClass &sjdbLoci) {
   
    if (P->sjdbFileChrStartEnd.at(0)!="-") {       
        for (uint ifile=0;ifile<P->sjdbFileChrStartEnd.size(); ifile++) {
            ifstream sjdbStreamIn ( P->sjdbFileChrStartEnd.at(ifile).c_str() );   
            if (sjdbStreamIn.fail()) {
                ostringstream errOut;
                errOut << "FATAL INPUT error, could not open input file sjdbFileChrStartEnd=" << P->sjdbFileChrStartEnd.at(ifile) <<"\n";
                exitWithError(errOut.str(),std::cerr, P->inOut->logMain, EXIT_CODE_INPUT_FILES, *P);
            };

            sjdbLoadFromStream(sjdbStreamIn, sjdbLoci);

            P->inOut->logMain << "Loaded database junctions from file: " << P->sjdbFileChrStartEnd.at(ifile) <<", total number of junctions: "<<sjdbLoci.chr.size()<<" junctions\n\n";
        };
    }; //if (P->sjdbFileChrStartEnd!="-")

};
Esempio n. 2
0
void sjdbInsertJunctions(Parameters *P, Genome &genome) {
        
    SjdbClass sjdbLoci;        
    time_t rawtime;

    //load 1st pass junctions
    if (P->twoPass.pass1sjFile.size()>0)
    {
        ifstream sjdbStreamIn ( P->twoPass.pass1sjFile.c_str() );   
        if (sjdbStreamIn.fail()) {
            ostringstream errOut;
            errOut << "FATAL INPUT error, could not open input file with junctions from the 1st pass="******"\n";
            exitWithError(errOut.str(),std::cerr, P->inOut->logMain, EXIT_CODE_INPUT_FILES, *P);
        };
        sjdbLoadFromStream(sjdbStreamIn, sjdbLoci);
        time ( &rawtime );
        P->inOut->logMain << timeMonthDayTime(rawtime) << "   Loaded database junctions from the 1st pass file: " << P->twoPass.pass1sjFile <<": "<<sjdbLoci.chr.size()<<" total junctions\n\n";
    };
    
    //load from junction files
    if (P->sjdbFileChrStartEnd.at(0)!="-")
    {
        sjdbLoadFromFiles(P,sjdbLoci);
        P->inOut->logMain << timeMonthDayTime(rawtime) << "   Loaded database junctions from the sjdbFileChrStartEnd file(s), " << sjdbLoci.chr.size()<<" total junctions\n\n";        
    };
    
    //load from GTF
    if (P->sjdbGTFfile!="-")
    {
        loadGTF(sjdbLoci, P, P->genomeDirOut);
        P->inOut->logMain << timeMonthDayTime(rawtime) << "   Loaded database junctions from the GTF file: " << P->sjdbGTFfile<<": "<<sjdbLoci.chr.size()<<" total junctions\n\n";
    };

    sjdbPrepare (sjdbLoci, P, genome.G, P->nGenome, P->twoPass.dir);//P->nGenome - change when replacing junctions
    time ( &rawtime );
    P->inOut->logMain  << timeMonthDayTime(rawtime) << "   Finished preparing junctions" <<endl;

    //insert junctions into the genome and SA and SAi
    sjdbBuildIndex (P, genome.G, genome.SA, genome.SA2, genome.SAi);
    time ( &rawtime ); 
    *P->inOut->logStdOut  << timeMonthDayTime(rawtime) << " ..... Finished inserting 1st pass junctions into genome" <<endl;
    //re-calculate genome-related parameters
    P->winBinN = P->nGenome/(1LLU << P->winBinNbits)+1;
};
Esempio n. 3
0
uint loadGTF(SjdbClass &sjdbLoci, Parameters *P) {//load gtf file, add junctions to P->sjdb
    //returns number of added junctions
    if (P->sjdbOverhang>0 && P->sjdbGTFfile!="-") {       
        ifstream sjdbStreamIn ( P->sjdbGTFfile.c_str() );   
        if (sjdbStreamIn.fail()) {
            ostringstream errOut;
            errOut << "FATAL error, could not open file sjdbGTFfile=" << P->sjdbGTFfile <<"\n";
            exitWithError(errOut.str(),std::cerr, P->inOut->logMain, EXIT_CODE_INPUT_FILES, *P);
        };    
        
        std::map <string,uint> transcriptIDnumber;

        uint exonN=0;
        while (sjdbStreamIn.good()) {//count the number of exons
            string chr1,ddd2,featureType;            
            sjdbStreamIn >> chr1 >> ddd2 >> featureType;
            if (chr1.substr(0,1)!="#" && featureType==P->sjdbGTFfeatureExon) {
                exonN++;
            };
            sjdbStreamIn.ignore(1000000000,'\n'); //ignore the rest of the line
        };
        uint* exonLoci=new uint [exonN*GTF_exonLoci_size];
        char* transcriptStrand = new char [exonN];
        vector <string> transcriptID;

        exonN=0;//re-calculate
        sjdbStreamIn.clear();
        sjdbStreamIn.seekg(0,ios::beg);
        while (sjdbStreamIn.good()) {

            string oneLine,chr1,ddd2,featureType;
            getline(sjdbStreamIn,oneLine);
            istringstream oneLineStream (oneLine);
            
            oneLineStream >> chr1 >> ddd2 >> featureType;
            if (chr1.substr(0,1)!="#" && featureType==P->sjdbGTFfeatureExon) {//exonic line, process
                
                if (P->sjdbGTFchrPrefix!="-") chr1=P->sjdbGTFchrPrefix + chr1;
                
                if (P->chrNameIndex.count(chr1)==0) {//chr not in Genome
                    P->inOut->logMain << "WARNING: while processing sjdbGTFfile=" << P->sjdbGTFfile <<": chromosome '"<<chr1<<"' not found in Genome fasta files for line:\n";
                    P->inOut->logMain << oneLine <<"\n"<<flush;          
                    continue; //do not process exons/transcripts on missing chromosomes
                };
                
                uint ex1,ex2;
                char str1;
                oneLineStream >> ex1 >> ex2 >> ddd2 >> str1 >> ddd2; //read all fields except the last

                string oneLine1;
                getline(oneLineStream, oneLine1);//get the last field
                replace(oneLine1.begin(),oneLine1.end(),';',' ');//to separate attributes
                replace(oneLine1.begin(),oneLine1.end(),'=',' ');//for GFF3 processing
                oneLineStream.str(oneLine1);
                oneLineStream.clear();

                string trID(""), attr1("");
                while (oneLineStream.good()) {
                    oneLineStream >> attr1;
                    if (attr1==P->sjdbGTFtagExonParentTranscript) {
                        oneLineStream >> trID;
                        trID.erase(remove(trID.begin(),trID.end(),'"'),trID.end());
                        trID.erase(remove(trID.begin(),trID.end(),';'),trID.end());
//                         cout <<trID<<endl;
                    };
                };
                if (trID=="") {//no transcript ID
                    P->inOut->logMain << "WARNING: while processing sjdbGTFfile=" << P->sjdbGTFfile <<": no transcript_id for exon feature for line:\n";
                    P->inOut->logMain << oneLine <<"\n"<<flush;
                } else {
                    transcriptIDnumber.insert(std::pair <string,uint> (trID,(uint) transcriptIDnumber.size()));//insert new element if necessary with a new numeric value
                    if (transcriptID.size() < transcriptIDnumber.size()) transcriptID.push_back(trID);
                    if (str1=='+') {
                       transcriptStrand[transcriptIDnumber[trID]]=1;
                    } else if (str1=='-') {
                       transcriptStrand[transcriptIDnumber[trID]]=2;
                    } else {
                       transcriptStrand[transcriptIDnumber[trID]]=0;
                    };
                };
                
                exonLoci[GTF_exonTrID(exonN)]=transcriptIDnumber[trID];
                exonLoci[GTF_exonStart(exonN)]=ex1+P->chrStart[P->chrNameIndex[chr1]]-1;
                exonLoci[GTF_exonEnd(exonN)]=ex2+P->chrStart[P->chrNameIndex[chr1]]-1;
                ++exonN;                

            };//if (chr1.substr(0,1)!="#" && featureType=="exon")
        };//
Esempio n. 4
0
uint loadGTF(SjdbClass &sjdbLoci, Parameters *P, string dirOut) {//load gtf file, add junctions to P->sjdb
    //returns number of added junctions
    if (P->sjdbOverhang>0 && P->sjdbGTFfile!="-") {       
        time_t rawTime;
        time(&rawTime);
        P->inOut->logMain     << timeMonthDayTime(rawTime) <<" ..... Processing annotations GTF\n" <<flush;
        *P->inOut->logStdOut  << timeMonthDayTime(rawTime) <<" ..... Processing annotations GTF\n" <<flush;           
        
        ifstream sjdbStreamIn ( P->sjdbGTFfile.c_str() );   
        if (sjdbStreamIn.fail()) {
            ostringstream errOut;
            errOut << "FATAL error, could not open file sjdbGTFfile=" << P->sjdbGTFfile <<"\n";
            exitWithError(errOut.str(),std::cerr, P->inOut->logMain, EXIT_CODE_INPUT_FILES, *P);
        };    
        
        if (P->chrNameIndex.size()==0) 
        {
            for (uint ii=0;ii<P->nChrReal;ii++) {
                P->chrNameIndex[P->chrName[ii]]=ii;
            };        
        };
        std::map <string,uint> transcriptIDnumber, geneIDnumber;

        uint exonN=0;
        while (sjdbStreamIn.good()) {//count the number of exons
            string chr1,ddd2,featureType;            
            sjdbStreamIn >> chr1 >> ddd2 >> featureType;
            if (chr1.substr(0,1)!="#" && featureType==P->sjdbGTFfeatureExon) {
                exonN++;
            };
            sjdbStreamIn.ignore(1000000000,'\n'); //ignore the rest of the line
        };
        
        if (exonN==0)
        {
            P->inOut->logMain << "WARNING: found no exons in sjdbGTFfile=" << P->sjdbGTFfile <<endl;
            return 0;
        };
        uint* exonLoci=new uint [exonN*GTF_exonLoci_size];
        char* transcriptStrand = new char [exonN];
        vector <string> transcriptID, geneID;

        exonN=0;//re-calculate
        sjdbStreamIn.clear();
        sjdbStreamIn.seekg(0,ios::beg);
        while (sjdbStreamIn.good()) {

            string oneLine,chr1,ddd2,featureType;
            getline(sjdbStreamIn,oneLine);
            istringstream oneLineStream (oneLine);
            
            oneLineStream >> chr1 >> ddd2 >> featureType;
            if (chr1.substr(0,1)!="#" && featureType==P->sjdbGTFfeatureExon) {//exonic line, process
                
                if (P->sjdbGTFchrPrefix!="-") chr1=P->sjdbGTFchrPrefix + chr1;
                
                if (P->chrNameIndex.count(chr1)==0) {//chr not in Genome
                    P->inOut->logMain << "WARNING: while processing sjdbGTFfile=" << P->sjdbGTFfile <<": chromosome '"<<chr1<<"' not found in Genome fasta files for line:\n";
                    P->inOut->logMain << oneLine <<"\n"<<flush;          
                    continue; //do not process exons/transcripts on missing chromosomes
                };
                
                uint ex1,ex2;
                char str1;
                oneLineStream >> ex1 >> ex2 >> ddd2 >> str1 >> ddd2; //read all fields except the last

                string oneLine1;
                getline(oneLineStream, oneLine1);//get the last field
                replace(oneLine1.begin(),oneLine1.end(),';',' ');//to separate attributes
                replace(oneLine1.begin(),oneLine1.end(),'=',' ');//for GFF3 processing
                oneLineStream.str(oneLine1);
                oneLineStream.clear();

                string trID(""), gID(""), attr1("");
                while (oneLineStream.good()) {
                    oneLineStream >> attr1;
                    if (attr1==P->sjdbGTFtagExonParentTranscript) {
                        oneLineStream >> trID;
                        trID.erase(remove(trID.begin(),trID.end(),'"'),trID.end());
                        trID.erase(remove(trID.begin(),trID.end(),';'),trID.end());
                    } else if (attr1==P->sjdbGTFtagExonParentGene) {
                        oneLineStream >> gID;
                        gID.erase(remove(gID.begin(),gID.end(),'"'),gID.end());
                        gID.erase(remove(gID.begin(),gID.end(),';'),gID.end());
                    };
                };
Esempio n. 5
0
void sjdbInsertJunctions(Parameters * P, Parameters * P1, Genome & genome, SjdbClass & sjdbLoci)
{
    time_t rawtime;

    if (P->sjdbN>0 && sjdbLoci.chr.size()==0)
    {//load from the saved genome, only if the loading did not happen already (if sjdb insertion happens at the 1st pass, sjdbLoci will be populated
        ifstream & sjdbStreamIn = ifstrOpen(P->genomeDir+"/sjdbList.out.tab", ERROR_OUT, "SOLUTION: re-generate the genome in genomeDir=" + P->genomeDir, P);
        sjdbLoadFromStream(sjdbStreamIn, sjdbLoci);
        sjdbLoci.priority.resize(sjdbLoci.chr.size(),30);
        time ( &rawtime );
        P->inOut->logMain << timeMonthDayTime(rawtime) << "   Loaded database junctions from the generated genome " << P->genomeDir+"/sjdbList.out.tab" <<": "<<sjdbLoci.chr.size()<<" total junctions\n\n";
    };

    if (P->twoPass.pass2)
    {//load 1st pass new junctions
     //sjdbLoci already contains the junctions from before 1st pass
        ifstream sjdbStreamIn ( P->twoPass.pass1sjFile.c_str() );
        if (sjdbStreamIn.fail()) {
            ostringstream errOut;
            errOut << "FATAL INPUT error, could not open input file with junctions from the 1st pass="******"\n";
            exitWithError(errOut.str(),std::cerr, P->inOut->logMain, EXIT_CODE_INPUT_FILES, *P);
        };
        sjdbLoadFromStream(sjdbStreamIn, sjdbLoci);
        sjdbLoci.priority.resize(sjdbLoci.chr.size(),0);
        time ( &rawtime );
        P->inOut->logMain << timeMonthDayTime(rawtime) << "   Loaded database junctions from the 1st pass file: " << P->twoPass.pass1sjFile <<": "<<sjdbLoci.chr.size()<<" total junctions\n\n";
    } else
    {//loading junctions from GTF or tab or from the saved genome is only allowed at the 1st pass
     //at the 2nd pass these are already in the sjdbLoci

        if (P->sjdbFileChrStartEnd.at(0)!="-")
        {//load from junction files
            sjdbLoadFromFiles(P,sjdbLoci);
            sjdbLoci.priority.resize(sjdbLoci.chr.size(),10);
            time ( &rawtime );
            P->inOut->logMain << timeMonthDayTime(rawtime) << "   Loaded database junctions from the sjdbFileChrStartEnd file(s), " << sjdbLoci.chr.size()<<" total junctions\n\n";
        };

        if (P->sjdbGTFfile!="-")
        {//load from GTF
            loadGTF(sjdbLoci, P, P->sjdbInsert.outDir);
            sjdbLoci.priority.resize(sjdbLoci.chr.size(),20);
            time ( &rawtime );
            P->inOut->logMain << timeMonthDayTime(rawtime) << "   Loaded database junctions from the GTF file: " << P->sjdbGTFfile<<": "<<sjdbLoci.chr.size()<<" total junctions\n\n";
        };
    };

    char *Gsj=new char [2*P->sjdbLength*sjdbLoci.chr.size()+1];//array to store junction sequences, will be filled in sjdbPrepare
    sjdbPrepare (sjdbLoci, P, P->chrStart[P->nChrReal], P->sjdbInsert.outDir, genome.G, Gsj);//P->nGenome - change when replacing junctions
    time ( &rawtime );
    P->inOut->logMain  << timeMonthDayTime(rawtime) << "   Finished preparing junctions" <<endl;

    if (P->sjdbN>P->limitSjdbInsertNsj)
    {
        ostringstream errOut;
        errOut << "Fatal LIMIT error: the number of junctions to be inserted on the fly ="<<P->sjdbN<<" is larger than the limitSjdbInsertNsj="<<P->limitSjdbInsertNsj<<"\n";                errOut << "Fatal LIMIT error: the number of junctions to be inserted on the fly ="<<P->sjdbN<<" is larger than the limitSjdbInsertNsj="<<P->limitSjdbInsertNsj<<"\n";
        errOut << "SOLUTION: re-run with at least --limitSjdbInsertNsj "<<P->sjdbN<<"\n";
        exitWithError(errOut.str(),std::cerr, P->inOut->logMain, EXIT_CODE_INPUT_FILES, *P);
    };
    //insert junctions into the genome and SA and SAi
    sjdbBuildIndex (P, P1, Gsj, genome.G, genome.SA, (P->twoPass.pass2 ? genome.SApass2 : genome.SApass1), genome.SAi);
    delete [] Gsj; //junction sequences have been added to G
    time ( &rawtime );
    P->inOut->logMain     << timeMonthDayTime(rawtime) << " ..... finished inserting junctions into genome" <<endl;

    if (P->sjdbInsert.save=="All")
    {//save and copy all genome files into sjdbInsert.outDir, except those created above
        if (P->genomeDir != P->sjdbInsert.outDir)
        {
            copyFile(P->genomeDir+"/chrName.txt", P->sjdbInsert.outDir+"/chrName.txt");
            copyFile(P->genomeDir+"/chrStart.txt", P->sjdbInsert.outDir+"/chrStart.txt");
            copyFile(P->genomeDir+"/chrNameLength.txt", P->sjdbInsert.outDir+"/chrNameLength.txt");
            copyFile(P->genomeDir+"/chrLength.txt", P->sjdbInsert.outDir+"/chrLength.txt");
        };

        genomeParametersWrite(P->sjdbInsert.outDir+("/genomeParameters.txt"), P, ERROR_OUT);

        ofstream & genomeOut = ofstrOpen(P->sjdbInsert.outDir+"/Genome",ERROR_OUT, P);
        fstreamWriteBig(genomeOut,genome.G,P->nGenome,P->sjdbInsert.outDir+"/Genome",ERROR_OUT,P);
        genomeOut.close();

        ofstream & saOut = ofstrOpen(P->sjdbInsert.outDir+"/SA",ERROR_OUT, P);
        fstreamWriteBig(saOut,(char*) genome.SA.charArray, (streamsize) genome.SA.lengthByte, P->sjdbInsert.outDir+"/SA",ERROR_OUT,P);
        saOut.close();

        ofstream & saIndexOut = ofstrOpen(P->sjdbInsert.outDir+"/SAindex",ERROR_OUT, P);
        fstreamWriteBig(saIndexOut, (char*) &P->genomeSAindexNbases, sizeof(P->genomeSAindexNbases),P->sjdbInsert.outDir+"/SAindex",ERROR_OUT,P);
        fstreamWriteBig(saIndexOut, (char*) P->genomeSAindexStart, sizeof(P->genomeSAindexStart[0])*(P->genomeSAindexNbases+1),P->sjdbInsert.outDir+"/SAindex",ERROR_OUT,P);
        fstreamWriteBig(saIndexOut,  genome.SAi.charArray, genome.SAi.lengthByte,P->sjdbInsert.outDir+"/SAindex",ERROR_OUT,P);
        saIndexOut.close();
    };

    //re-calculate genome-related parameters
    P->winBinN = P->nGenome/(1LLU << P->winBinNbits)+1;
};