void readMTAlleleFreq(const string freqFile, map<int, alleleFrequency> & pos2allelefreq){ // map<int, alleleFrequency> pos2allelefreq; string line; igzstream freqAlleleFile; freqAlleleFile.open(freqFile.c_str()); if (freqAlleleFile.good()){ while ( getline (freqAlleleFile,line)){ vector<string> fields = allTokens(line,'\t'); alleleFrequency freqToadd; if(fields.size() != 5){ cerr << "line "<<line<<" in file "<<freqFile<<" does not have 5 fields"<<endl; exit(1); } for(int nuc=0;nuc<4;nuc++){ freqToadd.f[nuc]=destringify<double>(fields[nuc+1]); } pos2allelefreq[ destringify<int>( fields[0]) ] = freqToadd; } freqAlleleFile.close(); }else{ cerr << "Cannot open allele frequency file "<<freqFile<<""<<endl; exit(1); } }
void readMTConsensus(const string consensusFile, map<int, PHREDgeno> & pos2phredgeno, int & sizeGenome, vector<int> & posOfIndels){ string line; igzstream consensusFD; consensusFD.open(consensusFile.c_str()); if (consensusFD.good()){ getline (consensusFD,line); while ( getline (consensusFD,line)){ if (line.empty()) continue; vector<string> fields = allTokens(line,'\t'); PHREDgeno toadd; // cerr<<line<<endl; if(fields.size() != 11){ cerr << "line "<<line<<" in file "<<consensusFile<<" does not have 11 fields"<<endl; exit(1); } if(fields[0][fields[0].size()-1] == 'i'){ //skip insertion posOfIndels.push_back( destringify<int>( fields[0]) ); continue; } if(fields[2] == "D"){ //skip deletions posOfIndels.push_back( destringify<int>( fields[0]) ); continue; } toadd.consensus = fields[2][0]; for(int nuc=0;nuc<4;nuc++){ toadd.phred[nuc] = destringify<double>(fields[nuc+7]); toadd.perror[nuc] = pow(10.0,toadd.phred[nuc]/(-10.0)); } pos2phredgeno[ destringify<int>( fields[0]) ] = toadd; sizeGenome = max( destringify<int>( fields[0]), sizeGenome); // cout<<destringify<int>( fields[0])<<endl; } consensusFD.close(); }else{ cerr << "Cannot open consensus file "<<consensusFile<<""<<endl; exit(1); } }
SimpleVCF::SimpleVCF(string line){ //trimWhiteSpacesBothEnds (&line); vector<string> fields=allTokens(line,'\t'); corevcf = new CoreVCF(fields); deleteCore=true; // cerr<<"Ok "<<endl; init(fields,corevcf); // cerr<<"Ok "<<endl; // cout<<"SimpleVCF "<<this<<"\t"<<deleteCore<<endl; }
string MistarParser::getHeaderNoDefline(string prefix){ vector<string> fields=allTokens(headerNoDefline,'\n'); vector<string> toreturn; for(unsigned int i=0;i<fields.size();i++){ if(!fields[i].empty()) toreturn.push_back(prefix+fields[i]); } return vectorToString(toreturn,"\n"); }
//IMPLEMENT HEADER MultiVCFreader::MultiVCFreader(string file,string indexForFile,string chrName,int start,int end,int indelsAhead) { readAhead=indelsAhead; rt = new ReadTabix (file,indexForFile,chrName,start,end); //reading header istringstream f (rt->getHeader()); string line; numPop=0; while (getline(f, line)) { //std::cout << line << std::endl; if(strBeginsWith(line,"#CHROM")) { vector<string> tok = allTokens(line,'\t'); if(tok.size() < 10 ) { cerr<<"The header line"<<line<<" does not contain enough fields"<<endl; exit(1); } for(unsigned int i=9; i<tok.size(); i++) { //cerr<<tok[i]<<endl; numPop++; populationNames.push_back(tok[i]); } } } if( numPop == 0 ) { cerr<<"No populations have been found for file "<<file<<endl; exit(1); } needToPopulateQueue = true; fullQueue = false; endQueue = false; numberOfTimesHasDataWasCalled = 0; svcfToReturn = 0; repoCalledHasData = false; indexInQueueOfIndels = -1; indexOfLastIndel = 0; previouslyFoundIndel = false; tabixMode = true; textMode = false; }
void MistarParser::parseHeader(istream & in){ bool firstLine=true; string line; while(getline ( in,line)){ //cout<<"line "<<line<<endl; if(line[0] == '#'){ // cout<<line; if(firstLine){ if(line != "#MISTAR"){ cerr << "Error: MistarParser first line must be #MISTAR found: " << line <<endl; exit(1); } firstLine=false; continue; } if(strBeginsWith(line, "#chr")){ defline=line; vector<string> fields=allTokens(line,'\t'); if(fields[0] != "#chr") { cerr<<"Field #1 ("<<fields[0]<<") of header must be #chr in line #"<<line<<"#"<<endl; exit(1); } if(fields[1] != "coord") { cerr<<"Field #2 of header must be coord "; exit(1); } if(fields[2] != "REF,ALT"){ cerr<<"Field #3 of header must be REF,ALT "; exit(1); } if(fields[3] != "root") { cerr<<"Field #4 of header must be root "; exit(1); } if(fields[4] != "anc") { cerr<<"Field #5 of header must be anc "; exit(1); } for(unsigned int i=3;i<fields.size();i++){ populationNames->push_back(fields[i]); numberPopulations++; } header+=line+"\n"; break; }else{ header+=line+"\n"; headerNoDefline+=line+"\n"; } }else{ cerr << "Error: MistarParser cannot get header" <<endl; exit(1); } } }
void SimpleVCF::init(const vector<string> & fields, CoreVCF * corevcf_){ //string line){ unresolvedGT=false; homozygousREF=false; heterozygous=false; homozygousALT=false; indexGenotype=-1; indexGenotypeQual=-1; indexDepth=-1; indexPL=-1; typeOfData=1; // fields=allTokens(line,'\t'); // corevcf = corevcf_; int fieldIndex = corevcf->getFieldIndexAndIncrease(); // cerr<<"fieldIndex "<<fieldIndex<<endl; //FORMAT FIELDS rawFormatNames = fields[ corevcf->getFieldIndexINFO()+1 ]; rawFormatValues = fields[fieldIndex]; // cerr<<"rawFormatNames "<<rawFormatNames<<endl; // cerr<<"rawFormatValues "<<rawFormatValues<<endl; formatFieldNames = allTokens(rawFormatNames ,':'); formatFieldValues = allTokens(rawFormatValues,':'); if(rawFormatValues == "./."){ unresolvedGT=true; observedPL=false; observedGL=false; haploidCall=false; }else{ if(formatFieldNames.size() != formatFieldValues.size()){ cerr<<"SimpleVCF: for line "<<vectorToString(fields,"\t")<<" the format field does not have as many fields as the values"<<endl; exit(1); } observedPL=false; observedGL=false; haploidCall=false; for(unsigned int i=0;i<formatFieldNames.size();i++){ // cerr<<"formatFieldNames["<<i<<"] "<<formatFieldNames[i]<<" = "<<formatFieldValues[i]<<endl; if(formatFieldNames[i] == "GT"){ indexGenotype =i; formatFieldGT= formatFieldValues[i]; bool determinedGenotype=false; //Taken from http://www.broadinstitute.org/gatk/guide/topic?name=intro if(formatFieldGT == "./."){ determinedGenotype=true; unresolvedGT=true; } if(formatFieldGT == "0"){ determinedGenotype=true; homozygousREF=true; haploidCall=true; } if(formatFieldGT == "1"){ determinedGenotype=true; homozygousALT=true; haploidCall=true; } if(formatFieldGT == "0/0"){ determinedGenotype=true; homozygousREF=true; } if(formatFieldGT == "0|0"){ determinedGenotype=true; homozygousREF=true; } if(formatFieldGT == "0/1"){ determinedGenotype=true; heterozygous=true; } if(formatFieldGT == "0|1"){ determinedGenotype=true; heterozygous=true; } if(formatFieldGT == "1|0"){ determinedGenotype=true; heterozygous=true; } if(formatFieldGT == "1/1"){ determinedGenotype=true; homozygousALT=true; } if(formatFieldGT == "1|1"){ determinedGenotype=true; homozygousALT=true; } if(formatFieldGT == "1/2"){ determinedGenotype=true; heterozygousALT=true; } //has first alt and second alt if(formatFieldGT == "1|2"){ determinedGenotype=true; heterozygousALT=true; } //has first alt and second alt if(formatFieldGT == "2/1"){ determinedGenotype=true; heterozygousALT=true; } //has first alt and second alt if(formatFieldGT == "2|1"){ determinedGenotype=true; heterozygousALT=true; } //has first alt and second alt if(formatFieldGT == "0|2"){ determinedGenotype=true; heterozygous2ndALT=true; } //has ref and second alt if(formatFieldGT == "0/2"){ determinedGenotype=true; heterozygous2ndALT=true; } //has ref and second alt if(formatFieldGT == "2/0"){ determinedGenotype=true; heterozygous2ndALT=true; } //has ref and second alt if(formatFieldGT == "2|0"){ determinedGenotype=true; heterozygous2ndALT=true; } //has ref and second alt if(formatFieldGT == "2/2"){ determinedGenotype=true; homozygous2ndALT=true; } //twice the second alt if(formatFieldGT == "2|2"){ determinedGenotype=true; homozygous2ndALT=true; } //twice the second alt //for more than 3 if(!determinedGenotype){ vector<string> fieldsOfGT = allTokens(formatFieldGT ,'/'); if(fieldsOfGT.size() == 2){ // int alleleCFirst = destringify<int> (fieldsOfGT[0]); // int alleleC2nd = destringify<int> (fieldsOfGT[1]); if(isPositiveInt(fieldsOfGT[0]) && isPositiveInt(fieldsOfGT[1]) ){ determinedGenotype=true; unresolvedGT=true; } }else{ vector<string> fieldsOfGT = allTokens(formatFieldGT ,'|'); if(fieldsOfGT.size() == 2){ // int alleleCFirst = destringify<int> (fieldsOfGT[0]); // int alleleC2nd = destringify<int> (fieldsOfGT[1]); if(isPositiveInt(fieldsOfGT[0]) && isPositiveInt(fieldsOfGT[1]) ){ determinedGenotype=true; unresolvedGT=true; } }else{ } } } // if(formatFieldGT == "0/3" || // formatFieldGT == "3/3" || // formatFieldGT == "3/3" || // ){ determinedGenotype=true; unresolvedGT=true; if(!determinedGenotype){ cerr<<"SimpleVCF: unable to determine genotype for line "<<vectorToString(fields,"\t")<<" field=#"<<formatFieldGT<<"#"<<endl; exit(1); } continue; } if(formatFieldNames[i] == "GQ"){ if(formatFieldValues[i] == "."){ indexGenotypeQual =i; formatFieldGQ=0.0; }else{ indexGenotypeQual =i; formatFieldGQ=destringify<float>(formatFieldValues[i]); } continue; } if(formatFieldNames[i] == "DP"){ indexDepth =i; formatFieldDP=destringify<int> (formatFieldValues[i]); continue;} if(formatFieldNames[i] == "GL"){ observedGL=true; if(observedPL){ cerr<<"SimpleVCF: cannot observed both GL and PL "<<vectorToString(fields,"\t")<<""<<endl; exit(1); } indexPL = i; formatFieldGL = formatFieldValues[i]; vector<string> glfields = allTokens(formatFieldGL,','); if(glfields.size() == 2){ //haploid calls (e.g. X for a male) if(!haploidCall){ cerr<<"SimpleVCF: cannot observed 2 GL fields for a non-haploid record "<<vectorToString(fields,"\t")<<""<<endl; exit(1); } formatFieldPLHomoRef = int(-10.0*destringify<double>(glfields[0])); formatFieldPLHetero = -1000000; //very unlikely formatFieldPLHomoAlt = int(-10.0*destringify<double>(glfields[1])); }else{ if(glfields.size() == 3){ //biallelic formatFieldPLHomoRef = int(-10.0*destringify<double>(glfields[0])); formatFieldPLHetero = int(-10.0*destringify<double>(glfields[1])); formatFieldPLHomoAlt = int(-10.0*destringify<double>(glfields[2])); }else{ if(glfields.size() == 6){ //triallelic //according to VCF docs it has the following order AA,AB,BB,AC,BC,CC formatFieldPLHomoRef = int(-10.0*destringify<double>(glfields[0])); //r-r formatFieldPLHetero1 = int(-10.0*destringify<double>(glfields[1])); //r-a1 formatFieldPLHomoAlt1 = int(-10.0*destringify<double>(glfields[2])); //a1-a1 formatFieldPLHetero2 = int(-10.0*destringify<double>(glfields[3])); //r-a2 formatFieldPLHetero12 = int(-10.0*destringify<double>(glfields[4])); //a1-a2 formatFieldPLHomoAlt2 = int(-10.0*destringify<double>(glfields[5])); //a2-a2 }else{ cerr<<"SimpleVCF: for line "<<vectorToString(fields,"\t")<<" the GL field does not have 3 or 6 fields"<<endl; exit(1); } } } } if(formatFieldNames[i] == "PL"){ observedPL=true; if(observedGL){ cerr<<"SimpleVCF: cannot observed both GL and PL "<<vectorToString(fields,"\t")<<""<<endl; exit(1); } indexPL = i; if(formatFieldValues[i] == "."){ formatFieldPL = formatFieldValues[i]; unresolvedGT=true; continue; } formatFieldPL = formatFieldValues[i]; vector<string> plfields = allTokens(formatFieldPL,','); if(plfields.size() == 3){ //biallelic formatFieldPLHomoRef = destringify<int>(plfields[0]); formatFieldPLHetero = destringify<int>(plfields[1]); formatFieldPLHomoAlt = destringify<int>(plfields[2]); }else{ if(plfields.size() == 6){ //triallelic //according to VCF docs it has the following order AA,AB,BB,AC,BC,CC formatFieldPLHomoRef = destringify<int>(plfields[0]); //r-r formatFieldPLHetero1 = destringify<int>(plfields[1]); //r-a1 formatFieldPLHomoAlt1 = destringify<int>(plfields[2]); //a1-a1 formatFieldPLHetero2 = destringify<int>(plfields[3]); //r-a2 formatFieldPLHetero12 = destringify<int>(plfields[4]); //a1-a2 formatFieldPLHomoAlt2 = destringify<int>(plfields[5]); //a2-a2 }else{ cerr<<"SimpleVCF: for line "<<vectorToString(fields,"\t")<<" the PL field does not have 3 or 6 fields"<<endl; exit(1); } } continue; } //To uncomment the fields to get these fields if(formatFieldNames[i] == "A"){ vector<string> adfield = allTokens( formatFieldValues[i] ,','); for(unsigned int j=0;j<adfield.size();j++){ countA.push_back( destringify<int>( adfield[j]) ); } continue; } if(formatFieldNames[i] == "C"){ vector<string> adfield = allTokens( formatFieldValues[i] ,','); for(unsigned int j=0;j<adfield.size();j++){ countC.push_back( destringify<int>( adfield[j]) ); } continue; } if(formatFieldNames[i] == "G"){ vector<string> adfield = allTokens( formatFieldValues[i] ,','); for(unsigned int j=0;j<adfield.size();j++){ countG.push_back( destringify<int>( adfield[j]) ); } continue; } if(formatFieldNames[i] == "T"){ vector<string> adfield = allTokens( formatFieldValues[i] ,','); for(unsigned int j=0;j<adfield.size();j++){ countT.push_back( destringify<int>( adfield[j]) ); } continue; } } } // cout<<getADforA()<<endl; // cout<<getADforC()<<endl; // cout<<getADforG()<<endl; // cout<<getADforT()<<endl; // cerr<<"end"<<endl; }
indexData intern_readIndex(string filename){ string line; ifstream myFile; indexData toReturn; toReturn.mlindex1=0; toReturn.mlindex2=0; bool isFirstLine =true; //initialize the values for the likelihood of matches or mismatches for(int i=0;i<64;i++){ if(i == 0) likeMatch[i] = -3.0; // this is vrong, hope it's never accessed else likeMatch[i] = log1p( -pow(10.0,i/-10.0) )/log(10); likeMismatch[i] = i/-10.0; #ifdef DEBUG2 cout<<"qual = "<<i<<endl; cout<<likeMatch[i]<<endl; cout<<likeMismatch[i]<<endl; #endif } //reading the files // myFile.open(filename.c_str(), ios::in); // if (myFile.is_open()){ vector<string> allLinesIndex = allTokens(filename,'\n'); //while ( getline (myFile,line)){ for(unsigned int i=0;i<allLinesIndex.size();i++){ line = allLinesIndex[i]; if(line.empty()) continue; line+=' '; // cerr<<"line #"<<line<<"#"<<toReturn.isDoubleIndex<<endl; if(isFirstLine){ if(line[0] == '#'){ unsigned int i=0; int numberOfFields=0; bool inWS=true; while(i<line.length()){ if( isspace(line[i])){ inWS=true; }else{ if(inWS){ numberOfFields++; } inWS=false; } i++; } if(numberOfFields==2){ toReturn.isDoubleIndex=false; }else{ if(numberOfFields==3 || numberOfFields==5){ toReturn.isDoubleIndex=true; }else{ cerr << "Must have 2, 3 or 5 fields"<<endl; exit(1); } } }else{ cerr << "First line must begin with #"<<endl; exit(1); } isFirstLine=false; }else{ int i=0; int fieldIndex=0; bool inWS=false; int lastOneNW=0; string foundName; while(i<int(line.length())){ if( isspace(line[i]) && i==0){ cerr<<line<<endl; cerr << "First character cannot be a space"<<endl; exit(1); } if( isspace(line[i]) ){ if(!inWS){ //found a field //first field, first index if(fieldIndex==0){ toReturn.indices1.push_back(toUpperCase(line.substr(lastOneNW,i-lastOneNW))); if(toReturn.mlindex1 < (i-lastOneNW)){ toReturn.mlindex1 =(i-lastOneNW); } }else{ //second field, either name of single ind or second index if(fieldIndex==1){ if(toReturn.isDoubleIndex){ toReturn.indices2.push_back(toUpperCase(line.substr(lastOneNW,i-lastOneNW))); if(toReturn.mlindex2 < (i-lastOneNW)){ toReturn.mlindex2 =(i-lastOneNW); } }else{ foundName=line.substr(lastOneNW,i-lastOneNW); //duplicated names ? if(toReturn.namesMap.find( foundName ) != toReturn.namesMap.end()){ cerr<<"Warning: The sequence name is duplicated "<<foundName<<endl; //exit(1); }else{ toReturn.namesMap[ foundName ] = ""; } toReturn.names.push_back( foundName ); } }else if(fieldIndex==2){ //sequence name when two indices if(toReturn.isDoubleIndex){ //duplicated names foundName=line.substr(lastOneNW,i-lastOneNW); if(toReturn.namesMap.find( foundName ) != toReturn.namesMap.end()){ cerr<<"Warning: The sequence name is duplicated "<<foundName<<endl; //exit(1); }else{ toReturn.namesMap[ foundName ] = ""; } toReturn.names.push_back( foundName ); }else{ //it's a comment for single index toReturn.namesMap[ foundName ] += line.substr(lastOneNW,i-lastOneNW); // cerr<<"Single index file cannot have 3 fields"<<endl; // exit(1); } }else{ //it's a comment again toReturn.namesMap[ foundName ] += line.substr(lastOneNW,i-lastOneNW); } } fieldIndex++; } inWS=true; }else{ if(inWS) lastOneNW=i; inWS=false; } i++; } //ending while(i<line.length()){ } // ending else firstline } // ending while myFile.good() ){ //checking for size // cout<<toReturn.indices1.size()<<endl; // cout<<toReturn.indices2.size()<<endl; // cout<<toReturn.names.size()<<endl; if(toReturn.isDoubleIndex) if((toReturn.indices1.size() != toReturn.indices2.size()) ){ cerr << "Size of the fields inconsistent "<<filename<<endl; exit(1); } if(toReturn.indices1.size() != toReturn.names.size() ){ cerr << "Size of the fields inconsistent "<<filename<<endl; exit(1); } //checking for valid dna for(unsigned int i=0;i<toReturn.indices1.size();i++){ if(!isValidDNA(toReturn.indices1[i])){ cerr << "Index " << toReturn.indices1[i] <<" is not a valid DNA sequence"<<endl; exit(1); } if(toReturn.isDoubleIndex) if(!isValidDNA(toReturn.indices2[i])){ cerr << "Index " << toReturn.indices2[i] <<" is not a valid DNA sequence"<<endl; exit(1); } } return toReturn; }
MultiVCFreader::MultiVCFreader(string file,int indelsAhead) { readAhead=indelsAhead; numberOfTimesHasDataWasCalled=0; svcfToReturn=0; vcfFile.open(file.c_str(), ios::in); // open the streams if (vcfFile.good()) { //fine } else { cerr<<"Unable to open the file "<<file<<endl; exit(1); } bool firstLine=true; bool haveCaptureCHROM=false; numPop=0; while(1) { bool flag=getline(vcfFile,currentline); if(!flag) { cerr<<"ERROR file : "+file+" is probably empty"<<endl; exit(1); } if(firstLine) { if(currentline.length() > 0 && currentline[0] != '#') { cerr<<"ERROR first line in "<<file<<"does not begin with #"<<endl; exit(1); } firstLine = false; } if(!firstLine) { if(currentline.length() > 0) { if(currentline[0] == '#') { if(strBeginsWith(currentline,"#CHROM")) { haveCaptureCHROM=true; vector<string> tok = allTokens(currentline,'\t'); if(tok.size() < 10 ) { cerr<<"The header line"<<currentline<<" does not contain enough fields for file "<<file<<endl; exit(1); } for(unsigned int i=9; i<tok.size(); i++) { //cerr<<tok[i]<<endl; numPop++; populationNames.push_back(tok[i]); } break; } // cerr<<"ERROR first line in "<<file<<"does not begin with #"<<endl; // return 1; } else { break; } } } }//end while(1) // vcfFile.close(); // //vcfFile.seekg(0, std::ios::beg); // vcfFile.open(file.c_str(), ios::in); // open the streams // if (vcfFile.good()) { // //fine // }else{ // cerr<<"Unable to open the file for second pass "<<file<<endl; // exit(1); // } if( numPop == 0 ) { cerr<<"No populations have been found for file "<<file<<endl; exit(1); } if(!haveCaptureCHROM) { cerr<<"The header with #CHROM has not been found in file:"<<file<<endl; exit(1); } needToPopulateQueue = true; fullQueue = false; endQueue = false; repoCalledHasData = false; indexInQueueOfIndels=-1; indexOfLastIndel=0; previouslyFoundIndel=false; tabixMode = false; textMode = true; }
bool MultiVCFreader::hasData() { if(repoCalledHasData) { repoCalledHasData=false; return true; } numberOfTimesHasDataWasCalled++; //if first call and queue empty, populate if(needToPopulateQueue) { //cout<<"hasData()"<<endl; bool loop=true; int indexQueue=0; while(loop) { if(getNextLine()) { #ifdef DEBUG cout<<"currentline "<<currentline<<endl; #endif vector<SimpleVCF *> * svcfvec = new vector<SimpleVCF *>(); vector<string> fieldTab = allTokens(currentline,'\t'); CoreVCF * corevcf = new CoreVCF(fieldTab); for(int k=0; k<numPop; k++) { SimpleVCF * svcf = new SimpleVCF (fieldTab,corevcf,k==0); svcfvec->push_back(svcf); } //SimpleVCF * svcfvec = new SimpleVCF(currentline); #ifdef DEBUG //cout<<"new1 "<<svcf<<endl; #endif if(queueOfVCFs.size() != 0 ) { flagCpG( queueOfVCFs.back()->at(0) , svcfvec->at(0) ); } // cout<<"Adding "<<*svcf<<endl; queueOfVCFs.push_back(svcfvec); if(svcfvec->at(0)->containsIndel()) { if(indexInQueueOfIndels == -1) indexInQueueOfIndels=indexQueue; } indexQueue++; if(queueOfVCFs.size() == (readAhead+1)) { loop=false; } } else { loop=false; } } if(queueOfVCFs.size() == (readAhead+1)) { //+1 for CPGs fullQueue=true; } else { endQueue=true; } needToPopulateQueue=false; } //if subsequent call, and queue full if(fullQueue) { if(getNextLine()) { // SimpleVCF * svcf = new SimpleVCF(currentline); vector<SimpleVCF *> * svcfvec = new vector<SimpleVCF *>(); vector<string> fieldTab = allTokens(currentline,'\t'); CoreVCF * corevcf = new CoreVCF(fieldTab); for(int k=0; k<numPop; k++) { SimpleVCF * svcf = new SimpleVCF (fieldTab,corevcf,k==0); svcfvec->push_back(svcf); } #ifdef DEBUG //cout<<"new2 "<<*svcf<<endl; #endif // cout<<"size "<<queueOfVCFs.size()<<endl; if(queueOfVCFs.size() != 0 ) { flagCpG( queueOfVCFs.back()->at(0),svcfvec->at(0)); } queueOfVCFs.push_back(svcfvec); if(svcfvec->at(0)->containsIndel()) { if(indexInQueueOfIndels == -1) indexInQueueOfIndels=queueOfVCFs.size()-1; } } else { fullQueue=false; endQueue=true; } } //if final calls and queue not max size if(endQueue) { //nothing to do } bool stillHasData=!(queueOfVCFs.empty()); // if(!stillHasData){ //getData() should not get called in this case, hence no deallocation // cout<<"delete1 "<<svcfToReturn<<endl; // delete svcfToReturn; // } return stillHasData; }
void readNucSubstitionFreq(const string filename,vector<probSubstition> & subVec){ igzstream subFP; subFP.open(filename.c_str(), ios::in); // unsigned int counterCont=0; if (subFP.good()){ vector<string> fields; string line; //header if ( !getline (subFP,line)){ cerr << "Unable to open file "<<filename<<endl; exit(1); } fields = allTokens(line,'\t'); if(fields.size() != 12){ cerr << "line from error profile does not have 12 fields "<<line<<endl; exit(1); } //probs while ( getline (subFP,line)){ fields = allTokens(line,'\t'); if(fields.size() != 12){ cerr << "line from error profile does not have 12 fields "<<line<<endl; exit(1); } substitutionRates tempFreq; probSubstition toaddSub; for(unsigned int k=0;k<=9;k+=3){ for(unsigned int t=0;t<=2;t++){ tempFreq.s[k+t]=destringify<double>(fields[k+t]); } } int indexFirstArray =0; int indexSecondArray=0; for(int nuc1=0;nuc1<4;nuc1++){ double sumMismatchProb=0.0; int indexInArrayMatch=1; for(int nuc2=0;nuc2<4;nuc2++){ if(nuc1==nuc2){ // prob of error is 0 if both nucleotides are identical indexInArrayMatch = indexFirstArray; toaddSub.s[indexFirstArray++] = 1.0; }else{ // rely on the substitution frequency sumMismatchProb += tempFreq.s[indexSecondArray]; toaddSub.s[indexFirstArray++] = tempFreq.s[indexSecondArray++]; } } toaddSub.s[indexInArrayMatch] = 1.0 - sumMismatchProb; } // for(int nuc1=0;nuc1<4;nuc1++){ // for(int nuc2=0;nuc2<4;nuc2++){ // cout<<(nuc1*4+nuc2)<<"\t"<<toaddSub.s[nuc1*4+nuc2]<<endl; // } // } // exit(1); subVec.push_back( toaddSub ); } subFP.close(); }else{ cerr << "Unable to open file "<<filename<<endl; exit(1); } }
void readIlluminaError(const string errFile,probSubstition & illuminaErrorsProb){ igzstream errFileSt; errFileSt.open(errFile.c_str(), ios::in); // unsigned int counterCont=0; if (errFileSt.good()){ vector<string> fields; string line; //header if ( !getline (errFileSt,line)){ cerr << "Unable to open file "<<errFile<<endl; exit(1); } fields = allTokens(line,'\t'); if(fields.size() != 12){ cerr << "line from error profile does not have 12 fields "<<line<<endl; exit(1); } //raw sums if ( !getline (errFileSt,line)){ cerr << "Unable to open file "<<errFile<<endl; exit(1); } fields = allTokens(line,'\t'); if(fields.size() != 12){ cerr << "line from error profile does not have 12 fields "<<line<<endl; exit(1); } //probs if ( !getline (errFileSt,line)){ cerr << "Unable to open file "<<errFile<<endl; exit(1); } fields = allTokens(line,'\t'); if(fields.size() != 12){ cerr << "line from error profile does not have 12 fields "<<line<<endl; exit(1); } substitutionRates tempFreq; for(unsigned int k=0;k<=9;k+=3){ for(unsigned int t=0;t<=2;t++){ tempFreq.s[k+t]=destringify<double>(fields[k+t]); //cerr<<freqIlluminaError.s[k+t]<<endl; } } int indexFirstArray =0; int indexSecondArray=0; for(int nuc1=0;nuc1<4;nuc1++){ for(int nuc2=0;nuc2<4;nuc2++){ if(nuc1==nuc2) // prob of error is 0 if both nucleotides are identical illuminaErrorsProb.s[indexFirstArray++]=0.0; else // rely on the substitution frequency illuminaErrorsProb.s[indexFirstArray++]=tempFreq.s[indexSecondArray++]; } } errFileSt.close(); }else{ cerr << "Unable to open file "<<errFile<<endl; exit(1); } }
bool MistarParser::hasData(){ if(numberOfTimesHasDataWasCalled!=-1){ // cout<<"delete"<<endl; //cerr<<"del "<<allRecToReturn<<endl; //delete(allRecToReturn->vectorAlleles); delete(allRecToReturn); numberdel++; }else{ numberOfTimesHasDataWasCalled=0; } numberOfTimesHasDataWasCalled++; // string line; //if(getline ( *myFilezipped,line)){ if(getNextLine()){ numbernew++; allRecToReturn = new AlleleRecords(); // cerr<<"new "<<allRecToReturn<<endl; //allRecToReturn->vectorAlleles = new vector<SingleAllele>(); // cout<<"currentline "<<currentline<<endl; vector<string> fields=allTokens(currentline,'\t'); if(fields.size() != (numberPopulations+3)){ cerr << "Error: MistarParser the following line should have "<<(numberPopulations+3)<<" fields " << currentline <<endl; exit(1); } if(fields[2].length() != 3){ cerr << "Error: MistarParser the following line " << currentline <<" does not have 2 comma separated alleles"<<endl; exit(1); } allRecToReturn->chr = fields[0]; allRecToReturn->coordinate = destringify<unsigned int>(fields[1]); allRecToReturn->ref = fields[2][0]; allRecToReturn->alt = fields[2][2]; if(allRecToReturn->ref == allRecToReturn->alt){ cerr << "Error: MistarParser the following line " << currentline <<" the reference is equal to the alt allele, exiting"<<endl; exit(1); } allRecToReturn->vectorAlleles = new vector<SingleAllele>(); for(unsigned int i=3;i<fields.size();i++){ unsigned int indexComma=0; unsigned int indexColon=0; for(unsigned int k=0;k<fields[i].size();k++){ if(fields[i][k]==',') indexComma=k; if(fields[i][k]==':') indexColon=k; } if(indexComma == 0 || indexColon == 0 ){ cerr << "Error: MistarParser problem with the following line " << currentline <<" cannot get allele count"<<endl; exit(1); } SingleAllele sa (destringify<int>( fields[i].substr(0,indexComma)), destringify<int>( fields[i].substr(indexComma+1,indexColon)), destringify<bool>(fields[i].substr(indexColon+1)) ); allRecToReturn->vectorAlleles->push_back(sa); } if( allRecToReturn->vectorAlleles->size() != numberPopulations){ cerr << "Error: MistarParser problem with the following line " << currentline <<" number of allele count read is not "<<numberPopulations<<endl; exit(1); } return true; }else{//if has no data if(textMode){ myFilezipped->close(); } // else // myFile->close(); return false; } return false; }