예제 #1
0
void readMTAlleleFreq(const string freqFile,	map<int, alleleFrequency> & pos2allelefreq){
    // map<int, alleleFrequency> pos2allelefreq;

    string line;
    igzstream freqAlleleFile;
    freqAlleleFile.open(freqFile.c_str());
    if (freqAlleleFile.good()){

	while ( getline (freqAlleleFile,line)){

	    vector<string> fields = allTokens(line,'\t');
	    alleleFrequency freqToadd;
	    
	    if(fields.size() != 5){
		cerr << "line "<<line<<"  in file  "<<freqFile<<" does not have 5 fields"<<endl;
		exit(1);
	    }
	   

	    for(int nuc=0;nuc<4;nuc++){
		freqToadd.f[nuc]=destringify<double>(fields[nuc+1]);
	    }

	    pos2allelefreq[ destringify<int>( fields[0])  ] = freqToadd;
	    	    
	}
	freqAlleleFile.close();

    }else{
	cerr << "Cannot open allele frequency file  "<<freqFile<<""<<endl;
	exit(1);
    }

}
예제 #2
0
void readMTConsensus(const string consensusFile,
		     map<int, PHREDgeno> & pos2phredgeno,
		     int & sizeGenome,
		     vector<int> & posOfIndels){

    string line;
    igzstream consensusFD;
    consensusFD.open(consensusFile.c_str());
    if (consensusFD.good()){
	getline (consensusFD,line);

	while ( getline (consensusFD,line)){
	    if (line.empty())
		continue;

	    vector<string> fields = allTokens(line,'\t');
	    PHREDgeno toadd;
	    // cerr<<line<<endl;


	    if(fields.size() != 11){
		cerr << "line "<<line<<"  in file  "<<consensusFile<<" does not have 11 fields"<<endl;
		exit(1);
	    }
	    

	    if(fields[0][fields[0].size()-1] == 'i'){ //skip insertion
		posOfIndels.push_back( destringify<int>( fields[0]) );
		continue;
	    }

	    if(fields[2] == "D"){ //skip deletions
		posOfIndels.push_back( destringify<int>( fields[0]) );
		continue;
	    }	    

	    toadd.consensus = fields[2][0];
	    for(int nuc=0;nuc<4;nuc++){		
		toadd.phred[nuc]  = destringify<double>(fields[nuc+7]);		
		toadd.perror[nuc] = pow(10.0,toadd.phred[nuc]/(-10.0));		
	    }

	    pos2phredgeno[     destringify<int>( fields[0])   ] = toadd;
	    sizeGenome =  max( destringify<int>( fields[0]), sizeGenome);
	    // cout<<destringify<int>( fields[0])<<endl;
	    
	}
	consensusFD.close();

    }else{
	cerr << "Cannot open consensus file  "<<consensusFile<<""<<endl;
	exit(1);
    }


}
예제 #3
0
SimpleVCF::SimpleVCF(string line){
    //trimWhiteSpacesBothEnds	(&line);
    vector<string> fields=allTokens(line,'\t');
    corevcf = new CoreVCF(fields);
    deleteCore=true;
    // cerr<<"Ok "<<endl;
    init(fields,corevcf);
    // cerr<<"Ok "<<endl;
    // cout<<"SimpleVCF "<<this<<"\t"<<deleteCore<<endl;
}
예제 #4
0
string MistarParser::getHeaderNoDefline(string prefix){
    vector<string> fields=allTokens(headerNoDefline,'\n');
    vector<string> toreturn;
    for(unsigned int i=0;i<fields.size();i++){
	if(!fields[i].empty())
	    toreturn.push_back(prefix+fields[i]);
    }

    return vectorToString(toreturn,"\n");
}
예제 #5
0
//IMPLEMENT HEADER
MultiVCFreader::MultiVCFreader(string file,string indexForFile,string chrName,int start,int end,int indelsAhead) {
    readAhead=indelsAhead;
    rt = new ReadTabix (file,indexForFile,chrName,start,end);


    //reading header
    istringstream f (rt->getHeader());
    string line;
    numPop=0;
    while (getline(f, line)) {
        //std::cout << line << std::endl;
        if(strBeginsWith(line,"#CHROM")) {
            vector<string> tok = allTokens(line,'\t');
            if(tok.size() < 10 ) {
                cerr<<"The header line"<<line<<" does not contain enough fields"<<endl;
                exit(1);
            }

            for(unsigned int i=9; i<tok.size(); i++) {
                //cerr<<tok[i]<<endl;
                numPop++;
                populationNames.push_back(tok[i]);
            }

        }
    }


    if( numPop == 0 ) {
        cerr<<"No populations have been found for file "<<file<<endl;
        exit(1);
    }

    needToPopulateQueue           = true;
    fullQueue                     = false;
    endQueue                      = false;
    numberOfTimesHasDataWasCalled = 0;

    svcfToReturn                  = 0;

    repoCalledHasData             = false;

    indexInQueueOfIndels          = -1;
    indexOfLastIndel              = 0;
    previouslyFoundIndel          = false;

    tabixMode                     = true;
    textMode                      = false;
}
예제 #6
0
void MistarParser::parseHeader(istream & in){
    bool firstLine=true;
    string line;

    while(getline ( in,line)){
	//cout<<"line "<<line<<endl;
	if(line[0] == '#'){
	    // cout<<line;
	    if(firstLine){
		if(line != "#MISTAR"){
		    cerr << "Error: MistarParser first line must be #MISTAR found: " << line <<endl;
		    exit(1);	    
		}		
		firstLine=false;
		continue;
	    }

	    
	    if(strBeginsWith(line, "#chr")){
		defline=line;
		vector<string> fields=allTokens(line,'\t');

		if(fields[0] != "#chr")   { cerr<<"Field #1 ("<<fields[0]<<") of header must be #chr in line #"<<line<<"#"<<endl;    exit(1); }
		if(fields[1] != "coord")  { cerr<<"Field #2 of header must be coord ";   exit(1); }
		if(fields[2] != "REF,ALT"){ cerr<<"Field #3 of header must be REF,ALT "; exit(1); }
		if(fields[3] != "root")   { cerr<<"Field #4 of header must be root ";    exit(1); }
		if(fields[4] != "anc")    { cerr<<"Field #5 of header must be anc ";     exit(1); }

		for(unsigned int i=3;i<fields.size();i++){
		    populationNames->push_back(fields[i]);
		    numberPopulations++;
		}
		header+=line+"\n";

		break;
	    }else{
		header+=line+"\n";
		headerNoDefline+=line+"\n";
	    }
	    
	}else{
	    cerr << "Error: MistarParser cannot get header"  <<endl;
	    exit(1);
	}
    }
}
예제 #7
0
void SimpleVCF::init(const vector<string> & fields, CoreVCF *  corevcf_){ //string line){


    unresolvedGT=false;
    homozygousREF=false;
    heterozygous=false;
    homozygousALT=false;

    indexGenotype=-1; 
    indexGenotypeQual=-1; 
    indexDepth=-1;    
    indexPL=-1;       

    typeOfData=1;


    // fields=allTokens(line,'\t');
    // corevcf = corevcf_;
    
    int fieldIndex  = corevcf->getFieldIndexAndIncrease();
    // cerr<<"fieldIndex "<<fieldIndex<<endl;

    //FORMAT FIELDS
    rawFormatNames  = fields[ corevcf->getFieldIndexINFO()+1 ];
    rawFormatValues = fields[fieldIndex];

    // cerr<<"rawFormatNames  "<<rawFormatNames<<endl;
    // cerr<<"rawFormatValues "<<rawFormatValues<<endl;

    formatFieldNames  = allTokens(rawFormatNames ,':');
    formatFieldValues = allTokens(rawFormatValues,':');
    
    if(rawFormatValues == "./."){
	unresolvedGT=true; 

	observedPL=false;
	observedGL=false;
	haploidCall=false;
    }else{

    if(formatFieldNames.size() != formatFieldValues.size()){
	cerr<<"SimpleVCF: for line "<<vectorToString(fields,"\t")<<" the format field does not have as many fields as the values"<<endl;
	exit(1);
    }

    observedPL=false;
    observedGL=false;
    haploidCall=false;
    for(unsigned int i=0;i<formatFieldNames.size();i++){
	 // cerr<<"formatFieldNames["<<i<<"] "<<formatFieldNames[i]<<" = "<<formatFieldValues[i]<<endl;
	if(formatFieldNames[i] == "GT"){ 
	    indexGenotype     =i; 
	    formatFieldGT=                   formatFieldValues[i]; 
	    bool determinedGenotype=false;
	    //Taken from http://www.broadinstitute.org/gatk/guide/topic?name=intro
	    if(formatFieldGT == "./."){ determinedGenotype=true; unresolvedGT=true;       }

	    if(formatFieldGT == "0"){   determinedGenotype=true; homozygousREF=true;   haploidCall=true;   }
	    if(formatFieldGT == "1"){   determinedGenotype=true; homozygousALT=true;   haploidCall=true;   }

	    if(formatFieldGT == "0/0"){ determinedGenotype=true; homozygousREF=true;      }
	    if(formatFieldGT == "0|0"){ determinedGenotype=true; homozygousREF=true;      }

	    if(formatFieldGT == "0/1"){ determinedGenotype=true; heterozygous=true;       }
	    if(formatFieldGT == "0|1"){ determinedGenotype=true; heterozygous=true;       }
	    if(formatFieldGT == "1|0"){ determinedGenotype=true; heterozygous=true;       }


	    if(formatFieldGT == "1/1"){ determinedGenotype=true; homozygousALT=true;      }
	    if(formatFieldGT == "1|1"){ determinedGenotype=true; homozygousALT=true;      }

	    if(formatFieldGT == "1/2"){ determinedGenotype=true; heterozygousALT=true;    } //has first alt and second alt
	    if(formatFieldGT == "1|2"){ determinedGenotype=true; heterozygousALT=true;    } //has first alt and second alt

	    if(formatFieldGT == "2/1"){ determinedGenotype=true; heterozygousALT=true;    } //has first alt and second alt
	    if(formatFieldGT == "2|1"){ determinedGenotype=true; heterozygousALT=true;    } //has first alt and second alt

	    if(formatFieldGT == "0|2"){ determinedGenotype=true; heterozygous2ndALT=true; } //has ref       and second alt
	    if(formatFieldGT == "0/2"){ determinedGenotype=true; heterozygous2ndALT=true; } //has ref       and second alt

	    if(formatFieldGT == "2/0"){ determinedGenotype=true; heterozygous2ndALT=true; } //has ref       and second alt
	    if(formatFieldGT == "2|0"){ determinedGenotype=true; heterozygous2ndALT=true; } //has ref       and second alt

	    if(formatFieldGT == "2/2"){ determinedGenotype=true; homozygous2ndALT=true; }   //twice the second alt
	    if(formatFieldGT == "2|2"){ determinedGenotype=true; homozygous2ndALT=true; }   //twice the second alt

	    //for more than 3

	    if(!determinedGenotype){

		vector<string> fieldsOfGT  = allTokens(formatFieldGT ,'/');
	    
		if(fieldsOfGT.size() == 2){
		    // int alleleCFirst = destringify<int>  (fieldsOfGT[0]);
		    // int alleleC2nd   = destringify<int>  (fieldsOfGT[1]);
		    if(isPositiveInt(fieldsOfGT[0]) &&
		       isPositiveInt(fieldsOfGT[1])    ){
			determinedGenotype=true; unresolvedGT=true; 
		    }		   
		}else{
		    vector<string> fieldsOfGT  = allTokens(formatFieldGT ,'|');
	    
		    if(fieldsOfGT.size() == 2){
			// int alleleCFirst = destringify<int>  (fieldsOfGT[0]);
			// int alleleC2nd   = destringify<int>  (fieldsOfGT[1]);
			if(isPositiveInt(fieldsOfGT[0]) &&
			   isPositiveInt(fieldsOfGT[1])   ){
			    determinedGenotype=true; unresolvedGT=true; 
			}		   
		    }else{

		    }

		}
	    }
	    // if(formatFieldGT == "0/3" ||
	    //    formatFieldGT == "3/3" ||
	    //    formatFieldGT == "3/3" ||
	       
	       

	    //    ){ determinedGenotype=true; unresolvedGT=true; 

	    if(!determinedGenotype){
		cerr<<"SimpleVCF: unable to determine genotype for line "<<vectorToString(fields,"\t")<<" field=#"<<formatFieldGT<<"#"<<endl;
		exit(1);
	    }
	    continue;
	}

	if(formatFieldNames[i] == "GQ"){ 
	    if(formatFieldValues[i] == "."){
		indexGenotypeQual =i; 
		formatFieldGQ=0.0;
	    }else{
		indexGenotypeQual =i; 
		formatFieldGQ=destringify<float>(formatFieldValues[i]);
	    } 
	    continue; }
	if(formatFieldNames[i] == "DP"){ indexDepth        =i; formatFieldDP=destringify<int>  (formatFieldValues[i]); continue;}

	if(formatFieldNames[i] == "GL"){ 
	    observedGL=true;
	    if(observedPL){
		cerr<<"SimpleVCF: cannot observed both GL and PL "<<vectorToString(fields,"\t")<<""<<endl;
		exit(1);
	    }

	    indexPL        = i; 
	    formatFieldGL  = formatFieldValues[i];
	    vector<string> glfields = allTokens(formatFieldGL,',');

	    if(glfields.size() == 2){ //haploid calls (e.g. X for a male)
		if(!haploidCall){
		    cerr<<"SimpleVCF: cannot observed 2 GL fields for a non-haploid record "<<vectorToString(fields,"\t")<<""<<endl;
		    exit(1);
		}
		formatFieldPLHomoRef =  int(-10.0*destringify<double>(glfields[0]));
		formatFieldPLHetero  =  -1000000; //very unlikely
		formatFieldPLHomoAlt =  int(-10.0*destringify<double>(glfields[1]));
		    
	    }else{
		if(glfields.size() == 3){ //biallelic

		    formatFieldPLHomoRef =  int(-10.0*destringify<double>(glfields[0]));
		    formatFieldPLHetero  =  int(-10.0*destringify<double>(glfields[1]));
		    formatFieldPLHomoAlt =  int(-10.0*destringify<double>(glfields[2]));

		}else{
		    if(glfields.size() == 6){ //triallelic
			//according to VCF docs it has the following order AA,AB,BB,AC,BC,CC
			formatFieldPLHomoRef  =  int(-10.0*destringify<double>(glfields[0])); //r-r

			formatFieldPLHetero1  =  int(-10.0*destringify<double>(glfields[1])); //r-a1
			formatFieldPLHomoAlt1 =  int(-10.0*destringify<double>(glfields[2])); //a1-a1

			formatFieldPLHetero2  =  int(-10.0*destringify<double>(glfields[3])); //r-a2

			formatFieldPLHetero12 =  int(-10.0*destringify<double>(glfields[4])); //a1-a2
			formatFieldPLHomoAlt2 =  int(-10.0*destringify<double>(glfields[5])); //a2-a2

		    }else{
			cerr<<"SimpleVCF: for line "<<vectorToString(fields,"\t")<<" the GL field does not have 3 or 6 fields"<<endl;
			exit(1);
		    }
		}
	    }
	}


	if(formatFieldNames[i] == "PL"){ 
	    observedPL=true;

	    if(observedGL){
		cerr<<"SimpleVCF: cannot observed both GL and PL "<<vectorToString(fields,"\t")<<""<<endl;
		exit(1);
	    }

	    indexPL        = i; 

	    if(formatFieldValues[i] == "."){
		formatFieldPL = formatFieldValues[i];
		unresolvedGT=true; 
		continue;
	    }

	    formatFieldPL  = formatFieldValues[i];
	    vector<string> plfields = allTokens(formatFieldPL,',');

	    if(plfields.size() == 3){ //biallelic
		formatFieldPLHomoRef =  destringify<int>(plfields[0]);
		formatFieldPLHetero  =  destringify<int>(plfields[1]);
		formatFieldPLHomoAlt =  destringify<int>(plfields[2]);

	    }else{
		if(plfields.size() == 6){ //triallelic
		    //according to VCF docs it has the following order AA,AB,BB,AC,BC,CC
		    formatFieldPLHomoRef  =  destringify<int>(plfields[0]); //r-r

		    formatFieldPLHetero1  =  destringify<int>(plfields[1]); //r-a1
		    formatFieldPLHomoAlt1 =  destringify<int>(plfields[2]); //a1-a1

		    formatFieldPLHetero2  =  destringify<int>(plfields[3]); //r-a2

		    formatFieldPLHetero12 =  destringify<int>(plfields[4]); //a1-a2
		    formatFieldPLHomoAlt2 =  destringify<int>(plfields[5]); //a2-a2

		}else{
		    cerr<<"SimpleVCF: for line "<<vectorToString(fields,"\t")<<" the PL field does not have 3 or 6 fields"<<endl;
		    exit(1);
		}
	    }
	    continue;
	}

	//To uncomment the fields to get these fields
	if(formatFieldNames[i] == "A"){   
	    vector<string> adfield = allTokens( formatFieldValues[i] ,',');
	    for(unsigned int j=0;j<adfield.size();j++){
		countA.push_back(   destringify<int>( adfield[j]) );
	    }
	    continue;
	}

	if(formatFieldNames[i] == "C"){   
	    vector<string> adfield = allTokens( formatFieldValues[i] ,',');
	    for(unsigned int j=0;j<adfield.size();j++){
		countC.push_back(   destringify<int>( adfield[j]) );
	    }
	    continue;
	}

	if(formatFieldNames[i] == "G"){   
	    vector<string> adfield = allTokens( formatFieldValues[i] ,',');
	    for(unsigned int j=0;j<adfield.size();j++){
		countG.push_back(   destringify<int>( adfield[j]) );
	    }
	    continue;
	}

	if(formatFieldNames[i] == "T"){   
	    vector<string> adfield = allTokens( formatFieldValues[i] ,',');
	    for(unsigned int j=0;j<adfield.size();j++){
		countT.push_back(   destringify<int>( adfield[j]) );
	    }
	    continue;
	}
	    


    }
    }
    // cout<<getADforA()<<endl;
    // cout<<getADforC()<<endl;
    // cout<<getADforG()<<endl;
    // cout<<getADforT()<<endl;

    // cerr<<"end"<<endl;

}
예제 #8
0
indexData intern_readIndex(string filename){
    string line;
    ifstream myFile;

    indexData toReturn;
    toReturn.mlindex1=0;
    toReturn.mlindex2=0;

    bool isFirstLine  =true;


    //initialize the values for the likelihood of matches or mismatches 
    for(int i=0;i<64;i++){
	if(i == 0)
	    likeMatch[i]    = -3.0; // this is vrong, hope it's never accessed
	else
	    likeMatch[i]    = log1p( -pow(10.0,i/-10.0) )/log(10);
	
	likeMismatch[i]     = i/-10.0;	
#ifdef DEBUG2
	cout<<"qual = "<<i<<endl;
	cout<<likeMatch[i]<<endl;
	cout<<likeMismatch[i]<<endl;
#endif
    }

    //reading the files



    // myFile.open(filename.c_str(), ios::in);
    // if (myFile.is_open()){

    vector<string> allLinesIndex = allTokens(filename,'\n');

    //while ( getline (myFile,line)){
    for(unsigned int i=0;i<allLinesIndex.size();i++){
	line = allLinesIndex[i];
	if(line.empty())
	    continue;
	line+=' ';
	// cerr<<"line #"<<line<<"#"<<toReturn.isDoubleIndex<<endl;

	if(isFirstLine){
	    if(line[0] == '#'){
		unsigned int i=0;
		int numberOfFields=0;
		bool inWS=true;
		while(i<line.length()){			
		    if( isspace(line[i])){			    
			inWS=true;
		    }else{
			if(inWS){
			    numberOfFields++;
			}
			inWS=false;			    
		    }
		    i++;
		}
		    
		if(numberOfFields==2){ 
		    toReturn.isDoubleIndex=false; 
		}else{
		    if(numberOfFields==3 || numberOfFields==5){
			toReturn.isDoubleIndex=true; 
		    }else{
			cerr << "Must have 2, 3 or 5 fields"<<endl;
			exit(1);
		    }			
		}

	    }else{
		cerr << "First line must begin with #"<<endl;
		exit(1);
	    }
	    isFirstLine=false;
	}else{
	    int i=0;
	    int fieldIndex=0;
	    bool inWS=false;
	    int lastOneNW=0;
	    string foundName;

	    while(i<int(line.length())){		
		    
		if( isspace(line[i]) && i==0){
		    cerr<<line<<endl;
		    cerr << "First character cannot be a space"<<endl;
		    exit(1);
		}
	    
		if( isspace(line[i]) ){			    
		    if(!inWS){ //found a field

			//first field, first index
			if(fieldIndex==0){
			    toReturn.indices1.push_back(toUpperCase(line.substr(lastOneNW,i-lastOneNW)));

			    if(toReturn.mlindex1 < (i-lastOneNW)){
				toReturn.mlindex1 =(i-lastOneNW);
			    }

			}else{
			    //second field, either name of single ind or second index
			    if(fieldIndex==1){
				if(toReturn.isDoubleIndex){
				    toReturn.indices2.push_back(toUpperCase(line.substr(lastOneNW,i-lastOneNW)));
				    if(toReturn.mlindex2 < (i-lastOneNW)){
					toReturn.mlindex2 =(i-lastOneNW);
				    }
				}else{
				    foundName=line.substr(lastOneNW,i-lastOneNW);
				    //duplicated names ?					
				    if(toReturn.namesMap.find(  foundName  ) !=  toReturn.namesMap.end()){
					cerr<<"Warning: The sequence name is duplicated "<<foundName<<endl;
					//exit(1);
				    }else{
					toReturn.namesMap[ foundName ] = ""; 
				    }

				    toReturn.names.push_back( foundName );

				}
			    }else if(fieldIndex==2){
				//sequence name when two indices
				if(toReturn.isDoubleIndex){
				    //duplicated names
				    foundName=line.substr(lastOneNW,i-lastOneNW);
					
				    if(toReturn.namesMap.find(  foundName  ) !=  toReturn.namesMap.end()){
					cerr<<"Warning: The sequence name is duplicated "<<foundName<<endl;
					//exit(1);
				    }else{
					toReturn.namesMap[ foundName ] = ""; 
				    }

				    toReturn.names.push_back( foundName );
				}else{
				    //it's a comment for single index
				    toReturn.namesMap[ foundName ] +=  line.substr(lastOneNW,i-lastOneNW);
				    // cerr<<"Single index file cannot have 3 fields"<<endl;
				    // exit(1);
				}
			    }else{
				//it's a comment again
				toReturn.namesMap[ foundName ] +=  line.substr(lastOneNW,i-lastOneNW);
			    }

			}
			fieldIndex++;
		    }
		    inWS=true;		    
			
		}else{
		    if(inWS)
			lastOneNW=i;
		    inWS=false;			    
		}
		i++;		
	    } //ending while(i<line.length()){		
	}  // ending else firstline

    }  // ending while myFile.good() ){





    //checking for size
    // cout<<toReturn.indices1.size()<<endl;
    // cout<<toReturn.indices2.size()<<endl;
    // cout<<toReturn.names.size()<<endl;
    if(toReturn.isDoubleIndex)
	if((toReturn.indices1.size() != toReturn.indices2.size()) ){
	    cerr << "Size of the fields inconsistent "<<filename<<endl;
	    exit(1);
	}


    if(toReturn.indices1.size() != toReturn.names.size() ){
	cerr << "Size of the fields inconsistent "<<filename<<endl;
	exit(1);
    }


    //checking for valid dna    
    for(unsigned int i=0;i<toReturn.indices1.size();i++){
	if(!isValidDNA(toReturn.indices1[i])){
	    cerr << "Index " << toReturn.indices1[i] <<" is not a valid DNA sequence"<<endl;
	    exit(1);
	}
	if(toReturn.isDoubleIndex)
	    if(!isValidDNA(toReturn.indices2[i])){
		cerr << "Index " << toReturn.indices2[i] <<" is not a valid DNA sequence"<<endl;
		exit(1);
	    }
    }
    return toReturn;
}
예제 #9
0
MultiVCFreader::MultiVCFreader(string file,int indelsAhead) {
    readAhead=indelsAhead;
    numberOfTimesHasDataWasCalled=0;
    svcfToReturn=0;

    vcfFile.open(file.c_str(), ios::in);    // open the streams
    if (vcfFile.good()) {
        //fine
    } else {
        cerr<<"Unable to open the file "<<file<<endl;
        exit(1);
    }

    bool firstLine=true;
    bool haveCaptureCHROM=false;
    numPop=0;

    while(1) {
        bool flag=getline(vcfFile,currentline);
        if(!flag) {
            cerr<<"ERROR file : "+file+" is probably empty"<<endl;
            exit(1);
        }

        if(firstLine) {

            if(currentline.length() > 0 && currentline[0] != '#') {
                cerr<<"ERROR first line in "<<file<<"does not begin with #"<<endl;
                exit(1);
            }

            firstLine = false;
        }

        if(!firstLine) {

            if(currentline.length() > 0) {
                if(currentline[0] == '#') {

                    if(strBeginsWith(currentline,"#CHROM")) {
                        haveCaptureCHROM=true;
                        vector<string> tok = allTokens(currentline,'\t');
                        if(tok.size() < 10 ) {
                            cerr<<"The header line"<<currentline<<" does not contain enough fields for file "<<file<<endl;
                            exit(1);
                        }

                        for(unsigned int i=9; i<tok.size(); i++) {
                            //cerr<<tok[i]<<endl;
                            numPop++;
                            populationNames.push_back(tok[i]);
                        }
                        break;
                    }

                    // cerr<<"ERROR first line in "<<file<<"does not begin with #"<<endl;
                    // return 1;
                } else {
                    break;
                }
            }

        }

    }//end while(1)
    // vcfFile.close();
    // //vcfFile.seekg(0, std::ios::beg);

    // vcfFile.open(file.c_str(), ios::in);    // open the streams
    // if (vcfFile.good()) {
    // 	//fine
    // }else{
    // 	cerr<<"Unable to open the file for second pass "<<file<<endl;
    // 	exit(1);
    // }

    if( numPop == 0 ) {
        cerr<<"No populations have been found for file "<<file<<endl;
        exit(1);
    }

    if(!haveCaptureCHROM) {
        cerr<<"The header with #CHROM has not been found in file:"<<file<<endl;
        exit(1);
    }


    needToPopulateQueue = true;
    fullQueue           = false;
    endQueue            = false;
    repoCalledHasData   = false;


    indexInQueueOfIndels=-1;
    indexOfLastIndel=0;
    previouslyFoundIndel=false;

    tabixMode = false;
    textMode  = true;
}
예제 #10
0
bool MultiVCFreader::hasData() {

    if(repoCalledHasData) {
        repoCalledHasData=false;
        return true;
    }

    numberOfTimesHasDataWasCalled++;


    //if first call and queue empty, populate
    if(needToPopulateQueue) {
        //cout<<"hasData()"<<endl;
        bool loop=true;
        int indexQueue=0;
        while(loop) {
            if(getNextLine()) {
#ifdef DEBUG
                cout<<"currentline "<<currentline<<endl;
#endif
                vector<SimpleVCF *> *  svcfvec = new vector<SimpleVCF *>();
                vector<string> fieldTab = allTokens(currentline,'\t');
                CoreVCF * corevcf =  new CoreVCF(fieldTab);
                for(int k=0; k<numPop; k++) {
                    SimpleVCF * svcf = new  SimpleVCF (fieldTab,corevcf,k==0);
                    svcfvec->push_back(svcf);
                }

                //SimpleVCF * svcfvec = new SimpleVCF(currentline);
#ifdef DEBUG
                //cout<<"new1 "<<svcf<<endl;
#endif

                if(queueOfVCFs.size() != 0 ) {
                    flagCpG( queueOfVCFs.back()->at(0) , svcfvec->at(0) );
                }
                // cout<<"Adding "<<*svcf<<endl;
                queueOfVCFs.push_back(svcfvec);

                if(svcfvec->at(0)->containsIndel()) {
                    if(indexInQueueOfIndels == -1)
                        indexInQueueOfIndels=indexQueue;
                }
                indexQueue++;
                if(queueOfVCFs.size() == (readAhead+1)) {
                    loop=false;
                }
            } else {
                loop=false;
            }
        }


        if(queueOfVCFs.size() == (readAhead+1)) { //+1 for CPGs
            fullQueue=true;
        } else {
            endQueue=true;
        }

        needToPopulateQueue=false;
    }

    //if subsequent call, and queue full
    if(fullQueue) {
        if(getNextLine()) {
            // SimpleVCF * svcf = new  SimpleVCF(currentline);

            vector<SimpleVCF *> *  svcfvec = new vector<SimpleVCF *>();
            vector<string> fieldTab = allTokens(currentline,'\t');
            CoreVCF * corevcf =  new CoreVCF(fieldTab);
            for(int k=0; k<numPop; k++) {
                SimpleVCF * svcf = new SimpleVCF (fieldTab,corevcf,k==0);
                svcfvec->push_back(svcf);
            }

#ifdef DEBUG
            //cout<<"new2 "<<*svcf<<endl;
#endif
            // cout<<"size "<<queueOfVCFs.size()<<endl;
            if(queueOfVCFs.size() != 0 ) {
                flagCpG( queueOfVCFs.back()->at(0),svcfvec->at(0));
            }

            queueOfVCFs.push_back(svcfvec);

            if(svcfvec->at(0)->containsIndel()) {
                if(indexInQueueOfIndels == -1)
                    indexInQueueOfIndels=queueOfVCFs.size()-1;
            }

        } else {
            fullQueue=false;
            endQueue=true;
        }

    }

    //if final calls and queue not max size
    if(endQueue) {
        //nothing to do
    }


    bool stillHasData=!(queueOfVCFs.empty());

    // if(!stillHasData){ //getData() should not get called in this case, hence no deallocation
    // 	cout<<"delete1  "<<svcfToReturn<<endl;
    // 	delete svcfToReturn;
    // }

    return stillHasData;
}
예제 #11
0
void readNucSubstitionFreq(const string filename,vector<probSubstition> & subVec){
    igzstream subFP;

    subFP.open(filename.c_str(), ios::in);

    //    unsigned int counterCont=0;
    if (subFP.good()){
	vector<string> fields;
	string line;

	//header
	if ( !getline (subFP,line)){
	    cerr << "Unable to open file "<<filename<<endl;
	    exit(1);
	}
	fields = allTokens(line,'\t');
	
	if(fields.size() != 12){
	    cerr << "line from error profile does not have 12 fields "<<line<<endl;
	    exit(1);
	}


	//probs
	while ( getline (subFP,line)){
	    
	    fields = allTokens(line,'\t');

	    if(fields.size() != 12){
		cerr << "line from error profile does not have 12 fields "<<line<<endl;
		exit(1);
	    }

	    substitutionRates tempFreq;	
	    probSubstition toaddSub;


	    for(unsigned int k=0;k<=9;k+=3){	

		for(unsigned int t=0;t<=2;t++){	
		    tempFreq.s[k+t]=destringify<double>(fields[k+t]);
		}

	    }


	    int indexFirstArray =0;
	    int indexSecondArray=0;

	    for(int nuc1=0;nuc1<4;nuc1++){
		double sumMismatchProb=0.0;
		int indexInArrayMatch=1;
		for(int nuc2=0;nuc2<4;nuc2++){
		    if(nuc1==nuc2){ // prob of error is 0 if both nucleotides are identical
			indexInArrayMatch                       = indexFirstArray;
			toaddSub.s[indexFirstArray++]           = 1.0;		    
		    }else{ //           rely on the substitution frequency
			sumMismatchProb                         += tempFreq.s[indexSecondArray];
			toaddSub.s[indexFirstArray++]            = tempFreq.s[indexSecondArray++];
		    }
		}

		toaddSub.s[indexInArrayMatch] = 1.0 - sumMismatchProb;
	    }

	    // for(int nuc1=0;nuc1<4;nuc1++){
	    // 	for(int nuc2=0;nuc2<4;nuc2++){
	    // 	    cout<<(nuc1*4+nuc2)<<"\t"<<toaddSub.s[nuc1*4+nuc2]<<endl;
	    // 	}	       
	    // }
	    
	    // exit(1);

	    subVec.push_back( toaddSub );
	}	             	              
	subFP.close();
    }else{
	cerr << "Unable to open file "<<filename<<endl;
	exit(1);
    }



}
예제 #12
0
void readIlluminaError(const string errFile,probSubstition & illuminaErrorsProb){

    igzstream errFileSt;

    errFileSt.open(errFile.c_str(), ios::in);

    //    unsigned int counterCont=0;
    if (errFileSt.good()){
	vector<string> fields;
	string line;
	//header
	if ( !getline (errFileSt,line)){
	    cerr << "Unable to open file "<<errFile<<endl;
	    exit(1);
	}
	fields = allTokens(line,'\t');
	
	if(fields.size() != 12){
	    cerr << "line from error profile does not have 12 fields "<<line<<endl;
	    exit(1);
	}

	//raw sums
	if ( !getline (errFileSt,line)){
	    cerr << "Unable to open file "<<errFile<<endl;
	    exit(1);
	}
	
	fields = allTokens(line,'\t');

	if(fields.size() != 12){
	    cerr << "line from error profile does not have 12 fields "<<line<<endl;
	    exit(1);
	}

	//probs
	if ( !getline (errFileSt,line)){
	    cerr << "Unable to open file "<<errFile<<endl;
	    exit(1);
	}
	
	fields = allTokens(line,'\t');

	if(fields.size() != 12){
	    cerr << "line from error profile does not have 12 fields "<<line<<endl;
	    exit(1);
	}
	substitutionRates tempFreq;	
	    
	
	for(unsigned int k=0;k<=9;k+=3){	

	    for(unsigned int t=0;t<=2;t++){	
		tempFreq.s[k+t]=destringify<double>(fields[k+t]);
		//cerr<<freqIlluminaError.s[k+t]<<endl;
	    }

	}


	int indexFirstArray =0;
	int indexSecondArray=0;

	for(int nuc1=0;nuc1<4;nuc1++){
	    for(int nuc2=0;nuc2<4;nuc2++){
		if(nuc1==nuc2) // prob of error is 0 if both nucleotides are identical
		    illuminaErrorsProb.s[indexFirstArray++]=0.0;
		else //           rely on the substitution frequency
		    illuminaErrorsProb.s[indexFirstArray++]=tempFreq.s[indexSecondArray++];
	    }
	}
	
	             	              
	errFileSt.close();
    }else{
	cerr << "Unable to open file "<<errFile<<endl;
	exit(1);
    }



}
예제 #13
0
bool MistarParser::hasData(){

    if(numberOfTimesHasDataWasCalled!=-1){
	//	cout<<"delete"<<endl;
	//cerr<<"del "<<allRecToReturn<<endl;
	//delete(allRecToReturn->vectorAlleles);
	delete(allRecToReturn);
	numberdel++;
    }else{
	numberOfTimesHasDataWasCalled=0;
    }
    
    numberOfTimesHasDataWasCalled++;
    //    string line;
    //if(getline ( *myFilezipped,line)){
    if(getNextLine()){
	numbernew++;
	allRecToReturn                = new AlleleRecords();
	//	cerr<<"new "<<allRecToReturn<<endl;
	//allRecToReturn->vectorAlleles = new vector<SingleAllele>();
	// cout<<"currentline "<<currentline<<endl;
	
	vector<string> fields=allTokens(currentline,'\t');

	if(fields.size() != (numberPopulations+3)){
	    cerr << "Error: MistarParser the following line should have "<<(numberPopulations+3)<<" fields " << currentline <<endl;
	    exit(1);	   
	}
	if(fields[2].length() != 3){
	    cerr << "Error: MistarParser the following line " << currentline <<" does not have 2 comma separated alleles"<<endl;
	    exit(1);	   
	}

	allRecToReturn->chr        =                           fields[0];
	allRecToReturn->coordinate = destringify<unsigned int>(fields[1]);
	allRecToReturn->ref        =                           fields[2][0];
	allRecToReturn->alt        =                           fields[2][2];
	if(allRecToReturn->ref == allRecToReturn->alt){
	    cerr << "Error: MistarParser the following line " << currentline <<" the reference is equal to the alt allele, exiting"<<endl;
	    exit(1);	   
	}

	allRecToReturn->vectorAlleles = new vector<SingleAllele>();
	for(unsigned int i=3;i<fields.size();i++){
	    unsigned int indexComma=0;
	    unsigned int indexColon=0;
	    for(unsigned int k=0;k<fields[i].size();k++){
		if(fields[i][k]==',')
		    indexComma=k;
		if(fields[i][k]==':')
		    indexColon=k;
	    }

	    if(indexComma == 0 || indexColon == 0 ){
		cerr << "Error: MistarParser problem with the following line " << currentline <<" cannot get allele count"<<endl;
		exit(1);	   
	    }
	    
	    SingleAllele sa (destringify<int>( fields[i].substr(0,indexComma)),
			     destringify<int>( fields[i].substr(indexComma+1,indexColon)),
			     destringify<bool>(fields[i].substr(indexColon+1))   );

	    allRecToReturn->vectorAlleles->push_back(sa);
 	}

	if( allRecToReturn->vectorAlleles->size() != numberPopulations){
	    cerr << "Error: MistarParser problem with the following line " << currentline <<" number of allele count read is not "<<numberPopulations<<endl;
		exit(1);	   	    
	}

	return true;

    }else{//if has no data

	if(textMode){
	    myFilezipped->close();
	}

	// else
	//     myFile->close();
	return false;
    }
    return false;
}