void VcfHelper::printArrayJoin(IFILE oFile, const StringArray& arr, const char* sep, const char* empty, int start, int end) { for(int i=start; i < end; ++i) { if ( i > start ) { ifprintf(oFile,"%s",sep); } ifprintf(oFile,"%s",arr[i].c_str()); } }
void VcfHelper::printArrayDoubleJoin(IFILE oFile, const StringArray& arr1, const StringArray& arr2, const char* sep1, const char* sep2, const char* empty, int start, int end) { for(int i=start; i < end; ++i) { if ( i > start ) { ifprintf(oFile,"%s",sep1); } ifprintf(oFile,"%s%s%s",arr1[i].c_str(),sep2,arr2[i].c_str()); } }
void Imputation::FlushPartialVcf(HaplotypeSet &rHap,HaplotypeSet &tHap,HaplotypeSet &PartialDosage, string &filename,int &Index) { string tempFileIndex(outFile),tempFileIndex1(outFile); IFILE vcfdosepartial = ifopen(filename.c_str(), "wb", InputFile::BGZF); for(int hapId=0;hapId<(int)PartialDosage.individualName.size();hapId++) { ifprintf(vcfdosepartial,"\t%s",PartialDosage.individualName[hapId].c_str()); } ifprintf(vcfdosepartial,"\n"); int i=0; for (int index =0; index < rHap.RefTypedTotalCount; index++) { if(rHap.RefTypedIndex[index]==-1) { if(i>=rHap.PrintStartIndex && i <= rHap.PrintEndIndex) { bool majorIsReference=false; if(!rHap.major[i]) majorIsReference=true; if(!tHap.AllMaleTarget) PartialDosage.PrintDosageForVcfOutputForID(vcfdosepartial,i, majorIsReference,rHap.VariantList[i].refAllele); else PartialDosage.PrintDosageForVcfOutputForIDMaleSamples(vcfdosepartial,i, majorIsReference,rHap.VariantList[i].refAllele); ifprintf(vcfdosepartial,"\n"); } i++; } else { if(!tHap.AllMaleTarget) PartialDosage.PrintDosageGWASOnlyForVcfOutputForID (tHap,vcfdosepartial,rHap.RefTypedIndex[index]); else PartialDosage.PrintDosageGWASOnlyForVcfOutputForIDMaleSamples (tHap,vcfdosepartial,rHap.RefTypedIndex[index]); ifprintf(vcfdosepartial,"\n"); } } ifclose(vcfdosepartial); }
void Imputation::PrintInfoFile(HaplotypeSet &rHap,HaplotypeSet &tHap, ImputationStatistics &stats) { cout<<endl<<" Writing summary (.info) files ... "<<endl; IFILE info = ifopen(outFile + ".info", "wb"); ifprintf(info, "SNP\tREF(0)\tALT(1)\tALT_Frq\tMAF\tAvgCall\tRsq\tGenotyped\tLooRsq\tEmpR\tEmpRsq\tDose0\tDose1\n"); int i=0; for (int index =0; index < rHap.RefTypedTotalCount; index++) { if(rHap.RefTypedIndex[index]==-1) { if(i>=rHap.PrintStartIndex && i <= rHap.PrintEndIndex) { ifprintf(info, "%s\t%s\t%s\t%.5f\t%.5f\t%.5f\t%.5f\t", RsId? rHap.VariantList[i].rsid.c_str(): rHap.VariantList[i].name.c_str(), rHap.VariantList[i].refAlleleString.c_str(), rHap.VariantList[i].altAlleleString.c_str(), stats.AlleleFrequency(i), stats.AlleleFrequency(i) > 0.5 ? 1.0 - stats.AlleleFrequency(i) : stats.AlleleFrequency(i), stats.AverageCallScore(i), stats.Rsq(i)); if (!tHap.missing[i]) { ifprintf(info, "Genotyped\t%.3f\t%.3f\t%.5f\t%.5f\t%.5f\n", stats.LooRsq(i), stats.EmpiricalR(i), stats.EmpiricalRsq(i), stats.LooMajorDose(i), stats.LooMinorDose(i)); } else ifprintf(info, "Imputed\t-\t-\t-\t-\t-\n"); } i++; } else { variant ThisTypedVariant =tHap.TypedOnlyVariantList[rHap.RefTypedIndex[index]]; ifprintf(info, "%s\t%s\t%s\t%.5f\t%.5f\t-\t-\tTyped_Only\t-\t-\t-\t-\t-\n", RsId? ThisTypedVariant.rsid.c_str(): ThisTypedVariant.name.c_str(), ThisTypedVariant.refAlleleString.c_str(), ThisTypedVariant.altAlleleString.c_str(), tHap.AlleleFreq[rHap.RefTypedIndex[index]], tHap.AlleleFreq[rHap.RefTypedIndex[index]] > 0.5 ? 1.0 - tHap.AlleleFreq[rHap.RefTypedIndex[index]] : tHap.AlleleFreq[rHap.RefTypedIndex[index]]); } } ifclose(info); cout<<endl<<" Summary information written to : "<<outFile<<".info"<<endl; }
void VcfHelper::printArrayJoin(IFILE oFile, const StringArray& arr, const char* sep, const char* empty) { int len = arr.Length(); if ( len == 0 ) { ifprintf(oFile,"%s",empty); } else if ( len == 1 ) { ifprintf(oFile,"%s",arr[0].c_str()); } else { printArrayJoin(oFile,arr,sep,empty,0,len); } }
void MarkovParameters::WriteErrorRates(StringArray & markerNames, const char * filename) { IFILE output = ifopen(filename, "wb"); if (output == NULL) return; ifprintf(output, "MarkerName\tErrorRate\n"); for (int i = 0; i < markers; i++) ifprintf(output, "%s\t%.5g\n", (const char *) markerNames[i], E[i]); ifclose(output); }
void VcfFile::printVCFHeader(IFILE oFile) { for(int i=0; i < getMetaCount(); ++i) { ifprintf(oFile,"##%s=%s\n",getMetaKey(i).c_str(), getMetaValue(i, "<na>").c_str()); } ifprintf(oFile,"#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO"); if ( ( getSampleCount() > 0 ) && ( !bSiteOnly ) ) { ifprintf(oFile,"\tFORMAT"); for(int i=0; i < getSampleCount(); ++i) { ifprintf(oFile,"\t%s",vpVcfInds[i]->sIndID.c_str()); } } ifprintf(oFile,"\n"); }
void VcfFile::printVCFHeaderSubset(IFILE oFile, std::vector<int>& subsetIndices) { //fprintf(stderr,"foo\n"); for(int i=0; i < getMetaCount(); ++i) { ifprintf(oFile,"##%s=%s\n",getMetaKey(i).c_str(), getMetaValue(i, "<na>").c_str()); } ifprintf(oFile,"#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO"); ifprintf(oFile,"\tFORMAT"); for(int j=0; j < (int)subsetIndices.size(); ++j) { int i = subsetIndices[j]; ifprintf(oFile,"\t%s",vpVcfInds[i]->sIndID.c_str()); } ifprintf(oFile,"\n"); }
void MarkovParameters::WriteCrossoverRates(StringArray & markerNames, const char * filename) { IFILE output = ifopen(filename, "wb"); if (output == NULL) return; ifprintf(output, "Interval\tSwitchRate\n"); for (int i = 0; i < markers - 1; i++) ifprintf(output, "%s-%s\t%.5g\n", (const char *) markerNames[i], (const char *) markerNames[i+1], R[i]); ifclose(output); }
void Bam2FastQ::writeFastQ(SamRecord& samRec, IFILE filePtr, const char* readNameExt) { static int16_t flag; static std::string sequence; static String quality; if(filePtr == NULL) { return; } flag = samRec.getFlag(); const char* readName = samRec.getReadName(); sequence = samRec.getSequence(); quality = samRec.getQuality(); if(SamFlag::isReverse(flag) && myReverseComp) { // It is reverse, so reverse compliment the sequence BaseUtilities::reverseComplement(sequence); // Reverse the quality. quality.Reverse(); } else { // Ensure it is all capitalized. int seqLen = sequence.size(); for (int i = 0; i < seqLen; i++) { sequence[i] = (char)toupper(sequence[i]); } } if(myRNPlus) { ifprintf(filePtr, "@%s%s\n%s\n+%s%s\n%s\n", readName, readNameExt, sequence.c_str(), readName, readNameExt, quality.c_str()); } else { ifprintf(filePtr, "@%s%s\n%s\n+\n%s\n", readName, readNameExt, sequence.c_str(), quality.c_str()); } // Release the record. myPool.releaseRecord(&samRec); }
void Imputation::performImputation(HaplotypeSet &tHap,HaplotypeSet &rHap, String Golden) { vector<int> optStructure=rHap.optEndPoints; int time_prev = time(0),time_load,vcfSampleIndex=0;; includeGwas=true; MarkovParameters* MP=createEstimates(rHap,tHap,rHap.optEndPoints,1-includeGwas); cout<<" ------------------------------------------------------------------------------"<<endl; cout<<" MAIN IMPUTATION "<<endl; cout<<" ------------------------------------------------------------------------------"<<endl; ImputationStatistics stats(rHap.numMarkers ); IFILE dosages=NULL, hapdose=NULL, haps=NULL,vcfdosepartial=NULL; HaplotypeSet DosageForVcfPartial; DosageForVcfPartial.unphasedOutput=unphasedOutput; DosageForVcfPartial.TypedOnly=tHap.TypedOnly; DosageForVcfPartial.GWASOnlycounter=tHap.GWASOnlycounter; if(tHap.TypedOnly) { printf("\n Calculating Allele Frequency for Typed-Only variants ... "); cout<<endl; tHap.CalculateGWASOnlyFreq(); } cout << "\n Starting Imputation ..."; printf("\n\n Setting up Markov Model for Imputation ..."); cout<<endl<<endl; if (phased && !unphasedOutput) { hapdose = ifopen(outFile + ".hapDose" + (gzip ? ".gz" : ""), "wb", gzip ?InputFile::BGZF:InputFile::UNCOMPRESSED); haps = ifopen(outFile + ".hapLabel" + (gzip ? ".gz" : ""), "wb", gzip ?InputFile::BGZF:InputFile::UNCOMPRESSED); } int maxVcfSample=200,NumVcfWritten=0,NumVcfCreated=0,NovcfParts=1; if((maxVcfSample)>=tHap.numSamples) maxVcfSample=tHap.numSamples; if(vcfOutput) { vcfdosepartial = ifopen(outFile + ".dose.vcf" + (gzip ? ".gz" : ""), "wb", gzip ?InputFile::BGZF:InputFile::UNCOMPRESSED); ifprintf(vcfdosepartial,"##fileformat=VCFv4.1\n"); time_t t = time(0); struct tm * now = localtime( & t ); ifprintf(vcfdosepartial,"##filedate=%d.%d.%d\n",(now->tm_year + 1900),(now->tm_mon + 1) ,now->tm_mday); ifprintf(vcfdosepartial,"##source=Minimac3\n"); if(GT) ifprintf(vcfdosepartial,"##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">\n"); if(tHap.AllMaleTarget) { if(DS) ifprintf(vcfdosepartial,"##FORMAT=<ID=DS,Number=1,Type=Float,Description=\"Estimated Alternate Allele Dosage (For Male Chr: X) : [P(Alt Allele)]\">\n"); if(GP) ifprintf(vcfdosepartial,"##FORMAT=<ID=GP,Number=2,Type=Float,Description=\"Estimated Posterior Probabilities for Genotypes 0 and 1 (For Male Chr: X) \">\n"); } else { if(DS) ifprintf(vcfdosepartial,"##FORMAT=<ID=DS,Number=1,Type=Float,Description=\"Estimated Alternate Allele Dosage : [P(0/1)+2*P(1/1)]\">\n"); if(GP) ifprintf(vcfdosepartial,"##FORMAT=<ID=GP,Number=3,Type=Float,Description=\"Estimated Posterior Probabilities for Genotypes 0/0, 0/1 and 1/1 \">\n"); } ifprintf(vcfdosepartial,"##INFO=<ID=MAF,Number=1,Type=Float,Description=\"Estimated Alternate Allele Frequency\">\n"); ifprintf(vcfdosepartial,"##INFO=<ID=R2,Number=1,Type=Float,Description=\"Estimated Imputation Accuracy\">\n"); ifprintf(vcfdosepartial,"##INFO=<ID=ER2,Number=1,Type=Float,Description=\"Empirical (Leave-One-Out) R-square (available only for genotyped variants)\">\n"); ifprintf(vcfdosepartial,"#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT"); ifclose(vcfdosepartial); if(!tHap.AllMaleTarget) DosageForVcfPartial.InitializePartialDosageForVcfOutput((2*maxVcfSample),rHap.numMarkers,format); else DosageForVcfPartial.InitializePartialDosageForVcfOutputMaleSamples(maxVcfSample<MaxSample?maxVcfSample:MaxSample,rHap.numMarkers,format); } if(doseOutput) dosages = ifopen(outFile + ".dose" + (gzip ? ".gz" : ""), "wb",(gzip ? InputFile::BGZF:InputFile::UNCOMPRESSED) ); #pragma omp parallel for for(int hapId=0;hapId<MaxSample;hapId++) { if (hapId %2==1) { if(rHap.finChromosome!="X") continue; else if(!tHap.AllMaleTarget) continue; } vector<float> foldedProb,recomProb,noRecomProb, rightProb,probAlleleNoStandardize(8,0.0),tempDoseHap1; vector<bool> tempHap(rHap.numMarkers),tempMissHap(rHap.numMarkers); vector<bool> tempDoseAlleleHap1; MarkovModel MM(tHap,rHap,tHap.missing,rHap.major); MM.CopyParameters(MP); int hapIdIndiv=hapId; do{ MM.initializeMatrices(tHap,rHap,optStructure,rHap.ReducedStructureInfo); printf(" Processing Haplotype %d of %d ...", hapIdIndiv + 1, MaxSample); cout<<endl; MM.ThisHapId=hapIdIndiv; for(int group=1;group<(int)optStructure.size();group++) { MM.foldProbabilities(foldedProb,group-1,rHap.ReducedStructureInfo[group-1],0,refCount); MM.leftNoRecoProb[group-1][0]=foldedProb; if(group==1 && !tHap.missing[0]) if(!tHap.getMissingScaffoldedHaplotype(hapIdIndiv,0)) { Condition(rHap,0,foldedProb,MM.leftNoRecoProb[group-1][0],MM.Error[0], tHap.getScaffoldedHaplotype(hapIdIndiv,0)? rHap.AlleleFreq[0] : 1-rHap.AlleleFreq[0], tHap.getScaffoldedHaplotype(hapIdIndiv,0),MM.backgroundError, foldedProb.size(),rHap.ReducedStructureInfo[0]); } MM.WalkLeft(tHap,hapIdIndiv,MM.leftProb[group-1],MM.leftNoRecoProb[group-1], foldedProb,optStructure[group-1],optStructure[group], rHap.ReducedStructureInfo[group-1],rHap.AlleleFreq); splitFoldedProb(recomProb,MM.leftProb[group-1][optStructure[group]-optStructure[group-1]],MM.leftNoRecoProb[group-1][optStructure[group]-optStructure[group-1]]); MM.unfoldProbabilities(group-1,recomProb,MM.leftNoRecoProb[group-1][optStructure[group]-optStructure[group-1]],foldedProb,0,rHap.ReducedStructureInfo,refCount); } for(int group=optStructure.size()-1;group>0;group--) { MM.foldProbabilities(foldedProb,group-1,rHap.ReducedStructureInfo[group-1],1,refCount); rightProb=foldedProb; noRecomProb=foldedProb; MM.Impute(tHap,foldedProb,hapIdIndiv,MM.leftProb[group-1],MM.leftNoRecoProb[group-1],rightProb,noRecomProb,MM.junctionLeftProb[group-1], MM.junctionRightProb[group],optStructure[group-1], optStructure[group],rHap.ReducedStructureInfo[group-1],1,rHap.AlleleFreq); splitFoldedProb(recomProb,rightProb,noRecomProb); MM.unfoldProbabilities(group-1,recomProb,noRecomProb,foldedProb,1,rHap.ReducedStructureInfo,refCount); } for(int jjj=0;jjj<rHap.numMarkers;jjj++) { tempHap[jjj]=tHap.getScaffoldedHaplotype(hapIdIndiv,jjj); tempMissHap[jjj]=tHap.getMissingScaffoldedHaplotype(hapIdIndiv,jjj); } if(vcfOutput) { if(hapIdIndiv%2==0) { tempDoseHap1= MM.imputedHap; tempDoseAlleleHap1= MM.imputedAlleleNumber; } } #pragma omp critical { stats.Update(MM.imputedHap, MM.leaveOneOut,tempHap,tempMissHap,rHap.major); } #pragma omp critical if (phased && !unphasedOutput) { PrintHaplotypeData(rHap, tHap, hapdose, haps, MM.imputedHap, MM.imputedAlleleNumber, hapIdIndiv, tHap.AllMaleTarget?hapId:hapId/2); } if(tHap.AllMaleTarget) break; hapIdIndiv++; }while(hapIdIndiv<MaxSample && hapIdIndiv%2==1); #pragma omp critical if(doseOutput) { PrintDosageData(rHap, tHap, dosages, MM.imputedDose, tHap.AllMaleTarget?hapId:hapId/2); } #pragma omp critical if(vcfOutput) { printf(" Saving Individual %s for VCF File...\n", tHap.individualName[tHap.AllMaleTarget?hapId:hapId/2].c_str()); if(!tHap.AllMaleTarget) DosageForVcfPartial.SaveDosageForVcfOutputSampleWise(NumVcfCreated-NumVcfWritten, tHap.individualName[tHap.AllMaleTarget?hapId:hapId/2], tempDoseHap1,MM.imputedHap, tempDoseAlleleHap1,MM.imputedAlleleNumber); else DosageForVcfPartial.SaveDosageForVcfOutputSampleWiseChrX(NumVcfCreated-NumVcfWritten, tHap.individualName[tHap.AllMaleTarget?hapId:hapId/2], MM.imputedHap, MM.imputedAlleleNumber); if(DosageForVcfPartial.TypedOnly) { DosageForVcfPartial.SaveIndexForGWASOnlyForVcfOutput(NumVcfCreated-NumVcfWritten, tHap.AllMaleTarget?hapId:hapId/2); } NumVcfCreated++; vcfSampleIndex++; if(NumVcfCreated%maxVcfSample==0 || NumVcfCreated==(tHap.AllMaleTarget?MaxSample:MaxSample/2)) { string PartialVcfFileName(outFile),tempFileIndex1(outFile); stringstream strs; strs<<(NovcfParts); PartialVcfFileName+=(".dose.vcf.part." + (string)(strs.str()) +(gzip ? ".gz" : "")); if(!tHap.AllMaleTarget) printf("\n --->>> Saving samples %d-%d in VCF file : %s ...\n\n", (NumVcfWritten)+1,(MaxSample/2<(NumVcfWritten+maxVcfSample)?MaxSample/2:(NumVcfWritten+maxVcfSample)), PartialVcfFileName.c_str()); else printf("\n --->>> Saving samples %d-%d in VCF file : %s ...\n\n", (NumVcfWritten)+1,(MaxSample<(NumVcfWritten+maxVcfSample)?MaxSample:(NumVcfWritten+maxVcfSample)), PartialVcfFileName.c_str()); //if(NovcfParts==2) // abort(); FlushPartialVcf(rHap,tHap,DosageForVcfPartial,PartialVcfFileName,NovcfParts); if(NumVcfCreated<(tHap.AllMaleTarget?MaxSample:MaxSample/2)) { NovcfParts++; NumVcfWritten+=maxVcfSample; //int gg=maxVcfSample<(((tHap.AllMaleTarget?MaxSample:MaxSample/2))-NumVcfWritten)? //2*maxVcfSample:2*(((tHap.AllMaleTarget?MaxSample:MaxSample/2))-NumVcfWritten); // // //abort(); if(!tHap.AllMaleTarget) DosageForVcfPartial.InitializePartialDosageForVcfOutput(maxVcfSample<(MaxSample/2-NumVcfWritten)?2*maxVcfSample:2*(MaxSample/2-NumVcfWritten),rHap.numMarkers,format); else DosageForVcfPartial.InitializePartialDosageForVcfOutputMaleSamples(maxVcfSample<(MaxSample-NumVcfWritten)?maxVcfSample:(MaxSample-NumVcfWritten),rHap.numMarkers,format); } } } } cout<<endl<<" Imputation Finished ... "<<endl; if (phased && !unphasedOutput) { ifclose(hapdose); ifclose(haps); cout<<endl<<" Haplotype Dosage information written to : "<< outFile + ".hapDose" + (gzip ? ".gz" : "")<<endl; cout<<endl<<" Haplotype Allele information written to : "<< outFile + ".hapLabel" + (gzip ? ".gz" : "")<<endl; } if(doseOutput) { ifclose(dosages); cout<<endl<<" Dosage information written to : "<< outFile + ".dose" + (gzip ? ".gz" : "")<<endl; } PrintInfoFile(rHap,tHap,stats); time_load = time(0) - time_prev; cout << "\n Time taken for imputation = " << time_load << " seconds."<<endl<<endl; if(vcfOutput) MergeFinalVcfAllVariants(rHap,tHap,stats,NovcfParts); }
void VerifyBamID::printPerMarkerInfo(const char* filename, int indIdx) { IFILE oFile = ifopen(filename,"wb"); int nMarkers = (int)(pGenotypes->chroms.size()); char base, a1, a2; ifprintf(oFile,"#CHROM\tPOS\tA1\tA2\tAF\tGENO\t#REF\t#ALT\t#OTHERS\tBASES\tQUALS\tMAPQS\n"); for(int i=0; i < nMarkers; ++i) { int counts[3] = {0,0,0}; std::vector<char> bases; std::vector<char> quals; std::vector<char> mqs; ifprintf(oFile,"%s\t%d\t%c\t%c\t%.4lf\t",pGenotypes->chroms[i].c_str(),pGenotypes->positions[i],pGenotypes->refBases[i],pGenotypes->altBases[i],pGenotypes->alleleFrequencies[i]); int geno = pGenotypes->getGenotype(indIdx,i); switch(geno) { case 0: // MISSING ifprintf(oFile,"./."); break; case 1: // HOMREF; ifprintf(oFile,"0/0"); break; case 2: // HET; ifprintf(oFile,"0/1"); break; case 3: // HOMALT; ifprintf(oFile,"1/1"); break; default: Logger::gLogger->error("Unrecognized genotype %d at ind %d, marker %d",indIdx,i); } a1 = pGenotypes->refBases[i]; a2 = pGenotypes->altBases[i]; for(int j=(int)pPile->nBegins[i]; j < (int)pPile->nEnds[i]; ++j) { // obtain b (base), (error), and readgroup info base = pPile->cBases[j]; if ( base == a1 ) { ++counts[0]; } else if ( base == a2 ) { ++counts[1]; } else { ++counts[2]; } bases.push_back(base); quals.push_back(pPile->cQuals[j]); mqs.push_back(((uint8_t)(pPile->cMapQs[j]) > 90) ? '~' : static_cast<char>(pPile->cMapQs[j]+33)); } ifprintf(oFile,"\t%d\t%d\t%d\t%.3lf\t",counts[0],counts[1],counts[2],(counts[0]+counts[1] == 0) ? 0.5 : (double)counts[0]/(double)(counts[0]+counts[1])); ifprintf(oFile,"\t"); for(int j=0; j < (int)bases.size(); ++j) ifprintf(oFile,"%c",bases[j]); ifprintf(oFile,"\t"); for(int j=0; j < (int)quals.size(); ++j) ifprintf(oFile,"%c",quals[j]); ifprintf(oFile,"\t"); for(int j=0; j < (int)mqs.size(); ++j) ifprintf(oFile,"%c",mqs[j]); ifprintf(oFile,"\n"); } }
void Imputation::PrintHaplotypeData(HaplotypeSet &rHap,HaplotypeSet &tHap, IFILE hapdose, IFILE haps, vector<float> &ThisimputedHap,vector<bool> ThisimputedAlleles, int ThisHapId, int ThisSampleId) { char labels[]= {0, 'A', 'C', 'G', 'T', 'D', 'I', 'R'}; printf(" Outputting HAPLO%d of Individual %s for Haplotype File...", tHap.AllMaleTarget?1:(ThisHapId%2+1) ,tHap.individualName[ThisSampleId].c_str()); cout<<endl; ifprintf(hapdose, "%s\tHAPLO%d", tHap.individualName[ThisSampleId].c_str(), tHap.AllMaleTarget?1:(ThisHapId%2+1) ); ifprintf(haps, "%s\tHAPLO%d\t", tHap.individualName[ThisSampleId].c_str(), tHap.AllMaleTarget?1:(ThisHapId%2+1) ); int i=0; for (int index =0; index < rHap.RefTypedTotalCount; index++) { if(rHap.RefTypedIndex[index]==-1) { if(i>=rHap.PrintStartIndex && i <= rHap.PrintEndIndex) { ifprintf(hapdose, "\t%.5f", ThisimputedHap[i]); ifprintf(haps, "%c", labels[(int) (ThisimputedAlleles[i]? rHap.VariantList[i].altAllele :rHap.VariantList[i].refAllele)]); } i++; } else { int MarkerIndex=rHap.RefTypedIndex[index]; bool a1; a1=tHap.GWASOnlyMissingSampleUnscaffolded[ThisHapId][MarkerIndex]; double outAllele1=0.0; if(a1) { outAllele1=tHap.AlleleFreq[MarkerIndex]; a1=round(outAllele1)==1?true:false; } else { a1=tHap.GWASOnlyhaplotypesUnscaffolded[ThisHapId][MarkerIndex]; if(a1) outAllele1=1.0; } // if(!tHap.major[MarkerIndex]) // outAllele1=1-outAllele1; ifprintf(haps, "%c", labels[(int) (a1? tHap.TypedOnlyVariantList[MarkerIndex].altAllele :tHap.TypedOnlyVariantList[MarkerIndex].refAllele)]); ifprintf(hapdose, "\t%.5f",outAllele1); } } ifprintf(hapdose, "\n"); ifprintf(haps, "\n"); }
void VcfFile::printBEDHeader(IFILE oBedFile, IFILE oFamFile) { for(int i=0; i < getSampleCount(); ++i) { if ( vpVcfInds[i]->sFamID.Length() == 0 ) { ifprintf(oFamFile,"%s",vpVcfInds[i]->sIndID.c_str()); } else { ifprintf(oFamFile,"%s",vpVcfInds[i]->sFamID.c_str()); } ifprintf(oFamFile,"\t%s",vpVcfInds[i]->sIndID.c_str()); if ( vpVcfInds[i]->sFatID.Length() == 0 ) { ifprintf(oFamFile,"\t0"); } else { ifprintf(oFamFile,"%s",vpVcfInds[i]->sFatID.c_str()); } if ( vpVcfInds[i]->sMotID.Length() == 0 ) { ifprintf(oFamFile,"\t0"); } else { ifprintf(oFamFile,"\t%s",vpVcfInds[i]->sMotID.c_str()); } switch( vpVcfInds[i]->gender ) { case VcfInd::UNKNOWN: ifprintf(oFamFile,"\t0"); break; case VcfInd::MALE: ifprintf(oFamFile,"\t1"); break; case VcfInd::FEMALE: ifprintf(oFamFile,"\t2"); break; default: throw VcfFileException("Unrecognized value for gender"); break; } ifprintf(oFamFile,"\t-9\n"); } char magicNumbers[3] = {0x6c,0x1b,0x01}; oBedFile->ifwrite(magicNumbers, 3); }
void Imputation::PrintDosageData(HaplotypeSet &rHap,HaplotypeSet &tHap, IFILE dosages, vector<float> &ThisDosage, int ThisSampleId) { printf(" Outputting Individual %s for Dosage file...", tHap.individualName[ThisSampleId].c_str()); cout<<endl; ifprintf(dosages, "%s\tDOSE",tHap.individualName[ThisSampleId].c_str()); int i=0; for (int index =0; index < rHap.RefTypedTotalCount; index++) { if(rHap.RefTypedIndex[index]==-1) { if(i>=rHap.PrintStartIndex && i <= rHap.PrintEndIndex) { ifprintf(dosages, "\t%.3f", ThisDosage[i]); } i++; } else { int MarkerIndex=rHap.RefTypedIndex[index]; bool a1,a2; double outAllele1=0.0,outAllele2=0.0; if(tHap.AllMaleTarget) { a1=tHap.GWASOnlyMissingSampleUnscaffolded[ThisSampleId][MarkerIndex]; if(a1) { outAllele1=tHap.AlleleFreq[MarkerIndex]; a1=round(outAllele1)==1?true:false; } else { a1=tHap.GWASOnlyhaplotypesUnscaffolded[ThisSampleId][MarkerIndex]; if(a1) outAllele1=1.0; } // if(!tHap.major[MarkerIndex]) // outAllele1=1-outAllele1; ifprintf(dosages, "\t%.3f", outAllele1); } else { a1=tHap.GWASOnlyMissingSampleUnscaffolded[2*ThisSampleId][MarkerIndex]; a2=tHap.GWASOnlyMissingSampleUnscaffolded[2*ThisSampleId+1][MarkerIndex]; if(a1 || a2) { outAllele1=tHap.AlleleFreq[MarkerIndex]; outAllele2=outAllele1; a1=round(outAllele1)==1?true:false; a2=a1; } else { a1=tHap.GWASOnlyhaplotypesUnscaffolded[2*ThisSampleId][MarkerIndex]; a2=tHap.GWASOnlyhaplotypesUnscaffolded[2*ThisSampleId+1][MarkerIndex]; if(a1) outAllele1=1.0; if(a2) outAllele2=1.0; } // if(!tHap.major[MarkerIndex]) // { // outAllele1=1-outAllele1; // outAllele2=1-outAllele2; // } ifprintf(dosages, "\t%.3f", outAllele1+outAllele2); } } } ifprintf(dosages,"\n"); }
void VcfMarker::printVCFMarker(IFILE oFile, bool siteOnly) { String line; ifprintf(oFile,"%s",sChrom.c_str()); ifprintf(oFile,"\t%d",nPos); ifprintf(oFile,"\t%s",sID.c_str()); ifprintf(oFile,"\t%s",sRef.c_str()); if ( asAlts.Length() == 1 ) { ifprintf(oFile,"\t%s",asAlts[0].c_str()); } else { ifprintf(oFile,"\t"); VcfHelper::printArrayJoin(oFile, asAlts, ",", "."); } if ( fQual < 0 ) { ifprintf(oFile,"\t."); } else { ifprintf(oFile,"\t%.0f",fQual); } if ( asFilters.Length() == 1 ) { ifprintf(oFile,"\t%s",asFilters[0].c_str()); } else { ifprintf(oFile,"\t"); VcfHelper::printArrayJoin(oFile, asFilters, ";", "PASS"); } ifprintf(oFile,"\t"); VcfHelper::printArrayDoubleJoin(oFile, asInfoKeys, asInfoValues, ";", "=", "."); if ( !siteOnly ) { if ( asSampleValues.Length() > 0 ) { ifprintf(oFile,"\t"); VcfHelper::printArrayJoin(oFile, asFormatKeys, ":", "."); for(int i=0; i < getSampleSize(); ++i) { ifprintf(oFile,"\t"); VcfHelper::printArrayJoin(oFile, asSampleValues, ":", ".", i*asFormatKeys.Length(), (i+1)*asFormatKeys.Length()); } } else if ( vnSampleGenotypes.size() > 0 ) { ifprintf(oFile,"\tGT",line.c_str()); for(int i=0; i < (int)vnSampleGenotypes.size(); ++i) { if ( vnSampleGenotypes[i] == 0xffff ) { ifprintf(oFile,"\t./."); } else { ifprintf(oFile,"\t%d/%d",((vnSampleGenotypes[i] & 0xff00) >> 8),(vnSampleGenotypes[i] & 0xff)); } } } }
void Imputation::MergeFinalVcfAllVariants(HaplotypeSet &rHap,HaplotypeSet &tHap,ImputationStatistics &stats,int MaxIndex) { cout<<" ------------------------------------------------------------------------------"<<endl; cout<<" FINAL VCF MERGE "<<endl; cout<<" ------------------------------------------------------------------------------"<<endl; printf("\n Merging partial VCF files to final output VCF File : %s ",(outFile + ".dose.vcf" + (gzip ? ".gz" : "")).c_str() ); cout<<endl<<endl; IFILE vcfdosepartial = ifopen(outFile + ".dose.vcf" + (gzip ? ".gz" : ""), "a", gzip ?InputFile::BGZF:InputFile::UNCOMPRESSED); vector<IFILE> vcfdosepartialList(MaxIndex); for(int i=1;i<=MaxIndex;i++) { string tempFileIndex(outFile); stringstream strs; strs<<(i); tempFileIndex+=(".dose.vcf.part." + (string)(strs.str())+(gzip ? ".gz" : "")); vcfdosepartialList[i-1] = ifopen(tempFileIndex.c_str(), "r"); } string line; for(int i=1;i<=MaxIndex;i++) { line.clear(); vcfdosepartialList[i-1]->readLine(line); ifprintf(vcfdosepartial,"%s",line.c_str()); } int i=0; for (int index =0; index < rHap.RefTypedTotalCount; index++) { //abort(); if(index%10000==0) { printf(" Merging marker %d of %d [%.1f%%] to VCF File ...", index + 1, rHap.RefTypedTotalCount,100*(double)(index + 1)/(int)rHap.RefTypedTotalCount); cout<<endl; } if(rHap.RefTypedIndex[index]==-1) { if(i>=rHap.PrintStartIndex && i <= rHap.PrintEndIndex) { ifprintf(vcfdosepartial,"\n%s\t%d\t%s\t%s\t%s\t.\tPASS\tMAF=%.5f;R2=%.5f", rHap.VariantList[i].chr.c_str(),rHap.VariantList[i].bp, RsId?rHap.VariantList[i].rsid.c_str():rHap.VariantList[i].name.c_str(),rHap.VariantList[i].refAlleleString.c_str(), rHap.VariantList[i].altAlleleString.c_str(),stats.AlleleFrequency(i) > 0.5 ? 1.0 - stats.AlleleFrequency(i) : stats.AlleleFrequency(i),stats.Rsq(i)); if(!tHap.missing[i]) ifprintf(vcfdosepartial,";ER2=%.5f",stats.EmpiricalRsq(i)); ifprintf(vcfdosepartial,"\t%s",GT?(DS?(GP?"GT:DS:GP":"GT:DS"):(GP?"GT:GP":"GT")):(DS?(GP?"DS:GP":"DS"):(GP?"GP":""))); for(int j=1;j<=MaxIndex;j++) { string tempFileIndex(outFile); stringstream strs; strs<<(j); tempFileIndex+=(".dose.vcf.part." + (string)(strs.str()) +(gzip ? ".gz" : "")); line.clear(); vcfdosepartialList[j-1]->readLine(line); ifprintf(vcfdosepartial,"%s",line.c_str()); } } i++; } else { variant ThisTypedVariant =tHap.TypedOnlyVariantList[rHap.RefTypedIndex[index]]; ifprintf(vcfdosepartial,"\n%s\t%d\t%s\t%s\t%s\t.\tPASS\t", ThisTypedVariant.chr.c_str(), ThisTypedVariant.bp, RsId? ThisTypedVariant.rsid.c_str():ThisTypedVariant.name.c_str(), ThisTypedVariant.refAlleleString.c_str(), ThisTypedVariant.altAlleleString.c_str()); ifprintf(vcfdosepartial,"GENOTYPED_ONLY;AN=%d;MAF=%.5f", tHap.TotalSample[rHap.RefTypedIndex[index]], tHap.AlleleFreq[rHap.RefTypedIndex[index]]); //cout<<rHap.RefTypedIndex[index]<<" " <<tHap.TotalSample[rHap.RefTypedIndex[index]]<<" " << tHap.AlleleFreq[rHap.RefTypedIndex[index]]/(double)tHap.TotalSample[rHap.RefTypedIndex[index]]<< endl; ifprintf(vcfdosepartial,"\t%s",GT?(DS?(GP?"GT:DS:GP":"GT:DS"):(GP?"GT:GP":"GT")):(DS?(GP?"DS:GP":"DS"):(GP?"GP":""))); for(int j=1;j<=MaxIndex;j++) { string tempFileIndex(outFile); stringstream strs; strs<<(j); tempFileIndex+=(".dose.vcf.part." + (string)(strs.str()) +(gzip ? ".gz" : "")); line.clear(); vcfdosepartialList[j-1]->readLine(line); ifprintf(vcfdosepartial,"%s",line.c_str()); } // ifprintf(vcfdosepartial,"\n"); } } for(int i=1;i<=MaxIndex;i++) { ifclose(vcfdosepartialList[i-1]); string tempFileIndex(outFile); stringstream strs; strs<<(i); tempFileIndex+=(".dose.vcf.part." + (string)(strs.str())+ (gzip ? ".gz" : "")); remove(tempFileIndex.c_str()); } ifclose(vcfdosepartial); printf("\n Merging Finished ..." ); cout<<endl <<endl; }
bool VcfRecord::write(IFILE filePtr, bool siteOnly) { if(filePtr == NULL) { myStatus.setStatus(StatGenStatus::FAIL_ORDER, "Error writing VCF record before opening the file."); return(false); } int numWritten = 0; int numExpected = 0; if(myChrom.length() == 0) { numWritten += ifprintf(filePtr, ".\t"); numExpected += 2; } else { numWritten += ifprintf(filePtr, "%s\t", myChrom.c_str()); numExpected += myChrom.length() + 1; } if(false) //my1BasedPos.length() == 0) { numWritten += ifprintf(filePtr, ".\t"); numExpected += 2; } else { std::string strPos = std::to_string((long long int)my1BasedPosNum); numWritten += ifprintf(filePtr, "%s\t", strPos.c_str()); numExpected += strPos.length() + 1; } if(myID.length() == 0) { numWritten += ifprintf(filePtr, ".\t"); numExpected += 2; } else { numWritten += ifprintf(filePtr, "%s\t", myID.c_str()); numExpected += myID.length() + 1; } if(myRef.length() == 0) { numWritten += ifprintf(filePtr, ".\t"); numExpected += 2; } else { numWritten += ifprintf(filePtr, "%s\t", myRef.c_str()); numExpected += myRef.length() + 1; } if(myAlt.length() == 0) { numWritten += ifprintf(filePtr, ".\t"); numExpected += 2; } else { numWritten += ifprintf(filePtr, "%s\t", myAlt.c_str()); numExpected += myAlt.length() + 1; } if(myQual.length() == 0) { numWritten += ifprintf(filePtr, ".\t"); numExpected += 2; } else { numWritten += ifprintf(filePtr, "%s\t", myQual.c_str()); numExpected += myQual.length() + 1; } const std::string& filterString = myFilter.getString(); if(filterString.length() == 0) { numWritten += ifprintf(filePtr, ".\t"); numExpected += 2; } else { numWritten += ifprintf(filePtr, "%s\t", filterString.c_str()); numExpected += filterString.length() + 1; } // Write the info. bool writeSuccess = myInfo.write(filePtr); // Only write the format & genotype if we are not just writing siteOnly // data and there is at least one sample if((!siteOnly) && (myGenotype.getNumSamples() != 0)) { writeSuccess &= myGenotype.write(filePtr); } // Write the new line. numWritten += ifprintf(filePtr, "\n"); numExpected += 1; return((numWritten == numExpected) && writeSuccess); }
int Bam2FastQ::execute(int argc, char **argv) { // Extract command line arguments. String inFile = ""; bool readName = false; String refFile = ""; String firstOut = ""; String secondOut = ""; String unpairedOut = ""; bool interleave = false; bool noeof = false; bool gzip = false; bool params = false; myOutBase = ""; myNumMateFailures = 0; myNumPairs = 0; myNumUnpaired = 0; mySplitRG = false; myQField = ""; myNumQualTagErrors = 0; myReverseComp = true; myRNPlus = false; myFirstRNExt = DEFAULT_FIRST_EXT; mySecondRNExt = DEFAULT_SECOND_EXT; myCompression = InputFile::DEFAULT; ParameterList inputParameters; BEGIN_LONG_PARAMETERS(longParameterList) LONG_PARAMETER_GROUP("Required Parameters") LONG_STRINGPARAMETER("in", &inFile) LONG_PARAMETER_GROUP("Optional Parameters") LONG_PARAMETER("readName", &readName) LONG_PARAMETER("splitRG", &mySplitRG) LONG_STRINGPARAMETER("qualField", &myQField) LONG_PARAMETER("merge", &interleave) LONG_STRINGPARAMETER("refFile", &refFile) LONG_STRINGPARAMETER("firstRNExt", &myFirstRNExt) LONG_STRINGPARAMETER("secondRNExt", &mySecondRNExt) LONG_PARAMETER("rnPlus", &myRNPlus) LONG_PARAMETER("noReverseComp", &myReverseComp) LONG_PARAMETER("gzip", &gzip) LONG_PARAMETER("noeof", &noeof) LONG_PARAMETER("params", ¶ms) LONG_PARAMETER_GROUP("Optional OutputFile Names") LONG_STRINGPARAMETER("outBase", &myOutBase) LONG_STRINGPARAMETER("firstOut", &firstOut) LONG_STRINGPARAMETER("secondOut", &secondOut) LONG_STRINGPARAMETER("unpairedOut", &unpairedOut) LONG_PHONEHOME(VERSION) END_LONG_PARAMETERS(); inputParameters.Add(new LongParameters ("Input Parameters", longParameterList)); // parameters start at index 2 rather than 1. inputParameters.Read(argc, argv, 2); // If no eof block is required for a bgzf file, set the bgzf file type to // not look for it. if(noeof) { // Set that the eof block is not required. BgzfFileType::setRequireEofBlock(false); } if(gzip) { myCompression = InputFile::GZIP; } // Check to see if the in file was specified, if not, report an error. if(inFile == "") { usage(); inputParameters.Status(); // In file was not specified but it is mandatory. std::cerr << "--in is a mandatory argument, " << "but was not specified" << std::endl; return(-1); } // Cannot specify both interleaved & secondOut since secondOut would be N/A. if(interleave && !secondOut.IsEmpty()) { usage(); inputParameters.Status(); std::cerr << "ERROR: Cannot specify --merge & --secondOut.\n"; return(-1); } // Cannot specify both interleaved & secondOut since secondOut would be N/A. if(interleave && !secondOut.IsEmpty()) { usage(); inputParameters.Status(); std::cerr << "ERROR: Cannot specify --merge & --secondOut.\n"; return(-1); } // Cannot specify both splitRG & firstOut/secondOut/unpairedOut // since it needs a different file for each RG. if(mySplitRG && (!firstOut.IsEmpty() || !secondOut.IsEmpty() || !unpairedOut.IsEmpty())) { usage(); inputParameters.Status(); std::cerr << "ERROR: Cannot specify --splitRG & --firstOut/--secondOut/--unpairedOut.\n"; std::cerr << "Use --outBase instead.\n"; return(-1); } // Cannot specify splitRG & output to stdout. if(mySplitRG && (myOutBase[0] == '-')) { usage(); inputParameters.Status(); std::cerr << "ERROR: Cannot specify --splitRG & write to stdout.\n"; return(-1); } // Check to see if the out file was specified, if not, generate it from // the input filename. if(myOutBase == "") { // Just remove the extension from the input filename. int extStart = inFile.FastFindLastChar('.'); if(extStart <= 0) { myOutBase = inFile; } else { myOutBase = inFile.Left(extStart); } } if(mySplitRG) { std::string fqList = myOutBase.c_str(); fqList += ".list"; myFqList = ifopen(fqList.c_str(), "w"); ifprintf(myFqList, "MERGE_NAME\tFASTQ1\tFASTQ2\tRG\n"); } // Check to see if the first/second/single-ended were specified and // if not, set them. myFirstFileNameExt = "_1.fastq"; mySecondFileNameExt = "_2.fastq"; myUnpairedFileNameExt = ".fastq"; if(interleave) { myFirstFileNameExt = "_interleaved.fastq"; myFirstFileNameExt = "_interleaved.fastq"; } getFileName(firstOut, myFirstFileNameExt); getFileName(secondOut, mySecondFileNameExt); getFileName(unpairedOut, myUnpairedFileNameExt); if(params) { inputParameters.Status(); } // Open the files for reading/writing. // Open prior to opening the output files, // so if there is an error, the outputs don't get created. SamFile samIn; samIn.OpenForRead(inFile, &mySamHeader); // Skip non-primary reads. samIn.SetReadFlags(0, 0x0100); // Open the output files if not splitting RG if(!mySplitRG) { myUnpairedFile = ifopen(unpairedOut, "w", myCompression); // Only open the first file if it is different than an already opened file. if(firstOut != unpairedOut) { myFirstFile = ifopen(firstOut, "w", myCompression); } else { myFirstFile = myUnpairedFile; } // If it is interleaved or the 2nd file is not a new name, set it appropriately. if(interleave || secondOut == firstOut) { mySecondFile = myFirstFile; } else if(secondOut == unpairedOut) { mySecondFile = myUnpairedFile; } else { mySecondFile = ifopen(secondOut, "w", myCompression); } if(myUnpairedFile == NULL) { std::cerr << "Failed to open " << unpairedOut << " so can't convert bam2FastQ.\n"; return(-1); } if(myFirstFile == NULL) { std::cerr << "Failed to open " << firstOut << " so can't convert bam2FastQ.\n"; return(-1); } if(mySecondFile == NULL) { std::cerr << "Failed to open " << secondOut << " so can't convert bam2FastQ.\n"; return(-1); } } if((readName) || (strcmp(mySamHeader.getSortOrder(), "queryname") == 0)) { readName = true; } else { // defaulting to coordinate sorted. samIn.setSortedValidation(SamFile::COORDINATE); } // Setup the '=' translation if the reference was specified. if(!refFile.IsEmpty()) { GenomeSequence* refPtr = new GenomeSequence(refFile); samIn.SetReadSequenceTranslation(SamRecord::BASES); samIn.SetReference(refPtr); } SamRecord* recordPtr; int16_t samFlag; SamStatus::Status returnStatus = SamStatus::SUCCESS; while(returnStatus == SamStatus::SUCCESS) { recordPtr = myPool.getRecord(); if(recordPtr == NULL) { // Failed to allocate a new record. throw(std::runtime_error("Failed to allocate a new SAM/BAM record")); } if(!samIn.ReadRecord(mySamHeader, *recordPtr)) { // Failed to read a record. returnStatus = samIn.GetStatus(); continue; } // Have a record. Check to see if it is a pair or unpaired read. samFlag = recordPtr->getFlag(); if(SamFlag::isPaired(samFlag)) { if(readName) { handlePairedRN(*recordPtr); } else { handlePairedCoord(*recordPtr); } } else { ++myNumUnpaired; writeFastQ(*recordPtr, myUnpairedFile, myUnpairedFileNameExt); } } // Flush All cleanUpMateMap(0, true); if(returnStatus == SamStatus::NO_MORE_RECS) { returnStatus = SamStatus::SUCCESS; } samIn.Close(); closeFiles(); // Output the results std::cerr << "\nFound " << myNumPairs << " read pairs.\n"; std::cerr << "Found " << myNumUnpaired << " unpaired reads.\n"; if(myNumMateFailures != 0) { std::cerr << "Failed to find mates for " << myNumMateFailures << " reads, so they were written as unpaired\n" << " (not included in either of the above counts).\n"; } if(myNumQualTagErrors != 0) { std::cerr << myNumQualTagErrors << " records did not have tag " << myQField.c_str() << " or it was invalid, so the quality field was used for those records.\n"; } return(returnStatus); }
void Bam2FastQ::writeFastQ(SamRecord& samRec, IFILE filePtr, const std::string& fileNameExt, const char* readNameExt) { static int16_t flag; static std::string sequence; static String quality; static std::string rg; static std::string rgFastqExt; static std::string rgListStr; static std::string fileName; static std::string fq2; if(mySplitRG) { rg = samRec.getString("RG").c_str(); rgFastqExt = rg + fileNameExt; OutFastqMap::iterator it; it = myOutFastqs.find(rgFastqExt); if(it == myOutFastqs.end()) { // New file. fileName = myOutBase.c_str(); if(rg != "") { fileName += '.'; } else { rg = "."; } fileName += rgFastqExt; filePtr = ifopen(fileName.c_str(), "w", myCompression); myOutFastqs[rgFastqExt] = filePtr; if(fileNameExt != mySecondFileNameExt) { // first end. const char* sm = mySamHeader.getRGTagValue("SM", rg.c_str()); if(strcmp(sm, "") == 0){sm = myOutBase.c_str();} rgListStr.clear(); SamHeaderRG* rgPtr = mySamHeader.getRG(rg.c_str()); if((rgPtr == NULL) || (!rgPtr->appendString(rgListStr))) { // No RG info for this record. rgListStr = ".\n"; } fq2 = "."; if(fileNameExt == myFirstFileNameExt) { fq2 = myOutBase.c_str(); if(rg != ".") { fq2 += '.'; fq2 += rg; } fq2 += mySecondFileNameExt; } ifprintf(myFqList, "%s\t%s\t%s\t%s", sm, fileName.c_str(), fq2.c_str(), rgListStr.c_str()); } } else { filePtr = it->second; } } if(filePtr == NULL) { throw(std::runtime_error("Programming ERROR/EXITING: Bam2FastQ filePtr not set.")); return; } flag = samRec.getFlag(); const char* readName = samRec.getReadName(); sequence = samRec.getSequence(); if(myQField.IsEmpty()) { // Read the quality from the quality field quality = samRec.getQuality(); } else { // Read Quality from the specified tag const String* qTagPtr = samRec.getStringTag(myQField.c_str()); if((qTagPtr != NULL) && (qTagPtr->Length() == (int)sequence.length())) { // Use the tag value for quality quality = qTagPtr->c_str(); } else { // Tag was not found, so use the quality field. ++myNumQualTagErrors; if(myNumQualTagErrors == 1) { std::cerr << "Bam2FastQ: " << myQField.c_str() << " tag was not found/invalid, so using the quality field in records without the tag\n"; } quality = samRec.getQuality(); } } if(SamFlag::isReverse(flag) && myReverseComp) { // It is reverse, so reverse compliment the sequence BaseUtilities::reverseComplement(sequence); // Reverse the quality. quality.Reverse(); } else { // Ensure it is all capitalized. int seqLen = sequence.size(); for (int i = 0; i < seqLen; i++) { sequence[i] = (char)toupper(sequence[i]); } } if(myRNPlus) { ifprintf(filePtr, "@%s%s\n%s\n+%s%s\n%s\n", readName, readNameExt, sequence.c_str(), readName, readNameExt, quality.c_str()); } else { ifprintf(filePtr, "@%s%s\n%s\n+\n%s\n", readName, readNameExt, sequence.c_str(), quality.c_str()); } // Release the record. myPool.releaseRecord(&samRec); }
// main function of verifyBamID int execute(int argc, char** argv) { printf("verifyBamID %s -- verify identity and purity of sequence data\n" "(c) 2010-2014 Hyun Min Kang, Goo Jun, and Goncalo Abecasis\n\n", VERSION); VerifyBamIDArgs args; ParameterList pl; BEGIN_LONG_PARAMETERS(longParameters) LONG_PARAMETER_GROUP("Input Files") LONG_STRINGPARAMETER("vcf",&args.sVcfFile) LONG_STRINGPARAMETER("bam",&args.sBamFile) LONG_STRINGPARAMETER("subset",&args.sSubsetInds) LONG_STRINGPARAMETER("smID",&args.sSMID) LONG_PARAMETER_GROUP("VCF analysis options") LONG_DOUBLEPARAMETER("genoError",&args.genoError) LONG_DOUBLEPARAMETER("minAF",&args.minAF) LONG_DOUBLEPARAMETER("minCallRate",&args.minCallRate) LONG_PARAMETER_GROUP("Individuals to compare with chip data") EXCLUSIVE_PARAMETER("site",&args.bSiteOnly) EXCLUSIVE_PARAMETER("self",&args.bSelfOnly) EXCLUSIVE_PARAMETER("best",&args.bFindBest) LONG_PARAMETER_GROUP("Chip-free optimization options") EXCLUSIVE_PARAMETER("free-none",&args.bFreeNone) EXCLUSIVE_PARAMETER("free-mix",&args.bFreeMixOnly) EXCLUSIVE_PARAMETER("free-refBias",&args.bFreeRefBiasOnly) EXCLUSIVE_PARAMETER("free-full",&args.bFreeFull) LONG_PARAMETER_GROUP("With-chip optimization options") EXCLUSIVE_PARAMETER("chip-none",&args.bChipNone) EXCLUSIVE_PARAMETER("chip-mix",&args.bChipMixOnly) EXCLUSIVE_PARAMETER("chip-refBias",&args.bChipRefBiasOnly) EXCLUSIVE_PARAMETER("chip-full",&args.bChipFull) LONG_PARAMETER_GROUP("BAM analysis options") LONG_PARAMETER("ignoreRG",&args.bIgnoreRG) LONG_PARAMETER("ignoreOverlapPair",&args.bIgnoreOverlapPair) LONG_PARAMETER("noEOF",&args.bNoEOF) LONG_PARAMETER("precise",&args.bPrecise) LONG_INTPARAMETER("minMapQ",&args.minMapQ) LONG_INTPARAMETER("maxDepth",&args.maxDepth) LONG_INTPARAMETER("minQ",&args.minQ) LONG_INTPARAMETER("maxQ",&args.maxQ) LONG_DOUBLEPARAMETER("grid",&args.grid) LONG_PARAMETER_GROUP("Modeling Reference Bias") LONG_DOUBLEPARAMETER("refRef",&args.pRefRef) LONG_DOUBLEPARAMETER("refHet",&args.pRefHet) LONG_DOUBLEPARAMETER("refAlt",&args.pRefAlt) LONG_PARAMETER_GROUP("Output options") LONG_STRINGPARAMETER("out",&args.sOutFile) LONG_PARAMETER("verbose",&args.bVerbose) LONG_PHONEHOME(VERSION) END_LONG_PARAMETERS(); pl.Add(new LongParameters("Available Options",longParameters)); pl.Read(argc, argv); pl.Status(); // check the validity of input files if ( args.sVcfFile.IsEmpty() ) { error("--vcf [vcf file] required"); } if ( args.sBamFile.IsEmpty() ) { error("--bam [bam file] is required"); } if ( args.sOutFile.IsEmpty() ) { error("--out [output prefix] is required"); } Logger::gLogger = new Logger((args.sOutFile + ".log").c_str(), args.bVerbose); if ( ! ( args.bSiteOnly || args.bSelfOnly || args.bFindBest ) ) { warning("--self option was autotomatically turned on by default. Specify --best option if you wanted to check across all possible samples in the VCF"); args.bSelfOnly = true; } if ( ( args.maxDepth > 20 ) && ( !args.bPrecise ) ) { warning("--precise option is not turned on at --maxDepth %d : may be prone to precision errors",args.maxDepth); } if ( ( args.bChipRefBiasOnly ) && ( !args.bSelfOnly ) ) { error("--self must be set for --chip-refBias to work. Skipping.."); } // check timestamp time_t t; time(&t); Logger::gLogger->writeLog("Analysis started on %s",ctime(&t)); // load arguments VerifyBamID vbid(&args); // load input VCF and BAM files Logger::gLogger->writeLog("Opening Input Files"); vbid.loadFiles(args.sBamFile.c_str(), args.sVcfFile.c_str()); // Check which genotype-free method is used if ( args.bFreeNone ) { // if no genotype-free mode is tested. skip it // do nothing for genotype-free estimation Logger::gLogger->writeLog("Skipping chip-free estimation of sample mixture"); } else if ( args.bFreeMixOnly ) { // only mixture is estimated. // genotype-free method Logger::gLogger->writeLog("Performing chip-free estimation of sample mixture at fixed reference bias parameters (%lf, %lf, %lf)",args.pRefRef,args.pRefHet,args.pRefAlt); // scan across multiple readgroups for(int rg=-1; rg < vbid.nRGs - (int)args.bIgnoreRG; ++rg) { VerifyBamID::mixLLK mix(&vbid); mix.OptimizeLLK(rg); Logger::gLogger->writeLog("Optimal per-sample fMix = %lf, LLK0 = %lf, LLK1 = %lf\n",mix.fMix,mix.llk0,mix.llk1); vbid.mixOut.llk0s[rg+1] = mix.llk0; vbid.mixOut.llk1s[rg+1] = mix.llk1; vbid.mixOut.fMixs[rg+1] = mix.fMix; } //vbid.mixRefHet = 0.5; //vbid.mixRefAlt = 0.00; } else if ( args.bFreeRefBiasOnly ) { Logger::gLogger->writeLog("Performing chip-free estimation of reference-bias without sample mixture"); for(int rg=-1; rg < vbid.nRGs - (int)args.bIgnoreRG; ++rg) { VerifyBamID::refBiasMixLLKFunc myFunc(&vbid, rg); AmoebaMinimizer myMinimizer; Vector startingPoint(2); startingPoint[0] = 0; // pRefHet = 0.5 startingPoint[1] = -4.595; // pRefAlt = 0.01 myMinimizer.func = &myFunc; myMinimizer.Reset(2); myMinimizer.point = startingPoint; myMinimizer.Minimize(1e-6); double pRefHet = VerifyBamID::invLogit(myMinimizer.point[0]); double pRefAlt = VerifyBamID::invLogit(myMinimizer.point[1]); Logger::gLogger->writeLog("Reference Bias Estimated as ( Pr[refBase|HET] = %lf, Pr[refBase|ALT] = %lf) with LLK = %lf at readGroup %d",pRefHet,pRefAlt,myMinimizer.fmin,rg); //vbid.setRefBiasParams(1.0, pRefHet, pRefAlt); vbid.mixOut.llk0s[rg+1] = myFunc.llk0; vbid.mixOut.llk1s[rg+1] = myFunc.llk1; vbid.mixOut.refHets[rg+1] = myFunc.pRefHet; vbid.mixOut.refAlts[rg+1] = myFunc.pRefAlt; } } else if ( args.bFreeFull ) { Logger::gLogger->writeLog("Performing chip-free estimation of reference-bias and sample mixture together"); for(int rg = -1; rg < vbid.nRGs - args.bIgnoreRG; ++rg) { VerifyBamID::fullMixLLKFunc myFunc(&vbid, rg); AmoebaMinimizer myMinimizer; Vector startingPoint(3); startingPoint[0] = -3.91; // start with fMix = 0.01 startingPoint[1] = 0; // pRefHet = 0.5 startingPoint[2] = -4.595; // pRefAlt = 0.01 myMinimizer.func = &myFunc; myMinimizer.Reset(3); myMinimizer.point = startingPoint; myMinimizer.Minimize(1e-6); double fMix = VerifyBamID::invLogit(myMinimizer.point[0]); if ( fMix > 0.5 ) fMix = 1.-fMix; double pRefHet = VerifyBamID::invLogit(myMinimizer.point[1]); double pRefAlt = VerifyBamID::invLogit(myMinimizer.point[2]); Logger::gLogger->writeLog("Optimal per-sample fMix = %lf\n",fMix); Logger::gLogger->writeLog("Reference Bias Estimated as ( Pr[refBase|HET] = %lf, Pr[refBase|ALT] = %lf) with LLK = %lf",pRefHet,pRefAlt,myMinimizer.fmin); //vbid.setRefBiasParams(1.0, pRefHet, pRefAlt); vbid.mixOut.llk0s[rg+1] = myFunc.llk0; vbid.mixOut.llk1s[rg+1] = myFunc.llk1; vbid.mixOut.fMixs[rg+1] = myFunc.fMix; vbid.mixOut.refHets[rg+1] = myFunc.pRefHet; vbid.mixOut.refAlts[rg+1] = myFunc.pRefAlt; } } Logger::gLogger->writeLog("calculating depth distribution"); vbid.calculateDepthDistribution(args.maxDepth, vbid.mixOut); Logger::gLogger->writeLog("finished calculating depth distribution"); std::vector<int> bestInds(vbid.nRGs+1,-1); std::vector<int> selfInds(vbid.nRGs+1,-1); if ( args.bChipNone ) { // do nothing Logger::gLogger->writeLog("Skipping with-chip estimation of sample mixture"); } else if ( args.bChipMixOnly ) { Logger::gLogger->writeLog("Performing with-chip estimation of sample mixture at fixed reference bias parameter (%lf, %lf, %lf)",args.pRefRef,args.pRefHet,args.pRefAlt); for(int rg=-1; rg < (vbid.nRGs - (int)args.bIgnoreRG); ++rg) { double maxIBD = -1; VerifyBamID::ibdLLK ibd(&vbid); for(int i=0; i < (int)vbid.pGenotypes->indids.size(); ++i) { double fIBD = ibd.OptimizeLLK(i, rg); Logger::gLogger->writeLog("Comparing with individual %s.. Optimal fIBD = %lf, LLK0 = %lf, LLK1 = %lf for readgroup %d",vbid.pGenotypes->indids[i].c_str(),fIBD, ibd.llk0, ibd.llk1, rg); if ( maxIBD < fIBD ) { bestInds[rg+1] = i; vbid.bestOut.llk0s[rg+1] = ibd.llk0; vbid.bestOut.llk1s[rg+1] = ibd.llk1; vbid.bestOut.fMixs[rg+1] = 1-ibd.fIBD; maxIBD = ibd.fIBD; } if ( ( (rg < 0) && (vbid.pPile->sBamSMID == vbid.pGenotypes->indids[i] ) ) || ( ( rg >= 0 ) && ( vbid.pPile->vsSMIDs[rg] == vbid.pGenotypes->indids[i]) ) ) { selfInds[rg+1] = i; vbid.selfOut.llk0s[rg+1] = ibd.llk0; vbid.selfOut.llk1s[rg+1] = ibd.llk1; vbid.selfOut.fMixs[rg+1] = 1-ibd.fIBD; } } if ( bestInds[rg+1] >= 0 ) { Logger::gLogger->writeLog("Best Matching Individual is %s with IBD = %lf",vbid.pGenotypes->indids[bestInds[rg+1]].c_str(),maxIBD); vbid.calculateDepthByGenotype(bestInds[rg+1],rg,vbid.bestOut); } if ( selfInds[rg+1] >= 0 ) { Logger::gLogger->writeLog("Self Individual is %s with IBD = %lf",vbid.pGenotypes->indids[selfInds[rg+1]].c_str(),vbid.selfOut.fMixs[rg+1]); vbid.calculateDepthByGenotype(selfInds[rg+1],rg,vbid.selfOut); } } } else if ( args.bChipRefBiasOnly ) { Logger::gLogger->writeLog("Performing with-chip estimation of reference-bias without sample mixture"); if ( args.bSelfOnly ) { for(int rg=-1; rg < (vbid.nRGs - (int)args.bIgnoreRG); ++rg) { VerifyBamID::refBiasIbdLLKFunc myFunc(&vbid, rg); AmoebaMinimizer myMinimizer; Vector startingPoint(2); startingPoint[0] = 0; // pRefHet = 0.5 startingPoint[1] = -4.595; // pRefAlt = 0.01 myMinimizer.func = &myFunc; myMinimizer.Reset(2); myMinimizer.point = startingPoint; myMinimizer.Minimize(1e-6); double pRefHet = VerifyBamID::invLogit(myMinimizer.point[0]); double pRefAlt = VerifyBamID::invLogit(myMinimizer.point[1]); Logger::gLogger->writeLog("Reference Bias Estimated as ( Pr[refBase|HET] = %lf, Pr[refBase|ALT] = %lf) with LLK = %lf",pRefHet,pRefAlt,myMinimizer.fmin); //vbid.setRefBiasParams(1.0, pRefHet, pRefAlt); vbid.selfOut.llk0s[rg+1] = myFunc.llk0; vbid.selfOut.llk1s[rg+1] = myFunc.llk1; vbid.selfOut.refHets[rg+1] = myFunc.pRefHet; vbid.selfOut.refAlts[rg+1] = myFunc.pRefAlt; vbid.calculateDepthByGenotype(0,rg,vbid.selfOut); } } else { Logger::gLogger->warning("--self must be set for --chip-refBias to work. Skipping.."); } } else if ( args.bChipFull ) { Logger::gLogger->writeLog("Performing with-chip estimation of reference-bias and sample mixture together"); for(int rg=-1; rg < (vbid.nRGs - (int)args.bIgnoreRG); ++rg) { double maxIBD = -1; for(int i=0; i < (int)vbid.pGenotypes->indids.size(); ++i) { VerifyBamID::fullIbdLLKFunc myFunc(&vbid,i,rg); AmoebaMinimizer myMinimizer; Vector startingPoint(3); startingPoint[0] = 3.91; // start with fIBD = 0.99 startingPoint[1] = 0; // pRefHet = 0.5 startingPoint[2] = -4.595; // pRefAlt = 0.01 myMinimizer.func = &myFunc; myFunc.indIdx = i; myMinimizer.Reset(3); myMinimizer.point = startingPoint; myMinimizer.Minimize(1e-6); double fIBD = VerifyBamID::invLogit(myMinimizer.point[0]); double pRefHet = VerifyBamID::invLogit(myMinimizer.point[1]); double pRefAlt = VerifyBamID::invLogit(myMinimizer.point[2]); Logger::gLogger->writeLog("Comparing with individual %s.. Optimal fIBD = %lf, LLK0 = %lf, LLK1 = %lf for readgroup %d",vbid.pGenotypes->indids[i].c_str(), fIBD, myFunc.llk0, myFunc.llk1, rg); //Logger::gLogger->writeLog("Optimal per-sample fIBD = %lf, ",fIBD); Logger::gLogger->writeLog("Reference Bias Estimated as ( Pr[refBase|HET] = %lf, Pr[refBase|ALT] = %lf ) with LLK = %lf",pRefHet,pRefAlt,myMinimizer.fmin); if ( maxIBD < fIBD ) { bestInds[rg+1] = i; maxIBD = fIBD; vbid.bestOut.llk0s[rg+1] = myFunc.llk0; vbid.bestOut.llk1s[rg+1] = myFunc.llk1; vbid.bestOut.fMixs[rg+1] = 1.-myFunc.fIBD; vbid.bestOut.refHets[rg+1] = myFunc.pRefHet; vbid.bestOut.refAlts[rg+1] = myFunc.pRefAlt; } if ( ( (rg < 0) && (vbid.pPile->sBamSMID == vbid.pGenotypes->indids[i] ) ) || ( ( rg >= 0 ) && ( vbid.pPile->vsSMIDs[rg] == vbid.pGenotypes->indids[i]) ) ) { selfInds[rg+1] = i; vbid.selfOut.llk0s[rg+1] = myFunc.llk0; vbid.selfOut.llk1s[rg+1] = myFunc.llk1; vbid.selfOut.fMixs[rg+1] = 1.-myFunc.fIBD; vbid.selfOut.refHets[rg+1] = myFunc.pRefHet; vbid.selfOut.refAlts[rg+1] = myFunc.pRefAlt; vbid.calculateDepthByGenotype(i, rg, vbid.selfOut); } } //vbid.setRefBiasParams(1.0, pRefHet, pRefAlt); if ( bestInds[rg+1] >= 0 ) { Logger::gLogger->writeLog("Best Matching Individual is %s with IBD = %lf",vbid.pGenotypes->indids[bestInds[rg+1]].c_str(),maxIBD); vbid.calculateDepthByGenotype(bestInds[rg+1], rg, vbid.bestOut); } if ( selfInds[rg+1] >= 0 ) { Logger::gLogger->writeLog("Self Individual is %s with IBD = %lf",vbid.pGenotypes->indids[selfInds[rg+1]].c_str(),vbid.selfOut.fMixs[rg+1]); vbid.calculateDepthByGenotype(selfInds[rg+1],rg,vbid.selfOut); } } } // PRINT OUTPUT FILE - ".selfSM" // [SEQ_ID] : SAMPLE ID in the sequence file // [CHIP_ID] : SAMPLE ID in the chip file (NA if not available) // [#SNPS] : Number of markers evaluated // [#READS] : Number of reads evaluated // [AVG_DP] : Mean depth // [FREEMIX] : Chip-free estimated alpha (% MIX in 0-1 scale), NA if unavailable // [FREELK1] : Chip-free log-likelihood at estimated alpha // [FREELK0] : Chip-free log-likelihood at 0% contamination // [CHIPIBD] : With-chip estimated alpha (% MIX in 0-1 scale) // [CHIPLK1] : With-chip log-likelihood at estimated alpha // [CHIPLK0] : With-chip log-likelihood at 0% contamination // [DPREF] : Depth at reference site in the chip // [RDPHET] : Relative depth at HET site in the chip // [RDPALT] : Relative depth at HOMALT site in the chip // [FREE_RF] : Pr(Ref|Ref) site estimated without chip data // [FREE_RH] : Pr(Ref|Het) site estimated without chip data // [FREE_RA] : Pr(Ref|Alt) site estimated without chip data // [CHIP_RF] : Pr(Ref|Ref) site estimated with chip data // [CHIP_RH] : Pr(Ref|Het) site estimated with chip data // [CHIP_RA] : Pr(Ref|Alt) site estimated with chip data // [DPREF] : Depth at reference alleles // [RDPHET] : Relative depth at heterozygous alleles // [RDPALT] : Relative depth at hom-alt alleles String selfSMFN = args.sOutFile + ".selfSM"; String bestSMFN = args.sOutFile + ".bestSM"; String selfRGFN = args.sOutFile + ".selfRG"; String bestRGFN = args.sOutFile + ".bestRG"; String dpSMFN = args.sOutFile + ".depthSM"; String dpRGFN = args.sOutFile + ".depthRG"; IFILE selfSMF = ifopen(selfSMFN,"wb"); IFILE bestSMF = (args.bFindBest ? ifopen(bestSMFN,"wb") : NULL); IFILE selfRGF = (args.bIgnoreRG ? NULL : ifopen(selfRGFN,"wb")); IFILE bestRGF = (args.bFindBest && !args.bIgnoreRG) ? ifopen(bestRGFN,"wb") : NULL; IFILE dpSMF = ifopen(dpSMFN,"wb"); IFILE dpRGF = (args.bIgnoreRG ? NULL : ifopen(dpRGFN,"wb")); if ( selfSMF == NULL ) { Logger::gLogger->error("Cannot write to %s",selfSMF); } if ( args.bFindBest && ( bestSMF == NULL ) ) { Logger::gLogger->error("Cannot write to %s",bestSMF); } if ( dpSMF == NULL ) { Logger::gLogger->error("Cannot write to %s",dpSMF); } ifprintf(dpSMF,"#RG\tDEPTH\t#SNPs\t%%SNPs\t%%CUMUL\n"); int nCumMarkers = 0; for(int i=args.maxDepth; i >= 0; --i) { nCumMarkers += vbid.mixOut.depths[i]; ifprintf(dpSMF,"ALL\t%d\t%d\t%.5lf\t%.5lf\n",i, vbid.mixOut.depths[i],(double) vbid.mixOut.depths[i]/(double)vbid.nMarkers,(double)nCumMarkers/(double)vbid.nMarkers); } ifclose(dpSMF); if ( dpRGF != NULL ) { ifprintf(dpRGF,"#RG\tDEPTH\t#SNPs\t%%SNPs\t%%CUMUL\n"); for(int rg=0; rg < (vbid.nRGs - (int)args.bIgnoreRG); ++rg) { const char* rgID = vbid.pPile->vsRGIDs[rg].c_str(); int nMarkers = 0; for(int i=args.maxDepth; i >= 0; --i) { nMarkers += vbid.mixOut.depths[(rg+1)*(args.maxDepth+1) + i]; } nCumMarkers = 0; for(int i=args.maxDepth; i >= 0; --i) { int d = vbid.mixOut.depths[(rg+1)*(args.maxDepth+1) + i]; nCumMarkers += d; ifprintf(dpRGF,"%s\t%d\t%d\t%.5lf\t%.5lf\n",rgID,i,d,(double)d/(double)vbid.nMarkers,(double)nCumMarkers/(double)nMarkers); } } ifclose(dpRGF); } const char* headers[] = {"#SEQ_ID","RG","CHIP_ID","#SNPS","#READS","AVG_DP","FREEMIX","FREELK1","FREELK0","FREE_RH","FREE_RA","CHIPMIX","CHIPLK1","CHIPLK0","CHIP_RH","CHIP_RA","DPREF","RDPHET","RDPALT"}; int nheaders = sizeof(headers)/sizeof(headers[0]); for(int i=0; i < nheaders; ++i) { ifprintf(selfSMF,"%s%s",i>0 ? "\t" : "",headers[i]); } ifprintf(selfSMF,"\n"); ifprintf(selfSMF,"%s\tALL",vbid.pPile->sBamSMID.c_str()); ifprintf(selfSMF,"\t%s",selfInds[0] >= 0 ? vbid.pGenotypes->indids[selfInds[0]].c_str() : "NA"); ifprintf(selfSMF,"\t%d\t%d\t%.2lf",vbid.nMarkers,vbid.mixOut.numReads[0],(double)vbid.mixOut.numReads[0]/(double)vbid.nMarkers); if ( args.bFreeNone ) { ifprintf(selfSMF,"\tNA\tNA\tNA\tNA\tNA"); } else if ( args.bFreeMixOnly ) { ifprintf(selfSMF,"\t%.5lf\t%.2lf\t%.2lf\tNA\tNA",vbid.mixOut.fMixs[0],vbid.mixOut.llk1s[0],vbid.mixOut.llk0s[0]); } else if ( args.bFreeRefBiasOnly ) { ifprintf(selfSMF,"\tNA\t%.2lf\t%.2lf\t%.5lf\t%.5lf",vbid.mixOut.llk1s[0],vbid.mixOut.llk0s[0],vbid.mixOut.refHets[0],vbid.mixOut.refAlts[0]); } else if ( args.bFreeFull ) { ifprintf(selfSMF,"\t%.5lf\t%.2lf\t%.2lf\t%.5lf\t%.5lf",vbid.mixOut.fMixs[0],vbid.mixOut.llk1s[0],vbid.mixOut.llk0s[0],vbid.mixOut.refHets[0],vbid.mixOut.refAlts[0]); } else { error("Invalid option in handling bFree"); } if ( args.bChipNone || bestInds[0] < 0 ) { ifprintf(selfSMF,"\tNA\tNA\tNA\tNA\tNA\tNA\tNA\tNA"); } else if ( args.bChipMixOnly ) { ifprintf(selfSMF,"\t%.5lf\t%.2lf\t%.2lf\tNA\tNA\t%.3lf\t%.4lf\t%.4lf",vbid.selfOut.fMixs[0],vbid.selfOut.llk1s[0],vbid.selfOut.llk0s[0],(double)vbid.selfOut.numReads[1]/vbid.selfOut.numGenos[1], (double)vbid.selfOut.numReads[2]*vbid.selfOut.numGenos[1]/vbid.selfOut.numReads[1]/vbid.selfOut.numGenos[2], (double)vbid.selfOut.numReads[3]*vbid.selfOut.numGenos[1]/vbid.selfOut.numReads[1]/vbid.selfOut.numGenos[3]); } else if ( args.bChipMixOnly ) { ifprintf(selfSMF,"\tNA\t%.2lf\t%.2lf\t%.5lf\t%.5lf\t%.3lf\t%.4lf\t%.4lf",vbid.selfOut.llk1s[0], vbid.selfOut.llk0s[0], vbid.selfOut.refHets[0], vbid.selfOut.refAlts[0], (double)vbid.selfOut.numReads[1]/vbid.selfOut.numGenos[1], (double)vbid.selfOut.numReads[2]*vbid.selfOut.numGenos[1]/vbid.selfOut.numReads[1]/vbid.selfOut.numGenos[2], (double)vbid.selfOut.numReads[3]*vbid.selfOut.numGenos[1]/vbid.selfOut.numReads[1]/vbid.selfOut.numGenos[3]); } else if ( args.bChipFull ) { ifprintf(selfSMF,"\t%.5lf\t%.2lf\t%.2lf\t%.5lf\t%.5lf\t%.3lf\t%.4lf\t%.4lf", vbid.selfOut.fMixs[0], vbid.selfOut.llk1s[0], vbid.selfOut.llk0s[0], vbid.selfOut.refHets[0], vbid.selfOut.refAlts[0], (double)vbid.selfOut.numReads[1]/vbid.selfOut.numGenos[1], (double)vbid.selfOut.numReads[2]*vbid.selfOut.numGenos[1]/vbid.selfOut.numReads[1]/vbid.selfOut.numGenos[2], (double)vbid.selfOut.numReads[3]*vbid.selfOut.numGenos[1]/vbid.selfOut.numReads[1]/vbid.selfOut.numGenos[3]); } else { error("Invalid option in handling bChip"); } ifprintf(selfSMF,"\n"); ifclose(selfSMF); if ( bestSMF != NULL ) { for(int i=0; i < nheaders; ++i) { ifprintf(bestSMF,"%s%s",i>0 ? "\t" : "",headers[i]); } ifprintf(bestSMF,"\n"); ifprintf(bestSMF,"%s\tALL",vbid.pPile->sBamSMID.c_str()); ifprintf(bestSMF,"\t%s",bestInds[0] >= 0 ? vbid.pGenotypes->indids[bestInds[0]].c_str() : "NA"); ifprintf(bestSMF,"\t%d\t%d\t%.2lf",vbid.nMarkers,vbid.mixOut.numReads[0],(double)vbid.mixOut.numReads[0]/(double)vbid.nMarkers); if ( args.bFreeNone ) { ifprintf(bestSMF,"\tNA\tNA\tNA\tNA\tNA"); } else if ( args.bFreeMixOnly ) { ifprintf(bestSMF,"\t%.5lf\t%.2lf\t%.2lf\tNA\tNA",vbid.mixOut.fMixs[0],vbid.mixOut.llk1s[0],vbid.mixOut.llk0s[0]); } else if ( args.bFreeRefBiasOnly ) { ifprintf(bestSMF,"\tNA\t%.2lf\t%.2lf\t%.5lf\t%.5lf",vbid.mixOut.llk1s[0],vbid.mixOut.llk0s[0],vbid.mixOut.refHets[0],vbid.mixOut.refAlts[0]); } else if ( args.bFreeFull ) { ifprintf(bestSMF,"\t%.5lf\t%.2lf\t%.2lf\t%.5lf\t%.5lf",vbid.mixOut.fMixs[0],vbid.mixOut.llk1s[0],vbid.mixOut.llk0s[0],vbid.mixOut.refHets[0],vbid.mixOut.refAlts[0]); } else { error("Invalid option in handling bFree"); } if ( args.bChipNone || bestInds[0] < 0 ) { ifprintf(bestSMF,"\tNA\tNA\tNA\tNA\tNA\tNA\tNA\tNA"); } else if ( args.bChipMixOnly ) { ifprintf(bestSMF,"\t%.5lf\t%.2lf\t%.2lf\tNA\tNA\t%.3lf\t%.4lf\t%.4lf",vbid.bestOut.fMixs[0],vbid.bestOut.llk1s[0],vbid.bestOut.llk0s[0],(double)vbid.bestOut.numReads[1]/vbid.bestOut.numGenos[1], (double)vbid.bestOut.numReads[2]*vbid.bestOut.numGenos[1]/vbid.bestOut.numReads[1]/vbid.bestOut.numGenos[2], (double)vbid.bestOut.numReads[3]*vbid.bestOut.numGenos[1]/vbid.bestOut.numReads[1]/vbid.bestOut.numGenos[3]); } else if ( args.bChipMixOnly ) { ifprintf(bestSMF,"\tNA\t%.2lf\t%.2lf\t%.5lf\t%.5lf\t%.3lf\t%.4lf\t%.4lf",vbid.bestOut.llk1s[0], vbid.bestOut.llk0s[0], vbid.bestOut.refHets[0], vbid.bestOut.refAlts[0], (double)vbid.bestOut.numReads[1]/vbid.bestOut.numGenos[1], (double)vbid.bestOut.numReads[2]*vbid.bestOut.numGenos[1]/vbid.bestOut.numReads[1]/vbid.bestOut.numGenos[2], (double)vbid.bestOut.numReads[3]*vbid.bestOut.numGenos[1]/vbid.bestOut.numReads[1]/vbid.bestOut.numGenos[3]); } else if ( args.bChipFull ) { ifprintf(bestSMF,"\t%.5lf\t%.2lf\t%.2lf\t%.5lf\t%.5lf\t%.3lf\t%.4lf\t%.4lf", vbid.bestOut.fMixs[0], vbid.bestOut.llk1s[0], vbid.bestOut.llk0s[0], vbid.bestOut.refHets[0], vbid.bestOut.refAlts[0], (double)vbid.bestOut.numReads[1]/vbid.bestOut.numGenos[1], (double)vbid.bestOut.numReads[2]*vbid.bestOut.numGenos[1]/vbid.bestOut.numReads[1]/vbid.bestOut.numGenos[2], (double)vbid.bestOut.numReads[3]*vbid.bestOut.numGenos[1]/vbid.bestOut.numReads[1]/vbid.bestOut.numGenos[3]); } else { error("Invalid option in handling bChip"); } ifprintf(bestSMF,"\n"); ifclose(bestSMF); } if ( selfRGF != NULL ) { for(int i=0; i < nheaders; ++i) { ifprintf(selfRGF,"%s%s",i>0 ? "\t" : "",headers[i]); } ifprintf(selfRGF,"\n"); for(int rg=0; rg < vbid.nRGs; ++rg) { ifprintf(selfRGF,"%s\t%s",vbid.pPile->sBamSMID.c_str(),vbid.pPile->vsRGIDs[rg].c_str()); ifprintf(selfRGF,"\t%s",bestInds[rg] >= 0 ? vbid.pGenotypes->indids[bestInds[rg]].c_str() : "NA"); ifprintf(selfRGF,"\t%d\t%d\t%.2lf",vbid.nMarkers,vbid.mixOut.numReads[(rg+1)*4],(double)vbid.mixOut.numReads[(rg+1)*4]/(double)vbid.mixOut.numGenos[(rg+1)*4]); if ( args.bFreeNone ) { ifprintf(selfRGF,"\tNA\tNA\tNA\tNA\tNA"); } else if ( args.bFreeMixOnly ) { ifprintf(selfRGF,"\t%.5lf\t%.2lf\t%.2lf\tNA\tNA",vbid.mixOut.fMixs[rg+1],vbid.mixOut.llk1s[rg+1],vbid.mixOut.llk0s[rg+1]); } else if ( args.bFreeRefBiasOnly ) { ifprintf(selfRGF,"\tNA\t%.2lf\t%.2lf\t%.5lf\t%.5lf",vbid.mixOut.llk1s[rg+1],vbid.mixOut.llk0s[rg+1],vbid.mixOut.refHets[rg+1],vbid.mixOut.refAlts[rg+1]); } else if ( args.bFreeFull ) { ifprintf(selfRGF,"\t%.5lf\t%.2lf\t%.2lf\t%.5lf\t%.5lf",vbid.mixOut.fMixs[rg+1],vbid.mixOut.llk1s[rg+1],vbid.mixOut.llk0s[rg+1],vbid.mixOut.refHets[rg+1],vbid.mixOut.refAlts[rg+1]); } else { error("Invalid option in handling bFree"); } if ( args.bChipNone || bestInds[0] < 0 ) { ifprintf(selfRGF,"\tNA\tNA\tNA\tNA\tNA\tNA\tNA\tNA"); } else if ( args.bChipMixOnly ) { ifprintf(selfRGF,"\t%.5lf\t%.2lf\t%.2lf\tNA\tNA\t%.3lf\t%.4lf\t%.4lf",vbid.selfOut.fMixs[rg+1], vbid.selfOut.llk1s[rg+1], vbid.selfOut.llk0s[rg+1], (double)vbid.selfOut.numReads[(rg+1)*4+1]/vbid.selfOut.numGenos[(rg+1)*4+1], (double)vbid.selfOut.numReads[(rg+1)*4+2]*vbid.selfOut.numGenos[(rg+1)*4+1]/vbid.selfOut.numReads[(rg+1)*4+1]/vbid.selfOut.numGenos[(rg+1)*4+2], (double)vbid.selfOut.numReads[(rg+1)*4+3]*vbid.selfOut.numGenos[(rg+1)*4+1]/vbid.selfOut.numReads[(rg+1)*4+1]/vbid.selfOut.numGenos[(rg+1)*4+3]); } else if ( args.bChipMixOnly ) { ifprintf(selfRGF,"\tNA\t%.2lf\t%.2lf\t%.5lf\t%.5lf\t%.3lf\t%.4lf\t%.4lf",vbid.selfOut.llk1s[rg+1], vbid.selfOut.llk0s[rg+1], vbid.selfOut.refHets[rg+1], vbid.selfOut.refAlts[rg+1], (double)vbid.selfOut.numReads[(rg+1)*4+1]/vbid.selfOut.numGenos[(rg+1)*4+1], (double)vbid.selfOut.numReads[(rg+1)*4+2]*vbid.selfOut.numGenos[(rg+1)*4+1]/vbid.selfOut.numReads[(rg+1)*4]/vbid.selfOut.numGenos[(rg+1)*4+2], (double)vbid.selfOut.numReads[(rg+1)*4+3]*vbid.selfOut.numGenos[(rg+1)*4+1]/vbid.selfOut.numReads[(rg+1)*4+1]/vbid.selfOut.numGenos[(rg+1)*4+3]); } else if ( args.bChipFull ) { ifprintf(selfRGF,"\t%.5lf\t%.2lf\t%.2lf\t%.5lf\t%.5lf\t%.3lf\t%.4lf\t%.4lf", vbid.selfOut.fMixs[rg+1], vbid.selfOut.llk1s[rg+1], vbid.selfOut.llk0s[rg+1], vbid.selfOut.refHets[rg+1], vbid.selfOut.refAlts[rg+1], (double)vbid.selfOut.numReads[(rg+1)*4+1]/vbid.selfOut.numGenos[(rg+1)*4+1], (double)vbid.selfOut.numReads[(rg+1)*4+2]*vbid.selfOut.numGenos[(rg+1)*4+1]/vbid.selfOut.numReads[(rg+1)*4+1]/vbid.selfOut.numGenos[(rg+1)*4+2], (double)vbid.selfOut.numReads[(rg+1)*4+3]*vbid.selfOut.numGenos[(rg+1)*4+1]/vbid.selfOut.numReads[(rg+1)*4+1]/vbid.selfOut.numGenos[(rg+1)*4+3]); } else { error("Invalid option in handling bChip"); } ifprintf(selfRGF,"\n"); } ifclose(selfRGF); } if ( bestRGF != NULL ) { for(int i=0; i < nheaders; ++i) { ifprintf(bestRGF,"%s%s",i>0 ? "\t" : "",headers[i]); } ifprintf(bestRGF,"\n"); for(int rg=0; rg < vbid.nRGs; ++rg) { ifprintf(bestRGF,"%s\t%s",vbid.pPile->sBamSMID.c_str(),vbid.pPile->vsRGIDs[rg].c_str()); ifprintf(bestRGF,"\t%s",bestInds[rg] >= 0 ? vbid.pGenotypes->indids[bestInds[rg]].c_str() : "NA"); ifprintf(bestRGF,"\t%d\t%d\t%.2lf",vbid.nMarkers,vbid.mixOut.numReads[(rg+1)*4],(double)vbid.mixOut.numReads[(rg+1)*4]/(double)vbid.mixOut.numGenos[(rg+1)*4]); if ( args.bFreeNone ) { ifprintf(bestRGF,"\tNA\tNA\tNA\tNA\tNA"); } else if ( args.bFreeMixOnly ) { ifprintf(bestRGF,"\t%.5lf\t%.2lf\t%.2lf\tNA\tNA",vbid.mixOut.fMixs[rg+1],vbid.mixOut.llk1s[rg+1],vbid.mixOut.llk0s[rg+1]); } else if ( args.bFreeRefBiasOnly ) { ifprintf(bestRGF,"\tNA\t%.2lf\t%.2lf\t%.5lf\t%.5lf",vbid.mixOut.llk1s[rg+1],vbid.mixOut.llk0s[rg+1],vbid.mixOut.refHets[rg+1],vbid.mixOut.refAlts[rg+1]); } else if ( args.bFreeFull ) { ifprintf(bestRGF,"\t%.5lf\t%.2lf\t%.2lf\t%.5lf\t%.5lf",vbid.mixOut.fMixs[rg+1],vbid.mixOut.llk1s[rg+1],vbid.mixOut.llk0s[rg+1],vbid.mixOut.refHets[rg+1],vbid.mixOut.refAlts[rg+1]); } else { error("Invalid option in handling bFree"); } if ( args.bChipNone || bestInds[0] < 0 ) { ifprintf(bestRGF,"\tNA\tNA\tNA\tNA\tNA\tNA\tNA\tNA"); } else if ( args.bChipMixOnly ) { ifprintf(bestRGF,"\t%.5lf\t%.2lf\t%.2lf\tNA\tNA\t%.3lf\t%.4lf\t%.4lf",vbid.bestOut.fMixs[rg+1], vbid.bestOut.llk1s[rg+1], vbid.bestOut.llk0s[rg+1], (double)vbid.bestOut.numReads[(rg+1)*4+1]/vbid.bestOut.numGenos[(rg+1)*4+1], (double)vbid.bestOut.numReads[(rg+1)*4+2]*vbid.bestOut.numGenos[(rg+1)*4+1]/vbid.bestOut.numReads[(rg+1)*4+1]/vbid.bestOut.numGenos[(rg+1)*4+2], (double)vbid.bestOut.numReads[(rg+1)*4+3]*vbid.bestOut.numGenos[(rg+1)*4+1]/vbid.bestOut.numReads[(rg+1)*4+1]/vbid.bestOut.numGenos[(rg+1)*4+3]); } else if ( args.bChipMixOnly ) { ifprintf(bestRGF,"\tNA\t%.2lf\t%.2lf\t%.5lf\t%.5lf\t%.3lf\t%.4lf\t%.4lf",vbid.bestOut.llk1s[rg+1], vbid.bestOut.llk0s[rg+1], vbid.bestOut.refHets[rg+1], vbid.bestOut.refAlts[rg+1], (double)vbid.bestOut.numReads[(rg+1)*4+1]/vbid.bestOut.numGenos[(rg+1)*4+1], (double)vbid.bestOut.numReads[(rg+1)*4+2]*vbid.bestOut.numGenos[(rg+1)*4+1]/vbid.bestOut.numReads[(rg+1)*4]/vbid.bestOut.numGenos[(rg+1)*4+2], (double)vbid.bestOut.numReads[(rg+1)*4+3]*vbid.bestOut.numGenos[(rg+1)*4+1]/vbid.bestOut.numReads[(rg+1)*4+1]/vbid.bestOut.numGenos[(rg+1)*4+3]); } else if ( args.bChipFull ) { ifprintf(bestRGF,"\t%.5lf\t%.2lf\t%.2lf\t%.5lf\t%.5lf\t%.3lf\t%.4lf\t%.4lf", vbid.bestOut.fMixs[rg+1], vbid.bestOut.llk1s[rg+1], vbid.bestOut.llk0s[rg+1], vbid.bestOut.refHets[rg+1], vbid.bestOut.refAlts[rg+1], (double)vbid.bestOut.numReads[(rg+1)*4+1]/vbid.bestOut.numGenos[(rg+1)*4+1], (double)vbid.bestOut.numReads[(rg+1)*4+2]*vbid.bestOut.numGenos[(rg+1)*4+1]/vbid.bestOut.numReads[(rg+1)*4+1]/vbid.bestOut.numGenos[(rg+1)*4+2], (double)vbid.bestOut.numReads[(rg+1)*4+3]*vbid.bestOut.numGenos[(rg+1)*4+1]/vbid.bestOut.numReads[(rg+1)*4+1]/vbid.bestOut.numGenos[(rg+1)*4+3]); } else { error("Invalid option in handling bChip"); } ifprintf(bestRGF,"\n"); } ifclose(bestRGF); } time(&t); Logger::gLogger->writeLog("Analysis finished on %s",ctime(&t)); return 0; }
int main(int argc, char ** argv) { setbuf(stdout, NULL); time_t start = time(NULL); printf("MiniMac - Imputation into phased haplotypes\n" "(c) 2011 Goncalo Abecasis\n"); #ifdef __VERSION__ printf("VERSION 5.0\n"); #else printf("UNDOCUMENTED RELEASE\n"); #endif int rounds = 5, states = 200, cpus = 0; bool em = false, gzip = false, phased = false; String referenceHaplotypes, referenceSnps; String haplotypes, snps; String prefix("minimac"); String firstMarker, lastMarker; String recombinationRates, errorRates; BEGIN_LONG_PARAMETERS(longParameters) LONG_PARAMETER_GROUP("Reference Haplotypes") LONG_STRINGPARAMETER("refHaps", &referenceHaplotypes) LONG_STRINGPARAMETER("refSnps", &referenceSnps) LONG_PARAMETER_GROUP("Target Haplotypes") LONG_STRINGPARAMETER("haps", &haplotypes) LONG_STRINGPARAMETER("snps", &snps) LONG_PARAMETER_GROUP("Starting Parameters") LONG_STRINGPARAMETER("rec", &recombinationRates) LONG_STRINGPARAMETER("erate", &errorRates) LONG_PARAMETER_GROUP("Parameter Fitting") LONG_INTPARAMETER("rounds", &rounds) LONG_INTPARAMETER("states", &states) LONG_PARAMETER("em", &em) LONG_PARAMETER_GROUP("Output Files") LONG_STRINGPARAMETER("prefix", &prefix) LONG_PARAMETER("phased", &phased) LONG_PARAMETER("gzip", &gzip) // LONG_PARAMETER_GROUP("Clipping Window") // LONG_STRINGPARAMETER("start", &firstMarker) // LONG_STRINGPARAMETER("stop", &lastMarker) #ifdef _OPENMP LONG_PARAMETER_GROUP("Multi-Threading") LONG_INTPARAMETER("cpus", &cpus) #endif END_LONG_PARAMETERS(); ParameterList pl; pl.Add(new LongParameters("Command Line Options", longParameters)); pl.Read(argc, argv); pl.Status(); #ifdef _OPENMP if (cpus > 0) omp_set_num_threads(cpus); #endif // Read marker list printf("Reading Reference Marker List ...\n"); StringArray refMarkerList; refMarkerList.Read(referenceSnps); // Index markers StringIntHash referenceHash; for (int i = 0; i < refMarkerList.Length(); i++) referenceHash.Add(refMarkerList[i].Trim(), i); printf(" %d Markers in Reference Haplotypes...\n\n", refMarkerList.Length()); // Load reference haplotypes printf("Loading reference haplotypes ...\n"); HaplotypeSet reference; reference.markerCount = refMarkerList.Length(); reference.LoadHaplotypes(referenceHaplotypes); printf(" %d Reference Haplotypes Loaded ...\n\n", reference.count); // Read framework marker list printf("Reading Framework Marker List ...\n"); StringArray markerList; markerList.Read(snps); ClipReference(reference, refMarkerList, referenceHash, markerList, firstMarker, lastMarker); // Crossref Marker Names to Reference Panel Positions IntArray markerIndex; markerIndex.Dimension(markerList.Length()); int matches = 0; for (int i = 0; i < markerList.Length(); i++) { markerIndex[i] = referenceHash.Integer(markerList[i].Trim()); if (markerIndex[i] >= 0) matches++; } printf(" %d Markers in Framework Haplotypes Overlap Reference ...\n", matches); if (matches == 0) error("No markers overlap between target and reference\n" "Please check correct reference is being used and markers are named consistently"); printf(" %d Other Markers in Framework Haplotypes Discarded ...\n\n", markerList.Length() - matches); // Check for flips in reference vs. target haplotypes int flips = 0; int previous = -1; for (int i = 0; i < markerIndex.Length(); i++) if (markerIndex[i] >= 0) if (markerIndex[i] < previous) { if (flips++ < 10) printf(" -> Marker %s precedes %s in reference, but follows it in target\n", (const char *) refMarkerList[previous], (const char *) markerList[i]); previous = markerIndex[i]; } if (flips > 10) printf(" -> %d Additional Marker Order Changes Not Listed\n", flips - 10); if (flips) printf(" %d Marker Pairs Change Order in Target vs Framework Haplotypes\n", flips); // Load target haplotypes printf("Loading target haplotypes ...\n"); HaplotypeSet target; target.markerCount = markerList.Length(); target.LoadHaplotypes(haplotypes, true); reference.CalculateFrequencies(); target.CalculateFrequencies(); target.CompareFrequencies(reference, markerIndex, markerList); printf(" %d Target Haplotypes Loaded ...\n\n", target.count); int startIndex = firstMarker.IsEmpty() ? 0 : referenceHash.Integer(firstMarker); int stopIndex = lastMarker.IsEmpty() ? reference.markerCount - 1 : referenceHash.Integer(lastMarker); if (startIndex < 0 || stopIndex < 0) error("Clipping requested, but no position available for one of the endpoints"); printf("Setting up Markov Model...\n\n"); // Setup Markov Model MarkovParameters mp; mp.Allocate(reference.markerCount); if (rounds > 0) printf("Initializing Model Parameters (using %s and up to %d haplotypes)\n", em ? "E-M" : "MCMC", states); // Simple initial estimates of error and recombination rate for (int i = 0; i < reference.markerCount; i++) mp.E[i] = 0.01; for (int i = 0; i < reference.markerCount - 1; i++) mp.R[i] = 0.001; if (mp.ReadErrorRates(errorRates)) printf(" Updated error rates using data in %s ...\n", (const char *) errorRates); if (mp.ReadCrossoverRates(recombinationRates)) printf(" Updated recombination rates using %s ...\n", (const char *) recombinationRates); // Parameter estimation loop for (int round = 0; round < rounds; round++) { printf(" Round %d of Parameter Refinement ...\n", round + 1); int iterations = states < reference.count ? states : reference.count; MarkovModel original; original.CopyParameters(mp); #pragma omp parallel for for (int i = 0; i < iterations; i++) { MarkovModel mm; mm.Allocate(reference.markerCount, reference.count - 1); mm.CopyParameters(original); // Reference leave one out (loo) panel char ** reference_loo = new char * [reference.count - 1]; for (int in = 0, out = 0; in < reference.count; in++) if (in != i) reference_loo[out++] = reference.haplotypes[in]; mm.WalkLeft(reference.haplotypes[i], reference_loo, reference.freq); if (em) mm.CountExpected(reference.haplotypes[i], reference_loo, reference.freq); else { #pragma omp critical { mm.ProfileModel(reference.haplotypes[i], reference_loo, reference.freq); } } delete [] reference_loo; #pragma omp critical mp += mm; } if (round >= rounds / 2) { int iterations = states < target.count ? states : target.count; #pragma omp parallel for for (int i = 0; i < iterations; i++) { MarkovModel mm; mm.Allocate(reference.markerCount, reference.count); mm.CopyParameters(original); // Padded version of target haplotype, including missing sites char * padded = new char [reference.markerCount]; for (int k = 0; k < reference.markerCount; k++) padded[k] = 0; // Copy current haplotype into padded vector for (int j = 0; j < target.markerCount; j++) if (markerIndex[j] >= 0) padded[markerIndex[j]] = target.haplotypes[i][j]; mm.WalkLeft(padded, reference.haplotypes, reference.freq); if (em) mm.CountExpected(padded, reference.haplotypes, reference.freq); else { #pragma omp critical { mm.ProfileModel(padded, reference.haplotypes, reference.freq); } } delete [] padded; #pragma omp critical mp += mm; } } mp.UpdateModel(); double crossovers = 0; for (int i = 0; i < reference.markerCount - 1; i++) crossovers += mp.R[i]; double errors = 0; for (int i = 0; i < reference.markerCount; i++) { double heterozygosity = 1.0 - square(reference.freq[1][i]) - square(reference.freq[2][i]) - square(reference.freq[3][i]) - square(reference.freq[4][i]); errors += mp.E[i] * heterozygosity; } errors /= reference.markerCount + 1e-30; printf(" %.0f mosaic crossovers expected per haplotype\n", crossovers); printf(" %.1f%% of crossovers are due to reference flips\n", mp.empiricalFlipRate * 100.); printf(" %.3g errors in mosaic expected per marker\n", errors); } if (rounds > 0) { printf(" Saving estimated parameters for future use ...\n"); mp.WriteParameters(refMarkerList, prefix, gzip); } printf("\n"); // List the major allele at each location reference.ListMajorAlleles(); printf("Generating Draft .info File ...\n\n"); // Output some basic information IFILE info = ifopen(prefix + ".info.draft", "wt"); ifprintf(info, "SNP\tAl1\tAl2\tFreq1\tGenotyped\n"); for (int i = 0, j = 0; i <= stopIndex; i++) if (i >= startIndex) ifprintf(info, "%s\t%s\t%s\t%.4f\t%s\n", (const char *) refMarkerList[i], reference.MajorAlleleLabel(i), reference.MinorAlleleLabel(i), reference.freq[reference.major[i]][i], j < markerIndex.Length() && i == markerIndex[j] ? (j++, "Genotyped") : "-"); else if (j < markerIndex.Length() && i == markerIndex[j]) j++; ifclose(info); printf("Imputing Genotypes ...\n"); IFILE dosages = ifopen(prefix + ".dose" + (gzip ? ".gz" : ""), "wt"); IFILE hapdose, haps; if (phased) { hapdose = ifopen(prefix + ".hapDose" + (gzip ? ".gz" : ""), "wt"); haps = ifopen(prefix + ".haps" + (gzip ? ".gz" : ""), "wt"); } ImputationStatistics stats(reference.markerCount); // Impute each haplotype #pragma omp parallel for for (int i = 0; i < target.count; i++) { if (i != 0 && target.labels[i] == target.labels[i-1]) continue; MarkovModel mm; mm.Allocate(reference.markerCount, reference.count); mm.ClearImputedDose(); mm.CopyParameters(mp); // Padded version of target haplotype, including missing sites char * padded = new char [reference.markerCount]; for (int j = 0; j < reference.markerCount; j++) padded[j] = 0; int k = i; do { printf(" Processing Haplotype %d of %d ...\n", k + 1, target.count); // Copy current haplotype into padded vector for (int j = 0; j < target.markerCount; j++) if (markerIndex[j] >= 0) padded[markerIndex[j]] = target.haplotypes[k][j]; mm.WalkLeft(padded, reference.haplotypes, reference.freq); mm.Impute(reference.major, padded, reference.haplotypes, reference.freq); #pragma omp critical { stats.Update(mm.imputedHap, mm.leaveOneOut, padded, reference.major); } #pragma omp critical if (phased) { ifprintf(hapdose, "%s\tHAPLO%d", (const char *) target.labels[i], k - i + 1); ifprintf(haps, "%s\tHAPLO%d", (const char *) target.labels[i], k - i + 1); for (int j = startIndex; j <= stopIndex; j++) { ifprintf(hapdose, "\t%.3f", mm.imputedHap[j]); ifprintf(haps, "%s%c", j % 8 == 0 ? " " : "", mm.imputedAlleles[j]); } ifprintf(hapdose, "\n"); ifprintf(haps, "\n"); } k++; } while (k < target.count && target.labels[k] == target.labels[i]); printf(" Outputting Individual %s ...\n", (const char *) target.labels[i]); #pragma omp critical { ifprintf(dosages, "%s\tDOSE", (const char *) target.labels[i]); for (int j = startIndex; j <= stopIndex; j++) ifprintf(dosages, "\t%.3f", mm.imputedDose[j]); ifprintf(dosages, "\n"); } delete [] padded; } ifclose(dosages); if (phased) { ifclose(hapdose); ifclose(haps); } // Output some basic information info = ifopen(prefix + ".info" + (gzip ? ".gz" : ""), "wt"); ifprintf(info, "SNP\tAl1\tAl2\tFreq1\tMAF\tAvgCall\tRsq\tGenotyped\tLooRsq\tEmpR\tEmpRsq\tDose1\tDose2\n"); // Padded version of target haplotype, including missing sites char * padded = new char [reference.markerCount]; for (int k = 0; k < reference.markerCount; k++) padded[k] = 0; // Mark genotyped SNPs in padded vector for (int j = 0; j < target.markerCount; j++) if (markerIndex[j] >= 0) padded[markerIndex[j]] = 1; for (int i = startIndex; i <= stopIndex; i++) { ifprintf(info, "%s\t%s\t%s\t%.5f\t%.5f\t%.5f\t%.5f\t", (const char *) refMarkerList[i], reference.MajorAlleleLabel(i), reference.MinorAlleleLabel(i), stats.AlleleFrequency(i), stats.AlleleFrequency(i) > 0.5 ? 1.0 - stats.AlleleFrequency(i) : stats.AlleleFrequency(i), stats.AverageCallScore(i), stats.Rsq(i)); if (padded[i]) ifprintf(info, "Genotyped\t%.5f\t%.5f\t%.5f\t%.5f\t%.5f\n", stats.LooRsq(i), stats.EmpiricalR(i), stats.EmpiricalRsq(i), stats.LooMajorDose(i), stats.LooMinorDose(i)); else ifprintf(info, "-\t-\t-\t-\t-\t-\n"); } ifclose(info); delete [] padded; time_t stop = time(NULL); int seconds = stop - start; printf("\nRun completed in %d hours, %d mins, %d seconds on %s\n\n", seconds / 3600, (seconds % 3600) / 60, seconds % 60, ctime(&stop)); }
int GapInfo::processFile(const char* inputFileName, const char* outputFileName, const char* refFile, bool detailed, bool checkFirst, bool checkStrand) { // Open the file for reading. SamFile samIn; samIn.OpenForRead(inputFileName); // Read the sam header. SamFileHeader samHeader; samIn.ReadHeader(samHeader); SamRecord samRecord; GenomeSequence* refPtr = NULL; if(strcmp(refFile, "") != 0) { refPtr = new GenomeSequence(refFile); } IFILE outFile = ifopen(outputFileName, "w"); // Map for summary. std::map<int, int> gapInfoMap; // Keep reading records until ReadRecord returns false. while(samIn.ReadRecord(samHeader, samRecord)) { uint16_t samFlags = samRecord.getFlag(); if((!SamFlag::isMapped(samFlags)) || (!SamFlag::isMateMapped(samFlags)) || (!SamFlag::isPaired(samFlags)) || (samFlags & SamFlag::SECONDARY_ALIGNMENT) || (SamFlag::isDuplicate(samFlags)) || (SamFlag::isQCFailure(samFlags))) { // unmapped, mate unmapped, not paired, // not the primary alignment, // duplicate, fails vendor quality check continue; } // No gap info if the chromosome names are different or // are unknown. int32_t refID = samRecord.getReferenceID(); if((refID != samRecord.getMateReferenceID()) || (refID == -1)) { continue; } int32_t readStart = samRecord.get0BasedPosition(); int32_t mateStart = samRecord.get0BasedMatePosition(); // If the mate starts first, then the pair was processed by // the mate. if(mateStart < readStart) { continue; } if((mateStart == readStart) && (SamFlag::isReverse(samFlags))) { // read and mate start at the same position, so // only process the forward strand. continue; } // Process this read pair. int32_t readEnd = samRecord.get0BasedAlignmentEnd(); int32_t gapSize = mateStart - readEnd - 1; if(detailed) { // Output the gap info. ifprintf(outFile, "%s\t%d\t%d", samRecord.getReferenceName(), readEnd+1, gapSize); // Check if it is not the first or if it is not the forward strand. if(checkFirst && !SamFlag::isFirstFragment(samFlags)) { ifprintf(outFile, "\tNotFirst"); } if(checkStrand && SamFlag::isReverse(samFlags)) { ifprintf(outFile, "\tReverse"); } ifprintf(outFile, "\n"); } else { // Summary. // Skip reads that are not the forward strand. if(SamFlag::isReverse(samFlags)) { // continue continue; } // Forward. // Check the reference for 'N's. if(refPtr != NULL) { genomeIndex_t chromStartIndex = refPtr->getGenomePosition(samRecord.getReferenceName()); if(chromStartIndex == INVALID_GENOME_INDEX) { // Invalid position, so continue to the next one. continue; } bool skipRead = false; for(int i = readEnd + 1; i < mateStart; i++) { if((*refPtr)[i] == 'N') { // 'N' in the reference, so continue to the next read. skipRead = true; break; } } if(skipRead) { continue; } } // Update the gapInfo. gapInfoMap[gapSize]++; } } if(!detailed) { // Output the summary. ifprintf(outFile, "GapSize\tNumPairs\n"); for(std::map<int,int>::iterator iter = gapInfoMap.begin(); iter != gapInfoMap.end(); iter++) { ifprintf(outFile, "%d\t%d\n", (*iter).first, (*iter).second); } } SamStatus::Status returnStatus = samIn.GetStatus(); if(returnStatus == SamStatus::NO_MORE_RECS) { return(SamStatus::SUCCESS); } return(returnStatus); }