int main(int argc, char ** argv) { printf("glfMultiples -- SNP calls based on .glf or .glz files\n"); printf("(c) 2008-2011 Goncalo Abecasis, Sebastian Zoellner, Yun Li\n\n"); String pedfile; String positionfile; String callfile; String glfAliases; String glfPrefix; String glfSuffix; ParameterList pl; double posterior = 0.50; int mapQuality = 0; int minTotalDepth = 1; int maxTotalDepth = INT_MAX; bool verbose = false; bool mapQualityStrict = false; bool hardFilter = false; bool smartFilter = false; bool softFilter = true; bool robustPrior = true; bool uniformPrior = false; String xLabel("X"), yLabel("Y"), mitoLabel("MT"); int xStart = 2699520, xStop = 154931044; BEGIN_LONG_PARAMETERS(longParameters) LONG_PARAMETER_GROUP("Pedigree File") LONG_STRINGPARAMETER("ped", &pedfile) LONG_PARAMETER_GROUP("Map Quality Filter") LONG_INTPARAMETER("minMapQuality", &mapQuality) LONG_PARAMETER("strict", &mapQualityStrict) LONG_PARAMETER_GROUP("Total Depth Filter") LONG_INTPARAMETER("minDepth", &minTotalDepth) LONG_INTPARAMETER("maxDepth", &maxTotalDepth) LONG_PARAMETER_GROUP("Position Filter") LONG_STRINGPARAMETER("positionFile", &positionfile) LONG_PARAMETER_GROUP("Chromosome Labels") LONG_STRINGPARAMETER("xChr", &xLabel) LONG_STRINGPARAMETER("yChr", &yLabel) LONG_STRINGPARAMETER("mito", &mitoLabel) LONG_INTPARAMETER("xStart", &xStart) LONG_INTPARAMETER("xStop", &xStop) LONG_PARAMETER_GROUP("Filtering Options") EXCLUSIVE_PARAMETER("hardFilter", &hardFilter) EXCLUSIVE_PARAMETER("smartFilter", &smartFilter) EXCLUSIVE_PARAMETER("softFilter", &softFilter) LONG_PARAMETER_GROUP("Prior Options") EXCLUSIVE_PARAMETER("uniformPrior", &uniformPrior) EXCLUSIVE_PARAMETER("robustPrior", &robustPrior) LONG_PARAMETER_GROUP("Output") LONG_PARAMETER("verbose", &verbose) LONG_PARAMETER_GROUP("Sample Names") LONG_STRINGPARAMETER("glfAliases", &glfAliases) LONG_PARAMETER_GROUP("Prefixes and Suffixes") LONG_STRINGPARAMETER("glfPrefix",&glfPrefix) LONG_STRINGPARAMETER("glfSuffix",&glfSuffix) END_LONG_PARAMETERS(); pl.Add(new StringParameter('b', "Base Call File", callfile)); pl.Add(new DoubleParameter('p', "Posterior Threshold", posterior)); pl.Add(new LongParameters("Additional Options", longParameters)); int argstart = pl.ReadWithTrailer(argc, argv) + 1; pl.Status(); if (posterior < 0.50) error("Posterior threshold for genotype calls (-p option) must be > 0.50."); time_t t; time(&t); printf("Analysis started on %s\n", ctime(&t)); fflush(stdout); int n = argc - argstart; argv += argstart; Pedigree ped; if (!pedfile.IsEmpty()) { ped.pd.AddStringColumn("glfFile"); ped.Load(pedfile); n = ped.count; } else if (n == 0) error("No pedigree file present and no glf files listed at the end of command line\n"); // Prior for finding difference from the reference at a particular site //BgzfFileType::setRequireEofBlock(false); double prior = 0.0; for (int i = 1; i <= 2 * n; i++) prior += 1.0 / i; prior *= 0.001; glfHandler * glf = new glfHandler[n]; bool firstGlf = n; if (ped.count) { bool warn = false; for (int i = n - 1; i > 0; i++) { if (!glf[i].Open(ped[i].strings[0])) { printf("Failed to open genotype likelihood file [%s] for individual %s:%s\n", (const char *) ped[i].strings[0], (const char *) ped[i].famid, (const char *) ped[i].pid); glf[i].OpenStub(); firstGlf = i; } if (warn) printf("\n"); if (firstGlf == n) error("No genotype likelihood files could be opened"); } } else { for (int i = firstGlf = 0; i < n; i++) { String glfName = glfPrefix + String(argv[i]) + glfSuffix; if (!glf[i].Open(glfName)) error("Failed to open genotype likelihood file [%s]\n", glfName.c_str()); } } StringAlias aliases; aliases.ReadFromFile(glfAliases); printf("Calling genotypes for files ...\n"); for (int i = 0; i < n; i++) printf("%s\n", ped.count ? (const char *) ped[i].strings[0] : argv[i]); printf("\n"); baseCalls = fopen(callfile, "wt"); if (baseCalls != NULL) { fprintf(baseCalls, "##fileformat=VCFv4.0\n"); ReportDate(baseCalls); fprintf(baseCalls, "##source=glfMultiples\n"); fprintf(baseCalls, "##minDepth=%d\n", minTotalDepth); fprintf(baseCalls, "##maxDepth=%d\n", maxTotalDepth); fprintf(baseCalls, "##minMapQuality=%d\n", mapQuality); fprintf(baseCalls, "##minPosterior=%.4f\n", posterior); fprintf(baseCalls, "##INFO=<ID=DP,Number=1,Type=Integer,Description=\"Total Depth\">\n"); fprintf(baseCalls, "##INFO=<ID=MQ,Number=1,Type=Integer,Description=\"Root Mean Squared Mapping Quality\">\n"); fprintf(baseCalls, "##INFO=<ID=NS,Number=1,Type=Integer,Description=\"Number of samples with coverage\">\n"); fprintf(baseCalls, "##INFO=<ID=AN,Number=1,Type=Integer,Description=\"Total number of alleles (with coverage)\">\n"); fprintf(baseCalls, "##INFO=<ID=AC,Number=.,Type=Integer,Description=\"Alternative allele count (with coverage)\">\n"); fprintf(baseCalls, "##INFO=<ID=AF,Number=.,Type=Float,Description=\"Alternate allele frequency\">\n"); fprintf(baseCalls, "##INFO=<ID=AB,Number=1,Type=Float,Description=\"Estimated allele balance between the alleles\">\n"); if ( mapQuality > 0 ) { fprintf(baseCalls, "##FILTER=<ID=mq%d,Description=\"Mapping Quality less than %d\">\n",mapQuality,mapQuality); } if ( minTotalDepth > 1 ) { fprintf(baseCalls, "##FILTER=<ID=dp%d,Description=\"Total Read Depth less than %d\">\n",minTotalDepth,minTotalDepth); } if ( minTotalDepth < INT_MAX ) { fprintf(baseCalls, "##FILTER=<ID=DP%d,Description=\"Total Read Depth greater than %d\">\n",maxTotalDepth,maxTotalDepth); } fprintf(baseCalls, "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Most Likely Genotype\">\n"); fprintf(baseCalls, "##FORMAT=<ID=GQ,Number=1,Type=Integer,Description=\"Genotype Call Quality\">\n"); fprintf(baseCalls, "##FORMAT=<ID=DP,Number=1,Type=Integer,Description=\"Read Depth\">\n"); fprintf(baseCalls, "##FORMAT=<ID=PL,Number=3,Type=Integer,Description=\"Genotype Likelihoods for Genotypes 0/0,0/1,1/1\">\n"); fprintf(baseCalls, "##FORMAT=<ID=PL3,Number=6,Type=Integer,Description=\"Genotype Likelihoods for Genotypes 0/0,0/1,1/1,0/2,1/2,2/2\">\n"); fprintf(baseCalls, "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT"); for (int i = 0; i < n; i++) fprintf(baseCalls, "\t%s", ped.count ? (const char *) (ped[i].famid + ":" + ped[i].pid) : (const char *) aliases.GetAlias(argv[i])); fprintf(baseCalls, "\n"); } StringArray buffer, tokens; StringHash positions; buffer.Read(positionfile); for (int i = 0; i < buffer.Length(); i++) { tokens.ReplaceTokens(buffer[i], " \t:"); if (tokens.Length() != 2) continue; positions.Add(tokens[0] + ":" + (int(tokens[1].AsInteger() - 1))); } int chromosomeType = 0; while (glf[firstGlf].NextSection()) { for (int i = firstGlf + 1; i < n; i++) { if (glf[i].isStub) continue; glf[i].NextSection(); if (glf[firstGlf].maxPosition != glf[i].maxPosition || glf[firstGlf].label != glf[i].label) { error("Genotype files '%s' and '%s' are not compatible ...\n" " File '%s' has section %s with %d entries ...\n" " File '%s' section %s with %d entries ...\n", ped.count ? (const char *) ped[firstGlf].strings[0] : argv[firstGlf], ped.count ? (const char *) ped[i].strings[0] : argv[i], ped.count ? (const char *) ped[firstGlf].strings[0] : argv[firstGlf], (const char *) glf[firstGlf].label, glf[firstGlf].maxPosition, ped.count ? (const char *) ped[i].strings[0] : argv[i], (const char *) glf[i].label, glf[i].maxPosition); } } chromosomeType = CT_AUTOSOME; if (ped.count) { if (glf[firstGlf].label == xLabel) chromosomeType = CT_CHRX; if (glf[firstGlf].label == yLabel) chromosomeType = CT_CHRY; if (glf[firstGlf].label == mitoLabel) chromosomeType = CT_MITO; } printf("Processing section %s with %d entries\n", (const char *) glf[firstGlf].label, glf[firstGlf].maxPosition); int refBase = 0; int position = 0; int mapQualityFilter = 0; int depthFilter = 0; int homozygousReference = 0; int transitions = 0; int transversions = 0; int otherPolymorphisms = 0; int sinkFilter = 0; int smartFilterHits = 0; int baseCounts[5] = {0, 0, 0, 0, 0}; String filter; while (true) { if (position > 0) { // Check whether we have reached the end of the current chromosome bool done = true; for (int i = 0; i < n; i++) if (glf[i].data.recordType != 0) done = false; if (done) break; } // Advance to the next position where needed for (int i = 0; i < n; i++) if (glf[i].position == position) glf[i].NextBaseEntry(); // Figure out the current analysis position refBase = glf[0].data.refBase; position = glf[0].position; for (int i = 1; i < n; i++) if (position > glf[i].position) { position = glf[i].position; refBase = glf[i].data.refBase; } // Avoid alignments that extend past the end of the chromosome if (position >= glf[firstGlf].maxPosition) break; baseCounts[(int)refBase]++; // These lines can be uncommented for debugging purposes // for (int i = 0; i < n; i++) // printf("GLF %d : position %d, refBase %d\n", i, position, refBase); // printf("Position: %d, refBase: %d\n", position, refBase); if (positions.Entries()) { filter = glf[firstGlf].label + ":" + position; if (positions.Find(filter) < 0) continue; } if (refBase == 0) continue; // Corrected calculation of root-mean-square Map Quality score // and check if we have at least one sample with good quality data int currentDepth = 0, totalDepth = 0, numCovered = 0; double currentQuality = 0.0, averageMapQuality = 0.0; bool passMapQualityFilter = false; for (int i = 0; i < n; i++) { currentDepth = glf[i].GetDepth(position); if (currentDepth != 0) { totalDepth += currentDepth; numCovered++; // not currently used -- will be "NS" currentQuality = glf[i].GetMapQuality(position); averageMapQuality += currentDepth * currentQuality * currentQuality; if (currentQuality >= mapQuality) passMapQualityFilter = true; } } averageMapQuality = sqrt(averageMapQuality / totalDepth); filter.Clear(); if (!passMapQualityFilter) { if (filter.Length() == 0) mapQualityFilter++; if (hardFilter) continue; filter.catprintf("%smq%d", filter.Length() ? ";" : "", mapQuality); } if (totalDepth < minTotalDepth) { if (filter.Length() == 0) depthFilter++; if (hardFilter) continue; filter.catprintf("%sdp%d", filter.Length() ? ";" : "", minTotalDepth); } if (totalDepth > maxTotalDepth) { if (filter.Length() == 0) depthFilter++; if (hardFilter) continue; filter.catprintf("%sDP%d", filter.Length() ? ";" : "", maxTotalDepth); } // Create convenient aliases for each base unsigned char transition = (((refBase - 1) ^ 2) + 1); unsigned char transvers1 = (((refBase - 1) ^ 3) + 1); unsigned char transvers2 = (((refBase - 1) ^ 1) + 1); int homRef = glf[0].GenotypeIndex(refBase, refBase); // Calculate likelihood assuming every is homozygous for the reference double lRef = log(1.0 - prior); for (int i = 0; i < n; i++) lRef += log(glf[i].GetLikelihoods(position)[homRef]); // Calculate maximum likelihood for a variant if (smartFilter) { double anyVariant = log(prior) + FilteringLikelihood(glf, n, position, refBase); if (exp(lRef - anyVariant) > (1.0 - posterior)/posterior) { smartFilterHits++; continue; } } //fprintf(stderr,"position = %d\n",position); double pTs = uniformPrior ? 1./3. : 2./3.; double pTv = uniformPrior ? 1./3. : 1./6.; // Calculate likelihoods for the most likelily SNP configurations double refTransition = log(prior * pTs) + PolymorphismLikelihood(glf, n, position, refBase, transition); double refTransvers1 = log(prior * pTv) + PolymorphismLikelihood(glf, n, position, refBase, transvers1); double refTransvers2 = log(prior * pTv) + PolymorphismLikelihood(glf, n, position, refBase, transvers2); // Calculate likelihoods for less likely SNP configurations double transitiontv1 = log(prior * 0.001) + PolymorphismLikelihood(glf, n, position, transition, transvers1); double transitiontv2 = log(prior * 0.001) + PolymorphismLikelihood(glf, n, position, transition, transvers2); double transvers1tv2 = log(prior * 0.001) + PolymorphismLikelihood(glf, n, position, transvers1, transvers2); // Calculate the likelihood for unusual configurations where everyone is heterozygous ... double sink = n > 10 ? log(prior * 1e-8) + SinkLikelihood(glf, n, position) : -1e100; double lmax = max( max(max(lRef, refTransition),max(refTransvers1, refTransvers2)), max(max(transitiontv1, transitiontv2), max(transvers1tv2, sink))); double sum = exp(lRef - lmax) + exp(refTransition -lmax) + exp(refTransvers1 - lmax) + exp(refTransvers2 - lmax) + exp(transitiontv1 - lmax) + exp(transitiontv2 - lmax) + exp(transvers1tv2 - lmax) + exp(sink - lmax); if (sum == 0.0) continue; if (exp(lRef - lmax)/sum > 1.0 - prior) { if (filter.Length() == 0) homozygousReference++; if (positions.Entries()) ReportSNP(glf, n, position, refBase, refBase, refBase, filter, totalDepth, averageMapQuality, lRef / sum); continue; } double quality = 1.0 - exp(lRef - lmax) / sum; if (verbose) { DumpDetails(glf, n, position, refBase); printf("%.3f %.3f %.3f %.3f %.3f %.3f %.3f\n", lRef, refTransition, refTransvers1, refTransvers2, transitiontv1, transitiontv2, transvers1tv2); } if (exp(refTransition - lmax)/sum > posterior) { ReportSNP(glf, n, position, refBase, refBase, transition, filter, totalDepth, averageMapQuality, quality /* refTransition/sum */); if (filter.Length() == 0) transitions++; } else if (exp(refTransvers1 - lmax)/sum > posterior) { ReportSNP(glf, n, position, refBase, refBase, transvers1, filter, totalDepth, averageMapQuality, quality /* refTransvers1/sum */); if (filter.Length() == 0) transversions++; } else if (exp(refTransvers2 - lmax)/sum > posterior) { ReportSNP(glf, n, position, refBase, refBase, transvers2, filter, totalDepth, averageMapQuality, quality /* refTransvers2/sum */); if (filter.Length() == 0) transversions++; } else if (exp(transitiontv1 - lmax)/sum > posterior) { ReportSNP(glf, n, position, refBase, transition, transvers1, filter, totalDepth, averageMapQuality, quality /* transitiontv1/sum */); if (filter.Length() == 0) otherPolymorphisms++; } else if (exp(transitiontv2 - lmax)/sum > posterior) { ReportSNP(glf, n, position, refBase, transition, transvers2, filter, totalDepth, averageMapQuality, quality /* transitiontv2/sum */); if (filter.Length() == 0) otherPolymorphisms++; } else if (exp(transvers1tv2 - lmax)/sum > posterior) { ReportSNP(glf, n, position, refBase, transvers1, transvers2, filter, totalDepth, averageMapQuality, quality /* transvers1tv2/sum */); if (filter.Length() == 0) otherPolymorphisms++; } else if (exp(sink - lmax)/sum > posterior) sinkFilter++; } int actualBases = glf[firstGlf].maxPosition - baseCounts[0]; printf(" Missing bases = %9d (%.3f%%)\n", baseCounts[0], baseCounts[0] * 100. / glf[firstGlf].maxPosition); printf(" Reference bases = %9d (%.3f%%)\n", glf[firstGlf].maxPosition - baseCounts[0], (glf[firstGlf].maxPosition - baseCounts[0]) * 100. / glf[firstGlf].maxPosition); printf(" A/T bases = %9d (%.3f%%, %d A, %d T)\n", baseCounts[1] + baseCounts[4], (baseCounts[1] + baseCounts[4]) * 100. / actualBases, baseCounts[1], baseCounts[4]); printf(" G/C bases = %9d (%.3f%%, %d G, %d C)\n", baseCounts[3] + baseCounts[2], (baseCounts[3] + baseCounts[2]) * 100. / actualBases, baseCounts[3], baseCounts[2]); printf(" Depth Filter = %9d bases (%.3f%%)\n", depthFilter, depthFilter * 100. / actualBases); printf(" Map Quality Filter = %9d bases (%.3f%%)\n", mapQualityFilter, mapQualityFilter * 100. / actualBases); printf(" Non-Polymorphic = %9d bases (%.3f%%)\n", homozygousReference, homozygousReference * 100. / actualBases); printf(" Transitions = %9d bases (%.3f%%)\n", transitions, transitions * 100. / actualBases); printf(" Transversions = %9d bases (%.3f%%)\n", transversions, transversions * 100. / actualBases); printf(" Other Polymorphisms = %9d bases (%.3f%%)\n", otherPolymorphisms, otherPolymorphisms * 100. / actualBases); if (n > 10) printf(" Homology Sink = %9d bases (%.3f%%)\n", sinkFilter, sinkFilter * 100. / actualBases); if (smartFilter) printf(" Smart Filter = %9d bases (%.3f%%)\n", smartFilterHits, smartFilterHits * 100. / actualBases); int noCalls = actualBases - homozygousReference - transitions - transversions - otherPolymorphisms - sinkFilter; printf(" No call = %9d bases (%.3f%%)\n", noCalls, noCalls * 100. / actualBases); fflush(stdout); } if (baseCalls != NULL) fclose(baseCalls); time(&t); printf("\nAnalysis completed on %s\n", ctime(&t)); fflush(stdout); }
int Convert::execute(int argc, char **argv) { // Extract command line arguments. String inFile = ""; String outFile = ""; String refFile = ""; bool lshift = false; bool noeof = false; bool params = false; bool useBases = false; bool useEquals = false; bool useOrigSeq = false; bool recover = false; ParameterList inputParameters; BEGIN_LONG_PARAMETERS(longParameterList) LONG_STRINGPARAMETER("in", &inFile) LONG_STRINGPARAMETER("out", &outFile) LONG_STRINGPARAMETER("refFile", &refFile) LONG_PARAMETER("lshift", &lshift) LONG_PARAMETER("noeof", &noeof) LONG_PARAMETER("recover", &recover) LONG_PARAMETER("params", ¶ms) LONG_PARAMETER_GROUP("SequenceConversion") EXCLUSIVE_PARAMETER("useBases", &useBases) EXCLUSIVE_PARAMETER("useEquals", &useEquals) EXCLUSIVE_PARAMETER("useOrigSeq", &useOrigSeq) LONG_PHONEHOME(VERSION) END_LONG_PARAMETERS(); inputParameters.Add(new LongParameters ("Input Parameters", longParameterList)); // parameters start at index 2 rather than 1. inputParameters.Read(argc, argv, 2); // If no eof block is required for a bgzf file, set the bgzf file type to // not look for it. if(noeof) { // Set that the eof block is not required. BgzfFileType::setRequireEofBlock(false); } // Check to see if the in file was specified, if not, report an error. if(inFile == "") { printUsage(std::cerr); inputParameters.Status(); // In file was not specified but it is mandatory. std::cerr << "--in is a mandatory argument, " << "but was not specified" << std::endl; return(-1); } if(outFile == "") { printUsage(std::cerr); inputParameters.Status(); // In file was not specified but it is mandatory. std::cerr << "--out is a mandatory argument, " << "but was not specified" << std::endl; return(-1); } // Check to see if the ref file was specified. // Open the reference. GenomeSequence* refPtr = NULL; if(refFile != "") { refPtr = new GenomeSequence(refFile); } SamRecord::SequenceTranslation translation; if((useBases) && (refPtr != NULL)) { translation = SamRecord::BASES; } else if((useEquals) && (refPtr != NULL)) { translation = SamRecord::EQUAL; } else { useOrigSeq = true; translation = SamRecord::NONE; } if(params) { inputParameters.Status(); } // Open the input file for reading. SamFile samIn; if(recover) samIn.setAttemptRecovery(true); samIn.OpenForRead(inFile); // Open the output file for writing. SamFile samOut; samOut.OpenForWrite(outFile); samOut.SetWriteSequenceTranslation(translation); samOut.SetReference(refPtr); // Read the sam header. SamFileHeader samHeader; samIn.ReadHeader(samHeader); // Write the sam header. samOut.WriteHeader(samHeader); SamRecord samRecord; // Set returnStatus to success. It will be changed // to the failure reason if any of the writes fail. SamStatus::Status returnStatus = SamStatus::SUCCESS; while(1) { try { // Keep reading records until ReadRecord returns false. while(samIn.ReadRecord(samHeader, samRecord)) { // left shift if necessary. if(lshift) { samRecord.shiftIndelsLeft(); } // Successfully read a record from the file, so write it. if(!samOut.WriteRecord(samHeader, samRecord)) { // Failed to write a record. fprintf(stderr, "%s\n", samOut.GetStatusMessage()); returnStatus = samOut.GetStatus(); } } break; } catch (std::runtime_error e) { std::cerr << "Caught runtime error: " << e.what() << "\n"; if(!recover) { std::cerr << "Corrupted BAM file detected - consider using --recover option.\n"; break; } std::cerr << "Attempting to resync at next good BGZF block and BAM record.\n"; // XXX need to resync SamFile stream here bool rc = samIn.attemptRecoverySync(checkSignature, SIGNATURE_LENGTH); if(rc) { std::cerr << "Successful resync - some data lost.\n"; continue; // succeeded } std::cerr << "Failed to re-sync on data stream.\n"; break; // failed to resync } } std::cerr << std::endl << "Number of records read = " << samIn.GetCurrentRecordCount() << std::endl; std::cerr << "Number of records written = " << samOut.GetCurrentRecordCount() << std::endl; if(refPtr != NULL) { delete(refPtr); } // Since the reads were successful, return the status based // on the status of the writes. If any failed, return // their failure status. return(returnStatus); }
// main function of verifyBamID int execute(int argc, char** argv) { printf("verifyBamID %s -- verify identity and purity of sequence data\n" "(c) 2010-2014 Hyun Min Kang, Goo Jun, and Goncalo Abecasis\n\n", VERSION); VerifyBamIDArgs args; ParameterList pl; BEGIN_LONG_PARAMETERS(longParameters) LONG_PARAMETER_GROUP("Input Files") LONG_STRINGPARAMETER("vcf",&args.sVcfFile) LONG_STRINGPARAMETER("bam",&args.sBamFile) LONG_STRINGPARAMETER("subset",&args.sSubsetInds) LONG_STRINGPARAMETER("smID",&args.sSMID) LONG_PARAMETER_GROUP("VCF analysis options") LONG_DOUBLEPARAMETER("genoError",&args.genoError) LONG_DOUBLEPARAMETER("minAF",&args.minAF) LONG_DOUBLEPARAMETER("minCallRate",&args.minCallRate) LONG_PARAMETER_GROUP("Individuals to compare with chip data") EXCLUSIVE_PARAMETER("site",&args.bSiteOnly) EXCLUSIVE_PARAMETER("self",&args.bSelfOnly) EXCLUSIVE_PARAMETER("best",&args.bFindBest) LONG_PARAMETER_GROUP("Chip-free optimization options") EXCLUSIVE_PARAMETER("free-none",&args.bFreeNone) EXCLUSIVE_PARAMETER("free-mix",&args.bFreeMixOnly) EXCLUSIVE_PARAMETER("free-refBias",&args.bFreeRefBiasOnly) EXCLUSIVE_PARAMETER("free-full",&args.bFreeFull) LONG_PARAMETER_GROUP("With-chip optimization options") EXCLUSIVE_PARAMETER("chip-none",&args.bChipNone) EXCLUSIVE_PARAMETER("chip-mix",&args.bChipMixOnly) EXCLUSIVE_PARAMETER("chip-refBias",&args.bChipRefBiasOnly) EXCLUSIVE_PARAMETER("chip-full",&args.bChipFull) LONG_PARAMETER_GROUP("BAM analysis options") LONG_PARAMETER("ignoreRG",&args.bIgnoreRG) LONG_PARAMETER("ignoreOverlapPair",&args.bIgnoreOverlapPair) LONG_PARAMETER("noEOF",&args.bNoEOF) LONG_PARAMETER("precise",&args.bPrecise) LONG_INTPARAMETER("minMapQ",&args.minMapQ) LONG_INTPARAMETER("maxDepth",&args.maxDepth) LONG_INTPARAMETER("minQ",&args.minQ) LONG_INTPARAMETER("maxQ",&args.maxQ) LONG_DOUBLEPARAMETER("grid",&args.grid) LONG_PARAMETER_GROUP("Modeling Reference Bias") LONG_DOUBLEPARAMETER("refRef",&args.pRefRef) LONG_DOUBLEPARAMETER("refHet",&args.pRefHet) LONG_DOUBLEPARAMETER("refAlt",&args.pRefAlt) LONG_PARAMETER_GROUP("Output options") LONG_STRINGPARAMETER("out",&args.sOutFile) LONG_PARAMETER("verbose",&args.bVerbose) LONG_PHONEHOME(VERSION) END_LONG_PARAMETERS(); pl.Add(new LongParameters("Available Options",longParameters)); pl.Read(argc, argv); pl.Status(); // check the validity of input files if ( args.sVcfFile.IsEmpty() ) { error("--vcf [vcf file] required"); } if ( args.sBamFile.IsEmpty() ) { error("--bam [bam file] is required"); } if ( args.sOutFile.IsEmpty() ) { error("--out [output prefix] is required"); } Logger::gLogger = new Logger((args.sOutFile + ".log").c_str(), args.bVerbose); if ( ! ( args.bSiteOnly || args.bSelfOnly || args.bFindBest ) ) { warning("--self option was autotomatically turned on by default. Specify --best option if you wanted to check across all possible samples in the VCF"); args.bSelfOnly = true; } if ( ( args.maxDepth > 20 ) && ( !args.bPrecise ) ) { warning("--precise option is not turned on at --maxDepth %d : may be prone to precision errors",args.maxDepth); } if ( ( args.bChipRefBiasOnly ) && ( !args.bSelfOnly ) ) { error("--self must be set for --chip-refBias to work. Skipping.."); } // check timestamp time_t t; time(&t); Logger::gLogger->writeLog("Analysis started on %s",ctime(&t)); // load arguments VerifyBamID vbid(&args); // load input VCF and BAM files Logger::gLogger->writeLog("Opening Input Files"); vbid.loadFiles(args.sBamFile.c_str(), args.sVcfFile.c_str()); // Check which genotype-free method is used if ( args.bFreeNone ) { // if no genotype-free mode is tested. skip it // do nothing for genotype-free estimation Logger::gLogger->writeLog("Skipping chip-free estimation of sample mixture"); } else if ( args.bFreeMixOnly ) { // only mixture is estimated. // genotype-free method Logger::gLogger->writeLog("Performing chip-free estimation of sample mixture at fixed reference bias parameters (%lf, %lf, %lf)",args.pRefRef,args.pRefHet,args.pRefAlt); // scan across multiple readgroups for(int rg=-1; rg < vbid.nRGs - (int)args.bIgnoreRG; ++rg) { VerifyBamID::mixLLK mix(&vbid); mix.OptimizeLLK(rg); Logger::gLogger->writeLog("Optimal per-sample fMix = %lf, LLK0 = %lf, LLK1 = %lf\n",mix.fMix,mix.llk0,mix.llk1); vbid.mixOut.llk0s[rg+1] = mix.llk0; vbid.mixOut.llk1s[rg+1] = mix.llk1; vbid.mixOut.fMixs[rg+1] = mix.fMix; } //vbid.mixRefHet = 0.5; //vbid.mixRefAlt = 0.00; } else if ( args.bFreeRefBiasOnly ) { Logger::gLogger->writeLog("Performing chip-free estimation of reference-bias without sample mixture"); for(int rg=-1; rg < vbid.nRGs - (int)args.bIgnoreRG; ++rg) { VerifyBamID::refBiasMixLLKFunc myFunc(&vbid, rg); AmoebaMinimizer myMinimizer; Vector startingPoint(2); startingPoint[0] = 0; // pRefHet = 0.5 startingPoint[1] = -4.595; // pRefAlt = 0.01 myMinimizer.func = &myFunc; myMinimizer.Reset(2); myMinimizer.point = startingPoint; myMinimizer.Minimize(1e-6); double pRefHet = VerifyBamID::invLogit(myMinimizer.point[0]); double pRefAlt = VerifyBamID::invLogit(myMinimizer.point[1]); Logger::gLogger->writeLog("Reference Bias Estimated as ( Pr[refBase|HET] = %lf, Pr[refBase|ALT] = %lf) with LLK = %lf at readGroup %d",pRefHet,pRefAlt,myMinimizer.fmin,rg); //vbid.setRefBiasParams(1.0, pRefHet, pRefAlt); vbid.mixOut.llk0s[rg+1] = myFunc.llk0; vbid.mixOut.llk1s[rg+1] = myFunc.llk1; vbid.mixOut.refHets[rg+1] = myFunc.pRefHet; vbid.mixOut.refAlts[rg+1] = myFunc.pRefAlt; } } else if ( args.bFreeFull ) { Logger::gLogger->writeLog("Performing chip-free estimation of reference-bias and sample mixture together"); for(int rg = -1; rg < vbid.nRGs - args.bIgnoreRG; ++rg) { VerifyBamID::fullMixLLKFunc myFunc(&vbid, rg); AmoebaMinimizer myMinimizer; Vector startingPoint(3); startingPoint[0] = -3.91; // start with fMix = 0.01 startingPoint[1] = 0; // pRefHet = 0.5 startingPoint[2] = -4.595; // pRefAlt = 0.01 myMinimizer.func = &myFunc; myMinimizer.Reset(3); myMinimizer.point = startingPoint; myMinimizer.Minimize(1e-6); double fMix = VerifyBamID::invLogit(myMinimizer.point[0]); if ( fMix > 0.5 ) fMix = 1.-fMix; double pRefHet = VerifyBamID::invLogit(myMinimizer.point[1]); double pRefAlt = VerifyBamID::invLogit(myMinimizer.point[2]); Logger::gLogger->writeLog("Optimal per-sample fMix = %lf\n",fMix); Logger::gLogger->writeLog("Reference Bias Estimated as ( Pr[refBase|HET] = %lf, Pr[refBase|ALT] = %lf) with LLK = %lf",pRefHet,pRefAlt,myMinimizer.fmin); //vbid.setRefBiasParams(1.0, pRefHet, pRefAlt); vbid.mixOut.llk0s[rg+1] = myFunc.llk0; vbid.mixOut.llk1s[rg+1] = myFunc.llk1; vbid.mixOut.fMixs[rg+1] = myFunc.fMix; vbid.mixOut.refHets[rg+1] = myFunc.pRefHet; vbid.mixOut.refAlts[rg+1] = myFunc.pRefAlt; } } Logger::gLogger->writeLog("calculating depth distribution"); vbid.calculateDepthDistribution(args.maxDepth, vbid.mixOut); Logger::gLogger->writeLog("finished calculating depth distribution"); std::vector<int> bestInds(vbid.nRGs+1,-1); std::vector<int> selfInds(vbid.nRGs+1,-1); if ( args.bChipNone ) { // do nothing Logger::gLogger->writeLog("Skipping with-chip estimation of sample mixture"); } else if ( args.bChipMixOnly ) { Logger::gLogger->writeLog("Performing with-chip estimation of sample mixture at fixed reference bias parameter (%lf, %lf, %lf)",args.pRefRef,args.pRefHet,args.pRefAlt); for(int rg=-1; rg < (vbid.nRGs - (int)args.bIgnoreRG); ++rg) { double maxIBD = -1; VerifyBamID::ibdLLK ibd(&vbid); for(int i=0; i < (int)vbid.pGenotypes->indids.size(); ++i) { double fIBD = ibd.OptimizeLLK(i, rg); Logger::gLogger->writeLog("Comparing with individual %s.. Optimal fIBD = %lf, LLK0 = %lf, LLK1 = %lf for readgroup %d",vbid.pGenotypes->indids[i].c_str(),fIBD, ibd.llk0, ibd.llk1, rg); if ( maxIBD < fIBD ) { bestInds[rg+1] = i; vbid.bestOut.llk0s[rg+1] = ibd.llk0; vbid.bestOut.llk1s[rg+1] = ibd.llk1; vbid.bestOut.fMixs[rg+1] = 1-ibd.fIBD; maxIBD = ibd.fIBD; } if ( ( (rg < 0) && (vbid.pPile->sBamSMID == vbid.pGenotypes->indids[i] ) ) || ( ( rg >= 0 ) && ( vbid.pPile->vsSMIDs[rg] == vbid.pGenotypes->indids[i]) ) ) { selfInds[rg+1] = i; vbid.selfOut.llk0s[rg+1] = ibd.llk0; vbid.selfOut.llk1s[rg+1] = ibd.llk1; vbid.selfOut.fMixs[rg+1] = 1-ibd.fIBD; } } if ( bestInds[rg+1] >= 0 ) { Logger::gLogger->writeLog("Best Matching Individual is %s with IBD = %lf",vbid.pGenotypes->indids[bestInds[rg+1]].c_str(),maxIBD); vbid.calculateDepthByGenotype(bestInds[rg+1],rg,vbid.bestOut); } if ( selfInds[rg+1] >= 0 ) { Logger::gLogger->writeLog("Self Individual is %s with IBD = %lf",vbid.pGenotypes->indids[selfInds[rg+1]].c_str(),vbid.selfOut.fMixs[rg+1]); vbid.calculateDepthByGenotype(selfInds[rg+1],rg,vbid.selfOut); } } } else if ( args.bChipRefBiasOnly ) { Logger::gLogger->writeLog("Performing with-chip estimation of reference-bias without sample mixture"); if ( args.bSelfOnly ) { for(int rg=-1; rg < (vbid.nRGs - (int)args.bIgnoreRG); ++rg) { VerifyBamID::refBiasIbdLLKFunc myFunc(&vbid, rg); AmoebaMinimizer myMinimizer; Vector startingPoint(2); startingPoint[0] = 0; // pRefHet = 0.5 startingPoint[1] = -4.595; // pRefAlt = 0.01 myMinimizer.func = &myFunc; myMinimizer.Reset(2); myMinimizer.point = startingPoint; myMinimizer.Minimize(1e-6); double pRefHet = VerifyBamID::invLogit(myMinimizer.point[0]); double pRefAlt = VerifyBamID::invLogit(myMinimizer.point[1]); Logger::gLogger->writeLog("Reference Bias Estimated as ( Pr[refBase|HET] = %lf, Pr[refBase|ALT] = %lf) with LLK = %lf",pRefHet,pRefAlt,myMinimizer.fmin); //vbid.setRefBiasParams(1.0, pRefHet, pRefAlt); vbid.selfOut.llk0s[rg+1] = myFunc.llk0; vbid.selfOut.llk1s[rg+1] = myFunc.llk1; vbid.selfOut.refHets[rg+1] = myFunc.pRefHet; vbid.selfOut.refAlts[rg+1] = myFunc.pRefAlt; vbid.calculateDepthByGenotype(0,rg,vbid.selfOut); } } else { Logger::gLogger->warning("--self must be set for --chip-refBias to work. Skipping.."); } } else if ( args.bChipFull ) { Logger::gLogger->writeLog("Performing with-chip estimation of reference-bias and sample mixture together"); for(int rg=-1; rg < (vbid.nRGs - (int)args.bIgnoreRG); ++rg) { double maxIBD = -1; for(int i=0; i < (int)vbid.pGenotypes->indids.size(); ++i) { VerifyBamID::fullIbdLLKFunc myFunc(&vbid,i,rg); AmoebaMinimizer myMinimizer; Vector startingPoint(3); startingPoint[0] = 3.91; // start with fIBD = 0.99 startingPoint[1] = 0; // pRefHet = 0.5 startingPoint[2] = -4.595; // pRefAlt = 0.01 myMinimizer.func = &myFunc; myFunc.indIdx = i; myMinimizer.Reset(3); myMinimizer.point = startingPoint; myMinimizer.Minimize(1e-6); double fIBD = VerifyBamID::invLogit(myMinimizer.point[0]); double pRefHet = VerifyBamID::invLogit(myMinimizer.point[1]); double pRefAlt = VerifyBamID::invLogit(myMinimizer.point[2]); Logger::gLogger->writeLog("Comparing with individual %s.. Optimal fIBD = %lf, LLK0 = %lf, LLK1 = %lf for readgroup %d",vbid.pGenotypes->indids[i].c_str(), fIBD, myFunc.llk0, myFunc.llk1, rg); //Logger::gLogger->writeLog("Optimal per-sample fIBD = %lf, ",fIBD); Logger::gLogger->writeLog("Reference Bias Estimated as ( Pr[refBase|HET] = %lf, Pr[refBase|ALT] = %lf ) with LLK = %lf",pRefHet,pRefAlt,myMinimizer.fmin); if ( maxIBD < fIBD ) { bestInds[rg+1] = i; maxIBD = fIBD; vbid.bestOut.llk0s[rg+1] = myFunc.llk0; vbid.bestOut.llk1s[rg+1] = myFunc.llk1; vbid.bestOut.fMixs[rg+1] = 1.-myFunc.fIBD; vbid.bestOut.refHets[rg+1] = myFunc.pRefHet; vbid.bestOut.refAlts[rg+1] = myFunc.pRefAlt; } if ( ( (rg < 0) && (vbid.pPile->sBamSMID == vbid.pGenotypes->indids[i] ) ) || ( ( rg >= 0 ) && ( vbid.pPile->vsSMIDs[rg] == vbid.pGenotypes->indids[i]) ) ) { selfInds[rg+1] = i; vbid.selfOut.llk0s[rg+1] = myFunc.llk0; vbid.selfOut.llk1s[rg+1] = myFunc.llk1; vbid.selfOut.fMixs[rg+1] = 1.-myFunc.fIBD; vbid.selfOut.refHets[rg+1] = myFunc.pRefHet; vbid.selfOut.refAlts[rg+1] = myFunc.pRefAlt; vbid.calculateDepthByGenotype(i, rg, vbid.selfOut); } } //vbid.setRefBiasParams(1.0, pRefHet, pRefAlt); if ( bestInds[rg+1] >= 0 ) { Logger::gLogger->writeLog("Best Matching Individual is %s with IBD = %lf",vbid.pGenotypes->indids[bestInds[rg+1]].c_str(),maxIBD); vbid.calculateDepthByGenotype(bestInds[rg+1], rg, vbid.bestOut); } if ( selfInds[rg+1] >= 0 ) { Logger::gLogger->writeLog("Self Individual is %s with IBD = %lf",vbid.pGenotypes->indids[selfInds[rg+1]].c_str(),vbid.selfOut.fMixs[rg+1]); vbid.calculateDepthByGenotype(selfInds[rg+1],rg,vbid.selfOut); } } } // PRINT OUTPUT FILE - ".selfSM" // [SEQ_ID] : SAMPLE ID in the sequence file // [CHIP_ID] : SAMPLE ID in the chip file (NA if not available) // [#SNPS] : Number of markers evaluated // [#READS] : Number of reads evaluated // [AVG_DP] : Mean depth // [FREEMIX] : Chip-free estimated alpha (% MIX in 0-1 scale), NA if unavailable // [FREELK1] : Chip-free log-likelihood at estimated alpha // [FREELK0] : Chip-free log-likelihood at 0% contamination // [CHIPIBD] : With-chip estimated alpha (% MIX in 0-1 scale) // [CHIPLK1] : With-chip log-likelihood at estimated alpha // [CHIPLK0] : With-chip log-likelihood at 0% contamination // [DPREF] : Depth at reference site in the chip // [RDPHET] : Relative depth at HET site in the chip // [RDPALT] : Relative depth at HOMALT site in the chip // [FREE_RF] : Pr(Ref|Ref) site estimated without chip data // [FREE_RH] : Pr(Ref|Het) site estimated without chip data // [FREE_RA] : Pr(Ref|Alt) site estimated without chip data // [CHIP_RF] : Pr(Ref|Ref) site estimated with chip data // [CHIP_RH] : Pr(Ref|Het) site estimated with chip data // [CHIP_RA] : Pr(Ref|Alt) site estimated with chip data // [DPREF] : Depth at reference alleles // [RDPHET] : Relative depth at heterozygous alleles // [RDPALT] : Relative depth at hom-alt alleles String selfSMFN = args.sOutFile + ".selfSM"; String bestSMFN = args.sOutFile + ".bestSM"; String selfRGFN = args.sOutFile + ".selfRG"; String bestRGFN = args.sOutFile + ".bestRG"; String dpSMFN = args.sOutFile + ".depthSM"; String dpRGFN = args.sOutFile + ".depthRG"; IFILE selfSMF = ifopen(selfSMFN,"wb"); IFILE bestSMF = (args.bFindBest ? ifopen(bestSMFN,"wb") : NULL); IFILE selfRGF = (args.bIgnoreRG ? NULL : ifopen(selfRGFN,"wb")); IFILE bestRGF = (args.bFindBest && !args.bIgnoreRG) ? ifopen(bestRGFN,"wb") : NULL; IFILE dpSMF = ifopen(dpSMFN,"wb"); IFILE dpRGF = (args.bIgnoreRG ? NULL : ifopen(dpRGFN,"wb")); if ( selfSMF == NULL ) { Logger::gLogger->error("Cannot write to %s",selfSMF); } if ( args.bFindBest && ( bestSMF == NULL ) ) { Logger::gLogger->error("Cannot write to %s",bestSMF); } if ( dpSMF == NULL ) { Logger::gLogger->error("Cannot write to %s",dpSMF); } ifprintf(dpSMF,"#RG\tDEPTH\t#SNPs\t%%SNPs\t%%CUMUL\n"); int nCumMarkers = 0; for(int i=args.maxDepth; i >= 0; --i) { nCumMarkers += vbid.mixOut.depths[i]; ifprintf(dpSMF,"ALL\t%d\t%d\t%.5lf\t%.5lf\n",i, vbid.mixOut.depths[i],(double) vbid.mixOut.depths[i]/(double)vbid.nMarkers,(double)nCumMarkers/(double)vbid.nMarkers); } ifclose(dpSMF); if ( dpRGF != NULL ) { ifprintf(dpRGF,"#RG\tDEPTH\t#SNPs\t%%SNPs\t%%CUMUL\n"); for(int rg=0; rg < (vbid.nRGs - (int)args.bIgnoreRG); ++rg) { const char* rgID = vbid.pPile->vsRGIDs[rg].c_str(); int nMarkers = 0; for(int i=args.maxDepth; i >= 0; --i) { nMarkers += vbid.mixOut.depths[(rg+1)*(args.maxDepth+1) + i]; } nCumMarkers = 0; for(int i=args.maxDepth; i >= 0; --i) { int d = vbid.mixOut.depths[(rg+1)*(args.maxDepth+1) + i]; nCumMarkers += d; ifprintf(dpRGF,"%s\t%d\t%d\t%.5lf\t%.5lf\n",rgID,i,d,(double)d/(double)vbid.nMarkers,(double)nCumMarkers/(double)nMarkers); } } ifclose(dpRGF); } const char* headers[] = {"#SEQ_ID","RG","CHIP_ID","#SNPS","#READS","AVG_DP","FREEMIX","FREELK1","FREELK0","FREE_RH","FREE_RA","CHIPMIX","CHIPLK1","CHIPLK0","CHIP_RH","CHIP_RA","DPREF","RDPHET","RDPALT"}; int nheaders = sizeof(headers)/sizeof(headers[0]); for(int i=0; i < nheaders; ++i) { ifprintf(selfSMF,"%s%s",i>0 ? "\t" : "",headers[i]); } ifprintf(selfSMF,"\n"); ifprintf(selfSMF,"%s\tALL",vbid.pPile->sBamSMID.c_str()); ifprintf(selfSMF,"\t%s",selfInds[0] >= 0 ? vbid.pGenotypes->indids[selfInds[0]].c_str() : "NA"); ifprintf(selfSMF,"\t%d\t%d\t%.2lf",vbid.nMarkers,vbid.mixOut.numReads[0],(double)vbid.mixOut.numReads[0]/(double)vbid.nMarkers); if ( args.bFreeNone ) { ifprintf(selfSMF,"\tNA\tNA\tNA\tNA\tNA"); } else if ( args.bFreeMixOnly ) { ifprintf(selfSMF,"\t%.5lf\t%.2lf\t%.2lf\tNA\tNA",vbid.mixOut.fMixs[0],vbid.mixOut.llk1s[0],vbid.mixOut.llk0s[0]); } else if ( args.bFreeRefBiasOnly ) { ifprintf(selfSMF,"\tNA\t%.2lf\t%.2lf\t%.5lf\t%.5lf",vbid.mixOut.llk1s[0],vbid.mixOut.llk0s[0],vbid.mixOut.refHets[0],vbid.mixOut.refAlts[0]); } else if ( args.bFreeFull ) { ifprintf(selfSMF,"\t%.5lf\t%.2lf\t%.2lf\t%.5lf\t%.5lf",vbid.mixOut.fMixs[0],vbid.mixOut.llk1s[0],vbid.mixOut.llk0s[0],vbid.mixOut.refHets[0],vbid.mixOut.refAlts[0]); } else { error("Invalid option in handling bFree"); } if ( args.bChipNone || bestInds[0] < 0 ) { ifprintf(selfSMF,"\tNA\tNA\tNA\tNA\tNA\tNA\tNA\tNA"); } else if ( args.bChipMixOnly ) { ifprintf(selfSMF,"\t%.5lf\t%.2lf\t%.2lf\tNA\tNA\t%.3lf\t%.4lf\t%.4lf",vbid.selfOut.fMixs[0],vbid.selfOut.llk1s[0],vbid.selfOut.llk0s[0],(double)vbid.selfOut.numReads[1]/vbid.selfOut.numGenos[1], (double)vbid.selfOut.numReads[2]*vbid.selfOut.numGenos[1]/vbid.selfOut.numReads[1]/vbid.selfOut.numGenos[2], (double)vbid.selfOut.numReads[3]*vbid.selfOut.numGenos[1]/vbid.selfOut.numReads[1]/vbid.selfOut.numGenos[3]); } else if ( args.bChipMixOnly ) { ifprintf(selfSMF,"\tNA\t%.2lf\t%.2lf\t%.5lf\t%.5lf\t%.3lf\t%.4lf\t%.4lf",vbid.selfOut.llk1s[0], vbid.selfOut.llk0s[0], vbid.selfOut.refHets[0], vbid.selfOut.refAlts[0], (double)vbid.selfOut.numReads[1]/vbid.selfOut.numGenos[1], (double)vbid.selfOut.numReads[2]*vbid.selfOut.numGenos[1]/vbid.selfOut.numReads[1]/vbid.selfOut.numGenos[2], (double)vbid.selfOut.numReads[3]*vbid.selfOut.numGenos[1]/vbid.selfOut.numReads[1]/vbid.selfOut.numGenos[3]); } else if ( args.bChipFull ) { ifprintf(selfSMF,"\t%.5lf\t%.2lf\t%.2lf\t%.5lf\t%.5lf\t%.3lf\t%.4lf\t%.4lf", vbid.selfOut.fMixs[0], vbid.selfOut.llk1s[0], vbid.selfOut.llk0s[0], vbid.selfOut.refHets[0], vbid.selfOut.refAlts[0], (double)vbid.selfOut.numReads[1]/vbid.selfOut.numGenos[1], (double)vbid.selfOut.numReads[2]*vbid.selfOut.numGenos[1]/vbid.selfOut.numReads[1]/vbid.selfOut.numGenos[2], (double)vbid.selfOut.numReads[3]*vbid.selfOut.numGenos[1]/vbid.selfOut.numReads[1]/vbid.selfOut.numGenos[3]); } else { error("Invalid option in handling bChip"); } ifprintf(selfSMF,"\n"); ifclose(selfSMF); if ( bestSMF != NULL ) { for(int i=0; i < nheaders; ++i) { ifprintf(bestSMF,"%s%s",i>0 ? "\t" : "",headers[i]); } ifprintf(bestSMF,"\n"); ifprintf(bestSMF,"%s\tALL",vbid.pPile->sBamSMID.c_str()); ifprintf(bestSMF,"\t%s",bestInds[0] >= 0 ? vbid.pGenotypes->indids[bestInds[0]].c_str() : "NA"); ifprintf(bestSMF,"\t%d\t%d\t%.2lf",vbid.nMarkers,vbid.mixOut.numReads[0],(double)vbid.mixOut.numReads[0]/(double)vbid.nMarkers); if ( args.bFreeNone ) { ifprintf(bestSMF,"\tNA\tNA\tNA\tNA\tNA"); } else if ( args.bFreeMixOnly ) { ifprintf(bestSMF,"\t%.5lf\t%.2lf\t%.2lf\tNA\tNA",vbid.mixOut.fMixs[0],vbid.mixOut.llk1s[0],vbid.mixOut.llk0s[0]); } else if ( args.bFreeRefBiasOnly ) { ifprintf(bestSMF,"\tNA\t%.2lf\t%.2lf\t%.5lf\t%.5lf",vbid.mixOut.llk1s[0],vbid.mixOut.llk0s[0],vbid.mixOut.refHets[0],vbid.mixOut.refAlts[0]); } else if ( args.bFreeFull ) { ifprintf(bestSMF,"\t%.5lf\t%.2lf\t%.2lf\t%.5lf\t%.5lf",vbid.mixOut.fMixs[0],vbid.mixOut.llk1s[0],vbid.mixOut.llk0s[0],vbid.mixOut.refHets[0],vbid.mixOut.refAlts[0]); } else { error("Invalid option in handling bFree"); } if ( args.bChipNone || bestInds[0] < 0 ) { ifprintf(bestSMF,"\tNA\tNA\tNA\tNA\tNA\tNA\tNA\tNA"); } else if ( args.bChipMixOnly ) { ifprintf(bestSMF,"\t%.5lf\t%.2lf\t%.2lf\tNA\tNA\t%.3lf\t%.4lf\t%.4lf",vbid.bestOut.fMixs[0],vbid.bestOut.llk1s[0],vbid.bestOut.llk0s[0],(double)vbid.bestOut.numReads[1]/vbid.bestOut.numGenos[1], (double)vbid.bestOut.numReads[2]*vbid.bestOut.numGenos[1]/vbid.bestOut.numReads[1]/vbid.bestOut.numGenos[2], (double)vbid.bestOut.numReads[3]*vbid.bestOut.numGenos[1]/vbid.bestOut.numReads[1]/vbid.bestOut.numGenos[3]); } else if ( args.bChipMixOnly ) { ifprintf(bestSMF,"\tNA\t%.2lf\t%.2lf\t%.5lf\t%.5lf\t%.3lf\t%.4lf\t%.4lf",vbid.bestOut.llk1s[0], vbid.bestOut.llk0s[0], vbid.bestOut.refHets[0], vbid.bestOut.refAlts[0], (double)vbid.bestOut.numReads[1]/vbid.bestOut.numGenos[1], (double)vbid.bestOut.numReads[2]*vbid.bestOut.numGenos[1]/vbid.bestOut.numReads[1]/vbid.bestOut.numGenos[2], (double)vbid.bestOut.numReads[3]*vbid.bestOut.numGenos[1]/vbid.bestOut.numReads[1]/vbid.bestOut.numGenos[3]); } else if ( args.bChipFull ) { ifprintf(bestSMF,"\t%.5lf\t%.2lf\t%.2lf\t%.5lf\t%.5lf\t%.3lf\t%.4lf\t%.4lf", vbid.bestOut.fMixs[0], vbid.bestOut.llk1s[0], vbid.bestOut.llk0s[0], vbid.bestOut.refHets[0], vbid.bestOut.refAlts[0], (double)vbid.bestOut.numReads[1]/vbid.bestOut.numGenos[1], (double)vbid.bestOut.numReads[2]*vbid.bestOut.numGenos[1]/vbid.bestOut.numReads[1]/vbid.bestOut.numGenos[2], (double)vbid.bestOut.numReads[3]*vbid.bestOut.numGenos[1]/vbid.bestOut.numReads[1]/vbid.bestOut.numGenos[3]); } else { error("Invalid option in handling bChip"); } ifprintf(bestSMF,"\n"); ifclose(bestSMF); } if ( selfRGF != NULL ) { for(int i=0; i < nheaders; ++i) { ifprintf(selfRGF,"%s%s",i>0 ? "\t" : "",headers[i]); } ifprintf(selfRGF,"\n"); for(int rg=0; rg < vbid.nRGs; ++rg) { ifprintf(selfRGF,"%s\t%s",vbid.pPile->sBamSMID.c_str(),vbid.pPile->vsRGIDs[rg].c_str()); ifprintf(selfRGF,"\t%s",bestInds[rg] >= 0 ? vbid.pGenotypes->indids[bestInds[rg]].c_str() : "NA"); ifprintf(selfRGF,"\t%d\t%d\t%.2lf",vbid.nMarkers,vbid.mixOut.numReads[(rg+1)*4],(double)vbid.mixOut.numReads[(rg+1)*4]/(double)vbid.mixOut.numGenos[(rg+1)*4]); if ( args.bFreeNone ) { ifprintf(selfRGF,"\tNA\tNA\tNA\tNA\tNA"); } else if ( args.bFreeMixOnly ) { ifprintf(selfRGF,"\t%.5lf\t%.2lf\t%.2lf\tNA\tNA",vbid.mixOut.fMixs[rg+1],vbid.mixOut.llk1s[rg+1],vbid.mixOut.llk0s[rg+1]); } else if ( args.bFreeRefBiasOnly ) { ifprintf(selfRGF,"\tNA\t%.2lf\t%.2lf\t%.5lf\t%.5lf",vbid.mixOut.llk1s[rg+1],vbid.mixOut.llk0s[rg+1],vbid.mixOut.refHets[rg+1],vbid.mixOut.refAlts[rg+1]); } else if ( args.bFreeFull ) { ifprintf(selfRGF,"\t%.5lf\t%.2lf\t%.2lf\t%.5lf\t%.5lf",vbid.mixOut.fMixs[rg+1],vbid.mixOut.llk1s[rg+1],vbid.mixOut.llk0s[rg+1],vbid.mixOut.refHets[rg+1],vbid.mixOut.refAlts[rg+1]); } else { error("Invalid option in handling bFree"); } if ( args.bChipNone || bestInds[0] < 0 ) { ifprintf(selfRGF,"\tNA\tNA\tNA\tNA\tNA\tNA\tNA\tNA"); } else if ( args.bChipMixOnly ) { ifprintf(selfRGF,"\t%.5lf\t%.2lf\t%.2lf\tNA\tNA\t%.3lf\t%.4lf\t%.4lf",vbid.selfOut.fMixs[rg+1], vbid.selfOut.llk1s[rg+1], vbid.selfOut.llk0s[rg+1], (double)vbid.selfOut.numReads[(rg+1)*4+1]/vbid.selfOut.numGenos[(rg+1)*4+1], (double)vbid.selfOut.numReads[(rg+1)*4+2]*vbid.selfOut.numGenos[(rg+1)*4+1]/vbid.selfOut.numReads[(rg+1)*4+1]/vbid.selfOut.numGenos[(rg+1)*4+2], (double)vbid.selfOut.numReads[(rg+1)*4+3]*vbid.selfOut.numGenos[(rg+1)*4+1]/vbid.selfOut.numReads[(rg+1)*4+1]/vbid.selfOut.numGenos[(rg+1)*4+3]); } else if ( args.bChipMixOnly ) { ifprintf(selfRGF,"\tNA\t%.2lf\t%.2lf\t%.5lf\t%.5lf\t%.3lf\t%.4lf\t%.4lf",vbid.selfOut.llk1s[rg+1], vbid.selfOut.llk0s[rg+1], vbid.selfOut.refHets[rg+1], vbid.selfOut.refAlts[rg+1], (double)vbid.selfOut.numReads[(rg+1)*4+1]/vbid.selfOut.numGenos[(rg+1)*4+1], (double)vbid.selfOut.numReads[(rg+1)*4+2]*vbid.selfOut.numGenos[(rg+1)*4+1]/vbid.selfOut.numReads[(rg+1)*4]/vbid.selfOut.numGenos[(rg+1)*4+2], (double)vbid.selfOut.numReads[(rg+1)*4+3]*vbid.selfOut.numGenos[(rg+1)*4+1]/vbid.selfOut.numReads[(rg+1)*4+1]/vbid.selfOut.numGenos[(rg+1)*4+3]); } else if ( args.bChipFull ) { ifprintf(selfRGF,"\t%.5lf\t%.2lf\t%.2lf\t%.5lf\t%.5lf\t%.3lf\t%.4lf\t%.4lf", vbid.selfOut.fMixs[rg+1], vbid.selfOut.llk1s[rg+1], vbid.selfOut.llk0s[rg+1], vbid.selfOut.refHets[rg+1], vbid.selfOut.refAlts[rg+1], (double)vbid.selfOut.numReads[(rg+1)*4+1]/vbid.selfOut.numGenos[(rg+1)*4+1], (double)vbid.selfOut.numReads[(rg+1)*4+2]*vbid.selfOut.numGenos[(rg+1)*4+1]/vbid.selfOut.numReads[(rg+1)*4+1]/vbid.selfOut.numGenos[(rg+1)*4+2], (double)vbid.selfOut.numReads[(rg+1)*4+3]*vbid.selfOut.numGenos[(rg+1)*4+1]/vbid.selfOut.numReads[(rg+1)*4+1]/vbid.selfOut.numGenos[(rg+1)*4+3]); } else { error("Invalid option in handling bChip"); } ifprintf(selfRGF,"\n"); } ifclose(selfRGF); } if ( bestRGF != NULL ) { for(int i=0; i < nheaders; ++i) { ifprintf(bestRGF,"%s%s",i>0 ? "\t" : "",headers[i]); } ifprintf(bestRGF,"\n"); for(int rg=0; rg < vbid.nRGs; ++rg) { ifprintf(bestRGF,"%s\t%s",vbid.pPile->sBamSMID.c_str(),vbid.pPile->vsRGIDs[rg].c_str()); ifprintf(bestRGF,"\t%s",bestInds[rg] >= 0 ? vbid.pGenotypes->indids[bestInds[rg]].c_str() : "NA"); ifprintf(bestRGF,"\t%d\t%d\t%.2lf",vbid.nMarkers,vbid.mixOut.numReads[(rg+1)*4],(double)vbid.mixOut.numReads[(rg+1)*4]/(double)vbid.mixOut.numGenos[(rg+1)*4]); if ( args.bFreeNone ) { ifprintf(bestRGF,"\tNA\tNA\tNA\tNA\tNA"); } else if ( args.bFreeMixOnly ) { ifprintf(bestRGF,"\t%.5lf\t%.2lf\t%.2lf\tNA\tNA",vbid.mixOut.fMixs[rg+1],vbid.mixOut.llk1s[rg+1],vbid.mixOut.llk0s[rg+1]); } else if ( args.bFreeRefBiasOnly ) { ifprintf(bestRGF,"\tNA\t%.2lf\t%.2lf\t%.5lf\t%.5lf",vbid.mixOut.llk1s[rg+1],vbid.mixOut.llk0s[rg+1],vbid.mixOut.refHets[rg+1],vbid.mixOut.refAlts[rg+1]); } else if ( args.bFreeFull ) { ifprintf(bestRGF,"\t%.5lf\t%.2lf\t%.2lf\t%.5lf\t%.5lf",vbid.mixOut.fMixs[rg+1],vbid.mixOut.llk1s[rg+1],vbid.mixOut.llk0s[rg+1],vbid.mixOut.refHets[rg+1],vbid.mixOut.refAlts[rg+1]); } else { error("Invalid option in handling bFree"); } if ( args.bChipNone || bestInds[0] < 0 ) { ifprintf(bestRGF,"\tNA\tNA\tNA\tNA\tNA\tNA\tNA\tNA"); } else if ( args.bChipMixOnly ) { ifprintf(bestRGF,"\t%.5lf\t%.2lf\t%.2lf\tNA\tNA\t%.3lf\t%.4lf\t%.4lf",vbid.bestOut.fMixs[rg+1], vbid.bestOut.llk1s[rg+1], vbid.bestOut.llk0s[rg+1], (double)vbid.bestOut.numReads[(rg+1)*4+1]/vbid.bestOut.numGenos[(rg+1)*4+1], (double)vbid.bestOut.numReads[(rg+1)*4+2]*vbid.bestOut.numGenos[(rg+1)*4+1]/vbid.bestOut.numReads[(rg+1)*4+1]/vbid.bestOut.numGenos[(rg+1)*4+2], (double)vbid.bestOut.numReads[(rg+1)*4+3]*vbid.bestOut.numGenos[(rg+1)*4+1]/vbid.bestOut.numReads[(rg+1)*4+1]/vbid.bestOut.numGenos[(rg+1)*4+3]); } else if ( args.bChipMixOnly ) { ifprintf(bestRGF,"\tNA\t%.2lf\t%.2lf\t%.5lf\t%.5lf\t%.3lf\t%.4lf\t%.4lf",vbid.bestOut.llk1s[rg+1], vbid.bestOut.llk0s[rg+1], vbid.bestOut.refHets[rg+1], vbid.bestOut.refAlts[rg+1], (double)vbid.bestOut.numReads[(rg+1)*4+1]/vbid.bestOut.numGenos[(rg+1)*4+1], (double)vbid.bestOut.numReads[(rg+1)*4+2]*vbid.bestOut.numGenos[(rg+1)*4+1]/vbid.bestOut.numReads[(rg+1)*4]/vbid.bestOut.numGenos[(rg+1)*4+2], (double)vbid.bestOut.numReads[(rg+1)*4+3]*vbid.bestOut.numGenos[(rg+1)*4+1]/vbid.bestOut.numReads[(rg+1)*4+1]/vbid.bestOut.numGenos[(rg+1)*4+3]); } else if ( args.bChipFull ) { ifprintf(bestRGF,"\t%.5lf\t%.2lf\t%.2lf\t%.5lf\t%.5lf\t%.3lf\t%.4lf\t%.4lf", vbid.bestOut.fMixs[rg+1], vbid.bestOut.llk1s[rg+1], vbid.bestOut.llk0s[rg+1], vbid.bestOut.refHets[rg+1], vbid.bestOut.refAlts[rg+1], (double)vbid.bestOut.numReads[(rg+1)*4+1]/vbid.bestOut.numGenos[(rg+1)*4+1], (double)vbid.bestOut.numReads[(rg+1)*4+2]*vbid.bestOut.numGenos[(rg+1)*4+1]/vbid.bestOut.numReads[(rg+1)*4+1]/vbid.bestOut.numGenos[(rg+1)*4+2], (double)vbid.bestOut.numReads[(rg+1)*4+3]*vbid.bestOut.numGenos[(rg+1)*4+1]/vbid.bestOut.numReads[(rg+1)*4+1]/vbid.bestOut.numGenos[(rg+1)*4+3]); } else { error("Invalid option in handling bChip"); } ifprintf(bestRGF,"\n"); } ifclose(bestRGF); } time(&t); Logger::gLogger->writeLog("Analysis finished on %s",ctime(&t)); return 0; }
int Validate::execute(int argc, char **argv) { // Extract command line arguments. String inFile = ""; int maxErrors = -1; int printableErrors = 100; bool so_flag = false; bool so_coord = false; bool so_query = false; bool noeof = false; bool disableStatistics = false; bool verbose = false; bool params = false; ParameterList inputParameters; BEGIN_LONG_PARAMETERS(longParameterList) LONG_STRINGPARAMETER("in", &inFile) LONG_PARAMETER("noeof", &noeof) LONG_INTPARAMETER("maxErrors", &maxErrors) LONG_PARAMETER("verbose", &verbose) LONG_INTPARAMETER("printableErrors", &printableErrors) LONG_PARAMETER("disableStatistics", &disableStatistics) LONG_PARAMETER("params", ¶ms) LONG_PARAMETER_GROUP("SortOrder") EXCLUSIVE_PARAMETER("so_flag", &so_flag) EXCLUSIVE_PARAMETER("so_coord", &so_coord) EXCLUSIVE_PARAMETER("so_query", &so_query) END_LONG_PARAMETERS(); inputParameters.Add(new LongParameters ("Input Parameters", longParameterList)); inputParameters.Read(argc-1, &(argv[1])); // Determine the sort type for validation based on the parameters. SamFile::SortedType sortType = SamFile::UNSORTED; if(so_flag) { sortType = SamFile::FLAG; } else if(so_coord) { sortType = SamFile::COORDINATE; } else if(so_query) { sortType = SamFile::QUERY_NAME; } // If no eof block is required for a bgzf file, set the bgzf file type to // not look for it. if(noeof) { // Set that the eof block is not required. BgzfFileType::setRequireEofBlock(false); } // Check to see if the in file was specified, if not, report an error. if(inFile == "") { usage(); inputParameters.Status(); // In file was not specified but it is mandatory. std::cerr << "--in is a mandatory argument for validate, " << "but was not specified" << std::endl; return(-1); } if(params) { inputParameters.Status(); } // Since we want to accumulate multiple errors, use RETURN rather // than throwing exceptions. SamFile samIn(ErrorHandler::RETURN); // Open the file for reading. if(!samIn.OpenForRead(inFile)) { fprintf(stderr, "%s\n", samIn.GetStatusMessage()); return(samIn.GetStatus()); } // Set the sorting validation type. samIn.setSortedValidation(sortType); // Set that statistics should be generated. samIn.GenerateStatistics(!disableStatistics); // Read the sam header. SamFileHeader samHeader; if(!samIn.ReadHeader(samHeader)) { fprintf(stderr, "%s\n", samIn.GetStatusMessage()); return(samIn.GetStatus()); } // Read the sam records. SamRecord samRecord(ErrorHandler::RETURN); // Track the status. SamStatus::Status status = SamStatus::SUCCESS; // Keep reading records until the end of the file is reached. int numValidRecords = 0; int numInvalidRecords = 0; int numErrorRecords = 0; int numRecords = 0; int numReportedErrors = 0; int totalErrorRecords = 0; std::map<SamStatus::Status, uint64_t> errorStats; std::map<SamValidationError::Type, uint64_t> invalidStats; SamValidationErrors invalidSamErrors; // Keep reading records from the file until SamFile::ReadRecord // indicates to stop (returns false). while( ( (maxErrors < 0) || (totalErrorRecords < maxErrors) ) && ( (samIn.ReadRecord(samHeader, samRecord)) || (SamStatus::isContinuableStatus(samIn.GetStatus())) ) ) { ++numRecords; if(samIn.GetStatus() == SamStatus::SUCCESS) { // Successfully set the record, so check to see if it is valid. // Clear any errors in the list. invalidSamErrors.clear(); if(!SamValidator::isValid(samHeader, samRecord, invalidSamErrors)) { // The record is not valid. ++numInvalidRecords; ++totalErrorRecords; if(verbose && (numReportedErrors < printableErrors)) { std::cerr << "Record " << numRecords << std::endl << invalidSamErrors << std::endl; ++numReportedErrors; } // Update the statistics for all validation errors found in this record. invalidSamErrors.resetErrorIter(); const SamValidationError* errorPtr = invalidSamErrors.getNextError(); while(errorPtr != NULL) { ++invalidStats[errorPtr->getType()]; errorPtr = invalidSamErrors.getNextError(); } // If the status is not yet set, set it. if(status == SamStatus::SUCCESS) { status = SamStatus::INVALID; } } else { // Valid record, so increment the counter. ++numValidRecords; } } else { // Error reading the record. ++numErrorRecords; ++totalErrorRecords; if(verbose && (numReportedErrors < printableErrors)) { // report error. std::cerr << "Record " << numRecords << std::endl << samIn.GetStatusMessage() << std::endl << std::endl; ++numReportedErrors; } // Increment the statistics ++errorStats[samIn.GetStatus()]; // If the status is not yet set, set it. if(status == SamStatus::SUCCESS) { status = samIn.GetStatus(); } } } if( (samIn.GetStatus() != SamStatus::NO_MORE_RECS) && (totalErrorRecords < maxErrors) ) { // The last read call had a failure, so report it. // If the number of errors is >= ,maxErrors we don't // want to print any more failures. ++numErrorRecords; ++totalErrorRecords; if(numReportedErrors < printableErrors) { std::cerr << "Record " << numRecords << ": "; std::cerr << std::endl << samIn.GetStatusMessage() << std::endl; } // Increment the statistics ++errorStats[samIn.GetStatus()]; if(status == SamStatus::SUCCESS) { status = samIn.GetStatus(); } } if(totalErrorRecords == maxErrors) { if(maxErrors == 0) { std::cerr << "WARNING file was not read at all due to maxErrors setting, but returning Success.\n"; } else { // Print a note that the entire file was not read. std::cerr << "File was not completely read due to the number of errors.\n"; std::cerr << "Statistics only reflect the part of the file that was read.\n"; } } fprintf(stderr, "\nNumber of records read = %d\n", numRecords); fprintf(stderr, "Number of valid records = %d\n", numValidRecords); std::cerr << std::endl; if(numRecords != numValidRecords) { std::cerr << "Error Counts:\n"; // Loop through the non-validation errors. std::map<SamStatus::Status, uint64_t>::iterator statusIter; for(statusIter = errorStats.begin(); statusIter != errorStats.end(); statusIter++) { std::cerr << "\t" << SamStatus::getStatusString(statusIter->first) << ": " << statusIter->second << std::endl; } std::map<SamValidationError::Type, uint64_t>::iterator invalidIter; for(invalidIter = invalidStats.begin(); invalidIter != invalidStats.end(); invalidIter++) { std::cerr << "\t" << SamValidationError::getTypeString(invalidIter->first) << ": " << invalidIter->second << std::endl; } std::cerr << std::endl; } samIn.PrintStatistics(); fprintf(stderr, "Returning: %d (%s)\n", status, SamStatus::getStatusString(status)); return(status); }
int main(int argc, char ** argv) { ParameterList inputParameters; String filename; int minReadLength = 10; int printableErrors = 20; int maxErrors = -1; String testParam; BaseAsciiMap::SPACE_TYPE myBaseType = BaseAsciiMap::UNKNOWN; // Read the parameters from the command line. bool baseSpace = false; bool colorSpace = false; bool autoDetect = false; bool ignoreErrors = false; bool baseComposition = false; bool avgQual = false; bool quiet = false; bool noeof = false; bool params = false; bool disableSeqIDCheck = false; bool interleaved = false; BEGIN_LONG_PARAMETERS(longParameterList) LONG_STRINGPARAMETER("file", &filename) LONG_PARAMETER("baseComposition", &baseComposition) LONG_PARAMETER("avgQual", &avgQual) LONG_PARAMETER("disableSeqIDCheck", &disableSeqIDCheck) LONG_PARAMETER("interleaved", &interleaved) LONG_PARAMETER("noeof", &noeof) LONG_PARAMETER("quiet", &quiet) LONG_PARAMETER("params", ¶ms) LONG_INTPARAMETER("minReadLen", &minReadLength) LONG_INTPARAMETER("maxErrors", &maxErrors) LONG_PARAMETER_GROUP("Space Type") EXCLUSIVE_PARAMETER("baseSpace", &baseSpace) EXCLUSIVE_PARAMETER("colorSpace", &colorSpace) EXCLUSIVE_PARAMETER("auto", &autoDetect) LONG_PARAMETER_GROUP("Errors") EXCLUSIVE_PARAMETER("ignoreErrors", &ignoreErrors) LONG_SMARTINTPARAMETER("printableErrors", &printableErrors) BEGIN_LEGACY_PARAMETERS() LONG_PARAMETER("printBaseComp", &baseComposition) LONG_PARAMETER("disableAllMessages", &quiet) LONG_INTPARAMETER("quitAfterErrorNum", &maxErrors) LONG_PARAMETER_GROUP("Space Type") EXCLUSIVE_PARAMETER("baseSpace", &baseSpace) EXCLUSIVE_PARAMETER("colorSpace", &colorSpace) EXCLUSIVE_PARAMETER("autoDetect", &autoDetect) LONG_PARAMETER_GROUP("Errors") EXCLUSIVE_PARAMETER("ignoreAllErrors", &ignoreErrors) LONG_SMARTINTPARAMETER("maxReportedErrors", &printableErrors) END_LONG_PARAMETERS(); inputParameters.Add(new LongParameters ("Input Parameters", longParameterList)); inputParameters.Read(argc, argv); if(ignoreErrors) { // Ignore all errors, so set printableErrors to 0. printableErrors = 0; } // Set the base type based on the passed in parameters. if(baseSpace) { // Base Space myBaseType = BaseAsciiMap::BASE_SPACE; } else if(colorSpace) { myBaseType = BaseAsciiMap::COLOR_SPACE; } else { myBaseType = BaseAsciiMap::UNKNOWN; // Set autoDetect autoDetect = true; } // If no eof block is required for a bgzf file, set the bgzf file type to // not look for it. if(noeof) { // Set that the eof block is not required. BgzfFileType::setRequireEofBlock(false); } // DO not print status if set to quiet. if((!quiet) && params) { inputParameters.Status(); } if(filename == "") { if(quiet) { return(-1); } // No filename was specified so print a usage description. std::cout << "ERROR: No filename specified. See below for usage help."; std::cout << std::endl << std::endl; std::cout << " Required Parameters:" << std::endl; std::cout << "\t--file : FastQ filename with path to be prorcessed.\n"; std::cout << std::endl; std::cout << " Optional Parameters:" << std::endl; std::cout << "\t--minReadLen : Minimum allowed read length (Defaults to 10).\n"; std::cout << "\t--maxErrors : Number of errors to allow before quitting\n"; std::cout << "\t reading/validating the file.\n"; std::cout << "\t -1 (default) indicates to not quit until\n"; std::cout << "\t the entire file is read.\n"; std::cout << "\t 0 indicates not to read/validate anything\n"; std::cout << "\t--printableErrors : Maximum number of errors to print before\n"; std::cout << "\t suppressing them (Defaults to 20).\n"; std::cout << "\t Different than maxErrors since \n"; std::cout << "\t printableErrors will continue reading and\n"; std::cout << "\t validating the file until the end, but\n"; std::cout << "\t just doesn't print the errors.\n"; std::cout << "\t--ignoreErrors : Ignore all errors (same as printableErrors = 0)\n"; std::cout << "\t overwrites the printableErrors option.\n"; std::cout << "\t--baseComposition : Print the Base Composition Statistics.\n"; std::cout << "\t--avgQual : Print the average phred quality per cycle & overall average quality.\n"; std::cout << "\t--disableSeqIDCheck : Disable the unique sequence identifier check.\n"; std::cout << "\t Use this option to save memory since the sequence id\n"; std::cout << "\t check uses a lot of memory.\n"; std::cout << "\t--noeof : Disable checking that the eof block is present in gzipped files\n."; std::cout << "\t--interleaved : Validate consequtive reads have the same sequence identifier\n"; std::cout << "\t (only allowed difference is 1/2, but not required) and validate\n"; std::cout << "\t that otherwise reads have unique sequence identifiers.\n"; std::cout << "\t Cannot be used if '--disableSeqIDCheck' is specified.\n"; std::cout << "\t--params : Print the parameter settings.\n"; std::cout << "\t--quiet : Suppresses the display of errors and summary statistics.\n"; std::cout << "\t Does not affect the printing of Base Composition Statistics.\n"; std::cout << "\n Optional Space Options for Raw Sequence (Last one specified is used):\n"; std::cout << "\t--auto : Determine baseSpace/colorSpace from the Raw Sequence in the file (Default).\n"; std::cout << "\t--baseSpace : ACTGN only\n"; std::cout << "\t--colorSpace : 0123. only\n"; std::cout << std::endl; std::cout << " Usage:" << std::endl; std::cout << "\t./fastQValidator --file <fileName> [--minReadLen <minReadLen>] [--maxErrors <numErrors>] [--printableErrors <printableErrors>|--ignoreErrors] [--baseComposition] [--disableSeqIDCheck] [--interleaved] [--quiet] [--baseSpace|--colorSpace|--auto] [--params]\n\n"; std::cout << " Examples:" << std::endl; std::cout << "\t../fastQValidator --file testFile.txt\n"; std::cout << "\t../fastQValidator --file testFile.txt --minReadLen 10 --baseSpace --printableErrors 100\n"; std::cout << "\t./fastQValidator --file test/testFile.txt --minReadLen 10 --colorSpace --ignoreErrors\n"; std::cout << std::endl; return (-1); } FastQFile validator(minReadLength, printableErrors); if(quiet) { validator.disableMessages(); } if(disableSeqIDCheck) { validator.disableSeqIDCheck(); } if(interleaved) { validator.interleaved(); } if(interleaved && disableSeqIDCheck) { if(!quiet) { std::cout << "ERROR: --interleaved and --disableSeqIDCheck cannot both be specified.\n"; } return(-1); } validator.setMaxErrors(maxErrors); FastQStatus::Status status = validator.validateFastQFile(filename, baseComposition, myBaseType, avgQual); if(!quiet) { std::cout << "Returning: " << status << " : " << FastQStatus::getStatusString(status) << std::endl; } return(status); }