Exemple #1
0
int main(int argc, char ** argv)
{
    printf("glfMultiples -- SNP calls based on .glf or .glz files\n");
    printf("(c) 2008-2011 Goncalo Abecasis, Sebastian Zoellner, Yun Li\n\n");

    String pedfile;
    String positionfile;
    String callfile;
    String glfAliases;
    String glfPrefix;
    String glfSuffix;
    ParameterList pl;

    double posterior = 0.50;
    int    mapQuality = 0;
    int    minTotalDepth = 1;
    int    maxTotalDepth = INT_MAX;
    bool   verbose = false;
    bool   mapQualityStrict = false;
    bool   hardFilter = false;
    bool   smartFilter = false;
    bool   softFilter = true;
    bool   robustPrior = true;
    bool   uniformPrior = false;
    String xLabel("X"), yLabel("Y"), mitoLabel("MT");
    int    xStart = 2699520, xStop = 154931044;

    BEGIN_LONG_PARAMETERS(longParameters)
        LONG_PARAMETER_GROUP("Pedigree File")
        LONG_STRINGPARAMETER("ped", &pedfile)
        LONG_PARAMETER_GROUP("Map Quality Filter")
        LONG_INTPARAMETER("minMapQuality", &mapQuality)
        LONG_PARAMETER("strict", &mapQualityStrict)
        LONG_PARAMETER_GROUP("Total Depth Filter")
        LONG_INTPARAMETER("minDepth", &minTotalDepth)
        LONG_INTPARAMETER("maxDepth", &maxTotalDepth)
        LONG_PARAMETER_GROUP("Position Filter")
        LONG_STRINGPARAMETER("positionFile", &positionfile)
        LONG_PARAMETER_GROUP("Chromosome Labels")
        LONG_STRINGPARAMETER("xChr", &xLabel)
        LONG_STRINGPARAMETER("yChr", &yLabel)
        LONG_STRINGPARAMETER("mito", &mitoLabel)
        LONG_INTPARAMETER("xStart", &xStart)
        LONG_INTPARAMETER("xStop", &xStop)
        LONG_PARAMETER_GROUP("Filtering Options")
        EXCLUSIVE_PARAMETER("hardFilter", &hardFilter)
        EXCLUSIVE_PARAMETER("smartFilter", &smartFilter)
        EXCLUSIVE_PARAMETER("softFilter", &softFilter)
        LONG_PARAMETER_GROUP("Prior Options")
        EXCLUSIVE_PARAMETER("uniformPrior", &uniformPrior)
        EXCLUSIVE_PARAMETER("robustPrior", &robustPrior)
        LONG_PARAMETER_GROUP("Output")
        LONG_PARAMETER("verbose", &verbose)
        LONG_PARAMETER_GROUP("Sample Names")
        LONG_STRINGPARAMETER("glfAliases", &glfAliases)
        LONG_PARAMETER_GROUP("Prefixes and Suffixes")
        LONG_STRINGPARAMETER("glfPrefix",&glfPrefix)
        LONG_STRINGPARAMETER("glfSuffix",&glfSuffix)
        END_LONG_PARAMETERS();

    pl.Add(new StringParameter('b', "Base Call File", callfile));
    pl.Add(new DoubleParameter('p', "Posterior Threshold", posterior));
    pl.Add(new LongParameters("Additional Options", longParameters));
    int argstart = pl.ReadWithTrailer(argc, argv) + 1;
    pl.Status();

    if (posterior < 0.50)
        error("Posterior threshold for genotype calls (-p option) must be > 0.50.");

    time_t t;
    time(&t);

    printf("Analysis started on %s\n", ctime(&t));
    fflush(stdout);

    int n = argc - argstart;
    argv += argstart;

    Pedigree ped;
    if (!pedfile.IsEmpty())
    {
        ped.pd.AddStringColumn("glfFile");
        ped.Load(pedfile);

        n = ped.count;
    }
    else
        if (n == 0)
            error("No pedigree file present and no glf files listed at the end of command line\n");

    // Prior for finding difference from the reference at a particular site
    //BgzfFileType::setRequireEofBlock(false);

    double prior = 0.0;
    for (int i = 1; i <= 2 * n; i++)
        prior += 1.0 / i;
    prior *= 0.001;

    glfHandler * glf = new glfHandler[n];

    bool firstGlf = n;
    if (ped.count)
    {
        bool warn = false;

        for (int i = n - 1; i > 0; i++)
        {
          
            if (!glf[i].Open(ped[i].strings[0]))
            {
                printf("Failed to open genotype likelihood file [%s] for individual %s:%s\n",
                       (const char *) ped[i].strings[0],
                       (const char *) ped[i].famid,
                       (const char *) ped[i].pid);

                glf[i].OpenStub();
                firstGlf = i;
            }

            if (warn)
                printf("\n");

            if (firstGlf == n)
                error("No genotype likelihood files could be opened");
        }
    }
    else
    {
        for (int i = firstGlf = 0; i < n; i++)
        {
            String glfName = glfPrefix + String(argv[i]) + glfSuffix;
            if (!glf[i].Open(glfName))
                error("Failed to open genotype likelihood file [%s]\n", glfName.c_str());
        }
    }

    StringAlias aliases;
    aliases.ReadFromFile(glfAliases);

    printf("Calling genotypes for files ...\n");
    for (int i = 0; i < n; i++)
        printf("%s\n", ped.count ? (const char *) ped[i].strings[0] : argv[i]);
    printf("\n");

    baseCalls = fopen(callfile, "wt");

    if (baseCalls != NULL)
    {
        fprintf(baseCalls, "##fileformat=VCFv4.0\n");
        ReportDate(baseCalls);
        fprintf(baseCalls, "##source=glfMultiples\n");
        fprintf(baseCalls, "##minDepth=%d\n", minTotalDepth);
        fprintf(baseCalls, "##maxDepth=%d\n", maxTotalDepth);
        fprintf(baseCalls, "##minMapQuality=%d\n", mapQuality);
        fprintf(baseCalls, "##minPosterior=%.4f\n", posterior);
        fprintf(baseCalls, "##INFO=<ID=DP,Number=1,Type=Integer,Description=\"Total Depth\">\n");
        fprintf(baseCalls, "##INFO=<ID=MQ,Number=1,Type=Integer,Description=\"Root Mean Squared Mapping Quality\">\n");
        fprintf(baseCalls, "##INFO=<ID=NS,Number=1,Type=Integer,Description=\"Number of samples with coverage\">\n");
        fprintf(baseCalls, "##INFO=<ID=AN,Number=1,Type=Integer,Description=\"Total number of alleles (with coverage)\">\n");
        fprintf(baseCalls, "##INFO=<ID=AC,Number=.,Type=Integer,Description=\"Alternative allele count (with coverage)\">\n");
        fprintf(baseCalls, "##INFO=<ID=AF,Number=.,Type=Float,Description=\"Alternate allele frequency\">\n");
        fprintf(baseCalls, "##INFO=<ID=AB,Number=1,Type=Float,Description=\"Estimated allele balance between the alleles\">\n");
	if ( mapQuality > 0 ) {
	  fprintf(baseCalls, "##FILTER=<ID=mq%d,Description=\"Mapping Quality less than %d\">\n",mapQuality,mapQuality);
	}
	if ( minTotalDepth > 1 ) {
	  fprintf(baseCalls, "##FILTER=<ID=dp%d,Description=\"Total Read Depth less than %d\">\n",minTotalDepth,minTotalDepth);
	}
	if ( minTotalDepth < INT_MAX ) {
	  fprintf(baseCalls, "##FILTER=<ID=DP%d,Description=\"Total Read Depth greater than %d\">\n",maxTotalDepth,maxTotalDepth);
	}
        fprintf(baseCalls, "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Most Likely Genotype\">\n");
        fprintf(baseCalls, "##FORMAT=<ID=GQ,Number=1,Type=Integer,Description=\"Genotype Call Quality\">\n");
        fprintf(baseCalls, "##FORMAT=<ID=DP,Number=1,Type=Integer,Description=\"Read Depth\">\n");
        fprintf(baseCalls, "##FORMAT=<ID=PL,Number=3,Type=Integer,Description=\"Genotype Likelihoods for Genotypes 0/0,0/1,1/1\">\n");
        fprintf(baseCalls, "##FORMAT=<ID=PL3,Number=6,Type=Integer,Description=\"Genotype Likelihoods for Genotypes 0/0,0/1,1/1,0/2,1/2,2/2\">\n");
        fprintf(baseCalls, "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT");
        for (int i = 0; i < n; i++)
            fprintf(baseCalls, "\t%s", ped.count ?
                    (const char *) (ped[i].famid + ":" + ped[i].pid) :
                    (const char *) aliases.GetAlias(argv[i]));

        fprintf(baseCalls, "\n");
    }

    StringArray buffer, tokens;
    StringHash  positions;

    buffer.Read(positionfile);

    for (int i = 0; i < buffer.Length(); i++)
    {
        tokens.ReplaceTokens(buffer[i], " \t:");

        if (tokens.Length() != 2) continue;

        positions.Add(tokens[0] + ":" + (int(tokens[1].AsInteger() - 1)));
    }

    int chromosomeType = 0;

    while (glf[firstGlf].NextSection())
    {
        for (int i = firstGlf + 1; i < n; i++)
        {
            if (glf[i].isStub) continue;

            glf[i].NextSection();

            if (glf[firstGlf].maxPosition != glf[i].maxPosition || glf[firstGlf].label != glf[i].label)
            {
                error("Genotype files '%s' and '%s' are not compatible ...\n"
                      "    File '%s' has section %s with %d entries ...\n"
                      "    File '%s' section %s with %d entries ...\n",
                      ped.count ? (const char *) ped[firstGlf].strings[0] : argv[firstGlf],
                      ped.count ? (const char *) ped[i].strings[0] : argv[i],
                      ped.count ? (const char *) ped[firstGlf].strings[0] : argv[firstGlf],
                      (const char *) glf[firstGlf].label, glf[firstGlf].maxPosition,
                      ped.count ? (const char *) ped[i].strings[0] : argv[i],
                      (const char *) glf[i].label, glf[i].maxPosition);
            }
        }

        chromosomeType = CT_AUTOSOME;

        if (ped.count)
        {
            if (glf[firstGlf].label == xLabel) chromosomeType = CT_CHRX;
            if (glf[firstGlf].label == yLabel) chromosomeType = CT_CHRY;
            if (glf[firstGlf].label == mitoLabel) chromosomeType = CT_MITO;
        }

        printf("Processing section %s with %d entries\n",
               (const char *) glf[firstGlf].label, glf[firstGlf].maxPosition);

        int refBase = 0;
        int position = 0;
        int mapQualityFilter = 0;
        int depthFilter = 0;
        int homozygousReference = 0;
        int transitions = 0;
        int transversions = 0;
        int otherPolymorphisms = 0;
        int sinkFilter = 0;
        int smartFilterHits = 0;
        int baseCounts[5] = {0, 0, 0, 0, 0};

        String filter;

        while (true)
        {
            if (position > 0)
            {
                // Check whether we have reached the end of the current chromosome
                bool done = true;
                for (int i = 0; i < n; i++)
                    if (glf[i].data.recordType != 0)
                        done = false;
                if (done) break;
            }

            // Advance to the next position where needed
            for (int i = 0; i < n; i++)
                if (glf[i].position == position)
                    glf[i].NextBaseEntry();

            // Figure out the current analysis position
            refBase = glf[0].data.refBase;
            position = glf[0].position;
            for (int i = 1; i < n; i++)
                if (position > glf[i].position)
                {
                    position = glf[i].position;
                    refBase = glf[i].data.refBase;
                }

            // Avoid alignments that extend past the end of the chromosome
            if (position >= glf[firstGlf].maxPosition)
                break;

            baseCounts[(int)refBase]++;

            // These lines can be uncommented for debugging purposes
            // for (int i = 0; i < n; i++)
            //   printf("GLF %d : position %d, refBase %d\n", i, position, refBase);
            // printf("Position: %d, refBase: %d\n", position, refBase);

            if (positions.Entries())
            {
                filter = glf[firstGlf].label + ":" + position;

                if (positions.Find(filter) < 0) continue;
            }

            if (refBase == 0) continue;

            // Corrected calculation of root-mean-square Map Quality score 
            // and check if we have at least one sample with good quality data

            int     currentDepth = 0, totalDepth = 0, numCovered = 0;
            double  currentQuality = 0.0, averageMapQuality = 0.0;
            bool    passMapQualityFilter = false;

            for (int i = 0; i < n; i++)
            {
                currentDepth = glf[i].GetDepth(position);
                if (currentDepth != 0)
                {
                    totalDepth += currentDepth;
                    numCovered++;     // not currently used -- will be "NS"
                    currentQuality = glf[i].GetMapQuality(position);
                    averageMapQuality += 
                        currentDepth * currentQuality * currentQuality;
                    if (currentQuality >= mapQuality)
                        passMapQualityFilter = true;
                }
            }
            averageMapQuality = sqrt(averageMapQuality / totalDepth);


            filter.Clear();

            if (!passMapQualityFilter)
            {
                if (filter.Length() == 0) mapQualityFilter++;
                if (hardFilter) continue;
                filter.catprintf("%smq%d", filter.Length() ? ";" : "", mapQuality);
            }

            if (totalDepth < minTotalDepth)
            {
                if (filter.Length() == 0) depthFilter++;
                if (hardFilter) continue;
                filter.catprintf("%sdp%d", filter.Length() ? ";" : "", minTotalDepth);
            }

            if (totalDepth > maxTotalDepth)
            {
                if (filter.Length() == 0) depthFilter++;
                if (hardFilter) continue;
                filter.catprintf("%sDP%d", filter.Length() ? ";" : "", maxTotalDepth);
            }

            // Create convenient aliases for each base
            unsigned char transition = (((refBase - 1) ^ 2) + 1);
            unsigned char transvers1 = (((refBase - 1) ^ 3) + 1);
            unsigned char transvers2 = (((refBase - 1) ^ 1) + 1);

            int homRef = glf[0].GenotypeIndex(refBase, refBase);

            // Calculate likelihood assuming every is homozygous for the reference
            double lRef = log(1.0 - prior);
            for (int i = 0; i < n; i++)
                lRef += log(glf[i].GetLikelihoods(position)[homRef]);

            // Calculate maximum likelihood for a variant
            if (smartFilter)
            {
                double anyVariant = log(prior) + FilteringLikelihood(glf, n, position, refBase);

                if (exp(lRef - anyVariant) > (1.0 - posterior)/posterior)
                {
                    smartFilterHits++;
                    continue;
                }
            }

	    //fprintf(stderr,"position = %d\n",position);

            double pTs = uniformPrior ? 1./3. : 2./3.;
            double pTv = uniformPrior ? 1./3. : 1./6.;

            // Calculate likelihoods for the most likelily SNP configurations
            double refTransition = log(prior * pTs) + PolymorphismLikelihood(glf, n, position, refBase, transition);
            double refTransvers1 = log(prior * pTv) + PolymorphismLikelihood(glf, n, position, refBase, transvers1);
            double refTransvers2 = log(prior * pTv) + PolymorphismLikelihood(glf, n, position, refBase, transvers2);

            // Calculate likelihoods for less likely SNP configurations
            double transitiontv1 = log(prior * 0.001) + PolymorphismLikelihood(glf, n, position, transition, transvers1);
            double transitiontv2 = log(prior * 0.001) + PolymorphismLikelihood(glf, n, position, transition, transvers2);
            double transvers1tv2 = log(prior * 0.001) + PolymorphismLikelihood(glf, n, position, transvers1, transvers2);

            // Calculate the likelihood for unusual configurations where everyone is heterozygous ...
            double sink = n > 10 ? log(prior * 1e-8) + SinkLikelihood(glf, n, position) : -1e100;

            double lmax = max(
                              max(max(lRef, refTransition),max(refTransvers1, refTransvers2)),
                              max(max(transitiontv1, transitiontv2), max(transvers1tv2, sink)));

            double sum = exp(lRef - lmax) + exp(refTransition -lmax) +
                exp(refTransvers1 - lmax) + exp(refTransvers2 - lmax) +
                exp(transitiontv1 - lmax) + exp(transitiontv2 - lmax) +
                exp(transvers1tv2 - lmax) + exp(sink - lmax);

            if (sum == 0.0) continue;

            if (exp(lRef - lmax)/sum > 1.0 - prior)
            {
                if (filter.Length() == 0) homozygousReference++;

                if (positions.Entries())
                    ReportSNP(glf, n, position, refBase, refBase, refBase, filter, totalDepth, averageMapQuality, lRef / sum);

                continue;
            }

            double quality = 1.0 - exp(lRef - lmax) / sum;

            if (verbose)
            {
                DumpDetails(glf, n, position, refBase);

                printf("%.3f %.3f %.3f %.3f %.3f %.3f %.3f\n",
                       lRef, refTransition, refTransvers1, refTransvers2,
                       transitiontv1, transitiontv2, transvers1tv2);
            }

            if (exp(refTransition - lmax)/sum > posterior)
            {
                ReportSNP(glf, n, position, refBase, refBase, transition,
                          filter, totalDepth, averageMapQuality, quality /* refTransition/sum */);
                if (filter.Length() == 0) transitions++;
            }
            else if (exp(refTransvers1 - lmax)/sum > posterior)
            {
                ReportSNP(glf, n, position, refBase, refBase, transvers1,
                          filter, totalDepth, averageMapQuality, quality /* refTransvers1/sum */);
                if (filter.Length() == 0) transversions++;
            }
            else if (exp(refTransvers2 - lmax)/sum > posterior)
            {
                ReportSNP(glf, n, position, refBase, refBase, transvers2,
                          filter, totalDepth, averageMapQuality, quality /* refTransvers2/sum */);
                if (filter.Length() == 0) transversions++;
            }
            else if (exp(transitiontv1 - lmax)/sum > posterior)
            {
                ReportSNP(glf, n, position, refBase, transition, transvers1,
                          filter, totalDepth, averageMapQuality, quality /* transitiontv1/sum */);
                if (filter.Length() == 0) otherPolymorphisms++;
            }
            else if (exp(transitiontv2 - lmax)/sum > posterior)
            {
                ReportSNP(glf, n, position, refBase, transition, transvers2,
                          filter, totalDepth, averageMapQuality, quality /* transitiontv2/sum */);
                if (filter.Length() == 0) otherPolymorphisms++;
            }
            else if (exp(transvers1tv2 - lmax)/sum > posterior)
            {
                ReportSNP(glf, n, position, refBase, transvers1, transvers2,
                          filter, totalDepth, averageMapQuality, quality /* transvers1tv2/sum */);
                if (filter.Length() == 0) otherPolymorphisms++;
            }
            else if (exp(sink - lmax)/sum > posterior)
                sinkFilter++;
        }

        int actualBases = glf[firstGlf].maxPosition - baseCounts[0];

        printf("          Missing bases = %9d (%.3f%%)\n",
               baseCounts[0], baseCounts[0] * 100. / glf[firstGlf].maxPosition);
        printf("        Reference bases = %9d (%.3f%%)\n",
               glf[firstGlf].maxPosition - baseCounts[0], (glf[firstGlf].maxPosition - baseCounts[0]) * 100. / glf[firstGlf].maxPosition);

        printf("              A/T bases = %9d (%.3f%%, %d A, %d T)\n",
               baseCounts[1] + baseCounts[4],
               (baseCounts[1] + baseCounts[4]) * 100. / actualBases,
               baseCounts[1], baseCounts[4]);

        printf("              G/C bases = %9d (%.3f%%, %d G, %d C)\n",
               baseCounts[3] + baseCounts[2],
               (baseCounts[3] + baseCounts[2]) * 100. / actualBases,
               baseCounts[3], baseCounts[2]);

        printf("           Depth Filter = %9d bases (%.3f%%)\n",
               depthFilter, depthFilter * 100. / actualBases);

        printf("     Map Quality Filter = %9d bases (%.3f%%)\n",
               mapQualityFilter, mapQualityFilter * 100. / actualBases);

        printf("        Non-Polymorphic = %9d bases (%.3f%%)\n",
               homozygousReference, homozygousReference * 100. / actualBases);

        printf("            Transitions = %9d bases (%.3f%%)\n",
               transitions, transitions * 100. / actualBases);

        printf("          Transversions = %9d bases (%.3f%%)\n",
               transversions, transversions * 100. / actualBases);

        printf("    Other Polymorphisms = %9d bases (%.3f%%)\n",
               otherPolymorphisms, otherPolymorphisms * 100. / actualBases);

        if (n > 10)
            printf("          Homology Sink = %9d bases (%.3f%%)\n",
                   sinkFilter, sinkFilter * 100. / actualBases);

        if (smartFilter)
            printf("           Smart Filter = %9d bases (%.3f%%)\n",
                   smartFilterHits, smartFilterHits * 100. / actualBases);

        int noCalls = actualBases - homozygousReference - transitions - transversions - otherPolymorphisms - sinkFilter;
        printf("                No call = %9d bases (%.3f%%)\n",
               noCalls, noCalls * 100. / actualBases);

        fflush(stdout);
    }

    if (baseCalls != NULL)
        fclose(baseCalls);

    time(&t);
    printf("\nAnalysis completed on %s\n", ctime(&t));
    fflush(stdout);
}
Exemple #2
0
int Convert::execute(int argc, char **argv)
{
    // Extract command line arguments.
    String inFile = "";
    String outFile = "";
    String refFile = "";
    bool lshift = false;
    bool noeof = false;
    bool params = false;

    bool useBases = false;
    bool useEquals = false;
    bool useOrigSeq = false;

    bool recover = false;

    ParameterList inputParameters;
    BEGIN_LONG_PARAMETERS(longParameterList)
        LONG_STRINGPARAMETER("in", &inFile)
        LONG_STRINGPARAMETER("out", &outFile)
        LONG_STRINGPARAMETER("refFile", &refFile)
        LONG_PARAMETER("lshift", &lshift)
        LONG_PARAMETER("noeof", &noeof)
        LONG_PARAMETER("recover", &recover)
        LONG_PARAMETER("params", &params)
        LONG_PARAMETER_GROUP("SequenceConversion")
            EXCLUSIVE_PARAMETER("useBases", &useBases)
            EXCLUSIVE_PARAMETER("useEquals", &useEquals)
            EXCLUSIVE_PARAMETER("useOrigSeq", &useOrigSeq)
        LONG_PHONEHOME(VERSION)
        END_LONG_PARAMETERS();
   
    inputParameters.Add(new LongParameters ("Input Parameters", 
                                            longParameterList));
    
    // parameters start at index 2 rather than 1.
    inputParameters.Read(argc, argv, 2);

    // If no eof block is required for a bgzf file, set the bgzf file type to 
    // not look for it.
    if(noeof)
    {
        // Set that the eof block is not required.
        BgzfFileType::setRequireEofBlock(false);
    }
    
    // Check to see if the in file was specified, if not, report an error.
    if(inFile == "")
    {
        printUsage(std::cerr);
        inputParameters.Status();
        // In file was not specified but it is mandatory.
        std::cerr << "--in is a mandatory argument, "
                  << "but was not specified" << std::endl;
        return(-1);
    }

    if(outFile == "")
    {
        printUsage(std::cerr);
        inputParameters.Status();
        // In file was not specified but it is mandatory.
        std::cerr << "--out is a mandatory argument, "
                  << "but was not specified" << std::endl;
        return(-1);
    }

    // Check to see if the ref file was specified.
    // Open the reference.
    GenomeSequence* refPtr = NULL;
    if(refFile != "")
    {
        refPtr = new GenomeSequence(refFile);
    }

    SamRecord::SequenceTranslation translation;
    if((useBases) && (refPtr != NULL))
    {
        translation = SamRecord::BASES;
    }
    else if((useEquals) && (refPtr != NULL))
    {
        translation = SamRecord::EQUAL;
    }
    else
    {
        useOrigSeq = true;
        translation = SamRecord::NONE;
    }
    
    if(params)
    {
        inputParameters.Status();
    }

    // Open the input file for reading.
    SamFile samIn;
    if(recover) samIn.setAttemptRecovery(true);
    samIn.OpenForRead(inFile);

    // Open the output file for writing.
    SamFile samOut;
    samOut.OpenForWrite(outFile);
    samOut.SetWriteSequenceTranslation(translation);
    samOut.SetReference(refPtr);

    // Read the sam header.
    SamFileHeader samHeader;
    samIn.ReadHeader(samHeader);

    // Write the sam header.
    samOut.WriteHeader(samHeader);

    SamRecord samRecord;

    // Set returnStatus to success.  It will be changed
    // to the failure reason if any of the writes fail.
    SamStatus::Status returnStatus = SamStatus::SUCCESS;

    while(1) {
        try {
            // Keep reading records until ReadRecord returns false.
            while(samIn.ReadRecord(samHeader, samRecord))
            {
                // left shift if necessary.
                if(lshift)
                {
                    samRecord.shiftIndelsLeft();
                }

                // Successfully read a record from the file, so write it.
                if(!samOut.WriteRecord(samHeader, samRecord))
                {
                    // Failed to write a record.
                    fprintf(stderr, "%s\n", samOut.GetStatusMessage());
                    returnStatus = samOut.GetStatus();
                }
            }
            break;
        } catch (std::runtime_error e) {
            std::cerr << "Caught runtime error: " << e.what() << "\n";
            if(!recover) {
                std::cerr << "Corrupted BAM file detected - consider using --recover option.\n";
                break;
            }
            std::cerr << "Attempting to resync at next good BGZF block and BAM record.\n";
            // XXX need to resync SamFile stream here
            bool rc = samIn.attemptRecoverySync(checkSignature, SIGNATURE_LENGTH);
            if(rc) {
                std::cerr << "Successful resync - some data lost.\n";
                continue;    // succeeded
            }
            std::cerr << "Failed to re-sync on data stream.\n";
            break;              // failed to resync
        }
    }

    std::cerr << std::endl << "Number of records read = " << 
        samIn.GetCurrentRecordCount() << std::endl;
    std::cerr << "Number of records written = " << 
        samOut.GetCurrentRecordCount() << std::endl;

    if(refPtr != NULL)
    {
        delete(refPtr);
    }

    // Since the reads were successful, return the status based
    // on the status of the writes.  If any failed, return
    // their failure status.
    return(returnStatus);
}
Exemple #3
0
// main function of verifyBamID
int execute(int argc, char** argv) {
  printf("verifyBamID %s -- verify identity and purity of sequence data\n"
	 "(c) 2010-2014 Hyun Min Kang, Goo Jun, and Goncalo Abecasis\n\n", VERSION);

  VerifyBamIDArgs args;
  ParameterList pl;

  BEGIN_LONG_PARAMETERS(longParameters)
    LONG_PARAMETER_GROUP("Input Files")
    LONG_STRINGPARAMETER("vcf",&args.sVcfFile)
    LONG_STRINGPARAMETER("bam",&args.sBamFile)
    LONG_STRINGPARAMETER("subset",&args.sSubsetInds)
    LONG_STRINGPARAMETER("smID",&args.sSMID)

    LONG_PARAMETER_GROUP("VCF analysis options")
    LONG_DOUBLEPARAMETER("genoError",&args.genoError)
    LONG_DOUBLEPARAMETER("minAF",&args.minAF)
    LONG_DOUBLEPARAMETER("minCallRate",&args.minCallRate)

    LONG_PARAMETER_GROUP("Individuals to compare with chip data")
    EXCLUSIVE_PARAMETER("site",&args.bSiteOnly)
    EXCLUSIVE_PARAMETER("self",&args.bSelfOnly)
    EXCLUSIVE_PARAMETER("best",&args.bFindBest)

    LONG_PARAMETER_GROUP("Chip-free optimization options")
    EXCLUSIVE_PARAMETER("free-none",&args.bFreeNone)
    EXCLUSIVE_PARAMETER("free-mix",&args.bFreeMixOnly)
    EXCLUSIVE_PARAMETER("free-refBias",&args.bFreeRefBiasOnly)
    EXCLUSIVE_PARAMETER("free-full",&args.bFreeFull)

    LONG_PARAMETER_GROUP("With-chip optimization options")
    EXCLUSIVE_PARAMETER("chip-none",&args.bChipNone)
    EXCLUSIVE_PARAMETER("chip-mix",&args.bChipMixOnly)
    EXCLUSIVE_PARAMETER("chip-refBias",&args.bChipRefBiasOnly)
    EXCLUSIVE_PARAMETER("chip-full",&args.bChipFull)

    LONG_PARAMETER_GROUP("BAM analysis options")
    LONG_PARAMETER("ignoreRG",&args.bIgnoreRG)
    LONG_PARAMETER("ignoreOverlapPair",&args.bIgnoreOverlapPair)
    LONG_PARAMETER("noEOF",&args.bNoEOF)
    LONG_PARAMETER("precise",&args.bPrecise)
    LONG_INTPARAMETER("minMapQ",&args.minMapQ)
    LONG_INTPARAMETER("maxDepth",&args.maxDepth)
    LONG_INTPARAMETER("minQ",&args.minQ)
    LONG_INTPARAMETER("maxQ",&args.maxQ)
    LONG_DOUBLEPARAMETER("grid",&args.grid)

    LONG_PARAMETER_GROUP("Modeling Reference Bias")
    LONG_DOUBLEPARAMETER("refRef",&args.pRefRef)
    LONG_DOUBLEPARAMETER("refHet",&args.pRefHet)
    LONG_DOUBLEPARAMETER("refAlt",&args.pRefAlt)

    LONG_PARAMETER_GROUP("Output options")
    LONG_STRINGPARAMETER("out",&args.sOutFile)
    LONG_PARAMETER("verbose",&args.bVerbose)
    LONG_PHONEHOME(VERSION)
  END_LONG_PARAMETERS();

  pl.Add(new LongParameters("Available Options",longParameters));
  pl.Read(argc, argv);
  pl.Status();

  // check the validity of input files
  if ( args.sVcfFile.IsEmpty() ) {
    error("--vcf [vcf file] required");
  }

  if ( args.sBamFile.IsEmpty() ) {
    error("--bam [bam file] is required");
  }

  if ( args.sOutFile.IsEmpty() ) {
    error("--out [output prefix] is required");
  }
  Logger::gLogger = new Logger((args.sOutFile + ".log").c_str(), args.bVerbose);

  if ( ! ( args.bSiteOnly || args.bSelfOnly || args.bFindBest ) ) {
    warning("--self option was autotomatically turned on by default. Specify --best option if you wanted to check across all possible samples in the VCF");
    args.bSelfOnly = true;
  }

  if ( ( args.maxDepth > 20 ) && ( !args.bPrecise ) ) {
    warning("--precise option is not turned on at --maxDepth %d : may be prone to precision errors",args.maxDepth);
  }

  if ( ( args.bChipRefBiasOnly ) && ( !args.bSelfOnly ) ) {
    error("--self must be set for --chip-refBias to work. Skipping..");
  }

  // check timestamp
  time_t t;
  time(&t);
  Logger::gLogger->writeLog("Analysis started on %s",ctime(&t));

  // load arguments
  VerifyBamID vbid(&args);

  // load input VCF and BAM files
  Logger::gLogger->writeLog("Opening Input Files");
  vbid.loadFiles(args.sBamFile.c_str(), args.sVcfFile.c_str());

  // Check which genotype-free method is used
  if ( args.bFreeNone ) {  // if no genotype-free mode is tested. skip it
    // do nothing for genotype-free estimation
    Logger::gLogger->writeLog("Skipping chip-free estimation of sample mixture");
  }
  else if ( args.bFreeMixOnly ) { // only mixture is estimated.
    // genotype-free method
    Logger::gLogger->writeLog("Performing chip-free estimation of sample mixture at fixed reference bias parameters (%lf, %lf, %lf)",args.pRefRef,args.pRefHet,args.pRefAlt);

    // scan across multiple readgroups
    for(int rg=-1; rg < vbid.nRGs - (int)args.bIgnoreRG; ++rg) {
      VerifyBamID::mixLLK mix(&vbid);
      mix.OptimizeLLK(rg);
      Logger::gLogger->writeLog("Optimal per-sample fMix = %lf, LLK0 = %lf, LLK1 = %lf\n",mix.fMix,mix.llk0,mix.llk1);
      vbid.mixOut.llk0s[rg+1] = mix.llk0;
      vbid.mixOut.llk1s[rg+1] = mix.llk1;
      vbid.mixOut.fMixs[rg+1] = mix.fMix;
    }

    //vbid.mixRefHet = 0.5;
    //vbid.mixRefAlt = 0.00;
  }
  else if ( args.bFreeRefBiasOnly ) {
    Logger::gLogger->writeLog("Performing chip-free estimation of reference-bias without sample mixture");
    for(int rg=-1; rg < vbid.nRGs - (int)args.bIgnoreRG; ++rg) {
      VerifyBamID::refBiasMixLLKFunc myFunc(&vbid, rg);
      AmoebaMinimizer myMinimizer;
      Vector startingPoint(2);
      startingPoint[0] = 0;      // pRefHet = 0.5
      startingPoint[1] = -4.595; // pRefAlt = 0.01
      myMinimizer.func = &myFunc;
      myMinimizer.Reset(2);
      myMinimizer.point = startingPoint;
      myMinimizer.Minimize(1e-6);
      double pRefHet = VerifyBamID::invLogit(myMinimizer.point[0]);
      double pRefAlt = VerifyBamID::invLogit(myMinimizer.point[1]);
      Logger::gLogger->writeLog("Reference Bias Estimated as ( Pr[refBase|HET] = %lf, Pr[refBase|ALT] = %lf) with LLK = %lf at readGroup %d",pRefHet,pRefAlt,myMinimizer.fmin,rg);
      //vbid.setRefBiasParams(1.0, pRefHet, pRefAlt);

      vbid.mixOut.llk0s[rg+1] = myFunc.llk0;
      vbid.mixOut.llk1s[rg+1] = myFunc.llk1;
      vbid.mixOut.refHets[rg+1] = myFunc.pRefHet;
      vbid.mixOut.refAlts[rg+1] = myFunc.pRefAlt;
    }
  }
  else if ( args.bFreeFull ) {
    Logger::gLogger->writeLog("Performing chip-free estimation of reference-bias and sample mixture together");
    for(int rg = -1; rg < vbid.nRGs - args.bIgnoreRG; ++rg) {
      VerifyBamID::fullMixLLKFunc myFunc(&vbid, rg);
      AmoebaMinimizer myMinimizer;
      Vector startingPoint(3);
      startingPoint[0] = -3.91;  // start with fMix = 0.01
      startingPoint[1] = 0;      // pRefHet = 0.5
      startingPoint[2] = -4.595; // pRefAlt = 0.01
      myMinimizer.func = &myFunc;
      myMinimizer.Reset(3);
      myMinimizer.point = startingPoint;
      myMinimizer.Minimize(1e-6);
      double fMix = VerifyBamID::invLogit(myMinimizer.point[0]);
      if ( fMix > 0.5 ) 
	fMix = 1.-fMix;
      double pRefHet = VerifyBamID::invLogit(myMinimizer.point[1]);
      double pRefAlt = VerifyBamID::invLogit(myMinimizer.point[2]);
      Logger::gLogger->writeLog("Optimal per-sample fMix = %lf\n",fMix);
      Logger::gLogger->writeLog("Reference Bias Estimated as ( Pr[refBase|HET] = %lf, Pr[refBase|ALT] = %lf) with LLK = %lf",pRefHet,pRefAlt,myMinimizer.fmin);
      //vbid.setRefBiasParams(1.0, pRefHet, pRefAlt);

      vbid.mixOut.llk0s[rg+1] = myFunc.llk0;
      vbid.mixOut.llk1s[rg+1] = myFunc.llk1;
      vbid.mixOut.fMixs[rg+1] = myFunc.fMix;
      vbid.mixOut.refHets[rg+1] = myFunc.pRefHet;
      vbid.mixOut.refAlts[rg+1] = myFunc.pRefAlt;
    }
  }
  Logger::gLogger->writeLog("calculating depth distribution");  
  vbid.calculateDepthDistribution(args.maxDepth, vbid.mixOut);

  Logger::gLogger->writeLog("finished calculating depth distribution");  

  std::vector<int> bestInds(vbid.nRGs+1,-1);
  std::vector<int> selfInds(vbid.nRGs+1,-1);

  if ( args.bChipNone ) {
    // do nothing
    Logger::gLogger->writeLog("Skipping with-chip estimation of sample mixture");
  }
  else if ( args.bChipMixOnly ) {
    Logger::gLogger->writeLog("Performing with-chip estimation of sample mixture at fixed reference bias parameter (%lf, %lf, %lf)",args.pRefRef,args.pRefHet,args.pRefAlt);
    
    for(int rg=-1; rg < (vbid.nRGs - (int)args.bIgnoreRG); ++rg) {
      double maxIBD = -1;
      VerifyBamID::ibdLLK ibd(&vbid);
      for(int i=0; i < (int)vbid.pGenotypes->indids.size(); ++i) {
	double fIBD = ibd.OptimizeLLK(i, rg);
	Logger::gLogger->writeLog("Comparing with individual %s.. Optimal fIBD = %lf, LLK0 = %lf, LLK1 = %lf for readgroup %d",vbid.pGenotypes->indids[i].c_str(),fIBD, ibd.llk0, ibd.llk1, rg);
	if ( maxIBD < fIBD ) {
	  bestInds[rg+1] = i;
	  vbid.bestOut.llk0s[rg+1] = ibd.llk0;
	  vbid.bestOut.llk1s[rg+1] = ibd.llk1;
	  vbid.bestOut.fMixs[rg+1] = 1-ibd.fIBD;
	  maxIBD = ibd.fIBD;
	}

	if ( ( (rg < 0) && (vbid.pPile->sBamSMID == vbid.pGenotypes->indids[i] ) ) || ( ( rg >= 0 ) && ( vbid.pPile->vsSMIDs[rg] == vbid.pGenotypes->indids[i]) ) ) {
	  selfInds[rg+1] = i;
	  vbid.selfOut.llk0s[rg+1] = ibd.llk0;
	  vbid.selfOut.llk1s[rg+1] = ibd.llk1;
	  vbid.selfOut.fMixs[rg+1] = 1-ibd.fIBD;
	}
      }

      if ( bestInds[rg+1] >= 0 ) {
	Logger::gLogger->writeLog("Best Matching Individual is %s with IBD = %lf",vbid.pGenotypes->indids[bestInds[rg+1]].c_str(),maxIBD);
	vbid.calculateDepthByGenotype(bestInds[rg+1],rg,vbid.bestOut);
      }

      if ( selfInds[rg+1] >= 0 ) {
	Logger::gLogger->writeLog("Self Individual is %s with IBD = %lf",vbid.pGenotypes->indids[selfInds[rg+1]].c_str(),vbid.selfOut.fMixs[rg+1]);
	vbid.calculateDepthByGenotype(selfInds[rg+1],rg,vbid.selfOut);
      }
    }
  }
  else if ( args.bChipRefBiasOnly ) {
    Logger::gLogger->writeLog("Performing with-chip estimation of reference-bias without sample mixture");
    if ( args.bSelfOnly ) {
      for(int rg=-1; rg < (vbid.nRGs - (int)args.bIgnoreRG); ++rg) {
	VerifyBamID::refBiasIbdLLKFunc myFunc(&vbid, rg);
	AmoebaMinimizer myMinimizer;
	Vector startingPoint(2);
	startingPoint[0] = 0;      // pRefHet = 0.5
	startingPoint[1] = -4.595; // pRefAlt = 0.01
	myMinimizer.func = &myFunc;
	myMinimizer.Reset(2);
	myMinimizer.point = startingPoint;
	myMinimizer.Minimize(1e-6);
	double pRefHet = VerifyBamID::invLogit(myMinimizer.point[0]);
	double pRefAlt = VerifyBamID::invLogit(myMinimizer.point[1]);
	Logger::gLogger->writeLog("Reference Bias Estimated as ( Pr[refBase|HET] = %lf, Pr[refBase|ALT] = %lf) with LLK = %lf",pRefHet,pRefAlt,myMinimizer.fmin);
	//vbid.setRefBiasParams(1.0, pRefHet, pRefAlt);

	vbid.selfOut.llk0s[rg+1] = myFunc.llk0;
	vbid.selfOut.llk1s[rg+1] = myFunc.llk1;
	vbid.selfOut.refHets[rg+1] = myFunc.pRefHet;
	vbid.selfOut.refAlts[rg+1] = myFunc.pRefAlt;
	vbid.calculateDepthByGenotype(0,rg,vbid.selfOut);
      }
    }
    else {
      Logger::gLogger->warning("--self must be set for --chip-refBias to work. Skipping..");
    }
  }
  else if ( args.bChipFull ) {
    Logger::gLogger->writeLog("Performing with-chip estimation of reference-bias and sample mixture together");
    for(int rg=-1; rg < (vbid.nRGs - (int)args.bIgnoreRG); ++rg) {
      double maxIBD = -1;

      for(int i=0; i < (int)vbid.pGenotypes->indids.size(); ++i) {
	VerifyBamID::fullIbdLLKFunc myFunc(&vbid,i,rg);
	AmoebaMinimizer myMinimizer;
	Vector startingPoint(3);
	startingPoint[0] = 3.91;  // start with fIBD = 0.99
	startingPoint[1] = 0;      // pRefHet = 0.5
	startingPoint[2] = -4.595; // pRefAlt = 0.01
	myMinimizer.func = &myFunc;

	myFunc.indIdx = i;
	myMinimizer.Reset(3);
	myMinimizer.point = startingPoint;
	myMinimizer.Minimize(1e-6);
	double fIBD = VerifyBamID::invLogit(myMinimizer.point[0]);
	double pRefHet = VerifyBamID::invLogit(myMinimizer.point[1]);
	double pRefAlt = VerifyBamID::invLogit(myMinimizer.point[2]);

	Logger::gLogger->writeLog("Comparing with individual %s.. Optimal fIBD = %lf, LLK0 = %lf, LLK1 = %lf for readgroup %d",vbid.pGenotypes->indids[i].c_str(), fIBD, myFunc.llk0, myFunc.llk1, rg);
	//Logger::gLogger->writeLog("Optimal per-sample fIBD = %lf, ",fIBD);
	Logger::gLogger->writeLog("Reference Bias Estimated as ( Pr[refBase|HET] = %lf, Pr[refBase|ALT] = %lf ) with LLK = %lf",pRefHet,pRefAlt,myMinimizer.fmin);
	if ( maxIBD < fIBD ) {
	  bestInds[rg+1] = i;
	  maxIBD = fIBD;
	  vbid.bestOut.llk0s[rg+1] = myFunc.llk0;
	  vbid.bestOut.llk1s[rg+1] = myFunc.llk1;
	  vbid.bestOut.fMixs[rg+1] = 1.-myFunc.fIBD;
	  vbid.bestOut.refHets[rg+1] = myFunc.pRefHet;
	  vbid.bestOut.refAlts[rg+1] = myFunc.pRefAlt;
	}

	if ( ( (rg < 0) && (vbid.pPile->sBamSMID == vbid.pGenotypes->indids[i] ) ) || ( ( rg >= 0 ) && ( vbid.pPile->vsSMIDs[rg] == vbid.pGenotypes->indids[i]) ) ) {
	  selfInds[rg+1] = i;
	  vbid.selfOut.llk0s[rg+1] = myFunc.llk0;
	  vbid.selfOut.llk1s[rg+1] = myFunc.llk1;
	  vbid.selfOut.fMixs[rg+1] = 1.-myFunc.fIBD;
	  vbid.selfOut.refHets[rg+1] = myFunc.pRefHet;
	  vbid.selfOut.refAlts[rg+1] = myFunc.pRefAlt;
	  vbid.calculateDepthByGenotype(i, rg, vbid.selfOut);
	}
      }
      //vbid.setRefBiasParams(1.0, pRefHet, pRefAlt);
      if ( bestInds[rg+1] >= 0 ) {
	Logger::gLogger->writeLog("Best Matching Individual is %s with IBD = %lf",vbid.pGenotypes->indids[bestInds[rg+1]].c_str(),maxIBD);
	vbid.calculateDepthByGenotype(bestInds[rg+1], rg, vbid.bestOut);
      }

      if ( selfInds[rg+1] >= 0 ) {
	Logger::gLogger->writeLog("Self Individual is %s with IBD = %lf",vbid.pGenotypes->indids[selfInds[rg+1]].c_str(),vbid.selfOut.fMixs[rg+1]);
	vbid.calculateDepthByGenotype(selfInds[rg+1],rg,vbid.selfOut);
      }
    }
  }

  // PRINT OUTPUT FILE - ".selfSM"
  // [SEQ_ID]  : SAMPLE ID in the sequence file
  // [CHIP_ID] : SAMPLE ID in the chip file (NA if not available)
  // [#SNPS] : Number of markers evaluated
  // [#READS]   : Number of reads evaluated
  // [AVG_DP]   : Mean depth
  // [FREEMIX]  : Chip-free estimated alpha (% MIX in 0-1 scale), NA if unavailable
  // [FREELK1]  : Chip-free log-likelihood at estimated alpha
  // [FREELK0]  : Chip-free log-likelihood at 0% contamination
  // [CHIPIBD]  : With-chip estimated alpha (% MIX in 0-1 scale)
  // [CHIPLK1]  : With-chip log-likelihood at estimated alpha
  // [CHIPLK0]  : With-chip log-likelihood at 0% contamination
  // [DPREF]    : Depth at reference site in the chip
  // [RDPHET]   : Relative depth at HET site in the chip
  // [RDPALT]   : Relative depth at HOMALT site in the chip
  // [FREE_RF]  : Pr(Ref|Ref) site estimated without chip data
  // [FREE_RH]  : Pr(Ref|Het) site estimated without chip data
  // [FREE_RA]  : Pr(Ref|Alt) site estimated without chip data
  // [CHIP_RF]  : Pr(Ref|Ref) site estimated with chip data
  // [CHIP_RH]  : Pr(Ref|Het) site estimated with chip data
  // [CHIP_RA]  : Pr(Ref|Alt) site estimated with chip data
  // [DPREF]    : Depth at reference alleles
  // [RDPHET]   : Relative depth at heterozygous alleles
  // [RDPALT]   : Relative depth at hom-alt alleles

  String selfSMFN = args.sOutFile + ".selfSM";
  String bestSMFN = args.sOutFile + ".bestSM";
  String selfRGFN = args.sOutFile + ".selfRG";
  String bestRGFN = args.sOutFile + ".bestRG";
  String dpSMFN = args.sOutFile + ".depthSM";
  String dpRGFN = args.sOutFile + ".depthRG";

  IFILE selfSMF = ifopen(selfSMFN,"wb");
  IFILE bestSMF = (args.bFindBest ? ifopen(bestSMFN,"wb") : NULL);
  IFILE selfRGF = (args.bIgnoreRG ? NULL : ifopen(selfRGFN,"wb"));
  IFILE bestRGF = (args.bFindBest && !args.bIgnoreRG) ? ifopen(bestRGFN,"wb") : NULL;

  IFILE dpSMF = ifopen(dpSMFN,"wb");
  IFILE dpRGF = (args.bIgnoreRG ? NULL : ifopen(dpRGFN,"wb"));
  if ( selfSMF == NULL ) {
    Logger::gLogger->error("Cannot write to %s",selfSMF);
  }
  if ( args.bFindBest && ( bestSMF == NULL ) ) {
    Logger::gLogger->error("Cannot write to %s",bestSMF);
  }
  if ( dpSMF == NULL ) {
    Logger::gLogger->error("Cannot write to %s",dpSMF);
  }

  ifprintf(dpSMF,"#RG\tDEPTH\t#SNPs\t%%SNPs\t%%CUMUL\n");
  int nCumMarkers = 0;
  for(int i=args.maxDepth; i >= 0; --i) {
    nCumMarkers += vbid.mixOut.depths[i];
    ifprintf(dpSMF,"ALL\t%d\t%d\t%.5lf\t%.5lf\n",i, vbid.mixOut.depths[i],(double) vbid.mixOut.depths[i]/(double)vbid.nMarkers,(double)nCumMarkers/(double)vbid.nMarkers);
  }
  ifclose(dpSMF);


  if ( dpRGF != NULL ) {
    ifprintf(dpRGF,"#RG\tDEPTH\t#SNPs\t%%SNPs\t%%CUMUL\n");
    for(int rg=0; rg < (vbid.nRGs - (int)args.bIgnoreRG); ++rg) {
      const char* rgID = vbid.pPile->vsRGIDs[rg].c_str();

      int nMarkers = 0;
      for(int i=args.maxDepth; i >= 0; --i) {
	nMarkers += vbid.mixOut.depths[(rg+1)*(args.maxDepth+1) + i];
      }

      nCumMarkers = 0;
      for(int i=args.maxDepth; i >= 0; --i) {
	int d = vbid.mixOut.depths[(rg+1)*(args.maxDepth+1) + i];
	nCumMarkers += d;
	ifprintf(dpRGF,"%s\t%d\t%d\t%.5lf\t%.5lf\n",rgID,i,d,(double)d/(double)vbid.nMarkers,(double)nCumMarkers/(double)nMarkers);
      }
    }
    ifclose(dpRGF);
  }

  const char* headers[] = {"#SEQ_ID","RG","CHIP_ID","#SNPS","#READS","AVG_DP","FREEMIX","FREELK1","FREELK0","FREE_RH","FREE_RA","CHIPMIX","CHIPLK1","CHIPLK0","CHIP_RH","CHIP_RA","DPREF","RDPHET","RDPALT"};
  int nheaders = sizeof(headers)/sizeof(headers[0]);

  for(int i=0; i < nheaders; ++i) { ifprintf(selfSMF,"%s%s",i>0 ? "\t" : "",headers[i]); }
  ifprintf(selfSMF,"\n");
  ifprintf(selfSMF,"%s\tALL",vbid.pPile->sBamSMID.c_str());
  ifprintf(selfSMF,"\t%s",selfInds[0] >= 0 ? vbid.pGenotypes->indids[selfInds[0]].c_str() : "NA");
  ifprintf(selfSMF,"\t%d\t%d\t%.2lf",vbid.nMarkers,vbid.mixOut.numReads[0],(double)vbid.mixOut.numReads[0]/(double)vbid.nMarkers);
  if ( args.bFreeNone ) { ifprintf(selfSMF,"\tNA\tNA\tNA\tNA\tNA"); }
  else if ( args.bFreeMixOnly ) { ifprintf(selfSMF,"\t%.5lf\t%.2lf\t%.2lf\tNA\tNA",vbid.mixOut.fMixs[0],vbid.mixOut.llk1s[0],vbid.mixOut.llk0s[0]); }
  else if ( args.bFreeRefBiasOnly ) { ifprintf(selfSMF,"\tNA\t%.2lf\t%.2lf\t%.5lf\t%.5lf",vbid.mixOut.llk1s[0],vbid.mixOut.llk0s[0],vbid.mixOut.refHets[0],vbid.mixOut.refAlts[0]); }
  else if ( args.bFreeFull ) { ifprintf(selfSMF,"\t%.5lf\t%.2lf\t%.2lf\t%.5lf\t%.5lf",vbid.mixOut.fMixs[0],vbid.mixOut.llk1s[0],vbid.mixOut.llk0s[0],vbid.mixOut.refHets[0],vbid.mixOut.refAlts[0]); }
  else { error("Invalid option in handling bFree"); }

  if ( args.bChipNone || bestInds[0] < 0 ) { ifprintf(selfSMF,"\tNA\tNA\tNA\tNA\tNA\tNA\tNA\tNA"); }
  else if ( args.bChipMixOnly ) { ifprintf(selfSMF,"\t%.5lf\t%.2lf\t%.2lf\tNA\tNA\t%.3lf\t%.4lf\t%.4lf",vbid.selfOut.fMixs[0],vbid.selfOut.llk1s[0],vbid.selfOut.llk0s[0],(double)vbid.selfOut.numReads[1]/vbid.selfOut.numGenos[1], (double)vbid.selfOut.numReads[2]*vbid.selfOut.numGenos[1]/vbid.selfOut.numReads[1]/vbid.selfOut.numGenos[2], (double)vbid.selfOut.numReads[3]*vbid.selfOut.numGenos[1]/vbid.selfOut.numReads[1]/vbid.selfOut.numGenos[3]); }
  else if ( args.bChipMixOnly ) { ifprintf(selfSMF,"\tNA\t%.2lf\t%.2lf\t%.5lf\t%.5lf\t%.3lf\t%.4lf\t%.4lf",vbid.selfOut.llk1s[0], vbid.selfOut.llk0s[0], vbid.selfOut.refHets[0], vbid.selfOut.refAlts[0], (double)vbid.selfOut.numReads[1]/vbid.selfOut.numGenos[1], (double)vbid.selfOut.numReads[2]*vbid.selfOut.numGenos[1]/vbid.selfOut.numReads[1]/vbid.selfOut.numGenos[2], (double)vbid.selfOut.numReads[3]*vbid.selfOut.numGenos[1]/vbid.selfOut.numReads[1]/vbid.selfOut.numGenos[3]); }
  else if ( args.bChipFull ) { ifprintf(selfSMF,"\t%.5lf\t%.2lf\t%.2lf\t%.5lf\t%.5lf\t%.3lf\t%.4lf\t%.4lf", vbid.selfOut.fMixs[0], vbid.selfOut.llk1s[0], vbid.selfOut.llk0s[0], vbid.selfOut.refHets[0], vbid.selfOut.refAlts[0], (double)vbid.selfOut.numReads[1]/vbid.selfOut.numGenos[1], (double)vbid.selfOut.numReads[2]*vbid.selfOut.numGenos[1]/vbid.selfOut.numReads[1]/vbid.selfOut.numGenos[2], (double)vbid.selfOut.numReads[3]*vbid.selfOut.numGenos[1]/vbid.selfOut.numReads[1]/vbid.selfOut.numGenos[3]); }
  else { error("Invalid option in handling bChip"); }
  ifprintf(selfSMF,"\n");
  ifclose(selfSMF);

  if ( bestSMF != NULL ) {
    for(int i=0; i < nheaders; ++i) { ifprintf(bestSMF,"%s%s",i>0 ? "\t" : "",headers[i]); }
    ifprintf(bestSMF,"\n");
    ifprintf(bestSMF,"%s\tALL",vbid.pPile->sBamSMID.c_str());
    ifprintf(bestSMF,"\t%s",bestInds[0] >= 0 ? vbid.pGenotypes->indids[bestInds[0]].c_str() : "NA");
    ifprintf(bestSMF,"\t%d\t%d\t%.2lf",vbid.nMarkers,vbid.mixOut.numReads[0],(double)vbid.mixOut.numReads[0]/(double)vbid.nMarkers);
    if ( args.bFreeNone ) { ifprintf(bestSMF,"\tNA\tNA\tNA\tNA\tNA"); }
    else if ( args.bFreeMixOnly ) { ifprintf(bestSMF,"\t%.5lf\t%.2lf\t%.2lf\tNA\tNA",vbid.mixOut.fMixs[0],vbid.mixOut.llk1s[0],vbid.mixOut.llk0s[0]); }
    else if ( args.bFreeRefBiasOnly ) { ifprintf(bestSMF,"\tNA\t%.2lf\t%.2lf\t%.5lf\t%.5lf",vbid.mixOut.llk1s[0],vbid.mixOut.llk0s[0],vbid.mixOut.refHets[0],vbid.mixOut.refAlts[0]); }
    else if ( args.bFreeFull ) { ifprintf(bestSMF,"\t%.5lf\t%.2lf\t%.2lf\t%.5lf\t%.5lf",vbid.mixOut.fMixs[0],vbid.mixOut.llk1s[0],vbid.mixOut.llk0s[0],vbid.mixOut.refHets[0],vbid.mixOut.refAlts[0]); }
    else { error("Invalid option in handling bFree"); }
    
    if ( args.bChipNone || bestInds[0] < 0 ) { ifprintf(bestSMF,"\tNA\tNA\tNA\tNA\tNA\tNA\tNA\tNA"); }
    else if ( args.bChipMixOnly ) { ifprintf(bestSMF,"\t%.5lf\t%.2lf\t%.2lf\tNA\tNA\t%.3lf\t%.4lf\t%.4lf",vbid.bestOut.fMixs[0],vbid.bestOut.llk1s[0],vbid.bestOut.llk0s[0],(double)vbid.bestOut.numReads[1]/vbid.bestOut.numGenos[1], (double)vbid.bestOut.numReads[2]*vbid.bestOut.numGenos[1]/vbid.bestOut.numReads[1]/vbid.bestOut.numGenos[2], (double)vbid.bestOut.numReads[3]*vbid.bestOut.numGenos[1]/vbid.bestOut.numReads[1]/vbid.bestOut.numGenos[3]); }
    else if ( args.bChipMixOnly ) { ifprintf(bestSMF,"\tNA\t%.2lf\t%.2lf\t%.5lf\t%.5lf\t%.3lf\t%.4lf\t%.4lf",vbid.bestOut.llk1s[0], vbid.bestOut.llk0s[0], vbid.bestOut.refHets[0], vbid.bestOut.refAlts[0], (double)vbid.bestOut.numReads[1]/vbid.bestOut.numGenos[1], (double)vbid.bestOut.numReads[2]*vbid.bestOut.numGenos[1]/vbid.bestOut.numReads[1]/vbid.bestOut.numGenos[2], (double)vbid.bestOut.numReads[3]*vbid.bestOut.numGenos[1]/vbid.bestOut.numReads[1]/vbid.bestOut.numGenos[3]); }
    else if ( args.bChipFull ) { ifprintf(bestSMF,"\t%.5lf\t%.2lf\t%.2lf\t%.5lf\t%.5lf\t%.3lf\t%.4lf\t%.4lf", vbid.bestOut.fMixs[0], vbid.bestOut.llk1s[0], vbid.bestOut.llk0s[0], vbid.bestOut.refHets[0], vbid.bestOut.refAlts[0], (double)vbid.bestOut.numReads[1]/vbid.bestOut.numGenos[1], (double)vbid.bestOut.numReads[2]*vbid.bestOut.numGenos[1]/vbid.bestOut.numReads[1]/vbid.bestOut.numGenos[2], (double)vbid.bestOut.numReads[3]*vbid.bestOut.numGenos[1]/vbid.bestOut.numReads[1]/vbid.bestOut.numGenos[3]); }
    else { error("Invalid option in handling bChip"); }
    ifprintf(bestSMF,"\n");
    ifclose(bestSMF);
  }

  if ( selfRGF != NULL ) {
    for(int i=0; i < nheaders; ++i) { ifprintf(selfRGF,"%s%s",i>0 ? "\t" : "",headers[i]); }
    ifprintf(selfRGF,"\n");
    for(int rg=0; rg < vbid.nRGs; ++rg) {
      ifprintf(selfRGF,"%s\t%s",vbid.pPile->sBamSMID.c_str(),vbid.pPile->vsRGIDs[rg].c_str());
      ifprintf(selfRGF,"\t%s",bestInds[rg] >= 0 ? vbid.pGenotypes->indids[bestInds[rg]].c_str() : "NA");
      ifprintf(selfRGF,"\t%d\t%d\t%.2lf",vbid.nMarkers,vbid.mixOut.numReads[(rg+1)*4],(double)vbid.mixOut.numReads[(rg+1)*4]/(double)vbid.mixOut.numGenos[(rg+1)*4]);
      if ( args.bFreeNone ) { ifprintf(selfRGF,"\tNA\tNA\tNA\tNA\tNA"); }
      else if ( args.bFreeMixOnly ) { ifprintf(selfRGF,"\t%.5lf\t%.2lf\t%.2lf\tNA\tNA",vbid.mixOut.fMixs[rg+1],vbid.mixOut.llk1s[rg+1],vbid.mixOut.llk0s[rg+1]); }
      else if ( args.bFreeRefBiasOnly ) { ifprintf(selfRGF,"\tNA\t%.2lf\t%.2lf\t%.5lf\t%.5lf",vbid.mixOut.llk1s[rg+1],vbid.mixOut.llk0s[rg+1],vbid.mixOut.refHets[rg+1],vbid.mixOut.refAlts[rg+1]); }
      else if ( args.bFreeFull ) { ifprintf(selfRGF,"\t%.5lf\t%.2lf\t%.2lf\t%.5lf\t%.5lf",vbid.mixOut.fMixs[rg+1],vbid.mixOut.llk1s[rg+1],vbid.mixOut.llk0s[rg+1],vbid.mixOut.refHets[rg+1],vbid.mixOut.refAlts[rg+1]); }
      else { error("Invalid option in handling bFree"); }
      
      if ( args.bChipNone || bestInds[0] < 0 ) { ifprintf(selfRGF,"\tNA\tNA\tNA\tNA\tNA\tNA\tNA\tNA"); }
      else if ( args.bChipMixOnly ) { ifprintf(selfRGF,"\t%.5lf\t%.2lf\t%.2lf\tNA\tNA\t%.3lf\t%.4lf\t%.4lf",vbid.selfOut.fMixs[rg+1], vbid.selfOut.llk1s[rg+1], vbid.selfOut.llk0s[rg+1], (double)vbid.selfOut.numReads[(rg+1)*4+1]/vbid.selfOut.numGenos[(rg+1)*4+1], (double)vbid.selfOut.numReads[(rg+1)*4+2]*vbid.selfOut.numGenos[(rg+1)*4+1]/vbid.selfOut.numReads[(rg+1)*4+1]/vbid.selfOut.numGenos[(rg+1)*4+2], (double)vbid.selfOut.numReads[(rg+1)*4+3]*vbid.selfOut.numGenos[(rg+1)*4+1]/vbid.selfOut.numReads[(rg+1)*4+1]/vbid.selfOut.numGenos[(rg+1)*4+3]); }
      else if ( args.bChipMixOnly ) { ifprintf(selfRGF,"\tNA\t%.2lf\t%.2lf\t%.5lf\t%.5lf\t%.3lf\t%.4lf\t%.4lf",vbid.selfOut.llk1s[rg+1], vbid.selfOut.llk0s[rg+1], vbid.selfOut.refHets[rg+1], vbid.selfOut.refAlts[rg+1], (double)vbid.selfOut.numReads[(rg+1)*4+1]/vbid.selfOut.numGenos[(rg+1)*4+1], (double)vbid.selfOut.numReads[(rg+1)*4+2]*vbid.selfOut.numGenos[(rg+1)*4+1]/vbid.selfOut.numReads[(rg+1)*4]/vbid.selfOut.numGenos[(rg+1)*4+2], (double)vbid.selfOut.numReads[(rg+1)*4+3]*vbid.selfOut.numGenos[(rg+1)*4+1]/vbid.selfOut.numReads[(rg+1)*4+1]/vbid.selfOut.numGenos[(rg+1)*4+3]); }
      else if ( args.bChipFull ) { ifprintf(selfRGF,"\t%.5lf\t%.2lf\t%.2lf\t%.5lf\t%.5lf\t%.3lf\t%.4lf\t%.4lf", vbid.selfOut.fMixs[rg+1], vbid.selfOut.llk1s[rg+1], vbid.selfOut.llk0s[rg+1], vbid.selfOut.refHets[rg+1], vbid.selfOut.refAlts[rg+1], (double)vbid.selfOut.numReads[(rg+1)*4+1]/vbid.selfOut.numGenos[(rg+1)*4+1], (double)vbid.selfOut.numReads[(rg+1)*4+2]*vbid.selfOut.numGenos[(rg+1)*4+1]/vbid.selfOut.numReads[(rg+1)*4+1]/vbid.selfOut.numGenos[(rg+1)*4+2], (double)vbid.selfOut.numReads[(rg+1)*4+3]*vbid.selfOut.numGenos[(rg+1)*4+1]/vbid.selfOut.numReads[(rg+1)*4+1]/vbid.selfOut.numGenos[(rg+1)*4+3]); }
      else { error("Invalid option in handling bChip"); }
      ifprintf(selfRGF,"\n");
    }
    ifclose(selfRGF);
  }

  if ( bestRGF != NULL ) {
    for(int i=0; i < nheaders; ++i) { ifprintf(bestRGF,"%s%s",i>0 ? "\t" : "",headers[i]); }
    ifprintf(bestRGF,"\n");
    for(int rg=0; rg < vbid.nRGs; ++rg) {
      ifprintf(bestRGF,"%s\t%s",vbid.pPile->sBamSMID.c_str(),vbid.pPile->vsRGIDs[rg].c_str());
      ifprintf(bestRGF,"\t%s",bestInds[rg] >= 0 ? vbid.pGenotypes->indids[bestInds[rg]].c_str() : "NA");
      ifprintf(bestRGF,"\t%d\t%d\t%.2lf",vbid.nMarkers,vbid.mixOut.numReads[(rg+1)*4],(double)vbid.mixOut.numReads[(rg+1)*4]/(double)vbid.mixOut.numGenos[(rg+1)*4]);
      if ( args.bFreeNone ) { ifprintf(bestRGF,"\tNA\tNA\tNA\tNA\tNA"); }
      else if ( args.bFreeMixOnly ) { ifprintf(bestRGF,"\t%.5lf\t%.2lf\t%.2lf\tNA\tNA",vbid.mixOut.fMixs[rg+1],vbid.mixOut.llk1s[rg+1],vbid.mixOut.llk0s[rg+1]); }
      else if ( args.bFreeRefBiasOnly ) { ifprintf(bestRGF,"\tNA\t%.2lf\t%.2lf\t%.5lf\t%.5lf",vbid.mixOut.llk1s[rg+1],vbid.mixOut.llk0s[rg+1],vbid.mixOut.refHets[rg+1],vbid.mixOut.refAlts[rg+1]); }
      else if ( args.bFreeFull ) { ifprintf(bestRGF,"\t%.5lf\t%.2lf\t%.2lf\t%.5lf\t%.5lf",vbid.mixOut.fMixs[rg+1],vbid.mixOut.llk1s[rg+1],vbid.mixOut.llk0s[rg+1],vbid.mixOut.refHets[rg+1],vbid.mixOut.refAlts[rg+1]); }
      else { error("Invalid option in handling bFree"); }
      
      if ( args.bChipNone || bestInds[0] < 0 ) { ifprintf(bestRGF,"\tNA\tNA\tNA\tNA\tNA\tNA\tNA\tNA"); }
      else if ( args.bChipMixOnly ) { ifprintf(bestRGF,"\t%.5lf\t%.2lf\t%.2lf\tNA\tNA\t%.3lf\t%.4lf\t%.4lf",vbid.bestOut.fMixs[rg+1], vbid.bestOut.llk1s[rg+1], vbid.bestOut.llk0s[rg+1], (double)vbid.bestOut.numReads[(rg+1)*4+1]/vbid.bestOut.numGenos[(rg+1)*4+1], (double)vbid.bestOut.numReads[(rg+1)*4+2]*vbid.bestOut.numGenos[(rg+1)*4+1]/vbid.bestOut.numReads[(rg+1)*4+1]/vbid.bestOut.numGenos[(rg+1)*4+2], (double)vbid.bestOut.numReads[(rg+1)*4+3]*vbid.bestOut.numGenos[(rg+1)*4+1]/vbid.bestOut.numReads[(rg+1)*4+1]/vbid.bestOut.numGenos[(rg+1)*4+3]); }
      else if ( args.bChipMixOnly ) { ifprintf(bestRGF,"\tNA\t%.2lf\t%.2lf\t%.5lf\t%.5lf\t%.3lf\t%.4lf\t%.4lf",vbid.bestOut.llk1s[rg+1], vbid.bestOut.llk0s[rg+1], vbid.bestOut.refHets[rg+1], vbid.bestOut.refAlts[rg+1], (double)vbid.bestOut.numReads[(rg+1)*4+1]/vbid.bestOut.numGenos[(rg+1)*4+1], (double)vbid.bestOut.numReads[(rg+1)*4+2]*vbid.bestOut.numGenos[(rg+1)*4+1]/vbid.bestOut.numReads[(rg+1)*4]/vbid.bestOut.numGenos[(rg+1)*4+2], (double)vbid.bestOut.numReads[(rg+1)*4+3]*vbid.bestOut.numGenos[(rg+1)*4+1]/vbid.bestOut.numReads[(rg+1)*4+1]/vbid.bestOut.numGenos[(rg+1)*4+3]); }
      else if ( args.bChipFull ) { ifprintf(bestRGF,"\t%.5lf\t%.2lf\t%.2lf\t%.5lf\t%.5lf\t%.3lf\t%.4lf\t%.4lf", vbid.bestOut.fMixs[rg+1], vbid.bestOut.llk1s[rg+1], vbid.bestOut.llk0s[rg+1], vbid.bestOut.refHets[rg+1], vbid.bestOut.refAlts[rg+1], (double)vbid.bestOut.numReads[(rg+1)*4+1]/vbid.bestOut.numGenos[(rg+1)*4+1], (double)vbid.bestOut.numReads[(rg+1)*4+2]*vbid.bestOut.numGenos[(rg+1)*4+1]/vbid.bestOut.numReads[(rg+1)*4+1]/vbid.bestOut.numGenos[(rg+1)*4+2], (double)vbid.bestOut.numReads[(rg+1)*4+3]*vbid.bestOut.numGenos[(rg+1)*4+1]/vbid.bestOut.numReads[(rg+1)*4+1]/vbid.bestOut.numGenos[(rg+1)*4+3]); }
      else { error("Invalid option in handling bChip"); }
      ifprintf(bestRGF,"\n");
    }
    ifclose(bestRGF);
  }
  
  time(&t);
  Logger::gLogger->writeLog("Analysis finished on %s",ctime(&t));

  return 0;
}
Exemple #4
0
int Validate::execute(int argc, char **argv)
{
    // Extract command line arguments.
    String inFile = "";
    int maxErrors = -1;
    int printableErrors = 100;
    bool so_flag = false;
    bool so_coord = false;
    bool so_query = false;
    bool noeof = false;
    bool disableStatistics = false;
    bool verbose = false;
    bool params = false;

    ParameterList inputParameters;
    BEGIN_LONG_PARAMETERS(longParameterList)
        LONG_STRINGPARAMETER("in", &inFile)
        LONG_PARAMETER("noeof", &noeof)
        LONG_INTPARAMETER("maxErrors", &maxErrors)
        LONG_PARAMETER("verbose", &verbose)
        LONG_INTPARAMETER("printableErrors", &printableErrors)
        LONG_PARAMETER("disableStatistics", &disableStatistics)
        LONG_PARAMETER("params", &params)
        LONG_PARAMETER_GROUP("SortOrder")
        EXCLUSIVE_PARAMETER("so_flag", &so_flag)
        EXCLUSIVE_PARAMETER("so_coord", &so_coord)
        EXCLUSIVE_PARAMETER("so_query", &so_query)
        END_LONG_PARAMETERS();
   
    inputParameters.Add(new LongParameters ("Input Parameters", 
                                            longParameterList));

    inputParameters.Read(argc-1, &(argv[1]));

    // Determine the sort type for validation based on the parameters.
    SamFile::SortedType sortType = SamFile::UNSORTED;
    if(so_flag)
    {
        sortType = SamFile::FLAG;
    } 
    else if(so_coord)
    {
        sortType = SamFile::COORDINATE;
    }
    else if(so_query)
    {
        sortType = SamFile::QUERY_NAME;
    }
   
    // If no eof block is required for a bgzf file, set the bgzf file type to 
    // not look for it.
    if(noeof)
    {
        // Set that the eof block is not required.
        BgzfFileType::setRequireEofBlock(false);
    }

    // Check to see if the in file was specified, if not, report an error.
    if(inFile == "")
    {
        usage();
        inputParameters.Status();
        // In file was not specified but it is mandatory.
        std::cerr << "--in is a mandatory argument for validate, "
                  << "but was not specified" << std::endl;
        return(-1);
    }

    if(params)
    {
        inputParameters.Status();
    }

    // Since we want to accumulate multiple errors, use RETURN rather
    // than throwing exceptions.
    SamFile samIn(ErrorHandler::RETURN);
    // Open the file for reading.   
    if(!samIn.OpenForRead(inFile))
    {
        fprintf(stderr, "%s\n", samIn.GetStatusMessage());
        return(samIn.GetStatus());
    }

    // Set the sorting validation type.
    samIn.setSortedValidation(sortType);

    // Set that statistics should be generated.
    samIn.GenerateStatistics(!disableStatistics);

    // Read the sam header.
    SamFileHeader samHeader;
    if(!samIn.ReadHeader(samHeader))
    {
        fprintf(stderr, "%s\n", samIn.GetStatusMessage());
        return(samIn.GetStatus());
    }

    // Read the sam records.
    SamRecord samRecord(ErrorHandler::RETURN);

    // Track the status.
    SamStatus::Status status = SamStatus::SUCCESS;

    // Keep reading records until the end of the file is reached.
    int numValidRecords = 0;
    int numInvalidRecords = 0;
    int numErrorRecords = 0;
    int numRecords = 0;
    int numReportedErrors = 0;
    int totalErrorRecords = 0;

    std::map<SamStatus::Status, uint64_t> errorStats;
    std::map<SamValidationError::Type, uint64_t> invalidStats;

    SamValidationErrors invalidSamErrors;

    // Keep reading records from the file until SamFile::ReadRecord
    // indicates to stop (returns false).
    while( ( (maxErrors < 0) || (totalErrorRecords < maxErrors) ) &&
           ( (samIn.ReadRecord(samHeader, samRecord)) || (SamStatus::isContinuableStatus(samIn.GetStatus())) ) )
    {
        ++numRecords;
        if(samIn.GetStatus() == SamStatus::SUCCESS)
        {
            // Successfully set the record, so check to see if it is valid.
            // Clear any errors in the list.
            invalidSamErrors.clear();
            if(!SamValidator::isValid(samHeader, samRecord, invalidSamErrors))
            {
                // The record is not valid.
                ++numInvalidRecords;
                ++totalErrorRecords;
                if(verbose && (numReportedErrors < printableErrors))
                {
                    std::cerr << "Record " << numRecords << std::endl
                              << invalidSamErrors << std::endl;
                    ++numReportedErrors;
                }
                // Update the statistics for all validation errors found in this record.
                invalidSamErrors.resetErrorIter();
                const SamValidationError* errorPtr = invalidSamErrors.getNextError();
                while(errorPtr != NULL)
                {
                    ++invalidStats[errorPtr->getType()];
                    errorPtr = invalidSamErrors.getNextError();
                }

                // If the status is not yet set, set it.
                if(status == SamStatus::SUCCESS)
                {
                    status = SamStatus::INVALID;
                }
            }
            else
            {
                // Valid record, so increment the counter.
                ++numValidRecords;
            }
        }
        else
        {
            // Error reading the record.
            ++numErrorRecords;
            ++totalErrorRecords;
            if(verbose && (numReportedErrors < printableErrors))
            {
                // report error.
                std::cerr << "Record " << numRecords << std::endl
                          << samIn.GetStatusMessage() << std::endl
                          << std::endl;
                ++numReportedErrors;
            }
            // Increment the statistics
            ++errorStats[samIn.GetStatus()];

            // If the status is not yet set, set it.
            if(status == SamStatus::SUCCESS)
            {
                status = samIn.GetStatus();
            }
        }
    }

    if( (samIn.GetStatus() != SamStatus::NO_MORE_RECS) &&
        (totalErrorRecords < maxErrors) )
    {
        // The last read call had a failure, so report it.
        // If the number of errors is >= ,maxErrors we don't
        // want to print any more failures.
        ++numErrorRecords;
        ++totalErrorRecords;
        if(numReportedErrors < printableErrors)
        {
            std::cerr << "Record " << numRecords << ": ";
            std::cerr << std::endl << samIn.GetStatusMessage() << std::endl;
        }

        // Increment the statistics
        ++errorStats[samIn.GetStatus()];

        if(status == SamStatus::SUCCESS)
        {
            status = samIn.GetStatus();
        }
    }

    if(totalErrorRecords == maxErrors)
    {
        if(maxErrors == 0)
        {
            std::cerr << "WARNING file was not read at all due to maxErrors setting, but returning Success.\n";
        }
        else
        {
            // Print a note that the entire file was not read.
            std::cerr << "File was not completely read due to the number of errors.\n";
            std::cerr << "Statistics only reflect the part of the file that was read.\n";
        }
    }

    fprintf(stderr, "\nNumber of records read = %d\n", numRecords);
    fprintf(stderr, "Number of valid records = %d\n", numValidRecords);

    std::cerr << std::endl;
    if(numRecords != numValidRecords)
    {
        std::cerr << "Error Counts:\n";

        // Loop through the non-validation errors.
        std::map<SamStatus::Status, uint64_t>::iterator statusIter;
        for(statusIter = errorStats.begin(); statusIter != errorStats.end(); statusIter++)
        {
            std::cerr << "\t" << SamStatus::getStatusString(statusIter->first) << ": "
                      << statusIter->second << std::endl;
        }

        std::map<SamValidationError::Type, uint64_t>::iterator invalidIter;
        for(invalidIter = invalidStats.begin(); invalidIter != invalidStats.end(); invalidIter++)
        {
            std::cerr << "\t" << SamValidationError::getTypeString(invalidIter->first) << ": "
                      << invalidIter->second << std::endl;
        }

        std::cerr << std::endl;
    }
    samIn.PrintStatistics();

    fprintf(stderr, "Returning: %d (%s)\n", status, SamStatus::getStatusString(status));
    return(status);
}
int main(int argc, char ** argv)
{   
   ParameterList inputParameters;
   String filename;
   int minReadLength = 10;
   int printableErrors = 20;
   int maxErrors = -1;
   String testParam;
   BaseAsciiMap::SPACE_TYPE myBaseType = BaseAsciiMap::UNKNOWN;
   
   // Read the parameters from the command line.
   bool baseSpace = false;
   bool colorSpace = false;
   bool autoDetect = false;
   bool ignoreErrors = false;
   bool baseComposition = false;
   bool avgQual = false;
   bool quiet = false;
   bool noeof = false;
   bool params = false;
   bool disableSeqIDCheck = false;
   bool interleaved = false;

   BEGIN_LONG_PARAMETERS(longParameterList)
      LONG_STRINGPARAMETER("file", &filename)
      LONG_PARAMETER("baseComposition", &baseComposition)
      LONG_PARAMETER("avgQual", &avgQual)
      LONG_PARAMETER("disableSeqIDCheck", &disableSeqIDCheck)
      LONG_PARAMETER("interleaved", &interleaved)
      LONG_PARAMETER("noeof", &noeof)
      LONG_PARAMETER("quiet", &quiet)
      LONG_PARAMETER("params", &params)
      LONG_INTPARAMETER("minReadLen", &minReadLength)
      LONG_INTPARAMETER("maxErrors", &maxErrors)
      LONG_PARAMETER_GROUP("Space Type")
         EXCLUSIVE_PARAMETER("baseSpace", &baseSpace)
         EXCLUSIVE_PARAMETER("colorSpace", &colorSpace)
         EXCLUSIVE_PARAMETER("auto", &autoDetect)
      LONG_PARAMETER_GROUP("Errors")
         EXCLUSIVE_PARAMETER("ignoreErrors", &ignoreErrors)
         LONG_SMARTINTPARAMETER("printableErrors", &printableErrors)
   BEGIN_LEGACY_PARAMETERS()
      LONG_PARAMETER("printBaseComp", &baseComposition)       
      LONG_PARAMETER("disableAllMessages", &quiet)
      LONG_INTPARAMETER("quitAfterErrorNum", &maxErrors)
      LONG_PARAMETER_GROUP("Space Type")
         EXCLUSIVE_PARAMETER("baseSpace", &baseSpace)
         EXCLUSIVE_PARAMETER("colorSpace", &colorSpace)
         EXCLUSIVE_PARAMETER("autoDetect", &autoDetect)
      LONG_PARAMETER_GROUP("Errors")
         EXCLUSIVE_PARAMETER("ignoreAllErrors", &ignoreErrors)
         LONG_SMARTINTPARAMETER("maxReportedErrors", &printableErrors)
   END_LONG_PARAMETERS();
   
   inputParameters.Add(new LongParameters ("Input Parameters", longParameterList));

   inputParameters.Read(argc, argv);

   if(ignoreErrors)
   {
      // Ignore all errors, so set printableErrors to 0.
      printableErrors = 0;
   }

   // Set the base type based on the passed in parameters.
   if(baseSpace)
   {
      // Base Space
      myBaseType = BaseAsciiMap::BASE_SPACE;
   }
   else if(colorSpace)
   {
      myBaseType = BaseAsciiMap::COLOR_SPACE;
   }
   else
   {
      myBaseType = BaseAsciiMap::UNKNOWN;
      // Set autoDetect
      autoDetect = true;
   }

   // If no eof block is required for a bgzf file, set the bgzf file type to 
   // not look for it.
   if(noeof)
   {
       // Set that the eof block is not required.
       BgzfFileType::setRequireEofBlock(false);
   }

   // DO not print status if set to quiet.
   if((!quiet) && params)
   {
      inputParameters.Status();
   }

   if(filename == "")
   {
      if(quiet)
      {
         return(-1);
      }
      // No filename was specified so print a usage description.
      std::cout << "ERROR: No filename specified.  See below for usage help.";
      std::cout << std::endl << std::endl;

      std::cout << "  Required Parameters:" << std::endl;
      std::cout << "\t--file  :  FastQ filename with path to be prorcessed.\n";
      std::cout << std::endl;

      std::cout << "  Optional Parameters:" << std::endl;
      std::cout << "\t--minReadLen         : Minimum allowed read length (Defaults to 10).\n";
      std::cout << "\t--maxErrors          : Number of errors to allow before quitting\n";
      std::cout << "\t                       reading/validating the file.\n";
      std::cout << "\t                       -1 (default) indicates to not quit until\n";
      std::cout << "\t                       the entire file is read.\n";
      std::cout << "\t                       0 indicates not to read/validate anything\n";
      std::cout << "\t--printableErrors    : Maximum number of errors to print before\n";
      std::cout << "\t                       suppressing them (Defaults to 20).\n";
      std::cout << "\t                       Different than maxErrors since \n";
      std::cout << "\t                       printableErrors will continue reading and\n";
      std::cout << "\t                       validating the file until the end, but\n";
      std::cout << "\t                       just doesn't print the errors.\n";
      std::cout << "\t--ignoreErrors       : Ignore all errors (same as printableErrors = 0)\n";
      std::cout << "\t                       overwrites the printableErrors option.\n";
      std::cout << "\t--baseComposition    : Print the Base Composition Statistics.\n";
      std::cout << "\t--avgQual            : Print the average phred quality per cycle & overall average quality.\n";
      std::cout << "\t--disableSeqIDCheck  : Disable the unique sequence identifier check.\n";
      std::cout << "\t                       Use this option to save memory since the sequence id\n";
      std::cout << "\t                       check uses a lot of memory.\n";
      std::cout << "\t--noeof              : Disable checking that the eof block is present in gzipped files\n.";
      std::cout << "\t--interleaved        : Validate consequtive reads have the same sequence identifier\n";
      std::cout << "\t                       (only allowed difference is 1/2, but not required) and validate\n";
      std::cout << "\t                       that otherwise reads have unique sequence identifiers.\n";
      std::cout << "\t                       Cannot be used if '--disableSeqIDCheck' is specified.\n";
      std::cout << "\t--params             : Print the parameter settings.\n";
      std::cout << "\t--quiet              : Suppresses the display of errors and summary statistics.\n";
      std::cout << "\t                       Does not affect the printing of Base Composition Statistics.\n";

      std::cout << "\n  Optional Space Options for Raw Sequence (Last one specified is used):\n";
      std::cout << "\t--auto       : Determine baseSpace/colorSpace from the Raw Sequence in the file (Default).\n";
      std::cout << "\t--baseSpace  : ACTGN only\n";
      std::cout << "\t--colorSpace : 0123. only\n";
      std::cout << std::endl;

      std::cout << "  Usage:" << std::endl;
      std::cout << "\t./fastQValidator --file <fileName> [--minReadLen <minReadLen>] [--maxErrors <numErrors>] [--printableErrors <printableErrors>|--ignoreErrors] [--baseComposition] [--disableSeqIDCheck] [--interleaved] [--quiet] [--baseSpace|--colorSpace|--auto] [--params]\n\n";
      std::cout << "  Examples:" << std::endl;
      std::cout << "\t../fastQValidator --file testFile.txt\n";
      std::cout << "\t../fastQValidator --file testFile.txt --minReadLen 10 --baseSpace --printableErrors 100\n";
      std::cout << "\t./fastQValidator --file test/testFile.txt --minReadLen 10 --colorSpace --ignoreErrors\n";
      std::cout << std::endl;
      return (-1);
   }
   
   FastQFile validator(minReadLength, printableErrors);
   
   if(quiet)
   {
      validator.disableMessages();
   }

   if(disableSeqIDCheck)
   {
       validator.disableSeqIDCheck();
   }

   if(interleaved)
   {
       validator.interleaved();
   }

   if(interleaved && disableSeqIDCheck)
   {
       if(!quiet)
       {
           std::cout << "ERROR: --interleaved and --disableSeqIDCheck cannot both be specified.\n";
       }
       return(-1);
   }

   validator.setMaxErrors(maxErrors);

   FastQStatus::Status status = validator.validateFastQFile(filename, baseComposition, myBaseType, avgQual);

   if(!quiet)
   {
      std::cout << "Returning: " << status << " : " << FastQStatus::getStatusString(status)
                << std::endl;
   }

   return(status);
}