Beispiel #1
0
// Dump the specified Bam Index file.
int DumpIndex::execute(int argc, char **argv)
{
    // Extract command line arguments.
    static const int UNSPECIFIED_INT = -1;
    String indexFile = "";
    int refID = UNSPECIFIED_INT;
    bool summary = false;
    bool params = false;

    ParameterList inputParameters;
    BEGIN_LONG_PARAMETERS(longParameterList)
        LONG_STRINGPARAMETER("bamIndex", &indexFile)
        LONG_INTPARAMETER("refID", &refID)
        LONG_PARAMETER("summary", &summary)
        LONG_PARAMETER("params", &params)
        END_LONG_PARAMETERS();
   
    inputParameters.Add(new LongParameters ("Input Parameters", 
                                            longParameterList));

    inputParameters.Read(argc-1, &(argv[1]));

    // Check to see if the index file was specified, if not, report an error.
    if(indexFile == "")
    {
        usage();
        inputParameters.Status();
        // mandatory argument was not specified.
        std::cerr << "Missing mandatory argument: --bamIndex" << std::endl;
        return(-1);
    }

    if(params)
    {
        inputParameters.Status();
    }

    // Read the index.
    BamIndex bamIndex;
    SamStatus status;
    status = bamIndex.readIndex(indexFile);

    if(status != SamStatus::SUCCESS)
    {
        // Failed to read the index, return.
        fprintf(stderr, "%s\n", status.getStatusMessage());
        return(status.getStatus());
    }

    // Print the index file.
    bamIndex.printIndex(refID, summary);

    return(status.getStatus());
}
Beispiel #2
0
int main(int argc, char ** argv)
{
    printf("glfMultiples -- SNP calls based on .glf or .glz files\n");
    printf("(c) 2008-2011 Goncalo Abecasis, Sebastian Zoellner, Yun Li\n\n");

    String pedfile;
    String positionfile;
    String callfile;
    String glfAliases;
    String glfPrefix;
    String glfSuffix;
    ParameterList pl;

    double posterior = 0.50;
    int    mapQuality = 0;
    int    minTotalDepth = 1;
    int    maxTotalDepth = INT_MAX;
    bool   verbose = false;
    bool   mapQualityStrict = false;
    bool   hardFilter = false;
    bool   smartFilter = false;
    bool   softFilter = true;
    bool   robustPrior = true;
    bool   uniformPrior = false;
    String xLabel("X"), yLabel("Y"), mitoLabel("MT");
    int    xStart = 2699520, xStop = 154931044;

    BEGIN_LONG_PARAMETERS(longParameters)
        LONG_PARAMETER_GROUP("Pedigree File")
        LONG_STRINGPARAMETER("ped", &pedfile)
        LONG_PARAMETER_GROUP("Map Quality Filter")
        LONG_INTPARAMETER("minMapQuality", &mapQuality)
        LONG_PARAMETER("strict", &mapQualityStrict)
        LONG_PARAMETER_GROUP("Total Depth Filter")
        LONG_INTPARAMETER("minDepth", &minTotalDepth)
        LONG_INTPARAMETER("maxDepth", &maxTotalDepth)
        LONG_PARAMETER_GROUP("Position Filter")
        LONG_STRINGPARAMETER("positionFile", &positionfile)
        LONG_PARAMETER_GROUP("Chromosome Labels")
        LONG_STRINGPARAMETER("xChr", &xLabel)
        LONG_STRINGPARAMETER("yChr", &yLabel)
        LONG_STRINGPARAMETER("mito", &mitoLabel)
        LONG_INTPARAMETER("xStart", &xStart)
        LONG_INTPARAMETER("xStop", &xStop)
        LONG_PARAMETER_GROUP("Filtering Options")
        EXCLUSIVE_PARAMETER("hardFilter", &hardFilter)
        EXCLUSIVE_PARAMETER("smartFilter", &smartFilter)
        EXCLUSIVE_PARAMETER("softFilter", &softFilter)
        LONG_PARAMETER_GROUP("Prior Options")
        EXCLUSIVE_PARAMETER("uniformPrior", &uniformPrior)
        EXCLUSIVE_PARAMETER("robustPrior", &robustPrior)
        LONG_PARAMETER_GROUP("Output")
        LONG_PARAMETER("verbose", &verbose)
        LONG_PARAMETER_GROUP("Sample Names")
        LONG_STRINGPARAMETER("glfAliases", &glfAliases)
        LONG_PARAMETER_GROUP("Prefixes and Suffixes")
        LONG_STRINGPARAMETER("glfPrefix",&glfPrefix)
        LONG_STRINGPARAMETER("glfSuffix",&glfSuffix)
        END_LONG_PARAMETERS();

    pl.Add(new StringParameter('b', "Base Call File", callfile));
    pl.Add(new DoubleParameter('p', "Posterior Threshold", posterior));
    pl.Add(new LongParameters("Additional Options", longParameters));
    int argstart = pl.ReadWithTrailer(argc, argv) + 1;
    pl.Status();

    if (posterior < 0.50)
        error("Posterior threshold for genotype calls (-p option) must be > 0.50.");

    time_t t;
    time(&t);

    printf("Analysis started on %s\n", ctime(&t));
    fflush(stdout);

    int n = argc - argstart;
    argv += argstart;

    Pedigree ped;
    if (!pedfile.IsEmpty())
    {
        ped.pd.AddStringColumn("glfFile");
        ped.Load(pedfile);

        n = ped.count;
    }
    else
        if (n == 0)
            error("No pedigree file present and no glf files listed at the end of command line\n");

    // Prior for finding difference from the reference at a particular site
    //BgzfFileType::setRequireEofBlock(false);

    double prior = 0.0;
    for (int i = 1; i <= 2 * n; i++)
        prior += 1.0 / i;
    prior *= 0.001;

    glfHandler * glf = new glfHandler[n];

    bool firstGlf = n;
    if (ped.count)
    {
        bool warn = false;

        for (int i = n - 1; i > 0; i++)
        {
          
            if (!glf[i].Open(ped[i].strings[0]))
            {
                printf("Failed to open genotype likelihood file [%s] for individual %s:%s\n",
                       (const char *) ped[i].strings[0],
                       (const char *) ped[i].famid,
                       (const char *) ped[i].pid);

                glf[i].OpenStub();
                firstGlf = i;
            }

            if (warn)
                printf("\n");

            if (firstGlf == n)
                error("No genotype likelihood files could be opened");
        }
    }
    else
    {
        for (int i = firstGlf = 0; i < n; i++)
        {
            String glfName = glfPrefix + String(argv[i]) + glfSuffix;
            if (!glf[i].Open(glfName))
                error("Failed to open genotype likelihood file [%s]\n", glfName.c_str());
        }
    }

    StringAlias aliases;
    aliases.ReadFromFile(glfAliases);

    printf("Calling genotypes for files ...\n");
    for (int i = 0; i < n; i++)
        printf("%s\n", ped.count ? (const char *) ped[i].strings[0] : argv[i]);
    printf("\n");

    baseCalls = fopen(callfile, "wt");

    if (baseCalls != NULL)
    {
        fprintf(baseCalls, "##fileformat=VCFv4.0\n");
        ReportDate(baseCalls);
        fprintf(baseCalls, "##source=glfMultiples\n");
        fprintf(baseCalls, "##minDepth=%d\n", minTotalDepth);
        fprintf(baseCalls, "##maxDepth=%d\n", maxTotalDepth);
        fprintf(baseCalls, "##minMapQuality=%d\n", mapQuality);
        fprintf(baseCalls, "##minPosterior=%.4f\n", posterior);
        fprintf(baseCalls, "##INFO=<ID=DP,Number=1,Type=Integer,Description=\"Total Depth\">\n");
        fprintf(baseCalls, "##INFO=<ID=MQ,Number=1,Type=Integer,Description=\"Root Mean Squared Mapping Quality\">\n");
        fprintf(baseCalls, "##INFO=<ID=NS,Number=1,Type=Integer,Description=\"Number of samples with coverage\">\n");
        fprintf(baseCalls, "##INFO=<ID=AN,Number=1,Type=Integer,Description=\"Total number of alleles (with coverage)\">\n");
        fprintf(baseCalls, "##INFO=<ID=AC,Number=.,Type=Integer,Description=\"Alternative allele count (with coverage)\">\n");
        fprintf(baseCalls, "##INFO=<ID=AF,Number=.,Type=Float,Description=\"Alternate allele frequency\">\n");
        fprintf(baseCalls, "##INFO=<ID=AB,Number=1,Type=Float,Description=\"Estimated allele balance between the alleles\">\n");
	if ( mapQuality > 0 ) {
	  fprintf(baseCalls, "##FILTER=<ID=mq%d,Description=\"Mapping Quality less than %d\">\n",mapQuality,mapQuality);
	}
	if ( minTotalDepth > 1 ) {
	  fprintf(baseCalls, "##FILTER=<ID=dp%d,Description=\"Total Read Depth less than %d\">\n",minTotalDepth,minTotalDepth);
	}
	if ( minTotalDepth < INT_MAX ) {
	  fprintf(baseCalls, "##FILTER=<ID=DP%d,Description=\"Total Read Depth greater than %d\">\n",maxTotalDepth,maxTotalDepth);
	}
        fprintf(baseCalls, "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Most Likely Genotype\">\n");
        fprintf(baseCalls, "##FORMAT=<ID=GQ,Number=1,Type=Integer,Description=\"Genotype Call Quality\">\n");
        fprintf(baseCalls, "##FORMAT=<ID=DP,Number=1,Type=Integer,Description=\"Read Depth\">\n");
        fprintf(baseCalls, "##FORMAT=<ID=PL,Number=3,Type=Integer,Description=\"Genotype Likelihoods for Genotypes 0/0,0/1,1/1\">\n");
        fprintf(baseCalls, "##FORMAT=<ID=PL3,Number=6,Type=Integer,Description=\"Genotype Likelihoods for Genotypes 0/0,0/1,1/1,0/2,1/2,2/2\">\n");
        fprintf(baseCalls, "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT");
        for (int i = 0; i < n; i++)
            fprintf(baseCalls, "\t%s", ped.count ?
                    (const char *) (ped[i].famid + ":" + ped[i].pid) :
                    (const char *) aliases.GetAlias(argv[i]));

        fprintf(baseCalls, "\n");
    }

    StringArray buffer, tokens;
    StringHash  positions;

    buffer.Read(positionfile);

    for (int i = 0; i < buffer.Length(); i++)
    {
        tokens.ReplaceTokens(buffer[i], " \t:");

        if (tokens.Length() != 2) continue;

        positions.Add(tokens[0] + ":" + (int(tokens[1].AsInteger() - 1)));
    }

    int chromosomeType = 0;

    while (glf[firstGlf].NextSection())
    {
        for (int i = firstGlf + 1; i < n; i++)
        {
            if (glf[i].isStub) continue;

            glf[i].NextSection();

            if (glf[firstGlf].maxPosition != glf[i].maxPosition || glf[firstGlf].label != glf[i].label)
            {
                error("Genotype files '%s' and '%s' are not compatible ...\n"
                      "    File '%s' has section %s with %d entries ...\n"
                      "    File '%s' section %s with %d entries ...\n",
                      ped.count ? (const char *) ped[firstGlf].strings[0] : argv[firstGlf],
                      ped.count ? (const char *) ped[i].strings[0] : argv[i],
                      ped.count ? (const char *) ped[firstGlf].strings[0] : argv[firstGlf],
                      (const char *) glf[firstGlf].label, glf[firstGlf].maxPosition,
                      ped.count ? (const char *) ped[i].strings[0] : argv[i],
                      (const char *) glf[i].label, glf[i].maxPosition);
            }
        }

        chromosomeType = CT_AUTOSOME;

        if (ped.count)
        {
            if (glf[firstGlf].label == xLabel) chromosomeType = CT_CHRX;
            if (glf[firstGlf].label == yLabel) chromosomeType = CT_CHRY;
            if (glf[firstGlf].label == mitoLabel) chromosomeType = CT_MITO;
        }

        printf("Processing section %s with %d entries\n",
               (const char *) glf[firstGlf].label, glf[firstGlf].maxPosition);

        int refBase = 0;
        int position = 0;
        int mapQualityFilter = 0;
        int depthFilter = 0;
        int homozygousReference = 0;
        int transitions = 0;
        int transversions = 0;
        int otherPolymorphisms = 0;
        int sinkFilter = 0;
        int smartFilterHits = 0;
        int baseCounts[5] = {0, 0, 0, 0, 0};

        String filter;

        while (true)
        {
            if (position > 0)
            {
                // Check whether we have reached the end of the current chromosome
                bool done = true;
                for (int i = 0; i < n; i++)
                    if (glf[i].data.recordType != 0)
                        done = false;
                if (done) break;
            }

            // Advance to the next position where needed
            for (int i = 0; i < n; i++)
                if (glf[i].position == position)
                    glf[i].NextBaseEntry();

            // Figure out the current analysis position
            refBase = glf[0].data.refBase;
            position = glf[0].position;
            for (int i = 1; i < n; i++)
                if (position > glf[i].position)
                {
                    position = glf[i].position;
                    refBase = glf[i].data.refBase;
                }

            // Avoid alignments that extend past the end of the chromosome
            if (position >= glf[firstGlf].maxPosition)
                break;

            baseCounts[(int)refBase]++;

            // These lines can be uncommented for debugging purposes
            // for (int i = 0; i < n; i++)
            //   printf("GLF %d : position %d, refBase %d\n", i, position, refBase);
            // printf("Position: %d, refBase: %d\n", position, refBase);

            if (positions.Entries())
            {
                filter = glf[firstGlf].label + ":" + position;

                if (positions.Find(filter) < 0) continue;
            }

            if (refBase == 0) continue;

            // Corrected calculation of root-mean-square Map Quality score 
            // and check if we have at least one sample with good quality data

            int     currentDepth = 0, totalDepth = 0, numCovered = 0;
            double  currentQuality = 0.0, averageMapQuality = 0.0;
            bool    passMapQualityFilter = false;

            for (int i = 0; i < n; i++)
            {
                currentDepth = glf[i].GetDepth(position);
                if (currentDepth != 0)
                {
                    totalDepth += currentDepth;
                    numCovered++;     // not currently used -- will be "NS"
                    currentQuality = glf[i].GetMapQuality(position);
                    averageMapQuality += 
                        currentDepth * currentQuality * currentQuality;
                    if (currentQuality >= mapQuality)
                        passMapQualityFilter = true;
                }
            }
            averageMapQuality = sqrt(averageMapQuality / totalDepth);


            filter.Clear();

            if (!passMapQualityFilter)
            {
                if (filter.Length() == 0) mapQualityFilter++;
                if (hardFilter) continue;
                filter.catprintf("%smq%d", filter.Length() ? ";" : "", mapQuality);
            }

            if (totalDepth < minTotalDepth)
            {
                if (filter.Length() == 0) depthFilter++;
                if (hardFilter) continue;
                filter.catprintf("%sdp%d", filter.Length() ? ";" : "", minTotalDepth);
            }

            if (totalDepth > maxTotalDepth)
            {
                if (filter.Length() == 0) depthFilter++;
                if (hardFilter) continue;
                filter.catprintf("%sDP%d", filter.Length() ? ";" : "", maxTotalDepth);
            }

            // Create convenient aliases for each base
            unsigned char transition = (((refBase - 1) ^ 2) + 1);
            unsigned char transvers1 = (((refBase - 1) ^ 3) + 1);
            unsigned char transvers2 = (((refBase - 1) ^ 1) + 1);

            int homRef = glf[0].GenotypeIndex(refBase, refBase);

            // Calculate likelihood assuming every is homozygous for the reference
            double lRef = log(1.0 - prior);
            for (int i = 0; i < n; i++)
                lRef += log(glf[i].GetLikelihoods(position)[homRef]);

            // Calculate maximum likelihood for a variant
            if (smartFilter)
            {
                double anyVariant = log(prior) + FilteringLikelihood(glf, n, position, refBase);

                if (exp(lRef - anyVariant) > (1.0 - posterior)/posterior)
                {
                    smartFilterHits++;
                    continue;
                }
            }

	    //fprintf(stderr,"position = %d\n",position);

            double pTs = uniformPrior ? 1./3. : 2./3.;
            double pTv = uniformPrior ? 1./3. : 1./6.;

            // Calculate likelihoods for the most likelily SNP configurations
            double refTransition = log(prior * pTs) + PolymorphismLikelihood(glf, n, position, refBase, transition);
            double refTransvers1 = log(prior * pTv) + PolymorphismLikelihood(glf, n, position, refBase, transvers1);
            double refTransvers2 = log(prior * pTv) + PolymorphismLikelihood(glf, n, position, refBase, transvers2);

            // Calculate likelihoods for less likely SNP configurations
            double transitiontv1 = log(prior * 0.001) + PolymorphismLikelihood(glf, n, position, transition, transvers1);
            double transitiontv2 = log(prior * 0.001) + PolymorphismLikelihood(glf, n, position, transition, transvers2);
            double transvers1tv2 = log(prior * 0.001) + PolymorphismLikelihood(glf, n, position, transvers1, transvers2);

            // Calculate the likelihood for unusual configurations where everyone is heterozygous ...
            double sink = n > 10 ? log(prior * 1e-8) + SinkLikelihood(glf, n, position) : -1e100;

            double lmax = max(
                              max(max(lRef, refTransition),max(refTransvers1, refTransvers2)),
                              max(max(transitiontv1, transitiontv2), max(transvers1tv2, sink)));

            double sum = exp(lRef - lmax) + exp(refTransition -lmax) +
                exp(refTransvers1 - lmax) + exp(refTransvers2 - lmax) +
                exp(transitiontv1 - lmax) + exp(transitiontv2 - lmax) +
                exp(transvers1tv2 - lmax) + exp(sink - lmax);

            if (sum == 0.0) continue;

            if (exp(lRef - lmax)/sum > 1.0 - prior)
            {
                if (filter.Length() == 0) homozygousReference++;

                if (positions.Entries())
                    ReportSNP(glf, n, position, refBase, refBase, refBase, filter, totalDepth, averageMapQuality, lRef / sum);

                continue;
            }

            double quality = 1.0 - exp(lRef - lmax) / sum;

            if (verbose)
            {
                DumpDetails(glf, n, position, refBase);

                printf("%.3f %.3f %.3f %.3f %.3f %.3f %.3f\n",
                       lRef, refTransition, refTransvers1, refTransvers2,
                       transitiontv1, transitiontv2, transvers1tv2);
            }

            if (exp(refTransition - lmax)/sum > posterior)
            {
                ReportSNP(glf, n, position, refBase, refBase, transition,
                          filter, totalDepth, averageMapQuality, quality /* refTransition/sum */);
                if (filter.Length() == 0) transitions++;
            }
            else if (exp(refTransvers1 - lmax)/sum > posterior)
            {
                ReportSNP(glf, n, position, refBase, refBase, transvers1,
                          filter, totalDepth, averageMapQuality, quality /* refTransvers1/sum */);
                if (filter.Length() == 0) transversions++;
            }
            else if (exp(refTransvers2 - lmax)/sum > posterior)
            {
                ReportSNP(glf, n, position, refBase, refBase, transvers2,
                          filter, totalDepth, averageMapQuality, quality /* refTransvers2/sum */);
                if (filter.Length() == 0) transversions++;
            }
            else if (exp(transitiontv1 - lmax)/sum > posterior)
            {
                ReportSNP(glf, n, position, refBase, transition, transvers1,
                          filter, totalDepth, averageMapQuality, quality /* transitiontv1/sum */);
                if (filter.Length() == 0) otherPolymorphisms++;
            }
            else if (exp(transitiontv2 - lmax)/sum > posterior)
            {
                ReportSNP(glf, n, position, refBase, transition, transvers2,
                          filter, totalDepth, averageMapQuality, quality /* transitiontv2/sum */);
                if (filter.Length() == 0) otherPolymorphisms++;
            }
            else if (exp(transvers1tv2 - lmax)/sum > posterior)
            {
                ReportSNP(glf, n, position, refBase, transvers1, transvers2,
                          filter, totalDepth, averageMapQuality, quality /* transvers1tv2/sum */);
                if (filter.Length() == 0) otherPolymorphisms++;
            }
            else if (exp(sink - lmax)/sum > posterior)
                sinkFilter++;
        }

        int actualBases = glf[firstGlf].maxPosition - baseCounts[0];

        printf("          Missing bases = %9d (%.3f%%)\n",
               baseCounts[0], baseCounts[0] * 100. / glf[firstGlf].maxPosition);
        printf("        Reference bases = %9d (%.3f%%)\n",
               glf[firstGlf].maxPosition - baseCounts[0], (glf[firstGlf].maxPosition - baseCounts[0]) * 100. / glf[firstGlf].maxPosition);

        printf("              A/T bases = %9d (%.3f%%, %d A, %d T)\n",
               baseCounts[1] + baseCounts[4],
               (baseCounts[1] + baseCounts[4]) * 100. / actualBases,
               baseCounts[1], baseCounts[4]);

        printf("              G/C bases = %9d (%.3f%%, %d G, %d C)\n",
               baseCounts[3] + baseCounts[2],
               (baseCounts[3] + baseCounts[2]) * 100. / actualBases,
               baseCounts[3], baseCounts[2]);

        printf("           Depth Filter = %9d bases (%.3f%%)\n",
               depthFilter, depthFilter * 100. / actualBases);

        printf("     Map Quality Filter = %9d bases (%.3f%%)\n",
               mapQualityFilter, mapQualityFilter * 100. / actualBases);

        printf("        Non-Polymorphic = %9d bases (%.3f%%)\n",
               homozygousReference, homozygousReference * 100. / actualBases);

        printf("            Transitions = %9d bases (%.3f%%)\n",
               transitions, transitions * 100. / actualBases);

        printf("          Transversions = %9d bases (%.3f%%)\n",
               transversions, transversions * 100. / actualBases);

        printf("    Other Polymorphisms = %9d bases (%.3f%%)\n",
               otherPolymorphisms, otherPolymorphisms * 100. / actualBases);

        if (n > 10)
            printf("          Homology Sink = %9d bases (%.3f%%)\n",
                   sinkFilter, sinkFilter * 100. / actualBases);

        if (smartFilter)
            printf("           Smart Filter = %9d bases (%.3f%%)\n",
                   smartFilterHits, smartFilterHits * 100. / actualBases);

        int noCalls = actualBases - homozygousReference - transitions - transversions - otherPolymorphisms - sinkFilter;
        printf("                No call = %9d bases (%.3f%%)\n",
               noCalls, noCalls * 100. / actualBases);

        fflush(stdout);
    }

    if (baseCalls != NULL)
        fclose(baseCalls);

    time(&t);
    printf("\nAnalysis completed on %s\n", ctime(&t));
    fflush(stdout);
}
int ReadReference::execute(int argc, char **argv)
{
    static const int UNSPECIFIED_INT = -1;
    String refFile = "";
    String refName = "";
    int start = UNSPECIFIED_INT;
    int numBases = UNSPECIFIED_INT;
    int end = UNSPECIFIED_INT;
    bool params = false;
    
    // Read in the parameters.    
    ParameterList inputParameters;
    BEGIN_LONG_PARAMETERS(longParameterList)
        LONG_STRINGPARAMETER("refFile", &refFile)
        LONG_STRINGPARAMETER("refName", &refName)
        LONG_INTPARAMETER("start", &start)
        LONG_INTPARAMETER("end", &end)
        LONG_INTPARAMETER("numBases", &numBases)
        LONG_PARAMETER("params", &params)
        LONG_PHONEHOME(VERSION)
        END_LONG_PARAMETERS();
   
    inputParameters.Add(new LongParameters ("Input Parameters", 
                                            longParameterList));
    
    // parameters start at index 2 rather than 1.
    inputParameters.Read(argc, argv, 2);
    
    if((refName == "") || (start == UNSPECIFIED_INT) || 
       ((end == UNSPECIFIED_INT) && (numBases == UNSPECIFIED_INT)))
    {
        usage();
        inputParameters.Status();
        std::cerr << "Missing Required Parameter\n\n";
        return(-1);
    }
    if((end != UNSPECIFIED_INT) && (numBases != UNSPECIFIED_INT))
    {
        usage();
        inputParameters.Status();
        std::cerr << "Only --end or --numBases can be specified\n\n";
        return(-1);
    }
    else if(numBases != UNSPECIFIED_INT)
    {
        end = start + numBases;
    }

    if(params)
    {
        inputParameters.Status();
    }

    // Open the reference.
    GenomeSequence reference(refFile);

    uint32_t refStart = 
        reference.getGenomePosition(refName.c_str());

    if(refStart == INVALID_GENOME_INDEX)
    {
        std::cerr << "Reference Name: " << refName.c_str()
                  << " not found in the reference file\n"; 
        return(-1);
    }

    std::string refString;
    
    reference.getString(refString, refStart + start, end - start);
    std::cout << refString << std::endl;
    
    return(0);
}
Beispiel #4
0
int ClipOverlap::execute(int argc, char **argv)
{
    // Extract command line arguments.
    String inFile = "";
    String outFile = "";
    String storeOrig = "";
    bool readName = false;
    bool noRNValidate = false;
    bool stats = false;
    int poolSize = DEFAULT_POOL_SIZE;
    bool unmapped = false;
    bool noeof = false;
    bool params = false;
    String excludeFlags = "0xF0C";

    // TODO, cleanup legacy parameters
    ParameterList inputParameters;
    BEGIN_LONG_PARAMETERS(longParameterList)
        LONG_PARAMETER_GROUP("Required Parameters")
        LONG_STRINGPARAMETER("in", &inFile)
        LONG_STRINGPARAMETER("out", &outFile)
        LONG_PARAMETER_GROUP("Optional Parameters")
        LONG_STRINGPARAMETER("storeOrig", &storeOrig)
        LONG_PARAMETER("readName", &readName)
        LONG_PARAMETER ("noRNValidate", &noRNValidate)
        LONG_PARAMETER ("stats", &stats)
        LONG_PARAMETER ("overlapsOnly", &myOverlapsOnly)
        LONG_STRINGPARAMETER ("excludeFlags", &excludeFlags)
        LONG_PARAMETER("unmapped", &unmapped)
        LONG_PARAMETER("noeof", &noeof)
        LONG_PARAMETER("params", &params)
        LONG_PARAMETER_GROUP("Coordinate Processing Optional Parameters")
        LONG_INTPARAMETER("poolSize", &poolSize)
        LONG_PARAMETER("poolSkipOverlap", &myPoolSkipOverlap)
        LONG_PHONEHOME(VERSION)
        BEGIN_LEGACY_PARAMETERS()
        LONG_PARAMETER ("clipsOnly", &myOverlapsOnly)
        LONG_PARAMETER("poolSkipClip", &myPoolSkipOverlap)
        END_LONG_PARAMETERS();
   
    inputParameters.Add(new LongParameters ("Input Parameters", 
                                            longParameterList));

    // parameters start at index 2 rather than 1.
    inputParameters.Read(argc, argv, 2);

    // If no eof block is required for a bgzf file, set the bgzf file type to 
    // not look for it.
    if(noeof)
    {
        // Set that the eof block is not required.
        BgzfFileType::setRequireEofBlock(false);
    }

    // Check to see if the in file was specified, if not, report an error.
    if(inFile == "")
    {
        printUsage(std::cerr);
        inputParameters.Status();
        // In file was not specified but it is mandatory.
        std::cerr << "--in is a mandatory argument, "
                  << "but was not specified" << std::endl;
        return(-1);
    }

    // Check to see if the out file was specified, if not, report an error.
    if(outFile == "")
    {
        printUsage(std::cerr);
        inputParameters.Status();
        // Out file was not specified but it is mandatory.
        std::cerr << "--out is a mandatory argument, "
                  << "but was not specified" << std::endl;
        return(-1);
    }

    if((storeOrig.Length() != 0) && (storeOrig.Length() != 2))
    {
        printUsage(std::cerr);
        inputParameters.Status();
        std::cerr << "--storeOrig tag name must be 2 characters.\n";
        return(-1);
    }

    myOverlapHandler = new OverlapClipLowerBaseQual();
    if(myOverlapHandler == NULL)
    {
        printUsage(std::cerr);
        inputParameters.Status();
        std::cerr << "Failed to allocate the overlap handler\n";
        return(-1);
    }

    if(unmapped)
    {
        myOverlapHandler->markAsUnmapped();
    }

    // Setup the overlap handler.
    myOverlapHandler->keepStats(stats);
    if(storeOrig.Length() != 0)
    {
        myOverlapHandler->storeOrigCigar(storeOrig);
    }

    myIntExcludeFlags = excludeFlags.AsInteger();

    if(params)
    {
        inputParameters.Status();
    }

    // For each step process the file.
    // Open the files & read/write the sam header.
    SamStatus::Status runStatus = SamStatus::SUCCESS;
    for(int i = 1; i <= myOverlapHandler->numSteps(); i++)
    {
        // Open the file for reading.
        mySamHeader.resetHeader();
        SamFile samIn(inFile, SamFile::READ, &mySamHeader);
        SamFile* samOutPtr = NULL;
        // Check if writing, if so, open the output file.
        if(i == myOverlapHandler->numSteps())
        {
            samOutPtr = new SamFile(outFile, SamFile::WRITE, &mySamHeader);
        }

        if(readName)
        {
            if(!noRNValidate)
            {
                samIn.setSortedValidation(SamFile::QUERY_NAME);
            }
            runStatus = handleSortedByReadName(samIn, samOutPtr);
        }
        else
        {
            // Coordinate sorted, so work with the pools.
            samIn.setSortedValidation(SamFile::COORDINATE);
            myPool.setMaxAllocatedRecs(poolSize);

            // Reset the number of failures
            myNumMateFailures = 0;
            myNumPoolFail = 0;
            myNumPoolFailNoHandle = 0;
            myNumPoolFailHandled = 0;
            myNumOutOfOrder = 0;

            // Run by coordinate
            if(samOutPtr != NULL)
            {
                // Setup the output buffer for writing.
                SamCoordOutput outputBuffer(myPool);
                outputBuffer.setOutputFile(samOutPtr, &mySamHeader);
                runStatus = handleSortedByCoord(samIn, &outputBuffer);

                // Cleanup the output buffer.
                if(!outputBuffer.flushAll())
                {
                    std::cerr << "ERROR: Failed to flush the output buffer\n";
                    runStatus = SamStatus::FAIL_IO;
                }
            }
            else
            {
                runStatus = handleSortedByCoord(samIn, NULL);
            }
        }

        if(runStatus != SamStatus::SUCCESS)
        {
            break;
        }
        // Close the input file, it will be reopened if there are 
        // multiple steps.
        samIn.Close();
        if(samOutPtr != NULL)
        {
            samOutPtr->Close();
            delete samOutPtr;
            samOutPtr = NULL;
        }
    }

    // Done processing.
    // Print Stats
    myOverlapHandler->printStats();

    if(myNumMateFailures != 0)
    {
        std::cerr << "WARNING: did not find expected overlapping mates for "
                  << myNumMateFailures << " records." << std::endl;
    }
    if(myNumPoolFail != 0)
    {
        // Had to skip clipping some records due to running out of
        // memory and not being able to wait for the mate.
        std::cerr << "WARNING: " << myNumPoolFail 
                  << " record pool failures\n";
        if(myNumPoolFailNoHandle != 0)
        {
            std::cerr << "Due to hitting the max record poolSize, skipped handling " 
                      << myNumPoolFailNoHandle << " records." << std::endl;
        }
        if(myNumPoolFailHandled != 0)
        {
            std::cerr << "Due to hitting the max record poolSize, default handled " 
                      << myNumPoolFailHandled << " records." << std::endl;
        }
        if(myNumOutOfOrder != 0)
        {
            std::cerr << "WARNING: Resulting File out of Order by " 
                      << myNumOutOfOrder << " records.\n";
        }
    }

    if(runStatus == SamStatus::SUCCESS)
    {
        if(myNumPoolFail == 0)
        {
            std::cerr << "Completed ClipOverlap Successfully.\n";
        }
        else
        {
            runStatus = SamStatus::NO_MORE_RECS;
            std::cerr << "Completed ClipOverlap with WARNINGS.\n";
        }
    }
    else
    {
        std::cerr << "Failed to complete ClipOverlap.\n";
    }
    return(runStatus);
}
Beispiel #5
0
int Validate::execute(int argc, char **argv)
{
    // Extract command line arguments.
    String inFile = "";
    int maxErrors = -1;
    int printableErrors = 100;
    bool so_flag = false;
    bool so_coord = false;
    bool so_query = false;
    bool noeof = false;
    bool disableStatistics = false;
    bool verbose = false;
    bool params = false;

    ParameterList inputParameters;
    BEGIN_LONG_PARAMETERS(longParameterList)
        LONG_STRINGPARAMETER("in", &inFile)
        LONG_PARAMETER("noeof", &noeof)
        LONG_INTPARAMETER("maxErrors", &maxErrors)
        LONG_PARAMETER("verbose", &verbose)
        LONG_INTPARAMETER("printableErrors", &printableErrors)
        LONG_PARAMETER("disableStatistics", &disableStatistics)
        LONG_PARAMETER("params", &params)
        LONG_PARAMETER_GROUP("SortOrder")
        EXCLUSIVE_PARAMETER("so_flag", &so_flag)
        EXCLUSIVE_PARAMETER("so_coord", &so_coord)
        EXCLUSIVE_PARAMETER("so_query", &so_query)
        END_LONG_PARAMETERS();
   
    inputParameters.Add(new LongParameters ("Input Parameters", 
                                            longParameterList));

    inputParameters.Read(argc-1, &(argv[1]));

    // Determine the sort type for validation based on the parameters.
    SamFile::SortedType sortType = SamFile::UNSORTED;
    if(so_flag)
    {
        sortType = SamFile::FLAG;
    } 
    else if(so_coord)
    {
        sortType = SamFile::COORDINATE;
    }
    else if(so_query)
    {
        sortType = SamFile::QUERY_NAME;
    }
   
    // If no eof block is required for a bgzf file, set the bgzf file type to 
    // not look for it.
    if(noeof)
    {
        // Set that the eof block is not required.
        BgzfFileType::setRequireEofBlock(false);
    }

    // Check to see if the in file was specified, if not, report an error.
    if(inFile == "")
    {
        usage();
        inputParameters.Status();
        // In file was not specified but it is mandatory.
        std::cerr << "--in is a mandatory argument for validate, "
                  << "but was not specified" << std::endl;
        return(-1);
    }

    if(params)
    {
        inputParameters.Status();
    }

    // Since we want to accumulate multiple errors, use RETURN rather
    // than throwing exceptions.
    SamFile samIn(ErrorHandler::RETURN);
    // Open the file for reading.   
    if(!samIn.OpenForRead(inFile))
    {
        fprintf(stderr, "%s\n", samIn.GetStatusMessage());
        return(samIn.GetStatus());
    }

    // Set the sorting validation type.
    samIn.setSortedValidation(sortType);

    // Set that statistics should be generated.
    samIn.GenerateStatistics(!disableStatistics);

    // Read the sam header.
    SamFileHeader samHeader;
    if(!samIn.ReadHeader(samHeader))
    {
        fprintf(stderr, "%s\n", samIn.GetStatusMessage());
        return(samIn.GetStatus());
    }

    // Read the sam records.
    SamRecord samRecord(ErrorHandler::RETURN);

    // Track the status.
    SamStatus::Status status = SamStatus::SUCCESS;

    // Keep reading records until the end of the file is reached.
    int numValidRecords = 0;
    int numInvalidRecords = 0;
    int numErrorRecords = 0;
    int numRecords = 0;
    int numReportedErrors = 0;
    int totalErrorRecords = 0;

    std::map<SamStatus::Status, uint64_t> errorStats;
    std::map<SamValidationError::Type, uint64_t> invalidStats;

    SamValidationErrors invalidSamErrors;

    // Keep reading records from the file until SamFile::ReadRecord
    // indicates to stop (returns false).
    while( ( (maxErrors < 0) || (totalErrorRecords < maxErrors) ) &&
           ( (samIn.ReadRecord(samHeader, samRecord)) || (SamStatus::isContinuableStatus(samIn.GetStatus())) ) )
    {
        ++numRecords;
        if(samIn.GetStatus() == SamStatus::SUCCESS)
        {
            // Successfully set the record, so check to see if it is valid.
            // Clear any errors in the list.
            invalidSamErrors.clear();
            if(!SamValidator::isValid(samHeader, samRecord, invalidSamErrors))
            {
                // The record is not valid.
                ++numInvalidRecords;
                ++totalErrorRecords;
                if(verbose && (numReportedErrors < printableErrors))
                {
                    std::cerr << "Record " << numRecords << std::endl
                              << invalidSamErrors << std::endl;
                    ++numReportedErrors;
                }
                // Update the statistics for all validation errors found in this record.
                invalidSamErrors.resetErrorIter();
                const SamValidationError* errorPtr = invalidSamErrors.getNextError();
                while(errorPtr != NULL)
                {
                    ++invalidStats[errorPtr->getType()];
                    errorPtr = invalidSamErrors.getNextError();
                }

                // If the status is not yet set, set it.
                if(status == SamStatus::SUCCESS)
                {
                    status = SamStatus::INVALID;
                }
            }
            else
            {
                // Valid record, so increment the counter.
                ++numValidRecords;
            }
        }
        else
        {
            // Error reading the record.
            ++numErrorRecords;
            ++totalErrorRecords;
            if(verbose && (numReportedErrors < printableErrors))
            {
                // report error.
                std::cerr << "Record " << numRecords << std::endl
                          << samIn.GetStatusMessage() << std::endl
                          << std::endl;
                ++numReportedErrors;
            }
            // Increment the statistics
            ++errorStats[samIn.GetStatus()];

            // If the status is not yet set, set it.
            if(status == SamStatus::SUCCESS)
            {
                status = samIn.GetStatus();
            }
        }
    }

    if( (samIn.GetStatus() != SamStatus::NO_MORE_RECS) &&
        (totalErrorRecords < maxErrors) )
    {
        // The last read call had a failure, so report it.
        // If the number of errors is >= ,maxErrors we don't
        // want to print any more failures.
        ++numErrorRecords;
        ++totalErrorRecords;
        if(numReportedErrors < printableErrors)
        {
            std::cerr << "Record " << numRecords << ": ";
            std::cerr << std::endl << samIn.GetStatusMessage() << std::endl;
        }

        // Increment the statistics
        ++errorStats[samIn.GetStatus()];

        if(status == SamStatus::SUCCESS)
        {
            status = samIn.GetStatus();
        }
    }

    if(totalErrorRecords == maxErrors)
    {
        if(maxErrors == 0)
        {
            std::cerr << "WARNING file was not read at all due to maxErrors setting, but returning Success.\n";
        }
        else
        {
            // Print a note that the entire file was not read.
            std::cerr << "File was not completely read due to the number of errors.\n";
            std::cerr << "Statistics only reflect the part of the file that was read.\n";
        }
    }

    fprintf(stderr, "\nNumber of records read = %d\n", numRecords);
    fprintf(stderr, "Number of valid records = %d\n", numValidRecords);

    std::cerr << std::endl;
    if(numRecords != numValidRecords)
    {
        std::cerr << "Error Counts:\n";

        // Loop through the non-validation errors.
        std::map<SamStatus::Status, uint64_t>::iterator statusIter;
        for(statusIter = errorStats.begin(); statusIter != errorStats.end(); statusIter++)
        {
            std::cerr << "\t" << SamStatus::getStatusString(statusIter->first) << ": "
                      << statusIter->second << std::endl;
        }

        std::map<SamValidationError::Type, uint64_t>::iterator invalidIter;
        for(invalidIter = invalidStats.begin(); invalidIter != invalidStats.end(); invalidIter++)
        {
            std::cerr << "\t" << SamValidationError::getTypeString(invalidIter->first) << ": "
                      << invalidIter->second << std::endl;
        }

        std::cerr << std::endl;
    }
    samIn.PrintStatistics();

    fprintf(stderr, "Returning: %d (%s)\n", status, SamStatus::getStatusString(status));
    return(status);
}
Beispiel #6
0
int main(int argc, char ** argv)
   {
   setbuf(stdout, NULL);

   time_t start = time(NULL);

   printf("MiniMac - Imputation into phased haplotypes\n"
          "(c) 2011 Goncalo Abecasis\n");
#ifdef __VERSION__
   printf("VERSION 5.0\n");
#else
   printf("UNDOCUMENTED RELEASE\n");
#endif

   int rounds = 5, states = 200, cpus = 0;
   bool em = false, gzip = false, phased = false;

   String referenceHaplotypes, referenceSnps;
   String haplotypes, snps;
   String prefix("minimac");
   String firstMarker, lastMarker;

   String recombinationRates, errorRates;

   BEGIN_LONG_PARAMETERS(longParameters)
      LONG_PARAMETER_GROUP("Reference Haplotypes")
         LONG_STRINGPARAMETER("refHaps", &referenceHaplotypes)
         LONG_STRINGPARAMETER("refSnps", &referenceSnps)
      LONG_PARAMETER_GROUP("Target Haplotypes")
         LONG_STRINGPARAMETER("haps", &haplotypes)
         LONG_STRINGPARAMETER("snps", &snps)
      LONG_PARAMETER_GROUP("Starting Parameters")
         LONG_STRINGPARAMETER("rec", &recombinationRates)
         LONG_STRINGPARAMETER("erate", &errorRates)
      LONG_PARAMETER_GROUP("Parameter Fitting")
         LONG_INTPARAMETER("rounds", &rounds)
         LONG_INTPARAMETER("states", &states)
         LONG_PARAMETER("em", &em)
      LONG_PARAMETER_GROUP("Output Files")
         LONG_STRINGPARAMETER("prefix", &prefix)
         LONG_PARAMETER("phased", &phased)
         LONG_PARAMETER("gzip", &gzip)
//    LONG_PARAMETER_GROUP("Clipping Window")
//      LONG_STRINGPARAMETER("start", &firstMarker)
//      LONG_STRINGPARAMETER("stop", &lastMarker)
#ifdef _OPENMP
      LONG_PARAMETER_GROUP("Multi-Threading")
         LONG_INTPARAMETER("cpus", &cpus)
#endif
   END_LONG_PARAMETERS();

   ParameterList pl;

   pl.Add(new LongParameters("Command Line Options", longParameters));
   pl.Read(argc, argv);
   pl.Status();

#ifdef _OPENMP
   if (cpus > 0)
      omp_set_num_threads(cpus);
#endif

   // Read marker list
   printf("Reading Reference Marker List ...\n");

   StringArray refMarkerList;
   refMarkerList.Read(referenceSnps);

   // Index markers
   StringIntHash referenceHash;
   for (int i = 0; i < refMarkerList.Length(); i++)
      referenceHash.Add(refMarkerList[i].Trim(), i);

   printf("  %d Markers in Reference Haplotypes...\n\n", refMarkerList.Length());

   // Load reference haplotypes
   printf("Loading reference haplotypes ...\n");
   HaplotypeSet reference;

   reference.markerCount = refMarkerList.Length();
   reference.LoadHaplotypes(referenceHaplotypes);

   printf("  %d Reference Haplotypes Loaded ...\n\n", reference.count);

   // Read framework marker list
   printf("Reading Framework Marker List ...\n");
   StringArray markerList;
   markerList.Read(snps);

   ClipReference(reference, refMarkerList, referenceHash, markerList,
                 firstMarker, lastMarker);

   // Crossref Marker Names to Reference Panel Positions
   IntArray markerIndex;
   markerIndex.Dimension(markerList.Length());

   int matches = 0;

   for (int i = 0; i < markerList.Length(); i++)
      {
      markerIndex[i] = referenceHash.Integer(markerList[i].Trim());

      if (markerIndex[i] >= 0) matches++;
      }

   printf("  %d Markers in Framework Haplotypes Overlap Reference ...\n", matches);

   if (matches == 0)
      error("No markers overlap between target and reference\n"
            "Please check correct reference is being used and markers are named consistently");

   printf("  %d Other Markers in Framework Haplotypes Discarded ...\n\n", markerList.Length() - matches);

   // Check for flips in reference vs. target haplotypes
   int flips = 0;
   int previous = -1;
   for (int i = 0; i < markerIndex.Length(); i++)
      if (markerIndex[i] >= 0)
         if (markerIndex[i] < previous)
            {
            if (flips++ < 10)
               printf("  -> Marker %s precedes %s in reference, but follows it in target\n",
                     (const char *) refMarkerList[previous],
                     (const char *) markerList[i]);
            previous = markerIndex[i];
            }
   if (flips > 10)
      printf("  -> %d Additional Marker Order Changes Not Listed\n", flips - 10);
   if (flips)
      printf("  %d Marker Pairs Change Order in Target vs Framework Haplotypes\n", flips);

   // Load target haplotypes
   printf("Loading target haplotypes ...\n");
   HaplotypeSet target;

   target.markerCount = markerList.Length();
   target.LoadHaplotypes(haplotypes, true);

   reference.CalculateFrequencies();
   target.CalculateFrequencies();
   target.CompareFrequencies(reference, markerIndex, markerList);

   printf("  %d Target Haplotypes Loaded ...\n\n", target.count);

   int startIndex = firstMarker.IsEmpty() ? 0 : referenceHash.Integer(firstMarker);
   int stopIndex = lastMarker.IsEmpty() ? reference.markerCount - 1 : referenceHash.Integer(lastMarker);

   if (startIndex < 0 || stopIndex < 0)
      error("Clipping requested, but no position available for one of the endpoints");

   printf("Setting up Markov Model...\n\n");

   // Setup Markov Model
   MarkovParameters mp;

   mp.Allocate(reference.markerCount);

   if (rounds > 0)
      printf("Initializing Model Parameters (using %s and up to %d haplotypes)\n",
             em ? "E-M" : "MCMC", states);

   // Simple initial estimates of error and recombination rate
   for (int i = 0; i < reference.markerCount; i++)
      mp.E[i] = 0.01;

   for (int i = 0; i < reference.markerCount - 1; i++)
      mp.R[i] = 0.001;

   if (mp.ReadErrorRates(errorRates))
      printf("  Updated error rates using data in %s ...\n", (const char *) errorRates);

   if (mp.ReadCrossoverRates(recombinationRates))
      printf("  Updated recombination rates using %s ...\n", (const char *) recombinationRates);

   // Parameter estimation loop
   for (int round = 0; round < rounds; round++)
      {
      printf("  Round %d of Parameter Refinement ...\n", round + 1);

      int iterations = states < reference.count ? states : reference.count;

      MarkovModel original;
      original.CopyParameters(mp);

      #pragma omp parallel for
      for (int i = 0; i < iterations; i++)
         {
         MarkovModel mm;

         mm.Allocate(reference.markerCount, reference.count - 1);
         mm.CopyParameters(original);

         // Reference leave one out (loo) panel
         char ** reference_loo = new char * [reference.count - 1];
         for (int in = 0, out = 0; in < reference.count; in++)
            if (in != i)
               reference_loo[out++] = reference.haplotypes[in];

         mm.WalkLeft(reference.haplotypes[i], reference_loo, reference.freq);

         if (em)
            mm.CountExpected(reference.haplotypes[i], reference_loo, reference.freq);
         else
            {
            #pragma omp critical
            { mm.ProfileModel(reference.haplotypes[i], reference_loo, reference.freq); }
            }

         delete [] reference_loo;

         #pragma omp critical
         mp += mm;
         }

      if (round >= rounds / 2)
         {
         int iterations = states < target.count ? states : target.count;

         #pragma omp parallel for
         for (int i = 0; i < iterations; i++)
            {
            MarkovModel mm;

            mm.Allocate(reference.markerCount, reference.count);
            mm.CopyParameters(original);

            // Padded version of target haplotype, including missing sites
            char * padded = new char [reference.markerCount];
            for (int k = 0; k < reference.markerCount; k++)
               padded[k] = 0;

            // Copy current haplotype into padded vector
            for (int j = 0; j < target.markerCount; j++)
               if (markerIndex[j] >= 0)
                  padded[markerIndex[j]] = target.haplotypes[i][j];

            mm.WalkLeft(padded, reference.haplotypes, reference.freq);

            if (em)
               mm.CountExpected(padded, reference.haplotypes, reference.freq);
            else
               {
               #pragma omp critical
               { mm.ProfileModel(padded, reference.haplotypes, reference.freq); }
               }

            delete [] padded;

            #pragma omp critical
            mp += mm;
            }
         }

      mp.UpdateModel();

      double crossovers = 0;
      for (int i = 0; i < reference.markerCount - 1; i++)
         crossovers += mp.R[i];

      double errors = 0;
      for (int i = 0; i < reference.markerCount; i++)
         {
         double heterozygosity = 1.0 - square(reference.freq[1][i])
                                     - square(reference.freq[2][i])
                                     - square(reference.freq[3][i])
                                     - square(reference.freq[4][i]);

         errors += mp.E[i] * heterozygosity;
         }
      errors /= reference.markerCount + 1e-30;

      printf("      %.0f mosaic crossovers expected per haplotype\n", crossovers);
      printf("      %.1f%% of crossovers are due to reference flips\n", mp.empiricalFlipRate * 100.);
      printf("      %.3g errors in mosaic expected per marker\n", errors);
      }

   if (rounds > 0)
      {
      printf("  Saving estimated parameters for future use ...\n");
      mp.WriteParameters(refMarkerList, prefix, gzip);
      }

   printf("\n");

   // List the major allele at each location
   reference.ListMajorAlleles();

   printf("Generating Draft .info File ...\n\n");

   // Output some basic information
   IFILE info = ifopen(prefix + ".info.draft", "wt");

   ifprintf(info, "SNP\tAl1\tAl2\tFreq1\tGenotyped\n");

   for (int i = 0, j = 0; i <= stopIndex; i++)
      if (i >= startIndex)
         ifprintf(info, "%s\t%s\t%s\t%.4f\t%s\n",
            (const char *) refMarkerList[i],
            reference.MajorAlleleLabel(i), reference.MinorAlleleLabel(i),
            reference.freq[reference.major[i]][i],
            j < markerIndex.Length() && i == markerIndex[j] ? (j++, "Genotyped") : "-");
      else
         if (j < markerIndex.Length() && i == markerIndex[j])
            j++;

   ifclose(info);

   printf("Imputing Genotypes ...\n");

   IFILE dosages = ifopen(prefix + ".dose" + (gzip ? ".gz" : ""), "wt");
   IFILE hapdose, haps;

   if (phased)
      {
      hapdose = ifopen(prefix + ".hapDose" + (gzip ? ".gz" : ""), "wt");
      haps = ifopen(prefix + ".haps" + (gzip ? ".gz" : ""), "wt");
      }

   ImputationStatistics stats(reference.markerCount);

   // Impute each haplotype
   #pragma omp parallel for
   for (int i = 0; i < target.count; i++)
      {
      if (i != 0 && target.labels[i] == target.labels[i-1])
         continue;

      MarkovModel mm;

      mm.Allocate(reference.markerCount, reference.count);
      mm.ClearImputedDose();
      mm.CopyParameters(mp);

      // Padded version of target haplotype, including missing sites
      char * padded = new char [reference.markerCount];
      for (int j = 0; j < reference.markerCount; j++)
         padded[j] = 0;

      int k = i;

      do {
         printf("  Processing Haplotype %d of %d ...\n", k + 1, target.count);

         // Copy current haplotype into padded vector
         for (int j = 0; j < target.markerCount; j++)
            if (markerIndex[j] >= 0)
               padded[markerIndex[j]] = target.haplotypes[k][j];

         mm.WalkLeft(padded, reference.haplotypes, reference.freq);
         mm.Impute(reference.major, padded, reference.haplotypes, reference.freq);

         #pragma omp critical
         { stats.Update(mm.imputedHap, mm.leaveOneOut, padded, reference.major); }

         #pragma omp critical
         if (phased)
            {
            ifprintf(hapdose, "%s\tHAPLO%d", (const char *) target.labels[i], k - i + 1);
            ifprintf(haps, "%s\tHAPLO%d", (const char *) target.labels[i], k - i + 1);
            for (int j = startIndex; j <= stopIndex; j++)
               {
               ifprintf(hapdose, "\t%.3f", mm.imputedHap[j]);
               ifprintf(haps, "%s%c", j % 8 == 0 ? " " : "", mm.imputedAlleles[j]);
               }
            ifprintf(hapdose, "\n");
            ifprintf(haps, "\n");
            }

         k++;
      } while (k < target.count && target.labels[k] == target.labels[i]);

      printf("    Outputting Individual %s ...\n", (const char *) target.labels[i]);

      #pragma omp critical
         {
         ifprintf(dosages, "%s\tDOSE", (const char *) target.labels[i]);
         for (int j = startIndex; j <= stopIndex; j++)
            ifprintf(dosages, "\t%.3f", mm.imputedDose[j]);
         ifprintf(dosages, "\n");
         }

      delete [] padded;
      }

   ifclose(dosages);

   if (phased)
      {
      ifclose(hapdose);
      ifclose(haps);
      }

   // Output some basic information
   info = ifopen(prefix + ".info" + (gzip ? ".gz" : ""), "wt");

   ifprintf(info, "SNP\tAl1\tAl2\tFreq1\tMAF\tAvgCall\tRsq\tGenotyped\tLooRsq\tEmpR\tEmpRsq\tDose1\tDose2\n");

   // Padded version of target haplotype, including missing sites
   char * padded = new char [reference.markerCount];
   for (int k = 0; k < reference.markerCount; k++)
      padded[k] = 0;

   // Mark genotyped SNPs in padded vector
   for (int j = 0; j < target.markerCount; j++)
      if (markerIndex[j] >= 0)
          padded[markerIndex[j]] = 1;

   for (int i = startIndex; i <= stopIndex; i++)
      {
      ifprintf(info, "%s\t%s\t%s\t%.5f\t%.5f\t%.5f\t%.5f\t",
            (const char *) refMarkerList[i],
            reference.MajorAlleleLabel(i),
            reference.MinorAlleleLabel(i),
            stats.AlleleFrequency(i),
            stats.AlleleFrequency(i) > 0.5 ? 1.0 - stats.AlleleFrequency(i) : stats.AlleleFrequency(i),
            stats.AverageCallScore(i),
            stats.Rsq(i));

      if (padded[i])
         ifprintf(info, "Genotyped\t%.5f\t%.5f\t%.5f\t%.5f\t%.5f\n",
                  stats.LooRsq(i), stats.EmpiricalR(i), stats.EmpiricalRsq(i),
                  stats.LooMajorDose(i), stats.LooMinorDose(i));
      else
         ifprintf(info, "-\t-\t-\t-\t-\t-\n");
      }

   ifclose(info);

   delete [] padded;

   time_t stop = time(NULL);
   int seconds = stop - start;

   printf("\nRun completed in %d hours, %d mins, %d seconds on %s\n\n",
          seconds / 3600, (seconds % 3600) / 60, seconds % 60,
          ctime(&stop));
   }
Beispiel #7
0
// main function of verifyBamID
int execute(int argc, char** argv) {
  printf("verifyBamID %s -- verify identity and purity of sequence data\n"
	 "(c) 2010-2014 Hyun Min Kang, Goo Jun, and Goncalo Abecasis\n\n", VERSION);

  VerifyBamIDArgs args;
  ParameterList pl;

  BEGIN_LONG_PARAMETERS(longParameters)
    LONG_PARAMETER_GROUP("Input Files")
    LONG_STRINGPARAMETER("vcf",&args.sVcfFile)
    LONG_STRINGPARAMETER("bam",&args.sBamFile)
    LONG_STRINGPARAMETER("subset",&args.sSubsetInds)
    LONG_STRINGPARAMETER("smID",&args.sSMID)

    LONG_PARAMETER_GROUP("VCF analysis options")
    LONG_DOUBLEPARAMETER("genoError",&args.genoError)
    LONG_DOUBLEPARAMETER("minAF",&args.minAF)
    LONG_DOUBLEPARAMETER("minCallRate",&args.minCallRate)

    LONG_PARAMETER_GROUP("Individuals to compare with chip data")
    EXCLUSIVE_PARAMETER("site",&args.bSiteOnly)
    EXCLUSIVE_PARAMETER("self",&args.bSelfOnly)
    EXCLUSIVE_PARAMETER("best",&args.bFindBest)

    LONG_PARAMETER_GROUP("Chip-free optimization options")
    EXCLUSIVE_PARAMETER("free-none",&args.bFreeNone)
    EXCLUSIVE_PARAMETER("free-mix",&args.bFreeMixOnly)
    EXCLUSIVE_PARAMETER("free-refBias",&args.bFreeRefBiasOnly)
    EXCLUSIVE_PARAMETER("free-full",&args.bFreeFull)

    LONG_PARAMETER_GROUP("With-chip optimization options")
    EXCLUSIVE_PARAMETER("chip-none",&args.bChipNone)
    EXCLUSIVE_PARAMETER("chip-mix",&args.bChipMixOnly)
    EXCLUSIVE_PARAMETER("chip-refBias",&args.bChipRefBiasOnly)
    EXCLUSIVE_PARAMETER("chip-full",&args.bChipFull)

    LONG_PARAMETER_GROUP("BAM analysis options")
    LONG_PARAMETER("ignoreRG",&args.bIgnoreRG)
    LONG_PARAMETER("ignoreOverlapPair",&args.bIgnoreOverlapPair)
    LONG_PARAMETER("noEOF",&args.bNoEOF)
    LONG_PARAMETER("precise",&args.bPrecise)
    LONG_INTPARAMETER("minMapQ",&args.minMapQ)
    LONG_INTPARAMETER("maxDepth",&args.maxDepth)
    LONG_INTPARAMETER("minQ",&args.minQ)
    LONG_INTPARAMETER("maxQ",&args.maxQ)
    LONG_DOUBLEPARAMETER("grid",&args.grid)

    LONG_PARAMETER_GROUP("Modeling Reference Bias")
    LONG_DOUBLEPARAMETER("refRef",&args.pRefRef)
    LONG_DOUBLEPARAMETER("refHet",&args.pRefHet)
    LONG_DOUBLEPARAMETER("refAlt",&args.pRefAlt)

    LONG_PARAMETER_GROUP("Output options")
    LONG_STRINGPARAMETER("out",&args.sOutFile)
    LONG_PARAMETER("verbose",&args.bVerbose)
    LONG_PHONEHOME(VERSION)
  END_LONG_PARAMETERS();

  pl.Add(new LongParameters("Available Options",longParameters));
  pl.Read(argc, argv);
  pl.Status();

  // check the validity of input files
  if ( args.sVcfFile.IsEmpty() ) {
    error("--vcf [vcf file] required");
  }

  if ( args.sBamFile.IsEmpty() ) {
    error("--bam [bam file] is required");
  }

  if ( args.sOutFile.IsEmpty() ) {
    error("--out [output prefix] is required");
  }
  Logger::gLogger = new Logger((args.sOutFile + ".log").c_str(), args.bVerbose);

  if ( ! ( args.bSiteOnly || args.bSelfOnly || args.bFindBest ) ) {
    warning("--self option was autotomatically turned on by default. Specify --best option if you wanted to check across all possible samples in the VCF");
    args.bSelfOnly = true;
  }

  if ( ( args.maxDepth > 20 ) && ( !args.bPrecise ) ) {
    warning("--precise option is not turned on at --maxDepth %d : may be prone to precision errors",args.maxDepth);
  }

  if ( ( args.bChipRefBiasOnly ) && ( !args.bSelfOnly ) ) {
    error("--self must be set for --chip-refBias to work. Skipping..");
  }

  // check timestamp
  time_t t;
  time(&t);
  Logger::gLogger->writeLog("Analysis started on %s",ctime(&t));

  // load arguments
  VerifyBamID vbid(&args);

  // load input VCF and BAM files
  Logger::gLogger->writeLog("Opening Input Files");
  vbid.loadFiles(args.sBamFile.c_str(), args.sVcfFile.c_str());

  // Check which genotype-free method is used
  if ( args.bFreeNone ) {  // if no genotype-free mode is tested. skip it
    // do nothing for genotype-free estimation
    Logger::gLogger->writeLog("Skipping chip-free estimation of sample mixture");
  }
  else if ( args.bFreeMixOnly ) { // only mixture is estimated.
    // genotype-free method
    Logger::gLogger->writeLog("Performing chip-free estimation of sample mixture at fixed reference bias parameters (%lf, %lf, %lf)",args.pRefRef,args.pRefHet,args.pRefAlt);

    // scan across multiple readgroups
    for(int rg=-1; rg < vbid.nRGs - (int)args.bIgnoreRG; ++rg) {
      VerifyBamID::mixLLK mix(&vbid);
      mix.OptimizeLLK(rg);
      Logger::gLogger->writeLog("Optimal per-sample fMix = %lf, LLK0 = %lf, LLK1 = %lf\n",mix.fMix,mix.llk0,mix.llk1);
      vbid.mixOut.llk0s[rg+1] = mix.llk0;
      vbid.mixOut.llk1s[rg+1] = mix.llk1;
      vbid.mixOut.fMixs[rg+1] = mix.fMix;
    }

    //vbid.mixRefHet = 0.5;
    //vbid.mixRefAlt = 0.00;
  }
  else if ( args.bFreeRefBiasOnly ) {
    Logger::gLogger->writeLog("Performing chip-free estimation of reference-bias without sample mixture");
    for(int rg=-1; rg < vbid.nRGs - (int)args.bIgnoreRG; ++rg) {
      VerifyBamID::refBiasMixLLKFunc myFunc(&vbid, rg);
      AmoebaMinimizer myMinimizer;
      Vector startingPoint(2);
      startingPoint[0] = 0;      // pRefHet = 0.5
      startingPoint[1] = -4.595; // pRefAlt = 0.01
      myMinimizer.func = &myFunc;
      myMinimizer.Reset(2);
      myMinimizer.point = startingPoint;
      myMinimizer.Minimize(1e-6);
      double pRefHet = VerifyBamID::invLogit(myMinimizer.point[0]);
      double pRefAlt = VerifyBamID::invLogit(myMinimizer.point[1]);
      Logger::gLogger->writeLog("Reference Bias Estimated as ( Pr[refBase|HET] = %lf, Pr[refBase|ALT] = %lf) with LLK = %lf at readGroup %d",pRefHet,pRefAlt,myMinimizer.fmin,rg);
      //vbid.setRefBiasParams(1.0, pRefHet, pRefAlt);

      vbid.mixOut.llk0s[rg+1] = myFunc.llk0;
      vbid.mixOut.llk1s[rg+1] = myFunc.llk1;
      vbid.mixOut.refHets[rg+1] = myFunc.pRefHet;
      vbid.mixOut.refAlts[rg+1] = myFunc.pRefAlt;
    }
  }
  else if ( args.bFreeFull ) {
    Logger::gLogger->writeLog("Performing chip-free estimation of reference-bias and sample mixture together");
    for(int rg = -1; rg < vbid.nRGs - args.bIgnoreRG; ++rg) {
      VerifyBamID::fullMixLLKFunc myFunc(&vbid, rg);
      AmoebaMinimizer myMinimizer;
      Vector startingPoint(3);
      startingPoint[0] = -3.91;  // start with fMix = 0.01
      startingPoint[1] = 0;      // pRefHet = 0.5
      startingPoint[2] = -4.595; // pRefAlt = 0.01
      myMinimizer.func = &myFunc;
      myMinimizer.Reset(3);
      myMinimizer.point = startingPoint;
      myMinimizer.Minimize(1e-6);
      double fMix = VerifyBamID::invLogit(myMinimizer.point[0]);
      if ( fMix > 0.5 ) 
	fMix = 1.-fMix;
      double pRefHet = VerifyBamID::invLogit(myMinimizer.point[1]);
      double pRefAlt = VerifyBamID::invLogit(myMinimizer.point[2]);
      Logger::gLogger->writeLog("Optimal per-sample fMix = %lf\n",fMix);
      Logger::gLogger->writeLog("Reference Bias Estimated as ( Pr[refBase|HET] = %lf, Pr[refBase|ALT] = %lf) with LLK = %lf",pRefHet,pRefAlt,myMinimizer.fmin);
      //vbid.setRefBiasParams(1.0, pRefHet, pRefAlt);

      vbid.mixOut.llk0s[rg+1] = myFunc.llk0;
      vbid.mixOut.llk1s[rg+1] = myFunc.llk1;
      vbid.mixOut.fMixs[rg+1] = myFunc.fMix;
      vbid.mixOut.refHets[rg+1] = myFunc.pRefHet;
      vbid.mixOut.refAlts[rg+1] = myFunc.pRefAlt;
    }
  }
  Logger::gLogger->writeLog("calculating depth distribution");  
  vbid.calculateDepthDistribution(args.maxDepth, vbid.mixOut);

  Logger::gLogger->writeLog("finished calculating depth distribution");  

  std::vector<int> bestInds(vbid.nRGs+1,-1);
  std::vector<int> selfInds(vbid.nRGs+1,-1);

  if ( args.bChipNone ) {
    // do nothing
    Logger::gLogger->writeLog("Skipping with-chip estimation of sample mixture");
  }
  else if ( args.bChipMixOnly ) {
    Logger::gLogger->writeLog("Performing with-chip estimation of sample mixture at fixed reference bias parameter (%lf, %lf, %lf)",args.pRefRef,args.pRefHet,args.pRefAlt);
    
    for(int rg=-1; rg < (vbid.nRGs - (int)args.bIgnoreRG); ++rg) {
      double maxIBD = -1;
      VerifyBamID::ibdLLK ibd(&vbid);
      for(int i=0; i < (int)vbid.pGenotypes->indids.size(); ++i) {
	double fIBD = ibd.OptimizeLLK(i, rg);
	Logger::gLogger->writeLog("Comparing with individual %s.. Optimal fIBD = %lf, LLK0 = %lf, LLK1 = %lf for readgroup %d",vbid.pGenotypes->indids[i].c_str(),fIBD, ibd.llk0, ibd.llk1, rg);
	if ( maxIBD < fIBD ) {
	  bestInds[rg+1] = i;
	  vbid.bestOut.llk0s[rg+1] = ibd.llk0;
	  vbid.bestOut.llk1s[rg+1] = ibd.llk1;
	  vbid.bestOut.fMixs[rg+1] = 1-ibd.fIBD;
	  maxIBD = ibd.fIBD;
	}

	if ( ( (rg < 0) && (vbid.pPile->sBamSMID == vbid.pGenotypes->indids[i] ) ) || ( ( rg >= 0 ) && ( vbid.pPile->vsSMIDs[rg] == vbid.pGenotypes->indids[i]) ) ) {
	  selfInds[rg+1] = i;
	  vbid.selfOut.llk0s[rg+1] = ibd.llk0;
	  vbid.selfOut.llk1s[rg+1] = ibd.llk1;
	  vbid.selfOut.fMixs[rg+1] = 1-ibd.fIBD;
	}
      }

      if ( bestInds[rg+1] >= 0 ) {
	Logger::gLogger->writeLog("Best Matching Individual is %s with IBD = %lf",vbid.pGenotypes->indids[bestInds[rg+1]].c_str(),maxIBD);
	vbid.calculateDepthByGenotype(bestInds[rg+1],rg,vbid.bestOut);
      }

      if ( selfInds[rg+1] >= 0 ) {
	Logger::gLogger->writeLog("Self Individual is %s with IBD = %lf",vbid.pGenotypes->indids[selfInds[rg+1]].c_str(),vbid.selfOut.fMixs[rg+1]);
	vbid.calculateDepthByGenotype(selfInds[rg+1],rg,vbid.selfOut);
      }
    }
  }
  else if ( args.bChipRefBiasOnly ) {
    Logger::gLogger->writeLog("Performing with-chip estimation of reference-bias without sample mixture");
    if ( args.bSelfOnly ) {
      for(int rg=-1; rg < (vbid.nRGs - (int)args.bIgnoreRG); ++rg) {
	VerifyBamID::refBiasIbdLLKFunc myFunc(&vbid, rg);
	AmoebaMinimizer myMinimizer;
	Vector startingPoint(2);
	startingPoint[0] = 0;      // pRefHet = 0.5
	startingPoint[1] = -4.595; // pRefAlt = 0.01
	myMinimizer.func = &myFunc;
	myMinimizer.Reset(2);
	myMinimizer.point = startingPoint;
	myMinimizer.Minimize(1e-6);
	double pRefHet = VerifyBamID::invLogit(myMinimizer.point[0]);
	double pRefAlt = VerifyBamID::invLogit(myMinimizer.point[1]);
	Logger::gLogger->writeLog("Reference Bias Estimated as ( Pr[refBase|HET] = %lf, Pr[refBase|ALT] = %lf) with LLK = %lf",pRefHet,pRefAlt,myMinimizer.fmin);
	//vbid.setRefBiasParams(1.0, pRefHet, pRefAlt);

	vbid.selfOut.llk0s[rg+1] = myFunc.llk0;
	vbid.selfOut.llk1s[rg+1] = myFunc.llk1;
	vbid.selfOut.refHets[rg+1] = myFunc.pRefHet;
	vbid.selfOut.refAlts[rg+1] = myFunc.pRefAlt;
	vbid.calculateDepthByGenotype(0,rg,vbid.selfOut);
      }
    }
    else {
      Logger::gLogger->warning("--self must be set for --chip-refBias to work. Skipping..");
    }
  }
  else if ( args.bChipFull ) {
    Logger::gLogger->writeLog("Performing with-chip estimation of reference-bias and sample mixture together");
    for(int rg=-1; rg < (vbid.nRGs - (int)args.bIgnoreRG); ++rg) {
      double maxIBD = -1;

      for(int i=0; i < (int)vbid.pGenotypes->indids.size(); ++i) {
	VerifyBamID::fullIbdLLKFunc myFunc(&vbid,i,rg);
	AmoebaMinimizer myMinimizer;
	Vector startingPoint(3);
	startingPoint[0] = 3.91;  // start with fIBD = 0.99
	startingPoint[1] = 0;      // pRefHet = 0.5
	startingPoint[2] = -4.595; // pRefAlt = 0.01
	myMinimizer.func = &myFunc;

	myFunc.indIdx = i;
	myMinimizer.Reset(3);
	myMinimizer.point = startingPoint;
	myMinimizer.Minimize(1e-6);
	double fIBD = VerifyBamID::invLogit(myMinimizer.point[0]);
	double pRefHet = VerifyBamID::invLogit(myMinimizer.point[1]);
	double pRefAlt = VerifyBamID::invLogit(myMinimizer.point[2]);

	Logger::gLogger->writeLog("Comparing with individual %s.. Optimal fIBD = %lf, LLK0 = %lf, LLK1 = %lf for readgroup %d",vbid.pGenotypes->indids[i].c_str(), fIBD, myFunc.llk0, myFunc.llk1, rg);
	//Logger::gLogger->writeLog("Optimal per-sample fIBD = %lf, ",fIBD);
	Logger::gLogger->writeLog("Reference Bias Estimated as ( Pr[refBase|HET] = %lf, Pr[refBase|ALT] = %lf ) with LLK = %lf",pRefHet,pRefAlt,myMinimizer.fmin);
	if ( maxIBD < fIBD ) {
	  bestInds[rg+1] = i;
	  maxIBD = fIBD;
	  vbid.bestOut.llk0s[rg+1] = myFunc.llk0;
	  vbid.bestOut.llk1s[rg+1] = myFunc.llk1;
	  vbid.bestOut.fMixs[rg+1] = 1.-myFunc.fIBD;
	  vbid.bestOut.refHets[rg+1] = myFunc.pRefHet;
	  vbid.bestOut.refAlts[rg+1] = myFunc.pRefAlt;
	}

	if ( ( (rg < 0) && (vbid.pPile->sBamSMID == vbid.pGenotypes->indids[i] ) ) || ( ( rg >= 0 ) && ( vbid.pPile->vsSMIDs[rg] == vbid.pGenotypes->indids[i]) ) ) {
	  selfInds[rg+1] = i;
	  vbid.selfOut.llk0s[rg+1] = myFunc.llk0;
	  vbid.selfOut.llk1s[rg+1] = myFunc.llk1;
	  vbid.selfOut.fMixs[rg+1] = 1.-myFunc.fIBD;
	  vbid.selfOut.refHets[rg+1] = myFunc.pRefHet;
	  vbid.selfOut.refAlts[rg+1] = myFunc.pRefAlt;
	  vbid.calculateDepthByGenotype(i, rg, vbid.selfOut);
	}
      }
      //vbid.setRefBiasParams(1.0, pRefHet, pRefAlt);
      if ( bestInds[rg+1] >= 0 ) {
	Logger::gLogger->writeLog("Best Matching Individual is %s with IBD = %lf",vbid.pGenotypes->indids[bestInds[rg+1]].c_str(),maxIBD);
	vbid.calculateDepthByGenotype(bestInds[rg+1], rg, vbid.bestOut);
      }

      if ( selfInds[rg+1] >= 0 ) {
	Logger::gLogger->writeLog("Self Individual is %s with IBD = %lf",vbid.pGenotypes->indids[selfInds[rg+1]].c_str(),vbid.selfOut.fMixs[rg+1]);
	vbid.calculateDepthByGenotype(selfInds[rg+1],rg,vbid.selfOut);
      }
    }
  }

  // PRINT OUTPUT FILE - ".selfSM"
  // [SEQ_ID]  : SAMPLE ID in the sequence file
  // [CHIP_ID] : SAMPLE ID in the chip file (NA if not available)
  // [#SNPS] : Number of markers evaluated
  // [#READS]   : Number of reads evaluated
  // [AVG_DP]   : Mean depth
  // [FREEMIX]  : Chip-free estimated alpha (% MIX in 0-1 scale), NA if unavailable
  // [FREELK1]  : Chip-free log-likelihood at estimated alpha
  // [FREELK0]  : Chip-free log-likelihood at 0% contamination
  // [CHIPIBD]  : With-chip estimated alpha (% MIX in 0-1 scale)
  // [CHIPLK1]  : With-chip log-likelihood at estimated alpha
  // [CHIPLK0]  : With-chip log-likelihood at 0% contamination
  // [DPREF]    : Depth at reference site in the chip
  // [RDPHET]   : Relative depth at HET site in the chip
  // [RDPALT]   : Relative depth at HOMALT site in the chip
  // [FREE_RF]  : Pr(Ref|Ref) site estimated without chip data
  // [FREE_RH]  : Pr(Ref|Het) site estimated without chip data
  // [FREE_RA]  : Pr(Ref|Alt) site estimated without chip data
  // [CHIP_RF]  : Pr(Ref|Ref) site estimated with chip data
  // [CHIP_RH]  : Pr(Ref|Het) site estimated with chip data
  // [CHIP_RA]  : Pr(Ref|Alt) site estimated with chip data
  // [DPREF]    : Depth at reference alleles
  // [RDPHET]   : Relative depth at heterozygous alleles
  // [RDPALT]   : Relative depth at hom-alt alleles

  String selfSMFN = args.sOutFile + ".selfSM";
  String bestSMFN = args.sOutFile + ".bestSM";
  String selfRGFN = args.sOutFile + ".selfRG";
  String bestRGFN = args.sOutFile + ".bestRG";
  String dpSMFN = args.sOutFile + ".depthSM";
  String dpRGFN = args.sOutFile + ".depthRG";

  IFILE selfSMF = ifopen(selfSMFN,"wb");
  IFILE bestSMF = (args.bFindBest ? ifopen(bestSMFN,"wb") : NULL);
  IFILE selfRGF = (args.bIgnoreRG ? NULL : ifopen(selfRGFN,"wb"));
  IFILE bestRGF = (args.bFindBest && !args.bIgnoreRG) ? ifopen(bestRGFN,"wb") : NULL;

  IFILE dpSMF = ifopen(dpSMFN,"wb");
  IFILE dpRGF = (args.bIgnoreRG ? NULL : ifopen(dpRGFN,"wb"));
  if ( selfSMF == NULL ) {
    Logger::gLogger->error("Cannot write to %s",selfSMF);
  }
  if ( args.bFindBest && ( bestSMF == NULL ) ) {
    Logger::gLogger->error("Cannot write to %s",bestSMF);
  }
  if ( dpSMF == NULL ) {
    Logger::gLogger->error("Cannot write to %s",dpSMF);
  }

  ifprintf(dpSMF,"#RG\tDEPTH\t#SNPs\t%%SNPs\t%%CUMUL\n");
  int nCumMarkers = 0;
  for(int i=args.maxDepth; i >= 0; --i) {
    nCumMarkers += vbid.mixOut.depths[i];
    ifprintf(dpSMF,"ALL\t%d\t%d\t%.5lf\t%.5lf\n",i, vbid.mixOut.depths[i],(double) vbid.mixOut.depths[i]/(double)vbid.nMarkers,(double)nCumMarkers/(double)vbid.nMarkers);
  }
  ifclose(dpSMF);


  if ( dpRGF != NULL ) {
    ifprintf(dpRGF,"#RG\tDEPTH\t#SNPs\t%%SNPs\t%%CUMUL\n");
    for(int rg=0; rg < (vbid.nRGs - (int)args.bIgnoreRG); ++rg) {
      const char* rgID = vbid.pPile->vsRGIDs[rg].c_str();

      int nMarkers = 0;
      for(int i=args.maxDepth; i >= 0; --i) {
	nMarkers += vbid.mixOut.depths[(rg+1)*(args.maxDepth+1) + i];
      }

      nCumMarkers = 0;
      for(int i=args.maxDepth; i >= 0; --i) {
	int d = vbid.mixOut.depths[(rg+1)*(args.maxDepth+1) + i];
	nCumMarkers += d;
	ifprintf(dpRGF,"%s\t%d\t%d\t%.5lf\t%.5lf\n",rgID,i,d,(double)d/(double)vbid.nMarkers,(double)nCumMarkers/(double)nMarkers);
      }
    }
    ifclose(dpRGF);
  }

  const char* headers[] = {"#SEQ_ID","RG","CHIP_ID","#SNPS","#READS","AVG_DP","FREEMIX","FREELK1","FREELK0","FREE_RH","FREE_RA","CHIPMIX","CHIPLK1","CHIPLK0","CHIP_RH","CHIP_RA","DPREF","RDPHET","RDPALT"};
  int nheaders = sizeof(headers)/sizeof(headers[0]);

  for(int i=0; i < nheaders; ++i) { ifprintf(selfSMF,"%s%s",i>0 ? "\t" : "",headers[i]); }
  ifprintf(selfSMF,"\n");
  ifprintf(selfSMF,"%s\tALL",vbid.pPile->sBamSMID.c_str());
  ifprintf(selfSMF,"\t%s",selfInds[0] >= 0 ? vbid.pGenotypes->indids[selfInds[0]].c_str() : "NA");
  ifprintf(selfSMF,"\t%d\t%d\t%.2lf",vbid.nMarkers,vbid.mixOut.numReads[0],(double)vbid.mixOut.numReads[0]/(double)vbid.nMarkers);
  if ( args.bFreeNone ) { ifprintf(selfSMF,"\tNA\tNA\tNA\tNA\tNA"); }
  else if ( args.bFreeMixOnly ) { ifprintf(selfSMF,"\t%.5lf\t%.2lf\t%.2lf\tNA\tNA",vbid.mixOut.fMixs[0],vbid.mixOut.llk1s[0],vbid.mixOut.llk0s[0]); }
  else if ( args.bFreeRefBiasOnly ) { ifprintf(selfSMF,"\tNA\t%.2lf\t%.2lf\t%.5lf\t%.5lf",vbid.mixOut.llk1s[0],vbid.mixOut.llk0s[0],vbid.mixOut.refHets[0],vbid.mixOut.refAlts[0]); }
  else if ( args.bFreeFull ) { ifprintf(selfSMF,"\t%.5lf\t%.2lf\t%.2lf\t%.5lf\t%.5lf",vbid.mixOut.fMixs[0],vbid.mixOut.llk1s[0],vbid.mixOut.llk0s[0],vbid.mixOut.refHets[0],vbid.mixOut.refAlts[0]); }
  else { error("Invalid option in handling bFree"); }

  if ( args.bChipNone || bestInds[0] < 0 ) { ifprintf(selfSMF,"\tNA\tNA\tNA\tNA\tNA\tNA\tNA\tNA"); }
  else if ( args.bChipMixOnly ) { ifprintf(selfSMF,"\t%.5lf\t%.2lf\t%.2lf\tNA\tNA\t%.3lf\t%.4lf\t%.4lf",vbid.selfOut.fMixs[0],vbid.selfOut.llk1s[0],vbid.selfOut.llk0s[0],(double)vbid.selfOut.numReads[1]/vbid.selfOut.numGenos[1], (double)vbid.selfOut.numReads[2]*vbid.selfOut.numGenos[1]/vbid.selfOut.numReads[1]/vbid.selfOut.numGenos[2], (double)vbid.selfOut.numReads[3]*vbid.selfOut.numGenos[1]/vbid.selfOut.numReads[1]/vbid.selfOut.numGenos[3]); }
  else if ( args.bChipMixOnly ) { ifprintf(selfSMF,"\tNA\t%.2lf\t%.2lf\t%.5lf\t%.5lf\t%.3lf\t%.4lf\t%.4lf",vbid.selfOut.llk1s[0], vbid.selfOut.llk0s[0], vbid.selfOut.refHets[0], vbid.selfOut.refAlts[0], (double)vbid.selfOut.numReads[1]/vbid.selfOut.numGenos[1], (double)vbid.selfOut.numReads[2]*vbid.selfOut.numGenos[1]/vbid.selfOut.numReads[1]/vbid.selfOut.numGenos[2], (double)vbid.selfOut.numReads[3]*vbid.selfOut.numGenos[1]/vbid.selfOut.numReads[1]/vbid.selfOut.numGenos[3]); }
  else if ( args.bChipFull ) { ifprintf(selfSMF,"\t%.5lf\t%.2lf\t%.2lf\t%.5lf\t%.5lf\t%.3lf\t%.4lf\t%.4lf", vbid.selfOut.fMixs[0], vbid.selfOut.llk1s[0], vbid.selfOut.llk0s[0], vbid.selfOut.refHets[0], vbid.selfOut.refAlts[0], (double)vbid.selfOut.numReads[1]/vbid.selfOut.numGenos[1], (double)vbid.selfOut.numReads[2]*vbid.selfOut.numGenos[1]/vbid.selfOut.numReads[1]/vbid.selfOut.numGenos[2], (double)vbid.selfOut.numReads[3]*vbid.selfOut.numGenos[1]/vbid.selfOut.numReads[1]/vbid.selfOut.numGenos[3]); }
  else { error("Invalid option in handling bChip"); }
  ifprintf(selfSMF,"\n");
  ifclose(selfSMF);

  if ( bestSMF != NULL ) {
    for(int i=0; i < nheaders; ++i) { ifprintf(bestSMF,"%s%s",i>0 ? "\t" : "",headers[i]); }
    ifprintf(bestSMF,"\n");
    ifprintf(bestSMF,"%s\tALL",vbid.pPile->sBamSMID.c_str());
    ifprintf(bestSMF,"\t%s",bestInds[0] >= 0 ? vbid.pGenotypes->indids[bestInds[0]].c_str() : "NA");
    ifprintf(bestSMF,"\t%d\t%d\t%.2lf",vbid.nMarkers,vbid.mixOut.numReads[0],(double)vbid.mixOut.numReads[0]/(double)vbid.nMarkers);
    if ( args.bFreeNone ) { ifprintf(bestSMF,"\tNA\tNA\tNA\tNA\tNA"); }
    else if ( args.bFreeMixOnly ) { ifprintf(bestSMF,"\t%.5lf\t%.2lf\t%.2lf\tNA\tNA",vbid.mixOut.fMixs[0],vbid.mixOut.llk1s[0],vbid.mixOut.llk0s[0]); }
    else if ( args.bFreeRefBiasOnly ) { ifprintf(bestSMF,"\tNA\t%.2lf\t%.2lf\t%.5lf\t%.5lf",vbid.mixOut.llk1s[0],vbid.mixOut.llk0s[0],vbid.mixOut.refHets[0],vbid.mixOut.refAlts[0]); }
    else if ( args.bFreeFull ) { ifprintf(bestSMF,"\t%.5lf\t%.2lf\t%.2lf\t%.5lf\t%.5lf",vbid.mixOut.fMixs[0],vbid.mixOut.llk1s[0],vbid.mixOut.llk0s[0],vbid.mixOut.refHets[0],vbid.mixOut.refAlts[0]); }
    else { error("Invalid option in handling bFree"); }
    
    if ( args.bChipNone || bestInds[0] < 0 ) { ifprintf(bestSMF,"\tNA\tNA\tNA\tNA\tNA\tNA\tNA\tNA"); }
    else if ( args.bChipMixOnly ) { ifprintf(bestSMF,"\t%.5lf\t%.2lf\t%.2lf\tNA\tNA\t%.3lf\t%.4lf\t%.4lf",vbid.bestOut.fMixs[0],vbid.bestOut.llk1s[0],vbid.bestOut.llk0s[0],(double)vbid.bestOut.numReads[1]/vbid.bestOut.numGenos[1], (double)vbid.bestOut.numReads[2]*vbid.bestOut.numGenos[1]/vbid.bestOut.numReads[1]/vbid.bestOut.numGenos[2], (double)vbid.bestOut.numReads[3]*vbid.bestOut.numGenos[1]/vbid.bestOut.numReads[1]/vbid.bestOut.numGenos[3]); }
    else if ( args.bChipMixOnly ) { ifprintf(bestSMF,"\tNA\t%.2lf\t%.2lf\t%.5lf\t%.5lf\t%.3lf\t%.4lf\t%.4lf",vbid.bestOut.llk1s[0], vbid.bestOut.llk0s[0], vbid.bestOut.refHets[0], vbid.bestOut.refAlts[0], (double)vbid.bestOut.numReads[1]/vbid.bestOut.numGenos[1], (double)vbid.bestOut.numReads[2]*vbid.bestOut.numGenos[1]/vbid.bestOut.numReads[1]/vbid.bestOut.numGenos[2], (double)vbid.bestOut.numReads[3]*vbid.bestOut.numGenos[1]/vbid.bestOut.numReads[1]/vbid.bestOut.numGenos[3]); }
    else if ( args.bChipFull ) { ifprintf(bestSMF,"\t%.5lf\t%.2lf\t%.2lf\t%.5lf\t%.5lf\t%.3lf\t%.4lf\t%.4lf", vbid.bestOut.fMixs[0], vbid.bestOut.llk1s[0], vbid.bestOut.llk0s[0], vbid.bestOut.refHets[0], vbid.bestOut.refAlts[0], (double)vbid.bestOut.numReads[1]/vbid.bestOut.numGenos[1], (double)vbid.bestOut.numReads[2]*vbid.bestOut.numGenos[1]/vbid.bestOut.numReads[1]/vbid.bestOut.numGenos[2], (double)vbid.bestOut.numReads[3]*vbid.bestOut.numGenos[1]/vbid.bestOut.numReads[1]/vbid.bestOut.numGenos[3]); }
    else { error("Invalid option in handling bChip"); }
    ifprintf(bestSMF,"\n");
    ifclose(bestSMF);
  }

  if ( selfRGF != NULL ) {
    for(int i=0; i < nheaders; ++i) { ifprintf(selfRGF,"%s%s",i>0 ? "\t" : "",headers[i]); }
    ifprintf(selfRGF,"\n");
    for(int rg=0; rg < vbid.nRGs; ++rg) {
      ifprintf(selfRGF,"%s\t%s",vbid.pPile->sBamSMID.c_str(),vbid.pPile->vsRGIDs[rg].c_str());
      ifprintf(selfRGF,"\t%s",bestInds[rg] >= 0 ? vbid.pGenotypes->indids[bestInds[rg]].c_str() : "NA");
      ifprintf(selfRGF,"\t%d\t%d\t%.2lf",vbid.nMarkers,vbid.mixOut.numReads[(rg+1)*4],(double)vbid.mixOut.numReads[(rg+1)*4]/(double)vbid.mixOut.numGenos[(rg+1)*4]);
      if ( args.bFreeNone ) { ifprintf(selfRGF,"\tNA\tNA\tNA\tNA\tNA"); }
      else if ( args.bFreeMixOnly ) { ifprintf(selfRGF,"\t%.5lf\t%.2lf\t%.2lf\tNA\tNA",vbid.mixOut.fMixs[rg+1],vbid.mixOut.llk1s[rg+1],vbid.mixOut.llk0s[rg+1]); }
      else if ( args.bFreeRefBiasOnly ) { ifprintf(selfRGF,"\tNA\t%.2lf\t%.2lf\t%.5lf\t%.5lf",vbid.mixOut.llk1s[rg+1],vbid.mixOut.llk0s[rg+1],vbid.mixOut.refHets[rg+1],vbid.mixOut.refAlts[rg+1]); }
      else if ( args.bFreeFull ) { ifprintf(selfRGF,"\t%.5lf\t%.2lf\t%.2lf\t%.5lf\t%.5lf",vbid.mixOut.fMixs[rg+1],vbid.mixOut.llk1s[rg+1],vbid.mixOut.llk0s[rg+1],vbid.mixOut.refHets[rg+1],vbid.mixOut.refAlts[rg+1]); }
      else { error("Invalid option in handling bFree"); }
      
      if ( args.bChipNone || bestInds[0] < 0 ) { ifprintf(selfRGF,"\tNA\tNA\tNA\tNA\tNA\tNA\tNA\tNA"); }
      else if ( args.bChipMixOnly ) { ifprintf(selfRGF,"\t%.5lf\t%.2lf\t%.2lf\tNA\tNA\t%.3lf\t%.4lf\t%.4lf",vbid.selfOut.fMixs[rg+1], vbid.selfOut.llk1s[rg+1], vbid.selfOut.llk0s[rg+1], (double)vbid.selfOut.numReads[(rg+1)*4+1]/vbid.selfOut.numGenos[(rg+1)*4+1], (double)vbid.selfOut.numReads[(rg+1)*4+2]*vbid.selfOut.numGenos[(rg+1)*4+1]/vbid.selfOut.numReads[(rg+1)*4+1]/vbid.selfOut.numGenos[(rg+1)*4+2], (double)vbid.selfOut.numReads[(rg+1)*4+3]*vbid.selfOut.numGenos[(rg+1)*4+1]/vbid.selfOut.numReads[(rg+1)*4+1]/vbid.selfOut.numGenos[(rg+1)*4+3]); }
      else if ( args.bChipMixOnly ) { ifprintf(selfRGF,"\tNA\t%.2lf\t%.2lf\t%.5lf\t%.5lf\t%.3lf\t%.4lf\t%.4lf",vbid.selfOut.llk1s[rg+1], vbid.selfOut.llk0s[rg+1], vbid.selfOut.refHets[rg+1], vbid.selfOut.refAlts[rg+1], (double)vbid.selfOut.numReads[(rg+1)*4+1]/vbid.selfOut.numGenos[(rg+1)*4+1], (double)vbid.selfOut.numReads[(rg+1)*4+2]*vbid.selfOut.numGenos[(rg+1)*4+1]/vbid.selfOut.numReads[(rg+1)*4]/vbid.selfOut.numGenos[(rg+1)*4+2], (double)vbid.selfOut.numReads[(rg+1)*4+3]*vbid.selfOut.numGenos[(rg+1)*4+1]/vbid.selfOut.numReads[(rg+1)*4+1]/vbid.selfOut.numGenos[(rg+1)*4+3]); }
      else if ( args.bChipFull ) { ifprintf(selfRGF,"\t%.5lf\t%.2lf\t%.2lf\t%.5lf\t%.5lf\t%.3lf\t%.4lf\t%.4lf", vbid.selfOut.fMixs[rg+1], vbid.selfOut.llk1s[rg+1], vbid.selfOut.llk0s[rg+1], vbid.selfOut.refHets[rg+1], vbid.selfOut.refAlts[rg+1], (double)vbid.selfOut.numReads[(rg+1)*4+1]/vbid.selfOut.numGenos[(rg+1)*4+1], (double)vbid.selfOut.numReads[(rg+1)*4+2]*vbid.selfOut.numGenos[(rg+1)*4+1]/vbid.selfOut.numReads[(rg+1)*4+1]/vbid.selfOut.numGenos[(rg+1)*4+2], (double)vbid.selfOut.numReads[(rg+1)*4+3]*vbid.selfOut.numGenos[(rg+1)*4+1]/vbid.selfOut.numReads[(rg+1)*4+1]/vbid.selfOut.numGenos[(rg+1)*4+3]); }
      else { error("Invalid option in handling bChip"); }
      ifprintf(selfRGF,"\n");
    }
    ifclose(selfRGF);
  }

  if ( bestRGF != NULL ) {
    for(int i=0; i < nheaders; ++i) { ifprintf(bestRGF,"%s%s",i>0 ? "\t" : "",headers[i]); }
    ifprintf(bestRGF,"\n");
    for(int rg=0; rg < vbid.nRGs; ++rg) {
      ifprintf(bestRGF,"%s\t%s",vbid.pPile->sBamSMID.c_str(),vbid.pPile->vsRGIDs[rg].c_str());
      ifprintf(bestRGF,"\t%s",bestInds[rg] >= 0 ? vbid.pGenotypes->indids[bestInds[rg]].c_str() : "NA");
      ifprintf(bestRGF,"\t%d\t%d\t%.2lf",vbid.nMarkers,vbid.mixOut.numReads[(rg+1)*4],(double)vbid.mixOut.numReads[(rg+1)*4]/(double)vbid.mixOut.numGenos[(rg+1)*4]);
      if ( args.bFreeNone ) { ifprintf(bestRGF,"\tNA\tNA\tNA\tNA\tNA"); }
      else if ( args.bFreeMixOnly ) { ifprintf(bestRGF,"\t%.5lf\t%.2lf\t%.2lf\tNA\tNA",vbid.mixOut.fMixs[rg+1],vbid.mixOut.llk1s[rg+1],vbid.mixOut.llk0s[rg+1]); }
      else if ( args.bFreeRefBiasOnly ) { ifprintf(bestRGF,"\tNA\t%.2lf\t%.2lf\t%.5lf\t%.5lf",vbid.mixOut.llk1s[rg+1],vbid.mixOut.llk0s[rg+1],vbid.mixOut.refHets[rg+1],vbid.mixOut.refAlts[rg+1]); }
      else if ( args.bFreeFull ) { ifprintf(bestRGF,"\t%.5lf\t%.2lf\t%.2lf\t%.5lf\t%.5lf",vbid.mixOut.fMixs[rg+1],vbid.mixOut.llk1s[rg+1],vbid.mixOut.llk0s[rg+1],vbid.mixOut.refHets[rg+1],vbid.mixOut.refAlts[rg+1]); }
      else { error("Invalid option in handling bFree"); }
      
      if ( args.bChipNone || bestInds[0] < 0 ) { ifprintf(bestRGF,"\tNA\tNA\tNA\tNA\tNA\tNA\tNA\tNA"); }
      else if ( args.bChipMixOnly ) { ifprintf(bestRGF,"\t%.5lf\t%.2lf\t%.2lf\tNA\tNA\t%.3lf\t%.4lf\t%.4lf",vbid.bestOut.fMixs[rg+1], vbid.bestOut.llk1s[rg+1], vbid.bestOut.llk0s[rg+1], (double)vbid.bestOut.numReads[(rg+1)*4+1]/vbid.bestOut.numGenos[(rg+1)*4+1], (double)vbid.bestOut.numReads[(rg+1)*4+2]*vbid.bestOut.numGenos[(rg+1)*4+1]/vbid.bestOut.numReads[(rg+1)*4+1]/vbid.bestOut.numGenos[(rg+1)*4+2], (double)vbid.bestOut.numReads[(rg+1)*4+3]*vbid.bestOut.numGenos[(rg+1)*4+1]/vbid.bestOut.numReads[(rg+1)*4+1]/vbid.bestOut.numGenos[(rg+1)*4+3]); }
      else if ( args.bChipMixOnly ) { ifprintf(bestRGF,"\tNA\t%.2lf\t%.2lf\t%.5lf\t%.5lf\t%.3lf\t%.4lf\t%.4lf",vbid.bestOut.llk1s[rg+1], vbid.bestOut.llk0s[rg+1], vbid.bestOut.refHets[rg+1], vbid.bestOut.refAlts[rg+1], (double)vbid.bestOut.numReads[(rg+1)*4+1]/vbid.bestOut.numGenos[(rg+1)*4+1], (double)vbid.bestOut.numReads[(rg+1)*4+2]*vbid.bestOut.numGenos[(rg+1)*4+1]/vbid.bestOut.numReads[(rg+1)*4]/vbid.bestOut.numGenos[(rg+1)*4+2], (double)vbid.bestOut.numReads[(rg+1)*4+3]*vbid.bestOut.numGenos[(rg+1)*4+1]/vbid.bestOut.numReads[(rg+1)*4+1]/vbid.bestOut.numGenos[(rg+1)*4+3]); }
      else if ( args.bChipFull ) { ifprintf(bestRGF,"\t%.5lf\t%.2lf\t%.2lf\t%.5lf\t%.5lf\t%.3lf\t%.4lf\t%.4lf", vbid.bestOut.fMixs[rg+1], vbid.bestOut.llk1s[rg+1], vbid.bestOut.llk0s[rg+1], vbid.bestOut.refHets[rg+1], vbid.bestOut.refAlts[rg+1], (double)vbid.bestOut.numReads[(rg+1)*4+1]/vbid.bestOut.numGenos[(rg+1)*4+1], (double)vbid.bestOut.numReads[(rg+1)*4+2]*vbid.bestOut.numGenos[(rg+1)*4+1]/vbid.bestOut.numReads[(rg+1)*4+1]/vbid.bestOut.numGenos[(rg+1)*4+2], (double)vbid.bestOut.numReads[(rg+1)*4+3]*vbid.bestOut.numGenos[(rg+1)*4+1]/vbid.bestOut.numReads[(rg+1)*4+1]/vbid.bestOut.numGenos[(rg+1)*4+3]); }
      else { error("Invalid option in handling bChip"); }
      ifprintf(bestRGF,"\n");
    }
    ifclose(bestRGF);
  }
  
  time(&t);
  Logger::gLogger->writeLog("Analysis finished on %s",ctime(&t));

  return 0;
}
Beispiel #8
0
int Stats::execute(int argc, char **argv)
{
    // Extract command line arguments.
    String inFile = "";
    String indexFile = "";
    bool basic = false;
    bool noeof = false;
    bool params = false;
    bool qual = false;
    bool phred = false;
    int maxNumReads = -1;
    bool unmapped = false;
    String pBaseQC = "";
    String cBaseQC = "";
    String regionList = "";
    int excludeFlags = 0;
    int requiredFlags = 0;
    bool withinRegion = false;
    int minMapQual = 0;
    String dbsnp = "";
    PosList *dbsnpListPtr = NULL;
    bool baseSum = false;
    int bufferSize = PileupHelper::DEFAULT_WINDOW_SIZE;

    ParameterList inputParameters;
    BEGIN_LONG_PARAMETERS(longParameterList)
        LONG_PARAMETER_GROUP("Required Parameters")
        LONG_STRINGPARAMETER("in", &inFile)
        LONG_PARAMETER_GROUP("Types of Statistics")
        LONG_PARAMETER("basic", &basic)
        LONG_PARAMETER("qual", &qual)
        LONG_PARAMETER("phred", &phred)
        LONG_STRINGPARAMETER("pBaseQC", &pBaseQC)
        LONG_STRINGPARAMETER("cBaseQC", &cBaseQC)
        LONG_PARAMETER_GROUP("Optional Parameters")
        LONG_INTPARAMETER("maxNumReads", &maxNumReads)
        LONG_PARAMETER("unmapped", &unmapped)
        LONG_STRINGPARAMETER("bamIndex", &indexFile)
        LONG_STRINGPARAMETER("regionList", &regionList)
        LONG_INTPARAMETER("excludeFlags", &excludeFlags)
        LONG_INTPARAMETER("requiredFlags", &requiredFlags)
        LONG_PARAMETER("noeof", &noeof)
        LONG_PARAMETER("params", &params)
        LONG_PARAMETER_GROUP("Optional phred/qual Only Parameters")
        LONG_PARAMETER("withinRegion", &withinRegion)
        LONG_PARAMETER_GROUP("Optional BaseQC Only Parameters")
        LONG_PARAMETER("baseSum", &baseSum)
        LONG_INTPARAMETER("bufferSize", &bufferSize)
        LONG_INTPARAMETER("minMapQual", &minMapQual)
        LONG_STRINGPARAMETER("dbsnp", &dbsnp)
        END_LONG_PARAMETERS();
   
    inputParameters.Add(new LongParameters ("Input Parameters", 
                                            longParameterList));

    inputParameters.Read(argc-1, &(argv[1]));

    // If no eof block is required for a bgzf file, set the bgzf file type to 
    // not look for it.
    if(noeof)
    {
        // Set that the eof block is not required.
        BgzfFileType::setRequireEofBlock(false);
    }

    // Check to see if the in file was specified, if not, report an error.
    if(inFile == "")
    {
        usage();
        inputParameters.Status();
        // In file was not specified but it is mandatory.
        std::cerr << "--in is a mandatory argument for stats, "
                  << "but was not specified" << std::endl;
        return(-1);
    }

    // Use the index file if unmapped or regionList is not empty.
    bool useIndex = (unmapped|| (!regionList.IsEmpty()));

    // IndexFile is required, so check to see if it has been set.
    if(useIndex && (indexFile == ""))
    {
        // In file was not specified, so set it to the in file
        // + ".bai"
        indexFile = inFile + ".bai";
    }
    ////////////////////////////////////////
    // Setup in case pileup is used.
    Pileup<PileupElementBaseQCStats> pileup(bufferSize);
    // Initialize start/end positions.
    myStartPos = 0;
    myEndPos = -1;
    
    // Open the output qc file if applicable.
    IFILE baseQCPtr = NULL;
    if(!pBaseQC.IsEmpty() && !cBaseQC.IsEmpty())
    {
        usage();
        inputParameters.Status();
        // Cannot specify both types of baseQC.
        std::cerr << "Cannot specify both --pBaseQC & --cBaseQC." << std::endl;
        return(-1);
    }
    else if(!pBaseQC.IsEmpty())
    {
        baseQCPtr = ifopen(pBaseQC, "w");
        PileupElementBaseQCStats::setPercentStats(true);
    }
    else if(!cBaseQC.IsEmpty())
    {
        baseQCPtr = ifopen(cBaseQC, "w");
        PileupElementBaseQCStats::setPercentStats(false);
    }

    if(baseQCPtr != NULL)
    {
        PileupElementBaseQCStats::setOutputFile(baseQCPtr);
        PileupElementBaseQCStats::printHeader();
    }
    if((baseQCPtr != NULL) || baseSum)
    {
        PileupElementBaseQCStats::setMapQualFilter(minMapQual);
        PileupElementBaseQCStats::setBaseSum(baseSum);
    }

    if(params)
    {
        inputParameters.Status();
    }

    // Open the file for reading.
    SamFile samIn;
    if(!samIn.OpenForRead(inFile))
    {
        fprintf(stderr, "%s\n", samIn.GetStatusMessage());
        return(samIn.GetStatus());
    }

    samIn.SetReadFlags(requiredFlags, excludeFlags);

    // Set whether or not basic statistics should be generated.
    samIn.GenerateStatistics(basic);

    // Read the sam header.
    SamFileHeader samHeader;
    if(!samIn.ReadHeader(samHeader))
    {
        fprintf(stderr, "%s\n", samIn.GetStatusMessage());
        return(samIn.GetStatus());
    }

    // Open the bam index file for reading if we are
    // doing unmapped reads (also set the read section).
    if(useIndex)
    {
        samIn.ReadBamIndex(indexFile);

        if(unmapped)
        {
            samIn.SetReadSection(-1);
        }

        if(!regionList.IsEmpty())
        {
            myRegionList = ifopen(regionList, "r");
        }
    }

    //////////////////////////
    // Read dbsnp if specified and doing baseQC
    if(((baseQCPtr != NULL) || baseSum) && (!dbsnp.IsEmpty()))
    {
        // Read the dbsnp file.
        IFILE fdbSnp;
        fdbSnp = ifopen(dbsnp,"r");
        // Determine how many entries.
        const SamReferenceInfo& refInfo = samHeader.getReferenceInfo();
        int maxRefLen = 0;
        for(int i = 0; i < refInfo.getNumEntries(); i++)
        {
            int refLen = refInfo.getReferenceLength(i);
            if(refLen >= maxRefLen)
            {
                maxRefLen = refLen + 1;
            }
        }
        
        dbsnpListPtr = new PosList(refInfo.getNumEntries(),maxRefLen);

        if(fdbSnp==NULL)
        {
            std::cerr << "Open dbSNP file " << dbsnp.c_str() << " failed!\n";
        }
        else if(dbsnpListPtr == NULL)
        {
            std::cerr << "Failed to init the memory allocation for the dbsnpList.\n";
        }
        else
        {
            // Read the dbsnp file.
            StringArray tokens;
            String buffer;
            int position = 0;
            int refID = 0;

            // Loop til the end of the file.
            while (!ifeof(fdbSnp))
            {
                // Read the next line.
                buffer.ReadLine(fdbSnp);
                // If it does not have at least 2 columns, 
                // continue to the next line.
                if (buffer.IsEmpty() || buffer[0] == '#') continue;
                tokens.AddTokens(buffer);
                if(tokens.Length() < 2) continue;

                if(!tokens[1].AsInteger(position))
                {
                    std::cerr << "Improperly formatted region line, start position "
                              << "(2nd column) is not an integer: "
                              << tokens[1]
                              << "; Skipping to the next line.\n";         
                    continue;
                }

                // Look up the reference name.
                refID = samHeader.getReferenceID(tokens[0]);
                if(refID != SamReferenceInfo::NO_REF_ID)
                {
                    // Reference id was found, so add it to the dbsnp
                    dbsnpListPtr->addPosition(refID, position);
                }
        
                tokens.Clear();
                buffer.Clear();
            }
        }
        ifclose(fdbSnp);
    }

    // Read the sam records.
    SamRecord samRecord;

    int numReads = 0;

    //////////////////////
    // Setup in case doing a quality count.
    // Quality histogram.
    const int MAX_QUAL = 126;
    const int START_QUAL = 33;
    uint64_t qualCount[MAX_QUAL+1];
    for(int i = 0; i <= MAX_QUAL; i++)
    {
        qualCount[i] = 0;
    }
    
    const int START_PHRED = 0;
    const int PHRED_DIFF = START_QUAL - START_PHRED;
    const int MAX_PHRED = MAX_QUAL - PHRED_DIFF;
    uint64_t phredCount[MAX_PHRED+1];
    for(int i = 0; i <= MAX_PHRED; i++)
    {
        phredCount[i] = 0;
    }
    
    int refPos = 0;
    Cigar* cigarPtr = NULL;
    char cigarChar = '?';
    // Exclude clips from the qual/phred counts if unmapped reads are excluded.
    bool qualExcludeClips = excludeFlags & SamFlag::UNMAPPED;

    //////////////////////////////////
    // When not reading by sections, getNextSection returns true
    // the first time, then false the next time.
    while(getNextSection(samIn))
    {
        // Keep reading records from the file until SamFile::ReadRecord
        // indicates to stop (returns false).
        while(((maxNumReads < 0) || (numReads < maxNumReads)) && samIn.ReadRecord(samHeader, samRecord))
        {
            // Another record was read, so increment the number of reads.
            ++numReads;
            // See if the quality histogram should be genereated.
            if(qual || phred)
            {
                // Get the quality.
                const char* qual = samRecord.getQuality();
                // Check for no quality ('*').
                if((qual[0] == '*') && (qual[1] == 0))
                {
                    // This record does not have a quality string, so no 
                    // quality processing is necessary.
                }
                else
                {
                    int index = 0;
                    cigarPtr = samRecord.getCigarInfo();
                    cigarChar = '?';
                    refPos = samRecord.get0BasedPosition();
                    if(!qualExcludeClips && (cigarPtr != NULL))
                    {
                        // Offset the reference position by any soft clips
                        // by subtracting the queryIndex of this start position.
                        // refPos is now the start position of the clips.
                        refPos -= cigarPtr->getQueryIndex(0);
                    }

                    while(qual[index] != 0)
                    {
                        // Skip this quality if it is clipped and we are skipping clips.
                        if(cigarPtr != NULL)
                        {
                            cigarChar = cigarPtr->getCigarCharOpFromQueryIndex(index);
                        }
                        if(qualExcludeClips && Cigar::isClip(cigarChar))
                        {
                            // Skip a clipped quality.
                            ++index;
                            // Increment the position.
                            continue;
                        }

                        if(withinRegion && (myEndPos != -1) && (refPos >= myEndPos))
                        {
                            // We have hit the end of the region, stop processing this
                            // quality string.
                            break;
                        }

                        if(withinRegion && (refPos < myStartPos))
                        {
                            // This position is not in the target.
                            ++index;
                            // Update the position if this is found in the reference or a clip.
                            if(Cigar::foundInReference(cigarChar) || Cigar::isClip(cigarChar))
                            {
                                ++refPos;
                            }
                            continue;
                        }

                        // Check for valid quality.
                        if((qual[index] < START_QUAL) || (qual[index] > MAX_QUAL))
                        {
                            if(qual)
                            {
                                std::cerr << "Invalid Quality found: " << qual[index] 
                                          << ".  Must be between "
                                          << START_QUAL << " and " << MAX_QUAL << ".\n";
                            }
                            if(phred)
                            {
                                std::cerr << "Invalid Phred Quality found: " << qual[index] - PHRED_DIFF
                                          << ".  Must be between "
                                          << START_QUAL << " and " << MAX_QUAL << ".\n";
                            }
                            // Skip an invalid quality.
                            ++index;
                            // Update the position if this is found in the reference or a clip.
                            if(Cigar::foundInReference(cigarChar) || Cigar::isClip(cigarChar))
                            {
                                ++refPos;
                            }
                            continue;
                        }
                        
                        // Increment the count for this quality.
                        ++(qualCount[(int)(qual[index])]);
                        ++(phredCount[(int)(qual[index]) - PHRED_DIFF]);
                        // Update the position if this is found in the reference or a clip.
                        if(Cigar::foundInReference(cigarChar) || Cigar::isClip(cigarChar))
                        {
                            ++refPos;
                        }
                        ++index;
                    }
                }
            }

            // Check the next thing to do for the read.
            if((baseQCPtr != NULL) || baseSum)
            {
                // Pileup the bases for this read.
                pileup.processAlignmentRegion(samRecord, myStartPos, myEndPos, dbsnpListPtr);
            }
        }

        // Done with a section, move on to the next one.

        // New section, so flush the pileup.
        pileup.flushPileup();
    }

    // Flush the rest of the pileup.
    if((baseQCPtr != NULL) || baseSum)
    {
        // Pileup the bases.
        pileup.processAlignmentRegion(samRecord, myStartPos, myEndPos, dbsnpListPtr);
        PileupElementBaseQCStats::printSummary();
        ifclose(baseQCPtr);
    }

    std::cerr << "Number of records read = " << 
        samIn.GetCurrentRecordCount() << std::endl;

    if(basic)
    {
        std::cerr << std::endl;
        samIn.PrintStatistics();
    }

    // Print the quality stats.
    if(qual)
    {
        std::cerr << std::endl;
        std::cerr << "Quality\tCount\n";
        for(int i = START_QUAL; i <= MAX_QUAL; i++)
        {
            std::cerr << i << "\t" << qualCount[i] << std::endl;
        }
    }
    // Print the phred quality stats.
    if(phred)
    {
        std::cerr << std::endl;
        std::cerr << "Phred\tCount\n";
        for(int i = START_PHRED; i <= MAX_PHRED; i++)
        {
            std::cerr << i << "\t" << phredCount[i] << std::endl;
        }
    }

    SamStatus::Status status = samIn.GetStatus();
    if(status == SamStatus::NO_MORE_RECS)
    {
        // A status of NO_MORE_RECS means that all reads were successful.
        status = SamStatus::SUCCESS;
    }

    return(status);
}
int main(int argc, char ** argv)
{   
   ParameterList inputParameters;
   String filename;
   int minReadLength = 10;
   int printableErrors = 20;
   int maxErrors = -1;
   String testParam;
   BaseAsciiMap::SPACE_TYPE myBaseType = BaseAsciiMap::UNKNOWN;
   
   // Read the parameters from the command line.
   bool baseSpace = false;
   bool colorSpace = false;
   bool autoDetect = false;
   bool ignoreErrors = false;
   bool baseComposition = false;
   bool avgQual = false;
   bool quiet = false;
   bool noeof = false;
   bool params = false;
   bool disableSeqIDCheck = false;
   bool interleaved = false;

   BEGIN_LONG_PARAMETERS(longParameterList)
      LONG_STRINGPARAMETER("file", &filename)
      LONG_PARAMETER("baseComposition", &baseComposition)
      LONG_PARAMETER("avgQual", &avgQual)
      LONG_PARAMETER("disableSeqIDCheck", &disableSeqIDCheck)
      LONG_PARAMETER("interleaved", &interleaved)
      LONG_PARAMETER("noeof", &noeof)
      LONG_PARAMETER("quiet", &quiet)
      LONG_PARAMETER("params", &params)
      LONG_INTPARAMETER("minReadLen", &minReadLength)
      LONG_INTPARAMETER("maxErrors", &maxErrors)
      LONG_PARAMETER_GROUP("Space Type")
         EXCLUSIVE_PARAMETER("baseSpace", &baseSpace)
         EXCLUSIVE_PARAMETER("colorSpace", &colorSpace)
         EXCLUSIVE_PARAMETER("auto", &autoDetect)
      LONG_PARAMETER_GROUP("Errors")
         EXCLUSIVE_PARAMETER("ignoreErrors", &ignoreErrors)
         LONG_SMARTINTPARAMETER("printableErrors", &printableErrors)
   BEGIN_LEGACY_PARAMETERS()
      LONG_PARAMETER("printBaseComp", &baseComposition)       
      LONG_PARAMETER("disableAllMessages", &quiet)
      LONG_INTPARAMETER("quitAfterErrorNum", &maxErrors)
      LONG_PARAMETER_GROUP("Space Type")
         EXCLUSIVE_PARAMETER("baseSpace", &baseSpace)
         EXCLUSIVE_PARAMETER("colorSpace", &colorSpace)
         EXCLUSIVE_PARAMETER("autoDetect", &autoDetect)
      LONG_PARAMETER_GROUP("Errors")
         EXCLUSIVE_PARAMETER("ignoreAllErrors", &ignoreErrors)
         LONG_SMARTINTPARAMETER("maxReportedErrors", &printableErrors)
   END_LONG_PARAMETERS();
   
   inputParameters.Add(new LongParameters ("Input Parameters", longParameterList));

   inputParameters.Read(argc, argv);

   if(ignoreErrors)
   {
      // Ignore all errors, so set printableErrors to 0.
      printableErrors = 0;
   }

   // Set the base type based on the passed in parameters.
   if(baseSpace)
   {
      // Base Space
      myBaseType = BaseAsciiMap::BASE_SPACE;
   }
   else if(colorSpace)
   {
      myBaseType = BaseAsciiMap::COLOR_SPACE;
   }
   else
   {
      myBaseType = BaseAsciiMap::UNKNOWN;
      // Set autoDetect
      autoDetect = true;
   }

   // If no eof block is required for a bgzf file, set the bgzf file type to 
   // not look for it.
   if(noeof)
   {
       // Set that the eof block is not required.
       BgzfFileType::setRequireEofBlock(false);
   }

   // DO not print status if set to quiet.
   if((!quiet) && params)
   {
      inputParameters.Status();
   }

   if(filename == "")
   {
      if(quiet)
      {
         return(-1);
      }
      // No filename was specified so print a usage description.
      std::cout << "ERROR: No filename specified.  See below for usage help.";
      std::cout << std::endl << std::endl;

      std::cout << "  Required Parameters:" << std::endl;
      std::cout << "\t--file  :  FastQ filename with path to be prorcessed.\n";
      std::cout << std::endl;

      std::cout << "  Optional Parameters:" << std::endl;
      std::cout << "\t--minReadLen         : Minimum allowed read length (Defaults to 10).\n";
      std::cout << "\t--maxErrors          : Number of errors to allow before quitting\n";
      std::cout << "\t                       reading/validating the file.\n";
      std::cout << "\t                       -1 (default) indicates to not quit until\n";
      std::cout << "\t                       the entire file is read.\n";
      std::cout << "\t                       0 indicates not to read/validate anything\n";
      std::cout << "\t--printableErrors    : Maximum number of errors to print before\n";
      std::cout << "\t                       suppressing them (Defaults to 20).\n";
      std::cout << "\t                       Different than maxErrors since \n";
      std::cout << "\t                       printableErrors will continue reading and\n";
      std::cout << "\t                       validating the file until the end, but\n";
      std::cout << "\t                       just doesn't print the errors.\n";
      std::cout << "\t--ignoreErrors       : Ignore all errors (same as printableErrors = 0)\n";
      std::cout << "\t                       overwrites the printableErrors option.\n";
      std::cout << "\t--baseComposition    : Print the Base Composition Statistics.\n";
      std::cout << "\t--avgQual            : Print the average phred quality per cycle & overall average quality.\n";
      std::cout << "\t--disableSeqIDCheck  : Disable the unique sequence identifier check.\n";
      std::cout << "\t                       Use this option to save memory since the sequence id\n";
      std::cout << "\t                       check uses a lot of memory.\n";
      std::cout << "\t--noeof              : Disable checking that the eof block is present in gzipped files\n.";
      std::cout << "\t--interleaved        : Validate consequtive reads have the same sequence identifier\n";
      std::cout << "\t                       (only allowed difference is 1/2, but not required) and validate\n";
      std::cout << "\t                       that otherwise reads have unique sequence identifiers.\n";
      std::cout << "\t                       Cannot be used if '--disableSeqIDCheck' is specified.\n";
      std::cout << "\t--params             : Print the parameter settings.\n";
      std::cout << "\t--quiet              : Suppresses the display of errors and summary statistics.\n";
      std::cout << "\t                       Does not affect the printing of Base Composition Statistics.\n";

      std::cout << "\n  Optional Space Options for Raw Sequence (Last one specified is used):\n";
      std::cout << "\t--auto       : Determine baseSpace/colorSpace from the Raw Sequence in the file (Default).\n";
      std::cout << "\t--baseSpace  : ACTGN only\n";
      std::cout << "\t--colorSpace : 0123. only\n";
      std::cout << std::endl;

      std::cout << "  Usage:" << std::endl;
      std::cout << "\t./fastQValidator --file <fileName> [--minReadLen <minReadLen>] [--maxErrors <numErrors>] [--printableErrors <printableErrors>|--ignoreErrors] [--baseComposition] [--disableSeqIDCheck] [--interleaved] [--quiet] [--baseSpace|--colorSpace|--auto] [--params]\n\n";
      std::cout << "  Examples:" << std::endl;
      std::cout << "\t../fastQValidator --file testFile.txt\n";
      std::cout << "\t../fastQValidator --file testFile.txt --minReadLen 10 --baseSpace --printableErrors 100\n";
      std::cout << "\t./fastQValidator --file test/testFile.txt --minReadLen 10 --colorSpace --ignoreErrors\n";
      std::cout << std::endl;
      return (-1);
   }
   
   FastQFile validator(minReadLength, printableErrors);
   
   if(quiet)
   {
      validator.disableMessages();
   }

   if(disableSeqIDCheck)
   {
       validator.disableSeqIDCheck();
   }

   if(interleaved)
   {
       validator.interleaved();
   }

   if(interleaved && disableSeqIDCheck)
   {
       if(!quiet)
       {
           std::cout << "ERROR: --interleaved and --disableSeqIDCheck cannot both be specified.\n";
       }
       return(-1);
   }

   validator.setMaxErrors(maxErrors);

   FastQStatus::Status status = validator.validateFastQFile(filename, baseComposition, myBaseType, avgQual);

   if(!quiet)
   {
      std::cout << "Returning: " << status << " : " << FastQStatus::getStatusString(status)
                << std::endl;
   }

   return(status);
}
Beispiel #10
0
int WriteRegion::execute(int argc, char **argv)
{
    // Extract command line arguments.
    String inFile = "";
    String outFile = "";
    String indexFile = "";
    String readName = "";
    String bed = "";
    myStart = UNSPECIFIED_INT;
    myEnd = UNSPECIFIED_INT;
    myPrevStart = UNSPECIFIED_INT;
    myPrevEnd = UNSPECIFIED_INT;
    myRefID = UNSET_REF;
    myRefName.Clear();
    myPrevRefName.Clear();
    myBedRefID = SamReferenceInfo::NO_REF_ID;
    bool lshift = false;
    bool noeof = false;
    bool params = false;
    myWithinReg = false;
    myWroteReg = false;

    ParameterList inputParameters;
    BEGIN_LONG_PARAMETERS(longParameterList)
        LONG_PARAMETER_GROUP("Required Parameters")
        LONG_STRINGPARAMETER("in", &inFile)
        LONG_STRINGPARAMETER("out", &outFile)
        LONG_PARAMETER_GROUP("Optional Region Parameters")        
        LONG_STRINGPARAMETER("bamIndex", &indexFile)
        LONG_STRINGPARAMETER("refName", &myRefName)
        LONG_INTPARAMETER("refID", &myRefID)
        LONG_INTPARAMETER("start", &myStart)
        LONG_INTPARAMETER("end", &myEnd)
        LONG_STRINGPARAMETER("bed", &bed)
        LONG_PARAMETER("withinReg", &myWithinReg)
        LONG_STRINGPARAMETER("readName", &readName)
        LONG_PARAMETER_GROUP("Optional Other Parameters")
        LONG_PARAMETER("lshift", &lshift)
        LONG_PARAMETER("noeof", &noeof)
        LONG_PARAMETER("params", &params)
        END_LONG_PARAMETERS();
   
    inputParameters.Add(new LongParameters ("Input Parameters", 
                                            longParameterList));

    inputParameters.Read(argc-1, &(argv[1]));

    // If no eof block is required for a bgzf file, set the bgzf file type to 
    // not look for it.
    if(noeof)
    {
        // Set that the eof block is not required.
        BgzfFileType::setRequireEofBlock(false);
    }

    // Check to see if the in file was specified, if not, report an error.
    if(inFile == "")
    {
        usage();
        // mandatory argument was not specified.
        inputParameters.Status();
        std::cerr << "Missing mandatory argument: --in" << std::endl;
        return(-1);
    }
    if(outFile == "")
    {
        usage();
        // mandatory argument was not specified.
        inputParameters.Status();
        std::cerr << "Missing mandatory argument: --out" << std::endl;
        return(-1);
    }
    
    if(indexFile == "")
    {
        // In file was not specified, so set it to the in file
        // + ".bai"
        indexFile = inFile + ".bai";
    }

    if(myRefID != UNSET_REF && myRefName.Length() != 0)
    {
        std::cerr << "Can't specify both refID and refName" << std::endl;
        inputParameters.Status();
        return(-1);
    }
    if(myRefID != UNSET_REF && bed.Length() != 0)
    {
        std::cerr << "Can't specify both refID and bed" << std::endl;
        inputParameters.Status();
        return(-1);
    }
    if(myRefName.Length() != 0 && bed.Length() != 0)
    {
        std::cerr << "Can't specify both refName and bed" << std::endl;
        inputParameters.Status();
        return(-1);
    }

    if(!bed.IsEmpty())
    {
        myBedFile = ifopen(bed, "r");
    }

    if(params)
    {
        inputParameters.Status();
    }

    // Open the file for reading.   
    mySamIn.OpenForRead(inFile);

    // Open the output file for writing.
    SamFile samOut;
    samOut.OpenForWrite(outFile);

    // Open the bam index file for reading if a region was specified.
    if((myRefName.Length() != 0) || (myRefID != UNSET_REF) || (myBedFile != NULL))
    {
        mySamIn.ReadBamIndex(indexFile);
    }

    // Read & write the sam header.
    mySamIn.ReadHeader(mySamHeader);
    samOut.WriteHeader(mySamHeader);

    // Read the sam records.
    SamRecord samRecord;
    // Track the status.
    int numSectionRecords = 0;

    // Set returnStatus to success.  It will be changed
    // to the failure reason if any of the writes fail.
    SamStatus::Status returnStatus = SamStatus::SUCCESS;
        
    while(getNextSection())
    {
        // Keep reading records until they aren't anymore.
        while(mySamIn.ReadRecord(mySamHeader, samRecord))
        {
            if(!readName.IsEmpty())
            {
                // Check for readname.
                if(strcmp(samRecord.getReadName(), readName.c_str()) != 0)
                {
                    // not a matching read name, so continue to the next record.
                    continue;
                }
            }
            
            // Check to see if the read has already been processed.
            if(myPrevEnd != UNSPECIFIED_INT)
            {
                // Because we already know that the bed was sorted, 
                // we know that the previous section started before
                // this one, so if the previous end is greater than
                // this record's end position we know that it
                // was already written in the previous section.
                // Note: can't be equal to the previous end since
                // the end range was exclusive, while
                // get0BasedAlignmentEnd is inclusive.
                // myPrevEnd is reset by getNextSection when a new
                // chromosome is hit.
                if(samRecord.get0BasedAlignmentEnd() < myPrevEnd)
                {
                    // This record was already written.
                    continue;
                }
            }

            // Shift left if applicable.
            if(lshift)
            {
                samRecord.shiftIndelsLeft();
            }

            // Successfully read a record from the file, so write it.
            samOut.WriteRecord(mySamHeader, samRecord);
            ++numSectionRecords;
        }
        myWroteReg = true;
    }

    if(myBedFile != NULL)
    {
        ifclose(myBedFile);
    }
    std::cerr << "Wrote " << outFile << " with " << numSectionRecords
              << " records.\n";
    return(returnStatus);
}
Beispiel #11
0
int VcfMac::execute(int argc, char **argv)
{
    String inputVcf = "";
    int minAC = -1;
    String sampleSubset = "";
    String filterList = "";
    bool params = false;

    IntervalTree<int> regions;
    std::vector<int> intersection;
    
    // Read in the parameters.    
    ParameterList inputParameters;
    BEGIN_LONG_PARAMETERS(longParameterList)
        LONG_PARAMETER_GROUP("Required Parameters")
        LONG_STRINGPARAMETER("in", &inputVcf)
        LONG_PARAMETER_GROUP("Optional Parameters")
        LONG_STRINGPARAMETER("sampleSubset", &sampleSubset)
        LONG_INTPARAMETER("minAC", &minAC)
        LONG_STRINGPARAMETER("filterList", &filterList)
        LONG_PARAMETER("params", &params)
        LONG_PHONEHOME(VERSION)
        END_LONG_PARAMETERS();
   
    inputParameters.Add(new LongParameters ("Input Parameters", 
                                            longParameterList));
    
    inputParameters.Read(argc-1, &(argv[1]));
    
    // Check that all files were specified.
    if(inputVcf == "")
    {
        usage();
        inputParameters.Status();
        std::cerr << "Missing \"--in\", a required parameter.\n\n";
        return(-1);
    }

    if(params)
    {
        inputParameters.Status();
    }

    // Open the two input files.
    VcfFileReader inFile;
    VcfHeader header;
    VcfRecord record;

    // Open the file
    if(sampleSubset.IsEmpty())
    {
        inFile.open(inputVcf, header);        
    }
    else
    {
        inFile.open(inputVcf, header, sampleSubset, NULL, NULL);
    }
    
    // Add the discard rule for minor allele count.
    if(minAC >= 0)
    {
        inFile.addDiscardMinMinorAlleleCount(minAC, NULL);
    }
    
    if(!filterList.IsEmpty())
    {
        // Open the filter list.
        IFILE regionFile = ifopen(filterList, "r");
        String regionLine;
        StringArray regionColumn;
        int start;
        int end;
        int intervalVal = 1;
        if(regionFile == NULL)
        {
            std::cerr << "Failed to open " << filterList 
                      << ", so keeping all positions\n";
            filterList.Clear();
        }
        else
        {
            while( regionFile->isOpen() && !regionFile->ifeof())
            {
                // Read the next interval
                regionLine.Clear();
                regionLine.ReadLine(regionFile);
                if(regionLine.IsEmpty())
                {
                    // Nothing on this line, continue to the next.
                    continue;
                }
                regionColumn.ReplaceColumns(regionLine, ' ');
                if(regionColumn.Length() != 2)
                {
                    std::cerr << "Improperly formatted region line: " 
                              << regionLine << "; skipping to the next line.\n";
                    continue;
                }
                // Convert the columns to integers.
                if(!regionColumn[0].AsInteger(start))
                {
                    // The start position (1st column) is not an integer.
                    std::cerr << "Improperly formatted region line, start position "
                              << "(1st column) is not an integer: "
                              << regionColumn[0]
                              << "; Skipping to the next line.\n";
                    continue;
                }
                if(!regionColumn[1].AsInteger(end))
                {
                    // The start position (1st column) is not an integer.
                    std::cerr << "Improperly formatted region line, end position "
                              << "(2nd column) is not an integer: "
                              << regionColumn[1]
                              << "; Skipping to the next line.\n";
                    continue;
                }
                // Add 1-based inclusive intervals.
                regions.add(start,end, intervalVal);
            }
        }
    }


    int numReadRecords = 0;

    while( inFile.readRecord(record))
    {
        if(!filterList.IsEmpty())
        {
            // Check if the region should be kept.
            intersection.clear();
            regions.get_intersecting_intervals(record.get1BasedPosition(), intersection);
            
            if(intersection.empty())
            {
                // not in the interval, so continue to the next record.
                continue;
            }
        }

        ++numReadRecords;

        // Loop through the number of possible alternates.
        unsigned int numAlts = record.getNumAlts();
        int minAlleleCount = -1;
        int curAlleleCount = 0;
        int totalAlleleCount = 0;
        for(unsigned int i = 0; i <= numAlts; i++)
        {
            curAlleleCount = record.getAlleleCount(i);
            if((minAlleleCount == -1) ||
               (curAlleleCount < minAlleleCount))
            {
                minAlleleCount = curAlleleCount;
            }
            totalAlleleCount += curAlleleCount;
        }
        if(totalAlleleCount != 0)
        {
            double maf = (double)minAlleleCount/totalAlleleCount;
            std::cout << record.getIDStr()
                      << "\t" << minAlleleCount
                      << "\t" << maf << "\n";
        }
    }
    
    inFile.close();

    //    std::cerr << "\n\t# Records: " << numReadRecords << "\n";

    // return success.
    return(0);
}