Ejemplo n.º 1
0
int main(int argc, char** argv) {

    VariantCallFile variantFile;

    if (argc > 1) {
        string filename = argv[1];
        variantFile.open(filename);
    } else {
        variantFile.open(std::cin);
    }

    if (!variantFile.is_open()) {
        return 1;
    }

    variantFile.addHeaderLine("##FORMAT=<ID=SN,Number=1,Type=String,Description=\"The name of the sample.\">");

    cout << variantFile.header << endl;

    Variant var(variantFile);
    while (variantFile.getNextVariant(var)) {
        var.format.push_back("SN");
        for (map<string, map<string, vector<string> > >::iterator s = var.samples.begin();
             s != var.samples.end(); ++s) {
            s->second["SN"].clear();
            s->second["SN"].push_back(s->first);
        }
        cout << var << endl;
    }

    return 0;

}
Ejemplo n.º 2
0
int main(int argc, char** argv) {

    VariantCallFile variantFile;

    if (argc > 1) {
        string filename = argv[1];
        variantFile.open(filename);
    } else {
        variantFile.open(std::cin);
    }

    if (!variantFile.is_open()) {
        return 1;
    }

    variantFile.addHeaderLine("##INFO=<ID=length,Number=A,Type=Integer,Description=\"length(ALT) - length(REF) for each ALT\">");
    variantFile.addHeaderLine("##INFO=<ID=length.ref,Number=1,Type=Integer,Description=\"length(REF)\">");
    variantFile.addHeaderLine("##INFO=<ID=length.alt,Number=A,Type=Integer,Description=\"length(ALT) for each ALT\">");
    cout << variantFile.header << endl;

    Variant var(variantFile);
    while (variantFile.getNextVariant(var)) {
        vector<string>& lengths = var.info["length"];
        lengths.clear();
        for (vector<string>::iterator a = var.alt.begin(); a != var.alt.end(); ++a) {
            lengths.push_back(convert((int) a->size() - (int) var.ref.size()));
        }
        vector<string>& lengthsRef = var.info["length.ref"];
        lengthsRef.clear();
        lengthsRef.push_back(convert(var.ref.size()));
        vector<string>& lengthsAlt = var.info["length.alt"];
        lengthsAlt.clear();
        for (vector<string>::iterator a = var.alt.begin(); a != var.alt.end(); ++a) {
            lengthsAlt.push_back(convert((int) a->size()));
        }
        cout << var << endl;
    }

    return 0;

}
Ejemplo n.º 3
0
int main(int argc, char** argv) {

    int c;
    bool invert = false;
    bool logicalOr = false;
    bool filterSites = false;
    vector<string> infofilterStrs;
    vector<VariantFilter> infofilters;
    vector<string> genofilterStrs;
    vector<VariantFilter> genofilters;
    string tag = "";
    string filterSpec;
    string alleleTag;
    vector<string> regions;

    if (argc == 1)
        printSummary(argv);

    while (true) {
        static struct option long_options[] =
        {
            /* These options set a flag. */
            //{"verbose", no_argument,       &verbose_flag, 1},
            {"help", no_argument, 0, 'h'},
            {"filter-sites", no_argument, 0, 's'},
            {"info-filter",  required_argument, 0, 'f'},
            {"genotype-filter",  required_argument, 0, 'g'},
            {"tag", required_argument, 0, 't'},
	    {"allele-tag", required_argument, 0, 'a'},
            {"invert", no_argument, 0, 'v'},
            {"or", no_argument, 0, 'o'},
            {"region", required_argument, 0, 'r'},
            //{"length",  no_argument, &printLength, true},
            {0, 0, 0, 0}
        };
        /* getopt_long stores the option index here. */
        int option_index = 0;

        c = getopt_long (argc, argv, "hvsof:g:t:r:a:",
                         long_options, &option_index);

      /* Detect the end of the options. */
          if (c == -1)
            break;
 
          switch (c)
            {
            case 0:
            /* If this option set a flag, do nothing else now. */
            if (long_options[option_index].flag != 0)
              break;
            printf ("option %s", long_options[option_index].name);
            if (optarg)
              printf (" with arg %s", optarg);
            printf ("\n");
            break;

          case 'f':
            filterSpec += " " + string(optarg);
            infofilterStrs.push_back(string(optarg));
            break;

          case 's':
            filterSites = true;
            break;

	  case 'a':
	    alleleTag = optarg;
	    break;
 
          case 'g':
            filterSpec += " genotypes filtered with: " + string(optarg);
            genofilterStrs.push_back(string(optarg));
            break;
 
          case 't':
            tag = optarg;
            break;
 
          case 'h':
            printSummary(argv);
            exit(0);
            break;

          case 'v':
            invert = true;
            break;

          case 'o':
            logicalOr = true;
            break;

          case 'r':
            regions.push_back(optarg);
            break;
          
          case '?':
            /* getopt_long already printed an error message. */
            printSummary(argv);
            exit(1);
            break;
 
          default:
            abort ();
          }
      }

    filterSpec = filterSpec.substr(1); // strip leading " "

    VariantCallFile variantFile;
    string inputFilename;
    if (optind == argc - 1) {
        inputFilename = argv[optind];
        variantFile.open(inputFilename);
    } else {
        variantFile.open(std::cin);
    }

    if (!variantFile.is_open()) {
        return 1;
    }

    for (vector<string>::iterator f = infofilterStrs.begin(); f != infofilterStrs.end(); ++f) {
        infofilters.push_back(VariantFilter(*f, VariantFilter::RECORD, variantFile.infoTypes));
    }

    for (vector<string>::iterator f = genofilterStrs.begin(); f != genofilterStrs.end(); ++f) {
        genofilters.push_back(VariantFilter(*f, VariantFilter::SAMPLE, variantFile.formatTypes));
    }

    vector<string> headerlines = split(variantFile.header, "\n");
    variantFile.header.clear();
    for (vector<string>::iterator l = headerlines.begin(); l != headerlines.end(); ++l) {
        if (!filterSpec.empty() && (l->find("INFO") != string::npos || l + 1 == headerlines.end())) {
            variantFile.header += "##filter=\"" + filterSpec + "\"\n";
            filterSpec.clear();
        }
        variantFile.header += *l + ((l + 1 == headerlines.end()) ? "" : "\n");
    }

    if (!alleleTag.empty()) {
	variantFile.addHeaderLine("##INFO=<ID="+ alleleTag +",Number=A,Type=String,Description=\"" + tag + " if this allele passes the filters, '.' if not, filters are: " + filterSpec + ".\">");
    }

    cout << variantFile.header << endl;

    /*
    if (genofilters.empty() && tag.empty()) {
        variantFile.parseSamples = false;
    }
    */

    Variant var(variantFile);

    vector<string>::iterator regionItr = regions.begin();

    do {

        if (!inputFilename.empty() && !regions.empty()) {
            string regionStr = *regionItr++;
            variantFile.setRegion(regionStr);
        }

        while (variantFile.getNextVariant(var)) {
            if (!genofilters.empty()) {
                for (vector<VariantFilter>::iterator f = genofilters.begin(); f != genofilters.end(); ++f) {
                    f->removeFilteredGenotypes(var);
                }
            }
            if (!infofilters.empty()) {
                if (filterSites) {
                    bool passes = passesFilters(var, infofilters, logicalOr);
                    if (invert) {
                        passes = !passes;
                    }
                    if (passes) {
                        if (!tag.empty()) {
			    if (alleleTag.empty()) {
				var.addFilter(tag);
			    } else {
				var.info[alleleTag].clear();
				for (vector<string>::iterator a = var.alt.begin(); a != var.alt.end(); ++a) {
				    var.info[alleleTag].push_back(tag);
				}
			    }
                            cout << var << endl;
                        } else {
                            cout << var << endl;
                        }
                    } else if (!tag.empty()) {
                        cout << var << endl;
                    }
                } else { // filter out alleles which pass
                    // removes the failing alleles
                    vector<string> failingAlts;
                    vector<string> passingAlts;
		    vector<bool> passes;
                    for (vector<string>::iterator a = var.alt.begin(); a != var.alt.end(); ++a) {
                        if (!passesFilters(var, infofilters, logicalOr, *a)) {
                            failingAlts.push_back(*a);
			    passes.push_back(false);
                        } else {
                            passingAlts.push_back(*a);
			    passes.push_back(true);
                        }
                    }
                    if (tag.empty()) { // if there is no specified tag, just remove the failing alts
			if (failingAlts.size() < var.alt.size()) {
			    for (vector<string>::iterator a = failingAlts.begin(); a != failingAlts.end(); ++a) {
				var.removeAlt(*a);
			    }
			    cout << var << endl;
			}
                    } else { // otherwise, apply the tag
			if (alleleTag.empty()) {
			    if (!passingAlts.empty()) {
				var.addFilter(tag);
			    }
			} else {
			    var.info[alleleTag].clear();
			    for (vector<bool>::iterator p = passes.begin(); p != passes.end(); ++p) {
				if (*p) {
				    var.info[alleleTag].push_back(tag);
				} else {
				    var.info[alleleTag].push_back(".");
				}
			    }
			}
                        cout << var << endl;
                    }
                }
            } else {
                if (genofilters.empty()) {
                    cout << variantFile.line << endl;
                } else {
                    cout << var << endl;
                }
            }
        }

    } while (regionItr != regions.end());

    return 0;

}
Ejemplo n.º 4
0
int main(int argc, char** argv) {

    if (argc < 5) {
        printSummary(argv);
        exit(0);
    }

    bool strict = false;
    int c;

    while (true) {
        static struct option long_options[] =
            {
                /* These options set a flag. */
                //{"verbose", no_argument,       &verbose_flag, 1},
                {"help", no_argument, 0, 'h'},
                {"strict",  no_argument, 0, 's'},
                //{"length",  no_argument, &printLength, true},
                {0, 0, 0, 0}
            };
        /* getopt_long stores the option index here. */
        int option_index = 0;

        c = getopt_long (argc, argv, "hs",
                         long_options, &option_index);

        /* Detect the end of the options. */
        if (c == -1)
            break;
 
        switch (c)
        {
        case 0:
            /* If this option set a flag, do nothing else now. */
            if (long_options[option_index].flag != 0)
                break;
            printf ("option %s", long_options[option_index].name);
            if (optarg)
                printf (" with arg %s", optarg);
            printf ("\n");
            break;

        case 's':
            strict = true;
            break;
 
        case 'h':
            printSummary(argv);
            exit(0);
            break;

        case '?':
            /* getopt_long already printed an error message. */
            printSummary(argv);
            exit(1);
            break;
 
        default:
            abort ();
        }
    }

    string tag = argv[optind];

    vector<string> samples;
    for (int i = optind+1; i < argc - 1; ++i) {
        samples.push_back(argv[i]);
    }

    string filename = argv[argc-1];

    VariantCallFile variantFile;
    if (filename == "-") {
        variantFile.open(std::cin);
    } else {
        variantFile.open(filename);
    }

    if (!variantFile.is_open()) {
        cerr << "could not open " << filename << endl;
        return 1;
    }

    assert(samples.size() == 2);

    Variant var(variantFile);

    // TODO check if AC is present
    // ensure that AC is listed as an info field
    string line = "##INFO=<ID=" + tag + ",Number=1,Type=String,Description=\"Samples";
    for (vector<string>::iterator s = samples.begin(); s != samples.end(); ++s) {
        line += " " + *s;
    }
    line += " have different genotypes\">";
    variantFile.addHeaderLine(line);

    variantFile.addHeaderLine("##INFO=<ID=SSC,Number=1,Type=Float,Description=\"Somatic variant score (phred-scaled probability that the somatic variant call is correct).\">");

    // write the new header
    cout << variantFile.header << endl;
 
    // print the records, filtering is done via the setting of varA's output sample names
    while (variantFile.getNextVariant(var)) {
        if (var.samples.find(samples.front()) != var.samples.end()
            && var.samples.find(samples.back()) != var.samples.end()) {
            map<string, vector<string> >& germline = var.samples[samples.front()];
            map<string, vector<string> >& somatic = var.samples[samples.back()];
            map<int, int> gtGermline = decomposeGenotype(germline["GT"].front());
            map<int, int> gtSomatic  = decomposeGenotype(somatic["GT"].front());
            int germlineAltCount = 0;
            convert(germline["AO"].front(), germlineAltCount);
            var.info[tag].clear(); // remove previous
            if (gtGermline == gtSomatic) {
                var.info[tag].push_back("germline");
            } else {
                //if (isHet(gtGermline) && isHom(gtSomatic)) {
                //    var.info[tag].push_back("loh");
                if (isHet(gtGermline) && isHomNonRef(gtSomatic) ||
                    isHomRef(gtGermline) && (isHet(gtSomatic) || isHomNonRef(gtSomatic))) {
                    if (!strict || strict && germlineAltCount == 0) {
                        var.info[tag].push_back("somatic");
                    }
                } else if (isHom(gtGermline) && isHet(gtSomatic)) {
                    if (var.alt.size() == 1) {
                        var.info[tag].push_back("reversion");
                    } else {
                        var.info[tag].push_back("somatic");
                    }
                }
            }
            if (germline.find("GQ") != germline.end() && somatic.find("GQ") != somatic.end()) {
                double germlineGQ;
                convert(germline["GQ"].front(), germlineGQ);
                double somaticGQ;
                convert(somatic["GQ"].front(), somaticGQ);
                double somaticScore = min(var.quality, min(germlineGQ, somaticGQ));
                var.info["SSC"].clear();
                var.info["SSC"].push_back(convert(somaticScore));
            }
        }
        cout << var << endl;
    }

    return 0;

}
Ejemplo n.º 5
0
int main(int argc, char** argv) {

    string bedFileName;
    string annotationInfoKey;
    string defaultAnnotationValue;

    if (argc == 1)
        printSummary(argv);

    int c;
    while (true) {
        static struct option long_options[] =
        {
            /* These options set a flag. */
            //{"verbose", no_argument,       &verbose_flag, 1},
            {"help", no_argument, 0, 'h'},
            {"bed",  required_argument, 0, 'b'},
            {"key",  required_argument, 0, 'k'},
            {"default",  required_argument, 0, 'd'},
            {0, 0, 0, 0}
        };
        /* getopt_long stores the option index here. */
        int option_index = 0;

        c = getopt_long (argc, argv, "hb:k:d:",
                         long_options, &option_index);

        if (c == -1)
            break;

        switch (c) {
            case 'b':
                bedFileName = string(optarg);
                break;

            case 'k':
                annotationInfoKey = string(optarg);
                break;

            case 'd':
                defaultAnnotationValue = string(optarg);
                break;

            case 'h':
                printSummary(argv);
                break;

            case '?':
                printSummary(argv);
                exit(1);
                break;

            default:
                abort ();
        }
    }

    if (bedFileName.empty()) {
        cerr << "a BED file is required when intersecting" << endl;
        exit(1);
    }

    BedReader bed(bedFileName);

    VariantCallFile variantFile;
    string inputFilename;
    if (optind == argc - 1) {
        inputFilename = argv[optind];
        variantFile.open(inputFilename);
    } else {
        variantFile.open(std::cin);
    }

    if (!variantFile.is_open()) {
        cout << "could not open VCF file" << endl;
        return 1;
    }

    string line = "##INFO=<ID=" + annotationInfoKey + ",Number=1,Type=String,Description=\"Annotation from "
        + bedFileName + " delimited by ':'\">";
    variantFile.addHeaderLine(line);

    cout << variantFile.header << endl;

    Variant var(variantFile);
    while (variantFile.getNextVariant(var)) {
        BedTarget record(var.sequenceName, var.position, var.position + var.ref.size() - 1, "");
        vector<BedTarget*> overlaps = bed.targetsOverlapping(record);
        vector<string> annotations;
        if (!overlaps.empty()) {
            for (vector<BedTarget*>::iterator t = overlaps.begin(); t != overlaps.end(); ++t) {
                annotations.push_back((*t)->desc);
            }
            var.info[annotationInfoKey].push_back(join(annotations, ":"));
        } else if (!defaultAnnotationValue.empty()) {
            var.info[annotationInfoKey].push_back(defaultAnnotationValue);
        }
        cout << var << endl;
    }

    return 0;

}
Ejemplo n.º 6
0
int main(int argc, char** argv) {

    if (argc != 3) {
        cerr << "usage: " << argv[0] << " <other-genotype-tag> <vcf file>" << endl
             << "adds statistics to the INFO field of the vcf file describing the" << endl
             << "amount of discrepancy between the genotypes (GT) in the vcf file and the" << endl
             << "genotypes reported in the <other-genotype-tag>.  use this after" << endl
             << "vcfannotategenotypes to get correspondence statistics for two vcfs." << endl;
        return 1;
    }

    string otherGenoTag = argv[1];
    string filename = argv[2];

    VariantCallFile variantFile;
    if (filename == "-") {
        variantFile.open(std::cin);
    } else {
        variantFile.open(filename);
    }

    if (!variantFile.is_open()) {
        return 1;
    }

    vector<string> specs;

    specs.push_back("AA_AA");
    specs.push_back("AA_AR");
    specs.push_back("AA_RR");
    specs.push_back("AA_NN");

    specs.push_back("AR_AA");
    specs.push_back("AR_AR");
    specs.push_back("AR_RR");
    specs.push_back("AR_NN");

    specs.push_back("RR_AA");
    specs.push_back("RR_AR");
    specs.push_back("RR_RR");
    specs.push_back("RR_NN");

    specs.push_back("NN_AA");
    specs.push_back("NN_AR");
    specs.push_back("NN_RR");
    specs.push_back("NN_NN");


    for (vector<string>::iterator spec = specs.begin(); spec != specs.end(); ++spec) {
        string line = "##INFO=<ID=" + otherGenoTag + ".genotypes." + *spec
            + ",Number=1,Type=Integer,Description=\"Number of genotypes with "
            + *spec + " relationship with " + otherGenoTag + "\">";
        variantFile.addHeaderLine(line);
    }

    string line;

    line = "##INFO=<ID=" + otherGenoTag + ".genotypes.count,Number=1,Type=Integer,Description=\"Count of genotypes under comparison.\">";
    variantFile.addHeaderLine(line);

    line = "##INFO=<ID=" + otherGenoTag + ".genotypes.alternate_count,Number=1,Type=Integer,Description=\"Count of alternate genotypes in the first file.\">";
    variantFile.addHeaderLine(line);

    line = "##INFO=<ID=" + otherGenoTag
        + ".site.alternate_positive_discrepancy,Number=1,Type=Integer,Description=\"Estimated positive discrepancy rate of "
        + otherGenoTag + " genotypes, where positive discrepancies are all cases where an alternate allele is called GT "
        + " but none is represented in " + otherGenoTag + " or " + otherGenoTag + " is null/no-call\">";
    variantFile.addHeaderLine(line);

    line = "##INFO=<ID=" + otherGenoTag
        + ".site.alternate_negative_discrepancy,Number=1,Type=Integer,Description=\"Estimated negative discrepancy rate of "
        + otherGenoTag + " genotypes, where negative discrepancies are all cases where no alternate allele is called in "
        + " GT but an alternate is represented in " + otherGenoTag + ", including no-calls or partly null genotypes\">";
    variantFile.addHeaderLine(line);

    line = "##INFO=<ID=" + otherGenoTag
        + ".site.alternate_null_discrepancy,Number=1,Type=Integer,Description=\"Estimated null discrepancy rate of "
        + otherGenoTag + " genotypes, where null discrepancies are all cases where GT is specified and contains an alternate but "
        + otherGenoTag + " is null.  Cases where GT is null or partly null are excluded.\">";
    variantFile.addHeaderLine(line);

    line = "##INFO=<ID=" + otherGenoTag
        + ".site.call_discrepancy,Number=1,Type=Integer,Description=\"Estimated call discrepancy rate of "
        + otherGenoTag + " genotypes (het->hom, hom->het) between " + otherGenoTag + " and GT\">";
    variantFile.addHeaderLine(line);

    line = "##INFO=<ID=" + otherGenoTag
        + ".site.call_concordance,Number=1,Type=Integer,Description=\"Estimated call concorndance rate of "
        + otherGenoTag + " genotypes between " + otherGenoTag + " and GT\">";
    variantFile.addHeaderLine(line);

    line = "##INFO=<ID=" + otherGenoTag
        + ".site.non_reference_discrepancy,Number=1,Type=Float,Description=\"Estimated non-reference discrepancy relative to "
        + otherGenoTag + " genotypes,\">";
    variantFile.addHeaderLine(line);

    line = "##INFO=<ID=" + otherGenoTag
        + ".site.non_reference_discrepancy.count,Number=1,Type=Int,Description=\"non-reference discrepancy normalizer relative to "
        + otherGenoTag + " genotypes,\">";
    variantFile.addHeaderLine(line);

    line = "##INFO=<ID=" + otherGenoTag
        + ".site.non_reference_discrepancy.normalizer,Number=1,Type=Int,Description=\"non-reference discrepancy count relative to "
        + otherGenoTag + " genotypes,\">";
    variantFile.addHeaderLine(line);

    line = "##INFO=<ID=" + otherGenoTag
        + ".site.non_reference_sensitivity,Number=1,Type=Float,Description=\"Estimated non-reference sensitivity relative to "
        + otherGenoTag + " genotypes,\">";
    variantFile.addHeaderLine(line);

    line = "##INFO=<ID=" + otherGenoTag
        + ".site.non_reference_sensitivity.count,Number=1,Type=Int,Description=\"non-reference sensitivity normalizer relative to "
        + otherGenoTag + " genotypes,\">";
    variantFile.addHeaderLine(line);

    line = "##INFO=<ID=" + otherGenoTag
        + ".site.non_reference_sensitivity.normalizer,Number=1,Type=Int,Description=\"non-reference sensitivity count relative to "
        + otherGenoTag + " genotypes,\">";
    variantFile.addHeaderLine(line);

    cout << variantFile.header << endl;

    Variant var(variantFile);

    while (variantFile.getNextVariant(var)) {

	//cout << "next: " << var << endl;
        // for each sample, check GT against <other-genotype-tag>
        // tally stats, and append to info
        map<string, map<string, vector<string> > >::iterator s     = var.samples.begin();
        map<string, map<string, vector<string> > >::iterator sEnd  = var.samples.end();

        map<string, int> genotypeComparisonCounts;
        int gtCount = var.samples.size();
        int gtAltCount = 0; // number of alternate-containing genotypes in the first file
        int pdCount = 0; // positive discrepancy count
        int ndCount = 0; // negative discrepancy count
        int nnCount = 0; // null discrepancy count
        int cdCount = 0; // call discrepancy count
        int ccCount = 0; // call concordance count
        int nrdCount = 0; // non-reference discrepancy count
        int nrdNormalizer = 0; // divisor for nrd rate
        int nrsCount = 0; // non-reference sensitivity count
        int nrsNormalizer = 0; // divisor for nrs rate

        for (; s != sEnd; ++s) {
            map<string, vector<string> >& sample = s->second;
            const string& name = s->first;

            // decompose genotypes into counts of strings
            // to facilitate comparison

	    string gtA;
	    if (sample.find("GT") == sample.end()) {
		gtA = "./.";
	    } else {
		gtA = sample["GT"].front();
	    }

	    string gtB;
	    if (sample.find(otherGenoTag) == sample.end()) {
		gtB = "./.";
	    } else {
		gtB = sample[otherGenoTag].front();
	    }


            map<int, int> genotypeA = decomposeGenotype(gtA);
            map<int, int> genotypeB = decomposeGenotype(gtB);

            string gtspecA = genotypeSpec(genotypeA);
            string gtspecB = genotypeSpec(genotypeB);
            //cout << gtA << " " << gtB << endl;
            //cout << gtspecA << " " << gtspecB << endl;
            ++genotypeComparisonCounts[gtspecA + "_" + gtspecB];

            if (hasNonRef(genotypeA)) {
                ++gtAltCount;
            }

            if (genotypeA != genotypeB) {
                if (isNull(genotypeA)) {
                    // TODO handle this somehow, maybe via a different flag?
                    if (!isNull(genotypeB)) {
                        ++nnCount;  // null discrepancy, the second set makes a call, this one does not
                    }
                } else if (hasNonRef(genotypeA)) {
                    if (!isNull(genotypeB) && hasNonRef(genotypeB)) { // they cannot be the same, but they both represent an alternate
                        ++cdCount;  // the calls are discrepant
                    } else { // the other call does not have an alternate
                        ++pdCount;
                        // it is also null
                        if (isNull(genotypeB)) {
                            ++nnCount;
                        }
                    }
                } else { // the current genotype has no non-ref alternate
                    if (!isNull(genotypeB) && hasNonRef(genotypeB)) {
                        ++ndCount;
                    }
                    if (isNull(genotypeB)) {
                        ++nnCount;
                    }
                }
            } else {
                if (!isNull(genotypeA)) {
                    ++ccCount;
                }
            }


            if (!(isNull(genotypeA) || isNull(genotypeB))
                    && !(isHomRef(genotypeA) && isHomRef(genotypeB))) {
                ++nrdNormalizer;
                if (genotypeA != genotypeB) {
                    ++nrdCount;
                }
            }

            if (!(isNull(genotypeB) || isHomRef(genotypeB))) {
                ++nrsNormalizer;
                if (!(isNull(genotypeA) || isHomRef(genotypeA))) {
                    ++nrsCount;
                }
            }

        }

        for (map<string, int>::iterator g = genotypeComparisonCounts.begin();
                g != genotypeComparisonCounts.end(); ++g) {
            stringstream c;
            c << g->second;
            vector<string>& t = var.info[otherGenoTag + ".genotypes." + g->first];
            t.clear(); t.push_back(c.str());
        }

        stringstream gtc;
        gtc << gtCount;
        var.info[otherGenoTag + ".genotypes.count"].push_back(gtc.str());

        stringstream gtac;
        gtac << gtAltCount;
        var.info[otherGenoTag + ".genotypes.alternate_count"].push_back(gtac.str());

        stringstream pd;
        pd << pdCount;
        var.info[otherGenoTag + ".site.alternate_positive_discrepancy"].push_back(pd.str());

        stringstream nd;
        nd << ndCount;
        var.info[otherGenoTag + ".site.alternate_negative_discrepancy"].push_back(nd.str());

        stringstream nn;
        nn << nnCount;
        var.info[otherGenoTag + ".site.alternate_null_discrepancy"].push_back(nn.str());

        stringstream cd;
        cd << cdCount;
        var.info[otherGenoTag + ".site.call_discrepancy"].push_back(cd.str());

        stringstream cc;
        cc << ccCount;
        var.info[otherGenoTag + ".site.call_concordance"].push_back(cc.str());

        stringstream nrdc;
        nrdc << nrdCount;
        var.info[otherGenoTag + ".site.non_reference_discrepancy.count"].push_back(nrdc.str());

        stringstream nrdn;
        nrdn << nrdNormalizer;
        var.info[otherGenoTag + ".site.non_reference_discrepancy.normalizer"].push_back(nrdn.str());

        if (nrdNormalizer > 0) {
            stringstream nrd;
            nrd << (double) nrdCount / (double) nrdNormalizer;
            var.info[otherGenoTag + ".site.non_reference_discrepancy"].push_back(nrd.str());
        }

        stringstream nrsc;
        nrsc << nrsCount;
        var.info[otherGenoTag + ".site.non_reference_sensitivity.count"].push_back(nrsc.str());

        stringstream nrsn;
        nrsn << nrsNormalizer;
        var.info[otherGenoTag + ".site.non_reference_sensitivity.normalizer"].push_back(nrsn.str());

        if (nrsNormalizer > 0) {
            stringstream nrs;
            nrs << (double) nrsCount / (double) nrsNormalizer;
            var.info[otherGenoTag + ".site.non_reference_sensitivity"].push_back(nrs.str());
        }

        cout << var << endl;

    }

    return 0;

}
Ejemplo n.º 7
0
int main(int argc, char** argv) {

    if (argc > 1 && (argv[1] == "-h" || argv[1] == "--help")) {
        cerr << "usage: " << argv[0] << " <vcf file>" << endl
             << "outputs a VCF stream where AC and NS have been generated for each record using sample genotypes" << endl;
        return 1;
    }

    VariantCallFile variantFile;
    if (argc == 1 || (argc == 2 && argv[1] == "-")) {
        variantFile.open(std::cin);
        if (!variantFile.is_open()) {
            cerr << "vcffixup: could not open stdin" << endl;
            return 1;
        }
    } else {
        string filename = argv[1];
        variantFile.open(filename);
        if (!variantFile.is_open()) {
            cerr << "vcffixup: could not open " << filename << endl;
            return 1;
        }
    }

    Variant var(variantFile);

    // remove header lines we're going to add
    variantFile.removeInfoHeaderLine("AC");
    variantFile.removeInfoHeaderLine("AF");
    variantFile.removeInfoHeaderLine("NS");
    variantFile.removeInfoHeaderLine("AN");

    // and add them back, so as not to duplicate them if they are already there
    variantFile.addHeaderLine("##INFO=<ID=AC,Number=A,Type=Integer,Description=\"Total number of alternate alleles in called genotypes\">");
    variantFile.addHeaderLine("##INFO=<ID=AF,Number=A,Type=Float,Description=\"Estimated allele frequency in the range (0,1]\">");
    variantFile.addHeaderLine("##INFO=<ID=NS,Number=1,Type=Integer,Description=\"Number of samples with data\">");
    variantFile.addHeaderLine("##INFO=<ID=AN,Number=1,Type=Integer,Description=\"Total number of alleles in called genotypes\">");

    // write the new header
    cout << variantFile.header << endl;
 
    // print the records, filtering is done via the setting of varA's output sample names
    while (variantFile.getNextVariant(var)) {
        stringstream ns;
        ns << var.samples.size();
        var.info["NS"].clear();
        var.info["NS"].push_back(ns.str());

        var.info["AC"].clear();
        var.info["AF"].clear();
        var.info["AN"].clear();

        int allelecount = countAlleles(var);
        stringstream an;
        an << allelecount;
        var.info["AN"].push_back(an.str());

        for (vector<string>::iterator a = var.alt.begin(); a != var.alt.end(); ++a) {
            string& allele = *a;
            int altcount = countAlts(var, var.getAltAlleleIndex(allele) + 1);
            stringstream ac;
            ac << altcount;
            var.info["AC"].push_back(ac.str());
            stringstream af;
            af << (double) altcount / (double) allelecount;
            var.info["AF"].push_back(af.str());
        }
        cout << var << endl;
    }

    return 0;

}
Ejemplo n.º 8
0
int main(int argc, char** argv) {

    if (argc != 2) {
        cerr << "usage: " << argv[0] << " <annotation-tag> <vcf file> <vcf file>" << endl
             << "adds a tag (BasesToNextVariant) to each variant record which indicates" << endl
	     << "the distance to the nearest variant" << endl;
        return 1;
    }

    string filename = argv[1];

    VariantCallFile variantFile;
    if (filename == "-") {
        variantFile.open(std::cin);
    } else {
        variantFile.open(filename);
    }

    if (!variantFile.is_open()) {
        return 1;
    }

    Variant varA(variantFile);
    Variant varB(variantFile);
    Variant varC(variantFile);

    vector<Variant*> vars;
    vars.push_back(&varA);
    vars.push_back(&varB);
    vars.push_back(&varC);
    
    for (vector<Variant*>::iterator v = vars.begin(); v != vars.end(); ++v) {
	variantFile.getNextVariant(**v);
    }

    string tag = "BasesToClosestVariant";
    string line = "##INFO=<ID=" + tag + ",Number=1,Type=Integer,Description=\"" \
	+ "Number of bases to the closest variant in the file.\">";
    variantFile.addHeaderLine(line);

    cout << variantFile.header << endl;

    // get the first distances
    if (vars.at(0)->sequenceName == vars.at(1)->sequenceName) {
	vars.at(0)->info[tag].push_back(convert(vars.at(1)->position - vars.at(0)->position));
    }

    while (variantFile.getNextVariant(*vars.back())) {

	if (vars.at(1)->sequenceName == vars.at(0)->sequenceName &&
	    vars.at(1)->sequenceName == vars.at(2)->sequenceName) {
	    vars.at(1)->info[tag].push_back(convert(min(vars.at(1)->position - vars.at(0)->position,
							vars.at(2)->position - vars.at(1)->position)));
	} else if (vars.at(1)->sequenceName == vars.at(0)->sequenceName) {
	    vars.at(1)->info[tag].push_back(convert(vars.at(1)->position - vars.at(0)->position));
	} else if (vars.at(2)->sequenceName == vars.at(1)->sequenceName) {
	    vars.at(1)->info[tag].push_back(convert(vars.at(2)->position - vars.at(1)->position));
	} else {
	    // don't add the tag
	}
	cout << *vars.front() << endl;
	// rotate
	Variant* v = vars.at(0);
	vars.at(0) = vars.at(1);
	vars.at(1) = vars.at(2);
	vars.at(2) = v;

    }

    // assign the last distances
    
    if (vars.at(0)->sequenceName == vars.at(1)->sequenceName) {
	vars.at(0)->info[tag].push_back(convert(vars.at(1)->position - vars.at(0)->position));
	cout << *vars.at(0) << endl;
	
	vars.at(1)->info[tag].push_back(convert(vars.at(1)->position - vars.at(0)->position));
	cout << *vars.at(1) << endl;
    }

    return 0;

}
Ejemplo n.º 9
0
int main(int argc, char** argv) {

    string vcfFileName;
    string fastaFileName;
    int windowsize = 100;
    bool includePreviousBaseForIndels = false;
    bool useMNPs = true;
    int altwindowsize = 50;

    // constants for SmithWaterman algorithm
    float matchScore = 10.0f;
    float mismatchScore = -9.0f;
    float gapOpenPenalty = 15.0f;
    float gapExtendPenalty = 6.66f;

    bool useEntropy = false;
    bool useRepeatGapExtendPenalty = false;
    float repeatGapExtendPenalty = 1;

    bool adjustVcf = false;
    string adjustedTag = "remappedCIGAR";

    if (argc == 1)
        printSummary(argv);

    int c;
    while (true) {
        static struct option long_options[] =
            {
                /* These options set a flag. */
                //{"verbose", no_argument,       &verbose_flag, 1},
                {"help", no_argument, 0, 'h'},
                {"ref-window-size", required_argument, 0, 'w'},
                {"reference", required_argument, 0, 'r'},
                {"match-score", required_argument, 0, 'm'},
                {"mismatch-score", required_argument, 0, 'x'},
                {"gap-open-penalty", required_argument, 0, 'o'},
                {"gap-extend-penalty", required_argument, 0, 'e'},
                {"alt-window-size", required_argument, 0, 's'},
                {"entropy-gap-open", no_argument, 0, 'z'},
                {"repeat-gap-extend", no_argument, 0, 'R'},
                {"adjust-vcf", required_argument, 0, 'a'},
                {0, 0, 0, 0}
            };
        /* getopt_long stores the option index here. */
        int option_index = 0;

        c = getopt_long (argc, argv, "hza:w:r:m:x:o:e:s:R:",
                         long_options, &option_index);

        if (c == -1)
            break;

        switch (c) {

	    case 'w':
            windowsize = atoi(optarg);
            break;

	    case 'a':
	        adjustVcf = true;
            adjustedTag = optarg;
            break;

	    case 'r':
            fastaFileName = string(optarg);
            break;

        case 'h':
            printSummary(argv);
            break;

	    case 'm':
            matchScore = atof(optarg);
	        break;

	    case 'x':
            mismatchScore = atof(optarg);
	        break;

	    case 'o':
            gapOpenPenalty = atof(optarg);
	        break;

	    case 'e':
            gapExtendPenalty = atof(optarg);
	        break;

	    case 's':
            altwindowsize = atoi(optarg);
            break;

	    case 'z':
            useEntropy = true;
            break;

	    case 'R':
            useRepeatGapExtendPenalty = true;
            repeatGapExtendPenalty = atof(optarg);
            break;

        case '?':
            printSummary(argv);
            exit(1);
            break;

        default:
            abort ();
        }
    }

    VariantCallFile variantFile;
    string inputFilename;
    if (optind == argc - 1) {
        inputFilename = argv[optind];
        variantFile.open(inputFilename);
    } else {
        variantFile.open(std::cin);
    }

    if (!variantFile.is_open()) {
        cerr << "could not open VCF file" << endl;
        exit(1);
    }

    FastaReference freference;
    if (fastaFileName.empty()) {
        cerr << "a reference is required" << endl;
        exit(1);
    } else {
        freference.open(fastaFileName);
    }
    
    if (adjustVcf) {
        vector<string> commandline;
        for (int i = 0; i < argc; ++i)
            commandline.push_back(argv[i]);
        variantFile.addHeaderLine("##INFO=<ID=" + adjustedTag + ",Number=A,Type=String,Description=\"CIGAR when remapped using"+ join(commandline, " ") +"\">");
    }

    cout << variantFile.header << endl;

    Variant var(variantFile);
    while (variantFile.getNextVariant(var)) {
        //if (!adjustVcf) {
	    cout << endl;
	    cout << var << endl;
	    //}
        map<string, vector<VariantAllele> > variantAlleles;
        vector<vector<pair<int, char> > > cigars;
        vector<int> positionDiffs;
        for (vector<string>::iterator a = var.alt.begin(); a != var.alt.end(); ++a) {
            //if (!adjustVcf) cout << endl;
            cout << endl;

            // try to remap locally

            string reference = freference.getSubSequence(var.sequenceName, var.position - 1 - windowsize, windowsize * 2 + var.ref.size());
	    
            // passed to sw align
            unsigned int referencePos;
            string cigar;

            string& alternate = *a;

            vector<VariantAllele>& variants = variantAlleles[alternate];

            string alternateQuery = reference.substr(windowsize - altwindowsize, altwindowsize) + alternate + reference.substr(reference.size() - windowsize, altwindowsize);

            //cout << "REF:\t" << reference << endl;
            //cout << "ALT:\t" << string(windowsize - altwindowsize, ' ') << alternateQuery << endl;
	    
            CSmithWatermanGotoh sw(matchScore, mismatchScore, gapOpenPenalty, gapExtendPenalty);
            if (useEntropy) sw.EnableEntropyGapPenalty(1);
            if (useRepeatGapExtendPenalty) sw.EnableRepeatGapExtensionPenalty(repeatGapExtendPenalty);
            sw.Align(referencePos, cigar, reference, alternateQuery);

            int altpos = 0;
            int refpos = 0;
            int len;
            string slen;
            vector<pair<int, char> > cigarData;

            string ref = reference.substr(referencePos);
            positionDiffs.push_back(referencePos); // TODO this... is borked

            stringstream refss;
            stringstream altss;

            if (!adjustVcf) cout << cigar << endl;
            cout << cigar << endl;
            for (string::iterator c = cigar.begin(); c != cigar.end(); ++c) {
                switch (*c) {
                case 'I':
                    len = atoi(slen.c_str());
                    slen.clear();
                    if (altpos < altwindowsize) {
                        cigarData.push_back(make_pair(len, 'M'));
                    } else {
                        cigarData.push_back(make_pair(len, *c));
                    }
                    altss << alternateQuery.substr(altpos, len);
                    refss << string(len, '-');
                    altpos += len;
                    break;
                case 'D':
                    len = atoi(slen.c_str());
                    slen.clear();
                    if (altpos < altwindowsize) {
                    } else {
                        cigarData.push_back(make_pair(len, *c));
                    }
                    refss << ref.substr(refpos, len);
                    altss << string(len, '-');
                    refpos += len;
                    break;
                case 'M':
                    len = atoi(slen.c_str());
                    slen.clear();
                    {
                        for (int i = 0; i < len; ++i) {
                            if (ref.at(refpos + i) == alternateQuery.at(altpos + i)) {
                                if (!cigarData.empty() && cigarData.back().second == 'M') {
                                    cigarData.back().first++;
                                } else {
                                    cigarData.push_back(make_pair(1, 'M'));
                                }
                            } else {
                                if (!cigarData.empty() && cigarData.back().second == 'X') {
                                    cigarData.back().first++;
                                } else {
                                    cigarData.push_back(make_pair(1, 'X'));
                                }
                            }
                        }
                    }
                    refss << ref.substr(refpos, len);
                    altss << alternateQuery.substr(altpos, len);
                    refpos += len;
                    altpos += len;
                    break;
                case 'S':
                    len = atoi(slen.c_str());
                    slen.clear();
                    cigarData.push_back(make_pair(len, *c));
                    refss << ref.substr(refpos, len);
                    //altss << alternateQuery.substr(altpos, len); // TODO deal with soft clipping, weird behavior
                    refpos += len;
                    altpos += len;
                    break;
                default:
                    len = 0;
                    slen += *c;
                    break;
                }
            }

            if (!adjustVcf) {
                cout << "ref:\t" << refss.str() << endl;
                cout << "alt:\t" << altss.str() << endl;
            } else {
                cout << "ref:\t" << refss.str() << endl;
                cout << "alt:\t" << altss.str() << endl;
                cigars.push_back(cigarData);
            }

        }

        if (adjustVcf) {
            int substart = cigars.front().front().first;
            int subend = cigars.front().back().first;

            // find the min and max match
            for (vector<vector<pair<int, char> > >::iterator c = cigars.begin(); c != cigars.end(); ++c) {
                if (c->front().second == 'M' && c->front().first <= substart) {
                    substart = c->front().first;
                    if (c->size() > 1 && c->at(1).second != 'X') {
                        --substart;
                    }
                }
                if (c->back().second == 'M' && c->back().first <= subend) {
                    subend = c->back().first;
                }
            }
	    
            // adjust the cigars and get the new reference length
            int reflen = 0;
            for (vector<vector<pair<int, char> > >::iterator c = cigars.begin(); c != cigars.end(); ++c) {
                c->front().first -= substart;
                c->back().first -= subend;
                int crf = cigarRefLen(*c);
                if (crf > reflen)
                    reflen = crf;
                var.info[adjustedTag].push_back(joinCigar(*c));
            }

            // find the lowest positional difference
            int pdiff = 0;
            for (vector<int>::iterator d = positionDiffs.begin(); d != positionDiffs.end(); ++d) {
                if (*d + altwindowsize < pdiff)
                    pdiff = *d + altwindowsize;
            }

            // adjust the reference string
            var.position += pdiff;

            // adjust the variant position
            var.ref = freference.getSubSequence(var.sequenceName, var.position - 1, reflen);

            cout << var << endl;
        }
    }

    return 0;

}
Ejemplo n.º 10
0
int main(int argc, char** argv) {

    int c;
    string fastaRef;
    int windowSize = 0;

    if (argc == 1)
        printSummary(argv);

    while (true) {
        static struct option long_options[] =
        {
            /* These options set a flag. */
            //{"verbose", no_argument,       &verbose_flag, 1},
            {"help", no_argument, 0, 'h'},
            {"fasta-reference",  required_argument, 0, 'f'},
            {"window-size", required_argument, 0, 'w'},
            //{"length",  no_argument, &printLength, true},
            {0, 0, 0, 0}
        };
        /* getopt_long stores the option index here. */
        int option_index = 0;

        c = getopt_long (argc, argv, "hf:w:",
                         long_options, &option_index);

      /* Detect the end of the options. */
          if (c == -1)
            break;
 
          switch (c)
            {
            case 0:
            /* If this option set a flag, do nothing else now. */
            if (long_options[option_index].flag != 0)
              break;
            printf ("option %s", long_options[option_index].name);
            if (optarg)
              printf (" with arg %s", optarg);
            printf ("\n");
            break;

          case 'f':
            fastaRef = optarg;
            break;

          case 'w':
            windowSize = atoi(optarg);
            break;
 
          case 'h':
            printSummary(argv);
            exit(0);
            break;

          case '?':
            /* getopt_long already printed an error message. */
            printSummary(argv);
            exit(1);
            break;
 
          default:
            abort ();
          }
      }

    if (windowSize == 0) {
        cerr << "a window size must be specified" << endl;
        exit(1);
    }
    if (fastaRef.empty()) {
        cerr << "a FASTA reference sequence must be specified" << endl;
        exit(1);
    }

    FastaReference ref;
    ref.open(fastaRef);

    VariantCallFile variantFile;
    string inputFilename;
    if (optind == argc - 1) {
        inputFilename = argv[optind];
        variantFile.open(inputFilename);
    } else {
        variantFile.open(std::cin);
    }

    if (!variantFile.is_open()) {
        return 1;
    }

    variantFile.addHeaderLine("##INFO=<ID=EntropyLeft,Number=1,Type=Float,Description=\"Entropy of left-flanking sequence of "+ convert(windowSize) +"bp\">");
    variantFile.addHeaderLine("##INFO=<ID=EntropyCenter,Number=1,Type=Float,Description=\"Entropy of centered sequence of "+ convert(windowSize) +"bp\">");
    variantFile.addHeaderLine("##INFO=<ID=EntropyRight,Number=1,Type=Float,Description=\"Entropy of right-flanking sequence of "+ convert(windowSize) +"bp\">");
    variantFile.addHeaderLine("##INFO=<ID=EntropyRef,Number=1,Type=Float,Description=\"Entropy of REF allele\">");
    variantFile.addHeaderLine("##INFO=<ID=EntropyAlt,Number=A,Type=Float,Description=\"Entropy of each ALT allele\">");

    cout << variantFile.header << endl;

    Variant var(variantFile);
    while (variantFile.getNextVariant(var)) {

        // get the ref start and end positions
        int refstart = var.position - 1; // convert to 0-based
        int refend = var.position + var.ref.size() - 1;
        string leftseq = ref.getSubSequence(var.sequenceName, refstart - windowSize, windowSize);
        string rightseq = ref.getSubSequence(var.sequenceName, refend, windowSize);
        string centerseq = ref.getSubSequence(var.sequenceName, refstart - windowSize/2, windowSize);
        double entropyLeft = shannon_H((char*) &leftseq[0], windowSize);
        double entropyRight = shannon_H((char*) &rightseq[0], windowSize);
        double entropyCenter = shannon_H((char*) &centerseq[0], windowSize);
        double entropyRef = shannon_H((char*) var.ref.c_str(), var.ref.size());

        var.info["EntropyLeft"].clear();
        var.info["EntropyRight"].clear();
        var.info["EntropyCenter"].clear();
        var.info["EntropyRef"].clear();
        var.info["EntropyAlt"].clear();

        var.info["EntropyLeft"].push_back(convert(entropyLeft));
        var.info["EntropyRight"].push_back(convert(entropyRight));
        var.info["EntropyCenter"].push_back(convert(entropyCenter));
        var.info["EntropyRef"].push_back(convert(entropyRef));

        for (vector<string>::iterator a = var.alt.begin(); a != var.alt.end(); ++a) {
            double entropyAlt = shannon_H((char*) a->c_str(), a->size());
            var.info["EntropyAlt"].push_back(convert(entropyAlt));
        }

        cout << var << endl;
    }

    return 0;

}
Ejemplo n.º 11
0
int main(int argc, char** argv) {

    bool includePreviousBaseForIndels = true;
    bool useMNPs = false;
    string parseFlag;
    int maxLength = 200;
    bool keepInfo = false;
    bool keepGeno = false;

    VariantCallFile variantFile;

    int c;
    while (true) {
        static struct option long_options[] =
            {
                /* These options set a flag. */
                //{"verbose", no_argument,       &verbose_flag, 1},
                {"help", no_argument, 0, 'h'},
                {"use-mnps", no_argument, 0, 'm'},
                {"max-length", required_argument, 0, 'L'},
                {"tag-parsed", required_argument, 0, 't'},
                {"keep-info", no_argument, 0, 'k'},
                {"keep-geno", no_argument, 0, 'g'},
                {0, 0, 0, 0}
            };
        /* getopt_long stores the option index here. */
        int option_index = 0;

        c = getopt_long (argc, argv, "hmkgt:L:",
                         long_options, &option_index);

        if (c == -1)
            break;

        switch (c) {

	    case 'm':
            useMNPs = true;
            break;

	    case 'k':
            keepInfo = true;
            break;

	    case 'g':
            keepGeno = true;
            break;

        case 'h':
            printSummary(argv);
            break;

	    case 't':
            parseFlag = optarg;
            break;

        case 'L':
            maxLength = atoi(optarg);
            break;

        case '?':
            printSummary(argv);
            exit(1);
            break;

        default:
            abort ();
        }
    }

    if (optind < argc) {
        string filename = argv[optind];
        variantFile.open(filename);
    } else {
        variantFile.open(std::cin);
    }

    if (!variantFile.is_open()) {
        return 1;
    }

    variantFile.addHeaderLine("##INFO=<ID=TYPE,Number=A,Type=String,Description=\"The type of allele, either snp, mnp, ins, del, or complex.\">");
    variantFile.addHeaderLine("##INFO=<ID=LEN,Number=A,Type=Integer,Description=\"allele length\">");
    if (!parseFlag.empty()) {
        variantFile.addHeaderLine("##INFO=<ID="+parseFlag+",Number=0,Type=Flag,Description=\"The allele was parsed using vcfallelicprimitives.\">");
    }
    cout << variantFile.header << endl;

    Variant var(variantFile);
    while (variantFile.getNextVariant(var)) {


        // we can't decompose *1* bp events, these are already in simplest-form whether SNPs or indels
        // we also don't handle anything larger than maxLength bp
        if (var.alt.size() == 1 
            && (   var.alt.front().size() == 1
                || var.ref.size() == 1
                || var.alt.front().size() > maxLength
                || var.ref.size() > maxLength
                )) {
            // nothing to do
            cout << var << endl;
            continue;
        }

        // for each parsedalternate, get the position
        // build a new vcf record for that position
        // unless we are already at the position !
        // take everything which is unique to that allele (records) and append it to the new record
        // then handle genotypes; determine the mapping between alleleic primitives and convert to phased haplotypes
        // this means taking all the parsedAlternates and, for each one, generating a pattern of allele indecies corresponding to it

        map<string, vector<VariantAllele> > varAlleles = var.parsedAlternates(includePreviousBaseForIndels, useMNPs);
        set<VariantAllele> alleles;

        // collect unique alleles
        for (map<string, vector<VariantAllele> >::iterator a = varAlleles.begin(); a != varAlleles.end(); ++a) {
            for (vector<VariantAllele>::iterator va = a->second.begin(); va != a->second.end(); ++va) {
                alleles.insert(*va);
            }
        }

        int altcount = 0;
        for (set<VariantAllele>::iterator a = alleles.begin(); a != alleles.end(); ++a) {
            if (a->ref != a->alt) {
                ++altcount;
            }
        }

        if (altcount == 1 && var.alt.size() == 1 && var.alt.front().size() == 1) { // if biallelic SNP
            cout << var << endl;
            continue;
        }

        // collect variant allele indexed membership
        map<string, vector<int> > variantAlleleIndexes; // from serialized VariantAllele to indexes
        for (map<string, vector<VariantAllele> >::iterator a = varAlleles.begin(); a != varAlleles.end(); ++a) {
            int index = var.altAlleleIndexes[a->first] + 1; // make non-relative
            for (vector<VariantAllele>::iterator va = a->second.begin(); va != a->second.end(); ++va) {
                variantAlleleIndexes[va->repr].push_back(index);
            }
        }

        map<VariantAllele, double> alleleFrequencies;
        map<VariantAllele, int> alleleCounts;
        map<VariantAllele, map<string, string> > alleleInfos;
        map<VariantAllele, map<string, map<string, string> > > alleleGenos;

        bool hasAf = false;
        if (var.info.find("AF") != var.info.end()) {
            hasAf = true;
            for (vector<string>::iterator a = var.alt.begin(); a != var.alt.end(); ++a) {
                vector<VariantAllele>& vars = varAlleles[*a];
                for (vector<VariantAllele>::iterator va = vars.begin(); va != vars.end(); ++va) {
                    double freq;
                    try {
                        convert(var.info["AF"].at(var.altAlleleIndexes[*a]), freq);
                        alleleFrequencies[*va] += freq;
                    } catch (...) {
                        cerr << "vcfallelicprimitives WARNING: AF does not have count == alts @ "
                             << var.sequenceName << ":" << var.position << endl;
                    }
                }
            }
        }

        bool hasAc = false;
        if (var.info.find("AC") != var.info.end()) {
            hasAc = true;
            for (vector<string>::iterator a = var.alt.begin(); a != var.alt.end(); ++a) {
                vector<VariantAllele>& vars = varAlleles[*a];
                for (vector<VariantAllele>::iterator va = vars.begin(); va != vars.end(); ++va) {
                    int freq;
                    try {
                        convert(var.info["AC"].at(var.altAlleleIndexes[*a]), freq);
                        alleleCounts[*va] += freq;
                    } catch (...) {
                        cerr << "vcfallelicprimitives WARNING: AC does not have count == alts @ "
                             << var.sequenceName << ":" << var.position << endl;
                    }
                }
            }
        }

        if (keepInfo) {
            for (map<string, vector<string> >::iterator infoit = var.info.begin();
                 infoit != var.info.end(); ++infoit) {
                string key = infoit->first;
                for (vector<string>::iterator a = var.alt.begin(); a != var.alt.end(); ++a) {
                    vector<VariantAllele>& vars = varAlleles[*a];
                    for (vector<VariantAllele>::iterator va = vars.begin(); va != vars.end(); ++va) {
                        string val;
                        vector<string>& vals = var.info[key];
                        if (vals.size() == var.alt.size()) { // allele count for info
                            val = vals.at(var.altAlleleIndexes[*a]);
                        } else if (vals.size() == 1) { // site-wise count
                            val = vals.front();
                        } // don't handle other multiples... how would we do this without going crazy?
                        if (!val.empty()) {
                            alleleInfos[*va][key] = val;
                        }
                    }
                }
            }
        }

        /*
        if (keepGeno) {
            for (map<string, map<string, vector<string> > >::iterator sampleit = var.samples.begin();
                 sampleit != var.samples.end(); ++sampleit) {
                string& sampleName = sampleit->first;
                map<string, vector<string> >& sampleValues = var.samples[sampleName];
                
            }
        }
        */

        // from old allele index to a new series across the unpacked positions
        map<int, map<long unsigned int, int> > unpackedAlleleIndexes;

        map<long unsigned int, Variant> variants;
        //vector<Variant> variants;
        for (set<VariantAllele>::iterator a = alleles.begin(); a != alleles.end(); ++a) {
            if (a->ref == a->alt) {
                // ref allele
                continue;
            }
            string type;
            int len = 0;
            if (a->ref.at(0) == a->alt.at(0)) { // well-behaved indels
                if (a->ref.size() > a->alt.size()) {
                    type = "del";
                    len = a->ref.size() - a->alt.size();
                } else if (a->ref.size() < a->alt.size()) {
                    len = a->alt.size() - a->ref.size();
                    type = "ins";
                }
            } else {
                if (a->ref.size() == a->alt.size()) {
                    len = a->ref.size();
                    if (a->ref.size() == 1) {
                        type = "snp";
                    } else {
                        type = "mnp";
                    }
                } else {
                    len = abs((int) a->ref.size() - (int) a->alt.size());
                    type = "complex";
                }
            }

            if (variants.find(a->position) == variants.end()) {
                Variant newvar(variantFile);
                variants[a->position] = newvar;
            }

            Variant& v = variants[a->position]; // guaranteed to exist

            if (!parseFlag.empty()) {
                v.infoFlags[parseFlag] = true;
            }
            v.quality = var.quality;
            v.filter = var.filter;
            v.id = ".";
            //v.format = var.format;
            vector<string> gtonlyformat;
            gtonlyformat.push_back("GT");
            v.format = gtonlyformat;
            v.info["TYPE"].push_back(type);
            v.info["LEN"].push_back(convert(len));
            if (hasAf) {
                v.info["AF"].push_back(convert(alleleFrequencies[*a]));
            }
            if (hasAc) {
                v.info["AC"].push_back(convert(alleleCounts[*a]));
            }
            if (keepInfo) {
                for (map<string, vector<string> >::iterator infoit = var.info.begin();
                     infoit != var.info.end(); ++infoit) {
                    string key = infoit->first;
                    if (key != "AF" && key != "AC" && key != "TYPE" && key != "LEN") { // don't clobber previous
                        v.info[key].push_back(alleleInfos[*a][key]);
                    }
                }
            }

            // now, keep all the other infos if we are asked to

            v.sequenceName = var.sequenceName;
            v.position = a->position; // ... by definition, this should be == if the variant was found
            if (v.ref.size() < a->ref.size()) {
                for (vector<string>::iterator va = v.alt.begin(); va != v.alt.end(); ++va) {
                    *va += a->ref.substr(v.ref.size());
                }
                v.ref = a->ref;
            }
            v.alt.push_back(a->alt);

            int alleleIndex = v.alt.size();
            vector<int>& originalIndexes = variantAlleleIndexes[a->repr];
            for (vector<int>::iterator i = originalIndexes.begin(); i != originalIndexes.end(); ++i) {
                unpackedAlleleIndexes[*i][v.position] = alleleIndex;
            }
            // add null allele
            unpackedAlleleIndexes[ALLELE_NULL][v.position] = ALLELE_NULL;

        }

        // genotypes
        for (vector<string>::iterator s = var.sampleNames.begin(); s != var.sampleNames.end(); ++s) {
            string& sampleName = *s;
            if (var.samples.find(sampleName) == var.samples.end()) {
                continue;
            }
            map<string, vector<string> >& sample = var.samples[sampleName];
            if (sample.find("GT") == sample.end()) {
                continue;
            }
            string& genotype = sample["GT"].front();
            vector<string> genotypeStrs = split(genotype, "|/");
            vector<int> genotypeIndexes;
            for (vector<string>::iterator s = genotypeStrs.begin(); s != genotypeStrs.end(); ++s) {
                int i;
                if (!convert(*s, i)) {
                    genotypeIndexes.push_back(ALLELE_NULL);
                } else {
                    genotypeIndexes.push_back(i);
                }
            }
            map<long unsigned int, vector<int> > positionIndexes;
            for (vector<int>::iterator g = genotypeIndexes.begin(); g != genotypeIndexes.end(); ++g) {
                int oldIndex = *g;
                for (map<long unsigned int, Variant>::iterator v = variants.begin(); v != variants.end(); ++v) {
                    const long unsigned int& p = v->first;
                    if (oldIndex == 0) { // reference
                        positionIndexes[p].push_back(0);
                    } else {
                        positionIndexes[p].push_back(unpackedAlleleIndexes[oldIndex][p]);
                    }
                }
            }
            for (map<long unsigned int, Variant>::iterator v = variants.begin(); v != variants.end(); ++v) {
                Variant& variant = v->second;
                vector<int>& gtints = positionIndexes[v->first];
                vector<string> gtstrs;
                for (vector<int>::iterator i = gtints.begin(); i != gtints.end(); ++i) {
                    if (*i != ALLELE_NULL) {
                        gtstrs.push_back(convert(*i));
                    } else {
                        gtstrs.push_back(".");
                    }
                }
                string genotype = join(gtstrs, "|");
                // if we are keeping the geno info, pull it over here
                if (keepGeno) {
                    variant.format = var.format;
                    variant.samples[sampleName] = var.samples[sampleName];
                }
                // note that this will replace the old geno, but otherwise it is the same
                variant.samples[sampleName]["GT"].clear();
                variant.samples[sampleName]["GT"].push_back(genotype);
            }
        }

        //for (vector<Variant>::iterator v = variants.begin(); v != variants.end(); ++v) {
        for (map<long unsigned int, Variant>::iterator v = variants.begin(); v != variants.end(); ++v) {
            cout << v->second << endl;
        }
    }

    return 0;

}
Ejemplo n.º 12
0
int main(int argc, char** argv) {

    vector<string> regions;
    bool addTags = false;
    bool addType = false;
    bool lengthFrequency = true;

    // constants for SmithWaterman algorithm
    float matchScore = 10.0f;
    float mismatchScore = -9.0f;
    float gapOpenPenalty = 15.0f;
    float gapExtendPenalty = 6.66f;

    bool useReferenceAlignment = false;

    int c;
    while (true) {
        static struct option long_options[] =
            {
                /* These options set a flag. */
                //{"verbose", no_argument,       &verbose_flag, 1},
                {"help", no_argument, 0, 'h'},
                {"region", required_argument, 0, 'r'},
                {"add-info", no_argument, 0, 'a'},
                {"add-type", no_argument, 0, 't'},
                {"no-length-frequency", no_argument, 0, 'l'},
                {"match-score", required_argument, 0, 'm'},
                {"mismatch-score", required_argument, 0, 'x'},
                {"gap-open-penalty", required_argument, 0, 'o'},
                {"gap-extend-penalty", required_argument, 0, 'e'},
                //{"length",  no_argument, &printLength, true},
                {0, 0, 0, 0}
            };
        /* getopt_long stores the option index here. */
        int option_index = 0;

        c = getopt_long (argc, argv, "hlatr:m:x:o:e:",
                         long_options, &option_index);

        /* Detect the end of the options. */
        if (c == -1)
            break;
 
        switch (c)
        {
        case 0:
            /* If this option set a flag, do nothing else now. */
            if (long_options[option_index].flag != 0)
                break;
            printf ("option %s", long_options[option_index].name);
            if (optarg)
                printf (" with arg %s", optarg);
            printf ("\n");
            break;

	    case 'h':
            printSummary(argv);
            exit(0);
            break;
		
	    case 'r':
            regions.push_back(optarg);
            break;
		
	    case 'l':
            lengthFrequency = false;
            break;
		
	    case 'a':
            addTags = true;
            break;

	    case 't':
            addType = true;
            break;

	    case 'm':
            matchScore = atof(optarg);
	        break;

	    case 'x':
            mismatchScore = atof(optarg);
	        break;

	    case 'o':
            gapOpenPenalty = atof(optarg);
	        break;

	    case 'e':
            gapExtendPenalty = atof(optarg);
	        break;
		
	    default:
            abort ();
        }
    }

    VariantCallFile variantFile;
    string inputFilename;
    if (optind == argc - 1) {
        inputFilename = argv[optind];
        variantFile.open(inputFilename);
    } else {
        variantFile.open(std::cin);
    }

    if (!variantFile.is_open()) {
        return 1;
    }

    if (addType && !addTags) {
        variantFile.addHeaderLine("##INFO=<ID=type,Number=A,Type=String,Description=\"The type of the allele, either snp, ins, del, complex, or ref.\">");
        variantFile.addHeaderLine("##INFO=<ID=cigar,Number=A,Type=String,Description=\"The CIGAR-style representation of the alternate allele as aligned to the reference\">");
        cout << variantFile.header << endl;
    }

    if (addTags) {
        variantFile.addHeaderLine("##INFO=<ID=transitions,Number=A,Type=Integer,Description=\"Total number of transitions in the alternate allele\">");
        variantFile.addHeaderLine("##INFO=<ID=transversions,Number=A,Type=Integer,Description=\"Total number of transversions in the alternate allele\">");
        variantFile.addHeaderLine("##INFO=<ID=deaminations,Number=A,Type=Integer,Description=\"Total number of deaminations in the alternate allele\">");
        variantFile.addHeaderLine("##INFO=<ID=aminations,Number=A,Type=Integer,Description=\"Total number of aminations in the alternate allele\">");
        variantFile.addHeaderLine("##INFO=<ID=mismatches,Number=A,Type=Integer,Description=\"Total number of mismatches in the alternate allele\">");
        variantFile.addHeaderLine("##INFO=<ID=insertions,Number=A,Type=Integer,Description=\"Total number of inserted bases in the alternate allele\">");
        variantFile.addHeaderLine("##INFO=<ID=deletions,Number=A,Type=Integer,Description=\"Total number of deleted bases in the alternate allele\">");
        variantFile.addHeaderLine("##INFO=<ID=cigar,Number=A,Type=String,Description=\"The CIGAR-style representation of the alternate allele as aligned to the reference\">");
        variantFile.addHeaderLine("##INFO=<ID=type,Number=A,Type=String,Description=\"The type of the allele, either snp, ins, del, complex, or ref.\">");
        variantFile.addHeaderLine("##INFO=<ID=reflen,Number=1,Type=Integer,Description=\"The length of the reference allele\">");
        variantFile.addHeaderLine("##INFO=<ID=altlen,Number=A,Type=Integer,Description=\"The length of the alternate allele\">");
        cout << variantFile.header << endl;
    }

    Variant var(variantFile);

    vector<string>::iterator regionItr = regions.begin();

    int variantAlleles = 0;
    int uniqueVariantAlleles = 0;
    int variantSites = 0;
    int snps = 0;
    int transitions = 0;
    int transversions = 0;
    int deaminations = 0;
    int aminations = 0;
    int totalinsertions = 0;
    int totaldeletions = 0;
    int insertedbases = 0;
    int deletedbases = 0;
    int totalmnps = 0;
    int totalcomplex = 0;
    int mismatchbases = 0;
    int mnpbases = 0;
    int biallelics = 0;
    int multiallelics = 0;
    map<int, int> insertions;
    map<int, int> deletions;
    map<int, int> mnps;
    map<int, int> complexsubs;

    bool includePreviousBaseForIndels = false;
    bool useMNPs = true;
    bool useEntropy = false;

    AlleleStats biallelicSNPs;

    // todo, add biallelic snp dialog to output and ts/tv for snps and mnps

    do {

        if (!inputFilename.empty() && !regions.empty()) {
            string regionStr = *regionItr++;
            variantFile.setRegion(regionStr);
        }

        while (variantFile.getNextVariant(var)) {
            ++variantSites;
            if (var.alt.size() > 1) {
                ++multiallelics;
            } else {
                ++biallelics;
            }
            map<string, vector<VariantAllele> > alternates 
	      = var.parsedAlternates(includePreviousBaseForIndels,
				     useMNPs,
				     useEntropy,
				     matchScore,
				     mismatchScore,
				     gapOpenPenalty,
				     gapExtendPenalty);

            map<VariantAllele, vector<string> > uniqueVariants;
	    
            vector<string> cigars;
	    
            for (vector<string>::iterator a = var.alt.begin(); a != var.alt.end(); ++a) {
                string& alternate = *a;
                if (addTags)
                    var.info["altlen"].push_back(convert(alternate.size()));
                vector<VariantAllele>& vav = alternates[alternate];
                if (vav.size() > 1) {
                    // check that there are actually multiple non-reference alleles
                    int nonRefAlleles = 0;
                    for (vector<VariantAllele>::iterator z = vav.begin(); z != vav.end(); ++z) {
                        if (z->ref != z->alt)
                            ++nonRefAlleles;
                    }
                    if (nonRefAlleles > 1)
                        ++totalcomplex;
                }
                for (vector<VariantAllele>::iterator v = vav.begin(); v != vav.end(); ++v) {
                    uniqueVariants[*v].push_back(alternate);
                }

                if (addTags || addType) {
                    string cigar;
                    pair<int, string> element;
                    for (vector<VariantAllele>::iterator v = vav.begin(); v != vav.end(); ++v) {
                        VariantAllele& va = *v;
                        if (va.ref != va.alt) {
                            if (element.second == "M") {
                                cigar += convert(element.first) + element.second;
                                element.second = ""; element.first = 0;
                            }
                            if (va.ref.size() == va.alt.size()) {
                                cigar += convert(va.ref.size()) + "X";
                            } else if (va.ref.size() > va.alt.size()) {
                                cigar += convert(va.ref.size() - va.alt.size()) + "D";
                            } else {
                                cigar += convert(va.alt.size() - va.ref.size()) + "I";
                            }
                        } else {
                            if (element.second == "M") {
                                element.first += va.ref.size();
                            } else {
                                element = make_pair(va.ref.size(), "M");
                            }
                        }
                    }
                    if (element.second == "M") {
                        cigar += convert(element.first) + element.second;
                    }
                    element.second = ""; element.first = 0;
                    cigars.push_back(cigar);
                }
            }

            if (addTags) {
                var.info["cigar"] = cigars;
                var.info["reflen"].push_back(convert(var.ref.size()));
            } else if (addType) {
                var.info["cigar"] = cigars;
            }

            variantAlleles += var.alt.size();
            map<string, AlleleStats> alleleStats;

            for (map<VariantAllele, vector<string> >::iterator v = uniqueVariants.begin(); v != uniqueVariants.end(); ++v) {
                const VariantAllele& va = v->first;
                vector<string>& alternates = v->second;

                if (!(addTags || addType)) { // don't add any tag information if we're not going to output it
                    alternates.clear();
                }

                if (va.ref != va.alt) {
                    ++uniqueVariantAlleles;
                    if (va.ref.size() == va.alt.size()) {
                        if (va.ref.size() == 1) {
                            ++snps;
                            ++mismatchbases;
                            for (vector<string>::iterator a = alternates.begin(); a != alternates.end(); ++a) {
                                ++alleleStats[*a].mismatches;
                            }
                            if (isTransition(va.ref, va.alt)) {
                                ++transitions;
                                for (vector<string>::iterator a = alternates.begin(); a != alternates.end(); ++a) {
                                    ++alleleStats[*a].transitions;
                                }
                            } else {
                                ++transversions;
                                for (vector<string>::iterator a = alternates.begin(); a != alternates.end(); ++a) {
                                    ++alleleStats[*a].transversions;
                                }
                            }
                            if (isAmination(va.ref, va.alt)) {
                                ++aminations;
                                for (vector<string>::iterator a = alternates.begin(); a != alternates.end(); ++a) {
                                    ++alleleStats[*a].aminations;
                                }
                            }
                            if (isDeamination(va.ref, va.alt)) {
                                ++deaminations;
                                for (vector<string>::iterator a = alternates.begin(); a != alternates.end(); ++a) {
                                    ++alleleStats[*a].deaminations;
                                }
                            }
                        } else {
                            ++totalmnps;
                            ++mnps[va.alt.size()]; // not entirely correct
                            for (vector<string>::iterator a = alternates.begin(); a != alternates.end(); ++a) {
                                alleleStats[*a].mismatches += va.alt.size();
                            }
                            string::const_iterator r = va.ref.begin();
                            for (string::const_iterator a = va.alt.begin(); a != va.alt.end(); ++a, ++r) {
                                string rstr = string(1, *r);
                                string astr = string(1, *a);
                                if (rstr == astr) {
                                    continue;
                                }
                                if (isTransition(rstr, astr)) {
                                    ++transitions;
                                    for (vector<string>::iterator a = alternates.begin(); a != alternates.end(); ++a) {
                                        ++alleleStats[*a].transitions;
                                    }
                                } else {
                                    ++transversions;
                                    for (vector<string>::iterator a = alternates.begin(); a != alternates.end(); ++a) {
                                        ++alleleStats[*a].transversions;
                                    }
                                }
                                if (isAmination(rstr, astr)) {
                                    ++aminations;
                                    for (vector<string>::iterator a = alternates.begin(); a != alternates.end(); ++a) {
                                        ++alleleStats[*a].aminations;
                                    }
                                }
                                if (isDeamination(rstr, astr)) {
                                    ++deaminations;
                                    for (vector<string>::iterator a = alternates.begin(); a != alternates.end(); ++a) {
                                        ++alleleStats[*a].deaminations;
                                    }
                                }
                                ++mismatchbases;
                                ++mnpbases;
                            }
                        }
                    } else if (va.ref.size() > va.alt.size()) {
                        int diff = va.ref.size() - va.alt.size();
                        deletedbases += diff;
                        ++totaldeletions;
                        ++deletions[diff];
                        for (vector<string>::iterator a = alternates.begin(); a != alternates.end(); ++a) {
                            alleleStats[*a].deletedbases += diff;
                            alleleStats[*a].deletions += 1;
                        }
                    } else {
                        int diff = va.alt.size() - va.ref.size();
                        insertedbases += diff;
                        ++totalinsertions;
                        ++insertions[diff];
                        for (vector<string>::iterator a = alternates.begin(); a != alternates.end(); ++a) {
                            alleleStats[*a].insertedbases += diff;
                            alleleStats[*a].insertions += 1;
                        }
                    }
                }
            }
            if (addTags || addType) {
                for (vector<string>::iterator a = var.alt.begin(); a != var.alt.end(); ++a) {
                    string vartype;
                    if (alleleStats[*a].insertions + alleleStats[*a].deletions == 0) {
                        if (alleleStats[*a].mismatches == 1) {
                            vartype = "snp";
                        } else if (alleleStats[*a].mismatches > 1) {
                            vartype = "complex";
                        } else {
                            vartype = "ref";
                        }
                    } else if (alleleStats[*a].insertions + alleleStats[*a].deletions == 1) {
                        if (alleleStats[*a].insertions == 1) {
                            vartype = "ins";
                        } else {
                            vartype = "del";
                        }
                    } else {
                        vartype = "complex";
                    }
                    if (addTags) {
                        var.info["mismatches"].push_back(convert(alleleStats[*a].mismatches));
                        var.info["insertions"].push_back(convert(alleleStats[*a].insertions));
                        var.info["deletions"].push_back(convert(alleleStats[*a].deletions));
                        var.info["transitions"].push_back(convert(alleleStats[*a].transitions));
                        var.info["transversions"].push_back(convert(alleleStats[*a].transversions));
                        var.info["deaminations"].push_back(convert(alleleStats[*a].deaminations));
                        var.info["aminations"].push_back(convert(alleleStats[*a].aminations));
                    }
                    var.info["type"].push_back(vartype);
                }
                cout << var << endl;
            }
            // biallelic SNP case
            if (var.alt.size() == 1 && var.ref.size() == 1 && var.alt.front().size() == 1) {
                if (isTransition(var.ref, var.alt.front())) {
                    biallelicSNPs.transitions++;
                } else {
                    biallelicSNPs.transversions++;
                }
                biallelicSNPs.mismatches++;
            }
        }

    } while (regionItr != regions.end());


    // find the maximum indel size
    int maxindel = 0;
    for (map<int, int>::iterator i = insertions.begin(); i != insertions.end(); ++i) {
        if (i->first > maxindel) {
            maxindel = i->first;
        }
    }
    for (map<int, int>::iterator i = deletions.begin(); i != deletions.end(); ++i) {
        if (i->first > maxindel) {
            maxindel = i->first;
        }
    }

    // and maximum mnp
    int maxmnp = 0;
    for (map<int, int>::iterator i = mnps.begin(); i != mnps.end(); ++i) {
        if (i->first > maxmnp) {
            maxmnp = i->first;
        }
    }

    // now print the results

    if (!addTags && !addType) {
        cout << "total variant sites:\t" << variantSites << endl
             << "of which " << biallelics << " (" << (double) biallelics / variantSites << ") are biallelic and "
                            << multiallelics << " (" << (double) multiallelics / variantSites << ") are multiallelic" << endl
             << "total variant alleles:\t" << variantAlleles << endl
             << "unique variant alleles:\t" << uniqueVariantAlleles << endl
             << endl
             << "snps:\t" << snps << endl
             << "mnps:\t" << totalmnps << endl
             << "indels:\t" << totalinsertions + totaldeletions << endl
             << "complex:\t" << totalcomplex << endl
             << endl
             << "mismatches:\t" << mismatchbases << endl
             << endl
             << "ts/tv ratio:\t" << (double) transitions / (double) transversions << endl
             << "deamination ratio:\t" << (double) deaminations / aminations << endl
             << "biallelic snps:\t" << biallelicSNPs.mismatches << " @ "
             << (double) biallelicSNPs.transitions / (double) biallelicSNPs.transversions << endl;

        if (lengthFrequency) {
            cout << endl
                 << "ins/del length frequency distribution" << endl
                 << "length\tins\tdel\tins/del" << endl;
            for (int i = 1; i <= maxindel; ++i) {
                int ins = insertions[i];
                int del = deletions[i];
                cout << i << "\t"
                     << (ins > 0 ? convert(ins) : "" ) << "\t"
                     << (del > 0 ? convert(del) : "") << "\t"
                     << (ins > 0 && del > 0 ? convert((double) ins / (double) del) : "")
                     << endl;
            }
        }

        cout << endl
             << "insertion alleles / deletion alleles:\t"
             << (double) totalinsertions / (double) totaldeletions << endl
             << "inserted bases / deleted bases:\t"
             << (double) insertedbases / (double) deletedbases << endl
             << endl;

        if (lengthFrequency) {
            cout << "mnp length frequency distribution" << endl
                 << "length\tcount" << endl;
            for (int i = 2; i <= maxmnp; ++i) {
                int mnp = mnps[i];
                cout << i << "\t"
                     << (mnp > 0 ? convert(mnp) : "")
                     << endl;
            }
        }

        cout << "total bases in mnps:\t" << mnpbases << endl;

        /*
          cout << "complex event frequency distribution" << endl
          << "length\tcount" << endl;
          for (map<int, int>::iterator i = complexsubs.begin(); i != complexsubs.end(); ++i) {
          cout << i->first << "\t" << i->second << endl;
          }
        */
    }

    return 0;

}
Ejemplo n.º 13
0
int main(int argc, char** argv) {

    if (argc != 4) {
        cerr << "usage: " << argv[0] << " <annotation-tag> <vcf file> <vcf file>" << endl
             << "annotates genotypes in the first file with genotypes in the second" << endl
             << "adding the genotype as another flag to each sample filed in the first file." << endl
             << "annotation-tag is the name of the sample flag which is added to store the annotation." << endl
             << "also adds a 'has_variant' flag for sites where the second file has a variant." << endl;
        return 1;
    }

    string annotag = argv[1];
    string filenameA = argv[2];
    string filenameB = argv[3];

    if (filenameA == filenameB) {
        cerr << "it won't help to annotate samples with their own genotypes!" << endl;
        return 1;
    }

    VariantCallFile variantFileA;
    if (filenameA == "-") {
        variantFileA.open(std::cin);
    } else {
        variantFileA.open(filenameA);
    }

    VariantCallFile variantFileB;
    if (filenameB == "-") {
        variantFileB.open(std::cin);
    } else {
        variantFileB.open(filenameB);
    }

    if (!variantFileA.is_open() || !variantFileB.is_open()) {
        return 1;
    }

    Variant varA(variantFileA);
    Variant varB(variantFileB);

    // while the first file doesn't match the second positionally,
    // step forward, annotating each genotype record with an empty genotype
    // when the two match, iterate through the genotypes from the first file
    // and get the genotypes reported in the second file
    
    variantFileA.getNextVariant(varA);
    variantFileB.getNextVariant(varB);

    string line = "##INFO=<ID=" + annotag + ".has_variant,Number=0,Type=Flag,Description=\"True if "
        + annotag + " has a called alternate among samples under comparison.\">";
    variantFileA.addHeaderLine(line);
    line = "##FORMAT=<ID=" + annotag + ",Number=1,Type=String,Description=\"Genotype from "
        + annotag + ".\">";
    variantFileA.addHeaderLine(line);

    cout << variantFileA.header << endl;

    do {

        // this is broken.  to do it right, it'll be necessary to get reference ids from the fasta reference used to make the alignments...
		// if B is NOT done, and is less than A, read new B.
        if (!variantFileB.done()
            && (varB.sequenceName != varA.sequenceName
                || (varB.sequenceName == varA.sequenceName && varB.position < varA.position)
				|| variantFileA.done())
            ) {
            variantFileB.getNextVariant(varB);
        }

		// if A is not done- and A is less than B, read A.  
		// should also read if variant B is done. 
        if (!variantFileA.done()
            && (varA.sequenceName != varB.sequenceName
                || (varA.sequenceName == varB.sequenceName && varA.position < varB.position)
				|| variantFileB.done())
            ) {
            annotateWithBlankGenotypes(varA, annotag);
            cout << varA << endl;
            variantFileA.getNextVariant(varA);
        }

        vector<Variant> varsA;
        vector<Variant> varsB;

        bool hasMultipleAlts = false;

        long int thisPosition = 0;
        string thisSequenceName;
        if (varA.position == varB.position
            && varA.sequenceName == varB.sequenceName) {
            thisPosition = varA.position;
            thisSequenceName = varA.sequenceName;
        }
        while (!variantFileA.done()
               && !variantFileB.done()
               && thisPosition == varA.position
               && thisSequenceName == varA.sequenceName
               && varA.sequenceName == varB.sequenceName
               && varA.position == varB.position) {
            // accumulate all the alts at the current position
            varsA.push_back(varA);
            varsB.push_back(varB);
            if (varA.alt.size() > 1 || varB.alt.size() > 1)
                hasMultipleAlts = true;
            variantFileA.getNextVariant(varA);
            variantFileB.getNextVariant(varB);
        }

        // multiple lines per position
        if (!hasMultipleAlts && (varsA.size() > 1 || varsB.size() > 1)) {

            map<pair<string, string>, Variant> varsAParsed;
            map<pair<string, string>, Variant> varsBParsed;	
            for (vector<Variant>::iterator v = varsA.begin(); v != varsA.end(); ++v) {
                varsAParsed[make_pair(v->ref, v->alt.front())] = *v;
            }
            for (vector<Variant>::iterator v = varsB.begin(); v != varsB.end(); ++v) {
                varsBParsed[make_pair(v->ref, v->alt.front())] = *v;
            }
	    
            for (map<pair<string, string>, Variant>::iterator vs = varsAParsed.begin(); vs != varsAParsed.end(); ++vs) {
                Variant& varA = vs->second;
                annotateWithBlankGenotypes(varA, annotag);
                if (varsBParsed.find(make_pair(varA.ref, varA.alt.front())) != varsBParsed.end()) {
                    Variant& varB = varsBParsed[make_pair(varA.ref, varA.alt.front())]; // TODO cleanup
                    annotateWithGenotypes(varA, varB, annotag);
                    varA.infoFlags[annotag + ".has_variant"] = true;
                }
                cout << varA << endl;
            }

        } else if (!varsA.empty() && !varsB.empty()) { // one line per multi-allelic
            Variant& varA = varsA.front();
            annotateWithBlankGenotypes(varA, annotag);
            Variant& varB = varsB.front();
            annotateWithGenotypes(varA, varB, annotag);
            // XXX TODO, and also allow for records with multiple alts
            // XXX assume that if the other file has a corresponding record, some kind of variation was detected at the same site
            varA.infoFlags[annotag + ".has_variant"] = true;
            cout << varA << endl;
        } else {
            for (vector<Variant>::iterator v = varsA.begin(); v != varsA.end(); ++v) {
                Variant& varA = *v;
                annotateWithBlankGenotypes(varA, annotag);
                cout << varA << endl;
            }
        }

    } while (!variantFileA.done() || !variantFileB.done());

    return 0;

}
Ejemplo n.º 14
0
int main(int argc, char** argv) {

    int c;
    string sampleField;
    string infoField;
    StatType statType = MEAN; 

    if (argc == 1)
        printSummary(argv);

    while (true) {
        static struct option long_options[] =
            {
                /* These options set a flag. */
                {"help", no_argument, 0, 'h'},
                {"field",  required_argument, 0, 'f'},
                {"info",  required_argument, 0, 'i'},
                {"average", no_argument, 0, 'a'},
                {"median", no_argument, 0, 'm'},
                {"min", no_argument, 0, 'n'},
                {"max", no_argument, 0, 'x'},
                {0, 0, 0, 0}
            };
        /* getopt_long stores the option index here. */
        int option_index = 0;

        c = getopt_long (argc, argv, "hamnxf:i:",
                         long_options, &option_index);

        /* Detect the end of the options. */
        if (c == -1)
            break;
 
        switch (c)
        {
        case 0:
            /* If this option set a flag, do nothing else now. */
            if (long_options[option_index].flag != 0)
                break;
            printf ("option %s", long_options[option_index].name);
            if (optarg)
                printf (" with arg %s", optarg);
            printf ("\n");
            break;

        case 'f':
            sampleField = optarg;
            break;

        case 'i':
            infoField = optarg;
            break;
 
        case 'a':
            statType = MEAN;
            break;

        case 'm':
            statType = MEDIAN;
            break;

        case 'n':
            statType = MIN;
            break;

        case 'x':
            statType = MAX;
            break;

        case 'h':
            printSummary(argv);
            exit(0);

        case '?':
            /* getopt_long already printed an error message. */
            printSummary(argv);
            exit(1);
            break;
 
        default:
            abort ();
        }
    }

    if (infoField.empty() || sampleField.empty()) {
        cerr << "Error: both a sample field and an info field are required." << endl;
        return 1;
    }

    VariantCallFile variantFile;
    string inputFilename;
    if (optind == argc - 1) {
        inputFilename = argv[optind];
        variantFile.open(inputFilename);
    } else {
        variantFile.open(std::cin);
    }

    if (!variantFile.is_open()) {
        return 1;
    }

    string statTypeStr;

    switch (statType) {
    case MEAN:
        statTypeStr = "mean";
        break;
    case MEDIAN:
        statTypeStr = "median";
        break;
    case MIN:
        statTypeStr = "min";
        break;
    case MAX:
        statTypeStr = "max";
        break;
    default:
        cerr << "Error: failure to convert stat type to string" << endl;
        return 1;
        break;
    }

    variantFile.addHeaderLine("##INFO=<ID="+infoField+",Number=1,Type=Float,Description=\"Summary statistic generated by"+statTypeStr+" of per-sample values of "+sampleField+" \">");

    cout << variantFile.header << endl;

    Variant var(variantFile);
    while (variantFile.getNextVariant(var)) {
        vector<double> vals;
        for (map<string, map<string, vector<string> > >::iterator s = var.samples.begin();
             s != var.samples.end(); ++s) {
            map<string, vector<string> >& sample = s->second;
            if (sample.find(sampleField) != sample.end()) {
                double val;
                string& s = sample[sampleField].front();
                if (sample[sampleField].size() > 1) {
                    cerr << "Error: cannot handle sample fields with multiple values" << endl;
                    return 1;
                }
                convert(s, val);
                vals.push_back(val);
            }
        }

        double result;
        switch (statType) {
        case MEAN:
            result = mean(vals);
            break;
        case MEDIAN:
            result = median(vals);
            break;
        case MIN:
            result = *min_element(vals.begin(), vals.end());
            break;
        case MAX:
            result = *max_element(vals.begin(), vals.end());
            break;
        default:
            cerr << "Error: unrecognized StatType" << endl;
            return 1;
            break;
        }

        var.info[infoField].clear();
        var.info[infoField].push_back(convert(result));

        cout << var << endl;

    }

    return 0;

}