Beispiel #1
0
int main(int argc, char** argv) {

    if (argc != 3) {
        cerr << "usage: " << argv[0] << " <vcf file> <vcf file>" << endl
             << "Adds info fields from the second file which are not present in the first vcf file." << endl;
        return 1;
    }

    string filenameA = argv[1];
    string filenameB = argv[2];

    if (filenameA == filenameB) {
        cerr << "it won't help to add info data from the same file!" << endl;
        return 1;
    }

    VariantCallFile variantFileA;
    if (filenameA == "-") {
        variantFileA.open(std::cin);
    } else {
        variantFileA.open(filenameA);
    }

    VariantCallFile variantFileB;
    if (filenameB == "-") {
        variantFileB.open(std::cin);
    } else {
        variantFileB.open(filenameB);
    }

    if (!variantFileA.is_open() || !variantFileB.is_open()) {
        return 1;
    }

    Variant varA(variantFileA);
    Variant varB(variantFileB);

    // while the first file doesn't match the second positionally,
    // step forward, annotating each genotype record with an empty genotype
    // when the two match, iterate through the genotypes from the first file
    // and get the genotypes reported in the second file
    
    variantFileA.getNextVariant(varA);
    variantFileB.getNextVariant(varB);
    
    variantFileA.header = unionInfoHeaderLines(variantFileA.header, variantFileB.header);
    
    cout << variantFileA.header << endl;

    do {

        while (!variantFileB.done()
               && (varB.sequenceName < varA.sequenceName
                   || (varB.sequenceName == varA.sequenceName && varB.position < varA.position))
            ) {
            variantFileB.getNextVariant(varB);
        }

        while (!variantFileA.done()
               && (varA.sequenceName < varB.sequenceName
                   || (varA.sequenceName == varB.sequenceName && varA.position < varB.position))
            ) {
            cout << varA << endl;
            variantFileA.getNextVariant(varA);
        }

        while (!variantFileB.done()
               && (varB.sequenceName < varA.sequenceName
                   || (varB.sequenceName == varA.sequenceName && varB.position < varA.position))
            ) {
            variantFileB.getNextVariant(varB);
        }

        while (!variantFileA.done() && varA.sequenceName == varB.sequenceName && varA.position == varB.position) {
            addInfo(varA, varB);
            cout << varA << endl;
            variantFileA.getNextVariant(varA);
            variantFileB.getNextVariant(varB);
        }
        
    } while (!variantFileA.done() && !variantFileB.done());

    if (!variantFileA.done()) {
        cout << varA << endl;
        while (variantFileA.getNextVariant(varA)) {
            cout << varA << endl;
        }
    }

    return 0;

}
Beispiel #2
0
int main(int argc, char** argv) {

    string vcfFileName;
    string fastaFileName;
    int windowsize = 30;

    if (argc == 1)
        printSummary(argv);

    int c;
    while (true) {
        static struct option long_options[] =
            {
                /* These options set a flag. */
                //{"verbose", no_argument,       &verbose_flag, 1},
                {"help", no_argument, 0, 'h'},
                {"window-size", required_argument, 0, 'w'},
                {"reference", required_argument, 0, 'r'},
                {0, 0, 0, 0}
            };
        /* getopt_long stores the option index here. */
        int option_index = 0;

        c = getopt_long (argc, argv, "hw:r:",
                         long_options, &option_index);

        if (c == -1)
            break;

        switch (c) {

	    case 'w':
            windowsize = atoi(optarg);
            break;

	    case 'r':
            fastaFileName = string(optarg);
            break;

        case 'h':
            printSummary(argv);
            break;

        case '?':
            printSummary(argv);
            exit(1);
            break;

        default:
            abort ();
        }
    }

    VariantCallFile variantFile;
    string inputFilename;
    if (optind == argc - 1) {
        inputFilename = argv[optind];
        variantFile.open(inputFilename);
    } else {
        variantFile.open(std::cin);
    }

    if (!variantFile.is_open()) {
        cerr << "could not open VCF file" << endl;
        exit(1);
    }

    FastaReference reference;
    if (fastaFileName.empty()) {
        cerr << "a reference is required for haplotype allele generation" << endl;
        exit(1);
    }
    reference.open(fastaFileName);

    // pattern
    // when variants are within windowSize from each other, build up local haplotypes
    // establish all the haplotypes which exist within the window using genotypes+allele#+position map
    // generate a haplotype allele string for each unique haplotype
    // for completeness retain phasing information in the genotypes
    // write a new VCF record in which there are haplotype alleles and correctly described genotypes for each sample
    // if the variants are outside of the windowSize, just write out the record

    Variant var(variantFile);
    Variant outputVar(variantFile);

    cout << variantFile.header << endl;

    // get the first distances
    vector<Variant> cluster;

    while (variantFile.getNextVariant(var) || !cluster.empty()) {

        bool haplotypeCluster = false;

        if (variantFile.done()) {
            if (cluster.size() >= 1) {
                haplotypeCluster = true;
            } else {
                cout << cluster.front() << endl;
                cluster.clear();
            }
        } else if (isPhased(var)) {
            if (cluster.empty()
                || cluster.back().sequenceName == var.sequenceName
                && var.position - cluster.back().position + cluster.back().ref.size() - 1 <= windowsize) {
                cluster.push_back(var);
            } else {
                if (cluster.size() == 1) {
                    cout << cluster.front() << endl;
                    cluster.clear();
                    if (!variantFile.done()) {
                        cluster.push_back(var);
                    }
                } else {
                    haplotypeCluster = true;
                }
            }
        } else { // not phased
            if (cluster.empty()) {
                cout << var << endl;
            } else if (cluster.size() == 1) {
                cout << cluster.front() << endl;
                cout << var << endl;
            } else {
                haplotypeCluster = true;
            }
        }

        // we need to deal with the current cluster, as our next var is outside of bounds
        // process the last cluster if it's more than 1 var
        if (haplotypeCluster) {
            /*            cerr << "cluster: ";
            for (vector<Variant>::iterator v = cluster.begin(); v != cluster.end(); ++v) {
                cerr << " " << v->position;
            }
            cerr << endl;
            */

            // generate haplotype alleles and genotypes!
            // get the reference sequence across the haplotype in question
            string referenceHaplotype = reference.getSubSequence(cluster.front().sequenceName,
                                                                 cluster.front().position - 1,
                                                                 cluster.back().position
                                                                 + cluster.back().ref.size() - cluster.front().position);

            // establish what haplotypes there are by parsing the (phased) genotypes across the samples over these records
            map<string, vector<vector<int> > > sampleHaplotypes;
            for (vector<string>::iterator s = var.sampleNames.begin(); s != var.sampleNames.end(); ++s) {
                // build the haplotype using the genotype fields in the variant cluster
                // only build haplotypes for samples with complete information
                string& sampleName = *s;
                vector<vector<int> >& haplotypes = sampleHaplotypes[sampleName];
		
                bool completeCoverage = true;
                // ensure complete genotype coverage over the haplotype cluster
                for (vector<Variant>::iterator v = cluster.begin(); v != cluster.end(); ++v) {
                    if (v->samples.find(sampleName) == v->samples.end()
                        || v->samples[sampleName].find("GT") == v->samples[sampleName].end()) {
                        completeCoverage = false;
                        break;
                    }
                }
                if (!completeCoverage) {
                    continue; // skip samples without complete coverage
                }
		
                // what's the ploidy?
                {
                    string& gt = cluster.front().samples[sampleName]["GT"].front();
                    vector<string> gtspec = split(gt, "|");
                    for (vector<string>::iterator g = gtspec.begin(); g != gtspec.end(); ++g) {
                        vector<int> haplotype;
                        haplotypes.push_back(haplotype);
                    }
                }
		
                for (vector<Variant>::iterator v = cluster.begin(); v != cluster.end(); ++v) {
                    string& gt = v->samples[sampleName]["GT"].front();
                    vector<string> gtspec = split(gt, "|");
                    vector<string>::iterator g = gtspec.begin();
                    for (vector<vector<int> >::iterator h = haplotypes.begin(); h != haplotypes.end(); ++h, ++g) {
                        int j;
                        convert(*g, j);
                        h->push_back(j);
                    }
                }
            }

            set<vector<int> > uniqueHaplotypes;
            for (map<string, vector<vector<int> > >::iterator hs = sampleHaplotypes.begin();
                 hs != sampleHaplotypes.end(); ++hs) {
                vector<vector<int> >& haps = hs->second;
                for (vector<vector<int> >::iterator h = haps.begin(); h != haps.end(); ++h) {
                    uniqueHaplotypes.insert(*h);
                }
            }
	    
            // write new haplotypes
            map<vector<int>, string> haplotypeSeqs;
            map<vector<int>, int> haplotypeIndexes;
            map<int, string> alleles;
	    
            int impossibleHaplotypes = 0;

            // always include the reference haplotype as 0
            // when we come to it in the haplotypes, we'll ignore it
            int alleleIndex = 1;
            for (set<vector<int> >::iterator u = uniqueHaplotypes.begin(); u != uniqueHaplotypes.end(); ++u) {

                /*
                for (vector<int>::const_iterator z = u->begin(); z != u->end(); ++z) {
                    cerr << *z;
                }
                cerr << endl;
                */

                string haplotype = referenceHaplotype;
                bool isreference = true;
                bool impossibleHaplotype = false;
                int referenceInsertOffset = 0;
                int j = 0; // index into variant cluster
                int lastpos = 0;
                int lastrefend = 0;
                for (vector<int>::const_iterator z = u->begin(); z != u->end(); ++z, ++j) {
                    int i = *z;
                    if (i != 0) {
                        isreference = false;
                        Variant& vartoInsert = cluster.at(j);
                        string& alternate = vartoInsert.alleles.at(i);
                        if (vartoInsert.position < lastrefend) {
                            cerr << "impossible haplotype, overlapping alleles at " << vartoInsert.sequenceName << ":" << vartoInsert.position << endl;
                            impossibleHaplotype = true;
                            break;
                        } else {
                            //cerr << vartoInsert.position << " " << cluster.front().position + referenceInsertOffset << endl;
                            //cerr << "replacing " << vartoInsert.ref << " at " << vartoInsert.position - cluster.front().position + referenceInsertOffset << " with " << alternate << endl;
                            haplotype.replace(vartoInsert.position - cluster.front().position + referenceInsertOffset,
                                              vartoInsert.ref.size(), alternate);
                            if (alternate.size() != vartoInsert.ref.size()) {
                                referenceInsertOffset += alternate.size() - vartoInsert.ref.size();
                            }
                            lastpos = vartoInsert.position;
                            lastrefend = vartoInsert.position + vartoInsert.ref.size();
                        }
                    }
                }
		
                if (impossibleHaplotype) {
                    ++impossibleHaplotypes;
                    haplotypeIndexes[*u] = -1; // indicates impossible haplotype
                    impossibleHaplotype = false;
                } else if (isreference) {
                    alleles[0] = haplotype;
                    haplotypeIndexes[*u] = 0;
                } else {
                    alleles[alleleIndex] = haplotype;
                    haplotypeIndexes[*u] = alleleIndex;
                    ++alleleIndex;
                }
                haplotypeSeqs[*u] = haplotype;
                // if there's not a reference allele, add it
                if (alleles.find(0) == alleles.end()) {
                    alleles[0] = referenceHaplotype;
                    // nb, there is no reference haplotype among
                    // the samples, so we don't have to add it to
                    // the haplotypeIndexes
                }
            }

            outputVar.ref = alleles[0];
            outputVar.alt.clear();
            for (int i = 1; i < alleleIndex; ++i) {
                outputVar.alt.push_back(alleles[i]);
            }
	    
            outputVar.sequenceName = cluster.front().sequenceName;
            outputVar.position = cluster.front().position;
            outputVar.filter = ".";
            outputVar.id = ".";
            outputVar.info = cluster.front().info;
            outputVar.samples.clear();
            outputVar.format = cluster.front().format;
	    
            // now the genotypes
            for (vector<string>::iterator s = var.sampleNames.begin(); s != var.sampleNames.end(); ++s) {
                string& sampleName = *s;
                vector<string> gt;
                vector<vector<int> > & hs = sampleHaplotypes[sampleName];
                for (vector<vector<int> >::iterator h = hs.begin(); h != hs.end(); ++h) {
                    int hi = haplotypeIndexes[*h];
                    if (hi != -1) {
                        gt.push_back(convert(hi));
                    } else {
                        // nonexistent or impossible haplotype
                        gt.push_back(".");
                    }
                }
                if (gt.size() != 0) {
                    outputVar.samples[sampleName]["GT"].push_back(join(gt, "|"));
                }
            }
            if (cluster.size() - impossibleHaplotypes < 2) {
                for (vector<Variant>::iterator v = cluster.begin(); v != cluster.end(); ++v) {
                    cout << *v << endl;
                }
            } else {
                if (!outputVar.alt.empty()) {
                    cout << outputVar << endl;
                } else {
                    cerr << "no alternate alleles remain at " << outputVar.sequenceName << ":" << outputVar.position << " after haplotype validation" << endl;
                }
            }
            cluster.clear();
            if (!variantFile.done()) cluster.push_back(var);
        }
    }

    exit(0);  // why?
    return 0;

}
int main(int argc, char** argv) {

    if (argc != 4) {
        cerr << "usage: " << argv[0] << " <annotation-tag> <vcf file> <vcf file>" << endl
             << "annotates genotypes in the first file with genotypes in the second" << endl
             << "adding the genotype as another flag to each sample filed in the first file." << endl
             << "annotation-tag is the name of the sample flag which is added to store the annotation." << endl
             << "also adds a 'has_variant' flag for sites where the second file has a variant." << endl;
        return 1;
    }

    string annotag = argv[1];
    string filenameA = argv[2];
    string filenameB = argv[3];

    if (filenameA == filenameB) {
        cerr << "it won't help to annotate samples with their own genotypes!" << endl;
        return 1;
    }

    VariantCallFile variantFileA;
    if (filenameA == "-") {
        variantFileA.open(std::cin);
    } else {
        variantFileA.open(filenameA);
    }

    VariantCallFile variantFileB;
    if (filenameB == "-") {
        variantFileB.open(std::cin);
    } else {
        variantFileB.open(filenameB);
    }

    if (!variantFileA.is_open() || !variantFileB.is_open()) {
        return 1;
    }

    Variant varA(variantFileA);
    Variant varB(variantFileB);

    // while the first file doesn't match the second positionally,
    // step forward, annotating each genotype record with an empty genotype
    // when the two match, iterate through the genotypes from the first file
    // and get the genotypes reported in the second file
    
    variantFileA.getNextVariant(varA);
    variantFileB.getNextVariant(varB);

    string line = "##INFO=<ID=" + annotag + ".has_variant,Number=0,Type=Flag,Description=\"True if "
        + annotag + " has a called alternate among samples under comparison.\">";
    variantFileA.addHeaderLine(line);
    line = "##FORMAT=<ID=" + annotag + ",Number=1,Type=String,Description=\"Genotype from "
        + annotag + ".\">";
    variantFileA.addHeaderLine(line);

    cout << variantFileA.header << endl;

    do {

        // this is broken.  to do it right, it'll be necessary to get reference ids from the fasta reference used to make the alignments...
		// if B is NOT done, and is less than A, read new B.
        if (!variantFileB.done()
            && (varB.sequenceName != varA.sequenceName
                || (varB.sequenceName == varA.sequenceName && varB.position < varA.position)
				|| variantFileA.done())
            ) {
            variantFileB.getNextVariant(varB);
        }

		// if A is not done- and A is less than B, read A.  
		// should also read if variant B is done. 
        if (!variantFileA.done()
            && (varA.sequenceName != varB.sequenceName
                || (varA.sequenceName == varB.sequenceName && varA.position < varB.position)
				|| variantFileB.done())
            ) {
            annotateWithBlankGenotypes(varA, annotag);
            cout << varA << endl;
            variantFileA.getNextVariant(varA);
        }

        vector<Variant> varsA;
        vector<Variant> varsB;

        bool hasMultipleAlts = false;

        long int thisPosition = 0;
        string thisSequenceName;
        if (varA.position == varB.position
            && varA.sequenceName == varB.sequenceName) {
            thisPosition = varA.position;
            thisSequenceName = varA.sequenceName;
        }
        while (!variantFileA.done()
               && !variantFileB.done()
               && thisPosition == varA.position
               && thisSequenceName == varA.sequenceName
               && varA.sequenceName == varB.sequenceName
               && varA.position == varB.position) {
            // accumulate all the alts at the current position
            varsA.push_back(varA);
            varsB.push_back(varB);
            if (varA.alt.size() > 1 || varB.alt.size() > 1)
                hasMultipleAlts = true;
            variantFileA.getNextVariant(varA);
            variantFileB.getNextVariant(varB);
        }

        // multiple lines per position
        if (!hasMultipleAlts && (varsA.size() > 1 || varsB.size() > 1)) {

            map<pair<string, string>, Variant> varsAParsed;
            map<pair<string, string>, Variant> varsBParsed;	
            for (vector<Variant>::iterator v = varsA.begin(); v != varsA.end(); ++v) {
                varsAParsed[make_pair(v->ref, v->alt.front())] = *v;
            }
            for (vector<Variant>::iterator v = varsB.begin(); v != varsB.end(); ++v) {
                varsBParsed[make_pair(v->ref, v->alt.front())] = *v;
            }
	    
            for (map<pair<string, string>, Variant>::iterator vs = varsAParsed.begin(); vs != varsAParsed.end(); ++vs) {
                Variant& varA = vs->second;
                annotateWithBlankGenotypes(varA, annotag);
                if (varsBParsed.find(make_pair(varA.ref, varA.alt.front())) != varsBParsed.end()) {
                    Variant& varB = varsBParsed[make_pair(varA.ref, varA.alt.front())]; // TODO cleanup
                    annotateWithGenotypes(varA, varB, annotag);
                    varA.infoFlags[annotag + ".has_variant"] = true;
                }
                cout << varA << endl;
            }

        } else if (!varsA.empty() && !varsB.empty()) { // one line per multi-allelic
            Variant& varA = varsA.front();
            annotateWithBlankGenotypes(varA, annotag);
            Variant& varB = varsB.front();
            annotateWithGenotypes(varA, varB, annotag);
            // XXX TODO, and also allow for records with multiple alts
            // XXX assume that if the other file has a corresponding record, some kind of variation was detected at the same site
            varA.infoFlags[annotag + ".has_variant"] = true;
            cout << varA << endl;
        } else {
            for (vector<Variant>::iterator v = varsA.begin(); v != varsA.end(); ++v) {
                Variant& varA = *v;
                annotateWithBlankGenotypes(varA, annotag);
                cout << varA << endl;
            }
        }

    } while (!variantFileA.done() || !variantFileB.done());

    return 0;

}