int main(int argc, char** argv) {

    VariantCallFile variantFile;

    if (argc > 1) {
        string filename = argv[1];
        variantFile.open(filename);
    } else {
        variantFile.open(std::cin);
    }

    if (!variantFile.is_open()) {
        return 1;
    }

    variantFile.addHeaderLine("##FORMAT=<ID=SN,Number=1,Type=String,Description=\"The name of the sample.\">");

    cout << variantFile.header << endl;

    Variant var(variantFile);
    while (variantFile.getNextVariant(var)) {
        var.format.push_back("SN");
        for (map<string, map<string, vector<string> > >::iterator s = var.samples.begin();
             s != var.samples.end(); ++s) {
            s->second["SN"].clear();
            s->second["SN"].push_back(s->first);
        }
        cout << var << endl;
    }

    return 0;

}
Example #2
0
int main(int argc, char** argv) {

    if (argc != 2) {
        cerr << "usage: " << argv[0] << " <vcf file>" << endl
             << "outputs the het/hom ratio for each individual in the file" << endl;
        return 1;
    }

    string filename = argv[1];

    VariantCallFile variantFile;
    if (filename == "-") {
        variantFile.open(std::cin);
    } else {
        variantFile.open(filename);
    }
    if (!variantFile.is_open()) {
        cerr << "could not open " << filename << endl;
        return 1;
    }

    map<string, unsigned int> hetCounts;
    map<string, unsigned int> homCounts;
    for (vector<string>::iterator s = variantFile.sampleNames.begin(); s != variantFile.sampleNames.end(); ++s) {
        hetCounts[*s] = 0;
        homCounts[*s] = 0;
    }

    Variant var(variantFile);
    while (variantFile.getNextVariant(var)) {
        //cout << var << endl;
        for (map<string, map<string, vector<string> > >::iterator s = var.samples.begin(); s != var.samples.end(); ++s) {
            string name = s->first;
            map<string, vector<string> >& sample = s->second;
            string& gt = sample["GT"].front();
            map<int, int> genotype = decomposeGenotype(gt);
            if (isHet(genotype)) {
                ++hetCounts[name];
            } else if (isHomNonRef(genotype)) {
                ++homCounts[name];
            }
        }
    }

    for (vector<string>::iterator s = variantFile.sampleNames.begin(); s != variantFile.sampleNames.end(); ++s) {
        cout << (s == variantFile.sampleNames.begin() ? "" : "\t") << *s;
    }
    cout << endl;
    for (vector<string>::iterator s = variantFile.sampleNames.begin(); s != variantFile.sampleNames.end(); ++s) {
        cout << (s == variantFile.sampleNames.begin() ? "" : "\t") << (double) hetCounts[*s] / (double) homCounts[*s];
    }
    cout << endl;

    return 0;

}
Example #3
0
int main(int argc, char** argv) {

    if (argc < 3) {
        cerr << "usage: " << argv[0] << " <vcf file> [FIELD1] [FIELD2] ..." << endl
             << "outputs each record in the vcf file, removing INFO fields not listed on the command line" << endl;
        return 1;
    }

    string filename = argv[1];

    set<string> fieldsToKeep;
    for (int i = 2; i < argc; ++i) {
        fieldsToKeep.insert(argv[i]);
    }

    VariantCallFile variantFile;
    if (filename == "-") {
        variantFile.open(std::cin);
    } else {
        variantFile.open(filename);
    }

    if (!variantFile.is_open()) {
        return 1;
    }

    Variant var(variantFile);

    vector<string> fieldsToErase;
    vector<string> infoIds = variantFile.infoIds();
    for (vector<string>::iterator i = infoIds.begin(); i != infoIds.end(); ++i) {
        if (!fieldsToKeep.count(*i)) {
            fieldsToErase.push_back(*i);
            variantFile.removeInfoHeaderLine(*i);
        }
    }

    // write the header
    cout << variantFile.header << endl;
 
    // print the records, filtering is done via the setting of varA's output sample names
    while (variantFile.getNextVariant(var)) {
        for (vector<string>::iterator f = fieldsToErase.begin(); f != fieldsToErase.end(); ++f) {
            var.info.erase(*f);
            var.infoFlags.erase(*f);
        }
        cout << var << endl;
    }

    return 0;

}
Example #4
0
int main(int argc, char** argv) {

    if (argc < 3) {
        cerr << "usage: " << argv[0] << " <vcf file> [SAMPLE1] [SAMPLE2] ..." << endl
             << "outputs each record in the vcf file, removing samples listed on the command line" << endl;
        return 1;
    }

    string filename = argv[1];

    vector<string> samplesToRemove;
    for (int i = 2; i < argc; ++i) {
        samplesToRemove.push_back(argv[i]);
    }

    VariantCallFile variantFile;
    if (filename == "-") {
        variantFile.open(std::cin);
    } else {
        variantFile.open(filename);
    }

    if (!variantFile.is_open()) {
        return 1;
    }

    Variant var(variantFile);

    vector<string> samplesToKeep = removeElems(samplesToRemove, variantFile.sampleNames);

    // update sample list in header
    variantFile.updateSamples(samplesToKeep);

    // and restrict the output sample names in the variant to those we are keeping
    var.setOutputSampleNames(samplesToKeep);
    
    // write the new header
    cout << variantFile.header << endl;
 
    // print the records, filtering is done via the setting of varA's output sample names
    while (variantFile.getNextVariant(var)) {
        cout << var << endl;
    }

    return 0;

}
Example #5
0
int main(int argc, char** argv) {

    VariantCallFile variantFile;

    if (argc > 1) {
        string filename = argv[1];
        variantFile.open(filename);
    } else {
        variantFile.open(std::cin);
    }

    if (!variantFile.is_open()) {
        return 1;
    }

    //cout << variantFile.header << endl;

    Variant var(variantFile);
    while (variantFile.getNextVariant(var)) {
        //cout << var << endl;
        double afref = 1;
        map<double, vector<string> > allelesByAf;
        vector<double> afd;
        vector<string>& afstr = var.info["AF"];
        for (vector<string>::iterator af = afstr.begin(); af != afstr.end(); ++af) {
            double r; convert(*af, r);
            afd.push_back(r);
        }
        vector<double>::iterator af = afd.begin();
        for (vector<string>::iterator a = var.alt.begin(); a != var.alt.end(); ++a, ++af) {
            afref -= *af;
            allelesByAf[*af].push_back(*a);
        }
        cout << var.ref;
        for (map<double, vector<string> >::reverse_iterator a = allelesByAf.rbegin(); a != allelesByAf.rend(); ++a) {
            cout << " -> " << join(a->second, ", ");
        }
        cout << endl;
    }

    return 0;

}
Example #6
0
int main(int argc, char** argv) {

    VariantCallFile variantFile;

    if (argc > 1) {
        string filename = argv[1];
        variantFile.open(filename);
    } else {
        variantFile.open(std::cin);
    }

    if (!variantFile.is_open()) {
        return 1;
    }

    cout << variantFile.header << endl;

    string lastsn;
    long int lastpos;
    string lastref;
    vector<string> lastalt;

    variantFile.parseSamples = false;
    Variant var(variantFile);
    while (variantFile.getNextVariant(var)) {
        if (!lastsn.empty()
            && (lastsn == var.sequenceName
                && lastpos == var.position
                && lastref == var.ref
                && lastalt == var.alt)) {
            continue;
        } else {
            lastsn = var.sequenceName;
            lastpos = var.position;
            lastref = var.ref;
            lastalt = var.alt;
            cout << var.originalLine << endl;
        }
    }

    return 0;

}
Example #7
0
int main(int argc, char** argv) {

    VariantCallFile variantFile;

    if (argc > 1) {
        string filename = argv[1];
        variantFile.open(filename);
    } else {
        variantFile.open(std::cin);
    }

    if (!variantFile.is_open()) {
        return 1;
    }

    variantFile.addHeaderLine("##INFO=<ID=length,Number=A,Type=Integer,Description=\"length(ALT) - length(REF) for each ALT\">");
    variantFile.addHeaderLine("##INFO=<ID=length.ref,Number=1,Type=Integer,Description=\"length(REF)\">");
    variantFile.addHeaderLine("##INFO=<ID=length.alt,Number=A,Type=Integer,Description=\"length(ALT) for each ALT\">");
    cout << variantFile.header << endl;

    Variant var(variantFile);
    while (variantFile.getNextVariant(var)) {
        vector<string>& lengths = var.info["length"];
        lengths.clear();
        for (vector<string>::iterator a = var.alt.begin(); a != var.alt.end(); ++a) {
            lengths.push_back(convert((int) a->size() - (int) var.ref.size()));
        }
        vector<string>& lengthsRef = var.info["length.ref"];
        lengthsRef.clear();
        lengthsRef.push_back(convert(var.ref.size()));
        vector<string>& lengthsAlt = var.info["length.alt"];
        lengthsAlt.clear();
        for (vector<string>::iterator a = var.alt.begin(); a != var.alt.end(); ++a) {
            lengthsAlt.push_back(convert((int) a->size()));
        }
        cout << var << endl;
    }

    return 0;

}
Example #8
0
int main(int argc, char** argv) {

    VariantCallFile variantFile;

    if (argc > 1) {
        string filename = argv[1];
        variantFile.open(filename);
    } else {
        variantFile.open(std::cin);
    }

    if (!variantFile.is_open()) {
        return 1;
    }

    cout << variantFile.header;

    Variant var(variantFile);
    while (variantFile.getNextVariant(var)) {
        map<string, vector<VariantAllele> > variants = var.parsedAlternates();
        cout << var << endl;
        for (map<string, vector<VariantAllele> >::iterator va = variants.begin(); va != variants.end(); ++va) {
            cout << " ( " << va->first << " :: ";
            vector<VariantAllele>& vars = va->second;
            vector<VariantAllele>::iterator g = vars.begin();
            for (; g != vars.end(); ++g) {
                cout << *g << "; ";
            }
            cout << " ) ";
        }
        cout << endl;
    }

    return 0;

}
Example #9
0
int main(int argc, char** argv) {

    
    string ref_file = "";
    vector<string> insertion_files;
    int max_interval = -1;
    bool replace_sequences = true;

    int c = 0;
    while (true) {
        static struct option long_options[] =
            {
                {"insertions", no_argument, 0, 'i'},
                {"help", no_argument, 0, 'h'},
                {"reference", required_argument, 0, 'r'},
                {"no-replace-sequences", no_argument, 0, 's'},
                {0, 0, 0, 0}
            };
        int option_index = 0;

        c = getopt_long (argc, argv, "sr:i:h",
                         long_options, &option_index);
        if (c == -1)
            break;
        /* Detect the end of the options. */
        switch(c){
        case 's':
            replace_sequences = false;
            break;
        case 'r':
            ref_file = optarg;
            break;
        case 'i':
            insertion_files.push_back(optarg);
            break;
        case 'h':
        case '?':
            print_help(argv);
            exit(1);
        default:
            print_help(argv);
            abort();
        }
    }

    if (argc < 2){
        print_help(argv);
        exit(1);
    }

 

    VariantCallFile variantFile;
    string filename = argv[argc - 1];
    variantFile.open(filename);
    if (!variantFile.is_open()) {
        return 1;
    }

    vector<FastaReference*> insertions;
    if (!insertion_files.empty()){
        for (auto x : insertion_files){
            FastaReference* ins = new FastaReference();
            insertions.push_back(ins);
            ins->open(x);
        }
    }

    FastaReference ref;
    if(!ref_file.empty()){
        ref.open(ref_file);
    }


    cout << variantFile.header << endl;

    Variant var;
    while (variantFile.getNextVariant(var)) {
        bool valid = var.canonicalize_sv(ref, insertions, replace_sequences, max_interval);
        if (!valid){
            cerr << "Variant could not be normalized" << var << endl;
        }
        cout << var << endl;
    }

    return 0;

}
Example #10
0
int main(int argc, char** argv) {

    if (argc != 2) {
        cerr << "usage: " << argv[0] << " <annotation-tag> <vcf file> <vcf file>" << endl
             << "adds a tag (BasesToNextVariant) to each variant record which indicates" << endl
	     << "the distance to the nearest variant" << endl;
        return 1;
    }

    string filename = argv[1];

    VariantCallFile variantFile;
    if (filename == "-") {
        variantFile.open(std::cin);
    } else {
        variantFile.open(filename);
    }

    if (!variantFile.is_open()) {
        return 1;
    }

    Variant varA(variantFile);
    Variant varB(variantFile);
    Variant varC(variantFile);

    vector<Variant*> vars;
    vars.push_back(&varA);
    vars.push_back(&varB);
    vars.push_back(&varC);
    
    for (vector<Variant*>::iterator v = vars.begin(); v != vars.end(); ++v) {
	variantFile.getNextVariant(**v);
    }

    string tag = "BasesToClosestVariant";
    string line = "##INFO=<ID=" + tag + ",Number=1,Type=Integer,Description=\"" \
	+ "Number of bases to the closest variant in the file.\">";
    variantFile.addHeaderLine(line);

    cout << variantFile.header << endl;

    // get the first distances
    if (vars.at(0)->sequenceName == vars.at(1)->sequenceName) {
	vars.at(0)->info[tag].push_back(convert(vars.at(1)->position - vars.at(0)->position));
    }

    while (variantFile.getNextVariant(*vars.back())) {

	if (vars.at(1)->sequenceName == vars.at(0)->sequenceName &&
	    vars.at(1)->sequenceName == vars.at(2)->sequenceName) {
	    vars.at(1)->info[tag].push_back(convert(min(vars.at(1)->position - vars.at(0)->position,
							vars.at(2)->position - vars.at(1)->position)));
	} else if (vars.at(1)->sequenceName == vars.at(0)->sequenceName) {
	    vars.at(1)->info[tag].push_back(convert(vars.at(1)->position - vars.at(0)->position));
	} else if (vars.at(2)->sequenceName == vars.at(1)->sequenceName) {
	    vars.at(1)->info[tag].push_back(convert(vars.at(2)->position - vars.at(1)->position));
	} else {
	    // don't add the tag
	}
	cout << *vars.front() << endl;
	// rotate
	Variant* v = vars.at(0);
	vars.at(0) = vars.at(1);
	vars.at(1) = vars.at(2);
	vars.at(2) = v;

    }

    // assign the last distances
    
    if (vars.at(0)->sequenceName == vars.at(1)->sequenceName) {
	vars.at(0)->info[tag].push_back(convert(vars.at(1)->position - vars.at(0)->position));
	cout << *vars.at(0) << endl;
	
	vars.at(1)->info[tag].push_back(convert(vars.at(1)->position - vars.at(0)->position));
	cout << *vars.at(1) << endl;
    }

    return 0;

}
Example #11
0
int main(int argc, char** argv) {

    int window = 150;
    VariantCallFile variantFile;
    string fastaFileName;

    int c;
    while (true) {
        static struct option long_options[] =
            {
                /* These options set a flag. */
                //{"verbose", no_argument,       &verbose_flag, 1},
                {"help", no_argument, 0, 'h'},
                {"reference", required_argument, 0, 'r'},
                {"window", required_argument, 0, 'w'},
                {0, 0, 0, 0}
            };
        /* getopt_long stores the option index here. */
        int option_index = 0;

        c = getopt_long (argc, argv, "hw:r:",
                         long_options, &option_index);

        if (c == -1)
            break;

        switch (c) {

	    case 'r':
            fastaFileName = optarg;
            break;

	    case 'w':
            window = atoi(optarg);
            break;

        case '?':
            printSummary(argv);
            exit(1);
            break;

        case 'h':
            printSummary(argv);
            break;

        default:
            abort ();
        }
    }

    if (optind < argc) {
        string filename = argv[optind];
        variantFile.open(filename);
    } else {
        variantFile.open(std::cin);
    }

    if (!variantFile.is_open()) {
        cerr << "could not open VCF file" << endl;
        exit(1);
    }

    FastaReference fastaReference;
    if (fastaFileName.empty()) {
        cerr << "a reference is required" << endl;
        exit(1);
    } else {
        fastaReference.open(fastaFileName);
    }

    /*
    variantFile.addHeaderLine("##INFO=<ID=TYPE,Number=A,Type=String,Description=\"The type of allele, either snp, mnp, ins, del, or complex.\">");
    variantFile.addHeaderLine("##INFO=<ID=LEN,Number=A,Type=Integer,Description=\"allele length\">");
    if (!parseFlag.empty()) {
        variantFile.addHeaderLine("##INFO=<ID="+parseFlag+",Number=0,Type=Flag,Description=\"The allele was parsed using vcfallelicprimitives.\">");
    }
    */
    cout << variantFile.header << endl;

    Variant var(variantFile);
    while (variantFile.getNextVariant(var)) {

        // if there is no indel, there is nothing to realign
        bool hasIndel = false;
        for (vector<string>::iterator a = var.alt.begin(); a != var.alt.end(); ++a) {
            if (a->size() != var.ref.size()) {
                hasIndel = true;
                break;
            }
        }
        if (!hasIndel) {
            cout << var << endl;
            continue;
        }

        vector<AltAlignment> alignments;
        string ref;

        // determine window size to prevent mismapping with SW algorithm
        int currentWindow = window;
        int scale = 2;
        if (var.ref.size()*scale > currentWindow) currentWindow = var.ref.size()*scale;
        for (vector<string>::iterator a = var.alleles.begin(); a != var.alleles.end(); ++a) {
            if (a->size()*scale > currentWindow) {
                currentWindow = a->size()*scale;
            }
        }

        // while the entropy of either flank is < some target entropy (~1 is fine), increase the flank sizes
        while (currentWindow < 2000) { // limit to one step > than this
            string refTarget = fastaReference.getSubSequence(var.sequenceName, var.position - 1 - currentWindow/2, currentWindow);
            if (entropy(refTarget.substr(0, refTarget.size()/2)) < 1 ||
                entropy(refTarget.substr(refTarget.size()/2)) < 1) {
                currentWindow *= scale;
            } else {
                break;
            }
        }

        // do the alignments
        getAlignment(var, fastaReference, ref, alignments, currentWindow);

        // stably left align the alignments
        for (vector<AltAlignment>::iterator a = alignments.begin(); a != alignments.end(); ++a) {
            Cigar cigarBefore = a->cigar;
            //cerr << a->seq << endl;
            //cerr << "before : " << a->pos << " " << joinCigar(a->cigar) << endl;
            long int prev = a->pos;
            stablyLeftAlign(a->seq, ref, a->cigar, 20, false);
            //cerr << "after  : " << a->pos << " " << joinCigar(a->cigar) << endl;
            if (a->pos != prev) cerr << "modified alignment @ " << var << endl;
        }
        //cout << var << endl;

        // transform the mappings
        // chop off leading matching bases
        // find the range of bp in the alleles
        // make the new ref allele
        // make the new alt alleles
        // emit the var

        long int newPosition = var.position+currentWindow/2;
        long int newEndPosition = var.position-currentWindow/2;
        // check for no-indel case
        int newLength = var.ref.size();
        bool giveUp = false;
        for (vector<AltAlignment>::iterator a = alignments.begin(); a != alignments.end() && !giveUp; ++a) {
            // get the first mismatching position
            Cigar::iterator c = a->cigar.begin();

            int rp = 0;
            int sp = 0;
            bool hitMismatch = false;

            int matchingBpAtStart = 0;
            int matchingBpAtEnd = 0;
            // will be set to true if the first reference position match is broken by a SNP, not an indel
            bool leadingSNP = false;

            while (c != a->cigar.end()) {
                char op = c->second[0];
                if (c == a->cigar.begin()) {
                    if (op != 'M') {
                        cerr << "alignment does not start on matched sequence" << endl;
                        cerr << var << endl;
                        exit(1);
                    }
                    int i = 0;
                    for ( ; i < c->first; ++i) {
                        if (ref[i] != a->seq[i]) {
                            leadingSNP = true;
                            break;
                        }
                    }
                    matchingBpAtStart = i;
                }
                if (!leadingSNP && c == (a->cigar.begin()+1)) {
                    // if the first thing we run into is an indel, step back, per VCF spec
                    if (op == 'D' || op == 'I') {
                        --matchingBpAtStart;
                    }
                }
                if (c == (a->cigar.end()-1)) {
                    if (op != 'M') {
                        // soft clip at end
                        // it'll be hard to interpret this
                        // the alignments sometimes generate this
                        // best thing to do is to move on
                        //cerr << "alignment does not end on matched sequence" << endl;
                        //cout << var << endl;
                        //exit(1);
                        giveUp = true;
                        break;
                    }
                    int i = 0;
                    for ( ; i < c->first; ++i) {
                        if (ref[ref.size()-1-i] != a->seq[a->seq.size()-1-i]) {
                            break;
                        }
                    }
                    matchingBpAtEnd = i;
                }
                ++c;
            }

            int altMismatchLength = a->seq.size() - matchingBpAtEnd - matchingBpAtStart;
            int refMismatchLength = (var.ref.size() + currentWindow) - matchingBpAtEnd - matchingBpAtStart;
            //cerr << "alt mismatch length " << altMismatchLength << endl
            //     << "ref mismatch length " << refMismatchLength << endl;
            long int newStart = var.position - currentWindow/2 + matchingBpAtStart;
            long int newEnd = newStart + refMismatchLength;
            //cerr << "ref should run from " << newStart << " to " << newStart + refMismatchLength << endl;
            newPosition = min(newStart, newPosition);
            newEndPosition = max(newEnd, newEndPosition);
            //cerr << newPosition << " " << newEndPosition << endl;
            //if (newRefSize < refMismatchLength) newRefSize = refMismatchLength;
        }

        // the alignment failed for some reason, continue
        if (giveUp) {
            cout << var << endl;
            continue;
        }

        //cerr << "new ref start " << newPosition << " and end " << newEndPosition << " was " << var.position << "," << var.position + var.ref.size() << endl;
        int newRefSize = newEndPosition - newPosition;
        string newRef = fastaReference.getSubSequence(var.sequenceName, newPosition-1, newRefSize);
        // get the number of bp to strip from the alts
        int stripFromStart = currentWindow/2 - (var.position - newPosition);
        int stripFromEnd = (currentWindow + newRefSize) - (stripFromStart + newRefSize) + (var.ref.size() - newRefSize);

        //cerr << "strip from start " << stripFromStart << endl;
        //cerr << "strip from end " << stripFromEnd << endl;

        vector<string> newAlt;
        vector<string>::iterator l = var.alt.begin();
        bool failedAlt = false;
        for (vector<AltAlignment>::iterator a = alignments.begin(); a != alignments.end();
             ++a, ++l) {
            int diff = newRef.size() - l->size();
            string alt = a->seq.substr(stripFromStart, a->seq.size() - (stripFromEnd + stripFromStart));
            newAlt.push_back(alt);
            if (alt.empty()) failedAlt = true;
        }

        // check the before/after haplotypes
        bool brokenRealignment = false;
        if (!newRef.empty() && !failedAlt) {
            int slop = 50; // 50 extra bp!
            int haplotypeStart = min(var.position, newPosition) - slop;
            int haplotypeEnd = max(var.position + var.ref.size(), newPosition + newRef.size()) + slop;
            string referenceHaplotype = fastaReference.getSubSequence(var.sequenceName, haplotypeStart - 1,
                                                                      haplotypeEnd - haplotypeStart);
            vector<string>::iterator o = var.alt.begin();
            vector<string>::iterator n = newAlt.begin();
            for ( ; o != var.alt.end() ; ++o, ++n) {
                // map the haplotypes
                string oldHaplotype = referenceHaplotype;
                string newHaplotype = referenceHaplotype;
                oldHaplotype.replace(var.position - haplotypeStart, var.ref.size(), *o);
                newHaplotype.replace(newPosition - haplotypeStart, newRef.size(), *n);
                if (oldHaplotype != newHaplotype) {
                    cerr << "broken left alignment!" << endl
                         << "old " << oldHaplotype << endl
                         << "new " << newHaplotype << endl;
                    cerr << "was: " << var << endl;
                    brokenRealignment = true;
                }
            }
        }

        // *if* everything is OK, update the variant
        if (!brokenRealignment && !newRef.empty() && !failedAlt) {
            var.ref = newRef;
            var.alt = newAlt;
            var.position = newPosition;
        }

        cout << var << endl;

        // for each parsedalternate, get the position
        // build a new vcf record for that position
        // unless we are already at the position !
        // take everything which is unique to that allele (records) and append it to the new record
        // then handle genotypes; determine the mapping between alleleic primitives and convert to phased haplotypes
        // this means taking all the parsedAlternates and, for each one, generating a pattern of allele indecies corresponding to it

        

        //for (vector<Variant>::iterator v = variants.begin(); v != variants.end(); ++v) {
    }

    return 0;

}
Example #12
0
int main(int argc, char** argv) {

    int c;
    string fastaRef;
    bool keepFailures = false;
    bool excludeFailures = false;

    if (argc == 1)
        printSummary(argv);

    while (true) {
        static struct option long_options[] =
            {
                /* These options set a flag. */
                //{"verbose", no_argument,       &verbose_flag, 1},
                {"help", no_argument, 0, 'h'},
                {"fasta-reference",  required_argument, 0, 'f'},
                {"exclude-failures",  no_argument, 0, 'x'},
                {"keep-failures",  no_argument, 0, 'k'},
                //{"length",  no_argument, &printLength, true},
                {0, 0, 0, 0}
            };
        /* getopt_long stores the option index here. */
        int option_index = 0;

        c = getopt_long (argc, argv, "hxkf:",
                         long_options, &option_index);

        /* Detect the end of the options. */
        if (c == -1)
            break;
 
        switch (c)
        {
        case 0:
            /* If this option set a flag, do nothing else now. */
            if (long_options[option_index].flag != 0)
                break;
            printf ("option %s", long_options[option_index].name);
            if (optarg)
                printf (" with arg %s", optarg);
            printf ("\n");
            break;

        case 'f':
            fastaRef = optarg;
            break;

        case 'x':
            excludeFailures = true;
            break;

        case 'k':
            keepFailures = true;
            break;
 
        case 'h':
            printSummary(argv);
            exit(0);
            break;

        case '?':
            /* getopt_long already printed an error message. */
            printSummary(argv);
            exit(1);
            break;
 
        default:
            abort ();
        }
    }

    if (fastaRef.empty()) {
        cerr << "a FASTA reference sequence must be specified" << endl;
        exit(1);
    }

    FastaReference ref;
    ref.open(fastaRef);

    VariantCallFile variantFile;
    string inputFilename;
    if (optind == argc - 1) {
        inputFilename = argv[optind];
        variantFile.open(inputFilename);
    } else {
        variantFile.open(std::cin);
    }

    if (!variantFile.is_open()) {
        return 1;
    }

    if (keepFailures || excludeFailures) {
        cout << variantFile.header << endl;
    }

    Variant var(variantFile);
    while (variantFile.getNextVariant(var)) {
        int refstart = var.position - 1; // convert to 0-based
        string matchedRef = ref.getSubSequence(var.sequenceName, refstart, var.ref.size());
        if (var.ref != matchedRef) {
            if (keepFailures) {
                cout << var << endl;
            } else if (!excludeFailures) {
                cout << "mismatched reference " << var.ref << " should be " << matchedRef << " at "
                     << var.sequenceName << ":" << var.position << endl;
            }
        } else if (excludeFailures) {
            cout << var << endl;
        }
    }

    return 0;

}
Example #13
0
int main(int argc, char** argv) {

  // set the random seed for MCMC

  srand((unsigned)time(NULL));

  // the filename

  string filename = "NA";

  // using vcflib; thanks to Erik Garrison 
  
  VariantCallFile variantFile ;

  // zero based index for the target and background indivudals 
  
  map<int, int> it, ib;
  
  // deltaaf is the difference of allele frequency we bother to look at 

  string deltaaf ;
  double daf  = -1;

    const struct option longopts[] = 
      {
	{"version"   , 0, 0, 'v'},
	{"help"      , 0, 0, 'h'},
        {"file"      , 1, 0, 'f'},
	{"target"    , 1, 0, 't'},
	{"background", 1, 0, 'b'},
	{"deltaaf"   , 1, 0, 'd'},
	{0,0,0,0}
      };

    int index;
    int iarg = 0;

    while(iarg != -1)
      {
	iarg = getopt_long(argc, argv, "d:t:b:f:hv", longopts, &index);
	
	switch (iarg)
	  {
	  case 0:
	    break;
	  case 'h':
	    cerr << endl;

	    cerr << "INFO: help: " << endl << endl;

	    cerr << "     bFst is a Bayesian approach to Fst.  Importantly bFst account for genotype uncertainty in the model using genotype likelihoods."       << endl;
	    cerr << "     For a more detailed description see: Holsinger et al. Molecular Ecology Vol 11, issue 7 2002.  The likelihood function has been "	     << endl;
	    cerr << "     modified to use genotype likelihoods provided by variant callers. There are five free parameters estimated in the model: each "	     << endl;
	    cerr << "     subpopulation's allele frequency and Fis (fixation index, within each subpopulation), a free parameter for the total population\'s "  << endl;
	    cerr << "     allele frequency, and Fst. "                                                                                      << endl             << endl;
	
	      cerr << "Output : 11 columns :                          " << endl; 
	      cerr << "     1.  Seqid                                     " << endl;
	      cerr << "     2.  Position				     " << endl;
	      cerr << "     3.  Observed allele frequency in target.	     " << endl;
	      cerr << "     4.  Estimated allele frequency in target.     " << endl;
	      cerr << "     5.  Observed allele frequency in background.  " << endl;
	      cerr << "     6.  Estimated allele frequency in background. " << endl;
	      cerr << "     7.  Observed allele frequency combined. 	     " << endl;
	      cerr << "     8.  Estimated allele frequency in combined.   " << endl;
	      cerr << "     9.  ML estimate of Fst (mean)		     " << endl;
	      cerr << "     10. Lower bound of the 95% credible interval  " << endl;
	      cerr << "     11. Upper bound of the 95% credible interval  " << endl << endl;
											 

	    cerr << "INFO: usage:  bFst --target 0,1,2,3,4,5,6,7 --background 11,12,13,16,17,19,22 --file my.vcf --deltaaf 0.1" << endl;
	    cerr << endl;
	    cerr << "INFO: required: t,target     -- a zero bases comma separated list of target individuals corrisponding to VCF columns" << endl;
	    cerr << "INFO: required: b,background -- a zero bases comma separated list of background individuals corrisponding to VCF columns" << endl;
	    cerr << "INFO: required: f,file a     -- a proper formatted VCF file.  the FORMAT field MUST contain \"PL\"" << endl; 
	    cerr << "INFO: required: d,deltaaf    -- skip sites were the difference in allele frequency is less than deltaaf" << endl;
	    cerr << endl; 
	    printVersion();
	    cerr << endl << endl;
	    return 0;

	  case 'v':
	    printVersion();
	    return 0;

	  case 't':
	    loadIndices(ib, optarg);
	    cerr << "INFO: There are " << ib.size() << " individuals in the target" << endl;
	    break;

	  case 'b':
	    loadIndices(it, optarg);
	    cerr << "INFO: There are " << it.size() << " individuals in the background" << endl;
	    break;

	  case 'f':
	    cerr << "INFO: File: " << optarg  <<  endl;
	    filename = optarg;
	    break;

	  case 'd':
	    cerr << "INFO: difference in allele frequency : " << optarg << endl;
	    deltaaf = optarg;
	    daf = atof(deltaaf.c_str());	    
	    break;
	  default: 
	    break; 
	    cerr << endl;
	    cerr << "FATAL: unknown command line option " << optarg << endl << endl ;
	    cerr << "INFO:  please use bFst --help      " << endl; 
	    cerr << endl;
	    return(1);
	  }

      }

    if(daf == -1){
    cerr << endl;
      cerr << "FATAL: did not specify deltaaf" << endl;
      cerr << "INFO:  please use bFst --help      " << endl; 
      cerr << endl;
      return(1);
    }

    if(filename == "NA"){
      cerr << endl;
      cerr << "FATAL: did not specify VCF file" << endl;
      cerr << "INFO:  please use bFst --help      " << endl; 
      cerr << endl;
      return(1);
    }

    variantFile.open(filename);
    

    if (!variantFile.is_open()) {
      cerr << endl;
      cerr << "FATAL: could not open VCF file" << endl;
      cerr << "INFO:  please use bFst --help" << endl; 
      cerr << endl;
      return(1);
    }
    if(it.size() < 2){
      cerr << endl;
      cerr << "FATAL: target not specified or less than two indviduals" << endl; 
      cerr << "INFO:  please use bFst --help                          " << endl; 
      cerr << endl;
    }
    if(ib.size() < 2){
      cerr << endl;
      cerr << "FATAL: target not specified or less than two indviduals"<< endl;
      cerr << "INFO:  please use bFst --help                          " << endl;
      cerr << endl;
    }
    
    Variant var(variantFile);

    vector<string> samples = variantFile.sampleNames;
    int nsamples = samples.size();

    while (variantFile.getNextVariant(var)) {
        
	// biallelic sites naturally 

	if(var.alt.size() > 1){
	  continue;
	}

	
	vector < map< string, vector<string> > > target, background, total;
	        
	int index = 0;

	for(int nsamp = 0; nsamp < nsamples; nsamp++){

          map<string, vector<string> > sample = var.samples[ samples[nsamp]];
	  
	  if(sample["GT"].front() != "./."){
	    if(it.find(index) != it.end() ){
	      target.push_back(sample);
	      total.push_back(sample);
	      
	    }
	    if(ib.find(index) != ib.end()){
		background.push_back(sample);
		total.push_back(sample);
	    }
	  }
    
	  index += 1;
	}
	
	if(target.size() < 2 || background.size() < 2 ){
	  continue;
	}
	
	pop popt, popb, popTotal;
	
	initPop(popt);
	initPop(popb);
	initPop(popTotal);

	loadPop(target,     popt);
	loadPop(background, popb);
	loadPop(total,  popTotal);

	if(popt.af == -1 || popb.af == -1){
	  continue;
	}
	if(popt.af == 1  && popb.af == 1){
	  continue;
	}
	if(popt.af == 0 && popb.af  == 0){
	  continue;
	}

	double afdiff = abs(popt.af - popb.af);

	if(afdiff < daf){
	  continue;
	}
	
	
	cerr << "INFO: target has "     << popt.questionable.size() << " questionable genotypes " << endl;
	cerr << "INFO: background has " << popb.questionable.size() << " questionable genotypes " << endl;

	// Parameters- targetAf backgroundAf targetFis backgroundFis totalAf fst
	vector<double> parameters;
	parameters.push_back(popt.af);
	parameters.push_back(popb.af);
	parameters.push_back(popt.fis);
	parameters.push_back(popb.fis);
	parameters.push_back(popTotal.af);
	parameters.push_back(0.1);
	parameters.push_back(popTotal.af);

	double sums [6] = {0};
	double fsts [10000]  ;

	for(int i = 0; i < 15000; i++){
	  
	  // update each of j parameters
	  
	  for(int j = 0; j < 6; j++ ){
	    
	    updateParameters(popt, popb, parameters, j);
	    if(i > 4999){
	      sums[j]     += parameters[j]; 
	    }
	  }
	  if(i > 4999){
	    fsts[i - 5000] =  parameters[5]; 
	  }
	  for(vector<int>::iterator itt = popt.questionable.begin(); itt != popt.questionable.end(); itt++){
	    updateGenotypes(popt, popb, parameters, (*itt), 0);

	  }
	  for(vector<int>::iterator itb = popb.questionable.begin(); itb != popb.questionable.end(); itb++){
	    updateGenotypes(popt, popb, parameters, (*itb) , 1);
	  }
	}
		
	qsort (fsts, sizeof(fsts)/sizeof(fsts[0]), sizeof(fsts[0]), cmp );
	
	double lcredint = fsts[500];
	double hcredint = fsts[9500]; 
	
    	cout << var.sequenceName << "\t"  << var.position     
	     << "\t"  << popt.af
             << "\t"  << sums[0]/10000
	     << "\t"  << popb.af 
	     << "\t"  << sums[1]/10000
	     << "\t"  << popTotal.af 
	     << "\t"  << sums[4]/10000
	     << "\t"  << sums[5]/10000
	     << "\t"  << lcredint
	     << "\t"  << hcredint
	     << endl;
    }
    return 0;		    
}
Example #14
0
int main(int argc, char** argv) {

    string bedFileName;
    string annotationInfoKey;
    string defaultAnnotationValue;

    if (argc == 1)
        printSummary(argv);

    int c;
    while (true) {
        static struct option long_options[] =
        {
            /* These options set a flag. */
            //{"verbose", no_argument,       &verbose_flag, 1},
            {"help", no_argument, 0, 'h'},
            {"bed",  required_argument, 0, 'b'},
            {"key",  required_argument, 0, 'k'},
            {"default",  required_argument, 0, 'd'},
            {0, 0, 0, 0}
        };
        /* getopt_long stores the option index here. */
        int option_index = 0;

        c = getopt_long (argc, argv, "hb:k:d:",
                         long_options, &option_index);

        if (c == -1)
            break;

        switch (c) {
            case 'b':
                bedFileName = string(optarg);
                break;

            case 'k':
                annotationInfoKey = string(optarg);
                break;

            case 'd':
                defaultAnnotationValue = string(optarg);
                break;

            case 'h':
                printSummary(argv);
                break;

            case '?':
                printSummary(argv);
                exit(1);
                break;

            default:
                abort ();
        }
    }

    if (bedFileName.empty()) {
        cerr << "a BED file is required when intersecting" << endl;
        exit(1);
    }

    BedReader bed(bedFileName);

    VariantCallFile variantFile;
    string inputFilename;
    if (optind == argc - 1) {
        inputFilename = argv[optind];
        variantFile.open(inputFilename);
    } else {
        variantFile.open(std::cin);
    }

    if (!variantFile.is_open()) {
        cout << "could not open VCF file" << endl;
        return 1;
    }

    string line = "##INFO=<ID=" + annotationInfoKey + ",Number=1,Type=String,Description=\"Annotation from "
        + bedFileName + " delimited by ':'\">";
    variantFile.addHeaderLine(line);

    cout << variantFile.header << endl;

    Variant var(variantFile);
    while (variantFile.getNextVariant(var)) {
        BedTarget record(var.sequenceName, var.position, var.position + var.ref.size() - 1, "");
        vector<BedTarget*> overlaps = bed.targetsOverlapping(record);
        vector<string> annotations;
        if (!overlaps.empty()) {
            for (vector<BedTarget*>::iterator t = overlaps.begin(); t != overlaps.end(); ++t) {
                annotations.push_back((*t)->desc);
            }
            var.info[annotationInfoKey].push_back(join(annotations, ":"));
        } else if (!defaultAnnotationValue.empty()) {
            var.info[annotationInfoKey].push_back(defaultAnnotationValue);
        }
        cout << var << endl;
    }

    return 0;

}
Example #15
0
int main(int argc, char** argv) {

    if (argc > 1 && (argv[1] == "-h" || argv[1] == "--help")) {
        cerr << "usage: " << argv[0] << " <vcf file>" << endl
             << "outputs a VCF stream where AC and NS have been generated for each record using sample genotypes" << endl;
        return 1;
    }

    VariantCallFile variantFile;
    if (argc == 1 || (argc == 2 && argv[1] == "-")) {
        variantFile.open(std::cin);
        if (!variantFile.is_open()) {
            cerr << "vcffixup: could not open stdin" << endl;
            return 1;
        }
    } else {
        string filename = argv[1];
        variantFile.open(filename);
        if (!variantFile.is_open()) {
            cerr << "vcffixup: could not open " << filename << endl;
            return 1;
        }
    }

    Variant var(variantFile);

    // remove header lines we're going to add
    variantFile.removeInfoHeaderLine("AC");
    variantFile.removeInfoHeaderLine("AF");
    variantFile.removeInfoHeaderLine("NS");
    variantFile.removeInfoHeaderLine("AN");

    // and add them back, so as not to duplicate them if they are already there
    variantFile.addHeaderLine("##INFO=<ID=AC,Number=A,Type=Integer,Description=\"Total number of alternate alleles in called genotypes\">");
    variantFile.addHeaderLine("##INFO=<ID=AF,Number=A,Type=Float,Description=\"Estimated allele frequency in the range (0,1]\">");
    variantFile.addHeaderLine("##INFO=<ID=NS,Number=1,Type=Integer,Description=\"Number of samples with data\">");
    variantFile.addHeaderLine("##INFO=<ID=AN,Number=1,Type=Integer,Description=\"Total number of alleles in called genotypes\">");

    // write the new header
    cout << variantFile.header << endl;
 
    // print the records, filtering is done via the setting of varA's output sample names
    while (variantFile.getNextVariant(var)) {
        stringstream ns;
        ns << var.samples.size();
        var.info["NS"].clear();
        var.info["NS"].push_back(ns.str());

        var.info["AC"].clear();
        var.info["AF"].clear();
        var.info["AN"].clear();

        int allelecount = countAlleles(var);
        stringstream an;
        an << allelecount;
        var.info["AN"].push_back(an.str());

        for (vector<string>::iterator a = var.alt.begin(); a != var.alt.end(); ++a) {
            string& allele = *a;
            int altcount = countAlts(var, var.getAltAlleleIndex(allele) + 1);
            stringstream ac;
            ac << altcount;
            var.info["AC"].push_back(ac.str());
            stringstream af;
            af << (double) altcount / (double) allelecount;
            var.info["AF"].push_back(af.str());
        }
        cout << var << endl;
    }

    return 0;

}
Example #16
0
int main(int argc, char** argv) {

  globalOpts.threads = 1   ;
  globalOpts.af      = 0.05;

  // zero based index for the target and background indivudals 
  
  map<int, int> it, ib;
  
    const struct option longopts[] = 
      {
	{"version"   , 0, 0, 'v'},
	{"help"      , 0, 0, 'h'},
        {"file"      , 1, 0, 'f'},
	{"target"    , 1, 0, 't'},
	{"region"    , 1, 0, 'r'},
	{"gen"       , 1, 0, 'g'},
	{"type"      , 1, 0, 'y'},
	{"threads"   , 1, 0, 'x'},
	{"af"        , 1, 0, 'a'},
	{"pos"       , 1, 0, 'p'},
	{0,0,0,0}
      };

    int findex;
    int iarg=0;

    while(iarg != -1)
      {
	iarg = getopt_long(argc, argv, "a:x:g:y:r:d:t:b:f:p:hv", longopts, &findex);
	
	switch (iarg)
	  {
	  case 'p':
	    {
	      globalOpts.pos = atoi(optarg);
	      break;
	    }

	  case 'a':
	    {
	      globalOpts.af = atof(optarg);
	      break;
	    }
	  case 'x':
	    {
	      globalOpts.threads = atoi(optarg);
	      break;
	    }
	  case 'g':
	    {
	      globalOpts.geneticMapFile = optarg;
	      break;
	    }
	  case 'h':
	    {
	      printHelp();
	      break;
	    }
	  case 'v':
	    {
	      printVersion();
	      break;
	    }
	  case 'y':
	    {
	      globalOpts.type = optarg;
	      break;
	    }
	  case 't':
	    {
	      loadIndices(it, optarg);
	      cerr << "INFO: there are " << it.size() << " individuals in the target" << endl;
	      cerr << "INFO: target ids: " << optarg << endl;
	      break;
	    }
	  case 'f':
	    {
	      cerr << "INFO: file: " << optarg  <<  endl;
	      globalOpts.filename = optarg;
	      break;
	    }
	  case 'r':
	    {
	      cerr << "INFO: set seqid region to : " << optarg << endl;
	      globalOpts.region = optarg; 
	      break;
	    default:
	      break;
	    }
	  }
      }

#if defined HAS_OPENMP
  omp_set_num_threads(globalOpts.threads);
#endif
  
    map<string, int> okayGenotypeLikelihoods;
    okayGenotypeLikelihoods["PL"] = 1;
    okayGenotypeLikelihoods["GL"] = 1;
    okayGenotypeLikelihoods["GP"] = 1;
    okayGenotypeLikelihoods["GT"] = 1;
    

    // add an option for dumping

//    for(std::map<int, double>::iterator gm = geneticMap.begin(); gm != geneticMap.end(); gm++){
//      cerr << "pos: " << gm->first << " cm: " << gm->second << endl; 
//    }

    if(globalOpts.type.empty()){
      cerr << "FATAL: failed to specify genotype likelihood format : PL or GL" << endl;
      printHelp();
      exit(1);
    }
    if(okayGenotypeLikelihoods.find(globalOpts.type) == okayGenotypeLikelihoods.end()){
      cerr << "FATAL: genotype likelihood is incorrectly formatted, only use: PL or GL" << endl;
      printHelp();
      exit(1);
    }

    if(globalOpts.filename.empty()){
      cerr << "FATAL: did not specify a file" << endl;
      printHelp();
      exit(1);
    }

    if(it.size() < 2){
      cerr << "FATAL: target option is required -- or -- less than two individuals in target\n";
      printHelp();
      exit(1);
    }

    // using vcflib; thanksErik 

    VariantCallFile variantFile;

    variantFile.open(globalOpts.filename);
    
    if(globalOpts.region.empty()){
      cerr << "FATAL: region required" << endl;
      exit(1);
    }
    if(! variantFile.setRegion(globalOpts.region)){
      cerr <<"FATAL: unable to set region" << endl;
      exit(1);
    }

    if (!variantFile.is_open()) {
      exit(1);
    }
    
    Variant var( variantFile );
    vector<int> target_h, background_h;

    int index   = 0; 
    int indexi  = 0;


    vector<string> samples = variantFile.sampleNames;
    int nsamples = samples.size();

    for(vector<string>::iterator samp = samples.begin(); samp != samples.end(); samp++){
      
      string sampleName = (*samp);
     
      if(it.find(index) != it.end() ){
	target_h.push_back(indexi);
	indexi++;
      }
      index++;
    }
    
   
    vector<long int> positions;
    
    vector<double> afs;

    string **haplotypes = new string*[target_h.size()];
    for (int i = 0; i < target_h.size(); i++) {
      haplotypes[i] = new string[2];
    }
    

    while (variantFile.getNextVariant(var)) {

      globalOpts.seqid = var.sequenceName;

      if(!var.isPhased()){
	cerr << "FATAL: Found an unphased variant. All genotypes must be phased!" << endl;
	exit(1);
      }

      if(var.alleles.size() > 2){
	continue;
      }

      vector < map< string, vector<string> > > target, background, total;
      
      int sindex = 0;
      
      for(int nsamp = 0; nsamp < nsamples; nsamp++){

	map<string, vector<string> > sample = var.samples[ samples[nsamp]];
	
	if(it.find(sindex) != it.end() ){
	  target.push_back(sample);
	}	
	sindex += 1;
      }
      
      genotype * populationTarget    ;
      
      if(globalOpts.type == "PL"){
	populationTarget     = new pl();
      }
      if(globalOpts.type == "GL"){
	populationTarget     = new gl();
      }
      if(globalOpts.type == "GP"){
	populationTarget     = new gp();
      }
      if(globalOpts.type == "GT"){
	populationTarget     = new gt();
      }

      populationTarget->loadPop(target, var.sequenceName, var.position);
      
      if(populationTarget->af <= globalOpts.af 
	 || populationTarget->nref < 2 
	 || populationTarget->nalt < 2){
	delete populationTarget;
	continue;
      }
      positions.push_back(var.position);
      afs.push_back(populationTarget->af);
      loadPhased(haplotypes, populationTarget, populationTarget->gts.size()); 
    
      populationTarget = NULL;
      delete populationTarget;
    }

    if(!globalOpts.geneticMapFile.empty()){
      cerr << "INFO: loading genetics map" << endl;
      loadGeneticMap(positions.front(), positions.back());
      cerr << "INFO: finished loading genetics map" << endl;
    }

    calc(haplotypes, target_h.size(), afs, positions, 
	 target_h, background_h, globalOpts.seqid);
    clearHaplotypes(haplotypes, target_h.size());

    exit(0);		    

}
int main(int argc, char** argv) {

    bool includePreviousBaseForIndels = true;
    bool useMNPs = false;
    string parseFlag;
    int maxLength = 200;
    bool keepInfo = false;
    bool keepGeno = false;

    VariantCallFile variantFile;

    int c;
    while (true) {
        static struct option long_options[] =
            {
                /* These options set a flag. */
                //{"verbose", no_argument,       &verbose_flag, 1},
                {"help", no_argument, 0, 'h'},
                {"use-mnps", no_argument, 0, 'm'},
                {"max-length", required_argument, 0, 'L'},
                {"tag-parsed", required_argument, 0, 't'},
                {"keep-info", no_argument, 0, 'k'},
                {"keep-geno", no_argument, 0, 'g'},
                {0, 0, 0, 0}
            };
        /* getopt_long stores the option index here. */
        int option_index = 0;

        c = getopt_long (argc, argv, "hmkgt:L:",
                         long_options, &option_index);

        if (c == -1)
            break;

        switch (c) {

	    case 'm':
            useMNPs = true;
            break;

	    case 'k':
            keepInfo = true;
            break;

	    case 'g':
            keepGeno = true;
            break;

        case 'h':
            printSummary(argv);
            break;

	    case 't':
            parseFlag = optarg;
            break;

        case 'L':
            maxLength = atoi(optarg);
            break;

        case '?':
            printSummary(argv);
            exit(1);
            break;

        default:
            abort ();
        }
    }

    if (optind < argc) {
        string filename = argv[optind];
        variantFile.open(filename);
    } else {
        variantFile.open(std::cin);
    }

    if (!variantFile.is_open()) {
        return 1;
    }

    variantFile.addHeaderLine("##INFO=<ID=TYPE,Number=A,Type=String,Description=\"The type of allele, either snp, mnp, ins, del, or complex.\">");
    variantFile.addHeaderLine("##INFO=<ID=LEN,Number=A,Type=Integer,Description=\"allele length\">");
    if (!parseFlag.empty()) {
        variantFile.addHeaderLine("##INFO=<ID="+parseFlag+",Number=0,Type=Flag,Description=\"The allele was parsed using vcfallelicprimitives.\">");
    }
    cout << variantFile.header << endl;

    Variant var(variantFile);
    while (variantFile.getNextVariant(var)) {


        // we can't decompose *1* bp events, these are already in simplest-form whether SNPs or indels
        // we also don't handle anything larger than maxLength bp
        if (var.alt.size() == 1 
            && (   var.alt.front().size() == 1
                || var.ref.size() == 1
                || var.alt.front().size() > maxLength
                || var.ref.size() > maxLength
                )) {
            // nothing to do
            cout << var << endl;
            continue;
        }

        // for each parsedalternate, get the position
        // build a new vcf record for that position
        // unless we are already at the position !
        // take everything which is unique to that allele (records) and append it to the new record
        // then handle genotypes; determine the mapping between alleleic primitives and convert to phased haplotypes
        // this means taking all the parsedAlternates and, for each one, generating a pattern of allele indecies corresponding to it

        map<string, vector<VariantAllele> > varAlleles = var.parsedAlternates(includePreviousBaseForIndels, useMNPs);
        set<VariantAllele> alleles;

        // collect unique alleles
        for (map<string, vector<VariantAllele> >::iterator a = varAlleles.begin(); a != varAlleles.end(); ++a) {
            for (vector<VariantAllele>::iterator va = a->second.begin(); va != a->second.end(); ++va) {
                alleles.insert(*va);
            }
        }

        int altcount = 0;
        for (set<VariantAllele>::iterator a = alleles.begin(); a != alleles.end(); ++a) {
            if (a->ref != a->alt) {
                ++altcount;
            }
        }

        if (altcount == 1 && var.alt.size() == 1 && var.alt.front().size() == 1) { // if biallelic SNP
            cout << var << endl;
            continue;
        }

        // collect variant allele indexed membership
        map<string, vector<int> > variantAlleleIndexes; // from serialized VariantAllele to indexes
        for (map<string, vector<VariantAllele> >::iterator a = varAlleles.begin(); a != varAlleles.end(); ++a) {
            int index = var.altAlleleIndexes[a->first] + 1; // make non-relative
            for (vector<VariantAllele>::iterator va = a->second.begin(); va != a->second.end(); ++va) {
                variantAlleleIndexes[va->repr].push_back(index);
            }
        }

        map<VariantAllele, double> alleleFrequencies;
        map<VariantAllele, int> alleleCounts;
        map<VariantAllele, map<string, string> > alleleInfos;
        map<VariantAllele, map<string, map<string, string> > > alleleGenos;

        bool hasAf = false;
        if (var.info.find("AF") != var.info.end()) {
            hasAf = true;
            for (vector<string>::iterator a = var.alt.begin(); a != var.alt.end(); ++a) {
                vector<VariantAllele>& vars = varAlleles[*a];
                for (vector<VariantAllele>::iterator va = vars.begin(); va != vars.end(); ++va) {
                    double freq;
                    try {
                        convert(var.info["AF"].at(var.altAlleleIndexes[*a]), freq);
                        alleleFrequencies[*va] += freq;
                    } catch (...) {
                        cerr << "vcfallelicprimitives WARNING: AF does not have count == alts @ "
                             << var.sequenceName << ":" << var.position << endl;
                    }
                }
            }
        }

        bool hasAc = false;
        if (var.info.find("AC") != var.info.end()) {
            hasAc = true;
            for (vector<string>::iterator a = var.alt.begin(); a != var.alt.end(); ++a) {
                vector<VariantAllele>& vars = varAlleles[*a];
                for (vector<VariantAllele>::iterator va = vars.begin(); va != vars.end(); ++va) {
                    int freq;
                    try {
                        convert(var.info["AC"].at(var.altAlleleIndexes[*a]), freq);
                        alleleCounts[*va] += freq;
                    } catch (...) {
                        cerr << "vcfallelicprimitives WARNING: AC does not have count == alts @ "
                             << var.sequenceName << ":" << var.position << endl;
                    }
                }
            }
        }

        if (keepInfo) {
            for (map<string, vector<string> >::iterator infoit = var.info.begin();
                 infoit != var.info.end(); ++infoit) {
                string key = infoit->first;
                for (vector<string>::iterator a = var.alt.begin(); a != var.alt.end(); ++a) {
                    vector<VariantAllele>& vars = varAlleles[*a];
                    for (vector<VariantAllele>::iterator va = vars.begin(); va != vars.end(); ++va) {
                        string val;
                        vector<string>& vals = var.info[key];
                        if (vals.size() == var.alt.size()) { // allele count for info
                            val = vals.at(var.altAlleleIndexes[*a]);
                        } else if (vals.size() == 1) { // site-wise count
                            val = vals.front();
                        } // don't handle other multiples... how would we do this without going crazy?
                        if (!val.empty()) {
                            alleleInfos[*va][key] = val;
                        }
                    }
                }
            }
        }

        /*
        if (keepGeno) {
            for (map<string, map<string, vector<string> > >::iterator sampleit = var.samples.begin();
                 sampleit != var.samples.end(); ++sampleit) {
                string& sampleName = sampleit->first;
                map<string, vector<string> >& sampleValues = var.samples[sampleName];
                
            }
        }
        */

        // from old allele index to a new series across the unpacked positions
        map<int, map<long unsigned int, int> > unpackedAlleleIndexes;

        map<long unsigned int, Variant> variants;
        //vector<Variant> variants;
        for (set<VariantAllele>::iterator a = alleles.begin(); a != alleles.end(); ++a) {
            if (a->ref == a->alt) {
                // ref allele
                continue;
            }
            string type;
            int len = 0;
            if (a->ref.at(0) == a->alt.at(0)) { // well-behaved indels
                if (a->ref.size() > a->alt.size()) {
                    type = "del";
                    len = a->ref.size() - a->alt.size();
                } else if (a->ref.size() < a->alt.size()) {
                    len = a->alt.size() - a->ref.size();
                    type = "ins";
                }
            } else {
                if (a->ref.size() == a->alt.size()) {
                    len = a->ref.size();
                    if (a->ref.size() == 1) {
                        type = "snp";
                    } else {
                        type = "mnp";
                    }
                } else {
                    len = abs((int) a->ref.size() - (int) a->alt.size());
                    type = "complex";
                }
            }

            if (variants.find(a->position) == variants.end()) {
                Variant newvar(variantFile);
                variants[a->position] = newvar;
            }

            Variant& v = variants[a->position]; // guaranteed to exist

            if (!parseFlag.empty()) {
                v.infoFlags[parseFlag] = true;
            }
            v.quality = var.quality;
            v.filter = var.filter;
            v.id = ".";
            //v.format = var.format;
            vector<string> gtonlyformat;
            gtonlyformat.push_back("GT");
            v.format = gtonlyformat;
            v.info["TYPE"].push_back(type);
            v.info["LEN"].push_back(convert(len));
            if (hasAf) {
                v.info["AF"].push_back(convert(alleleFrequencies[*a]));
            }
            if (hasAc) {
                v.info["AC"].push_back(convert(alleleCounts[*a]));
            }
            if (keepInfo) {
                for (map<string, vector<string> >::iterator infoit = var.info.begin();
                     infoit != var.info.end(); ++infoit) {
                    string key = infoit->first;
                    if (key != "AF" && key != "AC" && key != "TYPE" && key != "LEN") { // don't clobber previous
                        v.info[key].push_back(alleleInfos[*a][key]);
                    }
                }
            }

            // now, keep all the other infos if we are asked to

            v.sequenceName = var.sequenceName;
            v.position = a->position; // ... by definition, this should be == if the variant was found
            if (v.ref.size() < a->ref.size()) {
                for (vector<string>::iterator va = v.alt.begin(); va != v.alt.end(); ++va) {
                    *va += a->ref.substr(v.ref.size());
                }
                v.ref = a->ref;
            }
            v.alt.push_back(a->alt);

            int alleleIndex = v.alt.size();
            vector<int>& originalIndexes = variantAlleleIndexes[a->repr];
            for (vector<int>::iterator i = originalIndexes.begin(); i != originalIndexes.end(); ++i) {
                unpackedAlleleIndexes[*i][v.position] = alleleIndex;
            }
            // add null allele
            unpackedAlleleIndexes[ALLELE_NULL][v.position] = ALLELE_NULL;

        }

        // genotypes
        for (vector<string>::iterator s = var.sampleNames.begin(); s != var.sampleNames.end(); ++s) {
            string& sampleName = *s;
            if (var.samples.find(sampleName) == var.samples.end()) {
                continue;
            }
            map<string, vector<string> >& sample = var.samples[sampleName];
            if (sample.find("GT") == sample.end()) {
                continue;
            }
            string& genotype = sample["GT"].front();
            vector<string> genotypeStrs = split(genotype, "|/");
            vector<int> genotypeIndexes;
            for (vector<string>::iterator s = genotypeStrs.begin(); s != genotypeStrs.end(); ++s) {
                int i;
                if (!convert(*s, i)) {
                    genotypeIndexes.push_back(ALLELE_NULL);
                } else {
                    genotypeIndexes.push_back(i);
                }
            }
            map<long unsigned int, vector<int> > positionIndexes;
            for (vector<int>::iterator g = genotypeIndexes.begin(); g != genotypeIndexes.end(); ++g) {
                int oldIndex = *g;
                for (map<long unsigned int, Variant>::iterator v = variants.begin(); v != variants.end(); ++v) {
                    const long unsigned int& p = v->first;
                    if (oldIndex == 0) { // reference
                        positionIndexes[p].push_back(0);
                    } else {
                        positionIndexes[p].push_back(unpackedAlleleIndexes[oldIndex][p]);
                    }
                }
            }
            for (map<long unsigned int, Variant>::iterator v = variants.begin(); v != variants.end(); ++v) {
                Variant& variant = v->second;
                vector<int>& gtints = positionIndexes[v->first];
                vector<string> gtstrs;
                for (vector<int>::iterator i = gtints.begin(); i != gtints.end(); ++i) {
                    if (*i != ALLELE_NULL) {
                        gtstrs.push_back(convert(*i));
                    } else {
                        gtstrs.push_back(".");
                    }
                }
                string genotype = join(gtstrs, "|");
                // if we are keeping the geno info, pull it over here
                if (keepGeno) {
                    variant.format = var.format;
                    variant.samples[sampleName] = var.samples[sampleName];
                }
                // note that this will replace the old geno, but otherwise it is the same
                variant.samples[sampleName]["GT"].clear();
                variant.samples[sampleName]["GT"].push_back(genotype);
            }
        }

        //for (vector<Variant>::iterator v = variants.begin(); v != variants.end(); ++v) {
        for (map<long unsigned int, Variant>::iterator v = variants.begin(); v != variants.end(); ++v) {
            cout << v->second << endl;
        }
    }

    return 0;

}
Example #18
0
int main(int argc, char** argv) {

    string nullval = ".";
    bool genotypes = false;

    int c;
    while (true) {
        static struct option long_options[] =
        {
            /* These options set a flag. */
            //{"verbose", no_argument,       &verbose_flag, 1},
            {"help", no_argument, 0, 'h'},
            {"null-value", required_argument, 0, 'n'},
            {"genotypes", no_argument, 0, 'g'},
            {0, 0, 0, 0}
        };
        /* getopt_long stores the option index here. */
        int option_index = 0;

        c = getopt_long (argc, argv, "hn:g",
                         long_options, &option_index);

        if (c == -1)
            break;

        switch (c) {

	    case 'n':
	        nullval = optarg;
            break;

        case 'g':
            genotypes = true;
            break;

        case 'h':
            printSummary(argv);
            break;

        case '?':
            printSummary(argv);
            exit(1);
            break;

        default:
            abort ();
        }
    }

    VariantCallFile variantFile;
    bool usingstdin = false;
    string inputFilename;
    if (optind == argc - 1) {
        inputFilename = argv[optind];
        variantFile.open(inputFilename);
    } else {
        if (!variantFile.open(std::cin)) {
            if (argc == 1) {
                printSummary(argv);
            } else {
                cerr << "could not open stdin for reading as VCF" << endl;
                exit(1);
            }
        }
        usingstdin = true;
    }

    if (!variantFile.is_open()) {
        return 1;
    }

    // obtain all possible field names
    // true means it a bool field flag
    std::map<std::string, bool> keepFields;

    for (map<string, VariantFieldType>::iterator i = variantFile.infoTypes.begin(); i != variantFile.infoTypes.end(); ++i) {
        if (i->second == FIELD_BOOL) {
            keepFields[i->first] = true;
        } else {
            keepFields[i->first] = false;
        }
    }
    vector<string> formatfields;
    if (genotypes) {
        for (map<string, VariantFieldType>::iterator f = variantFile.formatTypes.begin(); f != variantFile.formatTypes.end(); ++f) {
            formatfields.push_back(f->first);
        }
    }

    // write header
    // defaults
    std::cout << "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER";

    for (std::map<std::string, bool>::iterator i = keepFields.begin(); i != keepFields.end(); ++i) {
        cout << "\t" << i->first;
    }

    if (genotypes) {
        cout << "\t" << "SAMPLE";
        for (vector<string>::iterator f = formatfields.begin(); f != formatfields.end(); ++f) {
            cout << "\t" << *f;
        }
    }
    std::cout << std::endl;

    Variant var(variantFile);
    while (variantFile.getNextVariant(var)) {
        stringstream outputRecord;

        loadInfoSS(outputRecord, keepFields, var, variantFile, nullval, formatfields, genotypes);

        std::cout << outputRecord.str() ;

    }
    return 0;
}
Example #19
0
int main(int argc, char** argv) {

  // set the random seed for MCMC

  srand((unsigned)time(NULL));

  // the filename

  string filename = "NA";

  // set region to scaffold

  string region = "NA"; 

  // using vcflib; thanks to Erik Garrison 

  VariantCallFile variantFile;

  // zero based index for the target and background indivudals 
  
  map<int, int> it, ib;
  
  // deltaaf is the difference of allele frequency we bother to look at 

  // ancestral state is set to zero by default

  string mut = "1";

  int counts = 0;
  
  // phased 

  int phased = 0;

    const struct option longopts[] = 
      {
	{"version"   , 0, 0, 'v'},
	{"help"      , 0, 0, 'h'},
        {"file"      , 1, 0, 'f'},
	{"target"    , 1, 0, 't'},
	{"background", 1, 0, 'b'},
	{"deltaaf"   , 1, 0, 'd'},
	{"region"    , 1, 0, 'r'},
	{"mutation"  , 1, 0, 'm'},
	{"phased"    , 1, 0, 'p'},
	{0,0,0,0}
      };

    int findex;
    int iarg=0;

    while(iarg != -1)
      {
	iarg = getopt_long(argc, argv, "p:m:r:d:t:b:f:hv", longopts, &findex);
	
	switch (iarg)
	  {
	  case 'h':
	    cerr << endl << endl;
	    cerr << "INFO: help" << endl;
	    cerr << "INFO: description:" << endl;
            cerr << "     gl-XPEHH estimates haplotype decay between the target and background populations.  SNVs are integrated                           " << endl;
	    cerr << "     until EHH in the target and background is less than 0.05. The score is the itegrated EHH (target) / integrated EHH (background). " << endl;
	    cerr << "     gl-XPEHH does NOT integrate over genetic distance, as genetic maps are not availible for most non-model organisms. 		   " << endl;
	    cerr << "     gl-XPEHH phases genotypes, imuputes missing genotypes, and changes poor quality genotypes. Phasing is done in a sliding window   " << endl;
	    cerr << "     with a stochastic search, therefore, every time gl-XPEHH is run it will generate slightly different results.                     " << endl;

	    cerr << "Output : 4 columns :     "    << endl;
	    cerr << "     1. seqid            "    << endl;
	    cerr << "     2. position         "    << endl;
	    cerr << "     3. xp-ehh           "    << endl;
	    cerr << "     4. iHS              "    << endl  << endl;

	    cerr << "INFO: gl-XPEHH  --target 0,1,2,3,4,5,6,7 --background 11,12,13,16,17,19,22 --file my.vcf --deltaaf 0.1 --ancestral 0        " << endl;
	    cerr << endl;
	    cerr << "INFO: required: r,region     -- a genomice range to calculate gl-XPEHH on in the format : \"seqid:start-end]\" or \"seqid\" " << endl;
	    cerr << "INFO: required: t,target     -- a zero base comma seperated list of target individuals corrisponding to VCF columns        " << endl;
	    cerr << "INFO: required: b,background -- a zero base comma seperated list of background individuals corrisponding to VCF columns    " << endl;
	    cerr << "INFO: required: f,file a     -- proper formatted VCF.  the FORMAT field MUST contain \"PL\" if option phased == 0           " << endl; 
	    cerr << "INFO: optional: m,mutation   -- which state is derived in vcf [0,1] default is 1                                            " << endl;
	    cerr << "INFO: optional: p,phased     -- phasing flag [0,1] 0 = phase vcf, 1 = vcf is already phased                                 " << endl;
	    cerr << endl; 
	    cerr << "INFO: version 1.0.1 ; date: April 2014 ; author: Zev Kronenberg; email : [email protected] " << endl;
	    cerr << endl << endl;
	    return 0;
	  case 'v':
	    cerr << endl << endl;
	    cerr << "INFO: version 1.0.1 ; date: April 2014 ; author: Zev Kronenberg; email : [email protected] "  << endl;
	    return 0;
	  case 'p':
	    phased = atoi(optarg);
	    cerr << "INFO: setting phase to: " << phased << endl;
	    break;
	  case 'm':
	    mut = optarg;
	    cerr << "INFO: derived state set to " << mut << endl;
	    break;
	  case 't':
	    loadIndices(it, optarg);
	    cerr << "INFO: there are " << it.size() << " individuals in the target" << endl;
	    cerr << "INFO: target ids: " << optarg << endl;
	    break;
	  case 'b':
	    loadIndices(ib, optarg);
	    cerr << "INFO: there are " << ib.size() << " individuals in the background" << endl;
	    cerr << "INFO: background ids: " << optarg << endl;
	    break;
	  case 'f':
	    cerr << "INFO: file: " << optarg  <<  endl;
	    filename = optarg;
	    break;
	  case 'r':
            cerr << "INFO: set seqid region to : " << optarg << endl;
	    region = optarg; 
	    break;
	  default:
	    break;
	  }

      }

    if(filename == "NA"){
      cerr << "FATAL: did not specify a file" << endl;
      cerr << "INFO: please use gl-XPEHH --help" << endl;
      return(1);
    }


    variantFile.open(filename);
    
    if(region == "NA"){
      cerr << "FATAL: did not specify a region"  << endl;
      cerr << "INFO: please use gl-XPEHH --help" << endl;
    }

   if(region != "NA"){
     variantFile.setRegion(region); 
   }
    
    if (!variantFile.is_open()) {
        return 1;
    }
    
    Variant var(variantFile);

    vector<string> samples = variantFile.sampleNames;
    vector<int>    target_h, background_h;

    int index, indexi = 0;

    cerr << "INFO: there are " << samples.size() << " individuals in the VCF" << endl;

    if(samples.size() == 0){
      cerr << "FATAL: too few samples or no VCF header"    << endl;
      cerr << "INFO: please use gl-XPEHH --help"           << endl;
      return(1);
    }

    for(vector<string>::iterator samp = samples.begin(); samp != samples.end(); samp++){
     
      if(it.find(index) != it.end() ){
	target_h.push_back(indexi);
	indexi++;
      }
      if(ib.find(index) != ib.end()){
	background_h.push_back(indexi);
	indexi++;
      }
      index++;
    }
    

    list< pop > tdat, bdat, zdat;

    vector<long int> positions;

    string haplotypes [it.size() + ib.size()][2];    
    
    string seqid;

    while (variantFile.getNextVariant(var)) {
        map<string, map<string, vector<string> > >::iterator s     = var.samples.begin(); 
        map<string, map<string, vector<string> > >::iterator sEnd  = var.samples.end();
        
	// biallelic sites naturally 

	if(var.alt.size() > 1){
	  continue;
	}

	vector < map< string, vector<string> > > target, background, total;
	        
	int sindex = 0;

        for (; s != sEnd; s++) {	  
	  
	  map<string, vector<string> >& sample = s->second;
	  
	  if(it.find(sindex) != it.end() ){
	    target.push_back(sample);
	    total.push_back(sample);	
	  }
	  if(ib.find(sindex) != ib.end()){
	    background.push_back(sample);
	    total.push_back(sample);
	  }  
	  sindex += 1;
	}
	
	seqid = var.sequenceName;

	pop popt, popb, popz;

	initPop(popt);
	initPop(popb);
	initPop(popz);

	loadPop(target,     popt, var.sequenceName, var.position, phased );
	loadPop(background, popb, var.sequenceName, var.position, phased );
	loadPop(total,      popz, var.sequenceName, var.position, phased );

	if(popt.af == -1 || popb.af == -1){
	  continue;
	}
	if(popz.af > 0.95 || popz.af < 0.05){
	  continue;
	}
	if(popt.af == 0 && popb.af == 1){
	  continue;
	}
	if(popt.af == 1 && popb.af == 0){
	  continue;
	}
		
	tdat.push_back(popt);
	bdat.push_back(popb);
	zdat.push_back(popz);
       
	positions.push_back(var.position);
	
	counts += 1;
	if(counts >= 1000){
	  cerr << "INFO: processed " << haplotypes[0][0].size() << " SNPs; current location : " << var.position << endl;
	  counts = 0;
	}

	while(zdat.size() >= 15 && !zdat.empty()){
          if(phased == 0){	    
            localPhase(haplotypes, zdat, (it.size() + ib.size()));
          }
          else{
            loadPhased(haplotypes, zdat, (it.size() + ib.size()));
          }
          while(!zdat.empty()){
            zdat.pop_front();
          }
	}
    }

    if(phased == 0){
      localPhase(haplotypes, zdat, (it.size() + ib.size()));
    }
    else{
      loadPhased(haplotypes, zdat, (it.size() + ib.size()));
    }
    while(!zdat.empty()){
      zdat.pop_front();
    }


    cerr << "INFO: phasing done" << endl;
   
    calc(haplotypes, (it.size() + ib.size()), positions, target_h, background_h,  mut, seqid);

    cerr << "INFO: gl-XPEHH finished" << endl;

    return 0;		    
}
Example #20
0
int main(int argc, char** argv) {

    int c;
    string fastaRef;
    int windowSize = 0;

    if (argc == 1)
        printSummary(argv);

    while (true) {
        static struct option long_options[] =
        {
            /* These options set a flag. */
            //{"verbose", no_argument,       &verbose_flag, 1},
            {"help", no_argument, 0, 'h'},
            {"fasta-reference",  required_argument, 0, 'f'},
            {"window-size", required_argument, 0, 'w'},
            //{"length",  no_argument, &printLength, true},
            {0, 0, 0, 0}
        };
        /* getopt_long stores the option index here. */
        int option_index = 0;

        c = getopt_long (argc, argv, "hf:w:",
                         long_options, &option_index);

      /* Detect the end of the options. */
          if (c == -1)
            break;
 
          switch (c)
            {
            case 0:
            /* If this option set a flag, do nothing else now. */
            if (long_options[option_index].flag != 0)
              break;
            printf ("option %s", long_options[option_index].name);
            if (optarg)
              printf (" with arg %s", optarg);
            printf ("\n");
            break;

          case 'f':
            fastaRef = optarg;
            break;

          case 'w':
            windowSize = atoi(optarg);
            break;
 
          case 'h':
            printSummary(argv);
            exit(0);
            break;

          case '?':
            /* getopt_long already printed an error message. */
            printSummary(argv);
            exit(1);
            break;
 
          default:
            abort ();
          }
      }

    if (windowSize == 0) {
        cerr << "a window size must be specified" << endl;
        exit(1);
    }
    if (fastaRef.empty()) {
        cerr << "a FASTA reference sequence must be specified" << endl;
        exit(1);
    }

    FastaReference ref;
    ref.open(fastaRef);

    VariantCallFile variantFile;
    string inputFilename;
    if (optind == argc - 1) {
        inputFilename = argv[optind];
        variantFile.open(inputFilename);
    } else {
        variantFile.open(std::cin);
    }

    if (!variantFile.is_open()) {
        return 1;
    }

    variantFile.addHeaderLine("##INFO=<ID=EntropyLeft,Number=1,Type=Float,Description=\"Entropy of left-flanking sequence of "+ convert(windowSize) +"bp\">");
    variantFile.addHeaderLine("##INFO=<ID=EntropyCenter,Number=1,Type=Float,Description=\"Entropy of centered sequence of "+ convert(windowSize) +"bp\">");
    variantFile.addHeaderLine("##INFO=<ID=EntropyRight,Number=1,Type=Float,Description=\"Entropy of right-flanking sequence of "+ convert(windowSize) +"bp\">");
    variantFile.addHeaderLine("##INFO=<ID=EntropyRef,Number=1,Type=Float,Description=\"Entropy of REF allele\">");
    variantFile.addHeaderLine("##INFO=<ID=EntropyAlt,Number=A,Type=Float,Description=\"Entropy of each ALT allele\">");

    cout << variantFile.header << endl;

    Variant var(variantFile);
    while (variantFile.getNextVariant(var)) {

        // get the ref start and end positions
        int refstart = var.position - 1; // convert to 0-based
        int refend = var.position + var.ref.size() - 1;
        string leftseq = ref.getSubSequence(var.sequenceName, refstart - windowSize, windowSize);
        string rightseq = ref.getSubSequence(var.sequenceName, refend, windowSize);
        string centerseq = ref.getSubSequence(var.sequenceName, refstart - windowSize/2, windowSize);
        double entropyLeft = shannon_H((char*) &leftseq[0], windowSize);
        double entropyRight = shannon_H((char*) &rightseq[0], windowSize);
        double entropyCenter = shannon_H((char*) &centerseq[0], windowSize);
        double entropyRef = shannon_H((char*) var.ref.c_str(), var.ref.size());

        var.info["EntropyLeft"].clear();
        var.info["EntropyRight"].clear();
        var.info["EntropyCenter"].clear();
        var.info["EntropyRef"].clear();
        var.info["EntropyAlt"].clear();

        var.info["EntropyLeft"].push_back(convert(entropyLeft));
        var.info["EntropyRight"].push_back(convert(entropyRight));
        var.info["EntropyCenter"].push_back(convert(entropyCenter));
        var.info["EntropyRef"].push_back(convert(entropyRef));

        for (vector<string>::iterator a = var.alt.begin(); a != var.alt.end(); ++a) {
            double entropyAlt = shannon_H((char*) a->c_str(), a->size());
            var.info["EntropyAlt"].push_back(convert(entropyAlt));
        }

        cout << var << endl;
    }

    return 0;

}
Example #21
0
int main(int argc, char** argv) {

    if (argc != 2) {
        cerr << "usage: " << argv[0] << " <vcf file>" << endl
             << "unphases and sorts the genotypes in the file" << endl;
        return 1;
    }

    string filename = argv[1];

    VariantCallFile variantFile;

    if (filename == "-") {
        variantFile.open(std::cin);
    } else {
        variantFile.open(filename);
    }

    if (!variantFile.is_open()) {
        return 1;
    }

    cout << variantFile.header << endl;

    Variant var(variantFile);
    while (variantFile.getNextVariant(var)) {
        map<string, map<string, vector<string> > >::iterator s     = var.samples.begin(); 
        map<string, map<string, vector<string> > >::iterator sEnd  = var.samples.end();
        
        for (; s != sEnd; ++s) {
            map<string, vector<string> >& sample = s->second;
            string& genotype = sample["GT"].front();
            vector<string> gt = split(genotype, "|/");
            // now let's sort the genotype
            vector<int> gti;
            for (vector<string>::iterator g = gt.begin(); g != gt.end(); ++g) {
                if (*g == ".") {
                    gti.push_back(-1);
                } else {
                    gti.push_back(atoi(g->c_str()));
                }
            }
            std::sort(gti.begin(), gti.end());
            stringstream gts;
            for (vector<int>::iterator g = gti.begin(); g != gti.end(); ++g) {
                if (g != gti.begin()) {
                    gts << "/";
                }
                if (*g == -1) {
                    gts << ".";
                } else {
                    gts << *g;
                }
            }
            genotype = gts.str();
        }
        cout << var << endl;
    }
    return 0;

}
Example #22
0
int main(int argc, char** argv) {

    string bedFileName;
    string vcfFileName;
    string fastaFileName;
    bool intersecting = false;
    bool unioning = false;
    bool invert = false;
    bool contained = true;
    bool overlapping = false;
    int windowsize = 30;

    if (argc == 1)
        printSummary(argv);

    int c;
    while (true) {
        static struct option long_options[] =
        {
            /* These options set a flag. */
            //{"verbose", no_argument,       &verbose_flag, 1},
            {"help", no_argument, 0, 'h'},
            {"bed",  required_argument, 0, 'b'},
            {"invert",  no_argument, 0, 'v'},
	    {"intersect-vcf", required_argument, 0, 'i'},
	    {"union-vcf", required_argument, 0, 'u'},
            {"contained",  no_argument, 0, 'c'},
            {"overlapping", no_argument, 0, 'o'},
	    {"window-size", required_argument, 0, 'w'},
	    {"reference", required_argument, 0, 'r'},
            {0, 0, 0, 0}
        };
        /* getopt_long stores the option index here. */
        int option_index = 0;

        c = getopt_long (argc, argv, "hvcob:i:u:w:r:",
                         long_options, &option_index);

        if (c == -1)
            break;

        switch (c) {

	    case 'w':
		windowsize = atoi(optarg);
		break;

            case 'b':
                bedFileName = string(optarg);
                break;

            case 'i':
		intersecting = true;
                vcfFileName = string(optarg);
                break;

            case 'u':
		unioning = true;
                vcfFileName = string(optarg);
                break;

	    case 'r':
		fastaFileName = string(optarg);
		break;

            case 'v':
                invert = true;
                break;

            case 'c':
                contained = true;
                break;

            case 'o':
                overlapping = true;
                break;

            case 'h':
                printSummary(argv);
                break;

            case '?':
                printSummary(argv);
                exit(1);
                break;

            default:
                abort ();
        }
    }

    bool usingBED = false;
    if (!bedFileName.empty()) {
	usingBED = true;
    }
    
    BedReader bed;
    if (usingBED) {
	bed.open(bedFileName);
    }

    VariantCallFile variantFile;
    string inputFilename;
    if (optind == argc - 1) {
        inputFilename = argv[optind];
        variantFile.open(inputFilename);
    } else {
        variantFile.open(std::cin);
    }

    if (!variantFile.is_open()) {
        cerr << "could not open VCF file" << endl;
        exit(1);
    }

    if (usingBED) {
	variantFile.parseSamples = false;
    }

    VariantCallFile otherVariantFile;
    if (!vcfFileName.empty()) {
	otherVariantFile.open(vcfFileName);
	if (!otherVariantFile.is_open()) {
	    cerr << "could not open VCF file " << vcfFileName << endl;
	    exit(1);
	}
    }

    FastaReference reference;
    if (unioning || intersecting) {
	if (fastaFileName.empty()) {
	    cerr << "a reference is required for haplotype-based intersection and unioniong" << endl;
	    exit(1);
	}
	reference.open(fastaFileName);
    }

    if (!unioning && !intersecting) {
	variantFile.parseSamples = false; // faster, as when we are
					  // only bed-intersecting we
					  // can do position-only
					  // output and don't have to
					  // manipulate specific
					  // alleles
    }

    // read the VCF file for union or intersection into an interval tree
    // indexed using some proximity window

    map<string, IntervalTree<Variant*> > variantIntervals;
    map<string, list<Variant> > otherVariants;
    map<string, vector<Interval<Variant*> > > otherVariantIntervals;

    if (unioning || intersecting) {

	Variant ovar(otherVariantFile);
	while (otherVariantFile.getNextVariant(ovar)) {
	    long int left = ovar.position;
	    long int right = left + ovar.ref.size(); // this should be 1-past the end
	    otherVariants[ovar.sequenceName].push_back(ovar);
	    Variant* v = &otherVariants[ovar.sequenceName].back();
	    otherVariantIntervals[ovar.sequenceName].push_back(Interval<Variant*>(left, right, v));
	}
	
	for (map<string, vector<Interval<Variant*> > >::iterator j = otherVariantIntervals.begin(); j != otherVariantIntervals.end(); ++j) {
	    variantIntervals[j->first] = IntervalTree<Variant*>(j->second);
	}

    }

    set<Variant*> outputVariants;

    long unsigned int lastOutputPosition = 0;
    string lastSequenceName;

    cout << variantFile.header;

    Variant var(variantFile);
    while (variantFile.getNextVariant(var)) {

	if (lastSequenceName.empty()) {
	    lastSequenceName = var.sequenceName;
	} else if (lastSequenceName != var.sequenceName) {
	    if (unioning) {
		vector<Interval<Variant*> > previousRecords;
		long int lastSeqLength = reference.sequenceLength(lastSequenceName);
		variantIntervals[lastSequenceName].findContained(lastOutputPosition, lastSeqLength, previousRecords);
		for (vector<Interval<Variant*> >::iterator r = previousRecords.begin(); r != previousRecords.end(); ++r) {
		    Variant* v = r->value;
		    if (outputVariants.find(v) == outputVariants.end()) {
			outputVariants.insert(v);
			cout << *v << endl;  // does this output everything in correct order?
		    }
		}
		lastSequenceName = var.sequenceName;
		lastOutputPosition = 0;
	    }
	}

	if (usingBED) {
	    BedTarget record(var.sequenceName, var.position, var.position + var.ref.size(), "");
	    vector<BedTarget*> overlaps = bed.targetsOverlapping(record);

	    if (!invert && !overlaps.empty()) {
		cout << variantFile.line << endl;
	    } else if (invert && overlaps.empty()) {
		cout << variantFile.line << endl;
	    }

	} else if (unioning || intersecting) {

	    // TODO check overlaps with union/intersection
	    // hmm... for unioning, you might need to step through the original VCF records
	    // but the idea is to exclude the haplotype-based duplicates

	    vector<Interval<Variant*> > results;

	    variantIntervals[var.sequenceName].findContained(var.position - windowsize, var.position + var.ref.size() + windowsize, results);

	    vector<Variant*> overlapping;

	    for (vector<Interval<Variant*> >::iterator r = results.begin(); r != results.end(); ++r) {
		overlapping.push_back(r->value);
	    }


	    if (unioning) {

		// unioning strategy

		// write out all the records from the last file
		// between the last one printed out and the first
		// one we're about to print out

		vector<Interval<Variant*> > previousRecords;

		variantIntervals[var.sequenceName].findOverlapping(lastOutputPosition, var.position - windowsize, previousRecords);

		map<long int, vector<Variant*> > variants;

		for (vector<Interval<Variant*> >::iterator r = previousRecords.begin(); r != previousRecords.end(); ++r) {
		    Variant* v = r->value;
		    if (outputVariants.find(v) == outputVariants.end()) {
			outputVariants.insert(v);
			variants[v->position].push_back(v);
		    }
		}

		for (map<long int, vector<Variant*> >::iterator v = variants.begin(); v != variants.end(); ++v) {
		    for (vector<Variant*>::iterator o = v->second.begin(); o != v->second.end(); ++o) {
			cout << **o << endl;
			lastOutputPosition = max(lastOutputPosition, (*o)->position);
		    }
		}

		// TODO find the duplicates for the other file
	    }


	    if (overlapping.empty()) {

		if (unioning || (intersecting && invert)) {
		    cout << var << endl;
		    lastOutputPosition = max(lastOutputPosition, var.position);
		}

	    } else {

		// get the min and max of the overlaps

		int haplotypeStart = var.position;
		int haplotypeEnd = var.position + var.ref.size();

		for (vector<Variant*>::iterator v = overlapping.begin(); v != overlapping.end(); ++v) {
		    haplotypeStart = min((*v)->position, (long unsigned int) haplotypeStart);
		    haplotypeEnd = max((*v)->position + (*v)->ref.size(), (long unsigned int) haplotypeEnd);
     		}

		// for everything overlapping and the current variant, construct the local haplotype within the bounds
		// if there is an exact match, the alllele in the current VCF does intersect

		string referenceHaplotype = reference.getSubSequence(var.sequenceName, haplotypeStart - 1, haplotypeEnd - haplotypeStart);
		map<string, vector<Variant*> > haplotypes;

		for (vector<Variant*>::iterator v = overlapping.begin(); v != overlapping.end(); ++v) {
		    Variant& variant = **v;
		    for (vector<string>::iterator a = variant.alt.begin(); a != variant.alt.end(); ++a) {
			string haplotype = referenceHaplotype;
			// get the relative start and end coordinates for the variant alternate allele
			int relativeStart = variant.position - haplotypeStart;
			haplotype.replace(relativeStart, variant.ref.size(), *a);
			haplotypes[haplotype].push_back(*v);
		    }
		}

		// determine the non-intersecting alts
		vector<string> altsToRemove;
		for (vector<string>::iterator a = var.alt.begin(); a != var.alt.end(); ++a) {
		    string haplotype = referenceHaplotype;
		    int relativeStart = var.position - haplotypeStart;
		    haplotype.replace(relativeStart, var.ref.size(), *a);
		    map<string, vector<Variant*> >::iterator h = haplotypes.find(haplotype);
		    if ((intersecting && !invert && h == haplotypes.end())
			|| (intersecting && invert && h != haplotypes.end())
			|| (unioning && h != haplotypes.end())) {
			altsToRemove.push_back(*a);
		    }
		}

		// remove the non-overlapping (intersecting) or overlapping (unioning) alts
		for (vector<string>::iterator a = altsToRemove.begin(); a != altsToRemove.end(); ++a) {
		    var.removeAlt(*a);
		}

		if (unioning) {
		    // somehow sort the records and combine them?
		    map<long int, vector<Variant*> > variants;
		    for (vector<Variant*>::iterator o = overlapping.begin(); o != overlapping.end(); ++o) {
			if ((*o)->position <= var.position && // check ensures proper ordering of variants on output
			    outputVariants.find(*o) == outputVariants.end()) {
			    outputVariants.insert(*o);
			    variants[(*o)->position].push_back(*o);
			}
		    }
		    // add in the current variant, if it has alts left
		    if (!var.alt.empty()) {
			variants[var.position].push_back(&var);
		    }

		    for (map<long int, vector<Variant*> >::iterator v = variants.begin(); v != variants.end(); ++v) {
			for (vector<Variant*>::iterator o = v->second.begin(); o != v->second.end(); ++o) {
			    cout << **o << endl;
			    lastOutputPosition = max(lastOutputPosition, (*o)->position);
			}
		    }
		} else {
		    // if any alts remain, output the variant record
		    if (!var.alt.empty()) {
			cout << var << endl;
			lastOutputPosition = max(lastOutputPosition, var.position);
		    }
		}

	    }

	}

    }


    // if unioning, and any variants remain, output them
    if (unioning) {
	for (map<string, list<Variant> >::iterator chrom = otherVariants.find(lastSequenceName);
	     chrom != otherVariants.end();
	     ++chrom) {
	    for (list<Variant>::iterator v = chrom->second.begin(); v != chrom->second.end(); ++v) {
		Variant* variant = &*v;
		if (outputVariants.find(variant) == outputVariants.end()) {
		    outputVariants.insert(variant);
		    cout << *variant << endl;
		    // TODO guarantee sorting
		}
	    }
	}
    }

    exit(0);  // why?
    return 0;

}
int main(int argc, char** argv) {

    if (argc != 3) {
        cerr << "usage: " << argv[0] << " <other-genotype-tag> <vcf file>" << endl
             << "adds statistics to the INFO field of the vcf file describing the" << endl
             << "amount of discrepancy between the genotypes (GT) in the vcf file and the" << endl
             << "genotypes reported in the <other-genotype-tag>.  use this after" << endl
             << "vcfannotategenotypes to get correspondence statistics for two vcfs." << endl;
        return 1;
    }

    string otherGenoTag = argv[1];
    string filename = argv[2];

    VariantCallFile variantFile;
    if (filename == "-") {
        variantFile.open(std::cin);
    } else {
        variantFile.open(filename);
    }

    if (!variantFile.is_open()) {
        return 1;
    }

    vector<string> specs;

    specs.push_back("AA_AA");
    specs.push_back("AA_AR");
    specs.push_back("AA_RR");
    specs.push_back("AA_NN");

    specs.push_back("AR_AA");
    specs.push_back("AR_AR");
    specs.push_back("AR_RR");
    specs.push_back("AR_NN");

    specs.push_back("RR_AA");
    specs.push_back("RR_AR");
    specs.push_back("RR_RR");
    specs.push_back("RR_NN");

    specs.push_back("NN_AA");
    specs.push_back("NN_AR");
    specs.push_back("NN_RR");
    specs.push_back("NN_NN");


    for (vector<string>::iterator spec = specs.begin(); spec != specs.end(); ++spec) {
        string line = "##INFO=<ID=" + otherGenoTag + ".genotypes." + *spec
            + ",Number=1,Type=Integer,Description=\"Number of genotypes with "
            + *spec + " relationship with " + otherGenoTag + "\">";
        variantFile.addHeaderLine(line);
    }

    string line;

    line = "##INFO=<ID=" + otherGenoTag + ".genotypes.count,Number=1,Type=Integer,Description=\"Count of genotypes under comparison.\">";
    variantFile.addHeaderLine(line);

    line = "##INFO=<ID=" + otherGenoTag + ".genotypes.alternate_count,Number=1,Type=Integer,Description=\"Count of alternate genotypes in the first file.\">";
    variantFile.addHeaderLine(line);

    line = "##INFO=<ID=" + otherGenoTag
        + ".site.alternate_positive_discrepancy,Number=1,Type=Integer,Description=\"Estimated positive discrepancy rate of "
        + otherGenoTag + " genotypes, where positive discrepancies are all cases where an alternate allele is called GT "
        + " but none is represented in " + otherGenoTag + " or " + otherGenoTag + " is null/no-call\">";
    variantFile.addHeaderLine(line);

    line = "##INFO=<ID=" + otherGenoTag
        + ".site.alternate_negative_discrepancy,Number=1,Type=Integer,Description=\"Estimated negative discrepancy rate of "
        + otherGenoTag + " genotypes, where negative discrepancies are all cases where no alternate allele is called in "
        + " GT but an alternate is represented in " + otherGenoTag + ", including no-calls or partly null genotypes\">";
    variantFile.addHeaderLine(line);

    line = "##INFO=<ID=" + otherGenoTag
        + ".site.alternate_null_discrepancy,Number=1,Type=Integer,Description=\"Estimated null discrepancy rate of "
        + otherGenoTag + " genotypes, where null discrepancies are all cases where GT is specified and contains an alternate but "
        + otherGenoTag + " is null.  Cases where GT is null or partly null are excluded.\">";
    variantFile.addHeaderLine(line);

    line = "##INFO=<ID=" + otherGenoTag
        + ".site.call_discrepancy,Number=1,Type=Integer,Description=\"Estimated call discrepancy rate of "
        + otherGenoTag + " genotypes (het->hom, hom->het) between " + otherGenoTag + " and GT\">";
    variantFile.addHeaderLine(line);

    line = "##INFO=<ID=" + otherGenoTag
        + ".site.call_concordance,Number=1,Type=Integer,Description=\"Estimated call concorndance rate of "
        + otherGenoTag + " genotypes between " + otherGenoTag + " and GT\">";
    variantFile.addHeaderLine(line);

    line = "##INFO=<ID=" + otherGenoTag
        + ".site.non_reference_discrepancy,Number=1,Type=Float,Description=\"Estimated non-reference discrepancy relative to "
        + otherGenoTag + " genotypes,\">";
    variantFile.addHeaderLine(line);

    line = "##INFO=<ID=" + otherGenoTag
        + ".site.non_reference_discrepancy.count,Number=1,Type=Int,Description=\"non-reference discrepancy normalizer relative to "
        + otherGenoTag + " genotypes,\">";
    variantFile.addHeaderLine(line);

    line = "##INFO=<ID=" + otherGenoTag
        + ".site.non_reference_discrepancy.normalizer,Number=1,Type=Int,Description=\"non-reference discrepancy count relative to "
        + otherGenoTag + " genotypes,\">";
    variantFile.addHeaderLine(line);

    line = "##INFO=<ID=" + otherGenoTag
        + ".site.non_reference_sensitivity,Number=1,Type=Float,Description=\"Estimated non-reference sensitivity relative to "
        + otherGenoTag + " genotypes,\">";
    variantFile.addHeaderLine(line);

    line = "##INFO=<ID=" + otherGenoTag
        + ".site.non_reference_sensitivity.count,Number=1,Type=Int,Description=\"non-reference sensitivity normalizer relative to "
        + otherGenoTag + " genotypes,\">";
    variantFile.addHeaderLine(line);

    line = "##INFO=<ID=" + otherGenoTag
        + ".site.non_reference_sensitivity.normalizer,Number=1,Type=Int,Description=\"non-reference sensitivity count relative to "
        + otherGenoTag + " genotypes,\">";
    variantFile.addHeaderLine(line);

    cout << variantFile.header << endl;

    Variant var(variantFile);

    while (variantFile.getNextVariant(var)) {

	//cout << "next: " << var << endl;
        // for each sample, check GT against <other-genotype-tag>
        // tally stats, and append to info
        map<string, map<string, vector<string> > >::iterator s     = var.samples.begin();
        map<string, map<string, vector<string> > >::iterator sEnd  = var.samples.end();

        map<string, int> genotypeComparisonCounts;
        int gtCount = var.samples.size();
        int gtAltCount = 0; // number of alternate-containing genotypes in the first file
        int pdCount = 0; // positive discrepancy count
        int ndCount = 0; // negative discrepancy count
        int nnCount = 0; // null discrepancy count
        int cdCount = 0; // call discrepancy count
        int ccCount = 0; // call concordance count
        int nrdCount = 0; // non-reference discrepancy count
        int nrdNormalizer = 0; // divisor for nrd rate
        int nrsCount = 0; // non-reference sensitivity count
        int nrsNormalizer = 0; // divisor for nrs rate

        for (; s != sEnd; ++s) {
            map<string, vector<string> >& sample = s->second;
            const string& name = s->first;

            // decompose genotypes into counts of strings
            // to facilitate comparison

	    string gtA;
	    if (sample.find("GT") == sample.end()) {
		gtA = "./.";
	    } else {
		gtA = sample["GT"].front();
	    }

	    string gtB;
	    if (sample.find(otherGenoTag) == sample.end()) {
		gtB = "./.";
	    } else {
		gtB = sample[otherGenoTag].front();
	    }


            map<int, int> genotypeA = decomposeGenotype(gtA);
            map<int, int> genotypeB = decomposeGenotype(gtB);

            string gtspecA = genotypeSpec(genotypeA);
            string gtspecB = genotypeSpec(genotypeB);
            //cout << gtA << " " << gtB << endl;
            //cout << gtspecA << " " << gtspecB << endl;
            ++genotypeComparisonCounts[gtspecA + "_" + gtspecB];

            if (hasNonRef(genotypeA)) {
                ++gtAltCount;
            }

            if (genotypeA != genotypeB) {
                if (isNull(genotypeA)) {
                    // TODO handle this somehow, maybe via a different flag?
                    if (!isNull(genotypeB)) {
                        ++nnCount;  // null discrepancy, the second set makes a call, this one does not
                    }
                } else if (hasNonRef(genotypeA)) {
                    if (!isNull(genotypeB) && hasNonRef(genotypeB)) { // they cannot be the same, but they both represent an alternate
                        ++cdCount;  // the calls are discrepant
                    } else { // the other call does not have an alternate
                        ++pdCount;
                        // it is also null
                        if (isNull(genotypeB)) {
                            ++nnCount;
                        }
                    }
                } else { // the current genotype has no non-ref alternate
                    if (!isNull(genotypeB) && hasNonRef(genotypeB)) {
                        ++ndCount;
                    }
                    if (isNull(genotypeB)) {
                        ++nnCount;
                    }
                }
            } else {
                if (!isNull(genotypeA)) {
                    ++ccCount;
                }
            }


            if (!(isNull(genotypeA) || isNull(genotypeB))
                    && !(isHomRef(genotypeA) && isHomRef(genotypeB))) {
                ++nrdNormalizer;
                if (genotypeA != genotypeB) {
                    ++nrdCount;
                }
            }

            if (!(isNull(genotypeB) || isHomRef(genotypeB))) {
                ++nrsNormalizer;
                if (!(isNull(genotypeA) || isHomRef(genotypeA))) {
                    ++nrsCount;
                }
            }

        }

        for (map<string, int>::iterator g = genotypeComparisonCounts.begin();
                g != genotypeComparisonCounts.end(); ++g) {
            stringstream c;
            c << g->second;
            vector<string>& t = var.info[otherGenoTag + ".genotypes." + g->first];
            t.clear(); t.push_back(c.str());
        }

        stringstream gtc;
        gtc << gtCount;
        var.info[otherGenoTag + ".genotypes.count"].push_back(gtc.str());

        stringstream gtac;
        gtac << gtAltCount;
        var.info[otherGenoTag + ".genotypes.alternate_count"].push_back(gtac.str());

        stringstream pd;
        pd << pdCount;
        var.info[otherGenoTag + ".site.alternate_positive_discrepancy"].push_back(pd.str());

        stringstream nd;
        nd << ndCount;
        var.info[otherGenoTag + ".site.alternate_negative_discrepancy"].push_back(nd.str());

        stringstream nn;
        nn << nnCount;
        var.info[otherGenoTag + ".site.alternate_null_discrepancy"].push_back(nn.str());

        stringstream cd;
        cd << cdCount;
        var.info[otherGenoTag + ".site.call_discrepancy"].push_back(cd.str());

        stringstream cc;
        cc << ccCount;
        var.info[otherGenoTag + ".site.call_concordance"].push_back(cc.str());

        stringstream nrdc;
        nrdc << nrdCount;
        var.info[otherGenoTag + ".site.non_reference_discrepancy.count"].push_back(nrdc.str());

        stringstream nrdn;
        nrdn << nrdNormalizer;
        var.info[otherGenoTag + ".site.non_reference_discrepancy.normalizer"].push_back(nrdn.str());

        if (nrdNormalizer > 0) {
            stringstream nrd;
            nrd << (double) nrdCount / (double) nrdNormalizer;
            var.info[otherGenoTag + ".site.non_reference_discrepancy"].push_back(nrd.str());
        }

        stringstream nrsc;
        nrsc << nrsCount;
        var.info[otherGenoTag + ".site.non_reference_sensitivity.count"].push_back(nrsc.str());

        stringstream nrsn;
        nrsn << nrsNormalizer;
        var.info[otherGenoTag + ".site.non_reference_sensitivity.normalizer"].push_back(nrsn.str());

        if (nrsNormalizer > 0) {
            stringstream nrs;
            nrs << (double) nrsCount / (double) nrsNormalizer;
            var.info[otherGenoTag + ".site.non_reference_sensitivity"].push_back(nrs.str());
        }

        cout << var << endl;

    }

    return 0;

}
Example #24
0
int main(int argc, char** argv) {

    string nullval;
    bool genotypes = false;

    int c;
    while (true) {
        static struct option long_options[] =
        {
            /* These options set a flag. */
            //{"verbose", no_argument,       &verbose_flag, 1},
            {"help", no_argument, 0, 'h'},
            {"null-value", required_argument, 0, 'n'},
            {"genotypes", no_argument, 0, 'g'},
            {0, 0, 0, 0}
        };
        /* getopt_long stores the option index here. */
        int option_index = 0;

        c = getopt_long (argc, argv, "hn:g",
                         long_options, &option_index);

        if (c == -1)
            break;

        switch (c) {

	    case 'n':
	        nullval = optarg;
            break;

        case 'g':
            genotypes = true;
            break;

        case 'h':
            printSummary(argv);
            break;
            
        case '?':
            printSummary(argv);
            exit(1);
            break;
                
        default:
            abort ();
        }
    }

    VariantCallFile variantFile;
    bool usingstdin = false;
    string inputFilename;
    if (optind == argc - 1) {
        inputFilename = argv[optind];
        variantFile.open(inputFilename);
    } else {
        if (!variantFile.open(std::cin)) {
            if (argc == 1) {
                printSummary(argv);
            } else {
                cerr << "could not open stdin for reading as VCF" << endl;
                exit(1);
            }
        }
        usingstdin = true;
    }

    if (!variantFile.is_open()) {
        return 1;
    }
    // obtain all possible field names
    vector<string> infofields;
    vector<string> infoflags;

    for (map<string, VariantFieldType>::iterator i = variantFile.infoTypes.begin(); i != variantFile.infoTypes.end(); ++i) {
        if (i->second == FIELD_BOOL) {
            infoflags.push_back(i->first);
        } else {
            infofields.push_back(i->first);
        }
    }

    vector<string> formatfields;
    if (genotypes) {
        for (map<string, VariantFieldType>::iterator f = variantFile.formatTypes.begin(); f != variantFile.formatTypes.end(); ++f) {
            formatfields.push_back(f->first);
        }
    }

    // write header

    // defaults
    cout << "CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER";
    
    // configurable info field
    for (vector<string>::iterator i = infofields.begin(); i != infofields.end(); ++i) {
        cout << "\t" << *i;
    }
    for (vector<string>::iterator i = infoflags.begin(); i != infoflags.end(); ++i) {
        cout << "\t" << *i;
    }
    
    if (genotypes) {
        cout << "\t" << "SAMPLE";
        for (vector<string>::iterator f = formatfields.begin(); f != formatfields.end(); ++f) {
            cout << "\t" << *f;
        }
    }
    cout << endl;

    Variant var(variantFile);
    while (variantFile.getNextVariant(var)) {

        if (!genotypes) {

            int altindex = 0;
            for (vector<string>::iterator a = var.alt.begin(); a != var.alt.end(); ++a, ++altindex) {

                string& altallele = *a;

                cout << var.sequenceName << "\t"
                     << var.position << "\t"
                     << var.id << "\t"
                     << var.ref << "\t"
                     << altallele << "\t"
                     << var.quality << "\t"
                     << var.filter;

                for (vector<string>::iterator i = infofields.begin(); i != infofields.end(); ++i) {
                    vector<string> value;
                    string& name = *i;
                    map<string, vector<string> >::iterator f = var.info.find(name);
                    if (f != var.info.end()) {
                        value = f->second;
                        if (value.size() == 1) {
                            cout << "\t" << value.front();
                        } else if (value.size() == var.alt.size()) {
                            cout << "\t" << value.at(altindex);
                        } else {
                            cout << "\t" << nullval; // null
                        }
                    } else {
                        cout << "\t" << nullval; // null
                    }
                }

                for (vector<string>::iterator i = infoflags.begin(); i != infoflags.end(); ++i) {
                    string value;
                    string& name = *i;
                    map<string, bool>::iterator f = var.infoFlags.find(name);
                    cout << "\t";
                    if (f != var.infoFlags.end()) {
                        cout << 1;
                    } else {
                        cout << 0;
                    }
                }

                cout << endl;

            }
        } else {

            stringstream o;

            // per-genotype output
            o << var.sequenceName << "\t"
              << var.position << "\t"
              << var.id << "\t"
              << var.ref << "\t"
              << join(var.alt, ",") << "\t"
              << var.quality << "\t"
              << var.filter;
            
            for (vector<string>::iterator i = infofields.begin(); i != infofields.end(); ++i) {
                vector<string> value;
                string& name = *i;
                map<string, vector<string> >::iterator f = var.info.find(name);
                if (f != var.info.end()) {
                    value = f->second;
                    if (value.size() == 1) {
                        o << "\t" << value.front();
                    } else if (value.size() == var.alt.size()) {
                        o << "\t" << join(value, ",");
                    } else {
                        o << "\t" << nullval; // null
                    }
                } else {
                    o << "\t" << nullval; // null
                }
            }

            for (vector<string>::iterator i = infoflags.begin(); i != infoflags.end(); ++i) {
                string value;
                string& name = *i;
                map<string, bool>::iterator f = var.infoFlags.find(name);
                o << "\t";
                if (f != var.infoFlags.end()) {
                    o << 1;
                } else {
                    o << 0;
                }
            }
            
            string siteinfo = o.str();

            for (map<string, map<string, vector<string> > >::iterator s = var.samples.begin(); s != var.samples.end(); ++s) {
                cout << siteinfo;
                const string& sampleName = s->first;
                cout << "\t" << sampleName;
                map<string, vector<string> >& sample = s->second;
                for (vector<string>::iterator f = formatfields.begin(); f != formatfields.end(); ++f) {
                    if (sample.find(*f) != sample.end()) {
                        cout << "\t" << join(sample[*f], ",");
                    } else {
                        cout << "\t" << nullval;
                    }
                }
                cout << endl;
            }
        }
    }

    return 0;

}
Example #25
0
int main(int argc, char** argv) {

    string vcfFileName;
    string fastaFileName;
    int windowsize = 30;

    if (argc == 1)
        printSummary(argv);

    int c;
    while (true) {
        static struct option long_options[] =
            {
                /* These options set a flag. */
                //{"verbose", no_argument,       &verbose_flag, 1},
                {"help", no_argument, 0, 'h'},
                {"window-size", required_argument, 0, 'w'},
                {"reference", required_argument, 0, 'r'},
                {0, 0, 0, 0}
            };
        /* getopt_long stores the option index here. */
        int option_index = 0;

        c = getopt_long (argc, argv, "hw:r:",
                         long_options, &option_index);

        if (c == -1)
            break;

        switch (c) {

	    case 'w':
            windowsize = atoi(optarg);
            break;

	    case 'r':
            fastaFileName = string(optarg);
            break;

        case 'h':
            printSummary(argv);
            break;

        case '?':
            printSummary(argv);
            exit(1);
            break;

        default:
            abort ();
        }
    }

    VariantCallFile variantFile;
    string inputFilename;
    if (optind == argc - 1) {
        inputFilename = argv[optind];
        variantFile.open(inputFilename);
    } else {
        variantFile.open(std::cin);
    }

    if (!variantFile.is_open()) {
        cerr << "could not open VCF file" << endl;
        exit(1);
    }

    FastaReference reference;
    if (fastaFileName.empty()) {
        cerr << "a reference is required for haplotype allele generation" << endl;
        exit(1);
    }
    reference.open(fastaFileName);

    // pattern
    // when variants are within windowSize from each other, build up local haplotypes
    // establish all the haplotypes which exist within the window using genotypes+allele#+position map
    // generate a haplotype allele string for each unique haplotype
    // for completeness retain phasing information in the genotypes
    // write a new VCF record in which there are haplotype alleles and correctly described genotypes for each sample
    // if the variants are outside of the windowSize, just write out the record

    Variant var(variantFile);
    Variant outputVar(variantFile);

    cout << variantFile.header << endl;

    // get the first distances
    vector<Variant> cluster;

    while (variantFile.getNextVariant(var) || !cluster.empty()) {

        bool haplotypeCluster = false;

        if (variantFile.done()) {
            if (cluster.size() >= 1) {
                haplotypeCluster = true;
            } else {
                cout << cluster.front() << endl;
                cluster.clear();
            }
        } else if (isPhased(var)) {
            if (cluster.empty()
                || cluster.back().sequenceName == var.sequenceName
                && var.position - cluster.back().position + cluster.back().ref.size() - 1 <= windowsize) {
                cluster.push_back(var);
            } else {
                if (cluster.size() == 1) {
                    cout << cluster.front() << endl;
                    cluster.clear();
                    if (!variantFile.done()) {
                        cluster.push_back(var);
                    }
                } else {
                    haplotypeCluster = true;
                }
            }
        } else { // not phased
            if (cluster.empty()) {
                cout << var << endl;
            } else if (cluster.size() == 1) {
                cout << cluster.front() << endl;
                cout << var << endl;
            } else {
                haplotypeCluster = true;
            }
        }

        // we need to deal with the current cluster, as our next var is outside of bounds
        // process the last cluster if it's more than 1 var
        if (haplotypeCluster) {
            /*            cerr << "cluster: ";
            for (vector<Variant>::iterator v = cluster.begin(); v != cluster.end(); ++v) {
                cerr << " " << v->position;
            }
            cerr << endl;
            */

            // generate haplotype alleles and genotypes!
            // get the reference sequence across the haplotype in question
            string referenceHaplotype = reference.getSubSequence(cluster.front().sequenceName,
                                                                 cluster.front().position - 1,
                                                                 cluster.back().position
                                                                 + cluster.back().ref.size() - cluster.front().position);

            // establish what haplotypes there are by parsing the (phased) genotypes across the samples over these records
            map<string, vector<vector<int> > > sampleHaplotypes;
            for (vector<string>::iterator s = var.sampleNames.begin(); s != var.sampleNames.end(); ++s) {
                // build the haplotype using the genotype fields in the variant cluster
                // only build haplotypes for samples with complete information
                string& sampleName = *s;
                vector<vector<int> >& haplotypes = sampleHaplotypes[sampleName];
		
                bool completeCoverage = true;
                // ensure complete genotype coverage over the haplotype cluster
                for (vector<Variant>::iterator v = cluster.begin(); v != cluster.end(); ++v) {
                    if (v->samples.find(sampleName) == v->samples.end()
                        || v->samples[sampleName].find("GT") == v->samples[sampleName].end()) {
                        completeCoverage = false;
                        break;
                    }
                }
                if (!completeCoverage) {
                    continue; // skip samples without complete coverage
                }
		
                // what's the ploidy?
                {
                    string& gt = cluster.front().samples[sampleName]["GT"].front();
                    vector<string> gtspec = split(gt, "|");
                    for (vector<string>::iterator g = gtspec.begin(); g != gtspec.end(); ++g) {
                        vector<int> haplotype;
                        haplotypes.push_back(haplotype);
                    }
                }
		
                for (vector<Variant>::iterator v = cluster.begin(); v != cluster.end(); ++v) {
                    string& gt = v->samples[sampleName]["GT"].front();
                    vector<string> gtspec = split(gt, "|");
                    vector<string>::iterator g = gtspec.begin();
                    for (vector<vector<int> >::iterator h = haplotypes.begin(); h != haplotypes.end(); ++h, ++g) {
                        int j;
                        convert(*g, j);
                        h->push_back(j);
                    }
                }
            }

            set<vector<int> > uniqueHaplotypes;
            for (map<string, vector<vector<int> > >::iterator hs = sampleHaplotypes.begin();
                 hs != sampleHaplotypes.end(); ++hs) {
                vector<vector<int> >& haps = hs->second;
                for (vector<vector<int> >::iterator h = haps.begin(); h != haps.end(); ++h) {
                    uniqueHaplotypes.insert(*h);
                }
            }
	    
            // write new haplotypes
            map<vector<int>, string> haplotypeSeqs;
            map<vector<int>, int> haplotypeIndexes;
            map<int, string> alleles;
	    
            int impossibleHaplotypes = 0;

            // always include the reference haplotype as 0
            // when we come to it in the haplotypes, we'll ignore it
            int alleleIndex = 1;
            for (set<vector<int> >::iterator u = uniqueHaplotypes.begin(); u != uniqueHaplotypes.end(); ++u) {

                /*
                for (vector<int>::const_iterator z = u->begin(); z != u->end(); ++z) {
                    cerr << *z;
                }
                cerr << endl;
                */

                string haplotype = referenceHaplotype;
                bool isreference = true;
                bool impossibleHaplotype = false;
                int referenceInsertOffset = 0;
                int j = 0; // index into variant cluster
                int lastpos = 0;
                int lastrefend = 0;
                for (vector<int>::const_iterator z = u->begin(); z != u->end(); ++z, ++j) {
                    int i = *z;
                    if (i != 0) {
                        isreference = false;
                        Variant& vartoInsert = cluster.at(j);
                        string& alternate = vartoInsert.alleles.at(i);
                        if (vartoInsert.position < lastrefend) {
                            cerr << "impossible haplotype, overlapping alleles at " << vartoInsert.sequenceName << ":" << vartoInsert.position << endl;
                            impossibleHaplotype = true;
                            break;
                        } else {
                            //cerr << vartoInsert.position << " " << cluster.front().position + referenceInsertOffset << endl;
                            //cerr << "replacing " << vartoInsert.ref << " at " << vartoInsert.position - cluster.front().position + referenceInsertOffset << " with " << alternate << endl;
                            haplotype.replace(vartoInsert.position - cluster.front().position + referenceInsertOffset,
                                              vartoInsert.ref.size(), alternate);
                            if (alternate.size() != vartoInsert.ref.size()) {
                                referenceInsertOffset += alternate.size() - vartoInsert.ref.size();
                            }
                            lastpos = vartoInsert.position;
                            lastrefend = vartoInsert.position + vartoInsert.ref.size();
                        }
                    }
                }
		
                if (impossibleHaplotype) {
                    ++impossibleHaplotypes;
                    haplotypeIndexes[*u] = -1; // indicates impossible haplotype
                    impossibleHaplotype = false;
                } else if (isreference) {
                    alleles[0] = haplotype;
                    haplotypeIndexes[*u] = 0;
                } else {
                    alleles[alleleIndex] = haplotype;
                    haplotypeIndexes[*u] = alleleIndex;
                    ++alleleIndex;
                }
                haplotypeSeqs[*u] = haplotype;
                // if there's not a reference allele, add it
                if (alleles.find(0) == alleles.end()) {
                    alleles[0] = referenceHaplotype;
                    // nb, there is no reference haplotype among
                    // the samples, so we don't have to add it to
                    // the haplotypeIndexes
                }
            }

            outputVar.ref = alleles[0];
            outputVar.alt.clear();
            for (int i = 1; i < alleleIndex; ++i) {
                outputVar.alt.push_back(alleles[i]);
            }
	    
            outputVar.sequenceName = cluster.front().sequenceName;
            outputVar.position = cluster.front().position;
            outputVar.filter = ".";
            outputVar.id = ".";
            outputVar.info = cluster.front().info;
            outputVar.samples.clear();
            outputVar.format = cluster.front().format;
	    
            // now the genotypes
            for (vector<string>::iterator s = var.sampleNames.begin(); s != var.sampleNames.end(); ++s) {
                string& sampleName = *s;
                vector<string> gt;
                vector<vector<int> > & hs = sampleHaplotypes[sampleName];
                for (vector<vector<int> >::iterator h = hs.begin(); h != hs.end(); ++h) {
                    int hi = haplotypeIndexes[*h];
                    if (hi != -1) {
                        gt.push_back(convert(hi));
                    } else {
                        // nonexistent or impossible haplotype
                        gt.push_back(".");
                    }
                }
                if (gt.size() != 0) {
                    outputVar.samples[sampleName]["GT"].push_back(join(gt, "|"));
                }
            }
            if (cluster.size() - impossibleHaplotypes < 2) {
                for (vector<Variant>::iterator v = cluster.begin(); v != cluster.end(); ++v) {
                    cout << *v << endl;
                }
            } else {
                if (!outputVar.alt.empty()) {
                    cout << outputVar << endl;
                } else {
                    cerr << "no alternate alleles remain at " << outputVar.sequenceName << ":" << outputVar.position << " after haplotype validation" << endl;
                }
            }
            cluster.clear();
            if (!variantFile.done()) cluster.push_back(var);
        }
    }

    exit(0);  // why?
    return 0;

}
Example #26
0
int main(int argc, char** argv) {

    if (argc != 3) {
        cerr << "usage: " << argv[0] << " <vcf file> <vcf file>" << endl
             << "Adds info fields from the second file which are not present in the first vcf file." << endl;
        return 1;
    }

    string filenameA = argv[1];
    string filenameB = argv[2];

    if (filenameA == filenameB) {
        cerr << "it won't help to add info data from the same file!" << endl;
        return 1;
    }

    VariantCallFile variantFileA;
    if (filenameA == "-") {
        variantFileA.open(std::cin);
    } else {
        variantFileA.open(filenameA);
    }

    VariantCallFile variantFileB;
    if (filenameB == "-") {
        variantFileB.open(std::cin);
    } else {
        variantFileB.open(filenameB);
    }

    if (!variantFileA.is_open() || !variantFileB.is_open()) {
        return 1;
    }

    Variant varA(variantFileA);
    Variant varB(variantFileB);

    // while the first file doesn't match the second positionally,
    // step forward, annotating each genotype record with an empty genotype
    // when the two match, iterate through the genotypes from the first file
    // and get the genotypes reported in the second file
    
    variantFileA.getNextVariant(varA);
    variantFileB.getNextVariant(varB);
    
    variantFileA.header = unionInfoHeaderLines(variantFileA.header, variantFileB.header);
    
    cout << variantFileA.header << endl;

    do {

        while (!variantFileB.done()
               && (varB.sequenceName < varA.sequenceName
                   || (varB.sequenceName == varA.sequenceName && varB.position < varA.position))
            ) {
            variantFileB.getNextVariant(varB);
        }

        while (!variantFileA.done()
               && (varA.sequenceName < varB.sequenceName
                   || (varA.sequenceName == varB.sequenceName && varA.position < varB.position))
            ) {
            cout << varA << endl;
            variantFileA.getNextVariant(varA);
        }

        while (!variantFileB.done()
               && (varB.sequenceName < varA.sequenceName
                   || (varB.sequenceName == varA.sequenceName && varB.position < varA.position))
            ) {
            variantFileB.getNextVariant(varB);
        }

        while (!variantFileA.done() && varA.sequenceName == varB.sequenceName && varA.position == varB.position) {
            addInfo(varA, varB);
            cout << varA << endl;
            variantFileA.getNextVariant(varA);
            variantFileB.getNextVariant(varB);
        }
        
    } while (!variantFileA.done() && !variantFileB.done());

    if (!variantFileA.done()) {
        cout << varA << endl;
        while (variantFileA.getNextVariant(varA)) {
            cout << varA << endl;
        }
    }

    return 0;

}
Example #27
0
int main(int argc, char** argv) {

    if (argc < 5) {
        printSummary(argv);
        exit(0);
    }

    bool strict = false;
    int c;

    while (true) {
        static struct option long_options[] =
            {
                /* These options set a flag. */
                //{"verbose", no_argument,       &verbose_flag, 1},
                {"help", no_argument, 0, 'h'},
                {"strict",  no_argument, 0, 's'},
                //{"length",  no_argument, &printLength, true},
                {0, 0, 0, 0}
            };
        /* getopt_long stores the option index here. */
        int option_index = 0;

        c = getopt_long (argc, argv, "hs",
                         long_options, &option_index);

        /* Detect the end of the options. */
        if (c == -1)
            break;
 
        switch (c)
        {
        case 0:
            /* If this option set a flag, do nothing else now. */
            if (long_options[option_index].flag != 0)
                break;
            printf ("option %s", long_options[option_index].name);
            if (optarg)
                printf (" with arg %s", optarg);
            printf ("\n");
            break;

        case 's':
            strict = true;
            break;
 
        case 'h':
            printSummary(argv);
            exit(0);
            break;

        case '?':
            /* getopt_long already printed an error message. */
            printSummary(argv);
            exit(1);
            break;
 
        default:
            abort ();
        }
    }

    string tag = argv[optind];

    vector<string> samples;
    for (int i = optind+1; i < argc - 1; ++i) {
        samples.push_back(argv[i]);
    }

    string filename = argv[argc-1];

    VariantCallFile variantFile;
    if (filename == "-") {
        variantFile.open(std::cin);
    } else {
        variantFile.open(filename);
    }

    if (!variantFile.is_open()) {
        cerr << "could not open " << filename << endl;
        return 1;
    }

    assert(samples.size() == 2);

    Variant var(variantFile);

    // TODO check if AC is present
    // ensure that AC is listed as an info field
    string line = "##INFO=<ID=" + tag + ",Number=1,Type=String,Description=\"Samples";
    for (vector<string>::iterator s = samples.begin(); s != samples.end(); ++s) {
        line += " " + *s;
    }
    line += " have different genotypes\">";
    variantFile.addHeaderLine(line);

    variantFile.addHeaderLine("##INFO=<ID=SSC,Number=1,Type=Float,Description=\"Somatic variant score (phred-scaled probability that the somatic variant call is correct).\">");

    // write the new header
    cout << variantFile.header << endl;
 
    // print the records, filtering is done via the setting of varA's output sample names
    while (variantFile.getNextVariant(var)) {
        if (var.samples.find(samples.front()) != var.samples.end()
            && var.samples.find(samples.back()) != var.samples.end()) {
            map<string, vector<string> >& germline = var.samples[samples.front()];
            map<string, vector<string> >& somatic = var.samples[samples.back()];
            map<int, int> gtGermline = decomposeGenotype(germline["GT"].front());
            map<int, int> gtSomatic  = decomposeGenotype(somatic["GT"].front());
            int germlineAltCount = 0;
            convert(germline["AO"].front(), germlineAltCount);
            var.info[tag].clear(); // remove previous
            if (gtGermline == gtSomatic) {
                var.info[tag].push_back("germline");
            } else {
                //if (isHet(gtGermline) && isHom(gtSomatic)) {
                //    var.info[tag].push_back("loh");
                if (isHet(gtGermline) && isHomNonRef(gtSomatic) ||
                    isHomRef(gtGermline) && (isHet(gtSomatic) || isHomNonRef(gtSomatic))) {
                    if (!strict || strict && germlineAltCount == 0) {
                        var.info[tag].push_back("somatic");
                    }
                } else if (isHom(gtGermline) && isHet(gtSomatic)) {
                    if (var.alt.size() == 1) {
                        var.info[tag].push_back("reversion");
                    } else {
                        var.info[tag].push_back("somatic");
                    }
                }
            }
            if (germline.find("GQ") != germline.end() && somatic.find("GQ") != somatic.end()) {
                double germlineGQ;
                convert(germline["GQ"].front(), germlineGQ);
                double somaticGQ;
                convert(somatic["GQ"].front(), somaticGQ);
                double somaticScore = min(var.quality, min(germlineGQ, somaticGQ));
                var.info["SSC"].clear();
                var.info["SSC"].push_back(convert(somaticScore));
            }
        }
        cout << var << endl;
    }

    return 0;

}
Example #28
0
int main(int argc, char** argv) {

    vector<string> regions;

    int c;
    while (true) {
        static struct option long_options[] =
        {
            /* These options set a flag. */
            //{"verbose", no_argument,       &verbose_flag, 1},
            {"help", no_argument, 0, 'h'},
            {"region", required_argument, 0, 'r'},
            //{"length",  no_argument, &printLength, true},
            {0, 0, 0, 0}
        };
        /* getopt_long stores the option index here. */
        int option_index = 0;

        c = getopt_long (argc, argv, "hr:",
                         long_options, &option_index);

      /* Detect the end of the options. */
          if (c == -1)
            break;
 
          switch (c)
            {
            case 0:
            /* If this option set a flag, do nothing else now. */
            if (long_options[option_index].flag != 0)
              break;
            printf ("option %s", long_options[option_index].name);
            if (optarg)
              printf (" with arg %s", optarg);
            printf ("\n");
            break;

          case 'h':
            printSummary(argv);
            exit(0);
            break;

          case 'r':
            regions.push_back(optarg);
            break;
          
          default:
            abort ();
          }
      }

    VariantCallFile variantFile;
    string inputFilename;
    if (optind == argc - 1) {
        inputFilename = argv[optind];
        variantFile.open(inputFilename);
    } else {
        variantFile.open(std::cin);
    }

    if (!variantFile.is_open()) {
        return 1;
    }

    Variant var(variantFile);

    vector<string>::iterator regionItr = regions.begin();

    int variantAlleles = 0;
    int variantSites = 0;
    int snps = 0;
    int transitions = 0;
    int transversions = 0;
    int totalinsertions = 0;
    int totaldeletions = 0;
    int insertedbases = 0;
    int deletedbases = 0;
    int totalmnps = 0;
    int totalcomplex = 0;
    map<int, int> insertions;
    map<int, int> deletions;
    map<int, int> mnps;
    map<int, int> complexsubs;

    do {

        if (!inputFilename.empty() && !regions.empty()) {
            string regionStr = *regionItr++;
            variantFile.setRegion(regionStr);
        }

        while (variantFile.getNextVariant(var)) {
            ++variantSites;
            map<string, vector<VariantAllele> > alternates = var.parsedAlternates();
            for (vector<string>::iterator a = var.alt.begin(); a != var.alt.end(); ++a) {
                ++variantAlleles;
                string& alternate = *a;
                if (var.ref.size() == alternate.size()) {
                    if (var.ref.size() == 1) {
                        ++snps;
                        if (isTransition(var.ref, alternate)) {
                            ++transitions;
                        } else {
                            ++transversions;
                        }
                    } else {
                        ++totalmnps;
                        if (alternates[alternate].size() > 1) {
                        } else {
                            VariantAllele& va = alternates[alternate].front();
                            ++mnps[va.alt.size()]; // not entirely correct
                        }
                    }
                } else if (var.ref.size() > alternate.size()) {
                    int diff = var.ref.size() - alternate.size();
                    deletedbases += diff;
                    if (alternates[alternate].size() > 1) {
                        ++totalcomplex;
                        ++complexsubs[-diff];
                    } else {
                        ++totaldeletions;
                        ++deletions[diff];
                    }
                } else {
                    int diff = alternate.size() - var.ref.size();
                    insertedbases += diff;
                    if (alternates[alternate].size() > 1) {
                        ++totalcomplex;
                        ++complexsubs[diff];
                    } else {
                        ++totalinsertions;
                        ++insertions[diff];
                    }
                }
            }
        }

    } while (regionItr != regions.end());

    // find the maximum indel size
    int maxindel = 0;
    for (map<int, int>::iterator i = insertions.begin(); i != insertions.end(); ++i) {
        if (i->first > maxindel) {
            maxindel = i->first;
        }
    }
    for (map<int, int>::iterator i = deletions.begin(); i != deletions.end(); ++i) {
        if (i->first > maxindel) {
            maxindel = i->first;
        }
    }

    // and maximum mnp
    int maxmnp = 0;
    for (map<int, int>::iterator i = mnps.begin(); i != mnps.end(); ++i) {
        if (i->first > maxmnp) {
            maxmnp = i->first;
        }
    }

    // now print the results

    cout << "total variant sites:\t" << variantSites << endl
         << "total variant alleles:\t" << variantAlleles << endl
         << endl
         << "snps:\t" << snps << endl
         << "indels:\t" << totalinsertions + totaldeletions << endl
         << "mnps:\t" << totalmnps << endl
         << "complex:\t" << totalcomplex << endl
         << endl
         << "ts/tv ratio:\t" << (double) transitions / (double) transversions << endl
         << endl
         << "ins/del length frequency distribution" << endl
         << "length\tins\tdel\tins/del" << endl;
    for (int i = 1; i <= maxindel; ++i) {
        int ins = insertions[i];
        int del = deletions[i];
        cout << i << "\t"
             << (ins > 0 ? convert(ins) : "" ) << "\t"
             << (del > 0 ? convert(del) : "") << "\t"
             << (ins > 0 && del > 0 ? convert((double) ins / (double) del) : "")
             << endl;
    }
    cout << endl
         << "insertion alleles / deletion alleles:\t" << (double) totalinsertions / (double) totaldeletions << endl
         << "inserted bases / deleted bases:\t" << (double) insertedbases / (double) deletedbases << endl
         << endl
         << "mnp length frequency distribution" << endl
         << "length\tcount" << endl;
    for (int i = 2; i <= maxmnp; ++i) {
        int mnp = mnps[i];
        cout << i << "\t"
             << (mnp > 0 ? convert(mnp) : "")
             << endl;
    }
    cout << endl;

    cout << "complex event frequency distribution" << endl
         << "length\tcount" << endl;
    for (map<int, int>::iterator i = complexsubs.begin(); i != complexsubs.end(); ++i) {
        cout << i->first << "\t" << i->second << endl;
    }

    return 0;

}
Example #29
0
int main(int argc, char** argv) {

    int c;
    bool invert = false;
    bool logicalOr = false;
    bool filterSites = false;
    vector<string> infofilterStrs;
    vector<VariantFilter> infofilters;
    vector<string> genofilterStrs;
    vector<VariantFilter> genofilters;
    string tag = "";
    string filterSpec;
    string alleleTag;
    vector<string> regions;

    if (argc == 1)
        printSummary(argv);

    while (true) {
        static struct option long_options[] =
        {
            /* These options set a flag. */
            //{"verbose", no_argument,       &verbose_flag, 1},
            {"help", no_argument, 0, 'h'},
            {"filter-sites", no_argument, 0, 's'},
            {"info-filter",  required_argument, 0, 'f'},
            {"genotype-filter",  required_argument, 0, 'g'},
            {"tag", required_argument, 0, 't'},
	    {"allele-tag", required_argument, 0, 'a'},
            {"invert", no_argument, 0, 'v'},
            {"or", no_argument, 0, 'o'},
            {"region", required_argument, 0, 'r'},
            //{"length",  no_argument, &printLength, true},
            {0, 0, 0, 0}
        };
        /* getopt_long stores the option index here. */
        int option_index = 0;

        c = getopt_long (argc, argv, "hvsof:g:t:r:a:",
                         long_options, &option_index);

      /* Detect the end of the options. */
          if (c == -1)
            break;
 
          switch (c)
            {
            case 0:
            /* If this option set a flag, do nothing else now. */
            if (long_options[option_index].flag != 0)
              break;
            printf ("option %s", long_options[option_index].name);
            if (optarg)
              printf (" with arg %s", optarg);
            printf ("\n");
            break;

          case 'f':
            filterSpec += " " + string(optarg);
            infofilterStrs.push_back(string(optarg));
            break;

          case 's':
            filterSites = true;
            break;

	  case 'a':
	    alleleTag = optarg;
	    break;
 
          case 'g':
            filterSpec += " genotypes filtered with: " + string(optarg);
            genofilterStrs.push_back(string(optarg));
            break;
 
          case 't':
            tag = optarg;
            break;
 
          case 'h':
            printSummary(argv);
            exit(0);
            break;

          case 'v':
            invert = true;
            break;

          case 'o':
            logicalOr = true;
            break;

          case 'r':
            regions.push_back(optarg);
            break;
          
          case '?':
            /* getopt_long already printed an error message. */
            printSummary(argv);
            exit(1);
            break;
 
          default:
            abort ();
          }
      }

    filterSpec = filterSpec.substr(1); // strip leading " "

    VariantCallFile variantFile;
    string inputFilename;
    if (optind == argc - 1) {
        inputFilename = argv[optind];
        variantFile.open(inputFilename);
    } else {
        variantFile.open(std::cin);
    }

    if (!variantFile.is_open()) {
        return 1;
    }

    for (vector<string>::iterator f = infofilterStrs.begin(); f != infofilterStrs.end(); ++f) {
        infofilters.push_back(VariantFilter(*f, VariantFilter::RECORD, variantFile.infoTypes));
    }

    for (vector<string>::iterator f = genofilterStrs.begin(); f != genofilterStrs.end(); ++f) {
        genofilters.push_back(VariantFilter(*f, VariantFilter::SAMPLE, variantFile.formatTypes));
    }

    vector<string> headerlines = split(variantFile.header, "\n");
    variantFile.header.clear();
    for (vector<string>::iterator l = headerlines.begin(); l != headerlines.end(); ++l) {
        if (!filterSpec.empty() && (l->find("INFO") != string::npos || l + 1 == headerlines.end())) {
            variantFile.header += "##filter=\"" + filterSpec + "\"\n";
            filterSpec.clear();
        }
        variantFile.header += *l + ((l + 1 == headerlines.end()) ? "" : "\n");
    }

    if (!alleleTag.empty()) {
	variantFile.addHeaderLine("##INFO=<ID="+ alleleTag +",Number=A,Type=String,Description=\"" + tag + " if this allele passes the filters, '.' if not, filters are: " + filterSpec + ".\">");
    }

    cout << variantFile.header << endl;

    /*
    if (genofilters.empty() && tag.empty()) {
        variantFile.parseSamples = false;
    }
    */

    Variant var(variantFile);

    vector<string>::iterator regionItr = regions.begin();

    do {

        if (!inputFilename.empty() && !regions.empty()) {
            string regionStr = *regionItr++;
            variantFile.setRegion(regionStr);
        }

        while (variantFile.getNextVariant(var)) {
            if (!genofilters.empty()) {
                for (vector<VariantFilter>::iterator f = genofilters.begin(); f != genofilters.end(); ++f) {
                    f->removeFilteredGenotypes(var);
                }
            }
            if (!infofilters.empty()) {
                if (filterSites) {
                    bool passes = passesFilters(var, infofilters, logicalOr);
                    if (invert) {
                        passes = !passes;
                    }
                    if (passes) {
                        if (!tag.empty()) {
			    if (alleleTag.empty()) {
				var.addFilter(tag);
			    } else {
				var.info[alleleTag].clear();
				for (vector<string>::iterator a = var.alt.begin(); a != var.alt.end(); ++a) {
				    var.info[alleleTag].push_back(tag);
				}
			    }
                            cout << var << endl;
                        } else {
                            cout << var << endl;
                        }
                    } else if (!tag.empty()) {
                        cout << var << endl;
                    }
                } else { // filter out alleles which pass
                    // removes the failing alleles
                    vector<string> failingAlts;
                    vector<string> passingAlts;
		    vector<bool> passes;
                    for (vector<string>::iterator a = var.alt.begin(); a != var.alt.end(); ++a) {
                        if (!passesFilters(var, infofilters, logicalOr, *a)) {
                            failingAlts.push_back(*a);
			    passes.push_back(false);
                        } else {
                            passingAlts.push_back(*a);
			    passes.push_back(true);
                        }
                    }
                    if (tag.empty()) { // if there is no specified tag, just remove the failing alts
			if (failingAlts.size() < var.alt.size()) {
			    for (vector<string>::iterator a = failingAlts.begin(); a != failingAlts.end(); ++a) {
				var.removeAlt(*a);
			    }
			    cout << var << endl;
			}
                    } else { // otherwise, apply the tag
			if (alleleTag.empty()) {
			    if (!passingAlts.empty()) {
				var.addFilter(tag);
			    }
			} else {
			    var.info[alleleTag].clear();
			    for (vector<bool>::iterator p = passes.begin(); p != passes.end(); ++p) {
				if (*p) {
				    var.info[alleleTag].push_back(tag);
				} else {
				    var.info[alleleTag].push_back(".");
				}
			    }
			}
                        cout << var << endl;
                    }
                }
            } else {
                if (genofilters.empty()) {
                    cout << variantFile.line << endl;
                } else {
                    cout << var << endl;
                }
            }
        }

    } while (regionItr != regions.end());

    return 0;

}
Example #30
0
int main(int argc, char** argv) {

    string vcfFileName;
    string fastaFileName;
    int windowsize = 100;
    bool includePreviousBaseForIndels = false;
    bool useMNPs = true;
    int altwindowsize = 50;

    // constants for SmithWaterman algorithm
    float matchScore = 10.0f;
    float mismatchScore = -9.0f;
    float gapOpenPenalty = 15.0f;
    float gapExtendPenalty = 6.66f;

    bool useEntropy = false;
    bool useRepeatGapExtendPenalty = false;
    float repeatGapExtendPenalty = 1;

    bool adjustVcf = false;
    string adjustedTag = "remappedCIGAR";

    if (argc == 1)
        printSummary(argv);

    int c;
    while (true) {
        static struct option long_options[] =
            {
                /* These options set a flag. */
                //{"verbose", no_argument,       &verbose_flag, 1},
                {"help", no_argument, 0, 'h'},
                {"ref-window-size", required_argument, 0, 'w'},
                {"reference", required_argument, 0, 'r'},
                {"match-score", required_argument, 0, 'm'},
                {"mismatch-score", required_argument, 0, 'x'},
                {"gap-open-penalty", required_argument, 0, 'o'},
                {"gap-extend-penalty", required_argument, 0, 'e'},
                {"alt-window-size", required_argument, 0, 's'},
                {"entropy-gap-open", no_argument, 0, 'z'},
                {"repeat-gap-extend", no_argument, 0, 'R'},
                {"adjust-vcf", required_argument, 0, 'a'},
                {0, 0, 0, 0}
            };
        /* getopt_long stores the option index here. */
        int option_index = 0;

        c = getopt_long (argc, argv, "hza:w:r:m:x:o:e:s:R:",
                         long_options, &option_index);

        if (c == -1)
            break;

        switch (c) {

	    case 'w':
            windowsize = atoi(optarg);
            break;

	    case 'a':
	        adjustVcf = true;
            adjustedTag = optarg;
            break;

	    case 'r':
            fastaFileName = string(optarg);
            break;

        case 'h':
            printSummary(argv);
            break;

	    case 'm':
            matchScore = atof(optarg);
	        break;

	    case 'x':
            mismatchScore = atof(optarg);
	        break;

	    case 'o':
            gapOpenPenalty = atof(optarg);
	        break;

	    case 'e':
            gapExtendPenalty = atof(optarg);
	        break;

	    case 's':
            altwindowsize = atoi(optarg);
            break;

	    case 'z':
            useEntropy = true;
            break;

	    case 'R':
            useRepeatGapExtendPenalty = true;
            repeatGapExtendPenalty = atof(optarg);
            break;

        case '?':
            printSummary(argv);
            exit(1);
            break;

        default:
            abort ();
        }
    }

    VariantCallFile variantFile;
    string inputFilename;
    if (optind == argc - 1) {
        inputFilename = argv[optind];
        variantFile.open(inputFilename);
    } else {
        variantFile.open(std::cin);
    }

    if (!variantFile.is_open()) {
        cerr << "could not open VCF file" << endl;
        exit(1);
    }

    FastaReference freference;
    if (fastaFileName.empty()) {
        cerr << "a reference is required" << endl;
        exit(1);
    } else {
        freference.open(fastaFileName);
    }
    
    if (adjustVcf) {
        vector<string> commandline;
        for (int i = 0; i < argc; ++i)
            commandline.push_back(argv[i]);
        variantFile.addHeaderLine("##INFO=<ID=" + adjustedTag + ",Number=A,Type=String,Description=\"CIGAR when remapped using"+ join(commandline, " ") +"\">");
    }

    cout << variantFile.header << endl;

    Variant var(variantFile);
    while (variantFile.getNextVariant(var)) {
        //if (!adjustVcf) {
	    cout << endl;
	    cout << var << endl;
	    //}
        map<string, vector<VariantAllele> > variantAlleles;
        vector<vector<pair<int, char> > > cigars;
        vector<int> positionDiffs;
        for (vector<string>::iterator a = var.alt.begin(); a != var.alt.end(); ++a) {
            //if (!adjustVcf) cout << endl;
            cout << endl;

            // try to remap locally

            string reference = freference.getSubSequence(var.sequenceName, var.position - 1 - windowsize, windowsize * 2 + var.ref.size());
	    
            // passed to sw align
            unsigned int referencePos;
            string cigar;

            string& alternate = *a;

            vector<VariantAllele>& variants = variantAlleles[alternate];

            string alternateQuery = reference.substr(windowsize - altwindowsize, altwindowsize) + alternate + reference.substr(reference.size() - windowsize, altwindowsize);

            //cout << "REF:\t" << reference << endl;
            //cout << "ALT:\t" << string(windowsize - altwindowsize, ' ') << alternateQuery << endl;
	    
            CSmithWatermanGotoh sw(matchScore, mismatchScore, gapOpenPenalty, gapExtendPenalty);
            if (useEntropy) sw.EnableEntropyGapPenalty(1);
            if (useRepeatGapExtendPenalty) sw.EnableRepeatGapExtensionPenalty(repeatGapExtendPenalty);
            sw.Align(referencePos, cigar, reference, alternateQuery);

            int altpos = 0;
            int refpos = 0;
            int len;
            string slen;
            vector<pair<int, char> > cigarData;

            string ref = reference.substr(referencePos);
            positionDiffs.push_back(referencePos); // TODO this... is borked

            stringstream refss;
            stringstream altss;

            if (!adjustVcf) cout << cigar << endl;
            cout << cigar << endl;
            for (string::iterator c = cigar.begin(); c != cigar.end(); ++c) {
                switch (*c) {
                case 'I':
                    len = atoi(slen.c_str());
                    slen.clear();
                    if (altpos < altwindowsize) {
                        cigarData.push_back(make_pair(len, 'M'));
                    } else {
                        cigarData.push_back(make_pair(len, *c));
                    }
                    altss << alternateQuery.substr(altpos, len);
                    refss << string(len, '-');
                    altpos += len;
                    break;
                case 'D':
                    len = atoi(slen.c_str());
                    slen.clear();
                    if (altpos < altwindowsize) {
                    } else {
                        cigarData.push_back(make_pair(len, *c));
                    }
                    refss << ref.substr(refpos, len);
                    altss << string(len, '-');
                    refpos += len;
                    break;
                case 'M':
                    len = atoi(slen.c_str());
                    slen.clear();
                    {
                        for (int i = 0; i < len; ++i) {
                            if (ref.at(refpos + i) == alternateQuery.at(altpos + i)) {
                                if (!cigarData.empty() && cigarData.back().second == 'M') {
                                    cigarData.back().first++;
                                } else {
                                    cigarData.push_back(make_pair(1, 'M'));
                                }
                            } else {
                                if (!cigarData.empty() && cigarData.back().second == 'X') {
                                    cigarData.back().first++;
                                } else {
                                    cigarData.push_back(make_pair(1, 'X'));
                                }
                            }
                        }
                    }
                    refss << ref.substr(refpos, len);
                    altss << alternateQuery.substr(altpos, len);
                    refpos += len;
                    altpos += len;
                    break;
                case 'S':
                    len = atoi(slen.c_str());
                    slen.clear();
                    cigarData.push_back(make_pair(len, *c));
                    refss << ref.substr(refpos, len);
                    //altss << alternateQuery.substr(altpos, len); // TODO deal with soft clipping, weird behavior
                    refpos += len;
                    altpos += len;
                    break;
                default:
                    len = 0;
                    slen += *c;
                    break;
                }
            }

            if (!adjustVcf) {
                cout << "ref:\t" << refss.str() << endl;
                cout << "alt:\t" << altss.str() << endl;
            } else {
                cout << "ref:\t" << refss.str() << endl;
                cout << "alt:\t" << altss.str() << endl;
                cigars.push_back(cigarData);
            }

        }

        if (adjustVcf) {
            int substart = cigars.front().front().first;
            int subend = cigars.front().back().first;

            // find the min and max match
            for (vector<vector<pair<int, char> > >::iterator c = cigars.begin(); c != cigars.end(); ++c) {
                if (c->front().second == 'M' && c->front().first <= substart) {
                    substart = c->front().first;
                    if (c->size() > 1 && c->at(1).second != 'X') {
                        --substart;
                    }
                }
                if (c->back().second == 'M' && c->back().first <= subend) {
                    subend = c->back().first;
                }
            }
	    
            // adjust the cigars and get the new reference length
            int reflen = 0;
            for (vector<vector<pair<int, char> > >::iterator c = cigars.begin(); c != cigars.end(); ++c) {
                c->front().first -= substart;
                c->back().first -= subend;
                int crf = cigarRefLen(*c);
                if (crf > reflen)
                    reflen = crf;
                var.info[adjustedTag].push_back(joinCigar(*c));
            }

            // find the lowest positional difference
            int pdiff = 0;
            for (vector<int>::iterator d = positionDiffs.begin(); d != positionDiffs.end(); ++d) {
                if (*d + altwindowsize < pdiff)
                    pdiff = *d + altwindowsize;
            }

            // adjust the reference string
            var.position += pdiff;

            // adjust the variant position
            var.ref = freference.getSubSequence(var.sequenceName, var.position - 1, reflen);

            cout << var << endl;
        }
    }

    return 0;

}