Beispiel #1
0
//******************************************************************************
// ExtractDNA
//******************************************************************************
void Bed2Fa::ExtractDNA() {

    /* Make sure that we can oen all of the files successfully*/

    // open the fasta database for reading
    ifstream faDb(_dbFile.c_str(), ios::in);
    if ( !faDb ) {
        cerr << "Error: The requested fasta database file (" << _dbFile << ") could not be opened. Exiting!" << endl;
        exit (1);
    }

    // open and memory-map genome file
    FastaReference *fr = new FastaReference;
    bool memmap = true;
    fr->open(_dbFile, memmap);

    BED bed, nullBed;
    string sequence;

    _bed->Open();
    while (_bed->GetNextBed(bed)) {
        if (_bed->_status == BED_VALID) {
            // make sure we are extracting >= 1 bp
            if (bed.zeroLength == false) {
    
                size_t seqLength = fr->sequenceLength(bed.chrom);
                // seqLength > 0 means chrom was found in index.
                // seqLength == 0 otherwise.
                if (seqLength) {
                    // make sure this feature will not exceed the end of the chromosome.
                    if ( (bed.start <= seqLength) && (bed.end <= seqLength) ) 
                    {
                        int length = bed.end - bed.start;
                        sequence = fr->getSubSequence(bed.chrom, bed.start, length);
                        ReportDNA(bed, sequence);
                    }
                    else
                    {
                        cerr << "Feature (" << bed.chrom << ":" << bed.start << "-" << bed.end << ") beyond the length of "
                            << bed.chrom << " size (" << seqLength << " bp).  Skipping." << endl;
                    }
                }
                else
                {
                    cerr << "WARNING. chromosome (" << bed.chrom << 
                            ") was not found in the FASTA file. Skipping."<< endl;
                }
            }
            // handle zeroLength 
            else {
                cerr << "Feature (" << bed.chrom << ":" << bed.start+1 << "-" << bed.end-1 << ") has length = 0, Skipping." << endl;
            }
            bed = nullBed;
        }
    }
    _bed->Close();
}
Beispiel #2
0
int main(int argc, char** argv) {

    string vcfFileName;
    string fastaFileName;
    int windowsize = 30;

    if (argc == 1)
        printSummary(argv);

    int c;
    while (true) {
        static struct option long_options[] =
            {
                /* These options set a flag. */
                //{"verbose", no_argument,       &verbose_flag, 1},
                {"help", no_argument, 0, 'h'},
                {"window-size", required_argument, 0, 'w'},
                {"reference", required_argument, 0, 'r'},
                {0, 0, 0, 0}
            };
        /* getopt_long stores the option index here. */
        int option_index = 0;

        c = getopt_long (argc, argv, "hw:r:",
                         long_options, &option_index);

        if (c == -1)
            break;

        switch (c) {

	    case 'w':
            windowsize = atoi(optarg);
            break;

	    case 'r':
            fastaFileName = string(optarg);
            break;

        case 'h':
            printSummary(argv);
            break;

        case '?':
            printSummary(argv);
            exit(1);
            break;

        default:
            abort ();
        }
    }

    VariantCallFile variantFile;
    string inputFilename;
    if (optind == argc - 1) {
        inputFilename = argv[optind];
        variantFile.open(inputFilename);
    } else {
        variantFile.open(std::cin);
    }

    if (!variantFile.is_open()) {
        cerr << "could not open VCF file" << endl;
        exit(1);
    }

    FastaReference reference;
    if (fastaFileName.empty()) {
        cerr << "a reference is required for haplotype allele generation" << endl;
        exit(1);
    }
    reference.open(fastaFileName);

    // pattern
    // when variants are within windowSize from each other, build up local haplotypes
    // establish all the haplotypes which exist within the window using genotypes+allele#+position map
    // generate a haplotype allele string for each unique haplotype
    // for completeness retain phasing information in the genotypes
    // write a new VCF record in which there are haplotype alleles and correctly described genotypes for each sample
    // if the variants are outside of the windowSize, just write out the record

    Variant var(variantFile);
    Variant outputVar(variantFile);

    cout << variantFile.header << endl;

    // get the first distances
    vector<Variant> cluster;

    while (variantFile.getNextVariant(var) || !cluster.empty()) {

        bool haplotypeCluster = false;

        if (variantFile.done()) {
            if (cluster.size() >= 1) {
                haplotypeCluster = true;
            } else {
                cout << cluster.front() << endl;
                cluster.clear();
            }
        } else if (isPhased(var)) {
            if (cluster.empty()
                || cluster.back().sequenceName == var.sequenceName
                && var.position - cluster.back().position + cluster.back().ref.size() - 1 <= windowsize) {
                cluster.push_back(var);
            } else {
                if (cluster.size() == 1) {
                    cout << cluster.front() << endl;
                    cluster.clear();
                    if (!variantFile.done()) {
                        cluster.push_back(var);
                    }
                } else {
                    haplotypeCluster = true;
                }
            }
        } else { // not phased
            if (cluster.empty()) {
                cout << var << endl;
            } else if (cluster.size() == 1) {
                cout << cluster.front() << endl;
                cout << var << endl;
            } else {
                haplotypeCluster = true;
            }
        }

        // we need to deal with the current cluster, as our next var is outside of bounds
        // process the last cluster if it's more than 1 var
        if (haplotypeCluster) {
            /*            cerr << "cluster: ";
            for (vector<Variant>::iterator v = cluster.begin(); v != cluster.end(); ++v) {
                cerr << " " << v->position;
            }
            cerr << endl;
            */

            // generate haplotype alleles and genotypes!
            // get the reference sequence across the haplotype in question
            string referenceHaplotype = reference.getSubSequence(cluster.front().sequenceName,
                                                                 cluster.front().position - 1,
                                                                 cluster.back().position
                                                                 + cluster.back().ref.size() - cluster.front().position);

            // establish what haplotypes there are by parsing the (phased) genotypes across the samples over these records
            map<string, vector<vector<int> > > sampleHaplotypes;
            for (vector<string>::iterator s = var.sampleNames.begin(); s != var.sampleNames.end(); ++s) {
                // build the haplotype using the genotype fields in the variant cluster
                // only build haplotypes for samples with complete information
                string& sampleName = *s;
                vector<vector<int> >& haplotypes = sampleHaplotypes[sampleName];
		
                bool completeCoverage = true;
                // ensure complete genotype coverage over the haplotype cluster
                for (vector<Variant>::iterator v = cluster.begin(); v != cluster.end(); ++v) {
                    if (v->samples.find(sampleName) == v->samples.end()
                        || v->samples[sampleName].find("GT") == v->samples[sampleName].end()) {
                        completeCoverage = false;
                        break;
                    }
                }
                if (!completeCoverage) {
                    continue; // skip samples without complete coverage
                }
		
                // what's the ploidy?
                {
                    string& gt = cluster.front().samples[sampleName]["GT"].front();
                    vector<string> gtspec = split(gt, "|");
                    for (vector<string>::iterator g = gtspec.begin(); g != gtspec.end(); ++g) {
                        vector<int> haplotype;
                        haplotypes.push_back(haplotype);
                    }
                }
		
                for (vector<Variant>::iterator v = cluster.begin(); v != cluster.end(); ++v) {
                    string& gt = v->samples[sampleName]["GT"].front();
                    vector<string> gtspec = split(gt, "|");
                    vector<string>::iterator g = gtspec.begin();
                    for (vector<vector<int> >::iterator h = haplotypes.begin(); h != haplotypes.end(); ++h, ++g) {
                        int j;
                        convert(*g, j);
                        h->push_back(j);
                    }
                }
            }

            set<vector<int> > uniqueHaplotypes;
            for (map<string, vector<vector<int> > >::iterator hs = sampleHaplotypes.begin();
                 hs != sampleHaplotypes.end(); ++hs) {
                vector<vector<int> >& haps = hs->second;
                for (vector<vector<int> >::iterator h = haps.begin(); h != haps.end(); ++h) {
                    uniqueHaplotypes.insert(*h);
                }
            }
	    
            // write new haplotypes
            map<vector<int>, string> haplotypeSeqs;
            map<vector<int>, int> haplotypeIndexes;
            map<int, string> alleles;
	    
            int impossibleHaplotypes = 0;

            // always include the reference haplotype as 0
            // when we come to it in the haplotypes, we'll ignore it
            int alleleIndex = 1;
            for (set<vector<int> >::iterator u = uniqueHaplotypes.begin(); u != uniqueHaplotypes.end(); ++u) {

                /*
                for (vector<int>::const_iterator z = u->begin(); z != u->end(); ++z) {
                    cerr << *z;
                }
                cerr << endl;
                */

                string haplotype = referenceHaplotype;
                bool isreference = true;
                bool impossibleHaplotype = false;
                int referenceInsertOffset = 0;
                int j = 0; // index into variant cluster
                int lastpos = 0;
                int lastrefend = 0;
                for (vector<int>::const_iterator z = u->begin(); z != u->end(); ++z, ++j) {
                    int i = *z;
                    if (i != 0) {
                        isreference = false;
                        Variant& vartoInsert = cluster.at(j);
                        string& alternate = vartoInsert.alleles.at(i);
                        if (vartoInsert.position < lastrefend) {
                            cerr << "impossible haplotype, overlapping alleles at " << vartoInsert.sequenceName << ":" << vartoInsert.position << endl;
                            impossibleHaplotype = true;
                            break;
                        } else {
                            //cerr << vartoInsert.position << " " << cluster.front().position + referenceInsertOffset << endl;
                            //cerr << "replacing " << vartoInsert.ref << " at " << vartoInsert.position - cluster.front().position + referenceInsertOffset << " with " << alternate << endl;
                            haplotype.replace(vartoInsert.position - cluster.front().position + referenceInsertOffset,
                                              vartoInsert.ref.size(), alternate);
                            if (alternate.size() != vartoInsert.ref.size()) {
                                referenceInsertOffset += alternate.size() - vartoInsert.ref.size();
                            }
                            lastpos = vartoInsert.position;
                            lastrefend = vartoInsert.position + vartoInsert.ref.size();
                        }
                    }
                }
		
                if (impossibleHaplotype) {
                    ++impossibleHaplotypes;
                    haplotypeIndexes[*u] = -1; // indicates impossible haplotype
                    impossibleHaplotype = false;
                } else if (isreference) {
                    alleles[0] = haplotype;
                    haplotypeIndexes[*u] = 0;
                } else {
                    alleles[alleleIndex] = haplotype;
                    haplotypeIndexes[*u] = alleleIndex;
                    ++alleleIndex;
                }
                haplotypeSeqs[*u] = haplotype;
                // if there's not a reference allele, add it
                if (alleles.find(0) == alleles.end()) {
                    alleles[0] = referenceHaplotype;
                    // nb, there is no reference haplotype among
                    // the samples, so we don't have to add it to
                    // the haplotypeIndexes
                }
            }

            outputVar.ref = alleles[0];
            outputVar.alt.clear();
            for (int i = 1; i < alleleIndex; ++i) {
                outputVar.alt.push_back(alleles[i]);
            }
	    
            outputVar.sequenceName = cluster.front().sequenceName;
            outputVar.position = cluster.front().position;
            outputVar.filter = ".";
            outputVar.id = ".";
            outputVar.info = cluster.front().info;
            outputVar.samples.clear();
            outputVar.format = cluster.front().format;
	    
            // now the genotypes
            for (vector<string>::iterator s = var.sampleNames.begin(); s != var.sampleNames.end(); ++s) {
                string& sampleName = *s;
                vector<string> gt;
                vector<vector<int> > & hs = sampleHaplotypes[sampleName];
                for (vector<vector<int> >::iterator h = hs.begin(); h != hs.end(); ++h) {
                    int hi = haplotypeIndexes[*h];
                    if (hi != -1) {
                        gt.push_back(convert(hi));
                    } else {
                        // nonexistent or impossible haplotype
                        gt.push_back(".");
                    }
                }
                if (gt.size() != 0) {
                    outputVar.samples[sampleName]["GT"].push_back(join(gt, "|"));
                }
            }
            if (cluster.size() - impossibleHaplotypes < 2) {
                for (vector<Variant>::iterator v = cluster.begin(); v != cluster.end(); ++v) {
                    cout << *v << endl;
                }
            } else {
                if (!outputVar.alt.empty()) {
                    cout << outputVar << endl;
                } else {
                    cerr << "no alternate alleles remain at " << outputVar.sequenceName << ":" << outputVar.position << " after haplotype validation" << endl;
                }
            }
            cluster.clear();
            if (!variantFile.done()) cluster.push_back(var);
        }
    }

    exit(0);  // why?
    return 0;

}
Beispiel #3
0
int main (int argc, char** argv)
{
  std::string command;
  std::string fastaFileName;
  std::string seqname;
  std::string longseqname;
  bool dump = false;
  bool buildIndex = false;  // flag to force index building
  bool printEntropy = false;  // entropy printing
  bool readRegionsFromStdin = false;
  std::string region;
  int c;

  while (true)
  {
    static struct option long_options[] =
    {
        /* These options set a flag. */
        {"help", no_argument, 0, 'h'},
        {"index",  no_argument, 0, 'i'},
        {"entropy", no_argument, 0, 'e'},
        {"region", required_argument, 0, 'r'},
        {"stdin", no_argument, 0, 'c'},
        {0, 0, 0, 0}
    };

    /* getopt_long stores the option index here. */
    int option_index = 0;
    c = getopt_long (argc, argv, "hciedr:", long_options, &option_index);

    /* Detect the end of the options. */
    if (c == -1)
      break;

    switch (c)
    {
      case 0:
        /* If this option set a flag, do nothing else now. */
        if (long_options[option_index].flag != 0)
          break;
        printf ("option %s", long_options[option_index].name);
        if (optarg)
          printf (" with arg %s", optarg);
        printf ("\n");
        break;

      case 'e':
        printEntropy = true;
        break;

      case 'c':
        readRegionsFromStdin = true;
        break;

      case 'i':
        buildIndex = true;
        break;

      case 'r':
        region = optarg;
        break;

        case 'd':
            dump = true;
            break;

      case 'h':
        printSummary();
        exit(0);
        break;

      case '?':
        /* getopt_long already printed an error message. */
        printSummary();
        exit(1);
        break;

      default:
        abort ();
    }
  }

  /* Print any remaining command line arguments (not options). */
  if (optind < argc)
  {
    //cerr << "fasta file: " << argv[optind] << std::endl;
    fastaFileName = argv[optind];
  }
  else
  {
    std::cerr << "Please specify a FASTA file." << std::endl;
    printSummary();
    exit(1);
  }

  if (buildIndex)
  {
    FastaIndex* fai = new FastaIndex();
    //cerr << "generating fasta index file for " << fastaFileName << std::endl;
    fai->indexReference(fastaFileName);
    fai->writeIndexFile((std::string) fastaFileName + fai->indexFileExtension());
  }
  
  std::string sequence;  // holds sequence so we can optionally process it

  FastaReference fr;
  fr.open(fastaFileName);

  if (dump)
  {
    for (vector<std::string>::iterator s = fr.index->sequenceNames.begin(); s != fr.index->sequenceNames.end(); ++s)
    {
      std::cout << *s << "\t" << fr.getSequence(*s) << std::endl;
    }

    return 0;
  }

  if (region != "")
  {
    FastaRegion target(region);
    sequence = fr.getTargetSubSequence(target);
  }

  if (readRegionsFromStdin)
  {
    std::string regionstr;

    while (getline(cin, regionstr))
    {
      FastaRegion target(regionstr);

      if (target.startPos == -1)
      {
        std::cout << fr.getSequence(target.startSeq) << std::endl;
      }
      else
      {
        std::cout << fr.getSubSequence(target.startSeq, target.startPos - 1, target.length()) << std::endl;
      }
    }
  }
  else
  {
    if (sequence != "")
    {
      if (printEntropy)
      {
        if (sequence.size() > 0)
        {
          std::cout << shannon_H((char*) sequence.c_str(), sequence.size()) << std::endl;
        }
        else
        {
          std::cerr << "please specify a region or sequence for which to calculate the shannon entropy" << std::endl;
        }
      }
      else
      {
        // if no statistical processing is requested, just print the sequence
        std::cout << sequence << std::endl;
      }
    }
  }

  return 0;
}
Beispiel #4
0
int main(int argc, char** argv) {

    int window = 150;
    VariantCallFile variantFile;
    string fastaFileName;

    int c;
    while (true) {
        static struct option long_options[] =
            {
                /* These options set a flag. */
                //{"verbose", no_argument,       &verbose_flag, 1},
                {"help", no_argument, 0, 'h'},
                {"reference", required_argument, 0, 'r'},
                {"window", required_argument, 0, 'w'},
                {0, 0, 0, 0}
            };
        /* getopt_long stores the option index here. */
        int option_index = 0;

        c = getopt_long (argc, argv, "hw:r:",
                         long_options, &option_index);

        if (c == -1)
            break;

        switch (c) {

	    case 'r':
            fastaFileName = optarg;
            break;

	    case 'w':
            window = atoi(optarg);
            break;

        case '?':
            printSummary(argv);
            exit(1);
            break;

        case 'h':
            printSummary(argv);
            break;

        default:
            abort ();
        }
    }

    if (optind < argc) {
        string filename = argv[optind];
        variantFile.open(filename);
    } else {
        variantFile.open(std::cin);
    }

    if (!variantFile.is_open()) {
        cerr << "could not open VCF file" << endl;
        exit(1);
    }

    FastaReference fastaReference;
    if (fastaFileName.empty()) {
        cerr << "a reference is required" << endl;
        exit(1);
    } else {
        fastaReference.open(fastaFileName);
    }

    /*
    variantFile.addHeaderLine("##INFO=<ID=TYPE,Number=A,Type=String,Description=\"The type of allele, either snp, mnp, ins, del, or complex.\">");
    variantFile.addHeaderLine("##INFO=<ID=LEN,Number=A,Type=Integer,Description=\"allele length\">");
    if (!parseFlag.empty()) {
        variantFile.addHeaderLine("##INFO=<ID="+parseFlag+",Number=0,Type=Flag,Description=\"The allele was parsed using vcfallelicprimitives.\">");
    }
    */
    cout << variantFile.header << endl;

    Variant var(variantFile);
    while (variantFile.getNextVariant(var)) {

        // if there is no indel, there is nothing to realign
        bool hasIndel = false;
        for (vector<string>::iterator a = var.alt.begin(); a != var.alt.end(); ++a) {
            if (a->size() != var.ref.size()) {
                hasIndel = true;
                break;
            }
        }
        if (!hasIndel) {
            cout << var << endl;
            continue;
        }

        vector<AltAlignment> alignments;
        string ref;

        // determine window size to prevent mismapping with SW algorithm
        int currentWindow = window;
        int scale = 2;
        if (var.ref.size()*scale > currentWindow) currentWindow = var.ref.size()*scale;
        for (vector<string>::iterator a = var.alleles.begin(); a != var.alleles.end(); ++a) {
            if (a->size()*scale > currentWindow) {
                currentWindow = a->size()*scale;
            }
        }

        // while the entropy of either flank is < some target entropy (~1 is fine), increase the flank sizes
        while (currentWindow < 2000) { // limit to one step > than this
            string refTarget = fastaReference.getSubSequence(var.sequenceName, var.position - 1 - currentWindow/2, currentWindow);
            if (entropy(refTarget.substr(0, refTarget.size()/2)) < 1 ||
                entropy(refTarget.substr(refTarget.size()/2)) < 1) {
                currentWindow *= scale;
            } else {
                break;
            }
        }

        // do the alignments
        getAlignment(var, fastaReference, ref, alignments, currentWindow);

        // stably left align the alignments
        for (vector<AltAlignment>::iterator a = alignments.begin(); a != alignments.end(); ++a) {
            Cigar cigarBefore = a->cigar;
            //cerr << a->seq << endl;
            //cerr << "before : " << a->pos << " " << joinCigar(a->cigar) << endl;
            long int prev = a->pos;
            stablyLeftAlign(a->seq, ref, a->cigar, 20, false);
            //cerr << "after  : " << a->pos << " " << joinCigar(a->cigar) << endl;
            if (a->pos != prev) cerr << "modified alignment @ " << var << endl;
        }
        //cout << var << endl;

        // transform the mappings
        // chop off leading matching bases
        // find the range of bp in the alleles
        // make the new ref allele
        // make the new alt alleles
        // emit the var

        long int newPosition = var.position+currentWindow/2;
        long int newEndPosition = var.position-currentWindow/2;
        // check for no-indel case
        int newLength = var.ref.size();
        bool giveUp = false;
        for (vector<AltAlignment>::iterator a = alignments.begin(); a != alignments.end() && !giveUp; ++a) {
            // get the first mismatching position
            Cigar::iterator c = a->cigar.begin();

            int rp = 0;
            int sp = 0;
            bool hitMismatch = false;

            int matchingBpAtStart = 0;
            int matchingBpAtEnd = 0;
            // will be set to true if the first reference position match is broken by a SNP, not an indel
            bool leadingSNP = false;

            while (c != a->cigar.end()) {
                char op = c->second[0];
                if (c == a->cigar.begin()) {
                    if (op != 'M') {
                        cerr << "alignment does not start on matched sequence" << endl;
                        cerr << var << endl;
                        exit(1);
                    }
                    int i = 0;
                    for ( ; i < c->first; ++i) {
                        if (ref[i] != a->seq[i]) {
                            leadingSNP = true;
                            break;
                        }
                    }
                    matchingBpAtStart = i;
                }
                if (!leadingSNP && c == (a->cigar.begin()+1)) {
                    // if the first thing we run into is an indel, step back, per VCF spec
                    if (op == 'D' || op == 'I') {
                        --matchingBpAtStart;
                    }
                }
                if (c == (a->cigar.end()-1)) {
                    if (op != 'M') {
                        // soft clip at end
                        // it'll be hard to interpret this
                        // the alignments sometimes generate this
                        // best thing to do is to move on
                        //cerr << "alignment does not end on matched sequence" << endl;
                        //cout << var << endl;
                        //exit(1);
                        giveUp = true;
                        break;
                    }
                    int i = 0;
                    for ( ; i < c->first; ++i) {
                        if (ref[ref.size()-1-i] != a->seq[a->seq.size()-1-i]) {
                            break;
                        }
                    }
                    matchingBpAtEnd = i;
                }
                ++c;
            }

            int altMismatchLength = a->seq.size() - matchingBpAtEnd - matchingBpAtStart;
            int refMismatchLength = (var.ref.size() + currentWindow) - matchingBpAtEnd - matchingBpAtStart;
            //cerr << "alt mismatch length " << altMismatchLength << endl
            //     << "ref mismatch length " << refMismatchLength << endl;
            long int newStart = var.position - currentWindow/2 + matchingBpAtStart;
            long int newEnd = newStart + refMismatchLength;
            //cerr << "ref should run from " << newStart << " to " << newStart + refMismatchLength << endl;
            newPosition = min(newStart, newPosition);
            newEndPosition = max(newEnd, newEndPosition);
            //cerr << newPosition << " " << newEndPosition << endl;
            //if (newRefSize < refMismatchLength) newRefSize = refMismatchLength;
        }

        // the alignment failed for some reason, continue
        if (giveUp) {
            cout << var << endl;
            continue;
        }

        //cerr << "new ref start " << newPosition << " and end " << newEndPosition << " was " << var.position << "," << var.position + var.ref.size() << endl;
        int newRefSize = newEndPosition - newPosition;
        string newRef = fastaReference.getSubSequence(var.sequenceName, newPosition-1, newRefSize);
        // get the number of bp to strip from the alts
        int stripFromStart = currentWindow/2 - (var.position - newPosition);
        int stripFromEnd = (currentWindow + newRefSize) - (stripFromStart + newRefSize) + (var.ref.size() - newRefSize);

        //cerr << "strip from start " << stripFromStart << endl;
        //cerr << "strip from end " << stripFromEnd << endl;

        vector<string> newAlt;
        vector<string>::iterator l = var.alt.begin();
        bool failedAlt = false;
        for (vector<AltAlignment>::iterator a = alignments.begin(); a != alignments.end();
             ++a, ++l) {
            int diff = newRef.size() - l->size();
            string alt = a->seq.substr(stripFromStart, a->seq.size() - (stripFromEnd + stripFromStart));
            newAlt.push_back(alt);
            if (alt.empty()) failedAlt = true;
        }

        // check the before/after haplotypes
        bool brokenRealignment = false;
        if (!newRef.empty() && !failedAlt) {
            int slop = 50; // 50 extra bp!
            int haplotypeStart = min(var.position, newPosition) - slop;
            int haplotypeEnd = max(var.position + var.ref.size(), newPosition + newRef.size()) + slop;
            string referenceHaplotype = fastaReference.getSubSequence(var.sequenceName, haplotypeStart - 1,
                                                                      haplotypeEnd - haplotypeStart);
            vector<string>::iterator o = var.alt.begin();
            vector<string>::iterator n = newAlt.begin();
            for ( ; o != var.alt.end() ; ++o, ++n) {
                // map the haplotypes
                string oldHaplotype = referenceHaplotype;
                string newHaplotype = referenceHaplotype;
                oldHaplotype.replace(var.position - haplotypeStart, var.ref.size(), *o);
                newHaplotype.replace(newPosition - haplotypeStart, newRef.size(), *n);
                if (oldHaplotype != newHaplotype) {
                    cerr << "broken left alignment!" << endl
                         << "old " << oldHaplotype << endl
                         << "new " << newHaplotype << endl;
                    cerr << "was: " << var << endl;
                    brokenRealignment = true;
                }
            }
        }

        // *if* everything is OK, update the variant
        if (!brokenRealignment && !newRef.empty() && !failedAlt) {
            var.ref = newRef;
            var.alt = newAlt;
            var.position = newPosition;
        }

        cout << var << endl;

        // for each parsedalternate, get the position
        // build a new vcf record for that position
        // unless we are already at the position !
        // take everything which is unique to that allele (records) and append it to the new record
        // then handle genotypes; determine the mapping between alleleic primitives and convert to phased haplotypes
        // this means taking all the parsedAlternates and, for each one, generating a pattern of allele indecies corresponding to it

        

        //for (vector<Variant>::iterator v = variants.begin(); v != variants.end(); ++v) {
    }

    return 0;

}
Beispiel #5
0
int main(int argc, char** argv) {

    int c;
    string fastaRef;
    bool keepFailures = false;
    bool excludeFailures = false;

    if (argc == 1)
        printSummary(argv);

    while (true) {
        static struct option long_options[] =
            {
                /* These options set a flag. */
                //{"verbose", no_argument,       &verbose_flag, 1},
                {"help", no_argument, 0, 'h'},
                {"fasta-reference",  required_argument, 0, 'f'},
                {"exclude-failures",  no_argument, 0, 'x'},
                {"keep-failures",  no_argument, 0, 'k'},
                //{"length",  no_argument, &printLength, true},
                {0, 0, 0, 0}
            };
        /* getopt_long stores the option index here. */
        int option_index = 0;

        c = getopt_long (argc, argv, "hxkf:",
                         long_options, &option_index);

        /* Detect the end of the options. */
        if (c == -1)
            break;
 
        switch (c)
        {
        case 0:
            /* If this option set a flag, do nothing else now. */
            if (long_options[option_index].flag != 0)
                break;
            printf ("option %s", long_options[option_index].name);
            if (optarg)
                printf (" with arg %s", optarg);
            printf ("\n");
            break;

        case 'f':
            fastaRef = optarg;
            break;

        case 'x':
            excludeFailures = true;
            break;

        case 'k':
            keepFailures = true;
            break;
 
        case 'h':
            printSummary(argv);
            exit(0);
            break;

        case '?':
            /* getopt_long already printed an error message. */
            printSummary(argv);
            exit(1);
            break;
 
        default:
            abort ();
        }
    }

    if (fastaRef.empty()) {
        cerr << "a FASTA reference sequence must be specified" << endl;
        exit(1);
    }

    FastaReference ref;
    ref.open(fastaRef);

    VariantCallFile variantFile;
    string inputFilename;
    if (optind == argc - 1) {
        inputFilename = argv[optind];
        variantFile.open(inputFilename);
    } else {
        variantFile.open(std::cin);
    }

    if (!variantFile.is_open()) {
        return 1;
    }

    if (keepFailures || excludeFailures) {
        cout << variantFile.header << endl;
    }

    Variant var(variantFile);
    while (variantFile.getNextVariant(var)) {
        int refstart = var.position - 1; // convert to 0-based
        string matchedRef = ref.getSubSequence(var.sequenceName, refstart, var.ref.size());
        if (var.ref != matchedRef) {
            if (keepFailures) {
                cout << var << endl;
            } else if (!excludeFailures) {
                cout << "mismatched reference " << var.ref << " should be " << matchedRef << " at "
                     << var.sequenceName << ":" << var.position << endl;
            }
        } else if (excludeFailures) {
            cout << var << endl;
        }
    }

    return 0;

}
Beispiel #6
0
int main(int argc, char** argv) {

    string vcfFileName;
    string fastaFileName;
    int windowsize = 100;
    bool includePreviousBaseForIndels = false;
    bool useMNPs = true;
    int altwindowsize = 50;

    // constants for SmithWaterman algorithm
    float matchScore = 10.0f;
    float mismatchScore = -9.0f;
    float gapOpenPenalty = 15.0f;
    float gapExtendPenalty = 6.66f;

    bool useEntropy = false;
    bool useRepeatGapExtendPenalty = false;
    float repeatGapExtendPenalty = 1;

    bool adjustVcf = false;
    string adjustedTag = "remappedCIGAR";

    if (argc == 1)
        printSummary(argv);

    int c;
    while (true) {
        static struct option long_options[] =
            {
                /* These options set a flag. */
                //{"verbose", no_argument,       &verbose_flag, 1},
                {"help", no_argument, 0, 'h'},
                {"ref-window-size", required_argument, 0, 'w'},
                {"reference", required_argument, 0, 'r'},
                {"match-score", required_argument, 0, 'm'},
                {"mismatch-score", required_argument, 0, 'x'},
                {"gap-open-penalty", required_argument, 0, 'o'},
                {"gap-extend-penalty", required_argument, 0, 'e'},
                {"alt-window-size", required_argument, 0, 's'},
                {"entropy-gap-open", no_argument, 0, 'z'},
                {"repeat-gap-extend", no_argument, 0, 'R'},
                {"adjust-vcf", required_argument, 0, 'a'},
                {0, 0, 0, 0}
            };
        /* getopt_long stores the option index here. */
        int option_index = 0;

        c = getopt_long (argc, argv, "hza:w:r:m:x:o:e:s:R:",
                         long_options, &option_index);

        if (c == -1)
            break;

        switch (c) {

	    case 'w':
            windowsize = atoi(optarg);
            break;

	    case 'a':
	        adjustVcf = true;
            adjustedTag = optarg;
            break;

	    case 'r':
            fastaFileName = string(optarg);
            break;

        case 'h':
            printSummary(argv);
            break;

	    case 'm':
            matchScore = atof(optarg);
	        break;

	    case 'x':
            mismatchScore = atof(optarg);
	        break;

	    case 'o':
            gapOpenPenalty = atof(optarg);
	        break;

	    case 'e':
            gapExtendPenalty = atof(optarg);
	        break;

	    case 's':
            altwindowsize = atoi(optarg);
            break;

	    case 'z':
            useEntropy = true;
            break;

	    case 'R':
            useRepeatGapExtendPenalty = true;
            repeatGapExtendPenalty = atof(optarg);
            break;

        case '?':
            printSummary(argv);
            exit(1);
            break;

        default:
            abort ();
        }
    }

    VariantCallFile variantFile;
    string inputFilename;
    if (optind == argc - 1) {
        inputFilename = argv[optind];
        variantFile.open(inputFilename);
    } else {
        variantFile.open(std::cin);
    }

    if (!variantFile.is_open()) {
        cerr << "could not open VCF file" << endl;
        exit(1);
    }

    FastaReference freference;
    if (fastaFileName.empty()) {
        cerr << "a reference is required" << endl;
        exit(1);
    } else {
        freference.open(fastaFileName);
    }
    
    if (adjustVcf) {
        vector<string> commandline;
        for (int i = 0; i < argc; ++i)
            commandline.push_back(argv[i]);
        variantFile.addHeaderLine("##INFO=<ID=" + adjustedTag + ",Number=A,Type=String,Description=\"CIGAR when remapped using"+ join(commandline, " ") +"\">");
    }

    cout << variantFile.header << endl;

    Variant var(variantFile);
    while (variantFile.getNextVariant(var)) {
        //if (!adjustVcf) {
	    cout << endl;
	    cout << var << endl;
	    //}
        map<string, vector<VariantAllele> > variantAlleles;
        vector<vector<pair<int, char> > > cigars;
        vector<int> positionDiffs;
        for (vector<string>::iterator a = var.alt.begin(); a != var.alt.end(); ++a) {
            //if (!adjustVcf) cout << endl;
            cout << endl;

            // try to remap locally

            string reference = freference.getSubSequence(var.sequenceName, var.position - 1 - windowsize, windowsize * 2 + var.ref.size());
	    
            // passed to sw align
            unsigned int referencePos;
            string cigar;

            string& alternate = *a;

            vector<VariantAllele>& variants = variantAlleles[alternate];

            string alternateQuery = reference.substr(windowsize - altwindowsize, altwindowsize) + alternate + reference.substr(reference.size() - windowsize, altwindowsize);

            //cout << "REF:\t" << reference << endl;
            //cout << "ALT:\t" << string(windowsize - altwindowsize, ' ') << alternateQuery << endl;
	    
            CSmithWatermanGotoh sw(matchScore, mismatchScore, gapOpenPenalty, gapExtendPenalty);
            if (useEntropy) sw.EnableEntropyGapPenalty(1);
            if (useRepeatGapExtendPenalty) sw.EnableRepeatGapExtensionPenalty(repeatGapExtendPenalty);
            sw.Align(referencePos, cigar, reference, alternateQuery);

            int altpos = 0;
            int refpos = 0;
            int len;
            string slen;
            vector<pair<int, char> > cigarData;

            string ref = reference.substr(referencePos);
            positionDiffs.push_back(referencePos); // TODO this... is borked

            stringstream refss;
            stringstream altss;

            if (!adjustVcf) cout << cigar << endl;
            cout << cigar << endl;
            for (string::iterator c = cigar.begin(); c != cigar.end(); ++c) {
                switch (*c) {
                case 'I':
                    len = atoi(slen.c_str());
                    slen.clear();
                    if (altpos < altwindowsize) {
                        cigarData.push_back(make_pair(len, 'M'));
                    } else {
                        cigarData.push_back(make_pair(len, *c));
                    }
                    altss << alternateQuery.substr(altpos, len);
                    refss << string(len, '-');
                    altpos += len;
                    break;
                case 'D':
                    len = atoi(slen.c_str());
                    slen.clear();
                    if (altpos < altwindowsize) {
                    } else {
                        cigarData.push_back(make_pair(len, *c));
                    }
                    refss << ref.substr(refpos, len);
                    altss << string(len, '-');
                    refpos += len;
                    break;
                case 'M':
                    len = atoi(slen.c_str());
                    slen.clear();
                    {
                        for (int i = 0; i < len; ++i) {
                            if (ref.at(refpos + i) == alternateQuery.at(altpos + i)) {
                                if (!cigarData.empty() && cigarData.back().second == 'M') {
                                    cigarData.back().first++;
                                } else {
                                    cigarData.push_back(make_pair(1, 'M'));
                                }
                            } else {
                                if (!cigarData.empty() && cigarData.back().second == 'X') {
                                    cigarData.back().first++;
                                } else {
                                    cigarData.push_back(make_pair(1, 'X'));
                                }
                            }
                        }
                    }
                    refss << ref.substr(refpos, len);
                    altss << alternateQuery.substr(altpos, len);
                    refpos += len;
                    altpos += len;
                    break;
                case 'S':
                    len = atoi(slen.c_str());
                    slen.clear();
                    cigarData.push_back(make_pair(len, *c));
                    refss << ref.substr(refpos, len);
                    //altss << alternateQuery.substr(altpos, len); // TODO deal with soft clipping, weird behavior
                    refpos += len;
                    altpos += len;
                    break;
                default:
                    len = 0;
                    slen += *c;
                    break;
                }
            }

            if (!adjustVcf) {
                cout << "ref:\t" << refss.str() << endl;
                cout << "alt:\t" << altss.str() << endl;
            } else {
                cout << "ref:\t" << refss.str() << endl;
                cout << "alt:\t" << altss.str() << endl;
                cigars.push_back(cigarData);
            }

        }

        if (adjustVcf) {
            int substart = cigars.front().front().first;
            int subend = cigars.front().back().first;

            // find the min and max match
            for (vector<vector<pair<int, char> > >::iterator c = cigars.begin(); c != cigars.end(); ++c) {
                if (c->front().second == 'M' && c->front().first <= substart) {
                    substart = c->front().first;
                    if (c->size() > 1 && c->at(1).second != 'X') {
                        --substart;
                    }
                }
                if (c->back().second == 'M' && c->back().first <= subend) {
                    subend = c->back().first;
                }
            }
	    
            // adjust the cigars and get the new reference length
            int reflen = 0;
            for (vector<vector<pair<int, char> > >::iterator c = cigars.begin(); c != cigars.end(); ++c) {
                c->front().first -= substart;
                c->back().first -= subend;
                int crf = cigarRefLen(*c);
                if (crf > reflen)
                    reflen = crf;
                var.info[adjustedTag].push_back(joinCigar(*c));
            }

            // find the lowest positional difference
            int pdiff = 0;
            for (vector<int>::iterator d = positionDiffs.begin(); d != positionDiffs.end(); ++d) {
                if (*d + altwindowsize < pdiff)
                    pdiff = *d + altwindowsize;
            }

            // adjust the reference string
            var.position += pdiff;

            // adjust the variant position
            var.ref = freference.getSubSequence(var.sequenceName, var.position - 1, reflen);

            cout << var << endl;
        }
    }

    return 0;

}
Beispiel #7
0
int main(int argc, char** argv) {

    string bedFileName;
    string vcfFileName;
    string fastaFileName;
    bool intersecting = false;
    bool unioning = false;
    bool invert = false;
    bool contained = true;
    bool overlapping = false;
    int windowsize = 30;

    if (argc == 1)
        printSummary(argv);

    int c;
    while (true) {
        static struct option long_options[] =
        {
            /* These options set a flag. */
            //{"verbose", no_argument,       &verbose_flag, 1},
            {"help", no_argument, 0, 'h'},
            {"bed",  required_argument, 0, 'b'},
            {"invert",  no_argument, 0, 'v'},
	    {"intersect-vcf", required_argument, 0, 'i'},
	    {"union-vcf", required_argument, 0, 'u'},
            {"contained",  no_argument, 0, 'c'},
            {"overlapping", no_argument, 0, 'o'},
	    {"window-size", required_argument, 0, 'w'},
	    {"reference", required_argument, 0, 'r'},
            {0, 0, 0, 0}
        };
        /* getopt_long stores the option index here. */
        int option_index = 0;

        c = getopt_long (argc, argv, "hvcob:i:u:w:r:",
                         long_options, &option_index);

        if (c == -1)
            break;

        switch (c) {

	    case 'w':
		windowsize = atoi(optarg);
		break;

            case 'b':
                bedFileName = string(optarg);
                break;

            case 'i':
		intersecting = true;
                vcfFileName = string(optarg);
                break;

            case 'u':
		unioning = true;
                vcfFileName = string(optarg);
                break;

	    case 'r':
		fastaFileName = string(optarg);
		break;

            case 'v':
                invert = true;
                break;

            case 'c':
                contained = true;
                break;

            case 'o':
                overlapping = true;
                break;

            case 'h':
                printSummary(argv);
                break;

            case '?':
                printSummary(argv);
                exit(1);
                break;

            default:
                abort ();
        }
    }

    bool usingBED = false;
    if (!bedFileName.empty()) {
	usingBED = true;
    }
    
    BedReader bed;
    if (usingBED) {
	bed.open(bedFileName);
    }

    VariantCallFile variantFile;
    string inputFilename;
    if (optind == argc - 1) {
        inputFilename = argv[optind];
        variantFile.open(inputFilename);
    } else {
        variantFile.open(std::cin);
    }

    if (!variantFile.is_open()) {
        cerr << "could not open VCF file" << endl;
        exit(1);
    }

    if (usingBED) {
	variantFile.parseSamples = false;
    }

    VariantCallFile otherVariantFile;
    if (!vcfFileName.empty()) {
	otherVariantFile.open(vcfFileName);
	if (!otherVariantFile.is_open()) {
	    cerr << "could not open VCF file " << vcfFileName << endl;
	    exit(1);
	}
    }

    FastaReference reference;
    if (unioning || intersecting) {
	if (fastaFileName.empty()) {
	    cerr << "a reference is required for haplotype-based intersection and unioniong" << endl;
	    exit(1);
	}
	reference.open(fastaFileName);
    }

    if (!unioning && !intersecting) {
	variantFile.parseSamples = false; // faster, as when we are
					  // only bed-intersecting we
					  // can do position-only
					  // output and don't have to
					  // manipulate specific
					  // alleles
    }

    // read the VCF file for union or intersection into an interval tree
    // indexed using some proximity window

    map<string, IntervalTree<Variant*> > variantIntervals;
    map<string, list<Variant> > otherVariants;
    map<string, vector<Interval<Variant*> > > otherVariantIntervals;

    if (unioning || intersecting) {

	Variant ovar(otherVariantFile);
	while (otherVariantFile.getNextVariant(ovar)) {
	    long int left = ovar.position;
	    long int right = left + ovar.ref.size(); // this should be 1-past the end
	    otherVariants[ovar.sequenceName].push_back(ovar);
	    Variant* v = &otherVariants[ovar.sequenceName].back();
	    otherVariantIntervals[ovar.sequenceName].push_back(Interval<Variant*>(left, right, v));
	}
	
	for (map<string, vector<Interval<Variant*> > >::iterator j = otherVariantIntervals.begin(); j != otherVariantIntervals.end(); ++j) {
	    variantIntervals[j->first] = IntervalTree<Variant*>(j->second);
	}

    }

    set<Variant*> outputVariants;

    long unsigned int lastOutputPosition = 0;
    string lastSequenceName;

    cout << variantFile.header;

    Variant var(variantFile);
    while (variantFile.getNextVariant(var)) {

	if (lastSequenceName.empty()) {
	    lastSequenceName = var.sequenceName;
	} else if (lastSequenceName != var.sequenceName) {
	    if (unioning) {
		vector<Interval<Variant*> > previousRecords;
		long int lastSeqLength = reference.sequenceLength(lastSequenceName);
		variantIntervals[lastSequenceName].findContained(lastOutputPosition, lastSeqLength, previousRecords);
		for (vector<Interval<Variant*> >::iterator r = previousRecords.begin(); r != previousRecords.end(); ++r) {
		    Variant* v = r->value;
		    if (outputVariants.find(v) == outputVariants.end()) {
			outputVariants.insert(v);
			cout << *v << endl;  // does this output everything in correct order?
		    }
		}
		lastSequenceName = var.sequenceName;
		lastOutputPosition = 0;
	    }
	}

	if (usingBED) {
	    BedTarget record(var.sequenceName, var.position, var.position + var.ref.size(), "");
	    vector<BedTarget*> overlaps = bed.targetsOverlapping(record);

	    if (!invert && !overlaps.empty()) {
		cout << variantFile.line << endl;
	    } else if (invert && overlaps.empty()) {
		cout << variantFile.line << endl;
	    }

	} else if (unioning || intersecting) {

	    // TODO check overlaps with union/intersection
	    // hmm... for unioning, you might need to step through the original VCF records
	    // but the idea is to exclude the haplotype-based duplicates

	    vector<Interval<Variant*> > results;

	    variantIntervals[var.sequenceName].findContained(var.position - windowsize, var.position + var.ref.size() + windowsize, results);

	    vector<Variant*> overlapping;

	    for (vector<Interval<Variant*> >::iterator r = results.begin(); r != results.end(); ++r) {
		overlapping.push_back(r->value);
	    }


	    if (unioning) {

		// unioning strategy

		// write out all the records from the last file
		// between the last one printed out and the first
		// one we're about to print out

		vector<Interval<Variant*> > previousRecords;

		variantIntervals[var.sequenceName].findOverlapping(lastOutputPosition, var.position - windowsize, previousRecords);

		map<long int, vector<Variant*> > variants;

		for (vector<Interval<Variant*> >::iterator r = previousRecords.begin(); r != previousRecords.end(); ++r) {
		    Variant* v = r->value;
		    if (outputVariants.find(v) == outputVariants.end()) {
			outputVariants.insert(v);
			variants[v->position].push_back(v);
		    }
		}

		for (map<long int, vector<Variant*> >::iterator v = variants.begin(); v != variants.end(); ++v) {
		    for (vector<Variant*>::iterator o = v->second.begin(); o != v->second.end(); ++o) {
			cout << **o << endl;
			lastOutputPosition = max(lastOutputPosition, (*o)->position);
		    }
		}

		// TODO find the duplicates for the other file
	    }


	    if (overlapping.empty()) {

		if (unioning || (intersecting && invert)) {
		    cout << var << endl;
		    lastOutputPosition = max(lastOutputPosition, var.position);
		}

	    } else {

		// get the min and max of the overlaps

		int haplotypeStart = var.position;
		int haplotypeEnd = var.position + var.ref.size();

		for (vector<Variant*>::iterator v = overlapping.begin(); v != overlapping.end(); ++v) {
		    haplotypeStart = min((*v)->position, (long unsigned int) haplotypeStart);
		    haplotypeEnd = max((*v)->position + (*v)->ref.size(), (long unsigned int) haplotypeEnd);
     		}

		// for everything overlapping and the current variant, construct the local haplotype within the bounds
		// if there is an exact match, the alllele in the current VCF does intersect

		string referenceHaplotype = reference.getSubSequence(var.sequenceName, haplotypeStart - 1, haplotypeEnd - haplotypeStart);
		map<string, vector<Variant*> > haplotypes;

		for (vector<Variant*>::iterator v = overlapping.begin(); v != overlapping.end(); ++v) {
		    Variant& variant = **v;
		    for (vector<string>::iterator a = variant.alt.begin(); a != variant.alt.end(); ++a) {
			string haplotype = referenceHaplotype;
			// get the relative start and end coordinates for the variant alternate allele
			int relativeStart = variant.position - haplotypeStart;
			haplotype.replace(relativeStart, variant.ref.size(), *a);
			haplotypes[haplotype].push_back(*v);
		    }
		}

		// determine the non-intersecting alts
		vector<string> altsToRemove;
		for (vector<string>::iterator a = var.alt.begin(); a != var.alt.end(); ++a) {
		    string haplotype = referenceHaplotype;
		    int relativeStart = var.position - haplotypeStart;
		    haplotype.replace(relativeStart, var.ref.size(), *a);
		    map<string, vector<Variant*> >::iterator h = haplotypes.find(haplotype);
		    if ((intersecting && !invert && h == haplotypes.end())
			|| (intersecting && invert && h != haplotypes.end())
			|| (unioning && h != haplotypes.end())) {
			altsToRemove.push_back(*a);
		    }
		}

		// remove the non-overlapping (intersecting) or overlapping (unioning) alts
		for (vector<string>::iterator a = altsToRemove.begin(); a != altsToRemove.end(); ++a) {
		    var.removeAlt(*a);
		}

		if (unioning) {
		    // somehow sort the records and combine them?
		    map<long int, vector<Variant*> > variants;
		    for (vector<Variant*>::iterator o = overlapping.begin(); o != overlapping.end(); ++o) {
			if ((*o)->position <= var.position && // check ensures proper ordering of variants on output
			    outputVariants.find(*o) == outputVariants.end()) {
			    outputVariants.insert(*o);
			    variants[(*o)->position].push_back(*o);
			}
		    }
		    // add in the current variant, if it has alts left
		    if (!var.alt.empty()) {
			variants[var.position].push_back(&var);
		    }

		    for (map<long int, vector<Variant*> >::iterator v = variants.begin(); v != variants.end(); ++v) {
			for (vector<Variant*>::iterator o = v->second.begin(); o != v->second.end(); ++o) {
			    cout << **o << endl;
			    lastOutputPosition = max(lastOutputPosition, (*o)->position);
			}
		    }
		} else {
		    // if any alts remain, output the variant record
		    if (!var.alt.empty()) {
			cout << var << endl;
			lastOutputPosition = max(lastOutputPosition, var.position);
		    }
		}

	    }

	}

    }


    // if unioning, and any variants remain, output them
    if (unioning) {
	for (map<string, list<Variant> >::iterator chrom = otherVariants.find(lastSequenceName);
	     chrom != otherVariants.end();
	     ++chrom) {
	    for (list<Variant>::iterator v = chrom->second.begin(); v != chrom->second.end(); ++v) {
		Variant* variant = &*v;
		if (outputVariants.find(variant) == outputVariants.end()) {
		    outputVariants.insert(variant);
		    cout << *variant << endl;
		    // TODO guarantee sorting
		}
	    }
	}
    }

    exit(0);  // why?
    return 0;

}
Beispiel #8
0
int main(int argc, char** argv) {

    
    string ref_file = "";
    vector<string> insertion_files;
    int max_interval = -1;
    bool replace_sequences = true;

    int c = 0;
    while (true) {
        static struct option long_options[] =
            {
                {"insertions", no_argument, 0, 'i'},
                {"help", no_argument, 0, 'h'},
                {"reference", required_argument, 0, 'r'},
                {"no-replace-sequences", no_argument, 0, 's'},
                {0, 0, 0, 0}
            };
        int option_index = 0;

        c = getopt_long (argc, argv, "sr:i:h",
                         long_options, &option_index);
        if (c == -1)
            break;
        /* Detect the end of the options. */
        switch(c){
        case 's':
            replace_sequences = false;
            break;
        case 'r':
            ref_file = optarg;
            break;
        case 'i':
            insertion_files.push_back(optarg);
            break;
        case 'h':
        case '?':
            print_help(argv);
            exit(1);
        default:
            print_help(argv);
            abort();
        }
    }

    if (argc < 2){
        print_help(argv);
        exit(1);
    }

 

    VariantCallFile variantFile;
    string filename = argv[argc - 1];
    variantFile.open(filename);
    if (!variantFile.is_open()) {
        return 1;
    }

    vector<FastaReference*> insertions;
    if (!insertion_files.empty()){
        for (auto x : insertion_files){
            FastaReference* ins = new FastaReference();
            insertions.push_back(ins);
            ins->open(x);
        }
    }

    FastaReference ref;
    if(!ref_file.empty()){
        ref.open(ref_file);
    }


    cout << variantFile.header << endl;

    Variant var;
    while (variantFile.getNextVariant(var)) {
        bool valid = var.canonicalize_sv(ref, insertions, replace_sequences, max_interval);
        if (!valid){
            cerr << "Variant could not be normalized" << var << endl;
        }
        cout << var << endl;
    }

    return 0;

}
Beispiel #9
0
int main(int argc, char** argv) {

    int c;
    string fastaRef;
    int windowSize = 0;

    if (argc == 1)
        printSummary(argv);

    while (true) {
        static struct option long_options[] =
        {
            /* These options set a flag. */
            //{"verbose", no_argument,       &verbose_flag, 1},
            {"help", no_argument, 0, 'h'},
            {"fasta-reference",  required_argument, 0, 'f'},
            {"window-size", required_argument, 0, 'w'},
            //{"length",  no_argument, &printLength, true},
            {0, 0, 0, 0}
        };
        /* getopt_long stores the option index here. */
        int option_index = 0;

        c = getopt_long (argc, argv, "hf:w:",
                         long_options, &option_index);

      /* Detect the end of the options. */
          if (c == -1)
            break;
 
          switch (c)
            {
            case 0:
            /* If this option set a flag, do nothing else now. */
            if (long_options[option_index].flag != 0)
              break;
            printf ("option %s", long_options[option_index].name);
            if (optarg)
              printf (" with arg %s", optarg);
            printf ("\n");
            break;

          case 'f':
            fastaRef = optarg;
            break;

          case 'w':
            windowSize = atoi(optarg);
            break;
 
          case 'h':
            printSummary(argv);
            exit(0);
            break;

          case '?':
            /* getopt_long already printed an error message. */
            printSummary(argv);
            exit(1);
            break;
 
          default:
            abort ();
          }
      }

    if (windowSize == 0) {
        cerr << "a window size must be specified" << endl;
        exit(1);
    }
    if (fastaRef.empty()) {
        cerr << "a FASTA reference sequence must be specified" << endl;
        exit(1);
    }

    FastaReference ref;
    ref.open(fastaRef);

    VariantCallFile variantFile;
    string inputFilename;
    if (optind == argc - 1) {
        inputFilename = argv[optind];
        variantFile.open(inputFilename);
    } else {
        variantFile.open(std::cin);
    }

    if (!variantFile.is_open()) {
        return 1;
    }

    variantFile.addHeaderLine("##INFO=<ID=EntropyLeft,Number=1,Type=Float,Description=\"Entropy of left-flanking sequence of "+ convert(windowSize) +"bp\">");
    variantFile.addHeaderLine("##INFO=<ID=EntropyCenter,Number=1,Type=Float,Description=\"Entropy of centered sequence of "+ convert(windowSize) +"bp\">");
    variantFile.addHeaderLine("##INFO=<ID=EntropyRight,Number=1,Type=Float,Description=\"Entropy of right-flanking sequence of "+ convert(windowSize) +"bp\">");
    variantFile.addHeaderLine("##INFO=<ID=EntropyRef,Number=1,Type=Float,Description=\"Entropy of REF allele\">");
    variantFile.addHeaderLine("##INFO=<ID=EntropyAlt,Number=A,Type=Float,Description=\"Entropy of each ALT allele\">");

    cout << variantFile.header << endl;

    Variant var(variantFile);
    while (variantFile.getNextVariant(var)) {

        // get the ref start and end positions
        int refstart = var.position - 1; // convert to 0-based
        int refend = var.position + var.ref.size() - 1;
        string leftseq = ref.getSubSequence(var.sequenceName, refstart - windowSize, windowSize);
        string rightseq = ref.getSubSequence(var.sequenceName, refend, windowSize);
        string centerseq = ref.getSubSequence(var.sequenceName, refstart - windowSize/2, windowSize);
        double entropyLeft = shannon_H((char*) &leftseq[0], windowSize);
        double entropyRight = shannon_H((char*) &rightseq[0], windowSize);
        double entropyCenter = shannon_H((char*) &centerseq[0], windowSize);
        double entropyRef = shannon_H((char*) var.ref.c_str(), var.ref.size());

        var.info["EntropyLeft"].clear();
        var.info["EntropyRight"].clear();
        var.info["EntropyCenter"].clear();
        var.info["EntropyRef"].clear();
        var.info["EntropyAlt"].clear();

        var.info["EntropyLeft"].push_back(convert(entropyLeft));
        var.info["EntropyRight"].push_back(convert(entropyRight));
        var.info["EntropyCenter"].push_back(convert(entropyCenter));
        var.info["EntropyRef"].push_back(convert(entropyRef));

        for (vector<string>::iterator a = var.alt.begin(); a != var.alt.end(); ++a) {
            double entropyAlt = shannon_H((char*) a->c_str(), a->size());
            var.info["EntropyAlt"].push_back(convert(entropyAlt));
        }

        cout << var << endl;
    }

    return 0;

}
Beispiel #10
0
int main_construct(int argc, char** argv) {

    if (argc == 2) {
        help_construct(argv);
        return 1;
    }

    // Make a constructor to fill in
    Constructor constructor;

    // We also parse some arguments separately.
    vector<string> fasta_filenames;
    vector<string> vcf_filenames;
    string region;
    bool region_is_chrom = false;

    int c;
    optind = 2; // force optind past command positional argument
    while (true) {
        static struct option long_options[] =
            {
                /* These options set a flag. */
                //{"verbose", no_argument,       &verbose_flag, 1},
                {"vcf", required_argument, 0, 'v'},
                {"reference", required_argument, 0, 'r'},
                {"rename", required_argument, 0, 'n'},
                {"alt-paths", no_argument, 0, 'a'},
                {"progress",  no_argument, 0, 'p'},
                {"region-size", required_argument, 0, 'z'},
                {"threads", required_argument, 0, 't'},
                {"region", required_argument, 0, 'R'},
                {"region-is-chrom", no_argument, 0, 'C'},
                {"node-max", required_argument, 0, 'm'},\
                {"flat-alts", no_argument, 0, 'f'},
                {0, 0, 0, 0}
            };

        int option_index = 0;
        c = getopt_long (argc, argv, "v:r:n:ph?z:t:R:m:as:Cf",
                         long_options, &option_index);

        /* Detect the end of the options. */
        if (c == -1)
            break;

        switch (c)
        {
        case 'v':
            vcf_filenames.push_back(optarg);
            break;

        case 'r':
            fasta_filenames.push_back(optarg);
            break;
            
        case 'n':
            {
                // Parse the rename old=new
                string key_value(optarg);
                auto found = key_value.find('=');
                if (found == string::npos || found == 0 || found + 1 == key_value.size()) {
                    cerr << "error:[vg construct] could not parse rename " << key_value << endl;
                    exit(1);
                }
                // Parse out the two parts
                string vcf_contig = key_value.substr(0, found);
                string fasta_contig = key_value.substr(found + 1);
                // Add the name mapping
                constructor.add_name_mapping(vcf_contig, fasta_contig);
            }
            break;

        case 'a':
            constructor.alt_paths = true;
            break;

        case 'p':
            constructor.show_progress = true;
            break;

        case 'z':
            constructor.vars_per_chunk = atoi(optarg);
            break;

        case 'R':
            region = optarg;
            break;

        case 'C':
            region_is_chrom = true;
            break;

        case 't':
            omp_set_num_threads(atoi(optarg));
            break;

        case 'm':
            constructor.max_node_size = atoi(optarg);
            break;

        case 'f':
            constructor.flat = true;
            break;

        case 'h':
        case '?':
            /* getopt_long already printed an error message. */
            help_construct(argv);
            exit(1);
            break;

        default:
            abort ();

        }
    }
    
    if (constructor.max_node_size == 0) {
        // Make sure we can actually make nodes
        cerr << "error:[vg construct] max node size cannot be 0" << endl;
        exit(1);
    }
    
    if (!region.empty()) {
        // We want to limit to a certain region
        if (!region_is_chrom) {
            // We are allowed to parse the region.
            // Break out sequence name and region bounds
            string seq_name;
            int start_pos = 0, stop_pos = 0;
            parse_region(region,
                         seq_name,
                         start_pos,
                         stop_pos);
                         
            if (start_pos != 0 && stop_pos != 0) {
                // These are 0-based, so if both are nonzero we got a real set of coordinates
                if (constructor.show_progress) {
                    cerr << "Restricting to " << seq_name << " from " << start_pos << " to " << stop_pos << endl;
                }
                constructor.allowed_vcf_names.insert(seq_name);
                // Make sure to correct the coordinates to 0-based exclusive-end, from 1-based inclusive-end
                constructor.allowed_vcf_regions[seq_name] = make_pair(start_pos - 1, stop_pos);
            } else if (start_pos == 0 && stop_pos == 0) {
                // We just got a name
                cerr << "Restricting to " << seq_name << " from 1 to end" << endl;
                constructor.allowed_vcf_names.insert(seq_name);
            } else {
                // This doesn't make sense. Does it have like one coordinate?
                cerr << "error:[vg construct] could not parse " << region << endl;
                exit(1);
            }
         } else {
            // We have been told not to parse the region
            cerr << "Restricting to " << region << " from 1 to end" << endl;
            constructor.allowed_vcf_names.insert(region);
         }
    }

    // This will own all the VCF files
    vector<unique_ptr<vcflib::VariantCallFile>> variant_files;
    for (auto& vcf_filename : vcf_filenames) {
        // Make sure each VCF file exists. Otherwise Tabix++ may exit with a non-
        // helpful message.

        // We can't invoke stat woithout a place for it to write. But all we
        // really want is its return value.
        struct stat temp;
        if(stat(vcf_filename.c_str(), &temp)) {
            cerr << "error:[vg construct] file \"" << vcf_filename << "\" not found" << endl;
            return 1;
        }
        vcflib::VariantCallFile* variant_file = new vcflib::VariantCallFile();
        variant_files.emplace_back(variant_file);
        variant_file->open(vcf_filename);
        if (!variant_file->is_open()) {
            cerr << "error:[vg construct] could not open" << vcf_filename << endl;
            return 1;
        }
    }

    if (fasta_filenames.empty()) {
        cerr << "error:[vg construct] a reference is required for graph construction" << endl;
        return 1;
    }
    vector<unique_ptr<FastaReference>> references;
    for (auto& fasta_filename : fasta_filenames) {
        // Open each FASTA file
        FastaReference* reference = new FastaReference();
        references.emplace_back(reference);
        reference->open(fasta_filename);
    }

    // We need a callback to handle pieces of graph as they are produced.
    auto callback = [&](Graph& big_chunk) {
        // TODO: these chunks may be too big to (de)serialize directly. For now,
        // just serialize them directly anyway.
        #pragma omp critical (cout)
        stream::write(cout, 1, std::function<Graph(uint64_t)>([&](uint64_t chunk_number) -> Graph {
            assert(chunk_number == 0);
            // Just spit out our one chunk
            return big_chunk;
        }));
    };

    // Make vectors of just bare pointers
    vector<vcflib::VariantCallFile*> vcf_pointers;
    for(auto& vcf : variant_files) {
        vcf_pointers.push_back(vcf.get());
    }
    vector<FastaReference*> fasta_pointers;
    for(auto& fasta : references) {
        fasta_pointers.push_back(fasta.get());
    }

    // Construct the graph.
    constructor.construct_graph(fasta_pointers, vcf_pointers, callback);

    // NB: If you worry about "still reachable but possibly lost" warnings in valgrind,
    // this would free all the memory used by protobuf:
    //ShutdownProtobufLibrary();

    return 0;
}
Beispiel #11
0
int main (int argc, char** argv) {

    double snp_mutation_rate = 0.001;
    double indel_mutation_rate = 0.0001;
    double het_rate = 0.5;
    double afs_alpha = 1;
    double indel_alpha = 3;
    double microsatellite_afs_alpha = 1;
    double microsatellite_len_alpha = 1.7;
    double microsatellite_mutation_rate = 0.0001;
    double mnp_ratio = 0.01;
    double tstv_ratio = 2.5;
    double deamination_ratio = 1.8;
    int microsatellite_min_length = 1;
    int indel_max = 1000;
    int ploidy = 1;
    int population_size = 1;
    int sample_id_max_digits = 1;
    int seed = time(NULL);
    string fastaFileName;
    string file_prefix = "";
    string sample_prefix = "";
    bool dry_run = false;
    int repeat_size_max = 20;
    bool uniform_indel_distribution = false;

    double p, lambda, shape, mu, sigma;

    string command_line = argv[0];
    for (int i = 1; i < argc; ++i) {
        command_line += " ";
        command_line += argv[i];
    }

    int c;

    while (true) {
        static struct option long_options[] =
            {
                /* These options set a flag. */
                //{"verbose", no_argument,       &verbose_flag, 1},
                //{"brief",   no_argument,       &verbose_flag, 0},
                {"help", no_argument, 0, 'h'},
                {"snp-rate",  required_argument, 0, 's'},
                {"mnp-ratio", required_argument, 0, 'M'},
                {"indel-rate",  required_argument, 0, 'i'},
                {"indel-alpha", required_argument, 0, 'z'},
                {"indel-max", required_argument, 0, 'X'},
                {"repeat-size-max", required_argument, 0, 'q'},
                {"microsat-rate",  required_argument, 0, 'm'},
                {"microsat-afs-alpha", required_argument, 0, 't'},
                {"microsat-len-alpha", required_argument, 0, 'j'},
                {"microsat-min-len", required_argument, 0, 'l'},
                {"afs-alpha",  required_argument, 0, 'a'},
                {"ploidy", required_argument, 0, 'p'},
                {"population-size", required_argument, 0, 'n'},
                {"file-prefix", required_argument, 0, 'P'},
                {"sample-prefix", required_argument, 0, 'S'},
                {"random-seed", required_argument, 0, 'g'},
                {"dry-run", no_argument, 0, 'd'},
                {"uniform-indels", no_argument, 0, 'U'},
                {"ts-tv-ratio", required_argument, 0, 'T'},
                {"deamination-ratio", required_argument, 0, 'D'},
                {0, 0, 0, 0}
            };
        /* getopt_long stores the option index here. */
        int option_index = 0;

        c = getopt_long (argc, argv, "hdUa:z:s:i:q:p:n:M:X:t:m:P:S:g:l:j:T:", long_options, &option_index);

        /* Detect the end of the options. */
        if (c == -1)
            break;
 
        switch (c)
        {
        case 0:
            /* If this option set a flag, do nothing else now. */
            if (long_options[option_index].flag != 0)
                break;
            printf ("option %s", long_options[option_index].name);
            if (optarg)
                printf (" with arg %s", optarg);
            printf ("\n");
            break;

        case 'd':
            dry_run = true;
            break;

        case 'U':
            uniform_indel_distribution = true;
            break;

        case 'q':
            if (!convert(optarg, repeat_size_max)) {
                cerr << "could not read -q, --repeat-size-max" << endl;
                exit(1);
            }
            break;

        case 's':
            if (!convert(optarg, snp_mutation_rate)) {
                cerr << "could not read -s, --snp-rate" << endl;
                exit(1);
            }
            break;

        case 'i':
            if (!convert(optarg, indel_mutation_rate)) {
                cerr << "could not read -i, --indel-rate" << endl;
                exit(1);
            }
            break;

        case 'a':
            if (!convert(optarg, afs_alpha)) {
                cerr << "could not read -a, --afs-alpha" << endl;
                exit(1);
            }
            break;
 
        case 'z':
            if (!convert(optarg, indel_alpha)) {
                cerr << "could not read -z, --indel-alpha" << endl;
                exit(1);
            }
            break;

        case 'X':
            if (!convert(optarg, indel_max)) {
                cerr << "could not read -M, --indel-max" << endl;
                exit(1);
            }
            break;
 
        case 'M':
            if (!convert(optarg, mnp_ratio)) {
                cerr << "could not read -m, --mnp-ratio" << endl;
                exit(1);
            }
            break;
 
        case 'm':
            if (!convert(optarg, microsatellite_mutation_rate)) {
                cerr << "could not read -m, --microsat-rate" << endl;
                exit(1);
            }
            break;

        case 'T':
            if (!convert(optarg, tstv_ratio)) {
                cerr << "could not read -T, --ts-tv-ratio" << endl;
                exit(1);
            }
            break;
 
        case 't':
            if (!convert(optarg, microsatellite_afs_alpha)) {
                cerr << "could not read -m, --microsatellite-afs-alpha" << endl;
                exit(1);
            }
            break;
 
        case 'j':
            if (!convert(optarg, microsatellite_len_alpha)) {
                cerr << "could not read -m, --microsatellite-len-alpha" << endl;
                exit(1);
            }
            break;
 
        case 'l':
            if (!convert(optarg, microsatellite_min_length)) {
                cerr << "could not read -l, --microsat-min-len" << endl;
                exit(1);
            }
            break;
 
        case 'p':
            if (!convert(optarg, ploidy)) {
                cerr << "could not read -p, --ploidy" << endl;
                exit(1);
            }
            break;

        case 'P':
            file_prefix = optarg;
            break;

        case 'S':
            sample_prefix = optarg;
            break;
 
        case 'n':
            if (!convert(optarg, population_size)) {
                cerr << "could not read -n, --population-size" << endl;
                exit(1);
            }
            sample_id_max_digits = strlen(optarg);
            break;

        case 'g':
            if (!convert(optarg, seed)) {
                cerr << "could not read -g, --random-seed" << endl;
                exit(1);
            }
            break;

        case 'h':
            printSummary();
            exit(0);
            break;
 
        case '?':
            /* getopt_long already printed an error message. */
            printSummary();
            exit(1);
            break;
 
        default:
            abort ();
        }
    }

    /* Print any remaining command line arguments (not options). */
    if (optind < argc) {
        //cerr << "fasta file: " << argv[optind] << endl;
        fastaFileName = argv[optind];
    } else {
        cerr << "please specify a fasta file" << endl;
        printSummary();
        exit(1);
    }

    init_genrand(seed); // seed mt with current time

    //mt19937 eng(seed);

    int bpPerHaplotypeMean = 1000;
    double bpPerHaplotypeSigma = 200;
    normal_distribution<double> normal(mu, sigma);
     
    //lambda = 7.0;
    //poisson_distribution<int> poisson(lambda);
    //poisson(eng);

    string seqname;
    string sequence;  // holds sequence so we can process it

    FastaReference fr;
    fr.open(fastaFileName);

    string bases = "ATGC";

    vcf::VariantCallFile vcfFile;

    // write the VCF header
    stringstream headerss;
    headerss 
        << "##fileformat=VCFv4.1" << endl
        << "##fileDate=" << dateStr() << endl
        << "##source=mutatrix population genome simulator" << endl
        << "##seed=" << seed << endl
        << "##reference=" << fastaFileName << endl
        << "##phasing=true" << endl
        << "##commandline=" << command_line << endl
        << "##INFO=<ID=AC,Number=A,Type=Integer,Description=\"Alternate allele count\">" << endl
        << "##INFO=<ID=TYPE,Number=A,Type=String,Description=\"Type of each allele (snp, ins, del, mnp, complex)\">" << endl
        << "##INFO=<ID=NS,Number=1,Type=Integer,Description=\"Number of samples at the site\">" << endl
        << "##INFO=<ID=NA,Number=1,Type=Integer,Description=\"Number of alternate alleles\">" << endl
        << "##INFO=<ID=LEN,Number=A,Type=Integer,Description=\"Length of each alternate allele\">" << endl
        << "##INFO=<ID=MICROSAT,Number=0,Type=Flag,Description=\"Generated at a sequence repeat loci\">" << endl
        << "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">" << endl
        << "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT";

    vector<string> samples;
    for (int i = 0; i < population_size; ++i) {
        stringstream sampless;
        sampless << sample_prefix << setfill('0') << setw(sample_id_max_digits) << i + 1; // one-based sample names
        samples.push_back(sampless.str());
        headerss << "\t" << sampless.str();
    }

    // and set up our VCF output file
    string header = headerss.str();
    vcfFile.openForOutput(header);
    cout << vcfFile.header << endl;

    int copies = ploidy * population_size;

    map<string, vector<SampleFastaFile*> > sequencesByRefseq;

    if (!dry_run) {
        for (FastaIndex::iterator s = fr.index->begin(); s != fr.index->end(); ++s) {

            FastaIndexEntry& indexEntry = s->second;
            seqname = indexEntry.name;

            vector<SampleFastaFile*>& sequences = sequencesByRefseq[seqname];
            for (int i = 0; i < population_size; ++i) {
                stringstream sname;
                sname << sample_prefix << setfill('0') << setw(sample_id_max_digits) << i + 1;
                string samplename = sname.str();
                for (int j = 0; j < ploidy; ++j) {
                    stringstream cname;
                    cname << j;
                    string chromname = cname.str();
                    string fullname = samplename + ":" + seqname + ":" + chromname;
                    string filename = file_prefix + fullname + ".fa";
                    //sequences.push_back(SampleFastaFile(filename, seqname));
                    sequences.push_back(new SampleFastaFile(filename, seqname));
                }
            }
        }
    }



    for (FastaIndex::iterator s = fr.index->begin(); s != fr.index->end(); ++s) {

        FastaIndexEntry& indexEntry = s->second;
        seqname = indexEntry.name;
        sequence = fr.getSequence(s->first);

        vector<SampleFastaFile*>& sequences = sequencesByRefseq[seqname];
        //sequences.resize(copies);
        
        long int pos = 0;
        long int microsatellite_end_pos = 0;
        while (pos < sequence.size()) {

            //cout << pos + 1 << " microsat end pos " << microsatellite_end_pos << endl;

            string ref = sequence.substr(pos, 1); // by default, ref is just the current base

            // skip non-DNA sequence information
            if (!(ref == "A" || ref == "T" || ref == "C" || ref == "G")) {
                pos += ref.size();
                for (vector<SampleFastaFile*>::iterator s = sequences.begin(); s != sequences.end(); ++s) {
                    (*s)->write(ref);
                }
                continue;
            }

            vector<Allele> alleles;

            // establish if we are in a repeat
            // and what motif is being repeated, how many times

            int len = 1;

            // get reference repeats
            // if we have a repeat, adjust the mutation rate
            // using length and direction-dependent
            // formula from "Likelihood-Based Estimation of Microsatellite Mutation Rates"
            // http://www.genetics.org/cgi/content/full/164/2/781#T1

            if (pos > microsatellite_end_pos) {

                map<string, int> repeats = repeatCounts(pos + 1, (const string&) sequence, repeat_size_max);

                string seq;
                int repeat_count = 0;
                // get the "biggest" repeat, the most likely ms allele at this site
                for (map<string, int>::iterator r = repeats.begin(); r != repeats.end(); ++r) {
                    if (repeat_count < r->second) {
                        repeat_count = r->second;
                        seq = r->first;
                    }
                }
                //cout << pos + 1 << " " << sequence.substr(pos + 1, seq.size() * repeat_count) << " ?= " << seq * repeat_count << endl;

                // guard ensures that we are in a pure repeat situoation, tandem-tandem repeats are not handled presently
                if (repeats.size() > 0 && sequence.substr(pos + 1, seq.size() * repeat_count) == seq * repeat_count) {

                    int microsatellite_length = repeat_count * seq.size();

                    // record end of microsatellite so we don't generate more mutations until we pass it
                    microsatellite_end_pos = pos + microsatellite_length - 1;

                    if (microsatellite_length > microsatellite_min_length
                        //&& genrand_real1() / copies 
                        //    < microsatellite_mutation_rate * repeat_count) {
                        && genrand_real1() > pow(1 - (microsatellite_mutation_rate * repeat_count), log(copies) * 2)) {

                        // establish the relative rate of ins and del events
                        /*
                          long double repeatMutationDelProbability = microsatelliteDelProb(repeat_count);
                          long double repeatMutationInsProbability = microsatelliteInsProb(repeat_count);
                          long double indel_balance = 1;
                          if (repeatMutationInsProbability > repeatMutationDelProbability) {
                          indel_balance = repeatMutationInsProbability / repeatMutationDelProbability;
                          } else {
                          indel_balance = 1 - (repeatMutationInsProbability / repeatMutationDelProbability);
                          }
                        */
                        double indel_balance = 0.5;

                        // how many alleles at the site?

                        //int numalleles = min((int) floor(zetarandom(microsatellite_afs_alpha)), (int) ((double) repeat_count * indel_balance));
                        int numalleles = random_allele_frequency(repeat_count, microsatellite_afs_alpha);
                        //cout << "repeat_count: " << repeat_count << " numalleles: " << numalleles << endl;

                        map<int, bool> allele_lengths;
                        // lengths of the alleles
                        while (allele_lengths.size() < numalleles) {
                            int allele_length;
                            // TODO adjust length so that shorter events are more likely...
                            if (genrand_real1() > indel_balance) {
                                allele_length = -1 * min((int) floor(zetarandom(microsatellite_len_alpha)), repeat_count);
                            } else {
                                allele_length = min((int) floor(zetarandom(microsatellite_len_alpha)), repeat_count);
                            }
                            //cout << allele_length << endl;
                            map<int, bool>::iterator f = allele_lengths.find(allele_length);
                            if (f == allele_lengths.end()) {
                                allele_lengths[allele_length] = true;
                            }
                        }

                        // generate alleles
                        for (map<int, bool>::iterator f = allele_lengths.begin();
                             f != allele_lengths.end(); ++f) {

                            int allele_length = f->first;
                            int c = abs(f->first);
                            string alt = seq;

                            for (int i = 1; i < c; ++i)
                                alt += seq;

                            if (allele_length > 0) {
                                alleles.push_back(Allele(ref, ref + alt, "MICROSAT"));
                            } else {
                                alleles.push_back(Allele(ref + alt, ref, "MICROSAT"));
                            }
                            //cout << pos + 1 << " "  << microsatellite_length << " " << alleles.back() << endl;
                        }
                        //cout << "alleles.size() == " << alleles.size() << endl;
                    }
                }
            }

            // snp case
            if (genrand_real1() > pow(1 - snp_mutation_rate, log(max(copies, 2)) * 2)) {

                // make an alternate allele
                /*
                  string alt = ref;
                  while (alt == ref) {
                  alt = string(1, bases.at(genrand_int32() % 4));
                  }
                */
                string alt = ref;
                if (genrand_real1() > 1 / (1 + tstv_ratio)) {
                    if (ref == "A") {
                        alt = "G";
                    } else if (ref == "G") {
                        alt = "A";
                    } else if (ref == "C") {
                        alt = "T";
                    } else if (ref == "T") {
                        alt = "C";
                    }
                } else {
                    while (alt == ref || isTransition(ref, alt)) {
                        alt = string(1, bases.at(genrand_int32() % 4));
                    }
                }

                if (genrand_real1() < mnp_ratio) {
                    int i = 1;
                    do {
                        ref += sequence.substr(pos + i, 1);
                        alt += sequence.substr(pos + i, 1);
                        ++i;
                        while (alt.at(alt.size() - 1) == ref.at(ref.size() - 1)) {
                            alt.at(alt.size() - 1) = bases.at(genrand_int32() % 4);
                        }
                    } while (genrand_real1() < mnp_ratio);
                    len = alt.size();
                }
                alleles.push_back(Allele(ref, alt));
            }

            // indel case
            if (genrand_real1() > pow(1 - indel_mutation_rate, log(max(copies, 2)) * 2)) {
                // how many bp?
                if (uniform_indel_distribution) {
                    len = (int) floor(genrand_real1() * indel_max);
                } else {
                    len = (int) floor(zetarandom(indel_alpha));
                }
                // guard against out-of-sequence indels
                if (pos + len < sequence.size() && len <= indel_max) {
                    if (genrand_int32() % 2 == 0) {
                        // deletion
                        alleles.push_back(Allele(sequence.substr(pos, 1 + len), sequence.substr(pos, 1)));
                    } else {
                        string alt = ref;
                        // insertion?
                        // insert some random de novo bases
                        while (alt.length() < len + 1) {
                            alt += string(1, bases.at(genrand_int32() % 4));
                        }
                        alleles.push_back(Allele(ref, alt));
                    }
                } else {
                    // fall through
                }
            }

            // no mutation generated
            if (alleles.empty()) {
                for (int i = 0; i < copies; ++i) {
                    if (!dry_run) {
                        sequences.at(i)->write(ref);
                    }
                }
                pos += ref.size();
            } else {

                // TODO randomly distribute all the alleles throughout the population
                // generate allele frequencies for each
                // fun times...

                string genotype;

                vector<bool> alts;
                random_shuffle(alleles.begin(), alleles.end());

                vector<Allele*> population_alleles;
                list<Allele> present_alleles; // filtered for AFS > 0 in the sample
                
                // AFS simulation
                int remaining_copies = copies;
                while (remaining_copies > 0 && !alleles.empty()) {
                    Allele allele = alleles.back();
                    alleles.pop_back();
                    int allele_freq = random_allele_frequency(remaining_copies, afs_alpha);
                    if (allele_freq > 0) {
                        present_alleles.push_back(allele);
                        Allele* allelePtr = &present_alleles.back();
                        for (int i = 0; i < allele_freq; ++i) {
                            population_alleles.push_back(allelePtr);
                        }
                        remaining_copies -= allele_freq;
                    }
                }

                if (present_alleles.empty()) {
                    for (int i = 0; i < copies; ++i) {
                        if (!dry_run) {
                            sequences.at(i)->write(ref);
                        }
                    }
                    pos += ref.size();
                    continue;
                }

                reverse(present_alleles.begin(), present_alleles.end());

                // establish the correct reference sequence and alternate allele set
                for (list<Allele>::iterator a = present_alleles.begin(); a != present_alleles.end(); ++a) {
                    Allele& allele = *a;
                    //cout << allele << endl;
                    if (allele.ref.size() > ref.size()) {
                        ref = allele.ref;
                    }
                }

                // reference alleles take up the rest
                Allele reference_allele = Allele(ref, ref);
                for (int i = 0; i < remaining_copies; ++i) {
                    population_alleles.push_back(&reference_allele);
                }

                vector<string> altstrs;
                // now the reference allele is the largest possible, adjust the alt allele strings to reflect this
                // if we have indels, add the base before, set the position back one
                for (list<Allele>::iterator a = present_alleles.begin(); a != present_alleles.end(); ++a) {
                    Allele& allele = *a;
                    string alleleStr = ref;
                    if (allele.ref.size() == allele.alt.size()) {
                        alleleStr.replace(0, allele.alt.size(), allele.alt);
                    } else {
                        alleleStr.replace(0, allele.ref.size(), allele.alt);
                    }
                    allele.ref = ref;
                    allele.alt = alleleStr;
                    altstrs.push_back(alleleStr);
                }

                assert(population_alleles.size() == copies);

                // shuffle the alleles around the population
                random_shuffle(population_alleles.begin(), population_alleles.end());

                vcf::Variant var(vcfFile);
                var.sequenceName = seqname;
                var.position = pos + 1;
                var.quality = 99;
                var.id = ".";
                var.filter = ".";
                var.info["NS"].push_back(convert(population_size));
                var.info["NA"].push_back(convert(present_alleles.size()));
                var.format.push_back("GT");
                var.ref = ref;
                var.alt = altstrs;

                // debugging, uncomment to see sequence context
                //cout << sequence.substr(pos - 10, 10) << "*" << ref << "*" << sequence.substr(pos + 1, 9) << endl;

                map<string, int> alleleIndexes;
                alleleIndexes[convert(reference_allele)] = 0; // XXX should we handle this differently, by adding the reference allele to present_alleles?
                int i = 1;
                for (list<Allele>::iterator a = present_alleles.begin(); a != present_alleles.end(); ++a, ++i) {
                    Allele& allele = *a;
                    //cout << allele << " " << i << endl;
                    alleleIndexes[convert(allele)] = i;
                    //cout << allele << " " << i << endl;
                }

                //for (map<string, int>::iterator a = alleleIndexes.begin(); a != alleleIndexes.end(); ++a) {
                //    cout << a->first << " = " << a->second << endl;
                //}

                int j = 0;
                for (vector<string>::iterator s = samples.begin(); s != samples.end(); ++s, ++j) {
                    string& sample = *s;
                    vector<string> genotype;
                    // XXX hack, maybe this should get stored in another map for easier access?
                    for (int i = 0; i < ploidy; ++i) {
                        int l = (j * ploidy) + i;
                        //cout << l << " " << population_alleles.at(l) << " " << alleleIndexes[convert(population_alleles.at(l))] << endl;
                        genotype.push_back(convert(alleleIndexes[convert(*population_alleles.at(l))]));
                    }
                    var.samples[sample]["GT"].push_back(join(genotype, "|"));
                    //cout << var.samples[sample]["GT"].front() << endl;
                }

                // XXX THIS IS BROKEN BECAUSE YOUR REFERENCE ALLELE CHANGES
                // LENGTH WITH DELETIONS.
                //
                // IT'S POSSIBLE TO GET COMPLEX ALLELES AT THE INTERSECTIONS
                // BETWEEN ONE ALLELIC VARIANT AND ANOTHER.  THIS IS BROKEN!
                //
                // TO FIX--- BUILD HAPLOTYPES, THEN DISTRIBUTE THEM WITHIN THE POPULATION
                //
                // now write out our sequence data (FASTA files)
                for (int j = 0; j < population_size; ++j) {
                    for (int i = 0; i < ploidy; ++i) {
                        int l = (j * ploidy) + i;
                        Allele* allele = population_alleles.at(l);
                        if (!dry_run) {
                            sequences.at(l)->write(allele->alt);
                        }
                    }
                }

                // tabulate allele frequency, and write some details to the VCF
                for (list<Allele>::iterator a = present_alleles.begin(); a != present_alleles.end(); ++a) {

                    Allele& allele = *a;
                    Allele* allelePtr = &*a;

                    vector<string> genotypes;
                    genotypes.resize(population_size);

                    int allele_freq = 0;

                    // obtain allele frequencies and output FASTA sequence data
                    // for each simulated sample
                    for (int j = 0; j < population_size; ++j) {
                        for (int i = 0; i < ploidy; ++i) {
                            int l = (j * ploidy) + i;
                            if (population_alleles.at(l) == allelePtr) {
                                ++allele_freq;
                            }
                        }
                    }

                    // set up the allele-specific INFO fields in the VCF record
                    var.info["AC"].push_back(convert(allele_freq));

                    int delta = allele.alt.size() - allele.ref.size();
                    if (delta == 0) {
                        if (allele.ref.size() == 1) {
                            var.info["TYPE"].push_back("snp");
                            var.info["LEN"].push_back(convert(allele.ref.size()));
                        } else {
                            var.info["TYPE"].push_back("mnp");;
                            var.info["LEN"].push_back(convert(allele.ref.size()));
                        }
                    } else if (delta > 0) {
                        var.info["TYPE"].push_back("ins");;
                        var.info["LEN"].push_back(convert(abs(delta)));
                    } else {
                        var.info["TYPE"].push_back("del");;
                        var.info["LEN"].push_back(convert(abs(delta)));
                    }
                    if (!allele.type.empty()) {
                        var.infoFlags[allele.type] = true;
                    }

                }

                // write the VCF record to stdout
                cout << var << endl;

                int largest_ref = 1; // enforce one pos
                for (list<Allele>::iterator a = present_alleles.begin(); a != present_alleles.end(); ++a) {
                    if (a->ref.size() > largest_ref) {
                        largest_ref = a->ref.size();
                    }
                }

                pos += largest_ref; // step by the size of the last event
            }
        }
    }

    // close, clean up files
    for (map<string, vector<SampleFastaFile*> >::iterator s = sequencesByRefseq.begin(); s != sequencesByRefseq.end(); ++s) {
        vector<SampleFastaFile*>& files = s->second;
        for (vector<SampleFastaFile*>::iterator f = files.begin(); f != files.end(); ++f) {
            delete *f;
        }
        files.clear();
    }

    return 0;

}
Beispiel #12
0
void realign_bam(Parameters& params) {

    FastaReference reference;
    reference.open(params.fasta_reference);

    bool suppress_output = false;

    int dag_window_size = params.dag_window_size;
    
    // open BAM file
    BamReader reader;
    if (!reader.Open("stdin")) {
        cerr << "could not open stdin for reading" << endl;
        exit(1);
    }

    BamWriter writer;
    if (!params.dry_run && !writer.Open("stdout", reader.GetHeaderText(), reader.GetReferenceData())) {
        cerr << "could not open stdout for writing" << endl;
        exit(1);
    }

    // store the names of all the reference sequences in the BAM file
    map<int, string> referenceIDToName;
    vector<RefData> referenceSequences = reader.GetReferenceData();
    int i = 0;
    for (RefVector::iterator r = referenceSequences.begin(); r != referenceSequences.end(); ++r) {
        referenceIDToName[i] = r->RefName;
        ++i;
    }

    vcf::VariantCallFile vcffile;
    if (!params.vcf_file.empty()) {
        if (!vcffile.open(params.vcf_file)) {
            cerr << "could not open VCF file " << params.vcf_file << endl;
            exit(1);
        }
    } else {
        cerr << "realignment requires VCF file" << endl;
        exit(1);
    }
    vcf::Variant var(vcffile);

    BamAlignment alignment;
    map<long int, vector<BamAlignment> > alignmentSortQueue;

    // get alignment
    // assemble DAG in region around alignment
    // loop for each alignment in BAM:
    //     update DAG when current alignment gets close to edge of assembled DAG
    //     attempt to realign if read has a certain number of mismatches + gaps or softclips, weighted by basequal
    //     if alignment to DAG has fewer mismatches and gaps than original alignment, use it
    //         flatten read into reference space (for now just output alleles from VCF un-spanned insertions)
    //     write read to queue for streaming re-sorting (some positional change will occur)

    long int dag_start_position = 0;
    string currentSeqname;
    string ref;
    //vector<Cigar> cigars; // contains the Cigar strings of nodes in the graph
    //vector<long int> refpositions; // contains the reference start coords of nodes in the graph
    ReferenceMappings ref_map;
    gssw_graph* graph = gssw_graph_create(0);
    int8_t* nt_table = gssw_create_nt_table();
    int8_t* mat = gssw_create_score_matrix(params.match, params.mism);

    int total_reads = 0;
    int total_realigned = 0;
    int total_improved = 0;
    bool emptyDAG = false; // if the dag is constructed over empty sequence
                           // such as when realigning reads mapped to all-N sequence
    if (params.debug) {
        cerr << "about to start processing alignments" << endl;
    }

    while (reader.GetNextAlignment(alignment)) {

        string& seqname = referenceIDToName[alignment.RefID];

        if (params.debug) {
            cerr << "--------------------------------------------" << endl
                 << "processing alignment " << alignment.Name << " at "
                 << seqname << ":" << alignment.Position << endl;
        }

        /*
        if (!alignment.IsMapped() && graph->size == 0) {
            if (params.debug) {
                cerr << "unable to build DAG using unmapped read "
                     << alignment.Name << " @ "
                     << seqname << ":" << alignment.Position
                     << " no previous mapped read found and DAG currently empty" << endl;
            }
            alignmentSortQueue[dag_start_position+dag_window_size].push_back(alignment);
            continue;
        }
        */

        ++total_reads;

        BamAlignment originalAlignment = alignment;
        long unsigned int initialAlignmentPosition = alignment.Position;
        //if (dag_start_position == 1) {
        //    dag_start_position = max(1, (int)initialAlignmentPosition - dag_window_size/2);
        //}

        // should we construct a new DAG?  do so when 3/4 of the way through the current one
        // center on current position + 1/2 dag window
        // TODO check this scheme using some scribbles on paper
        // alignment.IsMapped()
        if ((seqname != currentSeqname
             || ((alignment.Position + (alignment.QueryBases.size()/2)
                  > (3*dag_window_size/4) + dag_start_position)))
            && alignment.Position < reference.sequenceLength(seqname)) {

            if (seqname != currentSeqname) {
                if (params.debug) {
                    cerr << "switched ref seqs" << endl;
                }
                dag_start_position = max((long int) 0,
                                         (long int) (alignment.GetEndPosition() - dag_window_size/2));
            // recenter DAG
            } else if (!ref_map.empty()) {
                dag_start_position = dag_start_position + dag_window_size/2;
                dag_start_position = max(dag_start_position,
                                         (long int) (alignment.GetEndPosition() - dag_window_size/2));
            } else {
                dag_start_position = alignment.Position - dag_window_size/2;
            }
            dag_start_position = max((long int)0, dag_start_position);

            // TODO get sequence length and use to bound noted window size (edge case)
            //cerr << "getting ref " << seqname << " " << max((long int) 0, dag_start_position) << " " << dag_window_size << endl;

            // get variants for new DAG
            vector<vcf::Variant> variants;
            if (!vcffile.setRegion(seqname,
                                   dag_start_position + 1,
                                   dag_start_position + dag_window_size)) {
                // this is not necessarily an error; there should be a better way to check for VCF file validity
                /*
                cerr << "could not set region on VCF file to " << currentSeqname << ":"
                     << dag_start_position << "-" << dag_start_position + ref.size()
                     << endl;
                */
                //exit(1);
            } else {

                // check first variant
                if (vcffile.getNextVariant(var)) {
                    while (var.position <= dag_start_position + 1) {
                        //cerr << "var position == dag_start_position " << endl;
                        dag_start_position -= 1;
                        vcffile.setRegion(seqname,
                                          dag_start_position + 1,
                                          dag_start_position + dag_window_size);
                        if (!vcffile.getNextVariant(var)) { break; }
                    }
                }

                vcffile.setRegion(seqname,
                                  dag_start_position + 1,
                                  dag_start_position + dag_window_size);

                while (vcffile.getNextVariant(var)) {
                    if (params.debug) cerr << "getting variant at " << var.sequenceName << ":" << var.position << endl;
                    //cerr << var.position << " + " << var.ref.length() << " <= " << dag_start_position << " + " << dag_window_size << endl;
                    //cerr << var.position << " >= " << dag_start_position << endl;
                    if (var.position + var.ref.length() <= dag_start_position + dag_window_size
                        && var.position >= dag_start_position) {
                        variants.push_back(var);
                    }
                }

            }

            //cerr << "dag_start_position " << dag_start_position << endl;
            ref = reference.getSubSequence(seqname,
                                           max((long int) 0, dag_start_position),
                                           dag_window_size); // 0/1 conversion

            // clear graph and metadata
            ref_map.clear();
            //cigars.clear();
            //refpositions.clear();
            gssw_graph_destroy(graph);

            if (params.debug) { cerr << "constructing DAG" << endl; }
            // and build the DAG
            graph = gssw_graph_create(0);
            constructDAGProgressive(graph,
                                    ref_map,
                                    ref,
                                    seqname,
                                    variants,
                                    dag_start_position,
                                    nt_table,
                                    mat,
                                    params.flat_input_vcf);

            if (params.debug) {
                cerr << "graph has " << graph->size << " nodes" << endl;
                cerr << "DAG generated from input variants over "
                     << seqname << ":" << dag_start_position << "-" << dag_start_position + dag_window_size
                     << endl;
            }
            if (params.display_dag) {
                gssw_graph_print(graph);
                /*
                for (Backbone::iterator b = backbone.begin(); b != backbone.end(); ++b) {
                    cout << b->first << " "
                         << b->first->id << " "
                         << b->second.ref_position << " "
                         << b->second.cigar << endl
                         << b->first->seq << endl;
                }
                */
            }

            if (graph->size == 1 && allN(ref) || graph->size == 0) {
                if (params.debug) {
                    cerr << "DAG is empty (1 node, all N).  Alignment is irrelevant." << endl;
                }
                emptyDAG = true;
            } else {
                emptyDAG = false;
            }

        }

        AlignmentStats stats_before;
        bool was_mapped = alignment.IsMapped();
        bool has_realigned = false;
        if (was_mapped) {
            if (dag_start_position + dag_window_size < alignment.GetEndPosition()) {
                ref = reference.getSubSequence(seqname,
                                               max((long int) 0, dag_start_position),
                                               alignment.GetEndPosition() - dag_start_position); // 0/1 conversion
            }
        }

        if (params.debug) {
            if (emptyDAG) {
                cerr << "cannot realign against empty (all-N single node) graph" << endl;
            }
        }

        if (!emptyDAG && shouldRealign(alignment, ref, dag_start_position, params, stats_before)) {

            ++total_realigned;

            if (params.debug) {
                cerr << "realigning: " << alignment.Name
                     << " " << alignment.QueryBases << endl
                     << " aligned @ " << alignment.Position
                     << " to variant graph over "
                     << seqname
                     << ":" << dag_start_position
                     << "-" << dag_start_position + dag_window_size << endl;
            }

            //{
            try {

                Cigar flat_cigar;
                string read = alignment.QueryBases;
                string qualities = alignment.Qualities;
                int score;
                long int position;
                string strand;
                gssw_graph_mapping* gm =
                    gswalign(graph,
                             ref_map,
                             read,
                             qualities,
                             params,
                             position,
                             score,
                             flat_cigar,
                             strand,
                             nt_table,
                             mat);
                //
                gssw_graph_mapping_destroy(gm);

                if (params.dry_run) {

                    if (strand == "-" && !alignment.IsMapped()) {
                        read = reverseComplement(read);
                    }
                    cout << read << endl;
                    cout << graph_mapping_to_string(gm) << endl;
                    cout << score << " " << strand << " "
                         << position << " "
                         << flat_cigar << endl;

                } else {

                    /*
                    if (strand == "-") {
                        read = reverseComplement(trace_report.read);
                    }
                   */
 
                    // TODO the qualities are not on the right side of the read
                    if (strand == "-" && alignment.IsMapped()) {
                        // if we're realigning, this is always true unless we swapped strands
                        alignment.SetIsReverseStrand(true);
                        //reverse(alignment.Qualities.begin(), alignment.Qualities.end()); // reverse qualities
                    }
                    //alignment.QueryBases = reverseComplement(trace_report.read);
                    alignment.QueryBases = read;
                    alignment.Qualities = qualities;

                    alignment.Position = position;// + 1;// + 1;//(trace_report.node->position - 1) + trace_report.x;
                    alignment.SetIsMapped(true);
                    if (!alignment.MapQuality) {
                        alignment.MapQuality = 20; // horrible hack...  at least approximate with alignment mismatches against graph
                    }

                    // check if somehow we've ended up with an indel at the ends
                    // if so, grab the reference sequence right beyond it and add
                    // a single match to the cigar, allowing variant detection methods
                    // to run on the results without internal modification
                    Cigar& cigar = flat_cigar;
                    //cerr << flat_cigar << " " << flat_cigar.readLen() << " " << flat_cigar.refLen() << endl;
                    int flankSize = params.flatten_flank;
                    if (cigar.front().isIndel() ||
                        (cigar.front().isSoftclip() && cigar.at(1).isIndel())) {
                        alignment.Position -= flankSize;
                        string refBase = reference.getSubSequence(seqname, alignment.Position, flankSize);
                        if (cigar.front().isSoftclip()) {
                            alignment.QueryBases.erase(alignment.QueryBases.begin(),
                                                       alignment.QueryBases.begin()+cigar.front().length);
                            alignment.Qualities.erase(alignment.Qualities.begin(),
                                                       alignment.Qualities.begin()+cigar.front().length);
                            cigar.erase(cigar.begin());
                        }
                        alignment.QueryBases.insert(0, refBase);
                        alignment.Qualities.insert(0, string(flankSize, shortInt2QualityChar(30)));
                        Cigar newCigar; newCigar.push_back(CigarElement(flankSize, 'M'));
                        newCigar.append(flat_cigar);
                        flat_cigar = newCigar;
                    }
                    if (cigar.back().isIndel() ||
                        (cigar.back().isSoftclip() && cigar.at(cigar.size()-2).isIndel())) {
                        string refBase = reference.getSubSequence(seqname,
                                                                  alignment.Position
                                                                  + flat_cigar.refLen(),
                                                                  flankSize);
                        if (cigar.back().isSoftclip()) {
                            alignment.QueryBases.erase(alignment.QueryBases.end()-cigar.back().length,
                                                       alignment.QueryBases.end());
                            alignment.Qualities.erase(alignment.Qualities.end()-cigar.back().length,
                                                      alignment.Qualities.end());
                            cigar.pop_back();
                        }
                        Cigar newCigar; newCigar.push_back(CigarElement(flankSize, 'M'));
                        flat_cigar.append(newCigar);
                        //flat_cigar.append(newCigar);
                        alignment.QueryBases.append(refBase);
                        alignment.Qualities.append(string(flankSize, shortInt2QualityChar(30)));
                    }

                    flat_cigar.toCigarData(alignment.CigarData);
                    //cerr << flat_cigar << " " << flat_cigar.readLen() << " " << flat_cigar.refLen() << endl;

                    if (dag_start_position + dag_window_size < alignment.GetEndPosition()) {
                        ref = reference.getSubSequence(seqname,
                                                       max((long int) 0, dag_start_position),
                                                       alignment.GetEndPosition() - dag_start_position); // 0/1 conversion
                    }

                    AlignmentStats stats_after;
                    countMismatchesAndGaps(alignment, flat_cigar, ref, dag_start_position, stats_after, params.debug);
                    /*
                    if ((!was_mapped || (stats_before.softclip_qsum >= stats_after.softclip_qsum
                                         && stats_before.mismatch_qsum >= stats_after.mismatch_qsum))
                         && acceptRealignment(alignment, ref, dag_start_position, params, stats_after)) {
                    */
                    /*
                    if ((!was_mapped || (stats_before.softclip_qsum + stats_before.mismatch_qsum
                                         >= stats_after.softclip_qsum + stats_after.mismatch_qsum))
                         && acceptRealignment(alignment, ref, dag_start_position, params, stats_after)) {
                    */

                    // we accept the new alignment if...
                    if (!was_mapped  // it wasn't mapped previously
                        // or if we have removed soft clips or mismatches (per quality) from the alignment
                        //|| ((stats_before.softclip_qsum >= stats_after.softclip_qsum
                        //     && stats_before.mismatch_qsum >= stats_after.mismatch_qsum)
                        || ((stats_before.softclip_qsum + stats_before.mismatch_qsum
                             >= stats_after.softclip_qsum + stats_after.mismatch_qsum)
                            // and if we have added gaps, we have added them to remove mismatches or softclips
                            && (stats_before.gaps >= stats_after.gaps // accept any time we reduce gaps while not increasing softclips/mismatches
                                || (stats_before.gaps < stats_after.gaps // and allow gap increases when they improve the alignment
                                    && (stats_before.softclip_qsum 
                                        + stats_before.mismatch_qsum
                                        >
                                        stats_after.softclip_qsum
                                        + stats_after.mismatch_qsum))))
                            // and the alignment must not have more than the acceptable number of gaps, softclips, or mismatches
                            // as provided in input parameters
                        && acceptRealignment(alignment, ref, dag_start_position, params, stats_after)) {

                        // keep the alignment
                        // TODO require threshold of softclips to keep alignment (or count of gaps, mismatches,...)
                        if (params.debug) {
                            cerr << "realigned " << alignment.Name << " to graph, which it maps to with "
                                 << stats_after.mismatch_qsum << "q in mismatches and "
                                 << stats_after.softclip_qsum << "q in soft clips" << endl;
                        }
                        ++total_improved;
                        has_realigned = true;
                    } else {
                        // reset to old version of alignment
                        if (params.debug) {
                            cerr << "failed realignment of " << alignment.Name << " to graph, which it maps to with: " 
                                 << stats_after.mismatch_qsum << "q in mismatches " << "(vs " << stats_before.mismatch_qsum << "q before), and "
                                 << stats_after.softclip_qsum << "q in soft clips " << "(vs " << stats_before.softclip_qsum << "q before) " << endl;
                        }
                        has_realigned = false;
                        alignment = originalAlignment;
                    }
                }
                //} // try block

            } catch (...) {
                cerr << "exception when realigning " << alignment.Name
                     << " at position " << referenceIDToName[alignment.RefID]
                     << ":" << alignment.Position
                     << " " << alignment.QueryBases << endl;
                // reset to original alignment
                has_realigned = false;
                alignment = originalAlignment;

            }
        }

        // ensure correct order if alignments move
        long int maxOutputPos = initialAlignmentPosition - dag_window_size;
        // if we switched sequences we need to flush out all the reads from the previous one
        string lastSeqname = currentSeqname;
        if (seqname != currentSeqname) {
            // so the max output position is set past the end of the last chromosome
            if (!currentSeqname.empty()) {
                maxOutputPos = reference.sequenceLength(currentSeqname) + dag_window_size;
            }
            currentSeqname = seqname;
        }

        if (!params.dry_run) {
            map<long int, vector<BamAlignment> >::iterator p = alignmentSortQueue.begin();
            for ( ; p != alignmentSortQueue.end(); ++p) {
                // except if we are running in unsorted mode, stop when we are at the window size
                if (!params.unsorted_output && p->first > maxOutputPos) {
                    break; // no more to do
                } else {
                    for (vector<BamAlignment>::iterator a = p->second.begin(); a != p->second.end(); ++a) {
                        writer.SaveAlignment(*a);
                    }
                }
            }
            if (p != alignmentSortQueue.begin()) {
                alignmentSortQueue.erase(alignmentSortQueue.begin(), p);
            }
            if (!params.only_realigned || has_realigned) {
                alignmentSortQueue[alignment.Position].push_back(alignment);
            }
        }
    } // end GetNextAlignment loop

    if (!params.dry_run) {
        map<long int, vector<BamAlignment> >::iterator p = alignmentSortQueue.begin();
        for ( ; p != alignmentSortQueue.end(); ++p) {
            for (vector<BamAlignment>::iterator a = p->second.begin(); a != p->second.end(); ++a)
                writer.SaveAlignment(*a);
        }
    }

    gssw_graph_destroy(graph);
    free(nt_table);
	free(mat);

    reader.Close();
    writer.Close();

    if (params.debug) {
        cerr << "total reads:\t" << total_reads << endl;
        cerr << "realigned:\t" << total_realigned << endl;
        cerr << "improved:\t" << total_improved << endl;
    }

}
Beispiel #13
0
// one-off
void construct_dag_and_align_single_sequence(Parameters& params) {

    if (params.debug) {
        cout << "read: " << params.read_input << endl;
        //cout << "fastq file:" << params.fastq_file << endl;
        cout << "fasta reference:" << params.fasta_reference << endl;
        cout << "vcf file " << params.vcf_file << endl;
        cout << "target " << params.target << endl;
        cout << endl;
    }

    // get sequence of target
    FastaReference reference;
    reference.open(params.fasta_reference);
    FastaRegion target(params.target);
    string targetSequence = reference.getTargetSubSequence(target);

    // get variants in target
    vector<vcf::Variant> variants;
    vcf::VariantCallFile vcffile;

    if (!params.vcf_file.empty()) {
        vcffile.open(params.vcf_file);
        vcf::Variant var(vcffile);
    
        vcffile.setRegion(params.target);
        while (vcffile.getNextVariant(var)) {
            if (var.position + var.ref.length() <= target.stopPos) {
                variants.push_back(var);
            }
        }
    }

    long offset = max(target.startPos, 1); // start is -1 when coordinates are not specified

    // Declare the target DAG to align against.
    //vector<Cigar> cigars;
    //vector<long int> refpositions;
    ReferenceMappings ref_map;
    gssw_graph* graph = gssw_graph_create(0);
    int8_t* nt_table = gssw_create_nt_table();
	int8_t* mat = gssw_create_score_matrix(params.match, params.mism);
    constructDAGProgressive(graph,
                            ref_map,
                            targetSequence,
                            target.startSeq,
                            variants,
                            offset,
                            nt_table,
                            mat,
                            params.flat_input_vcf);

    if (params.display_dag) {
        cout << "DAG generated from input variants:" << endl;
    }


    // run the alignment

    string read = params.read_input;
    string qualities(read.size(), shortInt2QualityChar(30));
    int score;
    long int position;
    string strand;
    Cigar flat_cigar;
    gssw_graph_mapping* gm = gswalign(graph,
                                      ref_map,
                                      read,
                                      qualities,
                                      params,
                                      position,
                                      score,
                                      flat_cigar,
                                      strand,
                                      nt_table,
                                      mat);
    cerr << graph_mapping_to_string(gm) << endl;
    gssw_graph_mapping_destroy(gm);

    /*
    cout << score << " " << strand << " "
         << (trace_report.node->position - 1) + trace_report.x << " "
         << trace_report.fcigar
         << " seq:" << trace_report.x << " read:" << trace_report.y
         << " " << trace_report.gcigar << " " << trace_report.fcigar << endl;

    if (params.display_alignment) {
        string refseq;
        for (vector<sn*>::iterator n = trace_report.node_list.begin();
             n != trace_report.node_list.end(); ++n) {
            refseq.append((*n)->sequence);
        }
        refseq = refseq.substr(trace_report.x, read.size());
        cout << refseq << endl;
        if (strand == "+") {
            cout << read << endl;
        } else {
            cout << reverseComplement(read) << endl;
        }
    }
    */
}
Beispiel #14
0
int main(int argc, char** argv) {

    int c;

    FastaReference reference;
    bool has_ref = false;
    bool suppress_output = false;
    bool debug = false;
    bool isuncompressed = true;

    int maxiterations = 50;
    
    if (argc < 2) {
        printUsage(argv);
        exit(1);
    }

    while (true) {
        static struct option long_options[] =
        {
            {"help", no_argument, 0, 'h'},
            {"debug", no_argument, 0, 'd'},
            {"fasta-reference", required_argument, 0, 'f'},
            {"max-iterations", required_argument, 0, 'm'},
            {"suppress-output", no_argument, 0, 's'},
            {"compressed", no_argument, 0, 'c'},
            {0, 0, 0, 0}
        };

        int option_index = 0;

        c = getopt_long (argc, argv, "hdcsf:m:",
                         long_options, &option_index);

        /* Detect the end of the options. */
        if (c == -1)
            break;
 
        switch (c) {

            case 'f':
                reference.open(optarg); // will exit on open failure
                has_ref = true;
                break;
     
            case 'm':
                maxiterations = atoi(optarg);
                break;

            case 'd':
                debug = true;
                break;

            case 's':
                suppress_output = true;
                break;

            case 'c':
                isuncompressed = false;
                break;

            case 'h':
                printUsage(argv);
                exit(0);
                break;
              
            case '?':
                printUsage(argv);
                exit(1);
                break;
     
              default:
                abort();
                break;
        }
    }

    if (!has_ref) {
        cerr << "no FASTA reference provided, cannot realign" << endl;
        exit(1);
    }


    BAMSINGLEREADER reader;
    if (!reader.Open(STDIN)) {
        cerr << "could not open stdin for reading" << endl;
        exit(1);
    }

#ifdef HAVE_BAMTOOLS

    BamWriter writer;

    if (isuncompressed) {
        writer.SetCompressionMode(BamWriter::Uncompressed);
    }

    if (!suppress_output && !writer.Open("stdout", reader.GetHeaderText(), reader.GetReferenceData())) {
        cerr << "could not open stdout for writing" << endl;
        exit(1);
    }
#else

    SeqLib::BamWriter writer(isuncompressed ? SeqLib::SAM : SeqLib::BAM);
    SeqLib::BamHeader hdr = reader.Header();
    if (hdr.isEmpty()) {
      cerr << "could not open header for input" << endl;
      exit(1);
    }
    writer.SetHeader(hdr);

    if (!suppress_output && !writer.Open("-")) {
        cerr << "could not open stdout for writing" << endl;
        exit(1);
    }
#endif

    // store the names of all the reference sequences in the BAM file
    map<int, string> referenceIDToName;
    REFVEC referenceSequences = reader.GETREFDATA;
    int i = 0;
    for (REFVEC::iterator r = referenceSequences.begin(); r != referenceSequences.end(); ++r) {
        referenceIDToName[i] = r->REFNAME;
        ++i;
    }

    BAMALIGN alignment;

    while (GETNEXT(reader, alignment)) {
      
            DEBUG("---------------------------   read    --------------------------" << endl);
            DEBUG("| " << referenceIDToName[alignment.REFID] << ":" << alignment.POSITION << endl);
            DEBUG("| " << alignment.QNAME << ":" << alignment.ENDPOSITION << endl);
            DEBUG("| " << alignment.QNAME << ":" << (alignment.ISMAPPED ? " mapped" : " unmapped") << endl);
            DEBUG("| " << alignment.QNAME << ":" << " cigar data size: " << alignment.GETCIGAR.size() << endl);
            DEBUG("--------------------------- realigned --------------------------" << endl);

            // skip unmapped alignments, as they cannot be left-realigned without CIGAR data
            if (alignment.ISMAPPED) {

                int endpos = alignment.ENDPOSITION;
                int length = endpos - alignment.POSITION + 1;
                if (alignment.POSITION >= 0 && length > 0) {
                    if (!stablyLeftAlign(alignment,
                                reference.getSubSequence(
                                    referenceIDToName[alignment.REFID],
                                    alignment.POSITION,
                                    length),
                                maxiterations, debug)) {
                        cerr << "unstable realignment of " << alignment.QNAME
                             << " at " << referenceIDToName[alignment.REFID] << ":" << alignment.POSITION << endl
                             << alignment.QUERYBASES << endl;
                    }
                }

            }

            DEBUG("----------------------------------------------------------------" << endl);
            DEBUG(endl);

        if (!suppress_output)
	  WRITEALIGNMENT(writer, alignment);

    }

    reader.Close();
    if (!suppress_output)
        writer.Close();

    return 0;
}