Example #1
0
//
// Main
//
int overlapLongMain(int argc, char** argv)
{
    parseOverlapLongOptions(argc, argv);

    // Open output file
    std::ostream* pASQGWriter = createWriter(opt::outFile);

    // Build and write the ASQG header
    ASQG::HeaderRecord headerRecord;
    headerRecord.setOverlapTag(opt::minOverlap);
    headerRecord.setErrorRateTag(opt::errorRate);
    headerRecord.setInputFileTag(opt::readsFile);
    headerRecord.setTransitiveTag(true);
    headerRecord.write(*pASQGWriter);

    // Determine which index files to use. If a target file was provided,
    // use the index of the target reads
    std::string indexPrefix;
    if(!opt::targetFile.empty())
        indexPrefix = stripFilename(opt::targetFile);
    else
        indexPrefix = stripFilename(opt::readsFile);

    BWT* pBWT = new BWT(indexPrefix + BWT_EXT, opt::sampleRate);
    SampledSuffixArray* pSSA = new SampledSuffixArray(indexPrefix + SAI_EXT, SSA_FT_SAI);
    
    Timer* pTimer = new Timer(PROGRAM_IDENT);
    pBWT->printInfo();

    // Read the sequence file and write vertex records for each
    // Also store the read names in a vector of strings
    ReadTable reads;
    
    SeqReader* pReader = new SeqReader(opt::readsFile, SRF_NO_VALIDATION);
    SeqRecord record;
    while(pReader->get(record))
    {
        reads.addRead(record.toSeqItem());
        ASQG::VertexRecord vr(record.id, record.seq.toString());
        vr.write(*pASQGWriter);

        if(reads.getCount() % 100000 == 0)
            printf("Read %zu sequences\n", reads.getCount());
    }

    delete pReader;
    pReader = NULL;

    BWTIndexSet index;
    index.pBWT = pBWT;
    index.pSSA = pSSA;
    index.pReadTable = &reads;

    // Make a prefix for the temporary hits files
    size_t n_reads = reads.getCount();

    omp_set_num_threads(opt::numThreads);

#pragma omp parallel for
    for(size_t read_idx = 0; read_idx < n_reads; ++read_idx)
    {
        const SeqItem& curr_read = reads.getRead(read_idx);

        printf("read %s %zubp\n", curr_read.id.c_str(), curr_read.seq.length());
        SequenceOverlapPairVector sopv = 
            KmerOverlaps::retrieveMatches(curr_read.seq.toString(),
                                          opt::seedLength,
                                          opt::minOverlap,
                                          1 - opt::errorRate,
                                          100,
                                          index);

        printf("Found %zu matches\n", sopv.size());
        for(size_t i = 0; i < sopv.size(); ++i)
        {
            std::string match_id = reads.getRead(sopv[i].match_idx).id;

            // We only want to output each edge once so skip this overlap
            // if the matched read has a lexicographically lower ID
            if(curr_read.id > match_id)
                continue;

            std::string ao = ascii_overlap(sopv[i].sequence[0], sopv[i].sequence[1], sopv[i].overlap, 50);
            printf("\t%s\t[%d %d] ID=%s OL=%d PI:%.2lf C=%s\n", ao.c_str(),
                                                                sopv[i].overlap.match[0].start,
                                                                sopv[i].overlap.match[0].end,
                                                                match_id.c_str(),
                                                                sopv[i].overlap.getOverlapLength(),
                                                                sopv[i].overlap.getPercentIdentity(),
                                                                sopv[i].overlap.cigar.c_str());

            // Convert to ASQG
            SeqCoord sc1(sopv[i].overlap.match[0].start, sopv[i].overlap.match[0].end, sopv[i].overlap.length[0]);
            SeqCoord sc2(sopv[i].overlap.match[1].start, sopv[i].overlap.match[1].end, sopv[i].overlap.length[1]);
            
            // KmerOverlaps returns the coordinates of the overlap after flipping the reads
            // to ensure the strand matches. The ASQG file wants the coordinate of the original
            // sequencing strand. Flip here if necessary
            if(sopv[i].is_reversed)
                sc2.flip();

            // Convert the SequenceOverlap the ASQG's overlap format
            Overlap ovr(curr_read.id, sc1, match_id,  sc2, sopv[i].is_reversed, -1);

            ASQG::EdgeRecord er(ovr);
            er.setCigarTag(sopv[i].overlap.cigar);
            er.setPercentIdentityTag(sopv[i].overlap.getPercentIdentity());

#pragma omp critical
            {
                er.write(*pASQGWriter);
            }
        }
    }

    // Cleanup
    delete pReader;
    delete pBWT; 
    delete pSSA;
    
    delete pASQGWriter;
    delete pTimer;
    if(opt::numThreads > 1)
        pthread_exit(NULL);

    return 0;
}
Example #2
0
  ExecStatus
  Weights<View>::propagate(Space& home, const ModEventDelta&) {

    ModEvent me = ME_SET_NONE;

    if (!x.assigned()) {
      // Collect the weights of the elements in the unknown set in an array
      int size = elements.size();
      Region r(home);
      int* currentWeights = r.alloc<int>(size);

      UnknownRanges<View> ur(x);
      Iter::Ranges::ToValues<UnknownRanges<View> > urv(ur);
      for (int i=0; i<size; i++) {
        if (!urv() || elements[i]<urv.val()) {
          currentWeights[i] = 0;
        } else {
          assert(elements[i] == urv.val());
          currentWeights[i] = weights[i];
          ++urv;
        }
      }

      // Sort the weights of the unknown elements
      IntLess il;
      Support::quicksort<int>(currentWeights, size, il);

      // The maximum number of elements that can still be added to x
      int delta = static_cast<int>(std::min(x.unknownSize(), x.cardMax() - x.glbSize()));

      // The weight of the elements already in x
      GlbRanges<View> glb(x);
      int glbWeight = weightI<GlbRanges<View> >(elements, weights, glb);

      // Compute the weight of the current lower bound of x, plus at most
      // delta-1 further elements with smallest negative weights. This weight
      // determines which elements in the upper bound cannot possibly be
      // added to x (those whose weight would exceed the capacity even if
      // all other elements are minimal)
      int lowWeight = glbWeight;
      for (int i=0; i<delta-1; i++) {
        if (currentWeights[i] >= 0)
          break;
        lowWeight+=currentWeights[i];
      }

      // Compute the lowest possible weight of x. If there is another element
      // with negative weight left, then add its weight to lowWeight.
      // Otherwise lowWeight is already the lowest possible weight.
      int lowestWeight = lowWeight;
      if (delta>0 && currentWeights[delta-1]<0)
        lowestWeight+=currentWeights[delta-1];

      // If after including the minimal number of required elements,
      // no more element with negative weight is available, then
      // a tighter lower bound can be computed.
      if ( (x.cardMin() - x.glbSize() > 0 &&
            currentWeights[x.cardMin() - x.glbSize() - 1] >= 0) ||
           currentWeights[0] >= 0 ) {
        int lowestPosWeight = glbWeight;
        for (unsigned int i=0; i<x.cardMin() - x.glbSize(); i++) {
          lowestPosWeight += currentWeights[i];
        }
        lowestWeight = std::max(lowestWeight, lowestPosWeight);        
      }

      // Compute the highest possible weight of x as the weight of the lower
      // bound plus the weight of the delta heaviest elements still in the
      // upper bound.
      int highestWeight = glbWeight;
      for (int i=0; i<delta; i++) {
        if (currentWeights[size-i-1]<=0)
          break;
        highestWeight += currentWeights[size-i-1];
      }

      // Prune the weight using the computed bounds
      GECODE_ME_CHECK(y.gq(home, lowestWeight));
      GECODE_ME_CHECK(y.lq(home, highestWeight));

      // Exclude all elements that are too heavy from the set x.
      // Elements are too heavy if their weight alone already
      // exceeds the remaining capacity
      int remainingCapacity = y.max()-lowWeight;

      UnknownRanges<View> ur2(x);
      Iter::Ranges::ToValues<UnknownRanges<View> > urv2(ur2);
      OverweightValues<Iter::Ranges::ToValues<UnknownRanges<View> > >
        ov(remainingCapacity, elements, weights, urv2);
      Iter::Values::ToRanges<OverweightValues<
        Iter::Ranges::ToValues<UnknownRanges<View> > > > ovr(ov);
      me = x.excludeI(home, ovr);
      GECODE_ME_CHECK(me);
    }
    if (x.assigned()) {
      // If x is assigned, just compute its weight and assign y.
      GlbRanges<View> glb(x);
      int w =
        weightI<GlbRanges<View> >(elements, weights, glb);
      GECODE_ME_CHECK(y.eq(home, w));
      return home.ES_SUBSUMED(*this);
    }

    return me_modified(me) ? ES_NOFIX : ES_FIX;
  }