Example #1
0
inline void
onFindImpl(LocalDataHolder<TMatch, TGlobalHolder, TScoreExtension> & lH,
           TSeedId const & seedId,
           TSubjOcc subjOcc)
{
    if (TGlobalHolder::indexIsFM) // positions are reversed
        setSeqOffset(subjOcc,
                     length(lH.gH.subjSeqs[getSeqNo(subjOcc)])
                     - getSeqOffset(subjOcc)
                     - lH.options.seedLength);

    Match m {static_cast<Match::TQId>(lH.seedRefs[seedId]),
             static_cast<Match::TSId>(getSeqNo(subjOcc)),
             static_cast<Match::TPos>(lH.seedRanks[seedId] * lH.options.seedOffset),
             static_cast<Match::TPos>(getSeqOffset(subjOcc))};

    bool discarded = false;
    auto const halfSubjL = lH.options.seedLength /  2;

    if (!sIsTranslated(lH.gH.blastProgram))
    {
        for (unsigned k = 0; k < length(lH.gH.segIntStarts[m.subjId]); ++k)
        {
            // more than half of the seed falls into masked interval
            if (intervalOverlap(m.subjStart,
                                m.subjStart + lH.options.seedLength,
                                lH.gH.segIntStarts[m.subjId][k],
                                lH.gH.segIntEnds[m.subjId][k])
                    >= halfSubjL)
            {
                ++lH.stats.hitsMasked;
                discarded = true;
                break;
            }
        }
    }

     if ((!discarded) && (!seedLooksPromising(lH, m)))
     {
         discarded = true;
         ++lH.stats.hitsFailedPreExtendTest;
     }

    if (!discarded)
        lH.matches.emplace_back(m);
}
Example #2
0
void closest_grouped(ivl_vector_t& vx, ivl_vector_t& vy,
                     std::vector<int>& indices_x, std::vector<int>& indices_y,
                     std::vector<int>& overlap_sizes, std::vector<int>& distance_sizes) {

  ivl_tree_t tree_y(vy) ;

  std::pair<int, ivl_vector_t> min_dist;
  // initiatialize maximum left and right distances to minimize for closest
  int max_end = std::max(vx.back().stop, vy.back().stop) ;

  for (auto const& vx_it : vx) {
    ivl_vector_t closest ;
    ivl_vector_t closest_ivls ;

    min_dist = std::make_pair(max_end, closest_ivls) ;
    tree_y.findClosest(vx_it.start, vx_it.stop, closest, min_dist) ;

    for (auto const& ov_it : closest) {

      auto overlap = intervalOverlap(vx_it, ov_it) ;

      if (overlap > 0) {
        indices_x.push_back(vx_it.value) ;
        indices_y.push_back(ov_it.value) ;
        overlap_sizes.push_back(overlap < 0 ? -overlap : overlap) ;
        distance_sizes.push_back(0);
      } else if (ov_it.start >= vx_it.stop) {
        indices_x.push_back(vx_it.value) ;
        indices_y.push_back(ov_it.value) ;
        overlap_sizes.push_back(0) ;
        distance_sizes.push_back(-(overlap - 1));
      } else {
        indices_x.push_back(vx_it.value) ;
        indices_y.push_back(ov_it.value) ;
        overlap_sizes.push_back(0) ;
        distance_sizes.push_back(overlap - 1);
      }

    }
    closest.clear() ;
  }
}
Example #3
0
void intersect_group(ivl_vector_t vx, ivl_vector_t vy,
                     std::vector<int>& indices_x, std::vector<int>& indices_y,
                     std::vector<int>& overlap_sizes) {

  ivl_tree_t tree_y(vy) ;
  ivl_vector_t overlaps ;

  for (auto it : vx) {

    tree_y.findOverlapping(it.start, it.stop, overlaps) ;

    // store current intervals
    for (auto oit : overlaps) {

      int overlap_size = intervalOverlap(it, oit) ;
      overlap_sizes.push_back(overlap_size) ;

      indices_x.push_back(it.value) ;
      indices_y.push_back(oit.value) ;
    }

    overlaps.clear() ;
  }
}
Example #4
0
bool Box::overlaps(const Box o) const {
    return intervalOverlap(x,X,o.x,o.X) && intervalOverlap(y,Y,o.y,o.Y);
}
void
detectSubReads(gkStore               *gkp,
               workUnit              *w,
               FILE                  *subreadFile,
               bool                   subreadFileVerbose) {

  assert(w->adjLen > 0);
  assert(doCheckSubRead(gkp, w->id) == true);

  map<uint32, uint32>  secondIdx;
  map<uint32, uint32>  numOlaps;

  bool                 largePalindrome = false;
  intervalList<int32>  BAD;
  intervalList<int32>  BADall;

  //  Count the number of overlaps for each b_iid, and remember the last index.  There are supposed to
  //  be at most two overlaps per ID pair, so if we remember the last, and iterate through, we can
  //  get both.

  for (uint32 ii=0; ii<w->adjLen; ii++) {
    secondIdx[w->adj[ii].b_iid] = ii;
    numOlaps [w->adj[ii].b_iid]++;
  }

  //  Scan overlaps.  For any pair of b_iid, with overlaps in opposite directions, compute a 'bad'
  //  interval where a suspected flip occurs.

  for (uint32 ii=0; ii<w->adjLen; ii++) {
    adjOverlap  *aii = w->adj + ii;

    if (numOlaps[w->adj[ii].b_iid] == 1) {
      //  Only one overlap, can't indicate sub read!
      //if ((subreadFile) && (subreadFileVerbose))
      //  fprintf(subreadFile, "oneOverlap                 %u (%u-%u) %u (%u-%u) -- can't indicate subreads\n",
      //          w->adj[ii].a_iid, w->adj[ii].aovlbgn, w->adj[ii].aovlend, w->adj[ii].b_iid, w->adj[ii].bovlbgn, w->adj[ii].bovlend);
      continue;
    }

    //  We should never get more than two overlaps per read pair.
    if (numOlaps[w->adj[ii].b_iid] > 2) {
      fprintf(stderr, "ERROR: more overlaps than expected for pair %u %u.\n",
              w->adj[ii].a_iid, w->adj[ii].b_iid);
      continue;
    }
    assert(numOlaps[w->adj[ii].b_iid] == 2);

    uint32         jj = secondIdx[w->adj[ii].b_iid];
    adjOverlap   *ajj = w->adj + jj;

    assert(jj < w->adjLen);

    if (ii == jj) {
      //  Already did this one!
      //if ((subreadFile) && (subreadFileVerbose))
      //  fprintf(subreadFile, "sameOverlap                %u (%u-%u) %u (%u-%u)\n",
      //          w->adj[ii].a_iid, w->adj[ii].aovlbgn, w->adj[ii].aovlend, w->adj[ii].b_iid, w->adj[ii].bovlbgn, w->adj[ii].bovlend);
      continue;
    }

    //  The two overlaps should be for the same reads.

    assert(w->adj[ii].a_iid == w->adj[jj].a_iid);
    assert(w->adj[ii].b_iid == w->adj[jj].b_iid);

    //  And opposite orientations.

    if (w->adj[ii].flipped == w->adj[jj].flipped) {
      fprintf(stderr, "ERROR: same orient duplicate overlaps for pair %u %u\n",
              w->adj[ii].a_iid, w->adj[ii].b_iid);
      continue;
    }
    assert(w->adj[ii].flipped != w->adj[jj].flipped);


    bool  AcheckSub = (doCheckSubRead(gkp, w->adj[ii].a_iid) == true);
    bool  BcheckSub = (doCheckSubRead(gkp, w->adj[ii].b_iid) == true);

    assert(AcheckSub == true);  //  Otherwise we wouldn't be in this function!

    //  Decide what type of duplicate we have.
    //    Overlap on the A read -=> B read is potentially sub read containing -=> don't use overlaps
    //    Overlap on the B read -=> A read is potentially sub read containing -=> split this read

    uint32 Aoverlap = intervalOverlap(w->adj[ii].aovlbgn, w->adj[ii].aovlend, w->adj[jj].aovlbgn, w->adj[jj].aovlend);
    uint32 Boverlap = intervalOverlap(w->adj[ii].bovlbgn, w->adj[ii].bovlend, w->adj[jj].bovlbgn, w->adj[jj].bovlend);

    //  If there is no overlap anywhere, we're not sure what is going on.  This could be a genomic
    //  repeat.  Leave the overlaps alone.
    //
    if ((Aoverlap == 0) &&
        (Boverlap == 0))
      continue;

    //  Remember if the overlapping ovelap is large - we'll later check if the bad region falls
    //  within here, and if there are enough spanning reads not trim.  We also use this as one more
    //  count of BAD.
    //
    if ((AcheckSub) && (Aoverlap > 1000) &&
        (BcheckSub) && (Boverlap > 1000)) {
      uint32  dist = (w->adj[ii].a_iid > w->adj[ii].b_iid) ? (w->adj[ii].a_iid - w->adj[ii].b_iid) : (w->adj[ii].b_iid - w->adj[ii].a_iid);

      if (subreadFile)
        fprintf(subreadFile, "  II %8u (%6u-%6u) %8u (%6u-%6u)  JJ %8u (%6u-%6u) %8u (%6u-%6u) %s\n",
                w->adj[ii].a_iid, w->adj[ii].aovlbgn, w->adj[ii].aovlend, w->adj[ii].b_iid, w->adj[ii].bovlbgn, w->adj[ii].bovlend,
                w->adj[jj].a_iid, w->adj[jj].aovlbgn, w->adj[jj].aovlend, w->adj[jj].b_iid, w->adj[jj].bovlbgn, w->adj[jj].bovlend,
                (dist > 5) ? " PALINDROME WARNING--FAR-IID--WARNING" : "PALINDROME");

      largePalindrome  = true;
    }

#if 0
    //  Otherwise, if the overlaps overlap on both reads by significant chunks, don't believe
    //  either.  These are possibly both chimeric reads, at least PacBio junction reads.
    //
    //  Or an inverted repeat.
    //
    if ((AcheckSub) && (Aoverlap > 50) &&
        (BcheckSub) && (Boverlap > 50)) {
      if (subreadFile)
        fprintf(subreadFile, "BothOv     %u (%u-%u) %u (%u-%u)  %u (%u-%u) %u (%u-%u)\n",
                w->adj[ii].a_iid, w->adj[ii].aovlbgn, w->adj[ii].aovlend, w->adj[ii].b_iid, w->adj[ii].bovlbgn, w->adj[ii].bovlend,
                w->adj[jj].a_iid, w->adj[jj].aovlbgn, w->adj[jj].aovlend, w->adj[jj].b_iid, w->adj[jj].bovlbgn, w->adj[jj].bovlend);
    }
#endif

#if 0
    //  Stronger overlap in the A reads.  The B read looks like it has subreads, which is perfectly fine
    //  evidence for us.  Unless they span a junction.
    //
    if ((BcheckSub) && (Boverlap < Aoverlap)) {
      if (subreadFile)
        fprintf(subreadFile, "BcheckSub  %u (%u-%u) %u (%u-%u)  %u (%u-%u) %u (%u-%u)\n",
                w->adj[ii].a_iid, w->adj[ii].aovlbgn, w->adj[ii].aovlend, w->adj[ii].b_iid, w->adj[ii].bovlbgn, w->adj[ii].bovlend,
                w->adj[jj].a_iid, w->adj[jj].aovlbgn, w->adj[jj].aovlend, w->adj[jj].b_iid, w->adj[jj].bovlbgn, w->adj[jj].bovlend);
    }
#endif


    //  It looks like A has sub reads if the B read has a strong overlap in overlaps, and the A read does not
    //  have a strong overlap.

    if ((Aoverlap > 250) ||
        (Boverlap < 250))
      //  A strong overlap in the A read, there isn't a sub read junction we can identifiy, OR
      //  A weak overlap in the B read, and we expected the B read to align to both of the A subreads.
      continue;


    //  Decide on a region in the read that is suspected to contain the chimer junction.
    //
    //  In the true case: ii overlap is first on the read; bad region from the end of this overlap
    //  to the start of the jj overlap.
    //
    //  Note that sometimes overlaps extend through the junction.  This will just flip the region
    //  around.  We're expecting to find non-overlapping overlaps, but if we find overlapping ones,
    //  the bad interval is still between the end points.
    //
    //    -------------->                 ------------>
    //                    <---------  vs             <---------
    //
    uint32  badbgn = (w->adj[ii].aovlbgn < w->adj[jj].aovlbgn) ? w->adj[ii].aovlend : w->adj[jj].aovlend;
    uint32  badend = (w->adj[ii].aovlbgn < w->adj[jj].aovlbgn) ? w->adj[jj].aovlbgn : w->adj[ii].aovlbgn;

    if (badbgn > badend) {
      uint32  a = badbgn;
      badbgn = badend;
      badend = a;
    }
    assert(badbgn <= badend);

    if (subreadFile)
      fprintf(subreadFile, "  II %8u (%6u-%6u) %8u (%6u-%6u)  JJ %8u (%6u-%6u) %8u (%6u-%6u)  BAD %6u-%6u size %6u %s\n",
              w->adj[ii].a_iid, w->adj[ii].aovlbgn, w->adj[ii].aovlend, w->adj[ii].b_iid, w->adj[ii].bovlbgn, w->adj[ii].bovlend,
              w->adj[jj].a_iid, w->adj[jj].aovlbgn, w->adj[jj].aovlend, w->adj[jj].b_iid, w->adj[jj].bovlbgn, w->adj[jj].bovlend,
              badbgn, badend, badend - badbgn,
              (badend - badbgn <= SUBREAD_LOOP_MAX_SIZE) ? "(EVIDENCE)" : "(too far)");


    //  A true subread signature will have a small bad interval (10 bases) and largely agree on the
    //  interval.  False signature will have a large size, and not agree.  We only check for size
    //  though.
    //
    if (badend - badbgn <= SUBREAD_LOOP_MAX_SIZE)
      BAD.add(badbgn, badend - badbgn);

    //  Save all plausible pairs.
    //
    if (badend - badbgn <= SUBREAD_LOOP_EXT_SIZE)
      BADall.add(badbgn, badend - badbgn);
  }

  //
  //  Merge all the 'bad' intervals.  Save the merged intervals for later use.
  //

  BAD.merge();
  BADall.merge();



  for (uint32 bb=0; bb<BAD.numberOfIntervals(); bb++) {
    uint32  numSpan = 0;
    uint32  allHits = 0;

    //  Find the BADall interval that corresponds to this one.  This BAD interval must be contained
    //  in a BADall (because it contains all bad intervals, while BAD is just the close stuff).
    //  Once we find it, remember the number of reads for later use.

    for (uint32 aa=0; aa<BADall.numberOfIntervals(); aa++)
      if ((BADall.lo(aa) <= BAD.lo(bb)) && (BAD.hi(bb) <= BADall.hi(aa)))
        allHits += BADall.count(aa);

    assert(allHits != 0);

    //  Count the number of reads that span this region.  If the spanning read is not from a library
    //  that might contain subreads, give it more weight.

    for (uint32 ii=0; ii<w->adjLen; ii++)
      if ((w->adj[ii].aovlbgn + 100 < BAD.lo(bb)) && (BAD.hi(bb) + 100 < w->adj[ii].aovlend))
        numSpan += (doCheckSubRead(gkp, w->adj[ii].a_iid)) ? 1 : 2;

    if (subreadFile)
      fprintf(subreadFile, "AcheckSub region %u ("F_S32"-"F_S32") with %u hits %u bighits - span %u largePalindrome %s\n",
              w->adj[0].a_iid, BAD.lo(bb), BAD.hi(bb), BAD.count(bb), allHits,
              numSpan, largePalindrome ? "true" : "false");

    if (numSpan > 9)
      //  If there are 10 or more spanning read (equivalents) this is not a subread junction.  There
      //  is plenty of evidence it is true.
      continue;

    if (BAD.count(bb) + allHits / 4 + largePalindrome < 3)
      //  If 2 or fewer reads claim this is a sub read junction, skip it.  Evidence is weak.
      continue;

    if (subreadFile)
      fprintf(subreadFile, "CONFIRMED BAD REGION %d-%d\n", BAD.lo(bb), BAD.hi(bb));

    w->blist.push_back(badRegion(w->id, badType_subread, BAD.lo(bb), BAD.hi(bb)));
  }
}