C++ (Cpp) intervalList::merge Examples

Programming Language: C++ (Cpp)

Class/Type: intervalList

Method/Function: merge

Examples at hotexamples.com: 2

C++ (Cpp) intervalList::merge - 2 examples found. These are the top rated real world C++ (Cpp) examples of intervalList::merge extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

lo(4)

hi(4)

numberOfIntervals(4)

merge(2)

add(1)

clear(1)

count(1)

ct(1)

depth(1)

Example #1

Show file

File: statsGenerator.C Project: macmanes-lab/wgs-assembler

uint64
tandemRepeatACGTLength(intervalList<uint64> &il,
                       uint64               *offset,
                       seqCache             *A) {

  //  s -- the sequence
  //  i -- the interval list index

  il.merge();
  uint64 length = 0;
  uint64 unknown[256] = {0};
  for (uint32 i=0, s=0; i<il.numberOfIntervals(); i++) {
    while ((offset[s + 1]) <= il.lo(i)) 
      s++;

    char *S = A->getSequenceInCore(s)->sequence();

    uint64 lo = il.lo(i) - offset[s];
    uint64 hi = il.hi(i) - offset[s];

    for (uint64 j=lo; j < hi; j++)
      if (letterToBits[S[j]] != 0xff)
        length++;
      else
        unknown[S[j]]++;
  }

  //fprintf(stderr, "tandemRepeatACGTLength: "uint64FMT"\n", length);
  //for (uint32 i=0; i<256; i++)
  //  if (unknown[i] > 0)
  //    fprintf(stderr, "tandemRepeatACGTLength["uint32FMT"] = "uint64FMT" (%c)\n", i, unknown[i], i);

  return(length);
}

Example #2

Show file

File: bogus.C Project: lhon/canu

int
main(int argc, char **argv) {
  uint32   nucmerNamesLen  = 0;
  uint32   snapperNamesLen = 0;
  char    *nucmerNames[1024];
  char    *snapperNames[1024];
  char    *refName             = 0L;
  char    *outputPrefix        = 0L;

  //  Output intervals must be at least minLength bases long, and be formed from at least minFrags
  //  mappings.
  uint32   minLength  = 0;
  uint32   minFrags   = 0;

  //  Input matches must be at least minIdentity.
  double   minIdentity = 0;

  //  When comparing coords to if two alignments span the same piece of the fragment,
  //  allow (twice) this much slop in the comparison.  E.g., Abgn +- 5 == Bbgn
  //
  int32  alignWobble  = 5;

  //  When constructing the unique coverage map, trim each read by this amount, on each end, to
  //  simulate the minimum overlap length needed by unitigger.  This amount is automagically added
  //  back in on output.
  int32  fragTrim     = 40 / 2;

  //  When testing if a repeat alignment is contained in a non-repeat alignment, the repeat must
  //  start at least this many bases from the end of the non-repeat alignment.  In other words, a
  //  fragment with a repeat in the middle can be uniquely placed (by overlaps) with only 20 bases of
  //  unique sequence on the end.
  int32  uniqEnd      = 40;

  //  Loading jbrowse with all the raw input reads on large(r) genomes either
  //  fails, or takes forever.  This disables the raw read output.
  bool   includeRaw = true;


  int arg=1;
  int err=0;
  while (arg < argc) {
    if        (strcmp(argv[arg], "-nucmer") == 0) {
      nucmerNames[nucmerNamesLen++] = argv[++arg];

    } else if (strcmp(argv[arg], "-snapper") == 0) {
      snapperNames[snapperNamesLen++] = argv[++arg];

    } else if (strcmp(argv[arg], "-reference") == 0) {
      refName = argv[++arg];

    } else if (strcmp(argv[arg], "-output") == 0) {
      outputPrefix = argv[++arg];

    } else if (strcmp(argv[arg], "-wobble") == 0) {
      alignWobble = atoi(argv[++arg]);

    } else if (strcmp(argv[arg], "-overlap") == 0) {
      fragTrim = atoi(argv[++arg]) / 2;

    } else if (strcmp(argv[arg], "-noraw") == 0) {
      includeRaw = false;

    } else if (strcmp(argv[arg], "-minlength") == 0) {
      minLength = atoi(argv[++arg]);

    } else if (strcmp(argv[arg], "-minfrags") == 0) {
      minFrags = atoi(argv[++arg]);

    } else if (strcmp(argv[arg], "-minidentity") == 0) {
      minIdentity = atof(argv[++arg]);

    } else {
      fprintf(stderr, "ERROR: unknown option '%s'\n", argv[arg]);
      err++;
    }
    arg++;
  }
  if ((nucmerNamesLen == 0) && (snapperNamesLen == 0))
    fprintf(stderr, "ERROR: No input matches supplied (either -nucmer or -snapper).\n"), err++;
  if (refName == 0L)
    fprintf(stderr, "ERROR: No reference sequence supplied (-reference).\n"), err++;
  if (outputPrefix == 0L)
    fprintf(stderr, "ERROR: No output prefix supplied (-output).\n"), err++;
  if (err) {
    exit(1);
  }

  {
    char   outputName[FILENAME_MAX];

    errno = 0;

    sprintf(outputName, "%s.intervals", outputPrefix);
    intervalOutput = fopen(outputName, "w");

    if (errno)
      fprintf(stderr, "Failed to open '%s' for writing: %s\n",
              outputName, strerror(errno)), exit(1);

    sprintf(outputName, "%s.gff3", outputPrefix);
    gffOutput = fopen(outputName, "w");

    if (errno)
      fprintf(stderr, "Failed to open '%s' for writing: %s\n",
              outputName, strerror(errno)), exit(1);

    fprintf(gffOutput, "##gff-version 3\n");
  }

  //  Load the reference sequence.  ASSUMES it is all on one line!

  loadReferenceSequence(refName, refList, refMap);

  //  Load all the matches into genomeAlignment.  Generate longestAlignment for each fragment.
  //  genomeAlignment::isLognest and genomeAlignment::isRepeat are computed later.

  for (uint32 nn=0; nn<nucmerNamesLen; nn++)
    loadNucmer(nucmerNames[nn], genome, IIDmap, IIDname, refList, refMap, minIdentity);

  for (uint32 nn=0; nn<snapperNamesLen; nn++)
    loadSnapper(snapperNames[nn], genome, IIDmap, IIDname, refList, refMap, minIdentity);

  //if (includeRaw)
  //  writeInputsAsGFF3(outputPrefix);

  //  Process the matches fragment by fragment.  Find the longest, count the number
  //  of duplicates of the longest match, label as repeat/unique.

  sort(genome.begin(), genome.end(), byFragmentID);

  processMatches(alignWobble, uniqEnd);

  sort(genome.begin(), genome.end(), byGenomePosition);

  findSpannedMatches(uniqEnd);

  //  Now, throw all the non-spanned repeats into an intervalList, squash them to get intervals,
  //  and report the repeat regions.

  buildIntervals(outputPrefix, fragTrim, includeRaw);

  REPT.merge();
  UNIQ.merge();


  //  Search for exceptions -- one completely contained in the other

  markWeak();

  //  Extend UNIQ to cover gaps in alignments.
  //  Extend REPT to cover gaps in alignments.
  //  Merge adjacent/overlapping UNIQ-UNIQ or REPT-REPT intervals.



  //
  //  Write the output
  //

  for (uint32 ir=0, iu=0; ((ir < REPT.numberOfIntervals()) ||
                           (iu < UNIQ.numberOfIntervals())); ) {

    //  UNIQ regions are offset by fragTrim on each side

    int64  lor = (ir < REPT.numberOfIntervals()) ? REPT.lo(ir)            : 999999999;
    int64  hir = (ir < REPT.numberOfIntervals()) ? REPT.hi(ir)            : 999999999;
    int64  lou = (iu < UNIQ.numberOfIntervals()) ? UNIQ.lo(iu) - fragTrim : 999999999;
    int64  hiu = (iu < UNIQ.numberOfIntervals()) ? UNIQ.hi(iu) + fragTrim : 999999999;

    //  Search the refList for the reference sequence we are in.  We should never span reference
    //  sequences (which isn't tested, as we only know the low coordinate at this time).

    char  *refhdr = NULL;
    int64  refbgn = 0;
    int64  refend = 0;
    int64  refcnt = 0;

    if (lor < lou) {
      for (uint32 rr=0; rr<refList.size(); rr++)
        if ((refList[rr].rschnBgn <= lor) && (lor <= refList[rr].rschnEnd)) {
          refhdr = refList[rr].rsrefName;
          refbgn = REPT.lo(ir) - refList[rr].rschnBgn;
          refend = REPT.hi(ir) - refList[rr].rschnBgn;
          refcnt = REPT.count(ir);
        }

      if (refcnt == 0) {
        fprintf(stderr, "DIDN'T FIND REGION.\n");
        for (uint32 rr=0; rr<refList.size(); rr++) {
          fprintf(stderr, "  %3u rschnBgn %6d REPT %5d %ld-%ld UNIQ %5d %ld-%ld rschnEnd %6d REPT\n",
                  rr,
                  refList[rr].rschnBgn,
                  ir, lor, hir,
                  iu, lou, hiu,
                  refList[rr].rschnEnd);
        }
      }
      //assert(refcnt != 0);

      if ((refcnt > 0) && (minFrags <= refcnt) && (minLength <= refend - refbgn)) {
        fprintf(intervalOutput, "%s\t%8"F_S64P"\t%8"F_S64P"\tREPT\t"F_S64"%s\n",
                refhdr, refbgn, refend, refcnt, (REPTvalid[ir]) ? "" : " weak");

        if (REPTvalid[ir])
          fprintf(gffOutput, "%s\t.\tbogus_rept_interval\t"F_S64"\t"F_S64"\t.\t.\t.\tID=REPT%04d;fragCount="F_S64"\n",
                  refhdr, refbgn, refend, ir, refcnt);
        else
          fprintf(gffOutput, "%s\t.\tbogus_weak_interval\t"F_S64"\t"F_S64"\t.\t.\t.\tParent=UNIQ%04d;fragCount="F_S64"\n",
                  refhdr, refbgn, refend, REPTvalidParent[ir], refcnt);
      }

      ir++;

    } else {
      for (uint32 rr=0; rr<refList.size(); rr++)
        if ((refList[rr].rschnBgn <= lou) && (lou <= refList[rr].rschnEnd)) {
          refhdr = refList[rr].rsrefName;
          refbgn = UNIQ.lo(iu) - fragTrim - refList[rr].rschnBgn;
          refend = UNIQ.hi(iu) + fragTrim - refList[rr].rschnBgn;
          refcnt = UNIQ.count(iu);
        }

      //  Not sure why some data sets (long pacbio for example) trigger this.
#if 0
      if (refcnt == 0) {
        fprintf(stderr, "DIDN'T FIND REGION.\n");
        for (uint32 rr=0; rr<refList.size(); rr++) {
          fprintf(stderr, "  %3u rschnBgn %6d REPT %5d %ld-%ld UNIQ %5d %ld-%ld rschnEnd %6d UNIQ\n",
                  rr,
                  refList[rr].rschnBgn,
                  ir, lor, hir,
                  iu, lou, hiu,
                  refList[rr].rschnEnd);
        }
      }
#endif
      //assert(refcnt != 0);

      if ((refcnt > 0) && (minFrags <= refcnt) && (minLength <= refend - refbgn)) {
        fprintf(intervalOutput, "%s\t%8"F_S64P"\t%8"F_S64P"\tUNIQ\t"F_S64"%s\n",
                refhdr, refbgn, refend, refcnt, (UNIQvalid[iu]) ? "" : " separation");

        if (UNIQvalid[iu])
          fprintf(gffOutput, "%s\t.\tbogus_uniq_interval\t"F_S64"\t"F_S64"\t.\t.\t.\tID=UNIQ%04d;fragCount="F_S64"\n",
                  refhdr, refbgn, refend, iu, refcnt);
        else
          fprintf(gffOutput, "%s\t.\tbogus_sepr_interval\t"F_S64"\t"F_S64"\t.\t.\t.\tParent=REPT%04d;fragCount="F_S64"\n",
                  refhdr, refbgn, refend, UNIQvalidParent[iu], refcnt);
      }

      iu++;
    }
  }

  fclose(gffOutput);
  fclose(intervalOutput);

  //  See CVS version 1.3 for writing rept/uniq fasta

  return(0);
}