uint64
tandemRepeatACGTLength(intervalList<uint64> &il,
                       uint64               *offset,
                       seqCache             *A) {

  //  s -- the sequence
  //  i -- the interval list index

  il.merge();
  uint64 length = 0;
  uint64 unknown[256] = {0};
  for (uint32 i=0, s=0; i<il.numberOfIntervals(); i++) {
    while ((offset[s + 1]) <= il.lo(i)) 
      s++;

    char *S = A->getSequenceInCore(s)->sequence();

    uint64 lo = il.lo(i) - offset[s];
    uint64 hi = il.hi(i) - offset[s];

    for (uint64 j=lo; j < hi; j++)
      if (letterToBits[S[j]] != 0xff)
        length++;
      else
        unknown[S[j]]++;
  }

  //fprintf(stderr, "tandemRepeatACGTLength: "uint64FMT"\n", length);
  //for (uint32 i=0; i<256; i++)
  //  if (unknown[i] > 0)
  //    fprintf(stderr, "tandemRepeatACGTLength["uint32FMT"] = "uint64FMT" (%c)\n", i, unknown[i], i);

  return(length);
}
Exemple #2
0
void
findUnitigCoverage(Unitig               *tig,
                   intervalList<uint32> &coverage) {
  intervalList<uint32>  rawcoverage;

  for (uint32 fi=0; fi<tig->ufpath.size(); fi++) {
    ufNode  frg = tig->ufpath[fi];

    if (frg.position.bgn < frg.position.end)
      rawcoverage.add(frg.position.bgn, frg.position.end - frg.position.bgn);
    else
      rawcoverage.add(frg.position.end, frg.position.bgn - frg.position.end);
  }

  coverage.clear();
  coverage.depth(rawcoverage);

#ifdef DUMP_READ_COVERAGE
  char  fn[FILENAME_MAX];
  sprintf(fn, "%08u.coverage", tig->id());
  FILE *F = fopen(fn, "w");

  for (uint32 ii=0; ii<coverage.numberOfIntervals(); ii++)
    fprintf(F, "%u %u %u\n", coverage.lo(ii), coverage.hi(ii), coverage.depth(ii));

  fclose(F);
#endif
}
Exemple #3
0
int
main(int argc, char **argv) {
  uint32   nucmerNamesLen  = 0;
  uint32   snapperNamesLen = 0;
  char    *nucmerNames[1024];
  char    *snapperNames[1024];
  char    *refName             = 0L;
  char    *outputPrefix        = 0L;

  //  Output intervals must be at least minLength bases long, and be formed from at least minFrags
  //  mappings.
  uint32   minLength  = 0;
  uint32   minFrags   = 0;

  //  Input matches must be at least minIdentity.
  double   minIdentity = 0;

  //  When comparing coords to if two alignments span the same piece of the fragment,
  //  allow (twice) this much slop in the comparison.  E.g., Abgn +- 5 == Bbgn
  //
  int32  alignWobble  = 5;

  //  When constructing the unique coverage map, trim each read by this amount, on each end, to
  //  simulate the minimum overlap length needed by unitigger.  This amount is automagically added
  //  back in on output.
  int32  fragTrim     = 40 / 2;

  //  When testing if a repeat alignment is contained in a non-repeat alignment, the repeat must
  //  start at least this many bases from the end of the non-repeat alignment.  In other words, a
  //  fragment with a repeat in the middle can be uniquely placed (by overlaps) with only 20 bases of
  //  unique sequence on the end.
  int32  uniqEnd      = 40;

  //  Loading jbrowse with all the raw input reads on large(r) genomes either
  //  fails, or takes forever.  This disables the raw read output.
  bool   includeRaw = true;


  int arg=1;
  int err=0;
  while (arg < argc) {
    if        (strcmp(argv[arg], "-nucmer") == 0) {
      nucmerNames[nucmerNamesLen++] = argv[++arg];

    } else if (strcmp(argv[arg], "-snapper") == 0) {
      snapperNames[snapperNamesLen++] = argv[++arg];

    } else if (strcmp(argv[arg], "-reference") == 0) {
      refName = argv[++arg];

    } else if (strcmp(argv[arg], "-output") == 0) {
      outputPrefix = argv[++arg];

    } else if (strcmp(argv[arg], "-wobble") == 0) {
      alignWobble = atoi(argv[++arg]);

    } else if (strcmp(argv[arg], "-overlap") == 0) {
      fragTrim = atoi(argv[++arg]) / 2;

    } else if (strcmp(argv[arg], "-noraw") == 0) {
      includeRaw = false;

    } else if (strcmp(argv[arg], "-minlength") == 0) {
      minLength = atoi(argv[++arg]);

    } else if (strcmp(argv[arg], "-minfrags") == 0) {
      minFrags = atoi(argv[++arg]);

    } else if (strcmp(argv[arg], "-minidentity") == 0) {
      minIdentity = atof(argv[++arg]);

    } else {
      fprintf(stderr, "ERROR: unknown option '%s'\n", argv[arg]);
      err++;
    }
    arg++;
  }
  if ((nucmerNamesLen == 0) && (snapperNamesLen == 0))
    fprintf(stderr, "ERROR: No input matches supplied (either -nucmer or -snapper).\n"), err++;
  if (refName == 0L)
    fprintf(stderr, "ERROR: No reference sequence supplied (-reference).\n"), err++;
  if (outputPrefix == 0L)
    fprintf(stderr, "ERROR: No output prefix supplied (-output).\n"), err++;
  if (err) {
    exit(1);
  }

  {
    char   outputName[FILENAME_MAX];

    errno = 0;

    sprintf(outputName, "%s.intervals", outputPrefix);
    intervalOutput = fopen(outputName, "w");

    if (errno)
      fprintf(stderr, "Failed to open '%s' for writing: %s\n",
              outputName, strerror(errno)), exit(1);

    sprintf(outputName, "%s.gff3", outputPrefix);
    gffOutput = fopen(outputName, "w");

    if (errno)
      fprintf(stderr, "Failed to open '%s' for writing: %s\n",
              outputName, strerror(errno)), exit(1);

    fprintf(gffOutput, "##gff-version 3\n");
  }

  //  Load the reference sequence.  ASSUMES it is all on one line!

  loadReferenceSequence(refName, refList, refMap);

  //  Load all the matches into genomeAlignment.  Generate longestAlignment for each fragment.
  //  genomeAlignment::isLognest and genomeAlignment::isRepeat are computed later.

  for (uint32 nn=0; nn<nucmerNamesLen; nn++)
    loadNucmer(nucmerNames[nn], genome, IIDmap, IIDname, refList, refMap, minIdentity);

  for (uint32 nn=0; nn<snapperNamesLen; nn++)
    loadSnapper(snapperNames[nn], genome, IIDmap, IIDname, refList, refMap, minIdentity);

  //if (includeRaw)
  //  writeInputsAsGFF3(outputPrefix);

  //  Process the matches fragment by fragment.  Find the longest, count the number
  //  of duplicates of the longest match, label as repeat/unique.

  sort(genome.begin(), genome.end(), byFragmentID);

  processMatches(alignWobble, uniqEnd);

  sort(genome.begin(), genome.end(), byGenomePosition);

  findSpannedMatches(uniqEnd);

  //  Now, throw all the non-spanned repeats into an intervalList, squash them to get intervals,
  //  and report the repeat regions.

  buildIntervals(outputPrefix, fragTrim, includeRaw);

  REPT.merge();
  UNIQ.merge();


  //  Search for exceptions -- one completely contained in the other

  markWeak();

  //  Extend UNIQ to cover gaps in alignments.
  //  Extend REPT to cover gaps in alignments.
  //  Merge adjacent/overlapping UNIQ-UNIQ or REPT-REPT intervals.



  //
  //  Write the output
  //

  for (uint32 ir=0, iu=0; ((ir < REPT.numberOfIntervals()) ||
                           (iu < UNIQ.numberOfIntervals())); ) {

    //  UNIQ regions are offset by fragTrim on each side

    int64  lor = (ir < REPT.numberOfIntervals()) ? REPT.lo(ir)            : 999999999;
    int64  hir = (ir < REPT.numberOfIntervals()) ? REPT.hi(ir)            : 999999999;
    int64  lou = (iu < UNIQ.numberOfIntervals()) ? UNIQ.lo(iu) - fragTrim : 999999999;
    int64  hiu = (iu < UNIQ.numberOfIntervals()) ? UNIQ.hi(iu) + fragTrim : 999999999;

    //  Search the refList for the reference sequence we are in.  We should never span reference
    //  sequences (which isn't tested, as we only know the low coordinate at this time).

    char  *refhdr = NULL;
    int64  refbgn = 0;
    int64  refend = 0;
    int64  refcnt = 0;

    if (lor < lou) {
      for (uint32 rr=0; rr<refList.size(); rr++)
        if ((refList[rr].rschnBgn <= lor) && (lor <= refList[rr].rschnEnd)) {
          refhdr = refList[rr].rsrefName;
          refbgn = REPT.lo(ir) - refList[rr].rschnBgn;
          refend = REPT.hi(ir) - refList[rr].rschnBgn;
          refcnt = REPT.count(ir);
        }

      if (refcnt == 0) {
        fprintf(stderr, "DIDN'T FIND REGION.\n");
        for (uint32 rr=0; rr<refList.size(); rr++) {
          fprintf(stderr, "  %3u rschnBgn %6d REPT %5d %ld-%ld UNIQ %5d %ld-%ld rschnEnd %6d REPT\n",
                  rr,
                  refList[rr].rschnBgn,
                  ir, lor, hir,
                  iu, lou, hiu,
                  refList[rr].rschnEnd);
        }
      }
      //assert(refcnt != 0);

      if ((refcnt > 0) && (minFrags <= refcnt) && (minLength <= refend - refbgn)) {
        fprintf(intervalOutput, "%s\t%8"F_S64P"\t%8"F_S64P"\tREPT\t"F_S64"%s\n",
                refhdr, refbgn, refend, refcnt, (REPTvalid[ir]) ? "" : " weak");

        if (REPTvalid[ir])
          fprintf(gffOutput, "%s\t.\tbogus_rept_interval\t"F_S64"\t"F_S64"\t.\t.\t.\tID=REPT%04d;fragCount="F_S64"\n",
                  refhdr, refbgn, refend, ir, refcnt);
        else
          fprintf(gffOutput, "%s\t.\tbogus_weak_interval\t"F_S64"\t"F_S64"\t.\t.\t.\tParent=UNIQ%04d;fragCount="F_S64"\n",
                  refhdr, refbgn, refend, REPTvalidParent[ir], refcnt);
      }

      ir++;

    } else {
      for (uint32 rr=0; rr<refList.size(); rr++)
        if ((refList[rr].rschnBgn <= lou) && (lou <= refList[rr].rschnEnd)) {
          refhdr = refList[rr].rsrefName;
          refbgn = UNIQ.lo(iu) - fragTrim - refList[rr].rschnBgn;
          refend = UNIQ.hi(iu) + fragTrim - refList[rr].rschnBgn;
          refcnt = UNIQ.count(iu);
        }

      //  Not sure why some data sets (long pacbio for example) trigger this.
#if 0
      if (refcnt == 0) {
        fprintf(stderr, "DIDN'T FIND REGION.\n");
        for (uint32 rr=0; rr<refList.size(); rr++) {
          fprintf(stderr, "  %3u rschnBgn %6d REPT %5d %ld-%ld UNIQ %5d %ld-%ld rschnEnd %6d UNIQ\n",
                  rr,
                  refList[rr].rschnBgn,
                  ir, lor, hir,
                  iu, lou, hiu,
                  refList[rr].rschnEnd);
        }
      }
#endif
      //assert(refcnt != 0);

      if ((refcnt > 0) && (minFrags <= refcnt) && (minLength <= refend - refbgn)) {
        fprintf(intervalOutput, "%s\t%8"F_S64P"\t%8"F_S64P"\tUNIQ\t"F_S64"%s\n",
                refhdr, refbgn, refend, refcnt, (UNIQvalid[iu]) ? "" : " separation");

        if (UNIQvalid[iu])
          fprintf(gffOutput, "%s\t.\tbogus_uniq_interval\t"F_S64"\t"F_S64"\t.\t.\t.\tID=UNIQ%04d;fragCount="F_S64"\n",
                  refhdr, refbgn, refend, iu, refcnt);
        else
          fprintf(gffOutput, "%s\t.\tbogus_sepr_interval\t"F_S64"\t"F_S64"\t.\t.\t.\tParent=REPT%04d;fragCount="F_S64"\n",
                  refhdr, refbgn, refend, UNIQvalidParent[iu], refcnt);
      }

      iu++;
    }
  }

  fclose(gffOutput);
  fclose(intervalOutput);

  //  See CVS version 1.3 for writing rept/uniq fasta

  return(0);
}
Exemple #4
0
void
markWeak(void) {
  REPTvalid = new bool [REPT.numberOfIntervals()];
  UNIQvalid = new bool [UNIQ.numberOfIntervals()];

  REPTvalidParent = new int32 [REPT.numberOfIntervals()];
  UNIQvalidParent = new int32 [UNIQ.numberOfIntervals()];

  for (uint32 i=0; i<REPT.numberOfIntervals(); i++) {
    REPTvalid[i] = true;
    REPTvalidParent[i] = 0;
  }
  for (uint32 i=0; i<UNIQ.numberOfIntervals(); i++) {
    UNIQvalid[i] = true;
    UNIQvalidParent[i] = 0;
  }

  int32   UNIQexceptions=0;
  int32   REPTexceptions=0;

  for (uint32 ir=0; ir<REPT.numberOfIntervals(); ir++) {
    for (uint32 iu=0; iu<UNIQ.numberOfIntervals(); iu++) {
      if ((REPT.lo(ir) <= UNIQ.lo(iu)) &&
          (UNIQ.hi(iu) <= REPT.hi(ir))) {
        //fprintf(stderr, "EXCEPTION:  UNIQ %ld,%ld len=%ld ct=%d contained in REPT %ld,%ld len=%ld ct=%d\n",
        //        UNIQ.lo(iu), UNIQ.hi(iu), UNIQ.hi(iu) - UNIQ.lo(iu), UNIQ.count(iu),
        //        REPT.lo(ir), REPT.hi(ir), REPT.hi(ir) - REPT.lo(ir), REPT.count(ir));
        UNIQvalid[iu] = false;
        UNIQvalidParent[iu] = ir;
        UNIQexceptions++;
      }
      if ((UNIQ.lo(iu) <= REPT.lo(ir)) &&
          (REPT.hi(ir) <= UNIQ.hi(iu))) {
        //fprintf(stderr, "EXCEPTION:  REPT %ld,%ld len=%ld  ct=%d contained in UNIQ %ld,%ld len=%ld ct=%d\n",
        //        REPT.lo(ir), REPT.hi(ir), REPT.hi(ir) - REPT.lo(ir), REPT.count(ir),
        //        UNIQ.lo(iu), UNIQ.hi(iu), UNIQ.hi(iu) - UNIQ.lo(iu), UNIQ.count(iu));
        REPTvalid[ir] = false;
        REPTvalidParent[ir] = iu;
        REPTexceptions++;
      }
    }
  }

  fprintf(stderr, "Found %d REPT intervals, and %d REPT weak intervals.\n", REPT.numberOfIntervals() - REPTexceptions, REPTexceptions);
  fprintf(stderr, "Found %d UNIQ intervals, and %d UNIQ weak intervals.\n", UNIQ.numberOfIntervals() - UNIQexceptions, UNIQexceptions);
}
Exemple #5
0
static
void
buildIntervals(char *outputPrefix, int32 fragTrim, bool includeRaw) {

  for (uint32 i=0; i<genome.size(); i++) {
    int32  frgIID = genome[i].frgIID;
    int32  bgnOff = longest[frgIID].rptBgn;
    int32  endOff = longest[frgIID].frgLen - longest[frgIID].rptEnd;
    int32  frgOff = (genome[i].isReverse) ? endOff : bgnOff;
    int32  len    = genome[i].chnEnd - genome[i].chnBgn;
    char  *refhdr = refList[genome[i].genIID].rsrefName;

    if (genome[i].isSpanned == true) {
#ifdef REPORT_INTERVALS_IN_GFF
      fprintf(gffOutput, "\n");
      fprintf(gffOutput, "#SPAN frgIID %8d frg %8d,%8d rev %c gen %8d,%8d\n",
              frgIID,
              genome[i].frgBgn, genome[i].frgEnd,
              genome[i].isReverse ? 'r' : 'f',
              genome[i].genBgn, genome[i].genEnd);
#endif
      if (includeRaw)
        fprintf(gffOutput, "%s\t.\tbogus_span_input\t%d\t%d\t.\t%c\t.\tID=SPAN%08d-frag%08d;Name=%s;Note=%d-%d\n",
                refhdr,                                //  reference name
                genome[i].genBgn, genome[i].genEnd,    //  reference position
                (genome[i].isReverse) ? '-' : '+',     //  strand
                i,                                     //  ID - match id
                genome[i].frgIID,                      //  ID - frag id
                IIDname[genome[i].frgIID].c_str(),     //  Name - actual sequence name
                genome[i].frgBgn, genome[i].frgEnd);   //  Note - position on frag
      continue;
    }

    if (genome[i].isRepeat == true) {
      REPT.add(genome[i].chnBgn, len);
#ifdef REPORT_INTERVALS_IN_GFF
      fprintf(gffOutput, "\n");
      fprintf(gffOutput, "#REPT frgIID %8d frg %8d,%8d rev %c gen %8d,%8d\n",
              frgIID,
              genome[i].frgBgn, genome[i].frgEnd,
              genome[i].isReverse ? 'r' : 'f',
              genome[i].genBgn, genome[i].genEnd);
#endif
      if (includeRaw)
        fprintf(gffOutput, "%s\t.\tbogus_rept_input\t%d\t%d\t.\t%c\t.\tID=REPT%08d-frag%08d;Name=%s;Note=%d-%d\n",
                refhdr,                                //  reference name
                genome[i].genBgn, genome[i].genEnd,    //  reference position
                (genome[i].isReverse) ? '-' : '+',     //  strand
                i,                                     //  ID - match id
                genome[i].frgIID,                      //  ID - frag id
                IIDname[genome[i].frgIID].c_str(),     //  Name - actual sequence name
                genome[i].frgBgn, genome[i].frgEnd);   //  Note - position on frag
      continue;
    }

    //  Allow the fragment to extend into the repeat region.  We've added that region
    //  as a separate repeat alignment.  If something else spans it, the alignment will be removed,
    //  and we'll get a good overlap.
    bgnOff = 0;
    endOff = 0;

    len -= bgnOff;
    len -= endOff;
    len -= fragTrim * 2;

    if (len <= 0)
      continue;

    UNIQ.add(genome[i].chnBgn + frgOff + fragTrim,
             len);

#ifdef REPORT_INTERVALS_IN_GFF
    fprintf(gffOutput, "\n");
    fprintf(gffOutput, "#UNIQ frgIID %8d frg %8d,%8d rev %c gen %8d,%8d mod %8d,%8d\n",
            frgIID,
            genome[i].frgBgn, genome[i].frgEnd,
            genome[i].isReverse ? 'r' : 'f',
            genome[i].genBgn, genome[i].genEnd,
            genome[i].genBgn + frgOff + fragTrim,
            genome[i].genBgn + frgOff + fragTrim + len);
#endif
    if (includeRaw)
      fprintf(gffOutput, "%s\t.\tbogus_uniq_input\t%d\t%d\t.\t%c\t.\tID=UNIQ%08d-frag%08d;Name=%s;Note=%d-%d\n",
              refhdr,                                //  reference name
              genome[i].genBgn, genome[i].genEnd,    //  reference position
              (genome[i].isReverse) ? '-' : '+',     //  strand
              i,                                     //  ID - match id
              genome[i].frgIID,                      //  ID - frag id
              IIDname[genome[i].frgIID].c_str(),     //  Name - actual sequence name
              genome[i].frgBgn, genome[i].frgEnd);   //  Note - position on frag
  }
}