Ejemplo n.º 1
0
int matchFileWithLicenses(MonkState* state, const File* file, const Licenses* licenses, const MatchCallbacks* callbacks) {
  GArray* matches = findAllMatchesBetween(file, licenses,
          MAX_ALLOWED_DIFF_LENGTH, MIN_ADJACENT_MATCHES, MAX_LEADING_DIFF);
  int result = processMatches(state, file, matches, callbacks);

  // we are done: free memory
  match_array_free(matches);

  return result;
}
Ejemplo n.º 2
0
Archivo: bogus.C Proyecto: lhon/canu
int
main(int argc, char **argv) {
  uint32   nucmerNamesLen  = 0;
  uint32   snapperNamesLen = 0;
  char    *nucmerNames[1024];
  char    *snapperNames[1024];
  char    *refName             = 0L;
  char    *outputPrefix        = 0L;

  //  Output intervals must be at least minLength bases long, and be formed from at least minFrags
  //  mappings.
  uint32   minLength  = 0;
  uint32   minFrags   = 0;

  //  Input matches must be at least minIdentity.
  double   minIdentity = 0;

  //  When comparing coords to if two alignments span the same piece of the fragment,
  //  allow (twice) this much slop in the comparison.  E.g., Abgn +- 5 == Bbgn
  //
  int32  alignWobble  = 5;

  //  When constructing the unique coverage map, trim each read by this amount, on each end, to
  //  simulate the minimum overlap length needed by unitigger.  This amount is automagically added
  //  back in on output.
  int32  fragTrim     = 40 / 2;

  //  When testing if a repeat alignment is contained in a non-repeat alignment, the repeat must
  //  start at least this many bases from the end of the non-repeat alignment.  In other words, a
  //  fragment with a repeat in the middle can be uniquely placed (by overlaps) with only 20 bases of
  //  unique sequence on the end.
  int32  uniqEnd      = 40;

  //  Loading jbrowse with all the raw input reads on large(r) genomes either
  //  fails, or takes forever.  This disables the raw read output.
  bool   includeRaw = true;


  int arg=1;
  int err=0;
  while (arg < argc) {
    if        (strcmp(argv[arg], "-nucmer") == 0) {
      nucmerNames[nucmerNamesLen++] = argv[++arg];

    } else if (strcmp(argv[arg], "-snapper") == 0) {
      snapperNames[snapperNamesLen++] = argv[++arg];

    } else if (strcmp(argv[arg], "-reference") == 0) {
      refName = argv[++arg];

    } else if (strcmp(argv[arg], "-output") == 0) {
      outputPrefix = argv[++arg];

    } else if (strcmp(argv[arg], "-wobble") == 0) {
      alignWobble = atoi(argv[++arg]);

    } else if (strcmp(argv[arg], "-overlap") == 0) {
      fragTrim = atoi(argv[++arg]) / 2;

    } else if (strcmp(argv[arg], "-noraw") == 0) {
      includeRaw = false;

    } else if (strcmp(argv[arg], "-minlength") == 0) {
      minLength = atoi(argv[++arg]);

    } else if (strcmp(argv[arg], "-minfrags") == 0) {
      minFrags = atoi(argv[++arg]);

    } else if (strcmp(argv[arg], "-minidentity") == 0) {
      minIdentity = atof(argv[++arg]);

    } else {
      fprintf(stderr, "ERROR: unknown option '%s'\n", argv[arg]);
      err++;
    }
    arg++;
  }
  if ((nucmerNamesLen == 0) && (snapperNamesLen == 0))
    fprintf(stderr, "ERROR: No input matches supplied (either -nucmer or -snapper).\n"), err++;
  if (refName == 0L)
    fprintf(stderr, "ERROR: No reference sequence supplied (-reference).\n"), err++;
  if (outputPrefix == 0L)
    fprintf(stderr, "ERROR: No output prefix supplied (-output).\n"), err++;
  if (err) {
    exit(1);
  }

  {
    char   outputName[FILENAME_MAX];

    errno = 0;

    sprintf(outputName, "%s.intervals", outputPrefix);
    intervalOutput = fopen(outputName, "w");

    if (errno)
      fprintf(stderr, "Failed to open '%s' for writing: %s\n",
              outputName, strerror(errno)), exit(1);

    sprintf(outputName, "%s.gff3", outputPrefix);
    gffOutput = fopen(outputName, "w");

    if (errno)
      fprintf(stderr, "Failed to open '%s' for writing: %s\n",
              outputName, strerror(errno)), exit(1);

    fprintf(gffOutput, "##gff-version 3\n");
  }

  //  Load the reference sequence.  ASSUMES it is all on one line!

  loadReferenceSequence(refName, refList, refMap);

  //  Load all the matches into genomeAlignment.  Generate longestAlignment for each fragment.
  //  genomeAlignment::isLognest and genomeAlignment::isRepeat are computed later.

  for (uint32 nn=0; nn<nucmerNamesLen; nn++)
    loadNucmer(nucmerNames[nn], genome, IIDmap, IIDname, refList, refMap, minIdentity);

  for (uint32 nn=0; nn<snapperNamesLen; nn++)
    loadSnapper(snapperNames[nn], genome, IIDmap, IIDname, refList, refMap, minIdentity);

  //if (includeRaw)
  //  writeInputsAsGFF3(outputPrefix);

  //  Process the matches fragment by fragment.  Find the longest, count the number
  //  of duplicates of the longest match, label as repeat/unique.

  sort(genome.begin(), genome.end(), byFragmentID);

  processMatches(alignWobble, uniqEnd);

  sort(genome.begin(), genome.end(), byGenomePosition);

  findSpannedMatches(uniqEnd);

  //  Now, throw all the non-spanned repeats into an intervalList, squash them to get intervals,
  //  and report the repeat regions.

  buildIntervals(outputPrefix, fragTrim, includeRaw);

  REPT.merge();
  UNIQ.merge();


  //  Search for exceptions -- one completely contained in the other

  markWeak();

  //  Extend UNIQ to cover gaps in alignments.
  //  Extend REPT to cover gaps in alignments.
  //  Merge adjacent/overlapping UNIQ-UNIQ or REPT-REPT intervals.



  //
  //  Write the output
  //

  for (uint32 ir=0, iu=0; ((ir < REPT.numberOfIntervals()) ||
                           (iu < UNIQ.numberOfIntervals())); ) {

    //  UNIQ regions are offset by fragTrim on each side

    int64  lor = (ir < REPT.numberOfIntervals()) ? REPT.lo(ir)            : 999999999;
    int64  hir = (ir < REPT.numberOfIntervals()) ? REPT.hi(ir)            : 999999999;
    int64  lou = (iu < UNIQ.numberOfIntervals()) ? UNIQ.lo(iu) - fragTrim : 999999999;
    int64  hiu = (iu < UNIQ.numberOfIntervals()) ? UNIQ.hi(iu) + fragTrim : 999999999;

    //  Search the refList for the reference sequence we are in.  We should never span reference
    //  sequences (which isn't tested, as we only know the low coordinate at this time).

    char  *refhdr = NULL;
    int64  refbgn = 0;
    int64  refend = 0;
    int64  refcnt = 0;

    if (lor < lou) {
      for (uint32 rr=0; rr<refList.size(); rr++)
        if ((refList[rr].rschnBgn <= lor) && (lor <= refList[rr].rschnEnd)) {
          refhdr = refList[rr].rsrefName;
          refbgn = REPT.lo(ir) - refList[rr].rschnBgn;
          refend = REPT.hi(ir) - refList[rr].rschnBgn;
          refcnt = REPT.count(ir);
        }

      if (refcnt == 0) {
        fprintf(stderr, "DIDN'T FIND REGION.\n");
        for (uint32 rr=0; rr<refList.size(); rr++) {
          fprintf(stderr, "  %3u rschnBgn %6d REPT %5d %ld-%ld UNIQ %5d %ld-%ld rschnEnd %6d REPT\n",
                  rr,
                  refList[rr].rschnBgn,
                  ir, lor, hir,
                  iu, lou, hiu,
                  refList[rr].rschnEnd);
        }
      }
      //assert(refcnt != 0);

      if ((refcnt > 0) && (minFrags <= refcnt) && (minLength <= refend - refbgn)) {
        fprintf(intervalOutput, "%s\t%8"F_S64P"\t%8"F_S64P"\tREPT\t"F_S64"%s\n",
                refhdr, refbgn, refend, refcnt, (REPTvalid[ir]) ? "" : " weak");

        if (REPTvalid[ir])
          fprintf(gffOutput, "%s\t.\tbogus_rept_interval\t"F_S64"\t"F_S64"\t.\t.\t.\tID=REPT%04d;fragCount="F_S64"\n",
                  refhdr, refbgn, refend, ir, refcnt);
        else
          fprintf(gffOutput, "%s\t.\tbogus_weak_interval\t"F_S64"\t"F_S64"\t.\t.\t.\tParent=UNIQ%04d;fragCount="F_S64"\n",
                  refhdr, refbgn, refend, REPTvalidParent[ir], refcnt);
      }

      ir++;

    } else {
      for (uint32 rr=0; rr<refList.size(); rr++)
        if ((refList[rr].rschnBgn <= lou) && (lou <= refList[rr].rschnEnd)) {
          refhdr = refList[rr].rsrefName;
          refbgn = UNIQ.lo(iu) - fragTrim - refList[rr].rschnBgn;
          refend = UNIQ.hi(iu) + fragTrim - refList[rr].rschnBgn;
          refcnt = UNIQ.count(iu);
        }

      //  Not sure why some data sets (long pacbio for example) trigger this.
#if 0
      if (refcnt == 0) {
        fprintf(stderr, "DIDN'T FIND REGION.\n");
        for (uint32 rr=0; rr<refList.size(); rr++) {
          fprintf(stderr, "  %3u rschnBgn %6d REPT %5d %ld-%ld UNIQ %5d %ld-%ld rschnEnd %6d UNIQ\n",
                  rr,
                  refList[rr].rschnBgn,
                  ir, lor, hir,
                  iu, lou, hiu,
                  refList[rr].rschnEnd);
        }
      }
#endif
      //assert(refcnt != 0);

      if ((refcnt > 0) && (minFrags <= refcnt) && (minLength <= refend - refbgn)) {
        fprintf(intervalOutput, "%s\t%8"F_S64P"\t%8"F_S64P"\tUNIQ\t"F_S64"%s\n",
                refhdr, refbgn, refend, refcnt, (UNIQvalid[iu]) ? "" : " separation");

        if (UNIQvalid[iu])
          fprintf(gffOutput, "%s\t.\tbogus_uniq_interval\t"F_S64"\t"F_S64"\t.\t.\t.\tID=UNIQ%04d;fragCount="F_S64"\n",
                  refhdr, refbgn, refend, iu, refcnt);
        else
          fprintf(gffOutput, "%s\t.\tbogus_sepr_interval\t"F_S64"\t"F_S64"\t.\t.\t.\tParent=REPT%04d;fragCount="F_S64"\n",
                  refhdr, refbgn, refend, UNIQvalidParent[iu], refcnt);
      }

      iu++;
    }
  }

  fclose(gffOutput);
  fclose(intervalOutput);

  //  See CVS version 1.3 for writing rept/uniq fasta

  return(0);
}