uint64 tandemRepeatACGTLength(intervalList<uint64> &il, uint64 *offset, seqCache *A) { // s -- the sequence // i -- the interval list index il.merge(); uint64 length = 0; uint64 unknown[256] = {0}; for (uint32 i=0, s=0; i<il.numberOfIntervals(); i++) { while ((offset[s + 1]) <= il.lo(i)) s++; char *S = A->getSequenceInCore(s)->sequence(); uint64 lo = il.lo(i) - offset[s]; uint64 hi = il.hi(i) - offset[s]; for (uint64 j=lo; j < hi; j++) if (letterToBits[S[j]] != 0xff) length++; else unknown[S[j]]++; } //fprintf(stderr, "tandemRepeatACGTLength: "uint64FMT"\n", length); //for (uint32 i=0; i<256; i++) // if (unknown[i] > 0) // fprintf(stderr, "tandemRepeatACGTLength["uint32FMT"] = "uint64FMT" (%c)\n", i, unknown[i], i); return(length); }
int main(int argc, char **argv) { uint32 nucmerNamesLen = 0; uint32 snapperNamesLen = 0; char *nucmerNames[1024]; char *snapperNames[1024]; char *refName = 0L; char *outputPrefix = 0L; // Output intervals must be at least minLength bases long, and be formed from at least minFrags // mappings. uint32 minLength = 0; uint32 minFrags = 0; // Input matches must be at least minIdentity. double minIdentity = 0; // When comparing coords to if two alignments span the same piece of the fragment, // allow (twice) this much slop in the comparison. E.g., Abgn +- 5 == Bbgn // int32 alignWobble = 5; // When constructing the unique coverage map, trim each read by this amount, on each end, to // simulate the minimum overlap length needed by unitigger. This amount is automagically added // back in on output. int32 fragTrim = 40 / 2; // When testing if a repeat alignment is contained in a non-repeat alignment, the repeat must // start at least this many bases from the end of the non-repeat alignment. In other words, a // fragment with a repeat in the middle can be uniquely placed (by overlaps) with only 20 bases of // unique sequence on the end. int32 uniqEnd = 40; // Loading jbrowse with all the raw input reads on large(r) genomes either // fails, or takes forever. This disables the raw read output. bool includeRaw = true; int arg=1; int err=0; while (arg < argc) { if (strcmp(argv[arg], "-nucmer") == 0) { nucmerNames[nucmerNamesLen++] = argv[++arg]; } else if (strcmp(argv[arg], "-snapper") == 0) { snapperNames[snapperNamesLen++] = argv[++arg]; } else if (strcmp(argv[arg], "-reference") == 0) { refName = argv[++arg]; } else if (strcmp(argv[arg], "-output") == 0) { outputPrefix = argv[++arg]; } else if (strcmp(argv[arg], "-wobble") == 0) { alignWobble = atoi(argv[++arg]); } else if (strcmp(argv[arg], "-overlap") == 0) { fragTrim = atoi(argv[++arg]) / 2; } else if (strcmp(argv[arg], "-noraw") == 0) { includeRaw = false; } else if (strcmp(argv[arg], "-minlength") == 0) { minLength = atoi(argv[++arg]); } else if (strcmp(argv[arg], "-minfrags") == 0) { minFrags = atoi(argv[++arg]); } else if (strcmp(argv[arg], "-minidentity") == 0) { minIdentity = atof(argv[++arg]); } else { fprintf(stderr, "ERROR: unknown option '%s'\n", argv[arg]); err++; } arg++; } if ((nucmerNamesLen == 0) && (snapperNamesLen == 0)) fprintf(stderr, "ERROR: No input matches supplied (either -nucmer or -snapper).\n"), err++; if (refName == 0L) fprintf(stderr, "ERROR: No reference sequence supplied (-reference).\n"), err++; if (outputPrefix == 0L) fprintf(stderr, "ERROR: No output prefix supplied (-output).\n"), err++; if (err) { exit(1); } { char outputName[FILENAME_MAX]; errno = 0; sprintf(outputName, "%s.intervals", outputPrefix); intervalOutput = fopen(outputName, "w"); if (errno) fprintf(stderr, "Failed to open '%s' for writing: %s\n", outputName, strerror(errno)), exit(1); sprintf(outputName, "%s.gff3", outputPrefix); gffOutput = fopen(outputName, "w"); if (errno) fprintf(stderr, "Failed to open '%s' for writing: %s\n", outputName, strerror(errno)), exit(1); fprintf(gffOutput, "##gff-version 3\n"); } // Load the reference sequence. ASSUMES it is all on one line! loadReferenceSequence(refName, refList, refMap); // Load all the matches into genomeAlignment. Generate longestAlignment for each fragment. // genomeAlignment::isLognest and genomeAlignment::isRepeat are computed later. for (uint32 nn=0; nn<nucmerNamesLen; nn++) loadNucmer(nucmerNames[nn], genome, IIDmap, IIDname, refList, refMap, minIdentity); for (uint32 nn=0; nn<snapperNamesLen; nn++) loadSnapper(snapperNames[nn], genome, IIDmap, IIDname, refList, refMap, minIdentity); //if (includeRaw) // writeInputsAsGFF3(outputPrefix); // Process the matches fragment by fragment. Find the longest, count the number // of duplicates of the longest match, label as repeat/unique. sort(genome.begin(), genome.end(), byFragmentID); processMatches(alignWobble, uniqEnd); sort(genome.begin(), genome.end(), byGenomePosition); findSpannedMatches(uniqEnd); // Now, throw all the non-spanned repeats into an intervalList, squash them to get intervals, // and report the repeat regions. buildIntervals(outputPrefix, fragTrim, includeRaw); REPT.merge(); UNIQ.merge(); // Search for exceptions -- one completely contained in the other markWeak(); // Extend UNIQ to cover gaps in alignments. // Extend REPT to cover gaps in alignments. // Merge adjacent/overlapping UNIQ-UNIQ or REPT-REPT intervals. // // Write the output // for (uint32 ir=0, iu=0; ((ir < REPT.numberOfIntervals()) || (iu < UNIQ.numberOfIntervals())); ) { // UNIQ regions are offset by fragTrim on each side int64 lor = (ir < REPT.numberOfIntervals()) ? REPT.lo(ir) : 999999999; int64 hir = (ir < REPT.numberOfIntervals()) ? REPT.hi(ir) : 999999999; int64 lou = (iu < UNIQ.numberOfIntervals()) ? UNIQ.lo(iu) - fragTrim : 999999999; int64 hiu = (iu < UNIQ.numberOfIntervals()) ? UNIQ.hi(iu) + fragTrim : 999999999; // Search the refList for the reference sequence we are in. We should never span reference // sequences (which isn't tested, as we only know the low coordinate at this time). char *refhdr = NULL; int64 refbgn = 0; int64 refend = 0; int64 refcnt = 0; if (lor < lou) { for (uint32 rr=0; rr<refList.size(); rr++) if ((refList[rr].rschnBgn <= lor) && (lor <= refList[rr].rschnEnd)) { refhdr = refList[rr].rsrefName; refbgn = REPT.lo(ir) - refList[rr].rschnBgn; refend = REPT.hi(ir) - refList[rr].rschnBgn; refcnt = REPT.count(ir); } if (refcnt == 0) { fprintf(stderr, "DIDN'T FIND REGION.\n"); for (uint32 rr=0; rr<refList.size(); rr++) { fprintf(stderr, " %3u rschnBgn %6d REPT %5d %ld-%ld UNIQ %5d %ld-%ld rschnEnd %6d REPT\n", rr, refList[rr].rschnBgn, ir, lor, hir, iu, lou, hiu, refList[rr].rschnEnd); } } //assert(refcnt != 0); if ((refcnt > 0) && (minFrags <= refcnt) && (minLength <= refend - refbgn)) { fprintf(intervalOutput, "%s\t%8"F_S64P"\t%8"F_S64P"\tREPT\t"F_S64"%s\n", refhdr, refbgn, refend, refcnt, (REPTvalid[ir]) ? "" : " weak"); if (REPTvalid[ir]) fprintf(gffOutput, "%s\t.\tbogus_rept_interval\t"F_S64"\t"F_S64"\t.\t.\t.\tID=REPT%04d;fragCount="F_S64"\n", refhdr, refbgn, refend, ir, refcnt); else fprintf(gffOutput, "%s\t.\tbogus_weak_interval\t"F_S64"\t"F_S64"\t.\t.\t.\tParent=UNIQ%04d;fragCount="F_S64"\n", refhdr, refbgn, refend, REPTvalidParent[ir], refcnt); } ir++; } else { for (uint32 rr=0; rr<refList.size(); rr++) if ((refList[rr].rschnBgn <= lou) && (lou <= refList[rr].rschnEnd)) { refhdr = refList[rr].rsrefName; refbgn = UNIQ.lo(iu) - fragTrim - refList[rr].rschnBgn; refend = UNIQ.hi(iu) + fragTrim - refList[rr].rschnBgn; refcnt = UNIQ.count(iu); } // Not sure why some data sets (long pacbio for example) trigger this. #if 0 if (refcnt == 0) { fprintf(stderr, "DIDN'T FIND REGION.\n"); for (uint32 rr=0; rr<refList.size(); rr++) { fprintf(stderr, " %3u rschnBgn %6d REPT %5d %ld-%ld UNIQ %5d %ld-%ld rschnEnd %6d UNIQ\n", rr, refList[rr].rschnBgn, ir, lor, hir, iu, lou, hiu, refList[rr].rschnEnd); } } #endif //assert(refcnt != 0); if ((refcnt > 0) && (minFrags <= refcnt) && (minLength <= refend - refbgn)) { fprintf(intervalOutput, "%s\t%8"F_S64P"\t%8"F_S64P"\tUNIQ\t"F_S64"%s\n", refhdr, refbgn, refend, refcnt, (UNIQvalid[iu]) ? "" : " separation"); if (UNIQvalid[iu]) fprintf(gffOutput, "%s\t.\tbogus_uniq_interval\t"F_S64"\t"F_S64"\t.\t.\t.\tID=UNIQ%04d;fragCount="F_S64"\n", refhdr, refbgn, refend, iu, refcnt); else fprintf(gffOutput, "%s\t.\tbogus_sepr_interval\t"F_S64"\t"F_S64"\t.\t.\t.\tParent=REPT%04d;fragCount="F_S64"\n", refhdr, refbgn, refend, UNIQvalidParent[iu], refcnt); } iu++; } } fclose(gffOutput); fclose(intervalOutput); // See CVS version 1.3 for writing rept/uniq fasta return(0); }