void mappedLengths(atacFile &AF, atacMatchList &matches, seqCache *A, seqCache *B, char *prefix) { histogram h1(100, 1000000); histogram h2(100, 1000000); // For the coverage to work correctly, we need to either have one // intervalList per input sequence, or build a table of the chained // sequence positions. // uint64 *offset1 = buildOffset(AF.fastaA()); uint64 *offset2 = buildOffset(AF.fastaB()); intervalList<uint64> intervalA; intervalList<uint64> intervalB; for (uint32 m=0; m<matches.numberOfMatches(); m++) { intervalA.add(offset1[matches[m]->iid1] + (uint64)matches[m]->pos1, (uint64)matches[m]->len1); intervalB.add(offset2[matches[m]->iid2] + (uint64)matches[m]->pos2, (uint64)matches[m]->len2); h1.add(matches[m]->len1); h2.add(matches[m]->len2); } fprintf(stdout, "numberOfItems "uint64FMT"\n", (uint64)matches.numberOfMatches()); fprintf(stdout, "matchLength %s "uint64FMT" %s "uint64FMT" # Sum of lengths of sequence in matches\n", AF.labelA(), (uint64)intervalA.sumOfLengths(), AF.labelB(), (uint64)intervalB.sumOfLengths()); h1.show("AmatchLength"); h2.show("BmatchLength"); h1.dump(prefix, "AmatchLength"); h1.plot(prefix, "AmatchLength"); h2.dump(prefix, "BmatchLength"); h2.plot(prefix, "BmatchLength"); intervalA.merge(); intervalB.merge(); fprintf(stdout, "coveredLength %s "uint64FMT" %s "uint64FMT" # sequence covered by a match, including N\n", AF.labelA(), (uint64)intervalA.sumOfLengths(), AF.labelB(), (uint64)intervalB.sumOfLengths()); fprintf(stdout, "coveredLength %s "uint64FMT" %s "uint64FMT" # sequence covered by a match, ACGT only (new)\n", AF.labelA(), tandemRepeatACGTLength(intervalA, offset1, A), AF.labelB(), tandemRepeatACGTLength(intervalB, offset2, B)); delete [] offset1; delete [] offset2; }
// ****************** template <class Type> void MyVector<Type>::scatter() { buildOffset(); scatter(_offset); }
void tandemRepeatStats(atacFileStream &featuresA, atacFileStream &featuresB, atacFile &AF, seqCache *A, seqCache *B) { intervalList<uint64> ifa, ifb; intervalList<uint64> ima, imb; intervalList<uint64> mma, mmb; atacMatchList &matches = *AF.matches(); uint64 *offset1 = buildOffset(A); uint64 *offset2 = buildOffset(B); // ifa, ifb are intervalLists, storing the intervals labeled as // tandem repeats. They are using the offset[] to encode the // entire sequence as one consecutive string. // atacFeature *f = 0L; while ((f = featuresA.nextFeature("tr")) != 0L) ifa.add(offset1[f->iid] + f->pos, f->len); while ((f = featuresB.nextFeature("tr")) != 0L) ifb.add(offset2[f->iid] + f->pos, f->len); // ima, imb, like if?, encode the matches in one string. // for (uint32 m=0; m<matches.numberOfMatches(); m++) ima.add(offset1[matches[m]->iid1] + (uint64)matches[m]->pos1, (uint64)matches[m]->len1); for (uint32 m=0; m<matches.numberOfMatches(); m++) imb.add(offset2[matches[m]->iid2] + (uint64)matches[m]->pos2, (uint64)matches[m]->len2); fprintf(stdout, "\nTANDEM REPEATS in %s\n", AF.labelA()); fprintf(stdout, "numberOfItems "uint64FMT"\n", (uint64)ifa.numberOfIntervals()); fprintf(stdout, "totalLength "uint64FMT" # sum of lengths of all features\n", ifa.sumOfLengths()); ifa.merge(); fprintf(stdout, "numberOfItems "uint64FMT" # after merging overlapping regions\n", (uint64)ifa.numberOfIntervals()); fprintf(stdout, "coveredLength "uint64FMT" # sequence covered by a feature, including N\n", ifa.sumOfLengths()); fprintf(stdout, "coveredLength "uint64FMT" # sequence covered by a feature, ACGT only\n", tandemRepeatACGTLength(ifa, offset1, A)); mma.intersect(ifa, ima); fprintf(stdout, "numberOfItems "uint64FMT" # after merging overlapping regions, only in matches\n", (uint64)mma.numberOfIntervals()); fprintf(stdout, "inMatches "uint64FMT" # sequence covered by a feature and in a match, including N\n", mma.sumOfLengths()); fprintf(stdout, "inMatches "uint64FMT" # sequence covered by a feature and in a match, ACGT only\n", tandemRepeatACGTLength(mma, offset1, A)); fprintf(stdout, "\nTANDEM REPEATS in %s\n", AF.labelB()); fprintf(stdout, "numberOfItems "uint64FMT"\n", (uint64)ifb.numberOfIntervals()); fprintf(stdout, "totalLength "uint64FMT" # sum of lengths of all features\n", ifb.sumOfLengths()); ifb.merge(); fprintf(stdout, "numberOfItems "uint64FMT" # after merging overlapping regions\n", (uint64)ifb.numberOfIntervals()); fprintf(stdout, "coveredLength "uint64FMT" # sequence covered by a feature, including N\n", ifb.sumOfLengths()); fprintf(stdout, "coveredLength "uint64FMT" # sequence covered by a feature, ACGT only\n", tandemRepeatACGTLength(ifb, offset2, B)); mmb.intersect(ifb, imb); fprintf(stdout, "numberOfItems "uint64FMT" # after merging overlapping regions, only in matches\n", (uint64)mmb.numberOfIntervals()); fprintf(stdout, "inMatches "uint64FMT" # sequence covered by a feature and in a match, including N\n", mmb.sumOfLengths()); fprintf(stdout, "inMatches "uint64FMT" # sequence covered by a feature and in a match, ACGT only\n", tandemRepeatACGTLength(mmb, offset2, B)); delete [] offset1; delete [] offset2; }