/*! * \brief l_dnaUnionByAset() * * \param[in] da1, da2 * \return dad with the union of the set of numbers, or NULL on error * * <pre> * Notes: * (1) See sarrayUnionByAset() for the approach. * (2) Here, the key in building the sorted tree is the number itself. * (3) Operations using an underlying tree are O(nlogn), which is * typically less efficient than hashing, which is O(n). * </pre> */ L_DNA * l_dnaUnionByAset(L_DNA *da1, L_DNA *da2) { L_DNA *da3, *dad; PROCNAME("l_dnaUnionByAset"); if (!da1) return (L_DNA *)ERROR_PTR("da1 not defined", procName, NULL); if (!da2) return (L_DNA *)ERROR_PTR("da2 not defined", procName, NULL); /* Join */ da3 = l_dnaCopy(da1); l_dnaJoin(da3, da2, 0, -1); /* Eliminate duplicates */ dad = l_dnaRemoveDupsByAset(da3); l_dnaDestroy(&da3); return dad; }
l_int32 main(int argc, char **argv) { L_ASET *set; L_DNA *da1, *da2, *da3, *da4, *da5, *da6, *da7, *da8, *dav, *dac; L_DNAHASH *dahash; NUMA *nav, *nac; PTA *pta1, *pta2, *pta3; SARRAY *sa1, *sa2, *sa3, *sa4; lept_mkdir("lept/hash"); #if 1 /* Test string hashing with aset */ fprintf(stderr, "Set results with string hashing:\n"); sa1 = BuildShortStrings(3, 0); sa2 = BuildShortStrings(3, 1); fprintf(stderr, " size with unique strings: %d\n", sarrayGetCount(sa1)); fprintf(stderr, " size with dups: %d\n", sarrayGetCount(sa2)); startTimer(); set = l_asetCreateFromSarray(sa2); fprintf(stderr, " time to make set: %5.3f sec\n", stopTimer()); fprintf(stderr, " size of set without dups: %d\n", l_asetSize(set)); l_asetDestroy(&set); startTimer(); sa3 = sarrayRemoveDupsByAset(sa2); fprintf(stderr, " time to remove dups: %5.3f sec\n", stopTimer()); fprintf(stderr, " size without dups = %d\n", sarrayGetCount(sa3)); startTimer(); sa4 = sarrayIntersectionByAset(sa1, sa2); fprintf(stderr, " time to intersect: %5.3f sec\n", stopTimer()); fprintf(stderr, " intersection size = %d\n", sarrayGetCount(sa4)); sarrayDestroy(&sa3); sarrayDestroy(&sa4); /* Test sarray set operations with dna hash. * We use the same hash function as is used with aset. */ fprintf(stderr, "\nDna hash results for sarray:\n"); fprintf(stderr, " size with unique strings: %d\n", sarrayGetCount(sa1)); fprintf(stderr, " size with dups: %d\n", sarrayGetCount(sa2)); startTimer(); dahash = l_dnaHashCreateFromSarray(sa2); fprintf(stderr, " time to make hashmap: %5.3f sec\n", stopTimer()); fprintf(stderr, " entries in hashmap with dups: %d\n", l_dnaHashGetTotalCount(dahash)); l_dnaHashDestroy(&dahash); startTimer(); sarrayRemoveDupsByHash(sa2, &sa3, NULL); fprintf(stderr, " time to remove dups: %5.3f sec\n", stopTimer()); fprintf(stderr, " size without dups = %d\n", sarrayGetCount(sa3)); startTimer(); sa4 = sarrayIntersectionByHash(sa1, sa2); fprintf(stderr, " time to intersect: %5.3f sec\n", stopTimer()); fprintf(stderr, " intersection size = %d\n", sarrayGetCount(sa4)); sarrayDestroy(&sa3); sarrayDestroy(&sa4); sarrayDestroy(&sa1); sarrayDestroy(&sa2); #endif #if 1 /* Test point hashing with aset. * Enter all points within a 1500 x 1500 image in pta1, and include * 450,000 duplicates in pta2. With this pt hashing function, * there are no hash collisions among any of the 400 million pixel * locations in a 20000 x 20000 image. */ pta1 = BuildPointSet(1500, 1500, 0); pta2 = BuildPointSet(1500, 1500, 1); fprintf(stderr, "\nSet results for pta:\n"); fprintf(stderr, " pta1 size with unique points: %d\n", ptaGetCount(pta1)); fprintf(stderr, " pta2 size with dups: %d\n", ptaGetCount(pta2)); startTimer(); pta3 = ptaRemoveDupsByAset(pta2); fprintf(stderr, " Time to remove dups: %5.3f sec\n", stopTimer()); fprintf(stderr, " size without dups = %d\n", ptaGetCount(pta3)); ptaDestroy(&pta3); startTimer(); pta3 = ptaIntersectionByAset(pta1, pta2); fprintf(stderr, " Time to intersect: %5.3f sec\n", stopTimer()); fprintf(stderr, " intersection size = %d\n", ptaGetCount(pta3)); ptaDestroy(&pta1); ptaDestroy(&pta2); ptaDestroy(&pta3); #endif #if 1 /* Test pta set operations with dna hash, using the same pt hashing * function. Although there are no collisions in 20K x 20K images, * the dna hash implementation works properly even if there are some. */ pta1 = BuildPointSet(1500, 1500, 0); pta2 = BuildPointSet(1500, 1500, 1); fprintf(stderr, "\nDna hash results for pta:\n"); fprintf(stderr, " pta1 size with unique points: %d\n", ptaGetCount(pta1)); fprintf(stderr, " pta2 size with dups: %d\n", ptaGetCount(pta2)); startTimer(); ptaRemoveDupsByHash(pta2, &pta3, NULL); fprintf(stderr, " Time to remove dups: %5.3f sec\n", stopTimer()); fprintf(stderr, " size without dups = %d\n", ptaGetCount(pta3)); ptaDestroy(&pta3); startTimer(); pta3 = ptaIntersectionByHash(pta1, pta2); fprintf(stderr, " Time to intersect: %5.3f sec\n", stopTimer()); fprintf(stderr, " intersection size = %d\n", ptaGetCount(pta3)); ptaDestroy(&pta1); ptaDestroy(&pta2); ptaDestroy(&pta3); #endif /* Test dna set and histo operations using dna hash */ #if 1 fprintf(stderr, "\nDna hash results for dna:\n"); da1 = l_dnaMakeSequence(0.0, 0.125, 8000); da2 = l_dnaMakeSequence(300.0, 0.125, 8000); da3 = l_dnaMakeSequence(600.0, 0.125, 8000); da4 = l_dnaMakeSequence(900.0, 0.125, 8000); da5 = l_dnaMakeSequence(1200.0, 0.125, 8000); l_dnaJoin(da1, da2, 0, -1); l_dnaJoin(da1, da3, 0, -1); l_dnaJoin(da1, da4, 0, -1); l_dnaJoin(da1, da5, 0, -1); l_dnaRemoveDupsByHash(da1, &da6, &dahash); l_dnaHashDestroy(&dahash); fprintf(stderr, " dna size with dups = %d\n", l_dnaGetCount(da1)); fprintf(stderr, " dna size of unique numbers = %d\n", l_dnaGetCount(da6)); l_dnaMakeHistoByHash(da1, &dahash, &dav, &dac); nav = l_dnaConvertToNuma(dav); nac = l_dnaConvertToNuma(dac); fprintf(stderr, " dna number of histo points = %d\n", l_dnaGetCount(dac)); gplotSimpleXY1(nav, nac, GPLOT_IMPULSES, GPLOT_PNG, "/tmp/lept/hash/histo", "Histo"); da7 = l_dnaIntersectionByHash(da2, da3); fprintf(stderr, " dna number of points: da2 = %d, da3 = %d\n", l_dnaGetCount(da2), l_dnaGetCount(da3)); fprintf(stderr, " dna number of da2/da3 intersection points = %d\n", l_dnaGetCount(da7)); l_fileDisplay("/tmp/lept/hash/histo.png", 700, 100, 1.0); l_dnaDestroy(&da1); l_dnaDestroy(&da2); l_dnaDestroy(&da3); l_dnaDestroy(&da4); l_dnaDestroy(&da5); l_dnaDestroy(&da6); l_dnaDestroy(&da7); l_dnaDestroy(&dac); l_dnaDestroy(&dav); l_dnaHashDestroy(&dahash); numaDestroy(&nav); numaDestroy(&nac); #endif #if 1 da1 = l_dnaMakeSequence(0, 3, 10000); da2 = l_dnaMakeSequence(0, 5, 10000); da3 = l_dnaMakeSequence(0, 7, 10000); l_dnaJoin(da1, da2, 0, -1); l_dnaJoin(da1, da3, 0, -1); fprintf(stderr, "\nDna results using set:\n"); fprintf(stderr, " da1 count: %d\n", l_dnaGetCount(da1)); set = l_asetCreateFromDna(da1); fprintf(stderr, " da1 set size: %d\n\n", l_asetSize(set)); l_asetDestroy(&set); da4 = l_dnaUnionByAset(da2, da3); fprintf(stderr, " da4 count: %d\n", l_dnaGetCount(da4)); set = l_asetCreateFromDna(da4); fprintf(stderr, " da4 set size: %d\n\n", l_asetSize(set)); l_asetDestroy(&set); da5 = l_dnaIntersectionByAset(da1, da2); fprintf(stderr, " da5 count: %d\n", l_dnaGetCount(da5)); set = l_asetCreateFromDna(da5); fprintf(stderr, " da5 set size: %d\n\n", l_asetSize(set)); l_asetDestroy(&set); da6 = l_dnaMakeSequence(100000, 11, 5000); l_dnaJoin(da6, da1, 0, -1); fprintf(stderr, " da6 count: %d\n", l_dnaGetCount(da6)); set = l_asetCreateFromDna(da6); fprintf(stderr, " da6 set size: %d\n\n", l_asetSize(set)); l_asetDestroy(&set); da7 = l_dnaIntersectionByAset(da6, da3); fprintf(stderr, " da7 count: %d\n", l_dnaGetCount(da7)); set = l_asetCreateFromDna(da7); fprintf(stderr, " da7 set size: %d\n\n", l_asetSize(set)); l_asetDestroy(&set); da8 = l_dnaRemoveDupsByAset(da1); fprintf(stderr, " da8 count: %d\n\n", l_dnaGetCount(da8)); l_dnaDestroy(&da1); l_dnaDestroy(&da2); l_dnaDestroy(&da3); l_dnaDestroy(&da4); l_dnaDestroy(&da5); l_dnaDestroy(&da6); l_dnaDestroy(&da7); l_dnaDestroy(&da8); #endif return 0; }