l_int32 main(int argc, char **argv) { L_ASET *s; PIX *pix; setLeptDebugOK(1); pix = pixRead("weasel8.240c.png"); /* Build the set from all the pixels. */ s = BuildSet(pix, 1, FALSE); TestSetIterator(s, FALSE); l_asetDestroy(&s); /* Ditto, but just with a few pixels */ s = BuildSet(pix, 10, TRUE); TestSetIterator(s, TRUE); l_asetDestroy(&s); pixDestroy(&pix); pix = pixRead("marge.jpg"); startTimer(); s = BuildSet(pix, 1, FALSE); fprintf(stderr, "Time (250K pixels): %7.3f sec\n", stopTimer()); TestSetIterator(s, FALSE); l_asetDestroy(&s); pixDestroy(&pix); return 0; }
/*! * ptaRemoveDupsByAset() * * Input: ptas (assumed to be integer values) * Return: ptad (with duplicates removed), or null on error * * Notes: * (1) This is slower than ptaRemoveDupsByHash(), mostly because * of the nlogn sort to build up the rbtree. Do not use for * large numbers of points (say, > 1M). */ PTA * ptaRemoveDupsByAset(PTA *ptas) { l_int32 i, n, x, y; PTA *ptad; l_uint64 hash; L_ASET *set; RB_TYPE key; PROCNAME("ptaRemoveDupsByAset"); if (!ptas) return (PTA *)ERROR_PTR("ptas not defined", procName, NULL); set = l_asetCreate(L_UINT_TYPE); n = ptaGetCount(ptas); ptad = ptaCreate(n); for (i = 0; i < n; i++) { ptaGetIPt(ptas, i, &x, &y); l_hashPtToUint64(x, y, &hash); key.utype = hash; if (!l_asetFind(set, key)) { ptaAddPt(ptad, x, y); l_asetInsert(set, key); } } l_asetDestroy(&set); return ptad; }
/*! * \brief sarrayRemoveDupsByAset() * * \param[in] sas * \return sad with duplicates removed, or NULL on error * * <pre> * Notes: * (1) This is O(nlogn), considerably slower than * sarrayRemoveDupsByHash() for large string arrays. * (2) The key for each string is a 64-bit hash. * (3) Build a set, using hashed strings as keys. As the set is * built, first do a find; if not found, add the key to the * set and add the string to the output sarray. * </pre> */ SARRAY * sarrayRemoveDupsByAset(SARRAY *sas) { char *str; l_int32 i, n; l_uint64 hash; L_ASET *set; RB_TYPE key; SARRAY *sad; PROCNAME("sarrayRemoveDupsByAset"); if (!sas) return (SARRAY *)ERROR_PTR("sas not defined", procName, NULL); set = l_asetCreate(L_UINT_TYPE); sad = sarrayCreate(0); n = sarrayGetCount(sas); for (i = 0; i < n; i++) { str = sarrayGetString(sas, i, L_NOCOPY); l_hashStringToUint64(str, &hash); key.utype = hash; if (!l_asetFind(set, key)) { sarrayAddString(sad, str, L_COPY); l_asetInsert(set, key); } } l_asetDestroy(&set); return sad; }
/*! * \brief l_dnaRemoveDupsByAset() * * \param[in] das * \return dad with duplicates removed, or NULL on error */ L_DNA * l_dnaRemoveDupsByAset(L_DNA *das) { l_int32 i, n; l_float64 val; L_DNA *dad; L_ASET *set; RB_TYPE key; PROCNAME("l_dnaRemoveDupsByAset"); if (!das) return (L_DNA *)ERROR_PTR("das not defined", procName, NULL); set = l_asetCreate(L_FLOAT_TYPE); dad = l_dnaCreate(0); n = l_dnaGetCount(das); for (i = 0; i < n; i++) { l_dnaGetDValue(das, i, &val); key.ftype = val; if (!l_asetFind(set, key)) { l_dnaAddNumber(dad, val); l_asetInsert(set, key); } } l_asetDestroy(&set); return dad; }
/*! * \brief sarrayIntersectionByAset() * * \param[in] sa1, sa2 * \return sad with the intersection of the string set, or NULL on error * * <pre> * Notes: * (1) Algorithm: put the smaller sarray into a set, using the string * hashes as the key values. Then run through the larger sarray, * building an output sarray and a second set from the strings * in the larger array: if a string is in the first set but * not in the second, add the string to the output sarray and hash * it into the second set. The second set is required to make * sure only one instance of each string is put into the output sarray. * This is O(mlogn), {m,n} = sizes of {smaller,larger} input arrays. * </pre> */ SARRAY * sarrayIntersectionByAset(SARRAY *sa1, SARRAY *sa2) { char *str; l_int32 n1, n2, i, n; l_uint64 hash; L_ASET *set1, *set2; RB_TYPE key; SARRAY *sa_small, *sa_big, *sad; PROCNAME("sarrayIntersectionByAset"); if (!sa1) return (SARRAY *)ERROR_PTR("sa1 not defined", procName, NULL); if (!sa2) return (SARRAY *)ERROR_PTR("sa2 not defined", procName, NULL); /* Put the elements of the biggest array into a set */ n1 = sarrayGetCount(sa1); n2 = sarrayGetCount(sa2); sa_small = (n1 < n2) ? sa1 : sa2; /* do not destroy sa_small */ sa_big = (n1 < n2) ? sa2 : sa1; /* do not destroy sa_big */ set1 = l_asetCreateFromSarray(sa_big); /* Build up the intersection of strings */ sad = sarrayCreate(0); n = sarrayGetCount(sa_small); set2 = l_asetCreate(L_UINT_TYPE); for (i = 0; i < n; i++) { str = sarrayGetString(sa_small, i, L_NOCOPY); l_hashStringToUint64(str, &hash); key.utype = hash; if (l_asetFind(set1, key) && !l_asetFind(set2, key)) { sarrayAddString(sad, str, L_COPY); l_asetInsert(set2, key); } } l_asetDestroy(&set1); l_asetDestroy(&set2); return sad; }
/*! * ptaIntersectionByAset() * * Input: pta1, pta2 * Return: ptad (intersection of the point sets), or null on error * * Notes: * (1) See sarrayIntersectionByAset() for the approach. * (2) The key is a 64-bit hash from the (x,y) pair. * (3) This is slower than ptaIntersectionByHash(), mostly because * of the nlogn sort to build up the rbtree. Do not use for * large numbers of points (say, > 1M). */ PTA * ptaIntersectionByAset(PTA *pta1, PTA *pta2) { l_int32 n1, n2, i, n, x, y; l_uint64 hash; L_ASET *set1, *set2; RB_TYPE key; PTA *pta_small, *pta_big, *ptad; PROCNAME("ptaIntersectionByAset"); if (!pta1) return (PTA *)ERROR_PTR("pta1 not defined", procName, NULL); if (!pta2) return (PTA *)ERROR_PTR("pta2 not defined", procName, NULL); /* Put the elements of the biggest array into a set */ n1 = ptaGetCount(pta1); n2 = ptaGetCount(pta2); pta_small = (n1 < n2) ? pta1 : pta2; /* do not destroy pta_small */ pta_big = (n1 < n2) ? pta2 : pta1; /* do not destroy pta_big */ set1 = l_asetCreateFromPta(pta_big); /* Build up the intersection of points */ ptad = ptaCreate(0); n = ptaGetCount(pta_small); set2 = l_asetCreate(L_UINT_TYPE); for (i = 0; i < n; i++) { ptaGetIPt(pta_small, i, &x, &y); l_hashPtToUint64(x, y, &hash); key.utype = hash; if (l_asetFind(set1, key) && !l_asetFind(set2, key)) { ptaAddPt(ptad, x, y); l_asetInsert(set2, key); } } l_asetDestroy(&set1); l_asetDestroy(&set2); return ptad; }
/*! * \brief l_dnaIntersectionByAset() * * \param[in] da1, da2 * \return dad with the intersection of the two arrays, or NULL on error * * <pre> * Notes: * (1) See sarrayIntersection() for the approach. * (2) Here, the key in building the sorted tree is the number itself. * (3) Operations using an underlying tree are O(nlogn), which is * typically less efficient than hashing, which is O(n). * </pre> */ L_DNA * l_dnaIntersectionByAset(L_DNA *da1, L_DNA *da2) { l_int32 n1, n2, i, n; l_float64 val; L_ASET *set1, *set2; RB_TYPE key; L_DNA *da_small, *da_big, *dad; PROCNAME("l_dnaIntersectionByAset"); if (!da1) return (L_DNA *)ERROR_PTR("da1 not defined", procName, NULL); if (!da2) return (L_DNA *)ERROR_PTR("da2 not defined", procName, NULL); /* Put the elements of the largest array into a set */ n1 = l_dnaGetCount(da1); n2 = l_dnaGetCount(da2); da_small = (n1 < n2) ? da1 : da2; /* do not destroy da_small */ da_big = (n1 < n2) ? da2 : da1; /* do not destroy da_big */ set1 = l_asetCreateFromDna(da_big); /* Build up the intersection of floats */ dad = l_dnaCreate(0); n = l_dnaGetCount(da_small); set2 = l_asetCreate(L_FLOAT_TYPE); for (i = 0; i < n; i++) { l_dnaGetDValue(da_small, i, &val); key.ftype = val; if (l_asetFind(set1, key) && !l_asetFind(set2, key)) { l_dnaAddNumber(dad, val); l_asetInsert(set2, key); } } l_asetDestroy(&set1); l_asetDestroy(&set2); return dad; }
l_int32 main(int argc, char **argv) { L_ASET *set; L_DNA *da1, *da2, *da3, *da4, *da5, *da6, *da7, *da8, *dav, *dac; L_DNAHASH *dahash; NUMA *nav, *nac; PTA *pta1, *pta2, *pta3; SARRAY *sa1, *sa2, *sa3, *sa4; lept_mkdir("lept/hash"); #if 1 /* Test string hashing with aset */ fprintf(stderr, "Set results with string hashing:\n"); sa1 = BuildShortStrings(3, 0); sa2 = BuildShortStrings(3, 1); fprintf(stderr, " size with unique strings: %d\n", sarrayGetCount(sa1)); fprintf(stderr, " size with dups: %d\n", sarrayGetCount(sa2)); startTimer(); set = l_asetCreateFromSarray(sa2); fprintf(stderr, " time to make set: %5.3f sec\n", stopTimer()); fprintf(stderr, " size of set without dups: %d\n", l_asetSize(set)); l_asetDestroy(&set); startTimer(); sa3 = sarrayRemoveDupsByAset(sa2); fprintf(stderr, " time to remove dups: %5.3f sec\n", stopTimer()); fprintf(stderr, " size without dups = %d\n", sarrayGetCount(sa3)); startTimer(); sa4 = sarrayIntersectionByAset(sa1, sa2); fprintf(stderr, " time to intersect: %5.3f sec\n", stopTimer()); fprintf(stderr, " intersection size = %d\n", sarrayGetCount(sa4)); sarrayDestroy(&sa3); sarrayDestroy(&sa4); /* Test sarray set operations with dna hash. * We use the same hash function as is used with aset. */ fprintf(stderr, "\nDna hash results for sarray:\n"); fprintf(stderr, " size with unique strings: %d\n", sarrayGetCount(sa1)); fprintf(stderr, " size with dups: %d\n", sarrayGetCount(sa2)); startTimer(); dahash = l_dnaHashCreateFromSarray(sa2); fprintf(stderr, " time to make hashmap: %5.3f sec\n", stopTimer()); fprintf(stderr, " entries in hashmap with dups: %d\n", l_dnaHashGetTotalCount(dahash)); l_dnaHashDestroy(&dahash); startTimer(); sarrayRemoveDupsByHash(sa2, &sa3, NULL); fprintf(stderr, " time to remove dups: %5.3f sec\n", stopTimer()); fprintf(stderr, " size without dups = %d\n", sarrayGetCount(sa3)); startTimer(); sa4 = sarrayIntersectionByHash(sa1, sa2); fprintf(stderr, " time to intersect: %5.3f sec\n", stopTimer()); fprintf(stderr, " intersection size = %d\n", sarrayGetCount(sa4)); sarrayDestroy(&sa3); sarrayDestroy(&sa4); sarrayDestroy(&sa1); sarrayDestroy(&sa2); #endif #if 1 /* Test point hashing with aset. * Enter all points within a 1500 x 1500 image in pta1, and include * 450,000 duplicates in pta2. With this pt hashing function, * there are no hash collisions among any of the 400 million pixel * locations in a 20000 x 20000 image. */ pta1 = BuildPointSet(1500, 1500, 0); pta2 = BuildPointSet(1500, 1500, 1); fprintf(stderr, "\nSet results for pta:\n"); fprintf(stderr, " pta1 size with unique points: %d\n", ptaGetCount(pta1)); fprintf(stderr, " pta2 size with dups: %d\n", ptaGetCount(pta2)); startTimer(); pta3 = ptaRemoveDupsByAset(pta2); fprintf(stderr, " Time to remove dups: %5.3f sec\n", stopTimer()); fprintf(stderr, " size without dups = %d\n", ptaGetCount(pta3)); ptaDestroy(&pta3); startTimer(); pta3 = ptaIntersectionByAset(pta1, pta2); fprintf(stderr, " Time to intersect: %5.3f sec\n", stopTimer()); fprintf(stderr, " intersection size = %d\n", ptaGetCount(pta3)); ptaDestroy(&pta1); ptaDestroy(&pta2); ptaDestroy(&pta3); #endif #if 1 /* Test pta set operations with dna hash, using the same pt hashing * function. Although there are no collisions in 20K x 20K images, * the dna hash implementation works properly even if there are some. */ pta1 = BuildPointSet(1500, 1500, 0); pta2 = BuildPointSet(1500, 1500, 1); fprintf(stderr, "\nDna hash results for pta:\n"); fprintf(stderr, " pta1 size with unique points: %d\n", ptaGetCount(pta1)); fprintf(stderr, " pta2 size with dups: %d\n", ptaGetCount(pta2)); startTimer(); ptaRemoveDupsByHash(pta2, &pta3, NULL); fprintf(stderr, " Time to remove dups: %5.3f sec\n", stopTimer()); fprintf(stderr, " size without dups = %d\n", ptaGetCount(pta3)); ptaDestroy(&pta3); startTimer(); pta3 = ptaIntersectionByHash(pta1, pta2); fprintf(stderr, " Time to intersect: %5.3f sec\n", stopTimer()); fprintf(stderr, " intersection size = %d\n", ptaGetCount(pta3)); ptaDestroy(&pta1); ptaDestroy(&pta2); ptaDestroy(&pta3); #endif /* Test dna set and histo operations using dna hash */ #if 1 fprintf(stderr, "\nDna hash results for dna:\n"); da1 = l_dnaMakeSequence(0.0, 0.125, 8000); da2 = l_dnaMakeSequence(300.0, 0.125, 8000); da3 = l_dnaMakeSequence(600.0, 0.125, 8000); da4 = l_dnaMakeSequence(900.0, 0.125, 8000); da5 = l_dnaMakeSequence(1200.0, 0.125, 8000); l_dnaJoin(da1, da2, 0, -1); l_dnaJoin(da1, da3, 0, -1); l_dnaJoin(da1, da4, 0, -1); l_dnaJoin(da1, da5, 0, -1); l_dnaRemoveDupsByHash(da1, &da6, &dahash); l_dnaHashDestroy(&dahash); fprintf(stderr, " dna size with dups = %d\n", l_dnaGetCount(da1)); fprintf(stderr, " dna size of unique numbers = %d\n", l_dnaGetCount(da6)); l_dnaMakeHistoByHash(da1, &dahash, &dav, &dac); nav = l_dnaConvertToNuma(dav); nac = l_dnaConvertToNuma(dac); fprintf(stderr, " dna number of histo points = %d\n", l_dnaGetCount(dac)); gplotSimpleXY1(nav, nac, GPLOT_IMPULSES, GPLOT_PNG, "/tmp/lept/hash/histo", "Histo"); da7 = l_dnaIntersectionByHash(da2, da3); fprintf(stderr, " dna number of points: da2 = %d, da3 = %d\n", l_dnaGetCount(da2), l_dnaGetCount(da3)); fprintf(stderr, " dna number of da2/da3 intersection points = %d\n", l_dnaGetCount(da7)); l_fileDisplay("/tmp/lept/hash/histo.png", 700, 100, 1.0); l_dnaDestroy(&da1); l_dnaDestroy(&da2); l_dnaDestroy(&da3); l_dnaDestroy(&da4); l_dnaDestroy(&da5); l_dnaDestroy(&da6); l_dnaDestroy(&da7); l_dnaDestroy(&dac); l_dnaDestroy(&dav); l_dnaHashDestroy(&dahash); numaDestroy(&nav); numaDestroy(&nac); #endif #if 1 da1 = l_dnaMakeSequence(0, 3, 10000); da2 = l_dnaMakeSequence(0, 5, 10000); da3 = l_dnaMakeSequence(0, 7, 10000); l_dnaJoin(da1, da2, 0, -1); l_dnaJoin(da1, da3, 0, -1); fprintf(stderr, "\nDna results using set:\n"); fprintf(stderr, " da1 count: %d\n", l_dnaGetCount(da1)); set = l_asetCreateFromDna(da1); fprintf(stderr, " da1 set size: %d\n\n", l_asetSize(set)); l_asetDestroy(&set); da4 = l_dnaUnionByAset(da2, da3); fprintf(stderr, " da4 count: %d\n", l_dnaGetCount(da4)); set = l_asetCreateFromDna(da4); fprintf(stderr, " da4 set size: %d\n\n", l_asetSize(set)); l_asetDestroy(&set); da5 = l_dnaIntersectionByAset(da1, da2); fprintf(stderr, " da5 count: %d\n", l_dnaGetCount(da5)); set = l_asetCreateFromDna(da5); fprintf(stderr, " da5 set size: %d\n\n", l_asetSize(set)); l_asetDestroy(&set); da6 = l_dnaMakeSequence(100000, 11, 5000); l_dnaJoin(da6, da1, 0, -1); fprintf(stderr, " da6 count: %d\n", l_dnaGetCount(da6)); set = l_asetCreateFromDna(da6); fprintf(stderr, " da6 set size: %d\n\n", l_asetSize(set)); l_asetDestroy(&set); da7 = l_dnaIntersectionByAset(da6, da3); fprintf(stderr, " da7 count: %d\n", l_dnaGetCount(da7)); set = l_asetCreateFromDna(da7); fprintf(stderr, " da7 set size: %d\n\n", l_asetSize(set)); l_asetDestroy(&set); da8 = l_dnaRemoveDupsByAset(da1); fprintf(stderr, " da8 count: %d\n\n", l_dnaGetCount(da8)); l_dnaDestroy(&da1); l_dnaDestroy(&da2); l_dnaDestroy(&da3); l_dnaDestroy(&da4); l_dnaDestroy(&da5); l_dnaDestroy(&da6); l_dnaDestroy(&da7); l_dnaDestroy(&da8); #endif return 0; }
/*! * \brief pixGetSortedNeighborValues() * * \param[in] pixs 8, 16 or 32 bpp, with pixels labeled by c.c. * \param[in] x, y location of pixel * \param[in] conn 4 or 8 connected neighbors * \param[out] pneigh array of integers, to be filled with * the values of the neighbors, if any * \param[out] pnvals the number of unique neighbor values found * \return 0 if OK, 1 on error * * <pre> * Notes: * (1) The returned %neigh array is the unique set of neighboring * pixel values, of size nvals, sorted from smallest to largest. * The value 0, which represents background pixels that do * not belong to any set of connected components, is discarded. * (2) If there are no neighbors, this returns %neigh = NULL; otherwise, * the caller must free the array. * (3) For either 4 or 8 connectivity, the maximum number of unique * neighbor values is 4. * </pre> */ l_int32 pixGetSortedNeighborValues(PIX *pixs, l_int32 x, l_int32 y, l_int32 conn, l_int32 **pneigh, l_int32 *pnvals) { l_int32 i, npt, index; l_int32 neigh[4]; l_uint32 val; l_float32 fx, fy; L_ASET *aset; L_ASET_NODE *node; PTA *pta; RB_TYPE key; PROCNAME("pixGetSortedNeighborValues"); if (pneigh) *pneigh = NULL; if (pnvals) *pnvals = 0; if (!pneigh || !pnvals) return ERROR_INT("&neigh and &nvals not both defined", procName, 1); if (!pixs || pixGetDepth(pixs) < 8) return ERROR_INT("pixs not defined or depth < 8", procName, 1); /* Identify the locations of nearest neighbor pixels */ if ((pta = ptaGetNeighborPixLocs(pixs, x, y, conn)) == NULL) return ERROR_INT("pta of neighbors not made", procName, 1); /* Find the pixel values and insert into a set as keys */ aset = l_asetCreate(L_UINT_TYPE); npt = ptaGetCount(pta); for (i = 0; i < npt; i++) { ptaGetPt(pta, i, &fx, &fy); pixGetPixel(pixs, (l_int32)fx, (l_int32)fy, &val); key.utype = val; l_asetInsert(aset, key); } /* Extract the set keys and put them into the %neigh array. * Omit the value 0, which indicates the pixel doesn't * belong to one of the sets of connected components. */ node = l_asetGetFirst(aset); index = 0; while (node) { val = node->key.utype; if (val > 0) neigh[index++] = (l_int32)val; node = l_asetGetNext(node); } *pnvals = index; if (index > 0) { *pneigh = (l_int32 *)LEPT_CALLOC(index, sizeof(l_int32)); for (i = 0; i < index; i++) (*pneigh)[i] = neigh[i]; } ptaDestroy(&pta); l_asetDestroy(&aset); return 0; }