/*! * ptaRemoveDupsByHash() * * Input: ptas (assumed to be integer values) * &ptad (<return> unique set of pts; duplicates removed) * &dahash (<optional return> dnahash used for lookup) * Return: 0 if OK, 1 on error * * Notes: * (1) Generates a pta with unique values. * (2) The dnahash is built up with ptad to assure uniqueness. * It can be used to find if a point is in the set: * ptaFindPtByHash(ptad, dahash, x, y, &index) * (3) The hash of the (x,y) location is simple and fast. It scales * up with the number of buckets to insure a fairly random * bucket selection for adjacent points. * (4) A Dna is used rather than a Numa because we need accurate * representation of 32-bit integers that are indices into ptas. * Integer --> float --> integer conversion makes errors for * integers larger than 10M. * (5) This is faster than ptaRemoveDupsByAset(), because the * bucket lookup is O(n), although there is a double-loop * lookup within the dna in each bucket. */ l_int32 ptaRemoveDupsByHash(PTA *ptas, PTA **pptad, L_DNAHASH **pdahash) { l_int32 i, n, index, items, x, y; l_uint32 nsize; l_uint64 key; l_float64 val; PTA *ptad; L_DNAHASH *dahash; PROCNAME("ptaRemoveDupsByHash"); if (pdahash) *pdahash = NULL; if (!pptad) return ERROR_INT("&ptad not defined", procName, 1); *pptad = NULL; if (!ptas) return ERROR_INT("ptas not defined", procName, 1); n = ptaGetCount(ptas); findNextLargerPrime(n / 20, &nsize); /* buckets in hash table */ dahash = l_dnaHashCreate(nsize, 8); ptad = ptaCreate(n); *pptad = ptad; for (i = 0, items = 0; i < n; i++) { ptaGetIPt(ptas, i, &x, &y); ptaFindPtByHash(ptad, dahash, x, y, &index); if (index < 0) { /* not found */ l_hashPtToUint64Fast(nsize, x, y, &key); l_dnaHashAdd(dahash, key, (l_float64)items); ptaAddPt(ptad, x, y); items++; } } if (pdahash) *pdahash = dahash; else l_dnaHashDestroy(&dahash); return 0; }
/*! * \brief sarrayRemoveDupsByHash() * * \param[in] sas * \param[out] psad unique set of strings; duplicates removed * \param[out] pdahash [optional] dnahash used for lookup * \return 0 if OK, 1 on error * * <pre> * Notes: * (1) Generates a sarray with unique values. * (2) The dnahash is built up with sad to assure uniqueness. * It can be used to find if a string is in the set: * sarrayFindValByHash(sad, dahash, str, \&index) * (3) The hash of the string location is simple and fast. It scales * up with the number of buckets to insure a fairly random * bucket selection input strings. * (4) This is faster than sarrayRemoveDupsByAset(), because the * bucket lookup is O(n), although there is a double-loop * lookup within the dna in each bucket. * </pre> */ l_int32 sarrayRemoveDupsByHash(SARRAY *sas, SARRAY **psad, L_DNAHASH **pdahash) { char *str; l_int32 i, n, index, items; l_uint32 nsize; l_uint64 key; SARRAY *sad; L_DNAHASH *dahash; PROCNAME("sarrayRemoveDupsByHash"); if (pdahash) *pdahash = NULL; if (!psad) return ERROR_INT("&sad not defined", procName, 1); *psad = NULL; if (!sas) return ERROR_INT("sas not defined", procName, 1); n = sarrayGetCount(sas); findNextLargerPrime(n / 20, &nsize); /* buckets in hash table */ dahash = l_dnaHashCreate(nsize, 8); sad = sarrayCreate(n); *psad = sad; for (i = 0, items = 0; i < n; i++) { str = sarrayGetString(sas, i, L_NOCOPY); sarrayFindStringByHash(sad, dahash, str, &index); if (index < 0) { /* not found */ l_hashStringToUint64(str, &key); l_dnaHashAdd(dahash, key, (l_float64)items); sarrayAddString(sad, str, L_COPY); items++; } } if (pdahash) *pdahash = dahash; else l_dnaHashDestroy(&dahash); return 0; }