Beispiel #1
0
/*!
 *  ptaRemoveDupsByHash()
 *
 *      Input:  ptas (assumed to be integer values)
 *              &ptad (<return> unique set of pts; duplicates removed)
 *              &dahash (<optional return> dnahash used for lookup)
 *      Return: 0 if OK, 1 on error
 *
 *  Notes:
 *      (1) Generates a pta with unique values.
 *      (2) The dnahash is built up with ptad to assure uniqueness.
 *          It can be used to find if a point is in the set:
 *              ptaFindPtByHash(ptad, dahash, x, y, &index)
 *      (3) The hash of the (x,y) location is simple and fast.  It scales
 *          up with the number of buckets to insure a fairly random
 *          bucket selection for adjacent points.
 *      (4) A Dna is used rather than a Numa because we need accurate
 *          representation of 32-bit integers that are indices into ptas.
 *          Integer --> float --> integer conversion makes errors for
 *          integers larger than 10M.
 *      (5) This is faster than ptaRemoveDupsByAset(), because the
 *          bucket lookup is O(n), although there is a double-loop
 *          lookup within the dna in each bucket.
 */
l_int32
ptaRemoveDupsByHash(PTA         *ptas,
                    PTA        **pptad,
                    L_DNAHASH  **pdahash)
{
l_int32     i, n, index, items, x, y;
l_uint32    nsize;
l_uint64    key;
l_float64   val;
PTA        *ptad;
L_DNAHASH  *dahash;

    PROCNAME("ptaRemoveDupsByHash");

    if (pdahash) *pdahash = NULL;
    if (!pptad)
        return ERROR_INT("&ptad not defined", procName, 1);
    *pptad = NULL;
    if (!ptas)
        return ERROR_INT("ptas not defined", procName, 1);

    n = ptaGetCount(ptas);
    findNextLargerPrime(n / 20, &nsize);  /* buckets in hash table */
    dahash = l_dnaHashCreate(nsize, 8);
    ptad = ptaCreate(n);
    *pptad = ptad;
    for (i = 0, items = 0; i < n; i++) {
        ptaGetIPt(ptas, i, &x, &y);
        ptaFindPtByHash(ptad, dahash, x, y, &index);
        if (index < 0) {  /* not found */
            l_hashPtToUint64Fast(nsize, x, y, &key);
            l_dnaHashAdd(dahash, key, (l_float64)items);
            ptaAddPt(ptad, x, y);
            items++;
        }
    }

    if (pdahash)
        *pdahash = dahash;
    else
        l_dnaHashDestroy(&dahash);
    return 0;
}
/*!
 * \brief   sarrayRemoveDupsByHash()
 *
 * \param[in]    sas
 * \param[out]   psad unique set of strings; duplicates removed
 * \param[out]   pdahash [optional] dnahash used for lookup
 * \return  0 if OK, 1 on error
 *
 * <pre>
 * Notes:
 *      (1) Generates a sarray with unique values.
 *      (2) The dnahash is built up with sad to assure uniqueness.
 *          It can be used to find if a string is in the set:
 *              sarrayFindValByHash(sad, dahash, str, \&index)
 *      (3) The hash of the string location is simple and fast.  It scales
 *          up with the number of buckets to insure a fairly random
 *          bucket selection input strings.
 *      (4) This is faster than sarrayRemoveDupsByAset(), because the
 *          bucket lookup is O(n), although there is a double-loop
 *          lookup within the dna in each bucket.
 * </pre>
 */
l_int32
sarrayRemoveDupsByHash(SARRAY      *sas,
                       SARRAY     **psad,
                       L_DNAHASH  **pdahash)
{
char       *str;
l_int32     i, n, index, items;
l_uint32    nsize;
l_uint64    key;
SARRAY     *sad;
L_DNAHASH  *dahash;

    PROCNAME("sarrayRemoveDupsByHash");

    if (pdahash) *pdahash = NULL;
    if (!psad)
        return ERROR_INT("&sad not defined", procName, 1);
    *psad = NULL;
    if (!sas)
        return ERROR_INT("sas not defined", procName, 1);

    n = sarrayGetCount(sas);
    findNextLargerPrime(n / 20, &nsize);  /* buckets in hash table */
    dahash = l_dnaHashCreate(nsize, 8);
    sad = sarrayCreate(n);
    *psad = sad;
    for (i = 0, items = 0; i < n; i++) {
        str = sarrayGetString(sas, i, L_NOCOPY);
        sarrayFindStringByHash(sad, dahash, str, &index);
        if (index < 0) {  /* not found */
            l_hashStringToUint64(str, &key);
            l_dnaHashAdd(dahash, key, (l_float64)items);
            sarrayAddString(sad, str, L_COPY);
            items++;
        }
    }

    if (pdahash)
        *pdahash = dahash;
    else
        l_dnaHashDestroy(&dahash);
    return 0;
}