Example #1
0
/*!
 * \brief   l_dnaIntersectionByHash()
 *
 * \param[in]    da1, da2
 * \return  dad intersection of the number arrays, or NULL on error
 *
 * <pre>
 * Notes:
 *      (1) This uses the same method for building the intersection set
 *          as ptaIntersectionByHash() and sarrayIntersectionByHash().
 * </pre>
 */
L_DNA *
l_dnaIntersectionByHash(L_DNA  *da1,
                        L_DNA  *da2)
{
l_int32     n1, n2, nsmall, nbuckets, i, index1, index2;
l_uint32    nsize2;
l_uint64    key;
l_float64   val;
L_DNAHASH  *dahash1, *dahash2;
L_DNA      *da_small, *da_big, *dad;

    PROCNAME("l_dnaIntersectionByHash");

    if (!da1)
        return (L_DNA *)ERROR_PTR("da1 not defined", procName, NULL);
    if (!da2)
        return (L_DNA *)ERROR_PTR("da2 not defined", procName, NULL);

        /* Put the elements of the biggest array into a dnahash */
    n1 = l_dnaGetCount(da1);
    n2 = l_dnaGetCount(da2);
    da_small = (n1 < n2) ? da1 : da2;   /* do not destroy da_small */
    da_big = (n1 < n2) ? da2 : da1;   /* do not destroy da_big */
    dahash1 = l_dnaHashCreateFromDna(da_big);

        /* Build up the intersection of numbers.  Add to %dad
         * if the number is in da_big (using dahash1) but hasn't
         * yet been seen in the traversal of da_small (using dahash2). */
    dad = l_dnaCreate(0);
    nsmall = l_dnaGetCount(da_small);
    findNextLargerPrime(nsmall / 20, &nsize2);  /* buckets in hash table */
    dahash2 = l_dnaHashCreate(nsize2, 0);
    nbuckets = l_dnaHashGetCount(dahash2);
    for (i = 0; i < nsmall; i++) {
        l_dnaGetDValue(da_small, i, &val);
        l_dnaFindValByHash(da_big, dahash1, val, &index1);
        if (index1 >= 0) {  /* found */
            l_dnaFindValByHash(da_small, dahash2, val, &index2);
            if (index2 == -1) {  /* not found */
                l_dnaAddNumber(dad, val);
                l_hashFloat64ToUint64(nbuckets, val, &key);
                l_dnaHashAdd(dahash2, key, (l_float64)i);
            }
        }
    }

    l_dnaHashDestroy(&dahash1);
    l_dnaHashDestroy(&dahash2);
    return dad;
}
Example #2
0
/*!
 * \brief   recogGetClassIndex()
 *
 * \param[in]    recog with LUT's pre-computed
 * \param[in]    val integer value; can be up to 3 bytes for UTF-8
 * \param[in]    text text from which %val was derived; used if not found
 * \param[out]   pindex index into dna_tochar
 * \return  0 if found; 1 if not found and added; 2 on error.
 *
 * <pre>
 * Notes:
 *      (1) This is used during training.  There is one entry in
 *          recog->dna_tochar (integer value, e.g., ascii) and
 *          one in recog->sa_text (e.g, ascii letter in a string)
 *          for each character class.
 *      (2) This searches the dna character array for %val.  If it is
 *          not found, the template represents a character class not
 *          already seen: it increments setsize (the number of character
 *          classes) by 1, and augments both the index (dna_tochar)
 *          and text (sa_text) arrays.
 *      (3) Returns the index in &index, except on error.
 *      (4) Caller must check the function return value.
 * </pre>
 */
l_int32
recogGetClassIndex(L_RECOG  *recog,
                   l_int32   val,
                   char     *text,
                   l_int32  *pindex)
{
l_int32  i, n, ival;

    PROCNAME("recogGetClassIndex");

    if (!pindex)
        return ERROR_INT("&index not defined", procName, 2);
    *pindex = -1;
    if (!recog)
        return ERROR_INT("recog not defined", procName, 2);
    if (!text)
        return ERROR_INT("text not defined", procName, 2);

        /* Search existing characters */
    n = l_dnaGetCount(recog->dna_tochar);
    for (i = 0; i < n; i++) {
        l_dnaGetIValue(recog->dna_tochar, i, &ival);
        if (val == ival) {  /* found */
            *pindex = i;
            return 0;
        }
    }

       /* If not found... */
    l_dnaAddNumber(recog->dna_tochar, val);
    sarrayAddString(recog->sa_text, text, L_COPY);
    recog->setsize++;
    *pindex = n;
    return 1;
}
Example #3
0
/*!
 * \brief   l_dnaRemoveDupsByAset()
 *
 * \param[in]    das
 * \return  dad with duplicates removed, or NULL on error
 */
L_DNA *
l_dnaRemoveDupsByAset(L_DNA  *das)
{
l_int32    i, n;
l_float64  val;
L_DNA     *dad;
L_ASET    *set;
RB_TYPE    key;

    PROCNAME("l_dnaRemoveDupsByAset");

    if (!das)
        return (L_DNA *)ERROR_PTR("das not defined", procName, NULL);

    set = l_asetCreate(L_FLOAT_TYPE);
    dad = l_dnaCreate(0);
    n = l_dnaGetCount(das);
    for (i = 0; i < n; i++) {
        l_dnaGetDValue(das, i, &val);
        key.ftype = val;
        if (!l_asetFind(set, key)) {
            l_dnaAddNumber(dad, val);
            l_asetInsert(set, key);
        }
    }

    l_asetDestroy(&set);
    return dad;
}
Example #4
0
/*!
 *  l_dnaJoin()
 *
 *      Input:  dad  (dest dma; add to this one)
 *              das  (<optional> source dna; add from this one)
 *              istart  (starting index in das)
 *              iend  (ending index in das; use -1 to cat all)
 *      Return: 0 if OK, 1 on error
 *
 *  Notes:
 *      (1) istart < 0 is taken to mean 'read from the start' (istart = 0)
 *      (2) iend < 0 means 'read to the end'
 *      (3) if das == NULL, this is a no-op
 */
l_int32
l_dnaJoin(L_DNA   *dad,
          L_DNA   *das,
          l_int32  istart,
          l_int32  iend)
{
l_int32    n, i;
l_float64  val;

    PROCNAME("l_dnaJoin");

    if (!dad)
        return ERROR_INT("dad not defined", procName, 1);
    if (!das)
        return 0;

    if (istart < 0)
        istart = 0;
    n = l_dnaGetCount(das);
    if (iend < 0 || iend >= n)
        iend = n - 1;
    if (istart > iend)
        return ERROR_INT("istart > iend; nothing to add", procName, 1);

    for (i = istart; i <= iend; i++) {
        l_dnaGetDValue(das, i, &val);
        l_dnaAddNumber(dad, val);
    }

    return 0;
}
Example #5
0
/*!
 *  l_dnaWriteStream()
 *
 *      Input:  stream, da
 *      Return: 0 if OK, 1 on error
 */
l_int32
l_dnaWriteStream(FILE   *fp,
                 L_DNA  *da)
{
l_int32    i, n;
l_float64  startx, delx;

    PROCNAME("l_dnaWriteStream");

    if (!fp)
        return ERROR_INT("stream not defined", procName, 1);
    if (!da)
        return ERROR_INT("da not defined", procName, 1);

    n = l_dnaGetCount(da);
    fprintf(fp, "\nL_Dna Version %d\n", DNA_VERSION_NUMBER);
    fprintf(fp, "Number of numbers = %d\n", n);
    for (i = 0; i < n; i++)
        fprintf(fp, "  [%d] = %lf\n", i, da->array[i]);
    fprintf(fp, "\n");

        /* Optional data */
    l_dnaGetParameters(da, &startx, &delx);
    if (startx != 0.0 || delx != 1.0)
        fprintf(fp, "startx = %lf, delx = %lf\n", startx, delx);

    return 0;
}
Example #6
0
/*!
 * \brief   l_dnaHashCreateFromDna()
 *
 * \param[in]    da
 * \return  dahash if OK; 1 on error
 *
 * <pre>
 * Notes:
 *      (1) The values stored in the %dahash are indices into %da;
 *          %dahash has no use without %da.
 * </pre>
 */
L_DNAHASH *
l_dnaHashCreateFromDna(L_DNA  *da)
{
l_int32     i, n;
l_uint32    nsize;
l_uint64    key;
l_float64   val;
L_DNAHASH  *dahash;

    PROCNAME("l_dnaHashCreateFromDna");

    if (!da)
        return (L_DNAHASH *)ERROR_PTR("da not defined", procName, NULL);

    n = l_dnaGetCount(da);
    findNextLargerPrime(n / 20, &nsize);  /* buckets in hash table */

    dahash = l_dnaHashCreate(nsize, 8);
    for (i = 0; i < n; i++) {
        l_dnaGetDValue(da, i, &val);
        l_hashFloat64ToUint64(nsize, val, &key);
        l_dnaHashAdd(dahash, key, (l_float64)i);
    }

    return dahash;
}
Example #7
0
/*!
 * \brief   l_dnaIntersectionByAset()
 *
 * \param[in]    da1, da2
 * \return  dad with the intersection of the two arrays, or NULL on error
 *
 * <pre>
 * Notes:
 *      (1) See sarrayIntersection() for the approach.
 *      (2) Here, the key in building the sorted tree is the number itself.
 *      (3) Operations using an underlying tree are O(nlogn), which is
 *          typically less efficient than hashing, which is O(n).
 * </pre>
 */
L_DNA *
l_dnaIntersectionByAset(L_DNA  *da1,
                        L_DNA  *da2)
{
l_int32    n1, n2, i, n;
l_float64  val;
L_ASET    *set1, *set2;
RB_TYPE    key;
L_DNA     *da_small, *da_big, *dad;

    PROCNAME("l_dnaIntersectionByAset");

    if (!da1)
        return (L_DNA *)ERROR_PTR("da1 not defined", procName, NULL);
    if (!da2)
        return (L_DNA *)ERROR_PTR("da2 not defined", procName, NULL);

        /* Put the elements of the largest array into a set */
    n1 = l_dnaGetCount(da1);
    n2 = l_dnaGetCount(da2);
    da_small = (n1 < n2) ? da1 : da2;   /* do not destroy da_small */
    da_big = (n1 < n2) ? da2 : da1;   /* do not destroy da_big */
    set1 = l_asetCreateFromDna(da_big);

        /* Build up the intersection of floats */
    dad = l_dnaCreate(0);
    n = l_dnaGetCount(da_small);
    set2 = l_asetCreate(L_FLOAT_TYPE);
    for (i = 0; i < n; i++) {
        l_dnaGetDValue(da_small, i, &val);
        key.ftype = val;
        if (l_asetFind(set1, key) && !l_asetFind(set2, key)) {
            l_dnaAddNumber(dad, val);
            l_asetInsert(set2, key);
        }
    }

    l_asetDestroy(&set1);
    l_asetDestroy(&set2);
    return dad;
}
Example #8
0
/*!
 *  l_dnaaGetDnaCount()
 *
 *      Input:  daa
 *              index (of l_dna in daa)
 *      Return: count of numbers in the referenced l_dna, or 0 on error.
 */
l_int32
l_dnaaGetDnaCount(L_DNAA   *daa,
                    l_int32  index)
{
    PROCNAME("l_dnaaGetDnaCount");

    if (!daa)
        return ERROR_INT("daa not defined", procName, 0);
    if (index < 0 || index >= daa->n)
        return ERROR_INT("invalid index into daa", procName, 0);
    return l_dnaGetCount(daa->dna[index]);
}
Example #9
0
/*!
 * \brief   l_dnaMakeHistoByHash()
 *
 * \param[in]    das
 * \param[out]   pdahash hash map: val --> index
 * \param[out]   pdav array of values: index --> val
 * \param[out]   pdac histo array of counts: index --> count
 * \return  0 if OK; 1 on error
 *
 * <pre>
 * Notes:
 *      (1) Generates and returns a dna of occurrences (histogram),
 *          an aligned dna of values, and an associated hashmap.
 *          The hashmap takes %dav and a value, and points into the
 *          histogram in %dac.
 *      (2) The dna of values, %dav, is aligned with the histogram %dac,
 *          and is needed for fast lookup.  It is a hash set, because
 *          the values are unique.
 *      (3) Lookup is simple:
 *              l_dnaFindValByHash(dav, dahash, val, &index);
 *              if (index >= 0)
 *                  l_dnaGetIValue(dac, index, &icount);
 *              else
 *                  icount = 0;
 * </pre>
 */
l_ok
l_dnaMakeHistoByHash(L_DNA       *das,
                     L_DNAHASH  **pdahash,
                     L_DNA      **pdav,
                     L_DNA      **pdac)
{
l_int32     i, n, nitems, index, count;
l_uint32    nsize;
l_uint64    key;
l_float64   val;
L_DNA      *dac, *dav;
L_DNAHASH  *dahash;

    PROCNAME("l_dnaMakeHistoByHash");

    if (pdahash) *pdahash = NULL;
    if (pdac) *pdac = NULL;
    if (pdav) *pdav = NULL;
    if (!pdahash || !pdac || !pdav)
        return ERROR_INT("&dahash, &dac, &dav not all defined", procName, 1);
    if (!das)
        return ERROR_INT("das not defined", procName, 1);
    if ((n = l_dnaGetCount(das)) == 0)
        return ERROR_INT("no data in das", procName, 1);

    findNextLargerPrime(n / 20, &nsize);  /* buckets in hash table */
    dahash = l_dnaHashCreate(nsize, 8);
    dac = l_dnaCreate(n);  /* histogram */
    dav = l_dnaCreate(n);  /* the values */
    for (i = 0, nitems = 0; i < n; i++) {
        l_dnaGetDValue(das, i, &val);
            /* Is this value already stored in dav? */
        l_dnaFindValByHash(dav, dahash, val, &index);
        if (index >= 0) {  /* found */
            l_dnaGetIValue(dac, (l_float64)index, &count);
            l_dnaSetValue(dac, (l_float64)index, count + 1);
        } else {  /* not found */
            l_hashFloat64ToUint64(nsize, val, &key);
            l_dnaHashAdd(dahash, key, (l_float64)nitems);
            l_dnaAddNumber(dav, val);
            l_dnaAddNumber(dac, 1);
            nitems++;
        }
    }

    *pdahash = dahash;
    *pdac = dac;
    *pdav = dav;
    return 0;
}
Example #10
0
/*!
 *  l_dnaAddNumber()
 *
 *      Input:  da
 *              val  (float or int to be added; stored as a float)
 *      Return: 0 if OK, 1 on error
 */
l_int32
l_dnaAddNumber(L_DNA     *da,
               l_float64  val)
{
l_int32  n;

    PROCNAME("l_dnaAddNumber");

    if (!da)
        return ERROR_INT("da not defined", procName, 1);

    n = l_dnaGetCount(da);
    if (n >= da->nalloc)
        l_dnaExtendArray(da);
    da->array[n] = val;
    da->n++;
    return 0;
}
Example #11
0
/*!
 *  l_dnaReplaceNumber()
 *
 *      Input:  da
 *              index (element to be replaced)
 *              val (new value to replace old one)
 *      Return: 0 if OK, 1 on error
 */
l_int32
l_dnaReplaceNumber(L_DNA     *da,
                   l_int32    index,
                   l_float64  val)
{
l_int32  n;

    PROCNAME("l_dnaReplaceNumber");

    if (!da)
        return ERROR_INT("da not defined", procName, 1);
    n = l_dnaGetCount(da);
    if (index < 0 || index >= n)
        return ERROR_INT("index not in {0...n - 1}", procName, 1);

    da->array[index] = val;
    return 0;
}
Example #12
0
/*!
 *  l_dnaRemoveNumber()
 *
 *      Input:  da
 *              index (element to be removed)
 *      Return: 0 if OK, 1 on error
 *
 *  Notes:
 *      (1) This shifts da[i] --> da[i - 1] for all i > index.
 *      (2) It should not be used repeatedly on large arrays,
 *          because the function is O(n).
 */
l_int32
l_dnaRemoveNumber(L_DNA   *da,
                  l_int32  index)
{
l_int32  i, n;

    PROCNAME("l_dnaRemoveNumber");

    if (!da)
        return ERROR_INT("da not defined", procName, 1);
    n = l_dnaGetCount(da);
    if (index < 0 || index >= n)
        return ERROR_INT("index not in {0...n - 1}", procName, 1);

    for (i = index + 1; i < n; i++)
        da->array[i - 1] = da->array[i];
    da->n--;
    return 0;
}
Example #13
0
/*!
 * \brief   l_dnaRemoveDupsByHash()
 *
 * \param[in]    das
 * \param[out]   pdad hash set
 * \param[out]   pdahash [optional] dnahash used for lookup
 * \return  0 if OK; 1 on error
 *
 * <pre>
 * Notes:
 *      (1) Generates a dna with unique values.
 *      (2) The dnahash is built up with dad to assure uniqueness.
 *          It can be used to find if an element is in the set:
 *              l_dnaFindValByHash(dad, dahash, val, &index)
 * </pre>
 */
l_ok
l_dnaRemoveDupsByHash(L_DNA       *das,
                      L_DNA      **pdad,
                      L_DNAHASH  **pdahash)
{
l_int32     i, n, index, items;
l_uint32    nsize;
l_uint64    key;
l_float64   val;
L_DNA      *dad;
L_DNAHASH  *dahash;

    PROCNAME("l_dnaRemoveDupsByHash");

    if (pdahash) *pdahash = NULL;
    if (!pdad)
        return ERROR_INT("&dad not defined", procName, 1);
    *pdad = NULL;
    if (!das)
        return ERROR_INT("das not defined", procName, 1);

    n = l_dnaGetCount(das);
    findNextLargerPrime(n / 20, &nsize);  /* buckets in hash table */
    dahash = l_dnaHashCreate(nsize, 8);
    dad = l_dnaCreate(n);
    *pdad = dad;
    for (i = 0, items = 0; i < n; i++) {
        l_dnaGetDValue(das, i, &val);
        l_dnaFindValByHash(dad, dahash, val, &index);
        if (index < 0) {  /* not found */
            l_hashFloat64ToUint64(nsize, val, &key);
            l_dnaHashAdd(dahash, key, (l_float64)items);
            l_dnaAddNumber(dad, val);
            items++;
        }
    }

    if (pdahash)
        *pdahash = dahash;
    else
        l_dnaHashDestroy(&dahash);
    return 0;
}
Example #14
0
/*!
 * \brief   l_dnaHashGetTotalCount()
 *
 * \param[in]    dahash
 * \return  n number of numbers in all dna, or 0 on error
 */
l_int32
l_dnaHashGetTotalCount(L_DNAHASH  *dahash)
{
l_int32  i, n;
L_DNA   *da;

    PROCNAME("l_dnaHashGetTotalCount");

    if (!dahash)
        return ERROR_INT("dahash not defined", procName, 0);

    for (i = 0, n = 0; i < dahash->nbuckets; i++) {
        da = l_dnaHashGetDna(dahash, i, L_NOCOPY);
        if (da)
            n += l_dnaGetCount(da);
    }

    return n;
}
Example #15
0
/*!
 *  l_dnaMakeDelta()
 *
 *      Input:  das (input l_dna)
 *      Return: dad (of difference values val[i+1] - val[i]),
 *                   or null on error
 */
L_DNA *
l_dnaMakeDelta(L_DNA  *das)
{
l_int32  i, n, prev, cur;
L_DNA   *dad;

    PROCNAME("l_dnaMakeDelta");

    if (!das)
        return (L_DNA *)ERROR_PTR("das not defined", procName, NULL);
    n = l_dnaGetCount(das);
    dad = l_dnaCreate(n - 1);
    prev = 0;
    for (i = 1; i < n; i++) {
        l_dnaGetIValue(das, i, &cur);
        l_dnaAddNumber(dad, cur - prev);
        prev = cur;
    }
    return dad;
}
Example #16
0
/*!
 *  l_dnaConvertToNuma()
 *
 *      Input:  da
 *      Return: na, or null on error
 */
NUMA *
l_dnaConvertToNuma(L_DNA  *da)
{
l_int32    i, n;
l_float64  val;
NUMA      *na;

    PROCNAME("l_dnaConvertToNuma");

    if (!da)
        return (NUMA *)ERROR_PTR("da not defined", procName, NULL);

    n = l_dnaGetCount(da);
    na = numaCreate(n);
    for (i = 0; i < n; i++) {
        l_dnaGetDValue(da, i, &val);
        numaAddNumber(na, val);
    }
    return na;
}
Example #17
0
/*!
 *  l_dnaaGetNumberCount()
 *
 *      Input:  daa
 *      Return: count (total number of numbers in the l_dnaa),
 *                     or 0 if no numbers or on error
 */
l_int32
l_dnaaGetNumberCount(L_DNAA  *daa)
{
L_DNA   *da;
l_int32  n, sum, i;

    PROCNAME("l_dnaaGetNumberCount");

    if (!daa)
        return ERROR_INT("daa not defined", procName, 0);

    n = l_dnaaGetCount(daa);
    for (sum = 0, i = 0; i < n; i++) {
        da = l_dnaaGetDna(daa, i, L_CLONE);
        sum += l_dnaGetCount(da);
        l_dnaDestroy(&da);
    }

    return sum;
}
Example #18
0
/*!
 *  l_dnaGetIArray()
 *
 *      Input:  da
 *      Return: a copy of the bare internal array, integerized
 *              by rounding, or null on error
 *  Notes:
 *      (1) A copy of the array is made, because we need to
 *          generate an integer array from the bare double array.
 *          The caller is responsible for freeing the array.
 *      (2) The array size is determined by the number of stored numbers,
 *          not by the size of the allocated array in the l_dna.
 *      (3) This function is provided to simplify calculations
 *          using the bare internal array, rather than continually
 *          calling accessors on the l_dna.  It is typically used
 *          on an array of size 256.
 */
l_int32 *
l_dnaGetIArray(L_DNA  *da)
{
l_int32   i, n, ival;
l_int32  *array;

    PROCNAME("l_dnaGetIArray");

    if (!da)
        return (l_int32 *)ERROR_PTR("da not defined", procName, NULL);

    n = l_dnaGetCount(da);
    if ((array = (l_int32 *)CALLOC(n, sizeof(l_int32))) == NULL)
        return (l_int32 *)ERROR_PTR("array not made", procName, NULL);
    for (i = 0; i < n; i++) {
        l_dnaGetIValue(da, i, &ival);
        array[i] = ival;
    }

    return array;
}
Example #19
0
/*!
 *  ptaFindPtByHash()
 *
 *      Input:  pta
 *              dahash (built from pta)
 *              x, y  (arbitrary points)
 *              &index (<return> index into pta if (x,y) is in pta;
 *                      -1 otherwise)
 *      Return: 0 if OK, 1 on error
 *
 *  Notes:
 *      (1) Fast lookup in dnaHash associated with a pta, to see if a
 *          random point (x,y) is already stored in the hash table.
 */
l_int32
ptaFindPtByHash(PTA        *pta,
                L_DNAHASH  *dahash,
                l_int32     x,
                l_int32     y,
                l_int32    *pindex)
{
l_int32   i, nbuckets, nvals, index, xi, yi;
l_uint64  key;
L_DNA    *da;

    PROCNAME("ptaFindPtByHash");

    if (!pindex)
        return ERROR_INT("&index not defined", procName, 1);
    *pindex = -1;
    if (!pta)
        return ERROR_INT("pta not defined", procName, 1);
    if (!dahash)
        return ERROR_INT("dahash not defined", procName, 1);

    nbuckets = l_dnaHashGetCount(dahash);
    l_hashPtToUint64Fast(nbuckets, x, y, &key);
    da = l_dnaHashGetDna(dahash, key, L_NOCOPY);
    if (!da) return 0;

        /* Run through the da, looking for this point */
    nvals = l_dnaGetCount(da);
    for (i = 0; i < nvals; i++) {
        l_dnaGetIValue(da, i, &index);
        ptaGetIPt(pta, index, &xi, &yi);
        if (x == xi && y == yi) {
            *pindex = index;
            return 0;
        }
    }

    return 0;
}
Example #20
0
/*!
 * \brief   l_dnaFindValByHash()
 *
 * \param[in]    da
 * \param[in]    dahash containing indices into %da
 * \param[in]    val  searching for this number in %da
 * \param[out]   pindex index into da if found; -1 otherwise
 * \return  0 if OK; 1 on error
 *
 * <pre>
 * Notes:
 *      (1) Algo: hash %val into a key; hash the key to get the dna
 *                in %dahash (that holds indices into %da); traverse
 *                the dna of indices looking for %val in %da.
 * </pre>
 */
l_ok
l_dnaFindValByHash(L_DNA      *da,
                   L_DNAHASH  *dahash,
                   l_float64   val,
                   l_int32    *pindex)
{
l_int32    i, nbuckets, nvals, indexval;
l_float64  vali;
l_uint64   key;
L_DNA     *da1;

    PROCNAME("l_dnaFindValByHash");

    if (!pindex)
        return ERROR_INT("&index not defined", procName, 1);
    *pindex = -1;
    if (!da)
        return ERROR_INT("da not defined", procName, 1);
    if (!dahash)
        return ERROR_INT("dahash not defined", procName, 1);

    nbuckets = l_dnaHashGetCount(dahash);
    l_hashFloat64ToUint64(nbuckets, val, &key);
    da1 = l_dnaHashGetDna(dahash, key, L_NOCOPY);
    if (!da1) return 0;

        /* Run through da1, looking for this %val */
    nvals = l_dnaGetCount(da1);
    for (i = 0; i < nvals; i++) {
        l_dnaGetIValue(da1, i, &indexval);
        l_dnaGetDValue(da, indexval, &vali);
        if (val == vali) {
            *pindex = indexval;
            return 0;
        }
    }

    return 0;
}
/*!
 * \brief   sarrayFindStringByHash()
 *
 * \param[in]    sa
 * \param[in]    dahash built from sa
 * \param[in]    str  arbitrary string
 * \param[out]   pindex index into %sa if %str is in %sa;
 *              -1 otherwise
 * \return  0 if OK, 1 on error
 *
 * <pre>
 * Notes:
 *      (1) Fast lookup in dnaHash associated with a sarray, to see if a
 *          random string %str is already stored in the hash table.
 * </pre>
 */
l_int32
sarrayFindStringByHash(SARRAY      *sa,
                       L_DNAHASH   *dahash,
                       const char  *str,
                       l_int32     *pindex)
{
char     *stri;
l_int32   i, nvals, index;
l_uint64  key;
L_DNA    *da;

    PROCNAME("sarrayFindStringByHash");

    if (!pindex)
        return ERROR_INT("&index not defined", procName, 1);
    *pindex = -1;
    if (!sa)
        return ERROR_INT("sa not defined", procName, 1);
    if (!dahash)
        return ERROR_INT("dahash not defined", procName, 1);

    l_hashStringToUint64(str, &key);
    da = l_dnaHashGetDna(dahash, key, L_NOCOPY);
    if (!da) return 0;

        /* Run through the da, looking for this string */
    nvals = l_dnaGetCount(da);
    for (i = 0; i < nvals; i++) {
        l_dnaGetIValue(da, i, &index);
        stri = sarrayGetString(sa, index, L_NOCOPY);
        if (!strcmp(str, stri)) {  /* duplicate */
            *pindex = index;
            return 0;
        }
    }

    return 0;
}
Example #22
0
/*!
 * \brief   l_asetCreateFromDna()
 *
 * \param[in]    da source dna
 * \return  set using the doubles in %da as keys
 */
L_ASET *
l_asetCreateFromDna(L_DNA  *da)
{
l_int32    i, n;
l_float64  val;
L_ASET    *set;
RB_TYPE    key;

    PROCNAME("l_asetCreateFromDna");

    if (!da)
        return (L_ASET *)ERROR_PTR("da not defined", procName, NULL);

    set = l_asetCreate(L_FLOAT_TYPE);
    n = l_dnaGetCount(da);
    for (i = 0; i < n; i++) {
        l_dnaGetDValue(da, i, &val);
        key.ftype = val;
        l_asetInsert(set, key);
    }

    return set;
}
Example #23
0
/*!
 *  l_dnaInsertNumber()
 *
 *      Input:  da
 *              index (location in da to insert new value)
 *              val  (float64 or integer to be added)
 *      Return: 0 if OK, 1 on error
 *
 *  Notes:
 *      (1) This shifts da[i] --> da[i + 1] for all i >= index,
 *          and then inserts val as da[index].
 *      (2) It should not be used repeatedly on large arrays,
 *          because the function is O(n).
 *
 */
l_int32
l_dnaInsertNumber(L_DNA      *da,
                  l_int32    index,
                  l_float64  val)
{
l_int32  i, n;

    PROCNAME("l_dnaInsertNumber");

    if (!da)
        return ERROR_INT("da not defined", procName, 1);
    n = l_dnaGetCount(da);
    if (index < 0 || index > n)
        return ERROR_INT("index not in {0...n}", procName, 1);

    if (n >= da->nalloc)
        l_dnaExtendArray(da);
    for (i = n; i > index; i--)
        da->array[i] = da->array[i - 1];
    da->array[index] = val;
    da->n++;
    return 0;
}
Example #24
0
/*!
 *  l_dnaGetDArray()
 *
 *      Input:  da
 *              copyflag (L_NOCOPY or L_COPY)
 *      Return: either the bare internal array or a copy of it,
 *              or null on error
 *
 *  Notes:
 *      (1) If copyflag == L_COPY, it makes a copy which the caller
 *          is responsible for freeing.  Otherwise, it operates
 *          directly on the bare array of the l_dna.
 *      (2) Very important: for L_NOCOPY, any writes to the array
 *          will be in the l_dna.  Do not write beyond the size of
 *          the count field, because it will not be accessible
 *          from the l_dna!  If necessary, be sure to set the count
 *          field to a larger number (such as the alloc size)
 *          BEFORE calling this function.  Creating with l_dnaMakeConstant()
 *          is another way to insure full initialization.
 */
l_float64 *
l_dnaGetDArray(L_DNA   *da,
               l_int32  copyflag)
{
l_int32     i, n;
l_float64  *array;

    PROCNAME("l_dnaGetDArray");

    if (!da)
        return (l_float64 *)ERROR_PTR("da not defined", procName, NULL);

    if (copyflag == L_NOCOPY) {
        array = da->array;
    } else {  /* copyflag == L_COPY */
        n = l_dnaGetCount(da);
        if ((array = (l_float64 *)CALLOC(n, sizeof(l_float64))) == NULL)
            return (l_float64 *)ERROR_PTR("array not made", procName, NULL);
        for (i = 0; i < n; i++)
            array[i] = da->array[i];
    }

    return array;
}
Example #25
0
l_int32 main(int    argc,
             char **argv)
{
    L_ASET     *set;
    L_DNA      *da1, *da2, *da3, *da4, *da5, *da6, *da7, *da8, *dav, *dac;
    L_DNAHASH  *dahash;
    NUMA       *nav, *nac;
    PTA        *pta1, *pta2, *pta3;
    SARRAY     *sa1, *sa2, *sa3, *sa4;

    lept_mkdir("lept/hash");

#if 1
    /* Test string hashing with aset */
    fprintf(stderr, "Set results with string hashing:\n");
    sa1 = BuildShortStrings(3, 0);
    sa2 = BuildShortStrings(3, 1);
    fprintf(stderr, "  size with unique strings: %d\n", sarrayGetCount(sa1));
    fprintf(stderr, "  size with dups: %d\n", sarrayGetCount(sa2));
    startTimer();
    set = l_asetCreateFromSarray(sa2);
    fprintf(stderr, "  time to make set: %5.3f sec\n", stopTimer());
    fprintf(stderr, "  size of set without dups: %d\n", l_asetSize(set));
    l_asetDestroy(&set);
    startTimer();
    sa3 = sarrayRemoveDupsByAset(sa2);
    fprintf(stderr, "  time to remove dups: %5.3f sec\n", stopTimer());
    fprintf(stderr, "  size without dups = %d\n", sarrayGetCount(sa3));
    startTimer();
    sa4 = sarrayIntersectionByAset(sa1, sa2);
    fprintf(stderr, "  time to intersect: %5.3f sec\n", stopTimer());
    fprintf(stderr, "  intersection size = %d\n", sarrayGetCount(sa4));
    sarrayDestroy(&sa3);
    sarrayDestroy(&sa4);

    /* Test sarray set operations with dna hash.
     * We use the same hash function as is used with aset. */
    fprintf(stderr, "\nDna hash results for sarray:\n");
    fprintf(stderr, "  size with unique strings: %d\n", sarrayGetCount(sa1));
    fprintf(stderr, "  size with dups: %d\n", sarrayGetCount(sa2));
    startTimer();
    dahash = l_dnaHashCreateFromSarray(sa2);
    fprintf(stderr, "  time to make hashmap: %5.3f sec\n", stopTimer());
    fprintf(stderr, "  entries in hashmap with dups: %d\n",
            l_dnaHashGetTotalCount(dahash));
    l_dnaHashDestroy(&dahash);
    startTimer();
    sarrayRemoveDupsByHash(sa2, &sa3, NULL);
    fprintf(stderr, "  time to remove dups: %5.3f sec\n", stopTimer());
    fprintf(stderr, "  size without dups = %d\n", sarrayGetCount(sa3));
    startTimer();
    sa4 = sarrayIntersectionByHash(sa1, sa2);
    fprintf(stderr, "  time to intersect: %5.3f sec\n", stopTimer());
    fprintf(stderr, "  intersection size = %d\n", sarrayGetCount(sa4));
    sarrayDestroy(&sa3);
    sarrayDestroy(&sa4);
    sarrayDestroy(&sa1);
    sarrayDestroy(&sa2);
#endif

#if 1
    /* Test point hashing with aset.
     * Enter all points within a 1500 x 1500 image in pta1, and include
     * 450,000 duplicates in pta2.  With this pt hashing function,
     * there are no hash collisions among any of the 400 million pixel
     * locations in a 20000 x 20000 image. */
    pta1 = BuildPointSet(1500, 1500, 0);
    pta2 = BuildPointSet(1500, 1500, 1);
    fprintf(stderr, "\nSet results for pta:\n");
    fprintf(stderr, "  pta1 size with unique points: %d\n", ptaGetCount(pta1));
    fprintf(stderr, "  pta2 size with dups: %d\n", ptaGetCount(pta2));
    startTimer();
    pta3 = ptaRemoveDupsByAset(pta2);
    fprintf(stderr, "  Time to remove dups: %5.3f sec\n", stopTimer());
    fprintf(stderr, "  size without dups = %d\n", ptaGetCount(pta3));
    ptaDestroy(&pta3);

    startTimer();
    pta3 = ptaIntersectionByAset(pta1, pta2);
    fprintf(stderr, "  Time to intersect: %5.3f sec\n", stopTimer());
    fprintf(stderr, "  intersection size = %d\n", ptaGetCount(pta3));
    ptaDestroy(&pta1);
    ptaDestroy(&pta2);
    ptaDestroy(&pta3);
#endif

#if 1
    /* Test pta set operations with dna hash, using the same pt hashing
     * function.  Although there are no collisions in 20K x 20K images,
     * the dna hash implementation works properly even if there are some. */
    pta1 = BuildPointSet(1500, 1500, 0);
    pta2 = BuildPointSet(1500, 1500, 1);
    fprintf(stderr, "\nDna hash results for pta:\n");
    fprintf(stderr, "  pta1 size with unique points: %d\n", ptaGetCount(pta1));
    fprintf(stderr, "  pta2 size with dups: %d\n", ptaGetCount(pta2));
    startTimer();
    ptaRemoveDupsByHash(pta2, &pta3, NULL);
    fprintf(stderr, "  Time to remove dups: %5.3f sec\n", stopTimer());
    fprintf(stderr, "  size without dups = %d\n", ptaGetCount(pta3));
    ptaDestroy(&pta3);

    startTimer();
    pta3 = ptaIntersectionByHash(pta1, pta2);
    fprintf(stderr, "  Time to intersect: %5.3f sec\n", stopTimer());
    fprintf(stderr, "  intersection size = %d\n", ptaGetCount(pta3));
    ptaDestroy(&pta1);
    ptaDestroy(&pta2);
    ptaDestroy(&pta3);
#endif

    /* Test dna set and histo operations using dna hash */
#if 1
    fprintf(stderr, "\nDna hash results for dna:\n");
    da1 = l_dnaMakeSequence(0.0, 0.125, 8000);
    da2 = l_dnaMakeSequence(300.0, 0.125, 8000);
    da3 = l_dnaMakeSequence(600.0, 0.125, 8000);
    da4 = l_dnaMakeSequence(900.0, 0.125, 8000);
    da5 = l_dnaMakeSequence(1200.0, 0.125, 8000);
    l_dnaJoin(da1, da2, 0, -1);
    l_dnaJoin(da1, da3, 0, -1);
    l_dnaJoin(da1, da4, 0, -1);
    l_dnaJoin(da1, da5, 0, -1);
    l_dnaRemoveDupsByHash(da1, &da6, &dahash);
    l_dnaHashDestroy(&dahash);
    fprintf(stderr, "  dna size with dups = %d\n", l_dnaGetCount(da1));
    fprintf(stderr, "  dna size of unique numbers = %d\n", l_dnaGetCount(da6));
    l_dnaMakeHistoByHash(da1, &dahash, &dav, &dac);
    nav = l_dnaConvertToNuma(dav);
    nac = l_dnaConvertToNuma(dac);
    fprintf(stderr, "  dna number of histo points = %d\n", l_dnaGetCount(dac));
    gplotSimpleXY1(nav, nac, GPLOT_IMPULSES, GPLOT_PNG,
                   "/tmp/lept/hash/histo", "Histo");
    da7 = l_dnaIntersectionByHash(da2, da3);
    fprintf(stderr, "  dna number of points: da2 = %d, da3 = %d\n",
            l_dnaGetCount(da2), l_dnaGetCount(da3));
    fprintf(stderr, "  dna number of da2/da3 intersection points = %d\n",
            l_dnaGetCount(da7));
    l_fileDisplay("/tmp/lept/hash/histo.png", 700, 100, 1.0);
    l_dnaDestroy(&da1);
    l_dnaDestroy(&da2);
    l_dnaDestroy(&da3);
    l_dnaDestroy(&da4);
    l_dnaDestroy(&da5);
    l_dnaDestroy(&da6);
    l_dnaDestroy(&da7);
    l_dnaDestroy(&dac);
    l_dnaDestroy(&dav);
    l_dnaHashDestroy(&dahash);
    numaDestroy(&nav);
    numaDestroy(&nac);
#endif

#if 1
    da1 = l_dnaMakeSequence(0, 3, 10000);
    da2 = l_dnaMakeSequence(0, 5, 10000);
    da3 = l_dnaMakeSequence(0, 7, 10000);
    l_dnaJoin(da1, da2, 0, -1);
    l_dnaJoin(da1, da3, 0, -1);

    fprintf(stderr, "\nDna results using set:\n");
    fprintf(stderr, "  da1 count: %d\n", l_dnaGetCount(da1));
    set = l_asetCreateFromDna(da1);
    fprintf(stderr, "  da1 set size: %d\n\n", l_asetSize(set));
    l_asetDestroy(&set);

    da4 = l_dnaUnionByAset(da2, da3);
    fprintf(stderr, "  da4 count: %d\n", l_dnaGetCount(da4));
    set = l_asetCreateFromDna(da4);
    fprintf(stderr, "  da4 set size: %d\n\n", l_asetSize(set));
    l_asetDestroy(&set);

    da5 = l_dnaIntersectionByAset(da1, da2);
    fprintf(stderr, "  da5 count: %d\n", l_dnaGetCount(da5));
    set = l_asetCreateFromDna(da5);
    fprintf(stderr, "  da5 set size: %d\n\n", l_asetSize(set));
    l_asetDestroy(&set);

    da6 = l_dnaMakeSequence(100000, 11, 5000);
    l_dnaJoin(da6, da1, 0, -1);
    fprintf(stderr, "  da6 count: %d\n", l_dnaGetCount(da6));
    set = l_asetCreateFromDna(da6);
    fprintf(stderr, "  da6 set size: %d\n\n", l_asetSize(set));
    l_asetDestroy(&set);

    da7 = l_dnaIntersectionByAset(da6, da3);
    fprintf(stderr, "  da7 count: %d\n", l_dnaGetCount(da7));
    set = l_asetCreateFromDna(da7);
    fprintf(stderr, "  da7 set size: %d\n\n", l_asetSize(set));
    l_asetDestroy(&set);

    da8 = l_dnaRemoveDupsByAset(da1);
    fprintf(stderr, "  da8 count: %d\n\n", l_dnaGetCount(da8));

    l_dnaDestroy(&da1);
    l_dnaDestroy(&da2);
    l_dnaDestroy(&da3);
    l_dnaDestroy(&da4);
    l_dnaDestroy(&da5);
    l_dnaDestroy(&da6);
    l_dnaDestroy(&da7);
    l_dnaDestroy(&da8);
#endif

    return 0;
}
Example #26
0
int main(int    argc,
         char **argv)
{
char        *str;
l_uint8     *data1, *data2;
l_int32      i, n, same1, same2;
size_t       size1, size2, slice, total, start, end;
FILE        *fp;
L_DNA       *da;
SARRAY      *sa;
L_BYTEA     *lba1, *lba2, *lba3, *lba4, *lba5;
static char  mainName[] = "byteatest";

    if (argc != 1)
        return ERROR_INT("syntax: byteatest", mainName, 1);

    lept_mkdir("bytea");

        /* Test basic init, join and split */
    lba1 = l_byteaInitFromFile("feyn.tif");
    lba2 = l_byteaInitFromFile("test24.jpg");
    size1 = l_byteaGetSize(lba1);
    size2 = l_byteaGetSize(lba2);
    l_byteaJoin(lba1, &lba2);
    lba3 = l_byteaInitFromMem(lba1->data, size1);
    lba4 = l_byteaInitFromMem(lba1->data + size1, size2);

        /* Split by hand */
    l_binaryWrite("/tmp/bytea/junk1.dat", "w", lba3->data, lba3->size);
    l_binaryWrite("/tmp/bytea/junk2.dat", "w", lba4->data, lba4->size);
    filesAreIdentical("feyn.tif", "/tmp/bytea/junk1.dat", &same1);
    filesAreIdentical("test24.jpg", "/tmp/bytea/junk2.dat", &same2);
    if (same1 && same2)
        fprintf(stderr, "OK for join file\n");
    else
        fprintf(stderr, "Error: files are different!\n");

        /* Split by function */
    l_byteaSplit(lba1, size1, &lba5);
    l_binaryWrite("/tmp/bytea/junk3.dat", "w", lba1->data, lba1->size);
    l_binaryWrite("/tmp/bytea/junk4.dat", "w", lba5->data, lba5->size);
    filesAreIdentical("feyn.tif", "/tmp/bytea/junk3.dat", &same1);
    filesAreIdentical("test24.jpg", "/tmp/bytea/junk4.dat", &same2);
    if (same1 && same2)
        fprintf(stderr, "OK for split file\n");
    else
        fprintf(stderr, "Error: files are different!\n");
    l_byteaDestroy(&lba1);
    l_byteaDestroy(&lba2);
    l_byteaDestroy(&lba3);
    l_byteaDestroy(&lba4);
    l_byteaDestroy(&lba5);

        /* Test appending with strings */
    data1 = l_binaryRead("kernel_reg.c", &size1);
    sa = sarrayCreateLinesFromString((char *)data1, 1);
    lba1 = l_byteaCreate(0);
    n = sarrayGetCount(sa);
    for (i = 0; i < n; i++) {
        str = sarrayGetString(sa, i, L_NOCOPY);
        l_byteaAppendString(lba1, str);
        l_byteaAppendString(lba1, (char *)"\n");
    }
    data2 = l_byteaGetData(lba1, &size2);
    l_binaryWrite("/tmp/bytea/junk5.dat", "w", data2, size2);
    filesAreIdentical("kernel_reg.c", "/tmp/bytea/junk5.dat", &same1);
    if (same1)
        fprintf(stderr, "OK for appended string data\n");
    else
        fprintf(stderr, "Error: appended string data is different!\n");
    lept_free(data1);
    sarrayDestroy(&sa);
    l_byteaDestroy(&lba1);

        /* Test appending with binary data */
    slice = 1000;
    total = nbytesInFile("breviar-a38.jp2");
    lba1 = l_byteaCreate(100);
    n = 1 + total / slice;
    fprintf(stderr, "******************************************************\n");
    fprintf(stderr, "* Testing error checking: ignore two reported errors *\n");
    for (i = 0, start = 0; i <= n; i++, start += slice) {
         data1 = l_binaryReadSelect("breviar-a38.jp2", start, slice, &size1);
         l_byteaAppendData(lba1, data1, size1);
         lept_free(data1);
    }
    fprintf(stderr, "******************************************************\n");
    data2 = l_byteaGetData(lba1, &size2);
    l_binaryWrite("/tmp/bytea/junk6.dat", "w", data2, size2);
    filesAreIdentical("breviar-a38.jp2", "/tmp/bytea/junk6.dat", &same1);
    if (same1)
        fprintf(stderr, "OK for appended binary data\n");
    else
        fprintf(stderr, "Error: appended binary data is different!\n");
    l_byteaDestroy(&lba1);

        /* Test search */
    convertToPdf("test24.jpg", L_JPEG_ENCODE, 0, "/tmp/bytea/junk7.pdf",
                 0, 0, 100, NULL, NULL, 0);
    lba1 = l_byteaInitFromFile("/tmp/bytea/junk7.pdf");
    l_byteaFindEachSequence(lba1, (l_uint8 *)" 0 obj\n", 7, &da);
    /* l_dnaWriteStream(stderr, da); */
    n = l_dnaGetCount(da);
    if (n == 6)
        fprintf(stderr, "OK for search: found 6 instances\n");
    else
        fprintf(stderr, "Error in search: found %d instances, not 6\n", n);
    l_byteaDestroy(&lba1);
    l_dnaDestroy(&da);

        /* Test write to file */
    lba1 = l_byteaInitFromFile("feyn.tif");
    fp = lept_fopen("/tmp/bytea/junk8.dat", "wb");
    size1 = l_byteaGetSize(lba1);
    for (start = 0; start < size1; start += 1000) {
         end = L_MIN(start + 1000 - 1, size1 - 1);
         l_byteaWriteStream(fp, lba1, start, end);
    }
    lept_fclose(fp);
    filesAreIdentical("feyn.tif", "/tmp/bytea/junk8.dat", &same1);
    if (same1)
        fprintf(stderr, "OK for written binary data\n");
    else
        fprintf(stderr, "Error: written binary data is different!\n");
    l_byteaDestroy(&lba1);

    return 0;
}