/*! * \brief l_dnaMakeHistoByHash() * * \param[in] das * \param[out] pdahash hash map: val --> index * \param[out] pdav array of values: index --> val * \param[out] pdac histo array of counts: index --> count * \return 0 if OK; 1 on error * * <pre> * Notes: * (1) Generates and returns a dna of occurrences (histogram), * an aligned dna of values, and an associated hashmap. * The hashmap takes %dav and a value, and points into the * histogram in %dac. * (2) The dna of values, %dav, is aligned with the histogram %dac, * and is needed for fast lookup. It is a hash set, because * the values are unique. * (3) Lookup is simple: * l_dnaFindValByHash(dav, dahash, val, &index); * if (index >= 0) * l_dnaGetIValue(dac, index, &icount); * else * icount = 0; * </pre> */ l_ok l_dnaMakeHistoByHash(L_DNA *das, L_DNAHASH **pdahash, L_DNA **pdav, L_DNA **pdac) { l_int32 i, n, nitems, index, count; l_uint32 nsize; l_uint64 key; l_float64 val; L_DNA *dac, *dav; L_DNAHASH *dahash; PROCNAME("l_dnaMakeHistoByHash"); if (pdahash) *pdahash = NULL; if (pdac) *pdac = NULL; if (pdav) *pdav = NULL; if (!pdahash || !pdac || !pdav) return ERROR_INT("&dahash, &dac, &dav not all defined", procName, 1); if (!das) return ERROR_INT("das not defined", procName, 1); if ((n = l_dnaGetCount(das)) == 0) return ERROR_INT("no data in das", procName, 1); findNextLargerPrime(n / 20, &nsize); /* buckets in hash table */ dahash = l_dnaHashCreate(nsize, 8); dac = l_dnaCreate(n); /* histogram */ dav = l_dnaCreate(n); /* the values */ for (i = 0, nitems = 0; i < n; i++) { l_dnaGetDValue(das, i, &val); /* Is this value already stored in dav? */ l_dnaFindValByHash(dav, dahash, val, &index); if (index >= 0) { /* found */ l_dnaGetIValue(dac, (l_float64)index, &count); l_dnaSetValue(dac, (l_float64)index, count + 1); } else { /* not found */ l_hashFloat64ToUint64(nsize, val, &key); l_dnaHashAdd(dahash, key, (l_float64)nitems); l_dnaAddNumber(dav, val); l_dnaAddNumber(dac, 1); nitems++; } } *pdahash = dahash; *pdac = dac; *pdav = dav; return 0; }
/*! * \brief recogGetClassIndex() * * \param[in] recog with LUT's pre-computed * \param[in] val integer value; can be up to 3 bytes for UTF-8 * \param[in] text text from which %val was derived; used if not found * \param[out] pindex index into dna_tochar * \return 0 if found; 1 if not found and added; 2 on error. * * <pre> * Notes: * (1) This is used during training. There is one entry in * recog->dna_tochar (integer value, e.g., ascii) and * one in recog->sa_text (e.g, ascii letter in a string) * for each character class. * (2) This searches the dna character array for %val. If it is * not found, the template represents a character class not * already seen: it increments setsize (the number of character * classes) by 1, and augments both the index (dna_tochar) * and text (sa_text) arrays. * (3) Returns the index in &index, except on error. * (4) Caller must check the function return value. * </pre> */ l_int32 recogGetClassIndex(L_RECOG *recog, l_int32 val, char *text, l_int32 *pindex) { l_int32 i, n, ival; PROCNAME("recogGetClassIndex"); if (!pindex) return ERROR_INT("&index not defined", procName, 2); *pindex = -1; if (!recog) return ERROR_INT("recog not defined", procName, 2); if (!text) return ERROR_INT("text not defined", procName, 2); /* Search existing characters */ n = l_dnaGetCount(recog->dna_tochar); for (i = 0; i < n; i++) { l_dnaGetIValue(recog->dna_tochar, i, &ival); if (val == ival) { /* found */ *pindex = i; return 0; } } /* If not found... */ l_dnaAddNumber(recog->dna_tochar, val); sarrayAddString(recog->sa_text, text, L_COPY); recog->setsize++; *pindex = n; return 1; }
/*! * \brief l_dnaRemoveDupsByAset() * * \param[in] das * \return dad with duplicates removed, or NULL on error */ L_DNA * l_dnaRemoveDupsByAset(L_DNA *das) { l_int32 i, n; l_float64 val; L_DNA *dad; L_ASET *set; RB_TYPE key; PROCNAME("l_dnaRemoveDupsByAset"); if (!das) return (L_DNA *)ERROR_PTR("das not defined", procName, NULL); set = l_asetCreate(L_FLOAT_TYPE); dad = l_dnaCreate(0); n = l_dnaGetCount(das); for (i = 0; i < n; i++) { l_dnaGetDValue(das, i, &val); key.ftype = val; if (!l_asetFind(set, key)) { l_dnaAddNumber(dad, val); l_asetInsert(set, key); } } l_asetDestroy(&set); return dad; }
/*! * l_dnaCreateFromDArray() * * Input: da (float) * size (of the array) * copyflag (L_INSERT or L_COPY) * Return: da, or null on error * * Notes: * (1) With L_INSERT, ownership of the input array is transferred * to the returned l_dna, and all @size elements are considered * to be valid. */ L_DNA * l_dnaCreateFromDArray(l_float64 *darray, l_int32 size, l_int32 copyflag) { l_int32 i; L_DNA *da; PROCNAME("l_dnaCreateFromDArray"); if (!darray) return (L_DNA *)ERROR_PTR("darray not defined", procName, NULL); if (size <= 0) return (L_DNA *)ERROR_PTR("size must be > 0", procName, NULL); if (copyflag != L_INSERT && copyflag != L_COPY) return (L_DNA *)ERROR_PTR("invalid copyflag", procName, NULL); da = l_dnaCreate(size); if (copyflag == L_INSERT) { if (da->array) FREE(da->array); da->array = darray; da->n = size; } else { /* just copy the contents */ for (i = 0; i < size; i++) l_dnaAddNumber(da, darray[i]); } return da; }
/*! * l_dnaReadStream() * * Input: stream * Return: da, or null on error */ L_DNA * l_dnaReadStream(FILE *fp) { l_int32 i, n, index, ret, version; l_float64 val, startx, delx; L_DNA *da; PROCNAME("l_dnaReadStream"); if (!fp) return (L_DNA *)ERROR_PTR("stream not defined", procName, NULL); ret = fscanf(fp, "\nL_Dna Version %d\n", &version); if (ret != 1) return (L_DNA *)ERROR_PTR("not a l_dna file", procName, NULL); if (version != DNA_VERSION_NUMBER) return (L_DNA *)ERROR_PTR("invalid l_dna version", procName, NULL); if (fscanf(fp, "Number of numbers = %d\n", &n) != 1) return (L_DNA *)ERROR_PTR("invalid number of numbers", procName, NULL); if ((da = l_dnaCreate(n)) == NULL) return (L_DNA *)ERROR_PTR("da not made", procName, NULL); for (i = 0; i < n; i++) { if (fscanf(fp, " [%d] = %lf\n", &index, &val) != 2) return (L_DNA *)ERROR_PTR("bad input data", procName, NULL); l_dnaAddNumber(da, val); } /* Optional data */ if (fscanf(fp, "startx = %lf, delx = %lf\n", &startx, &delx) == 2) l_dnaSetParameters(da, startx, delx); return da; }
/*! * l_dnaJoin() * * Input: dad (dest dma; add to this one) * das (<optional> source dna; add from this one) * istart (starting index in das) * iend (ending index in das; use -1 to cat all) * Return: 0 if OK, 1 on error * * Notes: * (1) istart < 0 is taken to mean 'read from the start' (istart = 0) * (2) iend < 0 means 'read to the end' * (3) if das == NULL, this is a no-op */ l_int32 l_dnaJoin(L_DNA *dad, L_DNA *das, l_int32 istart, l_int32 iend) { l_int32 n, i; l_float64 val; PROCNAME("l_dnaJoin"); if (!dad) return ERROR_INT("dad not defined", procName, 1); if (!das) return 0; if (istart < 0) istart = 0; n = l_dnaGetCount(das); if (iend < 0 || iend >= n) iend = n - 1; if (istart > iend) return ERROR_INT("istart > iend; nothing to add", procName, 1); for (i = istart; i <= iend; i++) { l_dnaGetDValue(das, i, &val); l_dnaAddNumber(dad, val); } return 0; }
/*! * \brief l_dnaIntersectionByHash() * * \param[in] da1, da2 * \return dad intersection of the number arrays, or NULL on error * * <pre> * Notes: * (1) This uses the same method for building the intersection set * as ptaIntersectionByHash() and sarrayIntersectionByHash(). * </pre> */ L_DNA * l_dnaIntersectionByHash(L_DNA *da1, L_DNA *da2) { l_int32 n1, n2, nsmall, nbuckets, i, index1, index2; l_uint32 nsize2; l_uint64 key; l_float64 val; L_DNAHASH *dahash1, *dahash2; L_DNA *da_small, *da_big, *dad; PROCNAME("l_dnaIntersectionByHash"); if (!da1) return (L_DNA *)ERROR_PTR("da1 not defined", procName, NULL); if (!da2) return (L_DNA *)ERROR_PTR("da2 not defined", procName, NULL); /* Put the elements of the biggest array into a dnahash */ n1 = l_dnaGetCount(da1); n2 = l_dnaGetCount(da2); da_small = (n1 < n2) ? da1 : da2; /* do not destroy da_small */ da_big = (n1 < n2) ? da2 : da1; /* do not destroy da_big */ dahash1 = l_dnaHashCreateFromDna(da_big); /* Build up the intersection of numbers. Add to %dad * if the number is in da_big (using dahash1) but hasn't * yet been seen in the traversal of da_small (using dahash2). */ dad = l_dnaCreate(0); nsmall = l_dnaGetCount(da_small); findNextLargerPrime(nsmall / 20, &nsize2); /* buckets in hash table */ dahash2 = l_dnaHashCreate(nsize2, 0); nbuckets = l_dnaHashGetCount(dahash2); for (i = 0; i < nsmall; i++) { l_dnaGetDValue(da_small, i, &val); l_dnaFindValByHash(da_big, dahash1, val, &index1); if (index1 >= 0) { /* found */ l_dnaFindValByHash(da_small, dahash2, val, &index2); if (index2 == -1) { /* not found */ l_dnaAddNumber(dad, val); l_hashFloat64ToUint64(nbuckets, val, &key); l_dnaHashAdd(dahash2, key, (l_float64)i); } } } l_dnaHashDestroy(&dahash1); l_dnaHashDestroy(&dahash2); return dad; }
/*! * \brief l_dnaRemoveDupsByHash() * * \param[in] das * \param[out] pdad hash set * \param[out] pdahash [optional] dnahash used for lookup * \return 0 if OK; 1 on error * * <pre> * Notes: * (1) Generates a dna with unique values. * (2) The dnahash is built up with dad to assure uniqueness. * It can be used to find if an element is in the set: * l_dnaFindValByHash(dad, dahash, val, &index) * </pre> */ l_ok l_dnaRemoveDupsByHash(L_DNA *das, L_DNA **pdad, L_DNAHASH **pdahash) { l_int32 i, n, index, items; l_uint32 nsize; l_uint64 key; l_float64 val; L_DNA *dad; L_DNAHASH *dahash; PROCNAME("l_dnaRemoveDupsByHash"); if (pdahash) *pdahash = NULL; if (!pdad) return ERROR_INT("&dad not defined", procName, 1); *pdad = NULL; if (!das) return ERROR_INT("das not defined", procName, 1); n = l_dnaGetCount(das); findNextLargerPrime(n / 20, &nsize); /* buckets in hash table */ dahash = l_dnaHashCreate(nsize, 8); dad = l_dnaCreate(n); *pdad = dad; for (i = 0, items = 0; i < n; i++) { l_dnaGetDValue(das, i, &val); l_dnaFindValByHash(dad, dahash, val, &index); if (index < 0) { /* not found */ l_hashFloat64ToUint64(nsize, val, &key); l_dnaHashAdd(dahash, key, (l_float64)items); l_dnaAddNumber(dad, val); items++; } } if (pdahash) *pdahash = dahash; else l_dnaHashDestroy(&dahash); return 0; }
/*! * l_dnaMakeDelta() * * Input: das (input l_dna) * Return: dad (of difference values val[i+1] - val[i]), * or null on error */ L_DNA * l_dnaMakeDelta(L_DNA *das) { l_int32 i, n, prev, cur; L_DNA *dad; PROCNAME("l_dnaMakeDelta"); if (!das) return (L_DNA *)ERROR_PTR("das not defined", procName, NULL); n = l_dnaGetCount(das); dad = l_dnaCreate(n - 1); prev = 0; for (i = 1; i < n; i++) { l_dnaGetIValue(das, i, &cur); l_dnaAddNumber(dad, cur - prev); prev = cur; } return dad; }
/*! * l_dnaCreateFromIArray() * * Input: iarray (integer) * size (of the array) * Return: da, or null on error * * Notes: * (1) We can't insert this int array into the l_dna, because a l_dna * takes a double array. So this just copies the data from the * input array into the l_dna. The input array continues to be * owned by the caller. */ L_DNA * l_dnaCreateFromIArray(l_int32 *iarray, l_int32 size) { l_int32 i; L_DNA *da; PROCNAME("l_dnaCreateFromIArray"); if (!iarray) return (L_DNA *)ERROR_PTR("iarray not defined", procName, NULL); if (size <= 0) return (L_DNA *)ERROR_PTR("size must be > 0", procName, NULL); da = l_dnaCreate(size); for (i = 0; i < size; i++) l_dnaAddNumber(da, iarray[i]); return da; }
/*! * numaConvertToDna * * Input: na * Return: da, or null on error */ L_DNA * numaConvertToDna(NUMA *na) { l_int32 i, n; l_float32 val; L_DNA *da; PROCNAME("numaConvertToDna"); if (!na) return (L_DNA *)ERROR_PTR("na not defined", procName, NULL); n = numaGetCount(na); da = l_dnaCreate(n); for (i = 0; i < n; i++) { numaGetFValue(na, i, &val); l_dnaAddNumber(da, val); } return da; }
/*! * \brief l_dnaIntersectionByAset() * * \param[in] da1, da2 * \return dad with the intersection of the two arrays, or NULL on error * * <pre> * Notes: * (1) See sarrayIntersection() for the approach. * (2) Here, the key in building the sorted tree is the number itself. * (3) Operations using an underlying tree are O(nlogn), which is * typically less efficient than hashing, which is O(n). * </pre> */ L_DNA * l_dnaIntersectionByAset(L_DNA *da1, L_DNA *da2) { l_int32 n1, n2, i, n; l_float64 val; L_ASET *set1, *set2; RB_TYPE key; L_DNA *da_small, *da_big, *dad; PROCNAME("l_dnaIntersectionByAset"); if (!da1) return (L_DNA *)ERROR_PTR("da1 not defined", procName, NULL); if (!da2) return (L_DNA *)ERROR_PTR("da2 not defined", procName, NULL); /* Put the elements of the largest array into a set */ n1 = l_dnaGetCount(da1); n2 = l_dnaGetCount(da2); da_small = (n1 < n2) ? da1 : da2; /* do not destroy da_small */ da_big = (n1 < n2) ? da2 : da1; /* do not destroy da_big */ set1 = l_asetCreateFromDna(da_big); /* Build up the intersection of floats */ dad = l_dnaCreate(0); n = l_dnaGetCount(da_small); set2 = l_asetCreate(L_FLOAT_TYPE); for (i = 0; i < n; i++) { l_dnaGetDValue(da_small, i, &val); key.ftype = val; if (l_asetFind(set1, key) && !l_asetFind(set2, key)) { l_dnaAddNumber(dad, val); l_asetInsert(set2, key); } } l_asetDestroy(&set1); l_asetDestroy(&set2); return dad; }
/*! * l_dnaCopy() * * Input: da * Return: copy of l_dna, or null on error */ L_DNA * l_dnaCopy(L_DNA *da) { l_int32 i; L_DNA *dac; PROCNAME("l_dnaCopy"); if (!da) return (L_DNA *)ERROR_PTR("da not defined", procName, NULL); if ((dac = l_dnaCreate(da->nalloc)) == NULL) return (L_DNA *)ERROR_PTR("dac not made", procName, NULL); dac->startx = da->startx; dac->delx = da->delx; for (i = 0; i < da->n; i++) l_dnaAddNumber(dac, da->array[i]); return dac; }
/*! * l_dnaMakeSequence() * * Input: startval * increment * size (of sequence) * Return: l_dna of sequence of evenly spaced values, or null on error */ L_DNA * l_dnaMakeSequence(l_float64 startval, l_float64 increment, l_int32 size) { l_int32 i; l_float64 val; L_DNA *da; PROCNAME("l_dnaMakeSequence"); if ((da = l_dnaCreate(size)) == NULL) return (L_DNA *)ERROR_PTR("da not made", procName, NULL); for (i = 0; i < size; i++) { val = startval + i * increment; l_dnaAddNumber(da, val); } return da; }
/*! * l_dnaaAddNumber() * * Input: daa * index (of l_dna within l_dnaa) * val (number to be added; stored as a double) * Return: 0 if OK, 1 on error * * Notes: * (1) Adds to an existing l_dna only. */ l_int32 l_dnaaAddNumber(L_DNAA *daa, l_int32 index, l_float64 val) { l_int32 n; L_DNA *da; PROCNAME("l_dnaaAddNumber"); if (!daa) return ERROR_INT("daa not defined", procName, 1); n = l_dnaaGetCount(daa); if (index < 0 || index >= n) return ERROR_INT("invalid index in daa", procName, 1); da = l_dnaaGetDna(daa, index, L_CLONE); l_dnaAddNumber(da, val); l_dnaDestroy(&da); return 0; }
/*! * \brief l_dnaHashAdd() * * \param[in] dahash * \param[in] key key to be hashed into a bucket number * \param[in] value float value to be appended to the specific dna * \return 0 if OK; 1 on error */ l_ok l_dnaHashAdd(L_DNAHASH *dahash, l_uint64 key, l_float64 value) { l_int32 bucket; L_DNA *da; PROCNAME("l_dnaHashAdd"); if (!dahash) return ERROR_INT("dahash not defined", procName, 1); bucket = key % dahash->nbuckets; da = dahash->dna[bucket]; if (!da) { if ((da = l_dnaCreate(dahash->initsize)) == NULL) return ERROR_INT("da not made", procName, 1); dahash->dna[bucket] = da; } l_dnaAddNumber(da, value); return 0; }
main(int argc, char **argv) { l_int32 i, nbins, ival; l_float64 pi, angle, val, sum; L_DNA *da1, *da2, *da3, *da4, *da5; L_DNAA *daa1, *daa2; GPLOT *gplot; NUMA *na, *nahisto, *nax; L_REGPARAMS *rp; if (regTestSetup(argc, argv, &rp)) return 1; pi = 3.1415926535; da1 = l_dnaCreate(50); for (i = 0; i < 5000; i++) { angle = 0.02293 * i * pi; val = 999. * sin(angle); l_dnaAddNumber(da1, val); } /* Conversion to Numa; I/O for Dna */ na = l_dnaConvertToNuma(da1); da2 = numaConvertToDna(na); l_dnaWrite("/tmp/dna1.da", da1); l_dnaWrite("/tmp/dna2.da", da2); da3 = l_dnaRead("/tmp/dna2.da"); l_dnaWrite("/tmp/dna3.da", da3); regTestCheckFile(rp, "/tmp/dna1.da"); /* 0 */ regTestCheckFile(rp, "/tmp/dna2.da"); /* 1 */ regTestCheckFile(rp, "/tmp/dna3.da"); /* 2 */ regTestCompareFiles(rp, 1, 2); /* 3 */ /* I/O for Dnaa */ daa1 = l_dnaaCreate(3); l_dnaaAddDna(daa1, da1, L_INSERT); l_dnaaAddDna(daa1, da2, L_INSERT); l_dnaaAddDna(daa1, da3, L_INSERT); l_dnaaWrite("/tmp/dnaa1.daa", daa1); daa2 = l_dnaaRead("/tmp/dnaa1.daa"); l_dnaaWrite("/tmp/dnaa2.daa", daa2); regTestCheckFile(rp, "/tmp/dnaa1.daa"); /* 4 */ regTestCheckFile(rp, "/tmp/dnaa2.daa"); /* 5 */ regTestCompareFiles(rp, 4, 5); /* 6 */ l_dnaaDestroy(&daa1); l_dnaaDestroy(&daa2); /* Just for fun -- is the numa ok? */ nahisto = numaMakeHistogramClipped(na, 12, 2000); nbins = numaGetCount(nahisto); nax = numaMakeSequence(0, 1, nbins); gplot = gplotCreate("/tmp/historoot", GPLOT_PNG, "Histo example", "i", "histo[i]"); gplotAddPlot(gplot, nax, nahisto, GPLOT_LINES, "sine"); gplotMakeOutput(gplot); #ifndef _WIN32 sleep(1); #else Sleep(1000); #endif /* _WIN32 */ regTestCheckFile(rp, "/tmp/historoot.png"); /* 7 */ gplotDestroy(&gplot); numaDestroy(&na); numaDestroy(&nax); numaDestroy(&nahisto); /* Handling precision of int32 in double */ da4 = l_dnaCreate(25); for (i = 0; i < 1000; i++) l_dnaAddNumber(da4, 1928374 * i); l_dnaWrite("/tmp/dna4.da", da4); da5 = l_dnaRead("/tmp/dna4.da"); sum = 0; for (i = 0; i < 1000; i++) { l_dnaGetIValue(da5, i, &ival); sum += L_ABS(ival - i * 1928374); /* we better be adding 0 each time */ } regTestCompareValues(rp, sum, 0.0, 0.0); /* 8 */ l_dnaDestroy(&da4); l_dnaDestroy(&da5); return regTestCleanup(rp); }