/*! * \brief l_asetCreateFromSarray() * * \param[in] sa * \return set using a string hash into a uint32 as the key */ L_ASET * l_asetCreateFromSarray(SARRAY *sa) { char *str; l_int32 i, n; l_uint64 hash; L_ASET *set; RB_TYPE key; PROCNAME("l_asetCreateFromSarray"); if (!sa) return (L_ASET *)ERROR_PTR("sa not defined", procName, NULL); set = l_asetCreate(L_UINT_TYPE); n = sarrayGetCount(sa); for (i = 0; i < n; i++) { str = sarrayGetString(sa, i, L_NOCOPY); l_hashStringToUint64(str, &hash); key.utype = hash; l_asetInsert(set, key); } return set; }
/*! * \brief l_dnaHashCreateFromSarray() * * \param[in] sa * \return dahash, or NULL on error */ L_DNAHASH * l_dnaHashCreateFromSarray(SARRAY *sa) { char *str; l_int32 i, n; l_uint32 nsize; l_uint64 key; L_DNAHASH *dahash; /* Build up dnaHash of indices, hashed by a 64-bit key that * should randomize the lower bits used in bucket selection. * Having about 20 pts in each bucket is roughly optimal. */ n = sarrayGetCount(sa); findNextLargerPrime(n / 20, &nsize); /* buckets in hash table */ /* fprintf(stderr, "Prime used: %d\n", nsize); */ /* Add each string, using the hash as key and the index into %sa * as the value. Storing the index enables operations that check * for duplicates. */ dahash = l_dnaHashCreate(nsize, 8); for (i = 0; i < n; i++) { str = sarrayGetString(sa, i, L_NOCOPY); l_hashStringToUint64(str, &key); l_dnaHashAdd(dahash, key, (l_float64)i); } return dahash; }
/*! * \brief sarrayRemoveDupsByAset() * * \param[in] sas * \return sad with duplicates removed, or NULL on error * * <pre> * Notes: * (1) This is O(nlogn), considerably slower than * sarrayRemoveDupsByHash() for large string arrays. * (2) The key for each string is a 64-bit hash. * (3) Build a set, using hashed strings as keys. As the set is * built, first do a find; if not found, add the key to the * set and add the string to the output sarray. * </pre> */ SARRAY * sarrayRemoveDupsByAset(SARRAY *sas) { char *str; l_int32 i, n; l_uint64 hash; L_ASET *set; RB_TYPE key; SARRAY *sad; PROCNAME("sarrayRemoveDupsByAset"); if (!sas) return (SARRAY *)ERROR_PTR("sas not defined", procName, NULL); set = l_asetCreate(L_UINT_TYPE); sad = sarrayCreate(0); n = sarrayGetCount(sas); for (i = 0; i < n; i++) { str = sarrayGetString(sas, i, L_NOCOPY); l_hashStringToUint64(str, &hash); key.utype = hash; if (!l_asetFind(set, key)) { sarrayAddString(sad, str, L_COPY); l_asetInsert(set, key); } } l_asetDestroy(&set); return sad; }
/* Build all possible strings, up to a max of 5 roman alphabet characters */ static SARRAY * BuildShortStrings(l_int32 nchars, /* 3, 4 or 5 */ l_int32 add_dups) { char buf[64]; l_int32 i, j, k, l, m; l_uint64 hash; SARRAY *sa; sa = sarrayCreate(1000); for (i = 0; i < 26; i++) { sprintf(buf, "%c", i + 0x61); sarrayAddString(sa, buf, L_COPY); for (j = 0; j < 26; j++) { sprintf(buf, "%c%c", i + 0x61, j + 0x61); sarrayAddString(sa, buf, L_COPY); for (k = 0; k < 26; k++) { sprintf(buf, "%c%c%c", i + 0x61, j + 0x61, k + 0x61); sarrayAddString(sa, buf, L_COPY); if (add_dups && k < 4) /* add redundant strings */ sarrayAddString(sa, buf, L_COPY); if (nchars > 3) { for (l = 0; l < 26; l++) { sprintf(buf, "%c%c%c%c", i + 0x61, j + 0x61, k + 0x61, l + 0x61); sarrayAddString(sa, buf, L_COPY); if (add_dups && l < 4) /* add redundant strings */ sarrayAddString(sa, buf, L_COPY); if (nchars > 4) { for (m = 0; m < 26; m++) { sprintf(buf, "%c%c%c%c%c", i + 0x61, j + 0x61, k + 0x61, l + 0x61, m + 0x61); sarrayAddString(sa, buf, L_COPY); if (!add_dups && i == 17 && j == 12 && k == 4 && l == 21) { l_hashStringToUint64(buf, &hash); fprintf(stderr, " %llx\n", hash); } if (add_dups && m < 4) /* add redundant */ sarrayAddString(sa, buf, L_COPY); } } } } } } } return sa; }
/*! * \brief sarrayIntersectionByHash() * * \param[in] sa1, sa2 * \return sad intersection of the strings, or NULL on error * * <pre> * Notes: * (1) This is faster than sarrayIntersectionByAset(), because the * bucket lookup is O(n). * </pre> */ SARRAY * sarrayIntersectionByHash(SARRAY *sa1, SARRAY *sa2) { char *str; l_int32 n1, n2, nsmall, i, index1, index2; l_uint32 nsize2; l_uint64 key; L_DNAHASH *dahash1, *dahash2; SARRAY *sa_small, *sa_big, *sad; PROCNAME("sarrayIntersectionByHash"); if (!sa1) return (SARRAY *)ERROR_PTR("sa1 not defined", procName, NULL); if (!sa2) return (SARRAY *)ERROR_PTR("sa2 not defined", procName, NULL); /* Put the elements of the biggest sarray into a dnahash */ n1 = sarrayGetCount(sa1); n2 = sarrayGetCount(sa2); sa_small = (n1 < n2) ? sa1 : sa2; /* do not destroy sa_small */ sa_big = (n1 < n2) ? sa2 : sa1; /* do not destroy sa_big */ dahash1 = l_dnaHashCreateFromSarray(sa_big); /* Build up the intersection of strings. Add to %sad * if the string is in sa_big (using dahash1) but hasn't * yet been seen in the traversal of sa_small (using dahash2). */ sad = sarrayCreate(0); nsmall = sarrayGetCount(sa_small); findNextLargerPrime(nsmall / 20, &nsize2); /* buckets in hash table */ dahash2 = l_dnaHashCreate(nsize2, 0); for (i = 0; i < nsmall; i++) { str = sarrayGetString(sa_small, i, L_NOCOPY); sarrayFindStringByHash(sa_big, dahash1, str, &index1); if (index1 >= 0) { sarrayFindStringByHash(sa_small, dahash2, str, &index2); if (index2 == -1) { sarrayAddString(sad, str, L_COPY); l_hashStringToUint64(str, &key); l_dnaHashAdd(dahash2, key, (l_float64)i); } } } l_dnaHashDestroy(&dahash1); l_dnaHashDestroy(&dahash2); return sad; }
/*! * \brief sarrayRemoveDupsByHash() * * \param[in] sas * \param[out] psad unique set of strings; duplicates removed * \param[out] pdahash [optional] dnahash used for lookup * \return 0 if OK, 1 on error * * <pre> * Notes: * (1) Generates a sarray with unique values. * (2) The dnahash is built up with sad to assure uniqueness. * It can be used to find if a string is in the set: * sarrayFindValByHash(sad, dahash, str, \&index) * (3) The hash of the string location is simple and fast. It scales * up with the number of buckets to insure a fairly random * bucket selection input strings. * (4) This is faster than sarrayRemoveDupsByAset(), because the * bucket lookup is O(n), although there is a double-loop * lookup within the dna in each bucket. * </pre> */ l_int32 sarrayRemoveDupsByHash(SARRAY *sas, SARRAY **psad, L_DNAHASH **pdahash) { char *str; l_int32 i, n, index, items; l_uint32 nsize; l_uint64 key; SARRAY *sad; L_DNAHASH *dahash; PROCNAME("sarrayRemoveDupsByHash"); if (pdahash) *pdahash = NULL; if (!psad) return ERROR_INT("&sad not defined", procName, 1); *psad = NULL; if (!sas) return ERROR_INT("sas not defined", procName, 1); n = sarrayGetCount(sas); findNextLargerPrime(n / 20, &nsize); /* buckets in hash table */ dahash = l_dnaHashCreate(nsize, 8); sad = sarrayCreate(n); *psad = sad; for (i = 0, items = 0; i < n; i++) { str = sarrayGetString(sas, i, L_NOCOPY); sarrayFindStringByHash(sad, dahash, str, &index); if (index < 0) { /* not found */ l_hashStringToUint64(str, &key); l_dnaHashAdd(dahash, key, (l_float64)items); sarrayAddString(sad, str, L_COPY); items++; } } if (pdahash) *pdahash = dahash; else l_dnaHashDestroy(&dahash); return 0; }
/*! * \brief sarrayIntersectionByAset() * * \param[in] sa1, sa2 * \return sad with the intersection of the string set, or NULL on error * * <pre> * Notes: * (1) Algorithm: put the smaller sarray into a set, using the string * hashes as the key values. Then run through the larger sarray, * building an output sarray and a second set from the strings * in the larger array: if a string is in the first set but * not in the second, add the string to the output sarray and hash * it into the second set. The second set is required to make * sure only one instance of each string is put into the output sarray. * This is O(mlogn), {m,n} = sizes of {smaller,larger} input arrays. * </pre> */ SARRAY * sarrayIntersectionByAset(SARRAY *sa1, SARRAY *sa2) { char *str; l_int32 n1, n2, i, n; l_uint64 hash; L_ASET *set1, *set2; RB_TYPE key; SARRAY *sa_small, *sa_big, *sad; PROCNAME("sarrayIntersectionByAset"); if (!sa1) return (SARRAY *)ERROR_PTR("sa1 not defined", procName, NULL); if (!sa2) return (SARRAY *)ERROR_PTR("sa2 not defined", procName, NULL); /* Put the elements of the biggest array into a set */ n1 = sarrayGetCount(sa1); n2 = sarrayGetCount(sa2); sa_small = (n1 < n2) ? sa1 : sa2; /* do not destroy sa_small */ sa_big = (n1 < n2) ? sa2 : sa1; /* do not destroy sa_big */ set1 = l_asetCreateFromSarray(sa_big); /* Build up the intersection of strings */ sad = sarrayCreate(0); n = sarrayGetCount(sa_small); set2 = l_asetCreate(L_UINT_TYPE); for (i = 0; i < n; i++) { str = sarrayGetString(sa_small, i, L_NOCOPY); l_hashStringToUint64(str, &hash); key.utype = hash; if (l_asetFind(set1, key) && !l_asetFind(set2, key)) { sarrayAddString(sad, str, L_COPY); l_asetInsert(set2, key); } } l_asetDestroy(&set1); l_asetDestroy(&set2); return sad; }
/*! * \brief sarrayFindStringByHash() * * \param[in] sa * \param[in] dahash built from sa * \param[in] str arbitrary string * \param[out] pindex index into %sa if %str is in %sa; * -1 otherwise * \return 0 if OK, 1 on error * * <pre> * Notes: * (1) Fast lookup in dnaHash associated with a sarray, to see if a * random string %str is already stored in the hash table. * </pre> */ l_int32 sarrayFindStringByHash(SARRAY *sa, L_DNAHASH *dahash, const char *str, l_int32 *pindex) { char *stri; l_int32 i, nvals, index; l_uint64 key; L_DNA *da; PROCNAME("sarrayFindStringByHash"); if (!pindex) return ERROR_INT("&index not defined", procName, 1); *pindex = -1; if (!sa) return ERROR_INT("sa not defined", procName, 1); if (!dahash) return ERROR_INT("dahash not defined", procName, 1); l_hashStringToUint64(str, &key); da = l_dnaHashGetDna(dahash, key, L_NOCOPY); if (!da) return 0; /* Run through the da, looking for this string */ nvals = l_dnaGetCount(da); for (i = 0; i < nvals; i++) { l_dnaGetIValue(da, i, &index); stri = sarrayGetString(sa, index, L_NOCOPY); if (!strcmp(str, stri)) { /* duplicate */ *pindex = index; return 0; } } return 0; }