/*! * sarraySelectBySubstring() * * Input: sain (input sarray) * substr (<optional> substring for matching; can be NULL) * Return: saout (output sarray, filtered with substring) or null on error * * Notes: * (1) This selects all strings in sain that have substr as a substring. * Note that we can't use strncmp() because we're looking for * a match to the substring anywhere within each filename. * (2) If substr == NULL, returns a copy of the sarray. */ SARRAY * sarraySelectBySubstring(SARRAY *sain, const char *substr) { char *str; l_int32 n, i, offset, found; SARRAY *saout; PROCNAME("sarraySelectBySubstring"); if (!sain) return (SARRAY *)ERROR_PTR("sain not defined", procName, NULL); n = sarrayGetCount(sain); if (!substr || n == 0) return sarrayCopy(sain); saout = sarrayCreate(n); for (i = 0; i < n; i++) { str = sarrayGetString(sain, i, L_NOCOPY); arrayFindSequence((l_uint8 *)str, strlen(str), (l_uint8 *)substr, strlen(substr), &offset, &found); if (found) sarrayAddString(saout, str, L_COPY); } return saout; }
/*! * sarraySort() * * Input: saout (output sarray; can be NULL or equal to sain) * sain (input sarray) * sortorder (L_SORT_INCREASING or L_SORT_DECREASING) * Return: saout (output sarray, sorted by ascii value), or null on error * * Notes: * (1) Set saout = sain for in-place; otherwise, set naout = NULL. * (2) Shell sort, modified from K&R, 2nd edition, p.62. * Slow but simple O(n logn) sort. */ SARRAY * sarraySort(SARRAY *saout, SARRAY *sain, l_int32 sortorder) { char **array; char *tmp; l_int32 n, i, j, gap; PROCNAME("sarraySort"); if (!sain) return (SARRAY *)ERROR_PTR("sain not defined", procName, NULL); /* Make saout if necessary; otherwise do in-place */ if (!saout) saout = sarrayCopy(sain); else if (sain != saout) return (SARRAY *)ERROR_PTR("invalid: not in-place", procName, NULL); array = saout->array; /* operate directly on the array */ n = sarrayGetCount(saout); /* Shell sort */ for (gap = n/2; gap > 0; gap = gap / 2) { for (i = gap; i < n; i++) { for (j = i - gap; j >= 0; j -= gap) { if ((sortorder == L_SORT_INCREASING && stringCompareLexical(array[j], array[j + gap])) || (sortorder == L_SORT_DECREASING && stringCompareLexical(array[j + gap], array[j]))) { tmp = array[j]; array[j] = array[j + gap]; array[j + gap] = tmp; } } } } return saout; }
/*! * \brief sarrayUnionByAset() * * \param[in] sa1, sa2 * \return sad with the union of the string set, or NULL on error * * <pre> * Notes: * (1) Duplicates are removed from the concatenation of the two arrays. * (2) The key for each string is a 64-bit hash. * (2) Algorithm: Concatenate the two sarrays. Then build a set, * using hashed strings as keys. As the set is built, first do * a find; if not found, add the key to the set and add the string * to the output sarray. This is O(nlogn). * </pre> */ SARRAY * sarrayUnionByAset(SARRAY *sa1, SARRAY *sa2) { SARRAY *sa3, *sad; PROCNAME("sarrayUnionByAset"); if (!sa1) return (SARRAY *)ERROR_PTR("sa1 not defined", procName, NULL); if (!sa2) return (SARRAY *)ERROR_PTR("sa2 not defined", procName, NULL); /* Join */ sa3 = sarrayCopy(sa1); sarrayJoin(sa3, sa2); /* Eliminate duplicates */ sad = sarrayRemoveDupsByAset(sa3); sarrayDestroy(&sa3); return sad; }
/*! * \brief jbWordsInTextlines() * * \param[in] dirin directory of input pages * \param[in] reduction 1 for full res; 2 for half-res * \param[in] maxwidth of word mask components, to be kept * \param[in] maxheight of word mask components, to be kept * \param[in] thresh on correlation; 0.80 is reasonable * \param[in] weight for handling thick text; 0.6 is reasonable * \param[out] pnatl numa with textline index for each component * \param[in] firstpage 0-based * \param[in] npages use 0 for all pages in dirin * \return classer for the set of pages * * <pre> * Notes: * (1) This is a high-level function. See prog/jbwords for example * of usage. * (2) Typically, words can be found reasonably well at a resolution * of about 150 ppi. For highest accuracy, you should use 300 ppi. * Assuming that the input images are 300 ppi, use reduction = 1 * for finding words at full res, and reduction = 2 for finding * them at 150 ppi. * </pre> */ JBCLASSER * jbWordsInTextlines(const char *dirin, l_int32 reduction, l_int32 maxwidth, l_int32 maxheight, l_float32 thresh, l_float32 weight, NUMA **pnatl, l_int32 firstpage, l_int32 npages) { char *fname; l_int32 nfiles, i, w, h; BOXA *boxa; JBCLASSER *classer; NUMA *nai, *natl; PIX *pix; PIXA *pixa; SARRAY *safiles; PROCNAME("jbWordsInTextlines"); if (!pnatl) return (JBCLASSER *)ERROR_PTR("&natl not defined", procName, NULL); *pnatl = NULL; if (!dirin) return (JBCLASSER *)ERROR_PTR("dirin not defined", procName, NULL); if (reduction != 1 && reduction != 2) return (JBCLASSER *)ERROR_PTR("reduction not in {1,2}", procName, NULL); safiles = getSortedPathnamesInDirectory(dirin, NULL, firstpage, npages); nfiles = sarrayGetCount(safiles); /* Classify components */ classer = jbCorrelationInit(JB_WORDS, maxwidth, maxheight, thresh, weight); classer->safiles = sarrayCopy(safiles); natl = numaCreate(0); *pnatl = natl; for (i = 0; i < nfiles; i++) { fname = sarrayGetString(safiles, i, L_NOCOPY); if ((pix = pixRead(fname)) == NULL) { L_WARNING("image file %d not read\n", procName, i); continue; } pixGetDimensions(pix, &w, &h, NULL); if (reduction == 1) { classer->w = w; classer->h = h; } else { /* reduction == 2 */ classer->w = w / 2; classer->h = h / 2; } pixGetWordsInTextlines(pix, reduction, JB_WORDS_MIN_WIDTH, JB_WORDS_MIN_HEIGHT, maxwidth, maxheight, &boxa, &pixa, &nai); jbAddPageComponents(classer, pix, boxa, pixa); numaJoin(natl, nai, 0, -1); pixDestroy(&pix); numaDestroy(&nai); boxaDestroy(&boxa); pixaDestroy(&pixa); } sarrayDestroy(&safiles); return classer; }