/*!
 *  sarraySelectBySubstring()
 *
 *      Input:  sain (input sarray)
 *              substr (<optional> substring for matching; can be NULL)
 *      Return: saout (output sarray, filtered with substring) or null on error
 *
 *  Notes:
 *      (1) This selects all strings in sain that have substr as a substring.
 *          Note that we can't use strncmp() because we're looking for
 *          a match to the substring anywhere within each filename.
 *      (2) If substr == NULL, returns a copy of the sarray.
 */
SARRAY *
sarraySelectBySubstring(SARRAY      *sain,
                        const char  *substr)
{
char    *str;
l_int32  n, i, offset, found;
SARRAY  *saout;

    PROCNAME("sarraySelectBySubstring");

    if (!sain)
        return (SARRAY *)ERROR_PTR("sain not defined", procName, NULL);

    n = sarrayGetCount(sain);
    if (!substr || n == 0)
        return sarrayCopy(sain);

    saout = sarrayCreate(n);
    for (i = 0; i < n; i++) {
        str = sarrayGetString(sain, i, L_NOCOPY);
        arrayFindSequence((l_uint8 *)str, strlen(str), (l_uint8 *)substr,
                          strlen(substr), &offset, &found);
        if (found)
            sarrayAddString(saout, str, L_COPY);
    }

    return saout;
}
/*!
 *  sarraySort()
 *
 *      Input:  saout (output sarray; can be NULL or equal to sain)
 *              sain (input sarray)
 *              sortorder (L_SORT_INCREASING or L_SORT_DECREASING)
 *      Return: saout (output sarray, sorted by ascii value), or null on error
 *
 *  Notes:
 *      (1) Set saout = sain for in-place; otherwise, set naout = NULL.
 *      (2) Shell sort, modified from K&R, 2nd edition, p.62.
 *          Slow but simple O(n logn) sort.
 */
SARRAY *
sarraySort(SARRAY  *saout,
           SARRAY  *sain,
           l_int32  sortorder)
{
char   **array;
char    *tmp;
l_int32  n, i, j, gap;

    PROCNAME("sarraySort");

    if (!sain)
        return (SARRAY *)ERROR_PTR("sain not defined", procName, NULL);

        /* Make saout if necessary; otherwise do in-place */
    if (!saout)
        saout = sarrayCopy(sain);
    else if (sain != saout)
        return (SARRAY *)ERROR_PTR("invalid: not in-place", procName, NULL);
    array = saout->array;  /* operate directly on the array */
    n = sarrayGetCount(saout);

        /* Shell sort */
    for (gap = n/2; gap > 0; gap = gap / 2) {
        for (i = gap; i < n; i++) {
            for (j = i - gap; j >= 0; j -= gap) {
                if ((sortorder == L_SORT_INCREASING &&
                     stringCompareLexical(array[j], array[j + gap])) ||
                    (sortorder == L_SORT_DECREASING &&
                     stringCompareLexical(array[j + gap], array[j])))
                {
                    tmp = array[j];
                    array[j] = array[j + gap];
                    array[j + gap] = tmp;
                }
            }
        }
    }

    return saout;
}
/*!
 * \brief   sarrayUnionByAset()
 *
 * \param[in]    sa1, sa2
 * \return  sad with the union of the string set, or NULL on error
 *
 * <pre>
 * Notes:
 *      (1) Duplicates are removed from the concatenation of the two arrays.
 *      (2) The key for each string is a 64-bit hash.
 *      (2) Algorithm: Concatenate the two sarrays.  Then build a set,
 *          using hashed strings as keys.  As the set is built, first do
 *          a find; if not found, add the key to the set and add the string
 *          to the output sarray.  This is O(nlogn).
 * </pre>
 */
SARRAY *
sarrayUnionByAset(SARRAY  *sa1,
                  SARRAY  *sa2)
{
SARRAY  *sa3, *sad;

    PROCNAME("sarrayUnionByAset");

    if (!sa1)
        return (SARRAY *)ERROR_PTR("sa1 not defined", procName, NULL);
    if (!sa2)
        return (SARRAY *)ERROR_PTR("sa2 not defined", procName, NULL);

        /* Join */
    sa3 = sarrayCopy(sa1);
    sarrayJoin(sa3, sa2);

        /* Eliminate duplicates */
    sad = sarrayRemoveDupsByAset(sa3);
    sarrayDestroy(&sa3);
    return sad;
}
/*!
 * \brief   jbWordsInTextlines()
 *
 * \param[in]    dirin directory of input pages
 * \param[in]    reduction 1 for full res; 2 for half-res
 * \param[in]    maxwidth of word mask components, to be kept
 * \param[in]    maxheight of word mask components, to be kept
 * \param[in]    thresh on correlation; 0.80 is reasonable
 * \param[in]    weight for handling thick text; 0.6 is reasonable
 * \param[out]   pnatl numa with textline index for each component
 * \param[in]    firstpage 0-based
 * \param[in]    npages use 0 for all pages in dirin
 * \return  classer for the set of pages
 *
 * <pre>
 * Notes:
 *      (1) This is a high-level function.  See prog/jbwords for example
 *          of usage.
 *      (2) Typically, words can be found reasonably well at a resolution
 *          of about 150 ppi.  For highest accuracy, you should use 300 ppi.
 *          Assuming that the input images are 300 ppi, use reduction = 1
 *          for finding words at full res, and reduction = 2 for finding
 *          them at 150 ppi.
 * </pre>
 */
JBCLASSER *
jbWordsInTextlines(const char  *dirin,
                   l_int32      reduction,
                   l_int32      maxwidth,
                   l_int32      maxheight,
                   l_float32    thresh,
                   l_float32    weight,
                   NUMA       **pnatl,
                   l_int32      firstpage,
                   l_int32      npages)
{
char       *fname;
l_int32     nfiles, i, w, h;
BOXA       *boxa;
JBCLASSER  *classer;
NUMA       *nai, *natl;
PIX        *pix;
PIXA       *pixa;
SARRAY     *safiles;

    PROCNAME("jbWordsInTextlines");

    if (!pnatl)
        return (JBCLASSER *)ERROR_PTR("&natl not defined", procName, NULL);
    *pnatl = NULL;
    if (!dirin)
        return (JBCLASSER *)ERROR_PTR("dirin not defined", procName, NULL);
    if (reduction != 1 && reduction != 2)
        return (JBCLASSER *)ERROR_PTR("reduction not in {1,2}", procName, NULL);

    safiles = getSortedPathnamesInDirectory(dirin, NULL, firstpage, npages);
    nfiles = sarrayGetCount(safiles);

        /* Classify components */
    classer = jbCorrelationInit(JB_WORDS, maxwidth, maxheight, thresh, weight);
    classer->safiles = sarrayCopy(safiles);
    natl = numaCreate(0);
    *pnatl = natl;
    for (i = 0; i < nfiles; i++) {
        fname = sarrayGetString(safiles, i, L_NOCOPY);
        if ((pix = pixRead(fname)) == NULL) {
            L_WARNING("image file %d not read\n", procName, i);
            continue;
        }
        pixGetDimensions(pix, &w, &h, NULL);
        if (reduction == 1) {
            classer->w = w;
            classer->h = h;
        } else {  /* reduction == 2 */
            classer->w = w / 2;
            classer->h = h / 2;
        }
        pixGetWordsInTextlines(pix, reduction, JB_WORDS_MIN_WIDTH,
                               JB_WORDS_MIN_HEIGHT, maxwidth, maxheight,
                               &boxa, &pixa, &nai);
        jbAddPageComponents(classer, pix, boxa, pixa);
        numaJoin(natl, nai, 0, -1);
        pixDestroy(&pix);
        numaDestroy(&nai);
        boxaDestroy(&boxa);
        pixaDestroy(&pixa);
    }

    sarrayDestroy(&safiles);
    return classer;
}