示例#1
0
/*
 *  captureProtoSignature()
 *
 *      Input:  sa (output from cpp, by line)
 *              start (starting index to search; never a comment line)
 *              stop (index of line on which pattern is completed)
 *              charindex (char index of completing ')' character)
 *      Return: cleanstr (prototype string), or NULL on error
 *
 *  Notes:
 *      (1) Return all characters, ending with a ';' after the ')'
 */
static char *
captureProtoSignature(SARRAY  *sa,
                      l_int32  start,
                      l_int32  stop,
                      l_int32  charindex)
{
char    *str, *newstr, *protostr, *cleanstr;
SARRAY  *sap;
l_int32  i;

    PROCNAME("captureProtoSignature");

    if (!sa)
        return (char *)ERROR_PTR("sa not defined", procName, NULL);

    sap = sarrayCreate(0);
    for (i = start; i < stop; i++) {
        str = sarrayGetString(sa, i, L_COPY);
        sarrayAddString(sap, str, L_INSERT);
    }
    str = sarrayGetString(sa, stop, L_COPY);
    str[charindex + 1] = '\0';
    newstr = stringJoin(str, ";");
    sarrayAddString(sap, newstr, L_INSERT);
    LEPT_FREE(str);
    protostr = sarrayToString(sap, 2);
    sarrayDestroy(&sap);
    cleanstr = cleanProtoSignature(protostr);
    LEPT_FREE(protostr);

    return cleanstr;
}
/*!
 * \brief   l_asetCreateFromSarray()
 *
 * \param[in]    sa
 * \return  set using a string hash into a uint32 as the key
 */
L_ASET *
l_asetCreateFromSarray(SARRAY  *sa)
{
char     *str;
l_int32   i, n;
l_uint64  hash;
L_ASET   *set;
RB_TYPE   key;

    PROCNAME("l_asetCreateFromSarray");

    if (!sa)
        return (L_ASET *)ERROR_PTR("sa not defined", procName, NULL);

    set = l_asetCreate(L_UINT_TYPE);
    n = sarrayGetCount(sa);
    for (i = 0; i < n; i++) {
        str = sarrayGetString(sa, i, L_NOCOPY);
        l_hashStringToUint64(str, &hash);
        key.utype = hash;
        l_asetInsert(set, key);
    }

    return set;
}
示例#3
0
/*!
 * \brief   recogAddCharstrLabels()
 *
 * \param[in]    recog
 * \return  0 if OK, 1 on error
 */
static l_int32
recogAddCharstrLabels(L_RECOG  *recog)
{
char    *text;
l_int32  i, j, n1, n2;
PIX     *pix;
PIXA    *pixa;
PIXAA   *paa;

    PROCNAME("recogAddCharstrLabels");

    if (!recog)
        return ERROR_INT("recog not defined", procName, 1);

        /* Add the labels to each unscaled pix */
    paa = recog->pixaa_u;
    n1 = pixaaGetCount(paa, NULL);
    for (i = 0; i < n1; i++) {
        pixa = pixaaGetPixa(paa, i, L_CLONE);
        text = sarrayGetString(recog->sa_text, i, L_NOCOPY);
        n2 = pixaGetCount(pixa);
        for (j = 0; j < n2; j++) {
             pix = pixaGetPix(pixa, j, L_CLONE);
             pixSetText(pix, text);
             pixDestroy(&pix);
        }
        pixaDestroy(&pixa);
    }

    return 0;
}
/*!
 * \brief   sarrayRemoveDupsByAset()
 *
 * \param[in]    sas
 * \return  sad with duplicates removed, or NULL on error
 *
 * <pre>
 * Notes:
 *      (1) This is O(nlogn), considerably slower than
 *          sarrayRemoveDupsByHash() for large string arrays.
 *      (2) The key for each string is a 64-bit hash.
 *      (3) Build a set, using hashed strings as keys.  As the set is
 *          built, first do a find; if not found, add the key to the
 *          set and add the string to the output sarray.
 * </pre>
 */
SARRAY *
sarrayRemoveDupsByAset(SARRAY  *sas)
{
char     *str;
l_int32   i, n;
l_uint64  hash;
L_ASET   *set;
RB_TYPE   key;
SARRAY   *sad;

    PROCNAME("sarrayRemoveDupsByAset");

    if (!sas)
        return (SARRAY *)ERROR_PTR("sas not defined", procName, NULL);

    set = l_asetCreate(L_UINT_TYPE);
    sad = sarrayCreate(0);
    n = sarrayGetCount(sas);
    for (i = 0; i < n; i++) {
        str = sarrayGetString(sas, i, L_NOCOPY);
        l_hashStringToUint64(str, &hash);
        key.utype = hash;
        if (!l_asetFind(set, key)) {
            sarrayAddString(sad, str, L_COPY);
            l_asetInsert(set, key);
        }
    }

    l_asetDestroy(&set);
    return sad;
}
/*!
 * \brief   sarraySortByIndex()
 *
 * \param[in]    sain
 * \param[in]    naindex na that maps from the new sarray to the input sarray
 * \return  saout sorted, or NULL on error
 */
SARRAY *
sarraySortByIndex(SARRAY  *sain,
                  NUMA    *naindex)
{
char    *str;
l_int32  i, n, index;
SARRAY  *saout;

    PROCNAME("sarraySortByIndex");

    if (!sain)
        return (SARRAY *)ERROR_PTR("sain not defined", procName, NULL);
    if (!naindex)
        return (SARRAY *)ERROR_PTR("naindex not defined", procName, NULL);

    n = sarrayGetCount(sain);
    saout = sarrayCreate(n);
    for (i = 0; i < n; i++) {
        numaGetIValue(naindex, i, &index);
        str = sarrayGetString(sain, index, L_COPY);
        sarrayAddString(saout, str, L_INSERT);
    }

    return saout;
}
示例#6
0
/*!
 *  recogAddAllSamples()
 *
 *      Input:  recog
 *              paa (pixaa from previously trained recog)
 *              debug
 *      Return: 0 if OK, 1 on error
 *
 *  Notes:
 *      (1) This is used with the serialization routine recogRead(),
 *          where each pixa in the pixaa represents a set of characters
 *          in a different class.  Two different pixa may represent
 *          characters with the same label.  Before calling this
 *          function, we verify that the number of character classes,
 *          given by the setsize field in recog, equals the number of
 *          pixa in the paa.  The character labels for each set are
 *          in the sa_text field.
 */
static l_int32
recogAddAllSamples(L_RECOG  *recog,
                   PIXAA    *paa,
                   l_int32   debug)
{
char    *text;
l_int32  i, j, nc, ns;
PIX     *pix;
PIXA    *pixa;

    PROCNAME("recogAddAllSamples");

    if (!recog)
        return ERROR_INT("recog not defined", procName, 1);
    if (!paa)
        return ERROR_INT("paa not defined", procName, 1);

    nc = pixaaGetCount(paa, NULL);
    for (i = 0; i < nc; i++) {
        pixa = pixaaGetPixa(paa, i, L_CLONE);
        ns = pixaGetCount(pixa);
        text = sarrayGetString(recog->sa_text, i, L_NOCOPY);
        for (j = 0; j < ns; j++) {
            pix = pixaGetPix(pixa, j, L_CLONE);
            if (debug) {
                fprintf(stderr, "pix[%d,%d]: text = %s\n", i, j, text);
            }
            pixaaAddPix(recog->pixaa_u, i, pix, NULL, L_INSERT);
        }
        pixaDestroy(&pixa);
    }

    recogTrainingFinished(recog, debug);
    return 0;
}
/*!
 *  sarrayAppendRange()
 *
 *      Input:  sa1  (to be added to)
 *              sa2  (append specified range of strings in sa2 to sa1)
 *              start (index of first string of sa2 to append)
 *              end (index of last string of sa2 to append)
 *      Return: 0 if OK, 1 on error
 *
 *  Notes:
 *      (1) Copies of the strings in sarray2 are added to sarray1.
 *      (2) The [start ... end] range is truncated if necessary.
 */
l_int32
sarrayAppendRange(SARRAY  *sa1,
                  SARRAY  *sa2,
		  l_int32  start,
		  l_int32  end)
{
char    *str;
l_int32  n, i;

    PROCNAME("sarrayAppendRange");

    if (!sa1)
        return ERROR_INT("sa1 not defined", procName, 1);
    if (!sa2)
        return ERROR_INT("sa2 not defined", procName, 1);
    if (start < 0)
        start = 0;
    n = sarrayGetCount(sa2);
    if (end >= n)
        end = n - 1;
    if (start > end)
        return ERROR_INT("start > end", procName, 1);

    for (i = start; i <= end; i++) {
        str = sarrayGetString(sa2, i, L_NOCOPY);
        sarrayAddString(sa1, str, L_COPY);
    }

    return 0;
}
示例#8
0
/*
 *  getNextNonDoubleSlashLine()
 *
 *      Input:  sa (output from cpp, by line)
 *              start (starting index to search)
 *              &next (<return> index of first uncommented line after
 *                     the start line)
 *      Return: 0 if OK, 1 on error
 *
 *  Notes:
 *      (1) Skips over all consecutive '//' lines, beginning at 'start'
 *      (2) If all lines to the end start with '//', return next = -1
 */
static l_int32
getNextNonDoubleSlashLine(SARRAY  *sa,
                          l_int32  start,
                          l_int32 *pnext)
{
char    *str;
l_int32  i, n, len;

    PROCNAME("getNextNonDoubleSlashLine");

    if (!sa)
        return ERROR_INT("sa not defined", procName, 1);
    if (!pnext)
        return ERROR_INT("&pnext not defined", procName, 1);

        /* Init for situation where this line and all following
         * start with '//' */
    *pnext = -1;

    n = sarrayGetCount(sa);
    for (i = start; i < n; i++) {
        if ((str = sarrayGetString(sa, i, L_NOCOPY)) == NULL)
            return ERROR_INT("str not returned; shouldn't happen", procName, 1);
        len = strlen(str);
        if (len < 2 || str[0] != '/' || str[1] != '/') {
            *pnext = i;
            return 0;
        }
    }

    return 0;
}
示例#9
0
/*
 *  getNextNonBlankLine()
 *
 *      Input:  sa (output from cpp, by line)
 *              start (starting index to search)
 *              &next (<return> index of first nonblank line after
 *                     the start line)
 *      Return: 0 if OK, 1 on error
 *
 *  Notes:
 *      (1) Skips over all consecutive blank lines, beginning at 'start'
 *      (2) A blank line has only whitespace characters (' ', '\t', '\n', '\r')
 *      (3) If all lines to the end are blank, return next = -1
 */
static l_int32
getNextNonBlankLine(SARRAY  *sa,
                    l_int32  start,
                    l_int32 *pnext)
{
char    *str;
l_int32  i, j, n, len;

    PROCNAME("getNextNonBlankLine");

    if (!sa)
        return ERROR_INT("sa not defined", procName, 1);
    if (!pnext)
        return ERROR_INT("&pnext not defined", procName, 1);

        /* Init for situation where this line and all following are blank */
    *pnext = -1;

    n = sarrayGetCount(sa);
    for (i = start; i < n; i++) {
        if ((str = sarrayGetString(sa, i, L_NOCOPY)) == NULL)
            return ERROR_INT("str not returned; shouldn't happen", procName, 1);
        len = strlen(str);
        for (j = 0; j < len; j++) {
            if (str[j] != ' ' && str[j] != '\t'
                && str[j] != '\n' && str[j] != '\r') {  /* non-blank */
                *pnext = i;
                return 0;
            }
        }
    }

    return 0;
}
示例#10
0
/*!
 *  pixReadIndexed()
 *
 *      Input:  sarray (of full pathnames)
 *              index (into pathname array)
 *      Return: pix if OK; null if not found
 *
 *  Notes:
 *      (1) This function is useful for selecting image files from a
 *          directory, where the integer @index is embedded into
 *          the file name.
 *      (2) This is typically done by generating the sarray using
 *          getNumberedPathnamesInDirectory(), so that the @index
 *          pathname would have the number @index in it.  The size
 *          of the sarray should be the largest number (plus 1) appearing
 *          in the file names, respecting the constraints in the
 *          call to getNumberedPathnamesInDirectory().
 *      (3) Consequently, for some indices into the sarray, there may
 *          be no pathnames in the directory containing that number.
 *          By convention, we place empty C strings ("") in those
 *          locations in the sarray, and it is not an error if such
 *          a string is encountered and no pix is returned.
 *          Therefore, the caller must verify that a pix is returned.
 *      (4) See convertSegmentedPagesToPS() in src/psio1.c for an
 *          example of usage.
 */
PIX *
pixReadIndexed(SARRAY  *sa,
               l_int32  index)
{
char    *fname;
l_int32  n;
PIX     *pix;

    PROCNAME("pixReadIndexed");

    if (!sa)
        return (PIX *)ERROR_PTR("sa not defined", procName, NULL);
    n = sarrayGetCount(sa);
    if (index < 0 || index >= n)
        return (PIX *)ERROR_PTR("index out of bounds", procName, NULL);

    fname = sarrayGetString(sa, index, L_NOCOPY);
    if (fname[0] == '\0')
        return NULL;

    if ((pix = pixRead(fname)) == NULL) {
        L_ERROR("pix not read from file %s\n", procName, fname);
        return NULL;
    }

    return pix;
}
示例#11
0
/*
 *  getNextNonCommentLine()
 *
 *      Input:  sa (output from cpp, by line)
 *              start (starting index to search)
 *              &next (<return> index of first uncommented line after
 *                     the start line)
 *      Return: 0 if OK, 1 on error
 *
 *  Notes:
 *      (1) Skips over all consecutive comment lines, beginning at 'start'
 *      (2) If all lines to the end are '#' comments, return next = -1
 */
static l_int32
getNextNonCommentLine(SARRAY  *sa,
                      l_int32  start,
                      l_int32 *pnext)
{
char    *str;
l_int32  i, n;

    PROCNAME("getNextNonCommentLine");

    if (!sa)
        return ERROR_INT("sa not defined", procName, 1);
    if (!pnext)
        return ERROR_INT("&pnext not defined", procName, 1);

        /* Init for situation where this line and all following are comments */
    *pnext = -1;

    n = sarrayGetCount(sa);
    for (i = start; i < n; i++) {
        if ((str = sarrayGetString(sa, i, L_NOCOPY)) == NULL)
            return ERROR_INT("str not returned; shouldn't happen", procName, 1);
        if (str[0] != '#') {
            *pnext = i;
            return 0;
        }
    }

    return 0;
}
/*!
 *  sarraySelectBySubstring()
 *
 *      Input:  sain (input sarray)
 *              substr (<optional> substring for matching; can be NULL)
 *      Return: saout (output sarray, filtered with substring) or null on error
 *
 *  Notes:
 *      (1) This selects all strings in sain that have substr as a substring.
 *          Note that we can't use strncmp() because we're looking for
 *          a match to the substring anywhere within each filename.
 *      (2) If substr == NULL, returns a copy of the sarray.
 */
SARRAY *
sarraySelectBySubstring(SARRAY      *sain,
                        const char  *substr)
{
char    *str;
l_int32  n, i, offset, found;
SARRAY  *saout;

    PROCNAME("sarraySelectBySubstring");

    if (!sain)
        return (SARRAY *)ERROR_PTR("sain not defined", procName, NULL);

    n = sarrayGetCount(sain);
    if (!substr || n == 0)
        return sarrayCopy(sain);

    saout = sarrayCreate(n);
    for (i = 0; i < n; i++) {
        str = sarrayGetString(sain, i, L_NOCOPY);
        arrayFindSequence((l_uint8 *)str, strlen(str), (l_uint8 *)substr,
                          strlen(substr), &offset, &found);
        if (found)
            sarrayAddString(saout, str, L_COPY);
    }

    return saout;
}
示例#13
0
/*!
 *  pixaReadFilesSA()
 *
 *      Input:  sarray (full pathnames for all files)
 *      Return: pixa, or null on error
 */
PIXA *
pixaReadFilesSA(SARRAY  *sa)
{
char    *str;
l_int32  i, n;
PIX     *pix;
PIXA    *pixa;

    PROCNAME("pixaReadFilesSA");

    if (!sa)
        return (PIXA *)ERROR_PTR("sa not defined", procName, NULL);

    n = sarrayGetCount(sa);
    pixa = pixaCreate(n);
    for (i = 0; i < n; i++) {
        str = sarrayGetString(sa, i, L_NOCOPY);
        if ((pix = pixRead(str)) == NULL) {
            L_WARNING("pix not read from file %s\n", procName, str);
            continue;
        }
        pixaAddPix(pixa, pix, L_INSERT);
    }

    return pixa;
}
示例#14
0
/*
 *  sarrayConvertFilesToPS()
 *
 *      Input:  sarray (of full path names)
 *              res (typ. 300 or 600 ppi)
 *              fileout (output ps file)
 *      Return: 0 if OK, 1 on error
 *
 *  Notes:
 *      (1) See convertFilesToPS()
 */
l_int32
sarrayConvertFilesToPS(SARRAY      *sa,
                       l_int32      res,
                       const char  *fileout)
{
char    *fname;
l_int32  i, nfiles, index, firstfile, ret, format;

    PROCNAME("sarrayConvertFilesToPS");

    if (!sa)
        return ERROR_INT("sa not defined", procName, 1);
    if (!fileout)
        return ERROR_INT("fileout not defined", procName, 1);
    if (res <= 0) {
        L_INFO("setting res to 300 ppi", procName);
        res = 300;
    }
    if (res < 10 || res > 4000)
        L_WARNING("res is typically in the range 300-600 ppi", procName);

    nfiles = sarrayGetCount(sa);
    firstfile = TRUE;
    for (i = 0, index = 0; i < nfiles; i++) {
        fname = sarrayGetString(sa, i, L_NOCOPY);
        ret = pixReadHeader(fname, &format, NULL, NULL, NULL, NULL, NULL);
        if (ret) continue;
        if (format == IFF_UNKNOWN)
            continue;

        writeImageCompressedToPSFile(fname, fileout, res, &firstfile, &index);
    }

    return 0;
}
/*!
 * \brief   l_dnaHashCreateFromSarray()
 *
 * \param[in]    sa
 * \return  dahash, or NULL on error
 */
L_DNAHASH *
l_dnaHashCreateFromSarray(SARRAY  *sa)
{
char       *str;
l_int32     i, n;
l_uint32    nsize;
l_uint64    key;
L_DNAHASH  *dahash;

        /* Build up dnaHash of indices, hashed by a 64-bit key that
         * should randomize the lower bits used in bucket selection.
         * Having about 20 pts in each bucket is roughly optimal. */
    n = sarrayGetCount(sa);
    findNextLargerPrime(n / 20, &nsize);  /* buckets in hash table */
/*    fprintf(stderr, "Prime used: %d\n", nsize); */

        /* Add each string, using the hash as key and the index into %sa
         * as the value.  Storing the index enables operations that check
         * for duplicates.  */
    dahash = l_dnaHashCreate(nsize, 8);
    for (i = 0; i < n; i++) {
        str = sarrayGetString(sa, i, L_NOCOPY);
        l_hashStringToUint64(str, &key);
        l_dnaHashAdd(dahash, key, (l_float64)i);
    }

    return dahash;
}
示例#16
0
/*
 *  skipToMatchingBrace()
 *
 *      Input:  sa (output from cpp, by line)
 *              start (index of starting line with left bracket to search)
 *              lbindex (starting char index for left bracket)
 *              &stop (index of line with the matching right bracket)
 *              &rbindex (char index of matching right bracket)
 *      Return: 0 if OK, 1 on error
 *
 *  Notes:
 *      (1) If the matching right brace is not found, returns
 *          stop = -1.  This shouldn't happen.
 */
static l_int32
skipToMatchingBrace(SARRAY   *sa,
                    l_int32   start,
                    l_int32   lbindex,
                    l_int32  *pstop,
                    l_int32  *prbindex)
{
char    *str;
l_int32  i, j, jstart, n, sumbrace, found, instring, nchars;

    PROCNAME("skipToMatchingBrace");

    if (!sa)
        return ERROR_INT("sa not defined", procName, 1);
    if (!pstop)
        return ERROR_INT("&stop not defined", procName, 1);
    if (!prbindex)
        return ERROR_INT("&rbindex not defined", procName, 1);

    instring = 0;  /* init to FALSE; toggle on double quotes */
    *pstop = -1;
    n = sarrayGetCount(sa);
    sumbrace = 1;
    found = FALSE;
    for (i = start; i < n; i++) {
        str = sarrayGetString(sa, i, L_NOCOPY);
        jstart = 0;
        if (i == start)
            jstart = lbindex + 1;
        nchars = strlen(str);
        for (j = jstart; j < nchars; j++) {
                /* Toggle the instring state every time you encounter
                 * a double quote that is NOT escaped. */
            if (j == jstart && str[j] == '\"')
                instring = 1 - instring;
            if (j > jstart && str[j] == '\"' && str[j-1] != '\\')
                instring = 1 - instring;
                /* Record the braces if they are neither a literal character
                 * nor within a string. */
            if (str[j] == '{' && str[j+1] != '\'' && !instring) {
                sumbrace++;
            } else if (str[j] == '}' && str[j+1] != '\'' && !instring) {
                sumbrace--;
                if (sumbrace == 0) {
                    found = TRUE;
                    *prbindex = j;
                    break;
                }
            }
        }
        if (found) {
            *pstop = i;
            return 0;
        }
    }

    return ERROR_INT("matching right brace not found", procName, 1);
}
示例#17
0
/*
 *  parseForProtos()
 *
 *      Input:  filein (output of cpp)
 *              prestring (<optional> string that prefaces each decl;
 *                        use NULL to omit)
 *      Return: parsestr (string of function prototypes), or NULL on error
 *
 *  Notes:
 *      (1) We parse the output of cpp:
 *              cpp -ansi <filein> 
 *          Three plans were attempted, with success on the third. 
 *      (2) Plan 1.  A cursory examination of the cpp output indicated that
 *          every function was preceeded by a cpp comment statement.
 *          So we just need to look at statements beginning after comments.
 *          Unfortunately, this is NOT the case.  Some functions start
 *          without cpp comment lines, typically when there are no
 *          comments in the source that immediately precede the function.
 *      (3) Plan 2.  Consider the keywords in the language that start
 *          parts of the cpp file.  Some, like 'typedef', 'enum',
 *          'union' and 'struct', are followed after a while by '{',
 *          and eventually end with '}, plus an optional token and a
 *          final ';'  Others, like 'extern' and 'static', are never
 *          the beginnings of global function definitions.   Function
 *          prototypes have one or more sets of '(' followed eventually
 *          by a ')', and end with ';'.  But function definitions have
 *          tokens, followed by '(', more tokens, ')' and then
 *          immediately a '{'.  We would generate a prototype from this
 *          by adding a ';' to all tokens up to the ')'.  So we use
 *          these special tokens to decide what we are parsing.  And
 *          whenever a function definition is found and the prototype
 *          extracted, we skip through the rest of the function
 *          past the corresponding '}'.  This token ends a line, and
 *          is often on a line of its own.  But as it turns out,
 *          the only keyword we need to consider is 'static'.
 *      (4) Plan 3.  Consider the parentheses and braces for various
 *          declarations.  A struct, enum, or union has a pair of
 *          braces followed by a semicolon.  They cannot have parentheses
 *          before the left brace, but a struct can have lots of parentheses
 *          within the brace set.  A function prototype has no braces.
 *          A function declaration can have sets of left and right
 *          parentheses, but these are followed by a left brace.
 *          So plan 3 looks at the way parentheses and braces are
 *          organized.  Once the beginning of a function definition
 *          is found, the prototype is extracted and we search for
 *          the ending right brace.
 *      (5) To find the ending right brace, it is necessary to do some
 *          careful parsing.  For example, in this file, we have
 *          left and right braces as characters, and these must not
 *          be counted.  Somewhat more tricky, the file fhmtauto.c
 *          generates code, and includes a right brace in a string.
 *          So we must not include braces that are in strings.  But how
 *          do we know if something is inside a string?  Keep state,
 *          starting with not-inside, and every time you hit a double quote
 *          that is not escaped, toggle the condition.  Any brace
 *          found in the state of being within a string is ignored.
 *      (6) When a prototype is extracted, it is put in a canonical
 *          form (i.e., cleaned up).  Finally, we check that it is
 *          not static and save it.  (If static, it is ignored).
 *      (7) The @prestring for unix is NULL; it is included here so that
 *          you can use Microsoft's declaration for importing or
 *          exporting to a dll.  See environ.h for examples of use.
 *          Here, we set: @prestring = "LEPT_DLL ".  Note in particular
 *          the space character that will separate 'LEPT_DLL' from
 *          the standard unix prototype that follows.
 */
char *
parseForProtos(const char *filein,
               const char *prestring)
{
char    *strdata, *str, *newstr, *parsestr, *secondword;
l_int32  nbytes, start, next, stop, charindex, found;
SARRAY  *sa, *saout, *satest;

    PROCNAME("parseForProtos");

    if (!filein)
        return (char *)ERROR_PTR("filein not defined", procName, NULL);

        /* Read in the cpp output into memory, one string for each
         * line in the file, omitting blank lines.  */
    strdata = (char *)arrayRead(filein, &nbytes);
    sa = sarrayCreateLinesFromString(strdata, 0);

    saout = sarrayCreate(0);
    next = 0;
    while (1) {  /* repeat after each non-static prototype is extracted */
        searchForProtoSignature(sa, next, &start, &stop, &charindex, &found);
        if (!found)
            break;
/*        fprintf(stderr, "  start = %d, stop = %d, charindex = %d\n",
                start, stop, charindex); */
        str = captureProtoSignature(sa, start, stop, charindex);

            /* Make sure it is not static.  Note that 'extern' has
             * been prepended to the prototype, so the 'static'
             * keyword, if it exists, would be the second word. */
        satest = sarrayCreateWordsFromString(str);
        secondword = sarrayGetString(satest, 1, 0);
        if (strcmp(secondword, "static")) {  /* not static */
            if (prestring) {  /* prepend it to the prototype */
                newstr = stringJoin(prestring, str);
                sarrayAddString(saout, newstr, L_INSERT);
                FREE(str);
            }
            else
                sarrayAddString(saout, str, L_INSERT);
        }
        else
            FREE(str);
        sarrayDestroy(&satest);

        skipToEndOfFunction(sa, stop, charindex, &next);
        if (next == -1) break;
    }

        /* Flatten into a string with newlines between prototypes */
    parsestr = sarrayToString(saout, 1);
    FREE(strdata);
    sarrayDestroy(&sa);
    sarrayDestroy(&saout);

    return parsestr;
}
示例#18
0
int main(int    argc,
         char **argv)
{
char        *dirin, *dirout, *infile, *outfile, *tail;
l_int32      i, nfiles, border, x, y, w, h, xb, yb, wb, hb;
BOX         *box1, *box2;
BOXA        *boxa1, *boxa2;
PIX         *pixs, *pixt1, *pixd;
SARRAY      *safiles;
static char  mainName[] = "croptext";

    if (argc != 4)
        return ERROR_INT("Syntax: croptext dirin border dirout", mainName, 1);
    dirin = argv[1];
    border = atoi(argv[2]);
    dirout = argv[3];

    setLeptDebugOK(1);
    safiles = getSortedPathnamesInDirectory(dirin, NULL, 0, 0);
    nfiles = sarrayGetCount(safiles);

    for (i = 0; i < nfiles; i++) {
        infile = sarrayGetString(safiles, i, L_NOCOPY);
        splitPathAtDirectory(infile, NULL, &tail);
        outfile = genPathname(dirout, tail);
        pixs = pixRead(infile);
        pixt1 = pixMorphSequence(pixs, "r11 + c10.40 + o5.5 + x4", 0);
        boxa1 = pixConnComp(pixt1, NULL, 8);
        if (boxaGetCount(boxa1) == 0) {
            fprintf(stderr, "Warning: no components on page %s\n", tail);
            continue;
        }
        boxa2 = boxaSort(boxa1, L_SORT_BY_AREA, L_SORT_DECREASING, NULL);
        box1 = boxaGetBox(boxa2, 0, L_CLONE);
        boxGetGeometry(box1, &x, &y, &w, &h);
        xb = L_MAX(0, x - border);
        yb = L_MAX(0, y - border);
        wb = w + 2 * border;
        hb = h + 2 * border;
        box2 = boxCreate(xb, yb, wb, hb);
        pixd = pixClipRectangle(pixs, box2, NULL);
        pixWrite(outfile, pixd, IFF_TIFF_G4);

        pixDestroy(&pixs);
        pixDestroy(&pixt1);
        pixDestroy(&pixd);
        boxaDestroy(&boxa1);
        boxaDestroy(&boxa2);
    }

    return 0;
}
示例#19
0
/*
 *  getOffsetForCharacter()
 *
 *      Input:  sa (output from cpp, by line)
 *              start (starting index in sa to search; never a comment line)
 *              tchar (we are searching for the first instance of this)
 *              &soffset (<return> offset in strings from start index)
 *              &boffset (<return> offset in bytes within string in which
 *                        the character is first found)
 *              &toffset (<return> offset in total bytes from beginning of
 *                        string indexed by 'start' to the location where
 *                        the character is first found)
 *      Return: 0 if OK, 1 on error
 *
 *  Notes:
 *      (1) We are searching for the first instance of 'tchar', starting
 *          at the beginning of the string indexed by start.
 *      (2) If the character is not found, soffset is returned as -1,
 *          and the other offsets are set to very large numbers.  The
 *          caller must check the value of soffset.
 *      (3) This is only used in contexts where it is not necessary to
 *          consider if the character is inside a string.
 */
static l_int32
getOffsetForCharacter(SARRAY   *sa,
                      l_int32   start,
                      char      tchar,
                      l_int32  *psoffset,
                      l_int32  *pboffset,
                      l_int32  *ptoffset)
{
char    *str;
l_int32  i, j, n, nchars, totchars, found;

    PROCNAME("getOffsetForCharacter");

    if (!sa)
        return ERROR_INT("sa not defined", procName, 1);
    if (!psoffset)
        return ERROR_INT("&soffset not defined", procName, 1);
    if (!pboffset)
        return ERROR_INT("&boffset not defined", procName, 1);
    if (!ptoffset)
        return ERROR_INT("&toffset not defined", procName, 1);

    *psoffset = -1;  /* init to not found */
    *pboffset = 100000000;
    *ptoffset = 100000000;

    n = sarrayGetCount(sa);
    found = FALSE;
    totchars = 0;
    for (i = start; i < n; i++) {
        if ((str = sarrayGetString(sa, i, L_NOCOPY)) == NULL)
            return ERROR_INT("str not returned; shouldn't happen", procName, 1);
        nchars = strlen(str);
        for (j = 0; j < nchars; j++) {
            if (str[j] == tchar) {
                found = TRUE;
                break;
            }
        }
        if (found)
            break;
        totchars += nchars;
    }

    if (found) {
        *psoffset = i - start;
        *pboffset = j;
        *ptoffset = totchars + j;
    }

    return 0;
}
示例#20
0
/*
 *  cleanProtoSignature()
 *
 *      Input:  instr (input prototype string)
 *      Return: cleanstr (clean prototype string), or NULL on error
 *
 *  Notes:
 *      (1) Adds 'extern' at beginning and regularizes spaces
 *          between tokens.
 */
static char *
cleanProtoSignature(char *instr)
{
char    *str, *cleanstr;
char     buf[L_BUF_SIZE];
char     externstring[] = "extern";
l_int32  i, j, nwords, nchars, index, len;
SARRAY  *sa, *saout;

    PROCNAME("cleanProtoSignature");

    if (!instr)
        return (char *)ERROR_PTR("instr not defined", procName, NULL);

    sa = sarrayCreateWordsFromString(instr);
    nwords = sarrayGetCount(sa);
    saout = sarrayCreate(0);
    sarrayAddString(saout, externstring, 1);
    for (i = 0; i < nwords; i++) {
        str = sarrayGetString(sa, i, 0);
        nchars = strlen(str);
        index = 0;
        for (j = 0; j < nchars; j++) {
            if (index > L_BUF_SIZE - 6)
                return (char *)ERROR_PTR("token too large", procName, NULL);
            if (str[j] == '(') {
                buf[index++] = ' ';
                buf[index++] = '(';
                buf[index++] = ' ';
            }
            else if (str[j] == ')') {
                buf[index++] = ' ';
                buf[index++] = ')';
            }
            else 
                buf[index++] = str[j];
        }
        buf[index] = '\0';
        sarrayAddString(saout, buf, 1);
    }

        /* Flatten to a prototype string with spaces added after
         * each word, and remove the last space */
    cleanstr = sarrayToString(saout, 2);
    len = strlen(cleanstr);
    cleanstr[len - 1] = '\0';

    sarrayDestroy(&sa);
    sarrayDestroy(&saout);
    return cleanstr;
}
示例#21
0
/*!
 *  gplotGenDataFiles()
 *
 *      Input:  gplot
 *      Return: 0 if OK, 1 on error
 */
l_int32
gplotGenDataFiles(GPLOT *gplot) {
    char *plotdata, *dataname;
    l_int32 i, nplots;
    FILE *fp;

    PROCNAME("gplotGenDataFiles");

    if (!gplot)
        return ERROR_INT("gplot not defined", procName, 1);

    nplots = sarrayGetCount(gplot->datanames);
    for (i = 0; i < nplots; i++) {
        plotdata = sarrayGetString(gplot->plotdata, i, L_NOCOPY);
        dataname = sarrayGetString(gplot->datanames, i, L_NOCOPY);
        if ((fp = fopenWriteStream(dataname, "w")) == NULL)
            return ERROR_INT("datafile stream not opened", procName, 1);
        fwrite(plotdata, 1, strlen(plotdata), fp);
        fclose(fp);
    }

    return 0;
}
/*!
 * \brief   sarrayIntersectionByHash()
 *
 * \param[in]    sa1, sa2
 * \return  sad intersection of the strings, or NULL on error
 *
 * <pre>
 * Notes:
 *      (1) This is faster than sarrayIntersectionByAset(), because the
 *          bucket lookup is O(n).
 * </pre>
 */
SARRAY *
sarrayIntersectionByHash(SARRAY  *sa1,
                         SARRAY  *sa2)
{
char       *str;
l_int32     n1, n2, nsmall, i, index1, index2;
l_uint32    nsize2;
l_uint64    key;
L_DNAHASH  *dahash1, *dahash2;
SARRAY     *sa_small, *sa_big, *sad;

    PROCNAME("sarrayIntersectionByHash");

    if (!sa1)
        return (SARRAY *)ERROR_PTR("sa1 not defined", procName, NULL);
    if (!sa2)
        return (SARRAY *)ERROR_PTR("sa2 not defined", procName, NULL);

        /* Put the elements of the biggest sarray into a dnahash */
    n1 = sarrayGetCount(sa1);
    n2 = sarrayGetCount(sa2);
    sa_small = (n1 < n2) ? sa1 : sa2;   /* do not destroy sa_small */
    sa_big = (n1 < n2) ? sa2 : sa1;   /* do not destroy sa_big */
    dahash1 = l_dnaHashCreateFromSarray(sa_big);

        /* Build up the intersection of strings.  Add to %sad
         * if the string is in sa_big (using dahash1) but hasn't
         * yet been seen in the traversal of sa_small (using dahash2). */
    sad = sarrayCreate(0);
    nsmall = sarrayGetCount(sa_small);
    findNextLargerPrime(nsmall / 20, &nsize2);  /* buckets in hash table */
    dahash2 = l_dnaHashCreate(nsize2, 0);
    for (i = 0; i < nsmall; i++) {
        str = sarrayGetString(sa_small, i, L_NOCOPY);
        sarrayFindStringByHash(sa_big, dahash1, str, &index1);
        if (index1 >= 0) {
            sarrayFindStringByHash(sa_small, dahash2, str, &index2);
            if (index2 == -1) {
                sarrayAddString(sad, str, L_COPY);
                l_hashStringToUint64(str, &key);
                l_dnaHashAdd(dahash2, key, (l_float64)i);
            }
        }
    }

    l_dnaHashDestroy(&dahash1);
    l_dnaHashDestroy(&dahash2);
    return sad;
}
/*!
 * \brief   l_getIndexFromFile()
 *
 * \param[in]    filename
 * \param[out]   pindex found index
 * \return  0 if found, 1 on error.
 */
static l_int32
l_getIndexFromFile(const char  *filename,
                   l_int32     *pindex)
{
char     buf[256];
char    *word;
FILE    *fp;
l_int32  notfound, format;
SARRAY  *sa;

    PROCNAME("l_getIndexFromFile");

    if (!pindex)
        return ERROR_INT("&index not defined", procName, 1);
    *pindex = 0;
    if (!filename)
        return ERROR_INT("filename not defined", procName, 1);

        /* Open the stream, read lines until you find one with more
         * than a newline, and grab the first word. */
    if ((fp = fopenReadStream(filename)) == NULL)
        return ERROR_INT("stream not opened", procName, 1);
    do {
        if ((fgets(buf, sizeof(buf), fp)) == NULL) {
            fclose(fp);
            return ERROR_INT("fgets read fail", procName, 1);
        }
    } while (buf[0] == '\n');
    fclose(fp);
    sa = sarrayCreateWordsFromString(buf);
    word = sarrayGetString(sa, 0, L_NOCOPY);

        /* Find the index associated with the word.  If it is not
         * found, test to see if the file is a compressed pix. */
    notfound = l_getIndexFromStructname(word, pindex);
    sarrayDestroy(&sa);
    if (notfound) {  /* maybe a Pix */
        if (findFileFormat(filename, &format) == 0) {
            l_getIndexFromStructname("Pix", pindex);
        } else {
            return ERROR_INT("no file type identified", procName, 1);
        }
    }

    return 0;
}
示例#24
0
/*
 *  sarrayConvertFilesFittedToPS()
 *
 *      Input:  sarray (of full path names)
 *              xpts, ypts (desired size in printer points; use 0 for default)
 *              fileout (output ps file)
 *      Return: 0 if OK, 1 on error
 *
 *  Notes:
 *      (1) See convertFilesFittedToPS()
 */
l_int32
sarrayConvertFilesFittedToPS(SARRAY      *sa,
                             l_float32    xpts,
                             l_float32    ypts,
                             const char  *fileout)
{
char    *fname;
l_int32  ret, i, w, h, nfiles, index, firstfile, format, res;

    PROCNAME("sarrayConvertFilesFittedToPS");

    if (!sa)
        return ERROR_INT("sa not defined", procName, 1);
    if (!fileout)
        return ERROR_INT("fileout not defined", procName, 1);
    if (xpts <= 0.0) {
        L_INFO("setting xpts to 612.0", procName);
        xpts = 612.0;
    }
    if (ypts <= 0.0) {
        L_INFO("setting ypts to 792.0", procName);
        ypts = 792.0;
    }
    if (xpts < 100.0 || xpts > 2000.0 || ypts < 100.0 || ypts > 2000.0)
        L_WARNING("xpts,ypts are typically in the range 500-800", procName);

    nfiles = sarrayGetCount(sa);
    firstfile = TRUE;
    for (i = 0, index = 0; i < nfiles; i++) {
        fname = sarrayGetString(sa, i, L_NOCOPY);
        ret = pixReadHeader(fname, &format, &w, &h, NULL, NULL, NULL);
        if (ret) continue;
        if (format == IFF_UNKNOWN)
            continue;

            /* Be sure the entire image is wrapped */
        if (xpts * h < ypts * w)
            res = (l_int32)((l_float32)w * 72.0 / xpts);
        else
            res = (l_int32)((l_float32)h * 72.0 / ypts);

        writeImageCompressedToPSFile(fname, fileout, res, &firstfile, &index);
    }

    return 0;
}
/*!
 * \brief   strcodeCreateFromFile()
 *
 * \param[in]    filein containing filenames of serialized data
 * \param[in]    fileno integer that labels the two output files
 * \param[in]    outdir [optional] if null, files are made in /tmp/lept/auto
 * \return  0 if OK, 1 on error
 *
 * <pre>
 * Notes:
 *      (1) The %filein has one filename on each line.
 *          Comment lines begin with "#".
 *      (2) The output is 2 files:
 *             autogen.\<fileno\>.c
 *             autogen.\<fileno\>.h
 * </pre>
 */
l_int32
strcodeCreateFromFile(const char  *filein,
                      l_int32      fileno,
                      const char  *outdir)
{
char        *fname;
const char  *type;
l_uint8     *data;
size_t       nbytes;
l_int32      i, n, index;
SARRAY      *sa;
L_STRCODE   *strcode;

    PROCNAME("strcodeCreateFromFile");

    if (!filein)
        return ERROR_INT("filein not defined", procName, 1);

    if ((data = l_binaryRead(filein, &nbytes)) == NULL)
        return ERROR_INT("data not read from file", procName, 1);
    sa = sarrayCreateLinesFromString((char *)data, 0);
    LEPT_FREE(data);
    if (!sa)
        return ERROR_INT("sa not made", procName, 1);
    if ((n = sarrayGetCount(sa)) == 0) {
        sarrayDestroy(&sa);
        return ERROR_INT("no filenames in the file", procName, 1);
    }

    strcode = strcodeCreate(fileno);

    for (i = 0; i < n; i++) {
        fname = sarrayGetString(sa, i, L_NOCOPY);
        if (fname[0] == '#') continue;
        if (l_getIndexFromFile(fname, &index)) {
            L_ERROR("File %s has no recognizable type\n", procName, fname);
        } else {
            type = l_assoc[index].type;
            L_INFO("File %s is type %s\n", procName, fname, type);
            strcodeGenerate(strcode, fname, type);
        }
    }
    strcodeFinalize(&strcode, outdir);
    return 0;
}
示例#26
0
/*!
 * \brief   recogGetClassString()
 *
 * \param[in]    recog
 * \param[in]    index into array of char types
 * \param[out]   pcharstr string representation;
 *                        returns an empty string on error
 * \return  0 if found, 1 on error
 *
 * <pre>
 * Notes:
 *      (1) Extracts a copy of the string from sa_text, which
 *          the caller must free.
 *      (2) Caller must check the function return value.
 * </pre>
 */
l_int32
recogGetClassString(L_RECOG  *recog,
                    l_int32   index,
                    char    **pcharstr)
{
    PROCNAME("recogGetClassString");

    if (!pcharstr)
        return ERROR_INT("&charstr not defined", procName, 1);
    *pcharstr = stringNew("");
    if (!recog)
        return ERROR_INT("recog not defined", procName, 2);

    if (index < 0 || index >= recog->setsize)
        return ERROR_INT("invalid index", procName, 1);
    LEPT_FREE(*pcharstr);
    *pcharstr = sarrayGetString(recog->sa_text, index, L_COPY);
    return 0;
}
示例#27
0
/*!
 * \brief   recogAddAllSamples()
 *
 * \param[in]    precog  addr of recog
 * \param[in]    paa     pixaa from previously trained recog
 * \param[in]    debug
 * \return  0 if OK, 1 on error
 *
 * <pre>
 * Notes:
 *      (1) On error, the input recog is destroyed.
 *      (2) This is used with the serialization routine recogRead(),
 *          where each pixa in the pixaa represents a set of characters
 *          in a different class.  Before calling this function, we have
 *          verified that the number of character classes, given by the
 *          setsize field in %recog, equals the number of pixa in the paa.
 *          The character labels for each set are in the sa_text field.
 * </pre>
 */
static l_int32
recogAddAllSamples(L_RECOG  **precog,
                   PIXAA     *paa,
                   l_int32    debug)
{
char     *text;
l_int32   i, j, nc, ns;
PIX      *pix;
PIXA     *pixa, *pixa1;
L_RECOG  *recog;

    PROCNAME("recogAddAllSamples");

    if (!precog)
        return ERROR_INT("&recog not defined", procName, 1);
    if ((recog = *precog) == NULL)
        return ERROR_INT("recog not defined", procName, 1);
    if (!paa) {
        recogDestroy(&recog);
        return ERROR_INT("paa not defined", procName, 1);
    }

    nc = pixaaGetCount(paa, NULL);
    for (i = 0; i < nc; i++) {
        pixa = pixaaGetPixa(paa, i, L_CLONE);
        ns = pixaGetCount(pixa);
        text = sarrayGetString(recog->sa_text, i, L_NOCOPY);
        pixa1 = pixaCreate(ns);
        pixaaAddPixa(recog->pixaa_u, pixa1, L_INSERT);
        for (j = 0; j < ns; j++) {
            pix = pixaGetPix(pixa, j, L_CLONE);
            if (debug) fprintf(stderr, "pix[%d,%d]: text = %s\n", i, j, text);
            pixaaAddPix(recog->pixaa_u, i, pix, NULL, L_INSERT);
        }
        pixaDestroy(&pixa);
    }

    recogTrainingFinished(&recog, 0, -1, -1.0);  /* For second parameter,
                                             see comment in recogRead() */
    if (!recog)
        return ERROR_INT("bad templates; recog destroyed", procName, 1);
    return 0;
}
/*!
 * \brief   sarrayIntersectionByAset()
 *
 * \param[in]    sa1, sa2
 * \return  sad with the intersection of the string set, or NULL on error
 *
 * <pre>
 * Notes:
 *      (1) Algorithm: put the smaller sarray into a set, using the string
 *          hashes as the key values.  Then run through the larger sarray,
 *          building an output sarray and a second set from the strings
 *          in the larger array: if a string is in the first set but
 *          not in the second, add the string to the output sarray and hash
 *          it into the second set.  The second set is required to make
 *          sure only one instance of each string is put into the output sarray.
 *          This is O(mlogn), {m,n} = sizes of {smaller,larger} input arrays.
 * </pre>
 */
SARRAY *
sarrayIntersectionByAset(SARRAY  *sa1,
                         SARRAY  *sa2)
{
char     *str;
l_int32   n1, n2, i, n;
l_uint64  hash;
L_ASET   *set1, *set2;
RB_TYPE   key;
SARRAY   *sa_small, *sa_big, *sad;

    PROCNAME("sarrayIntersectionByAset");

    if (!sa1)
        return (SARRAY *)ERROR_PTR("sa1 not defined", procName, NULL);
    if (!sa2)
        return (SARRAY *)ERROR_PTR("sa2 not defined", procName, NULL);

        /* Put the elements of the biggest array into a set */
    n1 = sarrayGetCount(sa1);
    n2 = sarrayGetCount(sa2);
    sa_small = (n1 < n2) ? sa1 : sa2;   /* do not destroy sa_small */
    sa_big = (n1 < n2) ? sa2 : sa1;   /* do not destroy sa_big */
    set1 = l_asetCreateFromSarray(sa_big);

        /* Build up the intersection of strings */
    sad = sarrayCreate(0);
    n = sarrayGetCount(sa_small);
    set2 = l_asetCreate(L_UINT_TYPE);
    for (i = 0; i < n; i++) {
        str = sarrayGetString(sa_small, i, L_NOCOPY);
        l_hashStringToUint64(str, &hash);
        key.utype = hash;
        if (l_asetFind(set1, key) && !l_asetFind(set2, key)) {
            sarrayAddString(sad, str, L_COPY);
            l_asetInsert(set2, key);
        }
    }

    l_asetDestroy(&set1);
    l_asetDestroy(&set2);
    return sad;
}
/*!
 * \brief   sarrayRemoveDupsByHash()
 *
 * \param[in]    sas
 * \param[out]   psad unique set of strings; duplicates removed
 * \param[out]   pdahash [optional] dnahash used for lookup
 * \return  0 if OK, 1 on error
 *
 * <pre>
 * Notes:
 *      (1) Generates a sarray with unique values.
 *      (2) The dnahash is built up with sad to assure uniqueness.
 *          It can be used to find if a string is in the set:
 *              sarrayFindValByHash(sad, dahash, str, \&index)
 *      (3) The hash of the string location is simple and fast.  It scales
 *          up with the number of buckets to insure a fairly random
 *          bucket selection input strings.
 *      (4) This is faster than sarrayRemoveDupsByAset(), because the
 *          bucket lookup is O(n), although there is a double-loop
 *          lookup within the dna in each bucket.
 * </pre>
 */
l_int32
sarrayRemoveDupsByHash(SARRAY      *sas,
                       SARRAY     **psad,
                       L_DNAHASH  **pdahash)
{
char       *str;
l_int32     i, n, index, items;
l_uint32    nsize;
l_uint64    key;
SARRAY     *sad;
L_DNAHASH  *dahash;

    PROCNAME("sarrayRemoveDupsByHash");

    if (pdahash) *pdahash = NULL;
    if (!psad)
        return ERROR_INT("&sad not defined", procName, 1);
    *psad = NULL;
    if (!sas)
        return ERROR_INT("sas not defined", procName, 1);

    n = sarrayGetCount(sas);
    findNextLargerPrime(n / 20, &nsize);  /* buckets in hash table */
    dahash = l_dnaHashCreate(nsize, 8);
    sad = sarrayCreate(n);
    *psad = sad;
    for (i = 0, items = 0; i < n; i++) {
        str = sarrayGetString(sas, i, L_NOCOPY);
        sarrayFindStringByHash(sad, dahash, str, &index);
        if (index < 0) {  /* not found */
            l_hashStringToUint64(str, &key);
            l_dnaHashAdd(dahash, key, (l_float64)items);
            sarrayAddString(sad, str, L_COPY);
            items++;
        }
    }

    if (pdahash)
        *pdahash = dahash;
    else
        l_dnaHashDestroy(&dahash);
    return 0;
}
/*!
 *  getSortedPathnamesInDirectory()
 *
 *      Input:  directory name
 *              substr (<optional> substring filter on filenames; can be NULL)
 *              firstpage (0-based)
 *              npages (use 0 for all to the end)
 *      Return: sarray of sorted pathnames, or NULL on error
 *
 *  Notes:
 *      (1) If 'substr' is not NULL, only filenames that contain
 *          the substring can be returned.  If 'substr' is NULL,
 *          none of the filenames are filtered out.
 *      (2) The files in the directory, after optional filtering by
 *          the substring, are lexically sorted in increasing order.
 *          The full pathnames are returned for the requested sequence.
 *          If no files are found after filtering, returns an empty sarray.
 */
SARRAY *
getSortedPathnamesInDirectory(const char  *dirname,
                              const char  *substr,
                              l_int32      firstpage,
                              l_int32      npages)
{
char    *fname, *fullname;
l_int32  i, nfiles, lastpage;
SARRAY  *sa, *safiles, *saout;

    PROCNAME("getSortedPathnamesInDirectory");

    if (!dirname)
        return (SARRAY *)ERROR_PTR("dirname not defined", procName, NULL);

    if ((sa = getFilenamesInDirectory(dirname)) == NULL)
        return (SARRAY *)ERROR_PTR("sa not made", procName, NULL);
    safiles = sarraySelectBySubstring(sa, substr);
    sarrayDestroy(&sa);
    nfiles = sarrayGetCount(safiles);
    if (nfiles == 0) {
        L_WARNING("no files found", procName);
        return safiles;
    }

    sarraySort(safiles, safiles, L_SORT_INCREASING);

    firstpage = L_MIN(L_MAX(firstpage, 0), nfiles - 1);
    if (npages == 0)
        npages = nfiles - firstpage;
    lastpage = L_MIN(firstpage + npages - 1, nfiles - 1);

    saout = sarrayCreate(lastpage - firstpage + 1);
    for (i = firstpage; i <= lastpage; i++) {
        fname = sarrayGetString(safiles, i, L_NOCOPY);
        fullname = genPathname(dirname, fname);
        sarrayAddString(saout, fullname, L_INSERT);
    }

    sarrayDestroy(&safiles);
    return saout;
}