/* * captureProtoSignature() * * Input: sa (output from cpp, by line) * start (starting index to search; never a comment line) * stop (index of line on which pattern is completed) * charindex (char index of completing ')' character) * Return: cleanstr (prototype string), or NULL on error * * Notes: * (1) Return all characters, ending with a ';' after the ')' */ static char * captureProtoSignature(SARRAY *sa, l_int32 start, l_int32 stop, l_int32 charindex) { char *str, *newstr, *protostr, *cleanstr; SARRAY *sap; l_int32 i; PROCNAME("captureProtoSignature"); if (!sa) return (char *)ERROR_PTR("sa not defined", procName, NULL); sap = sarrayCreate(0); for (i = start; i < stop; i++) { str = sarrayGetString(sa, i, L_COPY); sarrayAddString(sap, str, L_INSERT); } str = sarrayGetString(sa, stop, L_COPY); str[charindex + 1] = '\0'; newstr = stringJoin(str, ";"); sarrayAddString(sap, newstr, L_INSERT); LEPT_FREE(str); protostr = sarrayToString(sap, 2); sarrayDestroy(&sap); cleanstr = cleanProtoSignature(protostr); LEPT_FREE(protostr); return cleanstr; }
/*! * \brief l_asetCreateFromSarray() * * \param[in] sa * \return set using a string hash into a uint32 as the key */ L_ASET * l_asetCreateFromSarray(SARRAY *sa) { char *str; l_int32 i, n; l_uint64 hash; L_ASET *set; RB_TYPE key; PROCNAME("l_asetCreateFromSarray"); if (!sa) return (L_ASET *)ERROR_PTR("sa not defined", procName, NULL); set = l_asetCreate(L_UINT_TYPE); n = sarrayGetCount(sa); for (i = 0; i < n; i++) { str = sarrayGetString(sa, i, L_NOCOPY); l_hashStringToUint64(str, &hash); key.utype = hash; l_asetInsert(set, key); } return set; }
/*! * \brief recogAddCharstrLabels() * * \param[in] recog * \return 0 if OK, 1 on error */ static l_int32 recogAddCharstrLabels(L_RECOG *recog) { char *text; l_int32 i, j, n1, n2; PIX *pix; PIXA *pixa; PIXAA *paa; PROCNAME("recogAddCharstrLabels"); if (!recog) return ERROR_INT("recog not defined", procName, 1); /* Add the labels to each unscaled pix */ paa = recog->pixaa_u; n1 = pixaaGetCount(paa, NULL); for (i = 0; i < n1; i++) { pixa = pixaaGetPixa(paa, i, L_CLONE); text = sarrayGetString(recog->sa_text, i, L_NOCOPY); n2 = pixaGetCount(pixa); for (j = 0; j < n2; j++) { pix = pixaGetPix(pixa, j, L_CLONE); pixSetText(pix, text); pixDestroy(&pix); } pixaDestroy(&pixa); } return 0; }
/*! * \brief sarrayRemoveDupsByAset() * * \param[in] sas * \return sad with duplicates removed, or NULL on error * * <pre> * Notes: * (1) This is O(nlogn), considerably slower than * sarrayRemoveDupsByHash() for large string arrays. * (2) The key for each string is a 64-bit hash. * (3) Build a set, using hashed strings as keys. As the set is * built, first do a find; if not found, add the key to the * set and add the string to the output sarray. * </pre> */ SARRAY * sarrayRemoveDupsByAset(SARRAY *sas) { char *str; l_int32 i, n; l_uint64 hash; L_ASET *set; RB_TYPE key; SARRAY *sad; PROCNAME("sarrayRemoveDupsByAset"); if (!sas) return (SARRAY *)ERROR_PTR("sas not defined", procName, NULL); set = l_asetCreate(L_UINT_TYPE); sad = sarrayCreate(0); n = sarrayGetCount(sas); for (i = 0; i < n; i++) { str = sarrayGetString(sas, i, L_NOCOPY); l_hashStringToUint64(str, &hash); key.utype = hash; if (!l_asetFind(set, key)) { sarrayAddString(sad, str, L_COPY); l_asetInsert(set, key); } } l_asetDestroy(&set); return sad; }
/*! * \brief sarraySortByIndex() * * \param[in] sain * \param[in] naindex na that maps from the new sarray to the input sarray * \return saout sorted, or NULL on error */ SARRAY * sarraySortByIndex(SARRAY *sain, NUMA *naindex) { char *str; l_int32 i, n, index; SARRAY *saout; PROCNAME("sarraySortByIndex"); if (!sain) return (SARRAY *)ERROR_PTR("sain not defined", procName, NULL); if (!naindex) return (SARRAY *)ERROR_PTR("naindex not defined", procName, NULL); n = sarrayGetCount(sain); saout = sarrayCreate(n); for (i = 0; i < n; i++) { numaGetIValue(naindex, i, &index); str = sarrayGetString(sain, index, L_COPY); sarrayAddString(saout, str, L_INSERT); } return saout; }
/*! * recogAddAllSamples() * * Input: recog * paa (pixaa from previously trained recog) * debug * Return: 0 if OK, 1 on error * * Notes: * (1) This is used with the serialization routine recogRead(), * where each pixa in the pixaa represents a set of characters * in a different class. Two different pixa may represent * characters with the same label. Before calling this * function, we verify that the number of character classes, * given by the setsize field in recog, equals the number of * pixa in the paa. The character labels for each set are * in the sa_text field. */ static l_int32 recogAddAllSamples(L_RECOG *recog, PIXAA *paa, l_int32 debug) { char *text; l_int32 i, j, nc, ns; PIX *pix; PIXA *pixa; PROCNAME("recogAddAllSamples"); if (!recog) return ERROR_INT("recog not defined", procName, 1); if (!paa) return ERROR_INT("paa not defined", procName, 1); nc = pixaaGetCount(paa, NULL); for (i = 0; i < nc; i++) { pixa = pixaaGetPixa(paa, i, L_CLONE); ns = pixaGetCount(pixa); text = sarrayGetString(recog->sa_text, i, L_NOCOPY); for (j = 0; j < ns; j++) { pix = pixaGetPix(pixa, j, L_CLONE); if (debug) { fprintf(stderr, "pix[%d,%d]: text = %s\n", i, j, text); } pixaaAddPix(recog->pixaa_u, i, pix, NULL, L_INSERT); } pixaDestroy(&pixa); } recogTrainingFinished(recog, debug); return 0; }
/*! * sarrayAppendRange() * * Input: sa1 (to be added to) * sa2 (append specified range of strings in sa2 to sa1) * start (index of first string of sa2 to append) * end (index of last string of sa2 to append) * Return: 0 if OK, 1 on error * * Notes: * (1) Copies of the strings in sarray2 are added to sarray1. * (2) The [start ... end] range is truncated if necessary. */ l_int32 sarrayAppendRange(SARRAY *sa1, SARRAY *sa2, l_int32 start, l_int32 end) { char *str; l_int32 n, i; PROCNAME("sarrayAppendRange"); if (!sa1) return ERROR_INT("sa1 not defined", procName, 1); if (!sa2) return ERROR_INT("sa2 not defined", procName, 1); if (start < 0) start = 0; n = sarrayGetCount(sa2); if (end >= n) end = n - 1; if (start > end) return ERROR_INT("start > end", procName, 1); for (i = start; i <= end; i++) { str = sarrayGetString(sa2, i, L_NOCOPY); sarrayAddString(sa1, str, L_COPY); } return 0; }
/* * getNextNonDoubleSlashLine() * * Input: sa (output from cpp, by line) * start (starting index to search) * &next (<return> index of first uncommented line after * the start line) * Return: 0 if OK, 1 on error * * Notes: * (1) Skips over all consecutive '//' lines, beginning at 'start' * (2) If all lines to the end start with '//', return next = -1 */ static l_int32 getNextNonDoubleSlashLine(SARRAY *sa, l_int32 start, l_int32 *pnext) { char *str; l_int32 i, n, len; PROCNAME("getNextNonDoubleSlashLine"); if (!sa) return ERROR_INT("sa not defined", procName, 1); if (!pnext) return ERROR_INT("&pnext not defined", procName, 1); /* Init for situation where this line and all following * start with '//' */ *pnext = -1; n = sarrayGetCount(sa); for (i = start; i < n; i++) { if ((str = sarrayGetString(sa, i, L_NOCOPY)) == NULL) return ERROR_INT("str not returned; shouldn't happen", procName, 1); len = strlen(str); if (len < 2 || str[0] != '/' || str[1] != '/') { *pnext = i; return 0; } } return 0; }
/* * getNextNonBlankLine() * * Input: sa (output from cpp, by line) * start (starting index to search) * &next (<return> index of first nonblank line after * the start line) * Return: 0 if OK, 1 on error * * Notes: * (1) Skips over all consecutive blank lines, beginning at 'start' * (2) A blank line has only whitespace characters (' ', '\t', '\n', '\r') * (3) If all lines to the end are blank, return next = -1 */ static l_int32 getNextNonBlankLine(SARRAY *sa, l_int32 start, l_int32 *pnext) { char *str; l_int32 i, j, n, len; PROCNAME("getNextNonBlankLine"); if (!sa) return ERROR_INT("sa not defined", procName, 1); if (!pnext) return ERROR_INT("&pnext not defined", procName, 1); /* Init for situation where this line and all following are blank */ *pnext = -1; n = sarrayGetCount(sa); for (i = start; i < n; i++) { if ((str = sarrayGetString(sa, i, L_NOCOPY)) == NULL) return ERROR_INT("str not returned; shouldn't happen", procName, 1); len = strlen(str); for (j = 0; j < len; j++) { if (str[j] != ' ' && str[j] != '\t' && str[j] != '\n' && str[j] != '\r') { /* non-blank */ *pnext = i; return 0; } } } return 0; }
/*! * pixReadIndexed() * * Input: sarray (of full pathnames) * index (into pathname array) * Return: pix if OK; null if not found * * Notes: * (1) This function is useful for selecting image files from a * directory, where the integer @index is embedded into * the file name. * (2) This is typically done by generating the sarray using * getNumberedPathnamesInDirectory(), so that the @index * pathname would have the number @index in it. The size * of the sarray should be the largest number (plus 1) appearing * in the file names, respecting the constraints in the * call to getNumberedPathnamesInDirectory(). * (3) Consequently, for some indices into the sarray, there may * be no pathnames in the directory containing that number. * By convention, we place empty C strings ("") in those * locations in the sarray, and it is not an error if such * a string is encountered and no pix is returned. * Therefore, the caller must verify that a pix is returned. * (4) See convertSegmentedPagesToPS() in src/psio1.c for an * example of usage. */ PIX * pixReadIndexed(SARRAY *sa, l_int32 index) { char *fname; l_int32 n; PIX *pix; PROCNAME("pixReadIndexed"); if (!sa) return (PIX *)ERROR_PTR("sa not defined", procName, NULL); n = sarrayGetCount(sa); if (index < 0 || index >= n) return (PIX *)ERROR_PTR("index out of bounds", procName, NULL); fname = sarrayGetString(sa, index, L_NOCOPY); if (fname[0] == '\0') return NULL; if ((pix = pixRead(fname)) == NULL) { L_ERROR("pix not read from file %s\n", procName, fname); return NULL; } return pix; }
/* * getNextNonCommentLine() * * Input: sa (output from cpp, by line) * start (starting index to search) * &next (<return> index of first uncommented line after * the start line) * Return: 0 if OK, 1 on error * * Notes: * (1) Skips over all consecutive comment lines, beginning at 'start' * (2) If all lines to the end are '#' comments, return next = -1 */ static l_int32 getNextNonCommentLine(SARRAY *sa, l_int32 start, l_int32 *pnext) { char *str; l_int32 i, n; PROCNAME("getNextNonCommentLine"); if (!sa) return ERROR_INT("sa not defined", procName, 1); if (!pnext) return ERROR_INT("&pnext not defined", procName, 1); /* Init for situation where this line and all following are comments */ *pnext = -1; n = sarrayGetCount(sa); for (i = start; i < n; i++) { if ((str = sarrayGetString(sa, i, L_NOCOPY)) == NULL) return ERROR_INT("str not returned; shouldn't happen", procName, 1); if (str[0] != '#') { *pnext = i; return 0; } } return 0; }
/*! * sarraySelectBySubstring() * * Input: sain (input sarray) * substr (<optional> substring for matching; can be NULL) * Return: saout (output sarray, filtered with substring) or null on error * * Notes: * (1) This selects all strings in sain that have substr as a substring. * Note that we can't use strncmp() because we're looking for * a match to the substring anywhere within each filename. * (2) If substr == NULL, returns a copy of the sarray. */ SARRAY * sarraySelectBySubstring(SARRAY *sain, const char *substr) { char *str; l_int32 n, i, offset, found; SARRAY *saout; PROCNAME("sarraySelectBySubstring"); if (!sain) return (SARRAY *)ERROR_PTR("sain not defined", procName, NULL); n = sarrayGetCount(sain); if (!substr || n == 0) return sarrayCopy(sain); saout = sarrayCreate(n); for (i = 0; i < n; i++) { str = sarrayGetString(sain, i, L_NOCOPY); arrayFindSequence((l_uint8 *)str, strlen(str), (l_uint8 *)substr, strlen(substr), &offset, &found); if (found) sarrayAddString(saout, str, L_COPY); } return saout; }
/*! * pixaReadFilesSA() * * Input: sarray (full pathnames for all files) * Return: pixa, or null on error */ PIXA * pixaReadFilesSA(SARRAY *sa) { char *str; l_int32 i, n; PIX *pix; PIXA *pixa; PROCNAME("pixaReadFilesSA"); if (!sa) return (PIXA *)ERROR_PTR("sa not defined", procName, NULL); n = sarrayGetCount(sa); pixa = pixaCreate(n); for (i = 0; i < n; i++) { str = sarrayGetString(sa, i, L_NOCOPY); if ((pix = pixRead(str)) == NULL) { L_WARNING("pix not read from file %s\n", procName, str); continue; } pixaAddPix(pixa, pix, L_INSERT); } return pixa; }
/* * sarrayConvertFilesToPS() * * Input: sarray (of full path names) * res (typ. 300 or 600 ppi) * fileout (output ps file) * Return: 0 if OK, 1 on error * * Notes: * (1) See convertFilesToPS() */ l_int32 sarrayConvertFilesToPS(SARRAY *sa, l_int32 res, const char *fileout) { char *fname; l_int32 i, nfiles, index, firstfile, ret, format; PROCNAME("sarrayConvertFilesToPS"); if (!sa) return ERROR_INT("sa not defined", procName, 1); if (!fileout) return ERROR_INT("fileout not defined", procName, 1); if (res <= 0) { L_INFO("setting res to 300 ppi", procName); res = 300; } if (res < 10 || res > 4000) L_WARNING("res is typically in the range 300-600 ppi", procName); nfiles = sarrayGetCount(sa); firstfile = TRUE; for (i = 0, index = 0; i < nfiles; i++) { fname = sarrayGetString(sa, i, L_NOCOPY); ret = pixReadHeader(fname, &format, NULL, NULL, NULL, NULL, NULL); if (ret) continue; if (format == IFF_UNKNOWN) continue; writeImageCompressedToPSFile(fname, fileout, res, &firstfile, &index); } return 0; }
/*! * \brief l_dnaHashCreateFromSarray() * * \param[in] sa * \return dahash, or NULL on error */ L_DNAHASH * l_dnaHashCreateFromSarray(SARRAY *sa) { char *str; l_int32 i, n; l_uint32 nsize; l_uint64 key; L_DNAHASH *dahash; /* Build up dnaHash of indices, hashed by a 64-bit key that * should randomize the lower bits used in bucket selection. * Having about 20 pts in each bucket is roughly optimal. */ n = sarrayGetCount(sa); findNextLargerPrime(n / 20, &nsize); /* buckets in hash table */ /* fprintf(stderr, "Prime used: %d\n", nsize); */ /* Add each string, using the hash as key and the index into %sa * as the value. Storing the index enables operations that check * for duplicates. */ dahash = l_dnaHashCreate(nsize, 8); for (i = 0; i < n; i++) { str = sarrayGetString(sa, i, L_NOCOPY); l_hashStringToUint64(str, &key); l_dnaHashAdd(dahash, key, (l_float64)i); } return dahash; }
/* * skipToMatchingBrace() * * Input: sa (output from cpp, by line) * start (index of starting line with left bracket to search) * lbindex (starting char index for left bracket) * &stop (index of line with the matching right bracket) * &rbindex (char index of matching right bracket) * Return: 0 if OK, 1 on error * * Notes: * (1) If the matching right brace is not found, returns * stop = -1. This shouldn't happen. */ static l_int32 skipToMatchingBrace(SARRAY *sa, l_int32 start, l_int32 lbindex, l_int32 *pstop, l_int32 *prbindex) { char *str; l_int32 i, j, jstart, n, sumbrace, found, instring, nchars; PROCNAME("skipToMatchingBrace"); if (!sa) return ERROR_INT("sa not defined", procName, 1); if (!pstop) return ERROR_INT("&stop not defined", procName, 1); if (!prbindex) return ERROR_INT("&rbindex not defined", procName, 1); instring = 0; /* init to FALSE; toggle on double quotes */ *pstop = -1; n = sarrayGetCount(sa); sumbrace = 1; found = FALSE; for (i = start; i < n; i++) { str = sarrayGetString(sa, i, L_NOCOPY); jstart = 0; if (i == start) jstart = lbindex + 1; nchars = strlen(str); for (j = jstart; j < nchars; j++) { /* Toggle the instring state every time you encounter * a double quote that is NOT escaped. */ if (j == jstart && str[j] == '\"') instring = 1 - instring; if (j > jstart && str[j] == '\"' && str[j-1] != '\\') instring = 1 - instring; /* Record the braces if they are neither a literal character * nor within a string. */ if (str[j] == '{' && str[j+1] != '\'' && !instring) { sumbrace++; } else if (str[j] == '}' && str[j+1] != '\'' && !instring) { sumbrace--; if (sumbrace == 0) { found = TRUE; *prbindex = j; break; } } } if (found) { *pstop = i; return 0; } } return ERROR_INT("matching right brace not found", procName, 1); }
/* * parseForProtos() * * Input: filein (output of cpp) * prestring (<optional> string that prefaces each decl; * use NULL to omit) * Return: parsestr (string of function prototypes), or NULL on error * * Notes: * (1) We parse the output of cpp: * cpp -ansi <filein> * Three plans were attempted, with success on the third. * (2) Plan 1. A cursory examination of the cpp output indicated that * every function was preceeded by a cpp comment statement. * So we just need to look at statements beginning after comments. * Unfortunately, this is NOT the case. Some functions start * without cpp comment lines, typically when there are no * comments in the source that immediately precede the function. * (3) Plan 2. Consider the keywords in the language that start * parts of the cpp file. Some, like 'typedef', 'enum', * 'union' and 'struct', are followed after a while by '{', * and eventually end with '}, plus an optional token and a * final ';' Others, like 'extern' and 'static', are never * the beginnings of global function definitions. Function * prototypes have one or more sets of '(' followed eventually * by a ')', and end with ';'. But function definitions have * tokens, followed by '(', more tokens, ')' and then * immediately a '{'. We would generate a prototype from this * by adding a ';' to all tokens up to the ')'. So we use * these special tokens to decide what we are parsing. And * whenever a function definition is found and the prototype * extracted, we skip through the rest of the function * past the corresponding '}'. This token ends a line, and * is often on a line of its own. But as it turns out, * the only keyword we need to consider is 'static'. * (4) Plan 3. Consider the parentheses and braces for various * declarations. A struct, enum, or union has a pair of * braces followed by a semicolon. They cannot have parentheses * before the left brace, but a struct can have lots of parentheses * within the brace set. A function prototype has no braces. * A function declaration can have sets of left and right * parentheses, but these are followed by a left brace. * So plan 3 looks at the way parentheses and braces are * organized. Once the beginning of a function definition * is found, the prototype is extracted and we search for * the ending right brace. * (5) To find the ending right brace, it is necessary to do some * careful parsing. For example, in this file, we have * left and right braces as characters, and these must not * be counted. Somewhat more tricky, the file fhmtauto.c * generates code, and includes a right brace in a string. * So we must not include braces that are in strings. But how * do we know if something is inside a string? Keep state, * starting with not-inside, and every time you hit a double quote * that is not escaped, toggle the condition. Any brace * found in the state of being within a string is ignored. * (6) When a prototype is extracted, it is put in a canonical * form (i.e., cleaned up). Finally, we check that it is * not static and save it. (If static, it is ignored). * (7) The @prestring for unix is NULL; it is included here so that * you can use Microsoft's declaration for importing or * exporting to a dll. See environ.h for examples of use. * Here, we set: @prestring = "LEPT_DLL ". Note in particular * the space character that will separate 'LEPT_DLL' from * the standard unix prototype that follows. */ char * parseForProtos(const char *filein, const char *prestring) { char *strdata, *str, *newstr, *parsestr, *secondword; l_int32 nbytes, start, next, stop, charindex, found; SARRAY *sa, *saout, *satest; PROCNAME("parseForProtos"); if (!filein) return (char *)ERROR_PTR("filein not defined", procName, NULL); /* Read in the cpp output into memory, one string for each * line in the file, omitting blank lines. */ strdata = (char *)arrayRead(filein, &nbytes); sa = sarrayCreateLinesFromString(strdata, 0); saout = sarrayCreate(0); next = 0; while (1) { /* repeat after each non-static prototype is extracted */ searchForProtoSignature(sa, next, &start, &stop, &charindex, &found); if (!found) break; /* fprintf(stderr, " start = %d, stop = %d, charindex = %d\n", start, stop, charindex); */ str = captureProtoSignature(sa, start, stop, charindex); /* Make sure it is not static. Note that 'extern' has * been prepended to the prototype, so the 'static' * keyword, if it exists, would be the second word. */ satest = sarrayCreateWordsFromString(str); secondword = sarrayGetString(satest, 1, 0); if (strcmp(secondword, "static")) { /* not static */ if (prestring) { /* prepend it to the prototype */ newstr = stringJoin(prestring, str); sarrayAddString(saout, newstr, L_INSERT); FREE(str); } else sarrayAddString(saout, str, L_INSERT); } else FREE(str); sarrayDestroy(&satest); skipToEndOfFunction(sa, stop, charindex, &next); if (next == -1) break; } /* Flatten into a string with newlines between prototypes */ parsestr = sarrayToString(saout, 1); FREE(strdata); sarrayDestroy(&sa); sarrayDestroy(&saout); return parsestr; }
int main(int argc, char **argv) { char *dirin, *dirout, *infile, *outfile, *tail; l_int32 i, nfiles, border, x, y, w, h, xb, yb, wb, hb; BOX *box1, *box2; BOXA *boxa1, *boxa2; PIX *pixs, *pixt1, *pixd; SARRAY *safiles; static char mainName[] = "croptext"; if (argc != 4) return ERROR_INT("Syntax: croptext dirin border dirout", mainName, 1); dirin = argv[1]; border = atoi(argv[2]); dirout = argv[3]; setLeptDebugOK(1); safiles = getSortedPathnamesInDirectory(dirin, NULL, 0, 0); nfiles = sarrayGetCount(safiles); for (i = 0; i < nfiles; i++) { infile = sarrayGetString(safiles, i, L_NOCOPY); splitPathAtDirectory(infile, NULL, &tail); outfile = genPathname(dirout, tail); pixs = pixRead(infile); pixt1 = pixMorphSequence(pixs, "r11 + c10.40 + o5.5 + x4", 0); boxa1 = pixConnComp(pixt1, NULL, 8); if (boxaGetCount(boxa1) == 0) { fprintf(stderr, "Warning: no components on page %s\n", tail); continue; } boxa2 = boxaSort(boxa1, L_SORT_BY_AREA, L_SORT_DECREASING, NULL); box1 = boxaGetBox(boxa2, 0, L_CLONE); boxGetGeometry(box1, &x, &y, &w, &h); xb = L_MAX(0, x - border); yb = L_MAX(0, y - border); wb = w + 2 * border; hb = h + 2 * border; box2 = boxCreate(xb, yb, wb, hb); pixd = pixClipRectangle(pixs, box2, NULL); pixWrite(outfile, pixd, IFF_TIFF_G4); pixDestroy(&pixs); pixDestroy(&pixt1); pixDestroy(&pixd); boxaDestroy(&boxa1); boxaDestroy(&boxa2); } return 0; }
/* * getOffsetForCharacter() * * Input: sa (output from cpp, by line) * start (starting index in sa to search; never a comment line) * tchar (we are searching for the first instance of this) * &soffset (<return> offset in strings from start index) * &boffset (<return> offset in bytes within string in which * the character is first found) * &toffset (<return> offset in total bytes from beginning of * string indexed by 'start' to the location where * the character is first found) * Return: 0 if OK, 1 on error * * Notes: * (1) We are searching for the first instance of 'tchar', starting * at the beginning of the string indexed by start. * (2) If the character is not found, soffset is returned as -1, * and the other offsets are set to very large numbers. The * caller must check the value of soffset. * (3) This is only used in contexts where it is not necessary to * consider if the character is inside a string. */ static l_int32 getOffsetForCharacter(SARRAY *sa, l_int32 start, char tchar, l_int32 *psoffset, l_int32 *pboffset, l_int32 *ptoffset) { char *str; l_int32 i, j, n, nchars, totchars, found; PROCNAME("getOffsetForCharacter"); if (!sa) return ERROR_INT("sa not defined", procName, 1); if (!psoffset) return ERROR_INT("&soffset not defined", procName, 1); if (!pboffset) return ERROR_INT("&boffset not defined", procName, 1); if (!ptoffset) return ERROR_INT("&toffset not defined", procName, 1); *psoffset = -1; /* init to not found */ *pboffset = 100000000; *ptoffset = 100000000; n = sarrayGetCount(sa); found = FALSE; totchars = 0; for (i = start; i < n; i++) { if ((str = sarrayGetString(sa, i, L_NOCOPY)) == NULL) return ERROR_INT("str not returned; shouldn't happen", procName, 1); nchars = strlen(str); for (j = 0; j < nchars; j++) { if (str[j] == tchar) { found = TRUE; break; } } if (found) break; totchars += nchars; } if (found) { *psoffset = i - start; *pboffset = j; *ptoffset = totchars + j; } return 0; }
/* * cleanProtoSignature() * * Input: instr (input prototype string) * Return: cleanstr (clean prototype string), or NULL on error * * Notes: * (1) Adds 'extern' at beginning and regularizes spaces * between tokens. */ static char * cleanProtoSignature(char *instr) { char *str, *cleanstr; char buf[L_BUF_SIZE]; char externstring[] = "extern"; l_int32 i, j, nwords, nchars, index, len; SARRAY *sa, *saout; PROCNAME("cleanProtoSignature"); if (!instr) return (char *)ERROR_PTR("instr not defined", procName, NULL); sa = sarrayCreateWordsFromString(instr); nwords = sarrayGetCount(sa); saout = sarrayCreate(0); sarrayAddString(saout, externstring, 1); for (i = 0; i < nwords; i++) { str = sarrayGetString(sa, i, 0); nchars = strlen(str); index = 0; for (j = 0; j < nchars; j++) { if (index > L_BUF_SIZE - 6) return (char *)ERROR_PTR("token too large", procName, NULL); if (str[j] == '(') { buf[index++] = ' '; buf[index++] = '('; buf[index++] = ' '; } else if (str[j] == ')') { buf[index++] = ' '; buf[index++] = ')'; } else buf[index++] = str[j]; } buf[index] = '\0'; sarrayAddString(saout, buf, 1); } /* Flatten to a prototype string with spaces added after * each word, and remove the last space */ cleanstr = sarrayToString(saout, 2); len = strlen(cleanstr); cleanstr[len - 1] = '\0'; sarrayDestroy(&sa); sarrayDestroy(&saout); return cleanstr; }
/*! * gplotGenDataFiles() * * Input: gplot * Return: 0 if OK, 1 on error */ l_int32 gplotGenDataFiles(GPLOT *gplot) { char *plotdata, *dataname; l_int32 i, nplots; FILE *fp; PROCNAME("gplotGenDataFiles"); if (!gplot) return ERROR_INT("gplot not defined", procName, 1); nplots = sarrayGetCount(gplot->datanames); for (i = 0; i < nplots; i++) { plotdata = sarrayGetString(gplot->plotdata, i, L_NOCOPY); dataname = sarrayGetString(gplot->datanames, i, L_NOCOPY); if ((fp = fopenWriteStream(dataname, "w")) == NULL) return ERROR_INT("datafile stream not opened", procName, 1); fwrite(plotdata, 1, strlen(plotdata), fp); fclose(fp); } return 0; }
/*! * \brief sarrayIntersectionByHash() * * \param[in] sa1, sa2 * \return sad intersection of the strings, or NULL on error * * <pre> * Notes: * (1) This is faster than sarrayIntersectionByAset(), because the * bucket lookup is O(n). * </pre> */ SARRAY * sarrayIntersectionByHash(SARRAY *sa1, SARRAY *sa2) { char *str; l_int32 n1, n2, nsmall, i, index1, index2; l_uint32 nsize2; l_uint64 key; L_DNAHASH *dahash1, *dahash2; SARRAY *sa_small, *sa_big, *sad; PROCNAME("sarrayIntersectionByHash"); if (!sa1) return (SARRAY *)ERROR_PTR("sa1 not defined", procName, NULL); if (!sa2) return (SARRAY *)ERROR_PTR("sa2 not defined", procName, NULL); /* Put the elements of the biggest sarray into a dnahash */ n1 = sarrayGetCount(sa1); n2 = sarrayGetCount(sa2); sa_small = (n1 < n2) ? sa1 : sa2; /* do not destroy sa_small */ sa_big = (n1 < n2) ? sa2 : sa1; /* do not destroy sa_big */ dahash1 = l_dnaHashCreateFromSarray(sa_big); /* Build up the intersection of strings. Add to %sad * if the string is in sa_big (using dahash1) but hasn't * yet been seen in the traversal of sa_small (using dahash2). */ sad = sarrayCreate(0); nsmall = sarrayGetCount(sa_small); findNextLargerPrime(nsmall / 20, &nsize2); /* buckets in hash table */ dahash2 = l_dnaHashCreate(nsize2, 0); for (i = 0; i < nsmall; i++) { str = sarrayGetString(sa_small, i, L_NOCOPY); sarrayFindStringByHash(sa_big, dahash1, str, &index1); if (index1 >= 0) { sarrayFindStringByHash(sa_small, dahash2, str, &index2); if (index2 == -1) { sarrayAddString(sad, str, L_COPY); l_hashStringToUint64(str, &key); l_dnaHashAdd(dahash2, key, (l_float64)i); } } } l_dnaHashDestroy(&dahash1); l_dnaHashDestroy(&dahash2); return sad; }
/*! * \brief l_getIndexFromFile() * * \param[in] filename * \param[out] pindex found index * \return 0 if found, 1 on error. */ static l_int32 l_getIndexFromFile(const char *filename, l_int32 *pindex) { char buf[256]; char *word; FILE *fp; l_int32 notfound, format; SARRAY *sa; PROCNAME("l_getIndexFromFile"); if (!pindex) return ERROR_INT("&index not defined", procName, 1); *pindex = 0; if (!filename) return ERROR_INT("filename not defined", procName, 1); /* Open the stream, read lines until you find one with more * than a newline, and grab the first word. */ if ((fp = fopenReadStream(filename)) == NULL) return ERROR_INT("stream not opened", procName, 1); do { if ((fgets(buf, sizeof(buf), fp)) == NULL) { fclose(fp); return ERROR_INT("fgets read fail", procName, 1); } } while (buf[0] == '\n'); fclose(fp); sa = sarrayCreateWordsFromString(buf); word = sarrayGetString(sa, 0, L_NOCOPY); /* Find the index associated with the word. If it is not * found, test to see if the file is a compressed pix. */ notfound = l_getIndexFromStructname(word, pindex); sarrayDestroy(&sa); if (notfound) { /* maybe a Pix */ if (findFileFormat(filename, &format) == 0) { l_getIndexFromStructname("Pix", pindex); } else { return ERROR_INT("no file type identified", procName, 1); } } return 0; }
/* * sarrayConvertFilesFittedToPS() * * Input: sarray (of full path names) * xpts, ypts (desired size in printer points; use 0 for default) * fileout (output ps file) * Return: 0 if OK, 1 on error * * Notes: * (1) See convertFilesFittedToPS() */ l_int32 sarrayConvertFilesFittedToPS(SARRAY *sa, l_float32 xpts, l_float32 ypts, const char *fileout) { char *fname; l_int32 ret, i, w, h, nfiles, index, firstfile, format, res; PROCNAME("sarrayConvertFilesFittedToPS"); if (!sa) return ERROR_INT("sa not defined", procName, 1); if (!fileout) return ERROR_INT("fileout not defined", procName, 1); if (xpts <= 0.0) { L_INFO("setting xpts to 612.0", procName); xpts = 612.0; } if (ypts <= 0.0) { L_INFO("setting ypts to 792.0", procName); ypts = 792.0; } if (xpts < 100.0 || xpts > 2000.0 || ypts < 100.0 || ypts > 2000.0) L_WARNING("xpts,ypts are typically in the range 500-800", procName); nfiles = sarrayGetCount(sa); firstfile = TRUE; for (i = 0, index = 0; i < nfiles; i++) { fname = sarrayGetString(sa, i, L_NOCOPY); ret = pixReadHeader(fname, &format, &w, &h, NULL, NULL, NULL); if (ret) continue; if (format == IFF_UNKNOWN) continue; /* Be sure the entire image is wrapped */ if (xpts * h < ypts * w) res = (l_int32)((l_float32)w * 72.0 / xpts); else res = (l_int32)((l_float32)h * 72.0 / ypts); writeImageCompressedToPSFile(fname, fileout, res, &firstfile, &index); } return 0; }
/*! * \brief strcodeCreateFromFile() * * \param[in] filein containing filenames of serialized data * \param[in] fileno integer that labels the two output files * \param[in] outdir [optional] if null, files are made in /tmp/lept/auto * \return 0 if OK, 1 on error * * <pre> * Notes: * (1) The %filein has one filename on each line. * Comment lines begin with "#". * (2) The output is 2 files: * autogen.\<fileno\>.c * autogen.\<fileno\>.h * </pre> */ l_int32 strcodeCreateFromFile(const char *filein, l_int32 fileno, const char *outdir) { char *fname; const char *type; l_uint8 *data; size_t nbytes; l_int32 i, n, index; SARRAY *sa; L_STRCODE *strcode; PROCNAME("strcodeCreateFromFile"); if (!filein) return ERROR_INT("filein not defined", procName, 1); if ((data = l_binaryRead(filein, &nbytes)) == NULL) return ERROR_INT("data not read from file", procName, 1); sa = sarrayCreateLinesFromString((char *)data, 0); LEPT_FREE(data); if (!sa) return ERROR_INT("sa not made", procName, 1); if ((n = sarrayGetCount(sa)) == 0) { sarrayDestroy(&sa); return ERROR_INT("no filenames in the file", procName, 1); } strcode = strcodeCreate(fileno); for (i = 0; i < n; i++) { fname = sarrayGetString(sa, i, L_NOCOPY); if (fname[0] == '#') continue; if (l_getIndexFromFile(fname, &index)) { L_ERROR("File %s has no recognizable type\n", procName, fname); } else { type = l_assoc[index].type; L_INFO("File %s is type %s\n", procName, fname, type); strcodeGenerate(strcode, fname, type); } } strcodeFinalize(&strcode, outdir); return 0; }
/*! * \brief recogGetClassString() * * \param[in] recog * \param[in] index into array of char types * \param[out] pcharstr string representation; * returns an empty string on error * \return 0 if found, 1 on error * * <pre> * Notes: * (1) Extracts a copy of the string from sa_text, which * the caller must free. * (2) Caller must check the function return value. * </pre> */ l_int32 recogGetClassString(L_RECOG *recog, l_int32 index, char **pcharstr) { PROCNAME("recogGetClassString"); if (!pcharstr) return ERROR_INT("&charstr not defined", procName, 1); *pcharstr = stringNew(""); if (!recog) return ERROR_INT("recog not defined", procName, 2); if (index < 0 || index >= recog->setsize) return ERROR_INT("invalid index", procName, 1); LEPT_FREE(*pcharstr); *pcharstr = sarrayGetString(recog->sa_text, index, L_COPY); return 0; }
/*! * \brief recogAddAllSamples() * * \param[in] precog addr of recog * \param[in] paa pixaa from previously trained recog * \param[in] debug * \return 0 if OK, 1 on error * * <pre> * Notes: * (1) On error, the input recog is destroyed. * (2) This is used with the serialization routine recogRead(), * where each pixa in the pixaa represents a set of characters * in a different class. Before calling this function, we have * verified that the number of character classes, given by the * setsize field in %recog, equals the number of pixa in the paa. * The character labels for each set are in the sa_text field. * </pre> */ static l_int32 recogAddAllSamples(L_RECOG **precog, PIXAA *paa, l_int32 debug) { char *text; l_int32 i, j, nc, ns; PIX *pix; PIXA *pixa, *pixa1; L_RECOG *recog; PROCNAME("recogAddAllSamples"); if (!precog) return ERROR_INT("&recog not defined", procName, 1); if ((recog = *precog) == NULL) return ERROR_INT("recog not defined", procName, 1); if (!paa) { recogDestroy(&recog); return ERROR_INT("paa not defined", procName, 1); } nc = pixaaGetCount(paa, NULL); for (i = 0; i < nc; i++) { pixa = pixaaGetPixa(paa, i, L_CLONE); ns = pixaGetCount(pixa); text = sarrayGetString(recog->sa_text, i, L_NOCOPY); pixa1 = pixaCreate(ns); pixaaAddPixa(recog->pixaa_u, pixa1, L_INSERT); for (j = 0; j < ns; j++) { pix = pixaGetPix(pixa, j, L_CLONE); if (debug) fprintf(stderr, "pix[%d,%d]: text = %s\n", i, j, text); pixaaAddPix(recog->pixaa_u, i, pix, NULL, L_INSERT); } pixaDestroy(&pixa); } recogTrainingFinished(&recog, 0, -1, -1.0); /* For second parameter, see comment in recogRead() */ if (!recog) return ERROR_INT("bad templates; recog destroyed", procName, 1); return 0; }
/*! * \brief sarrayIntersectionByAset() * * \param[in] sa1, sa2 * \return sad with the intersection of the string set, or NULL on error * * <pre> * Notes: * (1) Algorithm: put the smaller sarray into a set, using the string * hashes as the key values. Then run through the larger sarray, * building an output sarray and a second set from the strings * in the larger array: if a string is in the first set but * not in the second, add the string to the output sarray and hash * it into the second set. The second set is required to make * sure only one instance of each string is put into the output sarray. * This is O(mlogn), {m,n} = sizes of {smaller,larger} input arrays. * </pre> */ SARRAY * sarrayIntersectionByAset(SARRAY *sa1, SARRAY *sa2) { char *str; l_int32 n1, n2, i, n; l_uint64 hash; L_ASET *set1, *set2; RB_TYPE key; SARRAY *sa_small, *sa_big, *sad; PROCNAME("sarrayIntersectionByAset"); if (!sa1) return (SARRAY *)ERROR_PTR("sa1 not defined", procName, NULL); if (!sa2) return (SARRAY *)ERROR_PTR("sa2 not defined", procName, NULL); /* Put the elements of the biggest array into a set */ n1 = sarrayGetCount(sa1); n2 = sarrayGetCount(sa2); sa_small = (n1 < n2) ? sa1 : sa2; /* do not destroy sa_small */ sa_big = (n1 < n2) ? sa2 : sa1; /* do not destroy sa_big */ set1 = l_asetCreateFromSarray(sa_big); /* Build up the intersection of strings */ sad = sarrayCreate(0); n = sarrayGetCount(sa_small); set2 = l_asetCreate(L_UINT_TYPE); for (i = 0; i < n; i++) { str = sarrayGetString(sa_small, i, L_NOCOPY); l_hashStringToUint64(str, &hash); key.utype = hash; if (l_asetFind(set1, key) && !l_asetFind(set2, key)) { sarrayAddString(sad, str, L_COPY); l_asetInsert(set2, key); } } l_asetDestroy(&set1); l_asetDestroy(&set2); return sad; }
/*! * \brief sarrayRemoveDupsByHash() * * \param[in] sas * \param[out] psad unique set of strings; duplicates removed * \param[out] pdahash [optional] dnahash used for lookup * \return 0 if OK, 1 on error * * <pre> * Notes: * (1) Generates a sarray with unique values. * (2) The dnahash is built up with sad to assure uniqueness. * It can be used to find if a string is in the set: * sarrayFindValByHash(sad, dahash, str, \&index) * (3) The hash of the string location is simple and fast. It scales * up with the number of buckets to insure a fairly random * bucket selection input strings. * (4) This is faster than sarrayRemoveDupsByAset(), because the * bucket lookup is O(n), although there is a double-loop * lookup within the dna in each bucket. * </pre> */ l_int32 sarrayRemoveDupsByHash(SARRAY *sas, SARRAY **psad, L_DNAHASH **pdahash) { char *str; l_int32 i, n, index, items; l_uint32 nsize; l_uint64 key; SARRAY *sad; L_DNAHASH *dahash; PROCNAME("sarrayRemoveDupsByHash"); if (pdahash) *pdahash = NULL; if (!psad) return ERROR_INT("&sad not defined", procName, 1); *psad = NULL; if (!sas) return ERROR_INT("sas not defined", procName, 1); n = sarrayGetCount(sas); findNextLargerPrime(n / 20, &nsize); /* buckets in hash table */ dahash = l_dnaHashCreate(nsize, 8); sad = sarrayCreate(n); *psad = sad; for (i = 0, items = 0; i < n; i++) { str = sarrayGetString(sas, i, L_NOCOPY); sarrayFindStringByHash(sad, dahash, str, &index); if (index < 0) { /* not found */ l_hashStringToUint64(str, &key); l_dnaHashAdd(dahash, key, (l_float64)items); sarrayAddString(sad, str, L_COPY); items++; } } if (pdahash) *pdahash = dahash; else l_dnaHashDestroy(&dahash); return 0; }
/*! * getSortedPathnamesInDirectory() * * Input: directory name * substr (<optional> substring filter on filenames; can be NULL) * firstpage (0-based) * npages (use 0 for all to the end) * Return: sarray of sorted pathnames, or NULL on error * * Notes: * (1) If 'substr' is not NULL, only filenames that contain * the substring can be returned. If 'substr' is NULL, * none of the filenames are filtered out. * (2) The files in the directory, after optional filtering by * the substring, are lexically sorted in increasing order. * The full pathnames are returned for the requested sequence. * If no files are found after filtering, returns an empty sarray. */ SARRAY * getSortedPathnamesInDirectory(const char *dirname, const char *substr, l_int32 firstpage, l_int32 npages) { char *fname, *fullname; l_int32 i, nfiles, lastpage; SARRAY *sa, *safiles, *saout; PROCNAME("getSortedPathnamesInDirectory"); if (!dirname) return (SARRAY *)ERROR_PTR("dirname not defined", procName, NULL); if ((sa = getFilenamesInDirectory(dirname)) == NULL) return (SARRAY *)ERROR_PTR("sa not made", procName, NULL); safiles = sarraySelectBySubstring(sa, substr); sarrayDestroy(&sa); nfiles = sarrayGetCount(safiles); if (nfiles == 0) { L_WARNING("no files found", procName); return safiles; } sarraySort(safiles, safiles, L_SORT_INCREASING); firstpage = L_MIN(L_MAX(firstpage, 0), nfiles - 1); if (npages == 0) npages = nfiles - firstpage; lastpage = L_MIN(firstpage + npages - 1, nfiles - 1); saout = sarrayCreate(lastpage - firstpage + 1); for (i = firstpage; i <= lastpage; i++) { fname = sarrayGetString(safiles, i, L_NOCOPY); fullname = genPathname(dirname, fname); sarrayAddString(saout, fullname, L_INSERT); } sarrayDestroy(&safiles); return saout; }