/*! * \brief sarrayIntersectionByHash() * * \param[in] sa1, sa2 * \return sad intersection of the strings, or NULL on error * * <pre> * Notes: * (1) This is faster than sarrayIntersectionByAset(), because the * bucket lookup is O(n). * </pre> */ SARRAY * sarrayIntersectionByHash(SARRAY *sa1, SARRAY *sa2) { char *str; l_int32 n1, n2, nsmall, i, index1, index2; l_uint32 nsize2; l_uint64 key; L_DNAHASH *dahash1, *dahash2; SARRAY *sa_small, *sa_big, *sad; PROCNAME("sarrayIntersectionByHash"); if (!sa1) return (SARRAY *)ERROR_PTR("sa1 not defined", procName, NULL); if (!sa2) return (SARRAY *)ERROR_PTR("sa2 not defined", procName, NULL); /* Put the elements of the biggest sarray into a dnahash */ n1 = sarrayGetCount(sa1); n2 = sarrayGetCount(sa2); sa_small = (n1 < n2) ? sa1 : sa2; /* do not destroy sa_small */ sa_big = (n1 < n2) ? sa2 : sa1; /* do not destroy sa_big */ dahash1 = l_dnaHashCreateFromSarray(sa_big); /* Build up the intersection of strings. Add to %sad * if the string is in sa_big (using dahash1) but hasn't * yet been seen in the traversal of sa_small (using dahash2). */ sad = sarrayCreate(0); nsmall = sarrayGetCount(sa_small); findNextLargerPrime(nsmall / 20, &nsize2); /* buckets in hash table */ dahash2 = l_dnaHashCreate(nsize2, 0); for (i = 0; i < nsmall; i++) { str = sarrayGetString(sa_small, i, L_NOCOPY); sarrayFindStringByHash(sa_big, dahash1, str, &index1); if (index1 >= 0) { sarrayFindStringByHash(sa_small, dahash2, str, &index2); if (index2 == -1) { sarrayAddString(sad, str, L_COPY); l_hashStringToUint64(str, &key); l_dnaHashAdd(dahash2, key, (l_float64)i); } } } l_dnaHashDestroy(&dahash1); l_dnaHashDestroy(&dahash2); return sad; }
/*! * sarrayAppendRange() * * Input: sa1 (to be added to) * sa2 (append specified range of strings in sa2 to sa1) * start (index of first string of sa2 to append) * end (index of last string of sa2 to append) * Return: 0 if OK, 1 on error * * Notes: * (1) Copies of the strings in sarray2 are added to sarray1. * (2) The [start ... end] range is truncated if necessary. */ l_int32 sarrayAppendRange(SARRAY *sa1, SARRAY *sa2, l_int32 start, l_int32 end) { char *str; l_int32 n, i; PROCNAME("sarrayAppendRange"); if (!sa1) return ERROR_INT("sa1 not defined", procName, 1); if (!sa2) return ERROR_INT("sa2 not defined", procName, 1); if (start < 0) start = 0; n = sarrayGetCount(sa2); if (end >= n) end = n - 1; if (start > end) return ERROR_INT("start > end", procName, 1); for (i = start; i <= end; i++) { str = sarrayGetString(sa2, i, L_NOCOPY); sarrayAddString(sa1, str, L_COPY); } return 0; }
/*! * jbCorrelation() * * Input: dirin (directory of input images) * thresh (typically ~0.8) * weight (typically ~0.6) * components (JB_CONN_COMPS, JB_CHARACTERS, JB_WORDS) * rootname (for output files) * firstpage (0-based) * npages (use 0 for all pages in dirin) * renderflag (1 to render from templates; 0 to skip) * Return: 0 if OK, 1 on error * * Notes: * (1) The images must be 1 bpp. If they are not, you can convert * them using convertFilesTo1bpp(). * (2) See prog/jbcorrelation for generating more output (e.g., * for debugging) */ l_int32 jbCorrelation(const char *dirin, l_float32 thresh, l_float32 weight, l_int32 components, const char *rootname, l_int32 firstpage, l_int32 npages, l_int32 renderflag) { char filename[L_BUF_SIZE]; l_int32 nfiles, i, numpages; JBDATA *data; JBCLASSER *classer; PIX *pix; PIXA *pixa; SARRAY *safiles; PROCNAME("jbCorrelation"); if (!dirin) return ERROR_INT("dirin not defined", procName, 1); if (!rootname) return ERROR_INT("rootname not defined", procName, 1); if (components != JB_CONN_COMPS && components != JB_CHARACTERS && components != JB_WORDS) return ERROR_INT("components invalid", procName, 1); safiles = getSortedPathnamesInDirectory(dirin, NULL, firstpage, npages); nfiles = sarrayGetCount(safiles); /* Classify components */ classer = jbCorrelationInit(components, 0, 0, thresh, weight); jbAddPages(classer, safiles); /* Save data */ data = jbDataSave(classer); jbDataWrite(rootname, data); /* Optionally, render pages using class templates */ if (renderflag) { pixa = jbDataRender(data, FALSE); numpages = pixaGetCount(pixa); if (numpages != nfiles) fprintf(stderr, "numpages = %d, nfiles = %d, not equal!\n", numpages, nfiles); for (i = 0; i < numpages; i++) { pix = pixaGetPix(pixa, i, L_CLONE); snprintf(filename, L_BUF_SIZE, "%s.%05d", rootname, i); fprintf(stderr, "filename: %s\n", filename); pixWrite(filename, pix, IFF_PNG); pixDestroy(&pix); } pixaDestroy(&pixa); } sarrayDestroy(&safiles); jbClasserDestroy(&classer); jbDataDestroy(&data); return 0; }
/*! * \brief sarraySortByIndex() * * \param[in] sain * \param[in] naindex na that maps from the new sarray to the input sarray * \return saout sorted, or NULL on error */ SARRAY * sarraySortByIndex(SARRAY *sain, NUMA *naindex) { char *str; l_int32 i, n, index; SARRAY *saout; PROCNAME("sarraySortByIndex"); if (!sain) return (SARRAY *)ERROR_PTR("sain not defined", procName, NULL); if (!naindex) return (SARRAY *)ERROR_PTR("naindex not defined", procName, NULL); n = sarrayGetCount(sain); saout = sarrayCreate(n); for (i = 0; i < n; i++) { numaGetIValue(naindex, i, &index); str = sarrayGetString(sain, index, L_COPY); sarrayAddString(saout, str, L_INSERT); } return saout; }
/* * sarrayConvertFilesToPS() * * Input: sarray (of full path names) * res (typ. 300 or 600 ppi) * fileout (output ps file) * Return: 0 if OK, 1 on error * * Notes: * (1) See convertFilesToPS() */ l_int32 sarrayConvertFilesToPS(SARRAY *sa, l_int32 res, const char *fileout) { char *fname; l_int32 i, nfiles, index, firstfile, ret, format; PROCNAME("sarrayConvertFilesToPS"); if (!sa) return ERROR_INT("sa not defined", procName, 1); if (!fileout) return ERROR_INT("fileout not defined", procName, 1); if (res <= 0) { L_INFO("setting res to 300 ppi", procName); res = 300; } if (res < 10 || res > 4000) L_WARNING("res is typically in the range 300-600 ppi", procName); nfiles = sarrayGetCount(sa); firstfile = TRUE; for (i = 0, index = 0; i < nfiles; i++) { fname = sarrayGetString(sa, i, L_NOCOPY); ret = pixReadHeader(fname, &format, NULL, NULL, NULL, NULL, NULL); if (ret) continue; if (format == IFF_UNKNOWN) continue; writeImageCompressedToPSFile(fname, fileout, res, &firstfile, &index); } return 0; }
/*! * pixReadIndexed() * * Input: sarray (of full pathnames) * index (into pathname array) * Return: pix if OK; null if not found * * Notes: * (1) This function is useful for selecting image files from a * directory, where the integer @index is embedded into * the file name. * (2) This is typically done by generating the sarray using * getNumberedPathnamesInDirectory(), so that the @index * pathname would have the number @index in it. The size * of the sarray should be the largest number (plus 1) appearing * in the file names, respecting the constraints in the * call to getNumberedPathnamesInDirectory(). * (3) Consequently, for some indices into the sarray, there may * be no pathnames in the directory containing that number. * By convention, we place empty C strings ("") in those * locations in the sarray, and it is not an error if such * a string is encountered and no pix is returned. * Therefore, the caller must verify that a pix is returned. * (4) See convertSegmentedPagesToPS() in src/psio1.c for an * example of usage. */ PIX * pixReadIndexed(SARRAY *sa, l_int32 index) { char *fname; l_int32 n; PIX *pix; PROCNAME("pixReadIndexed"); if (!sa) return (PIX *)ERROR_PTR("sa not defined", procName, NULL); n = sarrayGetCount(sa); if (index < 0 || index >= n) return (PIX *)ERROR_PTR("index out of bounds", procName, NULL); fname = sarrayGetString(sa, index, L_NOCOPY); if (fname[0] == '\0') return NULL; if ((pix = pixRead(fname)) == NULL) { L_ERROR("pix not read from file %s\n", procName, fname); return NULL; } return pix; }
/*! * pixaReadFilesSA() * * Input: sarray (full pathnames for all files) * Return: pixa, or null on error */ PIXA * pixaReadFilesSA(SARRAY *sa) { char *str; l_int32 i, n; PIX *pix; PIXA *pixa; PROCNAME("pixaReadFilesSA"); if (!sa) return (PIXA *)ERROR_PTR("sa not defined", procName, NULL); n = sarrayGetCount(sa); pixa = pixaCreate(n); for (i = 0; i < n; i++) { str = sarrayGetString(sa, i, L_NOCOPY); if ((pix = pixRead(str)) == NULL) { L_WARNING("pix not read from file %s\n", procName, str); continue; } pixaAddPix(pixa, pix, L_INSERT); } return pixa; }
/* * getNextNonCommentLine() * * Input: sa (output from cpp, by line) * start (starting index to search) * &next (<return> index of first uncommented line after * the start line) * Return: 0 if OK, 1 on error * * Notes: * (1) Skips over all consecutive comment lines, beginning at 'start' * (2) If all lines to the end are '#' comments, return next = -1 */ static l_int32 getNextNonCommentLine(SARRAY *sa, l_int32 start, l_int32 *pnext) { char *str; l_int32 i, n; PROCNAME("getNextNonCommentLine"); if (!sa) return ERROR_INT("sa not defined", procName, 1); if (!pnext) return ERROR_INT("&pnext not defined", procName, 1); /* Init for situation where this line and all following are comments */ *pnext = -1; n = sarrayGetCount(sa); for (i = start; i < n; i++) { if ((str = sarrayGetString(sa, i, L_NOCOPY)) == NULL) return ERROR_INT("str not returned; shouldn't happen", procName, 1); if (str[0] != '#') { *pnext = i; return 0; } } return 0; }
/*! * sarrayAddString() * * Input: sarray * string (string to be added) * copyflag (L_INSERT, L_COPY) * Return: 0 if OK, 1 on error * * Notes: * (1) Legacy usage decrees that we always use 0 to insert a string * directly and 1 to insert a copy of the string. The * enums for L_INSERT and L_COPY agree with this convention, * and will not change in the future. * (2) See usage comments at the top of this file. */ l_int32 sarrayAddString(SARRAY *sa, char *string, l_int32 copyflag) { l_int32 n; PROCNAME("sarrayAddString"); if (!sa) return ERROR_INT("sa not defined", procName, 1); if (!string) return ERROR_INT("string not defined", procName, 1); if (copyflag != L_INSERT && copyflag != L_COPY) return ERROR_INT("invalid copyflag", procName, 1); n = sarrayGetCount(sa); if (n >= sa->nalloc) sarrayExtendArray(sa); if (copyflag == L_INSERT) sa->array[n] = string; else /* L_COPY */ sa->array[n] = stringNew(string); sa->n++; return 0; }
/*! * \brief l_dnaHashCreateFromSarray() * * \param[in] sa * \return dahash, or NULL on error */ L_DNAHASH * l_dnaHashCreateFromSarray(SARRAY *sa) { char *str; l_int32 i, n; l_uint32 nsize; l_uint64 key; L_DNAHASH *dahash; /* Build up dnaHash of indices, hashed by a 64-bit key that * should randomize the lower bits used in bucket selection. * Having about 20 pts in each bucket is roughly optimal. */ n = sarrayGetCount(sa); findNextLargerPrime(n / 20, &nsize); /* buckets in hash table */ /* fprintf(stderr, "Prime used: %d\n", nsize); */ /* Add each string, using the hash as key and the index into %sa * as the value. Storing the index enables operations that check * for duplicates. */ dahash = l_dnaHashCreate(nsize, 8); for (i = 0; i < n; i++) { str = sarrayGetString(sa, i, L_NOCOPY); l_hashStringToUint64(str, &key); l_dnaHashAdd(dahash, key, (l_float64)i); } return dahash; }
/* * getNextNonBlankLine() * * Input: sa (output from cpp, by line) * start (starting index to search) * &next (<return> index of first nonblank line after * the start line) * Return: 0 if OK, 1 on error * * Notes: * (1) Skips over all consecutive blank lines, beginning at 'start' * (2) A blank line has only whitespace characters (' ', '\t', '\n', '\r') * (3) If all lines to the end are blank, return next = -1 */ static l_int32 getNextNonBlankLine(SARRAY *sa, l_int32 start, l_int32 *pnext) { char *str; l_int32 i, j, n, len; PROCNAME("getNextNonBlankLine"); if (!sa) return ERROR_INT("sa not defined", procName, 1); if (!pnext) return ERROR_INT("&pnext not defined", procName, 1); /* Init for situation where this line and all following are blank */ *pnext = -1; n = sarrayGetCount(sa); for (i = start; i < n; i++) { if ((str = sarrayGetString(sa, i, L_NOCOPY)) == NULL) return ERROR_INT("str not returned; shouldn't happen", procName, 1); len = strlen(str); for (j = 0; j < len; j++) { if (str[j] != ' ' && str[j] != '\t' && str[j] != '\n' && str[j] != '\r') { /* non-blank */ *pnext = i; return 0; } } } return 0; }
/*! * \brief l_asetCreateFromSarray() * * \param[in] sa * \return set using a string hash into a uint32 as the key */ L_ASET * l_asetCreateFromSarray(SARRAY *sa) { char *str; l_int32 i, n; l_uint64 hash; L_ASET *set; RB_TYPE key; PROCNAME("l_asetCreateFromSarray"); if (!sa) return (L_ASET *)ERROR_PTR("sa not defined", procName, NULL); set = l_asetCreate(L_UINT_TYPE); n = sarrayGetCount(sa); for (i = 0; i < n; i++) { str = sarrayGetString(sa, i, L_NOCOPY); l_hashStringToUint64(str, &hash); key.utype = hash; l_asetInsert(set, key); } return set; }
/* * getNextNonDoubleSlashLine() * * Input: sa (output from cpp, by line) * start (starting index to search) * &next (<return> index of first uncommented line after * the start line) * Return: 0 if OK, 1 on error * * Notes: * (1) Skips over all consecutive '//' lines, beginning at 'start' * (2) If all lines to the end start with '//', return next = -1 */ static l_int32 getNextNonDoubleSlashLine(SARRAY *sa, l_int32 start, l_int32 *pnext) { char *str; l_int32 i, n, len; PROCNAME("getNextNonDoubleSlashLine"); if (!sa) return ERROR_INT("sa not defined", procName, 1); if (!pnext) return ERROR_INT("&pnext not defined", procName, 1); /* Init for situation where this line and all following * start with '//' */ *pnext = -1; n = sarrayGetCount(sa); for (i = start; i < n; i++) { if ((str = sarrayGetString(sa, i, L_NOCOPY)) == NULL) return ERROR_INT("str not returned; shouldn't happen", procName, 1); len = strlen(str); if (len < 2 || str[0] != '/' || str[1] != '/') { *pnext = i; return 0; } } return 0; }
/*! * \brief sarrayRemoveDupsByAset() * * \param[in] sas * \return sad with duplicates removed, or NULL on error * * <pre> * Notes: * (1) This is O(nlogn), considerably slower than * sarrayRemoveDupsByHash() for large string arrays. * (2) The key for each string is a 64-bit hash. * (3) Build a set, using hashed strings as keys. As the set is * built, first do a find; if not found, add the key to the * set and add the string to the output sarray. * </pre> */ SARRAY * sarrayRemoveDupsByAset(SARRAY *sas) { char *str; l_int32 i, n; l_uint64 hash; L_ASET *set; RB_TYPE key; SARRAY *sad; PROCNAME("sarrayRemoveDupsByAset"); if (!sas) return (SARRAY *)ERROR_PTR("sas not defined", procName, NULL); set = l_asetCreate(L_UINT_TYPE); sad = sarrayCreate(0); n = sarrayGetCount(sas); for (i = 0; i < n; i++) { str = sarrayGetString(sas, i, L_NOCOPY); l_hashStringToUint64(str, &hash); key.utype = hash; if (!l_asetFind(set, key)) { sarrayAddString(sad, str, L_COPY); l_asetInsert(set, key); } } l_asetDestroy(&set); return sad; }
/* * convertSegmentedPagesToPS() * * Input: pagedir (input page image directory) * pagestr (<optional> substring filter on page filenames; * can be NULL) * page_numpre (number of characters in page name before number) * maskdir (input mask image directory) * maskstr (<optional> substring filter on mask filenames; * can be NULL) * mask_numpre (number of characters in mask name before number) * numpost (number of characters in names after number) * maxnum (only consider page numbers up to this value) * textscale (scale of text output relative to pixs) * imagescale (scale of image output relative to pixs) * threshold (for binarization; typ. about 190; 0 for default) * fileout (output ps file) * Return: 0 if OK, 1 on error * * Notes: * (1) This generates a PS file for all page image and mask files in two * specified directories and that contain the page numbers as * specified below. The two directories can be the same, in which * case the page and mask files are differentiated by the two * substrings for string matches. * (2) The page images are taken in lexicographic order. * Mask images whose numbers match the page images are used to * segment the page images. Page images without a matching * mask image are scaled, thresholded and rendered entirely as text. * (3) Each PS page is generated as a compressed representation of * the page image, where the part of the image under the mask * is suitably scaled and compressed as DCT (i.e., jpeg), and * the remaining part of the page is suitably scaled, thresholded, * compressed as G4 (i.e., tiff g4), and rendered by painting * black through the resulting text mask. * (4) The scaling is typically 2x down for the DCT component * (@imagescale = 0.5) and 2x up for the G4 component * (@textscale = 2.0). * (5) The resolution is automatically set to fit to a * letter-size (8.5 x 11 inch) page. * (6) Both the DCT and the G4 encoding are PostScript level 2. * (7) It is assumed that the page number is contained within * the basename (the filename without directory or extension). * @page_numpre is the number of characters in the page basename * preceding the actual page number; @mask_numpre is likewise for * the mask basename; @numpost is the number of characters * following the page number. For example, for mask name * mask_006.tif, mask_numpre = 5 ("mask_). * (8) To render a page as is -- that is, with no thresholding * of any pixels -- use a mask in the mask directory that is * full size with all pixels set to 1. If the page is 1 bpp, * it is not necessary to have a mask. */ l_int32 convertSegmentedPagesToPS(const char *pagedir, const char *pagestr, l_int32 page_numpre, const char *maskdir, const char *maskstr, l_int32 mask_numpre, l_int32 numpost, l_int32 maxnum, l_float32 textscale, l_float32 imagescale, l_int32 threshold, const char *fileout) { l_int32 pageno, i, npages; PIX *pixs, *pixm; SARRAY *sapage, *samask; PROCNAME("convertSegmentedPagesToPS"); if (!pagedir) return ERROR_INT("pagedir not defined", procName, 1); if (!maskdir) return ERROR_INT("maskdir not defined", procName, 1); if (!fileout) return ERROR_INT("fileout not defined", procName, 1); if (threshold <= 0) { L_INFO("setting threshold to 190\n", procName); threshold = 190; } /* Get numbered full pathnames; max size of sarray is maxnum */ sapage = getNumberedPathnamesInDirectory(pagedir, pagestr, page_numpre, numpost, maxnum); samask = getNumberedPathnamesInDirectory(maskdir, maskstr, mask_numpre, numpost, maxnum); sarrayPadToSameSize(sapage, samask, (char *)""); if ((npages = sarrayGetCount(sapage)) == 0) { sarrayDestroy(&sapage); sarrayDestroy(&samask); return ERROR_INT("no matching pages found", procName, 1); } /* Generate the PS file */ pageno = 1; for (i = 0; i < npages; i++) { if ((pixs = pixReadIndexed(sapage, i)) == NULL) continue; pixm = pixReadIndexed(samask, i); pixWriteSegmentedPageToPS(pixs, pixm, textscale, imagescale, threshold, pageno, fileout); pixDestroy(&pixs); pixDestroy(&pixm); pageno++; } sarrayDestroy(&sapage); sarrayDestroy(&samask); return 0; }
/*! * sarraySelectBySubstring() * * Input: sain (input sarray) * substr (<optional> substring for matching; can be NULL) * Return: saout (output sarray, filtered with substring) or null on error * * Notes: * (1) This selects all strings in sain that have substr as a substring. * Note that we can't use strncmp() because we're looking for * a match to the substring anywhere within each filename. * (2) If substr == NULL, returns a copy of the sarray. */ SARRAY * sarraySelectBySubstring(SARRAY *sain, const char *substr) { char *str; l_int32 n, i, offset, found; SARRAY *saout; PROCNAME("sarraySelectBySubstring"); if (!sain) return (SARRAY *)ERROR_PTR("sain not defined", procName, NULL); n = sarrayGetCount(sain); if (!substr || n == 0) return sarrayCopy(sain); saout = sarrayCreate(n); for (i = 0; i < n; i++) { str = sarrayGetString(sain, i, L_NOCOPY); arrayFindSequence((l_uint8 *)str, strlen(str), (l_uint8 *)substr, strlen(substr), &offset, &found); if (found) sarrayAddString(saout, str, L_COPY); } return saout; }
/* * skipToMatchingBrace() * * Input: sa (output from cpp, by line) * start (index of starting line with left bracket to search) * lbindex (starting char index for left bracket) * &stop (index of line with the matching right bracket) * &rbindex (char index of matching right bracket) * Return: 0 if OK, 1 on error * * Notes: * (1) If the matching right brace is not found, returns * stop = -1. This shouldn't happen. */ static l_int32 skipToMatchingBrace(SARRAY *sa, l_int32 start, l_int32 lbindex, l_int32 *pstop, l_int32 *prbindex) { char *str; l_int32 i, j, jstart, n, sumbrace, found, instring, nchars; PROCNAME("skipToMatchingBrace"); if (!sa) return ERROR_INT("sa not defined", procName, 1); if (!pstop) return ERROR_INT("&stop not defined", procName, 1); if (!prbindex) return ERROR_INT("&rbindex not defined", procName, 1); instring = 0; /* init to FALSE; toggle on double quotes */ *pstop = -1; n = sarrayGetCount(sa); sumbrace = 1; found = FALSE; for (i = start; i < n; i++) { str = sarrayGetString(sa, i, L_NOCOPY); jstart = 0; if (i == start) jstart = lbindex + 1; nchars = strlen(str); for (j = jstart; j < nchars; j++) { /* Toggle the instring state every time you encounter * a double quote that is NOT escaped. */ if (j == jstart && str[j] == '\"') instring = 1 - instring; if (j > jstart && str[j] == '\"' && str[j-1] != '\\') instring = 1 - instring; /* Record the braces if they are neither a literal character * nor within a string. */ if (str[j] == '{' && str[j+1] != '\'' && !instring) { sumbrace++; } else if (str[j] == '}' && str[j+1] != '\'' && !instring) { sumbrace--; if (sumbrace == 0) { found = TRUE; *prbindex = j; break; } } } if (found) { *pstop = i; return 0; } } return ERROR_INT("matching right brace not found", procName, 1); }
/*! * \brief sarrayIntersectionByAset() * * \param[in] sa1, sa2 * \return sad with the intersection of the string set, or NULL on error * * <pre> * Notes: * (1) Algorithm: put the smaller sarray into a set, using the string * hashes as the key values. Then run through the larger sarray, * building an output sarray and a second set from the strings * in the larger array: if a string is in the first set but * not in the second, add the string to the output sarray and hash * it into the second set. The second set is required to make * sure only one instance of each string is put into the output sarray. * This is O(mlogn), {m,n} = sizes of {smaller,larger} input arrays. * </pre> */ SARRAY * sarrayIntersectionByAset(SARRAY *sa1, SARRAY *sa2) { char *str; l_int32 n1, n2, i, n; l_uint64 hash; L_ASET *set1, *set2; RB_TYPE key; SARRAY *sa_small, *sa_big, *sad; PROCNAME("sarrayIntersectionByAset"); if (!sa1) return (SARRAY *)ERROR_PTR("sa1 not defined", procName, NULL); if (!sa2) return (SARRAY *)ERROR_PTR("sa2 not defined", procName, NULL); /* Put the elements of the biggest array into a set */ n1 = sarrayGetCount(sa1); n2 = sarrayGetCount(sa2); sa_small = (n1 < n2) ? sa1 : sa2; /* do not destroy sa_small */ sa_big = (n1 < n2) ? sa2 : sa1; /* do not destroy sa_big */ set1 = l_asetCreateFromSarray(sa_big); /* Build up the intersection of strings */ sad = sarrayCreate(0); n = sarrayGetCount(sa_small); set2 = l_asetCreate(L_UINT_TYPE); for (i = 0; i < n; i++) { str = sarrayGetString(sa_small, i, L_NOCOPY); l_hashStringToUint64(str, &hash); key.utype = hash; if (l_asetFind(set1, key) && !l_asetFind(set2, key)) { sarrayAddString(sad, str, L_COPY); l_asetInsert(set2, key); } } l_asetDestroy(&set1); l_asetDestroy(&set2); return sad; }
/* * getOffsetForCharacter() * * Input: sa (output from cpp, by line) * start (starting index in sa to search; never a comment line) * tchar (we are searching for the first instance of this) * &soffset (<return> offset in strings from start index) * &boffset (<return> offset in bytes within string in which * the character is first found) * &toffset (<return> offset in total bytes from beginning of * string indexed by 'start' to the location where * the character is first found) * Return: 0 if OK, 1 on error * * Notes: * (1) We are searching for the first instance of 'tchar', starting * at the beginning of the string indexed by start. * (2) If the character is not found, soffset is returned as -1, * and the other offsets are set to very large numbers. The * caller must check the value of soffset. * (3) This is only used in contexts where it is not necessary to * consider if the character is inside a string. */ static l_int32 getOffsetForCharacter(SARRAY *sa, l_int32 start, char tchar, l_int32 *psoffset, l_int32 *pboffset, l_int32 *ptoffset) { char *str; l_int32 i, j, n, nchars, totchars, found; PROCNAME("getOffsetForCharacter"); if (!sa) return ERROR_INT("sa not defined", procName, 1); if (!psoffset) return ERROR_INT("&soffset not defined", procName, 1); if (!pboffset) return ERROR_INT("&boffset not defined", procName, 1); if (!ptoffset) return ERROR_INT("&toffset not defined", procName, 1); *psoffset = -1; /* init to not found */ *pboffset = 100000000; *ptoffset = 100000000; n = sarrayGetCount(sa); found = FALSE; totchars = 0; for (i = start; i < n; i++) { if ((str = sarrayGetString(sa, i, L_NOCOPY)) == NULL) return ERROR_INT("str not returned; shouldn't happen", procName, 1); nchars = strlen(str); for (j = 0; j < nchars; j++) { if (str[j] == tchar) { found = TRUE; break; } } if (found) break; totchars += nchars; } if (found) { *psoffset = i - start; *pboffset = j; *ptoffset = totchars + j; } return 0; }
int main(int argc, char **argv) { char *dirin, *dirout, *infile, *outfile, *tail; l_int32 i, nfiles, border, x, y, w, h, xb, yb, wb, hb; BOX *box1, *box2; BOXA *boxa1, *boxa2; PIX *pixs, *pixt1, *pixd; SARRAY *safiles; static char mainName[] = "croptext"; if (argc != 4) return ERROR_INT("Syntax: croptext dirin border dirout", mainName, 1); dirin = argv[1]; border = atoi(argv[2]); dirout = argv[3]; setLeptDebugOK(1); safiles = getSortedPathnamesInDirectory(dirin, NULL, 0, 0); nfiles = sarrayGetCount(safiles); for (i = 0; i < nfiles; i++) { infile = sarrayGetString(safiles, i, L_NOCOPY); splitPathAtDirectory(infile, NULL, &tail); outfile = genPathname(dirout, tail); pixs = pixRead(infile); pixt1 = pixMorphSequence(pixs, "r11 + c10.40 + o5.5 + x4", 0); boxa1 = pixConnComp(pixt1, NULL, 8); if (boxaGetCount(boxa1) == 0) { fprintf(stderr, "Warning: no components on page %s\n", tail); continue; } boxa2 = boxaSort(boxa1, L_SORT_BY_AREA, L_SORT_DECREASING, NULL); box1 = boxaGetBox(boxa2, 0, L_CLONE); boxGetGeometry(box1, &x, &y, &w, &h); xb = L_MAX(0, x - border); yb = L_MAX(0, y - border); wb = w + 2 * border; hb = h + 2 * border; box2 = boxCreate(xb, yb, wb, hb); pixd = pixClipRectangle(pixs, box2, NULL); pixWrite(outfile, pixd, IFF_TIFF_G4); pixDestroy(&pixs); pixDestroy(&pixt1); pixDestroy(&pixd); boxaDestroy(&boxa1); boxaDestroy(&boxa2); } return 0; }
/* * cleanProtoSignature() * * Input: instr (input prototype string) * Return: cleanstr (clean prototype string), or NULL on error * * Notes: * (1) Adds 'extern' at beginning and regularizes spaces * between tokens. */ static char * cleanProtoSignature(char *instr) { char *str, *cleanstr; char buf[L_BUF_SIZE]; char externstring[] = "extern"; l_int32 i, j, nwords, nchars, index, len; SARRAY *sa, *saout; PROCNAME("cleanProtoSignature"); if (!instr) return (char *)ERROR_PTR("instr not defined", procName, NULL); sa = sarrayCreateWordsFromString(instr); nwords = sarrayGetCount(sa); saout = sarrayCreate(0); sarrayAddString(saout, externstring, 1); for (i = 0; i < nwords; i++) { str = sarrayGetString(sa, i, 0); nchars = strlen(str); index = 0; for (j = 0; j < nchars; j++) { if (index > L_BUF_SIZE - 6) return (char *)ERROR_PTR("token too large", procName, NULL); if (str[j] == '(') { buf[index++] = ' '; buf[index++] = '('; buf[index++] = ' '; } else if (str[j] == ')') { buf[index++] = ' '; buf[index++] = ')'; } else buf[index++] = str[j]; } buf[index] = '\0'; sarrayAddString(saout, buf, 1); } /* Flatten to a prototype string with spaces added after * each word, and remove the last space */ cleanstr = sarrayToString(saout, 2); len = strlen(cleanstr); cleanstr[len - 1] = '\0'; sarrayDestroy(&sa); sarrayDestroy(&saout); return cleanstr; }
/* * sarrayConvertFilesFittedToPS() * * Input: sarray (of full path names) * xpts, ypts (desired size in printer points; use 0 for default) * fileout (output ps file) * Return: 0 if OK, 1 on error * * Notes: * (1) See convertFilesFittedToPS() */ l_int32 sarrayConvertFilesFittedToPS(SARRAY *sa, l_float32 xpts, l_float32 ypts, const char *fileout) { char *fname; l_int32 ret, i, w, h, nfiles, index, firstfile, format, res; PROCNAME("sarrayConvertFilesFittedToPS"); if (!sa) return ERROR_INT("sa not defined", procName, 1); if (!fileout) return ERROR_INT("fileout not defined", procName, 1); if (xpts <= 0.0) { L_INFO("setting xpts to 612.0", procName); xpts = 612.0; } if (ypts <= 0.0) { L_INFO("setting ypts to 792.0", procName); ypts = 792.0; } if (xpts < 100.0 || xpts > 2000.0 || ypts < 100.0 || ypts > 2000.0) L_WARNING("xpts,ypts are typically in the range 500-800", procName); nfiles = sarrayGetCount(sa); firstfile = TRUE; for (i = 0, index = 0; i < nfiles; i++) { fname = sarrayGetString(sa, i, L_NOCOPY); ret = pixReadHeader(fname, &format, &w, &h, NULL, NULL, NULL); if (ret) continue; if (format == IFF_UNKNOWN) continue; /* Be sure the entire image is wrapped */ if (xpts * h < ypts * w) res = (l_int32)((l_float32)w * 72.0 / xpts); else res = (l_int32)((l_float32)h * 72.0 / ypts); writeImageCompressedToPSFile(fname, fileout, res, &firstfile, &index); } return 0; }
/*! * \brief strcodeCreateFromFile() * * \param[in] filein containing filenames of serialized data * \param[in] fileno integer that labels the two output files * \param[in] outdir [optional] if null, files are made in /tmp/lept/auto * \return 0 if OK, 1 on error * * <pre> * Notes: * (1) The %filein has one filename on each line. * Comment lines begin with "#". * (2) The output is 2 files: * autogen.\<fileno\>.c * autogen.\<fileno\>.h * </pre> */ l_int32 strcodeCreateFromFile(const char *filein, l_int32 fileno, const char *outdir) { char *fname; const char *type; l_uint8 *data; size_t nbytes; l_int32 i, n, index; SARRAY *sa; L_STRCODE *strcode; PROCNAME("strcodeCreateFromFile"); if (!filein) return ERROR_INT("filein not defined", procName, 1); if ((data = l_binaryRead(filein, &nbytes)) == NULL) return ERROR_INT("data not read from file", procName, 1); sa = sarrayCreateLinesFromString((char *)data, 0); LEPT_FREE(data); if (!sa) return ERROR_INT("sa not made", procName, 1); if ((n = sarrayGetCount(sa)) == 0) { sarrayDestroy(&sa); return ERROR_INT("no filenames in the file", procName, 1); } strcode = strcodeCreate(fileno); for (i = 0; i < n; i++) { fname = sarrayGetString(sa, i, L_NOCOPY); if (fname[0] == '#') continue; if (l_getIndexFromFile(fname, &index)) { L_ERROR("File %s has no recognizable type\n", procName, fname); } else { type = l_assoc[index].type; L_INFO("File %s is type %s\n", procName, fname, type); strcodeGenerate(strcode, fname, type); } } strcodeFinalize(&strcode, outdir); return 0; }
/*! * \brief sarrayRemoveDupsByHash() * * \param[in] sas * \param[out] psad unique set of strings; duplicates removed * \param[out] pdahash [optional] dnahash used for lookup * \return 0 if OK, 1 on error * * <pre> * Notes: * (1) Generates a sarray with unique values. * (2) The dnahash is built up with sad to assure uniqueness. * It can be used to find if a string is in the set: * sarrayFindValByHash(sad, dahash, str, \&index) * (3) The hash of the string location is simple and fast. It scales * up with the number of buckets to insure a fairly random * bucket selection input strings. * (4) This is faster than sarrayRemoveDupsByAset(), because the * bucket lookup is O(n), although there is a double-loop * lookup within the dna in each bucket. * </pre> */ l_int32 sarrayRemoveDupsByHash(SARRAY *sas, SARRAY **psad, L_DNAHASH **pdahash) { char *str; l_int32 i, n, index, items; l_uint32 nsize; l_uint64 key; SARRAY *sad; L_DNAHASH *dahash; PROCNAME("sarrayRemoveDupsByHash"); if (pdahash) *pdahash = NULL; if (!psad) return ERROR_INT("&sad not defined", procName, 1); *psad = NULL; if (!sas) return ERROR_INT("sas not defined", procName, 1); n = sarrayGetCount(sas); findNextLargerPrime(n / 20, &nsize); /* buckets in hash table */ dahash = l_dnaHashCreate(nsize, 8); sad = sarrayCreate(n); *psad = sad; for (i = 0, items = 0; i < n; i++) { str = sarrayGetString(sas, i, L_NOCOPY); sarrayFindStringByHash(sad, dahash, str, &index); if (index < 0) { /* not found */ l_hashStringToUint64(str, &key); l_dnaHashAdd(dahash, key, (l_float64)items); sarrayAddString(sad, str, L_COPY); items++; } } if (pdahash) *pdahash = dahash; else l_dnaHashDestroy(&dahash); return 0; }
/*! * getSortedPathnamesInDirectory() * * Input: directory name * substr (<optional> substring filter on filenames; can be NULL) * firstpage (0-based) * npages (use 0 for all to the end) * Return: sarray of sorted pathnames, or NULL on error * * Notes: * (1) If 'substr' is not NULL, only filenames that contain * the substring can be returned. If 'substr' is NULL, * none of the filenames are filtered out. * (2) The files in the directory, after optional filtering by * the substring, are lexically sorted in increasing order. * The full pathnames are returned for the requested sequence. * If no files are found after filtering, returns an empty sarray. */ SARRAY * getSortedPathnamesInDirectory(const char *dirname, const char *substr, l_int32 firstpage, l_int32 npages) { char *fname, *fullname; l_int32 i, nfiles, lastpage; SARRAY *sa, *safiles, *saout; PROCNAME("getSortedPathnamesInDirectory"); if (!dirname) return (SARRAY *)ERROR_PTR("dirname not defined", procName, NULL); if ((sa = getFilenamesInDirectory(dirname)) == NULL) return (SARRAY *)ERROR_PTR("sa not made", procName, NULL); safiles = sarraySelectBySubstring(sa, substr); sarrayDestroy(&sa); nfiles = sarrayGetCount(safiles); if (nfiles == 0) { L_WARNING("no files found", procName); return safiles; } sarraySort(safiles, safiles, L_SORT_INCREASING); firstpage = L_MIN(L_MAX(firstpage, 0), nfiles - 1); if (npages == 0) npages = nfiles - firstpage; lastpage = L_MIN(firstpage + npages - 1, nfiles - 1); saout = sarrayCreate(lastpage - firstpage + 1); for (i = firstpage; i <= lastpage; i++) { fname = sarrayGetString(safiles, i, L_NOCOPY); fullname = genPathname(dirname, fname); sarrayAddString(saout, fullname, L_INSERT); } sarrayDestroy(&safiles); return saout; }
/*! * sarraySort() * * Input: saout (output sarray; can be NULL or equal to sain) * sain (input sarray) * sortorder (L_SORT_INCREASING or L_SORT_DECREASING) * Return: saout (output sarray, sorted by ascii value), or null on error * * Notes: * (1) Set saout = sain for in-place; otherwise, set naout = NULL. * (2) Shell sort, modified from K&R, 2nd edition, p.62. * Slow but simple O(n logn) sort. */ SARRAY * sarraySort(SARRAY *saout, SARRAY *sain, l_int32 sortorder) { char **array; char *tmp; l_int32 n, i, j, gap; PROCNAME("sarraySort"); if (!sain) return (SARRAY *)ERROR_PTR("sain not defined", procName, NULL); /* Make saout if necessary; otherwise do in-place */ if (!saout) saout = sarrayCopy(sain); else if (sain != saout) return (SARRAY *)ERROR_PTR("invalid: not in-place", procName, NULL); array = saout->array; /* operate directly on the array */ n = sarrayGetCount(saout); /* Shell sort */ for (gap = n/2; gap > 0; gap = gap / 2) { for (i = gap; i < n; i++) { for (j = i - gap; j >= 0; j -= gap) { if ((sortorder == L_SORT_INCREASING && stringCompareLexical(array[j], array[j + gap])) || (sortorder == L_SORT_DECREASING && stringCompareLexical(array[j + gap], array[j]))) { tmp = array[j]; array[j] = array[j + gap]; array[j + gap] = tmp; } } } } return saout; }
/* * skipToSemicolon() * * Input: sa (output from cpp, by line) * start (index of starting line to search) * charindex (starting char index for search) * &next (index of line containing the next ';') * Return: 0 if OK, 1 on error * * Notes: * (1) If the semicolon isn't found, returns next = -1. * This shouldn't happen. * (2) This is only used in contexts where the semicolon is * not within a string. */ static l_int32 skipToSemicolon(SARRAY *sa, l_int32 start, l_int32 charindex, l_int32 *pnext) { char *str; l_int32 i, j, n, jstart, nchars, found; PROCNAME("skipToSemicolon"); if (!sa) return ERROR_INT("sa not defined", procName, 1); if (!pnext) return ERROR_INT("&next not defined", procName, 1); *pnext = -1; n = sarrayGetCount(sa); found = FALSE; for (i = start; i < n; i++) { str = sarrayGetString(sa, i, L_NOCOPY); jstart = 0; if (i == start) jstart = charindex + 1; nchars = strlen(str); for (j = jstart; j < nchars; j++) { if (str[j] == ';') { found = TRUE;; break; } } if (found) { *pnext = i; return 0; } } return ERROR_INT("semicolon not found", procName, 1); }
/*! * sarrayConcatenate() * * Input: sa1 (to be added to) * sa2 (append to sa1) * Return: 0 if OK, 1 on error * * Notes: * (1) Copies of the strings in sarray2 are added to sarray1. */ l_int32 sarrayConcatenate(SARRAY *sa1, SARRAY *sa2) { char *str; l_int32 n, i; PROCNAME("sarrayConcatenate"); if (!sa1) return ERROR_INT("sa1 not defined", procName, 1); if (!sa2) return ERROR_INT("sa2 not defined", procName, 1); n = sarrayGetCount(sa2); for (i = 0; i < n; i++) { str = sarrayGetString(sa2, i, L_NOCOPY); sarrayAddString(sa1, str, L_COPY); } return 0; }
PIXA *MakeBootnum2(void) { char *fname; l_int32 i, n, w, h; BOX *box; PIX *pix; PIXA *pixa; L_RECOG *recog; SARRAY *sa; /* Phase 1: generate recog from the digit data */ recog = recogCreate(20, 32, L_USE_ALL, 120, 1); sa = getSortedPathnamesInDirectory("recog/bootnums", "png", 0, 0); n = sarrayGetCount(sa); for (i = 0; i < n; i++) { /* Read each pix: grayscale, multi-character, labelled */ fname = sarrayGetString(sa, i, L_NOCOPY); if ((pix = pixRead(fname)) == NULL) { fprintf(stderr, "Can't read %s\n", fname); continue; } /* Convert to a set of 1 bpp, single character, labelled */ pixGetDimensions(pix, &w, &h, NULL); box = boxCreate(0, 0, w, h); recogTrainLabelled(recog, pix, box, NULL, 1, 0); pixDestroy(&pix); boxDestroy(&box); } recogTrainingFinished(recog, 1); sarrayDestroy(&sa); /* Phase 2: generate pixa consisting of 1 bpp, single character pix */ recogWritePixa("/tmp/lept/recog/digits/bootnum2.pa", recog); pixa = pixaRead("/tmp/lept/recog/digits/bootnum2.pa"); recogDestroy(&recog); return pixa; }
/*! * gplotGenDataFiles() * * Input: gplot * Return: 0 if OK, 1 on error */ l_int32 gplotGenDataFiles(GPLOT *gplot) { char *plotdata, *dataname; l_int32 i, nplots; FILE *fp; PROCNAME("gplotGenDataFiles"); if (!gplot) return ERROR_INT("gplot not defined", procName, 1); nplots = sarrayGetCount(gplot->datanames); for (i = 0; i < nplots; i++) { plotdata = sarrayGetString(gplot->plotdata, i, L_NOCOPY); dataname = sarrayGetString(gplot->datanames, i, L_NOCOPY); if ((fp = fopenWriteStream(dataname, "w")) == NULL) return ERROR_INT("datafile stream not opened", procName, 1); fwrite(plotdata, 1, strlen(plotdata), fp); fclose(fp); } return 0; }