/*! * jbCorrelation() * * Input: dirin (directory of input images) * thresh (typically ~0.8) * weight (typically ~0.6) * components (JB_CONN_COMPS, JB_CHARACTERS, JB_WORDS) * rootname (for output files) * firstpage (0-based) * npages (use 0 for all pages in dirin) * renderflag (1 to render from templates; 0 to skip) * Return: 0 if OK, 1 on error * * Notes: * (1) The images must be 1 bpp. If they are not, you can convert * them using convertFilesTo1bpp(). * (2) See prog/jbcorrelation for generating more output (e.g., * for debugging) */ l_int32 jbCorrelation(const char *dirin, l_float32 thresh, l_float32 weight, l_int32 components, const char *rootname, l_int32 firstpage, l_int32 npages, l_int32 renderflag) { char filename[L_BUF_SIZE]; l_int32 nfiles, i, numpages; JBDATA *data; JBCLASSER *classer; PIX *pix; PIXA *pixa; SARRAY *safiles; PROCNAME("jbCorrelation"); if (!dirin) return ERROR_INT("dirin not defined", procName, 1); if (!rootname) return ERROR_INT("rootname not defined", procName, 1); if (components != JB_CONN_COMPS && components != JB_CHARACTERS && components != JB_WORDS) return ERROR_INT("components invalid", procName, 1); safiles = getSortedPathnamesInDirectory(dirin, NULL, firstpage, npages); nfiles = sarrayGetCount(safiles); /* Classify components */ classer = jbCorrelationInit(components, 0, 0, thresh, weight); jbAddPages(classer, safiles); /* Save data */ data = jbDataSave(classer); jbDataWrite(rootname, data); /* Optionally, render pages using class templates */ if (renderflag) { pixa = jbDataRender(data, FALSE); numpages = pixaGetCount(pixa); if (numpages != nfiles) fprintf(stderr, "numpages = %d, nfiles = %d, not equal!\n", numpages, nfiles); for (i = 0; i < numpages; i++) { pix = pixaGetPix(pixa, i, L_CLONE); snprintf(filename, L_BUF_SIZE, "%s.%05d", rootname, i); fprintf(stderr, "filename: %s\n", filename); pixWrite(filename, pix, IFF_PNG); pixDestroy(&pix); } pixaDestroy(&pixa); } sarrayDestroy(&safiles); jbClasserDestroy(&classer); jbDataDestroy(&data); return 0; }
/*! * \brief jbWordsInTextlines() * * \param[in] dirin directory of input pages * \param[in] reduction 1 for full res; 2 for half-res * \param[in] maxwidth of word mask components, to be kept * \param[in] maxheight of word mask components, to be kept * \param[in] thresh on correlation; 0.80 is reasonable * \param[in] weight for handling thick text; 0.6 is reasonable * \param[out] pnatl numa with textline index for each component * \param[in] firstpage 0-based * \param[in] npages use 0 for all pages in dirin * \return classer for the set of pages * * <pre> * Notes: * (1) This is a high-level function. See prog/jbwords for example * of usage. * (2) Typically, words can be found reasonably well at a resolution * of about 150 ppi. For highest accuracy, you should use 300 ppi. * Assuming that the input images are 300 ppi, use reduction = 1 * for finding words at full res, and reduction = 2 for finding * them at 150 ppi. * </pre> */ JBCLASSER * jbWordsInTextlines(const char *dirin, l_int32 reduction, l_int32 maxwidth, l_int32 maxheight, l_float32 thresh, l_float32 weight, NUMA **pnatl, l_int32 firstpage, l_int32 npages) { char *fname; l_int32 nfiles, i, w, h; BOXA *boxa; JBCLASSER *classer; NUMA *nai, *natl; PIX *pix; PIXA *pixa; SARRAY *safiles; PROCNAME("jbWordsInTextlines"); if (!pnatl) return (JBCLASSER *)ERROR_PTR("&natl not defined", procName, NULL); *pnatl = NULL; if (!dirin) return (JBCLASSER *)ERROR_PTR("dirin not defined", procName, NULL); if (reduction != 1 && reduction != 2) return (JBCLASSER *)ERROR_PTR("reduction not in {1,2}", procName, NULL); safiles = getSortedPathnamesInDirectory(dirin, NULL, firstpage, npages); nfiles = sarrayGetCount(safiles); /* Classify components */ classer = jbCorrelationInit(JB_WORDS, maxwidth, maxheight, thresh, weight); classer->safiles = sarrayCopy(safiles); natl = numaCreate(0); *pnatl = natl; for (i = 0; i < nfiles; i++) { fname = sarrayGetString(safiles, i, L_NOCOPY); if ((pix = pixRead(fname)) == NULL) { L_WARNING("image file %d not read\n", procName, i); continue; } pixGetDimensions(pix, &w, &h, NULL); if (reduction == 1) { classer->w = w; classer->h = h; } else { /* reduction == 2 */ classer->w = w / 2; classer->h = h / 2; } pixGetWordsInTextlines(pix, reduction, JB_WORDS_MIN_WIDTH, JB_WORDS_MIN_HEIGHT, maxwidth, maxheight, &boxa, &pixa, &nai); jbAddPageComponents(classer, pix, boxa, pixa); numaJoin(natl, nai, 0, -1); pixDestroy(&pix); numaDestroy(&nai); boxaDestroy(&boxa); pixaDestroy(&pixa); } sarrayDestroy(&safiles); return classer; }
int main(int argc, char **argv) { char filename[BUF_SIZE]; char *dirin, *rootname, *fname; l_int32 i, firstpage, npages, nfiles; l_float32 thresh, weight; JBDATA *data; JBCLASSER *classer; SARRAY *safiles; PIX *pix, *pixt; PIXA *pixa, *pixadb; static char mainName[] = "jbcorrelation"; if (argc != 5 && argc != 7) return ERROR_INT(" Syntax: jbcorrelation dirin thresh weight " "rootname [firstpage, npages]", mainName, 1); dirin = argv[1]; thresh = atof(argv[2]); weight = atof(argv[3]); rootname = argv[4]; if (argc == 5) { firstpage = 0; npages = 0; } else { firstpage = atoi(argv[5]); npages = atoi(argv[6]); } #if 0 /*--------------------------------------------------------------*/ jbCorrelation(dirin, thresh, weight, COMPONENTS, rootname, firstpage, npages, 1); /*--------------------------------------------------------------*/ #else /*--------------------------------------------------------------*/ safiles = getSortedPathnamesInDirectory(dirin, NULL, firstpage, npages); nfiles = sarrayGetCount(safiles); sarrayWriteStream(stderr, safiles); /* Classify components on requested pages */ startTimer(); classer = jbCorrelationInit(COMPONENTS, 0, 0, thresh, weight); jbAddPages(classer, safiles); fprintf(stderr, "Time to generate classes: %6.3f sec\n", stopTimer()); /* Save and write out the result */ data = jbDataSave(classer); jbDataWrite(rootname, data); fprintf(stderr, "Number of classes: %d\n", classer->nclass); /* Render the pages from the classifier data. * Use debugflag == FALSE to omit outlines of each component. */ pixa = jbDataRender(data, FALSE); /* Write the pages out */ npages = pixaGetCount(pixa); if (npages != nfiles) fprintf(stderr, "npages = %d, nfiles = %d, not equal!\n", npages, nfiles); for (i = 0; i < npages; i++) { pix = pixaGetPix(pixa, i, L_CLONE); snprintf(filename, BUF_SIZE, "%s.%05d", rootname, i); fprintf(stderr, "filename: %s\n", filename); pixWrite(filename, pix, IFF_PNG); pixDestroy(&pix); } #if DISPLAY_DIFFERENCE fname = sarrayGetString(safiles, 0, 0); pixt = pixRead(fname); pix = pixaGetPix(pixa, 0, L_CLONE); pixXor(pixt, pixt, pix); pixWrite("junk_output_diff", pixt, IFF_PNG); pixDestroy(&pix); pixDestroy(&pixt); #endif /* DISPLAY_DIFFERENCE */ #if DEBUG_TEST_DATA_IO { JBDATA *newdata; PIX *newpix; PIXA *newpixa; l_int32 same, iofail; /* Read the data back in and render the pages */ newdata = jbDataRead(rootname); newpixa = jbDataRender(newdata, FALSE); iofail = FALSE; for (i = 0; i < npages; i++) { pix = pixaGetPix(pixa, i, L_CLONE); newpix = pixaGetPix(newpixa, i, L_CLONE); pixEqual(pix, newpix, &same); if (!same) { iofail = TRUE; fprintf(stderr, "pix on page %d are unequal!\n", i); } pixDestroy(&pix); pixDestroy(&newpix); } if (iofail) fprintf(stderr, "read/write for jbdata fails\n"); else fprintf(stderr, "read/write for jbdata succeeds\n"); jbDataDestroy(&newdata); pixaDestroy(&newpixa); } #endif /* DEBUG_TEST_DATA_IO */ #if RENDER_DEBUG /* Use debugflag == TRUE to see outlines of each component. */ pixadb = jbDataRender(data, TRUE); /* Write the debug pages out */ npages = pixaGetCount(pixadb); for (i = 0; i < npages; i++) { pix = pixaGetPix(pixadb, i, L_CLONE); snprintf(filename, BUF_SIZE, "%s.db.%05d", rootname, i); fprintf(stderr, "filename: %s\n", filename); pixWrite(filename, pix, IFF_PNG); pixDestroy(&pix); } pixaDestroy(&pixadb); #endif /* RENDER_DEBUG */ #if DISPLAY_ALL_INSTANCES /* display all instances, organized by template */ pix = pixaaDisplayByPixa(classer->pixaa, X_SPACING, Y_SPACING, MAX_OUTPUT_WIDTH); pixWrite("output_instances", pix, IFF_PNG); pixDestroy(&pix); #endif /* DISPLAY_ALL_INSTANCES */ pixaDestroy(&pixa); sarrayDestroy(&safiles); jbClasserDestroy(&classer); jbDataDestroy(&data); /*--------------------------------------------------------------*/ #endif return 0; }