/*! * \brief pixColorSegment() * * \param[in] pixs 32 bpp; 24-bit color * \param[in] maxdist max euclidean dist to existing cluster * \param[in] maxcolors max number of colors allowed in first pass * \param[in] selsize linear size of sel for closing to remove noise * \param[in] finalcolors max number of final colors allowed after 4th pass * \param[in] debugflag 1 for debug output; 0 otherwise * \return pixd 8 bit with colormap, or NULL on error * * <pre> * Color segmentation proceeds in four phases: * * Phase 1: pixColorSegmentCluster() * The image is traversed in raster order. Each pixel either * becomes the representative for a new cluster or is assigned to an * existing cluster. Assignment is greedy. The data is stored in * a colormapped image. Three auxiliary arrays are used to hold * the colors of the representative pixels, for fast lookup. * The average color in each cluster is computed. * * Phase 2. pixAssignToNearestColor() * A second non-greedy clustering pass is performed, where each pixel * is assigned to the nearest cluster average. We also keep track * of how many pixels are assigned to each cluster. * * Phase 3. pixColorSegmentClean() * For each cluster, starting with the largest, do a morphological * closing to eliminate small components within larger ones. * * Phase 4. pixColorSegmentRemoveColors() * Eliminate all colors except the most populated 'finalcolors'. * Then remove unused colors from the colormap, and reassign those * pixels to the nearest remaining cluster, using the original pixel values. * * Notes: * (1) The goal is to generate a small number of colors. * Typically this would be specified by 'finalcolors', * a number that would be somewhere between 3 and 6. * The parameter 'maxcolors' specifies the maximum number of * colors generated in the first phase. This should be * larger than finalcolors, perhaps twice as large. * If more than 'maxcolors' are generated in the first phase * using the input 'maxdist', the distance is repeatedly * increased by a multiplicative factor until the condition * is satisfied. The implicit relation between 'maxdist' * and 'maxcolors' is thus adjusted programmatically. * (2) As a very rough guideline, given a target value of 'finalcolors', * here are approximate values of 'maxdist' and 'maxcolors' * to start with: * * finalcolors maxcolors maxdist * ----------- --------- ------- * 3 6 100 * 4 8 90 * 5 10 75 * 6 12 60 * * For a given number of finalcolors, if you use too many * maxcolors, the result will be noisy. If you use too few, * the result will be a relatively poor assignment of colors. * </pre> */ PIX * pixColorSegment(PIX *pixs, l_int32 maxdist, l_int32 maxcolors, l_int32 selsize, l_int32 finalcolors, l_int32 debugflag) { l_int32 *countarray; PIX *pixd; PROCNAME("pixColorSegment"); if (!pixs) return (PIX *)ERROR_PTR("pixs not defined", procName, NULL); if (pixGetDepth(pixs) != 32) return (PIX *)ERROR_PTR("must be rgb color", procName, NULL); /* Phase 1; original segmentation */ pixd = pixColorSegmentCluster(pixs, maxdist, maxcolors, debugflag); if (!pixd) return (PIX *)ERROR_PTR("pixd not made", procName, NULL); if (debugflag) { lept_mkdir("lept/segment"); pixWriteDebug("/tmp/lept/segment/colorseg1.png", pixd, IFF_PNG); } /* Phase 2; refinement in pixel assignment */ if ((countarray = (l_int32 *)LEPT_CALLOC(256, sizeof(l_int32))) == NULL) { pixDestroy(&pixd); return (PIX *)ERROR_PTR("countarray not made", procName, NULL); } pixAssignToNearestColor(pixd, pixs, NULL, LEVEL_IN_OCTCUBE, countarray); if (debugflag) pixWriteDebug("/tmp/lept/segment/colorseg2.png", pixd, IFF_PNG); /* Phase 3: noise removal by separately closing each color */ pixColorSegmentClean(pixd, selsize, countarray); LEPT_FREE(countarray); if (debugflag) pixWriteDebug("/tmp/lept/segment/colorseg3.png", pixd, IFF_PNG); /* Phase 4: removal of colors with small population and * reassignment of pixels to remaining colors */ pixColorSegmentRemoveColors(pixd, pixs, finalcolors); return pixd; }
/* * pixDebugFlipDetect() * * Input: filename (for output debug file) * pixs (input to pix*Detect) * pixhm (hit-miss result from ascenders or descenders) * enable (1 to enable this function; 0 to disable) * Return: void */ static void pixDebugFlipDetect(const char *filename, PIX *pixs, PIX *pixhm, l_int32 enable) { PIX *pixt, *pixthm; if (!enable) return; /* Display with red dot at counted locations */ pixt = pixConvert1To4Cmap(pixs); pixthm = pixMorphSequence(pixhm, "d5.5", 0); pixSetMaskedCmap(pixt, pixthm, 0, 0, 255, 0, 0); pixWriteDebug(filename, pixt, IFF_PNG); pixDestroy(&pixthm); pixDestroy(&pixt); return; }
/*! * \brief pixFindBaselines() * * \param[in] pixs 1 bpp, 300 ppi * \param[out] ppta [optional] pairs of pts corresponding to * approx. ends of each text line * \param[in] pixadb for debug output; use NULL to skip * \return na of baseline y values, or NULL on error * * <pre> * Notes: * (1) Input binary image must have text lines already aligned * horizontally. This can be done by either rotating the * image with pixDeskew(), or, if a projective transform * is required, by doing pixDeskewLocal() first. * (2) Input null for &pta if you don't want this returned. * The pta will come in pairs of points (left and right end * of each baseline). * (3) Caution: this will not work properly on text with multiple * columns, where the lines are not aligned between columns. * If there are multiple columns, they should be extracted * separately before finding the baselines. * (4) This function constructs different types of output * for baselines; namely, a set of raster line values and * a set of end points of each baseline. * (5) This function was designed to handle short and long text lines * without using dangerous thresholds on the peak heights. It does * this by combining the differential signal with a morphological * analysis of the locations of the text lines. One can also * combine this data to normalize the peak heights, by weighting * the differential signal in the region of each baseline * by the inverse of the width of the text line found there. * </pre> */ NUMA * pixFindBaselines(PIX *pixs, PTA **ppta, PIXA *pixadb) { l_int32 h, i, j, nbox, val1, val2, ndiff, bx, by, bw, bh; l_int32 imaxloc, peakthresh, zerothresh, inpeak; l_int32 mintosearch, max, maxloc, nloc, locval; l_int32 *array; l_float32 maxval; BOXA *boxa1, *boxa2, *boxa3; GPLOT *gplot; NUMA *nasum, *nadiff, *naloc, *naval; PIX *pix1, *pix2; PTA *pta; PROCNAME("pixFindBaselines"); if (ppta) *ppta = NULL; if (!pixs || pixGetDepth(pixs) != 1) return (NUMA *)ERROR_PTR("pixs undefined or not 1 bpp", procName, NULL); /* Close up the text characters, removing noise */ pix1 = pixMorphSequence(pixs, "c25.1 + e15.1", 0); /* Estimate the resolution */ if (pixadb) pixaAddPix(pixadb, pixScale(pix1, 0.25, 0.25), L_INSERT); /* Save the difference of adjacent row sums. * The high positive-going peaks are the baselines */ if ((nasum = pixCountPixelsByRow(pix1, NULL)) == NULL) { pixDestroy(&pix1); return (NUMA *)ERROR_PTR("nasum not made", procName, NULL); } h = pixGetHeight(pixs); nadiff = numaCreate(h); numaGetIValue(nasum, 0, &val2); for (i = 0; i < h - 1; i++) { val1 = val2; numaGetIValue(nasum, i + 1, &val2); numaAddNumber(nadiff, val1 - val2); } numaDestroy(&nasum); if (pixadb) { /* show the difference signal */ lept_mkdir("lept/baseline"); gplotSimple1(nadiff, GPLOT_PNG, "/tmp/lept/baseline/diff", "Diff Sig"); pix2 = pixRead("/tmp/lept/baseline/diff.png"); pixaAddPix(pixadb, pix2, L_INSERT); } /* Use the zeroes of the profile to locate each baseline. */ array = numaGetIArray(nadiff); ndiff = numaGetCount(nadiff); numaGetMax(nadiff, &maxval, &imaxloc); numaDestroy(&nadiff); /* Use this to begin locating a new peak: */ peakthresh = (l_int32)maxval / PEAK_THRESHOLD_RATIO; /* Use this to begin a region between peaks: */ zerothresh = (l_int32)maxval / ZERO_THRESHOLD_RATIO; naloc = numaCreate(0); naval = numaCreate(0); inpeak = FALSE; for (i = 0; i < ndiff; i++) { if (inpeak == FALSE) { if (array[i] > peakthresh) { /* transition to in-peak */ inpeak = TRUE; mintosearch = i + MIN_DIST_IN_PEAK; /* accept no zeros * between i and mintosearch */ max = array[i]; maxloc = i; } } else { /* inpeak == TRUE; look for max */ if (array[i] > max) { max = array[i]; maxloc = i; mintosearch = i + MIN_DIST_IN_PEAK; } else if (i > mintosearch && array[i] <= zerothresh) { /* leave */ inpeak = FALSE; numaAddNumber(naval, max); numaAddNumber(naloc, maxloc); } } } LEPT_FREE(array); /* If array[ndiff-1] is max, eg. no descenders, baseline at bottom */ if (inpeak) { numaAddNumber(naval, max); numaAddNumber(naloc, maxloc); } if (pixadb) { /* show the raster locations for the peaks */ gplot = gplotCreate("/tmp/lept/baseline/loc", GPLOT_PNG, "Peak locs", "rasterline", "height"); gplotAddPlot(gplot, naloc, naval, GPLOT_POINTS, "locs"); gplotMakeOutput(gplot); gplotDestroy(&gplot); pix2 = pixRead("/tmp/lept/baseline/loc.png"); pixaAddPix(pixadb, pix2, L_INSERT); } numaDestroy(&naval); /* Generate an approximate profile of text line width. * First, filter the boxes of text, where there may be * more than one box for a given textline. */ pix2 = pixMorphSequence(pix1, "r11 + c20.1 + o30.1 +c1.3", 0); if (pixadb) pixaAddPix(pixadb, pix2, L_COPY); boxa1 = pixConnComp(pix2, NULL, 4); pixDestroy(&pix1); pixDestroy(&pix2); if (boxaGetCount(boxa1) == 0) { numaDestroy(&naloc); boxaDestroy(&boxa1); L_INFO("no compnents after filtering\n", procName); return NULL; } boxa2 = boxaTransform(boxa1, 0, 0, 4., 4.); boxa3 = boxaSort(boxa2, L_SORT_BY_Y, L_SORT_INCREASING, NULL); boxaDestroy(&boxa1); boxaDestroy(&boxa2); /* Optionally, find the baseline segments */ pta = NULL; if (ppta) { pta = ptaCreate(0); *ppta = pta; } if (pta) { nloc = numaGetCount(naloc); nbox = boxaGetCount(boxa3); for (i = 0; i < nbox; i++) { boxaGetBoxGeometry(boxa3, i, &bx, &by, &bw, &bh); for (j = 0; j < nloc; j++) { numaGetIValue(naloc, j, &locval); if (L_ABS(locval - (by + bh)) > 25) continue; ptaAddPt(pta, bx, locval); ptaAddPt(pta, bx + bw, locval); break; } } } boxaDestroy(&boxa3); if (pixadb && pta) { /* display baselines */ l_int32 npts, x1, y1, x2, y2; pix1 = pixConvertTo32(pixs); npts = ptaGetCount(pta); for (i = 0; i < npts; i += 2) { ptaGetIPt(pta, i, &x1, &y1); ptaGetIPt(pta, i + 1, &x2, &y2); pixRenderLineArb(pix1, x1, y1, x2, y2, 2, 255, 0, 0); } pixWriteDebug("/tmp/lept/baseline/baselines.png", pix1, IFF_PNG); pixaAddPix(pixadb, pixScale(pix1, 0.25, 0.25), L_INSERT); pixDestroy(&pix1); } return naloc; }
/*! * \brief pixUpDownDetectGeneralDwa() * * \param[in] pixs 1 bpp, deskewed, English text * \param[out] pconf confidence that text is rightside-up * \param[in] mincount min number of up + down; use 0 for default * \param[in] npixels number of pixels removed from each side of word box * \param[in] debug 1 for debug output; 0 otherwise * \return 0 if OK, 1 on error * * <pre> * Notes: * (1) See the notes in pixUpDownDetectGeneral() for usage. * </pre> */ l_int32 pixUpDownDetectGeneralDwa(PIX *pixs, l_float32 *pconf, l_int32 mincount, l_int32 npixels, l_int32 debug) { char flipsel1[] = "flipsel1"; char flipsel2[] = "flipsel2"; char flipsel3[] = "flipsel3"; char flipsel4[] = "flipsel4"; l_int32 countup, countdown, nmax; l_float32 nup, ndown; PIX *pixt, *pix0, *pix1, *pix2, *pix3, *pixm; PROCNAME("pixUpDownDetectGeneralDwa"); if (!pconf) return ERROR_INT("&conf not defined", procName, 1); *pconf = 0.0; if (!pixs || pixGetDepth(pixs) != 1) return ERROR_INT("pixs not defined or not 1 bpp", procName, 1); if (mincount == 0) mincount = DEFAULT_MIN_UP_DOWN_COUNT; if (npixels < 0) npixels = 0; lept_mkdir("lept/orient"); /* One of many reasonable pre-filtering sequences: (1, 8) and (30, 1). * This closes holes in x-height characters and joins them at * the x-height. There is more noise in the descender detection * from this, but it works fairly well. */ pixt = pixMorphSequenceDwa(pixs, "c1.8 + c30.1", 0); /* Be sure to add the border before the flip DWA operations! */ pix0 = pixAddBorderGeneral(pixt, ADDED_BORDER, ADDED_BORDER, ADDED_BORDER, ADDED_BORDER, 0); pixDestroy(&pixt); /* Optionally, make a mask of the word bounding boxes, shortening * each of them by a fixed amount at each end. */ pixm = NULL; if (npixels > 0) { l_int32 i, nbox, x, y, w, h; BOX *box; BOXA *boxa; pix1 = pixMorphSequenceDwa(pix0, "o10.1", 0); boxa = pixConnComp(pix1, NULL, 8); pixm = pixCreateTemplate(pix1); pixDestroy(&pix1); nbox = boxaGetCount(boxa); for (i = 0; i < nbox; i++) { box = boxaGetBox(boxa, i, L_CLONE); boxGetGeometry(box, &x, &y, &w, &h); if (w > 2 * npixels) pixRasterop(pixm, x + npixels, y - 6, w - 2 * npixels, h + 13, PIX_SET, NULL, 0, 0); boxDestroy(&box); } boxaDestroy(&boxa); } /* Find the ascenders and optionally filter with pixm. * For an explanation of the procedure used for counting the result * of the HMT, see comments in pixUpDownDetectGeneral(). */ pix1 = pixFlipFHMTGen(NULL, pix0, flipsel1); pix2 = pixFlipFHMTGen(NULL, pix0, flipsel2); pixOr(pix1, pix1, pix2); if (pixm) pixAnd(pix1, pix1, pixm); pix3 = pixReduceRankBinaryCascade(pix1, 1, 1, 0, 0); pixCountPixels(pix3, &countup, NULL); pixDestroy(&pix1); pixDestroy(&pix2); pixDestroy(&pix3); /* Find the ascenders and optionally filter with pixm. */ pix1 = pixFlipFHMTGen(NULL, pix0, flipsel3); pix2 = pixFlipFHMTGen(NULL, pix0, flipsel4); pixOr(pix1, pix1, pix2); if (pixm) pixAnd(pix1, pix1, pixm); pix3 = pixReduceRankBinaryCascade(pix1, 1, 1, 0, 0); pixCountPixels(pix3, &countdown, NULL); pixDestroy(&pix1); pixDestroy(&pix2); pixDestroy(&pix3); /* Evaluate statistically, generating a confidence that is * related to the probability with a gaussian distribution. */ nup = (l_float32)(countup); ndown = (l_float32)(countdown); nmax = L_MAX(countup, countdown); if (nmax > mincount) *pconf = 2. * ((nup - ndown) / sqrt(nup + ndown)); if (debug) { if (pixm) pixWriteDebug("/tmp/lept/orient/pixm2.png", pixm, IFF_PNG); fprintf(stderr, "nup = %7.3f, ndown = %7.3f, conf = %7.3f\n", nup, ndown, *pconf); if (*pconf > DEFAULT_MIN_UP_DOWN_CONF) fprintf(stderr, "Text is rightside-up\n"); if (*pconf < -DEFAULT_MIN_UP_DOWN_CONF) fprintf(stderr, "Text is upside-down\n"); } pixDestroy(&pix0); pixDestroy(&pixm); return 0; }