// Transforms the given coords forward to normalized space using the // full transformation sequence defined by the block rotation, the // predecessors, deepest first, and finally this. void DENORM::NormTransform(const TPOINT& pt, TPOINT* transformed) const { FCOORD src_pt(pt.x, pt.y); FCOORD float_result; NormTransform(src_pt, &float_result); transformed->x = IntCastRounded(float_result.x()); transformed->y = IntCastRounded(float_result.y()); }
// Draws the outline in the given colour, normalized using the given denorm, // making use of sub-pixel accurate information if available. void C_OUTLINE::plot_normed(const DENORM& denorm, ScrollView::Color colour, ScrollView* window) const { window->Pen(colour); if (stepcount == 0) { window->Rectangle(box.left(), box.top(), box.right(), box.bottom()); return; } const DENORM* root_denorm = denorm.RootDenorm(); ICOORD pos = start; // current position FCOORD f_pos = sub_pixel_pos_at_index(pos, 0); FCOORD pos_normed; denorm.NormTransform(root_denorm, f_pos, &pos_normed); window->SetCursor(IntCastRounded(pos_normed.x()), IntCastRounded(pos_normed.y())); for (int s = 0; s < stepcount; pos += step(s++)) { int edge_weight = edge_strength_at_index(s); if (edge_weight == 0) { // This point has conflicting gradient and step direction, so ignore it. continue; } FCOORD f_pos = sub_pixel_pos_at_index(pos, s); FCOORD pos_normed; denorm.NormTransform(root_denorm, f_pos, &pos_normed); window->DrawTo(IntCastRounded(pos_normed.x()), IntCastRounded(pos_normed.y())); } }
// Transforms the given coords all the way back to source image space using // the full transformation sequence defined by this and its predecesors // recursively, shallowest first, and finally any block re_rotation. void DENORM::DenormTransform(const TPOINT& pt, TPOINT* original) const { FCOORD src_pt(pt.x, pt.y); FCOORD float_result; DenormTransform(src_pt, &float_result); original->x = IntCastRounded(float_result.x()); original->y = IntCastRounded(float_result.y()); }
// Computes the DENORMS for bl(baseline) and cn(character) normalization // during feature extraction. The input denorm describes the current state // of the blob, which is usually a baseline-normalized word. // The Transforms setup are as follows: // Baseline Normalized (bl) Output: // We center the grapheme by aligning the x-coordinate of its centroid with // x=128 and leaving the already-baseline-normalized y as-is. // // Character Normalized (cn) Output: // We align the grapheme's centroid at the origin and scale it // asymmetrically in x and y so that the 2nd moments are a standard value // (51.2) ie the result is vaguely square. // If classify_nonlinear_norm is true: // A non-linear normalization is setup that attempts to evenly distribute // edges across x and y. // // Some of the fields of fx_info are also setup: // Length: Total length of outline. // Rx: Rounded y second moment. (Reversed by convention.) // Ry: rounded x second moment. // Xmean: Rounded x center of mass of the blob. // Ymean: Rounded y center of mass of the blob. void Classify::SetupBLCNDenorms(const TBLOB& blob, bool nonlinear_norm, DENORM* bl_denorm, DENORM* cn_denorm, INT_FX_RESULT_STRUCT* fx_info) { // Compute 1st and 2nd moments of the original outline. FCOORD center, second_moments; int length = blob.ComputeMoments(¢er, &second_moments); if (fx_info != nullptr) { fx_info->Length = length; fx_info->Rx = IntCastRounded(second_moments.y()); fx_info->Ry = IntCastRounded(second_moments.x()); fx_info->Xmean = IntCastRounded(center.x()); fx_info->Ymean = IntCastRounded(center.y()); } // Setup the denorm for Baseline normalization. bl_denorm->SetupNormalization(nullptr, nullptr, &blob.denorm(), center.x(), 128.0f, 1.0f, 1.0f, 128.0f, 128.0f); // Setup the denorm for character normalization. if (nonlinear_norm) { GenericVector<GenericVector<int> > x_coords; GenericVector<GenericVector<int> > y_coords; TBOX box; blob.GetPreciseBoundingBox(&box); box.pad(1, 1); blob.GetEdgeCoords(box, &x_coords, &y_coords); cn_denorm->SetupNonLinear(&blob.denorm(), box, UINT8_MAX, UINT8_MAX, 0.0f, 0.0f, x_coords, y_coords); } else { cn_denorm->SetupNormalization(nullptr, nullptr, &blob.denorm(), center.x(), center.y(), 51.2f / second_moments.x(), 51.2f / second_moments.y(), 128.0f, 128.0f); } }
// Normalize a blob using blob transformations. Less accurate, but // more accurately copies the old way. void DENORM::LocalNormBlob(TBLOB* blob) const { TBOX blob_box = blob->bounding_box(); ICOORD translation(-IntCastRounded(x_origin_), -IntCastRounded(y_origin_)); blob->Move(translation); if (y_scale_ != 1.0f) blob->Scale(y_scale_); if (rotation_ != NULL) blob->Rotate(*rotation_); translation.set_x(IntCastRounded(final_xshift_)); translation.set_y(IntCastRounded(final_yshift_)); blob->Move(translation); }
// Draws the features in the given window. void WordFeature::Draw(const GenericVector<WordFeature>& features, ScrollView* window) { for (int f = 0; f < features.size(); ++f) { FCOORD pos(features[f].x_, features[f].y_); FCOORD dir; dir.from_direction(features[f].dir_); dir *= 8.0f; window->SetCursor(IntCastRounded(pos.x() - dir.x()), IntCastRounded(pos.y() - dir.y())); window->DrawTo(IntCastRounded(pos.x() + dir.x()), IntCastRounded(pos.y() + dir.y())); } }
// Computes and returns the absolute error of the given perp_disp from the // given linespacing model. double BaselineBlock::SpacingModelError(double perp_disp, double line_spacing, double line_offset) { // Round to the nearest multiple of line_spacing + line offset. int multiple = IntCastRounded((perp_disp - line_offset) / line_spacing); double model_y = line_spacing * multiple + line_offset; return fabs(perp_disp - model_y); }
// Helper to compute edge offsets for all the blobs on the list. // See coutln.h for an explanation of edge offsets. void BLOBNBOX::ComputeEdgeOffsets(Pix* thresholds, Pix* grey, BLOBNBOX_LIST* blobs) { int grey_height = 0; int thr_height = 0; int scale_factor = 1; if (thresholds != NULL && grey != NULL) { grey_height = pixGetHeight(grey); thr_height = pixGetHeight(thresholds); scale_factor = IntCastRounded(static_cast<double>(grey_height) / thr_height); } BLOBNBOX_IT blob_it(blobs); for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) { BLOBNBOX* blob = blob_it.data(); if (blob->cblob() != NULL) { // Get the threshold that applies to this blob. l_uint32 threshold = 128; if (thresholds != NULL && grey != NULL) { const TBOX& box = blob->cblob()->bounding_box(); // Transform the coordinates if required. TPOINT pt((box.left() + box.right()) / 2, (box.top() + box.bottom()) / 2); pixGetPixel(thresholds, pt.x / scale_factor, thr_height - 1 - pt.y / scale_factor, &threshold); } blob->cblob()->ComputeEdgeOffsets(threshold, grey); } } }
void DENORM::LocalNormTransform(const FCOORD& pt, FCOORD* transformed) const { FCOORD translated(pt.x() - x_origin_, pt.y() - y_origin_); if (x_map_ != NULL && y_map_ != NULL) { int x = ClipToRange(IntCastRounded(translated.x()), 0, x_map_->size()-1); translated.set_x((*x_map_)[x]); int y = ClipToRange(IntCastRounded(translated.y()), 0, y_map_->size()-1); translated.set_y((*y_map_)[y]); } else { translated.set_x(translated.x() * x_scale_); translated.set_y(translated.y() * y_scale_); if (rotation_ != NULL) translated.rotate(*rotation_); } transformed->set_x(translated.x() + final_xshift_); transformed->set_y(translated.y() + final_yshift_); }
bool LSTM::DeSerialize(TFile* fp) { if (fp->FReadEndian(&na_, sizeof(na_), 1) != 1) return false; if (type_ == NT_LSTM_SOFTMAX) { nf_ = no_; } else if (type_ == NT_LSTM_SOFTMAX_ENCODED) { nf_ = IntCastRounded(ceil(log2(no_))); } else { nf_ = 0; } is_2d_ = false; for (int w = 0; w < WT_COUNT; ++w) { if (w == GFS && !Is2D()) continue; if (!gate_weights_[w].DeSerialize(IsTraining(), fp)) return false; if (w == CI) { ns_ = gate_weights_[CI].NumOutputs(); is_2d_ = na_ - nf_ == ni_ + 2 * ns_; } } delete softmax_; if (type_ == NT_LSTM_SOFTMAX || type_ == NT_LSTM_SOFTMAX_ENCODED) { softmax_ = static_cast<FullyConnected*>(Network::CreateFromFile(fp)); if (softmax_ == nullptr) return false; } else { softmax_ = nullptr; } return true; }
// Accumulates counts for junk. Counts only whether the junk was correctly // rejected or not. bool ErrorCounter::AccumulateJunk(bool debug, const GenericVector<UnicharRating>& results, TrainingSample* sample) { // For junk we accept no answer, or an explicit shape answer matching the // class id of the sample. int num_results = results.size(); int font_id = sample->font_id(); int unichar_id = sample->class_id(); int percent = 0; if (num_results > 0) percent = IntCastRounded(results[0].rating * 100); if (num_results > 0 && results[0].unichar_id != unichar_id) { // This is a junk error. ++font_counts_[font_id].n[CT_ACCEPTED_JUNK]; sample->set_is_error(true); // It counts as an error for boosting too so sum the weight. scaled_error_ += sample->weight(); bad_score_hist_.add(percent, 1); return debug; } else { // Correctly rejected. ++font_counts_[font_id].n[CT_REJECTED_JUNK]; sample->set_is_error(false); ok_score_hist_.add(percent, 1); } return false; }
static void HistogramWeight(double weight, STATS* histogram) { int bucket = kHistogramBuckets - 1; if (weight != 0.0) { double logval = -log2(fabs(weight)); bucket = ClipToRange(IntCastRounded(logval), 0, kHistogramBuckets - 1); } histogram->add(bucket, 1); }
// Normalize a blob using blob transformations. Less accurate, but // more accurately copies the old way. void DENORM::LocalNormBlob(TBLOB* blob) const { TBOX blob_box = blob->bounding_box(); float x_center = (blob_box.left() + blob_box.right()) / 2.0f; ICOORD translation(-IntCastRounded(x_origin_), -IntCastRounded(YOriginAtOrigX(x_center))); blob->Move(translation); // Note that the old way of scaling only allowed for a single // scale factor. float scale = YScaleAtOrigX(x_center); if (scale != 1.0f) blob->Scale(scale); if (rotation_ != NULL) blob->Rotate(*rotation_); translation.set_x(IntCastRounded(final_xshift_)); translation.set_y(IntCastRounded(final_yshift_)); blob->Move(translation); }
// Sets up displacement_modes_ with the top few modes of the perpendicular // distance of each blob from the given direction vector, after rounding. void BaselineRow::SetupBlobDisplacements(const FCOORD& direction) { // Set of perpendicular displacements of the blob bottoms from the required // baseline direction. GenericVector<double> perp_blob_dists; displacement_modes_.truncate(0); // Gather the skew-corrected position of every blob. double min_dist = MAX_FLOAT32; double max_dist = -MAX_FLOAT32; BLOBNBOX_IT blob_it(blobs_); bool debug = false; for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) { BLOBNBOX* blob = blob_it.data(); const TBOX& box = blob->bounding_box(); #ifdef kDebugYCoord if (box.bottom() < kDebugYCoord && box.top() > kDebugYCoord) debug = true; #endif FCOORD blob_pos((box.left() + box.right()) / 2.0f, blob->baseline_position()); double offset = direction * blob_pos; perp_blob_dists.push_back(offset); if (debug) { tprintf("Displacement %g for blob at:", offset); box.print(); } UpdateRange(offset, &min_dist, &max_dist); } // Set up a histogram using disp_quant_factor_ as the bucket size. STATS dist_stats(IntCastRounded(min_dist / disp_quant_factor_), IntCastRounded(max_dist / disp_quant_factor_) + 1); for (int i = 0; i < perp_blob_dists.size(); ++i) { dist_stats.add(IntCastRounded(perp_blob_dists[i] / disp_quant_factor_), 1); } GenericVector<KDPairInc<float, int> > scaled_modes; dist_stats.top_n_modes(kMaxDisplacementsModes, &scaled_modes); if (debug) { for (int i = 0; i < scaled_modes.size(); ++i) { tprintf("Top mode = %g * %d\n", scaled_modes[i].key * disp_quant_factor_, scaled_modes[i].data); } } for (int i = 0; i < scaled_modes.size(); ++i) displacement_modes_.push_back(disp_quant_factor_ * scaled_modes[i].key); }
// Helper to compute an offset index feature. In this context an offset // feature with a dir of +/-1 is a feature of a similar direction, // but shifted perpendicular to the direction of the feature. An offset // feature with a dir of +/-2 is feature at the same position, but rotated // by +/- one [compact] quantum. Returns the index of the generated offset // feature, or -1 if it doesn't exist. Dir should be in // [-kNumOffsetMaps, kNumOffsetMaps] to indicate the relative direction. // A dir of 0 is an identity transformation. // Both input and output are from the index(sparse) feature space, not // the mapped/compact feature space, but the offset feature is the minimum // distance moved from the input to guarantee that it maps to the next // available quantum in the mapped/compact space. int IntFeatureMap::ComputeOffsetFeature(int index_feature, int dir) const { INT_FEATURE_STRUCT f = InverseIndexFeature(index_feature); ASSERT_HOST(IndexFeature(f) == index_feature); if (dir == 0) { return index_feature; } else if (dir == 1 || dir == -1) { FCOORD feature_dir = FeatureDirection(f.Theta); FCOORD rotation90(0.0f, 1.0f); feature_dir.rotate(rotation90); // Find the nearest existing feature. for (int m = 1; m < kMaxOffsetDist; ++m) { double x_pos = f.X + feature_dir.x() * (m * dir); double y_pos = f.Y + feature_dir.y() * (m * dir); int x = IntCastRounded(x_pos); int y = IntCastRounded(y_pos); if (x >= 0 && x <= MAX_UINT8 && y >= 0 && y <= MAX_UINT8) { INT_FEATURE_STRUCT offset_f; offset_f.X = x; offset_f.Y = y; offset_f.Theta = f.Theta; int offset_index = IndexFeature(offset_f); if (offset_index != index_feature && offset_index >= 0) return offset_index; // Found one. } else { return -1; // Hit the edge of feature space. } } } else if (dir == 2 || dir == -2) { // Find the nearest existing index_feature. for (int m = 1; m < kMaxOffsetDist; ++m) { int theta = f.Theta + m * dir / 2; INT_FEATURE_STRUCT offset_f; offset_f.X = f.X; offset_f.Y = f.Y; offset_f.Theta = Modulo(theta, 256); int offset_index = IndexFeature(offset_f); if (offset_index != index_feature && offset_index >= 0) return offset_index; // Found one. } } return -1; // Nothing within the max distance. }
// Fills in the x-height range accepted by the given unichar_id, given its // bounding box in the usual baseline-normalized coordinates, with some // initial crude x-height estimate (such as word size) and this denoting the // transformation that was used. Returns false, and an empty range if the // bottom is a mis-fit. Returns true and empty [0, 0] range if the bottom // fits, but the top is impossible. bool DENORM::XHeightRange(int unichar_id, const UNICHARSET& unicharset, const TBOX& bbox, inT16* min_xht, inT16* max_xht) const { // Clip the top and bottom to the limit of normalized feature space. int top = ClipToRange<int>(bbox.top(), 0, kBlnCellHeight - 1); int bottom = ClipToRange<int>(bbox.bottom(), 0, kBlnCellHeight - 1); // A tolerance of yscale corresponds to 1 pixel in the image. double tolerance = y_scale(); int min_bottom, max_bottom, min_top, max_top; unicharset.get_top_bottom(unichar_id, &min_bottom, &max_bottom, &min_top, &max_top); // Default returns indicate a mis-fit. *min_xht = 0; *max_xht = 0; // Chars with a misfitting bottom might be sub/superscript/dropcap, or might // just be wrongly classified. Return an empty range so they have to be // good to be considered. if (bottom < min_bottom - tolerance || bottom > max_bottom + tolerance) { return false; } // To help very high cap/xheight ratio fonts accept the correct x-height, // and to allow the large caps in small caps to accept the xheight of the // small caps, add kBlnBaselineOffset to chars with a maximum max. if (max_top == kBlnCellHeight - 1) max_top += kBlnBaselineOffset; int height = top - kBlnBaselineOffset; double min_height = min_top - kBlnBaselineOffset - tolerance; double max_height = max_top - kBlnBaselineOffset + tolerance; if (min_height <= 0.0) { if (height <= 0 || max_height > 0) *max_xht = MAX_INT16; // Anything will do. } else if (height > 0) { int result = IntCastRounded(height * kBlnXHeight / y_scale() / min_height); *max_xht = static_cast<inT16>(ClipToRange(result, 0, MAX_INT16)); } if (max_height > 0.0 && height > 0) { int result = IntCastRounded(height * kBlnXHeight / y_scale() / max_height); *min_xht = static_cast<inT16>(ClipToRange(result, 0, MAX_INT16)); } return true; }
// Given an initial estimate of line spacing (m_in) and the positions of each // baseline, computes the line spacing of the block more accurately in m_out, // and the corresponding intercept in c_out, and the number of spacings seen // in index_delta. Returns the error of fit to the line spacing model. // Uses a simple linear regression, but optimized the offset using the median. double BaselineBlock::FitLineSpacingModel( const GenericVector<double>& positions, double m_in, double* m_out, double* c_out, int* index_delta) { if (m_in == 0.0f || positions.size() < 2) { *m_out = m_in; *c_out = 0.0; if (index_delta != NULL) *index_delta = 0; return 0.0; } GenericVector<double> offsets; // Get the offset (remainder) linespacing for each line and choose the median. for (int i = 0; i < positions.size(); ++i) offsets.push_back(fmod(positions[i], m_in)); // Get the median offset. double median_offset = MedianOfCircularValues(m_in, &offsets); // Now fit a line to quantized line number and offset. LLSQ llsq; int min_index = MAX_INT32; int max_index = -MAX_INT32; for (int i = 0; i < positions.size(); ++i) { double y_pos = positions[i]; int row_index = IntCastRounded((y_pos - median_offset) / m_in); UpdateRange(row_index, &min_index, &max_index); llsq.add(row_index, y_pos); } // Get the refined line spacing. *m_out = llsq.m(); // Use the median offset rather than the mean. offsets.truncate(0); for (int i = 0; i < positions.size(); ++i) offsets.push_back(fmod(positions[i], *m_out)); // Get the median offset. if (debug_level_ > 2) { for (int i = 0; i < offsets.size(); ++i) tprintf("%d: %g\n", i, offsets[i]); } *c_out = MedianOfCircularValues(*m_out, &offsets); if (debug_level_ > 1) { tprintf("Median offset = %g, compared to mean of %g.\n", *c_out, llsq.c(*m_out)); } // Index_delta is the number of hypothesized line gaps present. if (index_delta != NULL) *index_delta = max_index - min_index; // Use the regression model's intercept to compute the error, as it may be // a full line-spacing in disagreement with the median. double rms_error = llsq.rms(*m_out, llsq.c(*m_out)); if (debug_level_ > 1) { tprintf("Linespacing of y=%g x + %g improved to %g x + %g, rms=%g\n", m_in, median_offset, *m_out, *c_out, rms_error); } return rms_error; }
// Helper computes median xheight in the image. static double MedianXHeight(BLOCK_LIST *block_list) { BLOCK_IT block_it(block_list); STATS xheights(0, block_it.data()->bounding_box().height()); for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) { ROW_IT row_it(block_it.data()->row_list()); for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) { xheights.add(IntCastRounded(row_it.data()->x_height()), 1); } } return xheights.median(); }
// Reorganize the blob lists with a different definition of small, medium // and large, compared to the original definition. // Height is still the primary filter key, but medium width blobs of small // height become small, and very wide blobs of small height stay noise, along // with small dot-shaped blobs. void TO_BLOCK::ReSetAndReFilterBlobs() { int min_height = IntCastRounded(kMinMediumSizeRatio * line_size); int max_height = IntCastRounded(kMaxMediumSizeRatio * line_size); BLOBNBOX_LIST noise_list; BLOBNBOX_LIST small_list; BLOBNBOX_LIST medium_list; BLOBNBOX_LIST large_list; SizeFilterBlobs(min_height, max_height, &blobs, &noise_list, &small_list, &medium_list, &large_list); SizeFilterBlobs(min_height, max_height, &large_blobs, &noise_list, &small_list, &medium_list, &large_list); SizeFilterBlobs(min_height, max_height, &small_blobs, &noise_list, &small_list, &medium_list, &large_list); SizeFilterBlobs(min_height, max_height, &noise_blobs, &noise_list, &small_list, &medium_list, &large_list); BLOBNBOX_IT blob_it(&blobs); blob_it.add_list_after(&medium_list); blob_it.set_to_list(&large_blobs); blob_it.add_list_after(&large_list); blob_it.set_to_list(&small_blobs); blob_it.add_list_after(&small_list); blob_it.set_to_list(&noise_blobs); blob_it.add_list_after(&noise_list); }
// Gets anything and everything with a non-NULL pointer, prescaled to a // given target_height (if 0, then the original image height), and aligned. // Also returns (if not NULL) the width and height of the scaled image. // The return value is the scale factor that was applied to the image to // achieve the target_height. float ImageData::PreScale(int target_height, Pix** pix, int* scaled_width, int* scaled_height, GenericVector<TBOX>* boxes) const { int input_width = 0; int input_height = 0; Pix* src_pix = GetPix(); ASSERT_HOST(src_pix != NULL); input_width = pixGetWidth(src_pix); input_height = pixGetHeight(src_pix); if (target_height == 0) target_height = input_height; float im_factor = static_cast<float>(target_height) / input_height; if (scaled_width != NULL) *scaled_width = IntCastRounded(im_factor * input_width); if (scaled_height != NULL) *scaled_height = target_height; if (pix != NULL) { // Get the scaled image. pixDestroy(pix); *pix = pixScale(src_pix, im_factor, im_factor); if (*pix == NULL) { tprintf("Scaling pix of size %d, %d by factor %g made null pix!!\n", input_width, input_height, im_factor); } if (scaled_width != NULL) *scaled_width = pixGetWidth(*pix); if (scaled_height != NULL) *scaled_height = pixGetHeight(*pix); } pixDestroy(&src_pix); if (boxes != NULL) { // Get the boxes. boxes->truncate(0); for (int b = 0; b < boxes_.size(); ++b) { TBOX box = boxes_[b]; box.scale(im_factor); boxes->push_back(box); } if (boxes->empty()) { // Make a single box for the whole image. TBOX box(0, 0, im_factor * input_width, target_height); boxes->push_back(box); } } return im_factor; }
// Computes and returns the noise_density IntGrid, at the same gridsize as // this by summing the number of small elements in a 3x3 neighbourhood of // each grid cell. good_grid is filled with blobs that are considered most // likely good text, and this is filled with small and medium blobs that are // more likely non-text. // The photo_map is used to bias the decision towards non-text, rather than // supplying definite decision. IntGrid* CCNonTextDetect::ComputeNoiseDensity(bool debug, Pix* photo_map, BlobGrid* good_grid) { IntGrid* noise_counts = CountCellElements(); IntGrid* noise_density = noise_counts->NeighbourhoodSum(); IntGrid* good_counts = good_grid->CountCellElements(); // Now increase noise density in photo areas, to bias the decision and // minimize hallucinated text on image, but trim the noise_density where // there are good blobs and the original count is low in non-photo areas, // indicating that most of the result came from neighbouring cells. int height = pixGetHeight(photo_map); int photo_offset = IntCastRounded(max_noise_count_ * kPhotoOffsetFraction); for (int y = 0; y < gridheight(); ++y) { for (int x = 0; x < gridwidth(); ++x) { int noise = noise_density->GridCellValue(x, y); if (max_noise_count_ < noise + photo_offset && noise <= max_noise_count_) { // Test for photo. int left = x * gridsize(); int right = left + gridsize(); int bottom = height - y * gridsize(); int top = bottom - gridsize(); if (ImageFind::BoundsWithinRect(photo_map, &left, &top, &right, &bottom)) { noise_density->SetGridCell(x, y, noise + photo_offset); } } if (debug && noise > max_noise_count_ && good_counts->GridCellValue(x, y) > 0) { tprintf("At %d, %d, noise = %d, good=%d, orig=%d, thr=%d\n", x * gridsize(), y * gridsize(), noise_density->GridCellValue(x, y), good_counts->GridCellValue(x, y), noise_counts->GridCellValue(x, y), max_noise_count_); } if (noise > max_noise_count_ && good_counts->GridCellValue(x, y) > 0 && noise_counts->GridCellValue(x, y) * kOriginalNoiseMultiple <= max_noise_count_) { noise_density->SetGridCell(x, y, 0); } } } delete noise_counts; delete good_counts; return noise_density; }
// Helper returns the mean direction vector from the given stats. Use the // mean direction from dirs if there is information available, otherwise, use // the fit_vector from point_diffs. static FCOORD MeanDirectionVector(const LLSQ& point_diffs, const LLSQ& dirs, const FCOORD& start_pt, const FCOORD& end_pt) { FCOORD fit_vector; if (dirs.count() > 0) { // There were directions, so use them. To avoid wrap-around problems, we // have 2 accumulators in dirs: x for normal directions and y for // directions offset by 128. We will use the one with the least variance. FCOORD mean_pt = dirs.mean_point(); double mean_dir = 0.0; if (dirs.x_variance() <= dirs.y_variance()) { mean_dir = mean_pt.x(); } else { mean_dir = mean_pt.y() + 128; } fit_vector.from_direction(Modulo(IntCastRounded(mean_dir), 256)); } else { // There were no directions, so we rely on the vector_fit to the points. // Since the vector_fit is 180 degrees ambiguous, we align with the // supplied feature_dir by making the scalar product non-negative. FCOORD feature_dir(end_pt - start_pt); fit_vector = point_diffs.vector_fit(); if (fit_vector.x() == 0.0f && fit_vector.y() == 0.0f) { // There was only a single point. Use feature_dir directly. fit_vector = feature_dir; } else { // Sometimes the least mean squares fit is wrong, due to the small sample // of points and scaling. Use a 90 degree rotated vector if that matches // feature_dir better. FCOORD fit_vector2 = !fit_vector; // The fit_vector is 180 degrees ambiguous, so resolve the ambiguity by // insisting that the scalar product with the feature_dir should be +ve. if (fit_vector % feature_dir < 0.0) fit_vector = -fit_vector; if (fit_vector2 % feature_dir < 0.0) fit_vector2 = -fit_vector2; // Even though fit_vector2 has a higher mean squared error, it might be // a better fit, so use it if the dot product with feature_dir is bigger. if (fit_vector2 % feature_dir > fit_vector % feature_dir) fit_vector = fit_vector2; } } return fit_vector; }
// Converts a float network to an int network. Each set of input weights that // corresponds to a single output weight is converted independently: // Compute the max absolute value of the weight set. // Scale so the max absolute value becomes MAX_INT8. // Round to integer. // Store a multiplicative scale factor (as a double) that will reproduce // the original value, subject to rounding errors. void WeightMatrix::ConvertToInt() { wi_.ResizeNoInit(wf_.dim1(), wf_.dim2()); scales_.init_to_size(wi_.dim1(), 0.0); int dim2 = wi_.dim2(); for (int t = 0; t < wi_.dim1(); ++t) { double* f_line = wf_[t]; inT8* i_line = wi_[t]; double max_abs = 0.0; for (int f = 0; f < dim2; ++f) { double abs_val = fabs(f_line[f]); if (abs_val > max_abs) max_abs = abs_val; } double scale = max_abs / MAX_INT8; scales_[t] = scale; if (scale == 0.0) scale = 1.0; for (int f = 0; f < dim2; ++f) { i_line[f] = IntCastRounded(f_line[f] / scale); } } wf_.Resize(1, 1, 0.0); int_mode_ = true; }
// Helper computes one or more features corresponding to the given points. // Emitted features are on the line defined by: // start_pt + lambda * (end_pt - start_pt) for scalar lambda. // Features are spaced at feature_length intervals. static int ComputeFeatures(const FCOORD& start_pt, const FCOORD& end_pt, double feature_length, GenericVector<INT_FEATURE_STRUCT>* features) { FCOORD feature_vector(end_pt - start_pt); if (feature_vector.x() == 0.0f && feature_vector.y() == 0.0f) return 0; // Compute theta for the feature based on its direction. uint8_t theta = feature_vector.to_direction(); // Compute the number of features and lambda_step. double target_length = feature_vector.length(); int num_features = IntCastRounded(target_length / feature_length); if (num_features == 0) return 0; // Divide the length evenly into num_features pieces. double lambda_step = 1.0 / num_features; double lambda = lambda_step / 2.0; for (int f = 0; f < num_features; ++f, lambda += lambda_step) { FCOORD feature_pt(start_pt); feature_pt += feature_vector * lambda; INT_FEATURE_STRUCT feature(feature_pt, theta); features->push_back(feature); } return num_features; }
// Adds the unichars of the given shape_id to the vector of results. Any // unichar_id that is already present just has the fonts added to the // font set for that result without adding a new entry in the vector. // NOTE: it is assumed that the results are given to this function in order // of decreasing rating. // The unichar_map vector indicates the index of the results entry containing // each unichar, or -1 if the unichar is not yet included in results. void ShapeTable::AddShapeToResults(const ShapeRating& shape_rating, GenericVector<int>* unichar_map, GenericVector<UnicharRating>* results)const { if (shape_rating.joined) { AddUnicharToResults(UNICHAR_JOINED, shape_rating.rating, unichar_map, results); } if (shape_rating.broken) { AddUnicharToResults(UNICHAR_BROKEN, shape_rating.rating, unichar_map, results); } const Shape& shape = GetShape(shape_rating.shape_id); for (int u = 0; u < shape.size(); ++u) { int result_index = AddUnicharToResults(shape[u].unichar_id, shape_rating.rating, unichar_map, results); for (int f = 0; f < shape[u].font_ids.size(); ++f) { (*results)[result_index].fonts.push_back( ScoredFont(shape[u].font_ids[f], IntCastRounded(shape_rating.rating * MAX_INT16))); } } }
// Gets anything and everything with a non-NULL pointer, prescaled to a // given target_height (if 0, then the original image height), and aligned. // Also returns (if not NULL) the width and height of the scaled image. void ImageData::PreScale(int target_height, Pix** pix, int* scaled_width, int* scaled_height, GenericVector<TBOX>* boxes) const { int input_width = 0; int input_height = 0; Pix* src_pix = GetPix(); ASSERT_HOST(src_pix != NULL); input_width = pixGetWidth(src_pix); input_height = pixGetHeight(src_pix); if (target_height == 0) target_height = input_height; float im_factor = static_cast<float>(target_height) / input_height; if (scaled_width != NULL) *scaled_width = IntCastRounded(im_factor * input_width); if (scaled_height != NULL) *scaled_height = target_height; if (pix != NULL) { // Get the scaled image. pixDestroy(pix); *pix = pixScale(src_pix, im_factor, im_factor); if (scaled_width != NULL) *scaled_width = pixGetWidth(*pix); if (scaled_height != NULL) *scaled_height = pixGetHeight(*pix); } pixDestroy(&src_pix); if (boxes != NULL) { // Get the boxes. boxes->truncate(0); for (int b = 0; b < boxes_.size(); ++b) { TBOX box = boxes_[b]; box.scale(im_factor); boxes->push_back(box); } } }
// Converts an angle in radians (from ICOORD::angle or FCOORD::angle) to a // standard feature direction as an unsigned angle in 256ths of a circle // measured anticlockwise from (-1, 0). uint8_t FCOORD::binary_angle_plus_pi(double radians) { return Modulo(IntCastRounded((radians + M_PI) * 128.0 / M_PI), 256); }
/** * Sets up auto page segmentation, determines the orientation, and corrects it. * Somewhat arbitrary chunk of functionality, factored out of AutoPageSeg to * facilitate testing. * photo_mask_pix is a pointer to a NULL pointer that will be filled on return * with the leptonica photo mask, which must be pixDestroyed by the caller. * to_blocks is an empty list that will be filled with (usually a single) * block that is used during layout analysis. This ugly API is required * because of the possibility of a unlv zone file. * TODO(rays) clean this up. * See AutoPageSeg for other arguments. * The returned ColumnFinder must be deleted after use. */ ColumnFinder* Tesseract::SetupPageSegAndDetectOrientation( PageSegMode pageseg_mode, BLOCK_LIST* blocks, Tesseract* osd_tess, OSResults* osr, TO_BLOCK_LIST* to_blocks, Pix** photo_mask_pix, Pix** music_mask_pix) { int vertical_x = 0; int vertical_y = 1; TabVector_LIST v_lines; TabVector_LIST h_lines; ICOORD bleft(0, 0); ASSERT_HOST(pix_binary_ != NULL); if (tessedit_dump_pageseg_images) { pixa_debug_.AddPix(pix_binary_, "PageSegInput"); } // Leptonica is used to find the rule/separator lines in the input. LineFinder::FindAndRemoveLines(source_resolution_, textord_tabfind_show_vlines, pix_binary_, &vertical_x, &vertical_y, music_mask_pix, &v_lines, &h_lines); if (tessedit_dump_pageseg_images) { pixa_debug_.AddPix(pix_binary_, "NoLines"); } // Leptonica is used to find a mask of the photo regions in the input. *photo_mask_pix = ImageFind::FindImages(pix_binary_, &pixa_debug_); if (tessedit_dump_pageseg_images) { pixa_debug_.AddPix(pix_binary_, "NoImages"); } if (!PSM_COL_FIND_ENABLED(pageseg_mode)) v_lines.clear(); // The rest of the algorithm uses the usual connected components. textord_.find_components(pix_binary_, blocks, to_blocks); TO_BLOCK_IT to_block_it(to_blocks); // There must be exactly one input block. // TODO(rays) handle new textline finding with a UNLV zone file. ASSERT_HOST(to_blocks->singleton()); TO_BLOCK* to_block = to_block_it.data(); TBOX blkbox = to_block->block->bounding_box(); ColumnFinder* finder = NULL; int estimated_resolution = source_resolution_; if (source_resolution_ == kMinCredibleResolution) { // Try to estimate resolution from typical body text size. int res = IntCastRounded(to_block->line_size * kResolutionEstimationFactor); if (res > estimated_resolution && res < kMaxCredibleResolution) { estimated_resolution = res; tprintf("Estimating resolution as %d\n", estimated_resolution); } } if (to_block->line_size >= 2) { finder = new ColumnFinder(static_cast<int>(to_block->line_size), blkbox.botleft(), blkbox.topright(), estimated_resolution, textord_use_cjk_fp_model, textord_tabfind_aligned_gap_fraction, &v_lines, &h_lines, vertical_x, vertical_y); finder->SetupAndFilterNoise(pageseg_mode, *photo_mask_pix, to_block); if (equ_detect_) { equ_detect_->LabelSpecialText(to_block); } BLOBNBOX_CLIST osd_blobs; // osd_orientation is the number of 90 degree rotations to make the // characters upright. (See osdetect.h for precise definition.) // We want the text lines horizontal, (vertical text indicates vertical // textlines) which may conflict (eg vertically written CJK). int osd_orientation = 0; bool vertical_text = textord_tabfind_force_vertical_text || pageseg_mode == PSM_SINGLE_BLOCK_VERT_TEXT; if (!vertical_text && textord_tabfind_vertical_text && PSM_ORIENTATION_ENABLED(pageseg_mode)) { vertical_text = finder->IsVerticallyAlignedText(textord_tabfind_vertical_text_ratio, to_block, &osd_blobs); } if (PSM_OSD_ENABLED(pageseg_mode) && osd_tess != NULL && osr != NULL) { GenericVector<int> osd_scripts; if (osd_tess != this) { // We are running osd as part of layout analysis, so constrain the // scripts to those allowed by *this. AddAllScriptsConverted(unicharset, osd_tess->unicharset, &osd_scripts); for (int s = 0; s < sub_langs_.size(); ++s) { AddAllScriptsConverted(sub_langs_[s]->unicharset, osd_tess->unicharset, &osd_scripts); } } os_detect_blobs(&osd_scripts, &osd_blobs, osr, osd_tess); if (pageseg_mode == PSM_OSD_ONLY) { delete finder; return NULL; } osd_orientation = osr->best_result.orientation_id; double osd_score = osr->orientations[osd_orientation]; double osd_margin = min_orientation_margin * 2; for (int i = 0; i < 4; ++i) { if (i != osd_orientation && osd_score - osr->orientations[i] < osd_margin) { osd_margin = osd_score - osr->orientations[i]; } } int best_script_id = osr->best_result.script_id; const char* best_script_str = osd_tess->unicharset.get_script_from_script_id(best_script_id); bool cjk = best_script_id == osd_tess->unicharset.han_sid() || best_script_id == osd_tess->unicharset.hiragana_sid() || best_script_id == osd_tess->unicharset.katakana_sid() || strcmp("Japanese", best_script_str) == 0 || strcmp("Korean", best_script_str) == 0 || strcmp("Hangul", best_script_str) == 0; if (cjk) { finder->set_cjk_script(true); } if (osd_margin < min_orientation_margin) { // The margin is weak. if (!cjk && !vertical_text && osd_orientation == 2) { // upside down latin text is improbable with such a weak margin. tprintf("OSD: Weak margin (%.2f), horiz textlines, not CJK: " "Don't rotate.\n", osd_margin); osd_orientation = 0; } else { tprintf( "OSD: Weak margin (%.2f) for %d blob text block, " "but using orientation anyway: %d\n", osd_margin, osd_blobs.length(), osd_orientation); } } } osd_blobs.shallow_clear(); finder->CorrectOrientation(to_block, vertical_text, osd_orientation); } return finder; }
// Evaluate the vector in terms of coverage of its length by good-looking // box edges. A good looking box is one where its nearest neighbour on the // inside is nearer than half the distance its nearest neighbour on the // outside of the putative column. Bad boxes are removed from the line. // A second pass then further filters boxes by requiring that the gutter // width be a minimum fraction of the mean gutter along the line. void TabVector::Evaluate(const ICOORD& vertical, TabFind* finder) { bool debug = false; needs_evaluation_ = false; int length = endpt_.y() - startpt_.y(); if (length == 0 || boxes_.empty()) { percent_score_ = 0; Print("Zero length in evaluate"); return; } // Compute the mean box height. BLOBNBOX_C_IT it(&boxes_); int mean_height = 0; int height_count = 0; for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { BLOBNBOX* bbox = it.data(); const TBOX& box = bbox->bounding_box(); int height = box.height(); mean_height += height; ++height_count; } if (height_count > 0) mean_height /= height_count; int max_gutter = kGutterMultiple * mean_height; if (IsRagged()) { // Ragged edges face a tougher test in that the gap must always be within // the height of the blob. max_gutter = kGutterToNeighbourRatio * mean_height; } STATS gutters(0, max_gutter + 1); // Evaluate the boxes for their goodness, calculating the coverage as we go. // Remove boxes that are not good and shorten the list to the first and // last good boxes. int num_deleted_boxes = 0; bool text_on_image = false; int good_length = 0; const TBOX* prev_good_box = NULL; for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { BLOBNBOX* bbox = it.data(); const TBOX& box = bbox->bounding_box(); int mid_y = (box.top() + box.bottom()) / 2; if (TabFind::WithinTestRegion(2, XAtY(box.bottom()), box.bottom())) { if (!debug) { tprintf("After already deleting %d boxes, ", num_deleted_boxes); Print("Starting evaluation"); } debug = true; } // A good box is one where the nearest neighbour on the inside is closer // than half the distance to the nearest neighbour on the outside // (of the putative column). bool left = IsLeftTab(); int tab_x = XAtY(mid_y); int gutter_width; int neighbour_gap; finder->GutterWidthAndNeighbourGap(tab_x, mean_height, max_gutter, left, bbox, &gutter_width, &neighbour_gap); if (debug) { tprintf("Box (%d,%d)->(%d,%d) has gutter %d, ndist %d\n", box.left(), box.bottom(), box.right(), box.top(), gutter_width, neighbour_gap); } // Now we can make the test. if (neighbour_gap * kGutterToNeighbourRatio <= gutter_width) { // A good box contributes its height to the good_length. good_length += box.top() - box.bottom(); gutters.add(gutter_width, 1); // Two good boxes together contribute the gap between them // to the good_length as well, as long as the gap is not // too big. if (prev_good_box != NULL) { int vertical_gap = box.bottom() - prev_good_box->top(); double size1 = sqrt(static_cast<double>(prev_good_box->area())); double size2 = sqrt(static_cast<double>(box.area())); if (vertical_gap < kMaxFillinMultiple * MIN(size1, size2)) good_length += vertical_gap; if (debug) { tprintf("Box and prev good, gap=%d, target %g, goodlength=%d\n", vertical_gap, kMaxFillinMultiple * MIN(size1, size2), good_length); } } else { // Adjust the start to the first good box. SetYStart(box.bottom()); } prev_good_box = &box; if (bbox->flow() == BTFT_TEXT_ON_IMAGE) text_on_image = true; } else { // Get rid of boxes that are not good. if (debug) { tprintf("Bad Box (%d,%d)->(%d,%d) with gutter %d, ndist %d\n", box.left(), box.bottom(), box.right(), box.top(), gutter_width, neighbour_gap); } it.extract(); ++num_deleted_boxes; } } if (debug) { Print("Evaluating:"); } // If there are any good boxes, do it again, except this time get rid of // boxes that have a gutter that is a small fraction of the mean gutter. // This filters out ends that run into a coincidental gap in the text. int search_top = endpt_.y(); int search_bottom = startpt_.y(); int median_gutter = IntCastRounded(gutters.median()); if (gutters.get_total() > 0) { prev_good_box = NULL; for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { BLOBNBOX* bbox = it.data(); const TBOX& box = bbox->bounding_box(); int mid_y = (box.top() + box.bottom()) / 2; // A good box is one where the gutter width is at least some constant // fraction of the mean gutter width. bool left = IsLeftTab(); int tab_x = XAtY(mid_y); int max_gutter = kGutterMultiple * mean_height; if (IsRagged()) { // Ragged edges face a tougher test in that the gap must always be // within the height of the blob. max_gutter = kGutterToNeighbourRatio * mean_height; } int gutter_width; int neighbour_gap; finder->GutterWidthAndNeighbourGap(tab_x, mean_height, max_gutter, left, bbox, &gutter_width, &neighbour_gap); // Now we can make the test. if (gutter_width >= median_gutter * kMinGutterFraction) { if (prev_good_box == NULL) { // Adjust the start to the first good box. SetYStart(box.bottom()); search_bottom = box.top(); } prev_good_box = &box; search_top = box.bottom(); } else { // Get rid of boxes that are not good. if (debug) { tprintf("Bad Box (%d,%d)->(%d,%d) with gutter %d, mean gutter %d\n", box.left(), box.bottom(), box.right(), box.top(), gutter_width, median_gutter); } it.extract(); ++num_deleted_boxes; } } } // If there has been a good box, adjust the end. if (prev_good_box != NULL) { SetYEnd(prev_good_box->top()); // Compute the percentage of the vector that is occupied by good boxes. int length = endpt_.y() - startpt_.y(); percent_score_ = 100 * good_length / length; if (num_deleted_boxes > 0) { needs_refit_ = true; FitAndEvaluateIfNeeded(vertical, finder); if (boxes_.empty()) return; } // Test the gutter over the whole vector, instead of just at the boxes. int required_shift; if (search_bottom > search_top) { search_bottom = startpt_.y(); search_top = endpt_.y(); } double min_gutter_width = kLineCountReciprocal / boxes_.length(); min_gutter_width += IsRagged() ? kMinRaggedGutter : kMinAlignedGutter; min_gutter_width *= mean_height; int max_gutter_width = IntCastRounded(min_gutter_width) + 1; if (median_gutter > max_gutter_width) max_gutter_width = median_gutter; int gutter_width = finder->GutterWidth(search_bottom, search_top, *this, text_on_image, max_gutter_width, &required_shift); if (gutter_width < min_gutter_width) { if (debug) { tprintf("Rejecting bad tab Vector with %d gutter vs %g min\n", gutter_width, min_gutter_width); } boxes_.shallow_clear(); percent_score_ = 0; } else if (debug) { tprintf("Final gutter %d, vs limit of %g, required shift = %d\n", gutter_width, min_gutter_width, required_shift); } } else { // There are no good boxes left, so score is 0. percent_score_ = 0; } if (debug) { Print("Evaluation complete:"); } }
WordFeature::WordFeature(const FCOORD& fcoord, uinT8 dir) : x_(IntCastRounded(fcoord.x())), y_(ClipToRange(IntCastRounded(fcoord.y()), 0, MAX_UINT8)), dir_(dir) { }