void Tesseract::PrerecAllWordsPar(const GenericVector<WordData>& words) { // Prepare all the blobs. GenericVector<BlobData> blobs; for (int w = 0; w < words.size(); ++w) { if (words[w].word->ratings != NULL && words[w].word->ratings->get(0, 0) == NULL) { for (int b = 0; b < words[w].word->chopped_word->NumBlobs(); ++b) { blobs.push_back(BlobData(b, this, *words[w].word)); } for (int s = 0; s < words[w].lang_words.size(); ++s) { const WERD_RES& word = words[w].lang_words[s]; for (int b = 0; b < word.chopped_word->NumBlobs(); ++b) { blobs.push_back(BlobData(b, sub_langs_[s], word)); } } } } // Pre-classify all the blobs. if (tessedit_parallelize > 1) { #pragma omp parallel for num_threads(10) for (int b = 0; b < blobs.size(); ++b) { *blobs[b].choices = blobs[b].tesseract->classify_blob(blobs[b].blob, "par", White, NULL); } } else { // TODO(AMD) parallelize this. for (int b = 0; b < blobs.size(); ++b) { *blobs[b].choices = blobs[b].tesseract->classify_blob(blobs[b].blob, "par", White, NULL); } } }
// Given an initial estimate of line spacing (m_in) and the positions of each // baseline, computes the line spacing of the block more accurately in m_out, // and the corresponding intercept in c_out, and the number of spacings seen // in index_delta. Returns the error of fit to the line spacing model. // Uses a simple linear regression, but optimized the offset using the median. double BaselineBlock::FitLineSpacingModel( const GenericVector<double>& positions, double m_in, double* m_out, double* c_out, int* index_delta) { if (m_in == 0.0f || positions.size() < 2) { *m_out = m_in; *c_out = 0.0; if (index_delta != NULL) *index_delta = 0; return 0.0; } GenericVector<double> offsets; // Get the offset (remainder) linespacing for each line and choose the median. for (int i = 0; i < positions.size(); ++i) offsets.push_back(fmod(positions[i], m_in)); // Get the median offset. double median_offset = MedianOfCircularValues(m_in, &offsets); // Now fit a line to quantized line number and offset. LLSQ llsq; int min_index = MAX_INT32; int max_index = -MAX_INT32; for (int i = 0; i < positions.size(); ++i) { double y_pos = positions[i]; int row_index = IntCastRounded((y_pos - median_offset) / m_in); UpdateRange(row_index, &min_index, &max_index); llsq.add(row_index, y_pos); } // Get the refined line spacing. *m_out = llsq.m(); // Use the median offset rather than the mean. offsets.truncate(0); for (int i = 0; i < positions.size(); ++i) offsets.push_back(fmod(positions[i], *m_out)); // Get the median offset. if (debug_level_ > 2) { for (int i = 0; i < offsets.size(); ++i) tprintf("%d: %g\n", i, offsets[i]); } *c_out = MedianOfCircularValues(*m_out, &offsets); if (debug_level_ > 1) { tprintf("Median offset = %g, compared to mean of %g.\n", *c_out, llsq.c(*m_out)); } // Index_delta is the number of hypothesized line gaps present. if (index_delta != NULL) *index_delta = max_index - min_index; // Use the regression model's intercept to compute the error, as it may be // a full line-spacing in disagreement with the median. double rms_error = llsq.rms(*m_out, llsq.c(*m_out)); if (debug_level_ > 1) { tprintf("Linespacing of y=%g x + %g improved to %g x + %g, rms=%g\n", m_in, median_offset, *m_out, *c_out, rms_error); } return rms_error; }
// Parse a string of the form [~]<lang>[+[~]<lang>]*. // Langs with no prefix get appended to to_load, provided they // are not in there already. // Langs with ~ prefix get appended to not_to_load, provided they are not in // there already. void Tesseract::ParseLanguageString(const char* lang_str, GenericVector<STRING>* to_load, GenericVector<STRING>* not_to_load) { STRING remains(lang_str); while (remains.length() > 0) { // Find the start of the lang code and which vector to add to. const char* start = remains.string(); while (*start == '+') ++start; GenericVector<STRING>* target = to_load; if (*start == '~') { target = not_to_load; ++start; } // Find the index of the end of the lang code in string start. int end = strlen(start); const char* plus = strchr(start, '+'); if (plus != NULL && plus - start < end) end = plus - start; STRING lang_code(start); lang_code.truncate_at(end); STRING next(start + end); remains = next; // Check whether lang_code is already in the target vector and add. if (!IsStrInList(lang_code, *target)) { if (tessdata_manager_debug_level) tprintf("Adding language '%s' to list\n", lang_code.string()); target->push_back(lang_code); } } }
// Computes an estimate of the line spacing of the block from the median // of the spacings between adjacent overlapping textlines. void BaselineBlock::EstimateLineSpacing() { GenericVector<float> spacings; for (int r = 0; r < rows_.size(); ++r) { BaselineRow* row = rows_[r]; // Exclude silly lines. if (fabs(row->BaselineAngle()) > M_PI * 0.25) continue; // Find the first row after row that overlaps it significantly. const TBOX& row_box = row->bounding_box(); int r2; for (r2 = r + 1; r2 < rows_.size() && !row_box.major_x_overlap(rows_[r2]->bounding_box()); ++r2); if (r2 < rows_.size()) { BaselineRow* row2 = rows_[r2]; // Exclude silly lines. if (fabs(row2->BaselineAngle()) > M_PI * 0.25) continue; float spacing = row->SpaceBetween(*row2); spacings.push_back(spacing); } } // If we have at least one value, use it, otherwise leave the previous // value unchanged. if (!spacings.empty()) { line_spacing_ = spacings[spacings.choose_nth_item(spacings.size() / 2)]; if (debug_level_ > 1) tprintf("Estimate of linespacing = %g\n", line_spacing_); } }
// Fits straight line baselines and computes the skew angle from the // median angle. Returns true if a good angle is found. // If use_box_bottoms is false, baseline positions are formed by // considering the outlines of the blobs. bool BaselineBlock::FitBaselinesAndFindSkew(bool use_box_bottoms) { if (non_text_block_) return false; GenericVector<double> angles; for (int r = 0; r < rows_.size(); ++r) { BaselineRow* row = rows_[r]; if (row->FitBaseline(use_box_bottoms)) { double angle = row->BaselineAngle(); angles.push_back(angle); } if (debug_level_ > 1) row->Print(); } if (!angles.empty()) { skew_angle_ = MedianOfCircularValues(M_PI, &angles); good_skew_angle_ = true; } else { skew_angle_ = 0.0f; good_skew_angle_ = false; } if (debug_level_ > 0) { tprintf("Initial block skew angle = %g, good = %d\n", skew_angle_, good_skew_angle_); } return good_skew_angle_; }
TESS_API int TESS_CALL TessBaseAPIInit4(TessBaseAPI* handle, const char* datapath, const char* language, TessOcrEngineMode mode, char** configs, int configs_size, char** vars_vec, char** vars_values, size_t vars_vec_size, BOOL set_only_non_debug_params) { GenericVector<STRING> varNames; GenericVector<STRING> varValues; if (vars_vec != nullptr && vars_values != nullptr) { for (size_t i = 0; i < vars_vec_size; i++) { varNames.push_back(STRING(vars_vec[i])); varValues.push_back(STRING(vars_values[i])); } } return handle->Init(datapath, language, mode, configs, configs_size, &varNames, &varValues, set_only_non_debug_params); }
// Applies the box file based on the image name fname, and resegments // the words in the block_list (page), with: // blob-mode: one blob per line in the box file, words as input. // word/line-mode: one blob per space-delimited unit after the #, and one word // per line in the box file. (See comment above for box file format.) // If find_segmentation is true, (word/line mode) then the classifier is used // to re-segment words/lines to match the space-delimited truth string for // each box. In this case, the input box may be for a word or even a whole // text line, and the output words will contain multiple blobs corresponding // to the space-delimited input string. // With find_segmentation false, no classifier is needed, but the chopper // can still be used to correctly segment touching characters with the help // of the input boxes. // In the returned PAGE_RES, the WERD_RES are setup as they would be returned // from normal classification, ie. with a word, chopped_word, rebuild_word, // seam_array, denorm, box_word, and best_state, but NO best_choice or // raw_choice, as they would require a UNICHARSET, which we aim to avoid. // Instead, the correct_text member of WERD_RES is set, and this may be later // converted to a best_choice using CorrectClassifyWords. CorrectClassifyWords // is not required before calling ApplyBoxTraining. PAGE_RES* Tesseract::ApplyBoxes(const STRING& fname, bool find_segmentation, BLOCK_LIST *block_list) { GenericVector<TBOX> boxes; GenericVector<STRING> texts, full_texts; if (!ReadAllBoxes(applybox_page, true, fname, &boxes, &texts, &full_texts, NULL)) { return NULL; // Can't do it. } int box_count = boxes.size(); int box_failures = 0; // Add an empty everything to the end. boxes.push_back(TBOX()); texts.push_back(STRING()); full_texts.push_back(STRING()); // In word mode, we use the boxes to make a word for each box, but // in blob mode we use the existing words and maximally chop them first. PAGE_RES* page_res = find_segmentation ? NULL : SetupApplyBoxes(boxes, block_list); clear_any_old_text(block_list); for (int i = 0; i < boxes.size() - 1; i++) { bool foundit = false; if (page_res != NULL) { if (i == 0) { foundit = ResegmentCharBox(page_res, NULL, boxes[i], boxes[i + 1], full_texts[i].string()); } else { foundit = ResegmentCharBox(page_res, &boxes[i-1], boxes[i], boxes[i + 1], full_texts[i].string()); } } else { foundit = ResegmentWordBox(block_list, boxes[i], boxes[i + 1], texts[i].string()); } if (!foundit) { box_failures++; ReportFailedBox(i, boxes[i], texts[i].string(), "FAILURE! Couldn't find a matching blob"); } } if (page_res == NULL) { // In word/line mode, we now maximally chop all the words and resegment // them with the classifier. page_res = SetupApplyBoxes(boxes, block_list); ReSegmentByClassification(page_res); } if (applybox_debug > 0) { tprintf("APPLY_BOXES:\n"); tprintf(" Boxes read from boxfile: %6d\n", box_count); if (box_failures > 0) tprintf(" Boxes failed resegmentation: %6d\n", box_failures); } TidyUp(page_res); return page_res; }
GenericVector<char*> M_Utils::lineSplit(const char* txt) { int txtlen = (int)strlen(txt); // pass 1: find split points GenericVector<int> splitpoints; for(int i = 0; i < txtlen; i++) { if(txt[i] == '\n' && (i < (txtlen-1))) splitpoints.push_back(i); } // pass 2: iterate split points to do all the splitting int prevsplit = 0; GenericVector<char*> res; if(splitpoints.empty()) { // deep copy the string char* newstr = strDeepCpy(txt); res.push_back(newstr); return res; } for(int i = 0; i < splitpoints.length(); i++) { int split = splitpoints[i]; int newstrsize = split-prevsplit; char* ln = new char[newstrsize+2]; // +1 for null terminator and +1 for newline for(int i = 0; i < newstrsize; i++) ln[i] = txt[prevsplit+i]; ln[newstrsize] = '\n'; ln[newstrsize+1] = '\0'; // null terminator res.push_back(ln); splitpoints.clear(); prevsplit = split; } // now just need to add the last line int lastsplit = prevsplit; int newstrsize = txtlen - prevsplit; char* ln = new char[newstrsize+1]; for(int i = 0; i < newstrsize; i++) ln[i] = txt[prevsplit+i]; ln[newstrsize] = '\0'; res.push_back(ln); return res; }
/** * WERD_CHOICE::WERD_CHOICE * * Constructor to build a WERD_CHOICE from the given string. * The function assumes that src_string is not NULL. */ WERD_CHOICE::WERD_CHOICE(const char *src_string, const UNICHARSET &unicharset) : unicharset_(&unicharset){ GenericVector<UNICHAR_ID> encoding; GenericVector<char> lengths; if (unicharset.encode_string(src_string, true, &encoding, &lengths, NULL)) { lengths.push_back('\0'); STRING src_lengths = &lengths[0]; this->init(src_string, src_lengths.string(), 0.0, 0.0, NO_PERM); } else { // There must have been an invalid unichar in the string. this->init(8); this->make_bad(); } }
void ResultIterator::CalculateTextlineOrder( bool paragraph_is_ltr, const LTRResultIterator &resit, GenericVector<StrongScriptDirection> *dirs_arg, GenericVectorEqEq<int> *word_indices) const { GenericVector<StrongScriptDirection> dirs; GenericVector<StrongScriptDirection> *directions; directions = (dirs_arg != NULL) ? dirs_arg : &dirs; directions->truncate(0); // A LTRResultIterator goes strictly left-to-right word order. LTRResultIterator ltr_it(resit); ltr_it.RestartRow(); if (ltr_it.Empty(RIL_WORD)) return; do { directions->push_back(ltr_it.WordDirection()); } while (ltr_it.Next(RIL_WORD) && !ltr_it.IsAtBeginningOf(RIL_TEXTLINE)); word_indices->truncate(0); CalculateTextlineOrder(paragraph_is_ltr, *directions, word_indices); }