Пример #1
0
void Tesseract::PrerecAllWordsPar(const GenericVector<WordData>& words) {
  // Prepare all the blobs.
  GenericVector<BlobData> blobs;
  for (int w = 0; w < words.size(); ++w) {
    if (words[w].word->ratings != NULL &&
        words[w].word->ratings->get(0, 0) == NULL) {
      for (int b = 0; b < words[w].word->chopped_word->NumBlobs(); ++b) {
        blobs.push_back(BlobData(b, this, *words[w].word));
      }
      for (int s = 0; s < words[w].lang_words.size(); ++s) {
        const WERD_RES& word = words[w].lang_words[s];
        for (int b = 0; b < word.chopped_word->NumBlobs(); ++b) {
          blobs.push_back(BlobData(b, sub_langs_[s], word));
        }
      }
    }
  }
  // Pre-classify all the blobs.
  if (tessedit_parallelize > 1) {
    #pragma omp parallel for num_threads(10)
    for (int b = 0; b < blobs.size(); ++b) {
      *blobs[b].choices =
          blobs[b].tesseract->classify_blob(blobs[b].blob, "par", White, NULL);
    }
  } else {
    // TODO(AMD) parallelize this.
    for (int b = 0; b < blobs.size(); ++b) {
      *blobs[b].choices =
          blobs[b].tesseract->classify_blob(blobs[b].blob, "par", White, NULL);
    }
  }
}
Пример #2
0
// Given an initial estimate of line spacing (m_in) and the positions of each
// baseline, computes the line spacing of the block more accurately in m_out,
// and the corresponding intercept in c_out, and the number of spacings seen
// in index_delta. Returns the error of fit to the line spacing model.
// Uses a simple linear regression, but optimized the offset using the median.
double BaselineBlock::FitLineSpacingModel(
    const GenericVector<double>& positions, double m_in,
    double* m_out, double* c_out, int* index_delta) {
  if (m_in == 0.0f || positions.size() < 2) {
    *m_out = m_in;
    *c_out = 0.0;
    if (index_delta != NULL) *index_delta = 0;
    return 0.0;
  }
  GenericVector<double> offsets;
  // Get the offset (remainder) linespacing for each line and choose the median.
  for (int i = 0; i < positions.size(); ++i)
    offsets.push_back(fmod(positions[i], m_in));
  // Get the median offset.
  double median_offset = MedianOfCircularValues(m_in, &offsets);
  // Now fit a line to quantized line number and offset.
  LLSQ llsq;
  int min_index = MAX_INT32;
  int max_index = -MAX_INT32;
  for (int i = 0; i < positions.size(); ++i) {
    double y_pos = positions[i];
    int row_index = IntCastRounded((y_pos - median_offset) / m_in);
    UpdateRange(row_index, &min_index, &max_index);
    llsq.add(row_index, y_pos);
  }
  // Get the refined line spacing.
  *m_out = llsq.m();
  // Use the median offset rather than the mean.
  offsets.truncate(0);
  for (int i = 0; i < positions.size(); ++i)
    offsets.push_back(fmod(positions[i], *m_out));
  // Get the median offset.
  if (debug_level_ > 2) {
    for (int i = 0; i < offsets.size(); ++i)
      tprintf("%d: %g\n", i, offsets[i]);
  }
  *c_out = MedianOfCircularValues(*m_out, &offsets);
  if (debug_level_ > 1) {
    tprintf("Median offset = %g, compared to mean of %g.\n",
            *c_out, llsq.c(*m_out));
  }
  // Index_delta is the number of hypothesized line gaps present.
  if (index_delta != NULL)
    *index_delta = max_index - min_index;
  // Use the regression model's intercept to compute the error, as it may be
  // a full line-spacing in disagreement with the median.
  double rms_error = llsq.rms(*m_out, llsq.c(*m_out));
  if (debug_level_ > 1) {
    tprintf("Linespacing of y=%g x + %g improved to %g x + %g, rms=%g\n",
            m_in, median_offset, *m_out, *c_out, rms_error);
  }
  return rms_error;
}
Пример #3
0
// Parse a string of the form [~]<lang>[+[~]<lang>]*.
// Langs with no prefix get appended to to_load, provided they
// are not in there already.
// Langs with ~ prefix get appended to not_to_load, provided they are not in
// there already.
void Tesseract::ParseLanguageString(const char* lang_str,
                                    GenericVector<STRING>* to_load,
                                    GenericVector<STRING>* not_to_load) {
  STRING remains(lang_str);
  while (remains.length() > 0) {
    // Find the start of the lang code and which vector to add to.
    const char* start = remains.string();
    while (*start == '+')
      ++start;
    GenericVector<STRING>* target = to_load;
    if (*start == '~') {
      target = not_to_load;
      ++start;
    }
    // Find the index of the end of the lang code in string start.
    int end = strlen(start);
    const char* plus = strchr(start, '+');
    if (plus != NULL && plus - start < end)
      end = plus - start;
    STRING lang_code(start);
    lang_code.truncate_at(end);
    STRING next(start + end);
    remains = next;
    // Check whether lang_code is already in the target vector and add.
    if (!IsStrInList(lang_code, *target)) {
      if (tessdata_manager_debug_level)
        tprintf("Adding language '%s' to list\n", lang_code.string());
      target->push_back(lang_code);
    }
  }
}
Пример #4
0
// Computes an estimate of the line spacing of the block from the median
// of the spacings between adjacent overlapping textlines.
void BaselineBlock::EstimateLineSpacing() {
  GenericVector<float> spacings;
  for (int r = 0; r < rows_.size(); ++r) {
    BaselineRow* row = rows_[r];
    // Exclude silly lines.
    if (fabs(row->BaselineAngle()) > M_PI * 0.25) continue;
    // Find the first row after row that overlaps it significantly.
    const TBOX& row_box = row->bounding_box();
    int r2;
    for (r2 = r + 1; r2 < rows_.size() &&
         !row_box.major_x_overlap(rows_[r2]->bounding_box());
         ++r2);
    if (r2 < rows_.size()) {
      BaselineRow* row2 = rows_[r2];
      // Exclude silly lines.
      if (fabs(row2->BaselineAngle()) > M_PI * 0.25) continue;
      float spacing = row->SpaceBetween(*row2);
      spacings.push_back(spacing);
    }
  }
  // If we have at least one value, use it, otherwise leave the previous
  // value unchanged.
  if (!spacings.empty()) {
    line_spacing_ = spacings[spacings.choose_nth_item(spacings.size() / 2)];
    if (debug_level_ > 1)
      tprintf("Estimate of linespacing = %g\n", line_spacing_);
  }
}
Пример #5
0
// Fits straight line baselines and computes the skew angle from the
// median angle. Returns true if a good angle is found.
// If use_box_bottoms is false, baseline positions are formed by
// considering the outlines of the blobs.
bool BaselineBlock::FitBaselinesAndFindSkew(bool use_box_bottoms) {
  if (non_text_block_) return false;
  GenericVector<double> angles;
  for (int r = 0; r < rows_.size(); ++r) {
    BaselineRow* row = rows_[r];
    if (row->FitBaseline(use_box_bottoms)) {
      double angle = row->BaselineAngle();
      angles.push_back(angle);
    }
    if (debug_level_ > 1)
      row->Print();
  }

  if (!angles.empty()) {
    skew_angle_ = MedianOfCircularValues(M_PI, &angles);
    good_skew_angle_ = true;
  } else {
    skew_angle_ = 0.0f;
    good_skew_angle_ = false;
  }
  if (debug_level_ > 0) {
    tprintf("Initial block skew angle = %g, good = %d\n",
            skew_angle_, good_skew_angle_);
  }
  return good_skew_angle_;
}
Пример #6
0
TESS_API int TESS_CALL TessBaseAPIInit4(TessBaseAPI* handle, const char* datapath, const char* language,
    TessOcrEngineMode mode, char** configs, int configs_size,
    char** vars_vec, char** vars_values, size_t vars_vec_size,
    BOOL set_only_non_debug_params)
{
    GenericVector<STRING> varNames;
    GenericVector<STRING> varValues;
    if (vars_vec != nullptr && vars_values != nullptr) {
        for (size_t i = 0; i < vars_vec_size; i++) {
            varNames.push_back(STRING(vars_vec[i]));
            varValues.push_back(STRING(vars_values[i]));
        }
    }

    return handle->Init(datapath, language, mode, configs, configs_size, &varNames, &varValues, set_only_non_debug_params);
}
Пример #7
0
// Applies the box file based on the image name fname, and resegments
// the words in the block_list (page), with:
// blob-mode: one blob per line in the box file, words as input.
// word/line-mode: one blob per space-delimited unit after the #, and one word
// per line in the box file. (See comment above for box file format.)
// If find_segmentation is true, (word/line mode) then the classifier is used
// to re-segment words/lines to match the space-delimited truth string for
// each box. In this case, the input box may be for a word or even a whole
// text line, and the output words will contain multiple blobs corresponding
// to the space-delimited input string.
// With find_segmentation false, no classifier is needed, but the chopper
// can still be used to correctly segment touching characters with the help
// of the input boxes.
// In the returned PAGE_RES, the WERD_RES are setup as they would be returned
// from normal classification, ie. with a word, chopped_word, rebuild_word,
// seam_array, denorm, box_word, and best_state, but NO best_choice or
// raw_choice, as they would require a UNICHARSET, which we aim to avoid.
// Instead, the correct_text member of WERD_RES is set, and this may be later
// converted to a best_choice using CorrectClassifyWords. CorrectClassifyWords
// is not required before calling ApplyBoxTraining.
PAGE_RES* Tesseract::ApplyBoxes(const STRING& fname,
                                bool find_segmentation,
                                BLOCK_LIST *block_list) {
  GenericVector<TBOX> boxes;
  GenericVector<STRING> texts, full_texts;
  if (!ReadAllBoxes(applybox_page, true, fname, &boxes, &texts, &full_texts,
                    NULL)) {
    return NULL;  // Can't do it.
  }

  int box_count = boxes.size();
  int box_failures = 0;
  // Add an empty everything to the end.
  boxes.push_back(TBOX());
  texts.push_back(STRING());
  full_texts.push_back(STRING());

  // In word mode, we use the boxes to make a word for each box, but
  // in blob mode we use the existing words and maximally chop them first.
  PAGE_RES* page_res = find_segmentation ?
      NULL : SetupApplyBoxes(boxes, block_list);
  clear_any_old_text(block_list);

  for (int i = 0; i < boxes.size() - 1; i++) {
    bool foundit = false;
    if (page_res != NULL) {
      if (i == 0) {
        foundit = ResegmentCharBox(page_res, NULL, boxes[i], boxes[i + 1],
                                   full_texts[i].string());
      } else {
        foundit = ResegmentCharBox(page_res, &boxes[i-1], boxes[i],
                                   boxes[i + 1], full_texts[i].string());
      }
    } else {
      foundit = ResegmentWordBox(block_list, boxes[i], boxes[i + 1],
                                 texts[i].string());
    }
    if (!foundit) {
      box_failures++;
      ReportFailedBox(i, boxes[i], texts[i].string(),
                      "FAILURE! Couldn't find a matching blob");
    }
  }

  if (page_res == NULL) {
    // In word/line mode, we now maximally chop all the words and resegment
    // them with the classifier.
    page_res = SetupApplyBoxes(boxes, block_list);
    ReSegmentByClassification(page_res);
  }
  if (applybox_debug > 0) {
    tprintf("APPLY_BOXES:\n");
    tprintf("   Boxes read from boxfile:  %6d\n", box_count);
    if (box_failures > 0)
      tprintf("   Boxes failed resegmentation:  %6d\n", box_failures);
  }
  TidyUp(page_res);
  return page_res;
}
Пример #8
0
GenericVector<char*> M_Utils::lineSplit(const char* txt) {
  int txtlen = (int)strlen(txt);
  // pass 1: find split points
  GenericVector<int> splitpoints;
  for(int i = 0; i < txtlen; i++) {
    if(txt[i] == '\n' && (i < (txtlen-1)))
      splitpoints.push_back(i);
  }
  // pass 2: iterate split points to do all the splitting
  int prevsplit = 0;
  GenericVector<char*> res;
  if(splitpoints.empty()) {
    // deep copy the string
    char* newstr = strDeepCpy(txt);
    res.push_back(newstr);
    return res;
  }
  for(int i = 0; i < splitpoints.length(); i++) {
    int split = splitpoints[i];
    int newstrsize = split-prevsplit;
    char* ln = new char[newstrsize+2]; // +1 for null terminator and +1 for newline
    for(int i = 0; i < newstrsize; i++)
      ln[i] = txt[prevsplit+i];
    ln[newstrsize] = '\n';
    ln[newstrsize+1] = '\0'; // null terminator
    res.push_back(ln);
    splitpoints.clear();
    prevsplit = split;
  }
  // now just need to add the last line
  int lastsplit = prevsplit;
  int newstrsize = txtlen - prevsplit;
  char* ln = new char[newstrsize+1];
  for(int i = 0; i < newstrsize; i++)
    ln[i] = txt[prevsplit+i];
  ln[newstrsize] = '\0';
  res.push_back(ln);
  return res;
}
Пример #9
0
/**
 * WERD_CHOICE::WERD_CHOICE
 *
 * Constructor to build a WERD_CHOICE from the given string.
 * The function assumes that src_string is not NULL.
 */
WERD_CHOICE::WERD_CHOICE(const char *src_string,
                         const UNICHARSET &unicharset)
    : unicharset_(&unicharset){
  GenericVector<UNICHAR_ID> encoding;
  GenericVector<char> lengths;
  if (unicharset.encode_string(src_string, true, &encoding, &lengths, NULL)) {
    lengths.push_back('\0');
    STRING src_lengths = &lengths[0];
    this->init(src_string, src_lengths.string(), 0.0, 0.0, NO_PERM);
  } else {  // There must have been an invalid unichar in the string.
    this->init(8);
    this->make_bad();
  }
}
Пример #10
0
void ResultIterator::CalculateTextlineOrder(
    bool paragraph_is_ltr,
    const LTRResultIterator &resit,
    GenericVector<StrongScriptDirection> *dirs_arg,
    GenericVectorEqEq<int> *word_indices) const {
  GenericVector<StrongScriptDirection> dirs;
  GenericVector<StrongScriptDirection> *directions;
  directions = (dirs_arg != NULL) ? dirs_arg : &dirs;
  directions->truncate(0);

  // A LTRResultIterator goes strictly left-to-right word order.
  LTRResultIterator ltr_it(resit);
  ltr_it.RestartRow();
  if (ltr_it.Empty(RIL_WORD)) return;
  do {
    directions->push_back(ltr_it.WordDirection());
  } while (ltr_it.Next(RIL_WORD) && !ltr_it.IsAtBeginningOf(RIL_TEXTLINE));

  word_indices->truncate(0);
  CalculateTextlineOrder(paragraph_is_ltr, *directions, word_indices);
}