Beispiel #1
0
// Remove outlines that are a tiny fraction in either width or height
// of the word height.
void Textord::clean_small_noise_from_words(ROW *row) {
  WERD_IT word_it(row->word_list());
  for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
    WERD* word = word_it.data();
    int min_size = static_cast<int>(
      textord_noise_hfract * word->bounding_box().height() + 0.5);
    C_BLOB_IT blob_it(word->cblob_list());
    for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
      C_BLOB* blob = blob_it.data();
      C_OUTLINE_IT out_it(blob->out_list());
      for (out_it.mark_cycle_pt(); !out_it.cycled_list(); out_it.forward()) {
        C_OUTLINE* outline = out_it.data();
        outline->RemoveSmallRecursive(min_size, &out_it);
      }
      if (blob->out_list()->empty()) {
        delete blob_it.extract();
      }
    }
    if (word->cblob_list()->empty()) {
      if (!word_it.at_last()) {
        // The next word is no longer a fuzzy non space if it was before,
        // since the word before is about to be deleted.
        WERD* next_word = word_it.data_relative(1);
        if (next_word->flag(W_FUZZY_NON)) {
          next_word->set_flag(W_FUZZY_NON, false);
        }
      }
      delete word_it.extract();
    }
  }
}
Beispiel #2
0
void make_single_word(bool one_blob, TO_ROW_LIST *rows, ROW_LIST* real_rows) {
  TO_ROW_IT to_row_it(rows);
  TO_ROW* row = to_row_it.data();
  // The blobs have to come out of the BLOBNBOX into the C_BLOB_LIST ready
  // to create the word.
  C_BLOB_LIST cblobs;
  C_BLOB_IT cblob_it(&cblobs);
  BLOBNBOX_IT box_it(row->blob_list());
  for (;!box_it.empty(); box_it.forward()) {
    BLOBNBOX* bblob= box_it.extract();
    if (bblob->joined_to_prev() || (one_blob && !cblob_it.empty())) {
      if (bblob->cblob() != NULL) {
        C_OUTLINE_IT cout_it(cblob_it.data()->out_list());
        cout_it.move_to_last();
        cout_it.add_list_after(bblob->cblob()->out_list());
        delete bblob->cblob();
      }
    } else {
      if (bblob->cblob() != NULL)
        cblob_it.add_after_then_move(bblob->cblob());
      delete bblob;
    }
  }
  // Convert the TO_ROW to a ROW.
  ROW* real_row = new ROW(row, static_cast<inT16>(row->kern_size),
                          static_cast<inT16>(row->space_size));
  WERD_IT word_it(real_row->word_list());
  WERD* word = new WERD(&cblobs, 0, NULL);
  word->set_flag(W_BOL, TRUE);
  word->set_flag(W_EOL, TRUE);
  word_it.add_after_then_move(word);
  ROW_IT row_it(real_rows);
  row_it.add_after_then_move(real_row);
}
Beispiel #3
0
//yangjing01 modified : 
bool TAL_make_single_word(bool one_blob, TO_ROW_LIST* rows, ROW_LIST* real_rows)
{
  TO_ROW_IT to_row_it(rows);
  ROW_IT row_it(real_rows);
  //to_real_row is the real row information of single row or single char mode
  TO_ROW* real_to_row = NULL;
  float row_max_height = 0.0;
  for (to_row_it.mark_cycle_pt();
    !to_row_it.cycled_list(); to_row_it.forward()){
    TO_ROW* row = to_row_it.data();
    float row_min_y = row->min_y();
    float row_max_y = row->max_y();
    float row_height = abs(row_max_y - row_min_y);
    if (real_to_row == NULL
      || row_height > row_max_height
      || fabs(row_height - row_max_height) < 1.0f){
      row_max_height = row_height;
      real_to_row = row;
    }
  }

  if (real_to_row == NULL){
    return false;
  }

  C_BLOB_LIST cblobs;
  C_BLOB_IT cblob_it(&cblobs);
  BLOBNBOX_IT box_it(real_to_row->blob_list());
  for (; !box_it.empty(); box_it.forward()){
    BLOBNBOX* bblob = box_it.extract();
    if (bblob->joined_to_prev() || (one_blob && !cblob_it.empty())) {
      if (bblob->cblob() != NULL){
        C_OUTLINE_IT cout_it(cblob_it.data()->out_list());
        cout_it.move_to_last();
        cout_it.add_list_after(bblob->cblob()->out_list());
        delete bblob->cblob();
      }
    }
    else {
      if (bblob->cblob() != NULL)
        cblob_it.add_after_then_move(bblob->cblob());
    }
    delete bblob;
  }
  // Convert the TO_ROW to a ROW.
  ROW* real_row = new ROW(real_to_row, static_cast<inT16>(real_to_row->kern_size),
    static_cast<inT16>(real_to_row->space_size));
  WERD_IT word_it(real_row->word_list());
  WERD* word = new WERD(&cblobs, 0, NULL);
  word->set_flag(W_BOL, TRUE);
  word->set_flag(W_EOL, TRUE);
  word->set_flag(W_DONT_CHOP, one_blob);
  word_it.add_after_then_move(word);
  row_it.add_after_then_move(real_row);

  return true;
}
void match_current_words(WERD_RES_LIST &words, ROW *row) {
  WERD_RES_IT word_it(&words);
  WERD_RES *word;

  for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) {
    word = word_it.data ();
    if ((!word->part_of_combo) && (word->outword == NULL))
      classify_word_pass2(word, row);
  }
}
Beispiel #5
0
static void clear_any_old_text(BLOCK_LIST *block_list) {
  BLOCK_IT block_it(block_list);
  for (block_it.mark_cycle_pt();
       !block_it.cycled_list(); block_it.forward()) {
    ROW_IT row_it(block_it.data()->row_list());
    for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
      WERD_IT word_it(row_it.data()->word_list());
      for (word_it.mark_cycle_pt();
           !word_it.cycled_list(); word_it.forward()) {
        word_it.data()->set_text("");
      }
    }
  }
}
Beispiel #6
0
void Tesseract::match_current_words(WERD_RES_LIST &words, ROW *row,
                                    BLOCK* block) {
  WERD_RES_IT word_it(&words);
  WERD_RES *word;
  // Since we are not using PAGE_RES to iterate over words, we need to update
  // prev_word_best_choice_ before calling classify_word_pass2().
  prev_word_best_choice_ = NULL;
  for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
    word = word_it.data();
    if ((!word->part_of_combo) && (word->box_word == NULL)) {
      classify_word_and_language(&Tesseract::classify_word_pass2,
                                 block, row, word);
    }
    prev_word_best_choice_ = word->best_choice;
  }
}
void Phrase::CreateFromString(const std::vector<FactorType> &factorOrder, const StringPiece &phraseString, const StringPiece &factorDelimiter)
{
  FactorCollection &factorCollection = FactorCollection::Instance();

  for (util::TokenIter<util::AnyCharacter, true> word_it(phraseString, util::AnyCharacter(" \t")); word_it; ++word_it) {
    Word &word = AddWord();
    size_t index = 0;
    for (util::TokenIter<util::MultiCharacter, false> factor_it(*word_it, util::MultiCharacter(factorDelimiter)); 
        factor_it && (index < factorOrder.size()); 
        ++factor_it, ++index) {
      word[factorOrder[index]] = factorCollection.AddFactor(*factor_it);
    }
    if (index != factorOrder.size()) {
      TRACE_ERR( "[ERROR] Malformed input: '" << *word_it << "'" <<  std::endl
                 << "In '" << phraseString << "'" << endl
                 << "  Expected input to have words composed of " << factorOrder.size() << " factor(s) (form FAC1|FAC2|...)" << std::endl
                 << "  but instead received input with " << index << " factor(s).\n");
      abort();
    }
  }
}
inT16 fp_eval_word_spacing(WERD_RES_LIST &word_res_list) {
  WERD_RES_IT word_it(&word_res_list);
  WERD_RES *word;
  PBLOB_IT blob_it;
  inT16 word_length;
  inT16 score = 0;
  inT16 i;
  inT16 offset;
  const char *chs;
  float small_limit = bln_x_height * fixsp_small_outlines_size;

  if (!fixsp_fp_eval)
    return (eval_word_spacing (word_res_list));

  for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) {
    word = word_it.data ();
    word_length = word->reject_map.length ();
    chs = word->best_choice->string ().string ();
    if ((word->done ||
      word->tess_accepted) ||
      (word->best_choice->permuter () == SYSTEM_DAWG_PERM) ||
      (word->best_choice->permuter () == FREQ_DAWG_PERM) ||
      (word->best_choice->permuter () == USER_DAWG_PERM) ||
    (safe_dict_word (chs) > 0)) {
      blob_it.set_to_list (word->outword->blob_list ());
      for (i = 0, offset = 0; i < word_length;
           offset += word->best_choice->lengths()[i++], blob_it.forward ()) {
        if ((chs[offset] == ' ') ||
          (blob_noise_score (blob_it.data ()) < small_limit))
          score -= 1;            //penalise possibly erroneous non-space

        else if (word->reject_map[i].accepted ())
          score++;
      }
    }
  }
  if (score < 0)
    score = 0;
  return score;
}
Beispiel #9
0
inT16 Tesseract::fp_eval_word_spacing(WERD_RES_LIST &word_res_list) {
  WERD_RES_IT word_it(&word_res_list);
  WERD_RES *word;
  inT16 word_length;
  inT16 score = 0;
  inT16 i;
  float small_limit = kBlnXHeight * fixsp_small_outlines_size;

  for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
    word = word_it.data();
    if (word->rebuild_word == NULL)
      continue;  // Can't handle cube words.
    word_length = word->reject_map.length();
    if (word->done ||
        word->tess_accepted ||
        word->best_choice->permuter() == SYSTEM_DAWG_PERM ||
        word->best_choice->permuter() == FREQ_DAWG_PERM ||
        word->best_choice->permuter() == USER_DAWG_PERM ||
        safe_dict_word(word) > 0) {
      TBLOB* blob = word->rebuild_word->blobs;
      UNICHAR_ID space = word->uch_set->unichar_to_id(" ");
      for (i = 0; i < word->best_choice->length() && blob != NULL;
           ++i, blob = blob->next) {
        if (word->best_choice->unichar_id(i) == space ||
            blob_noise_score(blob) < small_limit) {
          score -= 1;  // penalise possibly erroneous non-space
        } else if (word->reject_map[i].accepted()) {
          score++;
        }
      }
    }
  }
  if (score < 0)
    score = 0;
  return score;
}
Beispiel #10
0
/**
 * break_noisiest_blob_word()
 * Find the word with the blob which looks like the worst noise.
 * Break the word into two, deleting the noise blob.
 */
void Tesseract::break_noisiest_blob_word(WERD_RES_LIST &words) {
  WERD_RES_IT word_it(&words);
  WERD_RES_IT worst_word_it;
  float worst_noise_score = 9999;
  int worst_blob_index = -1;     // Noisiest blob of noisiest wd
  int blob_index;                // of wds noisiest blob
  float noise_score;             // of wds noisiest blob
  WERD_RES *word_res;
  C_BLOB_IT blob_it;
  C_BLOB_IT rej_cblob_it;
  C_BLOB_LIST new_blob_list;
  C_BLOB_IT new_blob_it;
  C_BLOB_IT new_rej_cblob_it;
  WERD *new_word;
  inT16 start_of_noise_blob;
  inT16 i;

  for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
    blob_index = worst_noise_blob(word_it.data(), &noise_score);
    if (blob_index > -1 && worst_noise_score > noise_score) {
      worst_noise_score = noise_score;
      worst_blob_index = blob_index;
      worst_word_it = word_it;
    }
  }
  if (worst_blob_index < 0) {
    words.clear();          // signal termination
    return;
  }

  /* Now split the worst_word_it */

  word_res = worst_word_it.data();

  /* Move blobs before noise blob to a new bloblist */

  new_blob_it.set_to_list(&new_blob_list);
  blob_it.set_to_list(word_res->word->cblob_list());
  for (i = 0; i < worst_blob_index; i++, blob_it.forward()) {
    new_blob_it.add_after_then_move(blob_it.extract());
  }
  start_of_noise_blob = blob_it.data()->bounding_box().left();
  delete blob_it.extract();     // throw out noise blob

  new_word = new WERD(&new_blob_list, word_res->word);
  new_word->set_flag(W_EOL, FALSE);
  word_res->word->set_flag(W_BOL, FALSE);
  word_res->word->set_blanks(1);  // After break

  new_rej_cblob_it.set_to_list(new_word->rej_cblob_list());
  rej_cblob_it.set_to_list(word_res->word->rej_cblob_list());
  for (;
       (!rej_cblob_it.empty() &&
        (rej_cblob_it.data()->bounding_box().left() < start_of_noise_blob));
       rej_cblob_it.forward()) {
    new_rej_cblob_it.add_after_then_move(rej_cblob_it.extract());
  }

  WERD_RES* new_word_res = new WERD_RES(new_word);
  new_word_res->combination = TRUE;
  worst_word_it.add_before_then_move(new_word_res);

  word_res->ClearResults();
}
Beispiel #11
0
/**
 * @name transform_to_next_perm()
 * Examines the current word list to find the smallest word gap size. Then walks
 * the word list closing any gaps of this size by either inserted new
 * combination words, or extending existing ones.
 *
 * The routine COULD be limited to stop it building words longer than N blobs.
 *
 * If there are no more gaps then it DELETES the entire list and returns the
 * empty list to cause termination.
 */
void transform_to_next_perm(WERD_RES_LIST &words) {
  WERD_RES_IT word_it(&words);
  WERD_RES_IT prev_word_it(&words);
  WERD_RES *word;
  WERD_RES *prev_word;
  WERD_RES *combo;
  WERD *copy_word;
  inT16 prev_right = -MAX_INT16;
  TBOX box;
  inT16 gap;
  inT16 min_gap = MAX_INT16;

  for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
    word = word_it.data();
    if (!word->part_of_combo) {
      box = word->word->bounding_box();
      if (prev_right > -MAX_INT16) {
        gap = box.left() - prev_right;
        if (gap < min_gap)
          min_gap = gap;
      }
      prev_right = box.right();
    }
  }
  if (min_gap < MAX_INT16) {
    prev_right = -MAX_INT16;        // back to start
    word_it.set_to_list(&words);
    // Note: we can't use cycle_pt due to inserted combos at start of list.
    for (; (prev_right == -MAX_INT16) || !word_it.at_first();
         word_it.forward()) {
      word = word_it.data();
      if (!word->part_of_combo) {
        box = word->word->bounding_box();
        if (prev_right > -MAX_INT16) {
          gap = box.left() - prev_right;
          if (gap <= min_gap) {
            prev_word = prev_word_it.data();
            if (prev_word->combination) {
              combo = prev_word;
            } else {
              /* Make a new combination and insert before
               * the first word being joined. */
              copy_word = new WERD;
              *copy_word = *(prev_word->word);
              // deep copy
              combo = new WERD_RES(copy_word);
              combo->combination = TRUE;
              combo->x_height = prev_word->x_height;
              prev_word->part_of_combo = TRUE;
              prev_word_it.add_before_then_move(combo);
            }
            combo->word->set_flag(W_EOL, word->word->flag(W_EOL));
            if (word->combination) {
              combo->word->join_on(word->word);
              // Move blobs to combo
              // old combo no longer needed
              delete word_it.extract();
            } else {
              // Copy current wd to combo
              combo->copy_on(word);
              word->part_of_combo = TRUE;
            }
            combo->done = FALSE;
            combo->ClearResults();
          } else {
            prev_word_it = word_it;  // catch up
          }
        }
        prev_right = box.right();
      }
    }
  } else {
    words.clear();  // signal termination
  }
}