/**
 * dawg_permute_and_select
 *
 * Recursively explore all the possible character combinations in
 * the given char_choices. Use go_deeper_dawg_fxn() to search all the
 * dawgs in the dawgs_ vector in parallel and discard invalid words.
 *
 * Allocate and return a WERD_CHOICE with the best valid word found.
 */
WERD_CHOICE *Dict::dawg_permute_and_select(
    const BLOB_CHOICE_LIST_VECTOR &char_choices, float rating_limit) {
  WERD_CHOICE *best_choice = new WERD_CHOICE();
  best_choice->make_bad();
  best_choice->set_rating(rating_limit);
  if (char_choices.length() == 0) return best_choice;
  DawgInfoVector *active_dawgs = new DawgInfoVector[char_choices.length() + 1];
  DawgInfoVector *constraints =  new DawgInfoVector[char_choices.length() + 1];
  init_active_dawgs(&(active_dawgs[0]));
  init_constraints(&(constraints[0]));
  DawgArgs dawg_args(&(active_dawgs[0]), &(constraints[0]),
                     &(active_dawgs[1]), &(constraints[1]),
                     (segment_penalty_dict_case_bad /
                      segment_penalty_dict_case_ok));
  WERD_CHOICE word(MAX_WERD_LENGTH);
  copy_hyphen_info(&word);
  // Discard rating and certainty of the hyphen base (if any).
  word.set_rating(0.0);
  word.set_certainty(0.0);
  if (word.length() + char_choices.length() > MAX_WERD_LENGTH) {
    delete[] active_dawgs;
    delete[] constraints;
    return best_choice;  // the word is too long to permute
  }
  float certainties[MAX_WERD_LENGTH];
  this->go_deeper_fxn_ = &tesseract::Dict::go_deeper_dawg_fxn;
  permute_choices(segment_dawg_debug ? "segment_dawg_debug" : NULL,
                  char_choices, 0, NULL, &word, certainties,
                  &rating_limit, best_choice, &dawg_args);
  delete[] active_dawgs;
  delete[] constraints;
  return best_choice;
}
Exemple #2
0
/**
 * append_choices
 *
 * Checks to see whether or not the next choice is worth appending to
 * the word being generated. If so then keeps going deeper into the word.
 *
 * This function assumes that Dict::go_deeper_fxn_ is set.
 */
void Dict::append_choices(
    const char *debug,
    const BLOB_CHOICE_LIST_VECTOR &char_choices,
    const BLOB_CHOICE &blob_choice,
    int char_choice_index,
    const CHAR_FRAGMENT_INFO *prev_char_frag_info,
    WERD_CHOICE *word,
    float certainties[],
    float *limit,
    WERD_CHOICE *best_choice,
    int *attempts_left,
    void *more_args) {
  int word_ending =
    (char_choice_index == char_choices.length() - 1) ? true : false;

  // Deal with fragments.
  CHAR_FRAGMENT_INFO char_frag_info;
  if (!fragment_state_okay(blob_choice.unichar_id(), blob_choice.rating(),
                           blob_choice.certainty(), prev_char_frag_info, debug,
                           word_ending, &char_frag_info)) {
    return;  // blob_choice must be an invalid fragment
  }
  // Search the next letter if this character is a fragment.
  if (char_frag_info.unichar_id == INVALID_UNICHAR_ID) {
    permute_choices(debug, char_choices, char_choice_index + 1,
                    &char_frag_info, word, certainties, limit,
                    best_choice, attempts_left, more_args);
    return;
  }

  // Add the next unichar.
  float old_rating = word->rating();
  float old_certainty = word->certainty();
  uint8_t old_permuter = word->permuter();
  certainties[word->length()] = char_frag_info.certainty;
  word->append_unichar_id_space_allocated(
      char_frag_info.unichar_id, char_frag_info.num_fragments,
      char_frag_info.rating, char_frag_info.certainty);

  // Explore the next unichar.
  (this->*go_deeper_fxn_)(debug, char_choices, char_choice_index,
                          &char_frag_info, word_ending, word, certainties,
                          limit, best_choice, attempts_left, more_args);

  // Remove the unichar we added to explore other choices in it's place.
  word->remove_last_unichar_id();
  word->set_rating(old_rating);
  word->set_certainty(old_certainty);
  word->set_permuter(old_permuter);
}
Exemple #3
0
/**
 * dawg_permute_and_select
 *
 * Recursively explore all the possible character combinations in
 * the given char_choices. Use go_deeper_dawg_fxn() to search all the
 * dawgs in the dawgs_ vector in parallel and discard invalid words.
 *
 * Allocate and return a WERD_CHOICE with the best valid word found.
 */
WERD_CHOICE *Dict::dawg_permute_and_select(
    const BLOB_CHOICE_LIST_VECTOR &char_choices, float rating_limit) {
  WERD_CHOICE *best_choice = new WERD_CHOICE(&getUnicharset());
  best_choice->make_bad();
  best_choice->set_rating(rating_limit);
  if (char_choices.length() == 0 || char_choices.length() > MAX_WERD_LENGTH)
    return best_choice;
  DawgPositionVector *active_dawgs =
      new DawgPositionVector[char_choices.length() + 1];
  init_active_dawgs(&(active_dawgs[0]), true);
  DawgArgs dawg_args(&(active_dawgs[0]), &(active_dawgs[1]), NO_PERM);
  WERD_CHOICE word(&getUnicharset(), MAX_WERD_LENGTH);

  float certainties[MAX_WERD_LENGTH];
  this->go_deeper_fxn_ = &tesseract::Dict::go_deeper_dawg_fxn;
  int attempts_left = max_permuter_attempts;
  permute_choices((dawg_debug_level) ? "permute_dawg_debug" : nullptr,
      char_choices, 0, nullptr, &word, certainties, &rating_limit, best_choice,
      &attempts_left, &dawg_args);
  delete[] active_dawgs;
  return best_choice;
}
Exemple #4
0
/**
 * @name go_deeper_dawg_fxn
 *
 * If the choice being composed so far could be a dictionary word
 * keep exploring choices.
 */
void Dict::go_deeper_dawg_fxn(
    const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices,
    int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info,
    bool word_ending, WERD_CHOICE *word, float certainties[], float *limit,
    WERD_CHOICE *best_choice, int *attempts_left, void *void_more_args) {
  DawgArgs *more_args = static_cast<DawgArgs *>(void_more_args);
  word_ending = (char_choice_index == char_choices.size()-1);
  int word_index = word->length() - 1;
  if (best_choice->rating() < *limit) return;
  // Look up char in DAWG

  // If the current unichar is an ngram first try calling
  // letter_is_okay() for each unigram it contains separately.
  UNICHAR_ID orig_uch_id = word->unichar_id(word_index);
  bool checked_unigrams = false;
  if (getUnicharset().get_isngram(orig_uch_id)) {
    if (dawg_debug_level) {
      tprintf("checking unigrams in an ngram %s\n",
              getUnicharset().debug_str(orig_uch_id).string());
    }
    int num_unigrams = 0;
    word->remove_last_unichar_id();
    GenericVector<UNICHAR_ID> encoding;
    const char *ngram_str = getUnicharset().id_to_unichar(orig_uch_id);
    // Since the string came out of the unicharset, failure is impossible.
    ASSERT_HOST(getUnicharset().encode_string(ngram_str, true, &encoding, nullptr,
                                              nullptr));
    bool unigrams_ok = true;
    // Construct DawgArgs that reflect the current state.
    DawgPositionVector unigram_active_dawgs = *(more_args->active_dawgs);
    DawgPositionVector unigram_updated_dawgs;
    DawgArgs unigram_dawg_args(&unigram_active_dawgs,
                               &unigram_updated_dawgs,
                               more_args->permuter);
    // Check unigrams in the ngram with letter_is_okay().
    for (int i = 0; unigrams_ok && i < encoding.size(); ++i) {
      UNICHAR_ID uch_id = encoding[i];
      ASSERT_HOST(uch_id != INVALID_UNICHAR_ID);
      ++num_unigrams;
      word->append_unichar_id(uch_id, 1, 0.0, 0.0);
      unigrams_ok = (this->*letter_is_okay_)(
          &unigram_dawg_args,
          word->unichar_id(word_index+num_unigrams-1),
          word_ending && i == encoding.size() - 1);
      (*unigram_dawg_args.active_dawgs) = *(unigram_dawg_args.updated_dawgs);
      if (dawg_debug_level) {
        tprintf("unigram %s is %s\n",
                getUnicharset().debug_str(uch_id).string(),
                unigrams_ok ? "OK" : "not OK");
      }
    }
    // Restore the word and copy the updated dawg state if needed.
    while (num_unigrams-- > 0) word->remove_last_unichar_id();
    word->append_unichar_id_space_allocated(orig_uch_id, 1, 0.0, 0.0);
    if (unigrams_ok) {
      checked_unigrams = true;
      more_args->permuter = unigram_dawg_args.permuter;
      *(more_args->updated_dawgs) = *(unigram_dawg_args.updated_dawgs);
    }
  }

  // Check which dawgs from the dawgs_ vector contain the word
  // up to and including the current unichar.
  if (checked_unigrams || (this->*letter_is_okay_)(
      more_args, word->unichar_id(word_index), word_ending)) {
    // Add a new word choice
    if (word_ending) {
      if (dawg_debug_level) {
        tprintf("found word = %s\n", word->debug_string().string());
      }
      if (strcmp(output_ambig_words_file.string(), "") != 0) {
        if (output_ambig_words_file_ == nullptr) {
          output_ambig_words_file_ =
              fopen(output_ambig_words_file.string(), "wb+");
          if (output_ambig_words_file_ == nullptr) {
            tprintf("Failed to open output_ambig_words_file %s\n",
                    output_ambig_words_file.string());
            exit(1);
          }
          STRING word_str;
          word->string_and_lengths(&word_str, nullptr);
          word_str += " ";
          fprintf(output_ambig_words_file_, "%s", word_str.string());
        }
        STRING word_str;
        word->string_and_lengths(&word_str, nullptr);
        word_str += " ";
        fprintf(output_ambig_words_file_, "%s", word_str.string());
      }
      WERD_CHOICE *adjusted_word = word;
      adjusted_word->set_permuter(more_args->permuter);
      update_best_choice(*adjusted_word, best_choice);
    } else {  // search the next letter
      // Make updated_* point to the next entries in the DawgPositionVector
      // arrays (that were originally created in dawg_permute_and_select)
      ++(more_args->updated_dawgs);
      // Make active_dawgs and constraints point to the updated ones.
      ++(more_args->active_dawgs);
      permute_choices(debug, char_choices, char_choice_index + 1,
                      prev_char_frag_info, word, certainties, limit,
                      best_choice, attempts_left, more_args);
      // Restore previous state to explore another letter in this position.
      --(more_args->updated_dawgs);
      --(more_args->active_dawgs);
    }
  } else {
      if (dawg_debug_level) {
        tprintf("last unichar not OK at index %d in %s\n",
                word_index, word->debug_string().string());
    }
  }
}
/**
 * @name go_deeper_dawg_fxn
 *
 * If the choice being composed so far could be a dictionary word
 * keep exploring choices.
 *
 * There are two modes for deciding whether to go deeper: regular dawg
 * permuter mode and the special ambigs mode. If *limit is <= 0.0 the
 * function switches to the ambigs mode (this is the case when
 * dawg_permute_and_select() function is called from NoDangerousAmbigs()) and
 * only searches for the first choice that has a rating better than *limit
 * (in this case ratings are fake, since the real ratings can not be < 0).
 * Modification of the hyphen state is turned off in the ambigs mode.
 * When in the regular dawg permuter mode, the function explores all the
 * possible words and chooses the one with the best rating. The letters with
 * ratings that are far worse than the ones seen so far are pruned out.
 */
void Dict::go_deeper_dawg_fxn(
    const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices,
    int char_choice_index,
    const CHAR_FRAGMENT_INFO *prev_char_frag_info,
    bool word_ending, WERD_CHOICE *word, float certainties[],
    float *limit, WERD_CHOICE *best_choice, void *void_more_args) {
  DawgArgs *more_args = reinterpret_cast<DawgArgs*>(void_more_args);
  int word_index = word->length() - 1;

  bool ambigs_mode = (*limit <= 0.0);
  if (ambigs_mode) {
    if (best_choice->rating() < *limit) return;
  } else {
    // Prune bad subwords
    if (more_args->rating_array[word_index] == NO_RATING) {
      more_args->rating_array[word_index] = word->rating();
    } else {
      float permdawg_limit = more_args->rating_array[word_index] *
        more_args->rating_margin + kPermDawgRatingPad;
      if (permdawg_limit < word->rating()) {
        if (segment_dawg_debug) {
          tprintf("early pruned word rating=%4.2f,"
                  " permdawg_limit=%4.2f, word=%s\n", word->rating(),
                  permdawg_limit, word->debug_string(getUnicharset()).string());
        }
        return;
      }
    }
  }
  // Deal with hyphens
  if (word_ending && has_hyphen_end(*word) && !ambigs_mode) {
    if (segment_dawg_debug)
      tprintf("new hyphen choice = %s\n",
              word->debug_string(getUnicharset()).string());
    word->set_permuter(more_args->permuter);
    adjust_word(word, certainties);
    set_hyphen_word(*word, *(more_args->active_dawgs),
                    *(more_args->constraints));
    update_best_choice(*word, best_choice);
  } else {  // Look up char in DAWG
    // TODO(daria): update the rest of the code that specifies alternative
    // letter_is_okay_ functions (e.g. TessCharNgram class) to work with
    // multi-byte unichars and/or unichar ids.

    // If the current unichar is an ngram first try calling
    // letter_is_okay() for each unigram it contains separately.
    UNICHAR_ID orig_uch_id = word->unichar_id(word_index);
    bool checked_unigrams = false;
    if (getUnicharset().get_isngram(orig_uch_id)) {
      if (segment_dawg_debug) {
        tprintf("checking unigrams in an ngram %s\n",
                getUnicharset().debug_str(orig_uch_id).string());
      }
      int orig_num_fragments = word->fragment_length(word_index);
      int num_unigrams = 0;
      word->remove_last_unichar_id();
      const char *ngram_str = getUnicharset().id_to_unichar(orig_uch_id);
      const char *ngram_str_end = ngram_str + strlen(ngram_str);
      const char *ngram_ptr = ngram_str;
      bool unigrams_ok = true;
      // Construct DawgArgs that reflect the current state.
      DawgInfoVector unigram_active_dawgs = *(more_args->active_dawgs);
      DawgInfoVector unigram_constraints = *(more_args->constraints);
      DawgInfoVector unigram_updated_active_dawgs;
      DawgInfoVector unigram_updated_constraints;
      DawgArgs unigram_dawg_args(&unigram_active_dawgs, &unigram_constraints,
                                 &unigram_updated_active_dawgs,
                                 &unigram_updated_constraints, 0.0);
      unigram_dawg_args.permuter = more_args->permuter;
      // Check unigrams in the ngram with letter_is_okay().
      while (unigrams_ok && ngram_ptr < ngram_str_end) {
        int step = getUnicharset().step(ngram_ptr);
        UNICHAR_ID uch_id = (step <= 0) ? INVALID_UNICHAR_ID :
            getUnicharset().unichar_to_id(ngram_ptr, step);
        ngram_ptr += step;
        ++num_unigrams;
        word->append_unichar_id(uch_id, 1, 0.0, 0.0);
        unigrams_ok = unigrams_ok && (this->*letter_is_okay_)(
            &unigram_dawg_args, word_index+num_unigrams-1, word,
            word_ending && (ngram_ptr == ngram_str_end));
        (*unigram_dawg_args.active_dawgs) =
          *(unigram_dawg_args.updated_active_dawgs);
        (*unigram_dawg_args.constraints) =
          *(unigram_dawg_args.updated_constraints);
        if (segment_dawg_debug) {
          tprintf("unigram %s is %s\n",
                  getUnicharset().debug_str(uch_id).string(),
                  unigrams_ok ? "OK" : "not OK");
        }
      }
      // Restore the word and copy the updated dawg state if needed.
      while (num_unigrams-- > 0) word->remove_last_unichar_id();
      word->append_unichar_id_space_allocated(
          orig_uch_id, orig_num_fragments, 0.0, 0.0);
      if (unigrams_ok) {
        checked_unigrams = true;
        more_args->permuter = unigram_dawg_args.permuter;
        *(more_args->updated_active_dawgs) =
          *(unigram_dawg_args.updated_active_dawgs);
        *(more_args->updated_constraints) =
          *(unigram_dawg_args.updated_constraints);
      }
    }

    // Check which dawgs from dawgs_ vector contain the word
    // up to and including the current unichar.
    if (checked_unigrams ||
        (this->*letter_is_okay_)(more_args, word_index, word, word_ending)) {
      // Add a new word choice
      if (word_ending) {
        if (segment_dawg_debug) {
          tprintf("found word = %s\n",
                  word->debug_string(getUnicharset()).string());
        }
        WERD_CHOICE *adjusted_word = word;
        WERD_CHOICE hyphen_tail_word;
        if (!ambigs_mode && hyphen_base_size() > 0) {
          hyphen_tail_word = *word;
          remove_hyphen_head(&hyphen_tail_word);
          adjusted_word = &hyphen_tail_word;
        }
        adjusted_word->set_permuter(more_args->permuter);
        if (!ambigs_mode) {
          adjust_word(adjusted_word, &certainties[hyphen_base_size()]);
        }
        update_best_choice(*adjusted_word, best_choice);
      } else {  // search the next letter
        // Make updated_* point to the next entries in the DawgInfoVector
        // arrays (that were originally created in dawg_permute_and_select)
        ++(more_args->updated_active_dawgs);
        ++(more_args->updated_constraints);
        // Make active_dawgs and constraints point to the updated ones.
        ++(more_args->active_dawgs);
        ++(more_args->constraints);
        permute_choices(debug, char_choices, char_choice_index + 1,
                        prev_char_frag_info, word, certainties, limit,
                        best_choice, more_args);
        // Restore previous state to explore another letter in this position.
        --(more_args->updated_active_dawgs);
        --(more_args->updated_constraints);
        --(more_args->active_dawgs);
        --(more_args->constraints);
      }
    } else {
      if (segment_dawg_debug) {
        tprintf("last unichar not OK at index %d in %s\n",
                word_index, word->debug_string(getUnicharset()).string());
      }
    }
  }
}