Beispiel #1
0
bool Dict::AcceptableChoice(const WERD_CHOICE& best_choice,
                            XHeightConsistencyEnum xheight_consistency) {
  float CertaintyThreshold = stopper_nondict_certainty_base;
  int WordSize;

  if (stopper_no_acceptable_choices) return false;

  if (best_choice.length() == 0) return false;

  bool no_dang_ambigs = !best_choice.dangerous_ambig_found();
  bool is_valid_word = valid_word_permuter(best_choice.permuter(), false);
  bool is_case_ok = case_ok(best_choice, getUnicharset());

  if (stopper_debug_level >= 1) {
    const char *xht = "UNKNOWN";
    switch (xheight_consistency) {
      case XH_GOOD:  xht = "NORMAL"; break;
      case XH_SUBNORMAL:  xht = "SUBNORMAL"; break;
      case XH_INCONSISTENT:  xht = "INCONSISTENT"; break;
      default: xht = "UNKNOWN";
    }
    tprintf("\nStopper:  %s (word=%c, case=%c, xht_ok=%s=[%g,%g])\n",
            best_choice.unichar_string().string(),
            (is_valid_word ? 'y' : 'n'),
            (is_case_ok ? 'y' : 'n'),
            xht,
            best_choice.min_x_height(),
            best_choice.max_x_height());
  }
  // Do not accept invalid words in PASS1.
  if (reject_offset_ <= 0.0f && !is_valid_word) return false;
  if (is_valid_word && is_case_ok) {
    WordSize = LengthOfShortestAlphaRun(best_choice);
    WordSize -= stopper_smallword_size;
    if (WordSize < 0)
      WordSize = 0;
    CertaintyThreshold += WordSize * stopper_certainty_per_char;
  }

  if (stopper_debug_level >= 1)
    tprintf("Stopper:  Rating = %4.1f, Certainty = %4.1f, Threshold = %4.1f\n",
            best_choice.rating(), best_choice.certainty(), CertaintyThreshold);

  if (no_dang_ambigs &&
      best_choice.certainty() > CertaintyThreshold &&
      xheight_consistency < XH_INCONSISTENT &&
      UniformCertainties(best_choice)) {
    return true;
  } else {
    if (stopper_debug_level >= 1) {
      tprintf("AcceptableChoice() returned false"
              " (no_dang_ambig:%d cert:%.4g thresh:%g uniform:%d)\n",
              no_dang_ambigs, best_choice.certainty(),
              CertaintyThreshold,
              UniformCertainties(best_choice));
    }
    return false;
  }
}
Beispiel #2
0
bool Dict::AcceptableResult(const WERD_CHOICE &BestChoice) {
  float CertaintyThreshold = stopper_nondict_certainty_base - reject_offset_;
  int WordSize;

  if (stopper_debug_level >= 1) {
    tprintf("\nRejecter: %s (word=%c, case=%c, unambig=%c)\n",
            BestChoice.debug_string(getUnicharset()).string(),
            (valid_word(BestChoice) ? 'y' : 'n'),
            (case_ok(BestChoice, getUnicharset()) ? 'y' : 'n'),
            ((list_rest (best_choices_) != NIL_LIST) ? 'n' : 'y'));
  }

  if (BestChoice.length() == 0 || CurrentWordAmbig())
    return false;
  if (BestChoice.fragment_mark()) {
    if (stopper_debug_level >= 1) {
      cprintf("AcceptableResult(): a choice with fragments beats BestChoice\n");
    }
    return false;
  }
  if (valid_word(BestChoice) && case_ok(BestChoice, getUnicharset())) {
    WordSize = LengthOfShortestAlphaRun(BestChoice);
    WordSize -= stopper_smallword_size;
    if (WordSize < 0)
      WordSize = 0;
    CertaintyThreshold += WordSize * stopper_certainty_per_char;
  }

  if (stopper_debug_level >= 1)
    cprintf ("Rejecter: Certainty = %4.1f, Threshold = %4.1f   ",
      BestChoice.certainty(), CertaintyThreshold);

  if (BestChoice.certainty() > CertaintyThreshold &&
      !stopper_no_acceptable_choices) {
    if (stopper_debug_level >= 1)
      cprintf("ACCEPTED\n");
    return true;
  }
  else {
    if (stopper_debug_level >= 1)
      cprintf("REJECTED\n");
    return false;
  }
}
Beispiel #3
0
bool Dict::AcceptableResult(WERD_RES *word) const {
  if (word->best_choice == nullptr) return false;
  float CertaintyThreshold = stopper_nondict_certainty_base - reject_offset_;
  int WordSize;

  if (stopper_debug_level >= 1) {
    tprintf("\nRejecter: %s (word=%c, case=%c, unambig=%c, multiple=%c)\n",
            word->best_choice->debug_string().string(),
            (valid_word(*word->best_choice) ? 'y' : 'n'),
            (case_ok(*word->best_choice, getUnicharset()) ? 'y' : 'n'),
            word->best_choice->dangerous_ambig_found() ? 'n' : 'y',
            word->best_choices.singleton() ? 'n' : 'y');
  }

  if (word->best_choice->length() == 0 || !word->best_choices.singleton())
    return false;
  if (valid_word(*word->best_choice) &&
      case_ok(*word->best_choice, getUnicharset())) {
    WordSize = LengthOfShortestAlphaRun(*word->best_choice);
    WordSize -= stopper_smallword_size;
    if (WordSize < 0)
      WordSize = 0;
    CertaintyThreshold += WordSize * stopper_certainty_per_char;
  }

  if (stopper_debug_level >= 1)
    tprintf("Rejecter: Certainty = %4.1f, Threshold = %4.1f   ",
            word->best_choice->certainty(), CertaintyThreshold);

  if (word->best_choice->certainty() > CertaintyThreshold &&
      !stopper_no_acceptable_choices) {
    if (stopper_debug_level >= 1)
      tprintf("ACCEPTED\n");
    return true;
  } else {
    if (stopper_debug_level >= 1)
      tprintf("REJECTED\n");
    return false;
  }
}
Beispiel #4
0
/**
 * dawg_permute_and_select
 *
 * Recursively explore all the possible character combinations in
 * the given char_choices. Use go_deeper_dawg_fxn() to search all the
 * dawgs in the dawgs_ vector in parallel and discard invalid words.
 *
 * Allocate and return a WERD_CHOICE with the best valid word found.
 */
WERD_CHOICE *Dict::dawg_permute_and_select(
    const BLOB_CHOICE_LIST_VECTOR &char_choices, float rating_limit) {
  WERD_CHOICE *best_choice = new WERD_CHOICE(&getUnicharset());
  best_choice->make_bad();
  best_choice->set_rating(rating_limit);
  if (char_choices.length() == 0 || char_choices.length() > MAX_WERD_LENGTH)
    return best_choice;
  DawgPositionVector *active_dawgs =
      new DawgPositionVector[char_choices.length() + 1];
  init_active_dawgs(&(active_dawgs[0]), true);
  DawgArgs dawg_args(&(active_dawgs[0]), &(active_dawgs[1]), NO_PERM);
  WERD_CHOICE word(&getUnicharset(), MAX_WERD_LENGTH);

  float certainties[MAX_WERD_LENGTH];
  this->go_deeper_fxn_ = &tesseract::Dict::go_deeper_dawg_fxn;
  int attempts_left = max_permuter_attempts;
  permute_choices((dawg_debug_level) ? "permute_dawg_debug" : nullptr,
      char_choices, 0, nullptr, &word, certainties, &rating_limit, best_choice,
      &attempts_left, &dawg_args);
  delete[] active_dawgs;
  return best_choice;
}
/**
 * @name adjust_word
 *
 * Assign an adjusted value to a string that is a word. The value
 * that this word choice has is based on case and punctuation rules.
 */
void Dict::adjust_word(WERD_CHOICE *word,
                       float *certainty_array) {
  float adjust_factor;
  float new_rating = word->rating();

  if (segment_dawg_debug) {
    tprintf("Word: %s %4.2f ",
            word->debug_string(getUnicharset()).string(), word->rating());
  }

  new_rating += RATING_PAD;
  if (Context::case_ok(*word, getUnicharset())) {
    if (freq_dawg_ != NULL && freq_dawg_->word_in_dawg(*word)) {
      word->set_permuter(FREQ_DAWG_PERM);
      new_rating *= segment_penalty_dict_frequent_word;
      adjust_factor = segment_penalty_dict_frequent_word;
      if (segment_dawg_debug)
        tprintf(", F, %4.2f ", (double)segment_penalty_dict_frequent_word);
    } else {
      new_rating *= segment_penalty_dict_case_ok;
      adjust_factor = segment_penalty_dict_case_ok;
      if (segment_dawg_debug)
        tprintf(", %4.2f ", (double)segment_penalty_dict_case_ok);
    }
  } else {
    new_rating *= segment_penalty_dict_case_bad;
    adjust_factor = segment_penalty_dict_case_bad;
    if (segment_dawg_debug) {
      tprintf(", C %4.2f ", (double)segment_penalty_dict_case_bad);
    }
  }
  new_rating -= RATING_PAD;
  word->set_rating(new_rating);

  LogNewChoice(*word, adjust_factor, certainty_array, false);

  if (segment_dawg_debug)
    tprintf(" --> %4.2f\n", new_rating);
}
Beispiel #6
0
int Dict::LengthOfShortestAlphaRun(const WERD_CHOICE &WordChoice) const {
  int shortest = INT32_MAX;
  int curr_len = 0;
  for (int w = 0; w < WordChoice.length(); ++w) {
    if (getUnicharset().get_isalpha(WordChoice.unichar_id(w))) {
      curr_len++;
    } else if (curr_len > 0) {
      if (curr_len < shortest) shortest = curr_len;
      curr_len = 0;
    }
  }
  if (curr_len > 0 && curr_len < shortest) {
    shortest = curr_len;
  } else if (shortest == INT32_MAX) {
    shortest = 0;
  }
  return shortest;
}
Beispiel #7
0
void Dict::ReplaceAmbig(int wrong_ngram_begin_index, int wrong_ngram_size,
                        UNICHAR_ID correct_ngram_id, WERD_CHOICE *werd_choice,
                        MATRIX *ratings) {
  int num_blobs_to_replace = 0;
  int begin_blob_index = 0;
  int i;
  // Rating and certainty for the new BLOB_CHOICE are derived from the
  // replaced choices.
  float new_rating = 0.0f;
  float new_certainty = 0.0f;
  BLOB_CHOICE* old_choice = nullptr;
  for (i = 0; i < wrong_ngram_begin_index + wrong_ngram_size; ++i) {
    if (i >= wrong_ngram_begin_index) {
      int num_blobs = werd_choice->state(i);
      int col = begin_blob_index + num_blobs_to_replace;
      int row = col + num_blobs - 1;
      BLOB_CHOICE_LIST* choices = ratings->get(col, row);
      ASSERT_HOST(choices != nullptr);
      old_choice = FindMatchingChoice(werd_choice->unichar_id(i), choices);
      ASSERT_HOST(old_choice != nullptr);
      new_rating += old_choice->rating();
      new_certainty += old_choice->certainty();
      num_blobs_to_replace += num_blobs;
    } else {
      begin_blob_index += werd_choice->state(i);
    }
  }
  new_certainty /= wrong_ngram_size;
  // If there is no entry in the ratings matrix, add it.
  MATRIX_COORD coord(begin_blob_index,
                     begin_blob_index + num_blobs_to_replace - 1);
  if (!coord.Valid(*ratings)) {
    ratings->IncreaseBandSize(coord.row - coord.col + 1);
  }
  if (ratings->get(coord.col, coord.row) == nullptr)
    ratings->put(coord.col, coord.row, new BLOB_CHOICE_LIST);
  BLOB_CHOICE_LIST* new_choices = ratings->get(coord.col, coord.row);
  BLOB_CHOICE* choice = FindMatchingChoice(correct_ngram_id, new_choices);
  if (choice != nullptr) {
    // Already there. Upgrade if new rating better.
    if (new_rating < choice->rating())
      choice->set_rating(new_rating);
    if (new_certainty < choice->certainty())
      choice->set_certainty(new_certainty);
    // DO NOT SORT!! It will mess up the iterator in LanguageModel::UpdateState.
  } else {
    // Need a new choice with the correct_ngram_id.
    choice = new BLOB_CHOICE(*old_choice);
    choice->set_unichar_id(correct_ngram_id);
    choice->set_rating(new_rating);
    choice->set_certainty(new_certainty);
    choice->set_classifier(BCC_AMBIG);
    choice->set_matrix_cell(coord.col, coord.row);
    BLOB_CHOICE_IT it (new_choices);
    it.add_to_end(choice);
  }
  // Remove current unichar from werd_choice. On the last iteration
  // set the correct replacement unichar instead of removing a unichar.
  for (int replaced_count = 0; replaced_count < wrong_ngram_size;
       ++replaced_count) {
    if (replaced_count + 1 == wrong_ngram_size) {
      werd_choice->set_blob_choice(wrong_ngram_begin_index,
                                   num_blobs_to_replace, choice);
    } else {
      werd_choice->remove_unichar_id(wrong_ngram_begin_index + 1);
    }
  }
  if (stopper_debug_level >= 1) {
      werd_choice->print("ReplaceAmbig() ");
      tprintf("Modified blob_choices: ");
      print_ratings_list("\n", new_choices, getUnicharset());
  }
}
Beispiel #8
0
bool Dict::NoDangerousAmbig(WERD_CHOICE *best_choice,
                            DANGERR *fixpt,
                            bool fix_replaceable,
                            MATRIX *ratings) {
  if (stopper_debug_level > 2) {
    tprintf("\nRunning NoDangerousAmbig() for %s\n",
            best_choice->debug_string().string());
  }

  // Construct BLOB_CHOICE_LIST_VECTOR with ambiguities
  // for each unichar id in BestChoice.
  BLOB_CHOICE_LIST_VECTOR ambig_blob_choices;
  int i;
  bool ambigs_found = false;
  // For each position in best_choice:
  // -- choose AMBIG_SPEC_LIST that corresponds to unichar_id at best_choice[i]
  // -- initialize wrong_ngram with a single unichar_id at best_choice[i]
  // -- look for ambiguities corresponding to wrong_ngram in the list while
  //    adding the following unichar_ids from best_choice to wrong_ngram
  //
  // Repeat the above procedure twice: first time look through
  // ambigs to be replaced and replace all the ambiguities found;
  // second time look through dangerous ambiguities and construct
  // ambig_blob_choices with fake a blob choice for each ambiguity
  // and pass them to dawg_permute_and_select() to search for
  // ambiguous words in the dictionaries.
  //
  // Note that during the execution of the for loop (on the first pass)
  // if replacements are made the length of best_choice might change.
  for (int pass = 0; pass < (fix_replaceable ? 2 : 1); ++pass) {
    bool replace = (fix_replaceable && pass == 0);
    const UnicharAmbigsVector &table = replace ?
      getUnicharAmbigs().replace_ambigs() : getUnicharAmbigs().dang_ambigs();
    if (!replace) {
      // Initialize ambig_blob_choices with lists containing a single
      // unichar id for the correspoding position in best_choice.
      // best_choice consisting from only the original letters will
      // have a rating of 0.0.
      for (i = 0; i < best_choice->length(); ++i) {
        BLOB_CHOICE_LIST *lst = new BLOB_CHOICE_LIST();
        BLOB_CHOICE_IT lst_it(lst);
        // TODO(rays/antonova) Put real xheights and y shifts here.
        lst_it.add_to_end(new BLOB_CHOICE(best_choice->unichar_id(i),
                                          0.0, 0.0, -1, 0, 1, 0, BCC_AMBIG));
        ambig_blob_choices.push_back(lst);
      }
    }
    UNICHAR_ID wrong_ngram[MAX_AMBIG_SIZE + 1];
    int wrong_ngram_index;
    int next_index;
    int blob_index = 0;
    for (i = 0; i < best_choice->length(); blob_index += best_choice->state(i),
         ++i) {
      UNICHAR_ID curr_unichar_id = best_choice->unichar_id(i);
      if (stopper_debug_level > 2) {
        tprintf("Looking for %s ngrams starting with %s:\n",
                replace ? "replaceable" : "ambiguous",
                getUnicharset().debug_str(curr_unichar_id).string());
      }
      int num_wrong_blobs = best_choice->state(i);
      wrong_ngram_index = 0;
      wrong_ngram[wrong_ngram_index] = curr_unichar_id;
      if (curr_unichar_id == INVALID_UNICHAR_ID ||
          curr_unichar_id >= table.size() ||
          table[curr_unichar_id] == nullptr) {
        continue;  // there is no ambig spec for this unichar id
      }
      AmbigSpec_IT spec_it(table[curr_unichar_id]);
      for (spec_it.mark_cycle_pt(); !spec_it.cycled_list();) {
        const AmbigSpec *ambig_spec = spec_it.data();
        wrong_ngram[wrong_ngram_index+1] = INVALID_UNICHAR_ID;
        int compare = UnicharIdArrayUtils::compare(wrong_ngram,
                                                   ambig_spec->wrong_ngram);
        if (stopper_debug_level > 2) {
          tprintf("candidate ngram: ");
          UnicharIdArrayUtils::print(wrong_ngram, getUnicharset());
          tprintf("current ngram from spec: ");
          UnicharIdArrayUtils::print(ambig_spec->wrong_ngram, getUnicharset());
          tprintf("comparison result: %d\n", compare);
        }
        if (compare == 0) {
          // Record the place where we found an ambiguity.
          if (fixpt != nullptr) {
            UNICHAR_ID leftmost_id = ambig_spec->correct_fragments[0];
            fixpt->push_back(DANGERR_INFO(
                blob_index, blob_index + num_wrong_blobs, replace,
                getUnicharset().get_isngram(ambig_spec->correct_ngram_id),
                leftmost_id));
            if (stopper_debug_level > 1) {
              tprintf("fixpt+=(%d %d %d %d %s)\n", blob_index,
                      blob_index + num_wrong_blobs, false,
                      getUnicharset().get_isngram(
                          ambig_spec->correct_ngram_id),
                      getUnicharset().id_to_unichar(leftmost_id));
            }
          }

          if (replace) {
            if (stopper_debug_level > 2) {
              tprintf("replace ambiguity with %s : ",
                      getUnicharset().id_to_unichar(
                          ambig_spec->correct_ngram_id));
              UnicharIdArrayUtils::print(
                  ambig_spec->correct_fragments, getUnicharset());
            }
            ReplaceAmbig(i, ambig_spec->wrong_ngram_size,
                         ambig_spec->correct_ngram_id,
                         best_choice, ratings);
          } else if (i > 0 || ambig_spec->type != CASE_AMBIG) {
            // We found dang ambig - update ambig_blob_choices.
            if (stopper_debug_level > 2) {
              tprintf("found ambiguity: ");
              UnicharIdArrayUtils::print(
                  ambig_spec->correct_fragments, getUnicharset());
            }
            ambigs_found = true;
            for (int tmp_index = 0; tmp_index <= wrong_ngram_index;
                 ++tmp_index) {
              // Add a blob choice for the corresponding fragment of the
              // ambiguity. These fake blob choices are initialized with
              // negative ratings (which are not possible for real blob
              // choices), so that dawg_permute_and_select() considers any
              // word not consisting of only the original letters a better
              // choice and stops searching for alternatives once such a
              // choice is found.
              BLOB_CHOICE_IT bc_it(ambig_blob_choices[i+tmp_index]);
              bc_it.add_to_end(new BLOB_CHOICE(
                  ambig_spec->correct_fragments[tmp_index], -1.0, 0.0,
                  -1, 0, 1, 0, BCC_AMBIG));
            }
          }
          spec_it.forward();
        } else if (compare == -1) {
          if (wrong_ngram_index+1 < ambig_spec->wrong_ngram_size &&
              ((next_index = wrong_ngram_index+1+i) < best_choice->length())) {
            // Add the next unichar id to wrong_ngram and keep looking for
            // more ambigs starting with curr_unichar_id in AMBIG_SPEC_LIST.
            wrong_ngram[++wrong_ngram_index] =
              best_choice->unichar_id(next_index);
            num_wrong_blobs += best_choice->state(next_index);
          } else {
            break;  // no more matching ambigs in this AMBIG_SPEC_LIST
          }
        } else {
          spec_it.forward();
        }
      }  // end searching AmbigSpec_LIST
    }  // end searching best_choice
  }  // end searching replace and dangerous ambigs

  // If any ambiguities were found permute the constructed ambig_blob_choices
  // to see if an alternative dictionary word can be found.
  if (ambigs_found) {
    if (stopper_debug_level > 2) {
      tprintf("\nResulting ambig_blob_choices:\n");
      for (i = 0; i < ambig_blob_choices.length(); ++i) {
        print_ratings_list("", ambig_blob_choices.get(i), getUnicharset());
        tprintf("\n");
      }
    }
    WERD_CHOICE *alt_word = dawg_permute_and_select(ambig_blob_choices, 0.0);
    ambigs_found = (alt_word->rating() < 0.0);
    if (ambigs_found) {
      if (stopper_debug_level >= 1) {
        tprintf ("Stopper: Possible ambiguous word = %s\n",
                 alt_word->debug_string().string());
      }
      if (fixpt != nullptr) {
        // Note: Currently character choices combined from fragments can only
        // be generated by NoDangrousAmbigs(). This code should be updated if
        // the capability to produce classifications combined from character
        // fragments is added to other functions.
        int orig_i = 0;
        for (i = 0; i < alt_word->length(); ++i) {
          const UNICHARSET &uchset = getUnicharset();
          bool replacement_is_ngram =
              uchset.get_isngram(alt_word->unichar_id(i));
          UNICHAR_ID leftmost_id = alt_word->unichar_id(i);
          if (replacement_is_ngram) {
            // we have to extract the leftmost unichar from the ngram.
            const char *str = uchset.id_to_unichar(leftmost_id);
            int step = uchset.step(str);
            if (step) leftmost_id = uchset.unichar_to_id(str, step);
          }
          int end_i = orig_i + alt_word->state(i);
          if (alt_word->state(i) > 1 ||
              (orig_i + 1 == end_i && replacement_is_ngram)) {
            // Compute proper blob indices.
            int blob_start = 0;
            for (int j = 0; j < orig_i; ++j)
              blob_start += best_choice->state(j);
            int blob_end = blob_start;
            for (int j = orig_i; j < end_i; ++j)
              blob_end += best_choice->state(j);
            fixpt->push_back(DANGERR_INFO(blob_start, blob_end, true,
                                          replacement_is_ngram, leftmost_id));
            if (stopper_debug_level > 1) {
              tprintf("fixpt->dangerous+=(%d %d %d %d %s)\n", orig_i, end_i,
                      true, replacement_is_ngram,
                      uchset.id_to_unichar(leftmost_id));
            }
          }
          orig_i += alt_word->state(i);
        }
      }
    }
    delete alt_word;
  }
  if (output_ambig_words_file_ != nullptr) {
    fprintf(output_ambig_words_file_, "\n");
  }

  ambig_blob_choices.delete_data_pointers();
  return !ambigs_found;
}
Beispiel #9
0
// Returns true if in light of the current state the letter at word_index
// in the given word is allowed according to at least one of the dawgs in
// dawgs_.
//
// See more extensive comments in dict.h where this function is declared.
//
int Dict::def_letter_is_okay(void* void_dawg_args, int word_index,
                             const void *void_word, bool word_end) {
  DawgArgs *dawg_args = reinterpret_cast<DawgArgs*>(void_dawg_args);
  const WERD_CHOICE *word = reinterpret_cast<const WERD_CHOICE*>(void_word);

  if (dawg_debug_level >= 3) {
    tprintf("def_letter_is_okay: word_index=%d word_end=%d"
            " word=%s num active dawgs=%d num constraints=%d\n",
            word_index, word_end,
            word->debug_string(getUnicharset()).string(),
            dawg_args->active_dawgs->length(),
            dawg_args->constraints->length());
  }

  // Do not accept words that contain kPatternUnicharID.
  // (otherwise pattern dawgs would not function correctly).
  // Do not accept words containing INVALID_UNICHAR_IDs.
  UNICHAR_ID unichar_id = word->unichar_id(word_index);
  if (unichar_id == Dawg::kPatternUnicharID ||
      unichar_id == INVALID_UNICHAR_ID) {
    dawg_args->permuter = NO_PERM;
    return NO_PERM;
  }

  // Initialization.
  PermuterType current_permuter = NO_PERM;
  dawg_args->updated_active_dawgs->clear();
  const DawgInfoVector &constraints = *(dawg_args->constraints);
  *dawg_args->updated_constraints = constraints;

  // Go over the active_dawgs vector and insert DawgInfo records with the
  // updated ref (an edge with the corresponding unichar id) into
  // dawg_args->updated_active_dawgs.
  for (int a = 0; a < dawg_args->active_dawgs->length(); ++a) {
    const DawgInfo &info = (*dawg_args->active_dawgs)[a];
    const Dawg *dawg = dawgs_[info.dawg_index];
    // Obtain unichar_id at this position (could be changed later, so this
    // needs to be inside the loop over all active dawgs).
     unichar_id = word->unichar_id(word_index);
    // The number dawg generalizes all digits to be kPatternUnicharID,
    // so try to match kPatternUnicharID if the current unichar is a digit.
    if (dawg->type() == DAWG_TYPE_NUMBER &&
        getUnicharset().get_isdigit(unichar_id)) {
      unichar_id = Dawg::kPatternUnicharID;
    }
    // Get the starting node for this letter.
    NODE_REF node;
    if (info.ref == NO_EDGE) {
      node = 0;  // beginning to explore this dawg
    } else {
      node = dawg->next_node(info.ref);
      if (node == 0) node = NO_EDGE;  // end of word
    }
    // Find the edge out of the node for the curent unichar_id.
    EDGE_REF edge = (node != NO_EDGE) ?
      dawg->edge_char_of(node, unichar_id, word_end) : NO_EDGE;

    if (dawg_debug_level >= 3) {
      tprintf("Active dawg: [%d, " REFFORMAT "] edge=" REFFORMAT "\n",
              info.dawg_index, node, edge);
    }

    if (edge != NO_EDGE) {  // the unichar was found in the current dawg
      if (ConstraintsOk(*(dawg_args->updated_constraints),
                        word_end, dawg->type())) {
        UpdatePermuter(dawg->permuter(), &current_permuter);
        dawg_args->updated_active_dawgs->add_unique(
            DawgInfo(info.dawg_index, edge),
            "Append current dawg to updated active dawgs: ");
      }
    } else {                // the unichar was not found in the current dawg
      // Handle leading/trailing punctuation dawgs that denote a word pattern
      // as an edge with kPatternUnicharID. If such an edge is found we add a
      // constraint denoting the state of the dawg before the word pattern.
      // This constraint will be applied later when this dawg is found among
      // successor dawgs as well potentially at the end of the word.
      if (dawg->type() == DAWG_TYPE_PUNCTUATION) {
        edge = dawg->edge_char_of(node, Dawg::kPatternUnicharID, word_end);
        if (edge != NO_EDGE) {
          dawg_args->updated_constraints->add_unique(
              DawgInfo(info.dawg_index, edge), "Recording constraint: ");
        } else {
          // Do not explore successors of this dawg, since this
          // must be invalid leading or trailing punctuation.
          if (dawg_debug_level >= 3) {
            tprintf("Invalid punctuation from dawg %d\n", info.dawg_index);
          }
          continue;
        }
      }

      if (info.ref == NO_EDGE) {
        if (dawg_debug_level >= 3) {
          tprintf("No letters matched in dawg %d\n", info.dawg_index);
        }
        continue;
      }

      // Discard the dawg if the pattern can not end at previous letter.
      if (edge == NO_EDGE &&  // previous part is not leading punctuation
          !dawg->end_of_word(info.ref)) {
        if (dawg_debug_level >= 3) {
          tprintf("No valid pattern end in dawg %d\n", info.dawg_index);
        }
        continue;
      }

      // Look for the unichar in each of this dawg's successors
      // and append those in which it is found to active_dawgs.
      const SuccessorList &slist = *(successors_[info.dawg_index]);
      for (int s = 0; s < slist.length(); ++s) {
        int sdawg_index = slist[s];
        const Dawg *sdawg = dawgs_[sdawg_index];
        NODE_REF snode = 0;
        // Apply constraints to the successor dawg.
        for (int c = 0; c < constraints.length(); ++c) {
          // If the successor dawg is described in the constraints change
          // the start ref from 0 to the one recorded as the constraint.
          const DawgInfo &cinfo = constraints[c];
          if (cinfo.dawg_index == sdawg_index) {
            snode = sdawg->next_node(cinfo.ref);
            // Make sure we do not search the successor dawg if after
            // applying the saved constraint we are at the end of the word.
            if (snode == 0) snode = NO_EDGE;
            if (dawg_debug_level >= 3) {
               tprintf("Applying constraint [%d, " REFFORMAT "]\n",
                       sdawg_index, snode);
            }
          }
        }
        // Look for the letter in this successor dawg.
        EDGE_REF sedge = sdawg->edge_char_of(
            snode, word->unichar_id(word_index), word_end);
        // If we found the letter append sdawg to the active_dawgs list.
        if (sedge != NO_EDGE &&
            ConstraintsOk(*(dawg_args->updated_constraints), word_end,
                          dawgs_[sdawg_index]->type())) {
          UpdatePermuter(sdawg->permuter(), &current_permuter);
          if (sdawg->next_node(sedge) != 0) {  // if not word end
            dawg_args->updated_active_dawgs->add_unique(
              DawgInfo(sdawg_index, sedge),
              "Append successor to updated active dawgs: ");
          }
        }
      }  // end successors loop
    }  // end if/else
  }  // end for
  // Update dawg_args->permuter if it used to be NO_PERM or if we found
  // the current letter in a non-punctuation dawg. This allows preserving
  // information on which dawg the "core" word came from.
  if ((current_permuter == PUNC_PERM &&
       current_permuter > dawg_args->permuter) ||
      current_permuter != PUNC_PERM) {
    dawg_args->permuter = current_permuter;
  }
  return dawg_args->permuter;
}
Beispiel #10
0
/**
 * @name go_deeper_dawg_fxn
 *
 * If the choice being composed so far could be a dictionary word
 * keep exploring choices.
 */
void Dict::go_deeper_dawg_fxn(
    const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices,
    int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info,
    bool word_ending, WERD_CHOICE *word, float certainties[], float *limit,
    WERD_CHOICE *best_choice, int *attempts_left, void *void_more_args) {
  DawgArgs *more_args = static_cast<DawgArgs *>(void_more_args);
  word_ending = (char_choice_index == char_choices.size()-1);
  int word_index = word->length() - 1;
  if (best_choice->rating() < *limit) return;
  // Look up char in DAWG

  // If the current unichar is an ngram first try calling
  // letter_is_okay() for each unigram it contains separately.
  UNICHAR_ID orig_uch_id = word->unichar_id(word_index);
  bool checked_unigrams = false;
  if (getUnicharset().get_isngram(orig_uch_id)) {
    if (dawg_debug_level) {
      tprintf("checking unigrams in an ngram %s\n",
              getUnicharset().debug_str(orig_uch_id).string());
    }
    int num_unigrams = 0;
    word->remove_last_unichar_id();
    GenericVector<UNICHAR_ID> encoding;
    const char *ngram_str = getUnicharset().id_to_unichar(orig_uch_id);
    // Since the string came out of the unicharset, failure is impossible.
    ASSERT_HOST(getUnicharset().encode_string(ngram_str, true, &encoding, nullptr,
                                              nullptr));
    bool unigrams_ok = true;
    // Construct DawgArgs that reflect the current state.
    DawgPositionVector unigram_active_dawgs = *(more_args->active_dawgs);
    DawgPositionVector unigram_updated_dawgs;
    DawgArgs unigram_dawg_args(&unigram_active_dawgs,
                               &unigram_updated_dawgs,
                               more_args->permuter);
    // Check unigrams in the ngram with letter_is_okay().
    for (int i = 0; unigrams_ok && i < encoding.size(); ++i) {
      UNICHAR_ID uch_id = encoding[i];
      ASSERT_HOST(uch_id != INVALID_UNICHAR_ID);
      ++num_unigrams;
      word->append_unichar_id(uch_id, 1, 0.0, 0.0);
      unigrams_ok = (this->*letter_is_okay_)(
          &unigram_dawg_args,
          word->unichar_id(word_index+num_unigrams-1),
          word_ending && i == encoding.size() - 1);
      (*unigram_dawg_args.active_dawgs) = *(unigram_dawg_args.updated_dawgs);
      if (dawg_debug_level) {
        tprintf("unigram %s is %s\n",
                getUnicharset().debug_str(uch_id).string(),
                unigrams_ok ? "OK" : "not OK");
      }
    }
    // Restore the word and copy the updated dawg state if needed.
    while (num_unigrams-- > 0) word->remove_last_unichar_id();
    word->append_unichar_id_space_allocated(orig_uch_id, 1, 0.0, 0.0);
    if (unigrams_ok) {
      checked_unigrams = true;
      more_args->permuter = unigram_dawg_args.permuter;
      *(more_args->updated_dawgs) = *(unigram_dawg_args.updated_dawgs);
    }
  }

  // Check which dawgs from the dawgs_ vector contain the word
  // up to and including the current unichar.
  if (checked_unigrams || (this->*letter_is_okay_)(
      more_args, word->unichar_id(word_index), word_ending)) {
    // Add a new word choice
    if (word_ending) {
      if (dawg_debug_level) {
        tprintf("found word = %s\n", word->debug_string().string());
      }
      if (strcmp(output_ambig_words_file.string(), "") != 0) {
        if (output_ambig_words_file_ == nullptr) {
          output_ambig_words_file_ =
              fopen(output_ambig_words_file.string(), "wb+");
          if (output_ambig_words_file_ == nullptr) {
            tprintf("Failed to open output_ambig_words_file %s\n",
                    output_ambig_words_file.string());
            exit(1);
          }
          STRING word_str;
          word->string_and_lengths(&word_str, nullptr);
          word_str += " ";
          fprintf(output_ambig_words_file_, "%s", word_str.string());
        }
        STRING word_str;
        word->string_and_lengths(&word_str, nullptr);
        word_str += " ";
        fprintf(output_ambig_words_file_, "%s", word_str.string());
      }
      WERD_CHOICE *adjusted_word = word;
      adjusted_word->set_permuter(more_args->permuter);
      update_best_choice(*adjusted_word, best_choice);
    } else {  // search the next letter
      // Make updated_* point to the next entries in the DawgPositionVector
      // arrays (that were originally created in dawg_permute_and_select)
      ++(more_args->updated_dawgs);
      // Make active_dawgs and constraints point to the updated ones.
      ++(more_args->active_dawgs);
      permute_choices(debug, char_choices, char_choice_index + 1,
                      prev_char_frag_info, word, certainties, limit,
                      best_choice, attempts_left, more_args);
      // Restore previous state to explore another letter in this position.
      --(more_args->updated_dawgs);
      --(more_args->active_dawgs);
    }
  } else {
      if (dawg_debug_level) {
        tprintf("last unichar not OK at index %d in %s\n",
                word_index, word->debug_string().string());
    }
  }
}
Beispiel #11
0
/**
 * @name fragment_state
 *
 * Given the current char choice and information about previously seen
 * fragments, determines whether adjacent character fragments are
 * present and whether they can be concatenated.
 *
 * The given prev_char_frag_info contains:
 * - fragment: if not nullptr contains information about immediately
 *   preceding fragmented character choice
 * - num_fragments: number of fragments that have been used so far
 *   to construct a character
 * - certainty: certainty of the current choice or minimum
 *   certainty of all fragments concatenated so far
 * - rating: rating of the current choice or sum of fragment
 *   ratings concatenated so far
 *
 * The output char_frag_info is filled in as follows:
 * - character: is set to be nullptr if the choice is a non-matching
 *   or non-ending fragment piece; is set to unichar of the given choice
 *   if it represents a regular character or a matching ending fragment
 * - fragment,num_fragments,certainty,rating are set as described above
 *
 * @returns false if a non-matching fragment is discovered, true otherwise.
 */
bool Dict::fragment_state_okay(UNICHAR_ID curr_unichar_id,
                               float curr_rating, float curr_certainty,
                               const CHAR_FRAGMENT_INFO *prev_char_frag_info,
                               const char *debug, int word_ending,
                               CHAR_FRAGMENT_INFO *char_frag_info) {
  const CHAR_FRAGMENT *this_fragment =
    getUnicharset().get_fragment(curr_unichar_id);
  const CHAR_FRAGMENT *prev_fragment =
    prev_char_frag_info != nullptr ? prev_char_frag_info->fragment : nullptr;

  // Print debug info for fragments.
  if (debug && (prev_fragment || this_fragment)) {
    tprintf("%s check fragments: choice=%s word_ending=%d\n", debug,
            getUnicharset().debug_str(curr_unichar_id).string(),
            word_ending);
    if (prev_fragment) {
      tprintf("prev_fragment %s\n", prev_fragment->to_string().string());
    }
    if (this_fragment) {
      tprintf("this_fragment %s\n", this_fragment->to_string().string());
    }
  }

  char_frag_info->unichar_id = curr_unichar_id;
  char_frag_info->fragment = this_fragment;
  char_frag_info->rating = curr_rating;
  char_frag_info->certainty = curr_certainty;
  char_frag_info->num_fragments = 1;
  if (prev_fragment && !this_fragment) {
    if (debug) tprintf("Skip choice with incomplete fragment\n");
    return false;
  }
  if (this_fragment) {
    // We are dealing with a fragment.
    char_frag_info->unichar_id = INVALID_UNICHAR_ID;
    if (prev_fragment) {
      if (!this_fragment->is_continuation_of(prev_fragment)) {
        if (debug) tprintf("Non-matching fragment piece\n");
        return false;
      }
      if (this_fragment->is_ending()) {
        char_frag_info->unichar_id =
          getUnicharset().unichar_to_id(this_fragment->get_unichar());
        char_frag_info->fragment = nullptr;
        if (debug) {
          tprintf("Built character %s from fragments\n",
                  getUnicharset().debug_str(
                      char_frag_info->unichar_id).string());
        }
      } else {
        if (debug) tprintf("Record fragment continuation\n");
        char_frag_info->fragment = this_fragment;
      }
      // Update certainty and rating.
      char_frag_info->rating =
        prev_char_frag_info->rating + curr_rating;
      char_frag_info->num_fragments = prev_char_frag_info->num_fragments + 1;
      char_frag_info->certainty =
        MIN(curr_certainty, prev_char_frag_info->certainty);
    } else {
      if (this_fragment->is_beginning()) {
        if (debug) tprintf("Record fragment beginning\n");
      } else {
        if (debug) {
          tprintf("Non-starting fragment piece with no prev_fragment\n");
        }
        return false;
      }
    }
  }
  if (word_ending && char_frag_info->fragment) {
    if (debug) tprintf("Word can not end with a fragment\n");
    return false;
  }
  return true;
}
/**
 * @name go_deeper_dawg_fxn
 *
 * If the choice being composed so far could be a dictionary word
 * keep exploring choices.
 *
 * There are two modes for deciding whether to go deeper: regular dawg
 * permuter mode and the special ambigs mode. If *limit is <= 0.0 the
 * function switches to the ambigs mode (this is the case when
 * dawg_permute_and_select() function is called from NoDangerousAmbigs()) and
 * only searches for the first choice that has a rating better than *limit
 * (in this case ratings are fake, since the real ratings can not be < 0).
 * Modification of the hyphen state is turned off in the ambigs mode.
 * When in the regular dawg permuter mode, the function explores all the
 * possible words and chooses the one with the best rating. The letters with
 * ratings that are far worse than the ones seen so far are pruned out.
 */
void Dict::go_deeper_dawg_fxn(
    const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices,
    int char_choice_index,
    const CHAR_FRAGMENT_INFO *prev_char_frag_info,
    bool word_ending, WERD_CHOICE *word, float certainties[],
    float *limit, WERD_CHOICE *best_choice, void *void_more_args) {
  DawgArgs *more_args = reinterpret_cast<DawgArgs*>(void_more_args);
  int word_index = word->length() - 1;

  bool ambigs_mode = (*limit <= 0.0);
  if (ambigs_mode) {
    if (best_choice->rating() < *limit) return;
  } else {
    // Prune bad subwords
    if (more_args->rating_array[word_index] == NO_RATING) {
      more_args->rating_array[word_index] = word->rating();
    } else {
      float permdawg_limit = more_args->rating_array[word_index] *
        more_args->rating_margin + kPermDawgRatingPad;
      if (permdawg_limit < word->rating()) {
        if (segment_dawg_debug) {
          tprintf("early pruned word rating=%4.2f,"
                  " permdawg_limit=%4.2f, word=%s\n", word->rating(),
                  permdawg_limit, word->debug_string(getUnicharset()).string());
        }
        return;
      }
    }
  }
  // Deal with hyphens
  if (word_ending && has_hyphen_end(*word) && !ambigs_mode) {
    if (segment_dawg_debug)
      tprintf("new hyphen choice = %s\n",
              word->debug_string(getUnicharset()).string());
    word->set_permuter(more_args->permuter);
    adjust_word(word, certainties);
    set_hyphen_word(*word, *(more_args->active_dawgs),
                    *(more_args->constraints));
    update_best_choice(*word, best_choice);
  } else {  // Look up char in DAWG
    // TODO(daria): update the rest of the code that specifies alternative
    // letter_is_okay_ functions (e.g. TessCharNgram class) to work with
    // multi-byte unichars and/or unichar ids.

    // If the current unichar is an ngram first try calling
    // letter_is_okay() for each unigram it contains separately.
    UNICHAR_ID orig_uch_id = word->unichar_id(word_index);
    bool checked_unigrams = false;
    if (getUnicharset().get_isngram(orig_uch_id)) {
      if (segment_dawg_debug) {
        tprintf("checking unigrams in an ngram %s\n",
                getUnicharset().debug_str(orig_uch_id).string());
      }
      int orig_num_fragments = word->fragment_length(word_index);
      int num_unigrams = 0;
      word->remove_last_unichar_id();
      const char *ngram_str = getUnicharset().id_to_unichar(orig_uch_id);
      const char *ngram_str_end = ngram_str + strlen(ngram_str);
      const char *ngram_ptr = ngram_str;
      bool unigrams_ok = true;
      // Construct DawgArgs that reflect the current state.
      DawgInfoVector unigram_active_dawgs = *(more_args->active_dawgs);
      DawgInfoVector unigram_constraints = *(more_args->constraints);
      DawgInfoVector unigram_updated_active_dawgs;
      DawgInfoVector unigram_updated_constraints;
      DawgArgs unigram_dawg_args(&unigram_active_dawgs, &unigram_constraints,
                                 &unigram_updated_active_dawgs,
                                 &unigram_updated_constraints, 0.0);
      unigram_dawg_args.permuter = more_args->permuter;
      // Check unigrams in the ngram with letter_is_okay().
      while (unigrams_ok && ngram_ptr < ngram_str_end) {
        int step = getUnicharset().step(ngram_ptr);
        UNICHAR_ID uch_id = (step <= 0) ? INVALID_UNICHAR_ID :
            getUnicharset().unichar_to_id(ngram_ptr, step);
        ngram_ptr += step;
        ++num_unigrams;
        word->append_unichar_id(uch_id, 1, 0.0, 0.0);
        unigrams_ok = unigrams_ok && (this->*letter_is_okay_)(
            &unigram_dawg_args, word_index+num_unigrams-1, word,
            word_ending && (ngram_ptr == ngram_str_end));
        (*unigram_dawg_args.active_dawgs) =
          *(unigram_dawg_args.updated_active_dawgs);
        (*unigram_dawg_args.constraints) =
          *(unigram_dawg_args.updated_constraints);
        if (segment_dawg_debug) {
          tprintf("unigram %s is %s\n",
                  getUnicharset().debug_str(uch_id).string(),
                  unigrams_ok ? "OK" : "not OK");
        }
      }
      // Restore the word and copy the updated dawg state if needed.
      while (num_unigrams-- > 0) word->remove_last_unichar_id();
      word->append_unichar_id_space_allocated(
          orig_uch_id, orig_num_fragments, 0.0, 0.0);
      if (unigrams_ok) {
        checked_unigrams = true;
        more_args->permuter = unigram_dawg_args.permuter;
        *(more_args->updated_active_dawgs) =
          *(unigram_dawg_args.updated_active_dawgs);
        *(more_args->updated_constraints) =
          *(unigram_dawg_args.updated_constraints);
      }
    }

    // Check which dawgs from dawgs_ vector contain the word
    // up to and including the current unichar.
    if (checked_unigrams ||
        (this->*letter_is_okay_)(more_args, word_index, word, word_ending)) {
      // Add a new word choice
      if (word_ending) {
        if (segment_dawg_debug) {
          tprintf("found word = %s\n",
                  word->debug_string(getUnicharset()).string());
        }
        WERD_CHOICE *adjusted_word = word;
        WERD_CHOICE hyphen_tail_word;
        if (!ambigs_mode && hyphen_base_size() > 0) {
          hyphen_tail_word = *word;
          remove_hyphen_head(&hyphen_tail_word);
          adjusted_word = &hyphen_tail_word;
        }
        adjusted_word->set_permuter(more_args->permuter);
        if (!ambigs_mode) {
          adjust_word(adjusted_word, &certainties[hyphen_base_size()]);
        }
        update_best_choice(*adjusted_word, best_choice);
      } else {  // search the next letter
        // Make updated_* point to the next entries in the DawgInfoVector
        // arrays (that were originally created in dawg_permute_and_select)
        ++(more_args->updated_active_dawgs);
        ++(more_args->updated_constraints);
        // Make active_dawgs and constraints point to the updated ones.
        ++(more_args->active_dawgs);
        ++(more_args->constraints);
        permute_choices(debug, char_choices, char_choice_index + 1,
                        prev_char_frag_info, word, certainties, limit,
                        best_choice, more_args);
        // Restore previous state to explore another letter in this position.
        --(more_args->updated_active_dawgs);
        --(more_args->updated_constraints);
        --(more_args->active_dawgs);
        --(more_args->constraints);
      }
    } else {
      if (segment_dawg_debug) {
        tprintf("last unichar not OK at index %d in %s\n",
                word_index, word->debug_string(getUnicharset()).string());
      }
    }
  }
}