Пример #1
0
void UNICHARSET::unichar_insert(const char* const unichar_repr) {
  if (!ids.contains(unichar_repr)) {
    if (strlen(unichar_repr) > UNICHAR_LEN) {
      fprintf(stderr, "Utf8 buffer too big, size=%d for %s\n",
              int(strlen(unichar_repr)), unichar_repr);
      return;
    }
    if (size_used == size_reserved) {
      if (size_used == 0)
        reserve(8);
      else
        reserve(2 * size_used);
    }

    strcpy(unichars[size_used].representation, unichar_repr);
    this->set_script(size_used, null_script);
    // If the given unichar_repr represents a fragmented character, set
    // fragment property to a pointer to CHAR_FRAGMENT class instance with
    // information parsed from the unichar representation. Use the script
    // of the base unichar for the fragmented character if possible.
    CHAR_FRAGMENT *frag = CHAR_FRAGMENT::parse_from_string(unichar_repr);
    this->unichars[size_used].properties.fragment = frag;
    if (frag != NULL && this->contains_unichar(frag->get_unichar())) {
      this->unichars[size_used].properties.script_id =
        this->get_script(frag->get_unichar());
    }
    this->unichars[size_used].properties.enabled = true;
    ids.insert(unichar_repr, size_used);
    ++size_used;
  }
}
Пример #2
0
// Creates a fake blob choice from the combination of the given fragments.
// unichar is the class to be made from the combination,
// expanded_fragment_lengths[choice_index] is the number of fragments to use.
// old_choices[choice_index] has the classifier output for each fragment.
// choice index initially indexes the last fragment and should be decremented
// expanded_fragment_lengths[choice_index] times to get the earlier fragments.
// Guarantees to return something non-null, or abort!
BLOB_CHOICE* Wordrec::rebuild_fragments(
    const char* unichar,
    const char* expanded_fragment_lengths,
    int choice_index,
    BLOB_CHOICE_LIST_VECTOR *old_choices) {
  float rating = 0.0f;
  float certainty = 0.0f;
  inT16 min_xheight = -MAX_INT16;
  inT16 max_xheight = MAX_INT16;
  for (int fragment_pieces = expanded_fragment_lengths[choice_index] - 1;
       fragment_pieces >= 0; --fragment_pieces, --choice_index) {
    // Get a pointer to the classifier results from the old_choices.
    BLOB_CHOICE_LIST *current_choices = old_choices->get(choice_index);
    // Populate fragment with updated values and look for the
    // fragment with the same values in current_choices.
    // Update rating and certainty of the character being composed.
    CHAR_FRAGMENT fragment;
    fragment.set_all(unichar, fragment_pieces,
                     expanded_fragment_lengths[choice_index], false);
    BLOB_CHOICE_IT choice_it(current_choices);
    for (choice_it.mark_cycle_pt(); !choice_it.cycled_list();
        choice_it.forward()) {
      BLOB_CHOICE* choice = choice_it.data();
      const CHAR_FRAGMENT *current_fragment =
          getDict().getUnicharset().get_fragment(choice->unichar_id());
      if (current_fragment && fragment.equals(current_fragment)) {
        rating += choice->rating();
        if (choice->certainty() < certainty) {
          certainty = choice->certainty();
        }
        IntersectRange(choice->min_xheight(), choice->max_xheight(),
                       &min_xheight, &max_xheight);
        break;
      }
    }
    if (choice_it.cycled_list()) {
      print_ratings_list("Failure", current_choices, unicharset);
      tprintf("Failed to find fragment %s at index=%d\n",
              fragment.to_string().string(), choice_index);
    }
    ASSERT_HOST(!choice_it.cycled_list());  // Be sure we found the fragment.
  }
  return new BLOB_CHOICE(getDict().getUnicharset().unichar_to_id(unichar),
                         rating, certainty, -1, -1, 0,
                         min_xheight, max_xheight, false);
}
Пример #3
0
void UNICHARSET::unichar_insert(const char* const unichar_repr,
                                OldUncleanUnichars old_style) {
  if (old_style == OldUncleanUnichars::kTrue) old_style_included_ = true;
  std::string cleaned =
      old_style_included_ ? unichar_repr : CleanupString(unichar_repr);
  if (!cleaned.empty() && !ids.contains(cleaned.data(), cleaned.size())) {
    const char* str = cleaned.c_str();
    GenericVector<int> encoding;
    if (!old_style_included_ &&
        encode_string(str, true, &encoding, nullptr, nullptr))
      return;
    if (size_used == size_reserved) {
      if (size_used == 0)
        reserve(8);
      else
        reserve(2 * size_used);
    }
    int index = 0;
    do {
      if (index >= UNICHAR_LEN) {
        fprintf(stderr, "Utf8 buffer too big, size>%d for %s\n", UNICHAR_LEN,
                unichar_repr);
        return;
      }
      unichars[size_used].representation[index++] = *str++;
    } while (*str != '\0');
    unichars[size_used].representation[index] = '\0';
    this->set_script(size_used, null_script);
    // If the given unichar_repr represents a fragmented character, set
    // fragment property to a pointer to CHAR_FRAGMENT class instance with
    // information parsed from the unichar representation. Use the script
    // of the base unichar for the fragmented character if possible.
    CHAR_FRAGMENT* frag =
        CHAR_FRAGMENT::parse_from_string(unichars[size_used].representation);
    this->unichars[size_used].properties.fragment = frag;
    if (frag != nullptr && this->contains_unichar(frag->get_unichar())) {
      this->unichars[size_used].properties.script_id =
        this->get_script(frag->get_unichar());
    }
    this->unichars[size_used].properties.enabled = true;
    ids.insert(unichars[size_used].representation, size_used);
    ++size_used;
  }
}
Пример #4
0
CHAR_FRAGMENT *CHAR_FRAGMENT::parse_from_string(const char *string) {
  const char *ptr = string;
  int len = strlen(string);
  if (len < kMinLen || *ptr != kSeparator) {
    return NULL;  // this string can not represent a fragment
  }
  ptr++;  // move to the next character
  int step = 0;
  while ((ptr + step) < (string + len) && *(ptr + step) != kSeparator) {
    step += UNICHAR::utf8_step(ptr + step);
  }
  if (step == 0 || step > UNICHAR_LEN) {
    return NULL;  // no character for unichar or the character is too long
  }
  char unichar[UNICHAR_LEN + 1];
  strncpy(unichar, ptr, step);
  unichar[step] = '\0';  // null terminate unichar
  ptr += step;  // move to the next fragment separator
  int pos = 0;
  int total = 0;
  bool natural = false;
  char *end_ptr = NULL;
  for (int i = 0; i < 2; i++) {
    if (ptr > string + len || *ptr != kSeparator) {
      if (i == 1 && *ptr == kNaturalFlag)
        natural = true;
      else
        return NULL;  // Failed to parse fragment representation.
    }
    ptr++;  // move to the next character
    i == 0 ? pos = static_cast<int>(strtol(ptr, &end_ptr, 10))
      : total = static_cast<int>(strtol(ptr, &end_ptr, 10));
    ptr = end_ptr;
  }
  if (ptr != string + len) {
    return NULL;  // malformed fragment representation
  }
  CHAR_FRAGMENT *fragment = new CHAR_FRAGMENT();
  fragment->set_all(unichar, pos, total, natural);
  return fragment;
}
Пример #5
0
bool UNICHARSET::load_via_fgets(
    TessResultCallback2<char *, char *, int> *fgets_cb,
    bool skip_fragments) {
  int unicharset_size;
  char buffer[256];

  this->clear();
  if (fgets_cb->Run(buffer, sizeof(buffer)) == NULL ||
      sscanf(buffer, "%d", &unicharset_size) != 1) {
    return false;
  }
  this->reserve(unicharset_size);
  for (UNICHAR_ID id = 0; id < unicharset_size; ++id) {
    char unichar[256];
    unsigned int properties;
    char script[64];

    strcpy(script, null_script);
    int min_bottom = 0;
    int max_bottom = MAX_UINT8;
    int min_top = 0;
    int max_top = MAX_UINT8;
    int min_width = 0;
    int max_width = MAX_INT16;
    int min_bearing = 0;
    int max_bearing = MAX_INT16;
    int min_advance = 0;
    int max_advance = MAX_INT16;
    // TODO(eger): check that this default it ok
    // after enabling BiDi iterator for Arabic+Cube.
    int direction = UNICHARSET::U_LEFT_TO_RIGHT;
    UNICHAR_ID other_case = id;
    UNICHAR_ID mirror = id;
    char normed[64];
    int v = -1;
    if (fgets_cb->Run(buffer, sizeof (buffer)) == NULL ||
        ((v = sscanf(buffer,
                     "%s %x %d,%d,%d,%d,%d,%d,%d,%d,%d,%d %63s %d %d %d %63s",
                     unichar, &properties,
                     &min_bottom, &max_bottom, &min_top, &max_top,
                     &min_width, &max_width, &min_bearing, &max_bearing,
                     &min_advance, &max_advance, script, &other_case,
                     &direction, &mirror, normed)) != 17 &&
         (v = sscanf(buffer,
                     "%s %x %d,%d,%d,%d,%d,%d,%d,%d,%d,%d %63s %d %d %d",
                     unichar, &properties,
                     &min_bottom, &max_bottom, &min_top, &max_top,
                     &min_width, &max_width, &min_bearing, &max_bearing,
                     &min_advance, &max_advance,
                     script, &other_case, &direction, &mirror)) != 16 &&
          (v = sscanf(buffer, "%s %x %d,%d,%d,%d %63s %d %d %d",
                      unichar, &properties,
                      &min_bottom, &max_bottom, &min_top, &max_top,
                      script, &other_case, &direction, &mirror)) != 10 &&
          (v = sscanf(buffer, "%s %x %d,%d,%d,%d %63s %d", unichar, &properties,
                      &min_bottom, &max_bottom, &min_top, &max_top,
                      script, &other_case)) != 8 &&
          (v = sscanf(buffer, "%s %x %63s %d", unichar, &properties,
                      script, &other_case)) != 4 &&
          (v = sscanf(buffer, "%s %x %63s",
                      unichar, &properties, script)) != 3 &&
          (v = sscanf(buffer, "%s %x", unichar, &properties)) != 2)) {
      return false;
    }

    // Skip fragments if needed.
    CHAR_FRAGMENT *frag = NULL;
    if (skip_fragments && (frag = CHAR_FRAGMENT::parse_from_string(unichar))) {
      int num_pieces = frag->get_total();
      delete frag;
      // Skip multi-element fragments, but keep singles like UNICHAR_BROKEN in.
      if (num_pieces > 1)
        continue;
    }
    // Insert unichar into unicharset and set its properties.
    if (strcmp(unichar, "NULL") == 0)
      this->unichar_insert(" ");
    else
      this->unichar_insert(unichar);

    this->set_isalpha(id, properties & ISALPHA_MASK);
    this->set_islower(id, properties & ISLOWER_MASK);
    this->set_isupper(id, properties & ISUPPER_MASK);
    this->set_isdigit(id, properties & ISDIGIT_MASK);
    this->set_ispunctuation(id, properties & ISPUNCTUATION_MASK);
    this->set_isngram(id, false);
    this->set_script(id, script);
    this->unichars[id].properties.enabled = true;
    this->set_top_bottom(id, min_bottom, max_bottom, min_top, max_top);
    this->set_width_range(id, min_width, max_width);
    this->set_bearing_range(id, min_bearing, max_bearing);
    this->set_advance_range(id, min_advance, max_advance);
    this->set_direction(id, static_cast<UNICHARSET::Direction>(direction));
    ASSERT_HOST(other_case < unicharset_size);
    this->set_other_case(id, (v>3) ? other_case : id);
    ASSERT_HOST(mirror < unicharset_size);
    this->set_mirror(id, (v>8) ? mirror : id);
    this->set_normed(id, (v>16) ? normed : unichar);
  }
  post_load_setup();
  return true;
}
/**
 * rebuild_current_state
 *
 * Evaluate the segmentation that is represented by this state in the
 * best first search.  Add this state to the "states_seen" list.
 */
BLOB_CHOICE_LIST_VECTOR *Wordrec::rebuild_current_state(
    TBLOB *blobs,
    SEAMS seam_list,
    STATE *state,
    BLOB_CHOICE_LIST_VECTOR *old_choices,
    int fx,
    bool force_rebuild,
    const WERD_CHOICE &best_choice,
    const MATRIX *ratings) {
  // Initialize search_state, num_joints, x, y.
  int num_joints = array_count(seam_list);
#ifndef GRAPHICS_DISABLED
    if (wordrec_display_segmentations) {
      print_state("Rebuiling state", state, num_joints);
    }
#endif
  SEARCH_STATE search_state = bin_to_chunks(state, num_joints);
  int x = 0;
  int y;
  int i;
  for (i = 1; i <= search_state[0]; i++) {
    y = x + search_state[i];
    x = y + 1;
  }
  y = count_blobs (blobs) - 1;

  // Initialize char_choices, expanded_fragment_lengths:
  // e.g. if fragment_lengths = {1 1 2 3 1},
  // expanded_fragment_lengths_str = {1 1 2 2 3 3 3 1}.
  BLOB_CHOICE_LIST_VECTOR *char_choices = new BLOB_CHOICE_LIST_VECTOR();
  STRING expanded_fragment_lengths_str = "";
  bool state_has_fragments = false;
  const char *fragment_lengths = NULL;

  if (best_choice.length() > 0) {
    fragment_lengths = best_choice.fragment_lengths();
  }
  if (fragment_lengths) {
    for (int i = 0; i < best_choice.length(); ++i) {
      *char_choices += NULL;
      if (fragment_lengths[i] > 1) {
        state_has_fragments = true;
      }
      for (int j = 0; j < fragment_lengths[i]; ++j) {
        expanded_fragment_lengths_str += fragment_lengths[i];
      }
    }
  } else {
    for (i = 0; i <= search_state[0]; ++i) {
      expanded_fragment_lengths_str += (char)1;
      *char_choices += NULL;
    }
  }

  // Finish early if force_rebuld is false and there are no fragments to merge.
  if (!force_rebuild && !state_has_fragments) {
    delete char_choices;
    memfree(search_state);
    return old_choices;
  }

  // Set up variables for concatenating fragments.
  const char *word_lengths_ptr = NULL;
  const char *word_ptr = NULL;
  if (state_has_fragments) {
    // Make word_lengths_ptr point to the last element in
    // best_choice->unichar_lengths().
    word_lengths_ptr = best_choice.unichar_lengths().string();
    word_lengths_ptr += (strlen(word_lengths_ptr)-1);
    // Make word_str point to the beginning of the last
    // unichar in best_choice->unichar_string().
    word_ptr = best_choice.unichar_string().string();
    word_ptr += (strlen(word_ptr)-*word_lengths_ptr);
  }
  const char *expanded_fragment_lengths =
    expanded_fragment_lengths_str.string();
  bool merging_fragment = false;
  int true_y = -1;
  char unichar[UNICHAR_LEN + 1];
  int fragment_pieces = -1;
  float rating = 0.0;
  float certainty = -MAX_FLOAT32;

  // Populate char_choices list such that it corresponds to search_state.
  //
  // If we are rebuilding a state that contains character fragments:
  // -- combine blobs that belong to character fragments
  // -- re-classify the blobs to obtain choices list for the merged blob
  // -- ensure that correct classification appears in the new choices list
  //    NOTE: a choice composed form original fragment choices will be always
  //    added to the new choices list for each character composed from
  //    fragments (even if the choice for the corresponding character appears
  //    in the re-classified choices list of for the newly merged blob).
  BLOB_CHOICE_IT temp_it;
  int char_choices_index = char_choices->length() - 1;
  for (i = search_state[0]; i >= 0; i--) {
    BLOB_CHOICE_LIST *current_choices = join_blobs_and_classify(
        blobs, seam_list, x, y, fx, ratings, old_choices);
    // Combine character fragments.
    if (expanded_fragment_lengths[i] > 1) {
      // Start merging character fragments.
      if (!merging_fragment) {
        merging_fragment = true;
        true_y = y;
        fragment_pieces = expanded_fragment_lengths[i];
        rating = 0.0;
        certainty = -MAX_FLOAT32;
        strncpy(unichar, word_ptr, *word_lengths_ptr);
        unichar[*word_lengths_ptr] = '\0';
      }
      // Take into account the fact that we could have joined pieces
      // since we first recorded the ending point of a fragment (true_y).
      true_y -= y - x;
      // Populate fragment with updated values and look for the
      // fragment with the same values in current_choices.
      // Update rating and certainty of the character being composed.
      fragment_pieces--;
      CHAR_FRAGMENT fragment;
      fragment.set_all(unichar, fragment_pieces,
                       expanded_fragment_lengths[i]);
      temp_it.set_to_list(current_choices);
      for (temp_it.mark_cycle_pt(); !temp_it.cycled_list();
           temp_it.forward()) {
        const CHAR_FRAGMENT *current_fragment =
          getDict().getUnicharset().get_fragment(temp_it.data()->unichar_id());
        if (current_fragment && fragment.equals(current_fragment)) {
          rating += temp_it.data()->rating();
          if (temp_it.data()->certainty() > certainty) {
            certainty = temp_it.data()->certainty();
          }
          break;
        }
      }
      assert(!temp_it.cycled_list());  // make sure we found the fragment
      // Free current_choices for the fragmented character.
      delete current_choices;

      // Finish composing character from fragments.
      if (fragment_pieces == 0) {
        // Populate current_choices with the classification of
        // the blob merged from blobs of each character fragment.
        current_choices = join_blobs_and_classify(blobs, seam_list, x,
                                                  true_y, fx, ratings, NULL);
        BLOB_CHOICE *merged_choice =
          new BLOB_CHOICE(getDict().getUnicharset().unichar_to_id(unichar),
                          rating, certainty, 0, NO_PERM);

        // Insert merged_blob into current_choices, such that current_choices
        // are still sorted in non-descending order by rating.
        ASSERT_HOST(!current_choices->empty());
        temp_it.set_to_list(current_choices);
        for (temp_it.mark_cycle_pt();
             !temp_it.cycled_list() &&
             merged_choice->rating() > temp_it.data()->rating();
             temp_it.forward());
        temp_it.add_before_stay_put(merged_choice);

        // Done merging this fragmented character.
        merging_fragment = false;
      }
    }
    if (!merging_fragment) {
      // Get rid of fragments in current_choices.
      temp_it.set_to_list(current_choices);
      for (temp_it.mark_cycle_pt(); !temp_it.cycled_list();
           temp_it.forward()) {
        if (getDict().getUnicharset().get_fragment(
            temp_it.data()->unichar_id())) {
          delete temp_it.extract();
        }
      }
      char_choices->set(current_choices, char_choices_index);
      char_choices_index--;

      // Update word_ptr and word_lengths_ptr.
      if (word_lengths_ptr != NULL && word_ptr != NULL) {
        word_lengths_ptr--;
        word_ptr -= (*word_lengths_ptr);
      }
    }
    y = x - 1;
    x = y - search_state[i];
  }
  old_choices->delete_data_pointers();
  delete old_choices;
  memfree(search_state);

  return (char_choices);
}