예제 #1
0
// Sets flags necessary for recognition in the training mode.
// Opens and returns the pointer to the output file.
FILE *Tesseract::init_recog_training(const STRING &fname) {
    if (tessedit_ambigs_training) {
        tessedit_tess_adaption_mode.set_value(0);    // turn off adaption
        tessedit_enable_doc_dict.set_value(0);       // turn off document dictionary
        // Explore all segmentations.
        getDict().stopper_no_acceptable_choices.set_value(1);
    }

    STRING output_fname = fname;
    const char *lastdot = strrchr(output_fname.string(), '.');
    if (lastdot != NULL) output_fname[lastdot - output_fname.string()] = '\0';
    output_fname += ".txt";
    FILE *output_file = open_file(output_fname.string(), "a+");
    return output_file;
}
예제 #2
0
파일: Main.c 프로젝트: kdmarrett/trie
int main(int argc, char** argv)
{
	FILE* fp; // declare dict file
	char* programName = argv[0];
	char* dictName = argv[1];
	char* input = (char*)malloc(sizeof(char) * MAXLINE);
	char* oldInput = (char*)malloc(sizeof(char) * MAXLINE);

	if ((input == NULL) || (oldInput == NULL))
		fprintf(stderr, MLCFAIL);
	if ((fp = fopen(dictName, "r")) == NULL)  {
		fprintf(stderr, "%s: %s: No such file or directory\n",
			programName, dictName);
		// exit if dict is not loaded properly
		return 1;
	}

	//build tree from dict
	struct node* root = newNode();
	root = getDict(fp, dictName, root);
	fclose(fp);

	// T9 interaction:
	printf("Enter \"exit\" to quit.\n");
	while (1)  {
		printf("Enter Key Sequence (or \"#\" for next word) :\n");
		scanf("%s", input);
		int len = strlen(input);
		if (!strncmp(input, "exit", len))  
			break;
		if (!strncmp(input, "#", 1))  
			strcat(oldInput, input);
		else
			strcpy(oldInput, input);
		char* word = searchTrie(root, oldInput, 
			strlen(oldInput), 0);
		printf("\t%s\n", word);
		free(word); //free alloc from appendquotes
	}  

	// clean up
	free(input);
	free(oldInput);
	deleteTrie(root);
	return 0; 
} 
예제 #3
0
/**
 * @name program_editup
 *
 * Initialize all the things in the program that need to be initialized.
 * init_permute determines whether to initialize the permute functions
 * and Dawg models.
 */
void Wordrec::program_editup(const char *textbase, bool init_permute) {
  if (textbase != NULL) {
    imagefile = textbase;
    /* Read in data files */
    edit_with_ocr(textbase);
  }

  /* Initialize subsystems */
  program_init();
  mfeature_init();  // assumes that imagefile is initialized
  if (init_permute)
    getDict().init_permute();
  setup_cp_maps();

  init_metrics();
  pass2_ok_split = chop_ok_split;
  pass2_seg_states = wordrec_num_seg_states;
}
예제 #4
0
// Runs classify_word_pass1() on the current word. Outputs Tesseract's
// raw choice as a result of the classification. For words labeled with a
// single unichar also outputs all alternatives from blob_choices of the
// best choice.
void Tesseract::ambigs_classify_and_output(WERD_RES *werd_res,
                                           ROW_RES *row_res,
                                           BLOCK_RES *block_res,
                                           const char *label,
                                           FILE *output_file) {
  int offset;
  // Classify word.
  fflush(stdout);
  classify_word_pass1(block_res->block, row_res->row, werd_res);
  WERD_CHOICE *best_choice = werd_res->best_choice;
  ASSERT_HOST(best_choice != NULL);
  ASSERT_HOST(best_choice->blob_choices() != NULL);

  // Compute the number of unichars in the label.
  int label_num_unichars = 0;
  int step = 1;  // should be non-zero on the first iteration
  for (offset = 0; label[offset] != '\0' && step > 0;
       step = werd_res->uch_set->step(label + offset),
       offset += step, ++label_num_unichars);
  if (step == 0) {
    tprintf("Not outputting illegal unichar %s\n", label);
    return;
  }

  // Output all classifier choices for the unigrams (1->1 classifications).
  if (label_num_unichars == 1 && best_choice->blob_choices()->length() == 1) {
    BLOB_CHOICE_LIST_C_IT outer_blob_choice_it;
    outer_blob_choice_it.set_to_list(best_choice->blob_choices());
    BLOB_CHOICE_IT blob_choice_it;
    blob_choice_it.set_to_list(outer_blob_choice_it.data());
    for (blob_choice_it.mark_cycle_pt();
         !blob_choice_it.cycled_list();
         blob_choice_it.forward()) {
      BLOB_CHOICE *blob_choice = blob_choice_it.data();
      if (blob_choice->unichar_id() != INVALID_UNICHAR_ID) {
        fprintf(output_file, "%s\t%s\t%.4f\t%.4f\n",
               unicharset.id_to_unichar(blob_choice->unichar_id()),
               label, blob_choice->rating(), blob_choice->certainty());
      }
    }
  }
  // Output raw choices for many->many and 1->many classifications.
  getDict().PrintAmbigAlternatives(output_file, label, label_num_unichars);
}
예제 #5
0
파일: skill.cpp 프로젝트: dhl/frePPLe
DECLARE_EXPORT void Skill::writeElement(XMLOutput *o, const Keyword& tag, mode m) const
{
  // Write a reference
  if (m == REFERENCE)
  {
    o->writeElement(tag, Tags::tag_name, getName());
    return;
  }

  // Write the head
  if (m != NOHEAD && m != NOHEADTAIL)
    o->BeginObject(tag, Tags::tag_name, XMLEscape(getName()));

  // Write source field
  o->writeElement(Tags::tag_source, getSource());

  // Write the custom fields
  PythonDictionary::write(o, getDict());

  // Write the tail
  if (m != NOHEADTAIL && m != NOTAIL) o->EndObject(tag);
}
예제 #6
0
/**********************************************************************
 * get_piece_rating
 *
 * Check to see if this piece has already been classified.  If it has
 * return that rating.  Otherwise build the piece from the smaller
 * pieces, classify it, store the rating for later, and take the piece
 * apart again.
 **********************************************************************/
BLOB_CHOICE_LIST *Wordrec::get_piece_rating(MATRIX *ratings,
                                            TBLOB *blobs,
                                            const DENORM& denorm,
                                            SEAMS seams,
                                            inT16 start,
                                            inT16 end,
                                            BlamerBundle *blamer_bundle) {
  BLOB_CHOICE_LIST *choices = ratings->get(start, end);
  if (choices == NOT_CLASSIFIED) {
    choices = classify_piece(blobs,
                             denorm,
                             seams,
                             start,
                             end,
                             blamer_bundle);
    ratings->put(start, end, choices);
    if (wordrec_debug_level > 1) {
      tprintf("get_piece_rating(): updated ratings matrix\n");
      ratings->print(getDict().getUnicharset());
    }
  }
  return (choices);
}
예제 #7
0
// Clear the document dictionary for this and all subclassifiers.
void Tesseract::ResetDocumentDictionary() {
  getDict().ResetDocumentDictionary();
  for (int i = 0; i < sub_langs_.size(); ++i) {
    sub_langs_[i]->getDict().ResetDocumentDictionary();
  }
}
예제 #8
0
/**
 * @name program_editdown
 *
 * This function holds any necessary post processing for the Wise Owl
 * program.
 */
void Wordrec::program_editdown(int32_t elasped_time) {
#ifndef DISABLED_LEGACY_ENGINE
  EndAdaptiveClassifier();
#endif  // ndef DISABLED_LEGACY_ENGINE
  getDict().End();
}
예제 #9
0
/**
 * @name dict_word()
 *
 * Test the dictionaries, returning NO_PERM (0) if not found, or one
 * of the PermuterType values if found, according to the dictionary.
 */
int Wordrec::dict_word(const WERD_CHOICE &word) {
  return getDict().valid_word(word);
}
예제 #10
0
/**
 * @name improve_by_chopping
 *
 * Repeatedly chops the worst blob, classifying the new blobs fixing up all
 * the data, and incrementally runs the segmentation search until a good word
 * is found, or no more chops can be found.
 */
void Wordrec::improve_by_chopping(float rating_cert_scale,
                                  WERD_RES* word,
                                  BestChoiceBundle* best_choice_bundle,
                                  BlamerBundle* blamer_bundle,
                                  LMPainPoints* pain_points,
                                  GenericVector<SegSearchPending>* pending) {
  int blob_number;
  do {  // improvement loop.
    // Make a simple vector of BLOB_CHOICEs to make it easy to pick which
    // one to chop.
    GenericVector<BLOB_CHOICE*> blob_choices;
    int num_blobs = word->ratings->dimension();
    for (int i = 0; i < num_blobs; ++i) {
      BLOB_CHOICE_LIST* choices = word->ratings->get(i, i);
      if (choices == NULL || choices->empty()) {
        blob_choices.push_back(NULL);
      } else {
        BLOB_CHOICE_IT bc_it(choices);
        blob_choices.push_back(bc_it.data());
      }
    }
    SEAM* seam = improve_one_blob(blob_choices, &best_choice_bundle->fixpt,
                                  false, false, word, &blob_number);
    if (seam == NULL) break;
    // A chop has been made. We have to correct all the data structures to
    // take into account the extra bottom-level blob.
    // Put the seam into the seam_array and correct everything else on the
    // word: ratings matrix (including matrix location in the BLOB_CHOICES),
    // states in WERD_CHOICEs, and blob widths.
    word->InsertSeam(blob_number, seam);
    // Insert a new entry in the beam array.
    best_choice_bundle->beam.insert(new LanguageModelState, blob_number);
    // Fixpts are outdated, but will get recalculated.
    best_choice_bundle->fixpt.clear();
    // Remap existing pain points.
    pain_points->RemapForSplit(blob_number);
    // Insert a new pending at the chop point.
    pending->insert(SegSearchPending(), blob_number);

    // Classify the two newly created blobs using ProcessSegSearchPainPoint,
    // as that updates the pending correctly and adds new pain points.
    MATRIX_COORD pain_point(blob_number, blob_number);
    ProcessSegSearchPainPoint(0.0f, pain_point, "Chop1", pending, word,
                              pain_points, blamer_bundle);
    pain_point.col = blob_number + 1;
    pain_point.row = blob_number + 1;
    ProcessSegSearchPainPoint(0.0f, pain_point, "Chop2", pending, word,
                              pain_points, blamer_bundle);
    if (language_model_->language_model_ngram_on) {
      // N-gram evaluation depends on the number of blobs in a chunk, so we
      // have to re-evaluate everything in the word.
      ResetNGramSearch(word, best_choice_bundle, pending);
      blob_number = 0;
    }
    // Run language model incrementally. (Except with the n-gram model on.)
    UpdateSegSearchNodes(rating_cert_scale, blob_number, pending,
                         word, pain_points, best_choice_bundle, blamer_bundle);
  } while (!language_model_->AcceptableChoiceFound() &&
           word->ratings->dimension() < kMaxNumChunks);

  // If after running only the chopper best_choice is incorrect and no blame
  // has been yet set, blame the classifier if best_choice is classifier's
  // top choice and is a dictionary word (i.e. language model could not have
  // helped). Otherwise blame the tradeoff between the classifier and
  // the old language model (permuters).
  if (word->blamer_bundle != NULL &&
      word->blamer_bundle->incorrect_result_reason() == IRR_CORRECT &&
      !word->blamer_bundle->ChoiceIsCorrect(word->best_choice)) {
    bool valid_permuter = word->best_choice != NULL &&
        Dict::valid_word_permuter(word->best_choice->permuter(), false);
    word->blamer_bundle->BlameClassifierOrLangModel(word,
                                                    getDict().getUnicharset(),
                                                    valid_permuter,
                                                    wordrec_debug_blamer);
  }
}
예제 #11
0
파일: tessbox.cpp 프로젝트: CDOcr/tesseract
/**
 * @name tess_acceptable_word
 *
 * @return true if the word is regarded as "good enough".
 * @param word_choice after context
 * @param raw_choice before context
 */
bool Tesseract::tess_acceptable_word(WERD_RES* word) {
  return getDict().AcceptableResult(word);
}
예제 #12
0
/**
 * rebuild_current_state
 *
 * Transfers the given state to the word's output fields: rebuild_word,
 * best_state, box_word, and returns the corresponding blob choices.
 */
BLOB_CHOICE_LIST_VECTOR *Wordrec::rebuild_current_state(
    WERD_RES *word,
    STATE *state,
    BLOB_CHOICE_LIST_VECTOR *old_choices,
    MATRIX *ratings) {
  // Initialize search_state, num_joints, x, y.
  int num_joints = array_count(word->seam_array);
#ifndef GRAPHICS_DISABLED
    if (wordrec_display_segmentations) {
      print_state("Rebuilding state", state, num_joints);
    }
#endif
  // Setup the rebuild_word ready for the output blobs.
  if (word->rebuild_word != NULL)
    delete word->rebuild_word;
  word->rebuild_word = new TWERD;
  // Setup the best_state.
  word->best_state.clear();
  SEARCH_STATE search_state = bin_to_chunks(state, num_joints);
  // See which index is which below for information on x and y.
  int x = 0;
  int y;
  for (int i = 1; i <= search_state[0]; i++) {
    y = x + search_state[i];
    x = y + 1;
  }
  y = count_blobs(word->chopped_word->blobs) - 1;

  // Initialize char_choices, expanded_fragment_lengths:
  // e.g. if fragment_lengths = {1 1 2 3 1},
  // expanded_fragment_lengths_str = {1 1 2 2 3 3 3 1}.
  BLOB_CHOICE_LIST_VECTOR *char_choices = new BLOB_CHOICE_LIST_VECTOR();
  STRING expanded_fragment_lengths_str = "";
  bool state_has_fragments = false;
  const char *fragment_lengths = NULL;

  if (word->best_choice->length() > 0) {
    fragment_lengths = word->best_choice->fragment_lengths();
  }
  if (fragment_lengths) {
    for (int i = 0; i < word->best_choice->length(); ++i) {
      *char_choices += NULL;
      word->best_state.push_back(0);
      if (fragment_lengths[i] > 1) {
        state_has_fragments = true;
      }
      for (int j = 0; j < fragment_lengths[i]; ++j) {
        expanded_fragment_lengths_str += fragment_lengths[i];
      }
    }
  } else {
    for (int i = 0; i <= search_state[0]; ++i) {
      expanded_fragment_lengths_str += (char)1;
      *char_choices += NULL;
      word->best_state.push_back(0);
    }
  }

  // Set up variables for concatenating fragments.
  const char *word_lengths_ptr = NULL;
  const char *word_ptr = NULL;
  if (state_has_fragments) {
    // Make word_lengths_ptr point to the last element in
    // best_choice->unichar_lengths().
    word_lengths_ptr = word->best_choice->unichar_lengths().string();
    word_lengths_ptr += (strlen(word_lengths_ptr)-1);
    // Make word_str point to the beginning of the last
    // unichar in best_choice->unichar_string().
    word_ptr = word->best_choice->unichar_string().string();
    word_ptr += (strlen(word_ptr)-*word_lengths_ptr);
  }
  const char *expanded_fragment_lengths =
    expanded_fragment_lengths_str.string();
  char unichar[UNICHAR_LEN + 1];

  // Populate char_choices list such that it corresponds to search_state.
  //
  // If we are rebuilding a state that contains character fragments:
  // -- combine blobs that belong to character fragments
  // -- re-classify the blobs to obtain choices list for the merged blob
  // -- ensure that correct classification appears in the new choices list
  //    NOTE: a choice composed form original fragment choices will be always
  //    added to the new choices list for each character composed from
  //    fragments (even if the choice for the corresponding character appears
  //    in the re-classified choices list of for the newly merged blob).
  int ss_index = search_state[0];
  // Which index is which?
  // char_choices_index refers to the finished product: there is one for each
  // blob/unicharset entry in the final word.
  // ss_index refers to the search_state, and indexes a group (chunk) of blobs
  // that were classified together for the best state.
  // old_choice_index is a copy of ss_index, and accesses the old_choices,
  // which correspond to chunks in the best state. old_choice_index gets
  // set to -1 on a fragment set, as there is no corresponding chunk in
  // the best state.
  // x and y refer to the underlying blobs and are the first and last blob
  // indices in a chunk.
  for (int char_choices_index = char_choices->length() - 1;
       char_choices_index >= 0;
       --char_choices_index) {
    // The start and end of the blob to rebuild.
    int true_x = x;
    int true_y = y;
    // The fake merged fragment choice.
    BLOB_CHOICE* merged_choice = NULL;
    // Test for and combine fragments first.
    int fragment_pieces = expanded_fragment_lengths[ss_index];
    int old_choice_index = ss_index;

    if (fragment_pieces > 1) {
      strncpy(unichar, word_ptr, *word_lengths_ptr);
      unichar[*word_lengths_ptr] = '\0';
      merged_choice = rebuild_fragments(unichar, expanded_fragment_lengths,
                                        old_choice_index, old_choices);
      old_choice_index = -1;
    }
    while (fragment_pieces > 0) {
      true_x = x;
      // Move left to the previous blob.
      y = x - 1;
      x = y - search_state[ss_index--];
      --fragment_pieces;
    }
    word->best_state[char_choices_index] = true_y + 1 - true_x;
    BLOB_CHOICE_LIST *current_choices = join_blobs_and_classify(
        word, true_x, true_y, old_choice_index, ratings, old_choices);
    if (merged_choice != NULL) {
      // Insert merged_blob into current_choices, such that current_choices
      // are still sorted in non-descending order by rating.
      ASSERT_HOST(!current_choices->empty());
      BLOB_CHOICE_IT choice_it(current_choices);
      for (choice_it.mark_cycle_pt(); !choice_it.cycled_list() &&
           merged_choice->rating() > choice_it.data()->rating();
           choice_it.forward())
        choice_it.add_before_stay_put(merged_choice);
    }
    // Get rid of fragments in current_choices.
    BLOB_CHOICE_IT choice_it(current_choices);
    for (choice_it.mark_cycle_pt(); !choice_it.cycled_list();
        choice_it.forward()) {
      if (getDict().getUnicharset().get_fragment(
          choice_it.data()->unichar_id())) {
        delete choice_it.extract();
      }
    }
    char_choices->set(current_choices, char_choices_index);

    // Update word_ptr and word_lengths_ptr.
    if (word_lengths_ptr != NULL && word_ptr != NULL) {
      word_lengths_ptr--;
      word_ptr -= (*word_lengths_ptr);
    }
  }
  old_choices->delete_data_pointers();
  delete old_choices;
  memfree(search_state);

  return char_choices;
}
예제 #13
0
/**
 * @name program_editdown
 *
 * This function holds any nessessary post processing for the Wise Owl
 * program.
 */
    void Wordrec::program_editdown(inT32 elasped_time) {
        EndAdaptiveClassifier();
        getDict().End();
    }
예제 #14
0
BOOL8 Tesseract::word_adaptable(  //should we adapt?
                                WERD_RES *word,
                                uinT16 mode) {
  if (tessedit_adaption_debug) {
    tprintf("Running word_adaptable() for %s rating %.4f certainty %.4f\n",
          word->best_choice == NULL ? "" :
          word->best_choice->unichar_string().string(),
          word->best_choice->rating(), word->best_choice->certainty());
  }

  BOOL8 status = FALSE;
  BITS16 flags(mode);

  enum MODES
  {
    ADAPTABLE_WERD,
    ACCEPTABLE_WERD,
    CHECK_DAWGS,
    CHECK_SPACES,
    CHECK_ONE_ELL_CONFLICT,
    CHECK_AMBIG_WERD
  };

  /*
  0: NO adaption
  */
  if (mode == 0) {
    if (tessedit_adaption_debug) tprintf("adaption disabled\n");
    return FALSE;
  }

  if (flags.bit (ADAPTABLE_WERD)) {
    status |= word->tess_would_adapt;  // result of Classify::AdaptableWord()
    if (tessedit_adaption_debug && !status) {
      tprintf("tess_would_adapt bit is false\n");
    }
  }

  if (flags.bit (ACCEPTABLE_WERD)) {
    status |= word->tess_accepted;
    if (tessedit_adaption_debug && !status) {
      tprintf("tess_accepted bit is false\n");
    }
  }

  if (!status) {                  // If not set then
    return FALSE;                // ignore other checks
  }

  if (flags.bit (CHECK_DAWGS) &&
    (word->best_choice->permuter () != SYSTEM_DAWG_PERM) &&
    (word->best_choice->permuter () != FREQ_DAWG_PERM) &&
    (word->best_choice->permuter () != USER_DAWG_PERM) &&
    (word->best_choice->permuter () != NUMBER_PERM)) {
    if (tessedit_adaption_debug) tprintf("word not in dawgs\n");
    return FALSE;
  }

  if (flags.bit (CHECK_ONE_ELL_CONFLICT) && one_ell_conflict (word, FALSE)) {
    if (tessedit_adaption_debug) tprintf("word has ell conflict\n");
    return FALSE;
  }

  if (flags.bit (CHECK_SPACES) &&
    (strchr(word->best_choice->unichar_string().string(), ' ') != NULL)) {
    if (tessedit_adaption_debug) tprintf("word contains spaces\n");
    return FALSE;
  }

//  if (flags.bit (CHECK_AMBIG_WERD) && test_ambig_word (word))
  if (flags.bit (CHECK_AMBIG_WERD) &&
      !getDict().NoDangerousAmbig(word->best_choice, NULL, false, NULL, NULL)) {
    if (tessedit_adaption_debug) tprintf("word is ambiguous\n");
    return FALSE;
  }

  // Do not adapt to words that are composed from fragments if
  // tessedit_adapt_to_char_fragments is false.
  if (!tessedit_adapt_to_char_fragments) {
    const char *fragment_lengths = word->best_choice->fragment_lengths();
    if (fragment_lengths != NULL && *fragment_lengths != '\0') {
      for (int i = 0; i < word->best_choice->length(); ++i) {
        if (fragment_lengths[i] > 1) {
          if (tessedit_adaption_debug) tprintf("won't adapt to fragments\n");
          return false;  // found a character composed from fragments
        }
      }
    }
  }

  if (tessedit_adaption_debug) {
    tprintf("returning status %d\n", status);
  }
  return status;
}
예제 #15
0
void
mkraddobjects()
{
    AcGePoint3d pt;
    if (RTNORM != acedGetPoint( NULL, "\nEnter position:", asDblArray (pt) ))
        return;

    AsdkMkrEntity* pEnt = new AsdkMkrEntity;
    if (NULL == pEnt)
        return;

    pEnt->setPos( pt );
    if (!append( pEnt )) {
        delete pEnt;
        return;
    }

    AcDbObjectId objId;
    AsdkMkrObject *pObj = new AsdkMkrObject;
    if (NULL == pObj) {
        pEnt->erase();
        pEnt->close();
        return;
    }

#ifdef DIRECT
    acdbHostApplicationServices()->workingDatabase()
        ->addAcDbObject( objId, pObj );
    pObj->close();
#else
#ifdef NOD
    AcDbDictionary* pMyDict = getDict( /*NOXLATE*/"ASDK_MYDICT", AcDb::kForWrite );
    if (NULL != pMyDict)
        pMyDict->setMergeStyle(AcDb::kDrcMangleName);
#else
    AcDbDictionary* pMyDict = getExtDict( pEnt, /*NOXLATE*/"ASDK_MYDICT", AcDb::kForWrite );
#endif  // NOD
    if (NULL == pMyDict) {
        delete pObj;
        pEnt->erase();
        pEnt->close();
        return;
    }

    Acad::ErrorStatus es;
    if (pMyDict->has( /*NOXLATE*/"MYENTRY" ))
        es = pMyDict->setAt( "*", pObj, objId );
    else
        es = pMyDict->setAt( /*NOXLATE*/"MYENTRY", pObj, objId );
    pMyDict->close();
    if (Acad::eOk == es)
        pObj->close();
    else {
        delete pObj;
        pEnt->erase();
        pEnt->close();
        return;
    }

#endif  // DIRECT
    pEnt->setId( objId );
    pEnt->close();

    acutPrintf( "\nEv'rything OK\n" );
}
예제 #16
0
// Apply segmentation search to the given set of words, within the constraints
// of the existing ratings matrix. If there is already a best_choice on a word
// leaves it untouched and just sets the done/accepted etc flags.
void Tesseract::SearchWords(PointerVector<WERD_RES>* words) {
  // Run the segmentation search on the network outputs and make a BoxWord
  // for each of the output words.
  // If we drop a word as junk, then there is always a space in front of the
  // next.
  const Dict* stopper_dict = lstm_recognizer_->GetDict();
  if (stopper_dict == nullptr) stopper_dict = &getDict();
  bool any_nonspace_delimited = false;
  for (int w = 0; w < words->size(); ++w) {
    WERD_RES* word = (*words)[w];
    if (word->best_choice != nullptr &&
        word->best_choice->ContainsAnyNonSpaceDelimited()) {
      any_nonspace_delimited = true;
      break;
    }
  }
  for (int w = 0; w < words->size(); ++w) {
    WERD_RES* word = (*words)[w];
    if (word->best_choice == NULL) {
      // If we are using the beam search, the unicharset had better match!
      word->SetupWordScript(unicharset);
      WordSearch(word);
    } else if (word->best_choice->unicharset() == &unicharset &&
               !lstm_recognizer_->IsRecoding()) {
      // We set up the word without using the dictionary, so set the permuter
      // now, but we can only do it because the unicharsets match.
      word->best_choice->set_permuter(
          getDict().valid_word(*word->best_choice, true));
    }
    if (word->best_choice == NULL) {
      // It is a dud.
      word->SetupFake(lstm_recognizer_->GetUnicharset());
    } else {
      // Set the best state.
      for (int i = 0; i < word->best_choice->length(); ++i) {
        int length = word->best_choice->state(i);
        word->best_state.push_back(length);
      }
      word->reject_map.initialise(word->best_choice->length());
      word->tess_failed = false;
      word->tess_accepted = true;
      word->tess_would_adapt = false;
      word->done = true;
      word->tesseract = this;
      float word_certainty = MIN(word->space_certainty,
                                 word->best_choice->certainty());
      word_certainty *= kCertaintyScale;
      // Arbitrary ding factor for non-dictionary words.
      if (!lstm_recognizer_->IsRecoding() &&
          !Dict::valid_word_permuter(word->best_choice->permuter(), true))
        word_certainty -= kNonDictionaryPenalty;
      if (getDict().stopper_debug_level >= 1) {
        tprintf("Best choice certainty=%g, space=%g, scaled=%g, final=%g\n",
                word->best_choice->certainty(), word->space_certainty,
                MIN(word->space_certainty, word->best_choice->certainty()) *
                    kCertaintyScale,
                word_certainty);
        word->best_choice->print();
      }
      // Discard words that are impossibly bad, but allow a bit more for
      // dictionary words, and keep bad words in non-space-delimited langs.
      if (word_certainty >= RecodeBeamSearch::kMinCertainty ||
          any_nonspace_delimited ||
          (word_certainty >= kWorstDictCertainty &&
           Dict::valid_word_permuter(word->best_choice->permuter(), true))) {
        word->tess_accepted = stopper_dict->AcceptableResult(word);
      } else {
        if (getDict().stopper_debug_level >= 1) {
          tprintf("Deleting word with certainty %g\n", word_certainty);
          word->best_choice->print();
        }
        // It is a dud.
        word->SetupFake(lstm_recognizer_->GetUnicharset());
      }
      word->best_choice->set_certainty(word_certainty);
    }
  }
}
예제 #17
0
/**
 * @name chop_word_main
 *
 * Classify the blobs in this word and permute the results.  Find the
 * worst blob in the word and chop it up.  Continue this process until
 * a good answer has been found or all the blobs have been chopped up
 * enough.  Return the word level ratings.
 */
BLOB_CHOICE_LIST_VECTOR *Wordrec::chop_word_main(WERD_RES *word) {
  TBLOB *blob;
  int index;
  int did_chopping;
  STATE state;
  BLOB_CHOICE_LIST *match_result;
  MATRIX *ratings = NULL;
  DANGERR fixpt;                 /*dangerous ambig */
  inT32 bit_count;               //no of bits

  set_denorm(&word->denorm);

  BLOB_CHOICE_LIST_VECTOR *char_choices = new BLOB_CHOICE_LIST_VECTOR();
  BLOB_CHOICE_LIST_VECTOR *best_char_choices = new BLOB_CHOICE_LIST_VECTOR();

  did_chopping = 0;
  for (blob = word->chopped_word->blobs, index = 0;
       blob != NULL; blob = blob->next, index++) {
    match_result = classify_blob(blob, "chop_word:", Green);
    if (match_result == NULL)
      cprintf("Null classifier output!\n");
    *char_choices += match_result;
  }
  bit_count = index - 1;
  set_n_ones(&state, char_choices->length() - 1);
  bool acceptable = false;
  bool replaced = false;
  bool best_choice_updated =
    getDict().permute_characters(*char_choices, word->best_choice,
                                 word->raw_choice);
  if (best_choice_updated &&
      getDict().AcceptableChoice(char_choices, word->best_choice, &fixpt,
                                 CHOPPER_CALLER, &replaced)) {
    acceptable = true;
  }
  if (replaced)
    update_blob_classifications(word->chopped_word, *char_choices);
  CopyCharChoices(*char_choices, best_char_choices);
  if (!acceptable) {  // do more work to find a better choice
    did_chopping = 1;

    bool best_choice_acceptable = false;
    if (chop_enable)
      improve_by_chopping(word,
                          char_choices,
                          &state,
                          best_char_choices,
                          &fixpt,
                          &best_choice_acceptable);
    if (chop_debug)
      print_seams ("Final seam list:", word->seam_array);

    // The force_word_assoc is almost redundant to enable_assoc.  However,
    // it is not conditioned on the dict behavior.  For CJK, we need to force
    // the associator to be invoked.  When we figure out the exact behavior
    // of dict on CJK, we can remove the flag if it turns out to be redundant.
    if ((wordrec_enable_assoc && !best_choice_acceptable) || force_word_assoc) {
      ratings = word_associator(word, &state, best_char_choices,
                                &fixpt, &state);
    }
  }
  best_char_choices = rebuild_current_state(word, &state, best_char_choices,
                                            ratings);
  if (ratings != NULL) {
    if (wordrec_debug_level > 0) {
      tprintf("Final Ratings Matrix:\n");
      ratings->print(getDict().getUnicharset());
    }
    ratings->delete_matrix_pointers();
    delete ratings;
  }
  getDict().FilterWordChoices();
  char_choices->delete_data_pointers();
  delete char_choices;

  return best_char_choices;
}
/**********************************************************************
 * tess_acceptable_word
 *
 * Return true if the word is regarded as "good enough".
 **********************************************************************/
BOOL8 Tesseract::tess_acceptable_word(
    WERD_CHOICE *word_choice,  // after context
    WERD_CHOICE *raw_choice) {  // before context
  return getDict().AcceptableResult(*word_choice, *raw_choice);
}
예제 #19
0
파일: tface.cpp 프로젝트: 0xkasun/Dummy_Tes
/**
 * @name program_editdown
 *
 * This function holds any nessessary post processing for the Wise Owl
 * program.
 */
void Wordrec::program_editdown(inT32 elasped_time) {
  EndAdaptiveClassifier();
  blob_match_table.end_match_table();
  getDict().InitChoiceAccum();
  getDict().End();
}
예제 #20
0
Wordrec::Wordrec() :
  // control parameters
  BOOL_MEMBER(merge_fragments_in_matrix, TRUE,
              "Merge the fragments in the ratings matrix and delete them"
              " after merging", params()),
  BOOL_MEMBER(wordrec_no_block, FALSE, "Don't output block information",
              params()),
  BOOL_MEMBER(wordrec_enable_assoc, TRUE, "Associator Enable",
              params()),
  BOOL_MEMBER(force_word_assoc, FALSE,
              "force associator to run regardless of what enable_assoc is."
              "This is used for CJK where component grouping is necessary.",
              CCUtil::params()),
  INT_MEMBER(wordrec_num_seg_states, 30, "Segmentation states",
             CCUtil::params()),
  double_MEMBER(wordrec_worst_state, 1.0, "Worst segmentation state",
                params()),
  BOOL_MEMBER(fragments_guide_chopper, FALSE,
              "Use information from fragments to guide chopping process",
              params()),
  INT_MEMBER(repair_unchopped_blobs, 1, "Fix blobs that aren't chopped",
             params()),
  double_MEMBER(tessedit_certainty_threshold, -2.25, "Good blob limit",
                params()),
  INT_MEMBER(chop_debug, 0, "Chop debug",
             params()),
  BOOL_MEMBER(chop_enable, 1, "Chop enable",
              params()),
  BOOL_MEMBER(chop_vertical_creep, 0, "Vertical creep",
            params()),
  INT_MEMBER(chop_split_length, 10000, "Split Length",
             params()),
  INT_MEMBER(chop_same_distance, 2, "Same distance",
             params()),
  INT_MEMBER(chop_min_outline_points, 6, "Min Number of Points on Outline",
             params()),
  INT_MEMBER(chop_inside_angle, -50, "Min Inside Angle Bend",
             params()),
  INT_MEMBER(chop_min_outline_area, 2000, "Min Outline Area",
             params()),
  double_MEMBER(chop_split_dist_knob, 0.5, "Split length adjustment",
                params()),
  double_MEMBER(chop_overlap_knob, 0.9, "Split overlap adjustment",
                params()),
  double_MEMBER(chop_center_knob, 0.15, "Split center adjustment",
                params()),
  double_MEMBER(chop_sharpness_knob, 0.06, "Split sharpness adjustment",
                params()),
  double_MEMBER(chop_width_change_knob, 5.0, "Width change adjustment",
                params()),
  double_MEMBER(chop_ok_split, 100.0, "OK split limit",
                params()),
  double_MEMBER(chop_good_split, 50.0, "Good split limit",
                params()),
  INT_MEMBER(chop_x_y_weight, 3, "X / Y  length weight",
             params()),
  INT_MEMBER(segment_adjust_debug, 0, "Segmentation adjustment debug",
             params()),
  BOOL_MEMBER(assume_fixed_pitch_char_segment, FALSE,
              "include fixed-pitch heuristics in char segmentation",
              params()),
  BOOL_MEMBER(use_new_state_cost, FALSE,
              "use new state cost heuristics for segmentation state evaluation",
              params()),
  double_MEMBER(heuristic_segcost_rating_base, 1.25,
                "base factor for adding segmentation cost into word rating."
                "It's a multiplying factor, the larger the value above 1, "
                "the bigger the effect of segmentation cost.",
                params()),
  double_MEMBER(heuristic_weight_rating, 1.0,
                "weight associated with char rating in combined cost of state",
                params()),
  double_MEMBER(heuristic_weight_width, 1000.0,
                "weight associated with width evidence in combined cost of"
                " state", params()),
  double_MEMBER(heuristic_weight_seamcut, 0.0,
                "weight associated with seam cut in combined cost of state",
                params()),
  double_MEMBER(heuristic_max_char_wh_ratio, 2.0,
                "max char width-to-height ratio allowed in segmentation",
                params()),
  INT_MEMBER(wordrec_debug_level, 0,
             "Debug level for wordrec", params()),
  BOOL_MEMBER(wordrec_debug_blamer, false,
              "Print blamer debug messages", params()),
  BOOL_MEMBER(wordrec_run_blamer, false,
              "Try to set the blame for errors", params()),
  BOOL_MEMBER(enable_new_segsearch, true,
                   "Enable new segmentation search path.", params()),
  INT_MEMBER(segsearch_debug_level, 0,
             "SegSearch debug level", params()),
  INT_MEMBER(segsearch_max_pain_points, 2000,
             "Maximum number of pain points stored in the queue",
             params()),
  INT_MEMBER(segsearch_max_futile_classifications, 10,
             "Maximum number of pain point classifications per word that"
             "did not result in finding a better word choice.",
             params()),
  double_MEMBER(segsearch_max_char_wh_ratio, 2.0,
                "Maximum character width-to-height ratio", params()),
  double_MEMBER(segsearch_max_fixed_pitch_char_wh_ratio, 2.0,
                "Maximum character width-to-height ratio for"
                " fixed-pitch fonts",
                params()),
  BOOL_MEMBER(save_alt_choices, false,
              "Save alternative paths found during chopping"
              " and segmentation search",
              params()) {
  prev_word_best_choice_ = NULL;
  language_model_ = new LanguageModel(&get_fontinfo_table(),
                                      &(getDict()));
  pass2_seg_states = 0;
  num_joints = 0;
  num_pushed = 0;
  num_popped = 0;
  fill_lattice_ = NULL;
}
예제 #21
0
/**
 * @name evaluate_state
 *
 * Evaluate the segmentation that is represented by this state in the
 * best first search.  Add this state to the "states_seen" list.
 */
inT16 Wordrec::evaluate_state(CHUNKS_RECORD *chunks_record,
                              SEARCH_RECORD *the_search,
                              DANGERR *fixpt,
                              BlamerBundle *blamer_bundle) {
  BLOB_CHOICE_LIST_VECTOR *char_choices;
  SEARCH_STATE chunk_groups;
  float rating_limit = the_search->best_choice->rating();
  bool keep_going = true;
  PIECES_STATE widths;

  the_search->num_states++;
  chunk_groups = bin_to_chunks(the_search->this_state,
                               the_search->num_joints);
  bin_to_pieces (the_search->this_state, the_search->num_joints, widths);
  if (wordrec_debug_level > 1) {
    log_state("Evaluating state", the_search->num_joints,
              the_search->this_state);
  }
  getDict().LogNewSegmentation(widths);

  char_choices = evaluate_chunks(chunks_record, chunk_groups, blamer_bundle);
  getDict().SetWordsegRatingAdjustFactor(-1.0f);
  bool updated_best_choice = false;
  if (char_choices != NULL && char_choices->length() > 0) {
    // Compute the segmentation cost and include the cost in word rating.
    // TODO(dsl): We should change the SEARCH_RECORD to store this cost
    // from state evaluation and avoid recomputing it here.
    prioritize_state(chunks_record, the_search);
    getDict().SetWordsegRatingAdjustFactor(the_search->segcost_bias);
    updated_best_choice =
      getDict().permute_characters(*char_choices,
                                   the_search->best_choice,
                                   the_search->raw_choice);
    bool replaced = false;
    if (updated_best_choice) {
      if (getDict().AcceptableChoice(char_choices, the_search->best_choice,
                                     NULL, ASSOCIATOR_CALLER, &replaced)) {
        keep_going = false;
      }
      CopyCharChoices(*char_choices, the_search->best_char_choices);
    }
  }
  getDict().SetWordsegRatingAdjustFactor(-1.0f);

#ifndef GRAPHICS_DISABLED
  if (wordrec_display_segmentations) {
    display_segmentation (chunks_record->chunks, chunk_groups);
    if (wordrec_display_segmentations > 1)
      window_wait(segm_window);
  }
#endif

  if (rating_limit != the_search->best_choice->rating()) {
    ASSERT_HOST(updated_best_choice);
    the_search->before_best = the_search->num_states;
    the_search->best_state->part1 = the_search->this_state->part1;
    the_search->best_state->part2 = the_search->this_state->part2;
    replace_char_widths(chunks_record, chunk_groups);
  } else {
    ASSERT_HOST(!updated_best_choice);
    if (char_choices != NULL) fixpt->clear();
  }

  if (char_choices != NULL) delete char_choices;
  memfree(chunk_groups);

  return (keep_going);
}
예제 #22
0
void Wordrec::SegSearch(CHUNKS_RECORD *chunks_record,
                        WERD_CHOICE *best_choice,
                        BLOB_CHOICE_LIST_VECTOR *best_char_choices,
                        WERD_CHOICE *raw_choice,
                        STATE *output_best_state) {
  int row, col = 0;
  if (segsearch_debug_level > 0) {
    tprintf("Starting SegSearch on ratings matrix:\n");
    chunks_record->ratings->print(getDict().getUnicharset());
  }
  // Start with a fresh best_choice since rating adjustments
  // used by the chopper and the new segmentation search are not compatible.
  best_choice->set_rating(WERD_CHOICE::kBadRating);
  // Clear best choice accumulator (that is used for adaption), so that
  // choices adjusted by chopper do not interfere with the results from the
  // segmentation search.
  getDict().ClearBestChoiceAccum();

  MATRIX *ratings = chunks_record->ratings;
  // Priority queue containing pain points generated by the language model
  // The priority is set by the language model components, adjustments like
  // seam cost and width priority are factored into the priority.
  HEAP *pain_points = MakeHeap(segsearch_max_pain_points);

  // best_path_by_column records the lowest cost path found so far for each
  // column of the chunks_record->ratings matrix over all the rows.
  BestPathByColumn *best_path_by_column =
    new BestPathByColumn[ratings->dimension()];
  for (col = 0; col < ratings->dimension(); ++col) {
    best_path_by_column[col].avg_cost = WERD_CHOICE::kBadRating;
    best_path_by_column[col].best_vse = NULL;
  }

  language_model_->InitForWord(prev_word_best_choice_, &denorm_,
                               assume_fixed_pitch_char_segment,
                               best_choice->certainty(),
                               segsearch_max_char_wh_ratio,
                               pain_points, chunks_record);

  MATRIX_COORD *pain_point;
  float pain_point_priority;
  BestChoiceBundle best_choice_bundle(
      output_best_state, best_choice, raw_choice, best_char_choices);

  // pending[i] stores a list of the parent/child pair of BLOB_CHOICE_LISTs,
  // where i is the column of the child. Initially all the classified entries
  // in the ratings matrix from column 0 (with parent NULL) are inserted into
  // pending[0]. As the language model state is updated, new child/parent
  // pairs are inserted into the lists. Next, the entries in pending[1] are
  // considered, and so on. It is important that during the update the
  // children are considered in the non-decreasing order of their column, since
  // this guarantess that all the parents would be up to date before an update
  // of a child is done.
  SEG_SEARCH_PENDING_LIST *pending =
    new SEG_SEARCH_PENDING_LIST[ratings->dimension()];

  // Search for the ratings matrix for the initial best path.
  for (row = 0; row < ratings->dimension(); ++row) {
    if (ratings->get(0, row) != NOT_CLASSIFIED) {
      pending[0].add_sorted(
          SEG_SEARCH_PENDING::compare, true,
          new SEG_SEARCH_PENDING(row, NULL, LanguageModel::kAllChangedFlag));
    }
  }
  UpdateSegSearchNodes(0, &pending, &best_path_by_column, chunks_record,
                       pain_points, &best_choice_bundle);

  // Keep trying to find a better path by fixing the "pain points".
  int num_futile_classifications = 0;
  while (!(language_model_->AcceptableChoiceFound() ||
           num_futile_classifications >=
           segsearch_max_futile_classifications)) {
    // Get the next valid "pain point".
    int pop;
    while (true) {
      pop = HeapPop(pain_points, &pain_point_priority, &pain_point);
      if (pop == EMPTY) break;
      if (pain_point->Valid(*ratings) &&
        ratings->get(pain_point->col, pain_point->row) == NOT_CLASSIFIED) {
        break;
      } else {
        delete pain_point;
      }
    }
    if (pop == EMPTY) {
      if (segsearch_debug_level > 0) tprintf("Pain points queue is empty\n");
      break;
    }
    if (segsearch_debug_level > 0) {
      tprintf("Classifying pain point priority=%.4f, col=%d, row=%d\n",
              pain_point_priority, pain_point->col, pain_point->row);
    }
    BLOB_CHOICE_LIST *classified = classify_piece(
        chunks_record->chunks, chunks_record->splits,
        pain_point->col, pain_point->row);
    ratings->put(pain_point->col, pain_point->row, classified);

    if (segsearch_debug_level > 0) {
      print_ratings_list("Updated ratings matrix with a new entry:",
                         ratings->get(pain_point->col, pain_point->row),
                         getDict().getUnicharset());
      chunks_record->ratings->print(getDict().getUnicharset());
    }

    // Insert initial "pain points" to join the newly classified blob
    // with its left and right neighbors.
    if (!classified->empty()) {
      float worst_piece_cert;
      bool fragmented;
      if (pain_point->col > 0) {
        language_model_->GetWorstPieceCertainty(
            pain_point->col-1, pain_point->row, chunks_record->ratings,
            &worst_piece_cert, &fragmented);
        language_model_->GeneratePainPoint(
            pain_point->col-1, pain_point->row, false,
            LanguageModel::kInitialPainPointPriorityAdjustment,
            worst_piece_cert, fragmented, best_choice->certainty(),
            segsearch_max_char_wh_ratio, NULL, NULL,
            chunks_record, pain_points);
      }
      if (pain_point->row+1 < ratings->dimension()) {
        language_model_->GetWorstPieceCertainty(
            pain_point->col, pain_point->row+1, chunks_record->ratings,
            &worst_piece_cert, &fragmented);
        language_model_->GeneratePainPoint(
            pain_point->col, pain_point->row+1, true,
            LanguageModel::kInitialPainPointPriorityAdjustment,
            worst_piece_cert, fragmented, best_choice->certainty(),
            segsearch_max_char_wh_ratio, NULL, NULL,
            chunks_record, pain_points);
      }
    }

    // Record a pending entry with the pain_point and each of its parents.
    int parent_row = pain_point->col - 1;
    if (parent_row < 0) {  // this node has no parents
      pending[pain_point->col].add_sorted(
          SEG_SEARCH_PENDING::compare, true,
          new SEG_SEARCH_PENDING(pain_point->row, NULL,
                                 LanguageModel::kAllChangedFlag));
    } else {
      for (int parent_col = 0; parent_col < pain_point->col; ++parent_col) {
        if (ratings->get(parent_col, parent_row) != NOT_CLASSIFIED) {
          pending[pain_point->col].add_sorted(
              SEG_SEARCH_PENDING::compare, true,
              new SEG_SEARCH_PENDING(pain_point->row,
                                     ratings->get(parent_col, parent_row),
                                     LanguageModel::kAllChangedFlag));
        }
      }
    }
    UpdateSegSearchNodes(pain_point->col, &pending, &best_path_by_column,
                         chunks_record, pain_points, &best_choice_bundle);
    if (!best_choice_bundle.updated) ++num_futile_classifications;

    if (segsearch_debug_level > 0) {
      tprintf("num_futile_classifications %d\n", num_futile_classifications);
    }

    // Clean up
    best_choice_bundle.updated = false;
    delete pain_point;  // done using this pain point
  }

  if (segsearch_debug_level > 0) {
    tprintf("Done with SegSearch (AcceptableChoiceFound: %d\n",
            language_model_->AcceptableChoiceFound());
  }

  // Clean up.
  FreeHeapData(pain_points, MATRIX_COORD::Delete);
  delete[] best_path_by_column;
  delete[] pending;
  for (row = 0; row < ratings->dimension(); ++row) {
    for (col = 0; col <= row; ++col) {
      BLOB_CHOICE_LIST *rating = ratings->get(col, row);
      if (rating != NOT_CLASSIFIED) language_model_->DeleteState(rating);
    }
  }
}
예제 #23
0
/*************************************************************************
 * write_results()
 *
 * All recognition and rejection has now been done. Generate the following:
 *   .txt file     - giving the final best choices with NO highlighting
 *   .raw file     - giving the tesseract top choice output for each word
 *   .map file     - showing how the .txt file has been rejected in the .ep file
 *   epchoice list - a list of one element per word, containing the text for the
 *                   epaper. Reject strings are inserted.
 *   inset list    - a list of bounding boxes of reject insets - indexed by the
 *                   reject strings in the epchoice text.
 *************************************************************************/
void Tesseract::write_results(PAGE_RES_IT &page_res_it,
                              char newline_type,  // type of newline
                              BOOL8 force_eol) {  // override tilde crunch?
  WERD_RES *word = page_res_it.word();
  STRING repetition_code;
  const STRING *wordstr;
  STRING wordstr_lengths;
  int i;
  char unrecognised = STRING (unrecognised_char)[0];
  char ep_chars[32];             //Only for unlv_tilde_crunch
  int ep_chars_index = 0;
  char txt_chs[32];              //Only for unlv_tilde_crunch
  char map_chs[32];              //Only for unlv_tilde_crunch
  int txt_index = 0;
  BOOL8 need_reject = FALSE;
  UNICHAR_ID space = unicharset.unichar_to_id(" ");
  if ((word->unlv_crunch_mode != CR_NONE ||
       word->best_choice->length() == 0) &&
      !tessedit_zero_kelvin_rejection && !tessedit_word_for_word) {
    if ((word->unlv_crunch_mode != CR_DELETE) &&
        (!stats_.tilde_crunch_written ||
         ((word->unlv_crunch_mode == CR_KEEP_SPACE) &&
          (word->word->space () > 0) &&
          !word->word->flag (W_FUZZY_NON) &&
          !word->word->flag (W_FUZZY_SP)))) {
      if (!word->word->flag (W_BOL) &&
          (word->word->space () > 0) &&
          !word->word->flag (W_FUZZY_NON) &&
          !word->word->flag (W_FUZZY_SP)) {
        // Write a space to separate from preceeding good text.
        txt_chs[txt_index] = ' ';
        map_chs[txt_index++] = '1';
        ep_chars[ep_chars_index++] = ' ';
        stats_.last_char_was_tilde = false;
      }
      need_reject = TRUE;
    }
    if ((need_reject && !stats_.last_char_was_tilde) ||
        (force_eol && stats_.write_results_empty_block)) {
      /* Write a reject char - mark as rejected unless zero_rejection mode */
      stats_.last_char_was_tilde = TRUE;
      txt_chs[txt_index] = unrecognised;
      if (tessedit_zero_rejection || (suspect_level == 0)) {
        map_chs[txt_index++] = '1';
        ep_chars[ep_chars_index++] = unrecognised;
      }
      else {
        map_chs[txt_index++] = '0';
        /*
           The ep_choice string is a faked reject to allow newdiff to sync the
           .etx with the .txt and .map files.
         */
        ep_chars[ep_chars_index++] = CTRL_INSET; // escape code
                                 //dummy reject
        ep_chars[ep_chars_index++] = 1;
                                 //dummy reject
        ep_chars[ep_chars_index++] = 1;
                                 //type
        ep_chars[ep_chars_index++] = 2;
                                 //dummy reject
        ep_chars[ep_chars_index++] = 1;
                                 //dummy reject
        ep_chars[ep_chars_index++] = 1;
      }
      stats_.tilde_crunch_written = true;
      stats_.last_char_was_newline = false;
      stats_.write_results_empty_block = false;
    }

    if ((word->word->flag (W_EOL) && !stats_.last_char_was_newline) || force_eol) {
      /* Add a new line output */
      txt_chs[txt_index] = '\n';
      map_chs[txt_index++] = '\n';
                                 //end line
      ep_chars[ep_chars_index++] = newline_type;

                                 //Cos of the real newline
      stats_.tilde_crunch_written = false;
      stats_.last_char_was_newline = true;
      stats_.last_char_was_tilde = false;
    }
    txt_chs[txt_index] = '\0';
    map_chs[txt_index] = '\0';
    ep_chars[ep_chars_index] = '\0';  // terminate string
    word->ep_choice = new WERD_CHOICE(ep_chars, unicharset);

    if (force_eol)
      stats_.write_results_empty_block = true;
    return;
  }

  /* NORMAL PROCESSING of non tilde crunched words */

  stats_.tilde_crunch_written = false;
  if (newline_type)
    stats_.last_char_was_newline = true;
  else
    stats_.last_char_was_newline = false;
  stats_.write_results_empty_block = force_eol;  // about to write a real word

  if (unlv_tilde_crunching &&
      stats_.last_char_was_tilde &&
      (word->word->space() == 0) &&
      !(word->word->flag(W_REP_CHAR) && tessedit_write_rep_codes) &&
      (word->best_choice->unichar_id(0) == space)) {
    /* Prevent adjacent tilde across words - we know that adjacent tildes within
       words have been removed */
    word->best_choice->remove_unichar_id(0);
    if (word->best_choice->blob_choices() != NULL) {
      BLOB_CHOICE_LIST_C_IT blob_choices_it(word->best_choice->blob_choices());
      if (!blob_choices_it.empty()) delete blob_choices_it.extract();
    }
    word->best_choice->populate_unichars(getDict().getUnicharset());
    word->reject_map.remove_pos (0);
    delete word->box_word;
    word->box_word = new BoxWord;
  }
  if (newline_type ||
    (word->word->flag (W_REP_CHAR) && tessedit_write_rep_codes))
    stats_.last_char_was_tilde = false;
  else {
    if (word->reject_map.length () > 0) {
      if (word->best_choice->unichar_id(word->reject_map.length() - 1) == space)
        stats_.last_char_was_tilde = true;
      else
        stats_.last_char_was_tilde = false;
    }
    else if (word->word->space () > 0)
      stats_.last_char_was_tilde = false;
    /* else it is unchanged as there are no output chars */
  }

  ASSERT_HOST (word->best_choice->length() == word->reject_map.length());

  set_unlv_suspects(word);
  check_debug_pt (word, 120);
  if (tessedit_rejection_debug) {
    tprintf ("Dict word: \"%s\": %d\n",
             word->best_choice->debug_string(unicharset).string(),
             dict_word(*(word->best_choice)));
  }
  if (word->word->flag (W_REP_CHAR) && tessedit_write_rep_codes) {
    repetition_code = "|^~R";
    wordstr_lengths = "\001\001\001\001";
    repetition_code += unicharset.id_to_unichar(get_rep_char (word));
    wordstr_lengths += strlen(unicharset.id_to_unichar(get_rep_char (word)));
    wordstr = &repetition_code;
  } else {
    if (tessedit_zero_rejection) {
      /* OVERRIDE ALL REJECTION MECHANISMS - ONLY REJECT TESS FAILURES */
      for (i = 0; i < word->best_choice->length(); ++i) {
        if (word->reject_map[i].rejected())
          word->reject_map[i].setrej_minimal_rej_accept();
      }
    }
    if (tessedit_minimal_rejection) {
      /* OVERRIDE ALL REJECTION MECHANISMS - ONLY REJECT TESS FAILURES */
      for (i = 0; i < word->best_choice->length(); ++i) {
        if ((word->best_choice->unichar_id(i) != space) &&
            word->reject_map[i].rejected())
          word->reject_map[i].setrej_minimal_rej_accept();
      }
    }
  }
}
예제 #24
0
/// Recursive helper to find a match to the target_text (from text_index
/// position) in the choices (from choices_pos position).
/// @param choices is an array of GenericVectors, of length choices_length,
/// with each element representing a starting position in the word, and the
/// #GenericVector holding classification results for a sequence of consecutive
/// blobs, with index 0 being a single blob, index 1 being 2 blobs etc.
/// @param choices_pos
/// @param choices_length
/// @param target_text
/// @param text_index
/// @param rating
/// @param segmentation
/// @param best_rating
/// @param best_segmentation
void Tesseract::SearchForText(const GenericVector<BLOB_CHOICE_LIST*>* choices,
                              int choices_pos, int choices_length,
                              const GenericVector<UNICHAR_ID>& target_text,
                              int text_index,
                              float rating, GenericVector<int>* segmentation,
                              float* best_rating,
                              GenericVector<int>* best_segmentation) {
  const UnicharAmbigsVector& table = getDict().getUnicharAmbigs().dang_ambigs();
  for (int length = 1; length <= choices[choices_pos].size(); ++length) {
    // Rating of matching choice or worst choice if no match.
    float choice_rating = 0.0f;
    // Find the corresponding best BLOB_CHOICE.
    BLOB_CHOICE_IT choice_it(choices[choices_pos][length - 1]);
    for (choice_it.mark_cycle_pt(); !choice_it.cycled_list();
         choice_it.forward()) {
      BLOB_CHOICE* choice = choice_it.data();
      choice_rating = choice->rating();
      UNICHAR_ID class_id = choice->unichar_id();
      if (class_id == target_text[text_index]) {
        break;
      }
      // Search ambigs table.
      if (class_id < table.size() && table[class_id] != NULL) {
        AmbigSpec_IT spec_it(table[class_id]);
        for (spec_it.mark_cycle_pt(); !spec_it.cycled_list();
             spec_it.forward()) {
          const AmbigSpec *ambig_spec = spec_it.data();
          // We'll only do 1-1.
          if (ambig_spec->wrong_ngram[1] == INVALID_UNICHAR_ID &&
              ambig_spec->correct_ngram_id == target_text[text_index])
            break;
        }
        if (!spec_it.cycled_list())
          break;  // Found an ambig.
      }
    }
    if (choice_it.cycled_list())
      continue;  // No match.
    segmentation->push_back(length);
    if (choices_pos + length == choices_length &&
        text_index + 1 == target_text.size()) {
      // This is a complete match. If the rating is good record a new best.
      if (applybox_debug > 2) {
        tprintf("Complete match, rating = %g, best=%g, seglength=%d, best=%d\n",
                rating + choice_rating, *best_rating, segmentation->size(),
                best_segmentation->size());
      }
      if (best_segmentation->empty() || rating + choice_rating < *best_rating) {
        *best_segmentation = *segmentation;
        *best_rating = rating + choice_rating;
      }
    } else if (choices_pos + length < choices_length &&
               text_index + 1 < target_text.size()) {
      if (applybox_debug > 3) {
        tprintf("Match found for %d=%s:%s, at %d+%d, recursing...\n",
                target_text[text_index],
                unicharset.id_to_unichar(target_text[text_index]),
                choice_it.data()->unichar_id() == target_text[text_index]
                     ? "Match" : "Ambig",
                choices_pos, length);
      }
      SearchForText(choices, choices_pos + length, choices_length, target_text,
                    text_index + 1, rating + choice_rating, segmentation,
                    best_rating, best_segmentation);
      if (applybox_debug > 3) {
        tprintf("End recursion for %d=%s\n", target_text[text_index],
                unicharset.id_to_unichar(target_text[text_index]));
      }
    }
    segmentation->truncate(segmentation->size() - 1);
  }
}
예제 #25
0
파일: tessbox.cpp 프로젝트: CDOcr/tesseract
/**
 * @name tess_add_doc_word
 *
 * Add the given word to the document dictionary
 */
void Tesseract::tess_add_doc_word(WERD_CHOICE *word_choice) {
  getDict().add_document_word(*word_choice);
}
예제 #26
0
/**
 * rebuild_current_state
 *
 * Evaluate the segmentation that is represented by this state in the
 * best first search.  Add this state to the "states_seen" list.
 */
BLOB_CHOICE_LIST_VECTOR *Wordrec::rebuild_current_state(
    TBLOB *blobs,
    SEAMS seam_list,
    STATE *state,
    BLOB_CHOICE_LIST_VECTOR *old_choices,
    int fx,
    bool force_rebuild,
    const WERD_CHOICE &best_choice,
    const MATRIX *ratings) {
  // Initialize search_state, num_joints, x, y.
  int num_joints = array_count(seam_list);
#ifndef GRAPHICS_DISABLED
    if (wordrec_display_segmentations) {
      print_state("Rebuiling state", state, num_joints);
    }
#endif
  SEARCH_STATE search_state = bin_to_chunks(state, num_joints);
  int x = 0;
  int y;
  int i;
  for (i = 1; i <= search_state[0]; i++) {
    y = x + search_state[i];
    x = y + 1;
  }
  y = count_blobs (blobs) - 1;

  // Initialize char_choices, expanded_fragment_lengths:
  // e.g. if fragment_lengths = {1 1 2 3 1},
  // expanded_fragment_lengths_str = {1 1 2 2 3 3 3 1}.
  BLOB_CHOICE_LIST_VECTOR *char_choices = new BLOB_CHOICE_LIST_VECTOR();
  STRING expanded_fragment_lengths_str = "";
  bool state_has_fragments = false;
  const char *fragment_lengths = NULL;

  if (best_choice.length() > 0) {
    fragment_lengths = best_choice.fragment_lengths();
  }
  if (fragment_lengths) {
    for (int i = 0; i < best_choice.length(); ++i) {
      *char_choices += NULL;
      if (fragment_lengths[i] > 1) {
        state_has_fragments = true;
      }
      for (int j = 0; j < fragment_lengths[i]; ++j) {
        expanded_fragment_lengths_str += fragment_lengths[i];
      }
    }
  } else {
    for (i = 0; i <= search_state[0]; ++i) {
      expanded_fragment_lengths_str += (char)1;
      *char_choices += NULL;
    }
  }

  // Finish early if force_rebuld is false and there are no fragments to merge.
  if (!force_rebuild && !state_has_fragments) {
    delete char_choices;
    memfree(search_state);
    return old_choices;
  }

  // Set up variables for concatenating fragments.
  const char *word_lengths_ptr = NULL;
  const char *word_ptr = NULL;
  if (state_has_fragments) {
    // Make word_lengths_ptr point to the last element in
    // best_choice->unichar_lengths().
    word_lengths_ptr = best_choice.unichar_lengths().string();
    word_lengths_ptr += (strlen(word_lengths_ptr)-1);
    // Make word_str point to the beginning of the last
    // unichar in best_choice->unichar_string().
    word_ptr = best_choice.unichar_string().string();
    word_ptr += (strlen(word_ptr)-*word_lengths_ptr);
  }
  const char *expanded_fragment_lengths =
    expanded_fragment_lengths_str.string();
  bool merging_fragment = false;
  int true_y = -1;
  char unichar[UNICHAR_LEN + 1];
  int fragment_pieces = -1;
  float rating = 0.0;
  float certainty = -MAX_FLOAT32;

  // Populate char_choices list such that it corresponds to search_state.
  //
  // If we are rebuilding a state that contains character fragments:
  // -- combine blobs that belong to character fragments
  // -- re-classify the blobs to obtain choices list for the merged blob
  // -- ensure that correct classification appears in the new choices list
  //    NOTE: a choice composed form original fragment choices will be always
  //    added to the new choices list for each character composed from
  //    fragments (even if the choice for the corresponding character appears
  //    in the re-classified choices list of for the newly merged blob).
  BLOB_CHOICE_IT temp_it;
  int char_choices_index = char_choices->length() - 1;
  for (i = search_state[0]; i >= 0; i--) {
    BLOB_CHOICE_LIST *current_choices = join_blobs_and_classify(
        blobs, seam_list, x, y, fx, ratings, old_choices);
    // Combine character fragments.
    if (expanded_fragment_lengths[i] > 1) {
      // Start merging character fragments.
      if (!merging_fragment) {
        merging_fragment = true;
        true_y = y;
        fragment_pieces = expanded_fragment_lengths[i];
        rating = 0.0;
        certainty = -MAX_FLOAT32;
        strncpy(unichar, word_ptr, *word_lengths_ptr);
        unichar[*word_lengths_ptr] = '\0';
      }
      // Take into account the fact that we could have joined pieces
      // since we first recorded the ending point of a fragment (true_y).
      true_y -= y - x;
      // Populate fragment with updated values and look for the
      // fragment with the same values in current_choices.
      // Update rating and certainty of the character being composed.
      fragment_pieces--;
      CHAR_FRAGMENT fragment;
      fragment.set_all(unichar, fragment_pieces,
                       expanded_fragment_lengths[i]);
      temp_it.set_to_list(current_choices);
      for (temp_it.mark_cycle_pt(); !temp_it.cycled_list();
           temp_it.forward()) {
        const CHAR_FRAGMENT *current_fragment =
          getDict().getUnicharset().get_fragment(temp_it.data()->unichar_id());
        if (current_fragment && fragment.equals(current_fragment)) {
          rating += temp_it.data()->rating();
          if (temp_it.data()->certainty() > certainty) {
            certainty = temp_it.data()->certainty();
          }
          break;
        }
      }
      assert(!temp_it.cycled_list());  // make sure we found the fragment
      // Free current_choices for the fragmented character.
      delete current_choices;

      // Finish composing character from fragments.
      if (fragment_pieces == 0) {
        // Populate current_choices with the classification of
        // the blob merged from blobs of each character fragment.
        current_choices = join_blobs_and_classify(blobs, seam_list, x,
                                                  true_y, fx, ratings, NULL);
        BLOB_CHOICE *merged_choice =
          new BLOB_CHOICE(getDict().getUnicharset().unichar_to_id(unichar),
                          rating, certainty, 0, NO_PERM);

        // Insert merged_blob into current_choices, such that current_choices
        // are still sorted in non-descending order by rating.
        ASSERT_HOST(!current_choices->empty());
        temp_it.set_to_list(current_choices);
        for (temp_it.mark_cycle_pt();
             !temp_it.cycled_list() &&
             merged_choice->rating() > temp_it.data()->rating();
             temp_it.forward());
        temp_it.add_before_stay_put(merged_choice);

        // Done merging this fragmented character.
        merging_fragment = false;
      }
    }
    if (!merging_fragment) {
      // Get rid of fragments in current_choices.
      temp_it.set_to_list(current_choices);
      for (temp_it.mark_cycle_pt(); !temp_it.cycled_list();
           temp_it.forward()) {
        if (getDict().getUnicharset().get_fragment(
            temp_it.data()->unichar_id())) {
          delete temp_it.extract();
        }
      }
      char_choices->set(current_choices, char_choices_index);
      char_choices_index--;

      // Update word_ptr and word_lengths_ptr.
      if (word_lengths_ptr != NULL && word_ptr != NULL) {
        word_lengths_ptr--;
        word_ptr -= (*word_lengths_ptr);
      }
    }
    y = x - 1;
    x = y - search_state[i];
  }
  old_choices->delete_data_pointers();
  delete old_choices;
  memfree(search_state);

  return (char_choices);
}
예제 #27
0
/**********************************************************************
 * select_blob_to_split
 *
 * These are the results of the last classification.  Find a likely
 * place to apply splits.  If none, return -1.
 **********************************************************************/
int Wordrec::select_blob_to_split(
    const GenericVector<BLOB_CHOICE*>& blob_choices,
    float rating_ceiling, bool split_next_to_fragment) {
  BLOB_CHOICE *blob_choice;
  int x;
  float worst = -MAX_FLOAT32;
  int worst_index = -1;
  float worst_near_fragment = -MAX_FLOAT32;
  int worst_index_near_fragment = -1;
  const CHAR_FRAGMENT **fragments = NULL;

  if (chop_debug) {
    if (rating_ceiling < MAX_FLOAT32)
      tprintf("rating_ceiling = %8.4f\n", rating_ceiling);
    else
      tprintf("rating_ceiling = No Limit\n");
  }

  if (split_next_to_fragment && blob_choices.size() > 0) {
    fragments = new const CHAR_FRAGMENT *[blob_choices.length()];
    if (blob_choices[0] != NULL) {
      fragments[0] = getDict().getUnicharset().get_fragment(
          blob_choices[0]->unichar_id());
    } else {
      fragments[0] = NULL;
    }
  }

  for (x = 0; x < blob_choices.size(); ++x) {
    if (blob_choices[x] == NULL) {
      if (fragments != NULL) {
        delete[] fragments;
      }
      return x;
    } else {
      blob_choice = blob_choices[x];
      // Populate fragments for the following position.
      if (split_next_to_fragment && x+1 < blob_choices.size()) {
        if (blob_choices[x + 1] != NULL) {
          fragments[x + 1] = getDict().getUnicharset().get_fragment(
              blob_choices[x + 1]->unichar_id());
        } else {
          fragments[x + 1] = NULL;
        }
      }
      if (blob_choice->rating() < rating_ceiling &&
          blob_choice->certainty() < tessedit_certainty_threshold) {
        // Update worst and worst_index.
        if (blob_choice->rating() > worst) {
          worst_index = x;
          worst = blob_choice->rating();
        }
        if (split_next_to_fragment) {
          // Update worst_near_fragment and worst_index_near_fragment.
          bool expand_following_fragment =
            (x + 1 < blob_choices.size() &&
             fragments[x+1] != NULL && !fragments[x+1]->is_beginning());
          bool expand_preceding_fragment =
            (x > 0 && fragments[x-1] != NULL && !fragments[x-1]->is_ending());
          if ((expand_following_fragment || expand_preceding_fragment) &&
              blob_choice->rating() > worst_near_fragment) {
            worst_index_near_fragment = x;
            worst_near_fragment = blob_choice->rating();
            if (chop_debug) {
              tprintf("worst_index_near_fragment=%d"
                      " expand_following_fragment=%d"
                      " expand_preceding_fragment=%d\n",
                      worst_index_near_fragment,
                      expand_following_fragment,
                      expand_preceding_fragment);
            }
          }
        }
      }
    }
  }
  if (fragments != NULL) {
    delete[] fragments;
  }
  // TODO(daria): maybe a threshold of badness for
  // worst_near_fragment would be useful.
  return worst_index_near_fragment != -1 ?
    worst_index_near_fragment : worst_index;
}
예제 #28
0
void Wordrec::SegSearch(WERD_RES* word_res,
                        BestChoiceBundle* best_choice_bundle,
                        BlamerBundle* blamer_bundle) {
  LMPainPoints pain_points(segsearch_max_pain_points,
                           segsearch_max_char_wh_ratio,
                           assume_fixed_pitch_char_segment,
                           &getDict(), segsearch_debug_level);
  // Compute scaling factor that will help us recover blob outline length
  // from classifier rating and certainty for the blob.
  float rating_cert_scale = -1.0 * getDict().certainty_scale / rating_scale;
  GenericVector<SegSearchPending> pending;
  InitialSegSearch(word_res, &pain_points, &pending, best_choice_bundle,
                   blamer_bundle);

  if (!SegSearchDone(0)) {  // find a better choice
    if (chop_enable && word_res->chopped_word != NULL) {
      improve_by_chopping(rating_cert_scale, word_res, best_choice_bundle,
                          blamer_bundle, &pain_points, &pending);
    }
    if (chop_debug) SEAM::PrintSeams("Final seam list:", word_res->seam_array);

    if (blamer_bundle != NULL &&
        !blamer_bundle->ChoiceIsCorrect(word_res->best_choice)) {
      blamer_bundle->SetChopperBlame(word_res, wordrec_debug_blamer);
    }
  }
  // Keep trying to find a better path by fixing the "pain points".

  MATRIX_COORD pain_point;
  float pain_point_priority;
  int num_futile_classifications = 0;
  STRING blamer_debug;
  while (wordrec_enable_assoc &&
      (!SegSearchDone(num_futile_classifications) ||
          (blamer_bundle != NULL &&
              blamer_bundle->GuidedSegsearchStillGoing()))) {
    // Get the next valid "pain point".
    bool found_nothing = true;
    LMPainPointsType pp_type;
    while ((pp_type = pain_points.Deque(&pain_point, &pain_point_priority)) !=
        LM_PPTYPE_NUM) {
      if (!pain_point.Valid(*word_res->ratings)) {
        word_res->ratings->IncreaseBandSize(
            pain_point.row - pain_point.col + 1);
      }
      if (pain_point.Valid(*word_res->ratings) &&
          !word_res->ratings->Classified(pain_point.col, pain_point.row,
                                         getDict().WildcardID())) {
        found_nothing = false;
        break;
      }
    }
    if (found_nothing) {
      if (segsearch_debug_level > 0) tprintf("Pain points queue is empty\n");
      break;
    }
    ProcessSegSearchPainPoint(pain_point_priority, pain_point,
                              LMPainPoints::PainPointDescription(pp_type),
                              &pending, word_res, &pain_points, blamer_bundle);

    UpdateSegSearchNodes(rating_cert_scale, pain_point.col, &pending,
                         word_res, &pain_points, best_choice_bundle,
                         blamer_bundle);
    if (!best_choice_bundle->updated) ++num_futile_classifications;

    if (segsearch_debug_level > 0) {
      tprintf("num_futile_classifications %d\n", num_futile_classifications);
    }

    best_choice_bundle->updated = false;  // reset updated

    // See if it's time to terminate SegSearch or time for starting a guided
    // search for the true path to find the blame for the incorrect best_choice.
    if (SegSearchDone(num_futile_classifications) &&
        blamer_bundle != NULL &&
        blamer_bundle->GuidedSegsearchNeeded(word_res->best_choice)) {
      InitBlamerForSegSearch(word_res, &pain_points, blamer_bundle,
                             &blamer_debug);
    }
  }  // end while loop exploring alternative paths
  if (blamer_bundle != NULL) {
    blamer_bundle->FinishSegSearch(word_res->best_choice,
                                   wordrec_debug_blamer, &blamer_debug);
  }

  if (segsearch_debug_level > 0) {
    tprintf("Done with SegSearch (AcceptableChoiceFound: %d)\n",
            language_model_->AcceptableChoiceFound());
  }
}