QString printWORD(WERD_RES *wordres,int height) { WERD *word = wordres->word; QString aux; BITS16 flags; //flags about word flags.set_bit(W_SEGMENTED,word->flag(W_SEGMENTED)); flags.set_bit(W_ITALIC,word->flag(W_ITALIC)); flags.set_bit(W_BOL,word->flag(W_BOL)); flags.set_bit(W_EOL,word->flag(W_EOL)); flags.set_bit(W_NORMALIZED,word->flag(W_NORMALIZED)); flags.set_bit(W_POLYGON,word->flag(W_POLYGON)); flags.set_bit(W_LINEARC,word->flag(W_LINEARC)); flags.set_bit(W_DONT_CHOP,word->flag(W_DONT_CHOP)); flags.set_bit(W_REP_CHAR,word->flag(W_REP_CHAR)); flags.set_bit(W_FUZZY_SP,word->flag(W_FUZZY_SP)); flags.set_bit(W_FUZZY_NON,word->flag(W_FUZZY_NON)); aux.append(QString("Blanks= %1\n").arg(word->space())); aux.append(printTBOX(word->bounding_box(),height,true)); aux.append(QString("Correct= %1\n") .arg(wordres->best_choice->unichar_string().string())); aux.append(QString("Flags = %1 = 0%2\n").arg(flags.val) .arg(flags.val,0,8)); aux.append (QString(" W_SEGMENTED = %1\n") .arg(word->flag(W_SEGMENTED) ? "TRUE" : "FALSE ")); aux.append (QString(" W_ITALIC = %1\n") .arg(word->flag(W_ITALIC) ? "TRUE" : "FALSE ")); aux.append (QString(" W_BOL = %1\n") .arg(word->flag(W_BOL) ? "TRUE" : "FALSE ")); aux.append (QString(" W_EOL = %1\n") .arg(word->flag(W_EOL) ? "TRUE" : "FALSE ")); aux.append (QString(" W_NORMALIZED = %1\n") .arg(word->flag(W_NORMALIZED) ? "TRUE" : "FALSE ")); aux.append (QString(" W_POLYGON = %1\n") .arg(word->flag(W_POLYGON) ? "TRUE" : "FALSE ")); aux.append (QString(" W_LINEARC = %1\n") .arg(word->flag(W_LINEARC) ? "TRUE" : "FALSE ")); aux.append (QString(" W_DONT_CHOP = %1\n") .arg(word->flag(W_DONT_CHOP) ? "TRUE" : "FALSE ")); aux.append (QString(" W_REP_CHAR = %1\n") .arg(word->flag(W_REP_CHAR) ? "TRUE" : "FALSE ")); aux.append (QString(" W_FUZZY_SP = %1\n") .arg(word->flag(W_FUZZY_SP) ? "TRUE" : "FALSE ")); aux.append (QString(" W_FUZZY_NON = %1\n") .arg(word->flag(W_FUZZY_NON) ? "TRUE" : "FALSE ")); aux.append(QString("Rejected cblob count = %1") .arg(word->rej_cblob_list()->length())); return aux; }
void ExtractBlobsFromSegmentation(BLOCK_LIST* blocks, C_BLOB_LIST* output_blob_list) { C_BLOB_IT return_list_it(output_blob_list); BLOCK_IT block_it(blocks); for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) { BLOCK* block = block_it.data(); ROW_IT row_it(block->row_list()); for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) { ROW* row = row_it.data(); // Iterate over all werds in the row. WERD_IT werd_it(row->word_list()); for (werd_it.mark_cycle_pt(); !werd_it.cycled_list(); werd_it.forward()) { WERD* werd = werd_it.data(); return_list_it.move_to_last(); return_list_it.add_list_after(werd->cblob_list()); return_list_it.move_to_last(); return_list_it.add_list_after(werd->rej_cblob_list()); } } } }
void Textord::clean_noise_from_words( //remove empties ROW *row //row to clean ) { TBOX blob_box; //bounding box inT8 *word_dud; //was it chucked C_BLOB *blob; //current blob C_OUTLINE *outline; //current outline WERD *word; //current word inT32 blob_size; //biggest size inT32 trans_count; //no of transitions inT32 trans_threshold; //noise tolerance inT32 dot_count; //small objects inT32 norm_count; //normal objects inT32 dud_words; //number discarded inT32 ok_words; //number remaining inT32 word_index; //current word //words of row WERD_IT word_it = row->word_list (); C_BLOB_IT blob_it; //blob iterator C_OUTLINE_IT out_it; //outline iterator ok_words = word_it.length (); if (ok_words == 0 || textord_no_rejects) return; word_dud = (inT8 *) alloc_mem (ok_words * sizeof (inT8)); dud_words = 0; ok_words = 0; word_index = 0; for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) { word = word_it.data (); //current word dot_count = 0; norm_count = 0; //blobs in word blob_it.set_to_list (word->cblob_list ()); for (blob_it.mark_cycle_pt (); !blob_it.cycled_list (); blob_it.forward ()) { blob = blob_it.data (); if (!word->flag (W_DONT_CHOP)) { //get outlines out_it.set_to_list (blob->out_list ()); for (out_it.mark_cycle_pt (); !out_it.cycled_list (); out_it.forward ()) { outline = out_it.data (); blob_box = outline->bounding_box (); blob_size = blob_box.width () > blob_box.height ()? blob_box.width () : blob_box. height(); if (blob_size < textord_noise_sizelimit * row->x_height ()) dot_count++; //count smal outlines if (!outline->child ()->empty () && blob_box.height () < (1 + textord_noise_syfract) * row->x_height () && blob_box.height () > (1 - textord_noise_syfract) * row->x_height () && blob_box.width () < (1 + textord_noise_sxfract) * row->x_height () && blob_box.width () > (1 - textord_noise_sxfract) * row->x_height ()) norm_count++; //count smal outlines } } else norm_count++; blob_box = blob->bounding_box (); blob_size = blob_box.width () > blob_box.height ()? blob_box.width () : blob_box.height (); if (blob_size >= textord_noise_sizelimit * row->x_height () && blob_size < row->x_height () * 2) { trans_threshold = blob_size / textord_noise_sizefraction; trans_count = blob->count_transitions (trans_threshold); if (trans_count < textord_noise_translimit) norm_count++; } else if (blob_box.height () > row->x_height () * 2 && (!word_it.at_first () || !blob_it.at_first ())) dot_count += 2; } if (dot_count > 2) { if (dot_count > norm_count * textord_noise_normratio * 2) word_dud[word_index] = 2; else if (dot_count > norm_count * textord_noise_normratio) word_dud[word_index] = 1; else word_dud[word_index] = 0; } else word_dud[word_index] = 0; if (word_dud[word_index] == 2) dud_words++; else ok_words++; word_index++; } word_index = 0; for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) { if (word_dud[word_index] == 2 || (word_dud[word_index] == 1 && dud_words > ok_words)) { word = word_it.data (); //current word //rejected blobs blob_it.set_to_list (word->rej_cblob_list ()); //move from blobs blob_it.add_list_after (word->cblob_list ()); } word_index++; } free_mem(word_dud); }