// Remove outlines that are a tiny fraction in either width or height // of the word height. void Textord::clean_small_noise_from_words(ROW *row) { WERD_IT word_it(row->word_list()); for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) { WERD* word = word_it.data(); int min_size = static_cast<int>( textord_noise_hfract * word->bounding_box().height() + 0.5); C_BLOB_IT blob_it(word->cblob_list()); for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) { C_BLOB* blob = blob_it.data(); C_OUTLINE_IT out_it(blob->out_list()); for (out_it.mark_cycle_pt(); !out_it.cycled_list(); out_it.forward()) { C_OUTLINE* outline = out_it.data(); outline->RemoveSmallRecursive(min_size, &out_it); } if (blob->out_list()->empty()) { delete blob_it.extract(); } } if (word->cblob_list()->empty()) { if (!word_it.at_last()) { // The next word is no longer a fuzzy non space if it was before, // since the word before is about to be deleted. WERD* next_word = word_it.data_relative(1); if (next_word->flag(W_FUZZY_NON)) { next_word->set_flag(W_FUZZY_NON, false); } } delete word_it.extract(); } } }
void SHA1(sLONG_PTR *pResult, PackagePtr pParams) { C_BLOB Param1; C_LONGINT Param2; C_TEXT returnValue; Param1.fromParamAtIndex(pParams, 1); Param2.fromParamAtIndex(pParams, 2); uint8_t *buf = (uint8_t *)calloc(20, sizeof(uint8_t)); CC_SHA1((unsigned char *)Param1.getBytesPtr(), Param1.getBytesLength(), buf); C_BLOB temp; temp.setBytes((const uint8_t *)buf, 20); switch (Param2.getIntValue()) { case 1: temp.toB64Text(&returnValue); break; default: temp.toHexText(&returnValue); break; } free(buf); returnValue.setReturn(pResult); }
PAGE_RES_IT* make_pseudo_word(PAGE_RES* page_res, const TBOX& selection_box) { PAGE_RES_IT pr_it(page_res); C_BLOB_LIST new_blobs; // list of gathered blobs C_BLOB_IT new_blob_it = &new_blobs; // iterator for (WERD_RES* word_res = pr_it.word(); word_res != NULL; word_res = pr_it.forward()) { WERD* word = word_res->word; if (word->bounding_box().overlap(selection_box)) { C_BLOB_IT blob_it(word->cblob_list()); for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) { C_BLOB* blob = blob_it.data(); if (blob->bounding_box().overlap(selection_box)) { new_blob_it.add_after_then_move(C_BLOB::deep_copy(blob)); } } if (!new_blobs.empty()) { WERD* pseudo_word = new WERD(&new_blobs, 1, NULL); word_res = pr_it.InsertSimpleCloneWord(*word_res, pseudo_word); PAGE_RES_IT* it = new PAGE_RES_IT(page_res); while (it->word() != word_res && it->word() != NULL) it->forward(); ASSERT_HOST(it->word() == word_res); return it; } } } return NULL; }
void HMACMD5(sLONG_PTR *pResult, PackagePtr pParams) { C_BLOB Param1; C_BLOB Param2; C_LONGINT Param3; C_TEXT returnValue; Param1.fromParamAtIndex(pParams, 1); Param2.fromParamAtIndex(pParams, 2); Param3.fromParamAtIndex(pParams, 3); uint8_t *buf = (uint8_t *)calloc(16, sizeof(uint8_t)); uint32_t mdlen = 16; HMAC(EVP_md5(), (const void *)Param1.getBytesPtr(), (int)Param1.getBytesLength(), (const unsigned char *)Param2.getBytesPtr(), (int)Param2.getBytesLength(), buf, &mdlen); C_BLOB temp; temp.setBytes((const uint8_t *)buf, 16); switch (Param3.getIntValue()) { case 1: temp.toB64Text(&returnValue); break; default: temp.toHexText(&returnValue); break; } free(buf); returnValue.setReturn(pResult); }
// Extracts all the noise outlines and stuffs the pointers into the given // vector of outlines. Afterwards, the outlines vector owns the pointers. void WERD::GetNoiseOutlines(GenericVector<C_OUTLINE*>* outlines) { C_BLOB_IT rej_it(&rej_cblobs); for (rej_it.mark_cycle_pt(); !rej_it.empty(); rej_it.forward()) { C_BLOB* blob = rej_it.extract(); C_OUTLINE_IT ol_it(blob->out_list()); outlines->push_back(ol_it.extract()); delete blob; } }
// TODO(mezhirov) delete this function and replace with word->bounding_box() static TBOX c_blob_list_get_bbox(C_BLOB_LIST *cblobs) { TBOX result; C_BLOB_IT c_it(cblobs); for (c_it.mark_cycle_pt(); !c_it.cycled_list(); c_it.forward()) { C_BLOB *blob = c_it.data(); //bboxes.push(tessy_rectangle(blob->bounding_box())); result.bounding_union(blob->bounding_box()); } return result; }
void RSASHA256(sLONG_PTR *pResult, PackagePtr pParams) { C_BLOB Param1; C_BLOB Param2; C_LONGINT Param3; C_TEXT returnValue; Param1.fromParamAtIndex(pParams, 1); Param2.fromParamAtIndex(pParams, 2); Param3.fromParamAtIndex(pParams, 3); uint8_t *buf = (uint8_t *)calloc(32, sizeof(uint8_t)); CC_SHA256((unsigned char *)Param1.getBytesPtr(), Param1.getBytesLength(), buf); unsigned int signatureLength = 0; BIO *bio = BIO_new_mem_buf((void *)Param2.getBytesPtr(), Param2.getBytesLength()); if(bio){ RSA *key = NULL; key = PEM_read_bio_RSAPrivateKey(bio, NULL, NULL, NULL); if(key){ uint8_t *sgn = (uint8_t *)calloc(RSA_size(key), sizeof(uint8_t)); if(RSA_sign(NID_sha256, buf, 32, sgn, &signatureLength, key)){ C_BLOB temp; temp.setBytes((const uint8_t *)sgn, signatureLength); switch (Param3.getIntValue()) { case 1: temp.toB64Text(&returnValue); break; default: temp.toHexText(&returnValue); break; } } free(sgn); } BIO_free(bio); } free(buf); returnValue.setReturn(pResult); }
// Adds the selected outlines to the indcated real blobs, and puts the rest // back in rej_cblobs where they came from. Where the target_blobs entry is // nullptr, a run of wanted outlines is put into a single new blob. // Ownership of the outlines is transferred back to the word. (Hence // GenericVector and not PointerVector.) // Returns true if any new blob was added to the start of the word, which // suggests that it might need joining to the word before it, and likewise // sets make_next_word_fuzzy true if any new blob was added to the end. bool WERD::AddSelectedOutlines(const GenericVector<bool>& wanted, const GenericVector<C_BLOB*>& target_blobs, const GenericVector<C_OUTLINE*>& outlines, bool* make_next_word_fuzzy) { bool outline_added_to_start = false; if (make_next_word_fuzzy != nullptr) *make_next_word_fuzzy = false; C_BLOB_IT rej_it(&rej_cblobs); for (int i = 0; i < outlines.size(); ++i) { C_OUTLINE* outline = outlines[i]; if (outline == nullptr) continue; // Already used it. if (wanted[i]) { C_BLOB* target_blob = target_blobs[i]; TBOX noise_box = outline->bounding_box(); if (target_blob == nullptr) { target_blob = new C_BLOB(outline); // Need to find the insertion point. C_BLOB_IT blob_it(&cblobs); for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) { C_BLOB* blob = blob_it.data(); TBOX blob_box = blob->bounding_box(); if (blob_box.left() > noise_box.left()) { if (blob_it.at_first() && !flag(W_FUZZY_SP) && !flag(W_FUZZY_NON)) { // We might want to join this word to its predecessor. outline_added_to_start = true; } blob_it.add_before_stay_put(target_blob); break; } } if (blob_it.cycled_list()) { blob_it.add_to_end(target_blob); if (make_next_word_fuzzy != nullptr) *make_next_word_fuzzy = true; } // Add all consecutive wanted, but null-blob outlines to same blob. C_OUTLINE_IT ol_it(target_blob->out_list()); while (i + 1 < outlines.size() && wanted[i + 1] && target_blobs[i + 1] == nullptr) { ++i; ol_it.add_to_end(outlines[i]); } } else { // Insert outline into this blob. C_OUTLINE_IT ol_it(target_blob->out_list()); ol_it.add_to_end(outline); } } else { // Put back on noise list. rej_it.add_to_end(new C_BLOB(outline)); } } return outline_added_to_start; }
void CC_HMACHASH(uint32_t hashlen, const EVP_MD * (*EVP)(void), C_BLOB &Param1, C_BLOB &Param2, C_LONGINT &Param3, C_TEXT &returnValue) { uint8_t *buf = (uint8_t *)calloc(hashlen, sizeof(uint8_t)); HMAC(EVP(), (const void *)Param1.getBytesPtr(), (int)Param1.getBytesLength(), (const unsigned char *)Param2.getBytesPtr(), (int)Param2.getBytesLength(), buf, &hashlen); C_BLOB temp; temp.setBytes((const uint8_t *)buf, hashlen); switch (Param3.getIntValue()) { case 1: temp.toB64Text(&returnValue); break; case 2: temp.toB64Text(&returnValue, true); break; default: temp.toHexText(&returnValue); break; } free(buf); }
void CC_HASH(unsigned int hashlen, void (*CC)(const void *data, uint32_t len, unsigned char *md), C_BLOB &Param1, C_LONGINT &Param2, C_TEXT &returnValue) { uint8_t *buf = (uint8_t *)calloc(hashlen, sizeof(uint8_t)); CC((unsigned char *)Param1.getBytesPtr(), Param1.getBytesLength(), buf); C_BLOB temp; temp.setBytes((const uint8_t *)buf, hashlen); switch (Param2.getIntValue()) { case 1: temp.toB64Text(&returnValue); break; case 2: temp.toB64Text(&returnValue, true); break; default: temp.toHexText(&returnValue); break; } free(buf); }
// Removes noise from the word by moving small outlines to the rej_cblobs // list, based on the size_threshold. void WERD::CleanNoise(float size_threshold) { C_BLOB_IT blob_it(&cblobs); C_BLOB_IT rej_it(&rej_cblobs); for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) { C_BLOB* blob = blob_it.data(); C_OUTLINE_IT ol_it(blob->out_list()); for (ol_it.mark_cycle_pt(); !ol_it.cycled_list(); ol_it.forward()) { C_OUTLINE* outline = ol_it.data(); TBOX ol_box = outline->bounding_box(); int ol_size = ol_box.width() > ol_box.height() ? ol_box.width() : ol_box.height(); if (ol_size < size_threshold) { // This outline is too small. Move it to a separate blob in the // reject blobs list. C_BLOB* rej_blob = new C_BLOB(ol_it.extract()); rej_it.add_after_then_move(rej_blob); } } if (blob->out_list()->empty()) delete blob_it.extract(); } }
void show_point(PAGE_RES* page_res, float x, float y) { FCOORD pt(x, y); PAGE_RES_IT pr_it(page_res); const int kBufsize = 512; char msg[kBufsize]; char *msg_ptr = msg; msg_ptr += sprintf(msg_ptr, "Pt:(%0.3f, %0.3f) ", x, y); for (WERD_RES* word = pr_it.word(); word != NULL; word = pr_it.forward()) { if (pr_it.row() != pr_it.prev_row() && pr_it.row()->row->bounding_box().contains(pt)) { msg_ptr += sprintf(msg_ptr, "BL(x)=%0.3f ", pr_it.row()->row->base_line(x)); } if (word->word->bounding_box().contains(pt)) { TBOX box = word->word->bounding_box(); msg_ptr += sprintf(msg_ptr, "Wd(%d, %d)/(%d, %d) ", box.left(), box.bottom(), box.right(), box.top()); C_BLOB_IT cblob_it(word->word->cblob_list()); for (cblob_it.mark_cycle_pt(); !cblob_it.cycled_list(); cblob_it.forward()) { C_BLOB* cblob = cblob_it.data(); box = cblob->bounding_box(); if (box.contains(pt)) { msg_ptr += sprintf(msg_ptr, "CBlb(%d, %d)/(%d, %d) ", box.left(), box.bottom(), box.right(), box.top()); } } } } image_win->AddMessage(msg); }
void CC_RSASHA(unsigned int hashlen, int nid, void (*CC)(const void *data, uint32_t len, unsigned char *md), C_BLOB &Param1, C_BLOB &Param2, C_LONGINT &Param3, C_TEXT &returnValue) { uint8_t *buf = (uint8_t *)calloc(hashlen, sizeof(uint8_t)); unsigned int signatureLength = 0; CC((unsigned char *)Param1.getBytesPtr(), Param1.getBytesLength(), buf); BIO *bio = BIO_new_mem_buf((void *)Param2.getBytesPtr(), Param2.getBytesLength()); if(bio) { RSA *key = NULL; key = PEM_read_bio_RSAPrivateKey(bio, NULL, NULL, NULL); if(key) { uint8_t *sgn = (uint8_t *)calloc(RSA_size(key), sizeof(uint8_t)); if(RSA_sign(nid, buf, hashlen, sgn, &signatureLength, key)) { C_BLOB temp; temp.setBytes((const uint8_t *)sgn, signatureLength); switch (Param3.getIntValue()) { case 1: temp.toB64Text(&returnValue); break; case 2: temp.toB64Text(&returnValue, true); break; default: temp.toHexText(&returnValue); break; } } free(sgn); } BIO_free(bio); } free(buf); }
void RIPEMD160(PA_PluginParameters params) { sLONG_PTR *pResult = (sLONG_PTR *)params->fResult; PackagePtr pParams = (PackagePtr)params->fParameters; C_BLOB Param1; C_LONGINT Param2; C_TEXT returnValue; Param1.fromParamAtIndex(pParams, 1); Param2.fromParamAtIndex(pParams, 2); uint8_t *buf = (uint8_t *)calloc(20, sizeof(uint8_t)); CC_RIPEMD160((unsigned char *)Param1.getBytesPtr(), Param1.getBytesLength(), buf); C_BLOB temp; temp.setBytes((const uint8_t *)buf, 20); switch (Param2.getIntValue()) { case 1: temp.toB64Text(&returnValue); break; case 2: temp.toB64Text(&returnValue, true); break; default: temp.toHexText(&returnValue); break; } free(buf); returnValue.setReturn(pResult); }
void CC_RSASHAVERIFY(unsigned int hashlen, int nid, void (*CC)(const void *data, uint32_t len, unsigned char *md), C_BLOB &Param1, C_BLOB &Param2, C_TEXT &Param3, C_LONGINT &Param4, C_LONGINT &returnValue) { uint8_t *buf = (uint8_t *)calloc(hashlen, sizeof(uint8_t)); CC((unsigned char *)Param1.getBytesPtr(), Param1.getBytesLength(), buf); BIO *bio = BIO_new_mem_buf((void *)Param2.getBytesPtr(), Param2.getBytesLength()); if(bio) { RSA *key = NULL; key = PEM_read_bio_RSA_PUBKEY(bio, NULL, NULL, NULL); if(key) { C_BLOB temp; switch (Param4.getIntValue()) { case 1: temp.fromB64Text(&Param3); break; default: temp.fromHexText(&Param3); break; } if(RSA_verify(nid, buf, hashlen, (unsigned char *)temp.getBytesPtr(), temp.getBytesLength(), key)) { returnValue.setIntValue(1); } } BIO_free(bio); } free(buf); }
bool Textord::clean_noise_from_row( //remove empties ROW* row //row to clean ) { bool testing_on; TBOX blob_box; //bounding box C_BLOB *blob; //current blob C_OUTLINE *outline; //current outline WERD *word; //current word int32_t blob_size; //biggest size int32_t trans_count = 0; //no of transitions int32_t trans_threshold; //noise tolerance int32_t dot_count; //small objects int32_t norm_count; //normal objects int32_t super_norm_count; //real char-like //words of row WERD_IT word_it = row->word_list (); C_BLOB_IT blob_it; //blob iterator C_OUTLINE_IT out_it; //outline iterator testing_on = textord_test_y > row->base_line (textord_test_x) && textord_show_blobs && textord_test_y < row->base_line (textord_test_x) + row->x_height (); dot_count = 0; norm_count = 0; super_norm_count = 0; for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) { word = word_it.data (); //current word //blobs in word blob_it.set_to_list (word->cblob_list ()); for (blob_it.mark_cycle_pt (); !blob_it.cycled_list (); blob_it.forward ()) { blob = blob_it.data (); if (!word->flag (W_DONT_CHOP)) { //get outlines out_it.set_to_list (blob->out_list ()); for (out_it.mark_cycle_pt (); !out_it.cycled_list (); out_it.forward ()) { outline = out_it.data (); blob_box = outline->bounding_box (); blob_size = blob_box.width () > blob_box.height ()? blob_box.width () : blob_box. height(); if (blob_size < textord_noise_sizelimit * row->x_height ()) dot_count++; //count smal outlines if (!outline->child ()->empty () && blob_box.height () < (1 + textord_noise_syfract) * row->x_height () && blob_box.height () > (1 - textord_noise_syfract) * row->x_height () && blob_box.width () < (1 + textord_noise_sxfract) * row->x_height () && blob_box.width () > (1 - textord_noise_sxfract) * row->x_height ()) super_norm_count++; //count smal outlines } } else super_norm_count++; blob_box = blob->bounding_box (); blob_size = blob_box.width () > blob_box.height ()? blob_box.width () : blob_box.height (); if (blob_size >= textord_noise_sizelimit * row->x_height () && blob_size < row->x_height () * 2) { trans_threshold = blob_size / textord_noise_sizefraction; trans_count = blob->count_transitions (trans_threshold); if (trans_count < textord_noise_translimit) norm_count++; } else if (blob_box.height () > row->x_height () * 2 && (!word_it.at_first () || !blob_it.at_first ())) dot_count += 2; if (testing_on) { tprintf ("Blob at (%d,%d) -> (%d,%d), ols=%d, tc=%d, bldiff=%g\n", blob_box.left (), blob_box.bottom (), blob_box.right (), blob_box.top (), blob->out_list ()->length (), trans_count, blob_box.bottom () - row->base_line (blob_box.left ())); } } } if (textord_noise_debug) { tprintf ("Row ending at (%d,%g):", blob_box.right (), row->base_line (blob_box.right ())); tprintf (" R=%g, dc=%d, nc=%d, %s\n", norm_count > 0 ? (float) dot_count / norm_count : 9999, dot_count, norm_count, dot_count > norm_count * textord_noise_normratio && dot_count > 2 ? "REJECTED" : "ACCEPTED"); } return super_norm_count < textord_noise_sncount && dot_count > norm_count * textord_noise_rowratio && dot_count > 2; }
void HTML_Convert_to_pdf(sLONG_PTR *pResult, PackagePtr pParams) { C_TEXT ParamHtml; ARRAY_LONGINT ParamKeys; ARRAY_TEXT ParamValues; C_BLOB returnValue; ParamHtml.fromParamAtIndex(pParams, 1); ParamKeys.fromParamAtIndex(pParams, 2); ParamValues.fromParamAtIndex(pParams, 3); CUTF8String paramValue, path; wkhtmltopdf_global_settings *gs; wkhtmltopdf_object_settings *os; wkhtmltopdf_converter *c; gs = _wkhtmltopdf_create_global_settings(); os = _wkhtmltopdf_create_object_settings(); /*hard code these options http://www.cs.au.dk/~jakobt/libwkhtmltox_0.10.0_doc/pagesettings.html#pagePdfGlobal */ _wkhtmltopdf_set_object_setting(os, "load.blockLocalFileAccess", "false"); _wkhtmltopdf_set_object_setting(os, "load.stopSlowScript", "true"); _wkhtmltopdf_set_object_setting(os, "load.debugJavascript", "false"); _wkhtmltopdf_set_object_setting(os, "load.loadErrorHandling", "ignore"); _wkhtmltopdf_set_object_setting(os, "includeInOutline", "true"); _wkhtmltopdf_set_global_setting(gs, "outputFormat", "pdf"); for(unsigned int i = 0; i < ParamKeys.getSize(); ++i){ ParamValues.copyUTF8StringAtIndex(¶mValue, i); switch (ParamKeys.getIntValueAtIndex(i)){ case HTML_USE_BACKGROUND: if(!paramValue.compare((const uint8_t *)"true")){ _wkhtmltopdf_set_object_setting(os, "web.background", "true"); }else{ _wkhtmltopdf_set_object_setting(os, "web.background", "false"); } break; case HTML_USE_IMAGES: if(!paramValue.compare((const uint8_t *)"true")){ _wkhtmltopdf_set_object_setting(os, "web.loadImages", "true"); }else{ _wkhtmltopdf_set_object_setting(os, "web.loadImages", "false"); } break; case HTML_USE_JAVASCRIPT: if(!paramValue.compare((const uint8_t *)"true")){ _wkhtmltopdf_set_object_setting(os, "web.enableJavascript", "true"); }else{ _wkhtmltopdf_set_object_setting(os, "web.enableJavascript", "false"); } break; case HTML_USE_SMART_RESIZE: if(!paramValue.compare((const uint8_t *)"true")){ _wkhtmltopdf_set_object_setting(os, "web.enableIntelligentShrinking", "true"); }else{ _wkhtmltopdf_set_object_setting(os, "web.enableIntelligentShrinking", "false"); } break; case HTML_USE_PRINT_MEDIA: if(!paramValue.compare((const uint8_t *)"true")){ _wkhtmltopdf_set_object_setting(os, "web.printMediaType", "true"); }else{ _wkhtmltopdf_set_object_setting(os, "web.printMediaType", "false"); } break; case HTML_USE_PLUGINS: if(!paramValue.compare((const uint8_t *)"true")){ _wkhtmltopdf_set_object_setting(os, "web.enablePlugins", "true"); }else{ _wkhtmltopdf_set_object_setting(os, "web.enablePlugins", "false"); } break; case HTML_PDF_USE_COMPRESSION: if(!paramValue.compare((const uint8_t *)"true")){ _wkhtmltopdf_set_global_setting(gs, "useCompression", "true"); }else{ _wkhtmltopdf_set_global_setting(gs, "useCompression", "false"); } break; case HTML_PDF_USE_OUTLINE: if(!paramValue.compare((const uint8_t *)"true")){ _wkhtmltopdf_set_global_setting(gs, "outline", "true"); }else{ _wkhtmltopdf_set_global_setting(gs, "outline", "false"); } break; case HTML_PDF_HEADER_USE_LINE: if(!paramValue.compare((const uint8_t *)"true")){ _wkhtmltopdf_set_object_setting(os, "header.line", "true"); }else{ _wkhtmltopdf_set_object_setting(os, "header.line", "false"); } break; case HTML_PDF_FOOTER_USE_LINE: if(!paramValue.compare((const uint8_t *)"true")){ _wkhtmltopdf_set_object_setting(os, "footer.line", "true"); }else{ _wkhtmltopdf_set_object_setting(os, "footer.line", "false"); } break; case HTML_PDF_TOC_USE_DOTTED_LINES: if(!paramValue.compare((const uint8_t *)"true")){ _wkhtmltopdf_set_object_setting(os, "toc.useDottedLines", "true"); }else{ _wkhtmltopdf_set_object_setting(os, "toc.useDottedLines", "false"); } break; case HTML_PDF_TOC_USE_FORWARD_LINKS: if(!paramValue.compare((const uint8_t *)"true")){ _wkhtmltopdf_set_object_setting(os, "toc.forwardLinks", "true"); }else{ _wkhtmltopdf_set_object_setting(os, "toc.forwardLinks", "false"); } break; case HTML_PDF_TOC_USE_BACK_LINKS: if(!paramValue.compare((const uint8_t *)"true")){ _wkhtmltopdf_set_object_setting(os, "toc.backLinks", "true"); }else{ _wkhtmltopdf_set_object_setting(os, "toc.backLinks", "false"); } break; case HTML_PDF_USE_EXTERNAL_LINKS: if(!paramValue.compare((const uint8_t *)"true")){ _wkhtmltopdf_set_object_setting(os, "useExternalLinks", "true"); }else{ _wkhtmltopdf_set_object_setting(os, "useExternalLinks", "false"); } break; case HTML_PDF_USE_LOCAL_LINKS: if(!paramValue.compare((const uint8_t *)"true")){ _wkhtmltopdf_set_object_setting(os, "useLocalLinks", "true"); }else{ _wkhtmltopdf_set_object_setting(os, "useLocalLinks", "false"); } break; case HTML_PDF_USE_FORMS: if(!paramValue.compare((const uint8_t *)"true")){ _wkhtmltopdf_set_object_setting(os, "produceForms", "true"); }else{ _wkhtmltopdf_set_object_setting(os, "produceForms", "false"); } break; case HTML_PDF_USE_PAGES_COUNT: if(!paramValue.compare((const uint8_t *)"true")){ _wkhtmltopdf_set_object_setting(os, "pagesCount", "true"); }else{ _wkhtmltopdf_set_object_setting(os, "pagesCount", "false"); } break; case HTML_MINIMUM_FONT_SIZE: _wkhtmltopdf_set_object_setting(os, "web.minimumFontSize", (const char *)paramValue.c_str()); break; case HTML_DEFAULT_ENCODING: _wkhtmltopdf_set_object_setting(os, "web.defaultEncoding", (const char *)paramValue.c_str()); break; case HTML_ZOOM_FACTOR: _wkhtmltopdf_set_object_setting(os, "load.zoomFactor",(const char *)paramValue.c_str()); break; case HTML_PDF_PAPER_SIZE: _wkhtmltopdf_set_global_setting(gs, "size.paperSize",(const char *)paramValue.c_str()); break; case HTML_PDF_DOCUMENT_WIDTH: _wkhtmltopdf_set_global_setting(gs, "size.width",(const char *)paramValue.c_str()); break; case HTML_PDF_DOCUMENT_HEIGHT: _wkhtmltopdf_set_global_setting(gs, "size.height",(const char *)paramValue.c_str()); break; case HTML_PDF_DPI: _wkhtmltopdf_set_global_setting(gs, "dpi",(const char *)paramValue.c_str()); break; case HTML_PDF_JPEG_QUALITY: _wkhtmltopdf_set_global_setting(gs, "imageQuality",(const char *)paramValue.c_str()); break; case HTML_PDF_IMAGE_DPI: _wkhtmltopdf_set_global_setting(gs, "imageDPI",(const char *)paramValue.c_str()); break; case HTML_PDF_ORIENTATION: if( (!paramValue.compare((const uint8_t *)"Landscape")) || (!paramValue.compare((const uint8_t *)"Portrait"))){ _wkhtmltopdf_set_global_setting(gs, "orientation", (const char *)paramValue.c_str()); } break; case HTML_PDF_COLOR_MODE: if( (!paramValue.compare((const uint8_t *)"Color")) || (!paramValue.compare((const uint8_t *)"Grayscale"))){ _wkhtmltopdf_set_global_setting(gs, "colorMode", (const char *)paramValue.c_str()); } break; case HTML_PDF_OUTPUT_FORMAT: if( (!paramValue.compare((const uint8_t *)"pdf")) || (!paramValue.compare((const uint8_t *)"ps"))){ _wkhtmltopdf_set_global_setting(gs, "outputFormat", (const char *)paramValue.c_str()); } break; case HTML_PDF_MARGIN_RIGHT: _wkhtmltopdf_set_global_setting(gs, "margin.right",(const char *)paramValue.c_str()); break; case HTML_PDF_MARGIN_LEFT: _wkhtmltopdf_set_global_setting(gs, "margin.left",(const char *)paramValue.c_str()); break; case HTML_PDF_MARGIN_BOTTOM: _wkhtmltopdf_set_global_setting(gs, "margin.bottom",(const char *)paramValue.c_str()); break; case HTML_PDF_MARGIN_TOP: _wkhtmltopdf_set_global_setting(gs, "margin.top",(const char *)paramValue.c_str()); break; case HTML_PDF_DOCUMENT_TITLE: _wkhtmltopdf_set_global_setting(gs, "documentTitle",(const char *)paramValue.c_str()); break; case HTML_PDF_PAGE_NUMBER_OFFSET: _wkhtmltopdf_set_global_setting(gs, "pageOffset",(const char *)paramValue.c_str()); break; case HTML_PDF_NUMBER_OF_COPIES: _wkhtmltopdf_set_global_setting(gs, "copies",(const char *)paramValue.c_str()); break; case HTML_PDF_OUTLINE_DEPTH: _wkhtmltopdf_set_global_setting(gs, "outlineDepth",(const char *)paramValue.c_str()); break; case HTML_PDF_HEADER_FONT_SIZE: _wkhtmltopdf_set_object_setting(os, "header.fontSize",(const char *)paramValue.c_str()); break; case HTML_PDF_HEADER_FONT_NAME: _wkhtmltopdf_set_object_setting(os, "header.fontName",(const char *)paramValue.c_str()); break; case HTML_PDF_HEADER_LEFT: _wkhtmltopdf_set_object_setting(os, "header.left",(const char *)paramValue.c_str()); break; case HTML_PDF_HEADER_CENTER: _wkhtmltopdf_set_object_setting(os, "header.center",(const char *)paramValue.c_str()); break; case HTML_PDF_HEADER_RIGHT: _wkhtmltopdf_set_object_setting(os, "header.right",(const char *)paramValue.c_str()); break; case HTML_PDF_HEADER_SPACE: _wkhtmltopdf_set_object_setting(os, "header.space",(const char *)paramValue.c_str()); break; case HTML_PDF_FOOTER_FONT_SIZE: _wkhtmltopdf_set_object_setting(os, "footer.fontSize",(const char *)paramValue.c_str()); break; case HTML_PDF_FOOTER_FONT_NAME: _wkhtmltopdf_set_object_setting(os, "footer.fontName",(const char *)paramValue.c_str()); break; case HTML_PDF_FOOTER_LEFT: _wkhtmltopdf_set_object_setting(os, "footer.left",(const char *)paramValue.c_str()); break; case HTML_PDF_FOOTER_CENTER: _wkhtmltopdf_set_object_setting(os, "footer.center",(const char *)paramValue.c_str()); break; case HTML_PDF_FOOTER_RIGHT: _wkhtmltopdf_set_object_setting(os, "footer.right",(const char *)paramValue.c_str()); break; case HTML_PDF_FOOTER_SPACE: _wkhtmltopdf_set_object_setting(os, "footer.space",(const char *)paramValue.c_str()); break; case HTML_PDF_TOC_CAPTION_TEXT: _wkhtmltopdf_set_object_setting(os, "toc.captionText",(const char *)paramValue.c_str()); break; case HTML_PDF_TOC_INDENTATION: _wkhtmltopdf_set_object_setting(os, "toc.indentation",(const char *)paramValue.c_str()); break; case HTML_PDF_TOC_FONT_SCALE: _wkhtmltopdf_set_object_setting(os, "toc.fontScale",(const char *)paramValue.c_str()); break; case HTML_WEB_USERNAME: _wkhtmltopdf_set_object_setting(os, "load.username",(const char *)paramValue.c_str()); break; case HTML_WEB_PASSWORD: _wkhtmltopdf_set_object_setting(os, "load.password",(const char *)paramValue.c_str()); break; case HTML_WEB_PROXY: _wkhtmltopdf_set_object_setting(os, "load.proxy",(const char *)paramValue.c_str()); break; case HTML_PDF_OUTLINE_PATH: ParamValues.copyPathAtIndex(&path, i); _wkhtmltopdf_set_object_setting(os, "dumpOutline",(const char *)path.c_str()); break; case HTML_PDF_OUTLINE_XSL_PATH: // ParamValues.copyUTF16StringAtIndex(&path, i); // _wkhtmltopdf_set_object_setting(os, "tocXsl",(const char *)path.c_str()); break; case HTML_CUSTOM_CSS_PATH: ParamValues.copyPathAtIndex(&path, i); _wkhtmltopdf_set_object_setting(os, "web.userStyleSheet",(const char *)path.c_str()); break; case HTML_COOKIE_JAR_PATH: ParamValues.copyPathAtIndex(&path, i); _wkhtmltopdf_set_object_setting(os, "load.cookieJar",(const char *)path.c_str()); break; } } c = _wkhtmltopdf_create_converter(gs); if(ParamHtml.getUTF16Length()){ CUTF8String htmlPath, html, htmlUrl; ParamHtml.copyPath(&htmlPath); ParamHtml.copyUTF8String(&htmlUrl); if( (htmlUrl.find((const uint8_t *)"http://") == 0) || (htmlUrl.find((const uint8_t *)"https://") == 0) || (htmlUrl.find((const uint8_t *)"ftp://") == 0) || (htmlUrl.find((const uint8_t *)"ftps://") == 0)){ _wkhtmltopdf_set_object_setting(os, "page",(const char *)htmlUrl.c_str()); _wkhtmltopdf_add_object(c, os, NULL); }else{ if(checkPath(&htmlPath)){ _wkhtmltopdf_set_object_setting(os, "page",(const char *)htmlPath.c_str()); _wkhtmltopdf_add_object(c, os, NULL); }else{ ParamHtml.copyUTF8String(&html); _wkhtmltopdf_add_object(c, os, (const char *)html.c_str()); } } if(_wkhtmltopdf_convert(c)){ const unsigned char *bytes; long len = _wkhtmltopdf_get_output(c, &bytes); returnValue.setBytes((const uint8_t *)bytes, len); } } _wkhtmltopdf_destroy_converter(c); returnValue.setReturn(pResult); }
void tweak_row_baseline(ROW *row, double blshift_maxshift, double blshift_xfraction) { TBOX blob_box; //bounding box C_BLOB *blob; //current blob WERD *word; //current word inT32 blob_count; //no of blobs inT32 src_index; //source segment inT32 dest_index; //destination segment inT32 *xstarts; //spline segments double *coeffs; //spline coeffs float ydiff; //baseline error float x_centre; //centre of blob //words of row WERD_IT word_it = row->word_list (); C_BLOB_IT blob_it; //blob iterator blob_count = 0; for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) { word = word_it.data (); //current word //get total blobs blob_count += word->cblob_list ()->length (); } if (blob_count == 0) return; xstarts = (inT32 *) alloc_mem ((blob_count + row->baseline.segments + 1) * sizeof (inT32)); coeffs = (double *) alloc_mem ((blob_count + row->baseline.segments) * 3 * sizeof (double)); src_index = 0; dest_index = 0; xstarts[0] = row->baseline.xcoords[0]; for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) { word = word_it.data (); //current word //blobs in word blob_it.set_to_list (word->cblob_list ()); for (blob_it.mark_cycle_pt (); !blob_it.cycled_list (); blob_it.forward ()) { blob = blob_it.data (); blob_box = blob->bounding_box (); x_centre = (blob_box.left () + blob_box.right ()) / 2.0; ydiff = blob_box.bottom () - row->base_line (x_centre); if (ydiff < 0) ydiff = -ydiff / row->x_height (); else ydiff = ydiff / row->x_height (); if (ydiff < blshift_maxshift && blob_box.height () / row->x_height () > blshift_xfraction) { if (xstarts[dest_index] >= x_centre) xstarts[dest_index] = blob_box.left (); coeffs[dest_index * 3] = 0; coeffs[dest_index * 3 + 1] = 0; coeffs[dest_index * 3 + 2] = blob_box.bottom (); //shift it dest_index++; xstarts[dest_index] = blob_box.right () + 1; } else { if (xstarts[dest_index] <= x_centre) { while (row->baseline.xcoords[src_index + 1] <= x_centre && src_index < row->baseline.segments - 1) { if (row->baseline.xcoords[src_index + 1] > xstarts[dest_index]) { coeffs[dest_index * 3] = row->baseline.quadratics[src_index].a; coeffs[dest_index * 3 + 1] = row->baseline.quadratics[src_index].b; coeffs[dest_index * 3 + 2] = row->baseline.quadratics[src_index].c; dest_index++; xstarts[dest_index] = row->baseline.xcoords[src_index + 1]; } src_index++; } coeffs[dest_index * 3] = row->baseline.quadratics[src_index].a; coeffs[dest_index * 3 + 1] = row->baseline.quadratics[src_index].b; coeffs[dest_index * 3 + 2] = row->baseline.quadratics[src_index].c; dest_index++; xstarts[dest_index] = row->baseline.xcoords[src_index + 1]; } } } } while (src_index < row->baseline.segments && row->baseline.xcoords[src_index + 1] <= xstarts[dest_index]) src_index++; while (src_index < row->baseline.segments) { coeffs[dest_index * 3] = row->baseline.quadratics[src_index].a; coeffs[dest_index * 3 + 1] = row->baseline.quadratics[src_index].b; coeffs[dest_index * 3 + 2] = row->baseline.quadratics[src_index].c; dest_index++; src_index++; xstarts[dest_index] = row->baseline.xcoords[src_index]; } //turn to spline row->baseline = QSPLINE (dest_index, xstarts, coeffs); free_mem(xstarts); free_mem(coeffs); }
void HTML_Convert_to_image(sLONG_PTR *pResult, PackagePtr pParams) { C_TEXT ParamHtml; ARRAY_LONGINT ParamKeys; ARRAY_TEXT ParamValues; C_BLOB returnValue; ParamHtml.fromParamAtIndex(pParams, 1); ParamKeys.fromParamAtIndex(pParams, 2); ParamValues.fromParamAtIndex(pParams, 3); CUTF8String paramValue, path; wkhtmltoimage_global_settings *gs; wkhtmltoimage_converter *c; gs = _wkhtmltoimage_create_global_settings(); for(unsigned int i = 0; i < ParamKeys.getSize(); ++i){ ParamValues.copyUTF8StringAtIndex(¶mValue, i); switch (ParamKeys.getIntValueAtIndex(i)){ case HTML_IMAGE_USE_SMART_WIDTH: if(!paramValue.compare((const uint8_t *)"true")){ _wkhtmltoimage_set_global_setting(gs, "smartWidth", "true"); }else{ _wkhtmltoimage_set_global_setting(gs, "smartWidth", "false"); } break; case HTML_IMAGE_USE_BACKGROUND: if(!paramValue.compare((const uint8_t *)"false")){ _wkhtmltoimage_set_global_setting(gs, "transparent", "true"); }else{ _wkhtmltoimage_set_global_setting(gs, "transparent", "false"); } break; case HTML_IMAGE_CROP_LEFT: _wkhtmltoimage_set_global_setting(gs, "crop.left", (const char *)paramValue.c_str()); break; case HTML_IMAGE_CROP_TOP: _wkhtmltoimage_set_global_setting(gs, "crop.top", (const char *)paramValue.c_str()); break; case HTML_IMAGE_CROP_WIDTH: _wkhtmltoimage_set_global_setting(gs, "crop.width", (const char *)paramValue.c_str()); break; case HTML_IMAGE_CROP_HEIGHT: _wkhtmltoimage_set_global_setting(gs, "crop.height", (const char *)paramValue.c_str()); break; case HTML_IMAGE_JPEG_QUALITY: _wkhtmltoimage_set_global_setting(gs, "quality", (const char *)paramValue.c_str()); break; case HTML_IMAGE_SCREEN_WIDTH: _wkhtmltoimage_set_global_setting(gs, "screenWidth", (const char *)paramValue.c_str()); break; case HTML_IMAGE_FORMAT: if( (!paramValue.compare((const uint8_t *)"jpg")) || (!paramValue.compare((const uint8_t *)"png")) || (!paramValue.compare((const uint8_t *)"bmp")) || (!paramValue.compare((const uint8_t *)"svg"))){ _wkhtmltoimage_set_global_setting(gs, "fmt", (const char *)paramValue.c_str()); } break; } } if(ParamHtml.getUTF16Length()){ CUTF8String htmlPath, html, htmlUrl; ParamHtml.copyPath(&htmlPath); ParamHtml.copyUTF8String(&htmlUrl); if( (htmlUrl.find((const uint8_t *)"http://") == 0) || (htmlUrl.find((const uint8_t *)"https://") == 0) || (htmlUrl.find((const uint8_t *)"ftp://") == 0) || (htmlUrl.find((const uint8_t *)"ftps://") == 0)){ _wkhtmltoimage_set_global_setting(gs, "in",(const char *)htmlUrl.c_str()); c = _wkhtmltoimage_create_converter(gs, NULL); }else{ if(checkPath(&htmlPath)){ _wkhtmltoimage_set_global_setting(gs, "in",(const char *)htmlPath.c_str()); c = _wkhtmltoimage_create_converter(gs, NULL); }else{ ParamHtml.copyUTF8String(&html); c = _wkhtmltoimage_create_converter(gs, (const char *)html.c_str()); } } if(_wkhtmltoimage_convert(c)){ const unsigned char *bytes; long len = _wkhtmltoimage_get_output(c, &bytes); returnValue.setBytes((const uint8_t *)bytes, len); } } _wkhtmltoimage_destroy_converter(c); returnValue.setReturn(pResult); }
/// Consume all source blobs that strongly overlap the given box, /// putting them into a new word, with the correct_text label. /// Fights over which box owns which blobs are settled by /// applying the blobs to box or next_box with the least non-overlap. /// @return false if the box was in error, which can only be caused by /// failing to find an overlapping blob for a box. bool Tesseract::ResegmentWordBox(BLOCK_LIST *block_list, const TBOX& box, const TBOX& next_box, const char* correct_text) { if (applybox_debug > 1) { tprintf("\nAPPLY_BOX: in ResegmentWordBox() for %s\n", correct_text); } WERD* new_word = NULL; BLOCK_IT b_it(block_list); for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) { BLOCK* block = b_it.data(); if (!box.major_overlap(block->bounding_box())) continue; ROW_IT r_it(block->row_list()); for (r_it.mark_cycle_pt(); !r_it.cycled_list(); r_it.forward()) { ROW* row = r_it.data(); if (!box.major_overlap(row->bounding_box())) continue; WERD_IT w_it(row->word_list()); for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) { WERD* word = w_it.data(); if (applybox_debug > 2) { tprintf("Checking word:"); word->bounding_box().print(); } if (word->text() != NULL && word->text()[0] != '\0') continue; // Ignore words that are already done. if (!box.major_overlap(word->bounding_box())) continue; C_BLOB_IT blob_it(word->cblob_list()); for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) { C_BLOB* blob = blob_it.data(); TBOX blob_box = blob->bounding_box(); if (!blob_box.major_overlap(box)) continue; double current_box_miss_metric = BoxMissMetric(blob_box, box); double next_box_miss_metric = BoxMissMetric(blob_box, next_box); if (applybox_debug > 2) { tprintf("Checking blob:"); blob_box.print(); tprintf("Current miss metric = %g, next = %g\n", current_box_miss_metric, next_box_miss_metric); } if (current_box_miss_metric > next_box_miss_metric) continue; // Blob is a better match for next box. if (applybox_debug > 2) { tprintf("Blob match: blob:"); blob_box.print(); tprintf("Matches box:"); box.print(); tprintf("With next box:"); next_box.print(); } if (new_word == NULL) { // Make a new word with a single blob. new_word = word->shallow_copy(); new_word->set_text(correct_text); w_it.add_to_end(new_word); } C_BLOB_IT new_blob_it(new_word->cblob_list()); new_blob_it.add_to_end(blob_it.extract()); } } } } if (new_word == NULL && applybox_debug > 0) tprintf("FAIL!\n"); return new_word != NULL; }
void CC_AES(const EVP_CIPHER *cipher, C_BLOB &Param1, C_BLOB &Param2, C_LONGINT &Param3, C_LONGINT &Param5, C_LONGINT &Param6, C_BLOB &Param7, C_BLOB &Param8, C_TEXT &returnValue) { EVP_CIPHER_CTX *ctx = EVP_CIPHER_CTX_new(); unsigned char key[EVP_MAX_KEY_LENGTH], iv[EVP_MAX_IV_LENGTH]; const unsigned char *source = (const unsigned char *)Param1.getBytesPtr(); int source_len = Param1.getBytesLength(); int crypted_len, tail_len; bool key_and_iv_is_valid = false; if( !Param2.getBytesLength() && Param7.getBytesLength() && Param8.getBytesLength() && Param7.getBytesLength() <= EVP_MAX_KEY_LENGTH && Param8.getBytesLength() <= EVP_MAX_IV_LENGTH) { memset(key, 0, EVP_MAX_KEY_LENGTH); memset( iv, 0, EVP_MAX_IV_LENGTH ); memcpy(key, Param7.getBytesPtr(), Param7.getBytesLength()); memcpy( iv, Param8.getBytesPtr(), Param8.getBytesLength()); key_and_iv_is_valid = true; }else { // passphrase -> key, iv key_and_iv_is_valid = (EVP_BytesToKey(cipher, EVP_md5(), NULL, Param2.getBytesPtr(), Param2.getBytesLength(), 2048, key, iv) > 0); } if (key_and_iv_is_valid) { if(EVP_CipherInit(ctx, cipher, key, iv, 0 == Param3.getIntValue())) { if(Param6.getIntValue()) { EVP_CIPHER_CTX_set_padding(ctx, 0); } size_t buf_size = source_len + EVP_MAX_BLOCK_LENGTH; unsigned char *buf = (unsigned char *)calloc(buf_size, sizeof(unsigned char)); if(EVP_CipherUpdate(ctx, buf, &crypted_len, source, source_len)) { if(EVP_CipherFinal(ctx, (buf + crypted_len), &tail_len)) { crypted_len += tail_len; C_BLOB temp; temp.setBytes((const uint8_t *)buf, crypted_len); switch (Param5.getIntValue()) { case 1: temp.toB64Text(&returnValue); break; case 2: temp.toB64Text(&returnValue, true); break; default: temp.toHexText(&returnValue); break; } } } free(buf); } EVP_CIPHER_CTX_free(ctx); } }
void Textord::clean_noise_from_words( //remove empties ROW *row //row to clean ) { TBOX blob_box; //bounding box inT8 *word_dud; //was it chucked C_BLOB *blob; //current blob C_OUTLINE *outline; //current outline WERD *word; //current word inT32 blob_size; //biggest size inT32 trans_count; //no of transitions inT32 trans_threshold; //noise tolerance inT32 dot_count; //small objects inT32 norm_count; //normal objects inT32 dud_words; //number discarded inT32 ok_words; //number remaining inT32 word_index; //current word //words of row WERD_IT word_it = row->word_list (); C_BLOB_IT blob_it; //blob iterator C_OUTLINE_IT out_it; //outline iterator ok_words = word_it.length (); if (ok_words == 0 || textord_no_rejects) return; word_dud = (inT8 *) alloc_mem (ok_words * sizeof (inT8)); dud_words = 0; ok_words = 0; word_index = 0; for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) { word = word_it.data (); //current word dot_count = 0; norm_count = 0; //blobs in word blob_it.set_to_list (word->cblob_list ()); for (blob_it.mark_cycle_pt (); !blob_it.cycled_list (); blob_it.forward ()) { blob = blob_it.data (); if (!word->flag (W_DONT_CHOP)) { //get outlines out_it.set_to_list (blob->out_list ()); for (out_it.mark_cycle_pt (); !out_it.cycled_list (); out_it.forward ()) { outline = out_it.data (); blob_box = outline->bounding_box (); blob_size = blob_box.width () > blob_box.height ()? blob_box.width () : blob_box. height(); if (blob_size < textord_noise_sizelimit * row->x_height ()) dot_count++; //count smal outlines if (!outline->child ()->empty () && blob_box.height () < (1 + textord_noise_syfract) * row->x_height () && blob_box.height () > (1 - textord_noise_syfract) * row->x_height () && blob_box.width () < (1 + textord_noise_sxfract) * row->x_height () && blob_box.width () > (1 - textord_noise_sxfract) * row->x_height ()) norm_count++; //count smal outlines } } else norm_count++; blob_box = blob->bounding_box (); blob_size = blob_box.width () > blob_box.height ()? blob_box.width () : blob_box.height (); if (blob_size >= textord_noise_sizelimit * row->x_height () && blob_size < row->x_height () * 2) { trans_threshold = blob_size / textord_noise_sizefraction; trans_count = blob->count_transitions (trans_threshold); if (trans_count < textord_noise_translimit) norm_count++; } else if (blob_box.height () > row->x_height () * 2 && (!word_it.at_first () || !blob_it.at_first ())) dot_count += 2; } if (dot_count > 2) { if (dot_count > norm_count * textord_noise_normratio * 2) word_dud[word_index] = 2; else if (dot_count > norm_count * textord_noise_normratio) word_dud[word_index] = 1; else word_dud[word_index] = 0; } else word_dud[word_index] = 0; if (word_dud[word_index] == 2) dud_words++; else ok_words++; word_index++; } word_index = 0; for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) { if (word_dud[word_index] == 2 || (word_dud[word_index] == 1 && dud_words > ok_words)) { word = word_it.data (); //current word //rejected blobs blob_it.set_to_list (word->rej_cblob_list ()); //move from blobs blob_it.add_list_after (word->cblob_list ()); } word_index++; } free_mem(word_dud); }
void Textord::clean_noise_from_words( //remove empties ROW *row //row to clean ) { TBOX blob_box; //bounding box C_BLOB *blob; //current blob C_OUTLINE *outline; //current outline WERD *word; //current word int32_t blob_size; //biggest size int32_t trans_count; //no of transitions int32_t trans_threshold; //noise tolerance int32_t dot_count; //small objects int32_t norm_count; //normal objects int32_t dud_words; //number discarded int32_t ok_words; //number remaining int32_t word_index; //current word //words of row WERD_IT word_it = row->word_list (); C_BLOB_IT blob_it; //blob iterator C_OUTLINE_IT out_it; //outline iterator ok_words = word_it.length (); if (ok_words == 0 || textord_no_rejects) return; // was it chucked std::vector<int8_t> word_dud(ok_words); dud_words = 0; ok_words = 0; word_index = 0; for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) { word = word_it.data (); //current word dot_count = 0; norm_count = 0; //blobs in word blob_it.set_to_list (word->cblob_list ()); for (blob_it.mark_cycle_pt (); !blob_it.cycled_list (); blob_it.forward ()) { blob = blob_it.data (); if (!word->flag (W_DONT_CHOP)) { //get outlines out_it.set_to_list (blob->out_list ()); for (out_it.mark_cycle_pt (); !out_it.cycled_list (); out_it.forward ()) { outline = out_it.data (); blob_box = outline->bounding_box (); blob_size = blob_box.width () > blob_box.height ()? blob_box.width () : blob_box. height(); if (blob_size < textord_noise_sizelimit * row->x_height ()) dot_count++; //count smal outlines if (!outline->child ()->empty () && blob_box.height () < (1 + textord_noise_syfract) * row->x_height () && blob_box.height () > (1 - textord_noise_syfract) * row->x_height () && blob_box.width () < (1 + textord_noise_sxfract) * row->x_height () && blob_box.width () > (1 - textord_noise_sxfract) * row->x_height ()) norm_count++; //count smal outlines } } else norm_count++; blob_box = blob->bounding_box (); blob_size = blob_box.width () > blob_box.height ()? blob_box.width () : blob_box.height (); if (blob_size >= textord_noise_sizelimit * row->x_height () && blob_size < row->x_height () * 2) { trans_threshold = blob_size / textord_noise_sizefraction; trans_count = blob->count_transitions (trans_threshold); if (trans_count < textord_noise_translimit) norm_count++; } else if (blob_box.height () > row->x_height () * 2 && (!word_it.at_first () || !blob_it.at_first ())) dot_count += 2; } if (dot_count > 2 && !word->flag(W_REP_CHAR)) { if (dot_count > norm_count * textord_noise_normratio * 2) word_dud[word_index] = 2; else if (dot_count > norm_count * textord_noise_normratio) word_dud[word_index] = 1; else word_dud[word_index] = 0; } else { word_dud[word_index] = 0; } if (word_dud[word_index] == 2) dud_words++; else ok_words++; word_index++; } word_index = 0; for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) { if (word_dud[word_index] == 2 || (word_dud[word_index] == 1 && dud_words > ok_words)) { word = word_it.data(); // Current word. // Previously we threw away the entire word. // Now just aggressively throw all small blobs into the reject list, where // the classifier can decide whether they are actually needed. word->CleanNoise(textord_noise_sizelimit * row->x_height()); } word_index++; } }