void UTF32ToUTF8(const GenericVector<char32>& str32, STRING* utf8_str) { utf8_str->ensure(str32.length()); utf8_str->assign("", 0); for (int i = 0; i < str32.length(); ++i) { UNICHAR uni_ch(str32[i]); char *utf8 = uni_ch.utf8_str(); if (utf8 != nullptr) { (*utf8_str) += utf8; delete[] utf8; } } }
bool Wordrec::ChoiceIsCorrect(const UNICHARSET &uni_set, const WERD_CHOICE *choice, const GenericVector<STRING> &truth_text) { if (choice == NULL) return false; int i; STRING truth_str; for (i = 0; i < truth_text.length(); ++i) truth_str += truth_text[i]; STRING normed_choice_str; for (i = 0; i < choice->length(); ++i) { normed_choice_str += uni_set.get_normed_unichar(choice->unichar_id(i)); } return (truth_str == normed_choice_str); }
GenericVector<char*> M_Utils::lineSplit(const char* txt) { int txtlen = (int)strlen(txt); // pass 1: find split points GenericVector<int> splitpoints; for(int i = 0; i < txtlen; i++) { if(txt[i] == '\n' && (i < (txtlen-1))) splitpoints.push_back(i); } // pass 2: iterate split points to do all the splitting int prevsplit = 0; GenericVector<char*> res; if(splitpoints.empty()) { // deep copy the string char* newstr = strDeepCpy(txt); res.push_back(newstr); return res; } for(int i = 0; i < splitpoints.length(); i++) { int split = splitpoints[i]; int newstrsize = split-prevsplit; char* ln = new char[newstrsize+2]; // +1 for null terminator and +1 for newline for(int i = 0; i < newstrsize; i++) ln[i] = txt[prevsplit+i]; ln[newstrsize] = '\n'; ln[newstrsize+1] = '\0'; // null terminator res.push_back(ln); splitpoints.clear(); prevsplit = split; } // now just need to add the last line int lastsplit = prevsplit; int newstrsize = txtlen - prevsplit; char* ln = new char[newstrsize+1]; for(int i = 0; i < newstrsize; i++) ln[i] = txt[prevsplit+i]; ln[newstrsize] = '\0'; res.push_back(ln); return res; }
/********************************************************************** * select_blob_to_split * * These are the results of the last classification. Find a likely * place to apply splits. If none, return -1. **********************************************************************/ int Wordrec::select_blob_to_split( const GenericVector<BLOB_CHOICE*>& blob_choices, float rating_ceiling, bool split_next_to_fragment) { BLOB_CHOICE *blob_choice; int x; float worst = -MAX_FLOAT32; int worst_index = -1; float worst_near_fragment = -MAX_FLOAT32; int worst_index_near_fragment = -1; const CHAR_FRAGMENT **fragments = NULL; if (chop_debug) { if (rating_ceiling < MAX_FLOAT32) tprintf("rating_ceiling = %8.4f\n", rating_ceiling); else tprintf("rating_ceiling = No Limit\n"); } if (split_next_to_fragment && blob_choices.size() > 0) { fragments = new const CHAR_FRAGMENT *[blob_choices.length()]; if (blob_choices[0] != NULL) { fragments[0] = getDict().getUnicharset().get_fragment( blob_choices[0]->unichar_id()); } else { fragments[0] = NULL; } } for (x = 0; x < blob_choices.size(); ++x) { if (blob_choices[x] == NULL) { if (fragments != NULL) { delete[] fragments; } return x; } else { blob_choice = blob_choices[x]; // Populate fragments for the following position. if (split_next_to_fragment && x+1 < blob_choices.size()) { if (blob_choices[x + 1] != NULL) { fragments[x + 1] = getDict().getUnicharset().get_fragment( blob_choices[x + 1]->unichar_id()); } else { fragments[x + 1] = NULL; } } if (blob_choice->rating() < rating_ceiling && blob_choice->certainty() < tessedit_certainty_threshold) { // Update worst and worst_index. if (blob_choice->rating() > worst) { worst_index = x; worst = blob_choice->rating(); } if (split_next_to_fragment) { // Update worst_near_fragment and worst_index_near_fragment. bool expand_following_fragment = (x + 1 < blob_choices.size() && fragments[x+1] != NULL && !fragments[x+1]->is_beginning()); bool expand_preceding_fragment = (x > 0 && fragments[x-1] != NULL && !fragments[x-1]->is_ending()); if ((expand_following_fragment || expand_preceding_fragment) && blob_choice->rating() > worst_near_fragment) { worst_index_near_fragment = x; worst_near_fragment = blob_choice->rating(); if (chop_debug) { tprintf("worst_index_near_fragment=%d" " expand_following_fragment=%d" " expand_preceding_fragment=%d\n", worst_index_near_fragment, expand_following_fragment, expand_preceding_fragment); } } } } } } if (fragments != NULL) { delete[] fragments; } // TODO(daria): maybe a threshold of badness for // worst_near_fragment would be useful. return worst_index_near_fragment != -1 ? worst_index_near_fragment : worst_index; }
bool TessPDFRenderer::EndDocumentHandler() { size_t n; char buf[kBasicBufSize]; // We reserved the /Pages object number early, so that the /Page // objects could refer to their parent. We finally have enough // information to go fill it in. Using lower level calls to manipulate // the offset record in two spots, because we are placing objects // out of order in the file. // PAGES const long int kPagesObjectNumber = 2; offsets_[kPagesObjectNumber] = offsets_.back(); // manipulation #1 n = snprintf(buf, sizeof(buf), "%ld 0 obj\n" "<<\n" " /Type /Pages\n" " /Kids [ ", kPagesObjectNumber); if (n >= sizeof(buf)) return false; AppendString(buf); size_t pages_objsize = strlen(buf); for (size_t i = 0; i < pages_.size(); i++) { n = snprintf(buf, sizeof(buf), "%ld 0 R ", pages_[i]); if (n >= sizeof(buf)) return false; AppendString(buf); pages_objsize += strlen(buf); } n = snprintf(buf, sizeof(buf), "]\n" " /Count %d\n" ">>\n" "endobj\n", pages_.size()); if (n >= sizeof(buf)) return false; AppendString(buf); pages_objsize += strlen(buf); offsets_.back() += pages_objsize; // manipulation #2 // INFO STRING utf16_title = "FEFF"; // byte_order_marker GenericVector<int> unicodes; UNICHAR::UTF8ToUnicode(title(), &unicodes); char utf16[kMaxBytesPerCodepoint]; for (int i = 0; i < unicodes.length(); i++) { int code = unicodes[i]; if (CodepointToUtf16be(code, utf16)) { utf16_title += utf16; } } char* datestr = l_getFormattedDate(); n = snprintf(buf, sizeof(buf), "%ld 0 obj\n" "<<\n" " /Producer (Tesseract %s)\n" " /CreationDate (D:%s)\n" " /Title <%s>\n" ">>\n" "endobj\n", obj_, TESSERACT_VERSION_STR, datestr, utf16_title.c_str()); lept_free(datestr); if (n >= sizeof(buf)) return false; AppendPDFObject(buf); n = snprintf(buf, sizeof(buf), "xref\n" "0 %ld\n" "0000000000 65535 f \n", obj_); if (n >= sizeof(buf)) return false; AppendString(buf); for (int i = 1; i < obj_; i++) { n = snprintf(buf, sizeof(buf), "%010ld 00000 n \n", offsets_[i]); if (n >= sizeof(buf)) return false; AppendString(buf); } n = snprintf(buf, sizeof(buf), "trailer\n" "<<\n" " /Size %ld\n" " /Root %ld 0 R\n" " /Info %ld 0 R\n" ">>\n" "startxref\n" "%ld\n" "%%%%EOF\n", obj_, 1L, // catalog obj_ - 1, // info offsets_.back()); if (n >= sizeof(buf)) return false; AppendString(buf); return true; }
char* TessPDFRenderer::GetPDFTextObjects(TessBaseAPI* api, double width, double height) { STRING pdf_str(""); double ppi = api->GetSourceYResolution(); // These initial conditions are all arbitrary and will be overwritten double old_x = 0.0, old_y = 0.0; int old_fontsize = 0; tesseract::WritingDirection old_writing_direction = WRITING_DIRECTION_LEFT_TO_RIGHT; bool new_block = true; int fontsize = 0; double a = 1; double b = 0; double c = 0; double d = 1; // TODO(jbreiden) This marries the text and image together. // Slightly cleaner from an abstraction standpoint if this were to // live inside a separate text object. pdf_str += "q "; pdf_str.add_str_double("", prec(width)); pdf_str += " 0 0 "; pdf_str.add_str_double("", prec(height)); pdf_str += " 0 0 cm"; if (!textonly_) { pdf_str += " /Im1 Do"; } pdf_str += " Q\n"; int line_x1 = 0; int line_y1 = 0; int line_x2 = 0; int line_y2 = 0; ResultIterator *res_it = api->GetIterator(); while (!res_it->Empty(RIL_BLOCK)) { if (res_it->IsAtBeginningOf(RIL_BLOCK)) { pdf_str += "BT\n3 Tr"; // Begin text object, use invisible ink old_fontsize = 0; // Every block will declare its fontsize new_block = true; // Every block will declare its affine matrix } if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) { int x1, y1, x2, y2; res_it->Baseline(RIL_TEXTLINE, &x1, &y1, &x2, &y2); ClipBaseline(ppi, x1, y1, x2, y2, &line_x1, &line_y1, &line_x2, &line_y2); } if (res_it->Empty(RIL_WORD)) { res_it->Next(RIL_WORD); continue; } // Writing direction changes at a per-word granularity tesseract::WritingDirection writing_direction; { tesseract::Orientation orientation; tesseract::TextlineOrder textline_order; float deskew_angle; res_it->Orientation(&orientation, &writing_direction, &textline_order, &deskew_angle); if (writing_direction != WRITING_DIRECTION_TOP_TO_BOTTOM) { switch (res_it->WordDirection()) { case DIR_LEFT_TO_RIGHT: writing_direction = WRITING_DIRECTION_LEFT_TO_RIGHT; break; case DIR_RIGHT_TO_LEFT: writing_direction = WRITING_DIRECTION_RIGHT_TO_LEFT; break; default: writing_direction = old_writing_direction; } } } // Where is word origin and how long is it? double x, y, word_length; { int word_x1, word_y1, word_x2, word_y2; res_it->Baseline(RIL_WORD, &word_x1, &word_y1, &word_x2, &word_y2); GetWordBaseline(writing_direction, ppi, height, word_x1, word_y1, word_x2, word_y2, line_x1, line_y1, line_x2, line_y2, &x, &y, &word_length); } if (writing_direction != old_writing_direction || new_block) { AffineMatrix(writing_direction, line_x1, line_y1, line_x2, line_y2, &a, &b, &c, &d); pdf_str.add_str_double(" ", prec(a)); // . This affine matrix pdf_str.add_str_double(" ", prec(b)); // . sets the coordinate pdf_str.add_str_double(" ", prec(c)); // . system for all pdf_str.add_str_double(" ", prec(d)); // . text that follows. pdf_str.add_str_double(" ", prec(x)); // . pdf_str.add_str_double(" ", prec(y)); // . pdf_str += (" Tm "); // Place cursor absolutely new_block = false; } else { double dx = x - old_x; double dy = y - old_y; pdf_str.add_str_double(" ", prec(dx * a + dy * b)); pdf_str.add_str_double(" ", prec(dx * c + dy * d)); pdf_str += (" Td "); // Relative moveto } old_x = x; old_y = y; old_writing_direction = writing_direction; // Adjust font size on a per word granularity. Pay attention to // fontsize, old_fontsize, and pdf_str. We've found that for // in Arabic, Tesseract will happily return a fontsize of zero, // so we make up a default number to protect ourselves. { bool bold, italic, underlined, monospace, serif, smallcaps; int font_id; res_it->WordFontAttributes(&bold, &italic, &underlined, &monospace, &serif, &smallcaps, &fontsize, &font_id); const int kDefaultFontsize = 8; if (fontsize <= 0) fontsize = kDefaultFontsize; if (fontsize != old_fontsize) { char textfont[20]; snprintf(textfont, sizeof(textfont), "/f-0-0 %d Tf ", fontsize); pdf_str += textfont; old_fontsize = fontsize; } } bool last_word_in_line = res_it->IsAtFinalElement(RIL_TEXTLINE, RIL_WORD); bool last_word_in_block = res_it->IsAtFinalElement(RIL_BLOCK, RIL_WORD); STRING pdf_word(""); int pdf_word_len = 0; do { const char *grapheme = res_it->GetUTF8Text(RIL_SYMBOL); if (grapheme && grapheme[0] != '\0') { GenericVector<int> unicodes; UNICHAR::UTF8ToUnicode(grapheme, &unicodes); char utf16[kMaxBytesPerCodepoint]; for (int i = 0; i < unicodes.length(); i++) { int code = unicodes[i]; if (CodepointToUtf16be(code, utf16)) { pdf_word += utf16; pdf_word_len++; } } } delete []grapheme; res_it->Next(RIL_SYMBOL); } while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_WORD)); if (word_length > 0 && pdf_word_len > 0 && fontsize > 0) { double h_stretch = kCharWidth * prec(100.0 * word_length / (fontsize * pdf_word_len)); pdf_str.add_str_double("", h_stretch); pdf_str += " Tz"; // horizontal stretch pdf_str += " [ <"; pdf_str += pdf_word; // UTF-16BE representation pdf_str += "> ] TJ"; // show the text } if (last_word_in_line) { pdf_str += " \n"; } if (last_word_in_block) { pdf_str += "ET\n"; // end the text object } } char *ret = new char[pdf_str.length() + 1]; strcpy(ret, pdf_str.string()); delete res_it; return ret; }
Configuration::Configuration(ServerXML *serverXML, PRBool deleteServerXML) : ServerXMLSchema::ServerWrapper(serverXML->server), ConfigurationObject(this), serverXML(serverXML), deleteServerXML(deleteServerXML), refcount(1), id(PR_AtomicIncrement(&ids)), pool(pool_create()), lscHash(0), vsHash(0), listener(0), aclcache(0) { // Post process the server.xml configuration try { int count; int i; // Hash variable names count = getVariableCount(); for (i = 0; i < count; i++) serverVars.addPair(getVariable(i)->name, getVariable(i)->value); // Instantiate a ListenSocketConfig for each ServerXMLSchema::HttpListener ServerXMLSchema::Pkcs11& pkcs11 = getPKCS11(); count = getHttpListenerCount(); for (i = 0; i < count; i++) { lscVector.append(new ListenSocketConfig(*getHttpListener(i), pkcs11, this)); } // Instantiate a VirtualServer for each ServerXMLSchema::VirtualServer count = getVirtualServerCount(); for (i = 0; i < count; i++) { vsVector.append(new VirtualServer(*getVirtualServer(i), this, this)); } // Check for ListenSocketConfigs that have the same IP:port count = getLscCount(); for (i = 0; i < count; i++) { ListenSocketConfig *lsc1 = getLsc(i); if (lsc1->enabled) { int j; for (j = i + 1; j < count; j++) { ListenSocketConfig* lsc2 = getLsc(j); if (lsc2->enabled && *lsc2 == *lsc1) { ereport(LOG_WARN, XP_GetAdminStr(DBT_Configuration_SameAddress), lsc1->name.getStringValue(), lsc2->name.getStringValue()); } } } } // Create and populate the table that will hash ListenSocketConfig // IDs to pointers count = getLscCount(); lscHash = new SimplePtrStringHash(findPrime(count)); for (i = 0; i < count; i++) { // Add this ListenSocketConfig ID-to-pointer mapping ListenSocketConfig* lsc = getLsc(i); if (!lsc->enabled) { ereport(LOG_INFORM, XP_GetAdminStr(DBT_Configuration_DisabledLs), lsc->name.getStringValue()); } if (lscHash->lookup((void*)lsc->name.getStringValue())) { throw ConfigurationServerXMLException(lsc->name, XP_GetAdminStr(DBT_Configuration_MultiplyDefined)); } lscHash->insert((void*)lsc->name.getStringValue(), (void*)lsc); } // If we can't bind to INADDR_ANY and a specific IP simultaneously... if (!ListenSockets::canBindAnyAndSpecific()) { // Give INADDR_ANY ListenSocketConfigs a pointer to non-INADDR_ANY // ListenSocketConfigs on the same port count = getLscCount(); for (int i = 0; i < count; i++) { ListenSocketConfig* lsc = getLsc(i); if (lsc->enabled && !lsc->hasExplicitIP()) { for (int j = 0; j < count; j++) { if (j != i) { lsc->addIPSpecificConfig(getLsc(j)); } } } } } // Create and populate the table that will hash VirtualServer IDs to // pointers vsHash = new SimplePtrStringHash(findPrime(getVSCount())); count = getVSCount(); for (i = 0; i < count; i++) { VirtualServer* vs = getVS(i); if (!vs->enabled) { ereport(LOG_INFORM, XP_GetAdminStr(DBT_Configuration_DisabledVs), vs->name.getStringValue()); } if (vsHash->lookup((void*)vs->name.getStringValue())) { throw ConfigurationServerXMLException(vs->name, XP_GetAdminStr(DBT_Configuration_MultiplyDefined)); } vsHash->insert((void*)vs->name.getStringValue(), (void*)vs); } // Give ListenSocketConfigs a pointer to their default VirtualServers count = getLscCount(); for (i = 0; i < count; i++) { ListenSocketConfig* lsc = getLsc(i); if (!lsc->enabled) continue; // Find default VS for ListenSocketConfig VirtualServer* vs = getVS(lsc->defaultVirtualServerName); if (!vs) { throw ConfigurationServerXMLException(lsc->defaultVirtualServerName, XP_GetAdminStr(DBT_Configuration_UndefinedVs)); } if (!vs->enabled) { throw ConfigurationServerXMLException(lsc->defaultVirtualServerName, XP_GetAdminStr(DBT_Configuration_DisabledDefaultVs)); } vs->bind(); // mark the VS as bound to a ListenSocketConfig // Give ListenSocketConfig a pointer to its default VS lsc->setDefaultVS(vs); // Check SSL properties SSLSocketConfiguration* sslc = lsc->getSSLParams(); if (sslc) { sslc->CheckCertHosts(NULL, lsc); } } // Add each VirtualServer to the listen sockets it's attached to count = getVSCount(); for (i = 0; i < count; i++) { VirtualServer* vs = getVS(i); if (vs->enabled && vs->isUnbound()) { ereport(LOG_WARN, XP_GetAdminStr(DBT_UnboundVS), vs->name.getStringValue()); } else { addVs(vs); } } // Give VirtualServers a pointer to their MIME files SimplePtrStringHash mimeFileHash(getMimeFileCount() + 1); count = getVSCount(); for (i = 0; i < count; i++) { int j; VirtualServer* vs = getVS(i); if (!vs->enabled) continue; // Add VS-specific MIME files for (j = 0; j < vs->getMimeFileCount(); j++) { vs->getMime().addMimeFile(parseMIMEFile(*vs->getMimeFile(j), mimeFileHash)); } // Add server-wide MIME files for (j = 0; j < getMimeFileCount(); j++) { vs->getMime().addMimeFile(parseMIMEFile(*getMimeFile(j), mimeFileHash)); } } // Set the name of the default ACL database. This must be done after // we construct the configuration's AuthDbs (the AuthDb constructor // calls ACL_VirtualDbRegister) and before we parse its ACL files. ACL_DatabaseSetDefault(NULL, defaultAuthDbName); // construct ACLLists for the virtual servers SimplePtrStringHash globalAclFileHash(251); count = getVSCount(); for (i = 0; i < count; i++) { int j; VirtualServer* vs = getVS(i); if (!vs->enabled) continue; // Build a list of all the ACLLists associated with this virtual server GenericVector vsAclListVector; SimplePtrStringHash vsAclFileHash(3); for (j = 0; j < vs->getAclFileCount(); j++) { // Check for VS-specific ACL files const char *filename = *vs->getAclFile(j); if (!vsAclFileHash.lookup((void*)filename)) { ACLListHandle_t *acllist = parseACLFile(*vs->getAclFile(j), globalAclFileHash); vsAclListVector.append(acllist); vsAclFileHash.insert((void*)filename, (void*)acllist); } } for (j = 0; j < getAclFileCount(); j++) { // Check for server-wide ACL files const char *filename = *getAclFile(j); if (!vsAclFileHash.lookup((void*)filename)) { ACLListHandle_t *acllist = parseACLFile(*getAclFile(j), globalAclFileHash); vsAclListVector.append(acllist); vsAclFileHash.insert((void*)filename, (void*)acllist); } } if (vsAclListVector.length() == 0) // no ACLList for this VS continue; ACLListHandle_t *aclroot = NULL; if (vsAclListVector.length() == 1) { // VS has just one ACLList so it can use the ACL file's ACLList // as is aclroot = (ACLListHandle_t *)vsAclListVector[0]; ACL_ListIncrement(0, aclroot); } else { // VS has multiple ACLLists aclroot = ACL_ListNew(0); for (j = 0; j < vsAclListVector.length(); j++) { ACLListHandle_t *acllist = (ACLListHandle_t *)vsAclListVector[j]; if (ACL_ListConcat(0, aclroot, acllist, 0) < 0) { // XXX this error message could be more intellegible... ACL_ListDecrement(0, aclroot); throw EreportableException(LOG_FAILURE, XP_GetAdminStr(DBT_Configuration_CannotConstructAclLists)); } } } // copy the pointers over to the vs vs->setACLList(aclroot); } // create an ACL cache for this configuration // if "acl-cache" is enabled, then only create ACLCache object if (aclCache.enabled) { aclcache = new ACLCache(); } else aclcache = NULL; } catch (const EreportableException& e) { cleanup(); throw; } }