char *TessPDFRenderer::GetPDFTextObjects(TessBaseAPI *api, double width, double height) { STRING pdf_str(""); double ppi = api->GetSourceYResolution(); // These initial conditions are all arbitrary and will be overwritten double old_x = 0.0, old_y = 0.0; int old_fontsize = 0; tesseract::WritingDirection old_writing_direction = WRITING_DIRECTION_LEFT_TO_RIGHT; bool new_block = true; int fontsize = 0; double a = 1; double b = 0; double c = 0; double d = 1; // TODO(jbreiden) This marries the text and image together. // Slightly cleaner from an abstraction standpoint if this were to // live inside a separate text object. pdf_str += "q "; pdf_str.add_str_double("", prec(width)); pdf_str += " 0 0 "; pdf_str.add_str_double("", prec(height)); pdf_str += " 0 0 cm /Im1 Do Q\n"; ResultIterator *res_it = api->GetIterator(); while (!res_it->Empty(RIL_BLOCK)) { if (res_it->IsAtBeginningOf(RIL_BLOCK)) { pdf_str += "BT\n3 Tr"; // Begin text object, use invisible ink old_fontsize = 0; // Every block will declare its fontsize new_block = true; // Every block will declare its affine matrix } int line_x1, line_y1, line_x2, line_y2; if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) { int x1, y1, x2, y2; res_it->Baseline(RIL_TEXTLINE, &x1, &y1, &x2, &y2); ClipBaseline(ppi, x1, y1, x2, y2, &line_x1, &line_y1, &line_x2, &line_y2); } if (res_it->Empty(RIL_WORD)) { res_it->Next(RIL_WORD); continue; } // Writing direction changes at a per-word granularity tesseract::WritingDirection writing_direction; { tesseract::Orientation orientation; tesseract::TextlineOrder textline_order; float deskew_angle; res_it->Orientation(&orientation, &writing_direction, &textline_order, &deskew_angle); if (writing_direction != WRITING_DIRECTION_TOP_TO_BOTTOM) { switch (res_it->WordDirection()) { case DIR_LEFT_TO_RIGHT: writing_direction = WRITING_DIRECTION_LEFT_TO_RIGHT; break; case DIR_RIGHT_TO_LEFT: writing_direction = WRITING_DIRECTION_RIGHT_TO_LEFT; break; default: writing_direction = old_writing_direction; } } } // Where is word origin and how long is it? double x, y, word_length; { int word_x1, word_y1, word_x2, word_y2; res_it->Baseline(RIL_WORD, &word_x1, &word_y1, &word_x2, &word_y2); GetWordBaseline(writing_direction, ppi, height, word_x1, word_y1, word_x2, word_y2, line_x1, line_y1, line_x2, line_y2, &x, &y, &word_length); } if (writing_direction != old_writing_direction || new_block) { AffineMatrix(writing_direction, line_x1, line_y1, line_x2, line_y2, &a, &b, &c, &d); pdf_str.add_str_double(" ", prec(a)); // . This affine matrix pdf_str.add_str_double(" ", prec(b)); // . sets the coordinate pdf_str.add_str_double(" ", prec(c)); // . system for all pdf_str.add_str_double(" ", prec(d)); // . text that follows. pdf_str.add_str_double(" ", prec(x)); // . pdf_str.add_str_double(" ", prec(y)); // . pdf_str += (" Tm "); // Place cursor absolutely new_block = false; } else { double dx = x - old_x; double dy = y - old_y; pdf_str.add_str_double(" ", prec(dx * a + dy * b)); pdf_str.add_str_double(" ", prec(dx * c + dy * d)); pdf_str += (" Td "); // Relative moveto } old_x = x; old_y = y; old_writing_direction = writing_direction; // Adjust font size on a per word granularity. Pay attention to // fontsize, old_fontsize, and pdf_str. We've found that for // in Arabic, Tesseract will happily return a fontsize of zero, // so we make up a default number to protect ourselves. { bool bold, italic, underlined, monospace, serif, smallcaps; int font_id; res_it->WordFontAttributes(&bold, &italic, &underlined, &monospace, &serif, &smallcaps, &fontsize, &font_id); const int kDefaultFontsize = 8; if (fontsize <= 0) fontsize = kDefaultFontsize; if (fontsize != old_fontsize) { char textfont[20]; snprintf(textfont, sizeof(textfont), "/f-0-0 %d Tf ", fontsize); pdf_str += textfont; old_fontsize = fontsize; } } bool last_word_in_line = res_it->IsAtFinalElement(RIL_TEXTLINE, RIL_WORD); bool last_word_in_block = res_it->IsAtFinalElement(RIL_BLOCK, RIL_WORD); STRING pdf_word(""); int pdf_word_len = 0; do { const char *grapheme = res_it->GetUTF8Text(RIL_SYMBOL); if (grapheme && grapheme[0] != '\0') { // TODO(jbreiden) Do a real UTF-16BE conversion // http://en.wikipedia.org/wiki/UTF-16#Example_UTF-16_encoding_procedure string_32 utf32; CubeUtils::UTF8ToUTF32(grapheme, &utf32); char utf16[20]; for (int i = 0; i < static_cast<int>(utf32.length()); i++) { snprintf(utf16, sizeof(utf16), "<%04X>", utf32[i]); pdf_word += utf16; pdf_word_len++; } } delete[]grapheme; res_it->Next(RIL_SYMBOL); } while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_WORD)); if (word_length > 0 && pdf_word_len > 0 && fontsize > 0) { double h_stretch = kCharWidth * prec(100.0 * word_length / (fontsize * pdf_word_len)); pdf_str.add_str_double("", h_stretch); pdf_str += " Tz"; // horizontal stretch pdf_str += " [ "; pdf_str += pdf_word; // UTF-16BE representation pdf_str += " ] TJ"; // show the text } if (last_word_in_line) { pdf_str += " \n"; } if (last_word_in_block) { pdf_str += "ET\n"; // end the text object } } char *ret = new char[pdf_str.length() + 1]; strcpy(ret, pdf_str.string()); delete res_it; return ret; }
char* TessPDFRenderer::GetPDFTextObjects(TessBaseAPI* api, double width, double height, int page_number) { double ppi = api->GetSourceYResolution(); STRING pdf_str(""); double old_x = 0.0, old_y = 0.0; int old_pointsize = 0; // TODO(jbreiden) Slightly cleaner from an abstraction standpoint // if this were to live inside a separate text object. pdf_str += "q "; pdf_str.add_str_double("", prec(width)); pdf_str += " 0 0 "; pdf_str.add_str_double("", prec(height)); pdf_str += " 0 0 cm /Im1 Do Q\n"; ResultIterator *res_it = api->GetIterator(); while (!res_it->Empty(RIL_BLOCK)) { if (res_it->IsAtBeginningOf(RIL_BLOCK)) { pdf_str += "BT\n3 Tr\n"; // Begin text object, use invisible ink old_pointsize = 0.0; // Every block will declare its font } int line_x1, line_y1, line_x2, line_y2; if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) { res_it->Baseline(RIL_TEXTLINE, &line_x1, &line_y1, &line_x2, &line_y2); double rise = abs(line_y2 - line_y1) * 72 / ppi; double run = abs(line_x2 - line_x1) * 72 / ppi; // There are some really stupid PDF viewers in the wild, such as // 'Preview' which ships with the Mac. They might do a better // job with text selection and highlighting when given perfectly // straight text instead of very slightly tilted text. I chose // this threshold large enough to absorb noise, but small enough // that lines probably won't cross each other if the whole page // is tilted at almost exactly the clipping threshold. if (rise < 2.0 && 2.0 < run) line_y1 = line_y2 = (line_y1 + line_y2) / 2; } if (res_it->Empty(RIL_WORD)) { res_it->Next(RIL_WORD); continue; } int word_x1, word_y1, word_x2, word_y2; res_it->Baseline(RIL_WORD, &word_x1, &word_y1, &word_x2, &word_y2); // The critical one is writing_direction tesseract::Orientation orientation; tesseract::WritingDirection writing_direction; tesseract::TextlineOrder textline_order; float deskew_angle; res_it->Orientation(&orientation, &writing_direction, &textline_order, &deskew_angle); // Unlike Tesseract, we always want the word baseline in reading order. if (writing_direction == WRITING_DIRECTION_RIGHT_TO_LEFT) { Swap(&word_x1, &word_x2); Swap(&word_y1, &word_y2); } // Viewers like evince can get really confused during copy-paste // when the baseline wanders around. I've decided to force every // word to match the (straight) baseline. The math below is just // projecting the word origin onto the baseline. All numbers are // in the native PDF coordinate system, which has the origin in // the bottom left and the unit is points, which is 1/72 inch. double word_length; double x, y; { int px = word_x1; int py = word_y1; double l2 = dist2(line_x1, line_y1, line_x2, line_y2); if (l2 == 0) { x = line_x1; y = line_y1; } else { double t = ((px - line_x2) * (line_x2 - line_x1) + (py - line_y2) * (line_y2 - line_y1)) / l2; x = line_x2 + t * (line_x2 - line_x1); y = line_y2 + t * (line_y2 - line_y1); } word_length = sqrt(static_cast<double>(dist2(word_x1, word_y1, word_x2, word_y2))); word_length = word_length * 72.0 / ppi; x = x * 72 / ppi; y = height - (y * 72.0 / ppi); } int pointsize = 0; if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) { // Calculate the rotation angle in the PDF cooordinate system, // which has the origin in the bottom left. The Tesseract // coordinate system has the origin in the upper left. // // PDF is kind of a like turtle graphics, and we orient the // turtle (errr... initial cursor position) with an affine // transformation. // // Rotate RTL Translate // // [ x' y' 1 ] = [ x y 1 ] [ cos𝜃 -sin𝜃 0 ] [ -1 0 0 ] [ 1 0 0 ] // [ sin𝜃 cos𝜃 0 ] [ 0 1 0 ] [ 0 1 0 ] // [ 0 0 1 ] [ 0 0 1 ] [ x y 1 ] // double theta = atan2(static_cast<double>(line_y1 - line_y2), static_cast<double>(line_x2 - line_x1)); double a, b, c, d; a = cos(theta); b = sin(theta); c = -sin(theta); d = cos(theta); switch(writing_direction) { case WRITING_DIRECTION_RIGHT_TO_LEFT: a = -a; b = -b; c = -c; break; case WRITING_DIRECTION_TOP_TO_BOTTOM: // TODO(jbreiden) Consider switching PDF writing mode to vertical. break; default: break; } pdf_str.add_str_double("", prec(a)); // . This affine matrix pdf_str.add_str_double(" ", prec(b)); // . sets the coordinate pdf_str.add_str_double(" ", prec(c)); // . system for all pdf_str.add_str_double(" ", prec(d)); // . text in the entire pdf_str.add_str_double(" ", prec(x)); // . line. pdf_str.add_str_double(" ", prec(y)); // . pdf_str += (" Tm "); // Place cursor absolutely } else { double offset = sqrt(static_cast<double>(dist2(old_x, old_y, x, y))); pdf_str.add_str_double(" ", prec(offset)); // Delta x in pts pdf_str.add_str_double(" ", 0); // Delta y in pts pdf_str += (" Td "); // Relative moveto } old_x = x; old_y = y; // Adjust font size on a per word granularity. Pay attention to // pointsize, old_pointsize, and pdf_str. We've found that for // in Arabic, Tesseract will happily return a pointsize of zero, // so we make up a default number to protect ourselves. { bool bold, italic, underlined, monospace, serif, smallcaps; int font_id; res_it->WordFontAttributes(&bold, &italic, &underlined, &monospace, &serif, &smallcaps, &pointsize, &font_id); const int kDefaultPointSize = 8; if (pointsize <= 0) pointsize = kDefaultPointSize; if (pointsize != old_pointsize) { char textfont[20]; snprintf(textfont, sizeof(textfont), "/f-0-0 %d Tf ", pointsize); pdf_str += textfont; old_pointsize = pointsize; } } bool last_word_in_line = res_it->IsAtFinalElement(RIL_TEXTLINE, RIL_WORD); bool last_word_in_block = res_it->IsAtFinalElement(RIL_BLOCK, RIL_WORD); STRING pdf_word(""); int pdf_word_len = 0; do { const char *grapheme = res_it->GetUTF8Text(RIL_SYMBOL); if (grapheme && grapheme[0] != '\0') { // TODO(jbreiden) Do a real UTF-16BE conversion // http://en.wikipedia.org/wiki/UTF-16#Example_UTF-16_encoding_procedure string_32 utf32; CubeUtils::UTF8ToUTF32(grapheme, &utf32); char utf16[20]; for (int i = 0; i < static_cast<int>(utf32.length()); i++) { snprintf(utf16, sizeof(utf16), "<%04X>", utf32[i]); pdf_word += utf16; pdf_word_len++; } } delete []grapheme; res_it->Next(RIL_SYMBOL); } while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_WORD)); if (word_length > 0 && pdf_word_len > 0 && pointsize > 0) { double h_stretch = kCharWidth * prec(100.0 * word_length / (pointsize * pdf_word_len)); pdf_str.add_str_double("", h_stretch); pdf_str += " Tz"; // horizontal stretch pdf_str += " [ "; pdf_str += pdf_word; // UTF-16BE representation pdf_str += " ] TJ"; // show the text } if (last_word_in_line) { pdf_str += " \n"; } if (last_word_in_block) { pdf_str += "ET\n"; // end the text object } } char *ret = new char[pdf_str.length() + 1]; strcpy(ret, pdf_str.string()); delete res_it; return ret; }