Example #1
0
char *TessPDFRenderer::GetPDFTextObjects(TessBaseAPI *api,
        double width, double height) {
    STRING pdf_str("");
    double ppi = api->GetSourceYResolution();

    // These initial conditions are all arbitrary and will be overwritten
    double old_x = 0.0, old_y = 0.0;
    int old_fontsize = 0;
    tesseract::WritingDirection old_writing_direction =
        WRITING_DIRECTION_LEFT_TO_RIGHT;
    bool new_block = true;
    int fontsize = 0;
    double a = 1;
    double b = 0;
    double c = 0;
    double d = 1;

    // TODO(jbreiden) This marries the text and image together.
    // Slightly cleaner from an abstraction standpoint if this were to
    // live inside a separate text object.
    pdf_str += "q ";
    pdf_str.add_str_double("", prec(width));
    pdf_str += " 0 0 ";
    pdf_str.add_str_double("", prec(height));
    pdf_str += " 0 0 cm /Im1 Do Q\n";

    ResultIterator *res_it = api->GetIterator();
    while (!res_it->Empty(RIL_BLOCK)) {
        if (res_it->IsAtBeginningOf(RIL_BLOCK)) {
            pdf_str += "BT\n3 Tr";     // Begin text object, use invisible ink
            old_fontsize = 0;          // Every block will declare its fontsize
            new_block = true;          // Every block will declare its affine matrix
        }

        int line_x1, line_y1, line_x2, line_y2;
        if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) {
            int x1, y1, x2, y2;
            res_it->Baseline(RIL_TEXTLINE, &x1, &y1, &x2, &y2);
            ClipBaseline(ppi, x1, y1, x2, y2, &line_x1, &line_y1, &line_x2, &line_y2);
        }

        if (res_it->Empty(RIL_WORD)) {
            res_it->Next(RIL_WORD);
            continue;
        }

        // Writing direction changes at a per-word granularity
        tesseract::WritingDirection writing_direction;
        {
            tesseract::Orientation orientation;
            tesseract::TextlineOrder textline_order;
            float deskew_angle;
            res_it->Orientation(&orientation, &writing_direction,
                                &textline_order, &deskew_angle);
            if (writing_direction != WRITING_DIRECTION_TOP_TO_BOTTOM) {
                switch (res_it->WordDirection()) {
                case DIR_LEFT_TO_RIGHT:
                    writing_direction = WRITING_DIRECTION_LEFT_TO_RIGHT;
                    break;
                case DIR_RIGHT_TO_LEFT:
                    writing_direction = WRITING_DIRECTION_RIGHT_TO_LEFT;
                    break;
                default:
                    writing_direction = old_writing_direction;
                }
            }
        }

        // Where is word origin and how long is it?
        double x, y, word_length;
        {
            int word_x1, word_y1, word_x2, word_y2;
            res_it->Baseline(RIL_WORD, &word_x1, &word_y1, &word_x2, &word_y2);
            GetWordBaseline(writing_direction, ppi, height,
                            word_x1, word_y1, word_x2, word_y2,
                            line_x1, line_y1, line_x2, line_y2,
                            &x, &y, &word_length);
        }

        if (writing_direction != old_writing_direction || new_block) {
            AffineMatrix(writing_direction,
                         line_x1, line_y1, line_x2, line_y2, &a, &b, &c, &d);
            pdf_str.add_str_double(" ", prec(a));  // . This affine matrix
            pdf_str.add_str_double(" ", prec(b));  // . sets the coordinate
            pdf_str.add_str_double(" ", prec(c));  // . system for all
            pdf_str.add_str_double(" ", prec(d));  // . text that follows.
            pdf_str.add_str_double(" ", prec(x));  // .
            pdf_str.add_str_double(" ", prec(y));  // .
            pdf_str += (" Tm ");                   // Place cursor absolutely
            new_block = false;
        } else {
            double dx = x - old_x;
            double dy = y - old_y;
            pdf_str.add_str_double(" ", prec(dx * a + dy * b));
            pdf_str.add_str_double(" ", prec(dx * c + dy * d));
            pdf_str += (" Td ");                   // Relative moveto
        }
        old_x = x;
        old_y = y;
        old_writing_direction = writing_direction;

        // Adjust font size on a per word granularity. Pay attention to
        // fontsize, old_fontsize, and pdf_str. We've found that for
        // in Arabic, Tesseract will happily return a fontsize of zero,
        // so we make up a default number to protect ourselves.
        {
            bool bold, italic, underlined, monospace, serif, smallcaps;
            int font_id;
            res_it->WordFontAttributes(&bold, &italic, &underlined, &monospace,
                                       &serif, &smallcaps, &fontsize, &font_id);
            const int kDefaultFontsize = 8;
            if (fontsize <= 0)
                fontsize = kDefaultFontsize;
            if (fontsize != old_fontsize) {
                char textfont[20];
                snprintf(textfont, sizeof(textfont), "/f-0-0 %d Tf ", fontsize);
                pdf_str += textfont;
                old_fontsize = fontsize;
            }
        }

        bool last_word_in_line = res_it->IsAtFinalElement(RIL_TEXTLINE, RIL_WORD);
        bool last_word_in_block = res_it->IsAtFinalElement(RIL_BLOCK, RIL_WORD);
        STRING pdf_word("");
        int pdf_word_len = 0;
        do {
            const char *grapheme = res_it->GetUTF8Text(RIL_SYMBOL);
            if (grapheme && grapheme[0] != '\0') {
                // TODO(jbreiden) Do a real UTF-16BE conversion
                // http://en.wikipedia.org/wiki/UTF-16#Example_UTF-16_encoding_procedure
                string_32 utf32;
                CubeUtils::UTF8ToUTF32(grapheme, &utf32);
                char utf16[20];
                for (int i = 0; i < static_cast<int>(utf32.length()); i++) {
                    snprintf(utf16, sizeof(utf16), "<%04X>", utf32[i]);
                    pdf_word += utf16;
                    pdf_word_len++;
                }
            }
            delete[]grapheme;
            res_it->Next(RIL_SYMBOL);
        } while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_WORD));
        if (word_length > 0 && pdf_word_len > 0 && fontsize > 0) {
            double h_stretch =
                kCharWidth * prec(100.0 * word_length / (fontsize * pdf_word_len));
            pdf_str.add_str_double("", h_stretch);
            pdf_str += " Tz";          // horizontal stretch
            pdf_str += " [ ";
            pdf_str += pdf_word;       // UTF-16BE representation
            pdf_str += " ] TJ";        // show the text
        }
        if (last_word_in_line) {
            pdf_str += " \n";
        }
        if (last_word_in_block) {
            pdf_str += "ET\n";         // end the text object
        }
    }
    char *ret = new char[pdf_str.length() + 1];
    strcpy(ret, pdf_str.string());
    delete res_it;
    return ret;
}
Example #2
0
char* TessPDFRenderer::GetPDFTextObjects(TessBaseAPI* api,
                                         double width, double height,
                                         int page_number) {
  double ppi = api->GetSourceYResolution();
  STRING pdf_str("");
  double old_x = 0.0, old_y = 0.0;
  int old_pointsize = 0;

  // TODO(jbreiden) Slightly cleaner from an abstraction standpoint
  // if this were to live inside a separate text object.
  pdf_str += "q ";
  pdf_str.add_str_double("", prec(width));
  pdf_str += " 0 0 ";
  pdf_str.add_str_double("", prec(height));
  pdf_str += " 0 0 cm /Im1 Do Q\n";

  ResultIterator *res_it = api->GetIterator();

  while (!res_it->Empty(RIL_BLOCK)) {
    if (res_it->IsAtBeginningOf(RIL_BLOCK)) {
      pdf_str += "BT\n3 Tr\n";  // Begin text object, use invisible ink
      old_pointsize = 0.0;      // Every block will declare its font
    }

    int line_x1, line_y1, line_x2, line_y2;
    if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) {
      res_it->Baseline(RIL_TEXTLINE,
                       &line_x1, &line_y1, &line_x2, &line_y2);
      double rise = abs(line_y2 - line_y1) * 72 / ppi;
      double run = abs(line_x2 - line_x1) * 72 / ppi;
      // There are some really stupid PDF viewers in the wild, such as
      // 'Preview' which ships with the Mac. They might do a better
      // job with text selection and highlighting when given perfectly
      // straight text instead of very slightly tilted text. I chose
      // this threshold large enough to absorb noise, but small enough
      // that lines probably won't cross each other if the whole page
      // is tilted at almost exactly the clipping threshold.
      if (rise < 2.0 && 2.0 < run)
        line_y1 = line_y2 = (line_y1 + line_y2) / 2;
    }

    if (res_it->Empty(RIL_WORD)) {
      res_it->Next(RIL_WORD);
      continue;
    }

    int word_x1, word_y1, word_x2, word_y2;
    res_it->Baseline(RIL_WORD, &word_x1, &word_y1, &word_x2, &word_y2);

    // The critical one is writing_direction
    tesseract::Orientation orientation;
    tesseract::WritingDirection writing_direction;
    tesseract::TextlineOrder textline_order;
    float deskew_angle;
    res_it->Orientation(&orientation, &writing_direction,
                        &textline_order, &deskew_angle);

    // Unlike Tesseract, we always want the word baseline in reading order.
    if (writing_direction == WRITING_DIRECTION_RIGHT_TO_LEFT) {
      Swap(&word_x1, &word_x2);
      Swap(&word_y1, &word_y2);
    }

    // Viewers like evince can get really confused during copy-paste
    // when the baseline wanders around. I've decided to force every
    // word to match the (straight) baseline.  The math below is just
    // projecting the word origin onto the baseline.  All numbers are
    // in the native PDF coordinate system, which has the origin in
    // the bottom left and the unit is points, which is 1/72 inch.
    double word_length;
    double x, y;
    {
      int px = word_x1;
      int py = word_y1;
      double l2 = dist2(line_x1, line_y1, line_x2, line_y2);
      if (l2 == 0) {
        x = line_x1;
        y = line_y1;
      } else {
        double t = ((px - line_x2) * (line_x2 - line_x1) +
                    (py - line_y2) * (line_y2 - line_y1)) / l2;
        x = line_x2 + t * (line_x2 - line_x1);
        y = line_y2 + t * (line_y2 - line_y1);
      }
      word_length = sqrt(static_cast<double>(dist2(word_x1, word_y1,
                                                   word_x2, word_y2)));
      word_length = word_length * 72.0 / ppi;
      x = x * 72 / ppi;
      y = height - (y * 72.0 / ppi);
    }

    int pointsize = 0;
    if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) {
      // Calculate the rotation angle in the PDF cooordinate system,
      // which has the origin in the bottom left. The Tesseract
      // coordinate system has the origin in the upper left.
      //
      // PDF is kind of a like turtle graphics, and we orient the
      // turtle (errr... initial cursor position) with an affine
      // transformation.
      //
      //                                Rotate              RTL    Translate
      //
      // [ x' y' 1 ]  = [ x y 1 ] [ cos𝜃 -sin𝜃 0 ]  [ -1 0 0 ] [ 1 0 0 ]
      //                          [ sin𝜃  cos𝜃 0 ]  [  0 1 0 ] [ 0 1 0 ]
      //                          [   0    0   1 ]  [  0 0 1 ] [ x y 1 ]
      //
      double theta = atan2(static_cast<double>(line_y1 - line_y2),
                           static_cast<double>(line_x2 - line_x1));
      double a, b, c, d;
      a = cos(theta);
      b = sin(theta);
      c = -sin(theta);
      d = cos(theta);
      switch(writing_direction) {
        case WRITING_DIRECTION_RIGHT_TO_LEFT:
          a = -a;
          b = -b;
          c = -c;
          break;
        case WRITING_DIRECTION_TOP_TO_BOTTOM:
          // TODO(jbreiden) Consider switching PDF writing mode to vertical.
          break;
        default:
          break;
      }

      pdf_str.add_str_double("",  prec(a));  // . This affine matrix
      pdf_str.add_str_double(" ", prec(b));  // . sets the coordinate
      pdf_str.add_str_double(" ", prec(c));  // . system for all
      pdf_str.add_str_double(" ", prec(d));  // . text in the entire
      pdf_str.add_str_double(" ", prec(x));  // . line.
      pdf_str.add_str_double(" ", prec(y));  // .
      pdf_str += (" Tm ");                   // Place cursor absolutely
    } else {
      double offset = sqrt(static_cast<double>(dist2(old_x, old_y, x, y)));
      pdf_str.add_str_double(" ", prec(offset));  // Delta x in pts
      pdf_str.add_str_double(" ", 0);             // Delta y in pts
      pdf_str += (" Td ");                        // Relative moveto
    }
    old_x = x;
    old_y = y;

    // Adjust font size on a per word granularity. Pay attention to
    // pointsize, old_pointsize, and pdf_str. We've found that for
    // in Arabic, Tesseract will happily return a pointsize of zero,
    //  so we make up a default number to protect ourselves.
    {
      bool bold, italic, underlined, monospace, serif, smallcaps;
      int font_id;
      res_it->WordFontAttributes(&bold, &italic, &underlined, &monospace,
                                 &serif, &smallcaps, &pointsize, &font_id);
      const int kDefaultPointSize = 8;
      if (pointsize <= 0)
        pointsize = kDefaultPointSize;
      if (pointsize != old_pointsize) {
        char textfont[20];
        snprintf(textfont, sizeof(textfont), "/f-0-0 %d Tf ", pointsize);
        pdf_str += textfont;
        old_pointsize = pointsize;
      }
    }

    bool last_word_in_line = res_it->IsAtFinalElement(RIL_TEXTLINE, RIL_WORD);
    bool last_word_in_block = res_it->IsAtFinalElement(RIL_BLOCK, RIL_WORD);
    STRING pdf_word("");
    int pdf_word_len = 0;
    do {
      const char *grapheme = res_it->GetUTF8Text(RIL_SYMBOL);
      if (grapheme && grapheme[0] != '\0') {
        // TODO(jbreiden) Do a real UTF-16BE conversion
        // http://en.wikipedia.org/wiki/UTF-16#Example_UTF-16_encoding_procedure
        string_32 utf32;
        CubeUtils::UTF8ToUTF32(grapheme, &utf32);
        char utf16[20];
        for (int i = 0; i < static_cast<int>(utf32.length()); i++) {
          snprintf(utf16, sizeof(utf16), "<%04X>", utf32[i]);
          pdf_word += utf16;
          pdf_word_len++;
        }
      }
      delete []grapheme;
      res_it->Next(RIL_SYMBOL);
    } while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_WORD));
    if (word_length > 0 && pdf_word_len > 0 && pointsize > 0) {
      double h_stretch =
          kCharWidth * prec(100.0 * word_length / (pointsize * pdf_word_len));
      pdf_str.add_str_double("", h_stretch);
      pdf_str += " Tz";          // horizontal stretch
      pdf_str += " [ ";
      pdf_str += pdf_word;       // UTF-16BE representation
      pdf_str += " ] TJ";        // show the text
    }
    if (last_word_in_line) {
      pdf_str += " \n";
    }
    if (last_word_in_block) {
      pdf_str += "ET\n";         // end the text object
    }
  }
  char *ret = new char[pdf_str.length() + 1];
  strcpy(ret, pdf_str.string());
  delete res_it;
  return ret;
}