示例#1
0
char *TessPDFRenderer::GetPDFTextObjects(TessBaseAPI *api,
        double width, double height) {
    STRING pdf_str("");
    double ppi = api->GetSourceYResolution();

    // These initial conditions are all arbitrary and will be overwritten
    double old_x = 0.0, old_y = 0.0;
    int old_fontsize = 0;
    tesseract::WritingDirection old_writing_direction =
        WRITING_DIRECTION_LEFT_TO_RIGHT;
    bool new_block = true;
    int fontsize = 0;
    double a = 1;
    double b = 0;
    double c = 0;
    double d = 1;

    // TODO(jbreiden) This marries the text and image together.
    // Slightly cleaner from an abstraction standpoint if this were to
    // live inside a separate text object.
    pdf_str += "q ";
    pdf_str.add_str_double("", prec(width));
    pdf_str += " 0 0 ";
    pdf_str.add_str_double("", prec(height));
    pdf_str += " 0 0 cm /Im1 Do Q\n";

    ResultIterator *res_it = api->GetIterator();
    while (!res_it->Empty(RIL_BLOCK)) {
        if (res_it->IsAtBeginningOf(RIL_BLOCK)) {
            pdf_str += "BT\n3 Tr";     // Begin text object, use invisible ink
            old_fontsize = 0;          // Every block will declare its fontsize
            new_block = true;          // Every block will declare its affine matrix
        }

        int line_x1, line_y1, line_x2, line_y2;
        if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) {
            int x1, y1, x2, y2;
            res_it->Baseline(RIL_TEXTLINE, &x1, &y1, &x2, &y2);
            ClipBaseline(ppi, x1, y1, x2, y2, &line_x1, &line_y1, &line_x2, &line_y2);
        }

        if (res_it->Empty(RIL_WORD)) {
            res_it->Next(RIL_WORD);
            continue;
        }

        // Writing direction changes at a per-word granularity
        tesseract::WritingDirection writing_direction;
        {
            tesseract::Orientation orientation;
            tesseract::TextlineOrder textline_order;
            float deskew_angle;
            res_it->Orientation(&orientation, &writing_direction,
                                &textline_order, &deskew_angle);
            if (writing_direction != WRITING_DIRECTION_TOP_TO_BOTTOM) {
                switch (res_it->WordDirection()) {
                case DIR_LEFT_TO_RIGHT:
                    writing_direction = WRITING_DIRECTION_LEFT_TO_RIGHT;
                    break;
                case DIR_RIGHT_TO_LEFT:
                    writing_direction = WRITING_DIRECTION_RIGHT_TO_LEFT;
                    break;
                default:
                    writing_direction = old_writing_direction;
                }
            }
        }

        // Where is word origin and how long is it?
        double x, y, word_length;
        {
            int word_x1, word_y1, word_x2, word_y2;
            res_it->Baseline(RIL_WORD, &word_x1, &word_y1, &word_x2, &word_y2);
            GetWordBaseline(writing_direction, ppi, height,
                            word_x1, word_y1, word_x2, word_y2,
                            line_x1, line_y1, line_x2, line_y2,
                            &x, &y, &word_length);
        }

        if (writing_direction != old_writing_direction || new_block) {
            AffineMatrix(writing_direction,
                         line_x1, line_y1, line_x2, line_y2, &a, &b, &c, &d);
            pdf_str.add_str_double(" ", prec(a));  // . This affine matrix
            pdf_str.add_str_double(" ", prec(b));  // . sets the coordinate
            pdf_str.add_str_double(" ", prec(c));  // . system for all
            pdf_str.add_str_double(" ", prec(d));  // . text that follows.
            pdf_str.add_str_double(" ", prec(x));  // .
            pdf_str.add_str_double(" ", prec(y));  // .
            pdf_str += (" Tm ");                   // Place cursor absolutely
            new_block = false;
        } else {
            double dx = x - old_x;
            double dy = y - old_y;
            pdf_str.add_str_double(" ", prec(dx * a + dy * b));
            pdf_str.add_str_double(" ", prec(dx * c + dy * d));
            pdf_str += (" Td ");                   // Relative moveto
        }
        old_x = x;
        old_y = y;
        old_writing_direction = writing_direction;

        // Adjust font size on a per word granularity. Pay attention to
        // fontsize, old_fontsize, and pdf_str. We've found that for
        // in Arabic, Tesseract will happily return a fontsize of zero,
        // so we make up a default number to protect ourselves.
        {
            bool bold, italic, underlined, monospace, serif, smallcaps;
            int font_id;
            res_it->WordFontAttributes(&bold, &italic, &underlined, &monospace,
                                       &serif, &smallcaps, &fontsize, &font_id);
            const int kDefaultFontsize = 8;
            if (fontsize <= 0)
                fontsize = kDefaultFontsize;
            if (fontsize != old_fontsize) {
                char textfont[20];
                snprintf(textfont, sizeof(textfont), "/f-0-0 %d Tf ", fontsize);
                pdf_str += textfont;
                old_fontsize = fontsize;
            }
        }

        bool last_word_in_line = res_it->IsAtFinalElement(RIL_TEXTLINE, RIL_WORD);
        bool last_word_in_block = res_it->IsAtFinalElement(RIL_BLOCK, RIL_WORD);
        STRING pdf_word("");
        int pdf_word_len = 0;
        do {
            const char *grapheme = res_it->GetUTF8Text(RIL_SYMBOL);
            if (grapheme && grapheme[0] != '\0') {
                // TODO(jbreiden) Do a real UTF-16BE conversion
                // http://en.wikipedia.org/wiki/UTF-16#Example_UTF-16_encoding_procedure
                string_32 utf32;
                CubeUtils::UTF8ToUTF32(grapheme, &utf32);
                char utf16[20];
                for (int i = 0; i < static_cast<int>(utf32.length()); i++) {
                    snprintf(utf16, sizeof(utf16), "<%04X>", utf32[i]);
                    pdf_word += utf16;
                    pdf_word_len++;
                }
            }
            delete[]grapheme;
            res_it->Next(RIL_SYMBOL);
        } while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_WORD));
        if (word_length > 0 && pdf_word_len > 0 && fontsize > 0) {
            double h_stretch =
                kCharWidth * prec(100.0 * word_length / (fontsize * pdf_word_len));
            pdf_str.add_str_double("", h_stretch);
            pdf_str += " Tz";          // horizontal stretch
            pdf_str += " [ ";
            pdf_str += pdf_word;       // UTF-16BE representation
            pdf_str += " ] TJ";        // show the text
        }
        if (last_word_in_line) {
            pdf_str += " \n";
        }
        if (last_word_in_block) {
            pdf_str += "ET\n";         // end the text object
        }
    }
    char *ret = new char[pdf_str.length() + 1];
    strcpy(ret, pdf_str.string());
    delete res_it;
    return ret;
}
示例#2
0
文件: geom.cpp 项目: rotace/sample
 AffineMatrix AffineMatrix::clone() const {
   return AffineMatrix(a, b, c, d, tx, ty);
 }