static void fz_text_begin_page(fz_context *ctx, fz_device *dev, const fz_rect *mediabox, const fz_matrix *ctm) { fz_text_device *tdev = (fz_text_device*)dev; if (tdev->page->len) { tdev->page->next = fz_new_text_page(ctx); tdev->page = tdev->page->next; } tdev->page->mediabox = *mediabox; fz_transform_rect(&tdev->page->mediabox, ctm); }
std::shared_ptr<std::vector<std::shared_ptr<RectFloat>>> MuPDFDoc::SearchText(const char* searchText) { fz_text_sheet *sheet = nullptr; fz_text_page *text = nullptr; fz_device *dev = nullptr; PageCache *pageCache = &m_pages[m_currentPage]; fz_var(sheet); fz_var(text); fz_var(dev); std::shared_ptr<std::vector<std::shared_ptr<RectFloat>>> hints(new std::vector<std::shared_ptr<RectFloat>>()); fz_try(m_context) { int hitCount = 0; fz_matrix ctm = CalcConvertMatrix(); fz_rect mbrect = fz_transform_rect(ctm, pageCache->mediaBox); sheet = fz_new_text_sheet(m_context); text = fz_new_text_page(m_context, mbrect); dev = fz_new_text_device(m_context, sheet, text); fz_run_page(m_document, pageCache->page, dev, ctm, nullptr); fz_free_device(dev); dev = nullptr; int len = TextLen(text); for (int pos = 0; pos < len; pos++) { fz_bbox rr = fz_empty_bbox; int n = Match(text, searchText, pos); for (int i = 0; i < n; i++) rr = fz_union_bbox(rr, BBoxCharAt(text, pos + i)); if (!fz_is_empty_bbox(rr) && hitCount < MAX_SEARCH_HITS) { hints->push_back(std::shared_ptr<RectFloat>(new RectFloat(rr.x0, rr.y0, rr.x1, rr.y1))); if (++hitCount >= MAX_SEARCH_HITS) break; } } } fz_always(m_context) { fz_free_text_page(m_context, text); fz_free_text_sheet(m_context, sheet); fz_free_device(dev); } fz_catch(m_context) { return std::shared_ptr<std::vector<std::shared_ptr<RectFloat>>>(nullptr); } return hints; }
fz_text_page * fz_new_text_page_from_page(fz_context *ctx, fz_page *page, fz_text_sheet *sheet) { fz_text_page *text; fz_device *dev; text = fz_new_text_page(ctx); fz_try(ctx) { dev = fz_new_text_device(ctx, sheet, text); fz_run_page(ctx, page, dev, &fz_identity, NULL); } fz_always(ctx) { fz_drop_device(ctx, dev); } fz_catch(ctx) { fz_drop_text_page(ctx, text); fz_rethrow(ctx); } return text; }
static void drawpage(fz_context *ctx, fz_document *doc, int pagenum) { fz_page *page; fz_display_list *list = NULL; fz_device *dev = NULL; int start; fz_cookie cookie = { 0 }; int needshot = 0; fz_var(list); fz_var(dev); if (showtime) { start = gettime(); } fz_try(ctx) { page = fz_load_page(doc, pagenum - 1); } fz_catch(ctx) { fz_rethrow_message(ctx, "cannot load page %d in file '%s'", pagenum, filename); } if (mujstest_file) { pdf_document *inter = pdf_specifics(doc); pdf_widget *widget = NULL; if (inter) widget = pdf_first_widget(inter, (pdf_page *)page); if (widget) { fprintf(mujstest_file, "GOTO %d\n", pagenum); needshot = 1; } for (;widget; widget = pdf_next_widget(widget)) { fz_rect rect; int w, h, len; int type = pdf_widget_get_type(widget); pdf_bound_widget(widget, &rect); w = (rect.x1 - rect.x0); h = (rect.y1 - rect.y0); ++mujstest_count; switch (type) { default: fprintf(mujstest_file, "%% UNKNOWN %0.2f %0.2f %0.2f %0.2f\n", rect.x0, rect.y0, rect.x1, rect.y1); break; case PDF_WIDGET_TYPE_PUSHBUTTON: fprintf(mujstest_file, "%% PUSHBUTTON %0.2f %0.2f %0.2f %0.2f\n", rect.x0, rect.y0, rect.x1, rect.y1); break; case PDF_WIDGET_TYPE_CHECKBOX: fprintf(mujstest_file, "%% CHECKBOX %0.2f %0.2f %0.2f %0.2f\n", rect.x0, rect.y0, rect.x1, rect.y1); break; case PDF_WIDGET_TYPE_RADIOBUTTON: fprintf(mujstest_file, "%% RADIOBUTTON %0.2f %0.2f %0.2f %0.2f\n", rect.x0, rect.y0, rect.x1, rect.y1); break; case PDF_WIDGET_TYPE_TEXT: { int maxlen = pdf_text_widget_max_len(inter, widget); int texttype = pdf_text_widget_content_type(inter, widget); /* If height is low, assume a single row, and base * the width off that. */ if (h < 10) { w = (w+h-1) / (h ? h : 1); h = 1; } /* Otherwise, if width is low, work off height */ else if (w < 10) { h = (w+h-1) / (w ? w : 1); w = 1; } else { w = (w+9)/10; h = (h+9)/10; } len = w*h; if (len < 2) len = 2; if (len > maxlen) len = maxlen; fprintf(mujstest_file, "%% TEXT %0.2f %0.2f %0.2f %0.2f\n", rect.x0, rect.y0, rect.x1, rect.y1); switch (texttype) { default: case PDF_WIDGET_CONTENT_UNRESTRAINED: fprintf(mujstest_file, "TEXT %d ", mujstest_count); escape_string(mujstest_file, len-3, lorem); fprintf(mujstest_file, "\n"); break; case PDF_WIDGET_CONTENT_NUMBER: fprintf(mujstest_file, "TEXT %d\n", mujstest_count); break; case PDF_WIDGET_CONTENT_SPECIAL: #ifdef __MINGW32__ fprintf(mujstest_file, "TEXT %I64d\n", 46702919800LL + mujstest_count); #else fprintf(mujstest_file, "TEXT %lld\n", 46702919800LL + mujstest_count); #endif break; case PDF_WIDGET_CONTENT_DATE: fprintf(mujstest_file, "TEXT Jun %d 1979\n", 1 + ((13 + mujstest_count) % 30)); break; case PDF_WIDGET_CONTENT_TIME: ++mujstest_count; fprintf(mujstest_file, "TEXT %02d:%02d\n", ((mujstest_count/60) % 24), mujstest_count % 60); break; } break; } case PDF_WIDGET_TYPE_LISTBOX: fprintf(mujstest_file, "%% LISTBOX %0.2f %0.2f %0.2f %0.2f\n", rect.x0, rect.y0, rect.x1, rect.y1); break; case PDF_WIDGET_TYPE_COMBOBOX: fprintf(mujstest_file, "%% COMBOBOX %0.2f %0.2f %0.2f %0.2f\n", rect.x0, rect.y0, rect.x1, rect.y1); break; } fprintf(mujstest_file, "CLICK %0.2f %0.2f\n", (rect.x0+rect.x1)/2, (rect.y0+rect.y1)/2); } } if (uselist) { fz_try(ctx) { list = fz_new_display_list(ctx); dev = fz_new_list_device(ctx, list); fz_run_page(doc, page, dev, &fz_identity, &cookie); } fz_always(ctx) { fz_free_device(dev); dev = NULL; } fz_catch(ctx) { fz_drop_display_list(ctx, list); fz_free_page(doc, page); fz_rethrow_message(ctx, "cannot draw page %d in file '%s'", pagenum, filename); } } if (showxml) { fz_try(ctx) { dev = fz_new_trace_device(ctx); if (list) fz_run_display_list(list, dev, &fz_identity, &fz_infinite_rect, &cookie); else fz_run_page(doc, page, dev, &fz_identity, &cookie); } fz_always(ctx) { fz_free_device(dev); dev = NULL; } fz_catch(ctx) { fz_drop_display_list(ctx, list); fz_free_page(doc, page); fz_rethrow(ctx); } } if (showtext) { fz_text_page *text = NULL; fz_var(text); fz_try(ctx) { text = fz_new_text_page(ctx); dev = fz_new_text_device(ctx, sheet, text); if (showtext == TEXT_HTML) fz_disable_device_hints(dev, FZ_IGNORE_IMAGE); if (list) fz_run_display_list(list, dev, &fz_identity, &fz_infinite_rect, &cookie); else fz_run_page(doc, page, dev, &fz_identity, &cookie); fz_free_device(dev); dev = NULL; if (showtext == TEXT_XML) { fz_print_text_page_xml(ctx, out, text); } else if (showtext == TEXT_HTML) { fz_analyze_text(ctx, sheet, text); fz_print_text_page_html(ctx, out, text); } else if (showtext == TEXT_PLAIN) { fz_print_text_page(ctx, out, text); fz_printf(out, "\f\n"); } } fz_always(ctx) { fz_free_device(dev); dev = NULL; fz_free_text_page(ctx, text); } fz_catch(ctx) { fz_drop_display_list(ctx, list); fz_free_page(doc, page); fz_rethrow(ctx); } } if (showmd5 || showtime) printf("page %s %d", filename, pagenum); if (pdfout) { fz_matrix ctm; fz_rect bounds, tbounds; pdf_page *newpage; fz_bound_page(doc, page, &bounds); fz_rotate(&ctm, rotation); tbounds = bounds; fz_transform_rect(&tbounds, &ctm); newpage = pdf_create_page(pdfout, bounds, 72, 0); fz_try(ctx) { dev = pdf_page_write(pdfout, newpage); if (list) fz_run_display_list(list, dev, &ctm, &tbounds, &cookie); else fz_run_page(doc, page, dev, &ctm, &cookie); fz_free_device(dev); dev = NULL; } fz_always(ctx) { fz_free_device(dev); dev = NULL; } fz_catch(ctx) { fz_drop_display_list(ctx, list); fz_free_page(doc, page); fz_rethrow(ctx); } pdf_insert_page(pdfout, newpage, INT_MAX); pdf_free_page(pdfout, newpage); } if (output && output_format == OUT_SVG) { float zoom; fz_matrix ctm; fz_rect bounds, tbounds; char buf[512]; FILE *file; fz_output *out; if (!strcmp(output, "-")) file = stdout; else { sprintf(buf, output, pagenum); file = fopen(buf, "wb"); if (file == NULL) fz_throw(ctx, FZ_ERROR_GENERIC, "cannot open file '%s': %s", buf, strerror(errno)); } out = fz_new_output_with_file(ctx, file); fz_bound_page(doc, page, &bounds); zoom = resolution / 72; fz_pre_rotate(fz_scale(&ctm, zoom, zoom), rotation); tbounds = bounds; fz_transform_rect(&tbounds, &ctm); fz_try(ctx) { dev = fz_new_svg_device(ctx, out, tbounds.x1-tbounds.x0, tbounds.y1-tbounds.y0); if (list) fz_run_display_list(list, dev, &ctm, &tbounds, &cookie); else fz_run_page(doc, page, dev, &ctm, &cookie); fz_free_device(dev); dev = NULL; } fz_always(ctx) { fz_free_device(dev); dev = NULL; fz_close_output(out); if (file != stdout) fclose(file); } fz_catch(ctx) { fz_drop_display_list(ctx, list); fz_free_page(doc, page); fz_rethrow(ctx); } }
static void drawpage(fz_context *ctx, fz_document *doc, int pagenum) { fz_page *page; fz_display_list *list = NULL; fz_device *dev = NULL; int start; fz_cookie cookie = { 0 }; fz_var(list); fz_var(dev); if (showtime) { start = gettime(); } fz_try(ctx) { page = fz_load_page(doc, pagenum - 1); } fz_catch(ctx) { fz_rethrow_message(ctx, "cannot load page %d in file '%s'", pagenum, filename); } if (uselist) { fz_try(ctx) { list = fz_new_display_list(ctx); dev = fz_new_list_device(ctx, list); fz_run_page(doc, page, dev, &fz_identity, &cookie); } fz_always(ctx) { fz_free_device(dev); dev = NULL; } fz_catch(ctx) { fz_drop_display_list(ctx, list); fz_free_page(doc, page); fz_rethrow_message(ctx, "cannot draw page %d in file '%s'", pagenum, filename); } } if (showxml) { fz_try(ctx) { dev = fz_new_trace_device(ctx); if (list) fz_run_display_list(list, dev, &fz_identity, &fz_infinite_rect, &cookie); else fz_run_page(doc, page, dev, &fz_identity, &cookie); } fz_always(ctx) { fz_free_device(dev); dev = NULL; } fz_catch(ctx) { fz_drop_display_list(ctx, list); fz_free_page(doc, page); fz_rethrow(ctx); } } if (showtext) { fz_text_page *text = NULL; fz_var(text); fz_try(ctx) { text = fz_new_text_page(ctx); dev = fz_new_text_device(ctx, sheet, text); if (showtext == TEXT_HTML) fz_disable_device_hints(dev, FZ_IGNORE_IMAGE); if (list) fz_run_display_list(list, dev, &fz_identity, &fz_infinite_rect, &cookie); else fz_run_page(doc, page, dev, &fz_identity, &cookie); fz_free_device(dev); dev = NULL; if (showtext == TEXT_XML) { fz_print_text_page_xml(ctx, out, text); } else if (showtext == TEXT_HTML) { fz_analyze_text(ctx, sheet, text); fz_print_text_page_html(ctx, out, text); } else if (showtext == TEXT_PLAIN) { fz_print_text_page(ctx, out, text); fz_printf(out, "\f\n"); } } fz_always(ctx) { fz_free_device(dev); dev = NULL; fz_free_text_page(ctx, text); } fz_catch(ctx) { fz_drop_display_list(ctx, list); fz_free_page(doc, page); fz_rethrow(ctx); } } if (showmd5 || showtime || showfeatures) printf("page %s %d", filename, pagenum); if (showfeatures) { int iscolor; dev = fz_new_test_device(ctx, &iscolor, 0.02f); fz_try(ctx) { if (list) fz_run_display_list(list, dev, &fz_identity, &fz_infinite_rect, NULL); else fz_run_page(doc, page, dev, &fz_identity, &cookie); } fz_always(ctx) { fz_free_device(dev); dev = NULL; } fz_catch(ctx) { fz_rethrow(ctx); } printf(" %s", iscolor ? "color" : "grayscale"); } if (pdfout) { fz_matrix ctm; fz_rect bounds, tbounds; pdf_page *newpage; fz_bound_page(doc, page, &bounds); fz_rotate(&ctm, rotation); tbounds = bounds; fz_transform_rect(&tbounds, &ctm); newpage = pdf_create_page(pdfout, bounds, 72, 0); fz_try(ctx) { dev = pdf_page_write(pdfout, newpage); if (list) fz_run_display_list(list, dev, &ctm, &tbounds, &cookie); else fz_run_page(doc, page, dev, &ctm, &cookie); fz_free_device(dev); dev = NULL; } fz_always(ctx) { fz_free_device(dev); dev = NULL; } fz_catch(ctx) { fz_drop_display_list(ctx, list); fz_free_page(doc, page); fz_rethrow(ctx); } pdf_insert_page(pdfout, newpage, INT_MAX); pdf_free_page(pdfout, newpage); } if (output && output_format == OUT_SVG) { float zoom; fz_matrix ctm; fz_rect bounds, tbounds; char buf[512]; FILE *file; fz_output *out; if (!strcmp(output, "-")) file = stdout; else { sprintf(buf, output, pagenum); file = fopen(buf, "wb"); if (file == NULL) fz_throw(ctx, FZ_ERROR_GENERIC, "cannot open file '%s': %s", buf, strerror(errno)); } out = fz_new_output_with_file(ctx, file); fz_bound_page(doc, page, &bounds); zoom = resolution / 72; fz_pre_rotate(fz_scale(&ctm, zoom, zoom), rotation); tbounds = bounds; fz_transform_rect(&tbounds, &ctm); fz_try(ctx) { dev = fz_new_svg_device(ctx, out, tbounds.x1-tbounds.x0, tbounds.y1-tbounds.y0); if (list) fz_run_display_list(list, dev, &ctm, &tbounds, &cookie); else fz_run_page(doc, page, dev, &ctm, &cookie); fz_free_device(dev); dev = NULL; } fz_always(ctx) { fz_free_device(dev); dev = NULL; fz_close_output(out); if (file != stdout) fclose(file); } fz_catch(ctx) { fz_drop_display_list(ctx, list); fz_free_page(doc, page); fz_rethrow(ctx); } }
std::string PDFDocument::GetPageText(int page, int line_sep) { // 1. Init MuPDF structures. pdf_page* page_struct = GetPage(page); fz_stext_sheet* text_sheet = fz_new_stext_sheet(_fz_context); // 2. Render page. #if MUPDF_VERSION >= 10010 fz_stext_options stext_options = { 0 }; // See #elif MUPDF_VERSION >= 10009 block below. fz_stext_page* text_page = fz_new_stext_page_from_page( _fz_context, &(page_struct->super), text_sheet, &stext_options); #elif MUPDF_VERSION >= 10009 // The function below is a wrapper around fz_run_page that uses a fresh // device. We can't use pdf_run_page to gather the text for us. // These notes are also left in here in case MuPDF's API changes again. fz_stext_page* text_page = fz_new_stext_page_from_page( _fz_context, &(page_struct->super), text_sheet); #else fz_stext_page* text_page = fz_new_text_page(_fz_context); fz_device* dev = fz_new_stext_device(_fz_context, text_sheet, text_page); // I've no idea what fz_{begin,end}_page do, but without them pdf_run_page // segfaults :-/ fz_begin_page(_fz_context, dev, &fz_infinite_rect, &fz_identity); pdf_run_page( _fz_context, _pdf_document, page_struct, dev, &fz_identity, nullptr); fz_end_page(_fz_context, dev); #endif // 3. Build text. std::string r; for (fz_page_block* page_block = text_page->blocks; page_block < text_page->blocks + text_page->len; ++page_block) { assert(page_block != nullptr); if (page_block->type != FZ_PAGE_BLOCK_TEXT) { continue; } fz_stext_block* const text_block = page_block->u.text; assert(text_block != nullptr); for (fz_stext_line* text_line = text_block->lines; text_line < text_block->lines + text_block->len; ++text_line) { assert(text_line != nullptr); for (fz_stext_span* text_span = text_line->first_span; text_span != nullptr; text_span = text_span->next) { for (int i = 0; i < text_span->len; ++i) { const int c = text_span->text[i].c; // A single UTF-8 character cannot take more than 4 bytes, but let's // go for 8. char buffer[8]; const int num_bytes = fz_runetochar(buffer, c); assert(num_bytes <= static_cast<int>(sizeof(buffer))); buffer[num_bytes] = '\0'; r += buffer; } } if (!isspace(r.back())) { r += line_sep; } } } // 4. Clean up. fz_drop_stext_page(_fz_context, text_page); fz_drop_stext_sheet(_fz_context, text_sheet); return r; }
static void drawpage(fz_context *ctx, fz_document *doc, int pagenum) { fz_page *page; fz_display_list *list = NULL; fz_device *dev = NULL; int start; fz_cookie cookie = { 0 }; int needshot = 0; fz_var(list); fz_var(dev); if (showtime) { start = gettime(); } fz_try(ctx) { page = fz_load_page(doc, pagenum - 1); } fz_catch(ctx) { fz_throw(ctx, "cannot load page %d in file '%s'", pagenum, filename); } if (mujstest_file) { fz_interactive *inter = fz_interact(doc); fz_widget *widget = NULL; if (inter) widget = fz_first_widget(inter, page); if (widget) { fprintf(mujstest_file, "GOTO %d\n", pagenum); needshot = 1; } for (;widget; widget = fz_next_widget(inter, widget)) { fz_rect rect = fz_widget_bbox(widget); int w = (rect.x1-rect.x0); int h = (rect.y1-rect.y0); int len; int type = fz_widget_get_type(widget); ++mujstest_count; switch (type) { default: fprintf(mujstest_file, "%% UNKNOWN %0.2f %0.2f %0.2f %0.2f\n", rect.x0, rect.y0, rect.x1, rect.y1); break; case FZ_WIDGET_TYPE_PUSHBUTTON: fprintf(mujstest_file, "%% PUSHBUTTON %0.2f %0.2f %0.2f %0.2f\n", rect.x0, rect.y0, rect.x1, rect.y1); break; case FZ_WIDGET_TYPE_CHECKBOX: fprintf(mujstest_file, "%% CHECKBOX %0.2f %0.2f %0.2f %0.2f\n", rect.x0, rect.y0, rect.x1, rect.y1); break; case FZ_WIDGET_TYPE_RADIOBUTTON: fprintf(mujstest_file, "%% RADIOBUTTON %0.2f %0.2f %0.2f %0.2f\n", rect.x0, rect.y0, rect.x1, rect.y1); break; case FZ_WIDGET_TYPE_TEXT: { int maxlen = fz_text_widget_max_len(inter, widget); int texttype = fz_text_widget_content_type(inter, widget); /* If height is low, assume a single row, and base * the width off that. */ if (h < 10) { w = (w+h-1) / (h ? h : 1); h = 1; } /* Otherwise, if width is low, work off height */ else if (w < 10) { h = (w+h-1) / (w ? w : 1); w = 1; } else { w = (w+9)/10; h = (h+9)/10; } len = w*h; if (len < 2) len = 2; if (len > maxlen) len = maxlen; fprintf(mujstest_file, "%% TEXT %0.2f %0.2f %0.2f %0.2f\n", rect.x0, rect.y0, rect.x1, rect.y1); switch (texttype) { default: case FZ_WIDGET_CONTENT_UNRESTRAINED: fprintf(mujstest_file, "TEXT %d ", mujstest_count); escape_string(mujstest_file, len-3, lorem); fprintf(mujstest_file, "\n"); break; case FZ_WIDGET_CONTENT_NUMBER: fprintf(mujstest_file, "TEXT %d\n", mujstest_count); break; case FZ_WIDGET_CONTENT_SPECIAL: fprintf(mujstest_file, "TEXT %lld\n", 46702919800LL + mujstest_count); break; case FZ_WIDGET_CONTENT_DATE: fprintf(mujstest_file, "TEXT Jun %d 1979\n", 1 + ((13 + mujstest_count) % 30)); break; case FZ_WIDGET_CONTENT_TIME: ++mujstest_count; fprintf(mujstest_file, "TEXT %02d:%02d\n", ((mujstest_count/60) % 24), mujstest_count % 60); break; } break; } case FZ_WIDGET_TYPE_LISTBOX: fprintf(mujstest_file, "%% LISTBOX %0.2f %0.2f %0.2f %0.2f\n", rect.x0, rect.y0, rect.x1, rect.y1); break; case FZ_WIDGET_TYPE_COMBOBOX: fprintf(mujstest_file, "%% COMBOBOX %0.2f %0.2f %0.2f %0.2f\n", rect.x0, rect.y0, rect.x1, rect.y1); break; } fprintf(mujstest_file, "CLICK %0.2f %0.2f\n", (rect.x0+rect.x1)/2, (rect.y0+rect.y1)/2); } } if (uselist) { fz_try(ctx) { list = fz_new_display_list(ctx); dev = fz_new_list_device(ctx, list); fz_run_page(doc, page, dev, fz_identity, &cookie); } fz_always(ctx) { fz_free_device(dev); dev = NULL; } fz_catch(ctx) { fz_free_display_list(ctx, list); fz_free_page(doc, page); fz_throw(ctx, "cannot draw page %d in file '%s'", pagenum, filename); } } if (showxml) { fz_try(ctx) { dev = fz_new_trace_device(ctx); fz_printf(out, "<page number=\"%d\">\n", pagenum); if (list) fz_run_display_list(list, dev, fz_identity, fz_infinite_rect, &cookie); else fz_run_page(doc, page, dev, fz_identity, &cookie); fz_printf(out, "</page>\n"); } fz_always(ctx) { fz_free_device(dev); dev = NULL; } fz_catch(ctx) { fz_free_display_list(ctx, list); fz_free_page(doc, page); fz_rethrow(ctx); } } if (showtext) { fz_text_page *text = NULL; fz_var(text); fz_try(ctx) { text = fz_new_text_page(ctx, fz_bound_page(doc, page)); dev = fz_new_text_device(ctx, sheet, text); if (list) fz_run_display_list(list, dev, fz_identity, fz_infinite_rect, &cookie); else fz_run_page(doc, page, dev, fz_identity, &cookie); fz_free_device(dev); dev = NULL; if (showtext == TEXT_XML) { fz_print_text_page_xml(ctx, out, text); } else if (showtext == TEXT_HTML) { fz_print_text_page_html(ctx, out, text); } else if (showtext == TEXT_PLAIN) { fz_print_text_page(ctx, out, text); fz_printf(out, "\f\n"); } } fz_always(ctx) { fz_free_device(dev); dev = NULL; fz_free_text_page(ctx, text); } fz_catch(ctx) { fz_free_display_list(ctx, list); fz_free_page(doc, page); fz_rethrow(ctx); } } if (showmd5 || showtime) printf("page %s %d", filename, pagenum); #ifdef GDI_PLUS_BMP_RENDERER // hack: use -G0 to "enable GDI+" when saving as TGA if (output && (strstr(output, ".bmp") || strstr(output, ".tga") && !gamma_value)) drawbmp(ctx, doc, page, list, pagenum); else #endif if (output || showmd5 || showtime) { float zoom; fz_matrix ctm; fz_rect bounds, tbounds; fz_bbox ibounds; fz_pixmap *pix = NULL; int w, h; fz_var(pix); bounds = fz_bound_page(doc, page); zoom = resolution / 72; ctm = fz_scale(zoom, zoom); ctm = fz_concat(ctm, fz_rotate(rotation)); tbounds = fz_transform_rect(ctm, bounds); ibounds = fz_round_rect(tbounds); /* convert to integers */ /* Make local copies of our width/height */ w = width; h = height; /* If a resolution is specified, check to see whether w/h are * exceeded; if not, unset them. */ if (res_specified) { int t; t = ibounds.x1 - ibounds.x0; if (w && t <= w) w = 0; t = ibounds.y1 - ibounds.y0; if (h && t <= h) h = 0; } /* Now w or h will be 0 unless they need to be enforced. */ if (w || h) { float scalex = w / (tbounds.x1 - tbounds.x0); float scaley = h / (tbounds.y1 - tbounds.y0); if (fit) { if (w == 0) scalex = 1.0f; if (h == 0) scaley = 1.0f; } else { if (w == 0) scalex = scaley; if (h == 0) scaley = scalex; } if (!fit) { if (scalex > scaley) scalex = scaley; else scaley = scalex; } ctm = fz_concat(ctm, fz_scale(scalex, scaley)); tbounds = fz_transform_rect(ctm, bounds); } ibounds = fz_round_rect(tbounds); /* TODO: banded rendering and multi-page ppm */ fz_try(ctx) { pix = fz_new_pixmap_with_bbox(ctx, colorspace, ibounds); if (savealpha) fz_clear_pixmap(ctx, pix); else fz_clear_pixmap_with_value(ctx, pix, 255); dev = fz_new_draw_device(ctx, pix); if (list) fz_run_display_list(list, dev, ctm, tbounds, &cookie); else fz_run_page(doc, page, dev, ctm, &cookie); fz_free_device(dev); dev = NULL; if (invert) fz_invert_pixmap(ctx, pix); if (gamma_value != 1) fz_gamma_pixmap(ctx, pix, gamma_value); if (savealpha) fz_unmultiply_pixmap(ctx, pix); if (output) { char buf[512]; sprintf(buf, output, pagenum); if (strstr(output, ".pgm") || strstr(output, ".ppm") || strstr(output, ".pnm")) fz_write_pnm(ctx, pix, buf); else if (strstr(output, ".pam")) fz_write_pam(ctx, pix, buf, savealpha); else if (strstr(output, ".png")) fz_write_png(ctx, pix, buf, savealpha); else if (strstr(output, ".pbm")) { fz_bitmap *bit = fz_halftone_pixmap(ctx, pix, NULL); fz_write_pbm(ctx, bit, buf); fz_drop_bitmap(ctx, bit); } /* SumatraPDF: support TGA as output format */ else if (strstr(output, ".tga")) fz_write_tga(ctx, pix, buf, savealpha); } if (showmd5) { unsigned char digest[16]; int i; fz_md5_pixmap(pix, digest); printf(" "); for (i = 0; i < 16; i++) printf("%02x", digest[i]); } } fz_always(ctx) { fz_free_device(dev); dev = NULL; fz_drop_pixmap(ctx, pix); } fz_catch(ctx) { fz_free_display_list(ctx, list); fz_free_page(doc, page); fz_rethrow(ctx); } }
DrPage * DrPDFExtractor::ExtractPage(unsigned int pageno) { fz_page * page = fz_load_page(m_doc, pageno); if (page == NULL) { return NULL; } DrPage * dpage = new DrPage; dpage->SetPageNo(pageno); std::list<DrChar *> charlist; std::list<DrPhrase *> phraselist; std::list<DrLine *> linelist; std::list<DrZone *> &zonelist = dpage->m_zonelist; fz_matrix transform; fz_rotate(&transform,0); fz_pre_scale(&transform, 1.0f, 1.0f); fz_rect bounds; fz_bound_page(m_doc, page, &bounds); fz_transform_rect(&bounds, &transform); fz_irect bbox; fz_round_rect(&bbox, &bounds); fz_matrix ttransform = transform; fz_pixmap *pix = fz_new_pixmap_with_bbox(m_ctx, fz_device_rgb(m_ctx), &bbox); fz_clear_pixmap_with_value(m_ctx, pix, 0xff); fz_device * dev = fz_new_draw_device(m_ctx,pix); fz_run_page(m_doc, page, dev, &transform, NULL); fz_free_device(dev); fz_text_sheet * sheet = fz_new_text_sheet(m_ctx); fz_text_page * tpage = fz_new_text_page(m_ctx); fz_device * cdev = fz_new_text_device(m_ctx, sheet, tpage); fz_run_page(m_doc, page, cdev, &ttransform, NULL); ExtractChars(charlist,tpage); fz_free_device(cdev); // DrThumbnail * thumb = new DrThumbnail(m_ctx,pix,pageno); // dpage->m_thumbnail = thumb; DrTextGrouper::TextGroup(phraselist, charlist); std::list<DrPhrase *>::iterator itphrase = phraselist.begin(); while (itphrase != phraselist.end()) { if ((*itphrase)->IsSpacePhrase()) { delete *itphrase; itphrase = phraselist.erase(itphrase); } else itphrase++; } DrTextGrouper::TextGroup(linelist, phraselist); DrTextGrouper::TextGroup(zonelist, linelist); dpage->CalculatePageBox(); // fz_free_text_sheet(m_ctx, tsheet); // fz_free_text_page(m_ctx, tpage); fz_free_page(m_doc, page); return dpage; }
static void drawpage(fz_context *ctx, fz_document *doc, int pagenum) { fz_page *page; fz_link *links; fz_display_list *list = NULL; fz_device *dev = NULL; int start; fz_var(list); fz_var(dev); if (showtime) { start = gettime(); } fz_try(ctx) { page = fz_load_page(doc, pagenum - 1); } fz_catch(ctx) { fz_throw(ctx, "cannot load page %d in file '%s'", pagenum, filename); } fz_try(ctx) { links = fz_load_links(doc, page); } fz_catch(ctx) { fz_throw(ctx, "cannot load links for page %d in file '%s'", pagenum, filename); } if (uselist) { fz_try(ctx) { list = fz_new_display_list(ctx); dev = fz_new_list_device(ctx, list); fz_run_page(doc, page, dev, fz_identity, NULL); } fz_catch(ctx) { fz_free_device(dev); fz_free_display_list(ctx, list); fz_free_page(doc, page); fz_throw(ctx, "cannot draw page %d in file '%s'", pagenum, filename); } fz_free_device(dev); dev = NULL; } if (showjson) { fz_text_page *text = NULL; fz_var(text); fz_try(ctx) { text = fz_new_text_page(ctx, fz_bound_page(doc, page)); dev = fz_new_text_device(ctx, sheet, text); if (list) fz_run_display_list(list, dev, fz_identity, fz_infinite_bbox, NULL); else fz_run_page(doc, page, dev, fz_identity, NULL); fz_free_device(dev); dev = NULL; fz_print_text_page_json(ctx, stdout, text, links); printf("\f\n"); } fz_catch(ctx) { fz_free_device(dev); fz_free_text_page(ctx, text); fz_free_display_list(ctx, list); fz_free_page(doc, page); fz_rethrow(ctx); } fz_free_text_page(ctx, text); } if (showmd5 || showtime) printf("page %s %d", filename, pagenum); if (output || showmd5 || showtime) { float zoom; fz_matrix ctm; fz_rect bounds, bounds2; fz_bbox bbox; fz_pixmap *pix = NULL; int w, h; fz_var(pix); bounds = fz_bound_page(doc, page); zoom = resolution / 72; ctm = fz_scale(zoom, zoom); ctm = fz_concat(ctm, fz_rotate(rotation)); bounds2 = fz_transform_rect(ctm, bounds); bbox = fz_round_rect(bounds2); /* Make local copies of our width/height */ w = width; h = height; /* If a resolution is specified, check to see whether w/h are * exceeded; if not, unset them. */ if (res_specified) { int t; t = bbox.x1 - bbox.x0; if (w && t <= w) w = 0; t = bbox.y1 - bbox.y0; if (h && t <= h) h = 0; } /* Now w or h will be 0 unless then need to be enforced. */ if (w || h) { float scalex = w/(bounds2.x1-bounds2.x0); float scaley = h/(bounds2.y1-bounds2.y0); if (fit) { if (w == 0) scalex = 1.0f; if (h == 0) scaley = 1.0f; } else { if (w == 0) scalex = scaley; if (h == 0) scaley = scalex; } if (!fit) { if (scalex > scaley) scalex = scaley; else scaley = scalex; } ctm = fz_concat(ctm, fz_scale(scalex, scaley)); bounds2 = fz_transform_rect(ctm, bounds); } bbox = fz_round_rect(bounds2); /* TODO: banded rendering and multi-page ppm */ fz_try(ctx) { pix = fz_new_pixmap_with_bbox(ctx, colorspace, bbox); if (savealpha) fz_clear_pixmap(ctx, pix); else fz_clear_pixmap_with_value(ctx, pix, 255); dev = fz_new_draw_device(ctx, pix); if (list) fz_run_display_list(list, dev, ctm, bbox, NULL); else fz_run_page(doc, page, dev, ctm, NULL); fz_free_device(dev); dev = NULL; if (invert) fz_invert_pixmap(ctx, pix); if (gamma_value != 1) fz_gamma_pixmap(ctx, pix, gamma_value); if (savealpha) fz_unmultiply_pixmap(ctx, pix); if (output) { char buf[512]; sprintf(buf, output, pagenum); if (strstr(output, ".pgm") || strstr(output, ".ppm") || strstr(output, ".pnm")) fz_write_pnm(ctx, pix, buf); else if (strstr(output, ".pam")) fz_write_pam(ctx, pix, buf, savealpha); else if (strstr(output, ".png")) fz_write_png(ctx, pix, buf, savealpha); else if (strstr(output, ".pbm")) { fz_bitmap *bit = fz_halftone_pixmap(ctx, pix, NULL); fz_write_pbm(ctx, bit, buf); fz_drop_bitmap(ctx, bit); } } if (showmd5) { unsigned char digest[16]; int i; fz_md5_pixmap(pix, digest); printf(" "); for (i = 0; i < 16; i++) printf("%02x", digest[i]); } fz_drop_pixmap(ctx, pix); } fz_catch(ctx) { fz_free_device(dev); fz_drop_pixmap(ctx, pix); fz_free_display_list(ctx, list); fz_free_page(doc, page); fz_rethrow(ctx); } }
std::string PDFDocument::GetPageText(int page, int line_sep) { // 1. Init MuPDF structures. pdf_page* page_struct = GetPage(page); #if MUPDF_VERSION < 10012 fz_stext_sheet* text_sheet = fz_new_stext_sheet(_fz_context); #endif // 2. Render page. #if MUPDF_VERSION >= 10012 fz_stext_options stext_options = {0}; // See #elif MUPDF_VERSION >= 10009 block below. fz_stext_page* text_page = fz_new_stext_page_from_page( _fz_context, &(page_struct->super), &stext_options); #elif MUPDF_VERSION >= 10010 fz_stext_options stext_options = {0}; // See #elif MUPDF_VERSION >= 10009 block below. fz_stext_page* text_page = fz_new_stext_page_from_page( _fz_context, &(page_struct->super), text_sheet, &stext_options); #elif MUPDF_VERSION >= 10009 // The function below is a wrapper around fz_run_page that uses a fresh // device. We can't use pdf_run_page to gather the text for us. // These notes are also left in here in case MuPDF's API changes again. fz_stext_page* text_page = fz_new_stext_page_from_page( _fz_context, &(page_struct->super), text_sheet); #else fz_stext_page* text_page = fz_new_text_page(_fz_context); fz_device* dev = fz_new_stext_device(_fz_context, text_sheet, text_page); // I've no idea what fz_{begin,end}_page do, but without them pdf_run_page // segfaults :-/ fz_begin_page(_fz_context, dev, &fz_infinite_rect, &fz_identity); pdf_run_page( _fz_context, _pdf_document, page_struct, dev, &fz_identity, nullptr); fz_end_page(_fz_context, dev); #endif // 3. Build text. std::string r; #if MUPDF_VERSION >= 10012 for (fz_stext_block* text_block = text_page->first_block; text_block != nullptr; text_block = text_block->next) { if (text_block->type != FZ_STEXT_BLOCK_TEXT) { continue; } for (fz_stext_line* text_line = text_block->u.t.first_line; text_line != nullptr; text_line = text_line->next) { for (fz_stext_char* text_char = text_line->first_char; text_char != nullptr; text_char = text_char->next) { { const int c = text_char->c; #else for (fz_page_block* page_block = text_page->blocks; page_block < text_page->blocks + text_page->len; ++page_block) { assert(page_block != nullptr); if (page_block->type != FZ_PAGE_BLOCK_TEXT) { continue; } fz_stext_block* const text_block = page_block->u.text; assert(text_block != nullptr); for (fz_stext_line* text_line = text_block->lines; text_line < text_block->lines + text_block->len; ++text_line) { assert(text_line != nullptr); for (fz_stext_span* text_span = text_line->first_span; text_span != nullptr; text_span = text_span->next) { for (int i = 0; i < text_span->len; ++i) { const int c = text_span->text[i].c; #endif // A single UTF-8 character cannot take more than 4 bytes, but let's // go for 8. char buffer[8]; const int num_bytes = fz_runetochar(buffer, c); assert(num_bytes <= static_cast<int>(sizeof(buffer))); buffer[num_bytes] = '\0'; r += buffer; } } if (!isspace(r.back())) { r += line_sep; } } } // 4. Clean up. fz_drop_stext_page(_fz_context, text_page); #if MUPDF_VERSION < 10012 fz_drop_stext_sheet(_fz_context, text_sheet); #endif return r; } PDFDocument::PDFOutlineItem::~PDFOutlineItem() {} PDFDocument::PDFOutlineItem::PDFOutlineItem(fz_outline* src) { if (src == nullptr) { _dest_page = -1; } else { _title = src->title; #if MUPDF_VERSION >= 10010 _dest_page = src->page; #else _dest_page = src->dest.ld.gotor.page; #endif } } int PDFDocument::PDFOutlineItem::GetDestPage() const { return _dest_page; } PDFDocument::PDFOutlineItem* PDFDocument::PDFOutlineItem::Build( fz_context* ctx, fz_outline* src) { PDFOutlineItem* root = nullptr; std::vector<std::unique_ptr<OutlineItem>> items; BuildRecursive(src, &items); fz_drop_outline(ctx, src); if (items.empty()) { return nullptr; } else if (items.size() == 1) { root = dynamic_cast<PDFOutlineItem*>(items[0].release()); } else { root = new PDFOutlineItem(nullptr); root->_title = DEFAULT_ROOT_OUTLINE_ITEM_TITLE; root->_children.swap(items); } return root; } void PDFDocument::PDFOutlineItem::BuildRecursive( fz_outline* src, std::vector<std::unique_ptr<Document::OutlineItem>>* output) { assert(output != nullptr); for (fz_outline* i = src; i != nullptr; i = i->next) { PDFOutlineItem* item = new PDFOutlineItem(i); if (i->down != nullptr) { BuildRecursive(i->down, &(item->_children)); } output->push_back(std::unique_ptr<Document::OutlineItem>(item)); } } PDFDocument::PDFPageCache::PDFPageCache(int cache_size, PDFDocument* parent) : Cache<int, pdf_page*>(cache_size), _parent(parent) {} PDFDocument::PDFPageCache::~PDFPageCache() { Clear(); } pdf_page* PDFDocument::PDFPageCache::Load(const int& page) { std::unique_lock<std::mutex> lock(_mutex); return pdf_load_page(_parent->_fz_context, _parent->_pdf_document, page); } void PDFDocument::PDFPageCache::Discard( const int& page, pdf_page* const& page_struct) { std::unique_lock<std::mutex> lock(_mutex); pdf_drop_page(_parent->_fz_context, _parent->_pdf_document, page_struct); }
JNIEXPORT jobjectArray JNICALL Java_com_artifex_mupdf_MuPDFCore_searchPage(JNIEnv * env, jobject thiz, jstring jtext) { jclass rectClass; jmethodID ctor; jobjectArray arr; jobject rect; fz_text_sheet *sheet = NULL; fz_text_page *text = NULL; fz_device *dev = NULL; float zoom; fz_matrix ctm; int pos; int len; int i, n; int hit_count = 0; const char *str; page_cache *pc = &pages[current]; rectClass = (*env)->FindClass(env, "android/graphics/RectF"); if (rectClass == NULL) return NULL; ctor = (*env)->GetMethodID(env, rectClass, "<init>", "(FFFF)V"); if (ctor == NULL) return NULL; str = (*env)->GetStringUTFChars(env, jtext, NULL); if (str == NULL) return NULL; fz_var(sheet); fz_var(text); fz_var(dev); fz_try(ctx) { fz_rect rect; if (hit_bbox == NULL) hit_bbox = fz_malloc_array(ctx, MAX_SEARCH_HITS, sizeof(*hit_bbox)); zoom = resolution / 72; ctm = fz_scale(zoom, zoom); rect = fz_transform_rect(ctm, pc->media_box); sheet = fz_new_text_sheet(ctx); text = fz_new_text_page(ctx, rect); dev = fz_new_text_device(ctx, sheet, text); fz_run_page(doc, pc->page, dev, ctm, NULL); fz_free_device(dev); dev = NULL; len = textlen(text); for (pos = 0; pos < len; pos++) { fz_bbox rr = fz_empty_bbox; n = match(text, str, pos); for (i = 0; i < n; i++) rr = fz_union_bbox(rr, bboxcharat(text, pos + i)); if (!fz_is_empty_bbox(rr) && hit_count < MAX_SEARCH_HITS) hit_bbox[hit_count++] = rr; } } fz_always(ctx) { fz_free_text_page(ctx, text); fz_free_text_sheet(ctx, sheet); fz_free_device(dev); } fz_catch(ctx) { jclass cls; (*env)->ReleaseStringUTFChars(env, jtext, str); cls = (*env)->FindClass(env, "java/lang/OutOfMemoryError"); if (cls != NULL) (*env)->ThrowNew(env, cls, "Out of memory in MuPDFCore_searchPage"); (*env)->DeleteLocalRef(env, cls); return NULL; } (*env)->ReleaseStringUTFChars(env, jtext, str); arr = (*env)->NewObjectArray(env, hit_count, rectClass, NULL); if (arr == NULL) return NULL; for (i = 0; i < hit_count; i++) { rect = (*env)->NewObject(env, rectClass, ctor, (float) (hit_bbox[i].x0), (float) (hit_bbox[i].y0), (float) (hit_bbox[i].x1), (float) (hit_bbox[i].y1)); if (rect == NULL) return NULL; (*env)->SetObjectArrayElement(env, arr, i, rect); (*env)->DeleteLocalRef(env, rect); } return arr; }