void pdfout_text_get_page (FILE *stream, fz_context *ctx, pdf_document *doc, int page_number) { fz_stext_sheet *sheet; fz_stext_page *text; sheet = fz_new_stext_sheet (ctx); text = fz_new_stext_page_from_page_number (ctx, &doc->super, page_number, sheet, 0); /* printf ("in pdfout_text_get_page, page_number: %d, page->len: %d\n", */ /* page_number, text->len); */ for (int i = 0; i < text->len; ++i) { fz_page_block *block = &text->blocks[i]; if (block->type == FZ_PAGE_BLOCK_TEXT) process_block (ctx, stream, block->u.text); } fprintf (stream, "\f\n"); /* cleanup */ fz_drop_stext_page (ctx, text); fz_drop_stext_sheet (ctx, sheet); }
fz_stext_page * fz_new_stext_page_from_page(fz_context *ctx, fz_page *page, fz_stext_sheet *sheet) { fz_stext_page *text; fz_device *dev; fz_rect mediabox; if (page == NULL) return NULL; text = fz_new_stext_page(ctx, fz_bound_page(ctx, page, &mediabox)); fz_try(ctx) { dev = fz_new_stext_device(ctx, sheet, text); fz_run_page(ctx, page, dev, &fz_identity, NULL); fz_close_device(ctx, dev); } fz_always(ctx) { fz_drop_device(ctx, dev); } fz_catch(ctx) { fz_drop_stext_page(ctx, text); fz_rethrow(ctx); } return text; }
static void text_drop_writer(fz_context *ctx, fz_document_writer *wri_) { fz_text_writer *wri = (fz_text_writer*)wri_; fz_drop_stext_page(ctx, wri->page); fz_drop_output(ctx, wri->out); }
static void text_end_page(fz_context *ctx, fz_document_writer *wri_, fz_device *dev) { fz_text_writer *wri = (fz_text_writer*)wri_; fz_close_device(ctx, dev); fz_drop_device(ctx, dev); switch (wri->format) { default: case FZ_FORMAT_TEXT: fz_print_stext_page_as_text(ctx, wri->out, wri->page); break; case FZ_FORMAT_HTML: fz_print_stext_page_as_html(ctx, wri->out, wri->page); break; case FZ_FORMAT_XHTML: fz_print_stext_page_as_xhtml(ctx, wri->out, wri->page); break; case FZ_FORMAT_STEXT: fz_print_stext_page_as_xml(ctx, wri->out, wri->page); break; } fz_drop_stext_page(ctx, wri->page); wri->page = NULL; }
static fz_device * text_begin_page(fz_context *ctx, fz_document_writer *wri_, const fz_rect *mediabox) { fz_text_writer *wri = (fz_text_writer*)wri_; if (wri->page) { fz_drop_stext_page(ctx, wri->page); wri->page = NULL; } wri->page = fz_new_stext_page(ctx, mediabox); return fz_new_stext_device(ctx, wri->page, &wri->opts); }
fz_buffer * fz_new_buffer_from_page(fz_context *ctx, fz_page *page, const fz_rect *sel, int crlf) { fz_stext_sheet *sheet; fz_stext_page *text; fz_buffer *buf; sheet = fz_new_stext_sheet(ctx); fz_try(ctx) { text = fz_new_stext_page_from_page(ctx, page, sheet); buf = fz_new_buffer_from_stext_page(ctx, text, sel, crlf); } fz_always(ctx) fz_drop_stext_sheet(ctx, sheet); fz_catch(ctx) fz_rethrow(ctx); fz_drop_stext_page(ctx, text); return buf; }
int fz_search_page(fz_context *ctx, fz_page *page, const char *needle, fz_rect *hit_bbox, int hit_max) { fz_stext_sheet *sheet; fz_stext_page *text; int count; sheet = fz_new_stext_sheet(ctx); fz_try(ctx) { text = fz_new_stext_page_from_page(ctx, page, sheet); count = fz_search_stext_page(ctx, text, needle, hit_bbox, hit_max); } fz_always(ctx) fz_drop_stext_sheet(ctx, sheet); fz_catch(ctx) fz_rethrow(ctx); fz_drop_stext_page(ctx, text); return count; }
fz_stext_page * fz_new_stext_page_from_page(fz_context *ctx, fz_page *page, fz_stext_sheet *sheet) { fz_stext_page *text; fz_device *dev; text = fz_new_stext_page(ctx); fz_try(ctx) { dev = fz_new_stext_device(ctx, sheet, text); fz_run_page(ctx, page, dev, &fz_identity, NULL); } fz_always(ctx) { fz_drop_device(ctx, dev); } fz_catch(ctx) { fz_drop_stext_page(ctx, text); fz_rethrow(ctx); } return text; }
std::string PDFDocument::GetPageText(int page, int line_sep) { // 1. Init MuPDF structures. pdf_page* page_struct = GetPage(page); fz_stext_sheet* text_sheet = fz_new_stext_sheet(_fz_context); // 2. Render page. #if MUPDF_VERSION >= 10010 fz_stext_options stext_options = { 0 }; // See #elif MUPDF_VERSION >= 10009 block below. fz_stext_page* text_page = fz_new_stext_page_from_page( _fz_context, &(page_struct->super), text_sheet, &stext_options); #elif MUPDF_VERSION >= 10009 // The function below is a wrapper around fz_run_page that uses a fresh // device. We can't use pdf_run_page to gather the text for us. // These notes are also left in here in case MuPDF's API changes again. fz_stext_page* text_page = fz_new_stext_page_from_page( _fz_context, &(page_struct->super), text_sheet); #else fz_stext_page* text_page = fz_new_text_page(_fz_context); fz_device* dev = fz_new_stext_device(_fz_context, text_sheet, text_page); // I've no idea what fz_{begin,end}_page do, but without them pdf_run_page // segfaults :-/ fz_begin_page(_fz_context, dev, &fz_infinite_rect, &fz_identity); pdf_run_page( _fz_context, _pdf_document, page_struct, dev, &fz_identity, nullptr); fz_end_page(_fz_context, dev); #endif // 3. Build text. std::string r; for (fz_page_block* page_block = text_page->blocks; page_block < text_page->blocks + text_page->len; ++page_block) { assert(page_block != nullptr); if (page_block->type != FZ_PAGE_BLOCK_TEXT) { continue; } fz_stext_block* const text_block = page_block->u.text; assert(text_block != nullptr); for (fz_stext_line* text_line = text_block->lines; text_line < text_block->lines + text_block->len; ++text_line) { assert(text_line != nullptr); for (fz_stext_span* text_span = text_line->first_span; text_span != nullptr; text_span = text_span->next) { for (int i = 0; i < text_span->len; ++i) { const int c = text_span->text[i].c; // A single UTF-8 character cannot take more than 4 bytes, but let's // go for 8. char buffer[8]; const int num_bytes = fz_runetochar(buffer, c); assert(num_bytes <= static_cast<int>(sizeof(buffer))); buffer[num_bytes] = '\0'; r += buffer; } } if (!isspace(r.back())) { r += line_sep; } } } // 4. Clean up. fz_drop_stext_page(_fz_context, text_page); fz_drop_stext_sheet(_fz_context, text_sheet); return r; }
std::string PDFDocument::GetPageText(int page, int line_sep) { // 1. Init MuPDF structures. pdf_page* page_struct = GetPage(page); #if MUPDF_VERSION < 10012 fz_stext_sheet* text_sheet = fz_new_stext_sheet(_fz_context); #endif // 2. Render page. #if MUPDF_VERSION >= 10012 fz_stext_options stext_options = {0}; // See #elif MUPDF_VERSION >= 10009 block below. fz_stext_page* text_page = fz_new_stext_page_from_page( _fz_context, &(page_struct->super), &stext_options); #elif MUPDF_VERSION >= 10010 fz_stext_options stext_options = {0}; // See #elif MUPDF_VERSION >= 10009 block below. fz_stext_page* text_page = fz_new_stext_page_from_page( _fz_context, &(page_struct->super), text_sheet, &stext_options); #elif MUPDF_VERSION >= 10009 // The function below is a wrapper around fz_run_page that uses a fresh // device. We can't use pdf_run_page to gather the text for us. // These notes are also left in here in case MuPDF's API changes again. fz_stext_page* text_page = fz_new_stext_page_from_page( _fz_context, &(page_struct->super), text_sheet); #else fz_stext_page* text_page = fz_new_text_page(_fz_context); fz_device* dev = fz_new_stext_device(_fz_context, text_sheet, text_page); // I've no idea what fz_{begin,end}_page do, but without them pdf_run_page // segfaults :-/ fz_begin_page(_fz_context, dev, &fz_infinite_rect, &fz_identity); pdf_run_page( _fz_context, _pdf_document, page_struct, dev, &fz_identity, nullptr); fz_end_page(_fz_context, dev); #endif // 3. Build text. std::string r; #if MUPDF_VERSION >= 10012 for (fz_stext_block* text_block = text_page->first_block; text_block != nullptr; text_block = text_block->next) { if (text_block->type != FZ_STEXT_BLOCK_TEXT) { continue; } for (fz_stext_line* text_line = text_block->u.t.first_line; text_line != nullptr; text_line = text_line->next) { for (fz_stext_char* text_char = text_line->first_char; text_char != nullptr; text_char = text_char->next) { { const int c = text_char->c; #else for (fz_page_block* page_block = text_page->blocks; page_block < text_page->blocks + text_page->len; ++page_block) { assert(page_block != nullptr); if (page_block->type != FZ_PAGE_BLOCK_TEXT) { continue; } fz_stext_block* const text_block = page_block->u.text; assert(text_block != nullptr); for (fz_stext_line* text_line = text_block->lines; text_line < text_block->lines + text_block->len; ++text_line) { assert(text_line != nullptr); for (fz_stext_span* text_span = text_line->first_span; text_span != nullptr; text_span = text_span->next) { for (int i = 0; i < text_span->len; ++i) { const int c = text_span->text[i].c; #endif // A single UTF-8 character cannot take more than 4 bytes, but let's // go for 8. char buffer[8]; const int num_bytes = fz_runetochar(buffer, c); assert(num_bytes <= static_cast<int>(sizeof(buffer))); buffer[num_bytes] = '\0'; r += buffer; } } if (!isspace(r.back())) { r += line_sep; } } } // 4. Clean up. fz_drop_stext_page(_fz_context, text_page); #if MUPDF_VERSION < 10012 fz_drop_stext_sheet(_fz_context, text_sheet); #endif return r; } PDFDocument::PDFOutlineItem::~PDFOutlineItem() {} PDFDocument::PDFOutlineItem::PDFOutlineItem(fz_outline* src) { if (src == nullptr) { _dest_page = -1; } else { _title = src->title; #if MUPDF_VERSION >= 10010 _dest_page = src->page; #else _dest_page = src->dest.ld.gotor.page; #endif } } int PDFDocument::PDFOutlineItem::GetDestPage() const { return _dest_page; } PDFDocument::PDFOutlineItem* PDFDocument::PDFOutlineItem::Build( fz_context* ctx, fz_outline* src) { PDFOutlineItem* root = nullptr; std::vector<std::unique_ptr<OutlineItem>> items; BuildRecursive(src, &items); fz_drop_outline(ctx, src); if (items.empty()) { return nullptr; } else if (items.size() == 1) { root = dynamic_cast<PDFOutlineItem*>(items[0].release()); } else { root = new PDFOutlineItem(nullptr); root->_title = DEFAULT_ROOT_OUTLINE_ITEM_TITLE; root->_children.swap(items); } return root; } void PDFDocument::PDFOutlineItem::BuildRecursive( fz_outline* src, std::vector<std::unique_ptr<Document::OutlineItem>>* output) { assert(output != nullptr); for (fz_outline* i = src; i != nullptr; i = i->next) { PDFOutlineItem* item = new PDFOutlineItem(i); if (i->down != nullptr) { BuildRecursive(i->down, &(item->_children)); } output->push_back(std::unique_ptr<Document::OutlineItem>(item)); } } PDFDocument::PDFPageCache::PDFPageCache(int cache_size, PDFDocument* parent) : Cache<int, pdf_page*>(cache_size), _parent(parent) {} PDFDocument::PDFPageCache::~PDFPageCache() { Clear(); } pdf_page* PDFDocument::PDFPageCache::Load(const int& page) { std::unique_lock<std::mutex> lock(_mutex); return pdf_load_page(_parent->_fz_context, _parent->_pdf_document, page); } void PDFDocument::PDFPageCache::Discard( const int& page, pdf_page* const& page_struct) { std::unique_lock<std::mutex> lock(_mutex); pdf_drop_page(_parent->_fz_context, _parent->_pdf_document, page_struct); }