/* Convert Unicode/PdfDocEncoding string into utf-8 */ char * pdf_to_utf8(fz_context *ctx, pdf_obj *src) { unsigned char *srcptr = (unsigned char *) pdf_to_str_buf(src); char *dstptr, *dst; int srclen = pdf_to_str_len(src); int dstlen = 0; int ucs; int i; if (srclen >= 2 && srcptr[0] == 254 && srcptr[1] == 255) { for (i = 2; i + 1 < srclen; i += 2) { ucs = srcptr[i] << 8 | srcptr[i+1]; dstlen += fz_runelen(ucs); } dstptr = dst = fz_malloc(ctx, dstlen + 1); for (i = 2; i + 1 < srclen; i += 2) { ucs = srcptr[i] << 8 | srcptr[i+1]; dstptr += fz_runetochar(dstptr, ucs); } } else if (srclen >= 2 && srcptr[0] == 255 && srcptr[1] == 254) { for (i = 2; i + 1 < srclen; i += 2) { ucs = srcptr[i] | srcptr[i+1] << 8; dstlen += fz_runelen(ucs); } dstptr = dst = fz_malloc(ctx, dstlen + 1); for (i = 2; i + 1 < srclen; i += 2) { ucs = srcptr[i] | srcptr[i+1] << 8; dstptr += fz_runetochar(dstptr, ucs); } } else { for (i = 0; i < srclen; i++) dstlen += fz_runelen(pdf_doc_encoding[srcptr[i]]); dstptr = dst = fz_malloc(ctx, dstlen + 1); for (i = 0; i < srclen; i++) { ucs = pdf_doc_encoding[srcptr[i]]; dstptr += fz_runetochar(dstptr, ucs); } } *dstptr = '\0'; return dst; }
char * fz_utf8_from_wchar(const wchar_t *s) { const wchar_t *src = s; char *d; char *dst; int len = 1; while (*src) { len += fz_runelen(*src++); } d = malloc(len); if (d != NULL) { dst = d; src = s; while (*src) { dst += fz_runetochar(dst, *src++); } *dst = 0; } return d; }
static void format_alpha_number(fz_context *ctx, char *buf, int size, int n, int alpha, int omega) { int base = omega - alpha + 1; int tmp[40]; int i, c; if (alpha > 256) /* to skip final-s for greek */ --base; /* Bijective base-26 (base-24 for greek) numeration */ i = 0; while (n > 0) { --n; c = n % base + alpha; if (alpha > 256 && c > alpha + 16) /* skip final-s for greek */ ++c; tmp[i++] = c; n /= base; } while (i > 0) buf += fz_runetochar(buf, tmp[--i]); *buf++ = '.'; *buf++ = ' '; *buf = 0; }
static void process_block (fz_context *ctx, FILE *memstream, fz_stext_block *block) { fz_stext_line *line; /* printf ("in process block, block->len = %d\n", block->len); */ for (line = block->lines; line - block->lines < block->len; line++) { fz_stext_span *span; for (span = line->first_span; span; span = span->next) { fz_stext_char *ch; if (span != line->first_span) putc (' ', memstream); for (ch = span->text; ch < span->text + span->len; ch++) { char utf[4]; int len = fz_runetochar (utf, ch->c); if (utf[0] == '\0') { pdfout_warn (ctx, "process_block: skipping null character"); continue; } fwrite (utf, len, 1, memstream); } } putc ('\n', memstream); } }
void docopy(pdfapp_t *app, Atom copy_target) { unsigned short copyucs2[16 * 1024]; char *latin1 = copylatin1; char *utf8 = copyutf8; unsigned short *ucs2; int ucs; pdfapp_oncopy(&gapp, copyucs2, 16 * 1024); for (ucs2 = copyucs2; ucs2[0] != 0; ucs2++) { ucs = ucs2[0]; utf8 += fz_runetochar(utf8, ucs); if (ucs < 256) *latin1++ = ucs; else *latin1++ = '?'; } *utf8 = 0; *latin1 = 0; XSetSelectionOwner(xdpy, copy_target, xwin, copytime); justcopied = 1; }
void fz_print_text_page(fz_context *ctx, fz_output *out, fz_text_page *page) { fz_text_block *block; fz_text_line *line; fz_text_span *span; fz_text_char *ch; char utf[10]; int i, n; for (block = page->blocks; block < page->blocks + page->len; block++) { for (line = block->lines; line < block->lines + block->len; line++) { for (span = line->spans; span < line->spans + line->len; span++) { for (ch = span->text; ch < span->text + span->len; ch++) { n = fz_runetochar(utf, ch->c); for (i = 0; i < n; i++) fz_printf(out, "%c", utf[i]); } } fz_printf(out, "\n"); } fz_printf(out, "\n"); } }
void fz_print_stext_page_as_text(fz_context *ctx, fz_output *out, fz_stext_page *page) { fz_stext_block *block; fz_stext_line *line; fz_stext_char *ch; char utf[10]; int i, n; for (block = page->first_block; block; block = block->next) { if (block->type == FZ_STEXT_BLOCK_TEXT) { for (line = block->u.t.first_line; line; line = line->next) { for (ch = line->first_char; ch; ch = ch->next) { n = fz_runetochar(utf, ch->c); for (i = 0; i < n; i++) fz_write_byte(ctx, out, utf[i]); } fz_write_string(ctx, out, "\n"); } fz_write_string(ctx, out, "\n"); } } }
void fz_write_buffer_rune(fz_context *ctx, fz_buffer *buf, int c) { char data[10]; int len = fz_runetochar(data, c); if (buf->len + len > buf->cap) fz_ensure_buffer(ctx, buf, buf->len + len); memcpy(buf->data + buf->len, data, len); buf->len += len; buf->unused_bits = 0; }
void fz_print_text_page(fz_context *ctx, fz_output *out, fz_text_page *page) { int block_n; for (block_n = 0; block_n < page->len; block_n++) { switch (page->blocks[block_n].type) { case FZ_PAGE_BLOCK_TEXT: { fz_text_block *block = page->blocks[block_n].u.text; fz_text_line *line; fz_text_char *ch; char utf[10]; int i, n; for (line = block->lines; line < block->lines + block->len; line++) { fz_text_span *span; for (span = line->first_span; span; span = span->next) { for (ch = span->text; ch < span->text + span->len; ch++) { n = fz_runetochar(utf, ch->c); for (i = 0; i < n; i++) fz_printf(out, "%c", utf[i]); } } fz_printf(out, "\n"); } fz_printf(out, "\n"); break; } case FZ_PAGE_BLOCK_IMAGE: break; } } }
static int ui_input_key(struct input *input) { switch (ui.key) { case KEY_LEFT: if (ui.mod == GLFW_MOD_CONTROL + GLFW_MOD_SHIFT) { input->q = prev_word(input->q, input->text); } else if (ui.mod == GLFW_MOD_CONTROL) { if (input->p != input->q) input->p = input->q = input->p < input->q ? input->p : input->q; else input->p = input->q = prev_word(input->q, input->text); } else if (ui.mod == GLFW_MOD_SHIFT) { if (input->q > input->text) input->q = prev_char(input->q, input->text); } else if (ui.mod == 0) { if (input->p != input->q) input->p = input->q = input->p < input->q ? input->p : input->q; else if (input->q > input->text) input->p = input->q = prev_char(input->q, input->text); } break; case KEY_RIGHT: if (ui.mod == GLFW_MOD_CONTROL + GLFW_MOD_SHIFT) { input->q = next_word(input->q, input->end); } else if (ui.mod == GLFW_MOD_CONTROL) { if (input->p != input->q) input->p = input->q = input->p > input->q ? input->p : input->q; else input->p = input->q = next_word(input->q, input->end); } else if (ui.mod == GLFW_MOD_SHIFT) { if (input->q < input->end) input->q = next_char(input->q); } else if (ui.mod == 0) { if (input->p != input->q) input->p = input->q = input->p > input->q ? input->p : input->q; else if (input->q < input->end) input->p = input->q = next_char(input->q); } break; case KEY_UP: case KEY_HOME: if (ui.mod == GLFW_MOD_CONTROL + GLFW_MOD_SHIFT) { input->q = input->text; } else if (ui.mod == GLFW_MOD_CONTROL) { input->p = input->q = input->text; } else if (ui.mod == GLFW_MOD_SHIFT) { input->q = input->text; } else if (ui.mod == 0) { input->p = input->q = input->text; } break; case KEY_DOWN: case KEY_END: if (ui.mod == GLFW_MOD_CONTROL + GLFW_MOD_SHIFT) { input->q = input->end; } else if (ui.mod == GLFW_MOD_CONTROL) { input->p = input->q = input->end; } else if (ui.mod == GLFW_MOD_SHIFT) { input->q = input->end; } else if (ui.mod == 0) { input->p = input->q = input->end; } break; case KEY_DELETE: if (input->p != input->q) ui_input_delete_selection(input); else if (input->p < input->end) { char *np = next_char(input->p); memmove(input->p, np, input->end - np); input->end -= np - input->p; *input->end = 0; input->q = input->p; } break; case KEY_ESCAPE: return -1; case KEY_ENTER: return 1; case KEY_BACKSPACE: if (input->p != input->q) ui_input_delete_selection(input); else if (input->p > input->text) { char *pp = prev_char(input->p, input->text); memmove(pp, input->p, input->end - input->p); input->end -= input->p - pp; *input->end = 0; input->q = input->p = pp; } break; case KEY_CTL_A: input->p = input->q = input->text; break; case KEY_CTL_E: input->p = input->q = input->end; break; case KEY_CTL_W: if (input->p != input->q) ui_input_delete_selection(input); else { input->p = prev_word(input->p, input->text); ui_input_delete_selection(input); } break; case KEY_CTL_U: input->p = input->q = input->end = input->text; break; case KEY_CTL_C: case KEY_CTL_X: if (input->p != input->q) { char buf[sizeof input->text]; char *p = input->p < input->q ? input->p : input->q; char *q = input->p > input->q ? input->p : input->q; memmove(buf, p, q - p); buf[q-p] = 0; glfwSetClipboardString(window, buf); if (ui.key == KEY_CTL_X) ui_input_delete_selection(input); } break; case KEY_CTL_V: { const char *buf = glfwGetClipboardString(window); if (buf) ui_input_paste(input, buf, (int)strlen(buf)); } break; default: if (ui.key >= 32) { int cat = ucdn_get_general_category(ui.key); if (ui.key == ' ' || (cat >= UCDN_GENERAL_CATEGORY_LL && cat < UCDN_GENERAL_CATEGORY_ZL)) { char buf[8]; int n = fz_runetochar(buf, ui.key); ui_input_paste(input, buf, n); } } break; } return 0; }
std::string PDFDocument::GetPageText(int page, int line_sep) { // 1. Init MuPDF structures. pdf_page* page_struct = GetPage(page); fz_stext_sheet* text_sheet = fz_new_stext_sheet(_fz_context); // 2. Render page. #if MUPDF_VERSION >= 10010 fz_stext_options stext_options = { 0 }; // See #elif MUPDF_VERSION >= 10009 block below. fz_stext_page* text_page = fz_new_stext_page_from_page( _fz_context, &(page_struct->super), text_sheet, &stext_options); #elif MUPDF_VERSION >= 10009 // The function below is a wrapper around fz_run_page that uses a fresh // device. We can't use pdf_run_page to gather the text for us. // These notes are also left in here in case MuPDF's API changes again. fz_stext_page* text_page = fz_new_stext_page_from_page( _fz_context, &(page_struct->super), text_sheet); #else fz_stext_page* text_page = fz_new_text_page(_fz_context); fz_device* dev = fz_new_stext_device(_fz_context, text_sheet, text_page); // I've no idea what fz_{begin,end}_page do, but without them pdf_run_page // segfaults :-/ fz_begin_page(_fz_context, dev, &fz_infinite_rect, &fz_identity); pdf_run_page( _fz_context, _pdf_document, page_struct, dev, &fz_identity, nullptr); fz_end_page(_fz_context, dev); #endif // 3. Build text. std::string r; for (fz_page_block* page_block = text_page->blocks; page_block < text_page->blocks + text_page->len; ++page_block) { assert(page_block != nullptr); if (page_block->type != FZ_PAGE_BLOCK_TEXT) { continue; } fz_stext_block* const text_block = page_block->u.text; assert(text_block != nullptr); for (fz_stext_line* text_line = text_block->lines; text_line < text_block->lines + text_block->len; ++text_line) { assert(text_line != nullptr); for (fz_stext_span* text_span = text_line->first_span; text_span != nullptr; text_span = text_span->next) { for (int i = 0; i < text_span->len; ++i) { const int c = text_span->text[i].c; // A single UTF-8 character cannot take more than 4 bytes, but let's // go for 8. char buffer[8]; const int num_bytes = fz_runetochar(buffer, c); assert(num_bytes <= static_cast<int>(sizeof(buffer))); buffer[num_bytes] = '\0'; r += buffer; } } if (!isspace(r.back())) { r += line_sep; } } } // 4. Clean up. fz_drop_stext_page(_fz_context, text_page); fz_drop_stext_sheet(_fz_context, text_sheet); return r; }
/* Convert Unicode/PdfDocEncoding string into utf-8 */ char * pdf_to_utf8(pdf_document *doc, pdf_obj *src) { fz_context *ctx = doc->ctx; fz_buffer *strmbuf = NULL; unsigned char *srcptr; char *dstptr, *dst; int srclen; int dstlen = 0; int ucs; int i; fz_var(strmbuf); fz_try(ctx) { if (pdf_is_string(src)) { srcptr = (unsigned char *) pdf_to_str_buf(src); srclen = pdf_to_str_len(src); } else if (pdf_is_stream(doc, pdf_to_num(src), pdf_to_gen(src))) { strmbuf = pdf_load_stream(doc, pdf_to_num(src), pdf_to_gen(src)); srclen = fz_buffer_storage(ctx, strmbuf, (unsigned char **)&srcptr); } else { srclen = 0; } if (srclen >= 2 && srcptr[0] == 254 && srcptr[1] == 255) { for (i = 2; i + 1 < srclen; i += 2) { ucs = srcptr[i] << 8 | srcptr[i+1]; dstlen += fz_runelen(ucs); } dstptr = dst = fz_malloc(ctx, dstlen + 1); for (i = 2; i + 1 < srclen; i += 2) { ucs = srcptr[i] << 8 | srcptr[i+1]; dstptr += fz_runetochar(dstptr, ucs); } } else if (srclen >= 2 && srcptr[0] == 255 && srcptr[1] == 254) { for (i = 2; i + 1 < srclen; i += 2) { ucs = srcptr[i] | srcptr[i+1] << 8; dstlen += fz_runelen(ucs); } dstptr = dst = fz_malloc(ctx, dstlen + 1); for (i = 2; i + 1 < srclen; i += 2) { ucs = srcptr[i] | srcptr[i+1] << 8; dstptr += fz_runetochar(dstptr, ucs); } } else { for (i = 0; i < srclen; i++) dstlen += fz_runelen(pdf_doc_encoding[srcptr[i]]); dstptr = dst = fz_malloc(ctx, dstlen + 1); for (i = 0; i < srclen; i++) { ucs = pdf_replace_undefined(pdf_doc_encoding[srcptr[i]]); dstptr += fz_runetochar(dstptr, ucs); } } } fz_always(ctx) { fz_drop_buffer(ctx, strmbuf); } fz_catch(ctx) { fz_rethrow(ctx); } *dstptr = '\0'; return dst; }
/* Convert Unicode/PdfDocEncoding string into utf-8 */ char * pdf_to_utf8(fz_context *ctx, pdf_document *doc, pdf_obj *src) { fz_buffer *stmbuf = NULL; unsigned char *srcptr; char *dstptr, *dst; size_t srclen; size_t dstlen = 0; int ucs; size_t i; fz_var(stmbuf); fz_try(ctx) { if (pdf_is_string(ctx, src)) { srcptr = (unsigned char *) pdf_to_str_buf(ctx, src); srclen = pdf_to_str_len(ctx, src); } else if (pdf_is_stream(ctx, src)) { stmbuf = pdf_load_stream(ctx, doc, pdf_to_num(ctx, src), pdf_to_gen(ctx, src)); srclen = fz_buffer_storage(ctx, stmbuf, (unsigned char **)&srcptr); } else { srclen = 0; } /* UTF-16BE */ if (srclen >= 2 && srcptr[0] == 254 && srcptr[1] == 255) { i = 2; while (i + 2 <= srclen) { /* skip language escape codes */ if (i + 6 <= srclen && srcptr[i+0] == 0 && srcptr[i+1] == 27 && srcptr[i+4] == 0 && srcptr[i+5] == 27) { i += 6; } else if (i + 8 <= srclen && srcptr[i+0] == 0 && srcptr[i+1] == 27 && srcptr[i+6] == 0 && srcptr[i+7] == 27) { i += 8; } else { i += rune_from_utf16be(&ucs, srcptr + i, srcptr + srclen); dstlen += fz_runelen(ucs); } } dstptr = dst = fz_malloc(ctx, dstlen + 1); i = 2; while (i + 2 <= srclen) { /* skip language escape codes */ if (i + 6 <= srclen && srcptr[i+0] == 0 && srcptr[i+1] == 27 && srcptr[i+4] == 0 && srcptr[i+5] == 27) { i += 6; } else if (i + 8 <= srclen && srcptr[i+0] == 0 && srcptr[i+1] == 27 && srcptr[i+6] == 0 && srcptr[i+7] == 27) { i += 8; } else { i += rune_from_utf16be(&ucs, srcptr + i, srcptr + srclen); dstptr += fz_runetochar(dstptr, ucs); } } } /* PDFDocEncoding */ else { for (i = 0; i < srclen; i++) dstlen += fz_runelen(pdf_doc_encoding[srcptr[i]]); dstptr = dst = fz_malloc(ctx, dstlen + 1); for (i = 0; i < srclen; i++) { ucs = pdf_doc_encoding[srcptr[i]]; dstptr += fz_runetochar(dstptr, ucs); } } } fz_always(ctx) { fz_drop_buffer(ctx, stmbuf); } fz_catch(ctx) { fz_rethrow(ctx); } *dstptr = '\0'; return dst; }
/* Write a UTF-8 encoded unicode character. */ void fz_write_rune(fz_context *ctx, fz_output *out, int rune) { char data[10]; fz_write_data(ctx, out, data, fz_runetochar(data, rune)); }
std::string PDFDocument::GetPageText(int page, int line_sep) { // 1. Init MuPDF structures. pdf_page* page_struct = GetPage(page); #if MUPDF_VERSION < 10012 fz_stext_sheet* text_sheet = fz_new_stext_sheet(_fz_context); #endif // 2. Render page. #if MUPDF_VERSION >= 10012 fz_stext_options stext_options = {0}; // See #elif MUPDF_VERSION >= 10009 block below. fz_stext_page* text_page = fz_new_stext_page_from_page( _fz_context, &(page_struct->super), &stext_options); #elif MUPDF_VERSION >= 10010 fz_stext_options stext_options = {0}; // See #elif MUPDF_VERSION >= 10009 block below. fz_stext_page* text_page = fz_new_stext_page_from_page( _fz_context, &(page_struct->super), text_sheet, &stext_options); #elif MUPDF_VERSION >= 10009 // The function below is a wrapper around fz_run_page that uses a fresh // device. We can't use pdf_run_page to gather the text for us. // These notes are also left in here in case MuPDF's API changes again. fz_stext_page* text_page = fz_new_stext_page_from_page( _fz_context, &(page_struct->super), text_sheet); #else fz_stext_page* text_page = fz_new_text_page(_fz_context); fz_device* dev = fz_new_stext_device(_fz_context, text_sheet, text_page); // I've no idea what fz_{begin,end}_page do, but without them pdf_run_page // segfaults :-/ fz_begin_page(_fz_context, dev, &fz_infinite_rect, &fz_identity); pdf_run_page( _fz_context, _pdf_document, page_struct, dev, &fz_identity, nullptr); fz_end_page(_fz_context, dev); #endif // 3. Build text. std::string r; #if MUPDF_VERSION >= 10012 for (fz_stext_block* text_block = text_page->first_block; text_block != nullptr; text_block = text_block->next) { if (text_block->type != FZ_STEXT_BLOCK_TEXT) { continue; } for (fz_stext_line* text_line = text_block->u.t.first_line; text_line != nullptr; text_line = text_line->next) { for (fz_stext_char* text_char = text_line->first_char; text_char != nullptr; text_char = text_char->next) { { const int c = text_char->c; #else for (fz_page_block* page_block = text_page->blocks; page_block < text_page->blocks + text_page->len; ++page_block) { assert(page_block != nullptr); if (page_block->type != FZ_PAGE_BLOCK_TEXT) { continue; } fz_stext_block* const text_block = page_block->u.text; assert(text_block != nullptr); for (fz_stext_line* text_line = text_block->lines; text_line < text_block->lines + text_block->len; ++text_line) { assert(text_line != nullptr); for (fz_stext_span* text_span = text_line->first_span; text_span != nullptr; text_span = text_span->next) { for (int i = 0; i < text_span->len; ++i) { const int c = text_span->text[i].c; #endif // A single UTF-8 character cannot take more than 4 bytes, but let's // go for 8. char buffer[8]; const int num_bytes = fz_runetochar(buffer, c); assert(num_bytes <= static_cast<int>(sizeof(buffer))); buffer[num_bytes] = '\0'; r += buffer; } } if (!isspace(r.back())) { r += line_sep; } } } // 4. Clean up. fz_drop_stext_page(_fz_context, text_page); #if MUPDF_VERSION < 10012 fz_drop_stext_sheet(_fz_context, text_sheet); #endif return r; } PDFDocument::PDFOutlineItem::~PDFOutlineItem() {} PDFDocument::PDFOutlineItem::PDFOutlineItem(fz_outline* src) { if (src == nullptr) { _dest_page = -1; } else { _title = src->title; #if MUPDF_VERSION >= 10010 _dest_page = src->page; #else _dest_page = src->dest.ld.gotor.page; #endif } } int PDFDocument::PDFOutlineItem::GetDestPage() const { return _dest_page; } PDFDocument::PDFOutlineItem* PDFDocument::PDFOutlineItem::Build( fz_context* ctx, fz_outline* src) { PDFOutlineItem* root = nullptr; std::vector<std::unique_ptr<OutlineItem>> items; BuildRecursive(src, &items); fz_drop_outline(ctx, src); if (items.empty()) { return nullptr; } else if (items.size() == 1) { root = dynamic_cast<PDFOutlineItem*>(items[0].release()); } else { root = new PDFOutlineItem(nullptr); root->_title = DEFAULT_ROOT_OUTLINE_ITEM_TITLE; root->_children.swap(items); } return root; } void PDFDocument::PDFOutlineItem::BuildRecursive( fz_outline* src, std::vector<std::unique_ptr<Document::OutlineItem>>* output) { assert(output != nullptr); for (fz_outline* i = src; i != nullptr; i = i->next) { PDFOutlineItem* item = new PDFOutlineItem(i); if (i->down != nullptr) { BuildRecursive(i->down, &(item->_children)); } output->push_back(std::unique_ptr<Document::OutlineItem>(item)); } } PDFDocument::PDFPageCache::PDFPageCache(int cache_size, PDFDocument* parent) : Cache<int, pdf_page*>(cache_size), _parent(parent) {} PDFDocument::PDFPageCache::~PDFPageCache() { Clear(); } pdf_page* PDFDocument::PDFPageCache::Load(const int& page) { std::unique_lock<std::mutex> lock(_mutex); return pdf_load_page(_parent->_fz_context, _parent->_pdf_document, page); } void PDFDocument::PDFPageCache::Discard( const int& page, pdf_page* const& page_struct) { std::unique_lock<std::mutex> lock(_mutex); pdf_drop_page(_parent->_fz_context, _parent->_pdf_document, page_struct); }