fz_char_and_box *fz_text_char_at(fz_char_and_box *cab, fz_text_page *page, int idx) { fz_text_block *block; fz_text_line *line; int ofs = 0; for (block = page->blocks; block < page->blocks + page->len; block++) { for (line = block->lines; line < block->lines + block->len; line++) { int span_num; for (span_num = 0; span_num < line->len; span_num++) { fz_text_span *span = line->spans[span_num]; if (idx < ofs + span->len) { cab->c = span->text[idx - ofs].c; fz_text_char_bbox(&cab->bbox, span, idx - ofs); return cab; } ofs += span->len; } /* pseudo-newline */ if (idx == 0) { cab->bbox = fz_empty_rect; cab->c = 0; return cab; } ofs++; } } cab->bbox = fz_empty_rect; cab->c = 0; return cab; }
/* Information down to the character level */ SYMBOL_DECLSPEC int __stdcall mGetTextCharacter(void *page, int block_num, int line_num, int item_num, double *top_x, double *top_y, double *height, double *width) { fz_text_block *block; fz_text_line line; fz_text_span *span; fz_text_page *text = (fz_text_page*)page; fz_char_and_box cab; int index = item_num; block = text->blocks[block_num].u.text; line = block->lines[line_num]; span = line.first_span; while (index >= span->len) { index = index - span->len; /* Reset to start of next span */ span = span->next; /* Get next span */ } cab.c = span->text[index].c; fz_text_char_bbox(&(cab.bbox), span, index); *top_x = cab.bbox.x0; *top_y = cab.bbox.y0; *height = cab.bbox.y1 - *top_y; *width = cab.bbox.x1 - *top_x; return cab.c; }
char * fz_copy_selection(fz_context *ctx, fz_text_page *page, fz_rect rect) { fz_buffer *buffer; fz_rect hitbox; int c, i, block_num, seen = 0; char *s; float x0 = rect.x0; float x1 = rect.x1; float y0 = rect.y0; float y1 = rect.y1; buffer = fz_new_buffer(ctx, 1024); for (block_num = 0; block_num < page->len; block_num++) { fz_text_block *block; fz_text_line *line; fz_text_span *span; if (page->blocks[block_num].type != FZ_PAGE_BLOCK_TEXT) continue; block = page->blocks[block_num].u.text; for (line = block->lines; line < block->lines + block->len; line++) { for (span = line->first_span; span; span = span->next) { if (seen) { fz_write_buffer_byte(ctx, buffer, '\n'); } seen = 0; for (i = 0; i < span->len; i++) { fz_text_char_bbox(&hitbox, span, i); c = span->text[i].c; if (c < 32) c = '?'; if (hitbox.x1 >= x0 && hitbox.x0 <= x1 && hitbox.y1 >= y0 && hitbox.y0 <= y1) { fz_write_buffer_rune(ctx, buffer, c); seen = 1; } } seen = (seen && span == line->last_span); } } } fz_write_buffer_byte(ctx, buffer, 0); s = (char*)buffer->data; fz_free(ctx, buffer); return s; }
char * fz_copy_selection(fz_context *ctx, fz_text_page *page, fz_rect rect) { fz_buffer *buffer; fz_rect hitbox; fz_text_block *block; fz_text_line *line; int c, i, seen = 0; char *s; float x0 = rect.x0; float x1 = rect.x1; float y0 = rect.y0; float y1 = rect.y1; buffer = fz_new_buffer(ctx, 1024); for (block = page->blocks; block < page->blocks + page->len; block++) { for (line = block->lines; line < block->lines + block->len; line++) { int span_num; for (span_num = 0; span_num < line->len; span_num++) { fz_text_span *span = line->spans[span_num]; if (seen) { fz_write_buffer_byte(ctx, buffer, '\n'); } seen = 0; for (i = 0; i < span->len; i++) { fz_text_char_bbox(&hitbox, span, i); c = span->text[i].c; if (c < 32) c = '?'; if (hitbox.x1 >= x0 && hitbox.x0 <= x1 && hitbox.y1 >= y0 && hitbox.y0 <= y1) { fz_write_buffer_rune(ctx, buffer, c); seen = 1; } } seen = (seen && span_num + 1 == line->len); } } } fz_write_buffer_byte(ctx, buffer, 0); s = (char*)buffer->data; fz_free(ctx, buffer); return s; }
int fz_highlight_selection(fz_context *ctx, fz_text_page *page, fz_rect rect, fz_rect *hit_bbox, int hit_max) { fz_rect linebox, charbox; fz_text_block *block; fz_text_line *line; fz_text_span *span; int i, block_num, hit_count; float x0 = rect.x0; float x1 = rect.x1; float y0 = rect.y0; float y1 = rect.y1; hit_count = 0; for (block_num = 0; block_num < page->len; block_num++) { if (page->blocks[block_num].type != FZ_PAGE_BLOCK_TEXT) continue; block = page->blocks[block_num].u.text; for (line = block->lines; line < block->lines + block->len; line++) { linebox = fz_empty_rect; for (span = line->first_span; span; span = span->next) { for (i = 0; i < span->len; i++) { fz_text_char_bbox(&charbox, span, i); if (charbox.x1 >= x0 && charbox.x0 <= x1 && charbox.y1 >= y0 && charbox.y0 <= y1) { if (charbox.y0 != linebox.y0 || fz_abs(charbox.x0 - linebox.x1) > 5) { if (!fz_is_empty_rect(&linebox) && hit_count < hit_max) hit_bbox[hit_count++] = linebox; linebox = charbox; } else { fz_union_rect(&linebox, &charbox); } } } } if (!fz_is_empty_rect(&linebox) && hit_count < hit_max) hit_bbox[hit_count++] = linebox; } } return hit_count; }
int fz_highlight_selection(fz_context *ctx, fz_text_page *page, fz_rect rect, fz_rect *hit_bbox, int hit_max) { fz_rect linebox, charbox; fz_text_block *block; fz_text_line *line; int i, hit_count; float x0 = rect.x0; float x1 = rect.x1; float y0 = rect.y0; float y1 = rect.y1; hit_count = 0; for (block = page->blocks; block < page->blocks + page->len; block++) { for (line = block->lines; line < block->lines + block->len; line++) { int span_num; linebox = fz_empty_rect; for (span_num = 0; span_num < line->len; span_num++) { fz_text_span *span = line->spans[span_num]; for (i = 0; i < span->len; i++) { fz_text_char_bbox(&charbox, span, i); if (charbox.x1 >= x0 && charbox.x0 <= x1 && charbox.y1 >= y0 && charbox.y0 <= y1) { if (charbox.y0 != linebox.y0 || fz_abs(charbox.x0 - linebox.x1) > 5) { if (!fz_is_empty_rect(&linebox) && hit_count < hit_max) hit_bbox[hit_count++] = linebox; linebox = charbox; } else { fz_union_rect(&linebox, &charbox); } } } } if (!fz_is_empty_rect(&linebox) && hit_count < hit_max) hit_bbox[hit_count++] = linebox; } } return hit_count; }
void DrPDFExtractor::ExtractChars(std::list<DrChar *> &charlist, fz_text_page * tpage) { for (int i = 0; i < tpage->len; i++) { fz_page_block * pb = tpage->blocks+i; if (pb->type == FZ_PAGE_BLOCK_TEXT) { fz_text_block * tb = pb->u.text; for (int j = 0; j < tb->len; j++) { fz_text_line * line = tb->lines+j; for (fz_text_span * span = line->first_span; span->next != NULL; span = span->next) { for (int k = 0; k < span->len; k++) { fz_rect bbox; fz_text_char_bbox(&bbox, span, k); fz_text_char * t_char = span->text+k; fz_text_style * style = t_char->style; DrChar * achar = new DrChar(); DrBox charbox(bbox.x0, bbox.y0, bbox.x1, bbox.y1); int fontstyle; if (style->font->ft_bold == 1 && style->font->ft_italic == 0) { fontstyle = DrFontDescriptor::FS_BOLD; } else if (style->font->ft_bold == 0 && style->font->ft_italic == 1) { fontstyle = DrFontDescriptor::FS_ITALIC; } else if (style->font->ft_bold == 1 && style->font->ft_italic == 1) { fontstyle = DrFontDescriptor::FS_BOLD_ITALIC; } else fontstyle = DrFontDescriptor::FS_NONE; DrFontDescriptor * fd = m_fontcache->FindDescriptor(style->font->name, style->size, fontstyle); if (fd == NULL) { m_fontcache->AddDescriptor(style->font->name, style->size, fontstyle); fd = m_fontcache->FindDescriptor(style->font->name, style->size, fontstyle); } achar->Initialize(charbox, t_char->c,fd); charlist.push_back(achar); } } } } } }
fz_char_and_box *fz_text_char_at(fz_char_and_box *cab, fz_text_page *page, int idx) { int block_num; for (block_num = 0; block_num < page->len; block_num++) { fz_text_block *block; fz_text_line *line; fz_text_span *span; int ofs = 0; if (page->blocks[block_num].type != FZ_PAGE_BLOCK_TEXT) continue; block = page->blocks[block_num].u.text; for (line = block->lines; line < block->lines + block->len; line++) { for (span = line->first_span; span; span = span->next) { if (idx < ofs + span->len) { cab->c = span->text[idx - ofs].c; fz_text_char_bbox(&cab->bbox, span, idx - ofs); return cab; } ofs += span->len; } /* pseudo-newline */ if (idx == 0) { cab->bbox = fz_empty_rect; cab->c = 0; return cab; } ofs++; } } cab->bbox = fz_empty_rect; cab->c = 0; return cab; }
void fz_print_text_page_xml(fz_context *ctx, fz_output *out, fz_text_page *page) { int block_n; fz_printf(out, "<page width=\"%g\" height=\"%g\">\n", page->mediabox.x1 - page->mediabox.x0, page->mediabox.y1 - page->mediabox.y0); for (block_n = 0; block_n < page->len; block_n++) { switch (page->blocks[block_n].type) { case FZ_PAGE_BLOCK_TEXT: { fz_text_block *block = page->blocks[block_n].u.text; fz_text_line *line; char *s; fz_printf(out, "<block bbox=\"%g %g %g %g\">\n", block->bbox.x0, block->bbox.y0, block->bbox.x1, block->bbox.y1); for (line = block->lines; line < block->lines + block->len; line++) { fz_text_span *span; fz_printf(out, "<line bbox=\"%g %g %g %g\">\n", line->bbox.x0, line->bbox.y0, line->bbox.x1, line->bbox.y1); for (span = line->first_span; span; span = span->next) { fz_text_style *style = NULL; int char_num; for (char_num = 0; char_num < span->len; char_num++) { fz_text_char *ch = &span->text[char_num]; if (ch->style != style) { if (style) { fz_printf(out, "</span>\n"); } style = ch->style; s = strchr(style->font->name, '+'); s = s ? s + 1 : style->font->name; fz_printf(out, "<span bbox=\"%g %g %g %g\" font=\"%s\" size=\"%g\">\n", span->bbox.x0, span->bbox.y0, span->bbox.x1, span->bbox.y1, s, style->size); } { fz_rect rect; fz_text_char_bbox(&rect, span, char_num); fz_printf(out, "<char bbox=\"%g %g %g %g\" x=\"%g\" y=\"%g\" c=\"", rect.x0, rect.y0, rect.x1, rect.y1, ch->p.x, ch->p.y); } switch (ch->c) { case '<': fz_printf(out, "<"); break; case '>': fz_printf(out, ">"); break; case '&': fz_printf(out, "&"); break; case '"': fz_printf(out, """); break; case '\'': fz_printf(out, "'"); break; default: if (ch->c >= 32 && ch->c <= 127) fz_printf(out, "%c", ch->c); else fz_printf(out, "&#x%x;", ch->c); break; } fz_printf(out, "\"/>\n"); } if (style) fz_printf(out, "</span>\n"); } fz_printf(out, "</line>\n"); } fz_printf(out, "</block>\n"); break; } case FZ_PAGE_BLOCK_IMAGE: { break; } } } fz_printf(out, "</page>\n"); }
fz_buffer * fz_new_buffer_from_text_page(fz_context *ctx, fz_text_page *text, const fz_rect *sel, int crlf) { fz_buffer *buf; fz_rect hitbox; float x0, y0, x1, y1; int block_num; int need_newline; int i; need_newline = 0; if (fz_is_infinite_rect(sel)) { x0 = y0 = -FLT_MAX; x1 = y1 = FLT_MAX; } else { x0 = sel->x0; y0 = sel->y0; x1 = sel->x1; y1 = sel->y1; } buf = fz_new_buffer(ctx, 256); fz_try(ctx) { for (block_num = 0; block_num < text->len; block_num++) { fz_text_line *line; fz_text_block *block; fz_text_span *span; if (text->blocks[block_num].type != FZ_PAGE_BLOCK_TEXT) continue; block = text->blocks[block_num].u.text; for (line = block->lines; line < block->lines + block->len; line++) { int saw_text = 0; for (span = line->first_span; span; span = span->next) { for (i = 0; i < span->len; i++) { int c; fz_text_char_bbox(ctx, &hitbox, span, i); c = span->text[i].c; if (c < 32) c = '?'; if (hitbox.x1 >= x0 && hitbox.x0 <= x1 && hitbox.y1 >= y0 && hitbox.y0 <= y1) { saw_text = 1; if (need_newline) { if (crlf) fz_write_buffer_rune(ctx, buf, '\r'); fz_write_buffer_rune(ctx, buf, '\n'); need_newline = 0; } fz_write_buffer_rune(ctx, buf, c); } } } if (saw_text) need_newline = 1; } } } fz_catch(ctx) { fz_drop_buffer(ctx, buf); fz_rethrow(ctx); } return buf; }