static VALUE rg_get_text(int argc, VALUE *argv, VALUE self) { gchar *text; PopplerSelectionStyle style = POPPLER_SELECTION_GLYPH; VALUE rb_text, arg1, arg2, rb_rect; PopplerPage *page; rb_scan_args(argc, argv, "02", &arg1, &arg2); page = SELF(self); rb_rect = Qnil; if (!NIL_P(arg1)) { if (RTEST(rb_obj_is_kind_of(arg1, cRectangle))) { rb_rect = arg1; } else { rb_raise(rb_eArgError, "wrong first arrument. selection rectangle is expected."); } if (!NIL_P(arg2)) { style = RVAL2POPPLERSELECTIONSTYLE(arg2); } } if (NIL_P(rb_rect)) { #if POPPLER_CHECK_VERSION(0, 15, 0) text = poppler_page_get_text(page); #else PopplerRectangle rect; double width, height; rect.x1 = 0; rect.y1 = 0; poppler_page_get_size(page, &width, &height); rect.x2 = width; rect.y2 = height; text = poppler_page_get_text(page, style, &rect); #endif } else { PopplerRectangle *rect; rect = RVAL2POPPLERRECTANGLE(rb_rect); #if POPPLER_CHECK_VERSION(0, 15, 0) text = poppler_page_get_selected_text(page, style, rect); #else text = poppler_page_get_text(page, style, rect); #endif } rb_text = CSTR2RVAL(text); g_free(text); return rb_text; }
static GString * extract_content_text (PopplerDocument *document, gsize n_bytes) { gint n_pages, i = 0; GString *string; GTimer *timer; gsize remaining_bytes = n_bytes; n_pages = poppler_document_get_n_pages (document); string = g_string_new (""); timer = g_timer_new (); while (i < n_pages && remaining_bytes > 0) { PopplerPage *page; gsize written_bytes = 0; gchar *text; page = poppler_document_get_page (document, i); i++; text = poppler_page_get_text (page); if (!text) { g_object_unref (page); continue; } if (tracker_text_validate_utf8 (text, MIN (strlen (text), remaining_bytes), &string, &written_bytes)) { g_string_append_c (string, ' '); } remaining_bytes -= written_bytes; g_debug ("Child: Extracted %" G_GSIZE_FORMAT " bytes from page %d, " "%" G_GSIZE_FORMAT " bytes remaining", written_bytes, i, remaining_bytes); g_free (text); g_object_unref (page); } g_debug ("Child: Content extraction finished: %d/%d pages indexed in %2.2f seconds, " "%" G_GSIZE_FORMAT " bytes extracted", i, n_pages, g_timer_elapsed (timer, NULL), (n_bytes - remaining_bytes)); g_timer_destroy (timer); return string; }
int main(int argc, char const *argv[]) { char *path; PopplerDocument *doc; GError *err; gchar *gbuf; char *buf; page_t page_meta; int file_length, n; g_type_init(); if (argc != 2) { return 1; } err = NULL; buf = open_pdf_file(argv[1], &file_length); sandboxify(); doc = poppler_document_new_from_data(buf, file_length, NULL, &err); if (err != NULL) { fprintf(stderr, "Unable to open file: %s\n", err->message); return 2; } n = poppler_document_get_n_pages(doc); for (int i = 0; i < n; i++) { PopplerPage *page = poppler_document_get_page(doc, i); page_meta.pagenum = i; page_meta.text = poppler_page_get_text(page); page_meta.svg_len = 0; page_meta.svg = malloc(SVG_BUFLEN); if (!page_meta.svg) ERROR("Cannot allocate svg buffer, not enought memory?"); page_meta.free_space = SVG_BUFLEN; render_page(&page_meta, page); if (page_meta.text) free(page_meta.text); g_object_unref(page); } if (munmap(buf, file_length) == -1) PERROR("munmap()"); return 0; }
static void pgd_text_get_text (GtkWidget *button, PgdTextDemo *demo) { PopplerPage *page; PopplerRectangle rect; gdouble width, height; gchar *text; GTimer *timer; page = poppler_document_get_page (demo->doc, demo->page); if (!page) return; poppler_page_get_size (page, &width, &height); rect.x1 = rect.y1 = 0; rect.x2 = width; rect.y2 = height; timer = g_timer_new (); text = poppler_page_get_text (page, POPPLER_SELECTION_GLYPH, &rect); g_timer_stop (timer); if (text) { gchar *str; str = g_strdup_printf ("<i>got text in %.4f seconds</i>", g_timer_elapsed (timer, NULL)); gtk_label_set_markup (GTK_LABEL (demo->timer_label), str); g_free (str); } else { gtk_label_set_markup (GTK_LABEL (demo->timer_label), "<i>No text found</i>"); } g_timer_destroy (timer); g_object_unref (page); if (text) { gtk_text_buffer_set_text (demo->buffer, text, strlen (text)); g_free (text); } }
static gchar * extract_content_text (PopplerDocument *document, gsize n_bytes) { GString *string; GTimer *timer; gsize remaining_bytes; gint n_pages, i; gdouble elapsed; n_pages = poppler_document_get_n_pages (document); string = g_string_new (""); timer = g_timer_new (); for (i = 0, remaining_bytes = n_bytes, elapsed = g_timer_elapsed (timer, NULL); i < n_pages && remaining_bytes > 0 && elapsed < EXTRACTION_PROCESS_TIMEOUT; i++, elapsed = g_timer_elapsed (timer, NULL)) { PopplerPage *page; gsize written_bytes = 0; gchar *text; page = poppler_document_get_page (document, i); text = poppler_page_get_text (page); if (!text) { g_object_unref (page); continue; } if (tracker_text_validate_utf8 (text, MIN (strlen (text), remaining_bytes), &string, &written_bytes)) { g_string_append_c (string, ' '); } remaining_bytes -= written_bytes; g_debug ("Extracted %" G_GSIZE_FORMAT " bytes from page %d, " "%" G_GSIZE_FORMAT " bytes remaining", written_bytes, i, remaining_bytes); g_free (text); g_object_unref (page); } if (elapsed >= EXTRACTION_PROCESS_TIMEOUT) { g_debug ("Extraction timed out, %d seconds reached", EXTRACTION_PROCESS_TIMEOUT); } g_debug ("Content extraction finished: %d/%d pages indexed in %2.2f seconds, " "%" G_GSIZE_FORMAT " bytes extracted", i, n_pages, g_timer_elapsed (timer, NULL), (n_bytes - remaining_bytes)); g_timer_destroy (timer); return g_string_free (string, FALSE); }
void find(const char *filename, GRegex *regex) { GFile *file; gchar *uri; PopplerDocument *doc; PopplerPage *page; GError *error = NULL; gint i, n; gint a, b; GMatchInfo *match_info; gchar *text; file = g_file_new_for_commandline_arg (filename); uri = g_file_get_uri (file); g_object_unref (file); if(!(doc = poppler_document_new_from_file (uri, NULL, &error))) { fprintf(stderr, "Could not open file %s: %s\n", filename, error->message); g_error_free(error); goto cleanup; } n = poppler_document_get_n_pages(doc); for(i = 0; i < n; ++i) { page = poppler_document_get_page(doc, i); text = poppler_page_get_text(page); g_regex_match(regex, text, (GRegexMatchFlags) 0, &match_info); while(g_match_info_matches(match_info)) { bold_on(); printf("%s:%i ", filename, i + 1); bold_off(); g_match_info_fetch_pos(match_info, 0, &a, &b); for(; a >= 1 && *(text + a - 1) != '\n'; --a); for(; *(text + b) && *(text + b) != '\n'; ++b); //print out the entire line for(; a < b; ++a) printf("%c", *(text + a)); printf("\n"); g_match_info_next (match_info, NULL); } g_match_info_free(match_info); g_object_unref(page); g_free(text); } g_object_unref(doc); cleanup: g_free(uri); }
/** called repeatedly to add more text to the editor as each page is read from the PDF file, via g_timeout_add. Spelling needs to be turned off during loading, re-attached it at the end if GConf says spelling should be enabled. This function is asynchronous - a background task. Don't do things with the progress bar or status bar whilst a load task could be running. */ static gboolean load_pdf (gpointer data) { GtkProgressBar * progressbar; GtkStatusbar * statusbar; GtkTextView * text_view; GtkTextBuffer * buffer; gchar *page, * msg, * G_GNUC_UNUSED lang; gdouble fraction, step, width, height; PopplerPage * PDFPage; const gchar * str; gint pages; guint id; Equeue * queue; queue= (Equeue *)data; text_view = GTK_TEXT_VIEW(gtk_builder_get_object (queue->ebook->builder, "textview")); buffer = gtk_text_view_get_buffer (text_view); if (!queue) { gtk_text_buffer_end_user_action (buffer); return FALSE; } if (!queue->ebook) { gtk_text_buffer_end_user_action (buffer); return FALSE; } progressbar = GTK_PROGRESS_BAR(gtk_builder_get_object (queue->ebook->builder, "progressbar")); statusbar = GTK_STATUSBAR(gtk_builder_get_object (queue->ebook->builder, "statusbar")); id = gtk_statusbar_get_context_id (statusbar, PACKAGE); pages = poppler_document_get_n_pages (queue->ebook->PDFDoc); if (queue->c >= pages) { lang = gconf_client_get_string (queue->ebook->client, queue->ebook->language.key, NULL); #ifdef HAVE_GTKSPELL /* spell_attach is already a background task. */ if (queue->spell_state) gtkspell_new_attach (text_view, (lang == NULL || *lang == '\0') ? NULL : lang, NULL); #endif gtk_progress_bar_set_text (progressbar, ""); gtk_progress_bar_set_fraction (progressbar, 0.0); if (queue->ebook->utf8_count > 0) { /* Translators: Please try to keep this string brief, there often isn't a lot of room in the statusbar. */ str = ngettext ("%ld non-UTF8 character was removed", "%ld non-UTF-8 characters were removed", queue->ebook->utf8_count); msg = g_strdup_printf (str, queue->ebook->utf8_count); id = gtk_statusbar_get_context_id (statusbar, PACKAGE); gtk_statusbar_push (statusbar, id, msg); g_free (msg); } else { gtk_statusbar_push (statusbar, id, _("Done")); } return FALSE; } PDFPage = poppler_document_get_page (queue->ebook->PDFDoc, queue->c); fraction = 0.0; /* fraction never reaches 1.0 - allow room for spelling attachment. */ if (queue->spell_state) step = 0.90/(gdouble)pages; else step = 0.99/(gdouble)pages; fraction += step * queue->c; /* update progress bar as we go */ gtk_progress_bar_set_fraction (progressbar, fraction); poppler_page_get_size (PDFPage, &width, &height); queue->rect->x2 = width; queue->rect->y2 = height; #if POPPLER_CHECK_VERSION(0, 14, 1) page = poppler_page_get_selected_text (PDFPage, POPPLER_SELECTION_LINE, queue->rect); #else page = poppler_page_get_text (PDFPage, POPPLER_SELECTION_LINE, queue->rect); #endif set_text (queue->ebook, page, queue->lines, queue->pagenums, queue->hyphens); g_free (page); queue->c++; /* add the page to the progressbar count. */ msg = g_strdup_printf ("%d/%d", queue->c, pages); gtk_progress_bar_set_text (progressbar, msg); g_free (msg); /* more to do yet, so return TRUE to call me again. */ return TRUE; }