예제 #1
0
static VALUE
rg_get_text(int argc, VALUE *argv, VALUE self)
{
    gchar *text;
    PopplerSelectionStyle style = POPPLER_SELECTION_GLYPH;
    VALUE rb_text, arg1, arg2, rb_rect;
    PopplerPage *page;

    rb_scan_args(argc, argv, "02", &arg1, &arg2);

    page = SELF(self);
    rb_rect = Qnil;
    if (!NIL_P(arg1)) {
        if (RTEST(rb_obj_is_kind_of(arg1, cRectangle))) {
            rb_rect = arg1;
        } else {
            rb_raise(rb_eArgError, "wrong first arrument. selection rectangle is expected.");
        }

        if (!NIL_P(arg2)) {
            style = RVAL2POPPLERSELECTIONSTYLE(arg2);
        }
    }

    if (NIL_P(rb_rect)) {
#if POPPLER_CHECK_VERSION(0, 15, 0)
        text = poppler_page_get_text(page);
#else
        PopplerRectangle rect;
        double width, height;

        rect.x1 = 0;
        rect.y1 = 0;
        poppler_page_get_size(page, &width, &height);
        rect.x2 = width;
        rect.y2 = height;
        text = poppler_page_get_text(page,
                                     style,
                                     &rect);
#endif
    } else {
        PopplerRectangle *rect;

        rect = RVAL2POPPLERRECTANGLE(rb_rect);
#if POPPLER_CHECK_VERSION(0, 15, 0)
        text = poppler_page_get_selected_text(page, style, rect);
#else
        text = poppler_page_get_text(page, style, rect);
#endif
    }

    rb_text = CSTR2RVAL(text);
    g_free(text);
    return rb_text;
}
예제 #2
0
static GString *
extract_content_text (PopplerDocument *document,
                      gsize            n_bytes)
{
	gint n_pages, i = 0;
	GString *string;
	GTimer *timer;
	gsize remaining_bytes = n_bytes;

	n_pages = poppler_document_get_n_pages (document);
	string = g_string_new ("");
	timer = g_timer_new ();

	while (i < n_pages &&
	       remaining_bytes > 0) {
		PopplerPage *page;
		gsize written_bytes = 0;
		gchar *text;

		page = poppler_document_get_page (document, i);
		i++;

		text = poppler_page_get_text (page);

		if (!text) {
			g_object_unref (page);
			continue;
		}

		if (tracker_text_validate_utf8 (text,
		                                MIN (strlen (text), remaining_bytes),
		                                &string,
		                                &written_bytes)) {
			g_string_append_c (string, ' ');
		}

		remaining_bytes -= written_bytes;

		g_debug ("Child: Extracted %" G_GSIZE_FORMAT " bytes from page %d, "
		         "%" G_GSIZE_FORMAT " bytes remaining",
		         written_bytes, i, remaining_bytes);

		g_free (text);
		g_object_unref (page);
	}

	g_debug ("Child: Content extraction finished: %d/%d pages indexed in %2.2f seconds, "
	         "%" G_GSIZE_FORMAT " bytes extracted",
	         i,
	         n_pages,
	         g_timer_elapsed (timer, NULL),
	         (n_bytes - remaining_bytes));

	g_timer_destroy (timer);

	return string;
}
예제 #3
0
int main(int argc, char const *argv[])
{
	char *path;
	PopplerDocument *doc;
	GError *err;
	gchar *gbuf;
	char *buf;
	page_t page_meta;
	int file_length, n;

	g_type_init();

	if (argc != 2) {
		return 1;
	}

	err = NULL;
	buf = open_pdf_file(argv[1], &file_length);

	sandboxify();

	doc = poppler_document_new_from_data(buf, file_length, NULL, &err);
	if (err != NULL) {
		fprintf(stderr, "Unable to open file: %s\n", err->message);
		return 2;
	}

	n = poppler_document_get_n_pages(doc);

	for (int i = 0; i < n; i++) {
		PopplerPage *page = poppler_document_get_page(doc, i);

		page_meta.pagenum = i;
		page_meta.text = poppler_page_get_text(page);
		page_meta.svg_len = 0;
		page_meta.svg = malloc(SVG_BUFLEN);
		if (!page_meta.svg)
			ERROR("Cannot allocate svg buffer, not enought memory?");
		page_meta.free_space = SVG_BUFLEN;

		render_page(&page_meta, page);

		if (page_meta.text)
			free(page_meta.text);
		g_object_unref(page);
	}

	if (munmap(buf, file_length) == -1)
		PERROR("munmap()");

	return 0;
}
예제 #4
0
파일: text.c 프로젝트: Godmen/poppler
static void
pgd_text_get_text (GtkWidget   *button,
		   PgdTextDemo *demo)
{
	PopplerPage     *page;
	PopplerRectangle rect;
	gdouble          width, height;
	gchar           *text;
	GTimer          *timer;

	page = poppler_document_get_page (demo->doc, demo->page);
	if (!page)
		return;

	poppler_page_get_size (page, &width, &height);
	rect.x1 = rect.y1 = 0;
	rect.x2 = width;
	rect.y2 = height;

	timer = g_timer_new ();
	text = poppler_page_get_text (page, POPPLER_SELECTION_GLYPH, &rect);
	g_timer_stop (timer);

	if (text) {
		gchar *str;

		str = g_strdup_printf ("<i>got text in %.4f seconds</i>",
				       g_timer_elapsed (timer, NULL));
		gtk_label_set_markup (GTK_LABEL (demo->timer_label), str);
		g_free (str);
	} else {
		gtk_label_set_markup (GTK_LABEL (demo->timer_label), "<i>No text found</i>");
	}

	g_timer_destroy (timer);
	g_object_unref (page);

	if (text) {
		gtk_text_buffer_set_text (demo->buffer, text, strlen (text));
		g_free (text);
	}
}
예제 #5
0
static gchar *
extract_content_text (PopplerDocument *document,
                      gsize            n_bytes)
{
	GString *string;
	GTimer *timer;
	gsize remaining_bytes;
	gint n_pages, i;
	gdouble elapsed;

	n_pages = poppler_document_get_n_pages (document);
	string = g_string_new ("");
	timer = g_timer_new ();

	for (i = 0, remaining_bytes = n_bytes, elapsed = g_timer_elapsed (timer, NULL);
	     i < n_pages && remaining_bytes > 0 && elapsed < EXTRACTION_PROCESS_TIMEOUT;
	     i++, elapsed = g_timer_elapsed (timer, NULL)) {
		PopplerPage *page;
		gsize written_bytes = 0;
		gchar *text;

		page = poppler_document_get_page (document, i);
		text = poppler_page_get_text (page);

		if (!text) {
			g_object_unref (page);
			continue;
		}

		if (tracker_text_validate_utf8 (text,
		                                MIN (strlen (text), remaining_bytes),
		                                &string,
		                                &written_bytes)) {
			g_string_append_c (string, ' ');
		}

		remaining_bytes -= written_bytes;

		g_debug ("Extracted %" G_GSIZE_FORMAT " bytes from page %d, "
		         "%" G_GSIZE_FORMAT " bytes remaining",
		         written_bytes, i, remaining_bytes);

		g_free (text);
		g_object_unref (page);
	}

	if (elapsed >= EXTRACTION_PROCESS_TIMEOUT) {
		g_debug ("Extraction timed out, %d seconds reached", EXTRACTION_PROCESS_TIMEOUT);
	}

	g_debug ("Content extraction finished: %d/%d pages indexed in %2.2f seconds, "
	         "%" G_GSIZE_FORMAT " bytes extracted",
	         i,
	         n_pages,
	         g_timer_elapsed (timer, NULL),
	         (n_bytes - remaining_bytes));

	g_timer_destroy (timer);

	return g_string_free (string, FALSE);
}
예제 #6
0
void find(const char *filename, GRegex *regex)
{
  GFile           *file;
  gchar           *uri;
  PopplerDocument *doc;
  PopplerPage     *page;
  GError          *error = NULL;
  gint             i, n;
  gint             a, b;
  GMatchInfo      *match_info;
  gchar           *text;
  
  file = g_file_new_for_commandline_arg (filename);
  uri = g_file_get_uri (file);
  g_object_unref (file);

  if(!(doc = poppler_document_new_from_file (uri, NULL, &error)))
  {
    fprintf(stderr, "Could not open file %s: %s\n",
            filename, error->message);
    g_error_free(error);
    goto cleanup;
  }

  n = poppler_document_get_n_pages(doc);

  for(i = 0; i < n; ++i)
  {
    page = poppler_document_get_page(doc, i);
    text = poppler_page_get_text(page);
    
    g_regex_match(regex, text, (GRegexMatchFlags) 0, &match_info);
    
    while(g_match_info_matches(match_info))
    {
      bold_on();
      printf("%s:%i ", filename, i + 1);
      bold_off();
      
      g_match_info_fetch_pos(match_info, 0, &a, &b);
      
      for(; a >= 1 && *(text + a - 1) != '\n'; --a);
      for(; *(text + b) && *(text + b) != '\n'; ++b);
      
      
      //print out the entire line
      for(; a < b; ++a)
        printf("%c", *(text + a));
      printf("\n");
      
      g_match_info_next (match_info, NULL);
    }
    g_match_info_free(match_info);
    
    g_object_unref(page);
    
    g_free(text);
    
  }
  
  g_object_unref(doc);
  
cleanup:
  g_free(uri);
}
예제 #7
0
파일: pdf.c 프로젝트: Debian/gpdftext
/**
 called repeatedly to add more text to the editor as each
 page is read from the PDF file, via g_timeout_add.

 Spelling needs to be turned off during loading, re-attached it at the end
 if GConf says spelling should be enabled.

 This function is asynchronous - a background task.
 Don't do things with the progress bar or status bar whilst a load
 task could be running.
 */
static gboolean
load_pdf (gpointer data)
{
	GtkProgressBar * progressbar;
	GtkStatusbar * statusbar;
	GtkTextView * text_view;
	GtkTextBuffer * buffer;
	gchar *page, * msg, * G_GNUC_UNUSED lang;
	gdouble fraction, step, width, height;
	PopplerPage * PDFPage;
	const gchar * str;
	gint pages;
	guint id;
	Equeue * queue;

	queue= (Equeue *)data;
	text_view = GTK_TEXT_VIEW(gtk_builder_get_object (queue->ebook->builder, "textview"));
	buffer = gtk_text_view_get_buffer (text_view);
	if (!queue)
	{
		gtk_text_buffer_end_user_action (buffer);
		return FALSE;
	}
	if (!queue->ebook)
	{
		gtk_text_buffer_end_user_action (buffer);
		return FALSE;
	}
	progressbar = GTK_PROGRESS_BAR(gtk_builder_get_object (queue->ebook->builder, "progressbar"));
	statusbar = GTK_STATUSBAR(gtk_builder_get_object (queue->ebook->builder, "statusbar"));
	id = gtk_statusbar_get_context_id (statusbar, PACKAGE);
	pages = poppler_document_get_n_pages (queue->ebook->PDFDoc);
	if (queue->c >= pages)
	{
		lang = gconf_client_get_string (queue->ebook->client,
			queue->ebook->language.key, NULL);
#ifdef HAVE_GTKSPELL
		/* spell_attach is already a background task. */
		if (queue->spell_state)
			gtkspell_new_attach (text_view,
				(lang == NULL || *lang == '\0') ? NULL : lang, NULL);
#endif
		gtk_progress_bar_set_text (progressbar, "");
		gtk_progress_bar_set_fraction (progressbar, 0.0);
		if (queue->ebook->utf8_count > 0)
		{
			/* Translators: Please try to keep this string brief,
			 there often isn't a lot of room in the statusbar. */
			str = ngettext ("%ld non-UTF8 character was removed",
				"%ld non-UTF-8 characters were removed", queue->ebook->utf8_count);
			msg = g_strdup_printf (str, queue->ebook->utf8_count);
			id = gtk_statusbar_get_context_id (statusbar, PACKAGE);
			gtk_statusbar_push (statusbar, id, msg);
			g_free (msg);
		}
		else
		{
			gtk_statusbar_push (statusbar, id, _("Done"));
		}
		return FALSE;
	}
	PDFPage = poppler_document_get_page (queue->ebook->PDFDoc, queue->c);
	fraction = 0.0;
	/* fraction never reaches 1.0 - allow room for spelling attachment. */
	if (queue->spell_state)
		step = 0.90/(gdouble)pages;
	else
		step = 0.99/(gdouble)pages;
	fraction += step * queue->c;
	/* update progress bar as we go */
	gtk_progress_bar_set_fraction (progressbar, fraction);
	poppler_page_get_size (PDFPage, &width, &height);
	queue->rect->x2 = width;
	queue->rect->y2 = height;
#if POPPLER_CHECK_VERSION(0, 14, 1)
	page = poppler_page_get_selected_text (PDFPage, POPPLER_SELECTION_LINE, queue->rect);
#else
    page = poppler_page_get_text (PDFPage, POPPLER_SELECTION_LINE, queue->rect);
#endif
	set_text (queue->ebook, page, queue->lines, queue->pagenums, queue->hyphens);
	g_free (page);
	queue->c++;
	/* add the page to the progressbar count. */
	msg = g_strdup_printf ("%d/%d", queue->c, pages);
	gtk_progress_bar_set_text (progressbar, msg);
	g_free (msg);
	/* more to do yet, so return TRUE to call me again. */
	return TRUE;
}