示例#1
0
static gchar *
extract_opf_path (const gchar *uri)
{
	GMarkupParseContext *context;
	gchar *path = NULL;
	GError *error = NULL;
	GMarkupParser parser = {
		container_xml_start_element_handler,
		NULL, NULL, NULL, NULL
	};

	/* Create parsing context */
	context = g_markup_parse_context_new (&parser, 0, &path, NULL);

	/* Load the internal container file from the Zip archive,
	 * and parse it to extract the .opf file to get metadata from
	 */
	tracker_gsf_parse_xml_in_zip (uri, "META-INF/container.xml", context, &error);
	g_markup_parse_context_free (context);

	if (error || !path) {
		g_warning ("Could not get EPUB container.xml file: %s\n",
		           (error) ? error->message : "No error provided");
		g_error_free (error);
		return NULL;
	}

	return path;
}
示例#2
0
static gboolean
extract_opf (const gchar          *uri,
             const gchar          *opf_path,
             TrackerExtractInfo   *info)
{
	GMarkupParseContext *context;
	OPFData *data = NULL;
	GError *error = NULL;
	gchar *dirname, *contents;
	GMarkupParser opf_parser = {
		opf_xml_start_element_handler,
		opf_xml_end_element_handler,
		opf_xml_text_handler,
		NULL, NULL
	};

	g_debug ("Extracting OPF file contents from EPUB '%s'", uri);

	data = opf_data_new (info);

	tracker_sparql_builder_predicate (data->metadata, "a");
	tracker_sparql_builder_object (data->metadata, "nfo:EBook");

	/* Create parsing context */
	context = g_markup_parse_context_new (&opf_parser, 0, data, NULL);

	/* Load the internal container file from the Zip archive,
	 * and parse it to extract the .opf file to get metadata from
	 */
	tracker_gsf_parse_xml_in_zip (uri, opf_path, context, &error);
	g_markup_parse_context_free (context);

	if (error) {
		g_warning ("Could not get EPUB '%s' file: %s\n", opf_path,
		           (error) ? error->message : "No error provided");
		g_error_free (error);
		opf_data_free (data);
		return FALSE;
	}

	dirname = g_path_get_dirname (opf_path);
	contents = extract_opf_contents (uri, dirname, data->pages);
	g_free (dirname);

	if (contents && *contents) {
		tracker_sparql_builder_predicate (data->metadata, "nie:plainTextContent");
		tracker_sparql_builder_object_unvalidated (data->metadata, contents);
	}

	opf_data_free (data);
	g_free (contents);

	return TRUE;
}
static void
extract_oasis_content (const gchar     *uri,
                       gulong           total_bytes,
                       ODTFileType      file_type,
                       TrackerResource *metadata)
{
	gchar *content = NULL;
	ODTContentParseInfo info;
	GMarkupParseContext *context;
	GError *error = NULL;
	GMarkupParser parser = {
		xml_start_element_handler_content,
		xml_end_element_handler_content,
		xml_text_handler_content,
		NULL,
		NULL
	};

	/* If no content requested, return */
	if (total_bytes == 0) {
		return;
	}

	/* Create parse info */
	info.current = ODT_TAG_TYPE_UNKNOWN;
	info.file_type = file_type;
	info.content = g_string_new ("");
	info.bytes_pending = total_bytes;

	/* Create parsing context */
	context = g_markup_parse_context_new (&parser, 0, &info, NULL);

	/* Load the internal XML file from the Zip archive, and parse it
	 * using the given context */
	tracker_gsf_parse_xml_in_zip (uri, "content.xml", context, &error);

	if (!error || g_error_matches (error, maximum_size_error_quark, 0)) {
		content = g_string_free (info.content, FALSE);
		tracker_resource_set_string (metadata, "nie:plainTextContent", content);
	} else {
		g_warning ("Got error parsing XML file: %s\n", error->message);
		g_string_free (info.content, TRUE);
	}

	if (error) {
		g_error_free (error);
	}

	g_free (content);
	g_markup_parse_context_free (context);
}
示例#4
0
static gchar *
extract_opf_contents (const gchar *uri,
                      const gchar *content_prefix,
                      GList       *content_files)
{
	OPFContentData content_data = { 0 };
	TrackerConfig *config;
	GError *error = NULL;
	GList *l;
	GMarkupParser xml_parser = {
		NULL, NULL,
		content_xml_text_handler,
		NULL, NULL
	};

	config = tracker_main_get_config ();

	content_data.contents = g_string_new ("");
	content_data.limit = (gsize) tracker_config_get_max_bytes (config);

	g_debug ("Extracting up to %" G_GSIZE_FORMAT " bytes of content", content_data.limit);

	for (l = content_files; l; l = l->next) {
		GMarkupParseContext *context;
		gchar *path;

		context = g_markup_parse_context_new (&xml_parser, 0, &content_data, NULL);

		/* Page file is relative to OPF file location */
		path = g_build_filename (content_prefix, l->data, NULL);
		tracker_gsf_parse_xml_in_zip (uri, path, context, &error);

		if (error) {
			g_warning ("Error extracting EPUB contents (%s): %s",
				   path, error->message);
			g_clear_error (&error);
		}
		g_free (path);

		g_markup_parse_context_free (context);

		if (content_data.limit <= 0) {
			/* Reached plain text extraction limit */
			break;
		}
	}

	return g_string_free (content_data.contents, FALSE);
}
G_MODULE_EXPORT gboolean
tracker_extract_get_metadata (TrackerExtractInfo *extract_info)
{
	TrackerResource *metadata;
	TrackerConfig *config;
	ODTMetadataParseInfo info = { 0 };
	ODTFileType file_type;
	GFile *file;
	gchar *uri;
	const gchar *mime_used;
	GMarkupParseContext *context;
	GMarkupParser parser = {
		xml_start_element_handler_metadata,
		xml_end_element_handler_metadata,
		xml_text_handler_metadata,
		NULL,
		NULL
	};

	if (G_UNLIKELY (maximum_size_error_quark == 0)) {
		maximum_size_error_quark = g_quark_from_static_string ("maximum_size_error");
	}

	metadata = tracker_resource_new (NULL);
	mime_used = tracker_extract_info_get_mimetype (extract_info);

	file = tracker_extract_info_get_file (extract_info);
	uri = g_file_get_uri (file);

	/* Setup conf */
	config = tracker_main_get_config ();

	g_debug ("Extracting OASIS metadata and contents from '%s'", uri);

	/* First, parse metadata */

	tracker_resource_add_uri (metadata, "rdf:type", "nfo:PaginatedTextDocument");

	/* Create parse info */
	info.metadata = metadata;
	info.current = ODT_TAG_TYPE_UNKNOWN;
	info.uri = uri;

	/* Create parsing context */
	context = g_markup_parse_context_new (&parser, 0, &info, NULL);

	/* Load the internal XML file from the Zip archive, and parse it
	 * using the given context */
	tracker_gsf_parse_xml_in_zip (uri, "meta.xml", context, NULL);
	g_markup_parse_context_free (context);

	if (g_ascii_strcasecmp (mime_used, "application/vnd.oasis.opendocument.text") == 0) {
		file_type = FILE_TYPE_ODT;
	} else if (g_ascii_strcasecmp (mime_used, "application/vnd.oasis.opendocument.presentation") == 0) {
		file_type = FILE_TYPE_ODP;
	} else if (g_ascii_strcasecmp (mime_used, "application/vnd.oasis.opendocument.spreadsheet") == 0) {
		file_type = FILE_TYPE_ODS;
	} else if (g_ascii_strcasecmp (mime_used, "application/vnd.oasis.opendocument.graphics") == 0) {
		file_type = FILE_TYPE_ODG;
	} else {
		g_message ("Mime type was not recognised:'%s'", mime_used);
		file_type = FILE_TYPE_INVALID;
	}

	/* Extract content with the given limitations */
	extract_oasis_content (uri,
	                       tracker_config_get_max_bytes (config),
	                       file_type,
	                       metadata);

	g_free (uri);

	tracker_extract_info_set_resource (extract_info, metadata);
	g_object_unref (metadata);

	return TRUE;
}