static gchar * extract_opf_path (const gchar *uri) { GMarkupParseContext *context; gchar *path = NULL; GError *error = NULL; GMarkupParser parser = { container_xml_start_element_handler, NULL, NULL, NULL, NULL }; /* Create parsing context */ context = g_markup_parse_context_new (&parser, 0, &path, NULL); /* Load the internal container file from the Zip archive, * and parse it to extract the .opf file to get metadata from */ tracker_gsf_parse_xml_in_zip (uri, "META-INF/container.xml", context, &error); g_markup_parse_context_free (context); if (error || !path) { g_warning ("Could not get EPUB container.xml file: %s\n", (error) ? error->message : "No error provided"); g_error_free (error); return NULL; } return path; }
static gboolean extract_opf (const gchar *uri, const gchar *opf_path, TrackerExtractInfo *info) { GMarkupParseContext *context; OPFData *data = NULL; GError *error = NULL; gchar *dirname, *contents; GMarkupParser opf_parser = { opf_xml_start_element_handler, opf_xml_end_element_handler, opf_xml_text_handler, NULL, NULL }; g_debug ("Extracting OPF file contents from EPUB '%s'", uri); data = opf_data_new (info); tracker_sparql_builder_predicate (data->metadata, "a"); tracker_sparql_builder_object (data->metadata, "nfo:EBook"); /* Create parsing context */ context = g_markup_parse_context_new (&opf_parser, 0, data, NULL); /* Load the internal container file from the Zip archive, * and parse it to extract the .opf file to get metadata from */ tracker_gsf_parse_xml_in_zip (uri, opf_path, context, &error); g_markup_parse_context_free (context); if (error) { g_warning ("Could not get EPUB '%s' file: %s\n", opf_path, (error) ? error->message : "No error provided"); g_error_free (error); opf_data_free (data); return FALSE; } dirname = g_path_get_dirname (opf_path); contents = extract_opf_contents (uri, dirname, data->pages); g_free (dirname); if (contents && *contents) { tracker_sparql_builder_predicate (data->metadata, "nie:plainTextContent"); tracker_sparql_builder_object_unvalidated (data->metadata, contents); } opf_data_free (data); g_free (contents); return TRUE; }
static void extract_oasis_content (const gchar *uri, gulong total_bytes, ODTFileType file_type, TrackerResource *metadata) { gchar *content = NULL; ODTContentParseInfo info; GMarkupParseContext *context; GError *error = NULL; GMarkupParser parser = { xml_start_element_handler_content, xml_end_element_handler_content, xml_text_handler_content, NULL, NULL }; /* If no content requested, return */ if (total_bytes == 0) { return; } /* Create parse info */ info.current = ODT_TAG_TYPE_UNKNOWN; info.file_type = file_type; info.content = g_string_new (""); info.bytes_pending = total_bytes; /* Create parsing context */ context = g_markup_parse_context_new (&parser, 0, &info, NULL); /* Load the internal XML file from the Zip archive, and parse it * using the given context */ tracker_gsf_parse_xml_in_zip (uri, "content.xml", context, &error); if (!error || g_error_matches (error, maximum_size_error_quark, 0)) { content = g_string_free (info.content, FALSE); tracker_resource_set_string (metadata, "nie:plainTextContent", content); } else { g_warning ("Got error parsing XML file: %s\n", error->message); g_string_free (info.content, TRUE); } if (error) { g_error_free (error); } g_free (content); g_markup_parse_context_free (context); }
static gchar * extract_opf_contents (const gchar *uri, const gchar *content_prefix, GList *content_files) { OPFContentData content_data = { 0 }; TrackerConfig *config; GError *error = NULL; GList *l; GMarkupParser xml_parser = { NULL, NULL, content_xml_text_handler, NULL, NULL }; config = tracker_main_get_config (); content_data.contents = g_string_new (""); content_data.limit = (gsize) tracker_config_get_max_bytes (config); g_debug ("Extracting up to %" G_GSIZE_FORMAT " bytes of content", content_data.limit); for (l = content_files; l; l = l->next) { GMarkupParseContext *context; gchar *path; context = g_markup_parse_context_new (&xml_parser, 0, &content_data, NULL); /* Page file is relative to OPF file location */ path = g_build_filename (content_prefix, l->data, NULL); tracker_gsf_parse_xml_in_zip (uri, path, context, &error); if (error) { g_warning ("Error extracting EPUB contents (%s): %s", path, error->message); g_clear_error (&error); } g_free (path); g_markup_parse_context_free (context); if (content_data.limit <= 0) { /* Reached plain text extraction limit */ break; } } return g_string_free (content_data.contents, FALSE); }
G_MODULE_EXPORT gboolean tracker_extract_get_metadata (TrackerExtractInfo *extract_info) { TrackerResource *metadata; TrackerConfig *config; ODTMetadataParseInfo info = { 0 }; ODTFileType file_type; GFile *file; gchar *uri; const gchar *mime_used; GMarkupParseContext *context; GMarkupParser parser = { xml_start_element_handler_metadata, xml_end_element_handler_metadata, xml_text_handler_metadata, NULL, NULL }; if (G_UNLIKELY (maximum_size_error_quark == 0)) { maximum_size_error_quark = g_quark_from_static_string ("maximum_size_error"); } metadata = tracker_resource_new (NULL); mime_used = tracker_extract_info_get_mimetype (extract_info); file = tracker_extract_info_get_file (extract_info); uri = g_file_get_uri (file); /* Setup conf */ config = tracker_main_get_config (); g_debug ("Extracting OASIS metadata and contents from '%s'", uri); /* First, parse metadata */ tracker_resource_add_uri (metadata, "rdf:type", "nfo:PaginatedTextDocument"); /* Create parse info */ info.metadata = metadata; info.current = ODT_TAG_TYPE_UNKNOWN; info.uri = uri; /* Create parsing context */ context = g_markup_parse_context_new (&parser, 0, &info, NULL); /* Load the internal XML file from the Zip archive, and parse it * using the given context */ tracker_gsf_parse_xml_in_zip (uri, "meta.xml", context, NULL); g_markup_parse_context_free (context); if (g_ascii_strcasecmp (mime_used, "application/vnd.oasis.opendocument.text") == 0) { file_type = FILE_TYPE_ODT; } else if (g_ascii_strcasecmp (mime_used, "application/vnd.oasis.opendocument.presentation") == 0) { file_type = FILE_TYPE_ODP; } else if (g_ascii_strcasecmp (mime_used, "application/vnd.oasis.opendocument.spreadsheet") == 0) { file_type = FILE_TYPE_ODS; } else if (g_ascii_strcasecmp (mime_used, "application/vnd.oasis.opendocument.graphics") == 0) { file_type = FILE_TYPE_ODG; } else { g_message ("Mime type was not recognised:'%s'", mime_used); file_type = FILE_TYPE_INVALID; } /* Extract content with the given limitations */ extract_oasis_content (uri, tracker_config_get_max_bytes (config), file_type, metadata); g_free (uri); tracker_extract_info_set_resource (extract_info, metadata); g_object_unref (metadata); return TRUE; }