G_MODULE_EXPORT gboolean tracker_extract_get_metadata (TrackerExtractInfo *info) { TrackerResource *metadata; guint max_width; guint max_height; GFile *file; gchar *uri; file = tracker_extract_info_get_file (info); uri = g_file_get_uri (file); metadata = tracker_resource_new (NULL); /* The Windows Icon file format may contain the same icon with different * sizes inside, so there's no clear way of setting single width and * height values. Thus, we set maximum sizes found. */ tracker_resource_add_uri (metadata, "rdf:type", "nfo:Image"); tracker_resource_add_uri (metadata, "rdf:type", "nfo:Icon"); if (find_max_width_and_height (uri, &max_width, &max_height)) { if (max_width > 0) { tracker_resource_set_int64 (metadata, "nfo:width", (gint64)max_width); } if (max_height > 0) { tracker_resource_set_int64 (metadata, "nfo:height", (gint64)max_height); } } g_free (uri); tracker_extract_info_set_resource (info, metadata); g_object_unref (metadata); return TRUE; }
G_MODULE_EXPORT gboolean tracker_extract_get_metadata (TrackerExtractInfo *info) { TrackerResource *resource; GXPSDocument *document; GXPSFile *xps_file; GFile *file; gchar *filename; GError *error = NULL; file = tracker_extract_info_get_file (info); xps_file = gxps_file_new (file, &error); filename = g_file_get_path (file); if (error != NULL) { g_warning ("Unable to open '%s': %s", filename, error->message); g_error_free (error); g_free (filename); return FALSE; } document = gxps_file_get_document (xps_file, 0, &error); g_object_unref (xps_file); if (error != NULL) { g_warning ("Unable to read '%s': %s", filename, error->message); g_error_free (error); g_free (filename); return FALSE; } resource = tracker_resource_new (NULL); tracker_resource_add_uri (resource, "rdf:type", "nfo:PaginatedTextDocument"); tracker_resource_set_int64 (resource, "nfo:pageCount", gxps_document_get_n_pages (document)); g_object_unref (document); g_free (filename); tracker_extract_info_set_resource (info, resource); g_object_unref (resource); return TRUE; }
G_MODULE_EXPORT gboolean tracker_extract_get_metadata (TrackerExtractInfo *info) { TrackerResource *resource; GFile *file; gchar *filename; DviContext *context; file = tracker_extract_info_get_file (info); filename = g_file_get_path (file); context = mdvi_init_context (filename); if (context == NULL) { g_warning ("Could not open dvi file '%s'\n", filename); g_free (filename); return FALSE; } resource = tracker_resource_new (NULL); tracker_resource_add_uri (resource, "rdf:type", "nfo:PaginatedTextDocument"); tracker_resource_set_int64 (resource, "nfo:pageCount", context->npages); if (context->fileid) { tracker_resource_set_string (resource, "nie:comment", context->fileid); } mdvi_destroy_context (context); tracker_extract_info_set_resource (info, resource); g_object_unref (resource); return TRUE; }
G_MODULE_EXPORT gboolean tracker_extract_get_metadata (TrackerExtractInfo *info) { TrackerResource *metadata; GFile *file; TrackerConfig *config; htmlDocPtr doc; parser_data pd; gchar *filename; xmlSAXHandler handler = { NULL, /* internalSubset */ NULL, /* isStandalone */ NULL, /* hasInternalSubset */ NULL, /* hasExternalSubset */ NULL, /* resolveEntity */ NULL, /* getEntity */ NULL, /* entityDecl */ NULL, /* notationDecl */ NULL, /* attributeDecl */ NULL, /* elementDecl */ NULL, /* unparsedEntityDecl */ NULL, /* setDocumentLocator */ NULL, /* startDocument */ NULL, /* endDocument */ parser_start_element, /* startElement */ parser_end_element, /* endElement */ NULL, /* reference */ parser_characters, /* characters */ NULL, /* ignorableWhitespace */ NULL, /* processingInstruction */ NULL, /* comment */ NULL, /* xmlParserWarning */ NULL, /* xmlParserError */ NULL, /* xmlParserError */ NULL, /* getParameterEntity */ NULL, /* cdataBlock */ NULL, /* externalSubset */ 1, /* initialized */ NULL, /* private */ NULL, /* startElementNsSAX2Func */ NULL, /* endElementNsSAX2Func */ NULL /* xmlStructuredErrorFunc */ }; file = tracker_extract_info_get_file (info); metadata = tracker_resource_new (NULL); tracker_resource_add_uri (metadata, "rdf:type", "nfo:HtmlDocument"); pd.metadata = metadata; pd.current = -1; pd.in_body = FALSE; pd.plain_text = g_string_new (NULL); pd.title = g_string_new (NULL); config = tracker_main_get_config (); pd.n_bytes_remaining = tracker_config_get_max_bytes (config); filename = g_file_get_path (file); doc = htmlSAXParseFile (filename, NULL, &handler, &pd); g_free (filename); if (doc) { xmlFreeDoc (doc); } g_strstrip (pd.plain_text->str); g_strstrip (pd.title->str); if (pd.title->str && *pd.title->str != '\0') { tracker_resource_set_string (metadata, "nie:title", pd.title->str); } if (pd.plain_text->str && *pd.plain_text->str != '\0') { tracker_resource_set_string (metadata, "nie:plainTextContent", pd.plain_text->str); } g_string_free (pd.plain_text, TRUE); g_string_free (pd.title, TRUE); tracker_extract_info_set_resource (info, metadata); g_object_unref (metadata); return TRUE; }
G_MODULE_EXPORT gboolean tracker_extract_get_metadata (TrackerExtractInfo *extract_info) { TrackerResource *metadata; TrackerConfig *config; ODTMetadataParseInfo info = { 0 }; ODTFileType file_type; GFile *file; gchar *uri; const gchar *mime_used; GMarkupParseContext *context; GMarkupParser parser = { xml_start_element_handler_metadata, xml_end_element_handler_metadata, xml_text_handler_metadata, NULL, NULL }; if (G_UNLIKELY (maximum_size_error_quark == 0)) { maximum_size_error_quark = g_quark_from_static_string ("maximum_size_error"); } metadata = tracker_resource_new (NULL); mime_used = tracker_extract_info_get_mimetype (extract_info); file = tracker_extract_info_get_file (extract_info); uri = g_file_get_uri (file); /* Setup conf */ config = tracker_main_get_config (); g_debug ("Extracting OASIS metadata and contents from '%s'", uri); /* First, parse metadata */ tracker_resource_add_uri (metadata, "rdf:type", "nfo:PaginatedTextDocument"); /* Create parse info */ info.metadata = metadata; info.current = ODT_TAG_TYPE_UNKNOWN; info.uri = uri; /* Create parsing context */ context = g_markup_parse_context_new (&parser, 0, &info, NULL); /* Load the internal XML file from the Zip archive, and parse it * using the given context */ tracker_gsf_parse_xml_in_zip (uri, "meta.xml", context, NULL); g_markup_parse_context_free (context); if (g_ascii_strcasecmp (mime_used, "application/vnd.oasis.opendocument.text") == 0) { file_type = FILE_TYPE_ODT; } else if (g_ascii_strcasecmp (mime_used, "application/vnd.oasis.opendocument.presentation") == 0) { file_type = FILE_TYPE_ODP; } else if (g_ascii_strcasecmp (mime_used, "application/vnd.oasis.opendocument.spreadsheet") == 0) { file_type = FILE_TYPE_ODS; } else if (g_ascii_strcasecmp (mime_used, "application/vnd.oasis.opendocument.graphics") == 0) { file_type = FILE_TYPE_ODG; } else { g_message ("Mime type was not recognised:'%s'", mime_used); file_type = FILE_TYPE_INVALID; } /* Extract content with the given limitations */ extract_oasis_content (uri, tracker_config_get_max_bytes (config), file_type, metadata); g_free (uri); tracker_extract_info_set_resource (extract_info, metadata); g_object_unref (metadata); return TRUE; }
G_MODULE_EXPORT gboolean tracker_extract_get_metadata (TrackerExtractInfo *info) { TrackerConfig *config; GTime creation_date; GError *error = NULL; TrackerResource *metadata; TrackerXmpData *xd = NULL; PDFData pd = { 0 }; /* actual data */ PDFData md = { 0 }; /* for merging */ PopplerDocument *document; gchar *xml = NULL; gchar *content, *uri; guint n_bytes; GPtrArray *keywords; guint i; GFile *file; gchar *filename; int fd; gchar *contents = NULL; gsize len; struct stat st; file = tracker_extract_info_get_file (info); filename = g_file_get_path (file); fd = tracker_file_open_fd (filename); if (fd == -1) { g_warning ("Could not open pdf file '%s': %s\n", filename, g_strerror (errno)); g_free (filename); return FALSE; } if (fstat (fd, &st) == -1) { g_warning ("Could not fstat pdf file '%s': %s\n", filename, g_strerror (errno)); close (fd); g_free (filename); return FALSE; } if (st.st_size == 0) { contents = NULL; len = 0; } else { contents = (gchar *) mmap (NULL, st.st_size, PROT_READ, MAP_PRIVATE, fd, 0); if (contents == NULL || contents == MAP_FAILED) { g_warning ("Could not mmap pdf file '%s': %s\n", filename, g_strerror (errno)); close (fd); g_free (filename); return FALSE; } len = st.st_size; } g_free (filename); uri = g_file_get_uri (file); document = poppler_document_new_from_data (contents, len, NULL, &error); if (error) { if (error->code == POPPLER_ERROR_ENCRYPTED) { metadata = tracker_resource_new (NULL); tracker_resource_add_uri (metadata, "rdf:type", "nfo:PaginatedTextDocument"); tracker_resource_set_boolean (metadata, "nfo:isContentEncrypted", TRUE); tracker_extract_info_set_resource (info, metadata); g_object_unref (metadata); g_error_free (error); g_free (uri); close (fd); return TRUE; } else { g_warning ("Couldn't create PopplerDocument from uri:'%s', %s", uri, error->message ? error->message : "no error given"); g_error_free (error); g_free (uri); close (fd); return FALSE; } } if (!document) { g_warning ("Could not create PopplerDocument from uri:'%s', " "NULL returned without an error", uri); g_free (uri); close (fd); return FALSE; } metadata = tracker_resource_new (NULL); tracker_resource_add_uri (metadata, "rdf:type", "nfo:PaginatedTextDocument"); g_object_get (document, "title", &pd.title, "author", &pd.author, "subject", &pd.subject, "keywords", &pd.keywords, "creation-date", &creation_date, "metadata", &xml, NULL); if (creation_date > 0) { pd.creation_date = tracker_date_to_string ((time_t) creation_date); } keywords = g_ptr_array_new_with_free_func ((GDestroyNotify) g_free); if (xml && *xml && (xd = tracker_xmp_new (xml, strlen (xml), uri)) != NULL) { /* The casts here are well understood and known */ md.title = (gchar *) tracker_coalesce_strip (4, pd.title, xd->title, xd->title2, xd->pdf_title); md.subject = (gchar *) tracker_coalesce_strip (2, pd.subject, xd->subject); md.date = (gchar *) tracker_coalesce_strip (3, pd.creation_date, xd->date, xd->time_original); md.author = (gchar *) tracker_coalesce_strip (2, pd.author, xd->creator); write_pdf_data (md, metadata, keywords); if (xd->keywords) { tracker_keywords_parse (keywords, xd->keywords); } if (xd->pdf_keywords) { tracker_keywords_parse (keywords, xd->pdf_keywords); } if (xd->publisher) { TrackerResource *publisher = tracker_extract_new_contact (xd->publisher); tracker_resource_set_relation (metadata, "nco:publisher", publisher); g_object_unref (publisher); } if (xd->type) { tracker_resource_set_string (metadata, "dc:type", xd->type); } if (xd->format) { tracker_resource_set_string (metadata, "dc:format", xd->format); } if (xd->identifier) { tracker_resource_set_string (metadata, "dc:identifier", xd->identifier); } if (xd->source) { tracker_resource_set_string (metadata, "dc:source", xd->source); } if (xd->language) { tracker_resource_set_string (metadata, "dc:language", xd->language); } if (xd->relation) { tracker_resource_set_string (metadata, "dc:relation", xd->relation); } if (xd->coverage) { tracker_resource_set_string (metadata, "dc:coverage", xd->coverage); } if (xd->license) { tracker_resource_set_string (metadata, "nie:license", xd->license); } if (xd->make || xd->model) { TrackerResource *equipment = tracker_extract_new_equipment (xd->make, xd->model); tracker_resource_set_relation (metadata, "nfo:equipment", equipment); g_object_unref (equipment); } if (xd->orientation) { tracker_resource_set_string (metadata, "nfo:orientation", xd->orientation); } if (xd->rights) { tracker_resource_set_string (metadata, "nie:copyright", xd->rights); } if (xd->white_balance) { tracker_resource_set_string (metadata, "nmm:whiteBalance", xd->white_balance); } if (xd->fnumber) { gdouble value; value = g_strtod (xd->fnumber, NULL); tracker_resource_set_double (metadata, "nmm:fnumber", value); } if (xd->flash) { tracker_resource_set_string (metadata, "nmm:flash", xd->flash); } if (xd->focal_length) { gdouble value; value = g_strtod (xd->focal_length, NULL); tracker_resource_set_double (metadata, "nmm:focalLength", value); } /* Question: Shouldn't xd->Artist be merged with md.author instead? */ if (xd->artist || xd->contributor) { TrackerResource *artist; const gchar *artist_name; artist_name = tracker_coalesce_strip (2, xd->artist, xd->contributor); artist = tracker_extract_new_contact (artist_name); tracker_resource_set_relation (metadata, "nco:contributor", artist); g_object_unref (artist); } if (xd->exposure_time) { gdouble value; value = g_strtod (xd->exposure_time, NULL); tracker_resource_set_double (metadata, "nmm:exposureTime", value); } if (xd->iso_speed_ratings) { gdouble value; value = g_strtod (xd->iso_speed_ratings, NULL); tracker_resource_set_double (metadata, "nmm:isoSpeed", value); } if (xd->description) { tracker_resource_set_string (metadata, "nie:description", xd->description); } if (xd->metering_mode) { tracker_resource_set_string (metadata, "nmm:meteringMode", xd->metering_mode); } if (xd->address || xd->state || xd->country || xd->city || xd->gps_altitude || xd->gps_latitude || xd-> gps_longitude) { TrackerResource *location = tracker_extract_new_location (xd->address, xd->state, xd->city, xd->country, xd->gps_altitude, xd->gps_latitude, xd->gps_longitude); tracker_resource_set_relation (metadata, "slo:location", location); g_object_unref (location); } if (xd->regions) { tracker_xmp_apply_regions_to_resource (metadata, xd); } tracker_xmp_free (xd); } else { /* So if we are here we have NO XMP data and we just * write what we know from Poppler. */ write_pdf_data (pd, metadata, keywords); } for (i = 0; i < keywords->len; i++) { TrackerResource *tag; const gchar *p; p = g_ptr_array_index (keywords, i); tag = tracker_extract_new_tag (p); tracker_resource_add_relation (metadata, "nao:hasTag", tag); g_object_unref (tag); } g_ptr_array_free (keywords, TRUE); tracker_resource_set_int64 (metadata, "nfo:pageCount", poppler_document_get_n_pages(document)); config = tracker_main_get_config (); n_bytes = tracker_config_get_max_bytes (config); content = extract_content_text (document, n_bytes); if (content) { tracker_resource_set_string (metadata, "nie:plainTextContent", content); g_free (content); } read_outline (document, metadata); g_free (xml); g_free (pd.keywords); g_free (pd.title); g_free (pd.subject); g_free (pd.creation_date); g_free (pd.author); g_free (pd.date); g_free (uri); g_object_unref (document); if (contents) { munmap (contents, len); } close (fd); tracker_extract_info_set_resource (info, metadata); g_object_unref (metadata); return TRUE; }