/** * This parses an Atom content construct. * * @param cur the XML node to be parsed * @param ctxt a valid feed parser context * @returns g_strduped string which must be freed by the caller. */ static gchar * atom10_parse_content_construct (xmlNodePtr cur, feedParserCtxtPtr ctxt) { gchar *ret = NULL; if (xmlHasNsProp (cur, BAD_CAST"src", NULL )) { /* RFC 4287 says a feed must have a summary when there's a src attribute in the content (and the content therefore empty). We are already parsing the summary separately. RFC 4287 also says an entry must contain one link element with rel="alternate", so there's no point in parsing src and setting it as link. */ ret = NULL; } else { gchar *type; /* determine encoding mode */ type = xml_get_ns_attribute (cur, "type", NULL); /* Contents need to be de-encoded and should not contain sub-tags.*/ if (type && (g_str_equal (type,"html") || !g_ascii_strcasecmp (type, "text/html"))) { ret = xhtml_extract (cur, 0, NULL); } else if (!type || !strcmp (type, "text") || !strncasecmp (type, "text/",5)) { gchar *tmp; /* Assume that "text/ *" files can be directly displayed.. kinda stated in the RFC */ ret = (gchar *)xmlNodeListGetString (cur->doc, cur->xmlChildrenNode, 1); g_strchug (g_strchomp (ret)); if (!type || !strcasecmp (type, "text")) tmp = atom10_mark_up_text_content (ret); else tmp = g_markup_printf_escaped ("<pre>%s</pre>", ret); g_free (ret); ret = tmp; } else if (!strcmp(type,"xhtml") || !g_ascii_strcasecmp (type, "application/xhtml+xml")) { /* The spec says to only show the contents of the div tag that MUST be present */ ret = xhtml_extract (cur, 2, NULL); } else { /* Do nothing on unsupported content types. This allows summaries to be used. */ ret = NULL; } g_free (type); } return ret; }
/** * Parse Atom 1.0 text tags of all sorts. * * @param htmlified If set to 1, then HTML is returned. * When set to 0, All HTML tags are removed * * @returns an escaped version of a text construct. */ static gchar * atom10_parse_text_construct (xmlNodePtr cur, gboolean htmlified) { gchar *type, *tmp, *ret = NULL; /* determine encoding mode */ type = xml_get_ns_attribute (cur, "type", NULL); /* not sure what MIME types are necessary... */ /* This that need to be de-encoded and should not contain sub-tags.*/ if (!type || !strcmp(type, "text")) { ret = (gchar *)xmlNodeListGetString (cur->doc, cur->xmlChildrenNode, 1); if (ret) { g_strchug (g_strchomp (ret)); if (htmlified) { tmp = atom10_mark_up_text_content (ret); g_free (ret); ret = tmp; } } } else if (!strcmp(type, "html")) { ret = xhtml_extract (cur, 0, NULL); if (!htmlified) ret = unhtmlize (unxmlize (ret)); } else if (!strcmp (type, "xhtml")) { /* The spec says to show the contents of the div tag that MUST be present */ ret = xhtml_extract (cur, 2, NULL); if (!htmlified) ret = unhtmlize (ret); } else { /* Invalid Atom feed */ ret = g_strdup ("This attribute was invalidly specified in this Atom feed."); } g_free (type); return ret; }
static void ns_itunes_parse_channel_tag (feedParserCtxtPtr ctxt, xmlNodePtr cur) { gchar *tmp; const gchar *old; if (!xmlStrcmp (cur->name, BAD_CAST"summary") || !xmlStrcmp (cur->name, BAD_CAST"subtitle")) { tmp = xhtml_extract (cur, 0, NULL); old = metadata_list_get (ctxt->subscription->metadata, "description"); if (!old || strlen (old) < strlen (tmp)) metadata_list_set (&ctxt->subscription->metadata, "description", tmp); g_free (tmp); } }
static void ns_itunes_parse_item_tag (feedParserCtxtPtr ctxt, xmlNodePtr cur) { gchar *tmp; if (!xmlStrcmp(cur->name, BAD_CAST"author")) { tmp = (gchar *)xmlNodeListGetString (cur->doc, cur->xmlChildrenNode, 1); if (tmp) { ctxt->item->metadata = metadata_list_append (ctxt->item->metadata, "author", tmp); g_free (tmp); } } if (!xmlStrcmp (cur->name, BAD_CAST"summary")) { tmp = xhtml_extract (cur, 0, NULL); item_set_description (ctxt->item, tmp); g_free (tmp); } if (!xmlStrcmp(cur->name, BAD_CAST"keywords")) { gchar *keyword = tmp = (gchar *)xmlNodeListGetString (cur->doc, cur->xmlChildrenNode, 1); gchar *allocated = tmp; /* parse comma separated list and strip leading spaces... */ while (tmp) { tmp = strchr (tmp, ','); if (tmp) { *tmp = 0; tmp++; } while (g_unichar_isspace (*keyword)) { keyword = g_utf8_next_char (keyword); } ctxt->item->metadata = metadata_list_append (ctxt->item->metadata, "category", keyword); keyword = tmp; } g_free (allocated); } }
/* The follow are not used, but had been recognized: "language", <---- Not in atom 0.2 or 0.3. We should use xml:lang "lastBuildDate", <--- Where is this from? "issued", <-- Not in the specs for feeds "created", <---- Not in the specs for feeds */ gchar* pie_parse_content_construct(xmlNodePtr cur) { gchar *mode, *type, *ret; g_assert(NULL != cur); ret = NULL; /* determine encoding mode */ mode = xml_get_attribute (cur, "mode"); type = xml_get_attribute (cur, "type"); /* Modes are used in older versions of ATOM, including 0.3. It does not exist in the newer IETF drafts.*/ if(NULL != mode) { if(!strcmp(mode, "escaped")) { gchar *tmp; tmp = xhtml_extract (cur, 0, NULL); if(NULL != tmp) ret = tmp; } else if(!strcmp(mode, "xml")) { ret = xhtml_extract (cur, 1,NULL); } else if(!strcmp(mode, "base64")) { g_warning("Base64 encoded <content> in Atom feeds not supported!\n"); } else if(!strcmp(mode, "multipart/alternative")) { if(NULL != cur->xmlChildrenNode) ret = pie_parse_content_construct(cur->xmlChildrenNode); } g_free(mode); } else { /* some feeds don'ts specify a mode but a MIME type in the type attribute... */ /* not sure what MIME types are necessary... */ /* This that need to be de-encoded and should not contain sub-tags.*/ if(NULL == type || !g_ascii_strcasecmp(type, "TEXT") || !strcmp(type, "text/plain")) { gchar *tmp; tmp = (gchar *)xmlNodeListGetString(cur->doc, cur->xmlChildrenNode, 1); ret = g_markup_printf_escaped("<div xmlns=\"http://www.w3.org/1999/xhtml\"><pre>%s</pre></div>", tmp); g_free(tmp); /* Next are things that contain subttags */ } else if(!g_ascii_strcasecmp(type, "HTML") || !strcmp(type, "text/html")) { ret = xhtml_extract (cur, 0,"http://default.base.com/"); } else if(/* HTML types */ !g_ascii_strcasecmp(type, "xhtml") || !strcmp(type, "application/xhtml+xml")) { ret = xhtml_extract (cur, 1,"http://default.base.com/"); } } /* If the type was text, everything must be now escaped and wrapped in pre tags.... Also, the atom 0.3 spec says that the default type MUST be considered to be text/plain. The type tag is required in 0.2.... */ //if (ret != NULL && (type == NULL || !strcmp(type, "text/plain") || !strcmp(type,"TEXT")))) { g_free(type); return ret; }
static GrssFeedItem* parse_rss_item (FeedRssHandler *parser, GrssFeedChannel *feed, xmlDocPtr doc, xmlNodePtr cur) { gchar *tmp; gchar *tmp2; gchar *tmp3; time_t t; GrssFeedItem *item; g_assert (cur != NULL); item = grss_feed_item_new (feed); /* try to get an item about id */ tmp = (gchar*) xmlGetProp (cur, BAD_CAST"about"); if (tmp) { grss_feed_item_set_id (item, tmp); grss_feed_item_set_source (item, tmp); g_free (tmp); } cur = cur->xmlChildrenNode; while (cur) { if (cur->type != XML_ELEMENT_NODE || !cur->name) { cur = cur->next; continue; } /* check namespace of this tag */ if (cur->ns) { if (ns_handler_item (parser->priv->handler, item, cur)) { cur = cur->next; continue; } } if (!xmlStrcmp (cur->name, BAD_CAST"category")) { tmp = (gchar*) xmlNodeListGetString (doc, cur->xmlChildrenNode, 1); if (tmp) { grss_feed_item_add_category (item, tmp); g_free (tmp); } } else if (!xmlStrcmp (cur->name, BAD_CAST"author")) { tmp = (gchar*) xmlNodeListGetString (doc, cur->xmlChildrenNode, 1); if (tmp) { grss_feed_item_set_author (item, tmp); g_free (tmp); } } else if (!xmlStrcmp (cur->name, BAD_CAST"comments")) { tmp = (gchar*) xmlNodeListGetString (doc, cur->xmlChildrenNode, 1); if (tmp) { grss_feed_item_set_comments_url (item, tmp); g_free (tmp); } } else if (!xmlStrcmp (cur->name, BAD_CAST"pubDate")) { tmp = (gchar*) xmlNodeListGetString (doc, cur->xmlChildrenNode, 1); if (tmp) { t = date_parse_RFC822 (tmp); grss_feed_item_set_publish_time (item, t); g_free (tmp); } } else if (!xmlStrcmp (cur->name, BAD_CAST"enclosure")) { /* RSS 0.93 allows multiple enclosures */ tmp = (gchar*) xmlGetProp (cur, BAD_CAST"url"); if (tmp) { gchar *type = (gchar*) xmlGetProp (cur, BAD_CAST"type"); gssize length = 0; GrssFeedEnclosure *enclosure; tmp2 = (gchar*) xmlGetProp (cur, BAD_CAST"length"); if (tmp2) { length = atol (tmp2); xmlFree (tmp2); } tmp3 = (gchar*) grss_feed_channel_get_homepage (feed); if ((strstr (tmp, "://") == NULL) && (tmp3 != NULL) && (strstr (tmp3, "://") != NULL)) { /* add base URL if necessary and possible */ tmp2 = g_strdup_printf ("%s/%s", tmp3, tmp); xmlFree (tmp); tmp = tmp2; } enclosure = grss_feed_enclosure_new (tmp); grss_feed_enclosure_set_format (enclosure, type); grss_feed_enclosure_set_length (enclosure, length); grss_feed_item_add_enclosure (item, enclosure); xmlFree (tmp); xmlFree (type); } } else if (!xmlStrcmp (cur->name, BAD_CAST"guid")) { if (!grss_feed_item_get_id (item)) { tmp = (gchar*) xmlNodeListGetString (doc, cur->xmlChildrenNode, 1); if (tmp) { if (strlen (tmp) > 0) { grss_feed_item_set_id (item, tmp); tmp2 = (gchar*) xmlGetProp (cur, BAD_CAST"isPermaLink"); if (!grss_feed_item_get_source (item) && (tmp2 == NULL || g_str_equal (tmp2, "true"))) grss_feed_item_set_source (item, tmp); /* Per the RSS 2.0 spec. */ if (tmp2) xmlFree (tmp2); } xmlFree (tmp); } } } else if (!xmlStrcmp (cur->name, BAD_CAST"title")) { tmp = unhtmlize ((gchar*) xmlNodeListGetString (doc, cur->xmlChildrenNode, TRUE)); if (tmp) { grss_feed_item_set_title (item, tmp); g_free (tmp); } } else if (!xmlStrcmp (cur->name, BAD_CAST"link")) { tmp = unhtmlize ((gchar*) xmlNodeListGetString (doc, cur->xmlChildrenNode, TRUE)); if (tmp) { grss_feed_item_set_source (item, tmp); g_free (tmp); } } else if (!xmlStrcmp (cur->name, BAD_CAST"description")) { tmp = xhtml_extract (cur, 0, NULL); if (tmp) { /* don't overwrite content:encoded descriptions... */ if (!grss_feed_item_get_description (item)) grss_feed_item_set_description (item, tmp); g_free (tmp); } } else if (!xmlStrcmp (cur->name, BAD_CAST"source")) { tmp = (gchar*) xmlGetProp (cur, BAD_CAST"url"); tmp2 = unhtmlize ((gchar*) xmlNodeListGetString (doc, cur->xmlChildrenNode, 1)); if (tmp) { grss_feed_item_set_real_source (item, g_strchomp (tmp), tmp2 ? g_strchomp (tmp2) : NULL); g_free (tmp); } if (tmp2) g_free (tmp2); } cur = cur->next; } return item; }