Exemplo n.º 1
0
/**
 * This parses an Atom content construct.
 *
 * @param cur	the XML node to be parsed
 * @param ctxt 	a valid feed parser context
 * @returns g_strduped string which must be freed by the caller.
 */
static gchar *
atom10_parse_content_construct (xmlNodePtr cur, feedParserCtxtPtr ctxt)
{
	gchar *ret = NULL;
	
	if (xmlHasNsProp (cur, BAD_CAST"src", NULL )) {
		/*
		   RFC 4287 says a feed must have a summary when there's
		   a src attribute in the content (and the content therefore
		   empty). We are already parsing the summary separately.

		   RFC 4287 also says an entry must contain one link element
		   with rel="alternate", so there's no point in parsing
		   src and setting it as link.
		*/
		ret = NULL;
	} else {
		gchar *type;

		/* determine encoding mode */
		type = xml_get_ns_attribute (cur, "type", NULL);
		
		/* Contents need to be de-encoded and should not contain sub-tags.*/
		if (type && (g_str_equal (type,"html") || !g_ascii_strcasecmp (type, "text/html"))) {
			ret = xhtml_extract (cur, 0, NULL);
		} else if (!type || !strcmp (type, "text") || !strncasecmp (type, "text/",5)) {
			gchar *tmp;
			/* Assume that "text/ *" files can be directly displayed.. kinda stated in the RFC */
			ret = (gchar *)xmlNodeListGetString (cur->doc, cur->xmlChildrenNode, 1);
			
			g_strchug (g_strchomp (ret));
			
			if (!type || !strcasecmp (type, "text"))
				tmp = atom10_mark_up_text_content (ret);
			else
				tmp = g_markup_printf_escaped ("<pre>%s</pre>", ret);
			g_free (ret);
			ret = tmp;
		} else if (!strcmp(type,"xhtml") || !g_ascii_strcasecmp (type, "application/xhtml+xml")) {
			/* The spec says to only show the contents of the div tag that MUST be present */
			ret = xhtml_extract (cur, 2, NULL);
		} else {
			/* Do nothing on unsupported content types. This allows summaries to be used. */
			ret = NULL;
		}
		
		g_free (type);
	}
	
	return ret;
}
Exemplo n.º 2
0
/**
 * Parse Atom 1.0 text tags of all sorts.
 *
 * @param htmlified	If set to 1, then HTML is returned. 
 * 			When set to 0, All HTML tags are removed
 *
 * @returns an escaped version of a text construct.
 */
static gchar *
atom10_parse_text_construct (xmlNodePtr cur, gboolean htmlified)
{
	gchar	*type, *tmp, *ret = NULL;
	
	/* determine encoding mode */
	type = xml_get_ns_attribute (cur, "type", NULL);
	
	/* not sure what MIME types are necessary... */
	
	/* This that need to be de-encoded and should not contain sub-tags.*/
	if (!type || !strcmp(type, "text")) {
		ret = (gchar *)xmlNodeListGetString (cur->doc, cur->xmlChildrenNode, 1);
		if (ret) {
			g_strchug (g_strchomp (ret));

			if (htmlified) {
				tmp = atom10_mark_up_text_content (ret);
				g_free (ret);
				ret = tmp;
			}
		}
	} else if (!strcmp(type, "html")) {
		ret = xhtml_extract (cur, 0, NULL);
		if (!htmlified)
			ret = unhtmlize (unxmlize (ret));
	} else if (!strcmp (type, "xhtml")) {
		/* The spec says to show the contents of the div tag that MUST be present */
		ret = xhtml_extract (cur, 2, NULL);
		
		if (!htmlified)
			ret = unhtmlize (ret);
	} else {
		/* Invalid Atom feed */
		ret = g_strdup ("This attribute was invalidly specified in this Atom feed.");
	}
	
	g_free (type);
		
	return ret;
}
Exemplo n.º 3
0
static void
ns_itunes_parse_channel_tag (feedParserCtxtPtr ctxt, xmlNodePtr cur)
{
	gchar *tmp;
	const gchar *old;

	if (!xmlStrcmp (cur->name, BAD_CAST"summary") || !xmlStrcmp (cur->name, BAD_CAST"subtitle")) {
		tmp = xhtml_extract (cur, 0, NULL);
		old = metadata_list_get (ctxt->subscription->metadata, "description");
		if (!old || strlen (old) < strlen (tmp))
			metadata_list_set (&ctxt->subscription->metadata, "description", tmp);
		g_free (tmp);
	}
}
Exemplo n.º 4
0
static void
ns_itunes_parse_item_tag (feedParserCtxtPtr ctxt, xmlNodePtr cur)
{
	gchar *tmp;
	
	if (!xmlStrcmp(cur->name, BAD_CAST"author")) {
		tmp = (gchar *)xmlNodeListGetString (cur->doc, cur->xmlChildrenNode, 1);
		if (tmp) {
			ctxt->item->metadata = metadata_list_append (ctxt->item->metadata, "author", tmp);
			g_free (tmp);
		}
	}
	
	if (!xmlStrcmp (cur->name, BAD_CAST"summary")) {
		tmp = xhtml_extract (cur, 0, NULL);
		item_set_description (ctxt->item, tmp);
		g_free (tmp);
	}
	
	if (!xmlStrcmp(cur->name, BAD_CAST"keywords")) {
		gchar *keyword = tmp = (gchar *)xmlNodeListGetString (cur->doc, cur->xmlChildrenNode, 1);
		gchar *allocated = tmp;
		/* parse comma separated list and strip leading spaces... */
		while (tmp) {
			tmp = strchr (tmp, ',');
			if (tmp) {
				*tmp = 0;
				tmp++;
			}
			while (g_unichar_isspace (*keyword)) {
				keyword = g_utf8_next_char (keyword);
			}
			ctxt->item->metadata = metadata_list_append (ctxt->item->metadata, "category", keyword);
			keyword = tmp;
		}
		g_free (allocated);
	}
}
Exemplo n.º 5
0
/*
  The follow are not used, but had been recognized:
  
	"language", <---- Not in atom 0.2 or 0.3. We should use xml:lang
	"lastBuildDate", <--- Where is this from?
	"issued", <-- Not in the specs for feeds
	"created",  <---- Not in the specs for feeds
*/
gchar* pie_parse_content_construct(xmlNodePtr cur) {
	gchar	*mode, *type, *ret;

	g_assert(NULL != cur);
	ret = NULL;
	
	/* determine encoding mode */
	mode = xml_get_attribute (cur, "mode");
	type = xml_get_attribute (cur, "type");

	/* Modes are used in older versions of ATOM, including 0.3. It
	   does not exist in the newer IETF drafts.*/
	if(NULL != mode) {
		if(!strcmp(mode, "escaped")) {
			gchar	*tmp;

			tmp = xhtml_extract (cur, 0, NULL);
			if(NULL != tmp)
				ret = tmp;
			
		} else if(!strcmp(mode, "xml")) {
			ret = xhtml_extract (cur, 1,NULL);
			
		} else if(!strcmp(mode, "base64")) {
			g_warning("Base64 encoded <content> in Atom feeds not supported!\n");
			
		} else if(!strcmp(mode, "multipart/alternative")) {
			if(NULL != cur->xmlChildrenNode)
				ret = pie_parse_content_construct(cur->xmlChildrenNode);
		}
		g_free(mode);
	} else {
		/* some feeds don'ts specify a mode but a MIME type in the
		   type attribute... */
		/* not sure what MIME types are necessary... */

		/* This that need to be de-encoded and should not contain sub-tags.*/
		if(NULL == type ||
			!g_ascii_strcasecmp(type, "TEXT") ||
			!strcmp(type, "text/plain")) {
			gchar *tmp;
			tmp = (gchar *)xmlNodeListGetString(cur->doc, cur->xmlChildrenNode, 1);
			ret = g_markup_printf_escaped("<div xmlns=\"http://www.w3.org/1999/xhtml\"><pre>%s</pre></div>", tmp);
			g_free(tmp);
			/* Next are things that contain subttags */
		} else if(!g_ascii_strcasecmp(type, "HTML") ||
		          !strcmp(type, "text/html")) {
			ret = xhtml_extract (cur, 0,"http://default.base.com/");
		} else if(/* HTML types */
		          !g_ascii_strcasecmp(type, "xhtml") ||
		          !strcmp(type, "application/xhtml+xml")) {
			ret = xhtml_extract (cur, 1,"http://default.base.com/");
		}
	}
	/* If the type was text, everything must be now escaped and
	   wrapped in pre tags.... Also, the atom 0.3 spec says that the
	   default type MUST be considered to be text/plain. The type tag
	   is required in 0.2.... */
	//if (ret != NULL && (type == NULL || !strcmp(type, "text/plain") || !strcmp(type,"TEXT")))) {
	g_free(type);
	
	return ret;
}
Exemplo n.º 6
0
static GrssFeedItem*
parse_rss_item (FeedRssHandler *parser, GrssFeedChannel *feed, xmlDocPtr doc, xmlNodePtr cur)
{
	gchar *tmp;
	gchar *tmp2;
	gchar *tmp3;
	time_t t;
	GrssFeedItem *item;

	g_assert (cur != NULL);

	item = grss_feed_item_new (feed);

	/* try to get an item about id */
	tmp = (gchar*) xmlGetProp (cur, BAD_CAST"about");
	if (tmp) {
		grss_feed_item_set_id (item, tmp);
		grss_feed_item_set_source (item, tmp);
		g_free (tmp);
	}

	cur = cur->xmlChildrenNode;

	while (cur) {
		if (cur->type != XML_ELEMENT_NODE || !cur->name) {
			cur = cur->next;
			continue;
		}

		/* check namespace of this tag */
		if (cur->ns) {
			if (ns_handler_item (parser->priv->handler, item, cur)) {
				cur = cur->next;
				continue;
			}
		}

		if (!xmlStrcmp (cur->name, BAD_CAST"category")) {
 			tmp = (gchar*) xmlNodeListGetString (doc, cur->xmlChildrenNode, 1);
			if (tmp) {
				grss_feed_item_add_category (item, tmp);
				g_free (tmp);
			}
		}
		else if (!xmlStrcmp (cur->name, BAD_CAST"author")) {
 			tmp = (gchar*) xmlNodeListGetString (doc, cur->xmlChildrenNode, 1);
			if (tmp) {
				grss_feed_item_set_author (item, tmp);
				g_free (tmp);
			}
		}
		else if (!xmlStrcmp (cur->name, BAD_CAST"comments")) {
 			tmp = (gchar*) xmlNodeListGetString (doc, cur->xmlChildrenNode, 1);
			if (tmp) {
				grss_feed_item_set_comments_url (item, tmp);
				g_free (tmp);
			}
		}
		else if (!xmlStrcmp (cur->name, BAD_CAST"pubDate")) {
 			tmp = (gchar*) xmlNodeListGetString (doc, cur->xmlChildrenNode, 1);
			if (tmp) {
				t = date_parse_RFC822 (tmp);
				grss_feed_item_set_publish_time (item, t);
				g_free (tmp);
			}
		}
		else if (!xmlStrcmp (cur->name, BAD_CAST"enclosure")) {
			/* RSS 0.93 allows multiple enclosures */
			tmp = (gchar*) xmlGetProp (cur, BAD_CAST"url");

			if (tmp) {
				gchar *type = (gchar*) xmlGetProp (cur, BAD_CAST"type");
				gssize length = 0;
				GrssFeedEnclosure *enclosure;

				tmp2 = (gchar*) xmlGetProp (cur, BAD_CAST"length");
				if (tmp2) {
					length = atol (tmp2);
					xmlFree (tmp2);
				}

				tmp3 = (gchar*) grss_feed_channel_get_homepage (feed);

				if ((strstr (tmp, "://") == NULL) &&
				    (tmp3 != NULL) &&
				    (strstr (tmp3, "://") != NULL)) {
					/* add base URL if necessary and possible */
					tmp2 = g_strdup_printf ("%s/%s", tmp3, tmp);
					xmlFree (tmp);
					tmp = tmp2;
				}

				enclosure = grss_feed_enclosure_new (tmp);
				grss_feed_enclosure_set_format (enclosure, type);
				grss_feed_enclosure_set_length (enclosure, length);
				grss_feed_item_add_enclosure (item, enclosure);

				xmlFree (tmp);
				xmlFree (type);
			}
		}
		else if (!xmlStrcmp (cur->name, BAD_CAST"guid")) {
			if (!grss_feed_item_get_id (item)) {
				tmp = (gchar*) xmlNodeListGetString (doc, cur->xmlChildrenNode, 1);
				if (tmp) {
					if (strlen (tmp) > 0) {
						grss_feed_item_set_id (item, tmp);
						tmp2 = (gchar*) xmlGetProp (cur, BAD_CAST"isPermaLink");

						if (!grss_feed_item_get_source (item) && (tmp2 == NULL || g_str_equal (tmp2, "true")))
							grss_feed_item_set_source (item, tmp); /* Per the RSS 2.0 spec. */
						if (tmp2)
							xmlFree (tmp2);
					}

					xmlFree (tmp);
				}
			}
		}
		else if (!xmlStrcmp (cur->name, BAD_CAST"title")) {
 			tmp = unhtmlize ((gchar*) xmlNodeListGetString (doc, cur->xmlChildrenNode, TRUE));
			if (tmp) {
				grss_feed_item_set_title (item, tmp);
				g_free (tmp);
			}
		}
		else if (!xmlStrcmp (cur->name, BAD_CAST"link")) {
 			tmp = unhtmlize ((gchar*) xmlNodeListGetString (doc, cur->xmlChildrenNode, TRUE));
			if (tmp) {
				grss_feed_item_set_source (item, tmp);
				g_free (tmp);
			}
		}
		else if (!xmlStrcmp (cur->name, BAD_CAST"description")) {
 			tmp = xhtml_extract (cur, 0, NULL);
			if (tmp) {
				/* don't overwrite content:encoded descriptions... */
				if (!grss_feed_item_get_description (item))
					grss_feed_item_set_description (item, tmp);
				g_free (tmp);
			}
		}
		else if (!xmlStrcmp (cur->name, BAD_CAST"source")) {
			tmp = (gchar*) xmlGetProp (cur, BAD_CAST"url");
			tmp2 = unhtmlize ((gchar*) xmlNodeListGetString (doc, cur->xmlChildrenNode, 1));

			if (tmp) {
				grss_feed_item_set_real_source (item, g_strchomp (tmp), tmp2 ? g_strchomp (tmp2) : NULL);
				g_free (tmp);
			}

			if (tmp2)
				g_free (tmp2);
		}

		cur = cur->next;
	}

	return item;
}