예제 #1
0
static nglString GetEncodingString(TidyNode tnod)
{
  if (tidyNodeGetId(tnod) == TidyTag_META)
  {
    // Search for the encoding attribute
    TidyAttr attr_content = tidyAttrGetById(tnod, TidyAttr_CONTENT);
    TidyAttr attr_httpequiv = tidyAttrGetById(tnod, TidyAttr_HTTP_EQUIV);
    if (attr_content && attr_httpequiv)
    {
      nglString contenttype(tidyAttrValue(attr_content));
      if (contenttype.Compare(_T("content-type"), false) != 0)
      {
        // bleh...
      }
      nglString encoding(tidyAttrValue(attr_content));
      //NGL_OUT(_T("content found in the tree: %s"), encoding.GetChars());
      int32 col = encoding.Find(_T("charset="));
      encoding = encoding.Extract(col + 8);
      //NGL_OUT(_T("encoding found in the tree: %s"), encoding.GetChars());
      return encoding;
    }
  }
  
  TidyNode child;
  
  for (child = tidyGetChild(tnod); child; child = tidyGetNext(child))
  {
    nglString str(GetEncodingString(child));
    if (!str.IsNull())
      return str;
  }
  
  
  return nglString::Null;
}
예제 #2
0
QString UCHome_Main_SiteConst::find_photo_url(TidyDoc doc)
{
    QString photo_url;
    TidyNode node;
    TidyNode node2;
    ctmbstr url_str = NULL;
    TidyAttr  attr = NULL;
    
    //quick_update s_clear
    node = this->searchNode(doc, NULL, "quick_update s_clear", TidyTag_DIV);
    if(node != NULL) {
        node2 = tidyGetChild(node);
        Q_ASSERT(tidyNodeGetId(node2) == TidyTag_IMG);
        attr = tidyAttrGetById(node2, TidyAttr_SRC);
        if(attr != NULL) {
            url_str = tidyAttrValue(attr);
            photo_url = QString(url_str);
            photo_url = photo_url.replace("small", "big");
            q_debug()<<"Photo url: "<<photo_url;
        }else{
        }
    }else{
        q_debug()<<"Warning: no photo url found";
    }
    
    return photo_url;
}
예제 #3
0
FeedRecord* UCHome_Main_SiteConst::parse_friend(void *state_data, TidyDoc doc, TidyNode node) 
{
    FeedRecord * rec = NULL;
    QString note;
    TidyBuffer tbuf;
    TidyNode node2 = NULL;
    TidyNode node3 = NULL;
    TidyNode node4 = NULL;
    TidyNode node5 = NULL;
    TidyAttr attr = NULL;
    ctmbstr fuid = NULL;
    ctmbstr fusername = NULL;

    tidyBufInit(&tbuf);

    tidyNodeGetText(doc, node, &tbuf);
    note = this->u8codec->toUnicode(QByteArray((char*)tbuf.bp));    
    rec = new FeedRecord();
    rec->content = this->rewrite_relative_link(note);
    rec->content = "<table>" + rec->content + "</table>";
    md5CheckSum((char*)tbuf.bp, tbuf.size, rec->md5sum);
    tidyBufFree(&tbuf);    
    //q_debug()<<"Orig:"<<note;

    node2 = tidyGetChild(node);
    node3 = tidyGetChild(node2);
    attr = tidyAttrGetById(node3, TidyAttr_VALUE);
    fuid = tidyAttrValue(attr);
    
    node4 = tidyGetNext(node2);
    node3 = tidyGetChild(node4); // A
    node5 = tidyGetChild(node3); //IMG
    attr = tidyAttrGetById(node5, TidyAttr_ALT);
    fusername = tidyAttrValue(attr);

    rec->fuid = fuid;
    rec->fusername = this->u8codec->toUnicode(QByteArray(fusername));
    q_debug()<<"USER:"<<fuid<<rec->fusername;
    md5CheckSum(fuid, strlen(fuid), rec->md5sum);//使用uid的md5值肯定不会出现冲突
    
    return rec;
}
예제 #4
0
FeedRecord* UCHome_Main_SiteConst::parse_feed(void *state_data, TidyDoc doc, TidyNode node)
{
    FeedRecord *rec = NULL;
    QString feed_text ;
    QByteArray feed_bytes;
    int nid = 0;
    ctmbstr  fid = NULL;
    TidyAttr attr = NULL;
    TidyBuffer tbuf = {0};
    tidyBufInit(&tbuf);
    
    nid = tidyNodeGetId(node);    
    this->get_node_text(doc, node, &tbuf);
    feed_bytes = QByteArray((char*)tbuf.bp);
    feed_text = this->u8codec->toUnicode(feed_bytes);

    tidyBufFree(&tbuf);

    rec = new FeedRecord();
    rec->content = feed_text;

    //<li class="type_1006780" id="feed_685697_li">
    //这个id是唯一的,用这个做标识
    attr = tidyAttrGetById(node, TidyAttr_ID);
    fid = tidyAttrValue(attr);
    //q_debug()<<attr<<fid;
    if(fid == NULL) {
        q_debug()<<"Warning: invalid fid"<<fid<<feed_text;
        return NULL;
    }else if(strstr(fid, " 1000 ") != NULL) {
        feed_bytes = fid;
        feed_bytes = feed_bytes.replace(" 1000 ", "");
        fid = feed_bytes.data();
        q_debug()<<"Warning: invalid fid"<<tidyAttrValue(attr);
        md5CheckSum(fid, strlen(fid), rec->md5sum);
    }else{
        md5CheckSum(fid, strlen(fid), rec->md5sum);
    }
    return rec;
}
예제 #5
0
파일: parse.c 프로젝트: ASpade/mulk
static void parse_html(TidyDoc tdoc, TidyNode tnod, const url_list_t *elem, int indent, FILE *outfile)
{
	TidyNode child;
	TidyAttr attr;
	TidyAttrId attr_id = TidyAttr_UNKNOWN;
	TidyNodeType node_type;
	TidyTagId node_id;
	ctmbstr name;
	char *url, *relative_url = NULL;
	int found = 0;
	int get_html_link = (!option_values.depth || elem->level < option_values.depth);
	int get_int_html_link = (!option_values.depth || elem->level < option_values.depth+1);
	int get_ext_depends = ((!option_values.depth || elem->level < option_values.depth+1)
		&& !option_values.no_html_dependencies);

	for (child = tidyGetChild(tnod); child; child = tidyGetNext(child)) {
		node_type = tidyNodeGetType(child);

		switch (node_type) {
			case TidyNode_Start:
			case TidyNode_StartEnd:
				node_id = tidyNodeGetId(child);
				if (get_html_link && (node_id == TidyTag_A || node_id == TidyTag_AREA || node_id == TidyTag_MAP)) {
					found = 1;
					attr_id = TidyAttr_HREF;
				}
				else if (get_int_html_link && (node_id == TidyTag_FRAME || node_id == TidyTag_IFRAME)) {
					found = 1;
					attr_id = TidyAttr_SRC; 
				}
				else if (get_ext_depends) {
					if (node_id == TidyTag_LINK) {
						found = 1;
						attr_id = TidyAttr_HREF;
					}
					else if (node_id == TidyTag_IMG || node_id == TidyTag_SCRIPT) {
						found = 1;
						attr_id = TidyAttr_SRC; 
					}
					else {
						found = 0;
						attr_id = TidyAttr_UNKNOWN;
					}
				}
				else {
					found = 0;
					attr_id = TidyAttr_UNKNOWN;
				}

				if (found && (attr = tidyAttrGetById(child, attr_id)) != NULL) {
					url = (char *) tidyAttrValue(attr);

					string_free(relative_url);
					if (url && *url)
						add_new_url_and_check(elem, url, outfile ? &relative_url : NULL);
				}

				if (outfile && (name = tidyNodeGetName(child)) != NULL) {
					fprintf(outfile, "%*.*s%s", indent, indent, "<", name);
					for (attr = tidyAttrFirst(child); attr; attr = tidyAttrNext(attr)) {
						fprintf(outfile, " %s", tidyAttrName(attr));
						if (relative_url && (tidyAttrGetId(attr) == attr_id))
							fprintf(outfile, "=\"%s\"", relative_url);
						else if (tidyAttrValue(attr))
							fprintf(outfile, "=\"%s\"", tidyAttrValue(attr) ? tidyAttrValue(attr) : "");
						else
							fprintf(outfile, "=\"\"");
					}
					string_free(relative_url);

					if (node_type == TidyNode_StartEnd)
						fprintf(outfile, "/>\n");
					else {
						fprintf(outfile, ">\n");
						parse_html(tdoc, child, elem, indent + 1, outfile);
						fprintf(outfile, "%*.*s%s>\n", indent + 1, indent + 1, "</", name);
					}
				}
				else {
					string_free(relative_url);
					parse_html(tdoc, child, elem, indent + 1, outfile);
				}
				break;
			case TidyNode_End:
				if (outfile) {
					if ((name = tidyNodeGetName(child)) != NULL)
						fprintf(outfile, "%*.*s/%s>\n", indent, indent, "<", name);
				}
				break;
			case TidyNode_Text:
				if (outfile) {
					TidyBuffer buf;
					TidyTagId parent_node_id = tidyNodeGetId(tnod);

					tidyBufInit(&buf);
					if (parent_node_id == TidyTag_SCRIPT || parent_node_id == TidyTag_STYLE)
						tidyNodeGetValue(tdoc, child, &buf);
					else
						tidyNodeGetText(tdoc, child, &buf);
					if (buf.bp)
						fprintf(outfile, "%s", (char *)buf.bp);
					tidyBufFree(&buf);
				}
				break;
			case TidyNode_Comment:
				if (outfile) {
					TidyBuffer buf;

					tidyBufInit(&buf);
					tidyNodeGetValue(tdoc, child, &buf);
					if (buf.bp)
						fprintf(outfile, "<!--%s-->\n", (char *)buf.bp);
					tidyBufFree(&buf);
				}
				break;
			case TidyNode_CDATA:
				if (outfile) {
					TidyBuffer buf;

					tidyBufInit(&buf);
					tidyNodeGetValue(tdoc, child, &buf);
					if (buf.bp)
						fprintf(outfile, "<![CDATA[%s]]>\n", (char *)buf.bp);
					tidyBufFree(&buf);
				}
				break;
			case TidyNode_DocType:
				if (outfile) {
					int pub = 0;

					fprintf(outfile, "<!DOCTYPE %s", tidyNodeGetName(child));
					for (attr = tidyAttrFirst(child); attr; attr = tidyAttrNext(attr)) {
						if (!pub) {
							fprintf(outfile, " %s", tidyAttrName(attr));
							if (!string_casecmp(tidyAttrName(attr), "PUBLIC"))
								pub = 1;
						}
						if (tidyAttrValue(attr))
							fprintf(outfile, " \"%s\"", tidyAttrValue(attr));
					}
					fprintf(outfile, ">\n");
				}
				break;
			default:
				if (outfile) {
					TidyBuffer buf;

					tidyBufInit(&buf);
					tidyNodeGetValue(tdoc, child, &buf);
					if (buf.bp)
						fprintf(outfile, "%s", (char *)buf.bp);
					tidyBufFree(&buf);
				}
				break;
		}
	}
}