void DataFeed::parse ( char *dataFeedPage, long dataFeedPageLen ) { // use Xml Class to parse up the page Xml xml; xml.set ( csUTF8, dataFeedPage, dataFeedPageLen, false, 0, false, TITLEREC_CURRENT_VERSION ); // get the nodes long numNodes = xml.getNumNodes(); XmlNode *nodes = xml.getNodes(); // to count the tiers, result levels, and level costs long currTier = 0; long currResultLevel = 0; long currLevelCost = 0; // pull out the keywords for the data feed for (long i = 0; i < numNodes; i++) { // skip if this isn't a meta tag, shouldn't happen if (nodes[i].m_nodeId != 68) continue; // get the meta tag name //long tagLen; //char *tag = xml.getString(i, "name", &tagLen); long ucTagLen; char *ucTag = xml.getString(i, "name", &ucTagLen); char tag[256]; long tagLen = utf16ToLatin1 ( tag, 256, (UChar*)ucTag, ucTagLen>>1 ); // skip if empty if (!tag || tagLen <= 0) continue; // get the content long ucConLen; char *ucCon = xml.getString(i, "content", &ucConLen); char con[1024]; long conLen = utf16ToLatin1 ( con, 1024, (UChar*)ucCon, ucConLen>>1 ); if (!con || conLen <= 0) continue; // match the meta tag to its local var and copy content if (tagLen == 10 && strncasecmp(tag, "customerid", 10) == 0) m_customerId = atoll(con); else if (tagLen == 11 && strncasecmp(tag, "datafeedurl", 11) == 0) setUrl(con, conLen); else if (tagLen == 8 && strncasecmp(tag, "passcode", 8) == 0) m_passcodeLen = setstr(m_passcode, MAX_PASSCODELEN, con, conLen); else if (tagLen == 6 && strncasecmp(tag, "status", 6) == 0) m_isActive = (bool)atoi(con); else if (tagLen == 6 && strncasecmp(tag, "locked", 6) == 0) m_isLocked = (bool)atoi(con); else if (tagLen == 14 && strncasecmp(tag, "dfcreationtime", 14) == 0) m_creationTime = atol(con); else if (tagLen == 8 && strncasecmp(tag, "numtiers", 8) == 0) m_priceTable.m_numTiers = atol(con); else if (tagLen == 15 && strncasecmp(tag, "numresultlevels", 15) == 0) m_priceTable.m_numResultLevels = atol(con); else if (tagLen == 10 && strncasecmp(tag, "monthlyfee", 10) == 0) m_priceTable.m_monthlyFee = atol(con); else if (tagLen == 7 && strncasecmp(tag, "tiermax", 7) == 0) { m_priceTable.m_tierMax[currTier] = (unsigned long)atol(con); currTier++; } else if (tagLen == 11 && strncasecmp(tag, "resultlevel", 11) == 0) { m_priceTable.m_resultLevels[currResultLevel] = (unsigned long)atol(con); currResultLevel++; } else if (tagLen == 9 && strncasecmp(tag, "levelcost", 9) == 0) { m_priceTable.m_levelCosts[currLevelCost] = (unsigned long)atol(con); currLevelCost++; } else log(LOG_INFO, "datafeed: Invalid Meta Tag Parsed [%li]:" " %s", tagLen, tag); } }
// returns length of stripped content, but will set g_errno and return -1 // on error int32_t stripHtml( char *content, int32_t contentLen, int32_t version, int32_t strip ) { if ( !strip ) { log( LOG_WARN, "query: html stripping not required!" ); return contentLen; } if ( ! content ) return 0; if ( contentLen == 0 ) return 0; // filter content if we should // keep this on the big stack so "content" still references something Xml tmpXml; // . get the content as xhtml (should be NULL terminated) // . parse as utf8 since all we are doing is messing with // the tags...content manipulation comes later if ( !tmpXml.set( content, contentLen, version, CT_HTML ) ) { return -1; } //if( strip == 4 ) // return tmpXml.getText( content, contentLen ); // go tag by tag int32_t n = tmpXml.getNumNodes(); XmlNode *nodes = tmpXml.getNodes(); // Xml class may have converted to utf16 content = tmpXml.getContent(); contentLen = tmpXml.getContentLen(); char *x = content; char *xend = content + contentLen; int32_t stackid = -1; int32_t stackc = 0; char skipIt = 0; // . hack COL tag to NOT require a back tag // . do not leave it that way as it could mess up our parsing //g_nodes[25].m_hasBackTag = 0; for ( int32_t i = 0 ; i < n ; i++ ) { // get id of this node int32_t id = nodes[i].m_nodeId; // if strip is 4, just remove the script tag if( strip == 4 ){ if ( id ){ if ( id == TAG_SCRIPT ){ skipIt ^= 1; continue; } } else if ( skipIt ) continue; goto keepit; } // if strip is 3, ALL tags will be removed! if( strip == 3 ) { if( id ) { // . we dont want anything in between: // - script tags (83) // - style tags (111) if ((id == TAG_SCRIPT) || (id == TAG_STYLE)) skipIt ^= 1; // save img to have alt text kept. if ( id == TAG_IMG ) goto keepit; continue; } else { if( skipIt ) continue; goto keepit; } } // get it int32_t fk; if ( strip == 1 ) fk = g_nodes[id].m_filterKeep1; else fk = g_nodes[id].m_filterKeep2; // if tag is <link ...> only keep it if it has // rel="stylesheet" or rel=stylesheet if ( strip == 2 && id == TAG_LINK ) { // <link> tag id int32_t fflen; char *ff = nodes[i].getFieldValue ( "rel" , &fflen ); if ( ff && fflen == 10 && strncmp(ff,"stylesheet",10) == 0 ) goto keepit; } // just remove just the tag if this is 2 if ( fk == 2 ) continue; // keep it if not in a stack if ( ! stackc && fk ) goto keepit; // if no front/back for tag, just skip it if ( ! nodes[i].m_hasBackTag ) continue; // start stack if none if ( stackc == 0 ) { // but not if this is a back tag if ( nodes[i].m_node[1] == '/' ) continue; // now start the stack stackid = id; stackc = 1; continue; } // skip if this tag does not match what is on stack if ( id != stackid ) continue; // if ANOTHER front tag, inc stack if ( nodes[i].m_node[1] != '/' ) stackc++; // otherwise, dec the stack count else stackc--; // . ensure not negative from excess back tags // . reset stackid to -1 to indicate no stack if ( stackc <= 0 ) { stackid= -1; stackc = 0; } // skip it continue; keepit: // replace images with their alt text int32_t vlen; char *v; if ( id == TAG_IMG ) { v = nodes[i].getFieldValue("alt", &vlen ); // try title if no alt text if ( ! v ) v = nodes[i].getFieldValue("title", &vlen ); if ( v ) { gbmemcpy ( x, v, vlen ); x += vlen; } continue; } // remove background image from body,table,td tags if ( id == TAG_BODY || id == TAG_TABLE || id == TAG_TD ) { v = nodes[i].getFieldValue("background", &vlen); // remove background, just sabotage it if ( v ) v[-4] = 'x'; } // store it gbmemcpy ( x , nodes[i].m_node , nodes[i].m_nodeLen ); x += nodes[i].m_nodeLen; // sanity check if ( x > xend ) { g_process.shutdownAbort(true);} } contentLen = x - content; content [ contentLen ] = '\0'; // unhack COL tag //g_nodes[25].m_hasBackTag = 1; return contentLen; }