TEST( XmlTest, MetaDescription) { const char* input_strs[] = { // valid "totally valid description", "“inside special quotes” and outside", // invalid "my \"invalid\" double quote description", "\"someone has quotes\", and nobody else has it" "'my 'invalid' single quote description'", "it's a description", "what is this quote \" doing here?" }; const char* format_strs[] = { "<meta name=\"description\" content=\"%s\">", "<meta name=\"description\" content='%s'>", "<meta name=\"description\" content=\"%s\" ng-attr-content=\"{{meta.description}}\">", "<meta name=\"description\" content='%s' ng-attr-content=\"{{meta.description}}\" >", "<meta name=\"description\" ng-attr-content=\"{{meta.description}}\" content=\"%s\">", "<meta name=\"description\" ng-attr-content=\"{{meta.description}}\" content='%s'>", "<meta name=\"description\" content=\"%s\" other-content=\"%s\">", "<meta name=\"description\" content='%s' other-content='%s'>", "<meta content=\"%s\" name=\"description\">", "<meta content='%s' name=\"description\">", "<meta name=\"description\" other-content=\"%s\" content=\"%s\">", "<meta name=\"description\" other-content='%s' content='%s'>" }; size_t len = sizeof( input_strs ) / sizeof( input_strs[0] ); size_t format_len = sizeof( format_strs ) / sizeof( format_strs[0] ); for ( size_t i = 0; i < len; i++ ) { for (size_t j = 0; j < format_len; j++) { const char *input_str = input_strs[i]; char desc[MAX_BUF_SIZE]; std::sprintf(desc, format_strs[j], input_str, input_str); char input[MAX_BUF_SIZE]; std::sprintf(input, HTML_HEAD_FORMAT, desc); Xml xml; ASSERT_TRUE(xml.set(input, strlen(input), 0, CT_HTML)); char buf[MAX_BUF_SIZE]; int32_t bufLen = MAX_BUF_SIZE; int32_t contentLen = 0; ASSERT_TRUE(xml.getTagContent("name", "description", buf, bufLen, 0, bufLen, &contentLen, false, TAG_META)); EXPECT_EQ(strlen(input_str), contentLen); EXPECT_STREQ(input_str, buf); } } }
void parse_doc_icu(char *s, int len, bool doHash, char *charset){ Xml xml; xml.set( s, len, TITLEREC_CURRENT_VERSION, 0, CT_HTML ); // Extract text from (x)html char *text_buf = (char*)malloc(64*1024); int32_t textLen = xml.getText( text_buf, 64 * 1024, 0, 99999999, doFilterSpaces ); Words w; w.set(text_buf, textLen, doHash); free(text_buf); }
void parse_doc_8859_1(char *s, int len, bool doHash,char *charset) { Xml xml; xml.set( s, len, TITLEREC_CURRENT_VERSION, 0, CT_HTML ); // Extract text from (x)html char *text_buf = (char*)malloc(len+1); xml.getText( text_buf, len, 0, 99999999, doFilterSpaces ); Words words; // just tokenize words words.set(text_buf, len, doHash); free(text_buf); }
TEST( XmlTest, MetaDescriptionStripTags) { const char* input_strs[] = { "my title<br> my <b>very important</b> text", "Lesser than (<) and greater than (>).", "We shouldn't strip <3 out", "123 < 1234; 1234 > 123", "<p style='text-align: center;'>A color cartoon drawing of a clapping cod fish ( rebus in the danish language for klaptorsk )</p>" }; const char* expected_outputs[] = { "my title. my very important text", "Lesser than (<) and greater than (>).", "We shouldn't strip <3 out", "123 < 1234; 1234 > 123", "A color cartoon drawing of a clapping cod fish ( rebus in the danish language for klaptorsk ). " }; const char* format_str = "<meta name=\"description\" content=\"%s\">"; size_t len = sizeof( input_strs ) / sizeof( input_strs[0] ); ASSERT_EQ(sizeof(input_strs)/sizeof(input_strs[0]), sizeof(expected_outputs)/sizeof(expected_outputs[0])); for ( size_t i = 0; i < len; i++ ) { const char *input_str = input_strs[i]; const char *output_str = expected_outputs[i]; char desc[MAX_BUF_SIZE]; std::sprintf(desc, format_str, input_str, input_str); char input[MAX_BUF_SIZE]; std::sprintf(input, HTML_HEAD_FORMAT, desc); Xml xml; ASSERT_TRUE(xml.set(input, strlen(input), 0, CT_HTML)); char buf[MAX_BUF_SIZE]; int32_t bufLen = MAX_BUF_SIZE; int32_t contentLen = 0; ASSERT_TRUE(xml.getTagContent("name", "description", buf, bufLen, 0, bufLen, &contentLen, false, TAG_META)); EXPECT_EQ(strlen(output_str), contentLen); EXPECT_STREQ(output_str, buf); } }
void parse_doc_icu(char *s, int len, bool doHash, char *charset){ Xml xml; xml.set(csUTF8,s,len,false, 0,false, TITLEREC_CURRENT_VERSION); //fprintf(stderr,"\nparse_doc_icu\n"); // Extract text from (x)html char *text_buf = (char*)malloc(64*1024); long textLen = xml.getText(text_buf, 64*1024, 0, 99999999, false, true, false, doFilterSpaces, false); Words w; w.set(true,false, text_buf, textLen, TITLEREC_CURRENT_VERSION,doHash); free(text_buf); }
static void generateSummary( Summary &summary, char *htmlInput, const char *queryStr, const char *urlStr ) { Xml xml; ASSERT_TRUE(xml.set(htmlInput, strlen(htmlInput), 0, CT_HTML)); Words words; ASSERT_TRUE(words.set(&xml, true)); Bits bits; ASSERT_TRUE(bits.set(&words)); Url url; url.set(urlStr); Sections sections; ASSERT_TRUE(sections.set(&words, &bits, &url, "", CT_HTML)); Query query; ASSERT_TRUE(query.set2(queryStr, langEnglish, true)); LinkInfo linkInfo; memset ( &linkInfo , 0 , sizeof(LinkInfo) ); linkInfo.m_lisize = sizeof(LinkInfo); Title title; ASSERT_TRUE(title.setTitle(&xml, &words, 80, &query, &linkInfo, &url, NULL, 0, CT_HTML, langEnglish)); Pos pos; ASSERT_TRUE(pos.set(&words)); Bits bitsForSummary; ASSERT_TRUE(bitsForSummary.setForSummary(&words)); Phrases phrases; ASSERT_TRUE(phrases.set(&words, &bits)); Matches matches; matches.setQuery(&query); ASSERT_TRUE(matches.set(&words, &phrases, §ions, &bitsForSummary, &pos, &xml, &title, &url, &linkInfo)); summary.setSummary(&xml, &words, §ions, &pos, &query, 180, 3, 3, 180, &url, &matches, title.getTitle(), title.getTitleLen()); }
void parse_doc_8859_1(char *s, int len, bool doHash,char *charset) { Xml xml; xml.set(csASCII,s,len,false, 0, false, TITLEREC_CURRENT_VERSION); //fprintf(stderr,"\nparse_doc_8859_1\n"); // Extract text from (x)html char *text_buf = (char*)malloc(len+1); xml.getText(text_buf, len, 0, 99999999, false, true, false, doFilterSpaces, false); Words words; // just tokenize words words.set(false, text_buf, TITEREC_CURRENT_VERSION, doHash); free(text_buf); }
// returns false if blocked, true otherwise bool processLoop ( void *state ) { // get it State2 *st = (State2 *)state; // get the tcp socket from the state TcpSocket *s = st->m_socket; // get it XmlDoc *xd = &st->m_xd; if ( ! xd->m_loaded ) { // setting just the docid. niceness is 0. //xd->set3 ( st->m_docId , st->m_coll , 0 ); // callback xd->setCallback ( state , processLoop ); // . and tell it to load from the old title rec // . this sets xd->m_oldTitleRec/m_oldTitleRecSize // . this sets xd->ptr_* and all other member vars from // the old title rec if found in titledb. if ( ! xd->loadFromOldTitleRec ( ) ) return false; } if ( g_errno ) return sendErrorReply ( st , g_errno ); // now force it to load old title rec //char **tr = xd->getTitleRec(); SafeBuf *tr = xd->getTitleRecBuf(); // blocked? return false if so. it will call processLoop() when it rets if ( tr == (void *)-1 ) return false; // we did not block. check for error? this will free "st" too. if ( ! tr ) return sendErrorReply ( st , g_errno ); // if title rec was empty, that is a problem if ( xd->m_titleRecBuf.length() == 0 ) return sendErrorReply ( st , ENOTFOUND); // set callback char *na = xd->getIsNoArchive(); // wait if blocked if ( na == (void *)-1 ) return false; // error? if ( ! na ) return sendErrorReply ( st , g_errno ); // forbidden? allow turkeys through though... if ( ! st->m_isAdmin && *na ) return sendErrorReply ( st , ENOCACHE ); SafeBuf *sb = &st->m_sb; // &page=4 will print rainbow sections if ( ! st->m_printed && st->m_r.getLong("page",0) ) { // do not repeat this call st->m_printed = true; // this will call us again since we called // xd->setCallback() above to us if ( ! xd->printDocForProCog ( sb , &st->m_r ) ) return false; } char *contentType = "text/html"; char format = st->m_format; if ( format == FORMAT_XML ) contentType = "text/xml"; if ( format == FORMAT_JSON ) contentType = "application/json"; // if we printed a special page (like rainbow sections) then return now if ( st->m_printed ) { bool status = g_httpServer.sendDynamicPage (s, //buf,bufLen, sb->getBufStart(), sb->getLength(), -1,false, //"text/html", contentType, -1, NULL, "utf8" ); // nuke state2 mdelete ( st , sizeof(State2) , "PageGet1" ); delete (st); return status; } /* // this was calling XmlDoc and setting sections, etc. to // get the SpiderReply junk... no no no // is it banned or filtered? this ignores the TagRec in the titleRec // and uses msg8a to get it fresh instead char *vi = xd->getIsFiltered();//Visible( ); // wait if blocked if ( vi == (void *)-1 ) return false; // error? if ( ! vi ) return sendErrorReply ( st , g_errno ); // banned? if ( ! st->m_isAdmin && ! *vi ) return sendErrorReply (st,EDOCBANNED); */ // get the utf8 content char **utf8 = xd->getUtf8Content(); //long len = xd->size_utf8Content - 1; // wait if blocked??? if ( utf8 == (void *)-1 ) return false; // strange if ( xd->size_utf8Content<=0) { log("pageget: utf8 content <= 0"); return sendErrorReply(st,EBADENGINEER ); } // alloc error? if ( ! utf8 ) return sendErrorReply ( st , g_errno ); // get this host Host *h = g_hostdb.getHost ( g_hostdb.m_hostId ); if ( ! h ) { log("pageget: hostid %li is bad",g_hostdb.m_hostId); return sendErrorReply(st,EBADENGINEER ); } char *content = xd->ptr_utf8Content; long contentLen = xd->size_utf8Content - 1; // shortcut char strip = st->m_strip; // alloc buffer now //char *buf = NULL; //long bufMaxSize = 0; //bufMaxSize = len + ( 32 * 1024 ) ; //bufMaxSize = contentLen + ( 32 * 1024 ) ; //buf = (char *)mmalloc ( bufMaxSize , "PageGet2" ); //char *p = buf; //char *bufEnd = buf + bufMaxSize; //if ( ! buf ) { // return sendErrorReply ( st , g_errno ); //} // for undoing the header //char *start1 = p; long startLen1 = sb->length(); // we are always utfu if ( strip != 2 ) sb->safePrintf( "<meta http-equiv=\"Content-Type\" " "content=\"text/html;charset=utf8\">\n"); // base href //Url *base = &xd->m_firstUrl; //if ( xd->ptr_redirUrl.m_url[0] ) // base = &xd->m_redirUrl; char *base = xd->ptr_firstUrl; if ( xd->ptr_redirUrl ) base = xd->ptr_redirUrl; //Url *redir = *xd->getRedirUrl(); if ( strip != 2 ) { sb->safePrintf ( "<BASE HREF=\"%s\">" , base ); //p += gbstrlen ( p ); } // default colors in case css files missing if ( strip != 2 ) { sb->safePrintf( "\n<style type=\"text/css\">\n" "body{background-color:white;color:black;}\n" "</style>\n"); //p += gbstrlen ( p ); } //char format = st->m_format; if ( format == FORMAT_XML ) sb->reset(); if ( format == FORMAT_JSON ) sb->reset(); // for undoing the stuff below long startLen2 = sb->length();//p; // query should be NULL terminated char *q = st->m_q; long qlen = st->m_qlen; char styleTitle[128] = "font-size:14px;font-weight:600;" "color:#000000;"; char styleText[128] = "font-size:14px;font-weight:400;" "color:#000000;"; char styleLink[128] = "font-size:14px;font-weight:400;" "color:#0000ff;"; char styleTell[128] = "font-size:14px;font-weight:600;" "color:#cc0000;"; // get the url of the title rec Url *f = xd->getFirstUrl(); bool printDisclaimer = st->m_printDisclaimer; if ( xd->m_contentType == CT_JSON ) printDisclaimer = false; if ( format == FORMAT_XML ) printDisclaimer = false; if ( format == FORMAT_JSON ) printDisclaimer = false; char tbuf[100]; tbuf[0] = 0; time_t lastSpiderDate = xd->m_spideredTime; if ( printDisclaimer || format == FORMAT_XML || format == FORMAT_JSON ) { struct tm *timeStruct = gmtime ( &lastSpiderDate ); strftime ( tbuf, 100,"%b %d, %Y UTC", timeStruct); } // We should always be displaying this disclaimer. // - May eventually want to display this at a different location // on the page, or on the click 'n' scroll browser page itself // when this page is not being viewed solo. // CNS: if ( ! st->m_clickNScroll ) { if ( printDisclaimer ) { sb->safePrintf(//sprintf ( p , //"<BASE HREF=\"%s\">" //"<table border=1 width=100%%>" //"<tr><td>" "<table border=\"1\" bgcolor=\"#" BGCOLOR "\" cellpadding=\"10\" " //"id=\"gbcnsdisctable\" class=\"gbcnsdisctable_v\"" "cellspacing=\"0\" width=\"100%%\" color=\"#ffffff\">" "<tr" //" id=\"gbcnsdisctr\" class=\"gbcnsdisctr_v\"" "><td>" //"<font face=times,sans-serif color=black size=-1>" "<span style=\"%s\">" "This is Gigablast's cached page of </span>" "<a href=\"%s\" style=\"%s\">%s</a>" "" , styleTitle, f->getUrl(), styleLink, f->getUrl() ); //p += gbstrlen ( p ); // then the rest //sprintf(p , sb->safePrintf( "<span style=\"%s\">. " "Gigablast is not responsible for the content of " "this page.</span>", styleTitle ); //p += gbstrlen ( p ); sb->safePrintf ( "<br/><span style=\"%s\">" "Cached: </span>" "<span style=\"%s\">", styleTitle, styleText ); //p += gbstrlen ( p ); // then the spider date in GMT // time_t lastSpiderDate = xd->m_spideredTime; // struct tm *timeStruct = gmtime ( &lastSpiderDate ); // char tbuf[100]; // strftime ( tbuf, 100,"%b %d, %Y UTC", timeStruct); //p += gbstrlen ( p ); sb->safeStrcpy(tbuf); // Moved over from PageResults.cpp sb->safePrintf( "</span> - <a href=\"" "/get?" "q=%s&c=%s&rtq=%li&" "d=%lli&strip=1\"" " style=\"%s\">" "[stripped]</a>", q , st->m_coll , (long)st->m_rtq, st->m_docId, styleLink ); // a link to alexa if ( f->getUrlLen() > 5 ) { sb->safePrintf( " - <a href=\"http:" "//web.archive.org/web/*/%s\"" " style=\"%s\">" "[older copies]</a>" , f->getUrl(), styleLink ); } if (st->m_noArchive){ sb->safePrintf( " - <span style=\"%s\"><b>" "[NOARCHIVE]</b></span>", styleTell ); } if (st->m_isBanned){ sb->safePrintf(" - <span style=\"%s\"><b>" "[BANNED]</b></span>", styleTell ); } // only print this if we got a query if ( qlen > 0 ) { sb->safePrintf("<br/><br/><span style=\"%s\"> " "These search terms have been " "highlighted: ", styleText ); //p += gbstrlen ( p ); } } // how much space left in p? //long avail = bufEnd - p; // . make the url that we're outputting for (like in PageResults.cpp) // . "thisUrl" is the baseUrl for click & scroll char thisUrl[MAX_URL_LEN]; char *thisUrlEnd = thisUrl + MAX_URL_LEN; char *x = thisUrl; // . use the external ip of our gateway // . construct the NAT mapped port // . you should have used iptables to map port to the correct // internal ip:port //unsigned long ip =g_conf.m_mainExternalIp ; // h->m_externalIp; //unsigned short port=g_conf.m_mainExternalPort;//h->m_externalHttpPort // local check //if ( st->m_isLocal ) { unsigned long ip = h->m_ip; unsigned short port = h->m_httpPort; //} //sprintf ( x , "http://%s:%li/get?q=" , iptoa ( ip ) , port ); // . we no longer put the port in here // . but still need http:// since we use <base href=> if (port == 80) sprintf(x,"http://%s/get?q=",iptoa(ip)); else sprintf(x,"http://%s:%hu/get?q=",iptoa(ip),port); x += gbstrlen ( x ); // the query url encoded long elen = urlEncode ( x , thisUrlEnd - x , q , qlen ); x += elen; // separate cgi vars with a & //sprintf ( x, "&seq=%li&rtq=%lid=%lli", // (long)st->m_seq,(long)st->m_rtq,st->m_msg22.getDocId()); sprintf ( x, "&d=%lli",st->m_docId ); x += gbstrlen(x); // set our query for highlighting Query qq; qq.set2 ( q, st->m_langId , true ); // print the query terms into our highlight buffer Highlight hi; // make words so we can set the scores to ignore fielded terms Words qw; qw.set ( q , // content being highlighted, utf8 qlen , // content being highlighted, utf8 TITLEREC_CURRENT_VERSION, true , // computeIds false ); // hasHtmlEntities? // . assign scores of 0 to query words that should be ignored // . TRICKY: loop over words in qq.m_qwords, but they should be 1-1 // with words in qw. // . sanity check //if ( qw.getNumWords() != qq.m_numWords ) { char *xx = NULL; *xx = 0;} // declare up here Matches m; // do the loop //Scores ss; //ss.set ( &qw , NULL ); //for ( long i = 0 ; i < qq.m_numWords ; i++ ) // if ( ! m.matchWord ( &qq.m_qwords[i],i ) ) ss.m_scores[i] = 0; // now set m.m_matches[] to those words in qw that match a query word // or phrase in qq. m.setQuery ( &qq ); //m.addMatches ( &qw , &ss , true ); m.addMatches ( &qw ); long hilen = 0; // CNS: if ( ! st->m_clickNScroll ) { // and highlight the matches if ( printDisclaimer ) { hilen = hi.set ( //p , //avail , sb , &qw , // words to highlight &m , // matches relative to qw false , // doSteming false , // st->m_clickAndScroll , (char *)thisUrl );// base url for ClcknScrll //p += hilen; // now an hr //memcpy ( p , "</span></table></table>\n" , 24 ); p += 24; sb->safeStrcpy("</span></table></table>\n"); } bool includeHeader = st->m_includeHeader; // do not show header for json object display if ( xd->m_contentType == CT_JSON ) includeHeader = false; if ( format == FORMAT_XML ) includeHeader = false; if ( format == FORMAT_JSON ) includeHeader = false; //mfree(uq, uqCapacity, "PageGet"); // undo the header writes if we should if ( ! includeHeader ) { // including base href is off by default when not including // the header, so the caller must explicitly turn it back on if ( st->m_includeBaseHref ) sb->m_length=startLen2;//p=start2; else sb->m_length=startLen1;//p=start1; } //sb->safeStrcpy(tbuf); if ( format == FORMAT_XML ) { sb->safePrintf("<response>\n"); sb->safePrintf("<statusCode>0</statusCode>\n"); sb->safePrintf("<statusMsg>Success</statusMsg>\n"); sb->safePrintf("<url><![CDATA["); sb->cdataEncode(xd->m_firstUrl.m_url); sb->safePrintf("]]></url>\n"); sb->safePrintf("<docId>%llu</docId>\n",xd->m_docId); sb->safePrintf("\t<cachedTimeUTC>%lu</cachedTimeUTC>\n", lastSpiderDate); sb->safePrintf("\t<cachedTimeStr>%s</cachedTimeStr>\n",tbuf); } if ( format == FORMAT_JSON ) { sb->safePrintf("{\"response\":{\n"); sb->safePrintf("\t\"statusCode\":0,\n"); sb->safePrintf("\t\"statusMsg\":\"Success\",\n"); sb->safePrintf("\t\"url\":\""); sb->jsonEncode(xd->m_firstUrl.m_url); sb->safePrintf("\",\n"); sb->safePrintf("\t\"docId\":%llu,\n",xd->m_docId); sb->safePrintf("\t\"cachedTimeUTC\":%lu,\n",lastSpiderDate); sb->safePrintf("\t\"cachedTimeStr\":\"%s\",\n",tbuf); } // identify start of <title> tag we wrote out char *sbstart = sb->getBufStart(); char *sbend = sb->getBufEnd(); char *titleStart = NULL; char *titleEnd = NULL; for ( char *t = sbstart ; t < sbend ; t++ ) { // title tag? if ( t[0]!='<' ) continue; if ( to_lower_a(t[1])!='t' ) continue; if ( to_lower_a(t[2])!='i' ) continue; if ( to_lower_a(t[3])!='t' ) continue; if ( to_lower_a(t[4])!='l' ) continue; if ( to_lower_a(t[5])!='e' ) continue; // point to it char *x = t + 5; // max - to keep things fast char *max = x + 500; for ( ; *x && *x != '>' && x < max ; x++ ); x++; // find end char *e = x; for ( ; *e && e < max ; e++ ) { if ( e[0]=='<' && to_lower_a(e[1])=='/' && to_lower_a(e[2])=='t' && to_lower_a(e[3])=='i' && to_lower_a(e[4])=='t' && to_lower_a(e[5])=='l' && to_lower_a(e[6])=='e' ) break; } if ( e < max ) { titleStart = x; titleEnd = e; } break; } // . print title at top! // . consider moving if ( titleStart ) { char *ebuf = st->m_r.getString("eb"); if ( ! ebuf ) ebuf = ""; //p += sprintf ( p , sb->safePrintf( "<table border=1 " "cellpadding=10 " "cellspacing=0 " "width=100%% " "color=#ffffff>" ); long printLinks = st->m_r.getLong("links",0); if ( ! printDisclaimer && printLinks ) sb->safePrintf(//p += sprintf ( p , // first put cached and live link "<tr>" "<td bgcolor=lightyellow>" // print cached link //"<center>" " " "<b>" "<a " "style=\"font-size:18px;font-weight:600;" "color:#000000;\" " "href=\"" "/get?" "c=%s&d=%lli&qh=0&cnsp=1&eb=%s\">" "cached link</a>" " " "<a " "style=\"font-size:18px;font-weight:600;" "color:#000000;\" " "href=%s>live link</a>" "</b>" //"</center>" "</td>" "</tr>\n" ,st->m_coll ,st->m_docId ,ebuf ,thisUrl // st->ptr_ubuf ); if ( printLinks ) { sb->safePrintf(//p += sprintf ( p , "<tr><td bgcolor=pink>" "<span style=\"font-size:18px;" "font-weight:600;" "color:#000000;\">" " " "<b>PAGE TITLE:</b> " ); long tlen = titleEnd - titleStart; sb->safeMemcpy ( titleStart , tlen ); sb->safePrintf ( "</span></td></tr>" ); } sb->safePrintf( "</table><br>\n" ); } // is the content preformatted? bool pre = false; char ctype = (char)xd->m_contentType; if ( ctype == CT_TEXT ) pre = true ; // text/plain if ( ctype == CT_DOC ) pre = true ; // filtered msword if ( ctype == CT_PS ) pre = true ; // filtered postscript if ( format == FORMAT_XML ) pre = false; if ( format == FORMAT_JSON ) pre = false; // if it is content-type text, add a <pre> if ( pre ) {//p + 5 < bufEnd && pre ) { sb->safePrintf("<pre>"); //p += 5; } if ( st->m_strip == 1 ) contentLen = stripHtml( content, contentLen, (long)xd->m_version, st->m_strip ); // it returns -1 and sets g_errno on error, line OOM if ( contentLen == -1 ) { //if ( buf ) mfree ( buf , bufMaxSize , "PageGet2" ); return sendErrorReply ( st , g_errno ); } Xml xml; Words ww; // if no highlighting, skip it bool queryHighlighting = st->m_queryHighlighting; if ( st->m_strip == 2 ) queryHighlighting = false; // do not do term highlighting if json if ( xd->m_contentType == CT_JSON ) queryHighlighting = false; SafeBuf tmp; SafeBuf *xb = sb; if ( format == FORMAT_XML ) xb = &tmp; if ( format == FORMAT_JSON ) xb = &tmp; if ( ! queryHighlighting ) { xb->safeMemcpy ( content , contentLen ); //p += contentLen ; } else { // get the content as xhtml (should be NULL terminated) //Words *ww = xd->getWords(); if ( ! xml.set ( content , contentLen , false , 0 , false , TITLEREC_CURRENT_VERSION , false , 0 , CT_HTML ) ) { // niceness is 0 //if ( buf ) mfree ( buf , bufMaxSize , "PageGet2" ); return sendErrorReply ( st , g_errno ); } if ( ! ww.set ( &xml , true , 0 ) ) { // niceness is 0 //if ( buf ) mfree ( buf , bufMaxSize , "PageGet2" ); return sendErrorReply ( st , g_errno ); } // sanity check //if ( ! xd->m_wordsValid ) { char *xx=NULL;*xx=0; } // how much space left in p? //avail = bufEnd - p; Matches m; m.setQuery ( &qq ); m.addMatches ( &ww ); hilen = hi.set ( xb , // p , avail , &ww , &m , false /*doStemming?*/ , st->m_clickAndScroll , thisUrl /*base url for click & scroll*/); //p += hilen; log(LOG_DEBUG, "query: Done highlighting cached page content"); } if ( format == FORMAT_XML ) { sb->safePrintf("\t<content><![CDATA["); sb->cdataEncode ( xb->getBufStart() ); sb->safePrintf("]]></content>\n"); sb->safePrintf("</response>\n"); } if ( format == FORMAT_JSON ) { sb->safePrintf("\t\"content\":\"\n"); sb->jsonEncode ( xb->getBufStart() ); sb->safePrintf("\"\n}\n}\n"); } // if it is content-type text, add a </pre> if ( pre ) { // p + 6 < bufEnd && pre ) { sb->safeMemcpy ( "</pre>" , 6 ); //p += 6; } // calculate bufLen //long bufLen = p - buf; long ct = xd->m_contentType; // now filter the entire buffer to escape out the xml tags // so it is displayed nice SafeBuf newbuf; if ( ct == CT_XML ) { // encode the xml tags into <tagname> sequences if ( !newbuf.htmlEncodeXmlTags ( sb->getBufStart() , sb->getLength(), 0)){// niceness=0 //if ( buf ) mfree ( buf , bufMaxSize , "PageGet2" ); return sendErrorReply ( st , g_errno ); } // free out buffer that we alloc'd before returning since this // should have copied it into another buffer //if ( buf ) mfree ( buf , bufMaxSize , "PageGet2" ); // reassign //buf = newbuf.getBufStart(); //bufLen = newbuf.length(); sb->stealBuf ( &newbuf ); } // now encapsulate it in html head/tail and send it off // sendErr: contentType = "text/html"; if ( strip == 2 ) contentType = "text/xml"; // xml is usually buggy and this throws browser off //if ( ctype == CT_XML ) contentType = "text/xml"; if ( xd->m_contentType == CT_JSON ) contentType = "application/json"; if ( format == FORMAT_XML ) contentType = "text/xml"; if ( format == FORMAT_JSON ) contentType = "application/json"; // safebuf, sb, is a member of "st" so this should copy the buffer // when it constructs the http reply, and we gotta call delete(st) // AFTER this so sb is still valid. bool status = g_httpServer.sendDynamicPage (s, //buf,bufLen, sb->getBufStart(), sb->getLength(), -1,false, contentType, -1, NULL, "utf8" ); // nuke state2 mdelete ( st , sizeof(State2) , "PageGet1" ); delete (st); // free out buffer that we alloc'd before returning since this // should have copied it into another buffer //if ( ct == CT_XML ) newbuf.purge(); //else if ( buf ) mfree ( buf , bufMaxSize , "PageGet2" ); // and convey the status return status; }
void DataFeed::parse ( char *dataFeedPage, long dataFeedPageLen ) { // use Xml Class to parse up the page Xml xml; xml.set ( csUTF8, dataFeedPage, dataFeedPageLen, false, 0, false, TITLEREC_CURRENT_VERSION ); // get the nodes long numNodes = xml.getNumNodes(); XmlNode *nodes = xml.getNodes(); // to count the tiers, result levels, and level costs long currTier = 0; long currResultLevel = 0; long currLevelCost = 0; // pull out the keywords for the data feed for (long i = 0; i < numNodes; i++) { // skip if this isn't a meta tag, shouldn't happen if (nodes[i].m_nodeId != 68) continue; // get the meta tag name //long tagLen; //char *tag = xml.getString(i, "name", &tagLen); long ucTagLen; char *ucTag = xml.getString(i, "name", &ucTagLen); char tag[256]; long tagLen = utf16ToLatin1 ( tag, 256, (UChar*)ucTag, ucTagLen>>1 ); // skip if empty if (!tag || tagLen <= 0) continue; // get the content long ucConLen; char *ucCon = xml.getString(i, "content", &ucConLen); char con[1024]; long conLen = utf16ToLatin1 ( con, 1024, (UChar*)ucCon, ucConLen>>1 ); if (!con || conLen <= 0) continue; // match the meta tag to its local var and copy content if (tagLen == 10 && strncasecmp(tag, "customerid", 10) == 0) m_customerId = atoll(con); else if (tagLen == 11 && strncasecmp(tag, "datafeedurl", 11) == 0) setUrl(con, conLen); else if (tagLen == 8 && strncasecmp(tag, "passcode", 8) == 0) m_passcodeLen = setstr(m_passcode, MAX_PASSCODELEN, con, conLen); else if (tagLen == 6 && strncasecmp(tag, "status", 6) == 0) m_isActive = (bool)atoi(con); else if (tagLen == 6 && strncasecmp(tag, "locked", 6) == 0) m_isLocked = (bool)atoi(con); else if (tagLen == 14 && strncasecmp(tag, "dfcreationtime", 14) == 0) m_creationTime = atol(con); else if (tagLen == 8 && strncasecmp(tag, "numtiers", 8) == 0) m_priceTable.m_numTiers = atol(con); else if (tagLen == 15 && strncasecmp(tag, "numresultlevels", 15) == 0) m_priceTable.m_numResultLevels = atol(con); else if (tagLen == 10 && strncasecmp(tag, "monthlyfee", 10) == 0) m_priceTable.m_monthlyFee = atol(con); else if (tagLen == 7 && strncasecmp(tag, "tiermax", 7) == 0) { m_priceTable.m_tierMax[currTier] = (unsigned long)atol(con); currTier++; } else if (tagLen == 11 && strncasecmp(tag, "resultlevel", 11) == 0) { m_priceTable.m_resultLevels[currResultLevel] = (unsigned long)atol(con); currResultLevel++; } else if (tagLen == 9 && strncasecmp(tag, "levelcost", 9) == 0) { m_priceTable.m_levelCosts[currLevelCost] = (unsigned long)atol(con); currLevelCost++; } else log(LOG_INFO, "datafeed: Invalid Meta Tag Parsed [%li]:" " %s", tagLen, tag); } }
void Blaster::gotDoc4 ( void *state, TcpSocket *s){ StateBD *st=(StateBD *)state; st->m_numUrlDocsReceived++; if (!s) { //Shouldn't happen, but still putting a checkpoint log (LOG_WARN,"blaster: Got a null s in gotDoc4." "Happened because ip could not be found for gigablast" "server"); if (st->m_numUrlDocsReceived==st->m_numUrlDocsSent){ m_launched--; // Free stateBD freeStateBD(st); } return; } // bail if got cut off if ( s->m_readOffset == 0 ) { log("blasterDiff : lost the Request in gotDoc4"); if (st->m_numUrlDocsReceived==st->m_numUrlDocsSent){ m_launched--; freeStateBD(st); } return; } char *reply = s->m_readBuf ; long size = s->m_readOffset; HttpMime mime; mime.set ( reply , size , NULL ); char *content = reply + mime.getMimeLen(); long contentLen = size - mime.getMimeLen(); //short csEnum = get_iana_charset(mime.getCharset(), // mime.getCharsetLen()); /* if (csEnum == csUnknown) log(LOG_DEBUG, "blaster: Unknown charset : %s", mime.getCharset());*/ Xml xml; if (!xml.set( content, contentLen, false, 0, false, TITLEREC_CURRENT_VERSION)){ log(LOG_WARN,"blaster: Couldn't set XML Class in gotDoc4"); } Links links; Url *url=mime.getLocationUrl(); if (!links.set(0,//siterec xml &xml, url, false, NULL, TITLEREC_CURRENT_VERSION, 0, false, NULL)){ log(LOG_WARN, "blaster: Coudn't set Links class in gotDoc4"); } for (long i=0;i<links.getNumLinks();i++){ char *ss=links.getLink(i); char *p; // This page *should* always be a gigablast page. So not adding // checks for msn or yahoo or google page. p=strstr(ss,"google."); if(p) continue; p=strstr(ss,"cache:"); //googles cache page if(p) continue; p= strstr(ss,"gigablast."); if(p) continue; p= strstr(ss,"web.archive.org");//older copies on gigablast if(p) continue; p= strstr(ss,"search.yahoo.com");//from gigablast search if(p) continue; p= strstr(ss,"search.msn.com");//from gigablast search if(p) continue; p= strstr(ss,"s.teoma.com");//from gigablast search if(p) continue; p= strstr(ss,"search.dmoz.org");//from gigablast search if(p) continue; p= strstr(ss,"www.answers.com");//from gigablast search if(p) continue; if (m_verbose) log(LOG_WARN,"blaster: Link Present on server2=%s",ss); } // So if one of the links that is returned is the exact url, // then we know that the url is present.So get the url from the // mime, search for it in the links that are returned. char tmp[1024]; char *sendBuf=s->m_sendBuf; char *p1,*p2; // First get the Host, which is the domain. Since socket s is going to // be useless after this function, changing m_sendBuf instead of using // more space p1=strstr(sendBuf,"%3A"); if(p1){ p1+=3; p2=strstr(p1," HTTP"); if (p2){ //Since I do not care about the sendbuf anymore *p2='\0'; } } if (!p1 || !p2){ log(LOG_WARN,"blasterdiff: Could not find search link" "from m_sendBuf in gotdoc4"); } else{ sprintf(tmp,"%s",p1); //log(LOG_WARN,"blaster: tmp in gotDoc4 = %s",tmp); bool isFound=false; // So now we search for tmp in the links for (long i=0;i<links.getNumLinks();i++){ if(strstr(links.getLink(i),tmp) && links.getLinkLen(i)==(int)gbstrlen(tmp)){ isFound=true; log(LOG_WARN,"blaster: %s in results1 but not" " in results2 for query %s but does exist" " in server2",tmp,st->m_u1);//->getQuery() } } if (!isFound) log(LOG_WARN,"blaster: %s in results1 but not" " in results2 for query %s and does NOT exist" " in server2",tmp,st->m_u1); // ->getQuery() } if (st->m_numUrlDocsReceived==st->m_numUrlDocsSent){ m_launched--; // Free stateBD freeStateBD(st); } return; }
// returns length of stripped content, but will set g_errno and return -1 // on error int32_t stripHtml( char *content, int32_t contentLen, int32_t version, int32_t strip ) { if ( !strip ) { log( LOG_WARN, "query: html stripping not required!" ); return contentLen; } if ( ! content ) return 0; if ( contentLen == 0 ) return 0; // filter content if we should // keep this on the big stack so "content" still references something Xml tmpXml; // . get the content as xhtml (should be NULL terminated) // . parse as utf8 since all we are doing is messing with // the tags...content manipulation comes later if ( !tmpXml.set( content, contentLen, version, CT_HTML ) ) { return -1; } //if( strip == 4 ) // return tmpXml.getText( content, contentLen ); // go tag by tag int32_t n = tmpXml.getNumNodes(); XmlNode *nodes = tmpXml.getNodes(); // Xml class may have converted to utf16 content = tmpXml.getContent(); contentLen = tmpXml.getContentLen(); char *x = content; char *xend = content + contentLen; int32_t stackid = -1; int32_t stackc = 0; char skipIt = 0; // . hack COL tag to NOT require a back tag // . do not leave it that way as it could mess up our parsing //g_nodes[25].m_hasBackTag = 0; for ( int32_t i = 0 ; i < n ; i++ ) { // get id of this node int32_t id = nodes[i].m_nodeId; // if strip is 4, just remove the script tag if( strip == 4 ){ if ( id ){ if ( id == TAG_SCRIPT ){ skipIt ^= 1; continue; } } else if ( skipIt ) continue; goto keepit; } // if strip is 3, ALL tags will be removed! if( strip == 3 ) { if( id ) { // . we dont want anything in between: // - script tags (83) // - style tags (111) if ((id == TAG_SCRIPT) || (id == TAG_STYLE)) skipIt ^= 1; // save img to have alt text kept. if ( id == TAG_IMG ) goto keepit; continue; } else { if( skipIt ) continue; goto keepit; } } // get it int32_t fk; if ( strip == 1 ) fk = g_nodes[id].m_filterKeep1; else fk = g_nodes[id].m_filterKeep2; // if tag is <link ...> only keep it if it has // rel="stylesheet" or rel=stylesheet if ( strip == 2 && id == TAG_LINK ) { // <link> tag id int32_t fflen; char *ff = nodes[i].getFieldValue ( "rel" , &fflen ); if ( ff && fflen == 10 && strncmp(ff,"stylesheet",10) == 0 ) goto keepit; } // just remove just the tag if this is 2 if ( fk == 2 ) continue; // keep it if not in a stack if ( ! stackc && fk ) goto keepit; // if no front/back for tag, just skip it if ( ! nodes[i].m_hasBackTag ) continue; // start stack if none if ( stackc == 0 ) { // but not if this is a back tag if ( nodes[i].m_node[1] == '/' ) continue; // now start the stack stackid = id; stackc = 1; continue; } // skip if this tag does not match what is on stack if ( id != stackid ) continue; // if ANOTHER front tag, inc stack if ( nodes[i].m_node[1] != '/' ) stackc++; // otherwise, dec the stack count else stackc--; // . ensure not negative from excess back tags // . reset stackid to -1 to indicate no stack if ( stackc <= 0 ) { stackid= -1; stackc = 0; } // skip it continue; keepit: // replace images with their alt text int32_t vlen; char *v; if ( id == TAG_IMG ) { v = nodes[i].getFieldValue("alt", &vlen ); // try title if no alt text if ( ! v ) v = nodes[i].getFieldValue("title", &vlen ); if ( v ) { gbmemcpy ( x, v, vlen ); x += vlen; } continue; } // remove background image from body,table,td tags if ( id == TAG_BODY || id == TAG_TABLE || id == TAG_TD ) { v = nodes[i].getFieldValue("background", &vlen); // remove background, just sabotage it if ( v ) v[-4] = 'x'; } // store it gbmemcpy ( x , nodes[i].m_node , nodes[i].m_nodeLen ); x += nodes[i].m_nodeLen; // sanity check if ( x > xend ) { g_process.shutdownAbort(true);} } contentLen = x - content; content [ contentLen ] = '\0'; // unhack COL tag //g_nodes[25].m_hasBackTag = 1; return contentLen; }