bool JsonItem::getCompoundName ( SafeBuf &nameBuf ) { // reset, but don't free mem etc. just set m_length to 0 nameBuf.reset(); // get its full compound name like "meta.twitter.title" JsonItem *p = this;//ji; char *lastName = NULL; char *nameArray[20]; int32_t numNames = 0; for ( ; p ; p = p->m_parent ) { // empty name? if ( ! p->m_name ) continue; if ( ! p->m_name[0] ) continue; // dup? can happen with arrays. parent of string // in object, has same name as his parent, the // name of the array. "dupname":[{"a":"b"},{"c":"d"}] if ( p->m_name == lastName ) continue; // update lastName = p->m_name; // add it up nameArray[numNames++] = p->m_name; // breach? if ( numNames < 15 ) continue; log("build: too many names in json tag"); break; } // assemble the names in reverse order which is correct order for ( int32_t i = 1 ; i <= numNames ; i++ ) { // copy into our safebuf if ( ! nameBuf.safeStrcpy ( nameArray[numNames-i]) ) return false; // separate names with periods if ( ! nameBuf.pushChar('.') ) return false; } // remove last period nameBuf.removeLastChar('.'); // and null terminate if ( ! nameBuf.nullTerm() ) return false; // change all :'s in names to .'s since : is reserved! char *px = nameBuf.getBufStart(); for ( ; *px ; px++ ) if ( *px == ':' ) *px = '.'; return true; }
// returns false if blocked, true otherwise bool processLoop ( void *state ) { // get it State2 *st = (State2 *)state; // get the tcp socket from the state TcpSocket *s = st->m_socket; // get it XmlDoc *xd = &st->m_xd; if ( ! xd->m_loaded ) { // setting just the docid. niceness is 0. //xd->set3 ( st->m_docId , st->m_coll , 0 ); // callback xd->setCallback ( state , processLoop ); // . and tell it to load from the old title rec // . this sets xd->m_oldTitleRec/m_oldTitleRecSize // . this sets xd->ptr_* and all other member vars from // the old title rec if found in titledb. if ( ! xd->loadFromOldTitleRec ( ) ) return false; } if ( g_errno ) return sendErrorReply ( st , g_errno ); // now force it to load old title rec //char **tr = xd->getTitleRec(); SafeBuf *tr = xd->getTitleRecBuf(); // blocked? return false if so. it will call processLoop() when it rets if ( tr == (void *)-1 ) return false; // we did not block. check for error? this will free "st" too. if ( ! tr ) return sendErrorReply ( st , g_errno ); // if title rec was empty, that is a problem if ( xd->m_titleRecBuf.length() == 0 ) return sendErrorReply ( st , ENOTFOUND); // set callback char *na = xd->getIsNoArchive(); // wait if blocked if ( na == (void *)-1 ) return false; // error? if ( ! na ) return sendErrorReply ( st , g_errno ); // forbidden? allow turkeys through though... if ( ! st->m_isMasterAdmin && *na ) return sendErrorReply ( st , ENOCACHE ); SafeBuf *sb = &st->m_sb; // &page=4 will print rainbow sections if ( ! st->m_printed && st->m_r.getLong("page",0) ) { // do not repeat this call st->m_printed = true; // this will call us again since we called // xd->setCallback() above to us if ( ! xd->printDocForProCog ( sb , &st->m_r ) ) return false; } char *contentType = "text/html"; char format = st->m_format; if ( format == FORMAT_XML ) contentType = "text/xml"; if ( format == FORMAT_JSON ) contentType = "application/json"; // if we printed a special page (like rainbow sections) then return now if ( st->m_printed ) { bool status = g_httpServer.sendDynamicPage (s, //buf,bufLen, sb->getBufStart(), sb->getLength(), -1,false, //"text/html", contentType, -1, NULL, "utf8" ); // nuke state2 mdelete ( st , sizeof(State2) , "PageGet1" ); delete (st); return status; } /* // this was calling XmlDoc and setting sections, etc. to // get the SpiderReply junk... no no no // is it banned or filtered? this ignores the TagRec in the titleRec // and uses msg8a to get it fresh instead char *vi = xd->getIsFiltered();//Visible( ); // wait if blocked if ( vi == (void *)-1 ) return false; // error? if ( ! vi ) return sendErrorReply ( st , g_errno ); // banned? if ( ! st->m_isMasterAdmin && ! *vi ) return sendErrorReply (st,EDOCBANNED); */ // get the utf8 content char **utf8 = xd->getUtf8Content(); //int32_t len = xd->size_utf8Content - 1; // wait if blocked??? if ( utf8 == (void *)-1 ) return false; // strange if ( xd->size_utf8Content<=0) { log("pageget: utf8 content <= 0"); return sendErrorReply(st,EBADENGINEER ); } // alloc error? if ( ! utf8 ) return sendErrorReply ( st , g_errno ); // get this host Host *h = g_hostdb.getHost ( g_hostdb.m_hostId ); if ( ! h ) { log("pageget: hostid %"INT32" is bad",g_hostdb.m_hostId); return sendErrorReply(st,EBADENGINEER ); } char *content = xd->ptr_utf8Content; int32_t contentLen = xd->size_utf8Content - 1; // int16_tcut char strip = st->m_strip; // alloc buffer now //char *buf = NULL; //int32_t bufMaxSize = 0; //bufMaxSize = len + ( 32 * 1024 ) ; //bufMaxSize = contentLen + ( 32 * 1024 ) ; //buf = (char *)mmalloc ( bufMaxSize , "PageGet2" ); //char *p = buf; //char *bufEnd = buf + bufMaxSize; //if ( ! buf ) { // return sendErrorReply ( st , g_errno ); //} // for undoing the header //char *start1 = p; int32_t startLen1 = sb->length(); // we are always utfu if ( strip != 2 ) sb->safePrintf( "<meta http-equiv=\"Content-Type\" " "content=\"text/html;charset=utf8\">\n"); // base href //Url *base = &xd->m_firstUrl; //if ( xd->ptr_redirUrl.m_url[0] ) // base = &xd->m_redirUrl; char *base = xd->ptr_firstUrl; if ( xd->ptr_redirUrl ) base = xd->ptr_redirUrl; //Url *redir = *xd->getRedirUrl(); if ( strip != 2 ) { sb->safePrintf ( "<BASE HREF=\"%s\">" , base ); //p += gbstrlen ( p ); } // default colors in case css files missing if ( strip != 2 ) { sb->safePrintf( "\n<style type=\"text/css\">\n" "body{background-color:white;color:black;}\n" "</style>\n"); //p += gbstrlen ( p ); } //char format = st->m_format; if ( format == FORMAT_XML ) sb->reset(); if ( format == FORMAT_JSON ) sb->reset(); if ( xd->m_contentType == CT_JSON ) sb->reset(); if ( xd->m_contentType == CT_XML ) sb->reset(); if ( xd->m_contentType == CT_STATUS ) sb->reset(); // for undoing the stuff below int32_t startLen2 = sb->length();//p; // query should be NULL terminated char *q = st->m_qsb.getBufStart(); int32_t qlen = st->m_qsb.getLength(); // m_qlen; char styleTitle[128] = "font-size:14px;font-weight:600;" "color:#000000;"; char styleText[128] = "font-size:14px;font-weight:400;" "color:#000000;"; char styleLink[128] = "font-size:14px;font-weight:400;" "color:#0000ff;"; char styleTell[128] = "font-size:14px;font-weight:600;" "color:#cc0000;"; // get the url of the title rec Url *f = xd->getFirstUrl(); bool printDisclaimer = st->m_printDisclaimer; if ( xd->m_contentType == CT_JSON ) printDisclaimer = false; if ( xd->m_contentType == CT_STATUS ) printDisclaimer = false; if ( format == FORMAT_XML ) printDisclaimer = false; if ( format == FORMAT_JSON ) printDisclaimer = false; char tbuf[100]; tbuf[0] = 0; time_t lastSpiderDate = xd->m_spideredTime; if ( printDisclaimer || format == FORMAT_XML || format == FORMAT_JSON ) { struct tm *timeStruct = gmtime ( &lastSpiderDate ); strftime ( tbuf, 100,"%b %d, %Y UTC", timeStruct); } // We should always be displaying this disclaimer. // - May eventually want to display this at a different location // on the page, or on the click 'n' scroll browser page itself // when this page is not being viewed solo. // CNS: if ( ! st->m_clickNScroll ) { if ( printDisclaimer ) { sb->safePrintf(//sprintf ( p , //"<BASE HREF=\"%s\">" //"<table border=1 width=100%%>" //"<tr><td>" "<table border=\"1\" bgcolor=\"#" BGCOLOR "\" cellpadding=\"10\" " //"id=\"gbcnsdisctable\" class=\"gbcnsdisctable_v\"" "cellspacing=\"0\" width=\"100%%\" color=\"#ffffff\">" "<tr" //" id=\"gbcnsdisctr\" class=\"gbcnsdisctr_v\"" "><td>" //"<font face=times,sans-serif color=black size=-1>" "<span style=\"%s\">" "This is Gigablast's cached page of </span>" "<a href=\"%s\" style=\"%s\">%s</a>" "" , styleTitle, f->getUrl(), styleLink, f->getUrl() ); //p += gbstrlen ( p ); // then the rest //sprintf(p , sb->safePrintf( "<span style=\"%s\">. " "Gigablast is not responsible for the content of " "this page.</span>", styleTitle ); //p += gbstrlen ( p ); sb->safePrintf ( "<br/><span style=\"%s\">" "Cached: </span>" "<span style=\"%s\">", styleTitle, styleText ); //p += gbstrlen ( p ); // then the spider date in GMT // time_t lastSpiderDate = xd->m_spideredTime; // struct tm *timeStruct = gmtime ( &lastSpiderDate ); // char tbuf[100]; // strftime ( tbuf, 100,"%b %d, %Y UTC", timeStruct); //p += gbstrlen ( p ); sb->safeStrcpy(tbuf); // Moved over from PageResults.cpp sb->safePrintf( "</span> - <a href=\"" "/get?" "q=%s&c=%s&rtq=%"INT32"&" "d=%"INT64"&strip=1\"" " style=\"%s\">" "[stripped]</a>", q , st->m_coll , (int32_t)st->m_rtq, st->m_docId, styleLink ); // a link to alexa if ( f->getUrlLen() > 5 ) { sb->safePrintf( " - <a href=\"http:" "//web.archive.org/web/*/%s\"" " style=\"%s\">" "[older copies]</a>" , f->getUrl(), styleLink ); } if (st->m_noArchive){ sb->safePrintf( " - <span style=\"%s\"><b>" "[NOARCHIVE]</b></span>", styleTell ); } if (st->m_isBanned){ sb->safePrintf(" - <span style=\"%s\"><b>" "[BANNED]</b></span>", styleTell ); } // only print this if we got a query if ( qlen > 0 ) { sb->safePrintf("<br/><br/><span style=\"%s\"> " "These search terms have been " "highlighted: ", styleText ); //p += gbstrlen ( p ); } } // how much space left in p? //int32_t avail = bufEnd - p; // . make the url that we're outputting for (like in PageResults.cpp) // . "thisUrl" is the baseUrl for click & scroll char thisUrl[MAX_URL_LEN]; char *thisUrlEnd = thisUrl + MAX_URL_LEN; char *x = thisUrl; // . use the external ip of our gateway // . construct the NAT mapped port // . you should have used iptables to map port to the correct // internal ip:port //uint32_t ip =g_conf.m_mainExternalIp ; // h->m_externalIp; //uint16_t port=g_conf.m_mainExternalPort;//h->m_externalHttpPort // local check //if ( st->m_isLocal ) { uint32_t ip = h->m_ip; uint16_t port = h->m_httpPort; //} //sprintf ( x , "http://%s:%"INT32"/get?q=" , iptoa ( ip ) , port ); // . we no longer put the port in here // . but still need http:// since we use <base href=> if (port == 80) sprintf(x,"http://%s/get?q=",iptoa(ip)); else sprintf(x,"http://%s:%hu/get?q=",iptoa(ip),port); x += gbstrlen ( x ); // the query url encoded int32_t elen = urlEncode ( x , thisUrlEnd - x , q , qlen ); x += elen; // separate cgi vars with a & //sprintf ( x, "&seq=%"INT32"&rtq=%"INT32"d=%"INT64"", // (int32_t)st->m_seq,(int32_t)st->m_rtq,st->m_msg22.getDocId()); sprintf ( x, "&d=%"INT64"",st->m_docId ); x += gbstrlen(x); // set our query for highlighting Query qq; qq.set2 ( q, st->m_langId , true ); // print the query terms into our highlight buffer Highlight hi; // make words so we can set the scores to ignore fielded terms Words qw; qw.set ( q , // content being highlighted, utf8 qlen , // content being highlighted, utf8 TITLEREC_CURRENT_VERSION, true , // computeIds false ); // hasHtmlEntities? // . assign scores of 0 to query words that should be ignored // . TRICKY: loop over words in qq.m_qwords, but they should be 1-1 // with words in qw. // . sanity check //if ( qw.getNumWords() != qq.m_numWords ) { char *xx = NULL; *xx = 0;} // declare up here Matches m; // do the loop //Scores ss; //ss.set ( &qw , NULL ); //for ( int32_t i = 0 ; i < qq.m_numWords ; i++ ) // if ( ! m.matchWord ( &qq.m_qwords[i],i ) ) ss.m_scores[i] = 0; // now set m.m_matches[] to those words in qw that match a query word // or phrase in qq. m.setQuery ( &qq ); //m.addMatches ( &qw , &ss , true ); m.addMatches ( &qw ); int32_t hilen = 0; // CNS: if ( ! st->m_clickNScroll ) { // and highlight the matches if ( printDisclaimer ) { hilen = hi.set ( //p , //avail , sb , &qw , // words to highlight &m , // matches relative to qw false , // doSteming false , // st->m_clickAndScroll , (char *)thisUrl );// base url for ClcknScrll //p += hilen; // now an hr //gbmemcpy ( p , "</span></table></table>\n" , 24 ); p += 24; sb->safeStrcpy("</span></table></table>\n"); } bool includeHeader = st->m_includeHeader; // do not show header for json object display if ( xd->m_contentType == CT_JSON ) includeHeader = false; if ( xd->m_contentType == CT_XML ) includeHeader = false; if ( xd->m_contentType == CT_STATUS ) includeHeader = false; if ( format == FORMAT_XML ) includeHeader = false; if ( format == FORMAT_JSON ) includeHeader = false; //mfree(uq, uqCapacity, "PageGet"); // undo the header writes if we should if ( ! includeHeader ) { // including base href is off by default when not including // the header, so the caller must explicitly turn it back on if ( st->m_includeBaseHref ) sb->m_length=startLen2;//p=start2; else sb->m_length=startLen1;//p=start1; } //sb->safeStrcpy(tbuf); if ( format == FORMAT_XML ) { sb->safePrintf("<response>\n"); sb->safePrintf("<statusCode>0</statusCode>\n"); sb->safePrintf("<statusMsg>Success</statusMsg>\n"); sb->safePrintf("<url><![CDATA["); sb->cdataEncode(xd->m_firstUrl.m_url); sb->safePrintf("]]></url>\n"); sb->safePrintf("<docId>%"UINT64"</docId>\n",xd->m_docId); sb->safePrintf("\t<cachedTimeUTC>%"INT32"</cachedTimeUTC>\n", (int32_t)lastSpiderDate); sb->safePrintf("\t<cachedTimeStr>%s</cachedTimeStr>\n",tbuf); } if ( format == FORMAT_JSON ) { sb->safePrintf("{\"response\":{\n"); sb->safePrintf("\t\"statusCode\":0,\n"); sb->safePrintf("\t\"statusMsg\":\"Success\",\n"); sb->safePrintf("\t\"url\":\""); sb->jsonEncode(xd->m_firstUrl.m_url); sb->safePrintf("\",\n"); sb->safePrintf("\t\"docId\":%"UINT64",\n",xd->m_docId); sb->safePrintf("\t\"cachedTimeUTC\":%"INT32",\n", (int32_t)lastSpiderDate); sb->safePrintf("\t\"cachedTimeStr\":\"%s\",\n",tbuf); } // identify start of <title> tag we wrote out char *sbstart = sb->getBufStart(); char *sbend = sb->getBufEnd(); char *titleStart = NULL; char *titleEnd = NULL; char ctype = (char)xd->m_contentType; // do not calc title or print it if doc is xml or json if ( ctype == CT_XML ) sbend = sbstart; if ( ctype == CT_JSON ) sbend = sbstart; if ( ctype == CT_STATUS ) sbend = sbstart; for ( char *t = sbstart ; t < sbend ; t++ ) { // title tag? if ( t[0]!='<' ) continue; if ( to_lower_a(t[1])!='t' ) continue; if ( to_lower_a(t[2])!='i' ) continue; if ( to_lower_a(t[3])!='t' ) continue; if ( to_lower_a(t[4])!='l' ) continue; if ( to_lower_a(t[5])!='e' ) continue; // point to it char *x = t + 5; // max - to keep things fast char *max = x + 500; for ( ; *x && *x != '>' && x < max ; x++ ); x++; // find end char *e = x; for ( ; *e && e < max ; e++ ) { if ( e[0]=='<' && to_lower_a(e[1])=='/' && to_lower_a(e[2])=='t' && to_lower_a(e[3])=='i' && to_lower_a(e[4])=='t' && to_lower_a(e[5])=='l' && to_lower_a(e[6])=='e' ) break; } if ( e < max ) { titleStart = x; titleEnd = e; } break; } // . print title at top! // . consider moving if ( titleStart ) { char *ebuf = st->m_r.getString("eb"); if ( ! ebuf ) ebuf = ""; //p += sprintf ( p , sb->safePrintf( "<table border=1 " "cellpadding=10 " "cellspacing=0 " "width=100%% " "color=#ffffff>" ); int32_t printLinks = st->m_r.getLong("links",0); if ( ! printDisclaimer && printLinks ) sb->safePrintf(//p += sprintf ( p , // first put cached and live link "<tr>" "<td bgcolor=lightyellow>" // print cached link //"<center>" " " "<b>" "<a " "style=\"font-size:18px;font-weight:600;" "color:#000000;\" " "href=\"" "/get?" "c=%s&d=%"INT64"&qh=0&cnsp=1&eb=%s\">" "cached link</a>" " " "<a " "style=\"font-size:18px;font-weight:600;" "color:#000000;\" " "href=%s>live link</a>" "</b>" //"</center>" "</td>" "</tr>\n" ,st->m_coll ,st->m_docId ,ebuf ,thisUrl // st->ptr_ubuf ); if ( printLinks ) { sb->safePrintf(//p += sprintf ( p , "<tr><td bgcolor=pink>" "<span style=\"font-size:18px;" "font-weight:600;" "color:#000000;\">" " " "<b>PAGE TITLE:</b> " ); int32_t tlen = titleEnd - titleStart; sb->safeMemcpy ( titleStart , tlen ); sb->safePrintf ( "</span></td></tr>" ); } sb->safePrintf( "</table><br>\n" ); } // is the content preformatted? bool pre = false; if ( ctype == CT_TEXT ) pre = true ; // text/plain if ( ctype == CT_DOC ) pre = true ; // filtered msword if ( ctype == CT_PS ) pre = true ; // filtered postscript if ( format == FORMAT_XML ) pre = false; if ( format == FORMAT_JSON ) pre = false; // if it is content-type text, add a <pre> if ( pre ) {//p + 5 < bufEnd && pre ) { sb->safePrintf("<pre>"); //p += 5; } if ( st->m_strip == 1 ) contentLen = stripHtml( content, contentLen, (int32_t)xd->m_version, st->m_strip ); // it returns -1 and sets g_errno on error, line OOM if ( contentLen == -1 ) { //if ( buf ) mfree ( buf , bufMaxSize , "PageGet2" ); return sendErrorReply ( st , g_errno ); } Xml xml; Words ww; // if no highlighting, skip it bool queryHighlighting = st->m_queryHighlighting; if ( st->m_strip == 2 ) queryHighlighting = false; // do not do term highlighting if json if ( xd->m_contentType == CT_JSON ) queryHighlighting = false; if ( xd->m_contentType == CT_STATUS ) queryHighlighting = false; SafeBuf tmp; SafeBuf *xb = sb; if ( format == FORMAT_XML ) xb = &tmp; if ( format == FORMAT_JSON ) xb = &tmp; if ( ! queryHighlighting ) { xb->safeMemcpy ( content , contentLen ); xb->nullTerm(); //p += contentLen ; } else { // get the content as xhtml (should be NULL terminated) //Words *ww = xd->getWords(); if ( ! xml.set ( content , contentLen , false , 0 , false , TITLEREC_CURRENT_VERSION , false , 0 , CT_HTML ) ) { // niceness is 0 //if ( buf ) mfree ( buf , bufMaxSize , "PageGet2" ); return sendErrorReply ( st , g_errno ); } if ( ! ww.set ( &xml , true , 0 ) ) { // niceness is 0 //if ( buf ) mfree ( buf , bufMaxSize , "PageGet2" ); return sendErrorReply ( st , g_errno ); } // sanity check //if ( ! xd->m_wordsValid ) { char *xx=NULL;*xx=0; } // how much space left in p? //avail = bufEnd - p; Matches m; m.setQuery ( &qq ); m.addMatches ( &ww ); hilen = hi.set ( xb , // p , avail , &ww , &m , false /*doStemming?*/ , st->m_clickAndScroll , thisUrl /*base url for click & scroll*/); //p += hilen; log(LOG_DEBUG, "query: Done highlighting cached page content"); } if ( format == FORMAT_XML ) { sb->safePrintf("\t<content><![CDATA["); sb->cdataEncode ( xb->getBufStart() ); sb->safePrintf("]]></content>\n"); sb->safePrintf("</response>\n"); } if ( format == FORMAT_JSON ) { sb->safePrintf("\t\"content\":\"\n"); sb->jsonEncode ( xb->getBufStart() ); sb->safePrintf("\"\n}\n}\n"); } // if it is content-type text, add a </pre> if ( pre ) { // p + 6 < bufEnd && pre ) { sb->safeMemcpy ( "</pre>" , 6 ); //p += 6; } // calculate bufLen //int32_t bufLen = p - buf; /* MDW: return the xml page as is now. 9/28/2014 int32_t ct = xd->m_contentType; // now filter the entire buffer to escape out the xml tags // so it is displayed nice SafeBuf newbuf; if ( ct == CT_XML ) { // encode the xml tags into <tagname> sequences if ( !newbuf.htmlEncodeXmlTags ( sb->getBufStart() , sb->getLength(), 0)){// niceness=0 //if ( buf ) mfree ( buf , bufMaxSize , "PageGet2" ); return sendErrorReply ( st , g_errno ); } // free out buffer that we alloc'd before returning since this // should have copied it into another buffer //if ( buf ) mfree ( buf , bufMaxSize , "PageGet2" ); // reassign //buf = newbuf.getBufStart(); //bufLen = newbuf.length(); sb->stealBuf ( &newbuf ); } */ // now encapsulate it in html head/tail and send it off // sendErr: contentType = "text/html"; if ( strip == 2 ) contentType = "text/xml"; // xml is usually buggy and this throws browser off //if ( ctype == CT_XML ) contentType = "text/xml"; if ( xd->m_contentType == CT_JSON ) contentType = "application/json"; if ( xd->m_contentType == CT_STATUS ) contentType = "application/json"; if ( xd->m_contentType == CT_XML ) contentType = "test/xml"; if ( format == FORMAT_XML ) contentType = "text/xml"; if ( format == FORMAT_JSON ) contentType = "application/json"; // safebuf, sb, is a member of "st" so this should copy the buffer // when it constructs the http reply, and we gotta call delete(st) // AFTER this so sb is still valid. bool status = g_httpServer.sendDynamicPage (s, //buf,bufLen, sb->getBufStart(), sb->getLength(), -1,false, contentType, -1, NULL, "utf8" ); // nuke state2 mdelete ( st , sizeof(State2) , "PageGet1" ); delete (st); // free out buffer that we alloc'd before returning since this // should have copied it into another buffer //if ( ct == CT_XML ) newbuf.purge(); //else if ( buf ) mfree ( buf , bufMaxSize , "PageGet2" ); // and convey the status return status; }
bool qaspider1 ( ) { // // delete the 'qatest123' collection // //static bool s_x1 = false; if ( ! s_flags[0] ) { s_flags[0] = true; if ( ! getUrl ( "/admin/delcoll?xml=1&delcoll=qatest123" ) ) return false; } // // add the 'qatest123' collection // //static bool s_x2 = false; if ( ! s_flags[1] ) { s_flags[1] = true; if ( ! getUrl ( "/admin/addcoll?addcoll=qatest123&xml=1" , // checksum of reply expected 238170006 ) ) return false; } // restrict hopcount to 0 or 1 in url filters so we do not spider // too deep //static bool s_z1 = false; if ( ! s_flags[2] ) { s_flags[2] = true; SafeBuf sb; sb.safePrintf("&c=qatest123&" // make it the custom filter "ufp=0&" "fe=%%21ismanualadd+%%26%%26+%%21insitelist&hspl=0&hspl=1&fsf=0.000000&mspr=0&mspi=1&xg=1000&fsp=-3&" // take out hopcount for now, just test quotas // "fe1=tag%%3Ashallow+%%26%%26+hopcount%%3C%%3D1&hspl1=0&hspl1=1&fsf1=1.000000&mspr1=1&mspi1=1&xg1=1000&fsp1=3&" // just one spider out allowed for consistency "fe1=tag%%3Ashallow+%%26%%26+sitepages%%3C%%3D20&hspl1=0&hspl1=1&fsf1=1.000000&mspr1=1&mspi1=1&xg1=1000&fsp1=45&" "fe2=default&hspl2=0&hspl2=1&fsf2=1.000000&mspr2=0&mspi2=1&xg2=1000&fsp2=45&" ); if ( ! getUrl ( "/admin/filters",0,sb.getBufStart()) ) return false; } // set the site list to // a few sites //static bool s_z2 = false; if ( ! s_flags[3] ) { s_flags[3] = true; SafeBuf sb; sb.safePrintf("&c=qatest123&format=xml&sitelist="); sb.urlEncode("tag:shallow site:www.walmart.com\r\n" "tag:shallow site:http://www.ibm.com/\r\n"); sb.nullTerm(); if ( ! getUrl ("/admin/settings",0,sb.getBufStart() ) ) return false; } // // use the add url interface now // walmart.com above was not seeded because of the site: directive // so this will seed it. // //static bool s_y2 = false; if ( ! s_flags[4] ) { s_flags[4] = true; SafeBuf sb; // delim=+++URL: sb.safePrintf("&c=qatest123" "&format=json" "&strip=1" "&spiderlinks=1" "&urls=www.walmart.com+ibm.com" ); // . now a list of websites we want to spider // . the space is already encoded as + //sb.urlEncode(s_urls1); if ( ! getUrl ( "/admin/addurl",0,sb.getBufStart()) ) return false; } // // wait for spidering to stop // checkagain: // wait until spider finishes. check the spider status page // in json to see when completed //static bool s_k1 = false; if ( ! s_flags[5] ) { // wait 5 seconds, call sleep timer... then call qatest() //usleep(5000000); // 5 seconds wait(3.0); s_flags[5] = true; return false; } if ( ! s_flags[15] ) { s_flags[15] = true; if ( ! getUrl ( "/admin/status?format=json&c=qatest123",0) ) return false; } //static bool s_k2 = false; if ( ! s_flags[6] ) { // ensure spiders are done. // "Nothing currently available to spider" if ( s_content&&!strstr(s_content,"Nothing currently avail")){ s_flags[5] = false; s_flags[15] = false; goto checkagain; } s_flags[6] = true; } // wait for index msg4 to not be cached to ensure all results indexed if ( ! s_flags[22] ) { s_flags[22] = true; wait(1.5); } // verify no results for gbhopcount:2 query //static bool s_y4 = false; if ( ! s_flags[7] ) { s_flags[7] = true; if ( ! getUrl ( "/search?c=qatest123&qa=1&format=xml&" "q=gbhopcount%3A2", -1672870556 ) ) return false; } // but some for gbhopcount:0 query //static bool s_t0 = false; if ( ! s_flags[8] ) { s_flags[8] = true; if ( ! getUrl ( "/search?c=qatest123&qa=1&format=xml&" "q=gbhopcount%3A0", 908338607 ) ) return false; } // check facet sections query for walmart //static bool s_y5 = false; if ( ! s_flags[9] ) { s_flags[9] = true; if ( ! getUrl ( "/search?c=qatest123&format=json&stream=1&" "q=gbfacetstr%3Agbxpathsitehash2492664135", 55157060 ) ) return false; } //static bool s_y6 = false; if ( ! s_flags[10] ) { s_flags[10] = true; if ( ! getUrl ( "/get?page=4&q=gbfacetstr:gbxpathsitehash2492664135&qlang=xx&c=qatest123&d=9861563119&cnsp=0" , 999 ) ) return false; } // in xml //static bool s_y7 = false; if ( ! s_flags[11] ) { s_flags[11] = true; if ( ! getUrl ( "/get?xml=1&page=4&q=gbfacetstr:gbxpathsitehash2492664135&qlang=xx&c=qatest123&d=9861563119&cnsp=0" , 999 ) ) return false; } // and json //static bool s_y8 = false; if ( ! s_flags[12] ) { s_flags[12] = true; if ( ! getUrl ( "/get?json=1&page=4&q=gbfacetstr:gbxpathsitehash2492664135&qlang=xx&c=qatest123&d=9861563119&cnsp=0" , 999 ) ) return false; } // delete the collection //static bool s_fee = false; // if ( ! s_flags[13] ) { // s_flags[13] = true; // if ( ! getUrl ( "/admin/delcoll?delcoll=qatest123" ) ) // return false; // } if ( ! s_flags[17] ) { s_flags[17] = true; if ( ! getUrl ( "/search?c=qatest123&qa=1&format=xml&" "q=site2%3Awww.walmart.com+" "gbsortby%3Agbspiderdate", 999 ) ) return false; } // xpath is like a title here i think. check the returned // facet table in the left column if ( ! s_flags[18] ) { s_flags[18] = true; if ( ! getUrl ( "/search?c=qatest123&qa=1&format=html&" "q=gbfacetstr%3Agbxpathsitehash3624590799" , 999 ) ) return false; } if ( ! s_flags[19] ) { s_flags[19] = true; if ( ! getUrl ( "/search?c=qatest123&qa=1&xml=1&" "q=gbfacetint%3Agbhopcount" , 999 ) ) return false; } if ( ! s_flags[20] ) { s_flags[20] = true; if ( ! getUrl ( "/search?c=qatest123&qa=1&json=1&" "q=gbfacetint%3Alog.score" , 999 ) ) return false; } if ( ! s_flags[21] ) { s_flags[21] = true; if ( ! getUrl ( "/search?c=qatest123&qa=1&xml=1&" "q=gbfacetfloat%3Atalks.rating" , 999 ) ) return false; } if ( ! s_flags[23] ) { s_flags[23] = true; // test facets mixed with gigabits in left hand column if ( ! getUrl ( "/search?c=qatest123&qa=1&html=1&" "q=gbfacetint%3Agbhopcount+walmart" , 999 ) ) return false; } //static bool s_fee2 = false; if ( ! s_flags[14] ) { s_flags[14] = true; log("qa: SUCCESSFULLY COMPLETED " "QA SPIDER1 TEST"); return true; } return true; }
// // the injection qa test suite // bool qainject1 ( ) { //if ( ! s_callback ) s_callback = qainject1; // // delete the 'qatest123' collection // //static bool s_x1 = false; if ( ! s_flags[0] ) { s_flags[0] = true; if ( ! getUrl ( "/admin/delcoll?xml=1&delcoll=qatest123" ) ) return false; } // // add the 'qatest123' collection // //static bool s_x2 = false; if ( ! s_flags[1] ) { s_flags[1] = true; if ( ! getUrl ( "/admin/addcoll?addcoll=qatest123&xml=1" , // checksum of reply expected 238170006 ) ) return false; } // this only loads once loadUrls(); long max = s_ubuf2.length()/(long)sizeof(char *); //max = 1; // // inject urls, return false if not done yet // //static bool s_x4 = false; if ( ! s_flags[2] ) { // TODO: try delimeter based injection too //static long s_ii = 0; for ( ; s_flags[20] < max ; ) { // inject using html api SafeBuf sb; sb.safePrintf("&c=qatest123&deleteurl=0&" "format=xml&u="); sb.urlEncode ( s_urlPtrs[s_flags[20]] ); // the content sb.safePrintf("&hasmime=1"); // sanity //if ( strstr(s_urlPtrs[s_flags[20]],"wdc.htm") ) // log("hey"); sb.safePrintf("&content="); sb.urlEncode(s_contentPtrs[s_flags[20]] ); sb.nullTerm(); // pre-inc it in case getUrl() blocks s_flags[20]++;//ii++; if ( ! getUrl("/admin/inject", 0, // no idea what crc to expect sb.getBufStart()) ) return false; } s_flags[2] = true; } // +the //static bool s_x5 = false; if ( ! s_flags[3] ) { wait(1.5); s_flags[3] = true; return false; } if ( ! s_flags[16] ) { s_flags[16] = true; if ( ! getUrl ( "/search?c=qatest123&qa=1&format=xml&q=%2Bthe", 702467314 ) ) return false; } // sports news //static bool s_x7 = false; if ( ! s_flags[4] ) { s_flags[4] = true; if ( ! getUrl ( "/search?c=qatest123&qa=1&format=xml&" "q=sports+news",2009472889 ) ) return false; } // 'washer & dryer' does some algorithmic synonyms 'washer and dryer' if ( ! s_flags[15] ) { s_flags[15] = true; if ( ! getUrl ( "/search?c=qatest123&qa=1&format=xml&" "debug=1&q=washer+%26+dryer",9999 ) ) return false; } // // mdw: query reindex test // // if ( ! s_flags[30] ) { // s_flags[30] = true; // if ( ! getUrl ( "/admin/reindex?c=qatest123&qa=1&format=xml&" // "debug=1&q=sports",9999 ) ) // return false; // } // // temp end it here // return true; // // eject/delete the urls // //static long s_ii2 = 0; for ( ; s_flags[5] < max ; ) { // reject using html api SafeBuf sb; sb.safePrintf( "/admin/inject?c=qatest123&deleteurl=1&" "format=xml&u="); sb.urlEncode ( s_urlPtrs[s_flags[5]] ); sb.nullTerm(); // pre-inc it in case getUrl() blocks //s_ii2++; s_flags[5]++; if ( ! getUrl ( sb.getBufStart() , 0 ) ) return false; } // // make sure no results left, +the // if ( ! s_flags[6] ) { wait(1.5); s_flags[6] = true; return false; } if ( ! s_flags[14] ) { s_flags[14] = true; if ( ! getUrl ( "/search?c=qatest123&qa=2&format=xml&q=%2Bthe", -1672870556 ) ) return false; } //static bool s_fee2 = false; if ( ! s_flags[13] ) { s_flags[13] = true; log("qa: SUCCESSFULLY COMPLETED " "QA INJECT TEST 1"); //if ( s_callback == qainject ) exit(0); return true; } return true; }
bool qaspider2 ( ) { // // delete the 'qatest123' collection // //static bool s_x1 = false; if ( ! s_flags[0] ) { s_flags[0] = true; if ( ! getUrl ( "/admin/delcoll?xml=1&delcoll=qatest123" ) ) return false; } // // add the 'qatest123' collection // //static bool s_x2 = false; if ( ! s_flags[1] ) { s_flags[1] = true; if ( ! getUrl ( "/admin/addcoll?addcoll=qatest123&xml=1" , // checksum of reply expected 238170006 ) ) return false; } // restrict hopcount to 0 or 1 in url filters so we do not spider // too deep //static bool s_z1 = false; if ( ! s_flags[2] ) { s_flags[2] = true; SafeBuf sb; sb.safePrintf("&c=qatest123&" // make it the custom filter "ufp=0&" "fe=%%21ismanualadd+%%26%%26+%%21insitelist&hspl=0&hspl=1&fsf=0.000000&mspr=0&mspi=1&xg=1000&fsp=-3&" // take out hopcount for now, just test quotas // "fe1=tag%%3Ashallow+%%26%%26+hopcount%%3C%%3D1&hspl1=0&hspl1=1&fsf1=1.000000&mspr1=1&mspi1=1&xg1=1000&fsp1=3&" // sitepages is a little fuzzy so take it // out for this test and use hopcount!!! //"fe1=tag%%3Ashallow+%%26%%26+sitepages%%3C%%3D20&hspl1=0&hspl1=1&fsf1=1.000000&mspr1=1&mspi1=1&xg1=1000&fsp1=45&" "fe1=tag%%3Ashallow+%%26%%26+hopcount<%%3D1&hspl1=0&hspl1=1&fsf1=1.000000&mspr1=1&mspi1=1&xg1=1000&fsp1=45&" "fe2=default&hspl2=0&hspl2=1&fsf2=1.000000&mspr2=0&mspi2=1&xg2=1000&fsp2=45&" ); if ( ! getUrl ( "/admin/filters",0,sb.getBufStart()) ) return false; } // set the site list to // a few sites // these should auto seed so no need to use addurl //static bool s_z2 = false; if ( ! s_flags[3] ) { s_flags[3] = true; SafeBuf sb; sb.safePrintf("&c=qatest123&format=xml&sitelist="); sb.urlEncode(//walmart has too many pages at depth 1, so remove it //"tag:shallow www.walmart.com\r\n" "tag:shallow http://www.ibm.com/\r\n"); sb.nullTerm(); if ( ! getUrl ("/admin/settings",0,sb.getBufStart() ) ) return false; } // // wait for spidering to stop // checkagain: // wait until spider finishes. check the spider status page // in json to see when completed //static bool s_k1 = false; if ( ! s_flags[4] ) { //usleep(5000000); // 5 seconds s_flags[4] = true; wait(3.0); return false; } if ( ! s_flags[14] ) { s_flags[14] = true; if ( ! getUrl ( "/admin/status?format=json&c=qatest123",0) ) return false; } //static bool s_k2 = false; if ( ! s_flags[5] ) { // ensure spiders are done. // "Nothing currently available to spider" if ( s_content&&!strstr(s_content,"Nothing currently avail")){ s_flags[4] = false; s_flags[14] = false; goto checkagain; } s_flags[5] = true; } // verify no results for gbhopcount:2 query //static bool s_y4 = false; if ( ! s_flags[6] ) { s_flags[6] = true; if ( ! getUrl ( "/search?c=qatest123&qa=1&format=xml&" "q=gbhopcount%3A2", -1310551262 ) ) return false; } // but some for gbhopcount:0 query //static bool s_t0 = false; if ( ! s_flags[7] ) { s_flags[7] = true; if ( ! getUrl ( "/search?c=qatest123&qa=1&format=xml&n=500&" "q=gbhopcount%3A0", 999 ) ) return false; } // check facet sections query for walmart //static bool s_y5 = false; if ( ! s_flags[8] ) { s_flags[8] = true; if ( ! getUrl ( "/search?c=qatest123&format=json&stream=0&" "q=gbfacetstr%3Agbxpathsitehash3311332088", 999 ) ) return false; } // wait for some reason if ( ! s_flags[15] ) { s_flags[15] = true; wait(1.5); return false; } //static bool s_y6 = false; if ( ! s_flags[9] ) { s_flags[9] = true; if ( ! getUrl ( "/get?page=4&q=gbfacetstr:gbxpathsitehash3311332088&qlang=xx&c=qatest123&d=9577169402&cnsp=0" , 999 ) ) return false; } // in xml //static bool s_y7 = false; if ( ! s_flags[10] ) { s_flags[10] = true; if ( ! getUrl ( "/get?xml=1&page=4&q=gbfacetstr:gbxpathsitehash2492664135&qlang=xx&c=qatest123&d=9577169402&cnsp=0" , 999 ) ) return false; } // and json //static bool s_y8 = false; if ( ! s_flags[11] ) { s_flags[11] = true; if ( ! getUrl ( "/get?json=1&page=4&q=gbfacetstr:gbxpathsitehash2492664135&qlang=xx&c=qatest123&d=9577169402&cnsp=0" , 999 ) ) return false; } // delete the collection //static bool s_fee = false; // if ( ! s_flags[12] ) { // s_flags[12] = true; // if ( ! getUrl ( "/admin/delcoll?delcoll=qatest123" ) ) // return false; // } //static bool s_fee2 = false; if ( ! s_flags[13] ) { s_flags[13] = true; log("qa: SUCCESSFULLY COMPLETED " "QA SPIDER2 TEST"); return true; } return true; }
// . return the score of the highest-scoring window containing match #m // . window is defined by the half-open interval [a,b) where a and b are // word #'s in the Words array indicated by match #m // . return -1 and set g_errno on error int64_t Summary::getBestWindow ( Matches *matches, int32_t mm, int32_t *lasta, int32_t *besta, int32_t *bestb, char *gotIt, char *retired, int32_t maxExcerptLen ) { // get the window around match #mm Match *m = &matches->m_matches[mm]; // what is the word # of match #mm? int32_t matchWordNum = m->m_wordNum; // what Words/Pos/Bits classes is this match in? Words *words = m->m_words; Section **sp = NULL; int32_t *pos = m->m_pos->m_pos; // use "m_swbits" not "m_bits", that is what Bits::setForSummary() uses const swbit_t *bb = m->m_bits->m_swbits; // shortcut if ( m->m_sections ) { sp = m->m_sections->m_sectionPtrs; } int32_t nw = words->getNumWords(); int64_t *wids = words->getWordIds(); nodeid_t *tids = words->getTagIds(); // . sanity check // . this prevents a core i've seen if ( matchWordNum >= nw ) { log("summary: got overflow condition for q=%s",m_q->m_orig); // assume no best window *besta = -1; *bestb = -1; *lasta = matchWordNum; return 0; } // . we NULLify the section ptrs if we already used the word in another summary. int32_t badFlags = SEC_SCRIPT|SEC_STYLE|SEC_SELECT|SEC_IN_TITLE; if ( (bb[matchWordNum] & D_USED) || ( sp && (sp[matchWordNum]->m_flags & badFlags) ) ) { // assume no best window *besta = -1; *bestb = -1; *lasta = matchWordNum; return 0; } // . "a" is the left fence post of the window (it is a word # in Words) // . go to the left as far as we can // . thus we decrement "a" int32_t a = matchWordNum; // "posa" is the character position of the END of word #a int32_t posa = pos[a+1]; int32_t firstFrag = -1; bool startOnQuote = false; bool goodStart = false; int32_t wordCount = 0; // . decrease "a" as int32_t as we stay within maxNumCharsPerLine // . avoid duplicating windows by using "lasta", the last "a" of the // previous call to getBestWindow(). This can happen if our last // central query term was close to this one. for ( ; a > 0 && posa - pos[a-1] < maxExcerptLen && a > *lasta; a-- ) { // . don't include any "dead zone", // . dead zones have already been used for the summary, and // we are getting a second/third/... excerpt here now then // stop if its the start of a sentence, too // stop before title word if ( (bb[a-1] & D_USED) || (bb[a] & D_STARTS_SENTENCE) || ( bb[a-1] & D_IN_TITLE )) { goodStart = true; break; } // don't go beyond an LI, TR, P tag if ( tids && ( tids[a-1] == TAG_LI || tids[a-1] == TAG_TR || tids[a-1] == TAG_P || tids[a-1] == TAG_DIV ) ) { goodStart = true; break; } // stop if its the start of a quoted sentence if ( a+1<nw && (bb[a+1] & D_IN_QUOTES) && words->getWord(a)[0] == '\"' ){ startOnQuote = true; goodStart = true; break; } // find out the first instance of a fragment (comma, etc) // watch out! because frag also means 's' in there's if ( ( bb[a] & D_STARTS_FRAG ) && !(bb[a-1] & D_IS_STRONG_CONNECTOR) && firstFrag == -1 ) { firstFrag = a; } if ( wids[a] ) { wordCount++; } } // if didn't find a good start, then start at the start of the frag if ( !goodStart && firstFrag != -1 ) { a = firstFrag; } // don't let punct or tag word start a line, unless a quote if ( a < matchWordNum && !wids[a] && words->getWord(a)[0] != '\"' ){ while ( a < matchWordNum && !wids[a] ) a++; // do not break right after a "strong connector", like // apostrophe while ( a < matchWordNum && a > 0 && ( bb[a-1] & D_IS_STRONG_CONNECTOR ) ) a++; // don't let punct or tag word start a line while ( a < matchWordNum && !wids[a] ) a++; } // remember, b is not included in the summary, the summary is [a,b-1] // remember to include all words in a matched phrase int32_t b = matchWordNum + m->m_numWords ; int32_t endQuoteWordNum = -1; int32_t numTagsCrossed = 0; for ( ; b <= nw; b++ ) { if ( b == nw ) { break; } if ( pos[b+1] - pos[a] >= maxExcerptLen ) { break; } if ( startOnQuote && words->getWord(b)[0] == '\"' ) { endQuoteWordNum = b; } // don't include any dead zone, those are already-used samples if ( bb[b] & D_USED ) { break; } // stop on a title word if ( bb[b] & D_IN_TITLE ) { break; } if ( wids[b] ) { wordCount++; } // don't go beyond an LI or TR backtag if ( tids && ( tids[b] == (BACKBIT|TAG_LI) || tids[b] == (BACKBIT|TAG_TR) ) ) { numTagsCrossed++; // try to have atleast 10 words in the summary if ( wordCount > 10 ) { break; } } // go beyond a P or DIV backtag in case the earlier char is a // ':'. This came from a special case for wikipedia pages // eg. http://en.wikipedia.org/wiki/Flyover if ( tids && ( tids[b] == (BACKBIT|TAG_P) || tids[b] == (BACKBIT|TAG_DIV) )) { numTagsCrossed++; // try to have atleast 10 words in the summary if ( wordCount > 10 && words->getWord(b-1)[0] != ':' ) { break; } } } // don't end on a lot of punct words if ( b > matchWordNum && !wids[b-1]){ // remove more than one punct words. if we're ending on a quote // keep it while ( b > matchWordNum && !wids[b-2] && endQuoteWordNum != -1 && b > endQuoteWordNum ) { b--; } // do not break right after a "strong connector", like apostrophe while ( b > matchWordNum && (bb[b-2] & D_IS_STRONG_CONNECTOR) ) { b--; } } Match *ms = matches->m_matches; // make m_matches.m_matches[mi] the first match in our [a,b) window int32_t mi ; // . the match at the center of the window is match #"mm", so that // matches->m_matches[mm] is the Match class // . set "mi" to it and back up "mi" as int32_t as >= a for ( mi = mm ; mi > 0 && ms[mi-1].m_wordNum >=a ; mi-- ) ; // now get the score of this excerpt. Also mark all the represented // query words. Mark the represented query words in the array that // comes to us. also mark how many times the same word is repeated in // this summary. int64_t score = 0LL; // is a url contained in the summary, that looks bad! punish! bool hasUrl = false; // the word count we did above was just an approximate. count it right wordCount = 0; // for debug //char buf[5000]; //char *xp = buf; SafeBuf xp; // wtf? if ( b > nw ) { b = nw; } // first score from the starting match down to a, including match for ( int32_t i = a ; i < b ; i++ ) { // debug print out if ( g_conf.m_logDebugSummary ) { int32_t len = words->getWordLen(i); char cs; for (int32_t k=0;k<len; k+=cs ) { const char *c = words->getWord(i)+k; cs = getUtf8CharSize(c); if ( is_binary_utf8 ( c ) ) { continue; } xp.safeMemcpy ( c , cs ); xp.nullTerm(); } } // skip if in bad section, marquee, select, script, style if ( sp && (sp[i]->m_flags & badFlags) ) { continue; } // don't count just numeric words if ( words->isNum(i) ) { continue; } // check if there is a url. best way to check for '://' if ( wids && !wids[i] ) { const char *wrd = words->getWord(i); int32_t wrdLen = words->getWordLen(i); if ( wrdLen == 3 && wrd[0] == ':' && wrd[1] == '/' && wrd[2] == '/' ) { hasUrl = true; } } // skip if not wid if ( ! wids[i] ) { continue; } // just make every word 100 pts int32_t t = 100; // penalize it if in one of these sections if ( bb[i] & ( D_IN_PARENS | D_IN_SUP | D_IN_LIST ) ) { t /= 2; } // boost it if in bold or italics if ( bb[i] & D_IN_BOLDORITALICS ) { t *= 2; } // add the score for this word score += t; // print the score, "t" if ( g_conf.m_logDebugSummary ) { xp.safePrintf("(%" PRId32")",t); } // count the alpha words we got wordCount++; // if no matches left, skip if ( mi >= matches->m_numMatches ) { continue; } // get the match Match *next = &ms[mi]; // skip if not a match if ( i != next->m_wordNum ) { continue; } // must be a match in this class if ( next->m_words != words ) { continue; } // advance it mi++; // which query word # does it match int32_t qwn = next->m_qwordNum; if ( qwn < 0 || qwn >= m_q->m_numWords ){g_process.shutdownAbort(true);} // undo old score score -= t; // add 100000 per match t = 100000; // weight based on tf, goes from 0.1 to 1.0 t = (int32_t)((float)t * m_wordWeights [ qwn ]); // if it is a query stop word, make it 10000 pts if ( m_q->m_qwords[qwn].m_isQueryStopWord ) { t = 0;//10000; } // penalize it if in one of these sections if ( bb[i] & ( D_IN_PARENS | D_IN_SUP | D_IN_LIST ) ) { t /= 2; } if ( gotIt[qwn] > 0 ) { // have we matched it in this [a,b) already? if ( gotIt[qwn] == 1 ) { t /= 15; } else { // if we have more than 2 matches in the same window, // it may not give a good summary. give a heavy penalty t -= 200000; } } else if ( retired [qwn] > 0 ) { // have we matched it already in a winning window? t /= 12; } // add it back score += t; if ( g_conf.m_logDebugSummary ) { xp.safePrintf ("[%" PRId32"]{qwn=%" PRId32",ww=%f}",t,qwn, m_wordWeights[qwn]); } // inc the query word count for this window if ( gotIt[qwn] < 100 ) { gotIt[qwn]++; } } int32_t oldScore = score; // apply the bonus if it starts or a sentence // only apply if the score is positive and if the wordcount is decent if ( score > 0 && wordCount > 7 ){ // a match can give us 10k to 100k pts based on the tf weights // so we don't want to overwhelm that too much, so let's make // this a 20k bonus if it starts a sentence if ( bb[a] & D_STARTS_SENTENCE ) { score += 8000; } else if ( bb[a] & D_STARTS_FRAG ) { // likewise, a fragment, like after a comma score += 4000; } // 1k if the match word is very close to the // start of a sentence, lets say 3 alphawords if ( matchWordNum - a < 7 ) { score += 1000; } } // a summary isn't really a summary if its less than 7 words. // reduce the score, but still give it a decent score. // minus 5M. if ( wordCount < 7 ) { score -= 20000; } // summaries that cross a lot of tags are usually bad, penalize them if ( numTagsCrossed > 1 ) { score -= (numTagsCrossed * 20000); } if ( hasUrl ) { score -= 8000; } // show it if ( g_conf.m_logDebugSummary ) { log(LOG_DEBUG, "sum: score=%08" PRId32" prescore=%08" PRId32" a=%05" PRId32" b=%05" PRId32" %s", (int32_t)score,oldScore,(int32_t)a,(int32_t)b, xp.getBufStart()); } // set lasta, besta, bestb *lasta = a; *besta = a; *bestb = b; return score; }
// . "uf" is printf url format to scrape with a %s for the query // . example: uf="http://www.google.com/search?num=50&q=%s&scoring=d&filter=0"; bool Msg7::scrapeQuery ( ) { // advance round now in case we return early m_round++; GigablastRequest *gr = &m_gr; // error? char *qts = gr->m_queryToScrape; if ( ! qts ) { char *xx=NULL;*xx=0; } if ( gbstrlen(qts) > 500 ) { g_errno = EQUERYTOOBIG; return true; } // first encode the query SafeBuf ebuf; ebuf.urlEncode ( qts ); // queryUNEncoded ); ebuf.nullTerm(); char *uf; if ( m_round == 1 ) // set to 1 for debugging uf="http://www.google.com/search?num=20&" "q=%s&scoring=d&filter=0"; //uf = "https://startpage.com/do/search?q=%s"; //uf = "http://www.google.com/" // "/cse?cx=013269018370076798483%3A8eec3papwpi&" // "ie=UTF-8&q=%s&" // "num=20"; else uf="http://www.bing.com/search?q=%s"; // skip bing for now //if ( m_round == 2 ) // return true; //if ( m_round == 1 ) // return true; // make the url we will download char ubuf[2048]; sprintf ( ubuf , uf , ebuf.getBufStart() ); // log it log("inject: SCRAPING %s",ubuf); SpiderRequest sreq; sreq.reset(); // set the SpiderRequest strcpy(sreq.m_url, ubuf); // . tell it to only add the hosts of each outlink for now! // . that will be passed on to when XmlDoc calls Links::set() i guess // . xd will not reschedule the scraped url into spiderdb either sreq.m_isScraping = 1; sreq.m_fakeFirstIp = 1; long firstIp = hash32n(ubuf); if ( firstIp == 0 || firstIp == -1 ) firstIp = 1; sreq.m_firstIp = firstIp; // parent docid is 0 sreq.setKey(firstIp,0LL,false); char *coll2 = gr->m_coll; CollectionRec *cr = g_collectiondb.getRec ( coll2 ); // forceDEl = false, niceness = 0 m_xd.set4 ( &sreq , NULL , cr->m_coll , NULL , 0 ); //m_xd.m_isScraping = true; // download without throttling //m_xd.m_throttleDownload = false; // disregard this m_xd.m_useRobotsTxt = false; // this will tell it to index ahrefs first before indexing // the doc. but do NOT do this if we are from ahrefs.com // ourselves to avoid recursive explosion!! if ( m_useAhrefs ) m_xd.m_useAhrefs = true; m_xd.m_reallyInjectLinks = true;//gr->m_injectLinks; // // rather than just add the links of the page to spiderdb, // let's inject them! // m_xd.setCallback ( this , doneInjectingLinksWrapper ); // niceness is 0 m_linkDedupTable.set(4,0,512,NULL,0,false,0,"ldtab2"); // do we actually inject the links, or just scrape? if ( ! m_xd.injectLinks ( &m_linkDedupTable , NULL, this , doneInjectingLinksWrapper ) ) return false; // otherwise, just download the google/bing search results so we // can display them in xml //else if ( m_xd.getUtf8Content() == (char **)-1 ) // return false; // print reply.. //printReply(); return true; }
bool HttpMime::addToCookieJar(Url *currentUrl, SafeBuf *sb) { /// @note Slightly modified from Netscape HTTP Cookie File format /// Difference is we only have one column for name/value // http://www.cookiecentral.com/faq/#3.5 // The layout of Netscape's cookies.txt file is such that each line contains one name-value pair. // An example cookies.txt file may have an entry that looks like this: // .netscape.com TRUE / FALSE 946684799 NETSCAPE_ID 100103 // // Each line represents a single piece of stored information. A tab is inserted between each of the fields. // From left-to-right, here is what each field represents: // // domain - The domain that created AND that can read the variable. // flag - A TRUE/FALSE value indicating if all machines within a given domain can access the variable. This value is set automatically by the browser, depending on the value you set for domain. // path - The path within the domain that the variable is valid for. // secure - A TRUE/FALSE value indicating if a secure connection with the domain is needed to access the variable. // expiration - The UNIX time that the variable will expire on. UNIX time is defined as the number of seconds since Jan 1, 1970 00:00:00 GMT. // name/value - The name/value of the variable. /// @todo ALC we should sort cookie-list // The user agent SHOULD sort the cookie-list in the following order: // * Cookies with longer paths are listed before cookies with shorter paths. // * Among cookies that have equal-length path fields, cookies with earlier creation-times are listed // before cookies with later creation-times. // fill in cookies from cookieJar std::map<std::string, httpcookie_t> oldCookies; const char *cookieJar = sb->getBufStart(); int32_t cookieJarLen = sb->length(); const char *lineStartPos = cookieJar; const char *lineEndPos = NULL; while ((lineEndPos = (const char*)memchr(lineStartPos, '\n', cookieJarLen - (lineStartPos - cookieJar))) != NULL) { const char *currentPos = lineStartPos; const char *tabPos = NULL; unsigned fieldCount = 0; httpcookie_t cookie = {}; while (fieldCount < 5 && (tabPos = (const char*)memchr(currentPos, '\t', lineEndPos - currentPos)) != NULL) { switch (fieldCount) { case 0: // domain cookie.m_domain = currentPos; cookie.m_domainLen = tabPos - currentPos; break; case 1: // flag if (memcmp(currentPos, "TRUE", 4) != 0) { cookie.m_defaultDomain = true; } break; case 2: { // path cookie.m_path = currentPos; cookie.m_pathLen = tabPos - currentPos; } break; case 3: // secure cookie.m_secure = (memcmp(currentPos, "TRUE", 4) == 0); break; case 4: // expiration break; } currentPos = tabPos + 1; ++fieldCount; } cookie.m_cookie = currentPos; cookie.m_cookieLen = lineEndPos - currentPos; const char *equalPos = (const char *)memchr(cookie.m_cookie, '=', cookie.m_cookieLen); if (equalPos) { cookie.m_nameLen = equalPos - cookie.m_cookie; oldCookies[std::string(cookie.m_cookie, cookie.m_nameLen)] = cookie; } lineStartPos = lineEndPos + 1; } // we don't need to care about the last line (we always end on \n) SafeBuf newCookieJar; // add old cookies for (auto &pair : oldCookies) { if (m_cookies.find(pair.first) == m_cookies.end()) { addCookie(pair.second, *currentUrl, &newCookieJar); } } // add new cookies for (auto &pair : m_cookies) { addCookie(pair.second, *currentUrl, &newCookieJar); } newCookieJar.nullTerm(); // replace old with new sb->reset(); sb->safeMemcpy(&newCookieJar); sb->nullTerm(); return true; }