bool HashTableT<Key_t, Val_t>::serialize(SafeBuf& sb) { sb += m_numSlots; sb += m_numSlotsUsed; if(m_numSlots == 0) return true; bool x = true; x &= sb.safeMemcpy((char*)m_keys, sizeof(Key_t) * m_numSlots); x &= sb.safeMemcpy((char*)m_vals, sizeof(Val_t) * m_numSlots); return x; }
void doneInjectingLinksWrapper ( void *state ) { Msg7 *msg7 = (Msg7 *)state; SafeBuf *sb = &msg7->m_sb; // copy the serps into ou rbuf if ( ! g_errno ) { // print header if ( sb->length() == 0 ) { // print header of page sb->safePrintf("<?xml version=\"1.0\" " "encoding=\"UTF-8\" ?>\n" "<response>\n" ); } // serp header if ( msg7->m_round == 1 ) sb->safePrintf("\t<googleResults>\n"); else sb->safePrintf("\t<bingResults>\n"); // print results sb->safeMemcpy(&msg7->m_xd.m_serpBuf); // end that if ( msg7->m_round == 1 ) sb->safePrintf("\t</googleResults>\n"); else sb->safePrintf("\t</bingResults>\n"); } // do bing now if ( msg7->m_round == 1 ) { // return if it blocks if ( ! msg7->scrapeQuery() ) return; } // otherwise, parse out the search results so steve can display them if ( g_errno ) sb->safePrintf("<error><![CDATA[%s]]></error>\n", mstrerror(g_errno)); // print header of page sb->safePrintf("</response>\n"); // page is not more than 32k //char buf[1024*32]; //char *p = buf; // return docid and hostid //p += sprintf ( p , "scraping status "); // print error msg out, too or "Success" //p += sprintf ( p , "%s", mstrerror(g_errno)); TcpSocket *sock = msg7->m_socket; g_httpServer.sendDynamicPage ( sock, sb->getBufStart(), sb->length(), -1/*cachetime*/); // hopefully sb buffer is copied becaues this will free it: mdelete ( msg7, sizeof(Msg7) , "PageInject" ); delete (msg7); }
bool getWords() { FILE *fd = fopen ( "/usr/share/dict/words" , "r" ); if ( ! fd ) { log("blaster:: failed to open /usr/share/dict/words %s", mstrerror(errno)); return 1; } char tmp[1024]; while ( fgets ( tmp , 1024 , fd ) ) { long len = gbstrlen(tmp); if(len > 2 && tmp[len-2] == 's' && tmp[len-3] == '\'') continue; s_windices += s_words.length(); s_words.safeMemcpy(tmp, len-1); //copy in data minus the newline s_words += '\0'; } fclose ( fd ); log("blaster: read %li words, %li bytes in from dictionary.", s_windices.length() / sizeof(long), s_words.length()); return true; }
// returns false if blocked, true otherwise bool processLoop ( void *state ) { // get it State2 *st = (State2 *)state; // get the tcp socket from the state TcpSocket *s = st->m_socket; // get it XmlDoc *xd = &st->m_xd; if ( ! xd->m_loaded ) { // setting just the docid. niceness is 0. //xd->set3 ( st->m_docId , st->m_coll , 0 ); // callback xd->setCallback ( state , processLoop ); // . and tell it to load from the old title rec // . this sets xd->m_oldTitleRec/m_oldTitleRecSize // . this sets xd->ptr_* and all other member vars from // the old title rec if found in titledb. if ( ! xd->loadFromOldTitleRec ( ) ) return false; } if ( g_errno ) return sendErrorReply ( st , g_errno ); // now force it to load old title rec //char **tr = xd->getTitleRec(); SafeBuf *tr = xd->getTitleRecBuf(); // blocked? return false if so. it will call processLoop() when it rets if ( tr == (void *)-1 ) return false; // we did not block. check for error? this will free "st" too. if ( ! tr ) return sendErrorReply ( st , g_errno ); // if title rec was empty, that is a problem if ( xd->m_titleRecBuf.length() == 0 ) return sendErrorReply ( st , ENOTFOUND); // set callback char *na = xd->getIsNoArchive(); // wait if blocked if ( na == (void *)-1 ) return false; // error? if ( ! na ) return sendErrorReply ( st , g_errno ); // forbidden? allow turkeys through though... if ( ! st->m_isAdmin && *na ) return sendErrorReply ( st , ENOCACHE ); SafeBuf *sb = &st->m_sb; // &page=4 will print rainbow sections if ( ! st->m_printed && st->m_r.getLong("page",0) ) { // do not repeat this call st->m_printed = true; // this will call us again since we called // xd->setCallback() above to us if ( ! xd->printDocForProCog ( sb , &st->m_r ) ) return false; } char *contentType = "text/html"; char format = st->m_format; if ( format == FORMAT_XML ) contentType = "text/xml"; if ( format == FORMAT_JSON ) contentType = "application/json"; // if we printed a special page (like rainbow sections) then return now if ( st->m_printed ) { bool status = g_httpServer.sendDynamicPage (s, //buf,bufLen, sb->getBufStart(), sb->getLength(), -1,false, //"text/html", contentType, -1, NULL, "utf8" ); // nuke state2 mdelete ( st , sizeof(State2) , "PageGet1" ); delete (st); return status; } /* // this was calling XmlDoc and setting sections, etc. to // get the SpiderReply junk... no no no // is it banned or filtered? this ignores the TagRec in the titleRec // and uses msg8a to get it fresh instead char *vi = xd->getIsFiltered();//Visible( ); // wait if blocked if ( vi == (void *)-1 ) return false; // error? if ( ! vi ) return sendErrorReply ( st , g_errno ); // banned? if ( ! st->m_isAdmin && ! *vi ) return sendErrorReply (st,EDOCBANNED); */ // get the utf8 content char **utf8 = xd->getUtf8Content(); //long len = xd->size_utf8Content - 1; // wait if blocked??? if ( utf8 == (void *)-1 ) return false; // strange if ( xd->size_utf8Content<=0) { log("pageget: utf8 content <= 0"); return sendErrorReply(st,EBADENGINEER ); } // alloc error? if ( ! utf8 ) return sendErrorReply ( st , g_errno ); // get this host Host *h = g_hostdb.getHost ( g_hostdb.m_hostId ); if ( ! h ) { log("pageget: hostid %li is bad",g_hostdb.m_hostId); return sendErrorReply(st,EBADENGINEER ); } char *content = xd->ptr_utf8Content; long contentLen = xd->size_utf8Content - 1; // shortcut char strip = st->m_strip; // alloc buffer now //char *buf = NULL; //long bufMaxSize = 0; //bufMaxSize = len + ( 32 * 1024 ) ; //bufMaxSize = contentLen + ( 32 * 1024 ) ; //buf = (char *)mmalloc ( bufMaxSize , "PageGet2" ); //char *p = buf; //char *bufEnd = buf + bufMaxSize; //if ( ! buf ) { // return sendErrorReply ( st , g_errno ); //} // for undoing the header //char *start1 = p; long startLen1 = sb->length(); // we are always utfu if ( strip != 2 ) sb->safePrintf( "<meta http-equiv=\"Content-Type\" " "content=\"text/html;charset=utf8\">\n"); // base href //Url *base = &xd->m_firstUrl; //if ( xd->ptr_redirUrl.m_url[0] ) // base = &xd->m_redirUrl; char *base = xd->ptr_firstUrl; if ( xd->ptr_redirUrl ) base = xd->ptr_redirUrl; //Url *redir = *xd->getRedirUrl(); if ( strip != 2 ) { sb->safePrintf ( "<BASE HREF=\"%s\">" , base ); //p += gbstrlen ( p ); } // default colors in case css files missing if ( strip != 2 ) { sb->safePrintf( "\n<style type=\"text/css\">\n" "body{background-color:white;color:black;}\n" "</style>\n"); //p += gbstrlen ( p ); } //char format = st->m_format; if ( format == FORMAT_XML ) sb->reset(); if ( format == FORMAT_JSON ) sb->reset(); // for undoing the stuff below long startLen2 = sb->length();//p; // query should be NULL terminated char *q = st->m_q; long qlen = st->m_qlen; char styleTitle[128] = "font-size:14px;font-weight:600;" "color:#000000;"; char styleText[128] = "font-size:14px;font-weight:400;" "color:#000000;"; char styleLink[128] = "font-size:14px;font-weight:400;" "color:#0000ff;"; char styleTell[128] = "font-size:14px;font-weight:600;" "color:#cc0000;"; // get the url of the title rec Url *f = xd->getFirstUrl(); bool printDisclaimer = st->m_printDisclaimer; if ( xd->m_contentType == CT_JSON ) printDisclaimer = false; if ( format == FORMAT_XML ) printDisclaimer = false; if ( format == FORMAT_JSON ) printDisclaimer = false; char tbuf[100]; tbuf[0] = 0; time_t lastSpiderDate = xd->m_spideredTime; if ( printDisclaimer || format == FORMAT_XML || format == FORMAT_JSON ) { struct tm *timeStruct = gmtime ( &lastSpiderDate ); strftime ( tbuf, 100,"%b %d, %Y UTC", timeStruct); } // We should always be displaying this disclaimer. // - May eventually want to display this at a different location // on the page, or on the click 'n' scroll browser page itself // when this page is not being viewed solo. // CNS: if ( ! st->m_clickNScroll ) { if ( printDisclaimer ) { sb->safePrintf(//sprintf ( p , //"<BASE HREF=\"%s\">" //"<table border=1 width=100%%>" //"<tr><td>" "<table border=\"1\" bgcolor=\"#" BGCOLOR "\" cellpadding=\"10\" " //"id=\"gbcnsdisctable\" class=\"gbcnsdisctable_v\"" "cellspacing=\"0\" width=\"100%%\" color=\"#ffffff\">" "<tr" //" id=\"gbcnsdisctr\" class=\"gbcnsdisctr_v\"" "><td>" //"<font face=times,sans-serif color=black size=-1>" "<span style=\"%s\">" "This is Gigablast's cached page of </span>" "<a href=\"%s\" style=\"%s\">%s</a>" "" , styleTitle, f->getUrl(), styleLink, f->getUrl() ); //p += gbstrlen ( p ); // then the rest //sprintf(p , sb->safePrintf( "<span style=\"%s\">. " "Gigablast is not responsible for the content of " "this page.</span>", styleTitle ); //p += gbstrlen ( p ); sb->safePrintf ( "<br/><span style=\"%s\">" "Cached: </span>" "<span style=\"%s\">", styleTitle, styleText ); //p += gbstrlen ( p ); // then the spider date in GMT // time_t lastSpiderDate = xd->m_spideredTime; // struct tm *timeStruct = gmtime ( &lastSpiderDate ); // char tbuf[100]; // strftime ( tbuf, 100,"%b %d, %Y UTC", timeStruct); //p += gbstrlen ( p ); sb->safeStrcpy(tbuf); // Moved over from PageResults.cpp sb->safePrintf( "</span> - <a href=\"" "/get?" "q=%s&c=%s&rtq=%li&" "d=%lli&strip=1\"" " style=\"%s\">" "[stripped]</a>", q , st->m_coll , (long)st->m_rtq, st->m_docId, styleLink ); // a link to alexa if ( f->getUrlLen() > 5 ) { sb->safePrintf( " - <a href=\"http:" "//web.archive.org/web/*/%s\"" " style=\"%s\">" "[older copies]</a>" , f->getUrl(), styleLink ); } if (st->m_noArchive){ sb->safePrintf( " - <span style=\"%s\"><b>" "[NOARCHIVE]</b></span>", styleTell ); } if (st->m_isBanned){ sb->safePrintf(" - <span style=\"%s\"><b>" "[BANNED]</b></span>", styleTell ); } // only print this if we got a query if ( qlen > 0 ) { sb->safePrintf("<br/><br/><span style=\"%s\"> " "These search terms have been " "highlighted: ", styleText ); //p += gbstrlen ( p ); } } // how much space left in p? //long avail = bufEnd - p; // . make the url that we're outputting for (like in PageResults.cpp) // . "thisUrl" is the baseUrl for click & scroll char thisUrl[MAX_URL_LEN]; char *thisUrlEnd = thisUrl + MAX_URL_LEN; char *x = thisUrl; // . use the external ip of our gateway // . construct the NAT mapped port // . you should have used iptables to map port to the correct // internal ip:port //unsigned long ip =g_conf.m_mainExternalIp ; // h->m_externalIp; //unsigned short port=g_conf.m_mainExternalPort;//h->m_externalHttpPort // local check //if ( st->m_isLocal ) { unsigned long ip = h->m_ip; unsigned short port = h->m_httpPort; //} //sprintf ( x , "http://%s:%li/get?q=" , iptoa ( ip ) , port ); // . we no longer put the port in here // . but still need http:// since we use <base href=> if (port == 80) sprintf(x,"http://%s/get?q=",iptoa(ip)); else sprintf(x,"http://%s:%hu/get?q=",iptoa(ip),port); x += gbstrlen ( x ); // the query url encoded long elen = urlEncode ( x , thisUrlEnd - x , q , qlen ); x += elen; // separate cgi vars with a & //sprintf ( x, "&seq=%li&rtq=%lid=%lli", // (long)st->m_seq,(long)st->m_rtq,st->m_msg22.getDocId()); sprintf ( x, "&d=%lli",st->m_docId ); x += gbstrlen(x); // set our query for highlighting Query qq; qq.set2 ( q, st->m_langId , true ); // print the query terms into our highlight buffer Highlight hi; // make words so we can set the scores to ignore fielded terms Words qw; qw.set ( q , // content being highlighted, utf8 qlen , // content being highlighted, utf8 TITLEREC_CURRENT_VERSION, true , // computeIds false ); // hasHtmlEntities? // . assign scores of 0 to query words that should be ignored // . TRICKY: loop over words in qq.m_qwords, but they should be 1-1 // with words in qw. // . sanity check //if ( qw.getNumWords() != qq.m_numWords ) { char *xx = NULL; *xx = 0;} // declare up here Matches m; // do the loop //Scores ss; //ss.set ( &qw , NULL ); //for ( long i = 0 ; i < qq.m_numWords ; i++ ) // if ( ! m.matchWord ( &qq.m_qwords[i],i ) ) ss.m_scores[i] = 0; // now set m.m_matches[] to those words in qw that match a query word // or phrase in qq. m.setQuery ( &qq ); //m.addMatches ( &qw , &ss , true ); m.addMatches ( &qw ); long hilen = 0; // CNS: if ( ! st->m_clickNScroll ) { // and highlight the matches if ( printDisclaimer ) { hilen = hi.set ( //p , //avail , sb , &qw , // words to highlight &m , // matches relative to qw false , // doSteming false , // st->m_clickAndScroll , (char *)thisUrl );// base url for ClcknScrll //p += hilen; // now an hr //memcpy ( p , "</span></table></table>\n" , 24 ); p += 24; sb->safeStrcpy("</span></table></table>\n"); } bool includeHeader = st->m_includeHeader; // do not show header for json object display if ( xd->m_contentType == CT_JSON ) includeHeader = false; if ( format == FORMAT_XML ) includeHeader = false; if ( format == FORMAT_JSON ) includeHeader = false; //mfree(uq, uqCapacity, "PageGet"); // undo the header writes if we should if ( ! includeHeader ) { // including base href is off by default when not including // the header, so the caller must explicitly turn it back on if ( st->m_includeBaseHref ) sb->m_length=startLen2;//p=start2; else sb->m_length=startLen1;//p=start1; } //sb->safeStrcpy(tbuf); if ( format == FORMAT_XML ) { sb->safePrintf("<response>\n"); sb->safePrintf("<statusCode>0</statusCode>\n"); sb->safePrintf("<statusMsg>Success</statusMsg>\n"); sb->safePrintf("<url><![CDATA["); sb->cdataEncode(xd->m_firstUrl.m_url); sb->safePrintf("]]></url>\n"); sb->safePrintf("<docId>%llu</docId>\n",xd->m_docId); sb->safePrintf("\t<cachedTimeUTC>%lu</cachedTimeUTC>\n", lastSpiderDate); sb->safePrintf("\t<cachedTimeStr>%s</cachedTimeStr>\n",tbuf); } if ( format == FORMAT_JSON ) { sb->safePrintf("{\"response\":{\n"); sb->safePrintf("\t\"statusCode\":0,\n"); sb->safePrintf("\t\"statusMsg\":\"Success\",\n"); sb->safePrintf("\t\"url\":\""); sb->jsonEncode(xd->m_firstUrl.m_url); sb->safePrintf("\",\n"); sb->safePrintf("\t\"docId\":%llu,\n",xd->m_docId); sb->safePrintf("\t\"cachedTimeUTC\":%lu,\n",lastSpiderDate); sb->safePrintf("\t\"cachedTimeStr\":\"%s\",\n",tbuf); } // identify start of <title> tag we wrote out char *sbstart = sb->getBufStart(); char *sbend = sb->getBufEnd(); char *titleStart = NULL; char *titleEnd = NULL; for ( char *t = sbstart ; t < sbend ; t++ ) { // title tag? if ( t[0]!='<' ) continue; if ( to_lower_a(t[1])!='t' ) continue; if ( to_lower_a(t[2])!='i' ) continue; if ( to_lower_a(t[3])!='t' ) continue; if ( to_lower_a(t[4])!='l' ) continue; if ( to_lower_a(t[5])!='e' ) continue; // point to it char *x = t + 5; // max - to keep things fast char *max = x + 500; for ( ; *x && *x != '>' && x < max ; x++ ); x++; // find end char *e = x; for ( ; *e && e < max ; e++ ) { if ( e[0]=='<' && to_lower_a(e[1])=='/' && to_lower_a(e[2])=='t' && to_lower_a(e[3])=='i' && to_lower_a(e[4])=='t' && to_lower_a(e[5])=='l' && to_lower_a(e[6])=='e' ) break; } if ( e < max ) { titleStart = x; titleEnd = e; } break; } // . print title at top! // . consider moving if ( titleStart ) { char *ebuf = st->m_r.getString("eb"); if ( ! ebuf ) ebuf = ""; //p += sprintf ( p , sb->safePrintf( "<table border=1 " "cellpadding=10 " "cellspacing=0 " "width=100%% " "color=#ffffff>" ); long printLinks = st->m_r.getLong("links",0); if ( ! printDisclaimer && printLinks ) sb->safePrintf(//p += sprintf ( p , // first put cached and live link "<tr>" "<td bgcolor=lightyellow>" // print cached link //"<center>" " " "<b>" "<a " "style=\"font-size:18px;font-weight:600;" "color:#000000;\" " "href=\"" "/get?" "c=%s&d=%lli&qh=0&cnsp=1&eb=%s\">" "cached link</a>" " " "<a " "style=\"font-size:18px;font-weight:600;" "color:#000000;\" " "href=%s>live link</a>" "</b>" //"</center>" "</td>" "</tr>\n" ,st->m_coll ,st->m_docId ,ebuf ,thisUrl // st->ptr_ubuf ); if ( printLinks ) { sb->safePrintf(//p += sprintf ( p , "<tr><td bgcolor=pink>" "<span style=\"font-size:18px;" "font-weight:600;" "color:#000000;\">" " " "<b>PAGE TITLE:</b> " ); long tlen = titleEnd - titleStart; sb->safeMemcpy ( titleStart , tlen ); sb->safePrintf ( "</span></td></tr>" ); } sb->safePrintf( "</table><br>\n" ); } // is the content preformatted? bool pre = false; char ctype = (char)xd->m_contentType; if ( ctype == CT_TEXT ) pre = true ; // text/plain if ( ctype == CT_DOC ) pre = true ; // filtered msword if ( ctype == CT_PS ) pre = true ; // filtered postscript if ( format == FORMAT_XML ) pre = false; if ( format == FORMAT_JSON ) pre = false; // if it is content-type text, add a <pre> if ( pre ) {//p + 5 < bufEnd && pre ) { sb->safePrintf("<pre>"); //p += 5; } if ( st->m_strip == 1 ) contentLen = stripHtml( content, contentLen, (long)xd->m_version, st->m_strip ); // it returns -1 and sets g_errno on error, line OOM if ( contentLen == -1 ) { //if ( buf ) mfree ( buf , bufMaxSize , "PageGet2" ); return sendErrorReply ( st , g_errno ); } Xml xml; Words ww; // if no highlighting, skip it bool queryHighlighting = st->m_queryHighlighting; if ( st->m_strip == 2 ) queryHighlighting = false; // do not do term highlighting if json if ( xd->m_contentType == CT_JSON ) queryHighlighting = false; SafeBuf tmp; SafeBuf *xb = sb; if ( format == FORMAT_XML ) xb = &tmp; if ( format == FORMAT_JSON ) xb = &tmp; if ( ! queryHighlighting ) { xb->safeMemcpy ( content , contentLen ); //p += contentLen ; } else { // get the content as xhtml (should be NULL terminated) //Words *ww = xd->getWords(); if ( ! xml.set ( content , contentLen , false , 0 , false , TITLEREC_CURRENT_VERSION , false , 0 , CT_HTML ) ) { // niceness is 0 //if ( buf ) mfree ( buf , bufMaxSize , "PageGet2" ); return sendErrorReply ( st , g_errno ); } if ( ! ww.set ( &xml , true , 0 ) ) { // niceness is 0 //if ( buf ) mfree ( buf , bufMaxSize , "PageGet2" ); return sendErrorReply ( st , g_errno ); } // sanity check //if ( ! xd->m_wordsValid ) { char *xx=NULL;*xx=0; } // how much space left in p? //avail = bufEnd - p; Matches m; m.setQuery ( &qq ); m.addMatches ( &ww ); hilen = hi.set ( xb , // p , avail , &ww , &m , false /*doStemming?*/ , st->m_clickAndScroll , thisUrl /*base url for click & scroll*/); //p += hilen; log(LOG_DEBUG, "query: Done highlighting cached page content"); } if ( format == FORMAT_XML ) { sb->safePrintf("\t<content><![CDATA["); sb->cdataEncode ( xb->getBufStart() ); sb->safePrintf("]]></content>\n"); sb->safePrintf("</response>\n"); } if ( format == FORMAT_JSON ) { sb->safePrintf("\t\"content\":\"\n"); sb->jsonEncode ( xb->getBufStart() ); sb->safePrintf("\"\n}\n}\n"); } // if it is content-type text, add a </pre> if ( pre ) { // p + 6 < bufEnd && pre ) { sb->safeMemcpy ( "</pre>" , 6 ); //p += 6; } // calculate bufLen //long bufLen = p - buf; long ct = xd->m_contentType; // now filter the entire buffer to escape out the xml tags // so it is displayed nice SafeBuf newbuf; if ( ct == CT_XML ) { // encode the xml tags into <tagname> sequences if ( !newbuf.htmlEncodeXmlTags ( sb->getBufStart() , sb->getLength(), 0)){// niceness=0 //if ( buf ) mfree ( buf , bufMaxSize , "PageGet2" ); return sendErrorReply ( st , g_errno ); } // free out buffer that we alloc'd before returning since this // should have copied it into another buffer //if ( buf ) mfree ( buf , bufMaxSize , "PageGet2" ); // reassign //buf = newbuf.getBufStart(); //bufLen = newbuf.length(); sb->stealBuf ( &newbuf ); } // now encapsulate it in html head/tail and send it off // sendErr: contentType = "text/html"; if ( strip == 2 ) contentType = "text/xml"; // xml is usually buggy and this throws browser off //if ( ctype == CT_XML ) contentType = "text/xml"; if ( xd->m_contentType == CT_JSON ) contentType = "application/json"; if ( format == FORMAT_XML ) contentType = "text/xml"; if ( format == FORMAT_JSON ) contentType = "application/json"; // safebuf, sb, is a member of "st" so this should copy the buffer // when it constructs the http reply, and we gotta call delete(st) // AFTER this so sb is still valid. bool status = g_httpServer.sendDynamicPage (s, //buf,bufLen, sb->getBufStart(), sb->getLength(), -1,false, contentType, -1, NULL, "utf8" ); // nuke state2 mdelete ( st , sizeof(State2) , "PageGet1" ); delete (st); // free out buffer that we alloc'd before returning since this // should have copied it into another buffer //if ( ct == CT_XML ) newbuf.purge(); //else if ( buf ) mfree ( buf , bufMaxSize , "PageGet2" ); // and convey the status return status; }
bool sendTurkPageReply ( State60 *st ) { XmlDoc *xd = &st->m_xd; //char *content = xd->ptr_utf8Content; //int32_t contentLen = xd->size_utf8Content - 1; // count the total number of EventDesc classes for all evids //char *evd = xd->ptr_eventData; //EventDisplay *ed = (EventDisplay *)evd; //char *addr = evd + (int32_t)ed->m_addr; //char timeZoneOffset = getTimeZoneFromAddr ( addr ); // in case getSections() block come right back in xd->setCallback ( st , xdcallback ); // . set niceness to 1 so all this processing doesn't slow queries down // . however, g_niceness should still be zero... hmmm... xd->m_niceness = 1; // default to 1 niceness st->m_niceness = 1; // now set the sections class Sections *ss = xd->getSections(); // now for each section with alnum text, telescope up as far as // possible without containing anymore alnum text than what it // contained. set SEC_CONTROL bit. such sections will have the // 2 green/blue dots, that are used for turning on/off title/desc. // but really the indians will only turn off sections that should // not have a title/desc. for ( Section *si = ss->m_rootSection ; si ; si = si->m_next ) { // breathe QUICKPOLL(st->m_niceness); // skip if does not have text if ( si->m_firstWordPos < 0 ) continue; // otherwise, find biggest parent that contains just that text Section *p = si->m_parent; Section *last = si; for ( ; p ; p = p->m_parent ) { if ( p->m_firstWordPos != si->m_firstWordPos ) break; if ( p->m_lastWordPos != si->m_lastWordPos ) break; last = p; } // set that bit then last->m_flags |= SEC_CONTROL; // and speed up the loop si = last; } // * now each SEC_CONTROL sections have a fence activated by a turker // * an event title or description can not span a fence. it must be // confined within a fence. however, it is allowed to include // title or description from a "title section". // * hold shift down to designate as title section when clicking it // * show the raw text of each event changing as you fence // sections in or out. show in a right frame. // * show list of events on page in the top frame. can toggle them // all individually. // * and remove no-display from all tags so we can see everything. // * highlight addresses, not just dates. // * each section hash has its own unique bg color when activated // * with a single click, completely reject an event because: // contains bad time, address, title or desc. specify which so // we can improve our algo. // * when selecting an individual event, scroll to its tod... // * remove all color from webpage that we can so our colors show up // * remove all imgs. just src them to dev null. // * allow for entering a custom title for an event or all events // that are or will ever appear on the page. // * when displaying the text of the events, use hyphens to // delineate the section topology. strike out text as a section // fence is activated. // * when a section is activated is it easier to just redownload // the whole text of the page? maybe just the text frame? // * clicking on an individual sentence section should just remove // that sentence. that is kinda a special content hash removal // tag. like "Click here for video." // * when an event id is selected i guess activate its bgcolor to // be light blue for all sentences currently in the event that // are not in activated sections. (make exception for designated // title sections). so we need multiple tags for each events // sentence div section. if sentence is split use multiple div tags // then to keep the order. so each event sentence would have // <div ev1=1 ev2=1 ev10=1>...</div> if it is in event ids 1,2 and // 10. that way we can activate it when one of those event ids is // activated. SafeBuf sb; // int16_tcuts if ( ! xd->m_wordsValid ) { char *xx=NULL;*xx=0; } Words *words = &xd->m_words; int32_t nw = words->getNumWords(); char **wptrs = words->getWords(); int32_t *wlens = words->getWordLens(); nodeid_t *tids = words->getTagIds(); // a special array for printing </div> tags char *endCounts = (char *)mcalloc ( nw ,"endcounts"); if ( ! endCounts ) return sendErrorReply ( st , g_errno ); // // now loop over all the words. if word starts a section that has // SEC_CONTROL bit set, and print out the section hash and a color // tag to be activated if the turkey activates us. // CAUTION: word may start multiple sections. // for ( int32_t i = 0 ; i < nw ; i++ ) { // get section ptr Section *sj = ss->m_sectionPtrs[i]; // sanity check. sj must be first section ptr that starts @ a if ( sj && sj->m_a==i && sj->m_prev && sj->m_prev->m_a==i ) { char *xx=NULL;*xx=0; } // . does word #i start a section? // . if section is control, print out the control while ( sj && sj->m_a == i ) { // print this section's hash if ( sj->m_flags & SEC_CONTROL) { // after the turkeys have made all the edits // they need to submit the changes they made. // how can we get that data sent back to the // back end? we need to send back the colors // of the sections that have been activated // i guess. just do a loop over them. sb.safePrintf("<div nobreak gbsecid=%"UINT32" " "bgcolor=#%"XINT32" " "onclick=gbtogglecolor()>", (uint32_t)sj->m_tagHash, (uint32_t)sj->m_tagHash); // sanity check if ( sj->m_b < 0 ) { char *xx=NULL;*xx=0; } if ( sj->m_b > nw ) { char *xx=NULL;*xx=0; } // and inc the /div count for that word endCounts[sj->m_b-1]++; } // try next section too sj = sj->m_next; } // if this is a tag, remove any coloring if ( tids[i] ) { } // print the word, be it a tag, alnum, punct sb.safeMemcpy ( wptrs[i] , wlens[i] ); // end a div tag? if ( ! endCounts[i] ) continue; // might be many so loop it for ( int32_t j = 0 ; j < endCounts[i] ; j++ ) sb.safePrintf("</div>"); } return false; }
bool sendReply ( void *state ) { StateCatdb *st = (StateCatdb*)state; // check for error if (g_errno) { if (st->m_catLookup) log("PageCatdb: Msg8b had error getting Site Rec: %s", mstrerror(g_errno)); else log("PageCatdb: Msg2a had error generating Catdb: %s", mstrerror(g_errno)); st->m_catLookup = false; g_errno = 0; } long long endTime = gettimeofdayInMilliseconds(); // page buffer SafeBuf sb; sb.reserve(64*1024); // . print standard header // . do not print big links if only an assassin, just print host ids g_pages.printAdminTop ( &sb, st->m_socket , &st->m_r ); sb.safePrintf( "<style>" ".poo { background-color:#%s;}\n" "</style>\n" , LIGHT_BLUE ); sb.safePrintf ( "<table %s>" "<tr><td colspan=2>" "<center><font size=+1><b>Catdb</b></font></center>" "</td></tr>", TABLE_STYLE ); // instructions sb.safePrintf("<tr bgcolor=#%s>" "<td colspan=3>" "<font size=-2>" "<center>" "Don't just start using this, you need to follow the " "instructions in the <i>admin guide</i> for adding " "DMOZ support." "</center>" "</font>" "</td>" "</tr>" ,DARK_BLUE ); // print the generate Catdb link sb.safePrintf ( "<tr class=poo><td>Update Catdb from DMOZ data.</td>" "<td><center>" "<a href=\"/master/catdb?c=%s&gencatdb=2\">" "Update Catdb</a> " "</center></td></tr>", st->m_coll ); sb.safePrintf ( "<tr class=poo>" "<td>Generate New Catdb from DMOZ data.</td>" "<td><center>" "<a href=\"/master/catdb?c=%s&gencatdb=1\">" "Generate Catdb</a> " "</center></td></tr>", st->m_coll ); if (st->m_genCatdb) sb.safePrintf ( "<tr class=poo>" "<td> Catdb Generation took %lli ms." "</td></tr>", endTime - st->m_startTime ); // print Url Catgory Lookup sb.safePrintf ( "<tr class=poo><td>Lookup Category of Url.</td>" "<td><input type=text name=caturl size=80" " value=\""); if (st->m_catLookup) { sb.safeMemcpy(st->m_url.getUrl(), st->m_url.getUrlLen()); } sb.safePrintf("\"></center></td></tr>" ); // print Url Info if Lookup was done if (st->m_catLookup) { sb.safePrintf("<tr><td>"); // print the url sb.safeMemcpy(st->m_url.getUrl(), st->m_url.getUrlLen()); sb.safePrintf(" (%lli ms)</td><td>", endTime - st->m_startTime ); // print each category id and path for (long i = 0; i < st->m_catRec.m_numCatids; i++) { sb.safePrintf("<b>[%li] ", st->m_catRec.m_catids[i]); g_categories->printPathFromId(&sb, st->m_catRec.m_catids[i]); sb.safePrintf("</b><br>"); // lookup title and summary char title[1024]; long titleLen = 0; char summ[4096]; long summLen = 0; char anchor[256]; unsigned char anchorLen = 0; g_categories->getTitleAndSummary( st->m_url.getUrl(), st->m_url.getUrlLen(), st->m_catRec.m_catids[i], title, &titleLen, 1023, summ, &summLen, 4098, anchor, &anchorLen, 255 ); title[titleLen] = '\0'; summ[summLen] = '\0'; anchor[anchorLen] = '\0'; // print title and summary sb.safePrintf("<b>Title:</b> %s<br>" "<b>Summary:</b> %s<br>", title, summ); if (anchorLen > 0) sb.safePrintf("<b>Anchor:</b> %s<br>", anchor); sb.safePrintf("<br>"); } sb.safePrintf("<b>Filenum:</b> %li<br>", st->m_catRec.m_filenum); // print indirect catids if (st->m_catRec.m_numIndCatids > 0) { sb.safePrintf("<hr><b>Indirect Catids [%li]:" "</b><br>\n", st->m_catRec.m_numIndCatids ); for (long i = 0; i < st->m_catRec.m_numIndCatids; i++) { sb.safePrintf("%lu<br>", st->m_catRec.m_indCatids[i]); } } sb.safePrintf("</td></tr>"); } // end it sb.safePrintf ( "</center></td></tr></table>" ); // print submit button sb.safePrintf ( "<br><center>" "<input type=submit value=\"Submit\" border=0>" "</form></center>" ); // print the final tail //p += g_httpServer.printTail ( p , pend - p ); // clear g_errno, if any, so our reply send goes through g_errno = 0; // extract the socket TcpSocket *s = st->m_socket; // clear the state mdelete ( st, sizeof(StateCatdb), "PageCatdb" ); delete st; // . send this page // . encapsulates in html header and tail // . make a Mime return g_httpServer.sendDynamicPage(s , sb.getBufStart(), sb.length()); }
bool Msg3a::gotAllSplitReplies ( ) { // if any of the split requests had an error, give up and set m_errno // but don't set if for non critical errors like query truncation if ( m_errno ) { g_errno = m_errno; return true; } // also reset the finalbuf and the oldNumTopDocIds if ( m_finalBuf ) { mfree ( m_finalBuf, m_finalBufSize, "Msg3aF" ); m_finalBuf = NULL; m_finalBufSize = 0; } // update our estimated total hits m_numTotalEstimatedHits = 0; for ( long i = 0; i < m_numHosts ; i++ ) { // get that host that gave us the reply //Host *h = g_hostdb.getHost(i); // . get the reply from multicast // . multicast should have destroyed all slots, but saved reply // . we are responsible for freeing the reply // . we need to call this even if g_errno or m_errno is // set so we can free the replies in Msg3a::reset() // . if we don't call getBestReply() on it multicast should // free it, because Multicast::m_ownReadBuf is still true Multicast *m = &m_mcast[i]; bool freeit = false; long replySize = 0; long replyMaxSize; char *rbuf; Msg39Reply *mr; // . only get it if the reply not already full // . if reply already processed, skip // . perhaps it had no more docids to give us or all termlists // were exhausted on its disk and this is a re-call // . we have to re-process it for count m_numTotalEstHits, etc. rbuf = m->getBestReply ( &replySize , &replyMaxSize , &freeit , true ); //stealIt? // cast it mr = (Msg39Reply *)rbuf; // in case of mem leak, re-label from "mcast" to this so we // can determine where it came from, "Msg3a-GBR" relabel( rbuf, replyMaxSize , "Msg3a-GBR" ); // . we must be able to free it... we must own it // . this is true if we should free it, but we should not have // to free it since it is owned by the slot? if ( freeit ) { log(LOG_LOGIC,"query: msg3a: Steal failed."); char *xx = NULL; *xx=0; } // bad reply? if ( ! mr ) { log(LOG_LOGIC,"query: msg3a: Bad NULL reply."); m_reply [i] = NULL; m_replyMaxSize[i] = 0; // it might have been timd out, just ignore it!! continue; // if size is 0 it can be Msg39 giving us an error! g_errno = EBADREPLYSIZE; m_errno = EBADREPLYSIZE; // all reply buffers should be freed on reset() return true; } // how did this happen? if ( replySize < 29 && ! mr->m_errno ) { // if size is 0 it can be Msg39 giving us an error! g_errno = EBADREPLYSIZE; m_errno = EBADREPLYSIZE; log(LOG_LOGIC,"query: msg3a: Bad reply size of %li.", replySize); // all reply buffers should be freed on reset() return true; } // can this be non-null? we shouldn't be overwriting one // without freeing it... if ( m_reply[i] ) // note the mem leak now log("query: mem leaking a 0x39 reply"); // cast it and set it m_reply [i] = mr; m_replyMaxSize[i] = replyMaxSize; // deserialize it (just sets the ptr_ and size_ member vars) //mr->deserialize ( ); deserializeMsg ( sizeof(Msg39Reply) , &mr->size_docIds, &mr->size_clusterRecs, &mr->ptr_docIds, mr->m_buf ); // sanity check if ( mr->m_nqt != m_q->getNumTerms() ) { g_errno = EBADREPLY; m_errno = EBADREPLY; log("query: msg3a: Split reply qterms=%li != %li.", (long)mr->m_nqt,(long)m_q->getNumTerms() ); return true; } // return if split had an error, but not for a non-critical // error like query truncation if ( mr->m_errno && mr->m_errno != EQUERYTRUNCATED ) { g_errno = mr->m_errno; m_errno = mr->m_errno; log("query: msg3a: Split had error: %s", mstrerror(g_errno)); return true; } // skip down here if reply was already set //skip: // add of the total hits from each split, this is how many // total results the lastest split is estimated to be able to // return // . THIS should now be exact since we read all termlists // of posdb... m_numTotalEstimatedHits += mr->m_estimatedHits; // debug log stuff if ( ! m_debug ) continue; // cast these for printing out long long *docIds = (long long *)mr->ptr_docIds; score_t *scores = (score_t *)mr->ptr_scores; // print out every docid in this split reply for ( long j = 0; j < mr->m_numDocIds ; j++ ) { // print out score_t logf( LOG_DEBUG, "query: msg3a: [%lu] %03li) " "split=%li docId=%012llu domHash=0x%02lx " "score=%lu" , (unsigned long)this , j , i , docIds [j] , (long)g_titledb.getDomHash8FromDocId(docIds[j]), (long)scores[j] ); } } // this seems to always return true! mergeLists ( ); if ( ! m_r->m_useSeoResultsCache ) return true; // now cache the reply SafeBuf cr; long dataSize = 4 + 4 + 4 + m_numDocIds * (8+4+4); long need = sizeof(key_t) + 4 + dataSize; bool status = cr.reserve ( need ); // sanity if ( ( m_ckey.n0 & 0x01 ) == 0x00 ) { char *xx=NULL; *xx=0; } // ignore errors g_errno = 0; // return on error with g_errno cleared if cache add failed if ( ! status ) return true; // add to buf otherwise cr.safeMemcpy ( &m_ckey , sizeof(key_t) ); cr.safeMemcpy ( &dataSize , 4 ); long now = getTimeGlobal(); cr.pushLong ( now ); cr.pushLong ( m_numDocIds ); cr.pushLong ( m_numTotalEstimatedHits );//Results ); long max = m_numDocIds; // then the docids for ( long i = 0 ; i < max ; i++ ) cr.pushLongLong(m_docIds[i] ); for ( long i = 0 ; i < max ; i++ ) cr.pushFloat(m_scores[i]); for ( long i = 0 ; i < max ; i++ ) cr.pushLong(getSiteHash26(i)); // sanity if ( cr.length() != need ) { char *xx=NULL; *xx=0; } // make these key_t startKey; key_t endKey; startKey = m_ckey; // clear delbit startKey.n0 &= 0xfffffffffffffffeLL; // end key is us endKey = m_ckey; // that is the single record m_seoCacheList.set ( cr.getBufStart() , cr.length(), cr.getBufStart(), // alloc cr.getCapacity(), // alloc size (char *)&startKey, (char *)&endKey, -1, // fixeddatasize true, // owndata? false,// use half keys? sizeof(key_t) ); // do not allow cr to free it, msg1 will cr.detachBuf(); // note it //log("seopipe: storing ckey=%s q=%s" // ,KEYSTR(&m_ckey,12) // ,m_r->ptr_query // ); //log("msg1: sending niceness=%li",(long)m_r->m_niceness); // this will often block, but who cares!? it just sends a request off if ( ! m_msg1.addList ( &m_seoCacheList , RDB_SERPDB,//RDB_CACHEDB, m_r->ptr_coll, this, // state gotSerpdbReplyWrapper, // callback false, // forcelocal? m_r->m_niceness ) ) { //log("blocked"); return false; } // we can safely delete m_msg17... just return true return true; }
// . return the score of the highest-scoring window containing match #m // . window is defined by the half-open interval [a,b) where a and b are // word #'s in the Words array indicated by match #m // . return -1 and set g_errno on error int64_t Summary::getBestWindow ( Matches *matches, int32_t mm, int32_t *lasta, int32_t *besta, int32_t *bestb, char *gotIt, char *retired, int32_t maxExcerptLen ) { // get the window around match #mm Match *m = &matches->m_matches[mm]; // what is the word # of match #mm? int32_t matchWordNum = m->m_wordNum; // what Words/Pos/Bits classes is this match in? Words *words = m->m_words; Section **sp = NULL; int32_t *pos = m->m_pos->m_pos; // use "m_swbits" not "m_bits", that is what Bits::setForSummary() uses const swbit_t *bb = m->m_bits->m_swbits; // shortcut if ( m->m_sections ) { sp = m->m_sections->m_sectionPtrs; } int32_t nw = words->getNumWords(); int64_t *wids = words->getWordIds(); nodeid_t *tids = words->getTagIds(); // . sanity check // . this prevents a core i've seen if ( matchWordNum >= nw ) { log("summary: got overflow condition for q=%s",m_q->m_orig); // assume no best window *besta = -1; *bestb = -1; *lasta = matchWordNum; return 0; } // . we NULLify the section ptrs if we already used the word in another summary. int32_t badFlags = SEC_SCRIPT|SEC_STYLE|SEC_SELECT|SEC_IN_TITLE; if ( (bb[matchWordNum] & D_USED) || ( sp && (sp[matchWordNum]->m_flags & badFlags) ) ) { // assume no best window *besta = -1; *bestb = -1; *lasta = matchWordNum; return 0; } // . "a" is the left fence post of the window (it is a word # in Words) // . go to the left as far as we can // . thus we decrement "a" int32_t a = matchWordNum; // "posa" is the character position of the END of word #a int32_t posa = pos[a+1]; int32_t firstFrag = -1; bool startOnQuote = false; bool goodStart = false; int32_t wordCount = 0; // . decrease "a" as int32_t as we stay within maxNumCharsPerLine // . avoid duplicating windows by using "lasta", the last "a" of the // previous call to getBestWindow(). This can happen if our last // central query term was close to this one. for ( ; a > 0 && posa - pos[a-1] < maxExcerptLen && a > *lasta; a-- ) { // . don't include any "dead zone", // . dead zones have already been used for the summary, and // we are getting a second/third/... excerpt here now then // stop if its the start of a sentence, too // stop before title word if ( (bb[a-1] & D_USED) || (bb[a] & D_STARTS_SENTENCE) || ( bb[a-1] & D_IN_TITLE )) { goodStart = true; break; } // don't go beyond an LI, TR, P tag if ( tids && ( tids[a-1] == TAG_LI || tids[a-1] == TAG_TR || tids[a-1] == TAG_P || tids[a-1] == TAG_DIV ) ) { goodStart = true; break; } // stop if its the start of a quoted sentence if ( a+1<nw && (bb[a+1] & D_IN_QUOTES) && words->getWord(a)[0] == '\"' ){ startOnQuote = true; goodStart = true; break; } // find out the first instance of a fragment (comma, etc) // watch out! because frag also means 's' in there's if ( ( bb[a] & D_STARTS_FRAG ) && !(bb[a-1] & D_IS_STRONG_CONNECTOR) && firstFrag == -1 ) { firstFrag = a; } if ( wids[a] ) { wordCount++; } } // if didn't find a good start, then start at the start of the frag if ( !goodStart && firstFrag != -1 ) { a = firstFrag; } // don't let punct or tag word start a line, unless a quote if ( a < matchWordNum && !wids[a] && words->getWord(a)[0] != '\"' ){ while ( a < matchWordNum && !wids[a] ) a++; // do not break right after a "strong connector", like // apostrophe while ( a < matchWordNum && a > 0 && ( bb[a-1] & D_IS_STRONG_CONNECTOR ) ) a++; // don't let punct or tag word start a line while ( a < matchWordNum && !wids[a] ) a++; } // remember, b is not included in the summary, the summary is [a,b-1] // remember to include all words in a matched phrase int32_t b = matchWordNum + m->m_numWords ; int32_t endQuoteWordNum = -1; int32_t numTagsCrossed = 0; for ( ; b <= nw; b++ ) { if ( b == nw ) { break; } if ( pos[b+1] - pos[a] >= maxExcerptLen ) { break; } if ( startOnQuote && words->getWord(b)[0] == '\"' ) { endQuoteWordNum = b; } // don't include any dead zone, those are already-used samples if ( bb[b] & D_USED ) { break; } // stop on a title word if ( bb[b] & D_IN_TITLE ) { break; } if ( wids[b] ) { wordCount++; } // don't go beyond an LI or TR backtag if ( tids && ( tids[b] == (BACKBIT|TAG_LI) || tids[b] == (BACKBIT|TAG_TR) ) ) { numTagsCrossed++; // try to have atleast 10 words in the summary if ( wordCount > 10 ) { break; } } // go beyond a P or DIV backtag in case the earlier char is a // ':'. This came from a special case for wikipedia pages // eg. http://en.wikipedia.org/wiki/Flyover if ( tids && ( tids[b] == (BACKBIT|TAG_P) || tids[b] == (BACKBIT|TAG_DIV) )) { numTagsCrossed++; // try to have atleast 10 words in the summary if ( wordCount > 10 && words->getWord(b-1)[0] != ':' ) { break; } } } // don't end on a lot of punct words if ( b > matchWordNum && !wids[b-1]){ // remove more than one punct words. if we're ending on a quote // keep it while ( b > matchWordNum && !wids[b-2] && endQuoteWordNum != -1 && b > endQuoteWordNum ) { b--; } // do not break right after a "strong connector", like apostrophe while ( b > matchWordNum && (bb[b-2] & D_IS_STRONG_CONNECTOR) ) { b--; } } Match *ms = matches->m_matches; // make m_matches.m_matches[mi] the first match in our [a,b) window int32_t mi ; // . the match at the center of the window is match #"mm", so that // matches->m_matches[mm] is the Match class // . set "mi" to it and back up "mi" as int32_t as >= a for ( mi = mm ; mi > 0 && ms[mi-1].m_wordNum >=a ; mi-- ) ; // now get the score of this excerpt. Also mark all the represented // query words. Mark the represented query words in the array that // comes to us. also mark how many times the same word is repeated in // this summary. int64_t score = 0LL; // is a url contained in the summary, that looks bad! punish! bool hasUrl = false; // the word count we did above was just an approximate. count it right wordCount = 0; // for debug //char buf[5000]; //char *xp = buf; SafeBuf xp; // wtf? if ( b > nw ) { b = nw; } // first score from the starting match down to a, including match for ( int32_t i = a ; i < b ; i++ ) { // debug print out if ( g_conf.m_logDebugSummary ) { int32_t len = words->getWordLen(i); char cs; for (int32_t k=0;k<len; k+=cs ) { const char *c = words->getWord(i)+k; cs = getUtf8CharSize(c); if ( is_binary_utf8 ( c ) ) { continue; } xp.safeMemcpy ( c , cs ); xp.nullTerm(); } } // skip if in bad section, marquee, select, script, style if ( sp && (sp[i]->m_flags & badFlags) ) { continue; } // don't count just numeric words if ( words->isNum(i) ) { continue; } // check if there is a url. best way to check for '://' if ( wids && !wids[i] ) { const char *wrd = words->getWord(i); int32_t wrdLen = words->getWordLen(i); if ( wrdLen == 3 && wrd[0] == ':' && wrd[1] == '/' && wrd[2] == '/' ) { hasUrl = true; } } // skip if not wid if ( ! wids[i] ) { continue; } // just make every word 100 pts int32_t t = 100; // penalize it if in one of these sections if ( bb[i] & ( D_IN_PARENS | D_IN_SUP | D_IN_LIST ) ) { t /= 2; } // boost it if in bold or italics if ( bb[i] & D_IN_BOLDORITALICS ) { t *= 2; } // add the score for this word score += t; // print the score, "t" if ( g_conf.m_logDebugSummary ) { xp.safePrintf("(%" PRId32")",t); } // count the alpha words we got wordCount++; // if no matches left, skip if ( mi >= matches->m_numMatches ) { continue; } // get the match Match *next = &ms[mi]; // skip if not a match if ( i != next->m_wordNum ) { continue; } // must be a match in this class if ( next->m_words != words ) { continue; } // advance it mi++; // which query word # does it match int32_t qwn = next->m_qwordNum; if ( qwn < 0 || qwn >= m_q->m_numWords ){g_process.shutdownAbort(true);} // undo old score score -= t; // add 100000 per match t = 100000; // weight based on tf, goes from 0.1 to 1.0 t = (int32_t)((float)t * m_wordWeights [ qwn ]); // if it is a query stop word, make it 10000 pts if ( m_q->m_qwords[qwn].m_isQueryStopWord ) { t = 0;//10000; } // penalize it if in one of these sections if ( bb[i] & ( D_IN_PARENS | D_IN_SUP | D_IN_LIST ) ) { t /= 2; } if ( gotIt[qwn] > 0 ) { // have we matched it in this [a,b) already? if ( gotIt[qwn] == 1 ) { t /= 15; } else { // if we have more than 2 matches in the same window, // it may not give a good summary. give a heavy penalty t -= 200000; } } else if ( retired [qwn] > 0 ) { // have we matched it already in a winning window? t /= 12; } // add it back score += t; if ( g_conf.m_logDebugSummary ) { xp.safePrintf ("[%" PRId32"]{qwn=%" PRId32",ww=%f}",t,qwn, m_wordWeights[qwn]); } // inc the query word count for this window if ( gotIt[qwn] < 100 ) { gotIt[qwn]++; } } int32_t oldScore = score; // apply the bonus if it starts or a sentence // only apply if the score is positive and if the wordcount is decent if ( score > 0 && wordCount > 7 ){ // a match can give us 10k to 100k pts based on the tf weights // so we don't want to overwhelm that too much, so let's make // this a 20k bonus if it starts a sentence if ( bb[a] & D_STARTS_SENTENCE ) { score += 8000; } else if ( bb[a] & D_STARTS_FRAG ) { // likewise, a fragment, like after a comma score += 4000; } // 1k if the match word is very close to the // start of a sentence, lets say 3 alphawords if ( matchWordNum - a < 7 ) { score += 1000; } } // a summary isn't really a summary if its less than 7 words. // reduce the score, but still give it a decent score. // minus 5M. if ( wordCount < 7 ) { score -= 20000; } // summaries that cross a lot of tags are usually bad, penalize them if ( numTagsCrossed > 1 ) { score -= (numTagsCrossed * 20000); } if ( hasUrl ) { score -= 8000; } // show it if ( g_conf.m_logDebugSummary ) { log(LOG_DEBUG, "sum: score=%08" PRId32" prescore=%08" PRId32" a=%05" PRId32" b=%05" PRId32" %s", (int32_t)score,oldScore,(int32_t)a,(int32_t)b, xp.getBufStart()); } // set lasta, besta, bestb *lasta = a; *besta = a; *bestb = b; return score; }
// // . ENTRY POINT FOR IMPORTING TITLEDB RECS FROM ANOTHER CLUSTER // . when user clicks 'begin' in import page we come here.. // . so when that parm changes in Parms.cpp we sense that and call // beginImport(CollectionRec *cr) // . or on startup we call resumeImports to check each coll for // an import in progress. // . search for files named titledb*.dat // . if none found just return // . when msg7 inject competes it calls this // . call this from sleep wrapper in Process.cpp // . returns false if would block (outstanding injects), true otherwise // . sets g_errno on error bool ImportState::importLoop ( ) { CollectionRec *cr = g_collectiondb.getRec ( m_collnum ); if ( ! cr || g_hostdb.m_hostId != 0 ) { // if coll was deleted! log("import: collnum %li deleted while importing into", (long)m_collnum); //if ( m_numOut > m_numIn ) return true; // delete the entire import state i guess // what happens if we have a msg7 reply come back in? // it should see the collrec is NULL and just fail. mdelete ( this, sizeof(ImportState) , "impstate"); delete (this); return true; } INJECTLOOP: // stop if waiting on outstanding injects long long out = m_numOut - m_numIn; if ( out >= cr->m_numImportInjects ) { g_errno = 0; return false; } if ( ! cr->m_importEnabled ) { // wait for all to return if ( out > 0 ) return false; // then delete it log("import: collnum %li import loop disabled", (long)m_collnum); mdelete ( this, sizeof(ImportState) , "impstate"); delete (this); return true; } // scan each titledb file scanning titledb0001.dat first, // titledb0003.dat second etc. //long long offset = -1; // . when offset is too big for current m_bigFile file then // we go to the next and set offset to 0. // . sets m_bf and m_fileOffset if ( ! setCurrentTitleFileAndOffset ( ) ) {//cr , -1 ); log("import: import: no files to read"); //goto INJECTLOOP; return true; } // this is -1 if none remain! if ( m_fileOffset == -1 ) { log("import: import fileoffset is -1. done."); return true; } long long saved = m_fileOffset; //Msg7 *msg7; //GigablastRequest *gr; //SafeBuf *sbuf = NULL; long need = 12; long dataSize = -1; //XmlDoc xd; key_t tkey; bool status; SafeBuf tmp; SafeBuf *sbuf = &tmp; long long docId; long shardNum; long key; Multicast *mcast; char *req; long reqSize; if ( m_fileOffset >= m_bfFileSize ) { log("inject: import: done processing file %li %s", m_bfFileId,m_bf.getFilename()); goto nextFile; } // read in title rec key and data size status = m_bf.read ( &tkey, sizeof(key_t) , m_fileOffset ); //if ( n != 12 ) goto nextFile; if ( g_errno ) { log("inject: import: reading file error: %s. advancing " "to next file",mstrerror(g_errno)); goto nextFile; } m_fileOffset += 12; // if negative key, skip if ( (tkey.n0 & 0x01) == 0 ) { goto INJECTLOOP; } // if non-negative then read in size status = m_bf.read ( &dataSize , 4 , m_fileOffset ); if ( g_errno ) { log("main: failed to read in title rec " "file. %s. Skipping file %s", mstrerror(g_errno),m_bf.getFilename()); goto nextFile; } m_fileOffset += 4; need += 4; need += dataSize; need += 4; // collnum, first 4 bytes if ( dataSize < 0 || dataSize > 500000000 ) { log("main: could not scan in titledb rec of " "corrupt dataSize of %li. BAILING ENTIRE " "SCAN of file %s",dataSize,m_bf.getFilename()); goto nextFile; } //gr = &msg7->m_gr; //XmlDoc *xd = getAvailXmlDoc(); //msg7 = getAvailMsg7(); mcast = getAvailMulticast(); // if none, must have to wait for some to come back to us if ( ! mcast ) { // restore file offset //m_fileOffset = saved; // no, must have been a oom or something log("import: import no mcast available"); return true;//false; } // this is for holding a compressed titlerec //sbuf = &mcast->m_sbuf;//&gr->m_sbuf; // point to start of buf sbuf->reset(); // ensure we have enough room sbuf->reserve ( need ); // collnum first 4 bytes sbuf->pushLong( (long)m_collnum ); // store title key sbuf->safeMemcpy ( &tkey , sizeof(key_t) ); // then datasize if any. neg rec will have -1 datasize if ( dataSize >= 0 ) sbuf->pushLong ( dataSize ); // then read data rec itself into it, compressed titlerec part if ( dataSize > 0 ) { // read in the titlerec after the key/datasize status = m_bf.read ( sbuf->getBuf() , dataSize , m_fileOffset ); if ( g_errno ) { // n != dataSize ) { log("main: failed to read in title rec " "file. %s. Skipping file %s", mstrerror(g_errno),m_bf.getFilename()); // essentially free up this msg7 now //msg7->m_inUse = false; //msg7->reset(); goto nextFile; } // advance m_fileOffset += dataSize; // it's good, count it sbuf->m_length += dataSize; } // set xmldoc from the title rec //xd->set ( sbuf.getBufStart() ); //xd->m_masterState = NULL; //xd->m_masterCallback ( titledbInjectLoop ); // we use this so we know where the doc we are injecting // was in the foregien titledb file. so we can update our bookmark // code. mcast->m_hackFileOff = saved;//m_fileOffset; mcast->m_hackFileId = m_bfFileId; // // inject a title rec buf this time, we are doing an import // FROM A TITLEDB FILE!!! // //gr->m_titleRecBuf = &sbuf; // break it down into gw // xd.set2 ( sbuf.getBufStart() , // sbuf.length() , // max size // cr->m_coll, // use our coll // NULL , // pbuf for page parser // 1 , // niceness // NULL ); //sreq ); // // note it // log("import: importing %s",xd.m_firstUrl.getUrl()); // now we can set gr for the injection // TODO: inject the whole "sbuf" so we get sitenuminlinks etc // all exactly the same... // gr->m_url = xd.getFirstUrl()->getUrl(); // gr->m_queryToScrape = NULL; // gr->m_contentDelim = 0; // gr->m_contentTypeStr = g_contentTypeStrings [xd.m_contentType]; // gr->m_contentFile = NULL; // gr->m_content = xd.ptr_utf8Content; // gr->m_diffbotReply = NULL; // gr->m_injectLinks = false; // gr->m_spiderLinks = true; // gr->m_shortReply = false; // gr->m_newOnly = false; // gr->m_deleteUrl = false; // gr->m_recycle = true; // recycle content? or sitelinks? // gr->m_dedup = false; // gr->m_hasMime = false; // gr->m_doConsistencyTesting = false; // gr->m_getSections = false; // gr->m_gotSections = false; // gr->m_charset = xd.m_charset; // gr->m_hopCount = xd.m_hopCount; // // point to next doc in the titledb file // //m_fileOffset += need; // get docid from key docId = g_titledb.getDocIdFromKey ( &tkey ); // get shard that holds the titlerec for it shardNum = g_hostdb.getShardNumFromDocId ( docId ); // for selecting which host in the shard receives it key = (long)docId; m_numOut++; // then index it. master callback will be called //if ( ! xd->index() ) return false; // TODO: make this forward the request to an appropriate host!! // . gr->m_sbuf is set to the titlerec so this should handle that // and use XmlDoc::set4() or whatever // if ( msg7->injectTitleRec ( msg7 , // state // gotMsg7ReplyWrapper , // callback // cr )) { // // it didn't block somehow... // msg7->m_inUse = false; // msg7->gotMsg7Reply(); // } req = sbuf->getBufStart(); reqSize = sbuf->length(); if ( reqSize != need ) { char *xx=NULL;*xx=0 ; } // do not free it, let multicast free it after sending it sbuf->detachBuf(); if ( ! mcast->send ( req , reqSize , 0x07 , true , // ownmsg? shardNum, false, // send to whole shard? key , // for selecting host in shard mcast , // state NULL , // state2 gotMulticastReplyWrapper , 999999 ) ) { // total timeout in seconds log("import: import mcast had error: %s",mstrerror(g_errno)); m_numIn++; } goto INJECTLOOP; nextFile: // invalidate this flag //m_offIsValid = false; // . and call this function. we add one to m_bfFileId so we // do not re-get the file we just injected. // . sets m_bf and m_fileOffset // . returns false if nothing to read if ( ! setCurrentTitleFileAndOffset ( ) ) { //cr , m_bfFileId+1 ); log("import: import: no files left to read"); //goto INJECTLOOP; return true; } // if it returns NULL we are done! log("main: titledb injection loop completed. waiting for " "outstanding injects to return."); if ( m_numOut > m_numIn ) return false; log("main: all injects have returned. DONE."); // dummy return return true; }
bool sendPageThesaurus( TcpSocket *s, HttpRequest *r ) { SafeBuf p; char getBuf[64]; // holds extra values for GET method char formBuf[256]; // holds extra values for forms snprintf(getBuf, 64, "c=%s", r->getString("c", 0, "")); snprintf(formBuf, 256, "<input type=hidden name=\"c\" value=\"%s\">", //"<input type=hidden name=\"pwd\" value=\"%s\">", r->getString("c", 0, "")); g_pages.printAdminTop( &p, s, r); if (r->getLong("cancel", 0) != 0) { g_thesaurus.cancelRebuild(); p.safePrintf("<br><br>\n"); p.safePrintf( "<center><b><font color=#ff0000>" "rebuild canceled" "</font></b></center>"); } if (r->getLong("rebuild", 0) != 0) { bool full = r->getLong("full", 0); p.safePrintf("<br><br>\n"); if (g_thesaurus.rebuild(0, full)) { p.safePrintf( "<center><b><font color=#ff0000>" "error starting rebuild, check log for details" "</font></b></center>"); } else { p.safePrintf( "<center><b><font color=#ff0000>" "rebuild started" "</font></b></center>"); } } if (r->getLong("rebuildaff", 0) != 0) { bool full = r->getLong("full", 0); p.safePrintf("<br><br>\n"); if (g_thesaurus.rebuildAffinity(0, full)) { p.safePrintf( "<center><b><font color=#ff0000>" "error starting rebuild, check log for details" "</font></b></center>"); } else { p.safePrintf( "<center><b><font color=#ff0000>" "rebuild started" "</font></b></center>"); } } if (r->getLong("distribute", 0) != 0) { char cmd[1024]; p.safePrintf("<br><br>\n"); if (g_thesaurus.m_affinityState) { p.safePrintf( "<center><b><font color=#ff0000>" "cannot distribute during rebuild" "</font></b></center>"); } else { for ( long i = 0; i < g_hostdb.getNumHosts() ; i++ ) { Host *h = g_hostdb.getHost(i); snprintf(cmd, 512, "rcp -r " "./dict/thesaurus.* " "%s:%s/dict/ &", iptoa(h->m_ip), h->m_dir); log(LOG_INFO, "admin: %s", cmd); system( cmd ); } p.safePrintf( "<center><b><font color=#ff0000>" "data distributed" "</font></b></center>"); } } if (r->getLong("reload", 0) != 0) { p.safePrintf("<br><br>\n"); if (r->getLong("cast", 0) != 0) { p.safePrintf( "<center><b><font color=#ff0000>" "reload command broadcast" "</font></b></center>"); } else if (g_thesaurus.init()) { p.safePrintf( "<center><b><font color=#ff0000>" "thesaurus data reloaded" "</font></b></center>"); } else { p.safePrintf( "<center><b><font color=#ff0000>" "error reloading thesaurus data" "</font></b></center>"); } } long manualAddLen = 0; char *manualAdd = NULL; SafeBuf manualAddBuf; if ((manualAdd = r->getString("manualadd", &manualAddLen))) { trimWhite(manualAdd); manualAddLen = gbstrlen(manualAdd); File manualFile; manualFile.set(g_hostdb.m_dir, "dict/thesaurus-manual.txt"); if (manualFile.open(O_WRONLY | O_CREAT | O_TRUNC) && (manualFile.write(manualAdd, manualAddLen, 0) == manualAddLen)) { char newl = '\n'; // for write() if (manualAdd[manualAddLen-1] != '\n') manualFile.write(&newl, 1, manualAddLen); p.safePrintf( "<center><b><font color=#ff0000>" "updated manual add file sucessfully" "</font></b></center>"); } else { p.safePrintf( "<center><b><font color=#ff0000>" "error writing manual add file" "</font></b></center>"); } } else { char ff[PATH_MAX]; snprintf(ff, PATH_MAX, "%sdict/thesaurus-manual.txt", g_hostdb.m_dir); if (manualAddBuf.fillFromFile(ff)) { if (*(manualAddBuf.getBuf()-1) != '\n') manualAddBuf.pushChar('\n'); manualAdd = manualAddBuf.getBufStart(); manualAddLen = manualAddBuf.length(); } } long affinityAddLen = 0; char *affinityAdd = NULL; SafeBuf affinityAddBuf; if ((affinityAdd = r->getString("affinityadd", &affinityAddLen))) { trimWhite(affinityAdd); affinityAddLen = gbstrlen(affinityAdd); File affinityFile; affinityFile.set(g_hostdb.m_dir, "dict/thesaurus-affinity.txt"); if (affinityFile.open(O_WRONLY | O_CREAT | O_TRUNC) && (affinityFile.write(affinityAdd, affinityAddLen, 0) == affinityAddLen)) { char newl = '\n'; // for write() if (affinityAdd[affinityAddLen-1] != '\n') affinityFile.write(&newl, 1, affinityAddLen); p.safePrintf( "<center><b><font color=#ff0000>" "updated affinity add file sucessfully" "</font></b></center>"); } else { p.safePrintf( "<center><b><font color=#ff0000>" "error writing affinity add file" "</font></b></center>"); } } else { char ff[PATH_MAX]; snprintf(ff, PATH_MAX, "%sdict/thesaurus-affinity.txt", g_hostdb.m_dir); if (affinityAddBuf.fillFromFile(ff)) { if (*(affinityAddBuf.getBuf()-1) != '\n') affinityAddBuf.pushChar('\n'); affinityAdd = affinityAddBuf.getBufStart(); affinityAddLen = affinityAddBuf.length(); } } char *syn = r->getString("synonym"); long len = 0; if (syn) len = gbstrlen(syn); if (len) { SynonymInfo info; bool r = g_thesaurus.getAllInfo(syn, &info, len, SYNBIT_ALL); p.safePrintf("<br><br>\n"); p.safePrintf ( "<table cellpadding=4 width=100%% bgcolor=#%s border=1>" "<tr>" "<td colspan=2 bgcolor=#%s>" "<center><b>Synonym List (%ld)</b></center>" "</td>" "</tr>\n", LIGHT_BLUE, DARK_BLUE, info.m_numSyns); if (r) { p.safePrintf("<tr>" "<td align=right><tt>%s</tt></td>" "<td align=left>" "<tt>1.000/%08lX (1.000/%08lX)</tt>" "</td>" "</tr>\n", syn, MAX_AFFINITY, MAX_AFFINITY); for (long i = 0; i < info.m_numSyns; i++) { // get the reverse affinity as well long aff = g_thesaurus.getAffinity( info.m_syn[i], syn, info.m_len[i], len); p.safePrintf( "<tr>" "<td width=40%% align=right>" "<tt>"); p.safeMemcpy(info.m_syn[i], info.m_len[i]); p.safePrintf("</tt>" "</td>" "<td width=60%% align=left>" "<tt>"); if (info.m_affinity[i] >= 0) { p.safePrintf("%0.3f/%08lX ", (float)info.m_affinity[i] / MAX_AFFINITY, info.m_affinity[i]); } else { p.safePrintf("u "); } if (aff >= 0) { p.safePrintf("(%0.3f/%08lX) ", (float)aff / MAX_AFFINITY, aff); } else { p.safePrintf("(u) "); } p.safePrintf("(%ld) (%ld) (%ld) (%ld) " "(%lld) (%lld)", (long)info.m_type[i], (long)info.m_sort[i], info.m_firstId[i], info.m_lastId[i], info.m_leftSynHash[i], info.m_rightSynHash[i]); for (int j = info.m_firstId[i]; j <= info.m_lastId[i]; j++) { p.safePrintf(" (%lld)", info.m_termId[j]); } p.safePrintf( "</tt>" "</td>" "</tr>\n"); } p.safePrintf("</table>"); } else { p.safePrintf("<tr>" "<td align=center><font color=#FF0000>" "synonym not found: %s" "</font></td>" "</tr>\n", syn); } } p.safePrintf ( "<br><br>\n" ); p.safePrintf ( "<table cellpadding=4 width=100%% bgcolor=#%s border=1>" "<tr>" "<td colspan=2 bgcolor=#%s>" "<center><b>Thesaurus Controls" "</b></center></td>" "</tr>\n", LIGHT_BLUE, DARK_BLUE); p.safePrintf ( "<tr>" "<td width=37%%><b>rebuild all data</b><br>" "<font size=1>" "rebuilds synonyms and then begins the rebuild process for " "affinity data; this should only be run on one host, as the " "data is copied when the process is finished; full rebuild " "does not use existing affinity data" "</font>" "</td>" "<td width=12%% bgcolor=#0000ff>" "<center><b><a href=\"/master/thesaurus?rebuild=1&%s\">" "rebuild all data</a> <a href=\"/master/thesaurus?" "rebuild=1&full=1&%s\">(full)</a></b></center>" "</td>" "</tr>\n", getBuf, getBuf); p.safePrintf ( "<tr>" "<td width=37%%><b>distribute data</b><br>" "<font size=1>" "distributes all thesaurus data to all hosts, this is " "normally done automatically but if there was a problem " "with the copy, this lets you do it manually" "</font>" "</td>" "<td width=12%% bgcolor=#0000ff>" "<center><b><a href=\"/master/thesaurus?distribute=1&%s\">" "distribute data</a></b></center>" "</td>" "</tr>\n", getBuf); p.safePrintf ( "<tr>" "<td width=37%%><b>reload data</b><br>" "<font size=1>" "reloads the synonyms and affinity table on this host only" "</font>" "</td>" "<td width=12%% bgcolor=#0000ff>" "<center><b>" "<a href=\"/master/thesaurus?reload=1&cast=0&%s\">" "reload data</a></b></center>" "</td>" "</tr>\n", getBuf); p.safePrintf ( "<tr>" "<td width=37%%><b>reload data (all hosts)</b><br>" "<font size=1>" "reloads the synonyms and affinity table on all hosts" "</font>" "</td>" "<td width=12%% bgcolor=#0000ff>" "<center><b>" "<a href=\"/master/thesaurus?reload=1&cast=1&%s\">" "reload data (all hosts)</a></b></center>" "</td>" "</tr>\n", getBuf); p.safePrintf ( "<tr>" "<td width=37%%><b>list synonyms</b><br>" "<font size=1>" "enter a word here to list all synonym entries and their " "affinities" "</font>" "</td>" "<td width=12%%>" "<form action=\"/master/thesaurus>\">" "<input type=text name=synonym size=20>" "<input type=submit value=Submit>" "%s" "</form></td>" "</tr>\n", formBuf); p.safePrintf ( "<tr>" "<td colspan=2 bgcolor=#%s>" "<center><b>Affinity Controls" "</b></center></td>" "</tr>\n", DARK_BLUE); p.safePrintf ( "<tr>" "<td width=37%%><b>cancel running rebuild</b><br>" "<font size=1>" "cancels the rebuild and throws all intermediate data away" "</font>" "</td>" "<td width=12%% bgcolor=#0000ff>" "<center><b><a href=\"/master/thesaurus?cancel=1&%s\">" "cancel running rebuild</a></b></center>" "</td>" "</tr>\n", getBuf); p.safePrintf ( "<tr>" "<td width=37%%><b>rebuild affinity only</b><br>" "<font size=1>" "begins the rebuild process for affinity data, has no " "effect if a rebuild is already in progress; full rebuild " "does not reuse existing affinity data" "</font>" "</td>" "<td width=12%% bgcolor=#0000ff>" "<center><b><a href=\"/master/thesaurus?rebuildaff=1&%s\">" "rebuild affinity</a> <a href=\"/master/thesaurus?" "rebuildaff=1&full=1&%s\">(full)</a></b></center>" "</td>" "</tr>\n", getBuf, getBuf); p.safePrintf ( "<tr>" "<td colspan=2 bgcolor=#%s>" "<center><b>Manual File Controls" "</b></td>" "</tr>\n", DARK_BLUE); p.safePrintf ( "<tr>" "<td align=center colspan=2>"); p.safePrintf( "<b>manually added pairs</b><br>\n" "<font size=1>place word pairs here that should be linked " "as synonyms, one pair per line, seperated by a pipe '|' " "character, optionally followed by another pipe and a type " "designation; any badly formatted lines will be silently " "ignored</font><br>\n" "<form action=\"/master/thesaurus\" method=post>" "<textarea name=\"manualadd\" rows=20 cols=80>"); if (manualAdd && manualAddLen) { p.htmlEncode(manualAdd, manualAddLen, true); } p.safePrintf ( "</textarea><br>" "<input type=submit value=Submit>" "<input type=reset value=Reset>" "%s" "</form></td>" "</tr>\n", formBuf); p.safePrintf ( "<tr>" "<td align=center colspan=2>" "<b>affinity value overrides</b><br>\n" "<font size=1>place word/phrase pairs here that should have " "there affinity values overridden, format is " "\"word1|word2|value\", where value is a floating point, " "integer (either decimal or hex), or the word \"max\"; " "any badly formatted lines will be silently ignored; note " "that these pairs will only work if the thesaurus otherwise " "has an entry for them, so add them to the manual add file " "above if need be</font><br>\n" "<form action=\"/master/thesaurus\" method=post>" "<textarea name=\"affinityadd\" rows=20 cols=80>"); if (affinityAdd && affinityAddLen) { p.htmlEncode(affinityAdd, affinityAddLen, true); } p.safePrintf ( "</textarea><br>" "<input type=submit value=Submit>" "<input type=reset value=Reset>" "%s" "</form></td>" "</tr>\n", formBuf); p.safePrintf ( "</table>\n" ); p.safePrintf ( "<br><br>\n" ); p.safePrintf ( "<table cellpadding=4 width=100%% bgcolor=#%s border=1>" "<tr>" "<td colspan=2 bgcolor=#%s>" "<center><b>Affinity Builder Status" "</b></td>" "</tr>\n", LIGHT_BLUE, DARK_BLUE); long long a, b, c, d, e, f, g, h, i, j, k; StateAffinity *aff = g_thesaurus.m_affinityState; if (!aff) { p.safePrintf ( "<tr><td colspan=2>" "<center><b>Not running</b></center>" "</td></tr>\n"); a = b = c = d = e = f = g = h = i = j = k = 0; } else { a = aff->m_oldTable->getNumSlotsUsed(); b = aff->m_oldTable->getNumSlotsUsed() - aff->m_n; c = aff->m_n; d = (gettimeofdayInMilliseconds() - aff->m_time) / 1000; if (!d || !(c / d)) { e = 0; } else { e = b / (c / d); } f = aff->m_sent; g = aff->m_recv; h = aff->m_errors; i = aff->m_old; j = aff->m_cache; k = aff->m_hitsTable.getNumSlotsUsed(); } p.safePrintf ( "<tr><td><b># of total pairs</b></td>" "<td>%lli</td></tr>\n" "<tr><td><b># of pairs remaining</b></td>" "<td>%lli</td></tr>\n" "<tr><td><b># of pairs processed</b></td>" "<td>%lli</td></tr>\n" "<tr><td><b>elapsed time in seconds</b></td>" "<td>%lli</td></tr>\n" "<tr><td><b>estimated remaining time in seconds</b></td>" "<td>%lli</td></tr>\n" "<tr><td><b># of requests sent</b></td>" "<td>%lli</td></tr>\n" "<tr><td><b># of requests received</b></td>" "<td>%lli</td></tr>\n" "<tr><td><b># of request errors</b></td>" "<td>%lli</td></tr>\n" "<tr><td><b># of old values reused</b></td>" "<td>%lli</td></tr>\n" "<tr><td><b># of cache hits</b></td>" "<td>%lli</td></tr>\n" "<tr><td><b>cache size</b></td>" "<td>%lli</td></tr>\n", a, b, c, d, e, f, g, h, i, j, k); p.safePrintf ( "</table>\n" ); return g_httpServer.sendDynamicPage ( s, p.getBufStart(), p.length() ); }
// this should be called when all docs have finished spidering void Test::stopIt ( ) { // sanity if ( m_isAdding ) { char *xx=NULL;*xx=0; } // flag that we are done m_isRunning = false; // print time log("test: took %lli ms to complete injections.", gettimeofdayInMilliseconds() - m_testStartTime ); // get this before setting testParserEnabled to false char *testDir = g_test.getTestDir(); // turn this off now too g_conf.m_testParserEnabled = false; g_conf.m_testSpiderEnabled = false; // save all! bool disabled = g_threads.m_disabled; g_threads.disableThreads(); // save it blocking style g_process.save(); if ( ! disabled ) g_threads.enableThreads(); // save ips.txt saveTestBuf ( testDir ); log("test: test completed. making qa.html"); // // // NOW MAKE THE qa.html FILE // // // only analyze up to last 7 runs long start = m_runId - 7; if ( start < 0 ) start = 0; SafeBuf sb; sb.safePrintf("<table border=1>\n"); sb.safePrintf("<tr>" "<td><b><nobr>run id</nobr></b></td>" "<td><b><nobr>conf diff</nobr></b></td>" "<td><b><nobr>coll diff</nobr></b></td>" "<td><b><nobr>run info</nobr></b></td>" "</tr>\n"); // take diffs between this run and the last run for confparms for ( long i = m_runId ; i > start ; i-- ) { // shortcut char *dir = g_hostdb.m_dir; // make diff filename char diff1[200]; sprintf(diff1,"%s/%s/run.%li.confparms.txt.diff",dir, testDir,i); File f1; f1.set(diff1); if ( ! f1.doesExist() ) { char df1[200]; char df2[200]; sprintf(df1,"%s/%s/run.%li.confparms.txt",dir, testDir,i); sprintf(df2,"%s/%s/run.%li.confparms.txt",dir, testDir,i-1); // do the diff char cmd[600]; sprintf(cmd,"diff %s %s > %s",df1,df2,diff1); log("test: system(\"%s\")",cmd); system (cmd); } long fs1 = f1.getFileSize(); sb.safePrintf("<tr><td>%li</td><td>%li</td>", i,fs1); // make diff filename char diff2[200]; sprintf(diff2,"%s/%s/run.%li.collparms.txt.diff",dir, testDir,i); File f2; f2.set(diff2); if ( ! f2.doesExist() ) { char df1[200]; char df2[200]; sprintf(df1,"%s/%s/run.%li.collparms.txt",dir, testDir,i); sprintf(df2,"%s/%s/run.%li.collparms.txt",dir, testDir,i-1); // do the diff char cmd[600]; sprintf(cmd,"diff %s %s > %s",df1,df2,diff2); log("test: system(\"%s\")",cmd); system (cmd); } long fs2 = f2.getFileSize(); sb.safePrintf("<td>%li</td>", fs2); // the version char vf[200]; sprintf(vf,"%s/%s/run.%li.version.txt",dir,testDir,i); File f3; f3.set ( vf ); long fs3 = f3.getFileSize(); char vbuf[1000]; vbuf[0] = 0; if ( fs3 > 0 ) { f3.open(O_RDONLY); long rs = f3.read(vbuf,fs3,0); vbuf[fs3] = '\0'; if ( rs <= 0 ) continue; f3.close(); } // show it sb.safePrintf("<td><pre>%s</pre></td></tr>\n", vbuf); } sb.safePrintf("</table>\n"); sb.safePrintf("<br>\n"); // // now diff each parser output file for each url in urls.txt // // // loop over url buf first so we can print one table per url // char *next = NULL; // reset the url buf ptr m_urlPtr = m_urlBuf; // count em long count = 0; // ptrs to each url table long un = 0; long uptr [5000]; // offsets now, not char ptr since buf gets reallocd char udiff[5000]; long ulen [5000]; long uhits[5000]; // critical errors! validateOutput() choked! long uunchecked[5000]; // events/addresses found but were not validatd long umiss[5000]; long usort[5000]; long uevents[5000]; SafeBuf tmp; long niceness = MAX_NICENESS; // advance to next url for ( ; m_urlPtr < m_urlEnd ; m_urlPtr = next ) { // breathe QUICKPOLL(niceness); // we converted all non-url chars into \0's so skip those! for ( ; m_urlPtr<m_urlEnd && !*m_urlPtr ; m_urlPtr++ ); // breach check if ( m_urlPtr >= m_urlEnd ) break; // set this up next = m_urlPtr; // compute next url ptr for ( ; next < m_urlEnd && *next ; next++ ); // point to this url char *u = m_urlPtr; // get hash long long h = hash64 ( u , gbstrlen(u) ); // shortcut char *dir = g_hostdb.m_dir; // print into a secondary safe buf with a ptr to // it so we can sort that and transfer into the // primary safebuf later uptr[un] = tmp.length(); // assume no diff udiff[un] = 0; // print number tmp.safePrintf("%li) ",count++); // . link to our stored http server reply // . TODO: link it to our [cached] copy in the test coll!!! char local[1200]; sprintf(local,"/%s/doc.%llu.html",testDir,h); tmp.safePrintf("<a href=\"%s\"><b>%s</b></a> ",local,u); // link to live page tmp.safePrintf(" <a href=\"%s\">live</a> ",u); // link to page parser char ubuf[2000]; urlEncode(ubuf,2000,u,gbstrlen(u),true); tmp.safePrintf(" <a href=\"/master/parser?c=test&" "u=%s\">parser</a> ",ubuf); //tmp.safePrintf(" (%llu)",h); tmp.safePrintf("<br>\n"); //tmp.safePrintf("<br>\n"); tmp.safePrintf("<table border=1>\n"); tmp.safePrintf("<tr>" "<td><b><nobr>run id</nobr></b></td>" "<td><b><nobr>crit hits</nobr></b></td>" "<td><b><nobr>crit errors</nobr></b></td>" "<td><b><nobr># e</nobr></b></td>" "<td><b><nobr>unchecked</nobr></b></td>" "<td><b><nobr>diff chars</nobr></b></td>" "<td><b><nobr>diff file</nobr></b></td>" "<td><b><nobr>full output</nobr></b></td>" "</tr>\n"); //SafeBuf sd; // loop over all the runs now, starting with latest run first for ( long ri = m_runId ; ri >= start ; ri-- ) { QUICKPOLL(niceness); // the diff filename char pdiff[200]; sprintf(pdiff,"%s/%s/parse.%llu.%li.html.diff",dir, testDir,h,ri); File f; f.set(pdiff); long fs = f.getFileSize(); if ( ! f.doesExist() && ri > 0 ) { // make the parse filename char pbuf1[200]; char pbuf2[200]; sprintf(pbuf1,"%s/%s/parse.%llu.%li.html", dir,testDir,h,ri); sprintf(pbuf2,"%s/%s/parse.%llu.%li.html", dir,testDir,h,ri-1); // sanity check //File tf; tf.set(pbuf1); //if ( ! tf.doesExist()) {char *xx=NULL;*xx=0;} // tmp file name char tmp1[200]; char tmp2[200]; sprintf(tmp1,"%s/%s/t1.html",dir,testDir); sprintf(tmp2,"%s/%s/t2.html",dir,testDir); // filter first char cmd[600]; sprintf(cmd, "cat %s | " "grep -v \"<!--ignore-->\" " " > %s", pbuf1,tmp1); system(cmd); sprintf(cmd, "cat %s | " "grep -v \"<!--ignore-->\" " " > %s", pbuf2,tmp2); system(cmd); // make the system cmd to do the diff sprintf(cmd, "echo \"<pre>\" > %s ; " "diff -w --text %s %s " // ignore this table header row //" | grep -v \"R#4\"" " >> %s", pdiff, tmp1,tmp2,pdiff); log("test: system(\"%s\")",cmd); system(cmd); // try again f.set(pdiff); fs = f.getFileSize(); } QUICKPOLL(niceness); // this means 0 . it just has the <pre> tag in it! if ( fs < 0 || fs == 6 ) fs = 0; // . if no diff and NOT current run, do not print it // . print it if the run right before the current // now always too if ( ri != m_runId && ri != m_runId-1 && fs == 0 ) continue; // relative filename char rel[200]; sprintf(rel,"/%s/parse.%llu.%li.html.diff", testDir,h,ri); char full[200]; sprintf(full,"/%s/parse.%llu.%li.html", testDir,h,ri); char validate[200]; sprintf(validate, "/%s/parse-shortdisplay.%llu.%li.html", testDir,h,ri); // use red font for current run that has a diff! char *t1 = ""; char *t2 = ""; if ( ri == m_runId && fs != 0 ) { t1 = "<font color=pink><b>"; t2 = "</b></font>"; // a diff udiff[un] = 1; } // . get critical errors // . i.e. XmlDoc::validateOutput() could not validate // a particular event or address that was in the // url's "validated.uh64.txt" file since the admin // clicked on the checkbox in the page parser output // . if we do not find such a tag in the parser output // any more then Spider.cpp creates this file! if ( ri == m_runId ) { char cfile[256]; sprintf(cfile,"%s/%s/critical.%llu.%li.txt", g_hostdb.m_dir,testDir,h,ri); SafeBuf ttt; ttt.fillFromFile(cfile); // first long is misses, then hits then events umiss[un] = 0; uhits[un] = 0; uevents[un] = 0; uunchecked[un] = 0; if ( ttt.length() >= 3 ) sscanf(ttt.getBufStart(), "%li %li %li %li", &umiss[un], &uhits[un], &uevents[un], &uunchecked[un]); usort[un] = umiss[un] + uunchecked[un]; //File cf; //cf.set(cfile); //if ( cf.doesExist()) ucrit[un] = 1; //else ucrit[un] = 0; } // more critical? if ( ri == m_runId && umiss[un] != 0 ) { t1 = "<font color=red><b>"; t2 = "</b></font>"; } // . these are good to have // . if you don't have 1+ critical hits then you // probably need to be validate by the qa guy char *uhb1 = ""; char *uhb2 = ""; if ( ri == m_runId && uhits[un] != 0 ) { uhb1 = "<font color=green><b>**"; uhb2 = "**</b></font>"; } QUICKPOLL(niceness); char *e1 = "<td>"; char *e2 = "</td>"; long ne = uevents[un]; if ( ne ) { e1="<td bgcolor=orange><b><font color=brown>"; e2="</font></b></td>"; } char *u1 = "<td>"; char *u2 = "</td>"; if ( uunchecked[un] ) { u1="<td bgcolor=purple><b><font color=white>"; u2="</font></b></td>"; } // print the row! tmp.safePrintf("<tr>" "<td>%s%li%s</td>" "<td>%s%li%s</td>" // critical hits "<td>%s%li%s</td>" // critical misses "%s%li%s" // # events "%s%li%s" // unchecked "<td>%s%li%s</td>" // filesize of diff // diff filename "<td><a href=\"%s\">%s%s%s</a></td>" // full parser output "<td>" "<a href=\"%s\">full</a> | " "<a href=\"%s\">validate</a> " "</td>" "</tr>\n", t1,ri,t2, uhb1,uhits[un],uhb2, t1,umiss[un],t2, e1,ne,e2, u1,uunchecked[un],u2, t1,fs,t2, rel,t1,rel,t2, full, validate); // only fill "sd" for the most recent guy if ( ri != m_runId ) continue; // now concatenate the parse-shortdisplay file // to this little table so qa admin can check/uncheck // validation checkboxes for addresses and events //sprintf(cfile, // "%s/test/parse-shortdisplay.%llu.%li.html", // g_hostdb.m_dir,h,ri); //sd.fillFromFile ( cfile ); } // end table tmp.safePrintf("</table>\n"); // . and a separate little section for the checkboxes // . should already be in tables, etc. // . each checkbox should provide its own uh64 when it // calls senddiv() when clicked now //tmp.cat ( sd ); tmp.safePrintf("<br>\n"); tmp.safePrintf("<br>\n"); // set this ulen[un] = tmp.length() - uptr[un] ; // sanity check if ( ulen[un] > 10000000 ) { char *xx=NULL;*xx=0; } // inc it un++; // increase the 5000!! if ( un >= 5000 ) { char *xx=NULL; *xx=0; } } char flag ; bubble: flag = 0; // sort the url tables for ( long i = 0 ; i < un - 1 ; i++ ) { QUICKPOLL(niceness); if ( usort[i] > usort[i+1] ) continue; if ( usort[i] == usort[i+1] ) if ( udiff[i] >= udiff[i+1] ) continue; // swap em long tp = uptr[i]; long td = udiff[i]; long um = umiss[i]; long us = usort[i]; long uh = uhits[i]; long tl = ulen [i]; uptr[i] = uptr[i+1]; umiss[i] = umiss[i+1]; usort[i] = usort[i+1]; uhits[i] = uhits[i+1]; udiff[i] = udiff[i+1]; ulen[i] = ulen[i+1]; uptr[i+1] = tp; umiss[i+1] = um; usort[i+1] = us; uhits[i+1] = uh; udiff[i+1] = td; ulen [i+1] = tl; flag = 1; } if ( flag ) goto bubble; // transfer into primary safe buf now for ( long i = 0 ; i < un ; i++ ) sb.safeMemcpy(tmp.getBufStart() + uptr[i],ulen[i]); sb.safePrintf("</html>\n"); char dfile[200]; sprintf(dfile,"%s/%s/qa.html",g_hostdb.m_dir,testDir); sb.dumpToFile ( dfile ); // free the buffer of urls reset(); // turn off spiders g_conf.m_spideringEnabled = 0; // all done return; }
bool HttpMime::addCookieHeader(const char *cookieJar, const char *url, SafeBuf *sb) { Url tmpUrl; tmpUrl.set(url); SafeBuf tmpSb; size_t cookieJarLen = strlen(cookieJar); const char *lineStartPos = cookieJar; const char *lineEndPos = NULL; while ((lineEndPos = (const char*)memchr(lineStartPos, '\n', cookieJarLen - (lineStartPos - cookieJar))) != NULL) { const char *currentPos = lineStartPos; const char *tabPos = NULL; unsigned fieldCount = 0; bool skipCookie = false; const char *domain = NULL; int32_t domainLen = 0; while (fieldCount < 5 && (tabPos = (const char*)memchr(currentPos, '\t', lineEndPos - currentPos)) != NULL) { switch (fieldCount) { case 0: // domain domain = currentPos; domainLen = tabPos - currentPos; break; case 1: // flag if (memcmp(currentPos, "TRUE", 4) == 0) { // allow subdomain if (tmpUrl.getHostLen() >= domainLen) { if (!endsWith(tmpUrl.getHost(), tmpUrl.getHostLen(), domain, domainLen)) { // doesn't end with domain - ignore cookie skipCookie = true; break; } } else { skipCookie = true; break; } } else { // only specific domain if (tmpUrl.getHostLen() != domainLen || strncasecmp(domain, tmpUrl.getHost(), domainLen) != 0) { // non-matching domain - ignore cookie skipCookie = true; break; } } break; case 2: { // path const char *path = currentPos; int32_t pathLen = tabPos - currentPos; if (strncasecmp(path, tmpUrl.getPath(), pathLen) == 0) { if (tmpUrl.getPathLen() != pathLen) { if (path[pathLen - 1] != '/' && tmpUrl.getPath()[tmpUrl.getPathLen() - 1] != '/') { // non-matching path - ignore cookie skipCookie = true; break; } } } else { // non-matching path - ignore cookie skipCookie = true; break; } } break; case 3: // secure break; case 4: // expiration break; } currentPos = tabPos + 1; ++fieldCount; } if (!skipCookie) { tmpSb.safeMemcpy(currentPos, lineEndPos - currentPos); tmpSb.pushChar(';'); } lineStartPos = lineEndPos + 1; } // we don't need to care about the last line (we always end on \n) if (tmpSb.length() > 0) { sb->safeStrcpy("Cookie: "); sb->safeMemcpy(&tmpSb); sb->safeStrcpy("\r\n"); } return true; }