bool getUrl( char *path , void (* callback) (void *state, TcpSocket *sock) ) { SafeBuf sb; sb.safePrintf ( "http://%s:%li%s" , iptoa(g_hostdb.m_myHost->m_ip) , (long)g_hostdb.m_myHost->m_port , path ); Url u; u.set ( sb.getBufStart() ); if ( ! g_httpServer.getDoc ( u.getUrl() , 0 , // ip 0 , // offset -1 , // size 0 , // ifmodsince NULL , callback , 60*1000, // timeout 0, // proxyip 0, // proxyport -1, // maxtextdoclen -1, // maxotherdoclen NULL ) ) // useragent return false; // error? log("qa: getUrl error: %s",mstrerror(g_errno)); return true; }
// for example, RENAME log000 to log000-bak20131104-181932 static bool renameCurrentLogFile ( ) { File f; char tmp[16]; sprintf(tmp,"log%03" PRId32,g_hostdb.m_hostId); f.set ( g_hostdb.m_dir , tmp ); // make new filename like log000-bak20131104-181932 time_t now = time(NULL); struct tm tm_buf; tm *tm1 = gmtime_r(&now,&tm_buf); char tmp2[64]; strftime(tmp2,64,"%Y%m%d-%H%M%S",tm1); SafeBuf newName; if ( ! newName.safePrintf ( "%slog%03" PRId32"-bak%s", g_hostdb.m_dir, g_hostdb.m_hostId, tmp2 ) ) { fprintf(stderr,"log rename failed\n"); return false; } // rename log000 to log000-2013_11_04-18:19:32 if ( f.doesExist() ) { //fprintf(stdout,"renaming file\n"); f.rename ( newName.getBufStart() ); } return true; }
// "xd" is the XmlDoc that just completed injecting void ImportState::saveFileBookMark ( ) { //Msg7 *msg7 ) { long long minOff = -1LL; long minFileId = -1; //long fileId = msg7->m_hackFileId; //long long fileOff = msg7->m_hackFileOff; // if there is one outstanding the preceeded us, we can't update // the bookmark just yet. for ( long i = 0 ; i < m_numPtrs ; i++ ) { Multicast *mcast = &m_ptrs[i]; if ( ! mcast->m_inUse ) continue; if ( minOff == -1 ) { minOff = mcast->m_hackFileOff; minFileId = mcast->m_hackFileId; continue; } if ( mcast->m_hackFileId > minFileId ) continue; if ( mcast->m_hackFileId == minFileId && mcast->m_hackFileOff > minOff ) continue; minOff = mcast->m_hackFileOff; minFileId = mcast->m_hackFileId; } char fname[256]; sprintf(fname,"%slasttitledbinjectinfo.dat",g_hostdb.m_dir); SafeBuf ff; ff.safePrintf("%llu,%lu",minOff,minFileId);//_fileOffset,m_bfFileId); ff.save ( fname ); }
// draw a HORIZONTAL line in html void Statsdb::drawLine3 ( SafeBuf &sb , long x1 , long x2 , long fy1 , long color , long width ) { // do not draw repeats in the case we have a ton of points to plot long key32 ; key32 = hash32h ( x1 , 0 ); key32 = hash32h ( x2 , key32); key32 = hash32h ( fy1 , key32); key32 = hash32h ( color , key32); key32 = hash32h ( width , key32); if ( m_dupTable.isInTable(&key32) ) return; m_dupTable.addKey(&key32); sb.safePrintf("<div style=\"position:absolute;" "left:%li;" "bottom:%li;" "background-color:#%lx;" "z-index:-5;" "min-height:%lipx;" "min-width:%lipx;\"></div>\n" , x1 + m_bx , (fy1 - width/2) + m_by , color , width , x2 - x1 ); }
// draw a HORIZONTAL line in html void Statsdb::drawLine3 ( SafeBuf &sb , int32_t x1 , int32_t x2 , int32_t fy1 , int32_t color , int32_t width ) { // do not draw repeats in the case we have a ton of points to plot int32_t key32 ; key32 = hash32h ( x1 , 0 ); key32 = hash32h ( x2 , key32); key32 = hash32h ( fy1 , key32); key32 = hash32h ( color , key32); key32 = hash32h ( width , key32); if ( m_dupTable.isInTable(&key32) ) return; m_dupTable.addKey(&key32); sb.safePrintf("<div style=\"position:absolute;" "left:%" PRId32";" "bottom:%" PRId32";" "background-color:#%" PRIx32";" "z-index:-5;" "min-height:%" PRId32"px;" "min-width:%" PRId32"px;\"" " class=\"color-%" PRIx32"\"></div>\n" , x1 + m_bx , (fy1 - width/2) + m_by , color , width , x2 - x1 , color ); }
bool saveHashTable ( ) { if ( s_ht.m_numSlotsUsed <= 0 ) return true; SafeBuf fn; fn.safePrintf("%s/qa/",g_hostdb.m_dir); log("qa: saving crctable.dat"); s_ht.save ( fn.getBufStart() , "crctable.dat" ); return true; }
bool qascrape ( ) { // // delete the 'qatest123' collection // //static bool s_x1 = false; if ( ! s_flags[0] ) { s_flags[0] = true; if ( ! getUrl ( "/admin/delcoll?xml=1&delcoll=qatest123" ) ) return false; } // // add the 'qatest123' collection // //static bool s_x2 = false; if ( ! s_flags[1] ) { s_flags[1] = true; if ( ! getUrl ( "/admin/addcoll?addcoll=qatest123&xml=1" , // checksum of reply expected 238170006 ) ) return false; } // scrape it if ( ! s_flags[3] ) { s_flags[3] = true; SafeBuf sb; sb.safePrintf( "/admin/inject?c=qatest123&" "format=xml&qts=test"); if ( ! getUrl ( sb.getBufStart() , 999 ) ) return false; } // verify no results for gbhopcount:2 query //static bool s_y4 = false; if ( ! s_flags[6] ) { s_flags[6] = true; if ( ! getUrl ( "/search?c=qatest123&qa=1&format=xml&" "q=test", -1310551262 ) ) return false; } //static bool s_fee2 = false; if ( ! s_flags[13] ) { s_flags[13] = true; log("qa: SUCCESSFULLY COMPLETED " "QA SCRAPE TEST"); return true; } return true; }
// . the url being reuqested // . removes &code= facebook cruft bool HttpRequest::getCurrentUrl ( SafeBuf &cu ) { // makre sure we got enough room if ( ! cu.reserve ( m_hostLen + 64 + m_plen + 1 + 1 ) ) return false; // need a "Host: " char *host = m_host; if ( ! host ) host = APPSUBDOMAIN; cu.safePrintf("http"); if ( m_isSSL ) cu.pushChar('s'); cu.safePrintf("://%s",host); char *path = m_path; long plen = m_plen; if ( ! path ) { path = "/"; plen = 1; } // . scan path and change \0 back to = or & // . similar logic in HttpServer.cpp for logging! char *dst = cu.getBuf(); char *src = path; char *srcEnd = path + plen; char dd = '='; for ( ; src < srcEnd ; src++ , dst++ ) { *dst = *src; if ( *src ) continue; *dst = dd; if ( dd == '=' ) dd = '&'; else dd = '='; } *dst = '\0'; // cut it off at facebook's &code= char *buf = cu.getBufStart(); char *code = strstr( buf,"&code="); // fix for eventguru.com/blog.html?code= if ( ! code ) code = strstr(buf,"?code="); // hack that off if there if ( code ) { *code = '\0'; dst = code; } // update length cu.setLength( dst - cu.getBufStart() ); return true; }
// . run a series of tests to ensure that gb is functioning properly // . uses the ./qa subdirectory to hold archive pages, ips, spider dates to // ensure consistency between tests for exact replays bool qatest ( ) { if ( s_registered ) { g_loop.unregisterSleepCallback(NULL,qatestWrapper); s_registered = false; } if ( ! s_callback ) s_callback = qatest; if ( ! g_qaSock ) return true; // returns true when done, false when blocked //if ( ! qainject ( ) ) return false; // returns true when done, false when blocked //if ( ! qaspider ( ) ) return false; long n = sizeof(s_qatests)/sizeof(QATest); for ( long i = 0 ; i < n ; i++ ) { QATest *qt = &s_qatests[i]; if ( ! qt->m_doTest ) continue; // store that s_qt = qt; // point to flags s_flags = qt->m_flags; // call the qatest if ( ! qt->m_func() ) return false; } // save this saveHashTable(); // do not reset since we don't reload it above! //s_ht.reset(); //if ( g_numErrors ) // g_qaOutput.safePrintf("<input type=submit value=submit><br>"); g_qaOutput.safePrintf("<br>DONE RUNNING QA TESTS<br>"); // . print the output // . the result of each test is stored in the g_qaOutput safebuf g_httpServer.sendDynamicPage(g_qaSock, g_qaOutput.getBufStart(), g_qaOutput.length(), -1/*cachetime*/); g_qaOutput.purge(); g_qaSock = NULL; return true; }
bool deleteUrls ( ) { static long s_ii2 = 0; for ( ; s_ii2 < s_numUrls ; ) { // pre-inc it s_ii2++; // reject using html api SafeBuf sb; sb.safePrintf( "/admin/inject?c=qatest123&delete=1&u="); sb.urlEncode ( s_urlPtrs[s_ii2] ); return getUrl ( sb.getBufStart() , qatestWrapper ); } return true; }
void doneInjectingLinksWrapper ( void *state ) { Msg7 *msg7 = (Msg7 *)state; SafeBuf *sb = &msg7->m_sb; // copy the serps into ou rbuf if ( ! g_errno ) { // print header if ( sb->length() == 0 ) { // print header of page sb->safePrintf("<?xml version=\"1.0\" " "encoding=\"UTF-8\" ?>\n" "<response>\n" ); } // serp header if ( msg7->m_round == 1 ) sb->safePrintf("\t<googleResults>\n"); else sb->safePrintf("\t<bingResults>\n"); // print results sb->safeMemcpy(&msg7->m_xd.m_serpBuf); // end that if ( msg7->m_round == 1 ) sb->safePrintf("\t</googleResults>\n"); else sb->safePrintf("\t</bingResults>\n"); } // do bing now if ( msg7->m_round == 1 ) { // return if it blocks if ( ! msg7->scrapeQuery() ) return; } // otherwise, parse out the search results so steve can display them if ( g_errno ) sb->safePrintf("<error><![CDATA[%s]]></error>\n", mstrerror(g_errno)); // print header of page sb->safePrintf("</response>\n"); // page is not more than 32k //char buf[1024*32]; //char *p = buf; // return docid and hostid //p += sprintf ( p , "scraping status "); // print error msg out, too or "Success" //p += sprintf ( p , "%s", mstrerror(g_errno)); TcpSocket *sock = msg7->m_socket; g_httpServer.sendDynamicPage ( sock, sb->getBufStart(), sb->length(), -1/*cachetime*/); // hopefully sb buffer is copied becaues this will free it: mdelete ( msg7, sizeof(Msg7) , "PageInject" ); delete (msg7); }
// ensure search results are consistent bool searchTest2 () { long nq = sizeof(s_queries)/sizeof(char *); for ( ; s_qi2 < nq ; ) { // pre-inc it s_qi2++; // inject using html api SafeBuf sb; // qa=1 tell gb to exclude "variable" or "random" things // from the serps so we can checksum it consistently sb.safePrintf ( "/search?c=qatest123&qa=1&q=" ); sb.urlEncode ( s_queries[s_qi2] ); return getUrl ( sb.getBufStart() , doneSearching2 ); } return true; }
// returns false if blocked, true otherwise, like on quick connect error bool getUrl( char *path , long checkCRC = 0 , char *post = NULL ) { SafeBuf sb; sb.safePrintf ( "http://%s:%li%s" , iptoa(g_hostdb.m_myHost->m_ip) , (long)g_hostdb.m_myHost->m_httpPort , path ); s_checkCRC = checkCRC; bool doPost = true; if ( strncmp ( path , "/search" , 7 ) == 0 ) doPost = false; //Url u; s_url.set ( sb.getBufStart() ); log("qa: getting %s",sb.getBufStart()); if ( ! g_httpServer.getDoc ( s_url.getUrl() , 0 , // ip 0 , // offset -1 , // size 0 , // ifmodsince NULL , gotReplyWrapper, 999999*1000, // timeout ms 0, // proxyip 0, // proxyport -1, // maxtextdoclen -1, // maxotherdoclen NULL , // useragent "HTTP/1.0" , // protocol doPost , // doPost NULL , // cookie NULL , // additionalHeader NULL , // fullRequest post ) ) return false; // error? processReply ( NULL , 0 ); //log("qa: getUrl error: %s",mstrerror(g_errno)); return true; }
// draw a HORIZONTAL line in html void drawLine2 ( SafeBuf &sb , long x1 , long x2 , long fy1 , long color , long width ) { sb.safePrintf("<div style=\"position:absolute;" "left:%li;" "top:%li;" "background-color:#%06lx;" "z-index:-5;" "min-height:%lipx;" "min-width:%lipx;\"></div>\n" , x1 , (fy1 - width/2) - 20 //- 300 , color , width , x2 - x1 ); }
// . returns false if blocked, true otherwise // . sets g_errno on error // . called either from // 1) doDocIdSplitLoop // 2) or getDocIds2() if only 1 docidsplit bool Msg39::getLists () { if ( m_debug ) m_startTime = gettimeofdayInMilliseconds(); // . ask Indexdb for the IndexLists we need for these termIds // . each rec in an IndexList is a termId/score/docId tuple // // restrict to docid range? // // . get the docid start and end // . do docid paritioning so we can send to all hosts // in the network, not just one stripe int64_t docIdStart = 0; int64_t docIdEnd = MAX_DOCID; // . restrict to this docid? // . will really make gbdocid:| searches much faster! int64_t dr = m_tmpq.m_docIdRestriction; if ( dr ) { docIdStart = dr; docIdEnd = dr + 1; } // . override // . this is set from Msg39::doDocIdSplitLoop() to compute // search results in stages, so that we do not load massive // termlists into memory and got OOM (out of memory) if ( m_r->m_minDocId != -1 ) docIdStart = m_r->m_minDocId; if ( m_r->m_maxDocId != -1 ) docIdEnd = m_r->m_maxDocId+1; // if we have twins, then make sure the twins read different // pieces of the same docid range to make things 2x faster //bool useTwins = false; //if ( g_hostdb.getNumStripes() == 2 ) useTwins = true; //if ( useTwins ) { // int64_t delta2 = ( docIdEnd - docIdStart ) / 2; // if ( m_r->m_stripe == 0 ) docIdEnd = docIdStart + delta2; // else docIdStart = docIdStart + delta2; //} // new striping logic: int32_t numStripes = g_hostdb.getNumStripes(); int64_t delta2 = ( docIdEnd - docIdStart ) / numStripes; int32_t stripe = g_hostdb.getMyHost()->m_stripe; docIdStart += delta2 * stripe; // is this right? docIdEnd = docIdStart + delta2; // add 1 to be safe so we don't lose a docid docIdEnd++; // TODO: add triplet support later for this to split the // read 3 ways. 4 ways for quads, etc. //if ( g_hostdb.getNumStripes() >= 3 ) { char *xx=NULL;*xx=0;} // do not go over MAX_DOCID because it gets masked and // ends up being 0!!! and we get empty lists if ( docIdEnd > MAX_DOCID ) docIdEnd = MAX_DOCID; // remember so Msg2.cpp can use them to restrict the termlists // from "whiteList" as well m_docIdStart = docIdStart; m_docIdEnd = docIdEnd; // // set startkey/endkey for each term/termlist // for ( int32_t i = 0 ; i < m_tmpq.getNumTerms() ; i++ ) { // breathe QUICKPOLL ( m_r->m_niceness ); // int16_tcuts QueryTerm *qterm = &m_tmpq.m_qterms[i]; char *sk = qterm->m_startKey; char *ek = qterm->m_endKey; // get the term id int64_t tid = m_tmpq.getTermId(i); // if only 1 stripe //if ( g_hostdb.getNumStripes() == 1 ) { // docIdStart = 0; // docIdEnd = MAX_DOCID; //} // debug if ( m_debug ) log("query: setting sk/ek for docids %"INT64"" " to %"INT64" for termid=%"INT64"" , docIdStart , docIdEnd , tid ); // store now in qterm g_posdb.makeStartKey ( sk , tid , docIdStart ); g_posdb.makeEndKey ( ek , tid , docIdEnd ); qterm->m_ks = sizeof(POSDBKEY);//key144_t); } // debug msg if ( m_debug || g_conf.m_logDebugQuery ) { for ( int32_t i = 0 ; i < m_tmpq.getNumTerms() ; i++ ) { // get the term in utf8 //char bb[256]; QueryTerm *qt = &m_tmpq.m_qterms[i]; //utf16ToUtf8(bb, 256, qt->m_term, qt->m_termLen); char *tpc = qt->m_term + qt->m_termLen; char tmp = *tpc; *tpc = '\0'; char sign = qt->m_termSign; if ( sign == 0 ) sign = '0'; QueryWord *qw = qt->m_qword; int32_t wikiPhrId = qw->m_wikiPhraseId; if ( m_tmpq.isPhrase(i) ) wikiPhrId = 0; char leftwikibigram = 0; char rightwikibigram = 0; if ( qt->m_leftPhraseTerm && qt->m_leftPhraseTerm->m_isWikiHalfStopBigram ) leftwikibigram = 1; if ( qt->m_rightPhraseTerm && qt->m_rightPhraseTerm->m_isWikiHalfStopBigram ) rightwikibigram = 1; /* char c = m_tmpq.getTermSign(i); char tt[512]; int32_t ttlen = m_tmpq.getTermLen(i); if ( ttlen > 254 ) ttlen = 254; if ( ttlen < 0 ) ttlen = 0; // old:painful: convert each term from unicode to ascii gbmemcpy ( tt , m_tmpq.getTerm(i) , ttlen ); */ int32_t isSynonym = 0; QueryTerm *st = qt->m_synonymOf; if ( st ) isSynonym = true; SafeBuf sb; // now we can display it //tt[ttlen]='\0'; //if ( c == '\0' ) c = ' '; sb.safePrintf( "query: msg39: [%"PTRFMT"] " "query term #%"INT32" \"%s\" " "phr=%"INT32" termId=%"UINT64" rawTermId=%"UINT64" " //"estimatedTermFreq=%"INT64" (+/- ~16000) " "tfweight=%.02f " "sign=%c " "numPlusses=%hhu " "required=%"INT32" " "fielcode=%"INT32" " "ebit=0x%0"XINT64" " "impBits=0x%0"XINT64" " "wikiphrid=%"INT32" " "leftwikibigram=%"INT32" " "rightwikibigram=%"INT32" " //"range.startTermNum=%hhi range.endTermNum=%hhi " //"minRecSizes=%"INT32" " "readSizeInBytes=%"INT32" " //"ebit=0x%"XINT64" " //"impBits=0x%"XINT64" " "hc=%"INT32" " "component=%"INT32" " "otermLen=%"INT32" " "isSynonym=%"INT32" " "querylangid=%"INT32" " , (PTRTYPE)this , i , qt->m_term,//bb , (int32_t)m_tmpq.isPhrase (i) , m_tmpq.getTermId (i) , m_tmpq.getRawTermId (i) , ((float *)m_r->ptr_termFreqWeights)[i] , sign , //c , 0 , (int32_t)qt->m_isRequired, (int32_t)qt->m_fieldCode, (int64_t)qt->m_explicitBit , (int64_t)qt->m_implicitBits , wikiPhrId, (int32_t)leftwikibigram, (int32_t)rightwikibigram, ((int32_t *)m_r->ptr_readSizes)[i] , //(int64_t)m_tmpq.m_qterms[i].m_explicitBit , //(int64_t)m_tmpq.m_qterms[i].m_implicitBits , (int32_t)m_tmpq.m_qterms[i].m_hardCount , (int32_t)m_tmpq.m_componentCodes[i], (int32_t)m_tmpq.getTermLen(i) , isSynonym, (int32_t)m_tmpq.m_langId ); // ,tt // put it back *tpc = tmp; if ( st ) { int32_t stnum = st - m_tmpq.m_qterms; sb.safePrintf("synofterm#=%"INT32"",stnum); //sb.safeMemcpy(st->m_term,st->m_termLen); sb.pushChar(' '); sb.safePrintf("synwid0=%"INT64" ",qt->m_synWids0); sb.safePrintf("synwid1=%"INT64" ",qt->m_synWids1); sb.safePrintf("synalnumwords=%"INT32" ", qt->m_numAlnumWordsInSynonym); // like for synonym "nj" it's base, // "new jersey" has 2 alnum words! sb.safePrintf("synbasealnumwords=%"INT32" ", qt->m_numAlnumWordsInBase); } logf(LOG_DEBUG,"%s",sb.getBufStart()); } m_tmpq.printBooleanTree(); } // timestamp log if ( m_debug ) log(LOG_DEBUG,"query: msg39: [%"PTRFMT"] " "Getting %"INT32" index lists ", (PTRTYPE)this,m_tmpq.getNumTerms()); // . now get the index lists themselves // . return if it blocked // . not doing a merge (last parm) means that the lists we receive // will be an appending of a bunch of lists so keys won't be in order // . merging is uneccessary for us here because we hash the keys anyway // . and merging takes up valuable cpu time // . caution: the index lists returned from Msg2 are now compressed // . now i'm merging because it's 10 times faster than hashing anyway // and the reply buf should now always be <= minRecSizes so we can // pre-allocate one better, and, 3) this should fix the yahoo.com // reindex bug char rdbId = RDB_POSDB; // . TODO: MDW: fix // . partap says there is a bug in this??? we can't cache UOR'ed lists? bool checkCache = false; // split is us???? //int32_t split = g_hostdb.m_myHost->m_group; int32_t split = g_hostdb.m_myHost->m_shardNum; // call msg2 if ( ! m_msg2.getLists ( rdbId , m_r->m_collnum,//m_r->ptr_coll , m_r->m_maxAge , m_r->m_addToCache , //m_tmpq.m_qterms , &m_tmpq, m_r->ptr_whiteList, // we need to restrict docid range for // whitelist as well! this is from // doDocIdSplitLoop() m_docIdStart, m_docIdEnd, // how much of each termlist to read in bytes (int32_t *)m_r->ptr_readSizes , //m_tmpq.getNumTerms() , // numLists // 1-1 with query terms m_lists , this , controlLoopWrapper,//gotListsWrapper , m_r , m_r->m_niceness , true , // do merge? m_debug , NULL , // best hostids m_r->m_restrictPosdbForQuery , split , checkCache )) { m_blocked = true; return false; } // error? //if ( g_errno ) { // log("msg39: Had error getting termlists2: %s.", // mstrerror(g_errno)); // // don't bail out here because we are in docIdSplitLoop() // //sendReply (m_slot,this,NULL,0,0,true); // return true; //} //return gotLists ( true ); return true; }
void Statsdb::drawHR ( float z , float ymin , float ymax , //GIFPlotter *plotter , SafeBuf &gw, Label *label , float zoff , long color ) { // convert into yspace float z2 = ((float)DY * (float)(z - ymin)) /(float)(ymax-ymin); // avoid collisions with other graphs z2 += zoff; // border //z2 += m_by; // round off error z2 += 0.5; // for adjusatmnet float ptsPerPixel = (ymax-ymin)/ (float)DY; // make an adjustment to the label then! -- Commented out because it's currently not used. float zadj = zoff * ptsPerPixel; //#ifdef _USEPLOTTER_ // use the color specified from addStat_r() for this line/pt //plotter->pencolor ( ((color >> 16) & 0xff) << 8 , // ((color >> 8) & 0xff) << 8 , // ((color >> 0) & 0xff) << 8 ); // horizontal line //plotter->line ( m_bx, (long)z2 , DX + m_bx, (long)z2 ); long width = 1; drawLine3 ( m_gw, 0, DX , (long)z2,color, width); // make label char tmp[128]; // . use "graphHash" to map to unit display // . this is a disk read volume sprintf(tmp,label->m_format,z +zadj);//* label->m_yscalar); /* // a white shadow plotter->pencolor ( 0xffff,0xffff,0xffff ); plotter->move ( m_bx + 80 + 2 , z2 + 10 - 2 ); plotter->alabel ( 'c' , 'c' , tmp ); // a black shadow plotter->pencolor ( 0 , 0 , 0 ); plotter->move ( m_bx + 80 + 1 , z2 + 10 - 1 ); plotter->alabel ( 'c' , 'c' , tmp ); //long color = label->m_color; // use the color specified from addStat_r() for this line/pt plotter->pencolor ( ((color >> 16) & 0xff) << 8 , ((color >> 8) & 0xff) << 8 , ((color >> 0) & 0xff) << 8 ); // move cursor plotter->move ( m_bx + 80 , z2 + 10 ); // plot label plotter->alabel ( 'c' , 'c' , tmp ); */ // LABEL gw.safePrintf("<div style=\"position:absolute;" "left:%li;" "bottom:%li;" "color:#%lx;" "z-index:110;" "font-size:14px;" "min-height:20px;" "min-width:3px;\">%s</div>\n" , (long)(m_bx) , (long)z2 +m_by , color // the label: , tmp ); }
bool sendTurkPageReply ( State60 *st ) { XmlDoc *xd = &st->m_xd; //char *content = xd->ptr_utf8Content; //int32_t contentLen = xd->size_utf8Content - 1; // count the total number of EventDesc classes for all evids //char *evd = xd->ptr_eventData; //EventDisplay *ed = (EventDisplay *)evd; //char *addr = evd + (int32_t)ed->m_addr; //char timeZoneOffset = getTimeZoneFromAddr ( addr ); // in case getSections() block come right back in xd->setCallback ( st , xdcallback ); // . set niceness to 1 so all this processing doesn't slow queries down // . however, g_niceness should still be zero... hmmm... xd->m_niceness = 1; // default to 1 niceness st->m_niceness = 1; // now set the sections class Sections *ss = xd->getSections(); // now for each section with alnum text, telescope up as far as // possible without containing anymore alnum text than what it // contained. set SEC_CONTROL bit. such sections will have the // 2 green/blue dots, that are used for turning on/off title/desc. // but really the indians will only turn off sections that should // not have a title/desc. for ( Section *si = ss->m_rootSection ; si ; si = si->m_next ) { // breathe QUICKPOLL(st->m_niceness); // skip if does not have text if ( si->m_firstWordPos < 0 ) continue; // otherwise, find biggest parent that contains just that text Section *p = si->m_parent; Section *last = si; for ( ; p ; p = p->m_parent ) { if ( p->m_firstWordPos != si->m_firstWordPos ) break; if ( p->m_lastWordPos != si->m_lastWordPos ) break; last = p; } // set that bit then last->m_flags |= SEC_CONTROL; // and speed up the loop si = last; } // * now each SEC_CONTROL sections have a fence activated by a turker // * an event title or description can not span a fence. it must be // confined within a fence. however, it is allowed to include // title or description from a "title section". // * hold shift down to designate as title section when clicking it // * show the raw text of each event changing as you fence // sections in or out. show in a right frame. // * show list of events on page in the top frame. can toggle them // all individually. // * and remove no-display from all tags so we can see everything. // * highlight addresses, not just dates. // * each section hash has its own unique bg color when activated // * with a single click, completely reject an event because: // contains bad time, address, title or desc. specify which so // we can improve our algo. // * when selecting an individual event, scroll to its tod... // * remove all color from webpage that we can so our colors show up // * remove all imgs. just src them to dev null. // * allow for entering a custom title for an event or all events // that are or will ever appear on the page. // * when displaying the text of the events, use hyphens to // delineate the section topology. strike out text as a section // fence is activated. // * when a section is activated is it easier to just redownload // the whole text of the page? maybe just the text frame? // * clicking on an individual sentence section should just remove // that sentence. that is kinda a special content hash removal // tag. like "Click here for video." // * when an event id is selected i guess activate its bgcolor to // be light blue for all sentences currently in the event that // are not in activated sections. (make exception for designated // title sections). so we need multiple tags for each events // sentence div section. if sentence is split use multiple div tags // then to keep the order. so each event sentence would have // <div ev1=1 ev2=1 ev10=1>...</div> if it is in event ids 1,2 and // 10. that way we can activate it when one of those event ids is // activated. SafeBuf sb; // int16_tcuts if ( ! xd->m_wordsValid ) { char *xx=NULL;*xx=0; } Words *words = &xd->m_words; int32_t nw = words->getNumWords(); char **wptrs = words->getWords(); int32_t *wlens = words->getWordLens(); nodeid_t *tids = words->getTagIds(); // a special array for printing </div> tags char *endCounts = (char *)mcalloc ( nw ,"endcounts"); if ( ! endCounts ) return sendErrorReply ( st , g_errno ); // // now loop over all the words. if word starts a section that has // SEC_CONTROL bit set, and print out the section hash and a color // tag to be activated if the turkey activates us. // CAUTION: word may start multiple sections. // for ( int32_t i = 0 ; i < nw ; i++ ) { // get section ptr Section *sj = ss->m_sectionPtrs[i]; // sanity check. sj must be first section ptr that starts @ a if ( sj && sj->m_a==i && sj->m_prev && sj->m_prev->m_a==i ) { char *xx=NULL;*xx=0; } // . does word #i start a section? // . if section is control, print out the control while ( sj && sj->m_a == i ) { // print this section's hash if ( sj->m_flags & SEC_CONTROL) { // after the turkeys have made all the edits // they need to submit the changes they made. // how can we get that data sent back to the // back end? we need to send back the colors // of the sections that have been activated // i guess. just do a loop over them. sb.safePrintf("<div nobreak gbsecid=%"UINT32" " "bgcolor=#%"XINT32" " "onclick=gbtogglecolor()>", (uint32_t)sj->m_tagHash, (uint32_t)sj->m_tagHash); // sanity check if ( sj->m_b < 0 ) { char *xx=NULL;*xx=0; } if ( sj->m_b > nw ) { char *xx=NULL;*xx=0; } // and inc the /div count for that word endCounts[sj->m_b-1]++; } // try next section too sj = sj->m_next; } // if this is a tag, remove any coloring if ( tids[i] ) { } // print the word, be it a tag, alnum, punct sb.safeMemcpy ( wptrs[i] , wlens[i] ); // end a div tag? if ( ! endCounts[i] ) continue; // might be many so loop it for ( int32_t j = 0 ; j < endCounts[i] ; j++ ) sb.safePrintf("</div>"); } return false; }
void gotDatedbList ( State60 *st ) { // must only be run on host #0 since we need just one lock table if ( g_hostdb.m_myHost->m_hostId != 0 ) { char *xx=NULL;*xx=0; } // load turk lock table if we need to bool s_init = false; if ( ! s_init ) { s_init = true; if ( ! g_turkLocks.set(8,sizeof(TurkLock),256) ) log("turk: failed to init turk lock table"); if ( ! g_turkLocks.load(g_conf.m_dir,"turkdir/docidlocks.dat")) log("turk: failed to load turk lock table"); } time_t now = getTimeGlobal(); // int16_tcut RdbList *list = &st->m_list; // the best docid int64_t best = 0LL; // scan the list to get urls/docids to turk out for ( ; ! list->isExhausted() ; ) { // get rec char *k = list->getCurrentKey(); // skip that list->skipCurrentRecord(); // skip if negative if ( (k[0] & 0x01) == 0x00 ) continue; // get the docid int64_t docid = g_datedb.getDocId ( k ); // skip if locked TurkLock *tt = (TurkLock *)g_turkLock.getValue(&docid); // if there check time if ( tt && now - tt->m_lockTime > 3600 ) { // remove it g_turkLock.removeKey(&docId); // nuke tt tt = NULL; } // if still there, skip it and try next one if ( tt ) continue; // ok, we got a good docid to dish out best = docId; break; } SafeBuf sb; // print description so they can clikc a button to start the turk sb.safePrintf("<html>\n" "<title>Event Editor</title>\n" "<body>\n" "<table width=\"100%%\" border=\"0\">\n" "<tr><td style=\"background-color:#0079ba;\">\n" "<center><font color=#00000>" "<h2>Event Editor</h2>\n" "</font></center></td>" "</tr></table>"); // if we had no docid, give user an empty msg if ( ! best ) { sb.safePrintf("<center>Nothing currently available to edit. " "Please try again later.</center>" "</body></html>\n"); sendReply ( &sb ); return; } // lock it! TurkLock tt; strcpy ( tt.m_user , st->m_user ); tt.m_lockTime = now; if ( ! g_lockTable.addLock ( &tt ) ) { sendErrorReply ( st , g_errno ); return; } // . fetch the TitleRec // . a max cache age of 0 means not to read from the cache XmlDoc *xd = &st->m_xd; // . when getTitleRec() is called it will load the old one // since XmlDoc::m_setFromTitleRec will be true // . niceness is 0 xd->set3 ( best , st->m_coll , 0 ); // if it blocks while it loads title rec, it will re-call this routine xd->setCallback ( st , processLoopWrapper ); // good to go! return processLoop ( st ); }
void doneReindexing ( void *state ) { // cast it State13 *st = (State13 *)state; GigablastRequest *gr = &st->m_gr; // note it if ( gr->m_query && gr->m_query[0] ) log(LOG_INFO,"admin: Done with query reindex. %s", mstrerror(g_errno)); //// // // print the html page // ///// HttpRequest *hr = &gr->m_hr; char format = hr->getReplyFormat(); SafeBuf sb; const char *ct = "text/html"; if ( format == FORMAT_JSON ) ct = "application/json"; if ( format == FORMAT_XML ) ct = "text/xml"; if ( format == FORMAT_XML ) { sb.safePrintf("<response>\n" "\t<statusCode>0</statusCode>\n" "\t<statusMsg>Success</statusMsg>\n" "\t<matchingResults>%" PRId32"</matchingResults>\n" "</response>" , st->m_msg1c.m_numDocIdsAdded ); g_httpServer.sendDynamicPage ( gr->m_socket, sb.getBufStart(), sb.length(), -1, false,ct); mdelete ( st , sizeof(State13) , "PageTagdb" ); delete (st); return; } if ( format == FORMAT_JSON ) { sb.safePrintf("{\"response\":{\n" "\t\"statusCode\":0,\n" "\t\"statusMsg\":\"Success\",\n" "\t\"matchingResults\":%" PRId32"\n" "}\n" "}\n" , st->m_msg1c.m_numDocIdsAdded ); g_httpServer.sendDynamicPage ( gr->m_socket, sb.getBufStart(), sb.length(), -1, false,ct); mdelete ( st , sizeof(State13) , "PageTagdb" ); delete (st); return; } g_pages.printAdminTop ( &sb , gr->m_socket , &gr->m_hr ); sb.safePrintf("<style>" ".poo { background-color:#%s;}\n" "</style>\n" , LIGHT_BLUE ); // // print error msg if any // if ( gr->m_query && gr->m_query[0] && ! g_errno ) sb.safePrintf ( "<center><font color=red><b>Success. " "Added %" PRId32" docid(s) to " "spider queue.</b></font></center><br>" , st->m_msg1c.m_numDocIdsAdded ); if ( gr->m_query && gr->m_query[0] && g_errno ) sb.safePrintf ( "<center><font color=red><b>Error. " "%s</b></font></center><br>" , mstrerror(g_errno)); // print the reindex interface g_parms.printParmTable ( &sb , gr->m_socket , &gr->m_hr ); g_httpServer.sendDynamicPage ( gr->m_socket, sb.getBufStart(), sb.length(), -1, false); mdelete ( st , sizeof(State13) , "PageTagdb" ); delete (st); }
bool sendReply ( void *state ) { GigablastRequest *gr = (GigablastRequest *)state; // in order to see what sites are being added log it, then we can // more easily remove sites from sitesearch.gigablast.com that are // being added but not being searched SafeBuf xb; if ( gr->m_urlsBuf ) { xb.safeTruncateEllipsis ( gr->m_urlsBuf , 200 ); log( LOG_INFO, "http: add url %s (%s)", xb.getBufStart(), mstrerror( g_errno ) ); } char format = gr->m_hr.getReplyFormat(); TcpSocket *sock = gr->m_socket; if ( format == FORMAT_JSON || format == FORMAT_XML ) { bool status = g_httpServer.sendSuccessReply ( gr ); // nuke state mdelete ( gr , sizeof(gr) , "PageAddUrl" ); delete (gr); return status; } int32_t ulen = 0; const char *url = gr->m_urlsBuf; if ( url ) ulen = gbstrlen (url); // re-null it out if just http:// bool printUrl = true; if ( ulen == 0 ) printUrl = false; if ( ! gr->m_urlsBuf ) printUrl = false; if ( ulen==7 && printUrl && !strncasecmp(gr->m_url,"http://",7)) printUrl = false; if ( ulen==8 && printUrl && !strncasecmp(gr->m_url,"https://",8)) printUrl = false; // page is not more than 32k char buf[1024*32+MAX_URL_LEN*2]; SafeBuf sb(buf, 1024*32+MAX_URL_LEN*2); g_pages.printAdminTop ( &sb , sock , &gr->m_hr ); // if there was an error let them know SafeBuf mbuf; if ( g_errno ) { mbuf.safePrintf("<center><font color=red>"); mbuf.safePrintf("Error adding url(s): <b>%s[%i]</b>", mstrerror(g_errno) , g_errno); mbuf.safePrintf("</font></center>"); } else if ( printUrl ) { mbuf.safePrintf("<center><font color=red>"); mbuf.safePrintf("<b><u>"); mbuf.safeTruncateEllipsis(gr->m_urlsBuf,200); mbuf.safePrintf("</u></b></font> added to spider queue successfully<br><br>"); mbuf.safePrintf("</font></center>"); } if ( mbuf.length() ) { sb.safeStrcpy( mbuf.getBufStart() ); } g_parms.printParmTable ( &sb , sock , &gr->m_hr ); // print the final tail g_pages.printTail ( &sb, true ); // admin? // clear g_errno, if any, so our reply send goes through g_errno = 0; // nuke state mdelete ( gr , sizeof(GigablastRequest) , "PageAddUrl" ); delete (gr); return g_httpServer.sendDynamicPage( sock, sb.getBufStart(), sb.length(), -1 ); // cachetime }
// returns false if blocked, true otherwise bool processLoop ( void *state ) { // get it State2 *st = (State2 *)state; // get the tcp socket from the state TcpSocket *s = st->m_socket; // get it XmlDoc *xd = &st->m_xd; if ( ! xd->m_loaded ) { // setting just the docid. niceness is 0. //xd->set3 ( st->m_docId , st->m_coll , 0 ); // callback xd->setCallback ( state , processLoop ); // . and tell it to load from the old title rec // . this sets xd->m_oldTitleRec/m_oldTitleRecSize // . this sets xd->ptr_* and all other member vars from // the old title rec if found in titledb. if ( ! xd->loadFromOldTitleRec ( ) ) return false; } if ( g_errno ) return sendErrorReply ( st , g_errno ); // now force it to load old title rec //char **tr = xd->getTitleRec(); SafeBuf *tr = xd->getTitleRecBuf(); // blocked? return false if so. it will call processLoop() when it rets if ( tr == (void *)-1 ) return false; // we did not block. check for error? this will free "st" too. if ( ! tr ) return sendErrorReply ( st , g_errno ); // if title rec was empty, that is a problem if ( xd->m_titleRecBuf.length() == 0 ) return sendErrorReply ( st , ENOTFOUND); // set callback char *na = xd->getIsNoArchive(); // wait if blocked if ( na == (void *)-1 ) return false; // error? if ( ! na ) return sendErrorReply ( st , g_errno ); // forbidden? allow turkeys through though... if ( ! st->m_isAdmin && *na ) return sendErrorReply ( st , ENOCACHE ); SafeBuf *sb = &st->m_sb; // &page=4 will print rainbow sections if ( ! st->m_printed && st->m_r.getLong("page",0) ) { // do not repeat this call st->m_printed = true; // this will call us again since we called // xd->setCallback() above to us if ( ! xd->printDocForProCog ( sb , &st->m_r ) ) return false; } char *contentType = "text/html"; char format = st->m_format; if ( format == FORMAT_XML ) contentType = "text/xml"; if ( format == FORMAT_JSON ) contentType = "application/json"; // if we printed a special page (like rainbow sections) then return now if ( st->m_printed ) { bool status = g_httpServer.sendDynamicPage (s, //buf,bufLen, sb->getBufStart(), sb->getLength(), -1,false, //"text/html", contentType, -1, NULL, "utf8" ); // nuke state2 mdelete ( st , sizeof(State2) , "PageGet1" ); delete (st); return status; } /* // this was calling XmlDoc and setting sections, etc. to // get the SpiderReply junk... no no no // is it banned or filtered? this ignores the TagRec in the titleRec // and uses msg8a to get it fresh instead char *vi = xd->getIsFiltered();//Visible( ); // wait if blocked if ( vi == (void *)-1 ) return false; // error? if ( ! vi ) return sendErrorReply ( st , g_errno ); // banned? if ( ! st->m_isAdmin && ! *vi ) return sendErrorReply (st,EDOCBANNED); */ // get the utf8 content char **utf8 = xd->getUtf8Content(); //long len = xd->size_utf8Content - 1; // wait if blocked??? if ( utf8 == (void *)-1 ) return false; // strange if ( xd->size_utf8Content<=0) { log("pageget: utf8 content <= 0"); return sendErrorReply(st,EBADENGINEER ); } // alloc error? if ( ! utf8 ) return sendErrorReply ( st , g_errno ); // get this host Host *h = g_hostdb.getHost ( g_hostdb.m_hostId ); if ( ! h ) { log("pageget: hostid %li is bad",g_hostdb.m_hostId); return sendErrorReply(st,EBADENGINEER ); } char *content = xd->ptr_utf8Content; long contentLen = xd->size_utf8Content - 1; // shortcut char strip = st->m_strip; // alloc buffer now //char *buf = NULL; //long bufMaxSize = 0; //bufMaxSize = len + ( 32 * 1024 ) ; //bufMaxSize = contentLen + ( 32 * 1024 ) ; //buf = (char *)mmalloc ( bufMaxSize , "PageGet2" ); //char *p = buf; //char *bufEnd = buf + bufMaxSize; //if ( ! buf ) { // return sendErrorReply ( st , g_errno ); //} // for undoing the header //char *start1 = p; long startLen1 = sb->length(); // we are always utfu if ( strip != 2 ) sb->safePrintf( "<meta http-equiv=\"Content-Type\" " "content=\"text/html;charset=utf8\">\n"); // base href //Url *base = &xd->m_firstUrl; //if ( xd->ptr_redirUrl.m_url[0] ) // base = &xd->m_redirUrl; char *base = xd->ptr_firstUrl; if ( xd->ptr_redirUrl ) base = xd->ptr_redirUrl; //Url *redir = *xd->getRedirUrl(); if ( strip != 2 ) { sb->safePrintf ( "<BASE HREF=\"%s\">" , base ); //p += gbstrlen ( p ); } // default colors in case css files missing if ( strip != 2 ) { sb->safePrintf( "\n<style type=\"text/css\">\n" "body{background-color:white;color:black;}\n" "</style>\n"); //p += gbstrlen ( p ); } //char format = st->m_format; if ( format == FORMAT_XML ) sb->reset(); if ( format == FORMAT_JSON ) sb->reset(); // for undoing the stuff below long startLen2 = sb->length();//p; // query should be NULL terminated char *q = st->m_q; long qlen = st->m_qlen; char styleTitle[128] = "font-size:14px;font-weight:600;" "color:#000000;"; char styleText[128] = "font-size:14px;font-weight:400;" "color:#000000;"; char styleLink[128] = "font-size:14px;font-weight:400;" "color:#0000ff;"; char styleTell[128] = "font-size:14px;font-weight:600;" "color:#cc0000;"; // get the url of the title rec Url *f = xd->getFirstUrl(); bool printDisclaimer = st->m_printDisclaimer; if ( xd->m_contentType == CT_JSON ) printDisclaimer = false; if ( format == FORMAT_XML ) printDisclaimer = false; if ( format == FORMAT_JSON ) printDisclaimer = false; char tbuf[100]; tbuf[0] = 0; time_t lastSpiderDate = xd->m_spideredTime; if ( printDisclaimer || format == FORMAT_XML || format == FORMAT_JSON ) { struct tm *timeStruct = gmtime ( &lastSpiderDate ); strftime ( tbuf, 100,"%b %d, %Y UTC", timeStruct); } // We should always be displaying this disclaimer. // - May eventually want to display this at a different location // on the page, or on the click 'n' scroll browser page itself // when this page is not being viewed solo. // CNS: if ( ! st->m_clickNScroll ) { if ( printDisclaimer ) { sb->safePrintf(//sprintf ( p , //"<BASE HREF=\"%s\">" //"<table border=1 width=100%%>" //"<tr><td>" "<table border=\"1\" bgcolor=\"#" BGCOLOR "\" cellpadding=\"10\" " //"id=\"gbcnsdisctable\" class=\"gbcnsdisctable_v\"" "cellspacing=\"0\" width=\"100%%\" color=\"#ffffff\">" "<tr" //" id=\"gbcnsdisctr\" class=\"gbcnsdisctr_v\"" "><td>" //"<font face=times,sans-serif color=black size=-1>" "<span style=\"%s\">" "This is Gigablast's cached page of </span>" "<a href=\"%s\" style=\"%s\">%s</a>" "" , styleTitle, f->getUrl(), styleLink, f->getUrl() ); //p += gbstrlen ( p ); // then the rest //sprintf(p , sb->safePrintf( "<span style=\"%s\">. " "Gigablast is not responsible for the content of " "this page.</span>", styleTitle ); //p += gbstrlen ( p ); sb->safePrintf ( "<br/><span style=\"%s\">" "Cached: </span>" "<span style=\"%s\">", styleTitle, styleText ); //p += gbstrlen ( p ); // then the spider date in GMT // time_t lastSpiderDate = xd->m_spideredTime; // struct tm *timeStruct = gmtime ( &lastSpiderDate ); // char tbuf[100]; // strftime ( tbuf, 100,"%b %d, %Y UTC", timeStruct); //p += gbstrlen ( p ); sb->safeStrcpy(tbuf); // Moved over from PageResults.cpp sb->safePrintf( "</span> - <a href=\"" "/get?" "q=%s&c=%s&rtq=%li&" "d=%lli&strip=1\"" " style=\"%s\">" "[stripped]</a>", q , st->m_coll , (long)st->m_rtq, st->m_docId, styleLink ); // a link to alexa if ( f->getUrlLen() > 5 ) { sb->safePrintf( " - <a href=\"http:" "//web.archive.org/web/*/%s\"" " style=\"%s\">" "[older copies]</a>" , f->getUrl(), styleLink ); } if (st->m_noArchive){ sb->safePrintf( " - <span style=\"%s\"><b>" "[NOARCHIVE]</b></span>", styleTell ); } if (st->m_isBanned){ sb->safePrintf(" - <span style=\"%s\"><b>" "[BANNED]</b></span>", styleTell ); } // only print this if we got a query if ( qlen > 0 ) { sb->safePrintf("<br/><br/><span style=\"%s\"> " "These search terms have been " "highlighted: ", styleText ); //p += gbstrlen ( p ); } } // how much space left in p? //long avail = bufEnd - p; // . make the url that we're outputting for (like in PageResults.cpp) // . "thisUrl" is the baseUrl for click & scroll char thisUrl[MAX_URL_LEN]; char *thisUrlEnd = thisUrl + MAX_URL_LEN; char *x = thisUrl; // . use the external ip of our gateway // . construct the NAT mapped port // . you should have used iptables to map port to the correct // internal ip:port //unsigned long ip =g_conf.m_mainExternalIp ; // h->m_externalIp; //unsigned short port=g_conf.m_mainExternalPort;//h->m_externalHttpPort // local check //if ( st->m_isLocal ) { unsigned long ip = h->m_ip; unsigned short port = h->m_httpPort; //} //sprintf ( x , "http://%s:%li/get?q=" , iptoa ( ip ) , port ); // . we no longer put the port in here // . but still need http:// since we use <base href=> if (port == 80) sprintf(x,"http://%s/get?q=",iptoa(ip)); else sprintf(x,"http://%s:%hu/get?q=",iptoa(ip),port); x += gbstrlen ( x ); // the query url encoded long elen = urlEncode ( x , thisUrlEnd - x , q , qlen ); x += elen; // separate cgi vars with a & //sprintf ( x, "&seq=%li&rtq=%lid=%lli", // (long)st->m_seq,(long)st->m_rtq,st->m_msg22.getDocId()); sprintf ( x, "&d=%lli",st->m_docId ); x += gbstrlen(x); // set our query for highlighting Query qq; qq.set2 ( q, st->m_langId , true ); // print the query terms into our highlight buffer Highlight hi; // make words so we can set the scores to ignore fielded terms Words qw; qw.set ( q , // content being highlighted, utf8 qlen , // content being highlighted, utf8 TITLEREC_CURRENT_VERSION, true , // computeIds false ); // hasHtmlEntities? // . assign scores of 0 to query words that should be ignored // . TRICKY: loop over words in qq.m_qwords, but they should be 1-1 // with words in qw. // . sanity check //if ( qw.getNumWords() != qq.m_numWords ) { char *xx = NULL; *xx = 0;} // declare up here Matches m; // do the loop //Scores ss; //ss.set ( &qw , NULL ); //for ( long i = 0 ; i < qq.m_numWords ; i++ ) // if ( ! m.matchWord ( &qq.m_qwords[i],i ) ) ss.m_scores[i] = 0; // now set m.m_matches[] to those words in qw that match a query word // or phrase in qq. m.setQuery ( &qq ); //m.addMatches ( &qw , &ss , true ); m.addMatches ( &qw ); long hilen = 0; // CNS: if ( ! st->m_clickNScroll ) { // and highlight the matches if ( printDisclaimer ) { hilen = hi.set ( //p , //avail , sb , &qw , // words to highlight &m , // matches relative to qw false , // doSteming false , // st->m_clickAndScroll , (char *)thisUrl );// base url for ClcknScrll //p += hilen; // now an hr //memcpy ( p , "</span></table></table>\n" , 24 ); p += 24; sb->safeStrcpy("</span></table></table>\n"); } bool includeHeader = st->m_includeHeader; // do not show header for json object display if ( xd->m_contentType == CT_JSON ) includeHeader = false; if ( format == FORMAT_XML ) includeHeader = false; if ( format == FORMAT_JSON ) includeHeader = false; //mfree(uq, uqCapacity, "PageGet"); // undo the header writes if we should if ( ! includeHeader ) { // including base href is off by default when not including // the header, so the caller must explicitly turn it back on if ( st->m_includeBaseHref ) sb->m_length=startLen2;//p=start2; else sb->m_length=startLen1;//p=start1; } //sb->safeStrcpy(tbuf); if ( format == FORMAT_XML ) { sb->safePrintf("<response>\n"); sb->safePrintf("<statusCode>0</statusCode>\n"); sb->safePrintf("<statusMsg>Success</statusMsg>\n"); sb->safePrintf("<url><![CDATA["); sb->cdataEncode(xd->m_firstUrl.m_url); sb->safePrintf("]]></url>\n"); sb->safePrintf("<docId>%llu</docId>\n",xd->m_docId); sb->safePrintf("\t<cachedTimeUTC>%lu</cachedTimeUTC>\n", lastSpiderDate); sb->safePrintf("\t<cachedTimeStr>%s</cachedTimeStr>\n",tbuf); } if ( format == FORMAT_JSON ) { sb->safePrintf("{\"response\":{\n"); sb->safePrintf("\t\"statusCode\":0,\n"); sb->safePrintf("\t\"statusMsg\":\"Success\",\n"); sb->safePrintf("\t\"url\":\""); sb->jsonEncode(xd->m_firstUrl.m_url); sb->safePrintf("\",\n"); sb->safePrintf("\t\"docId\":%llu,\n",xd->m_docId); sb->safePrintf("\t\"cachedTimeUTC\":%lu,\n",lastSpiderDate); sb->safePrintf("\t\"cachedTimeStr\":\"%s\",\n",tbuf); } // identify start of <title> tag we wrote out char *sbstart = sb->getBufStart(); char *sbend = sb->getBufEnd(); char *titleStart = NULL; char *titleEnd = NULL; for ( char *t = sbstart ; t < sbend ; t++ ) { // title tag? if ( t[0]!='<' ) continue; if ( to_lower_a(t[1])!='t' ) continue; if ( to_lower_a(t[2])!='i' ) continue; if ( to_lower_a(t[3])!='t' ) continue; if ( to_lower_a(t[4])!='l' ) continue; if ( to_lower_a(t[5])!='e' ) continue; // point to it char *x = t + 5; // max - to keep things fast char *max = x + 500; for ( ; *x && *x != '>' && x < max ; x++ ); x++; // find end char *e = x; for ( ; *e && e < max ; e++ ) { if ( e[0]=='<' && to_lower_a(e[1])=='/' && to_lower_a(e[2])=='t' && to_lower_a(e[3])=='i' && to_lower_a(e[4])=='t' && to_lower_a(e[5])=='l' && to_lower_a(e[6])=='e' ) break; } if ( e < max ) { titleStart = x; titleEnd = e; } break; } // . print title at top! // . consider moving if ( titleStart ) { char *ebuf = st->m_r.getString("eb"); if ( ! ebuf ) ebuf = ""; //p += sprintf ( p , sb->safePrintf( "<table border=1 " "cellpadding=10 " "cellspacing=0 " "width=100%% " "color=#ffffff>" ); long printLinks = st->m_r.getLong("links",0); if ( ! printDisclaimer && printLinks ) sb->safePrintf(//p += sprintf ( p , // first put cached and live link "<tr>" "<td bgcolor=lightyellow>" // print cached link //"<center>" " " "<b>" "<a " "style=\"font-size:18px;font-weight:600;" "color:#000000;\" " "href=\"" "/get?" "c=%s&d=%lli&qh=0&cnsp=1&eb=%s\">" "cached link</a>" " " "<a " "style=\"font-size:18px;font-weight:600;" "color:#000000;\" " "href=%s>live link</a>" "</b>" //"</center>" "</td>" "</tr>\n" ,st->m_coll ,st->m_docId ,ebuf ,thisUrl // st->ptr_ubuf ); if ( printLinks ) { sb->safePrintf(//p += sprintf ( p , "<tr><td bgcolor=pink>" "<span style=\"font-size:18px;" "font-weight:600;" "color:#000000;\">" " " "<b>PAGE TITLE:</b> " ); long tlen = titleEnd - titleStart; sb->safeMemcpy ( titleStart , tlen ); sb->safePrintf ( "</span></td></tr>" ); } sb->safePrintf( "</table><br>\n" ); } // is the content preformatted? bool pre = false; char ctype = (char)xd->m_contentType; if ( ctype == CT_TEXT ) pre = true ; // text/plain if ( ctype == CT_DOC ) pre = true ; // filtered msword if ( ctype == CT_PS ) pre = true ; // filtered postscript if ( format == FORMAT_XML ) pre = false; if ( format == FORMAT_JSON ) pre = false; // if it is content-type text, add a <pre> if ( pre ) {//p + 5 < bufEnd && pre ) { sb->safePrintf("<pre>"); //p += 5; } if ( st->m_strip == 1 ) contentLen = stripHtml( content, contentLen, (long)xd->m_version, st->m_strip ); // it returns -1 and sets g_errno on error, line OOM if ( contentLen == -1 ) { //if ( buf ) mfree ( buf , bufMaxSize , "PageGet2" ); return sendErrorReply ( st , g_errno ); } Xml xml; Words ww; // if no highlighting, skip it bool queryHighlighting = st->m_queryHighlighting; if ( st->m_strip == 2 ) queryHighlighting = false; // do not do term highlighting if json if ( xd->m_contentType == CT_JSON ) queryHighlighting = false; SafeBuf tmp; SafeBuf *xb = sb; if ( format == FORMAT_XML ) xb = &tmp; if ( format == FORMAT_JSON ) xb = &tmp; if ( ! queryHighlighting ) { xb->safeMemcpy ( content , contentLen ); //p += contentLen ; } else { // get the content as xhtml (should be NULL terminated) //Words *ww = xd->getWords(); if ( ! xml.set ( content , contentLen , false , 0 , false , TITLEREC_CURRENT_VERSION , false , 0 , CT_HTML ) ) { // niceness is 0 //if ( buf ) mfree ( buf , bufMaxSize , "PageGet2" ); return sendErrorReply ( st , g_errno ); } if ( ! ww.set ( &xml , true , 0 ) ) { // niceness is 0 //if ( buf ) mfree ( buf , bufMaxSize , "PageGet2" ); return sendErrorReply ( st , g_errno ); } // sanity check //if ( ! xd->m_wordsValid ) { char *xx=NULL;*xx=0; } // how much space left in p? //avail = bufEnd - p; Matches m; m.setQuery ( &qq ); m.addMatches ( &ww ); hilen = hi.set ( xb , // p , avail , &ww , &m , false /*doStemming?*/ , st->m_clickAndScroll , thisUrl /*base url for click & scroll*/); //p += hilen; log(LOG_DEBUG, "query: Done highlighting cached page content"); } if ( format == FORMAT_XML ) { sb->safePrintf("\t<content><![CDATA["); sb->cdataEncode ( xb->getBufStart() ); sb->safePrintf("]]></content>\n"); sb->safePrintf("</response>\n"); } if ( format == FORMAT_JSON ) { sb->safePrintf("\t\"content\":\"\n"); sb->jsonEncode ( xb->getBufStart() ); sb->safePrintf("\"\n}\n}\n"); } // if it is content-type text, add a </pre> if ( pre ) { // p + 6 < bufEnd && pre ) { sb->safeMemcpy ( "</pre>" , 6 ); //p += 6; } // calculate bufLen //long bufLen = p - buf; long ct = xd->m_contentType; // now filter the entire buffer to escape out the xml tags // so it is displayed nice SafeBuf newbuf; if ( ct == CT_XML ) { // encode the xml tags into <tagname> sequences if ( !newbuf.htmlEncodeXmlTags ( sb->getBufStart() , sb->getLength(), 0)){// niceness=0 //if ( buf ) mfree ( buf , bufMaxSize , "PageGet2" ); return sendErrorReply ( st , g_errno ); } // free out buffer that we alloc'd before returning since this // should have copied it into another buffer //if ( buf ) mfree ( buf , bufMaxSize , "PageGet2" ); // reassign //buf = newbuf.getBufStart(); //bufLen = newbuf.length(); sb->stealBuf ( &newbuf ); } // now encapsulate it in html head/tail and send it off // sendErr: contentType = "text/html"; if ( strip == 2 ) contentType = "text/xml"; // xml is usually buggy and this throws browser off //if ( ctype == CT_XML ) contentType = "text/xml"; if ( xd->m_contentType == CT_JSON ) contentType = "application/json"; if ( format == FORMAT_XML ) contentType = "text/xml"; if ( format == FORMAT_JSON ) contentType = "application/json"; // safebuf, sb, is a member of "st" so this should copy the buffer // when it constructs the http reply, and we gotta call delete(st) // AFTER this so sb is still valid. bool status = g_httpServer.sendDynamicPage (s, //buf,bufLen, sb->getBufStart(), sb->getLength(), -1,false, contentType, -1, NULL, "utf8" ); // nuke state2 mdelete ( st , sizeof(State2) , "PageGet1" ); delete (st); // free out buffer that we alloc'd before returning since this // should have copied it into another buffer //if ( ct == CT_XML ) newbuf.purge(); //else if ( buf ) mfree ( buf , bufMaxSize , "PageGet2" ); // and convey the status return status; }
// // new code for drawing graph in html with absolute divs instead // of using GIF plotter library which had issues // void Stats::printGraphInHtml ( SafeBuf &sb ) { // gif size char tmp[64]; sprintf ( tmp , "%lix%li", (long)DX+40 , (long)DY+40 ); // "1040x440" // 20 pixel borders //int bx = 10; //int by = 30; // define the space with boundaries 100 unit wide boundaries //plotter.space ( -bx , -by , DX + bx , DY + by ); // draw the x-axis //plotter.line ( 0 , 0 , DX , 0 ); // draw the y-axis //plotter.line ( 0 , 0 , 0 , DY ); // find time ranges long long t2 = 0; for ( long i = 0 ; i < MAX_POINTS ; i++ ) { // skip empties if ( m_pts[i].m_startTime == 0 ) continue; // set min/max if ( m_pts[i].m_endTime > t2 ) t2 = m_pts[i].m_endTime; } // now compute the start time for the graph long long t1 = 0x7fffffffffffffffLL; // now recompute t1 for ( long i = 0 ; i < MAX_POINTS ; i++ ) { // skip empties if ( m_pts[i].m_startTime == 0 ) continue; // can't be behind more than 1 second if ( m_pts[i].m_startTime < t2 - DT ) continue; // otherwise, it's a candidate for the first time if ( m_pts[i].m_startTime < t1 ) t1 = m_pts[i].m_startTime; } // // main graphing window // sb.safePrintf("<div style=\"position:relative;" "background-color:#c0c0c0;" // match style of tables "border-radius:10px;" "border:#6060f0 2px solid;" //"overflow-y:hidden;" "overflow-x:hidden;" "z-index:-10;" // the tick marks we print below are based on it // being a window of the last 20 seconds... and using // DX pixels "min-width:%lipx;" "min-height:%lipx;" //"width:100%%;" //"min-height:600px;" //"margin-top:10px;" "margin-bottom:10px;" //"margin-right:10px;" //"margin-left:10px;" "\">" ,(long)DX ,(long)DY +20); // add 10 more for "2s" labels etc. // 10 x-axis tick marks for ( int x = DX/20 ; x <= DX ; x += DX/20 ) { // tick mark //plotter.line ( x , -20 , x , 20 ); sb.safePrintf("<div style=\"position:absolute;" "left:%li;" "bottom:0;" "background-color:#000000;" "z-index:110;" "min-height:20px;" "min-width:3px;\"></div>\n" , (long)x-1 ); // generate label //char buf [ 32 ]; //sprintf ( buf , "%li" , // (long)(DT * (long long)x / (long long)DX) ); // LABEL sb.safePrintf("<div style=\"position:absolute;" "left:%li;" "bottom:20;" //"background-color:#000000;" "z-index:110;" "min-height:20px;" "min-width:3px;\">%lis</div>\n" , (long)x-10 // the label: ,(long)(DT * (long long)x / (long long)DX)/1000 ); // move cursor //plotter.move ( x , -by / 2 - 9 ); // plot label //plotter.alabel ( 'c' , 'c' , buf ); } // . each line consists of several points // . we need to know each point for adding otherlines // . is about [400/6][1024] = 70k // . each line can contain multiple data points // . each data point is expressed as a horizontal line segment void *lrgBuf; long lrgSize = 0; lrgSize += MAX_LINES * MAX_POINTS * sizeof(StatPoint *); lrgSize += MAX_LINES * sizeof(long); lrgBuf = (char *) mmalloc(lrgSize, "Stats.cpp"); if (! lrgBuf) { log("could not allocate memory for local buffer in Stats.cpp" "%li bytes needed", lrgSize); return; } char *lrgPtr = (char *)lrgBuf; StatPoint **points = (StatPoint **)lrgPtr; lrgPtr += MAX_LINES * MAX_POINTS * sizeof(StatPoint *); long *numPoints = (long *)lrgPtr; lrgPtr += MAX_LINES * sizeof(long); memset ( (char *)numPoints , 0 , MAX_LINES * sizeof(long) ); // store the data points into "lines" long count = MAX_POINTS; for ( long i = m_next ; count >= 0 ; i++ , count-- ) { // wrap around the array if ( i >= MAX_POINTS ) i = 0; // skip point if empty if ( m_pts[i].m_startTime == 0 ) continue; // skip if too early if ( m_pts[i].m_endTime < t1 ) continue; // . find the lowest line the will hold us // . this adds point to points[x][n] where x is determined addPoint ( points , numPoints , &m_pts[i] ); } int y1 = 21; // plot the points (lines) in each line for ( long i = 0 ; i < MAX_LINES ; i++ ) { // increase vert y1 += MAX_WIDTH + 1; // wrap back down if necessary if ( y1 >= DY ) y1 = 21; // plt all points in this row for ( long j = 0 ; j < numPoints[i] ; j++ ) { // get the point StatPoint *p = points[MAX_POINTS * i + j]; // transform time to x coordinates int x1 = (p->m_startTime - t1) * (long long)DX / DT; int x2 = (p->m_endTime - t1) * (long long)DX / DT; // if x2 is negative, skip it if ( x2 < 0 ) continue; // if x1 is negative, boost it to -2 if ( x1 < 0 ) x1 = -2; // . line thickness is function of read/write size // . take logs int w = (int)log(((double)p->m_numBytes)/8192.0) + 3; //log("log of %li is %i",m_pts[i].m_numBytes,w); if ( w < 3 ) w = 3; if ( w > MAX_WIDTH ) w = MAX_WIDTH; //plotter.linewidth ( w ); // use the color specified from addStat_r() for this line/pt //plotter.pencolor ( ((p->m_color >> 16) & 0xff) << 8 , // ((p->m_color >> 8) & 0xff) << 8 , // ((p->m_color >> 0) & 0xff) << 8 ); // ensure at least 3 units wide for visibility if ( x2 < x1 + 3 ) x2 = x1 + 3; // . flip the y so we don't have to scroll the browser down // . DY does not include the axis and tick marks long fy1 = DY - y1 + 20 ; // plot it //plotter.line ( x1 , fy1 , x2 , fy1 ); drawLine2 ( sb , x1 , x2 , fy1 , p->m_color , w ); // debug msg //log("line (%i,%i, %i,%i) ", x1 , vert , x2 , vert ); //log("bytes = %li width = %li ", m_pts[i].m_numBytes,w); //log("st=%i, end=%i color=%lx " , // (int)m_pts[i].m_startTime , // (int)m_pts[i].m_endTime , // m_pts[i].m_color ); } } sb.safePrintf("</div>\n"); mfree(lrgBuf, lrgSize, "Stats.cpp"); }
// . make a web page from results stored in msg40 // . send it on TcpSocket "s" when done // . returns false if blocked, true otherwise // . sets g_errno on error bool gotTitleRec ( void *state ) { // cast the State4 out State4 *st = (State4 *) state; // get the socket TcpSocket *s = st->m_socket; SafeBuf sb; // get it's docId long long docId = st->m_docId; // make the query string for passing to different hosts char qs[64]; sprintf(qs,"&d=%lli",docId); if ( docId==0LL ) qs[0] = 0; // print standard header sb.reserve2x ( 32768 ); g_pages.printAdminTop (&sb, st->m_socket, &st->m_r ); //PAGE_TITLEDB, // st->m_username,//NULL , // st->m_coll , st->m_pwd , s->m_ip , qs ); // shortcut XmlDoc *xd = &st->m_xd; // . deal with errors // . print none if non title rec at or after the provided docId if ( g_errno || docId == 0LL || xd->m_titleRecBuf.length() <= 0 ) { // print docId in box sb.safePrintf ( "<center>\nEnter docId: " "<input type=text name=d value=%lli size=15>", docId); sb.safePrintf ( "</form><br>\n" ); if ( docId == 0 ) sb.safePrintf("<br>"); else if ( g_errno ) sb.safePrintf("<br><br>Error = %s",mstrerror(g_errno)); else sb.safePrintf("<br><br>No titleRec for that docId " "or higher"); // print where it should be //unsigned long gid = getGroupIdFromDocId ( docId ); //Host *hosts = g_hostdb.getGroup(gid); long shardNum = getShardNumFromDocId ( docId ); Host *hosts = g_hostdb.getShard ( shardNum ); long hostId = -1; if ( hosts ) hostId = hosts[0].m_hostId; sb.safePrintf("<br><br>docId on host #%li and twins.",hostId); sb.safePrintf ( "\n</center>" ); mdelete ( st , sizeof(State4) , "PageTitledb"); delete (st); // erase g_errno for sending g_errno = 0; // now encapsulate it in html head/tail and send it off return g_httpServer.sendDynamicPage ( s , sb.getBufStart(), sb.length() ); } // print docId in box sb.safePrintf ("<br>\n" "<center>Enter docId: " "<input type=text name=d value=%lli size=15>", docId ); // print where it should be //unsigned long gid = getGroupIdFromDocId ( docId ); //Host *hosts = g_hostdb.getGroup(gid); long shardNum = getShardNumFromDocId ( docId ); Host *hosts = g_hostdb.getShard ( shardNum ); long hostId = -1; if ( hosts ) hostId = hosts[0].m_hostId; sb.safePrintf("<br><br>docId on host #%li and twins.",hostId); sb.safePrintf ( "</form><br>\n" ); //char *coll = st->m_coll; Title *ti = xd->getTitle(); if ( ! ti ) { log ( "admin: Could not set title" ); return g_httpServer.sendErrorReply(s,500,mstrerror(g_errno)); } // sanity check. should not block if ( ! xd->m_titleValid ) { char *xx=NULL;*xx=0; } // print it out xd->printDoc ( &sb ); // don't forget to cleanup mdelete ( st , sizeof(State4) , "PageTitledb"); delete (st); // now encapsulate it in html head/tail and send it off return g_httpServer.sendDynamicPage (s, sb.getBufStart(), sb.length()); }
// . now come here when we got the necessary index lists // . returns false if blocked, true otherwise // . sets g_errno on error bool Msg39::intersectLists ( ) { // bool updateReadInfo ) { // bail on error if ( g_errno ) { hadError: log("msg39: Had error getting termlists: %s.", mstrerror(g_errno)); if ( ! g_errno ) { char *xx=NULL;*xx=0; } //sendReply (m_slot,this,NULL,0,0,true); return true; } // timestamp log if ( m_debug ) { log(LOG_DEBUG,"query: msg39: [%"PTRFMT"] " "Got %"INT32" lists in %"INT64" ms" , (PTRTYPE)this,m_tmpq.getNumTerms(), gettimeofdayInMilliseconds() - m_startTime); m_startTime = gettimeofdayInMilliseconds(); } // breathe QUICKPOLL ( m_r->m_niceness ); // ensure collection not deleted from under us CollectionRec *cr = g_collectiondb.getRec ( m_r->m_collnum ); if ( ! cr ) { g_errno = ENOCOLLREC; goto hadError; } // . set the IndexTable so it can set it's score weights from the // termFreqs of each termId in the query // . this now takes into account the special termIds used for sorting // by date (0xdadadada and 0xdadadad2 & TERMID_MASK) // . it should weight them so much so that the summation of scores // from other query terms cannot make up for a lower date score // . this will actually calculate the top // . this might also change m_tmpq.m_termSigns // . this won't do anything if it was already called m_posdbTable.init ( &m_tmpq , m_debug , this , &m_tt , m_r->m_collnum,//ptr_coll , &m_msg2 , // m_lists , //m_tmpq.m_numTerms , // m_numLists m_r ); // breathe QUICKPOLL ( m_r->m_niceness ); // . we have to do this here now too // . but if we are getting weights, we don't need m_tt! // . actually we were using it before for rat=0/bool queries but // i got rid of NO_RAT_SLOTS if ( ! m_allocedTree && ! m_posdbTable.allocTopTree() ) { if ( ! g_errno ) { char *xx=NULL;*xx=0; } //sendReply ( m_slot , this , NULL , 0 , 0 , true); return true; } // if msg2 had ALL empty lists we can cut it int16_t if ( m_posdbTable.m_topTree->m_numNodes == 0 ) { //estimateHitsAndSendReply ( ); return true; } // we have to allocate this with each call because each call can // be a different docid range from doDocIdSplitLoop. if ( ! m_posdbTable.allocWhiteListTable() ) { log("msg39: Had error allocating white list table: %s.", mstrerror(g_errno)); if ( ! g_errno ) { char *xx=NULL;*xx=0; } //sendReply (m_slot,this,NULL,0,0,true); return true; } // do not re do it if doing docid range splitting m_allocedTree = true; // . now we must call this separately here, not in allocTopTree() // . we have to re-set the QueryTermInfos with each docid range split // since it will set the list ptrs from the msg2 lists if ( ! m_posdbTable.setQueryTermInfo () ) return true; // print query term bit numbers here for ( int32_t i = 0 ; m_debug && i < m_tmpq.getNumTerms() ; i++ ) { QueryTerm *qt = &m_tmpq.m_qterms[i]; //utf16ToUtf8(bb, 256, qt->m_term, qt->m_termLen); char *tpc = qt->m_term + qt->m_termLen; char tmp = *tpc; *tpc = '\0'; SafeBuf sb; sb.safePrintf("query: msg39: BITNUM query term #%"INT32" \"%s\" " "bitnum=%"INT32" ", i , qt->m_term, qt->m_bitNum ); // put it back *tpc = tmp; logf(LOG_DEBUG,"%s",sb.getBufStart()); } // timestamp log if ( m_debug ) { log(LOG_DEBUG,"query: msg39: [%"PTRFMT"] " "Preparing to intersect " "took %"INT64" ms", (PTRTYPE)this, gettimeofdayInMilliseconds() - m_startTime ); m_startTime = gettimeofdayInMilliseconds(); } // time it int64_t start = gettimeofdayInMilliseconds(); int64_t diff; // . don't bother making a thread if lists are small // . look at STAGE? in IndexReadInfo.cpp to see how we read in stages // . it's always saying msg39 handler is hogging cpu...could this be it //if ( m_msg2.getTotalRead() < 2000*8 ) goto skipThread; // debug //goto skipThread; // . NOW! let's do this in a thread so we can continue to service // incoming requests // . don't launch more than 1 thread at a time for this // . set callback when thread done // breathe QUICKPOLL ( m_r->m_niceness ); // . create the thread // . only one of these type of threads should be launched at a time if ( ! m_debug && g_threads.call ( INTERSECT_THREAD , // threadType m_r->m_niceness , this , // top 4 bytes must be cback controlLoopWrapper2,//threadDoneWrapper , addListsWrapper ) ) { m_blocked = true; return false; } // if it failed //log(LOG_INFO,"query: Intersect thread creation failed. Doing " // "blocking. Hurts performance."); // check tree if ( m_tt.m_nodes == NULL ) { log(LOG_LOGIC,"query: msg39: Badness."); char *xx = NULL; *xx = 0; } // sometimes we skip the thread //skipThread: // . addLists() should never have a problem // . g_errno should be set by prepareToAddLists() above if there is // going to be a problem //if ( m_r->m_useNewAlgo ) m_posdbTable.intersectLists10_r ( ); //else // m_posdbTable.intersectLists9_r ( ); // time it diff = gettimeofdayInMilliseconds() - start; if ( diff > 10 ) log("query: Took %"INT64" ms for intersection",diff); // returns false if blocked, true otherwise //return addedLists (); return true; }
bool Log::init ( char *filename ) { // set the main process id //s_pid = getpidtid(); setPid(); // init these m_numErrors = 0; m_bufPtr = 0; m_fd = -1; m_disabled = false; #ifdef DEBUG g_dbufSize = 4096; g_dbuf = (char*)mmalloc(g_dbufSize,"Log: DebugBuffer"); if (!g_dbuf) fprintf(stderr, "Unable to init debug buffer"); #endif // m_hostname = g_conf.m_hostname; // m_port = port; // is there a filename to log our errors to? m_filename = filename; if ( ! m_filename ) return true; // skip this for now //return true; // // RENAME log000 to log000-2013_11_04-18:19:32 // if ( g_conf.m_runAsDaemon ) { File f; char tmp[16]; sprintf(tmp,"log%03li",g_hostdb.m_hostId); f.set ( g_hostdb.m_dir , tmp ); // make new filename like log000-2013_11_04-18:19:32 time_t now = getTimeLocal(); tm *tm1 = gmtime((const time_t *)&now); char tmp2[64]; strftime(tmp2,64,"%Y_%m_%d-%T",tm1); SafeBuf newName; if ( ! newName.safePrintf ( "%slog%03li-%s", g_hostdb.m_dir, g_hostdb.m_hostId, tmp2 ) ) { fprintf(stderr,"log rename failed\n"); return false; } // rename log000 to log000-2013_11_04-18:19:32 if ( f.doesExist() ) { //fprintf(stdout,"renaming file\n"); f.rename ( newName.getBufStart() ); } } // open it for appending. // create with -rw-rw-r-- permissions if it's not there. m_fd = open ( m_filename , O_APPEND | O_CREAT | O_RDWR , S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH ); if ( m_fd >= 0 ) return true; // bitch to stderr and return false on error fprintf(stderr,"could not open log file %s for appending\n", m_filename); return false; }
bool sendReply ( void *state , bool addUrlEnabled ) { // allow others to add now //s_inprogress = false; // get the state properly //gr *st1 = (gr *) state; GigablastRequest *gr = (GigablastRequest *)state; // in order to see what sites are being added log it, then we can // more easily remove sites from sitesearch.gigablast.com that are // being added but not being searched SafeBuf xb; if ( gr->m_urlsBuf ) { xb.safeTruncateEllipsis ( gr->m_urlsBuf , 200 ); log(LOG_INFO,"http: add url %s (%s)", xb.getBufStart(),mstrerror(g_errno)); } char format = gr->m_hr.getReplyFormat(); TcpSocket *sock = gr->m_socket; if ( format == FORMAT_JSON || format == FORMAT_XML ) { bool status = g_httpServer.sendSuccessReply ( gr ); // nuke state mdelete ( gr , sizeof(gr) , "PageAddUrl" ); delete (gr); return status; } long ulen = 0; char *url = gr->m_urlsBuf; if ( url ) ulen = gbstrlen (url); // re-null it out if just http:// bool printUrl = true; if ( ulen == 0 ) printUrl = false; if ( ! gr->m_urlsBuf ) printUrl = false; if ( ulen==7 && printUrl && !strncasecmp(gr->m_url,"http://",7)) printUrl = false; if ( ulen==8 && printUrl && !strncasecmp(gr->m_url,"https://",8)) printUrl = false; // page is not more than 32k char buf[1024*32+MAX_URL_LEN*2]; SafeBuf sb(buf, 1024*32+MAX_URL_LEN*2); //char rawbuf[1024*8]; //SafeBuf rb(rawbuf, 1024*8); //rb.safePrintf("<?xml version=\"1.0\" encoding=\"utf-8\"?>\n"); //rb.safePrintf("<status>\n"); //CollectionRec *cr = g_collectiondb.getRec ( gr->m_coll ); // collection name char tt [ 128 ]; tt[0] = '\0'; g_pages.printAdminTop ( &sb , sock , &gr->m_hr ); // display url //char *url = gr->m_urlsBuf; //if ( url && ! url[0] ) url = NULL; // watch out for NULLs if ( ! url ) url = "http://"; // if there was an error let them know //char msg[MAX_URL_LEN + 1024]; SafeBuf mbuf; //char *pm = ""; if ( g_errno ) { mbuf.safePrintf("<center><font color=red>"); mbuf.safePrintf("Error adding url(s): <b>%s[%i]</b>", mstrerror(g_errno) , g_errno); mbuf.safePrintf("</font></center>"); //pm = msg; //rb.safePrintf("Error adding url(s): %s[%i]", // mstrerror(g_errno) , g_errno); } else if ( printUrl ) { mbuf.safePrintf("<center><font color=red>"); mbuf.safePrintf("<b><u>"); mbuf.safeTruncateEllipsis(gr->m_urlsBuf,200); mbuf.safePrintf("</u></b> added to spider " "queue " "successfully<br><br>"); mbuf.safePrintf("</font></center>"); //rb.safePrintf("%s added to spider " // "queue successfully", url ); //pm = msg; //url = "http://"; //else // pm = "Don't forget to <a href=/gigaboost.html>" // "Gigaboost</a> your URL."; } if ( mbuf.length() ) sb.safeStrcpy ( mbuf.getBufStart() ); g_parms.printParmTable ( &sb , sock , &gr->m_hr ); // print the final tail g_pages.printTail ( &sb, true ); // admin? // clear g_errno, if any, so our reply send goes through g_errno = 0; // nuke state mdelete ( gr , sizeof(GigablastRequest) , "PageAddUrl" ); delete (gr); return g_httpServer.sendDynamicPage (sock, sb.getBufStart(), sb.length(), -1 ); // cachetime }
bool sendPageAddDelColl ( TcpSocket *s , HttpRequest *r , bool add ) { // get collection name //int32_t nclen; //char *nc = r->getString ( "nc" , &nclen ); //int32_t cpclen; //char *cpc = r->getString ( "cpc" , &cpclen ); g_errno = 0; //bool cast = r->getLong("cast",0); const char *msg = NULL; // if any host in network is dead, do not do this //if ( g_hostdb.hasDeadHost() ) msg = "A host in the network is dead."; char format = r->getReplyFormat(); if ( format == FORMAT_XML || format == FORMAT_JSON ) { // no addcoll given? int32_t page = g_pages.getDynamicPageNumber ( r ); const char *addcoll = r->getString("addcoll",NULL); const char *delcoll = r->getString("delcoll",NULL); if ( ! addcoll ) addcoll = r->getString("addColl",NULL); if ( ! delcoll ) delcoll = r->getString("delColl",NULL); if ( page == PAGE_ADDCOLL && ! addcoll ) { g_errno = EBADENGINEER; const char *msg = "no addcoll parm provided"; return g_httpServer.sendErrorReply(s,g_errno,msg,NULL); } if ( page == PAGE_DELCOLL && ! delcoll ) { g_errno = EBADENGINEER; const char *msg = "no delcoll parm provided"; return g_httpServer.sendErrorReply(s,g_errno,msg,NULL); } return g_httpServer.sendSuccessReply(s,format); } // error? const char *action = r->getString("action",NULL); const char *addColl = r->getString("addcoll",NULL); char buf [ 64*1024 ]; SafeBuf p(buf, 64*1024); // // CLOUD SEARCH ENGINE SUPPORT - GIGABOT ERRORS // SafeBuf gtmp; char *gmsg = NULL; // is it too big? if ( action && addColl && strlen(addColl) > MAX_COLL_LEN ) { gtmp.safePrintf("search engine name is too long"); gmsg = gtmp.getBufStart(); } // from Collectiondb.cpp::addNewColl() ensure coll name is legit const char *x = addColl; for ( ; x && *x ; x++ ) { if ( is_alnum_a(*x) ) continue; if ( *x == '-' ) continue; if ( *x == '_' ) continue; // underscore now allowed break; } if ( x && *x ) { g_errno = EBADENGINEER; gtmp.safePrintf("<font color=red>Error. \"%s\" is a " "malformed name because it " "contains the '%c' character.</font><br><br>", addColl,*x); gmsg = gtmp.getBufStart(); } // // END GIGABOT ERRORS // // // CLOUD SEARCH ENGINE SUPPORT // // if added the coll successfully, do not print same page, jump to // printing the basic settings page so they can add sites to it. // crap, this GET request, "r", is missing the "c" parm sometimes. // we need to use the "addcoll" parm anyway. maybe print a meta // redirect then? char guide = r->getLong("guide",0); // do not redirect if gmsg is set, there was a problem with the name if ( action && ! msg && format == FORMAT_HTML && guide && ! gmsg ) { //return g_parms.sendPageGeneric ( s, r, PAGE_BASIC_SETTINGS ); // just redirect to it if ( addColl ) p.safePrintf("<meta http-equiv=Refresh " "content=\"0; URL=/admin/settings" "?guide=1&c=%s\">", addColl); return g_httpServer.sendDynamicPage (s, p.getBufStart(), p.length()); } // print standard header g_pages.printAdminTop ( &p , s , r , NULL, "onload=document." "getElementById('acbox').focus();"); if ( g_errno ) { msg = mstrerror( g_errno ); } if ( msg && ! guide ) { const char *cc = "deleting"; if ( add ) cc = "adding"; p.safePrintf ( "<center>\n" "<font color=red>" "<b>Error %s collection: %s. " "See log file for details.</b>" "</font>" "</center><br>\n",cc,msg); } // // CLOUD SEARCH ENGINE SUPPORT // if ( add && guide ) printGigabotAdvice ( &p , PAGE_ADDCOLL , r , gmsg ); // print the add collection box if ( add /*&& (! nc[0] || g_errno ) */ ) { const char *t1 = "Add Collection"; if ( guide ) t1 = "Add Search Engine"; p.safePrintf ( "<center>\n<table %s>\n" "<tr class=hdrow><td colspan=2>" "<center><b>%s</b></center>" "</td></tr>\n" ,TABLE_STYLE ,t1 ); const char *t2 = "collection"; if ( guide ) t2 = "search engine"; const char *str = addColl; if ( ! addColl ) str = ""; p.safePrintf ( "<tr bgcolor=#%s>" "<td><b>name of new %s to add</td>\n" "<td><input type=text name=addcoll size=30 " "id=acbox " "value=\"%s\">" "</td></tr>\n" , LIGHT_BLUE , t2 , str ); // don't show the clone box if we are under gigabot the guide if ( ! guide ) p.safePrintf( "<tr bgcolor=#%s>" "<td><b>clone settings from this " "collection</b>" "<br><font size=1>Copy settings from " "this pre-existing collection. Leave " "blank to " "accept default values.</font></td>\n" "<td><input type=text name=clonecoll " "size=30>" "</td>" "</tr>" , LIGHT_BLUE ); // collection pwds p.safePrintf( "<tr bgcolor=#%s>" "<td><b>collection passwords" "</b>" "<br><font size=1>List of white space separated " "passwords allowed to adminster collection." "</font>" "</td>\n" "<td><input type=text name=collpwd " "size=60>" "</td>" "</tr>" , LIGHT_BLUE ); // ips box for security p.safePrintf( "<tr bgcolor=#%s>" "<td><b>collection ips" "</b>" "<br><font size=1>List of white space separated " "IPs allowed to adminster collection." "</font>" "</td>\n" "<td><input type=text name=collips " "size=60>" "</td>" "</tr>" , LIGHT_BLUE ); // now list collections from which to copy the config //p.safePrintf ( // "<tr><td><b>copy configuration from this " // "collection</b><br><font size=1>Leave blank to " // "accept default values.</font></td>\n" // "<td><input type=text name=cpc value=\"%s\" size=30>" // "</td></tr>\n",coll); p.safePrintf ( "</table></center><br>\n"); // wrap up the form started by printAdminTop g_pages.printAdminBottom ( &p ); int32_t bufLen = p.length(); return g_httpServer.sendDynamicPage (s,p.getBufStart(),bufLen); } // if we added a collection, print its page //if ( add && nc[0] && ! g_errno ) // return g_parms.sendPageGeneric2 ( s , r , PAGE_SEARCH , // nc , pwd ); if ( g_collectiondb.m_numRecsUsed <= 0 ) goto skip; // print all collections out in a checklist so you can check the // ones you want to delete, the values will be the id of that collectn p.safePrintf ( "<center>\n<table %s>\n" "<tr class=hdrow><td><center><b>Delete Collections" "</b></center></td></tr>\n" "<tr bgcolor=#%s><td>" "<center><b>Select the collections you wish to delete. " //"<font color=red>This feature is currently under " //"development.</font>" "</b></center></td></tr>\n" "<tr bgcolor=#%s><td>" // table within a table "<center><table width=20%%>\n", TABLE_STYLE, LIGHT_BLUE, DARK_BLUE ); for ( int32_t i = 0 ; i < g_collectiondb.m_numRecs ; i++ ) { CollectionRec *cr = g_collectiondb.m_recs[i]; if ( ! cr ) continue; p.safePrintf ( "<tr bgcolor=#%s><td>" "<input type=checkbox name=delcoll value=\"%s\"> " "%s</td></tr>\n", DARK_BLUE, cr->m_coll,cr->m_coll); } p.safePrintf( "</table></center></td></tr></table><br>\n" ); skip: // wrap up the form started by printAdminTop g_pages.printAdminBottom ( &p ); int32_t bufLen = p.length(); return g_httpServer.sendDynamicPage (s,p.getBufStart(),bufLen); }
// . displays the stats for a username // . show stats for every day we have them for // . in a big list // . if they click the day display all docids evaluated for that day // . show the accuracy for that day too // . how many docs they edited // . how many of those docs were verified by another // . and if there was consensus void gotTransdbList ( State60 *st ) { // get today's time range time_t now = getTimeGlobal(); // get start of today time_t dayStart = now / (24*3600); SafeBuf sb; // int16_tcut TcpSocket *s = st->m_s; // make about 200k of mem to write into if ( ! sb.reserve ( 200000 ) ) return g_httpServer.sendErrorReply(s,500,mstrerrno(g_errno)); // print description so they can clikc a button to start the turk sb.safePrintf("<html>\n" "<title>Event Editor</title>\n" "<body>\n" "<table width=\"100%%\" border=\"0\">\n" "<tr><td style=\"background-color:#0079ba;\">\n" "<center><font color=#00000>" "<h2>Event Editor</h2>\n" "</font></center></td>" "</tr></table>"); // print the content sb.safePrintf("<center><font size=4><blink>" "<b><a href=\"/pageturk?c=%s&edit=1\">" "Click here to start editing.</a></b></blink>" "</font><br><i>Please take your " "time to read the information below before you begin" "</i><br><font color=\"red\" size=2> Warning: Adult " "content might be presented to you." " You should be above 18 years of age to continue." "</center></font>",st->m_coll); sb.safePrintf("<font face=arial,sans-serif color=black size=3>" "<p>By clicking <i>Start Voting</i>, you will be " "presented with an interface for editing events. " "The editor will display a modified web page that " "contains one or more events. Each event's description " "will be highlight with a blue background. You can " "toggle whether a particular event is displayed by " "clicking on that event's ID. You can highlight one or " "multiple event descriptions at the same time. " "</p><p>" "By clicking on the section icons in the web page you " "can tell the editor that a virtual fence should be " "erected around that section. The fence will make sure " "that event descriptions can not span across it. Each " "event description must be fully contained either " "inside or outside the fence. However, you can also " "declare a section as a title section, which means that " "the text that the title section contains is free to be " "used by any event description." "</p>\n" "<p>When you are done erecting section fences, you " "submit your changes. The more changes you make the " "more points you earn. Other users may evaluate " "your edits for accuracy. You will be paid based on the " "points you earn as well as your accuracy. All " "transactions are listed in the table below.</p>" "<p>You may not change your username or password " "but you can change your email address. Your email " "address will be used to pay you with PayPal every " "Friday. Paypal fees will be deducted on your end. By " "using this service you agree to all stated Terms & " "Conditions.</p>" "</font>\n"); // get the user record User *uu = g_users.getUser ( username ); // print out their info, like paypal email sb.safePrintf("<table>\n" "<tr><td colspan=10><center>Your Info</center>" "</td></tr>\n" "<tr>" "<td>Email</td>" "<td><input type=text value=%s></td>" "<td>email address used to pay with paypal</td>" "</tr>\n" "<tr><td colspan=10><input type=submit value=update>" "</td></tr>\n" "</table>\n" , uu->m_payPalEmail ); // print your stats here now sb.safePrintf("<table>\n" "<tr><td colspan=10><center>Your Stats</center>" "</td></tr>\n" "<tr>" "<td>date</td>" "<td>action</td>" "<td>amount</td>" "<td>desc</td>" "</tr>\n"); // int16_tcut RdbList *list = &st->m_list; int32_t lastDay = -1; int32_t totalReceives = 0; int32_t totalSubmits = 0; int32_t totalPasses = 0; int32_t totalFails = 0; // scan the list for ( ; ! list->isExhausted() ; ) { // get rec char *rec = list->getCurrentRecord(); char *data = list->getCurrentData(); int32_t dataSize = list->getCurrentDataSize(); // skip that list->skipCurrentRecord(); // skip if negative if ( (rec[0] & 0x01) == 0x00 ) continue; // get the time (global time - sync'd with host #0) time_t tt = g_transdb.getTimeStamp ( rec ); // get day # int32_t daynum = tt / (24*3600); // is it today? bool isToday = ( daynum >= dayStart ); // point to the Transaction Trans *trans = (Trans *)data; // if is today, print it out verbatim if ( isToday ) { // print it in html row format to match table above //printTrans ( &sb , rec ); sb.safePrintf("<tr>"); // make it into a nice date time_t dd = lastDay * 86400; struct tm *timeStruct = localtime ( &dd ); char ppp[100]; strftime(ppp,100,"%H:%M:%S",timeStruct); // print last days stats first sb.safePrintf("<td>%s</td>",ppp); // then stats if ( trans->m_actionType == AT_RECEIVE_DOC ) sb.safePrintf("<td>receive</td>" "<td>%"INT32" pts</td>" "<td>docid=%"UINT64"</td>", (int32_t)trans->m_number, trans->m_docId); else if ( trans->m_actionType == AT_SUBMIT_DOC ) sb.safePrintf("<td>submit</td>" "<td>%"INT32" pts</td>" "<td>docid=%"UINT64"</td>", (int32_t)trans->m_number, trans->m_docId); else if ( trans->m_actionType == AT_PASS_DOC ) sb.safePrintf("<td>verify</td>" "<td>%"INT32" pts</td>" "<td>docid=%"UINT64" was verified " "by user=\"%s\"</td>", (int32_t)trans->m_number, trans->m_docId, trans->m_desc); else if ( trans->m_actionType == AT_FAIL_DOC ) sb.safePrintf("<td>verify</td>" "<td>%"INT32" pts</td>" "<td>docid=%"UINT64" was deemed to " "be incorrect " "by user=\"%s\"</td>", (int32_t)trans->m_number, trans->m_docId, trans->m_desc); else if ( trans->m_actionType == AT_ACCURACY_EVAL) sb.safePrintf("<td>accuracy eval</td>" "<td>%.02f</td>" "<td>docid=%"UINT64"</td>", trans->m_number, trans->m_docId); else if ( trans->m_actionType == AT_CHARGE) sb.safePrintf("<td>credit</td>" "<td>%.02f</td>" "<td>You made money.</td>", trans->m_number); else if ( trans->m_actionType == AT_PAYMENT) sb.safePrintf("<td>payment</td>" "<td>%.02f</td>" "<td>We paid you.</td>", trans->m_number); else if ( trans->m_actionType == AT_LOGIN) sb.safePrintf("<td>login</td>" "<td>-</td>" "<td>You logged in.</td>"); else if ( trans->m_actionType == AT_LOGOUT) sb.safePrintf("<td>logout</td>" "<td>-</td>" "<td>You logged out.</td>"); else if ( trans->m_actionType == AT_AUTO_LOGOUT) sb.safePrintf("<td>logout</td>" "<td>-</td>" "<td>You were auto " "logged out.</td>"); else { char *xx=NULL;*xx=0; } sb.safePrintf("</tr>\n"); continue; } // if does not match last day, print out that last day's stats // and reset for next guy if ( daynum != lastDay && lastDay != -1 ) { // make it into a nice date time_t dd = lastDay * 86400; struct tm *timeStruct = localtime ( &dd ); char ppp[100]; strftime(ppp,100,"%b-%d-%Y",timeStruct); // print last days stats first sb.safePrintf("<td>%s</td>",ppp); // then stats sb.safePrintf("<tr>" "<td>receive</td>" "<td>%"INT32"</td>" "<td>Total received</td>" "</tr>\n", totalReceives); sb.safePrintf("<tr>" "<td>submit</td>" "<td>%"INT32"</td>" "<td>Total submitted</td>" "</tr>\n", totalSubmits); sb.safePrintf("<tr>" "<td>pass</td>" "<td>%"INT32"</td>" "<td>Total accuracy tests passed</td>" "</tr>\n", totalPasses); sb.safePrintf("<tr>" "<td>fail</td>" "<td>%"INT32"</td>" "<td>Total accuracy tests failed</td>" "</tr>\n", totalFails); // reset as well totalReceived = 0; totalSubmits = 0; totalPasses = 0; totalFails = 0; } // remember last day # we processed for accumulating stats lastDay = daynum; // accum stats if ( trans->m_actionType == AT_RECEIVE_DOC ) totalReceives++; if ( trans->m_actionType == AT_SUBMIT_DOC ) totalSubmits++; if ( trans->m_actionType == AT_PASS_DOC ) totalPasses++; if ( trans->m_actionType == AT_FAIL_DOC ) totalFails++; } sb.safePrintf("</body></html>\n"); sendReply ( &sb ); }
// . a new interface so Msg3b can call this with "s" set to NULL // . returns false if blocked, true otherwise // . sets g_errno on error bool sendPageParser2 ( TcpSocket *s , HttpRequest *r , State8 *st , long long docId , Query *q , // in query term space, not imap space long long *termFreqs , // in imap space float *termFreqWeights , // in imap space float *affWeights , void *state , void (* callback)(void *state) ) { //log("parser: read sock=%li",s->m_sd); // might a simple request to addsomething to validated.*.txt file // from XmlDoc::print() or XmlDoc::validateOutput() char *add = r->getString("add",NULL); //long long uh64 = r->getLongLong("uh64",0LL); char *uh64str = r->getString("uh64",NULL); //char *divTag = r->getString("div",NULL); if ( uh64str ) { // convert add to number long addNum = 0; if ( to_lower_a(add[0])=='t' ) // "true" or "false"? addNum = 1; // convert it. skip beginning "str" inserted to prevent // javascript from messing with the long long since it // was rounding it! //long long uh64 = atoll(uh64str);//+3); // urldecode that //long divTagLen = gbstrlen(divTag); //long newLen = urlDecode ( divTag , divTag , divTagLen ); // null term? //divTag[newLen] = '\0'; // do it. this is defined in XmlDoc.cpp //addCheckboxSpan ( uh64 , divTag , addNum ); // make basic reply char *reply; reply = "HTTP/1.0 200 OK\r\n" "Connection: Close\r\n"; // that is it! send a basic reply ok bool status = g_httpServer.sendDynamicPage( s , reply, gbstrlen(reply), -1, //cachtime false ,//postreply? NULL, //ctype -1 , //httpstatus NULL,//cookie "utf-8"); return status; } // make a state if ( st ) st->m_freeIt = false; if ( ! st ) { try { st = new (State8); } catch ( ... ) { g_errno = ENOMEM; log("PageParser: new(%i): %s", (int)sizeof(State8),mstrerror(g_errno)); return g_httpServer.sendErrorReply(s,500, mstrerror(g_errno));} mnew ( st , sizeof(State8) , "PageParser" ); st->m_freeIt = true; } // msg3b uses this to get a score from the query st->m_state = state; st->m_callback = callback; st->m_q = q; st->m_termFreqs = termFreqs; st->m_termFreqWeights = termFreqWeights; st->m_affWeights = affWeights; //st->m_total = (score_t)-1; st->m_indexCode = 0; st->m_blocked = false; st->m_didRootDom = false; st->m_didRootWWW = false; st->m_wasRootDom = false; st->m_u = NULL; st->m_recompute = false; //st->m_url.reset(); // do not allow more than one to be launched at a time if in // a quickpoll. will cause quickpoll in quickpoll. g_inPageParser = true; // password, too long pwdLen = 0; char *pwd = r->getString ( "pwd" , &pwdLen ); if ( pwdLen > 31 ) pwdLen = 31; if ( pwdLen > 0 ) strncpy ( st->m_pwd , pwd , pwdLen ); st->m_pwd[pwdLen]='\0'; // save socket ptr st->m_s = s; st->m_r.copy ( r ); // get the collection char *coll = r->getString ( "c" , &st->m_collLen ,NULL /*default*/); if ( st->m_collLen > MAX_COLL_LEN ) return sendErrorReply ( st , ENOBUFS ); if ( ! coll ) return sendErrorReply ( st , ENOCOLLREC ); strcpy ( st->m_coll , coll ); // version to use, if -1 use latest st->m_titleRecVersion = r->getLong("version",-1); if ( st->m_titleRecVersion == -1 ) st->m_titleRecVersion = TITLEREC_CURRENT_VERSION; // default to 0 if not provided st->m_hopCount = r->getLong("hc",0); //long ulen = 0; //char *u = r->getString ( "u" , &ulen , NULL /*default*/); long old = r->getLong ( "old", 0 ); // set query long qlen; char *qs = r->getString("q",&qlen,NULL); if ( qs ) st->m_tq.set2 ( qs , langUnknown , true ); // url will override docid if given if ( ! st->m_u || ! st->m_u[0] ) st->m_docId = r->getLongLong ("docid",-1); else st->m_docId = -1; // set url in state class (may have length 0) //if ( u ) st->m_url.set ( u , ulen ); //st->m_urlLen = ulen; st->m_u = st->m_r.getString("u",&st->m_ulen,NULL); // should we recycle link info? st->m_recycle = r->getLong("recycle",0); st->m_recycle2 = r->getLong("recycleimp",0); st->m_render = r->getLong("render" ,0); // for quality computation... takes way longer cuz we have to // lookup the IP address of every outlink, so we can get its root // quality using Msg25 which needs to filter out voters from that IP // range. st->m_oips = r->getLong("oips" ,0); long linkInfoLen = 0; // default is NULL char *linkInfoColl = r->getString ( "oli" , &linkInfoLen, NULL ); if ( linkInfoColl ) strcpy ( st->m_linkInfoColl , linkInfoColl ); else st->m_linkInfoColl[0] = '\0'; // set the flag in our SafeBuf class so that Words.cpp knows to show // html or html source depending on this value st->m_xbuf.m_renderHtml = st->m_render; // should we use the old title rec? st->m_old = old; // are we coming from a local machine? st->m_isLocal = r->isLocal(); //no more setting the default root quality to 30, instead if we do not // know it setting it to -1 st->m_rootQuality=-1; // header SafeBuf *xbuf = &st->m_xbuf; xbuf->safePrintf("<meta http-equiv=\"Content-Type\" " "content=\"text/html; charset=utf-8\">\n"); // print standard header g_pages.printAdminTop ( xbuf , st->m_s , &st->m_r ); // print the standard header for admin pages char *dd = ""; char *rr = ""; char *rr2 = ""; char *render = ""; char *oips = ""; char *us = ""; if ( st->m_u && st->m_u[0] ) us = st->m_u; //if ( st->m_sfn != -1 ) sprintf ( rtu , "%li",st->m_sfn ); if ( st->m_old ) dd = " checked"; if ( st->m_recycle ) rr = " checked"; if ( st->m_recycle2 ) rr2 = " checked"; if ( st->m_render ) render = " checked"; if ( st->m_oips ) oips = " checked"; xbuf->safePrintf( "<style>" ".poo { background-color:#%s;}\n" "</style>\n" , LIGHT_BLUE ); long clen; char *contentParm = r->getString("content",&clen,""); // print the input form xbuf->safePrintf ( "<style>\n" "h2{font-size: 12px; color: #666666;}\n" ".gbtag { border: 1px solid gray;" "background: #ffffef;display:inline;}\n" ".gbcomment { border: 1px solid gray;" "color: #888888; font-style:italic; " "background: #ffffef;display:inline;}\n" ".token { border: 1px solid gray;" "background: #f0ffff;display:inline;}\n" ".spam { border: 1px solid gray;" "background: #af0000;" "color: #ffffa0;}" ".hs {color: #009900;}" "</style>\n" "<center>" "<table %s>" "<tr><td colspan=5><center><b>" "Parser" "</b></center></td></tr>\n" "<tr class=poo>" "<td>" "<b>url</b>" "<br><font size=-2>" "Type in <b>FULL</b> url to parse." "</font>" "</td>" "</td>" "<td>" "<input type=text name=u value=\"%s\" size=\"40\">\n" "</td>" "</tr>" /* "<tr class=poo>" "<td>" "Parser version to use: " "</td>" "<td>" "<input type=text name=\"version\" size=\"4\" value=\"-1\"> " "</td>" "<td>" "(-1 means to use latest title rec version)<br>" "</td>" "</tr>" */ /* "<tr class=poo>" "<td>" "Hop count to use: " "</td>" "<td>" "<input type=text name=\"hc\" size=\"4\" value=\"%li\"> " "</td>" "<td>" "(-1 is unknown. For root urls hopcount is always 0)<br>" "</td>" "</tr>" */ "<tr class=poo>" "<td>" "<b>use cached</b>" "<br><font size=-2>" "Load page from cache (titledb)?" "</font>" "</td>" "<td>" "<input type=checkbox name=old value=1%s> " "</td>" "</tr>" /* "<tr class=poo>" "<td>" "Reparse root:" "</td>" "<td>" "<input type=checkbox name=artr value=1%s> " "</td>" "<td>" "Apply selected ruleset to root to update quality" "</td>" "</tr>" */ "<tr class=poo>" "<td>" "<b>recycle link info</b>" "<br><font size=-2>" "Recycle the link info from the title rec" "Load page from cache (titledb)?" "</font>" "</td>" "<td>" "<input type=checkbox name=recycle value=1%s> " "</td>" "</tr>" /* "<tr class=poo>" "<td>" "Recycle Link Info Imported:" "</td>" "<td>" "<input type=checkbox name=recycleimp value=1%s> " "</td>" "<td>" "Recycle the link info imported from other coll" "</td>" "</tr>" */ "<tr class=poo>" "<td>" "<b>render html</b>" "<br><font size=-2>" "Render document content as HTML" "</font>" "</td>" "<td>" "<input type=checkbox name=render value=1%s> " "</td>" "</tr>" /* "<tr class=poo>" "<td>" "Lookup outlinks' ruleset, ips, quality:" "</td>" "<td>" "<input type=checkbox name=oips value=1%s> " "</td>" "<td>" "To compute quality lookup IP addresses of roots " "of outlinks." "</td>" "</tr>" "<tr class=poo>" "<td>" "LinkInfo Coll:" "</td>" "<td>" "<input type=text name=\"oli\" size=\"10\" value=\"\"> " "</td>" "<td>" "Leave empty usually. Uses this coll to lookup link info." "</td>" "</tr>" */ "<tr class=poo>" "<td>" "<b>optional query</b>" "<br><font size=-2>" "Leave empty usually. For title generation only." "</font>" "</td>" "<td>" "<input type=text name=\"q\" size=\"20\" value=\"\"> " "</td>" "</tr>", TABLE_STYLE, us , dd, rr, render ); xbuf->safePrintf( "<tr class=poo>" "<td>" "<b>content type below is</b>" "<br><font size=-2>" "Is the content below HTML? XML? JSON?" "</font>" "</td>" "<td>" //"<input type=checkbox name=xml value=1> " "<select name=ctype>\n" "<option value=%li selected>HTML</option>\n" "<option value=%li selected>XML</option>\n" "<option value=%li selected>JSON</option>\n" "</select>\n" "</td>" "</tr>", (long)CT_HTML, (long)CT_XML, (long)CT_JSON ); xbuf->safePrintf( "<tr class=poo>" "<td><b>content</b>" "<br><font size=-2>" "Use this content for the provided <i>url</i> " "rather than downloading it from the web." "</td>" "<td>" "<textarea rows=10 cols=80 name=content>" "%s" "</textarea>" "</td>" "</tr>" "</table>" "</center>" "</form>" "<br>", //oips , contentParm ); xbuf->safePrintf( "<center>" "<input type=submit value=Submit>" "</center>" ); // just print the page if no url given if ( ! st->m_u || ! st->m_u[0] ) return processLoop ( st ); XmlDoc *xd = &st->m_xd; // set this up SpiderRequest sreq; sreq.reset(); strcpy(sreq.m_url,st->m_u); long firstIp = hash32n(st->m_u); if ( firstIp == -1 || firstIp == 0 ) firstIp = 1; // parentdocid of 0 sreq.setKey( firstIp, 0LL, false ); sreq.m_isPageParser = 1; sreq.m_hopCount = st->m_hopCount; sreq.m_hopCountValid = 1; sreq.m_fakeFirstIp = 1; sreq.m_firstIp = firstIp; Url nu; nu.set(sreq.m_url); sreq.m_domHash32 = nu.getDomainHash32(); sreq.m_siteHash32 = nu.getHostHash32(); // . get provided content if any // . will be NULL if none provided // . "content" may contain a MIME long contentLen = 0; char *content = r->getString ( "content" , &contentLen , NULL ); // is the "content" url-encoded? default is true. bool contentIsEncoded = true; // mark doesn't like to url-encode his content if ( ! content ) { content = r->getUnencodedContent (); contentLen = r->getUnencodedContentLen (); contentIsEncoded = false; } // ensure null if ( contentLen == 0 ) content = NULL; uint8_t contentType = CT_HTML; if ( r->getBool("xml",0) ) contentType = CT_XML; contentType = r->getLong("ctype",contentType);//CT_HTML); // if facebook, load xml content from title rec... bool isFacebook = (bool)strstr(st->m_u,"http://www.facebook.com/"); if ( isFacebook && ! content ) { long long docId = g_titledb.getProbableDocId(st->m_u); sprintf(sreq.m_url ,"%llu", docId ); sreq.m_isPageReindex = true; } // hack if ( content ) { st->m_dbuf.purge(); st->m_dbuf.safeStrcpy(content); //char *data = strstr(content,"\r\n\r\n"); //long dataPos = 0; //if ( data ) dataPos = (data + 4) - content; //st->m_dbuf.convertJSONtoXML(0,dataPos); //st->m_dbuf.decodeJSON(0); content = st->m_dbuf.getBufStart(); } // . use the enormous power of our new XmlDoc class // . this returns false if blocked if ( ! xd->set4 ( &sreq , NULL , st->m_coll , &st->m_wbuf , 0 ,//PP_NICENESS )) content , false, // deletefromindex 0, // forced ip contentType )) // return error reply if g_errno is set return sendErrorReply ( st , g_errno ); // make this our callback in case something blocks xd->setCallback ( st , processLoop ); // . set xd from the old title rec if recycle is true // . can also use XmlDoc::m_loadFromOldTitleRec flag if ( st->m_recycle ) xd->m_recycleContent = true; return processLoop ( st ); }
bool sendReply ( void *state ) { StateCatdb *st = (StateCatdb*)state; // check for error if (g_errno) { if (st->m_catLookup) log("PageCatdb: Msg8b had error getting Site Rec: %s", mstrerror(g_errno)); else log("PageCatdb: Msg2a had error generating Catdb: %s", mstrerror(g_errno)); st->m_catLookup = false; g_errno = 0; } long long endTime = gettimeofdayInMilliseconds(); // page buffer SafeBuf sb; sb.reserve(64*1024); // . print standard header // . do not print big links if only an assassin, just print host ids g_pages.printAdminTop ( &sb, st->m_socket , &st->m_r ); sb.safePrintf( "<style>" ".poo { background-color:#%s;}\n" "</style>\n" , LIGHT_BLUE ); sb.safePrintf ( "<table %s>" "<tr><td colspan=2>" "<center><font size=+1><b>Catdb</b></font></center>" "</td></tr>", TABLE_STYLE ); // instructions sb.safePrintf("<tr bgcolor=#%s>" "<td colspan=3>" "<font size=-2>" "<center>" "Don't just start using this, you need to follow the " "instructions in the <i>admin guide</i> for adding " "DMOZ support." "</center>" "</font>" "</td>" "</tr>" ,DARK_BLUE ); // print the generate Catdb link sb.safePrintf ( "<tr class=poo><td>Update Catdb from DMOZ data.</td>" "<td><center>" "<a href=\"/master/catdb?c=%s&gencatdb=2\">" "Update Catdb</a> " "</center></td></tr>", st->m_coll ); sb.safePrintf ( "<tr class=poo>" "<td>Generate New Catdb from DMOZ data.</td>" "<td><center>" "<a href=\"/master/catdb?c=%s&gencatdb=1\">" "Generate Catdb</a> " "</center></td></tr>", st->m_coll ); if (st->m_genCatdb) sb.safePrintf ( "<tr class=poo>" "<td> Catdb Generation took %lli ms." "</td></tr>", endTime - st->m_startTime ); // print Url Catgory Lookup sb.safePrintf ( "<tr class=poo><td>Lookup Category of Url.</td>" "<td><input type=text name=caturl size=80" " value=\""); if (st->m_catLookup) { sb.safeMemcpy(st->m_url.getUrl(), st->m_url.getUrlLen()); } sb.safePrintf("\"></center></td></tr>" ); // print Url Info if Lookup was done if (st->m_catLookup) { sb.safePrintf("<tr><td>"); // print the url sb.safeMemcpy(st->m_url.getUrl(), st->m_url.getUrlLen()); sb.safePrintf(" (%lli ms)</td><td>", endTime - st->m_startTime ); // print each category id and path for (long i = 0; i < st->m_catRec.m_numCatids; i++) { sb.safePrintf("<b>[%li] ", st->m_catRec.m_catids[i]); g_categories->printPathFromId(&sb, st->m_catRec.m_catids[i]); sb.safePrintf("</b><br>"); // lookup title and summary char title[1024]; long titleLen = 0; char summ[4096]; long summLen = 0; char anchor[256]; unsigned char anchorLen = 0; g_categories->getTitleAndSummary( st->m_url.getUrl(), st->m_url.getUrlLen(), st->m_catRec.m_catids[i], title, &titleLen, 1023, summ, &summLen, 4098, anchor, &anchorLen, 255 ); title[titleLen] = '\0'; summ[summLen] = '\0'; anchor[anchorLen] = '\0'; // print title and summary sb.safePrintf("<b>Title:</b> %s<br>" "<b>Summary:</b> %s<br>", title, summ); if (anchorLen > 0) sb.safePrintf("<b>Anchor:</b> %s<br>", anchor); sb.safePrintf("<br>"); } sb.safePrintf("<b>Filenum:</b> %li<br>", st->m_catRec.m_filenum); // print indirect catids if (st->m_catRec.m_numIndCatids > 0) { sb.safePrintf("<hr><b>Indirect Catids [%li]:" "</b><br>\n", st->m_catRec.m_numIndCatids ); for (long i = 0; i < st->m_catRec.m_numIndCatids; i++) { sb.safePrintf("%lu<br>", st->m_catRec.m_indCatids[i]); } } sb.safePrintf("</td></tr>"); } // end it sb.safePrintf ( "</center></td></tr></table>" ); // print submit button sb.safePrintf ( "<br><center>" "<input type=submit value=\"Submit\" border=0>" "</form></center>" ); // print the final tail //p += g_httpServer.printTail ( p , pend - p ); // clear g_errno, if any, so our reply send goes through g_errno = 0; // extract the socket TcpSocket *s = st->m_socket; // clear the state mdelete ( st, sizeof(StateCatdb), "PageCatdb" ); delete st; // . send this page // . encapsulates in html header and tail // . make a Mime return g_httpServer.sendDynamicPage(s , sb.getBufStart(), sb.length()); }