// . the url being reuqested // . removes &code= facebook cruft bool HttpRequest::getCurrentUrl ( SafeBuf &cu ) { // makre sure we got enough room if ( ! cu.reserve ( m_hostLen + 64 + m_plen + 1 + 1 ) ) return false; // need a "Host: " char *host = m_host; if ( ! host ) host = APPSUBDOMAIN; cu.safePrintf("http"); if ( m_isSSL ) cu.pushChar('s'); cu.safePrintf("://%s",host); char *path = m_path; long plen = m_plen; if ( ! path ) { path = "/"; plen = 1; } // . scan path and change \0 back to = or & // . similar logic in HttpServer.cpp for logging! char *dst = cu.getBuf(); char *src = path; char *srcEnd = path + plen; char dd = '='; for ( ; src < srcEnd ; src++ , dst++ ) { *dst = *src; if ( *src ) continue; *dst = dd; if ( dd == '=' ) dd = '&'; else dd = '='; } *dst = '\0'; // cut it off at facebook's &code= char *buf = cu.getBufStart(); char *code = strstr( buf,"&code="); // fix for eventguru.com/blog.html?code= if ( ! code ) code = strstr(buf,"?code="); // hack that off if there if ( code ) { *code = '\0'; dst = code; } // update length cu.setLength( dst - cu.getBufStart() ); return true; }
bool HttpRequest::getCurrentUrlPath ( SafeBuf &cup ) { // makre sure we got enough room if ( ! cup.reserve ( m_plen + 1 + 1 ) ) return false; char *path = m_path; long plen = m_plen; if ( ! path ) { path = "/"; plen = 1; } // . scan path and change \0 back to = or & // . similar logic in HttpServer.cpp for logging! char *dst = cup.getBuf(); char *start = dst; char *src = path; char *srcEnd = path + plen; // stop if we hit '?' for ( ; src < srcEnd && *src != '?' ; src++ , dst++ ) { *dst = *src; } cup.incrementLength(dst - start); *dst = '\0'; return true; }
void Msg39::estimateHitsAndSendReply ( ) { // no longer in use m_inUse = false; // now this for the query loop on the QueryLogEntries. m_topDocId50 = 0LL; m_topScore50 = 0.0; // a little hack for the seo pipeline in xmldoc.cpp m_topDocId = 0LL; m_topScore = 0.0; m_topDocId2 = 0LL; m_topScore2 = 0.0; int32_t ti = m_tt.getHighNode(); if ( ti >= 0 ) { TopNode *t = &m_tt.m_nodes[ti]; m_topDocId = t->m_docId; m_topScore = t->m_score; } // try the 2nd one too int32_t ti2 = -1; if ( ti >= 0 ) ti2 = m_tt.getNext ( ti ); if ( ti2 >= 0 ) { TopNode *t2 = &m_tt.m_nodes[ti2]; m_topDocId2 = t2->m_docId; m_topScore2 = t2->m_score; } // convenience ptrs. we will store the docids/scores into these arrays int64_t *topDocIds; double *topScores; key_t *topRecs; // numDocIds counts docs in all tiers when using toptree. int32_t numDocIds = m_tt.m_numUsedNodes; // the msg39 reply we send back int32_t replySize; char *reply; //m_numTotalHits = m_posdbTable.m_docIdVoteBuf.length() / 6; // make the reply? Msg39Reply mr; // this is what you want to look at if there is no seo.cpp module... if ( ! m_callback ) { // if we got clusterdb recs in here, use 'em if ( m_gotClusterRecs ) numDocIds = m_numVisible; // don't send more than the docs that are asked for if ( numDocIds > m_r->m_docsToGet) numDocIds =m_r->m_docsToGet; // # of QueryTerms in query int32_t nqt = m_tmpq.m_numTerms; // start setting the stuff mr.m_numDocIds = numDocIds; // copy # estiamted hits into 8 bytes of reply //int64_t est = m_posdbTable.m_estimatedTotalHits; // ensure it has at least as many results as we got //if ( est < numDocIds ) est = numDocIds; // or if too big... //if ( numDocIds < m_r->m_docsToGet ) est = numDocIds; // . total estimated hits // . this is now an EXACT count! mr.m_estimatedHits = m_numTotalHits; // sanity check mr.m_nqt = nqt; // the m_errno if any mr.m_errno = m_errno; // int16_tcut PosdbTable *pt = &m_posdbTable; // the score info, in no particular order right now mr.ptr_scoreInfo = pt->m_scoreInfoBuf.getBufStart(); mr.size_scoreInfo = pt->m_scoreInfoBuf.length(); // that has offset references into posdbtable::m_pairScoreBuf // and m_singleScoreBuf, so we need those too now mr.ptr_pairScoreBuf = pt->m_pairScoreBuf.getBufStart(); mr.size_pairScoreBuf = pt->m_pairScoreBuf.length(); mr.ptr_singleScoreBuf = pt->m_singleScoreBuf.getBufStart(); mr.size_singleScoreBuf = pt->m_singleScoreBuf.length(); // save some time since seo.cpp gets from posdbtable directly, // so we can avoid serializing/copying this stuff at least if ( ! m_r->m_makeReply ) { mr.size_scoreInfo = 0; mr.size_pairScoreBuf = 0; mr.size_singleScoreBuf = 0; } //mr.m_sectionStats = pt->m_sectionStats; // reserve space for these guys, we fill them in below mr.ptr_docIds = NULL; mr.ptr_scores = NULL; mr.ptr_clusterRecs = NULL; // this is how much space to reserve mr.size_docIds = 8 * numDocIds; // int64_t mr.size_scores = sizeof(double) * numDocIds; // float // if not doing site clustering, we won't have these perhaps... if ( m_gotClusterRecs ) mr.size_clusterRecs = sizeof(key_t) *numDocIds; else mr.size_clusterRecs = 0; #define MAX_FACETS 20000 ///////////////// // // FACETS // ///////////////// // We can have multiple gbfacet: terms in a query so // serialize all the QueryTerm::m_facetHashTables into // Msg39Reply::ptr_facetHashList. // // combine the facet hash lists of each query term into // a list of lists. each lsit is preceeded by the query term // id of the query term (like gbfacet:xpathsitehash12345) // followed by a 4 byte length of the following 32-bit // facet values int32_t need = 0; for ( int32_t i = 0 ; i < m_tmpq.m_numTerms; i++ ) { QueryTerm *qt = &m_tmpq.m_qterms[i]; // skip if not facet if ( qt->m_fieldCode != FIELD_GBFACETSTR && qt->m_fieldCode != FIELD_GBFACETINT && qt->m_fieldCode != FIELD_GBFACETFLOAT ) continue; HashTableX *ft = &qt->m_facetHashTable; if ( ft->m_numSlotsUsed == 0 ) continue; int32_t used = ft->m_numSlotsUsed; // limit for memory if ( used > (int32_t)MAX_FACETS ) { log("msg39: truncating facet list to 20000 " "from %"INT32" for %s",used,qt->m_term); used = (int32_t)MAX_FACETS; } // store query term id 64 bit need += 8; // then size need += 4; // then buckets. keys and counts need += (4+sizeof(FacetEntry)) * used; } // allocate SafeBuf tmp; if ( ! tmp.reserve ( need ) ) { log("query: Could not allocate memory " "to hold reply facets"); sendReply(m_slot,this,NULL,0,0,true); return; } // point to there char *p = tmp.getBufStart(); for ( int32_t i = 0 ; i < m_tmpq.m_numTerms ; i++ ) { QueryTerm *qt = &m_tmpq.m_qterms[i]; // skip if not facet if ( qt->m_fieldCode != FIELD_GBFACETSTR && qt->m_fieldCode != FIELD_GBFACETINT && qt->m_fieldCode != FIELD_GBFACETFLOAT ) continue; // get all the facet hashes and their counts HashTableX *ft = &qt->m_facetHashTable; // skip if none if ( ft->m_numSlotsUsed == 0 ) continue; // store query term id 64 bit *(int64_t *)p = qt->m_termId; p += 8; int32_t used = ft->getNumSlotsUsed(); if ( used > (int32_t)MAX_FACETS ) used = (int32_t)MAX_FACETS; // store count *(int32_t *)p = used; p += 4; int32_t count = 0; // for sanity check char *pend = p + (used * (4+sizeof(FacetEntry))); // serialize the key/val pairs for ( int32_t k = 0 ; k < ft->m_numSlots ; k++ ) { // skip empty buckets if ( ! ft->m_flags[k] ) continue; // store key. the hash of the facet value. *(int32_t *)p = ft->getKey32FromSlot(k); p += 4; // then store count //*(int32_t *)p = ft->getVal32FromSlot(k); p += 4; // now this has a docid on it so we can // lookup the text of the facet in Msg40.cpp FacetEntry *fe; fe = (FacetEntry *)ft->getValFromSlot(k); // sanity // no, count can be zero if its a range facet // that was never added to. we add those // empty FaceEntries only for range facets // in Posdb.cpp //if(fe->m_count == 0 ) { char *xx=NULL;*xx=0;} gbmemcpy ( p , fe , sizeof(FacetEntry) ); p += sizeof(FacetEntry); // do not breach if ( ++count >= (int32_t)MAX_FACETS ) break; } // sanity check if ( p != pend ) { char *xx=NULL;*xx=0; } // do the next query term } // now point to that so it can be serialized below mr.ptr_facetHashList = tmp.getBufStart(); mr.size_facetHashList = p - tmp.getBufStart();//tmp.length(); ///////////// // // END FACETS // ///////////// // . that is pretty much it,so serialize it into buffer,"reply" // . mr.ptr_docIds, etc., will point into the buffer so we can // re-serialize into it below from the tree // . returns NULL and sets g_errno on error // . "true" means we should make mr.ptr_* reference into the // newly serialized buffer. reply = serializeMsg ( sizeof(Msg39Reply), // baseSize &mr.size_docIds, // firstSizeParm &mr.size_clusterRecs,//lastSizePrm &mr.ptr_docIds , // firstStrPtr &mr , // thisPtr &replySize , NULL , 0 , true ) ; if ( ! reply ) { log("query: Could not allocated memory " "to hold reply of docids to send back."); sendReply(m_slot,this,NULL,0,0,true); return; } topDocIds = (int64_t *) mr.ptr_docIds; topScores = (double *) mr.ptr_scores; topRecs = (key_t *) mr.ptr_clusterRecs; } int32_t docCount = 0; // loop over all results in the TopTree for ( int32_t ti = m_tt.getHighNode() ; ti >= 0 ; ti = m_tt.getPrev(ti) ) { // get the guy TopNode *t = &m_tt.m_nodes[ti]; // skip if clusterLevel is bad! if ( m_gotClusterRecs && t->m_clusterLevel != CR_OK ) continue; // if not sending back a reply... we were called from seo.cpp // State3f logic to evaluate a QueryLogEntry, etc. if ( m_callback ) { // skip results past #50 if ( docCount > 50 ) continue; // set this m_topScore50 = t->m_score; m_topDocId50 = t->m_docId; // that's it continue; } // get the docid ptr //char *diptr = t->m_docIdPtr; //int64_t docId = getDocIdFromPtr(diptr); // sanity check if ( t->m_docId < 0 ) { char *xx=NULL; *xx=0; } //add it to the reply topDocIds [docCount] = t->m_docId; topScores [docCount] = t->m_score; if ( m_tt.m_useIntScores ) topScores[docCount] = (double)t->m_intScore; // supply clusterdb rec? only for full splits if ( m_gotClusterRecs ) topRecs [docCount] = t->m_clusterRec; //topExplicits [docCount] = // getNumBitsOn(t->m_explicits) docCount++; // 50th score? set this for seo.cpp. if less than 50 results // we want the score of the last doc then. if ( docCount <= 50 ) m_topScore50 = t->m_score; if ( m_debug ) { logf(LOG_DEBUG,"query: msg39: [%"PTRFMT"] " "%03"INT32") docId=%012"UINT64" sum=%.02f", (PTRTYPE)this, docCount, t->m_docId,t->m_score); } //don't send more than the docs that are wanted if ( docCount >= numDocIds ) break; } if ( docCount > 300 && m_debug ) log("query: Had %"INT32" nodes in top tree",docCount); // this is sensitive info if ( m_debug ) { log(LOG_DEBUG, "query: msg39: [%"PTRFMT"] " "Intersected lists took %"INT64" (%"INT64") " "ms " "docIdsToGet=%"INT32" docIdsGot=%"INT32" " "q=%s", (PTRTYPE)this , m_posdbTable.m_addListsTime , gettimeofdayInMilliseconds() - m_startTime , m_r->m_docsToGet , numDocIds , m_tmpq.getQuery() ); } // if we blocked because we used a thread then call callback if // summoned from a msg3f handler and not a msg39 handler if ( m_callback ) { // if we blocked call user callback if ( m_blocked ) m_callback ( m_state ); // if not sending back a udp reply, return now return; } // now send back the reply sendReply(m_slot,this,reply,replySize,replySize,false); return; }
// . displays the stats for a username // . show stats for every day we have them for // . in a big list // . if they click the day display all docids evaluated for that day // . show the accuracy for that day too // . how many docs they edited // . how many of those docs were verified by another // . and if there was consensus void gotTransdbList ( State60 *st ) { // get today's time range time_t now = getTimeGlobal(); // get start of today time_t dayStart = now / (24*3600); SafeBuf sb; // int16_tcut TcpSocket *s = st->m_s; // make about 200k of mem to write into if ( ! sb.reserve ( 200000 ) ) return g_httpServer.sendErrorReply(s,500,mstrerrno(g_errno)); // print description so they can clikc a button to start the turk sb.safePrintf("<html>\n" "<title>Event Editor</title>\n" "<body>\n" "<table width=\"100%%\" border=\"0\">\n" "<tr><td style=\"background-color:#0079ba;\">\n" "<center><font color=#00000>" "<h2>Event Editor</h2>\n" "</font></center></td>" "</tr></table>"); // print the content sb.safePrintf("<center><font size=4><blink>" "<b><a href=\"/pageturk?c=%s&edit=1\">" "Click here to start editing.</a></b></blink>" "</font><br><i>Please take your " "time to read the information below before you begin" "</i><br><font color=\"red\" size=2> Warning: Adult " "content might be presented to you." " You should be above 18 years of age to continue." "</center></font>",st->m_coll); sb.safePrintf("<font face=arial,sans-serif color=black size=3>" "<p>By clicking <i>Start Voting</i>, you will be " "presented with an interface for editing events. " "The editor will display a modified web page that " "contains one or more events. Each event's description " "will be highlight with a blue background. You can " "toggle whether a particular event is displayed by " "clicking on that event's ID. You can highlight one or " "multiple event descriptions at the same time. " "</p><p>" "By clicking on the section icons in the web page you " "can tell the editor that a virtual fence should be " "erected around that section. The fence will make sure " "that event descriptions can not span across it. Each " "event description must be fully contained either " "inside or outside the fence. However, you can also " "declare a section as a title section, which means that " "the text that the title section contains is free to be " "used by any event description." "</p>\n" "<p>When you are done erecting section fences, you " "submit your changes. The more changes you make the " "more points you earn. Other users may evaluate " "your edits for accuracy. You will be paid based on the " "points you earn as well as your accuracy. All " "transactions are listed in the table below.</p>" "<p>You may not change your username or password " "but you can change your email address. Your email " "address will be used to pay you with PayPal every " "Friday. Paypal fees will be deducted on your end. By " "using this service you agree to all stated Terms & " "Conditions.</p>" "</font>\n"); // get the user record User *uu = g_users.getUser ( username ); // print out their info, like paypal email sb.safePrintf("<table>\n" "<tr><td colspan=10><center>Your Info</center>" "</td></tr>\n" "<tr>" "<td>Email</td>" "<td><input type=text value=%s></td>" "<td>email address used to pay with paypal</td>" "</tr>\n" "<tr><td colspan=10><input type=submit value=update>" "</td></tr>\n" "</table>\n" , uu->m_payPalEmail ); // print your stats here now sb.safePrintf("<table>\n" "<tr><td colspan=10><center>Your Stats</center>" "</td></tr>\n" "<tr>" "<td>date</td>" "<td>action</td>" "<td>amount</td>" "<td>desc</td>" "</tr>\n"); // int16_tcut RdbList *list = &st->m_list; int32_t lastDay = -1; int32_t totalReceives = 0; int32_t totalSubmits = 0; int32_t totalPasses = 0; int32_t totalFails = 0; // scan the list for ( ; ! list->isExhausted() ; ) { // get rec char *rec = list->getCurrentRecord(); char *data = list->getCurrentData(); int32_t dataSize = list->getCurrentDataSize(); // skip that list->skipCurrentRecord(); // skip if negative if ( (rec[0] & 0x01) == 0x00 ) continue; // get the time (global time - sync'd with host #0) time_t tt = g_transdb.getTimeStamp ( rec ); // get day # int32_t daynum = tt / (24*3600); // is it today? bool isToday = ( daynum >= dayStart ); // point to the Transaction Trans *trans = (Trans *)data; // if is today, print it out verbatim if ( isToday ) { // print it in html row format to match table above //printTrans ( &sb , rec ); sb.safePrintf("<tr>"); // make it into a nice date time_t dd = lastDay * 86400; struct tm *timeStruct = localtime ( &dd ); char ppp[100]; strftime(ppp,100,"%H:%M:%S",timeStruct); // print last days stats first sb.safePrintf("<td>%s</td>",ppp); // then stats if ( trans->m_actionType == AT_RECEIVE_DOC ) sb.safePrintf("<td>receive</td>" "<td>%"INT32" pts</td>" "<td>docid=%"UINT64"</td>", (int32_t)trans->m_number, trans->m_docId); else if ( trans->m_actionType == AT_SUBMIT_DOC ) sb.safePrintf("<td>submit</td>" "<td>%"INT32" pts</td>" "<td>docid=%"UINT64"</td>", (int32_t)trans->m_number, trans->m_docId); else if ( trans->m_actionType == AT_PASS_DOC ) sb.safePrintf("<td>verify</td>" "<td>%"INT32" pts</td>" "<td>docid=%"UINT64" was verified " "by user=\"%s\"</td>", (int32_t)trans->m_number, trans->m_docId, trans->m_desc); else if ( trans->m_actionType == AT_FAIL_DOC ) sb.safePrintf("<td>verify</td>" "<td>%"INT32" pts</td>" "<td>docid=%"UINT64" was deemed to " "be incorrect " "by user=\"%s\"</td>", (int32_t)trans->m_number, trans->m_docId, trans->m_desc); else if ( trans->m_actionType == AT_ACCURACY_EVAL) sb.safePrintf("<td>accuracy eval</td>" "<td>%.02f</td>" "<td>docid=%"UINT64"</td>", trans->m_number, trans->m_docId); else if ( trans->m_actionType == AT_CHARGE) sb.safePrintf("<td>credit</td>" "<td>%.02f</td>" "<td>You made money.</td>", trans->m_number); else if ( trans->m_actionType == AT_PAYMENT) sb.safePrintf("<td>payment</td>" "<td>%.02f</td>" "<td>We paid you.</td>", trans->m_number); else if ( trans->m_actionType == AT_LOGIN) sb.safePrintf("<td>login</td>" "<td>-</td>" "<td>You logged in.</td>"); else if ( trans->m_actionType == AT_LOGOUT) sb.safePrintf("<td>logout</td>" "<td>-</td>" "<td>You logged out.</td>"); else if ( trans->m_actionType == AT_AUTO_LOGOUT) sb.safePrintf("<td>logout</td>" "<td>-</td>" "<td>You were auto " "logged out.</td>"); else { char *xx=NULL;*xx=0; } sb.safePrintf("</tr>\n"); continue; } // if does not match last day, print out that last day's stats // and reset for next guy if ( daynum != lastDay && lastDay != -1 ) { // make it into a nice date time_t dd = lastDay * 86400; struct tm *timeStruct = localtime ( &dd ); char ppp[100]; strftime(ppp,100,"%b-%d-%Y",timeStruct); // print last days stats first sb.safePrintf("<td>%s</td>",ppp); // then stats sb.safePrintf("<tr>" "<td>receive</td>" "<td>%"INT32"</td>" "<td>Total received</td>" "</tr>\n", totalReceives); sb.safePrintf("<tr>" "<td>submit</td>" "<td>%"INT32"</td>" "<td>Total submitted</td>" "</tr>\n", totalSubmits); sb.safePrintf("<tr>" "<td>pass</td>" "<td>%"INT32"</td>" "<td>Total accuracy tests passed</td>" "</tr>\n", totalPasses); sb.safePrintf("<tr>" "<td>fail</td>" "<td>%"INT32"</td>" "<td>Total accuracy tests failed</td>" "</tr>\n", totalFails); // reset as well totalReceived = 0; totalSubmits = 0; totalPasses = 0; totalFails = 0; } // remember last day # we processed for accumulating stats lastDay = daynum; // accum stats if ( trans->m_actionType == AT_RECEIVE_DOC ) totalReceives++; if ( trans->m_actionType == AT_SUBMIT_DOC ) totalSubmits++; if ( trans->m_actionType == AT_PASS_DOC ) totalPasses++; if ( trans->m_actionType == AT_FAIL_DOC ) totalFails++; } sb.safePrintf("</body></html>\n"); sendReply ( &sb ); }
bool sendReply ( void *state ) { StateCatdb *st = (StateCatdb*)state; // check for error if (g_errno) { if (st->m_catLookup) log("PageCatdb: Msg8b had error getting Site Rec: %s", mstrerror(g_errno)); else log("PageCatdb: Msg2a had error generating Catdb: %s", mstrerror(g_errno)); st->m_catLookup = false; g_errno = 0; } long long endTime = gettimeofdayInMilliseconds(); // page buffer SafeBuf sb; sb.reserve(64*1024); // . print standard header // . do not print big links if only an assassin, just print host ids g_pages.printAdminTop ( &sb, st->m_socket , &st->m_r ); sb.safePrintf( "<style>" ".poo { background-color:#%s;}\n" "</style>\n" , LIGHT_BLUE ); sb.safePrintf ( "<table %s>" "<tr><td colspan=2>" "<center><font size=+1><b>Catdb</b></font></center>" "</td></tr>", TABLE_STYLE ); // instructions sb.safePrintf("<tr bgcolor=#%s>" "<td colspan=3>" "<font size=-2>" "<center>" "Don't just start using this, you need to follow the " "instructions in the <i>admin guide</i> for adding " "DMOZ support." "</center>" "</font>" "</td>" "</tr>" ,DARK_BLUE ); // print the generate Catdb link sb.safePrintf ( "<tr class=poo><td>Update Catdb from DMOZ data.</td>" "<td><center>" "<a href=\"/master/catdb?c=%s&gencatdb=2\">" "Update Catdb</a> " "</center></td></tr>", st->m_coll ); sb.safePrintf ( "<tr class=poo>" "<td>Generate New Catdb from DMOZ data.</td>" "<td><center>" "<a href=\"/master/catdb?c=%s&gencatdb=1\">" "Generate Catdb</a> " "</center></td></tr>", st->m_coll ); if (st->m_genCatdb) sb.safePrintf ( "<tr class=poo>" "<td> Catdb Generation took %lli ms." "</td></tr>", endTime - st->m_startTime ); // print Url Catgory Lookup sb.safePrintf ( "<tr class=poo><td>Lookup Category of Url.</td>" "<td><input type=text name=caturl size=80" " value=\""); if (st->m_catLookup) { sb.safeMemcpy(st->m_url.getUrl(), st->m_url.getUrlLen()); } sb.safePrintf("\"></center></td></tr>" ); // print Url Info if Lookup was done if (st->m_catLookup) { sb.safePrintf("<tr><td>"); // print the url sb.safeMemcpy(st->m_url.getUrl(), st->m_url.getUrlLen()); sb.safePrintf(" (%lli ms)</td><td>", endTime - st->m_startTime ); // print each category id and path for (long i = 0; i < st->m_catRec.m_numCatids; i++) { sb.safePrintf("<b>[%li] ", st->m_catRec.m_catids[i]); g_categories->printPathFromId(&sb, st->m_catRec.m_catids[i]); sb.safePrintf("</b><br>"); // lookup title and summary char title[1024]; long titleLen = 0; char summ[4096]; long summLen = 0; char anchor[256]; unsigned char anchorLen = 0; g_categories->getTitleAndSummary( st->m_url.getUrl(), st->m_url.getUrlLen(), st->m_catRec.m_catids[i], title, &titleLen, 1023, summ, &summLen, 4098, anchor, &anchorLen, 255 ); title[titleLen] = '\0'; summ[summLen] = '\0'; anchor[anchorLen] = '\0'; // print title and summary sb.safePrintf("<b>Title:</b> %s<br>" "<b>Summary:</b> %s<br>", title, summ); if (anchorLen > 0) sb.safePrintf("<b>Anchor:</b> %s<br>", anchor); sb.safePrintf("<br>"); } sb.safePrintf("<b>Filenum:</b> %li<br>", st->m_catRec.m_filenum); // print indirect catids if (st->m_catRec.m_numIndCatids > 0) { sb.safePrintf("<hr><b>Indirect Catids [%li]:" "</b><br>\n", st->m_catRec.m_numIndCatids ); for (long i = 0; i < st->m_catRec.m_numIndCatids; i++) { sb.safePrintf("%lu<br>", st->m_catRec.m_indCatids[i]); } } sb.safePrintf("</td></tr>"); } // end it sb.safePrintf ( "</center></td></tr></table>" ); // print submit button sb.safePrintf ( "<br><center>" "<input type=submit value=\"Submit\" border=0>" "</form></center>" ); // print the final tail //p += g_httpServer.printTail ( p , pend - p ); // clear g_errno, if any, so our reply send goes through g_errno = 0; // extract the socket TcpSocket *s = st->m_socket; // clear the state mdelete ( st, sizeof(StateCatdb), "PageCatdb" ); delete st; // . send this page // . encapsulates in html header and tail // . make a Mime return g_httpServer.sendDynamicPage(s , sb.getBufStart(), sb.length()); }
bool Msg3a::gotAllSplitReplies ( ) { // if any of the split requests had an error, give up and set m_errno // but don't set if for non critical errors like query truncation if ( m_errno ) { g_errno = m_errno; return true; } // also reset the finalbuf and the oldNumTopDocIds if ( m_finalBuf ) { mfree ( m_finalBuf, m_finalBufSize, "Msg3aF" ); m_finalBuf = NULL; m_finalBufSize = 0; } // update our estimated total hits m_numTotalEstimatedHits = 0; for ( long i = 0; i < m_numHosts ; i++ ) { // get that host that gave us the reply //Host *h = g_hostdb.getHost(i); // . get the reply from multicast // . multicast should have destroyed all slots, but saved reply // . we are responsible for freeing the reply // . we need to call this even if g_errno or m_errno is // set so we can free the replies in Msg3a::reset() // . if we don't call getBestReply() on it multicast should // free it, because Multicast::m_ownReadBuf is still true Multicast *m = &m_mcast[i]; bool freeit = false; long replySize = 0; long replyMaxSize; char *rbuf; Msg39Reply *mr; // . only get it if the reply not already full // . if reply already processed, skip // . perhaps it had no more docids to give us or all termlists // were exhausted on its disk and this is a re-call // . we have to re-process it for count m_numTotalEstHits, etc. rbuf = m->getBestReply ( &replySize , &replyMaxSize , &freeit , true ); //stealIt? // cast it mr = (Msg39Reply *)rbuf; // in case of mem leak, re-label from "mcast" to this so we // can determine where it came from, "Msg3a-GBR" relabel( rbuf, replyMaxSize , "Msg3a-GBR" ); // . we must be able to free it... we must own it // . this is true if we should free it, but we should not have // to free it since it is owned by the slot? if ( freeit ) { log(LOG_LOGIC,"query: msg3a: Steal failed."); char *xx = NULL; *xx=0; } // bad reply? if ( ! mr ) { log(LOG_LOGIC,"query: msg3a: Bad NULL reply."); m_reply [i] = NULL; m_replyMaxSize[i] = 0; // it might have been timd out, just ignore it!! continue; // if size is 0 it can be Msg39 giving us an error! g_errno = EBADREPLYSIZE; m_errno = EBADREPLYSIZE; // all reply buffers should be freed on reset() return true; } // how did this happen? if ( replySize < 29 && ! mr->m_errno ) { // if size is 0 it can be Msg39 giving us an error! g_errno = EBADREPLYSIZE; m_errno = EBADREPLYSIZE; log(LOG_LOGIC,"query: msg3a: Bad reply size of %li.", replySize); // all reply buffers should be freed on reset() return true; } // can this be non-null? we shouldn't be overwriting one // without freeing it... if ( m_reply[i] ) // note the mem leak now log("query: mem leaking a 0x39 reply"); // cast it and set it m_reply [i] = mr; m_replyMaxSize[i] = replyMaxSize; // deserialize it (just sets the ptr_ and size_ member vars) //mr->deserialize ( ); deserializeMsg ( sizeof(Msg39Reply) , &mr->size_docIds, &mr->size_clusterRecs, &mr->ptr_docIds, mr->m_buf ); // sanity check if ( mr->m_nqt != m_q->getNumTerms() ) { g_errno = EBADREPLY; m_errno = EBADREPLY; log("query: msg3a: Split reply qterms=%li != %li.", (long)mr->m_nqt,(long)m_q->getNumTerms() ); return true; } // return if split had an error, but not for a non-critical // error like query truncation if ( mr->m_errno && mr->m_errno != EQUERYTRUNCATED ) { g_errno = mr->m_errno; m_errno = mr->m_errno; log("query: msg3a: Split had error: %s", mstrerror(g_errno)); return true; } // skip down here if reply was already set //skip: // add of the total hits from each split, this is how many // total results the lastest split is estimated to be able to // return // . THIS should now be exact since we read all termlists // of posdb... m_numTotalEstimatedHits += mr->m_estimatedHits; // debug log stuff if ( ! m_debug ) continue; // cast these for printing out long long *docIds = (long long *)mr->ptr_docIds; score_t *scores = (score_t *)mr->ptr_scores; // print out every docid in this split reply for ( long j = 0; j < mr->m_numDocIds ; j++ ) { // print out score_t logf( LOG_DEBUG, "query: msg3a: [%lu] %03li) " "split=%li docId=%012llu domHash=0x%02lx " "score=%lu" , (unsigned long)this , j , i , docIds [j] , (long)g_titledb.getDomHash8FromDocId(docIds[j]), (long)scores[j] ); } } // this seems to always return true! mergeLists ( ); if ( ! m_r->m_useSeoResultsCache ) return true; // now cache the reply SafeBuf cr; long dataSize = 4 + 4 + 4 + m_numDocIds * (8+4+4); long need = sizeof(key_t) + 4 + dataSize; bool status = cr.reserve ( need ); // sanity if ( ( m_ckey.n0 & 0x01 ) == 0x00 ) { char *xx=NULL; *xx=0; } // ignore errors g_errno = 0; // return on error with g_errno cleared if cache add failed if ( ! status ) return true; // add to buf otherwise cr.safeMemcpy ( &m_ckey , sizeof(key_t) ); cr.safeMemcpy ( &dataSize , 4 ); long now = getTimeGlobal(); cr.pushLong ( now ); cr.pushLong ( m_numDocIds ); cr.pushLong ( m_numTotalEstimatedHits );//Results ); long max = m_numDocIds; // then the docids for ( long i = 0 ; i < max ; i++ ) cr.pushLongLong(m_docIds[i] ); for ( long i = 0 ; i < max ; i++ ) cr.pushFloat(m_scores[i]); for ( long i = 0 ; i < max ; i++ ) cr.pushLong(getSiteHash26(i)); // sanity if ( cr.length() != need ) { char *xx=NULL; *xx=0; } // make these key_t startKey; key_t endKey; startKey = m_ckey; // clear delbit startKey.n0 &= 0xfffffffffffffffeLL; // end key is us endKey = m_ckey; // that is the single record m_seoCacheList.set ( cr.getBufStart() , cr.length(), cr.getBufStart(), // alloc cr.getCapacity(), // alloc size (char *)&startKey, (char *)&endKey, -1, // fixeddatasize true, // owndata? false,// use half keys? sizeof(key_t) ); // do not allow cr to free it, msg1 will cr.detachBuf(); // note it //log("seopipe: storing ckey=%s q=%s" // ,KEYSTR(&m_ckey,12) // ,m_r->ptr_query // ); //log("msg1: sending niceness=%li",(long)m_r->m_niceness); // this will often block, but who cares!? it just sends a request off if ( ! m_msg1.addList ( &m_seoCacheList , RDB_SERPDB,//RDB_CACHEDB, m_r->ptr_coll, this, // state gotSerpdbReplyWrapper, // callback false, // forcelocal? m_r->m_niceness ) ) { //log("blocked"); return false; } // we can safely delete m_msg17... just return true return true; }
// // . ENTRY POINT FOR IMPORTING TITLEDB RECS FROM ANOTHER CLUSTER // . when user clicks 'begin' in import page we come here.. // . so when that parm changes in Parms.cpp we sense that and call // beginImport(CollectionRec *cr) // . or on startup we call resumeImports to check each coll for // an import in progress. // . search for files named titledb*.dat // . if none found just return // . when msg7 inject competes it calls this // . call this from sleep wrapper in Process.cpp // . returns false if would block (outstanding injects), true otherwise // . sets g_errno on error bool ImportState::importLoop ( ) { CollectionRec *cr = g_collectiondb.getRec ( m_collnum ); if ( ! cr || g_hostdb.m_hostId != 0 ) { // if coll was deleted! log("import: collnum %li deleted while importing into", (long)m_collnum); //if ( m_numOut > m_numIn ) return true; // delete the entire import state i guess // what happens if we have a msg7 reply come back in? // it should see the collrec is NULL and just fail. mdelete ( this, sizeof(ImportState) , "impstate"); delete (this); return true; } INJECTLOOP: // stop if waiting on outstanding injects long long out = m_numOut - m_numIn; if ( out >= cr->m_numImportInjects ) { g_errno = 0; return false; } if ( ! cr->m_importEnabled ) { // wait for all to return if ( out > 0 ) return false; // then delete it log("import: collnum %li import loop disabled", (long)m_collnum); mdelete ( this, sizeof(ImportState) , "impstate"); delete (this); return true; } // scan each titledb file scanning titledb0001.dat first, // titledb0003.dat second etc. //long long offset = -1; // . when offset is too big for current m_bigFile file then // we go to the next and set offset to 0. // . sets m_bf and m_fileOffset if ( ! setCurrentTitleFileAndOffset ( ) ) {//cr , -1 ); log("import: import: no files to read"); //goto INJECTLOOP; return true; } // this is -1 if none remain! if ( m_fileOffset == -1 ) { log("import: import fileoffset is -1. done."); return true; } long long saved = m_fileOffset; //Msg7 *msg7; //GigablastRequest *gr; //SafeBuf *sbuf = NULL; long need = 12; long dataSize = -1; //XmlDoc xd; key_t tkey; bool status; SafeBuf tmp; SafeBuf *sbuf = &tmp; long long docId; long shardNum; long key; Multicast *mcast; char *req; long reqSize; if ( m_fileOffset >= m_bfFileSize ) { log("inject: import: done processing file %li %s", m_bfFileId,m_bf.getFilename()); goto nextFile; } // read in title rec key and data size status = m_bf.read ( &tkey, sizeof(key_t) , m_fileOffset ); //if ( n != 12 ) goto nextFile; if ( g_errno ) { log("inject: import: reading file error: %s. advancing " "to next file",mstrerror(g_errno)); goto nextFile; } m_fileOffset += 12; // if negative key, skip if ( (tkey.n0 & 0x01) == 0 ) { goto INJECTLOOP; } // if non-negative then read in size status = m_bf.read ( &dataSize , 4 , m_fileOffset ); if ( g_errno ) { log("main: failed to read in title rec " "file. %s. Skipping file %s", mstrerror(g_errno),m_bf.getFilename()); goto nextFile; } m_fileOffset += 4; need += 4; need += dataSize; need += 4; // collnum, first 4 bytes if ( dataSize < 0 || dataSize > 500000000 ) { log("main: could not scan in titledb rec of " "corrupt dataSize of %li. BAILING ENTIRE " "SCAN of file %s",dataSize,m_bf.getFilename()); goto nextFile; } //gr = &msg7->m_gr; //XmlDoc *xd = getAvailXmlDoc(); //msg7 = getAvailMsg7(); mcast = getAvailMulticast(); // if none, must have to wait for some to come back to us if ( ! mcast ) { // restore file offset //m_fileOffset = saved; // no, must have been a oom or something log("import: import no mcast available"); return true;//false; } // this is for holding a compressed titlerec //sbuf = &mcast->m_sbuf;//&gr->m_sbuf; // point to start of buf sbuf->reset(); // ensure we have enough room sbuf->reserve ( need ); // collnum first 4 bytes sbuf->pushLong( (long)m_collnum ); // store title key sbuf->safeMemcpy ( &tkey , sizeof(key_t) ); // then datasize if any. neg rec will have -1 datasize if ( dataSize >= 0 ) sbuf->pushLong ( dataSize ); // then read data rec itself into it, compressed titlerec part if ( dataSize > 0 ) { // read in the titlerec after the key/datasize status = m_bf.read ( sbuf->getBuf() , dataSize , m_fileOffset ); if ( g_errno ) { // n != dataSize ) { log("main: failed to read in title rec " "file. %s. Skipping file %s", mstrerror(g_errno),m_bf.getFilename()); // essentially free up this msg7 now //msg7->m_inUse = false; //msg7->reset(); goto nextFile; } // advance m_fileOffset += dataSize; // it's good, count it sbuf->m_length += dataSize; } // set xmldoc from the title rec //xd->set ( sbuf.getBufStart() ); //xd->m_masterState = NULL; //xd->m_masterCallback ( titledbInjectLoop ); // we use this so we know where the doc we are injecting // was in the foregien titledb file. so we can update our bookmark // code. mcast->m_hackFileOff = saved;//m_fileOffset; mcast->m_hackFileId = m_bfFileId; // // inject a title rec buf this time, we are doing an import // FROM A TITLEDB FILE!!! // //gr->m_titleRecBuf = &sbuf; // break it down into gw // xd.set2 ( sbuf.getBufStart() , // sbuf.length() , // max size // cr->m_coll, // use our coll // NULL , // pbuf for page parser // 1 , // niceness // NULL ); //sreq ); // // note it // log("import: importing %s",xd.m_firstUrl.getUrl()); // now we can set gr for the injection // TODO: inject the whole "sbuf" so we get sitenuminlinks etc // all exactly the same... // gr->m_url = xd.getFirstUrl()->getUrl(); // gr->m_queryToScrape = NULL; // gr->m_contentDelim = 0; // gr->m_contentTypeStr = g_contentTypeStrings [xd.m_contentType]; // gr->m_contentFile = NULL; // gr->m_content = xd.ptr_utf8Content; // gr->m_diffbotReply = NULL; // gr->m_injectLinks = false; // gr->m_spiderLinks = true; // gr->m_shortReply = false; // gr->m_newOnly = false; // gr->m_deleteUrl = false; // gr->m_recycle = true; // recycle content? or sitelinks? // gr->m_dedup = false; // gr->m_hasMime = false; // gr->m_doConsistencyTesting = false; // gr->m_getSections = false; // gr->m_gotSections = false; // gr->m_charset = xd.m_charset; // gr->m_hopCount = xd.m_hopCount; // // point to next doc in the titledb file // //m_fileOffset += need; // get docid from key docId = g_titledb.getDocIdFromKey ( &tkey ); // get shard that holds the titlerec for it shardNum = g_hostdb.getShardNumFromDocId ( docId ); // for selecting which host in the shard receives it key = (long)docId; m_numOut++; // then index it. master callback will be called //if ( ! xd->index() ) return false; // TODO: make this forward the request to an appropriate host!! // . gr->m_sbuf is set to the titlerec so this should handle that // and use XmlDoc::set4() or whatever // if ( msg7->injectTitleRec ( msg7 , // state // gotMsg7ReplyWrapper , // callback // cr )) { // // it didn't block somehow... // msg7->m_inUse = false; // msg7->gotMsg7Reply(); // } req = sbuf->getBufStart(); reqSize = sbuf->length(); if ( reqSize != need ) { char *xx=NULL;*xx=0 ; } // do not free it, let multicast free it after sending it sbuf->detachBuf(); if ( ! mcast->send ( req , reqSize , 0x07 , true , // ownmsg? shardNum, false, // send to whole shard? key , // for selecting host in shard mcast , // state NULL , // state2 gotMulticastReplyWrapper , 999999 ) ) { // total timeout in seconds log("import: import mcast had error: %s",mstrerror(g_errno)); m_numIn++; } goto INJECTLOOP; nextFile: // invalidate this flag //m_offIsValid = false; // . and call this function. we add one to m_bfFileId so we // do not re-get the file we just injected. // . sets m_bf and m_fileOffset // . returns false if nothing to read if ( ! setCurrentTitleFileAndOffset ( ) ) { //cr , m_bfFileId+1 ); log("import: import: no files left to read"); //goto INJECTLOOP; return true; } // if it returns NULL we are done! log("main: titledb injection loop completed. waiting for " "outstanding injects to return."); if ( m_numOut > m_numIn ) return false; log("main: all injects have returned. DONE."); // dummy return return true; }