bool AutoBan::printTable( TcpSocket *s , HttpRequest *r ) { SafeBuf sb(512 * 512,"autobbuf"); //read in all of the possible cgi parms off the bat: //long user = g_pages.getUserType( s , r ); char *username = g_users.getUsername(r); //char *pwd = r->getString ("pwd"); char *coll = r->getString ("c"); long banIpsLen; char *banIps = r->getString ("banIps" , &banIpsLen , NULL); long allowIpsLen; char *allowIps = r->getString ("allowIps" , &allowIpsLen , NULL); long clearLen; char *clear = r->getString ("clear" , &clearLen , NULL); bool changed = false; long validCodesLen; char *validCodes = r->getString ("validCodes", &validCodesLen, NULL); long showAllIps = r->getLong("showAllIps", 0); long showLongView = r->getLong("longview", 0); // do it all from parm now //long banRegexLen; //char *banRegex = r->getString("banRegex", &banRegexLen, NULL); // char *ss = sb.getBuf(); // char *ssend = sb.getBufEnd(); g_pages.printAdminTop ( &sb, PAGE_AUTOBAN, username, coll , NULL , s->m_ip ); //sb.incrementLength(sss - ss); // MDW: moved to here long now = getTime(); long days; long hours; long minutes; long secs; long msecs; if(r->getLong("resetcodes", 0)) { setCodesFromConf(); } sb.safePrintf("\n<br><br><table width=100%% bgcolor=#%s " "cellpadding=4 border=1>\n", BABY_BLUE); getCalendarFromMs((now - m_codeResetTime) * 1000, &days, &hours, &minutes, &secs, &msecs); sb.safePrintf("<tr><td colspan=18 bgcolor=#%s>" "<center><b>Code Usage " "(<a href=\"/master/" "autoban?c=%s&resetcodes=1\">reset</a> " "%li days %li hours %li " "minutes %li sec ago)" "</b></center></td></tr>", DARK_BLUE, coll, days, hours, minutes, secs); sb.safePrintf("<tr bgcolor=#%s>" "<td><center><b>Code</b></center></td>" "<td><center><b>IP</b></center></td>" "<td><center><b>Query Count</b></center></td>" "<td><center><b>Bytes Read</b></center></td>" "<td><center><b>Bytes Sent</b></center></td>" "<td><center><b>Outstanding Count</b></center></td>" "<td><center><b>Most Ever Outstanding</b></center></td>" "<td><center><b>Max Outstanding</b></center></td>" "</tr>", LIGHT_BLUE); for(long i = 0; i < m_ht.getNumSlots(); i++) { if ( m_ht.getKey ( i ) == 0 ) continue; CodeVal *cv = m_ht.getValuePointerFromSlot ( i ); if ( ! cv ) continue; sb.safePrintf("<tr>"); sb.safePrintf("<td>"); sb.copyToken(cv->m_code);//m_codeVals[i].m_code); sb.safePrintf("</td>"); sb.safePrintf("<td><center>%s</center> </td>", iptoa(cv->m_ip)); sb.safePrintf("<td><center>%lli</center></td>", cv->m_count); sb.safePrintf("<td><center>%lli</center></td>", cv->m_bytesRead); sb.safePrintf("<td><center>%lli</center></td>", cv->m_bytesSent); sb.safePrintf("<td><center>%li</center></td>", cv->m_outstanding); sb.safePrintf("<td><center>%li</center></td>", cv->m_maxEver); if ( cv->m_maxOutstanding != 50 ) sb.safePrintf("<td><center><b>%li</b></center></td>", cv->m_maxOutstanding); else sb.safePrintf("<td><center>%li</center></td>", cv->m_maxOutstanding); sb.safePrintf("</tr>"); } sb.safePrintf ("</table><br><br>\n" ); if(clear && clearLen < 64) { long ip = atoip(clear, clearLen); if(ip) { removeIp(ip); char *beginning; char ipbuf[64];//gotta NULL terminate for strstr memcpy(ipbuf, clear, clearLen); ipbuf[clearLen] = '\0'; beginning = findToken(g_conf.m_banIps, ipbuf, clearLen); if(beginning) { char *to = beginning; char *from = beginning + clearLen; while(*to) *to++ = *from++; } beginning = findToken(g_conf.m_allowIps, ipbuf, clearLen); if(beginning) { char *to = beginning; char *from = beginning + clearLen; while(*to) *to++ = *from++; } changed = true; } } long allowLen; char *allow = r->getString ( "allow" , &allowLen , NULL ); if(allow && allowLen < 64) { long ip = atoip(allow, allowLen); if(ip) { char *beginning; char ipbuf[64];//gotta NULL terminate for strstr memcpy(ipbuf, allow, allowLen); ipbuf[allowLen] = '\0'; beginning = findToken(g_conf.m_allowIps, ipbuf, allowLen); if(!beginning) { //its not present, so add it. char *p = g_conf.m_allowIps; while(*p) p++; if(p - g_conf.m_allowIps + allowLen + 2 < AUTOBAN_TEXT_SIZE) { *p++ = '\n'; memcpy(p, ipbuf,allowLen); *(p + allowLen) = '\0'; } else { sb.safePrintf("<font color=red>" "Not enough stack space " "to fit allowIps. " "Increase " "AUTOBAN_TEXT_SIZE in " "Conf.h. " "Had %i need %li." "</font>", AUTOBAN_TEXT_SIZE, p - g_conf.m_allowIps + allowLen + 2); goto dontRemove1; } } beginning = findToken(g_conf.m_banIps, ipbuf, allowLen); if(beginning) { //remove it from banned if present. char *to = beginning; char *from = beginning + allowLen; while(*to) *to++ = *from++; } changed = true; } } dontRemove1: long denyLen; char *deny = r->getString ( "deny" , &denyLen , NULL ); if(deny && denyLen < 64) { long ip = atoip(deny, denyLen); if(ip) { char *beginning; char ipbuf[64];//gotta NULL terminate for strstr memcpy(ipbuf, deny, denyLen); ipbuf[denyLen] = '\0'; beginning = findToken(g_conf.m_banIps, ipbuf, denyLen); if(!beginning) { //its not present, so add it. char *p =g_conf.m_banIps; while(*p) p++; if(p - g_conf.m_banIps + denyLen + 2 < AUTOBAN_TEXT_SIZE) { *p++ = '\n'; memcpy(p, ipbuf,denyLen); *(p + denyLen) = '\0'; } else { sb.safePrintf("<font color=red>Not " "enough stack space " "to fit bannedIPs. " "Increase " "AUTOBAN_TEXT_SIZE in " "Conf.h. " "Had %i need %li." "</font>", AUTOBAN_TEXT_SIZE, p - g_conf.m_banIps + denyLen + 2); goto dontRemove2; } } beginning = findToken(g_conf.m_allowIps, ipbuf, denyLen); if(beginning) { //remove it from allowed list if present. char *to = beginning; char *from = beginning + denyLen; while(*to) *to++ = *from++; } changed = true; } } dontRemove2: if(!g_conf.m_doAutoBan) { sb.safePrintf("<center><font color=red><b>Autoban is disabled, " "turn it on in Master Controls.</b></font></center><br>"); } if(validCodes) { if(validCodesLen >= AUTOBAN_TEXT_SIZE) { sb.safePrintf("<font color=red>Not enough stack space " "to fit codes. " "Increase AUTOBAN_TEXT_SIZE in Conf.h. " "Had %i need %li.</font>", AUTOBAN_TEXT_SIZE, validCodesLen); validCodes = NULL; validCodesLen = 0; } else { memcpy(g_conf.m_validCodes, validCodes, validCodesLen); g_conf.m_validCodes[validCodesLen] = '\0'; trimWhite(g_conf.m_validCodes); setCodesFromConf(); } } //first remove all of the ips in the conf, then add the passed in // ones to the conf parm; if (banIps) { //ack, the browser puts in crlf when this comes back, so //we will have a longer string here than the one we sent //out. trim back all extrainious whitespace before we do //bounds checking. trimWhite(banIps); banIpsLen = gbstrlen(banIps); if(banIpsLen >= AUTOBAN_TEXT_SIZE) { sb.safePrintf("<font color=red>Not enough stack space " "to fit bannedIps. " "Increase AUTOBAN_TEXT_SIZE in Conf.h. " "Had %i need %li.</font>", AUTOBAN_TEXT_SIZE, banIpsLen); banIpsLen = AUTOBAN_TEXT_SIZE - 1; } for(long i = 0; i < m_tableSize; i++) { if(m_detectKeys[i] == 0) continue; //check the 'set from conf' bit, and clear those. if(m_detectVals[i].m_flags & FROMCONF) { removeIp(m_detectKeys[i]); } } memcpy(g_conf.m_banIps, banIps, banIpsLen); g_conf.m_banIps[banIpsLen] = '\0'; changed = true; } if (allowIps) { trimWhite(allowIps); allowIpsLen = gbstrlen(allowIps); if(allowIpsLen >= AUTOBAN_TEXT_SIZE) { sb.safePrintf("<font color=red>Not enough stack space " "to fit allowIps. " "Increase AUTOBAN_TEXT_SIZE in Conf.h. " "Had %i need %li.</font>", AUTOBAN_TEXT_SIZE, allowIpsLen); allowIpsLen = AUTOBAN_TEXT_SIZE - 1; } for(long i = 0; i < m_tableSize; i++) { if(m_detectKeys[i] == 0) continue; //check the 'set from conf' bit, and clear those. if(m_detectVals[i].m_flags & FROMCONF) { removeIp(m_detectKeys[i]); } } memcpy(g_conf.m_allowIps, allowIps, allowIpsLen); g_conf.m_allowIps[allowIpsLen] = '\0'; changed = true; } if(changed) { trimWhite(g_conf.m_allowIps); trimWhite(g_conf.m_banIps); setFromConf(); } sb.safePrintf("\n<table width=100%% bgcolor=#%s " "cellpadding=4 border=1>\n", BABY_BLUE); sb.safePrintf("<tr><td colspan=2 bgcolor=#%s>" "<center><b>Add IPs</b></center></td></tr>", DARK_BLUE); // ss = sb.getBuf(); // ssend = sb.getBufEnd(); g_parms.printParms (&sb, s, r); // sb.incrementLength(sss - ss); sb.safePrintf ("<tr><td>" "<center>" "<input type=submit value=\"Update\" " "method=\"POST\" border=0>" "</center></td></tr>"); sb.safePrintf ("</table><br><br>\n" ); if(!showLongView) { sb.safePrintf("<b><a href=\"autoban" "?c=%s" "&showAllIps=%li" "&longview=1\">Show watched ips table...</a></b>", coll, showAllIps); return g_httpServer.sendDynamicPage ( s , sb.getBufStart() , sb.length() , -1 , false); } ///////////////////////////////////////////////////////////////////// sb.safePrintf("\n<table width=100%% bgcolor=#%s " "cellpadding=4 border=1>\n", BABY_BLUE); sb.safePrintf("<tr><td colspan=3 bgcolor=#%s>" "<center><b>Watched Ips</b></center></td></tr>", DARK_BLUE); sb.safePrintf("<tr bgcolor=#%s>" "<td><center><b>IP</b></center></td>" "<td><center><b>Description</b></center></td>" // "<td><center><b>Time Added</b></center></td>" "<td><center><b>Allow/Deny/Clear</b></center></td>" "</tr>", LIGHT_BLUE); long *sortedIndices = (long*)mmalloc(m_tableSize * sizeof(long), "AutoBanH"); if(!sortedIndices) { return g_httpServer.sendErrorReply(s,500,mstrerror(ENOMEM)); } long numEntries = 0; for(long i = 0; i < m_tableSize; i++) { if(m_detectKeys[i] == 0) continue; sortedIndices[numEntries++] = i; } SorterTable = m_detectKeys; gbsort(sortedIndices, numEntries, sizeof(long), ip_cmp); //lets put each class of watched ip in its own safebuf then cat //them together at the end. SafeBuf allowed; SafeBuf banned; SafeBuf feedLeachers; SafeBuf cowBots; SafeBuf *e; for(long j = 0; j < numEntries; j++) { long i = sortedIndices[j]; if(m_detectKeys[i] == 0) continue; //if(!(m_detectVals[i].m_flags & FROMCONF)) continue; bool allow = m_detectVals[i].m_flags & ALLOW && m_detectVals[i].m_flags & FROMCONF; bool deny = m_detectVals[i].m_flags & DENY && m_detectVals[i].m_flags & FROMCONF; bool explicitban = deny && m_detectVals[i].m_flags & FROMCONF; unsigned short dayCount = m_detectVals[i].m_dayCount; unsigned char minuteCount = m_detectVals[i].m_minuteCount; bool day = dayCount >= g_conf.m_numFreeQueriesPerDay; bool minute = minuteCount >= g_conf.m_numFreeQueriesPerMinute; char *description; char *color; if(allow) { color = GREEN; description = "Allowed"; e = &allowed; } else if(explicitban) { color = RED; description = "Banned"; e = &banned; } else if(minute) { color = RED; description = "Cow Bot"; e = &cowBots; } else if(day) { color = RED; description = "Feed Leacher"; e = &feedLeachers; } else { //this can happen when someone was banned due to //exceeding the quota, then the quota was lowered. m_detectVals[i].m_flags &= ~DENY; //log("autoban: ohshit-banning %s",iptoa(s->m_ip)); continue; } e->safePrintf("<tr>"); e->safePrintf("<td bgcolor=#%s><center>%s</center></td><td>" "<center>%s</center></td>" // "<td><center>" // "%li days %li hrs %li min ago" // "</center></td>" "<td><center><a href=\"/master/" "autoban?c=%s&allow=%s&showAllIps=%li\">" "allow/</a>" "<a href=\"/master/" "autoban?c=%s&deny=%s&showAllIps=%li\">" "deny/</a>" "<a href=\"/master/" "autoban?c=%s&clear=%s&showAllIps=%li\">" "clear</a></center>" "</td>",color, iptoa(m_detectKeys[i]), description, // days,hours,minutes, coll, iptoa(m_detectKeys[i]), showAllIps, coll, iptoa(m_detectKeys[i]), showAllIps, coll, iptoa(m_detectKeys[i]), showAllIps); e->safePrintf("</tr>"); } sb.cat(allowed); sb.cat(banned); sb.cat(feedLeachers); sb.cat(cowBots); sb.safePrintf ("</table><br><br>\n" ); // MDW moved from here sb.safePrintf("\n<br><br><table width=100%% bgcolor=#%s " "cellpadding=4 border=1>\n", BABY_BLUE); sb.safePrintf("<tr><td colspan=5 bgcolor=#%s>" "<center><b>Control Panel</b></center></td></tr>", DARK_BLUE); sb.safePrintf("<tr>" "<td bgcolor=#%s><center><b>Show Ips by Number of Queries" "</b></center></td>", LIGHT_BLUE); sb.safePrintf("<td><center><font color=red><b><a href=\"/master/" "autoban?c=%s&showAllIps=0\">" "0 Queries</a></b>" "</font></center></td>", coll); sb.safePrintf("<td><center><font color=red><b><a href=\"/master/" "autoban?c=%s&showAllIps=1\">" "1 Query</a></b>" "</font></center></td>", coll); sb.safePrintf("<td><center><font color=red><b><a href=\"/master/" "autoban?c=%s&showAllIps=10\">" "10 Queries</a></b>" "</font></center></td>", coll); sb.safePrintf("<td><center><font color=red><b><a href=\"/master/" "autoban?c=%s&showAllIps=100\">" "100 Queries</a></b>" "</font></center></td></tr>", coll); sb.safePrintf ("</table><br><br>\n"); if(!showAllIps) { char* ss = (char*) sb.getBufStart(); long sslen = sb.length(); mfree(sortedIndices, m_tableSize * sizeof(long),"AutoBanH"); return g_httpServer.sendDynamicPage ( s , ss , sslen , -1 , false); } sb.safePrintf("\n<br><br><table width=100%% bgcolor=#%s " "cellpadding=4 border=1>\n", BABY_BLUE); sb.safePrintf("<tr><td colspan=6 bgcolor=#%s>" "<center><b>Queries Today</b></center></td></tr>", DARK_BLUE); sb.safePrintf("<tr bgcolor=#%s>" "<td><center><b>IP</b></center></td>" "<td><center><b>Minute count</b></center></td>" "<td><center><b>Day count</b></center></td>" "<td><center><b>Time Until Reset</b></center></td>" "<td><center><b>Times Banned</b></center></td>" "<td><center><b>Allow/Deny</b></center></td>" "</tr>", LIGHT_BLUE); char minBuf[128]; char dayBuf[128]; unsigned long lastIpGroup = 0; for(long j = 0; j < numEntries; j++) { long i = sortedIndices[j]; long dayCount = m_detectVals[i].m_dayCount; unsigned char minuteCount = m_detectVals[i].m_minuteCount; if(!(m_detectVals[i].m_flags & FROMCONF)) { if(m_detectVals[i].m_minuteExpires < now) minuteCount = 0; if(!(m_detectVals[i].m_flags & DENY) && m_detectVals[i].m_dayExpires < now) dayCount = 0; } //a hack: if( dayCount < showAllIps) continue; char *color = YELLOW; if(m_detectVals[i].m_flags & ALLOW) { color = GREEN; snprintf(minBuf, 128, "--"); snprintf(dayBuf, 128, "%li", dayCount); } else if(m_detectVals[i].m_flags & DENY) { color = RED; snprintf(minBuf, 128, "--"); snprintf(dayBuf, 128, "%li", dayCount); } else { snprintf(minBuf, 128, "%li", (long)minuteCount); snprintf(dayBuf, 128, "%li", (long)dayCount); } unsigned long thisIpGroup = (unsigned long)m_detectKeys[i] & 0x00ffffff; sb.safePrintf("<tr><center>"); if(m_detectVals[i].m_flags & FROMCONF) { sb.safePrintf("<td bgcolor=#%s><center>%s%s%s</center></td>" "<td><center>%s</center> </td>" "<td><center>%s</center></td>" "<td><center><font color=red>" "<b>NEVER</b>" "</font></center></td>" "<td><center>--</center></td>", color, (thisIpGroup == lastIpGroup)?"<b>":"", iptoa(m_detectKeys[i]), (thisIpGroup == lastIpGroup)?"</b>":"", minBuf, dayBuf); } else { //they haven't done a query since being unbanned, //unban them now so we don't get negative resets displayed. /* no, don't unban the bots!!! MDW yippy project if(m_detectVals[i].m_dayExpires < now) { m_detectVals[i].m_flags &= ~DENY; //log("autoban: dayexpire-unbanning %s", // iptoa(ip)); m_detectVals[i].m_dayExpires = now + ONE_DAY; m_detectVals[i].m_minuteExpires = now + 60; m_detectVals[i].m_dayCount = 0; m_detectVals[i].m_minuteCount = 0; sb.safePrintf("</center></tr>"); continue; } */ getCalendarFromMs((m_detectVals[i].m_dayExpires - now)* 1000, &days, &hours, &minutes, &secs, &msecs); sb.safePrintf("<td bgcolor=#%s><center>%s%s%s</center></td>" "<td><center>%s</center> </td>" "<td><center>%s</center></td>" "<td><center><font color=red>" "<b>%li days %li hrs %li min %li sec</b>" "</font></center></td>" "<td><center>%i</center></td>", color, (thisIpGroup == lastIpGroup)?"<b>":"", iptoa(m_detectKeys[i]), (thisIpGroup == lastIpGroup)?"</b>":"", minBuf, dayBuf, days, hours, minutes, secs, m_detectVals[i].m_timesBanned); } sb.safePrintf("<td><center>" "<a href=\"/master/" "autoban?c=%s&allow=%s&showAllIps=%li\">" "allow/</a>" "<a href=\"/master/" "autoban?c=%s&deny=%s&showAllIps=%li\">" "deny</a></center>" "</td>", coll, iptoa(m_detectKeys[i]), showAllIps, coll, iptoa(m_detectKeys[i]), showAllIps); sb.safePrintf("</center></tr>"); lastIpGroup = thisIpGroup; } sb.safePrintf ("</table><br><br>\n" ); char* ss = (char*) sb.getBufStart(); long sslen = sb.length(); mfree(sortedIndices, m_tableSize * sizeof(long),"AutoBanH"); return g_httpServer.sendDynamicPage ( s , ss , sslen , -1 , false); }
void doneReindexing ( void *state ) { // cast it State13 *st = (State13 *)state; GigablastRequest *gr = &st->m_gr; // note it if ( gr->m_query && gr->m_query[0] ) log(LOG_INFO,"admin: Done with query reindex. %s", mstrerror(g_errno)); //// // // print the html page // ///// HttpRequest *hr = &gr->m_hr; char format = hr->getReplyFormat(); SafeBuf sb; const char *ct = "text/html"; if ( format == FORMAT_JSON ) ct = "application/json"; if ( format == FORMAT_XML ) { ct = "text/xml"; sb.safePrintf("<response>\n" "\t<statusCode>0</statusCode>\n" "\t<statusMsg>Success</statusMsg>\n" "\t<matchingResults>%" PRId32"</matchingResults>\n" "</response>" , st->m_msg1c.m_numDocIdsAdded ); g_httpServer.sendDynamicPage ( gr->m_socket, sb.getBufStart(), sb.length(), -1, false,ct); mdelete ( st , sizeof(State13) , "PageTagdb" ); delete (st); return; } if ( format == FORMAT_JSON ) { sb.safePrintf("{\"response\":{\n" "\t\"statusCode\":0,\n" "\t\"statusMsg\":\"Success\",\n" "\t\"matchingResults\":%" PRId32"\n" "}\n" "}\n" , st->m_msg1c.m_numDocIdsAdded ); g_httpServer.sendDynamicPage ( gr->m_socket, sb.getBufStart(), sb.length(), -1, false,ct); mdelete ( st , sizeof(State13) , "PageTagdb" ); delete (st); return; } g_pages.printAdminTop ( &sb , gr->m_socket , &gr->m_hr ); sb.safePrintf("<style>" ".poo { background-color:#%s;}\n" "</style>\n" , LIGHT_BLUE ); // // print error msg if any // if ( gr->m_query && gr->m_query[0] && ! g_errno ) sb.safePrintf ( "<center><font color=red><b>Success. " "Added %" PRId32" docid(s) to " "spider queue.</b></font></center><br>" , st->m_msg1c.m_numDocIdsAdded ); if ( gr->m_query && gr->m_query[0] && g_errno ) sb.safePrintf ( "<center><font color=red><b>Error. " "%s</b></font></center><br>" , mstrerror(g_errno)); // print the reindex interface g_parms.printParmTable ( &sb , gr->m_socket , &gr->m_hr ); g_httpServer.sendDynamicPage ( gr->m_socket, sb.getBufStart(), sb.length(), -1, false); mdelete ( st , sizeof(State13) , "PageTagdb" ); delete (st); }
void gotDatedbList ( State60 *st ) { // must only be run on host #0 since we need just one lock table if ( g_hostdb.m_myHost->m_hostId != 0 ) { char *xx=NULL;*xx=0; } // load turk lock table if we need to bool s_init = false; if ( ! s_init ) { s_init = true; if ( ! g_turkLocks.set(8,sizeof(TurkLock),256) ) log("turk: failed to init turk lock table"); if ( ! g_turkLocks.load(g_conf.m_dir,"turkdir/docidlocks.dat")) log("turk: failed to load turk lock table"); } time_t now = getTimeGlobal(); // int16_tcut RdbList *list = &st->m_list; // the best docid int64_t best = 0LL; // scan the list to get urls/docids to turk out for ( ; ! list->isExhausted() ; ) { // get rec char *k = list->getCurrentKey(); // skip that list->skipCurrentRecord(); // skip if negative if ( (k[0] & 0x01) == 0x00 ) continue; // get the docid int64_t docid = g_datedb.getDocId ( k ); // skip if locked TurkLock *tt = (TurkLock *)g_turkLock.getValue(&docid); // if there check time if ( tt && now - tt->m_lockTime > 3600 ) { // remove it g_turkLock.removeKey(&docId); // nuke tt tt = NULL; } // if still there, skip it and try next one if ( tt ) continue; // ok, we got a good docid to dish out best = docId; break; } SafeBuf sb; // print description so they can clikc a button to start the turk sb.safePrintf("<html>\n" "<title>Event Editor</title>\n" "<body>\n" "<table width=\"100%%\" border=\"0\">\n" "<tr><td style=\"background-color:#0079ba;\">\n" "<center><font color=#00000>" "<h2>Event Editor</h2>\n" "</font></center></td>" "</tr></table>"); // if we had no docid, give user an empty msg if ( ! best ) { sb.safePrintf("<center>Nothing currently available to edit. " "Please try again later.</center>" "</body></html>\n"); sendReply ( &sb ); return; } // lock it! TurkLock tt; strcpy ( tt.m_user , st->m_user ); tt.m_lockTime = now; if ( ! g_lockTable.addLock ( &tt ) ) { sendErrorReply ( st , g_errno ); return; } // . fetch the TitleRec // . a max cache age of 0 means not to read from the cache XmlDoc *xd = &st->m_xd; // . when getTitleRec() is called it will load the old one // since XmlDoc::m_setFromTitleRec will be true // . niceness is 0 xd->set3 ( best , st->m_coll , 0 ); // if it blocks while it loads title rec, it will re-call this routine xd->setCallback ( st , processLoopWrapper ); // good to go! return processLoop ( st ); }
bool gotXmlDoc ( void *state ) { // cast it State8 *st = (State8 *)state; // get the xmldoc XmlDoc *xd = &st->m_xd; // if we loaded from old title rec, it should be there! // . save the ips.txt file if we are the test coll // . saveTestBuf() is a function in Msge1.cpp //if ( xd && xd->m_coll && ! strcmp ( xd->m_coll , "test")) // // use same dir that XmlDoc::getTestDir() would use // saveTestBuf ( "test-page-parser" ); // error? if ( g_errno ) return sendErrorReply ( st , g_errno ); // shortcut SafeBuf *xbuf = &st->m_xbuf; bool printIt = false; if ( st->m_u && st->m_u[0] ) printIt = true; if ( st->m_docId != -1LL ) printIt = true; if ( st->m_donePrinting ) printIt = false; // do not re-call this if printDocForProCog blocked... (check length()) if ( printIt ) { // mark as done st->m_donePrinting = true; // always re-compute the page inlinks dynamically, do not // use the ptr_linkInfo1 stored in titlerec!! // NO! not if set from titlerec/docid if ( st->m_recompute ) xd->m_linkInfo1Valid = false; // try a recompute regardless, because we do not store the // bad inlinkers, and ppl want to see why they are bad! //xd->m_linkInfo1Valid = false; // now get the meta list, in the process it will print out a // bunch of junk into st->m_xbuf //char *metalist = xd->getMetaList ( ); //if ( ! metalist ) return sendErrorReply ( st , g_errno ); // return false if it blocked //if ( metalist == (void *)-1 ) return false; // for debug... //if ( ! xd->m_indexCode ) xd->doConsistencyTest ( false ); // . print it out // . returns false if blocks, true otherwise // . sets g_errno on error if ( ! xd->printDocForProCog ( xbuf , &st->m_r ) ) return false; // error? if ( g_errno ) return sendErrorReply ( st , g_errno ); } long isXml = st->m_r.getLong("xml",0); char ctype = CT_HTML; if ( isXml ) ctype = CT_XML; // now encapsulate it in html head/tail and send it off bool status = g_httpServer.sendDynamicPage( st->m_s , xbuf->getBufStart(), xbuf->length() , -1, //cachtime false ,//postreply? &ctype, -1 , //httpstatus NULL,//cookie "utf-8"); // delete the state now if ( st->m_freeIt ) { mdelete ( st , sizeof(State8) , "PageParser" ); delete (st); } // return the status return status; }
bool sendReply ( void *state ) { GigablastRequest *gr = (GigablastRequest *)state; // in order to see what sites are being added log it, then we can // more easily remove sites from sitesearch.gigablast.com that are // being added but not being searched SafeBuf xb; if ( gr->m_urlsBuf ) { xb.safeTruncateEllipsis ( gr->m_urlsBuf , 200 ); log( LOG_INFO, "http: add url %s (%s)", xb.getBufStart(), mstrerror( g_errno ) ); } char format = gr->m_hr.getReplyFormat(); TcpSocket *sock = gr->m_socket; if ( format == FORMAT_JSON || format == FORMAT_XML ) { bool status = g_httpServer.sendSuccessReply ( gr ); // nuke state mdelete ( gr , sizeof(gr) , "PageAddUrl" ); delete (gr); return status; } int32_t ulen = 0; const char *url = gr->m_urlsBuf; if ( url ) ulen = gbstrlen (url); // re-null it out if just http:// bool printUrl = true; if ( ulen == 0 ) printUrl = false; if ( ! gr->m_urlsBuf ) printUrl = false; if ( ulen==7 && printUrl && !strncasecmp(gr->m_url,"http://",7)) printUrl = false; if ( ulen==8 && printUrl && !strncasecmp(gr->m_url,"https://",8)) printUrl = false; // page is not more than 32k char buf[1024*32+MAX_URL_LEN*2]; SafeBuf sb(buf, 1024*32+MAX_URL_LEN*2); g_pages.printAdminTop ( &sb , sock , &gr->m_hr ); // if there was an error let them know SafeBuf mbuf; if ( g_errno ) { mbuf.safePrintf("<center><font color=red>"); mbuf.safePrintf("Error adding url(s): <b>%s[%i]</b>", mstrerror(g_errno) , g_errno); mbuf.safePrintf("</font></center>"); } else if ( printUrl ) { mbuf.safePrintf("<center><font color=red>"); mbuf.safePrintf("<b><u>"); mbuf.safeTruncateEllipsis(gr->m_urlsBuf,200); mbuf.safePrintf("</u></b></font> added to spider queue successfully<br><br>"); mbuf.safePrintf("</font></center>"); } if ( mbuf.length() ) { sb.safeStrcpy( mbuf.getBufStart() ); } g_parms.printParmTable ( &sb , sock , &gr->m_hr ); // print the final tail g_pages.printTail ( &sb, true ); // admin? // clear g_errno, if any, so our reply send goes through g_errno = 0; // nuke state mdelete ( gr , sizeof(GigablastRequest) , "PageAddUrl" ); delete (gr); return g_httpServer.sendDynamicPage( sock, sb.getBufStart(), sb.length(), -1 ); // cachetime }
bool sendReply ( void *state ) { StateCatdb *st = (StateCatdb*)state; // check for error if (g_errno) { if (st->m_catLookup) log("PageCatdb: Msg8b had error getting Site Rec: %s", mstrerror(g_errno)); else log("PageCatdb: Msg2a had error generating Catdb: %s", mstrerror(g_errno)); st->m_catLookup = false; g_errno = 0; } long long endTime = gettimeofdayInMilliseconds(); // page buffer SafeBuf sb; sb.reserve(64*1024); // . print standard header // . do not print big links if only an assassin, just print host ids g_pages.printAdminTop ( &sb, st->m_socket , &st->m_r ); sb.safePrintf( "<style>" ".poo { background-color:#%s;}\n" "</style>\n" , LIGHT_BLUE ); sb.safePrintf ( "<table %s>" "<tr><td colspan=2>" "<center><font size=+1><b>Catdb</b></font></center>" "</td></tr>", TABLE_STYLE ); // instructions sb.safePrintf("<tr bgcolor=#%s>" "<td colspan=3>" "<font size=-2>" "<center>" "Don't just start using this, you need to follow the " "instructions in the <i>admin guide</i> for adding " "DMOZ support." "</center>" "</font>" "</td>" "</tr>" ,DARK_BLUE ); // print the generate Catdb link sb.safePrintf ( "<tr class=poo><td>Update Catdb from DMOZ data.</td>" "<td><center>" "<a href=\"/master/catdb?c=%s&gencatdb=2\">" "Update Catdb</a> " "</center></td></tr>", st->m_coll ); sb.safePrintf ( "<tr class=poo>" "<td>Generate New Catdb from DMOZ data.</td>" "<td><center>" "<a href=\"/master/catdb?c=%s&gencatdb=1\">" "Generate Catdb</a> " "</center></td></tr>", st->m_coll ); if (st->m_genCatdb) sb.safePrintf ( "<tr class=poo>" "<td> Catdb Generation took %lli ms." "</td></tr>", endTime - st->m_startTime ); // print Url Catgory Lookup sb.safePrintf ( "<tr class=poo><td>Lookup Category of Url.</td>" "<td><input type=text name=caturl size=80" " value=\""); if (st->m_catLookup) { sb.safeMemcpy(st->m_url.getUrl(), st->m_url.getUrlLen()); } sb.safePrintf("\"></center></td></tr>" ); // print Url Info if Lookup was done if (st->m_catLookup) { sb.safePrintf("<tr><td>"); // print the url sb.safeMemcpy(st->m_url.getUrl(), st->m_url.getUrlLen()); sb.safePrintf(" (%lli ms)</td><td>", endTime - st->m_startTime ); // print each category id and path for (long i = 0; i < st->m_catRec.m_numCatids; i++) { sb.safePrintf("<b>[%li] ", st->m_catRec.m_catids[i]); g_categories->printPathFromId(&sb, st->m_catRec.m_catids[i]); sb.safePrintf("</b><br>"); // lookup title and summary char title[1024]; long titleLen = 0; char summ[4096]; long summLen = 0; char anchor[256]; unsigned char anchorLen = 0; g_categories->getTitleAndSummary( st->m_url.getUrl(), st->m_url.getUrlLen(), st->m_catRec.m_catids[i], title, &titleLen, 1023, summ, &summLen, 4098, anchor, &anchorLen, 255 ); title[titleLen] = '\0'; summ[summLen] = '\0'; anchor[anchorLen] = '\0'; // print title and summary sb.safePrintf("<b>Title:</b> %s<br>" "<b>Summary:</b> %s<br>", title, summ); if (anchorLen > 0) sb.safePrintf("<b>Anchor:</b> %s<br>", anchor); sb.safePrintf("<br>"); } sb.safePrintf("<b>Filenum:</b> %li<br>", st->m_catRec.m_filenum); // print indirect catids if (st->m_catRec.m_numIndCatids > 0) { sb.safePrintf("<hr><b>Indirect Catids [%li]:" "</b><br>\n", st->m_catRec.m_numIndCatids ); for (long i = 0; i < st->m_catRec.m_numIndCatids; i++) { sb.safePrintf("%lu<br>", st->m_catRec.m_indCatids[i]); } } sb.safePrintf("</td></tr>"); } // end it sb.safePrintf ( "</center></td></tr></table>" ); // print submit button sb.safePrintf ( "<br><center>" "<input type=submit value=\"Submit\" border=0>" "</form></center>" ); // print the final tail //p += g_httpServer.printTail ( p , pend - p ); // clear g_errno, if any, so our reply send goes through g_errno = 0; // extract the socket TcpSocket *s = st->m_socket; // clear the state mdelete ( st, sizeof(StateCatdb), "PageCatdb" ); delete st; // . send this page // . encapsulates in html header and tail // . make a Mime return g_httpServer.sendDynamicPage(s , sb.getBufStart(), sb.length()); }
// . a new interface so Msg3b can call this with "s" set to NULL // . returns false if blocked, true otherwise // . sets g_errno on error bool sendPageParser2 ( TcpSocket *s , HttpRequest *r , State8 *st , long long docId , Query *q , // in query term space, not imap space long long *termFreqs , // in imap space float *termFreqWeights , // in imap space float *affWeights , void *state , void (* callback)(void *state) ) { //log("parser: read sock=%li",s->m_sd); // might a simple request to addsomething to validated.*.txt file // from XmlDoc::print() or XmlDoc::validateOutput() char *add = r->getString("add",NULL); //long long uh64 = r->getLongLong("uh64",0LL); char *uh64str = r->getString("uh64",NULL); //char *divTag = r->getString("div",NULL); if ( uh64str ) { // convert add to number long addNum = 0; if ( to_lower_a(add[0])=='t' ) // "true" or "false"? addNum = 1; // convert it. skip beginning "str" inserted to prevent // javascript from messing with the long long since it // was rounding it! //long long uh64 = atoll(uh64str);//+3); // urldecode that //long divTagLen = gbstrlen(divTag); //long newLen = urlDecode ( divTag , divTag , divTagLen ); // null term? //divTag[newLen] = '\0'; // do it. this is defined in XmlDoc.cpp //addCheckboxSpan ( uh64 , divTag , addNum ); // make basic reply char *reply; reply = "HTTP/1.0 200 OK\r\n" "Connection: Close\r\n"; // that is it! send a basic reply ok bool status = g_httpServer.sendDynamicPage( s , reply, gbstrlen(reply), -1, //cachtime false ,//postreply? NULL, //ctype -1 , //httpstatus NULL,//cookie "utf-8"); return status; } // make a state if ( st ) st->m_freeIt = false; if ( ! st ) { try { st = new (State8); } catch ( ... ) { g_errno = ENOMEM; log("PageParser: new(%i): %s", sizeof(State8),mstrerror(g_errno)); return g_httpServer.sendErrorReply(s,500, mstrerror(g_errno)); } mnew ( st , sizeof(State8) , "PageParser" ); st->m_freeIt = true; } // msg3b uses this to get a score from the query st->m_state = state; st->m_callback = callback; st->m_q = q; st->m_termFreqs = termFreqs; st->m_termFreqWeights = termFreqWeights; st->m_affWeights = affWeights; st->m_total = (score_t)-1; st->m_indexCode = 0; st->m_blocked = false; st->m_didRootDom = false; st->m_didRootWWW = false; st->m_wasRootDom = false; st->m_u = NULL; st->m_recompute = false; //st->m_url.reset(); // do not allow more than one to be launched at a time if in // a quickpoll. will cause quickpoll in quickpoll. g_inPageParser = true; // password, too long pwdLen = 0; char *pwd = r->getString ( "pwd" , &pwdLen ); if ( pwdLen > 31 ) pwdLen = 31; if ( pwdLen > 0 ) strncpy ( st->m_pwd , pwd , pwdLen ); st->m_pwd[pwdLen]='\0'; // save socket ptr st->m_s = s; st->m_r.copy ( r ); // get the collection char *coll = r->getString ( "c" , &st->m_collLen ,NULL /*default*/); if ( st->m_collLen > MAX_COLL_LEN ) return sendErrorReply ( st , ENOBUFS ); strcpy ( st->m_coll , coll ); // version to use, if -1 use latest st->m_titleRecVersion = r->getLong("version",-1); if ( st->m_titleRecVersion == -1 ) st->m_titleRecVersion = TITLEREC_CURRENT_VERSION; // default to 0 if not provided st->m_hopCount = r->getLong("hc",0); //long ulen = 0; //char *u = r->getString ( "u" , &ulen , NULL /*default*/); long old = r->getLong ( "old", 0 ); // set query long qlen; char *qs = r->getString("q",&qlen,NULL); if ( qs ) st->m_tq.set2 ( qs , langUnknown , true ); // url will override docid if given if ( ! st->m_u || ! st->m_u[0] ) st->m_docId = r->getLongLong ("docid",-1); else st->m_docId = -1; // set url in state class (may have length 0) //if ( u ) st->m_url.set ( u , ulen ); //st->m_urlLen = ulen; st->m_u = st->m_r.getString("u",&st->m_ulen,NULL); // should we recycle link info? st->m_recycle = r->getLong("recycle",0); st->m_recycle2 = r->getLong("recycleimp",0); st->m_render = r->getLong("render" ,0); // for quality computation... takes way longer cuz we have to // lookup the IP address of every outlink, so we can get its root // quality using Msg25 which needs to filter out voters from that IP // range. st->m_oips = r->getLong("oips" ,0); long linkInfoLen = 0; // default is NULL char *linkInfoColl = r->getString ( "oli" , &linkInfoLen, NULL ); if ( linkInfoColl ) strcpy ( st->m_linkInfoColl , linkInfoColl ); else st->m_linkInfoColl[0] = '\0'; // set the flag in our SafeBuf class so that Words.cpp knows to show // html or html source depending on this value st->m_xbuf.m_renderHtml = st->m_render; // should we use the old title rec? st->m_old = old; // are we coming from a local machine? st->m_isLocal = r->isLocal(); //no more setting the default root quality to 30, instead if we do not // know it setting it to -1 st->m_rootQuality=-1; // header SafeBuf *xbuf = &st->m_xbuf; xbuf->safePrintf("<meta http-equiv=\"Content-Type\" " "content=\"text/html; charset=utf-8\">\n"); // print standard header g_pages.printAdminTop ( xbuf , st->m_s , &st->m_r ); // print the standard header for admin pages char *dd = ""; char *rr = ""; char *rr2 = ""; char *render = ""; char *oips = ""; char *us = ""; if ( st->m_u && st->m_u[0] ) us = st->m_u; //if ( st->m_sfn != -1 ) sprintf ( rtu , "%li",st->m_sfn ); if ( st->m_old ) dd = " checked"; if ( st->m_recycle ) rr = " checked"; if ( st->m_recycle2 ) rr2 = " checked"; if ( st->m_render ) render = " checked"; if ( st->m_oips ) oips = " checked"; xbuf->safePrintf( "<style>" ".poo { background-color:#%s;}\n" "</style>\n" , LIGHT_BLUE ); long clen; char *contentParm = r->getString("content",&clen,""); // print the input form xbuf->safePrintf ( "<style>\n" "h2{font-size: 12px; color: #666666;}\n" ".gbtag { border: 1px solid gray;" "background: #ffffef;display:inline;}\n" ".gbcomment { border: 1px solid gray;" "color: #888888; font-style:italic; " "background: #ffffef;display:inline;}\n" ".token { border: 1px solid gray;" "background: #f0ffff;display:inline;}\n" ".spam { border: 1px solid gray;" "background: #af0000;" "color: #ffffa0;}" ".hs {color: #009900;}" "</style>\n" "<center>" "<table %s>" "<tr><td colspan=5><center><b>" "Parser" "</b></center></td></tr>\n" "<tr class=poo>" "<td>" "<b>url</b>" "<br><font size=-2>" "Type in <b>FULL</b> url to parse." "</font>" "</td>" "</td>" "<td>" "<input type=text name=u value=\"%s\" size=\"40\">\n" "</td>" "</tr>" /* "<tr class=poo>" "<td>" "Parser version to use: " "</td>" "<td>" "<input type=text name=\"version\" size=\"4\" value=\"-1\"> " "</td>" "<td>" "(-1 means to use latest title rec version)<br>" "</td>" "</tr>" */ /* "<tr class=poo>" "<td>" "Hop count to use: " "</td>" "<td>" "<input type=text name=\"hc\" size=\"4\" value=\"%li\"> " "</td>" "<td>" "(-1 is unknown. For root urls hopcount is always 0)<br>" "</td>" "</tr>" */ "<tr class=poo>" "<td>" "<b>use cached</b>" "<br><font size=-2>" "Load page from cache (titledb)?" "</font>" "</td>" "<td>" "<input type=checkbox name=old value=1%s> " "</td>" "</tr>" /* "<tr class=poo>" "<td>" "Reparse root:" "</td>" "<td>" "<input type=checkbox name=artr value=1%s> " "</td>" "<td>" "Apply selected ruleset to root to update quality" "</td>" "</tr>" */ "<tr class=poo>" "<td>" "<b>recycle link info</b>" "<br><font size=-2>" "Recycle the link info from the title rec" "Load page from cache (titledb)?" "</font>" "</td>" "<td>" "<input type=checkbox name=recycle value=1%s> " "</td>" "</tr>" /* "<tr class=poo>" "<td>" "Recycle Link Info Imported:" "</td>" "<td>" "<input type=checkbox name=recycleimp value=1%s> " "</td>" "<td>" "Recycle the link info imported from other coll" "</td>" "</tr>" */ "<tr class=poo>" "<td>" "<b>render html</b>" "<br><font size=-2>" "Render document content as HTML" "</font>" "</td>" "<td>" "<input type=checkbox name=render value=1%s> " "</td>" "</tr>" /* "<tr class=poo>" "<td>" "Lookup outlinks' ruleset, ips, quality:" "</td>" "<td>" "<input type=checkbox name=oips value=1%s> " "</td>" "<td>" "To compute quality lookup IP addresses of roots " "of outlinks." "</td>" "</tr>" "<tr class=poo>" "<td>" "LinkInfo Coll:" "</td>" "<td>" "<input type=text name=\"oli\" size=\"10\" value=\"\"> " "</td>" "<td>" "Leave empty usually. Uses this coll to lookup link info." "</td>" "</tr>" */ "<tr class=poo>" "<td>" "<b>optional query</b>" "<br><font size=-2>" "Leave empty usually. For title generation only." "</font>" "</td>" "<td>" "<input type=text name=\"q\" size=\"20\" value=\"\"> " "</td>" "</tr>" "<tr class=poo>" "<td>" "<b>content below is xml</b>" "<br><font size=-2>" "Is the content below XML?" "</font>" "</td>" "<td>" "<input type=checkbox name=xml value=1> " "</td>" "</tr>" "<tr class=poo>" "<td><b>content</b>" "<br><font size=-2>" "Use this content for the provided <i>url</i> " "rather than downloading it from the web." "</td>" "<td>" "<textarea rows=10 cols=80 name=content>" "%s" "</textarea>" "</td>" "</tr>" "</table>" "</center>" "</form>" "<br>", TABLE_STYLE, us , //(long)st->m_hopCount, //rtu, dd, //artr , rr, //rr2, render , //oips , contentParm ); xbuf->safePrintf( "<center>" "<input type=submit value=Submit>" "</center>" ); // just print the page if no url given if ( ! st->m_u || ! st->m_u[0] ) return processLoop ( st ); XmlDoc *xd = &st->m_xd; // set this up SpiderRequest sreq; sreq.reset(); strcpy(sreq.m_url,st->m_u); long firstIp = hash32n(st->m_u); if ( firstIp == -1 || firstIp == 0 ) firstIp = 1; // parentdocid of 0 sreq.setKey( firstIp, 0LL, false ); sreq.m_isPageParser = 1; sreq.m_hopCount = st->m_hopCount; sreq.m_hopCountValid = 1; sreq.m_fakeFirstIp = 1; sreq.m_firstIp = firstIp; Url nu; nu.set(sreq.m_url); sreq.m_domHash32 = nu.getDomainHash32(); sreq.m_siteHash32 = nu.getHostHash32(); // . get provided content if any // . will be NULL if none provided // . "content" may contain a MIME long contentLen = 0; char *content = r->getString ( "content" , &contentLen , NULL ); // is the "content" url-encoded? default is true. bool contentIsEncoded = true; // mark doesn't like to url-encode his content if ( ! content ) { content = r->getUnencodedContent (); contentLen = r->getUnencodedContentLen (); contentIsEncoded = false; } // ensure null if ( contentLen == 0 ) content = NULL; uint8_t contentType = CT_HTML; if ( r->getBool("xml",0) ) contentType = CT_XML; // if facebook, load xml content from title rec... bool isFacebook = (bool)strstr(st->m_u,"http://www.facebook.com/"); if ( isFacebook && ! content ) { long long docId = g_titledb.getProbableDocId(st->m_u); sprintf(sreq.m_url ,"%llu", docId ); sreq.m_isPageReindex = true; } // hack if ( content ) { st->m_dbuf.purge(); st->m_dbuf.safeStrcpy(content); //char *data = strstr(content,"\r\n\r\n"); //long dataPos = 0; //if ( data ) dataPos = (data + 4) - content; //st->m_dbuf.convertJSONtoXML(0,dataPos); //st->m_dbuf.decodeJSON(0); content = st->m_dbuf.getBufStart(); } // . use the enormous power of our new XmlDoc class // . this returns false if blocked if ( ! xd->set4 ( &sreq , NULL , st->m_coll , &st->m_wbuf , 0 ,//PP_NICENESS )) content , false, // deletefromindex 0, // forced ip contentType )) // return error reply if g_errno is set return sendErrorReply ( st , g_errno ); // make this our callback in case something blocks xd->setCallback ( st , processLoop ); // . set xd from the old title rec if recycle is true // . can also use XmlDoc::m_loadFromOldTitleRec flag if ( st->m_recycle ) xd->m_recycleContent = true; return processLoop ( st ); }
bool qajson ( ) { // // delete the 'qatest123' collection // //static bool s_x1 = false; if ( ! s_flags[0] ) { s_flags[0] = true; if ( ! getUrl ( "/admin/delcoll?xml=1&delcoll=qatest123" ) ) return false; } // // add the 'qatest123' collection // //static bool s_x2 = false; if ( ! s_flags[1] ) { s_flags[1] = true; if ( ! getUrl ( "/admin/addcoll?addcoll=qatest123&xml=1" , // checksum of reply expected 238170006 ) ) return false; } // add the 50 urls if ( ! s_flags[3] ) { s_flags[3] = true; SafeBuf sb; sb.safePrintf("&c=qatest123" "&format=json" "&strip=1" "&spiderlinks=0" "&urls="//www.walmart.com+ibm.com" ); sb.urlEncode ( s_ubuf4 ); // . now a list of websites we want to spider // . the space is already encoded as + if ( ! getUrl ( "/admin/addurl",0,sb.getBufStart()) ) return false; } // // wait for spidering to stop // checkagain: // wait until spider finishes. check the spider status page // in json to see when completed //static bool s_k1 = false; if ( ! s_flags[5] ) { // wait 5 seconds, call sleep timer... then call qatest() //usleep(5000000); // 5 seconds wait(3.0); s_flags[5] = true; return false; } if ( ! s_flags[15] ) { s_flags[15] = true; if ( ! getUrl ( "/admin/status?format=json&c=qatest123",0) ) return false; } //static bool s_k2 = false; if ( ! s_flags[6] ) { // ensure spiders are done. // "Nothing currently available to spider" if ( s_content&&!strstr(s_content,"Nothing currently avail")){ s_flags[5] = false; s_flags[15] = false; goto checkagain; } s_flags[6] = true; } if ( ! s_flags[7] ) { s_flags[7] = true; if ( ! getUrl ( "/search?c=qatest123&qa=1&format=xml&" "q=type%3Ajson+meta.authors%3Appk", -1310551262 ) ) return false; } if ( ! s_flags[8] ) { s_flags[8] = true; if ( ! getUrl ( "/search?c=qatest123&qa=1&format=xml&n=100&" "q=type%3Ajson", -1310551262 ) ) return false; } if ( ! s_flags[9] ) { s_flags[9] = true; if ( ! getUrl ( "/search?c=qatest123&qa=1&format=json&" "q=gbfacetstr%3Ameta.authors", -1310551262 ) ) return false; } if ( ! s_flags[10] ) { s_flags[10] = true; // this has > 50 values for the facet field hash if ( ! getUrl ( "/search?c=qatest123&qa=1&format=json&" "q=gbfacetstr%3Astrings.key", -1310551262 ) ) return false; } // other query tests... if ( ! s_flags[12] ) { s_flags[12] = true; if ( ! getUrl ( "/search?c=qatest123&qa=1&format=json&" "q=inurl2%3Aquirksmode.org%2Fm%2F", -1310551262 ) ) return false; } if ( ! s_flags[13] ) { s_flags[13] = true; if ( ! getUrl ( "/search?c=qatest123&qa=1&format=json&" "q=site%3Aquirksmode.org", -1310551262 ) ) return false; } // test gbfieldmatch:field:"quoted value" query to ensure it converts // the quoted value into the right int32 if ( ! s_flags[14] ) { s_flags[14] = true; if ( ! getUrl ( "/search?c=qatest123&qa=1&format=json&" "q=gbfieldmatch%3Astrings.key" "%3Ainvestigate-tweet", -1310551262 ) ) return false; } if ( ! s_flags[15] ) { s_flags[15] = true; if ( ! getUrl ( "/search?c=qatest123&qa=1&format=json&" "q=gbfieldmatch%3Astrings.key" "%3A\"Maemo+Browser\"", -1310551262 ) ) return false; } if ( ! s_flags[16] ) { s_flags[16] = true; if ( ! getUrl ( "/search?c=qatest123&qa=1&format=json&" "q=gbfieldmatch%3Astrings.key" "%3A\"Google+Wireless+Transcoder\"", -1310551262 ) ) return false; } // this should have no results, not capitalized if ( ! s_flags[17] ) { s_flags[17] = true; if ( ! getUrl ( "/search?c=qatest123&qa=1&format=json&" "q=gbfieldmatch%3Astrings.key%3A\"samsung\"", -1310551262 ) ) return false; } if ( ! s_flags[18] ) { s_flags[18] = true; if ( ! getUrl ( "/search?c=qatest123&qa=1&format=json&" "q=gbfieldmatch%3Astrings.key%3ASamsung", -1310551262 ) ) return false; } if ( ! s_flags[18] ) { s_flags[18] = true; if ( ! getUrl ( "/search?c=qatest123&qa=1&format=json&" "q=gbfieldmatch%3Astrings.key%3A\"Samsung\"", -1310551262 ) ) return false; } //static bool s_fee2 = false; if ( ! s_flags[20] ) { s_flags[20] = true; log("qa: SUCCESSFULLY COMPLETED " "QA JSON TEST"); return true; } return true; }
bool sendPageQA ( TcpSocket *sock , HttpRequest *hr ) { char pbuf[32768]; SafeBuf sb(pbuf, 32768); //char format = hr->getReplyFormat(); // set this. also sets gr->m_hr GigablastRequest gr; // this will fill in GigablastRequest so all the parms we need are set g_parms.setGigablastRequest ( sock , hr , &gr ); // // . handle a request to update the crc for this test // . test id identified by "ajaxUrlHash" which is the hash of the test's url // and the test name, QATest::m_testName long ajax = hr->getLong("ajax",0); unsigned long ajaxUrlHash ; ajaxUrlHash = (unsigned long long)hr->getLongLong("uh",0LL); unsigned long ajaxCrc ; ajaxCrc = (unsigned long long)hr->getLongLong("crc",0LL); if ( ajax ) { // make sure it is initialized if ( s_ht.m_ks ) { // overwrite current value with provided one because // the user click on an override checkbox to update // the crc s_ht.addKey ( &ajaxUrlHash , &ajaxCrc ); saveHashTable(); } // send back the urlhash so the checkbox can turn the // bg color of the "diff" gray SafeBuf sb3; sb3.safePrintf("%lu",ajaxUrlHash); g_httpServer.sendDynamicPage(sock, sb3.getBufStart(), sb3.length(), -1/*cachetime*/); return true; } // if they hit the submit button, begin the tests long submit = hr->hasField("action"); long n = sizeof(s_qatests)/sizeof(QATest); if ( submit && g_qaInProgress ) { g_errno = EINPROGRESS; g_httpServer.sendErrorReply(sock,g_errno,mstrerror(g_errno)); return true; } // set m_doTest for ( long i = 0 ; submit && i < n ; i++ ) { QATest *qt = &s_qatests[i]; char tmp[10]; sprintf(tmp,"test%li",i); qt->m_doTest = hr->getLong(tmp,0); } if ( submit ) { // reset all the static thingies resetFlags(); // save socket g_qaSock = sock; g_numErrors = 0; g_qaOutput.reset(); g_qaOutput.safePrintf("<html><body>" "<title>QA Test Results</title>\n"); g_qaOutput.safePrintf("<SCRIPT LANGUAGE=\"javascript\">\n" // update s_ht with the new crc for this test "function submitchanges(urlhash,crc) " "{\n " "var client=new XMLHttpRequest();\n" "client.onreadystatechange=gotsubmitreplyhandler;" "var " "u='/admin/qa?ajax=1&uh='+urlhash+'&crc='+crc;\n" "client.open('GET',u);\n" "client.send();\n" // use that to fix background to gray "var w=document.getElementById(urlhash);\n" // set background color "w.style.backgroundColor = '0xe0e0e0';\n" // gear spinning after checkbox "}\n\n " // call this when we got the reply that the // checkbox went through "function gotsubmitreplyhandler() {\n" // return if reply is not fully ready "if(this.readyState != 4 )return;\n" // if error or empty reply then do nothing "if(!this.responseText)return;\n" // response text is the urlhash32, unsigned long "var id=this.responseText;\n" // use that to fix background to gray "var w=document.getElementById(id);\n" // set background color "w.style.backgroundColor = '0xe0e0e0';\n" "}\n\n" "</SCRIPT> "); // and run the qa test loop if ( ! qatest( ) ) return false; // what happened? log("qa: qatest completed without blocking"); } // show tests, all checked by default, to perform g_pages.printAdminTop ( &sb , sock , hr ); sb.safePrintf("<SCRIPT LANGUAGE=\"javascript\">\n" "function checkAll(name, num)\n " "{ " " for (var i = 0; i < num; i++) {\n" " var e = document.getElementById(name + i);\n" //"alert(name+i);" " e.checked = !e.checked ;\n " " }\n" "}\n\n " "</SCRIPT> "); //sb.safePrintf("<form name=\"fo\">"); sb.safePrintf("\n<table %s>\n",TABLE_STYLE); sb.safePrintf("<tr class=hdrow><td colspan=2>" "<center><b>QA Tests</b></center>" "</td></tr>"); // header row sb.safePrintf("<tr><td><b>Do Test?</b> <a style=cursor:hand;" "cursor:pointer; " "onclick=\"checkAll('test', %li);\">(toggle)</a>",n); sb.safePrintf("</td><td><b>Test Name</b></td></tr>\n"); // . we keep the ptr to each test in an array // . print out each qa function for ( long i = 0 ; i < n ; i++ ) { QATest *qt = &s_qatests[i]; char *bg; if ( i % 2 == 0 ) bg = LIGHT_BLUE; else bg = DARK_BLUE; sb.safePrintf("<tr bgcolor=#%s>" "<td><input type=checkbox value=1 name=test%li " "id=test%li></td>" "<td>%s" "<br>" "<font color=gray size=-1>%s</font>" "</td>" "</tr>\n" , bg , i , i , qt->m_testName , qt->m_testDesc ); } sb.safePrintf("</table>\n<br>\n"); // "</form>\n"); g_pages.printAdminBottom ( &sb , hr ); g_httpServer.sendDynamicPage(sock, sb.getBufStart(), sb.length(), -1/*cachetime*/); return true; }
void processReply ( char *reply , long replyLen ) { // store our current reply SafeBuf fb2; fb2.safeMemcpy(reply,replyLen ); fb2.nullTerm(); // log that we got the reply log("qa: got reply(len=%li)(errno=%s)=%s", replyLen,mstrerror(g_errno),reply); char *content = NULL; long contentLen = 0; // get mime if ( reply ) { HttpMime mime; mime.set ( reply, replyLen , NULL ); // only hash content since mime has a timestamp in it content = mime.getContent(); contentLen = mime.getContentLen(); if ( content && contentLen>0 && content[contentLen] ) { char *xx=NULL;*xx=0; } } if ( ! content ) { content = ""; contentLen = 0; } s_content = content; // take out <responseTimeMS> markOut ( content , "<currentTimeUTC>"); markOut ( content , "<responseTimeMS>"); // until i figure this one out, take it out markOut ( content , "<docsInCollection>"); // until i figure this one out, take it out markOut ( content , "<hits>"); // for those links in the html pages markOut ( content, "rand64="); // for json markOut ( content , "\"currentTimeUTC\":" ); markOut ( content , "\"responseTimeMS\":"); markOut ( content , "\"docsInCollection\":"); // for xml markOut ( content , "<currentTimeUTC>" ); markOut ( content , "<responseTimeMS>"); markOut ( content , "<docsInCollection>"); // indexed 1 day ago markOut ( content,"indexed:"); // modified 1 day ago markOut ( content,"modified:"); // s_gigabitCount... it is perpetually incrementing static counter // in PageResults.cpp markOut(content,"ccc("); markOut(content,"id=fd"); markOut(content,"id=sd"); // for some reason the term freq seems to change a little in // the scoring table markOut(content,"id=tf"); // make checksum. we ignore back to back spaces so this // hash works for <docsInCollection>10 vs <docsInCollection>9 long contentCRC = 0; if ( content ) contentCRC = qa_hash32 ( content ); // note it log("qa: got contentCRC of %lu",contentCRC); // if what we expected, save to disk if not there yet, then // call s_callback() to resume the qa pipeline /* if ( contentCRC == s_expectedCRC ) { // save content if good char fn3[1024]; sprintf(fn3,"%sqa/content.%lu",g_hostdb.m_dir,contentCRC); File ff; ff.set ( fn3 ); if ( ! ff.doesExist() ) { // if not there yet then save it fb2.save(fn3); } // . continue on with the qa process // . which qa function that may be //s_callback(); return; } */ // // if crc of content does not match what was expected then do a diff // so we can see why not // // this means caller does not care about the response if ( ! s_checkCRC ) { //s_callback(); return; } //const char *emsg = "qa: bad contentCRC of %li should be %li " // "\n";//"phase=%li\n"; //fprintf(stderr,emsg,contentCRC,s_expectedCRC);//,s_phase-1); // hash url long urlHash32 = hash32n ( s_url.getUrl() ); // combine test function too since two tests may use the same url long nameHash = hash32n ( s_qt->m_testName ); // combine together urlHash32 = hash32h ( nameHash , urlHash32 ); static bool s_init = false; if ( ! s_init ) { s_init = true; s_ht.set(4,4,1024,NULL,0,false,0,"qaht"); // make symlink //char cmd[512]; //snprintf(cmd,"cd %s/html ;ln -s ../qa ./qa", g_hostdb.m_dir); //system(cmd); char dir[1024]; snprintf(dir,1000,"%sqa",g_hostdb.m_dir); long status = ::mkdir ( dir , S_IRUSR | S_IWUSR | S_IXUSR | S_IRGRP | S_IWGRP | S_IXGRP | S_IROTH | S_IXOTH ); if ( status == -1 && errno != EEXIST && errno ) log("qa: Failed to make directory %s: %s.", dir,mstrerror(errno)); // try to load from disk SafeBuf fn; fn.safePrintf("%s/qa/",g_hostdb.m_dir); log("qa: loading crctable.dat"); s_ht.load ( fn.getBufStart() , "crctable.dat" ); } // break up into lines char fn2[1024]; sprintf(fn2,"%sqa/content.%lu",g_hostdb.m_dir,contentCRC); fb2.save ( fn2 ); // look up in hashtable to see what reply crc should be long *val = (long *)s_ht.getValue ( &urlHash32 ); // just return if the same if ( val && contentCRC == *val ) { g_qaOutput.safePrintf("<b style=color:green;>" "passed test</b><br>%s : " "<a href=%s>%s</a> (urlhash=%lu " "crc=<a href=/qa/content.%lu>" "%lu</a>)<br>" "<hr>", s_qt->m_testName, s_url.getUrl(), s_url.getUrl(), urlHash32, contentCRC, contentCRC); return; } if ( ! val ) { // add it so we know s_ht.addKey ( &urlHash32 , &contentCRC ); g_qaOutput.safePrintf("<b style=color:blue;>" "first time testing</b><br>%s : " "<a href=%s>%s</a> " "(urlhash=%lu " "crc=<a href=/qa/content.%lu>%lu" "</a>)<br>" "<hr>", s_qt->m_testName, s_url.getUrl(), s_url.getUrl(), urlHash32, contentCRC, contentCRC); return; } log("qa: crc changed for url %s from %li to %li", s_url.getUrl(),*val,contentCRC); // get response on file SafeBuf fb1; char fn1[1024]; sprintf(fn1,"%sqa/content.%lu",g_hostdb.m_dir, *val); fb1.load(fn1); fb1.nullTerm(); // do the diff between the two replies so we can see what changed char cmd[1024]; sprintf(cmd,"diff %s %s > /tmp/diffout",fn1,fn2); log("qa: %s\n",cmd); system(cmd); g_numErrors++; g_qaOutput.safePrintf("<b style=color:red;>FAILED TEST</b><br>%s : " "<a href=%s>%s</a> (urlhash=%lu)<br>" "<input type=checkbox name=urlhash%lu value=1 " // use ajax to update test crc. if you undo your // check then it should put the old val back. // when you first click the checkbox it should // gray out the diff i guess. "onclick=submitchanges(%lu,%lu);> " "Accept changes" "<br>" "original on left, new on right. " "oldcrc = <a href=/qa/content.%lu>%lu</a>" " != <a href=/qa/content.%lu>%lu</a> = newcrc" "<br>diff output follows:<br>" "<pre id=%lu style=background-color:0xffffff;>", s_qt->m_testName, s_url.getUrl(), s_url.getUrl(), urlHash32, // input checkbox name field urlHash32, // submitchanges() parms urlHash32, contentCRC, // original/old content.%lu *val, *val, // new content.%lu contentCRC, contentCRC, // for the pre tag id: urlHash32); // store in output SafeBuf sb; sb.load("/tmp/diffout"); g_qaOutput.htmlEncode ( sb.getBufStart() ); g_qaOutput.safePrintf("</pre><br><hr>"); // if this is zero allow it to slide by. it is learning mode i guess. // so we can learn what crc we need to use. // otherwise, stop right there for debugging //if ( s_expectedCRC != 0 ) exit(1); // keep on going //s_callback(); }
bool qaspider2 ( ) { // // delete the 'qatest123' collection // //static bool s_x1 = false; if ( ! s_flags[0] ) { s_flags[0] = true; if ( ! getUrl ( "/admin/delcoll?xml=1&delcoll=qatest123" ) ) return false; } // // add the 'qatest123' collection // //static bool s_x2 = false; if ( ! s_flags[1] ) { s_flags[1] = true; if ( ! getUrl ( "/admin/addcoll?addcoll=qatest123&xml=1" , // checksum of reply expected 238170006 ) ) return false; } // restrict hopcount to 0 or 1 in url filters so we do not spider // too deep //static bool s_z1 = false; if ( ! s_flags[2] ) { s_flags[2] = true; SafeBuf sb; sb.safePrintf("&c=qatest123&" // make it the custom filter "ufp=0&" "fe=%%21ismanualadd+%%26%%26+%%21insitelist&hspl=0&hspl=1&fsf=0.000000&mspr=0&mspi=1&xg=1000&fsp=-3&" // take out hopcount for now, just test quotas // "fe1=tag%%3Ashallow+%%26%%26+hopcount%%3C%%3D1&hspl1=0&hspl1=1&fsf1=1.000000&mspr1=1&mspi1=1&xg1=1000&fsp1=3&" // sitepages is a little fuzzy so take it // out for this test and use hopcount!!! //"fe1=tag%%3Ashallow+%%26%%26+sitepages%%3C%%3D20&hspl1=0&hspl1=1&fsf1=1.000000&mspr1=1&mspi1=1&xg1=1000&fsp1=45&" "fe1=tag%%3Ashallow+%%26%%26+hopcount<%%3D1&hspl1=0&hspl1=1&fsf1=1.000000&mspr1=1&mspi1=1&xg1=1000&fsp1=45&" "fe2=default&hspl2=0&hspl2=1&fsf2=1.000000&mspr2=0&mspi2=1&xg2=1000&fsp2=45&" ); if ( ! getUrl ( "/admin/filters",0,sb.getBufStart()) ) return false; } // set the site list to // a few sites // these should auto seed so no need to use addurl //static bool s_z2 = false; if ( ! s_flags[3] ) { s_flags[3] = true; SafeBuf sb; sb.safePrintf("&c=qatest123&format=xml&sitelist="); sb.urlEncode(//walmart has too many pages at depth 1, so remove it //"tag:shallow www.walmart.com\r\n" "tag:shallow http://www.ibm.com/\r\n"); sb.nullTerm(); if ( ! getUrl ("/admin/settings",0,sb.getBufStart() ) ) return false; } // // wait for spidering to stop // checkagain: // wait until spider finishes. check the spider status page // in json to see when completed //static bool s_k1 = false; if ( ! s_flags[4] ) { //usleep(5000000); // 5 seconds s_flags[4] = true; wait(3.0); return false; } if ( ! s_flags[14] ) { s_flags[14] = true; if ( ! getUrl ( "/admin/status?format=json&c=qatest123",0) ) return false; } //static bool s_k2 = false; if ( ! s_flags[5] ) { // ensure spiders are done. // "Nothing currently available to spider" if ( s_content&&!strstr(s_content,"Nothing currently avail")){ s_flags[4] = false; s_flags[14] = false; goto checkagain; } s_flags[5] = true; } // verify no results for gbhopcount:2 query //static bool s_y4 = false; if ( ! s_flags[6] ) { s_flags[6] = true; if ( ! getUrl ( "/search?c=qatest123&qa=1&format=xml&" "q=gbhopcount%3A2", -1310551262 ) ) return false; } // but some for gbhopcount:0 query //static bool s_t0 = false; if ( ! s_flags[7] ) { s_flags[7] = true; if ( ! getUrl ( "/search?c=qatest123&qa=1&format=xml&n=500&" "q=gbhopcount%3A0", 999 ) ) return false; } // check facet sections query for walmart //static bool s_y5 = false; if ( ! s_flags[8] ) { s_flags[8] = true; if ( ! getUrl ( "/search?c=qatest123&format=json&stream=0&" "q=gbfacetstr%3Agbxpathsitehash3311332088", 999 ) ) return false; } // wait for some reason if ( ! s_flags[15] ) { s_flags[15] = true; wait(1.5); return false; } //static bool s_y6 = false; if ( ! s_flags[9] ) { s_flags[9] = true; if ( ! getUrl ( "/get?page=4&q=gbfacetstr:gbxpathsitehash3311332088&qlang=xx&c=qatest123&d=9577169402&cnsp=0" , 999 ) ) return false; } // in xml //static bool s_y7 = false; if ( ! s_flags[10] ) { s_flags[10] = true; if ( ! getUrl ( "/get?xml=1&page=4&q=gbfacetstr:gbxpathsitehash2492664135&qlang=xx&c=qatest123&d=9577169402&cnsp=0" , 999 ) ) return false; } // and json //static bool s_y8 = false; if ( ! s_flags[11] ) { s_flags[11] = true; if ( ! getUrl ( "/get?json=1&page=4&q=gbfacetstr:gbxpathsitehash2492664135&qlang=xx&c=qatest123&d=9577169402&cnsp=0" , 999 ) ) return false; } // delete the collection //static bool s_fee = false; // if ( ! s_flags[12] ) { // s_flags[12] = true; // if ( ! getUrl ( "/admin/delcoll?delcoll=qatest123" ) ) // return false; // } //static bool s_fee2 = false; if ( ! s_flags[13] ) { s_flags[13] = true; log("qa: SUCCESSFULLY COMPLETED " "QA SPIDER2 TEST"); return true; } return true; }
// . parse an incoming request // . return false and set g_errno on error // . CAUTION: we destroy "req" by replacing it's last char with a \0 // . last char must be \n or \r for it to be a proper request anyway bool HttpRequest::set ( char *origReq , int32_t origReqLen , TcpSocket *sock ) { // reset number of cgi field terms reset(); if ( ! m_reqBuf.reserve ( origReqLen + 1 ) ) { log("http: failed to copy request: %s",mstrerror(g_errno)); return false; } // copy it to avoid mangling it m_reqBuf.safeMemcpy ( origReq , origReqLen ); // NULL term m_reqBuf.pushChar('\0'); m_reqBufValid = true; // and point to that char *req = m_reqBuf.getBufStart(); if( !req ) { log(LOG_ERROR, "http: req is NULL"); g_errno = EBADREQUEST; return false; } int32_t reqLen = m_reqBuf.length() - 1; // save this m_userIP = sock ? sock->m_ip : 0; m_isSSL = sock ? (sock->m_ssl!=NULL) : false; // TcpServer should always give us a NULL terminated request if ( req[reqLen] != '\0' ) { g_process.shutdownAbort(true); } // how long is the first line, the primary request // int32_t i; // for ( i = 0 ; i<reqLen && i<MAX_REQ_LEN && // req[i]!='\n' && req[i]!='\r'; i++); // . now fill up m_buf, used to log the request // . make sure the url was encoded correctly // . we don't want assholes encoding every char so we can't see what // url they are submitting to be spidered/indexed // . also, don't de-code encoded ' ' '+' '?' '=' '&' because that would // change the meaning of the url // . and finally, non-ascii chars that don't display correctly // . this should NULL terminate m_buf, too // . turn this off for now, just try to log a different way // m_bufLen = urlNormCode ( m_buf , MAX_REQ_LEN - 1 , req , i ); // ensure it's big enough to be a valid request if ( reqLen < 5 ) { log(LOG_WARN, "http: got reqlen %" PRId32"<5 = %s",reqLen,req); g_errno = EBADREQUEST; return false; } int32_t cmdLen = 0; // or if first line too long //if ( i >= 1024 ) { g_errno = EBADREQUEST; return false; } // get the type, must be GET or HEAD if ( strncmp ( req , "GET " , 4 ) == 0 ) { m_requestType = RT_GET; cmdLen = 3; } // these means a compressed reply was requested. use by query // compression proxies. else if ( strncmp ( req , "ZET " , 4 ) == 0 ) { m_requestType = RT_GET; cmdLen = 3; } else if ( strncmp ( req , "HEAD " , 5 ) == 0 ) { m_requestType = RT_HEAD; cmdLen = 4; } else if ( strncmp ( req , "POST " , 5 ) == 0 ) { m_requestType = RT_POST; cmdLen = 4; } else if ( strncmp ( req , "CONNECT " , 8 ) == 0 ) { // take this out until it stops losing descriptors and works //m_requestType = RT_CONNECT; //cmdLen = 7; // we no longer insert section info. emmanuel gets section // info when injecting a doc now i think in PageInject.cpp. // we do not proxy https requests because we can't // decrypt the page contents to cache them or to insert // the sectiondb voting markup, so it's kinda pointless... // and i'm not aiming to be a full-fledge squid proxy. log("http: CONNECT request not supported because we " "can't insert section markup and we can't cache: %s",req); g_errno = EBADREQUEST; return false; } else { log("http: got bad request cmd: %s",req); g_errno = EBADREQUEST; return false; } // . NULL terminate the request (a destructive operation!) // . this removes the last \n in the trailing \r\n // . shit, but it f***s up POST requests if ( m_requestType != RT_POST ) { req [ reqLen - 1 ] = '\0'; reqLen--; } // POST requests can be absolutely huge if you are injecting a 100MB // file, so limit our strstrs to the end of the mime char *d = NULL; char dc; // check for body if it was a POST request if ( m_requestType == RT_POST ) { d = strstr ( req , "\r\n\r\n" ); if ( d ) { dc = *d; *d = '\0'; } else log("http: Got POST request without \\r\\n\\r\\n."); } // is it a proxy request? m_isSquidProxyRequest = false; if ( strncmp ( req + cmdLen + 1, "http://" ,7) == 0 || strncmp ( req + cmdLen + 1, "https://",8) == 0 ) { m_isSquidProxyRequest = true; // set url parms for it m_squidProxiedUrl = req + cmdLen + 1; char *p = m_squidProxiedUrl + 7; if ( *p == '/' ) p++; // https:// ? // stop at whitespace or \0 for ( ; *p && ! is_wspace_a(*p) ; p++ ); // that's the length of it m_squidProxiedUrlLen = p - m_squidProxiedUrl; } else if ( m_requestType == RT_CONNECT ) { m_isSquidProxyRequest = true; // set url parms for it m_squidProxiedUrl = req + cmdLen + 1; // usually its like CONNECT diffbot.com:443 char *p = m_squidProxiedUrl; // stop at whitespace or \0 for ( ; *p && ! is_wspace_a(*p) ; p++ ); // that's the length of it m_squidProxiedUrlLen = p - m_squidProxiedUrl; } // check authentication char *auth = NULL; if ( m_isSquidProxyRequest && req ) auth = strstr(req,"Proxy-authorization: Basic "); //if ( m_isSquidProxyRequest && ! auth ) { // log("http: no auth in proxy request %s",req); // g_errno = EBADREQUEST; // return false; //} SafeBuf tmp; if ( auth ) { // find end of it char *p = auth; for ( ; *p && *p != '\r' && *p != '\n' ; p++ ); tmp.base64Decode ( auth , p - auth ); } // assume incorrect username/password bool matched = false; if ( m_isSquidProxyRequest ) { // now try to match in g_conf.m_proxyAuth safebuf of // username:password space-separated list char *p = g_conf.m_proxyAuth.getBufStart(); // loop over those for ( ; p && *p ; ) { // skip initial white space for ( ; *p && is_wspace_a(*p); p++ ); // skip to end of username:password thing char *end = p; for ( ; *end && !is_wspace_a(*end); end++); // save char *start = p; // advance p = end; // this is always a match if ( end-start == 3 && strncmp(start,"*:*",3) == 0 ) { matched = true; break; } // compare now if ( tmp.length() != end-start ) continue; if ( strncmp(tmp.getBufStart(),start,end-start) != 0 ) continue; // we got a match matched = true; break; } } // incorrect username:passwrod? if ( m_isSquidProxyRequest && ! matched ) { log("http: bad username:password in proxy request %s",req); g_errno = EPERMDENIED; return false; } // if proxy request to download a url through us, we are done if ( m_isSquidProxyRequest ) return true; bool multipart = false; if ( m_requestType == 2 ) { // is POST? char *cd ; cd = gb_strcasestr(req,"Content-Type: multipart/form-data"); if ( cd ) multipart = true; } // . point to the file path // . skip over the "GET " int32_t filenameStart = 4 ; // skip over extra char if it's a "HEAD " request if ( m_requestType == RT_HEAD || m_requestType == RT_POST ) filenameStart++; // are we a redirect? int32_t i = filenameStart; m_redirLen = 0; if ( strncmp ( &req[i] , "/?redir=" , 8 ) == 0 ) { for ( int32_t k = i+8; k<reqLen && m_redirLen<126 ; k++) { if ( req[k] == '\r' ) break; if ( req[k] == '\n' ) break; if ( req[k] == '\t' ) break; if ( req[k] == ' ' ) break; m_redir[m_redirLen++] = req[k]; } } m_redir[m_redirLen] = '\0'; // find a \n space \r or ? that delimits the filename for ( i = filenameStart ; i < reqLen ; i++ ) { if ( is_wspace_a ( req [ i ] ) ) break; if ( req [ i ] == '?' ) break; } // now calc the filename length m_filenameLen = i - filenameStart; // return false and set g_errno if it's 0 if ( m_filenameLen <= 0 ) { log("http: got filenameLen<=0: %s",req); g_errno = EBADREQUEST; return false; } // . bitch if too big // . leave room for strcatting "index.html" below if ( m_filenameLen >= MAX_HTTP_FILENAME_LEN - 10 ) { log("http: got filenameLen>=max"); g_errno = EBADREQUEST; return false; } // . decode the filename into m_filename and reassign it's length // . decode %2F to / , etc... m_filenameLen = urlDecode(m_filename,req+filenameStart,m_filenameLen); // NULL terminate m_filename m_filename [ m_filenameLen ] = '\0'; // does it have a file extension AFTER the last / in the filename? bool hasExtension = false; for ( int32_t j = m_filenameLen-1 ; j >= 0 ; j-- ) { if ( m_filename[j] == '.' ) { hasExtension = true; break; } if ( m_filename[j] == '/' ) break; } // if it has no file extension append a /index.html if ( ! hasExtension && m_filename [ m_filenameLen - 1 ] == '/' ) { strcat ( m_filename , "index.html" ); m_filenameLen = strlen ( m_filename ); } // . uses the TcpSocket::m_readBuf // . if *p was ? then keep going m_origUrlRequest = origReq + filenameStart; char *p = origReq + m_filenameLen; for ( ; *p && ! is_wspace_a(*p) ; p++ ); m_origUrlRequestLen = p - m_origUrlRequest; // set file offset/size defaults m_fileOffset = 0; // -1 means ALL the file from m_fileOffset onwards m_fileSize = -1; // "e" points to where the range actually starts, if any //char *e; // . TODO: speed up by doing one strstr for Range: and maybe range: // . do they have a Range: 0-100\n in the mime denoting a partial get? //char *s = strstr ( req ,"Range:bytes=" ); //e = s + 12; // try alternate formats //if ( ! s ) { s = strstr ( req ,"Range: bytes=" ); e = s + 13; } //if ( ! s ) { s = strstr ( req ,"Range: " ); e = s + 7; } // parse out the range if we got one //if ( s ) { // int32_t x = 0; // sscanf ( e ,"%" PRId32"-%" PRId32 , &m_fileOffset , &x ); // // get all file if range's 2nd number is non-existant // if ( x == 0 ) m_fileSize = -1; // else m_fileSize = x - m_fileOffset; // // ensure legitimacy // if ( m_fileOffset < 0 ) m_fileOffset = 0; //} // reset our hostname m_hostLen = 0; // assume request is NOT from local network //m_isMasterAdmin = false; m_isLocal = false; // get the virtual hostname they want to use char *s = strstr ( req ,"Host:" ); // try alternate formats if ( ! s ) s = strstr ( req , "host:" ); // must be on its own line, otherwise it's not valid if ( s && s > req && *(s-1) !='\n' ) s = NULL; // parse out the host if we got one if ( s ) { // skip field name, host: s += 5; // skip e to beginning of the host name after "host:" while ( *s==' ' || *s=='\t' ) s++; // find end of the host name char *end = s; while ( *end && !is_wspace_a(*end) ) end++; // . now *end should be \0, \n, \r, ' ', ... // . get host len m_hostLen = end - s; // truncate if too big if ( m_hostLen >= 255 ) m_hostLen = 254; // copy into hostname gbmemcpy ( m_host , s , m_hostLen ); } // NULL terminate it m_host [ m_hostLen ] = '\0'; // get Referer: field s = strstr ( req ,"Referer:" ); // find another if ( ! s ) s = strstr ( req ,"referer:" ); // must be on its own line, otherwise it's not valid if ( s && s > req && *(s-1) !='\n' ) s = NULL; // assume no referer m_refLen = 0; // parse out the referer if we got one if ( s ) { // skip field name, referer: s += 8; // skip e to beginning of the host name after ':' while ( *s==' ' || *s=='\t' ) s++; // find end of the host name char *end = s; while ( *end && !is_wspace_a(*end) ) end++; // . now *end should be \0, \n, \r, ' ', ... // . get len m_refLen = end - s; // truncate if too big if ( m_refLen >= 255 ) m_refLen = 254; // copy into m_ref gbmemcpy ( m_ref , s , m_refLen ); } // NULL terminate it m_ref [ m_refLen ] = '\0'; // get User-Agent: field s = strstr ( req ,"User-Agent:" ); // find another if ( ! s ) s = strstr ( req ,"user-agent:" ); // must be on its own line, otherwise it's not valid if ( s && s > req && *(s-1) !='\n' ) s = NULL; // assume empty int32_t len = 0; // parse out the referer if we got one if ( s ) { // skip field name, referer: s += 11; // skip e to beginning of the host name after ':' while ( *s==' ' || *s=='\t' ) s++; // find end of the agent name char *end = s; while ( *end && *end!='\n' && *end!='\r' ) end++; // . now *end should be \0, \n, \r, ' ', ... // . get agent len len = end - s; // truncate if too big if ( len > 127 ) len = 127; // copy into m_userAgent gbmemcpy ( m_userAgent , s , len ); } // NULL terminate it m_userAgent [ len ] = '\0'; // get Cookie: field s = strstr ( req, "Cookie:" ); // find another if ( !s ) s = strstr ( req, "cookie:" ); // must be on its own line, otherwise it's not valid if ( s && s > req && *(s-1) != '\n' ) s = NULL; // assume empty // m_cookieBufLen = 0; m_cookiePtr = s; // parse out the cookie if we got one if ( s ) { // skip field name, Cookie: s += 7; // skip s to beginning of cookie after ':' while ( *s == ' ' || *s == '\t' ) s++; // find end of the cookie char *end = s; while ( *end && *end != '\n' && *end != '\r' ) end++; // save length m_cookieLen = end - m_cookiePtr; // get cookie len //m_cookieBufLen = end - s; // trunc if too big //if (m_cookieBufLen > 1023) m_cookieBufLen = 1023; // copy into m_cookieBuf //gbmemcpy(m_cookieBuf, s, m_cookieBufLen); } // NULL terminate it if ( m_cookiePtr ) m_cookiePtr[m_cookieLen] = '\0'; //m_cookieBuf[m_cookieBufLen] = '\0'; // convert every '&' in cookie to a \0 for parsing the fields // for ( int32_t j = 0 ; j < m_cookieBufLen ; j++ ) // if ( m_cookieBuf[j] == '&' ) m_cookieBuf[j] = '\0'; // mark it as cgi if it has a ? bool isCgi = ( req [ i ] == '?' ) ; // reset m_filename length to exclude the ?* stuff if ( isCgi ) { // skip over the '?' i++; // find a space the delmits end of cgi int32_t j; for ( j = i; j < reqLen; j++) if (is_wspace_a(req[j])) break; // now add it if ( ! addCgi ( &req[i] , j-i ) ) return false; // update i i = j; } // . set path ptrs // . the whole /cgi/14.cgi?coll=xxx&..... thang m_path = req + filenameStart; m_plen = i - filenameStart; // we're local if hostname is 192.168.[0|1].y //if ( strncmp(iptoa(sock->m_ip),"192.168.1.",10) == 0) { // m_isMasterAdmin = true; m_isLocal = true; } //if ( strncmp(iptoa(sock->m_ip),"192.168.0.",10) == 0) { // m_isMasterAdmin = true; m_isLocal = true; } //if(strncmp(iptoa(sock->m_ip),"192.168.1.",10) == 0) m_isLocal = true; //if(strncmp(iptoa(sock->m_ip),"192.168.0.",10) == 0) m_isLocal = true; if ( sock && strncmp(iptoa(sock->m_ip),"192.168.",8) == 0) m_isLocal = true; if ( sock && strncmp(iptoa(sock->m_ip),"10.",3) == 0) m_isLocal = true; // gotta scan all ips in hosts.conf as well... // if we are coming from any of our own hosts.conf c blocks // consider ourselves local uint32_t last = 0; for ( int32_t i = 0 ; i < g_hostdb.getNumHosts() ; i++ ) { Host *h = g_hostdb.getHost(i); // save time with this check if ( h->m_ip == last ) continue; // update it last = h->m_ip; // returns number of top bytes in comon int32_t nt = sock ? ipCmp ( sock->m_ip , h->m_ip ) : 0; // at least be in the same c-block as a host in hosts.conf if ( nt < 3 ) continue; m_isLocal = true; break; } // connectips/adminips // for ( int32_t i = 0 ; i < g_conf.m_numConnectIps ; i++ ) { // if ( sock->m_ip != g_conf.m_connectIps[i] ) continue; // m_isLocal = true; // break; // } // 127.0.0.1 if ( sock && sock->m_ip == 16777343 ) m_isLocal = true; // . TODO: now add any cgi data from a POST..... // . look after the mime //char *d = NULL; // check for body if it was a POST request //if ( m_requestType == RT_POST ) d = strstr ( req , "\r\n\r\n" ); // return true now if no cgi stuff to parse if ( d ) { // now put d's char back, just in case... does it really matter? *d = dc; char *post = d + 4; int32_t postLen = reqLen-(d+4-req) ; // post sometimes has a \r or\n after it while ( postLen > 0 && post[postLen-1]=='\r' ) postLen--; // add it to m_cgiBuf, filter and everything if ( ! addCgi ( post , postLen ) ) return false; } // Put '\0' back into the HttpRequest buffer... // crap, not if we are multi-part unencoded stuff... if ( m_cgiBuf && ! multipart ) { // do not mangle the "ucontent"! int32_t cgiBufLen = m_cgiBufLen; cgiBufLen -= m_ucontentLen; char *buf = m_cgiBuf; for (int32_t i = 0; i < cgiBufLen ; i++) if (buf[i] == '&') buf[i] = '\0'; // don't decode the ucontent= field! int32_t decodeLen = m_cgiBufLen; // so subtract that if ( m_ucontent ) decodeLen -= m_ucontentLen; // decode everything. fixed for %00 in &content= so it // doesn't set our parms when injecting. int32_t len = urlDecodeNoZeroes(m_cgiBuf,m_cgiBuf,decodeLen); // we're parsing crap after the null if the last parm // has no value //memset(m_cgiBuf+len, '\0', m_cgiBufLen-len); m_cgiBufLen = len; // ensure that is null i guess if ( ! m_ucontent ) m_cgiBuf[len] = '\0'; } if (m_cgiBuf2){ char *buf = m_cgiBuf2; for (int32_t i = 0; i < m_cgiBuf2Size-1 ; i++) if (buf[i] == '&') buf[i] = '\0'; // decode everything. fixed for %00 in &content= so it // doesn't set our parms when injecting. int32_t len = urlDecodeNoZeroes ( m_cgiBuf2 , m_cgiBuf2 , m_cgiBuf2Size); memset(m_cgiBuf2+len, '\0', m_cgiBuf2Size-len); } // . parse the fields after the ? in a cgi filename // . or fields in the content if it's a POST // . m_cgiBuf must be and is NULL terminated for this parseFields ( m_cgiBuf , m_cgiBufLen ); // Add extra parms to the request. if (m_cgiBuf2Size){ parseFields(m_cgiBuf2, m_cgiBuf2Size); } // urldecode the cookie buf too!! if ( m_cookiePtr ) { char *p = m_cookiePtr; for (int32_t i = 0; i < m_cookieLen ; i++) { //if (p[i] == '&') p[i] = '\0'; // cookies are separated with ';' in the request only if (p[i] == ';') p[i] = '\0'; // a hack for the metacookie=.... // which uses &'s to separate its subcookies // this is a hack for msie's limit of 50 cookies if ( p[i] == '&' ) p[i] = '\0'; // set m_metaCookie to start of meta cookie if ( p[i] == 'm' && p[i+1] == 'e' && strncmp(p,"metacookie",10) == 0 ) m_metaCookie = p; } int32_t len = urlDecode ( m_cookiePtr , m_cookiePtr, m_cookieLen ); // we're parsing crap after the null if the last parm // has no value memset(m_cookiePtr+len, '\0', m_cookieLen-len); m_cookieLen = len; } return true; }
// . form an HTTP request // . use size 0 for HEAD requests // . use size -1 for GET whole doc requests // . fill in your own offset/size for partial GET requests // . returns false and sets g_errno on error // . NOTE: http 1.1 uses Keep-Alive by default (use Connection: close to not) bool HttpRequest::set (char *url,int32_t offset,int32_t size,time_t ifModifiedSince, const char *userAgent, const char *proto, bool doPost, const char *cookieJar, const char *additionalHeader, // if posting something, how many bytes is it? int32_t postContentLen , // are we sending the request through an http proxy? // if so this will be non-zero int32_t proxyIp , const char *proxyUsernamePwd ) { m_reqBufValid = false; int32_t hlen ; int32_t port = 80; const char *hptr = getHostFast ( url , &hlen , &port ); char *path = getPathFast ( url ); // . use the full url if sending to an http proxy // . HACK: do NOT do this if it is httpS because we end up // using the http tunnel using the CONNECT cmd and the squid proxy // will just forward/proxy just the entire tcp packets. if ( proxyIp && strncmp(url,"https://",8) != 0 ) path = url; char *pathEnd = NULL; const char *postData = NULL; if ( doPost ) { pathEnd = strstr(path,"?"); if ( pathEnd ) { *pathEnd = '\0'; postData = pathEnd + 1; } } // if no legit host if ( hlen <= 0 || ! hptr ) { g_errno = EBADURL; return false; } // sanity check. port is only 16 bits if ( port > (int32_t)0xffff ) { g_errno = EBADURL; return false; } // return false and set g_errno if url too big //if ( url->getUrlLen() + 400 >= MAX_REQ_LEN ) { // g_errno = EURLTOOBIG; return false;} // assume request type is a GET m_requestType = RT_GET;//0; // get the host NULL terminated char host[1024+8]; //int32_t hlen = url->getHostLen(); strncpy ( host , hptr , hlen ); host [ hlen ] = '\0'; // then port //uint16_t port = url->getPort(); if ( port != 80 ) { sprintf ( host + hlen , ":%" PRIu32 , (uint32_t)port ); hlen += strlen ( host + hlen ); } // the if-modified-since field const char *ims = ""; #if 0 char ibuf[64]; if ( ifModifiedSince ) { struct tm tm_buf; char buf[64]; // NOTE: ctime appends a \n snprintf(ibuf, sizeof(ibuf), "If-Modified-Since: %s UTC", asctime_r(gmtime_r(&ifModifiedSince,&tm_buf),buf)); // get the length int32_t ilen = strlen(ibuf); if( ilen && ilen < (int32_t)sizeof(ibuf)-1 ) { // hack off \n from ctime - replace with \r\n\0 ibuf [ ilen - 1 ] = '\r'; ibuf [ ilen ] = '\n'; ibuf [ ilen + 1 ] = '\0'; // set ims to this string ims = ibuf; } } // . until we fix if-modified-since, take it out // . seems like we are being called with it as true when should not be ims=""; #endif // . use one in conf file if caller did not provide // . this is usually Gigabot/1.0 if ( ! userAgent ) userAgent = g_conf.m_spiderUserAgent; // accept only these const char *accept = "*/*"; /* "text/html, " "text/plain, " "text/xml, " "application/pdf, " "application/msword, " "application/vnd.ms-excel, " "application/mspowerpoint, " "application/postscript"; */ const char *cmd = "GET"; if ( size == 0 ) cmd = "HEAD"; if ( doPost ) cmd = "POST"; // crap, can't spider nyt.com if we are 1.0, so use 1.0 but also // note Connection: Close\r\n when making requests //proto = "HTTP/1.1"; SafeBuf tmp; const char *up = ""; if ( proxyUsernamePwd && proxyUsernamePwd[0] ) { tmp.safePrintf("Proxy-Authorization: Basic "); tmp.base64Encode (proxyUsernamePwd,strlen(proxyUsernamePwd)); tmp.safePrintf("\r\n"); up = tmp.getBufStart(); } // . now use "Accept-Language: en" to tell servers we prefer english // . i removed keep-alive connection since some connections close on // non-200 ok http statuses and we think they're open since close // signal (read 0 bytes) may have been delayed const char* acceptEncoding = ""; // the scraper is getting back gzipped search results from goog, // so disable this for now // i am re-enabling now for testing... if(g_conf.m_gzipDownloads) acceptEncoding = "Accept-Encoding: gzip;q=1.0\r\n"; // i thought this might stop wikipedia from forcing gzip on us // but it did not! // else // acceptEncoding = "Accept-Encoding:\r\n"; // char *p = m_buf; // init the safebuf to point to this buffer in our class to avoid // a potential alloc // m_reqBuf.setBuf ( m_buf , MAX_REQ_LEN , 0 , false, csUTF8 ); m_reqBuf.purge(); // indicate this is good m_reqBufValid = true; if ( size == 0 ) { // 1 for HEAD requests m_requestType = RT_HEAD; m_reqBuf.safePrintf ( "%s %s %s\r\n" "Host: %s\r\n" "%s" "User-Agent: %s\r\n" "Connection: Close\r\n" //"Connection: Keep-Alive\r\n" "Accept-Language: en\r\n" //"Accept: */*\r\n\r\n" , "Accept: %s\r\n" "%s" , cmd, path , proto, host , ims , userAgent , accept , up ); } else if ( size != -1 ) m_reqBuf.safePrintf ( "%s %s %s\r\n" "Host: %s\r\n" "%s" "User-Agent: %s\r\n" "Connection: Close\r\n" //"Connection: Keep-Alive\r\n" "Accept-Language: en\r\n" //"Accept: */*\r\n" "Accept: %s\r\n" "Range: bytes=%" PRId32"-%" PRId32"\r\n" "%s" , cmd, path , proto , host , ims , userAgent , accept , offset , offset + size , up); else if ( offset > 0 ) // size is -1 m_reqBuf.safePrintf ( "%s %s %s\r\n" "Host: %s\r\n" "%s" "User-Agent: %s\r\n" "Connection: Close\r\n" //"Connection: Keep-Alive\r\n" "Accept-Language: en\r\n" //"Accept: */*\r\n" "Accept: %s\r\n" "Range: bytes=%" PRId32"-\r\n" "%s" , cmd, path , proto , host , ims , userAgent , accept , offset , up ); // Wget's request: // GET / HTTP/1.0\r\nUser-Agent: Wget/1.10.2\r\nAccept: */*\r\nHost: 127.0.0.1:8000\r\nConnection: Keep-Alive\r\n\r\n // firefox's request: // GET /master?c=main HTTP/1.1\r\nHost: 10.5.1.203:8000\r\nUser-Agent: Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.2.7) Gecko/20100715 Ubuntu/10.04 (lucid) Firefox/3.6.7\r\nAccept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8\r\nAccept-Language: en-us,en;q=0.5\r\nAccept-Encoding: gzip,deflate\r\nAccept-Charset: ISO-8859-1,utf-8;q=0.7,*;q=0.7\r\nKeep-Alive: 115\r\nConnection: keep-alive\r\nReferer: http://10.5.0.2:8002/qpmdw.html\r\nCookie: __utma=267617550.1103353528.1269214594.1273256655.1276103782.12; __utmz=267617550.1269214594.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); _incvi=qCffL7N8chFyJLwWrBDMbNz2Q3EWmAnf4uA; s_lastvisit=1269900225815; s_pers=%20s_getnr%3D1276103782254-New%7C1339175782254%3B%20s_nrgvo%3DNew%7C1339175782258%3B\r\n\r\n else { // until we fix if-modified-since, take it out //ims=""; //userAgent = "Wget/1.10.2"; //userAgent = "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.2.7) Gecko/20100715 Ubuntu/10.04 (lucid) Firefox/3.6.7"; //proto = "HTTP/1.0"; m_reqBuf.safePrintf ( "%s %s %s\r\n" "User-Agent: %s\r\n" "Accept: */*\r\n" "Host: %s\r\n" "%s" "Connection: Close\r\n" //"Connection: Keep-Alive\r\n" //"Accept-Language: en\r\n" "%s" "%s" , //"Accept: %s\r\n\r\n" , //"\r\n", cmd, path , proto , userAgent , host , ims , acceptEncoding, up ); //accept ); } if ( additionalHeader ) m_reqBuf.safePrintf("%s\r\n",additionalHeader ); // cookie here if (cookieJar) { HttpMime::addCookieHeader(cookieJar, url, &m_reqBuf); } // print content-length: if post if ( postData ) { // dammit... recaptcha does not work without this!!!! m_reqBuf.safePrintf ( "Content-Type: " "application/x-www-form-urlencoded\r\n"); } // we need this if doing a post even if postData is NULL if ( doPost ) { int32_t contentLen = 0; if ( postData ) contentLen = strlen(postData); // this overrides if provided. -1 is default if ( postContentLen >= 0 ) contentLen = postContentLen; m_reqBuf.safePrintf ("Content-Length: %" PRId32"\r\n", contentLen ); m_reqBuf.safePrintf("\r\n"); if ( postData ) m_reqBuf.safePrintf("%s",postData); // log it for debug //log("captch: %s",m_buf); } if ( ! doPost ) { // ! postData ) { m_reqBuf.safePrintf("\r\n"); } // restore url buffer if ( pathEnd ) *pathEnd = '?'; return true; }
bool Log::init ( char *filename ) { // set the main process id //s_pid = getpidtid(); setPid(); // init these m_numErrors = 0; m_bufPtr = 0; m_fd = -1; m_disabled = false; #ifdef DEBUG g_dbufSize = 4096; g_dbuf = (char*)mmalloc(g_dbufSize,"Log: DebugBuffer"); if (!g_dbuf) fprintf(stderr, "Unable to init debug buffer"); #endif // m_hostname = g_conf.m_hostname; // m_port = port; // is there a filename to log our errors to? m_filename = filename; if ( ! m_filename ) return true; // skip this for now //return true; // // RENAME log000 to log000-2013_11_04-18:19:32 // if ( g_conf.m_runAsDaemon ) { File f; char tmp[16]; sprintf(tmp,"log%03li",g_hostdb.m_hostId); f.set ( g_hostdb.m_dir , tmp ); // make new filename like log000-2013_11_04-18:19:32 time_t now = getTimeLocal(); tm *tm1 = gmtime((const time_t *)&now); char tmp2[64]; strftime(tmp2,64,"%Y_%m_%d-%T",tm1); SafeBuf newName; if ( ! newName.safePrintf ( "%slog%03li-%s", g_hostdb.m_dir, g_hostdb.m_hostId, tmp2 ) ) { fprintf(stderr,"log rename failed\n"); return false; } // rename log000 to log000-2013_11_04-18:19:32 if ( f.doesExist() ) { //fprintf(stdout,"renaming file\n"); f.rename ( newName.getBufStart() ); } } // open it for appending. // create with -rw-rw-r-- permissions if it's not there. m_fd = open ( m_filename , O_APPEND | O_CREAT | O_RDWR , S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH ); if ( m_fd >= 0 ) return true; // bitch to stderr and return false on error fprintf(stderr,"could not open log file %s for appending\n", m_filename); return false; }
// . make a web page from results stored in msg40 // . send it on TcpSocket "s" when done // . returns false if blocked, true otherwise // . sets g_errno on error bool gotTitleRec ( void *state ) { // cast the State4 out State4 *st = (State4 *) state; // get the socket TcpSocket *s = st->m_socket; SafeBuf sb; // get it's docId long long docId = st->m_docId; // make the query string for passing to different hosts char qs[64]; sprintf(qs,"&d=%lli",docId); if ( docId==0LL ) qs[0] = 0; // print standard header sb.reserve2x ( 32768 ); g_pages.printAdminTop (&sb, st->m_socket, &st->m_r ); //PAGE_TITLEDB, // st->m_username,//NULL , // st->m_coll , st->m_pwd , s->m_ip , qs ); // shortcut XmlDoc *xd = &st->m_xd; // . deal with errors // . print none if non title rec at or after the provided docId if ( g_errno || docId == 0LL || xd->m_titleRecBuf.length() <= 0 ) { // print docId in box sb.safePrintf ( "<center>\nEnter docId: " "<input type=text name=d value=%lli size=15>", docId); sb.safePrintf ( "</form><br>\n" ); if ( docId == 0 ) sb.safePrintf("<br>"); else if ( g_errno ) sb.safePrintf("<br><br>Error = %s",mstrerror(g_errno)); else sb.safePrintf("<br><br>No titleRec for that docId " "or higher"); // print where it should be //unsigned long gid = getGroupIdFromDocId ( docId ); //Host *hosts = g_hostdb.getGroup(gid); long shardNum = getShardNumFromDocId ( docId ); Host *hosts = g_hostdb.getShard ( shardNum ); long hostId = -1; if ( hosts ) hostId = hosts[0].m_hostId; sb.safePrintf("<br><br>docId on host #%li and twins.",hostId); sb.safePrintf ( "\n</center>" ); mdelete ( st , sizeof(State4) , "PageTitledb"); delete (st); // erase g_errno for sending g_errno = 0; // now encapsulate it in html head/tail and send it off return g_httpServer.sendDynamicPage ( s , sb.getBufStart(), sb.length() ); } // print docId in box sb.safePrintf ("<br>\n" "<center>Enter docId: " "<input type=text name=d value=%lli size=15>", docId ); // print where it should be //unsigned long gid = getGroupIdFromDocId ( docId ); //Host *hosts = g_hostdb.getGroup(gid); long shardNum = getShardNumFromDocId ( docId ); Host *hosts = g_hostdb.getShard ( shardNum ); long hostId = -1; if ( hosts ) hostId = hosts[0].m_hostId; sb.safePrintf("<br><br>docId on host #%li and twins.",hostId); sb.safePrintf ( "</form><br>\n" ); //char *coll = st->m_coll; Title *ti = xd->getTitle(); if ( ! ti ) { log ( "admin: Could not set title" ); return g_httpServer.sendErrorReply(s,500,mstrerror(g_errno)); } // sanity check. should not block if ( ! xd->m_titleValid ) { char *xx=NULL;*xx=0; } // print it out xd->printDoc ( &sb ); // don't forget to cleanup mdelete ( st , sizeof(State4) , "PageTitledb"); delete (st); // now encapsulate it in html head/tail and send it off return g_httpServer.sendDynamicPage (s, sb.getBufStart(), sb.length()); }
// // the injection qa test suite // bool qainject1 ( ) { //if ( ! s_callback ) s_callback = qainject1; // // delete the 'qatest123' collection // //static bool s_x1 = false; if ( ! s_flags[0] ) { s_flags[0] = true; if ( ! getUrl ( "/admin/delcoll?xml=1&delcoll=qatest123" ) ) return false; } // // add the 'qatest123' collection // //static bool s_x2 = false; if ( ! s_flags[1] ) { s_flags[1] = true; if ( ! getUrl ( "/admin/addcoll?addcoll=qatest123&xml=1" , // checksum of reply expected 238170006 ) ) return false; } // this only loads once loadUrls(); long max = s_ubuf2.length()/(long)sizeof(char *); //max = 1; // // inject urls, return false if not done yet // //static bool s_x4 = false; if ( ! s_flags[2] ) { // TODO: try delimeter based injection too //static long s_ii = 0; for ( ; s_flags[20] < max ; ) { // inject using html api SafeBuf sb; sb.safePrintf("&c=qatest123&deleteurl=0&" "format=xml&u="); sb.urlEncode ( s_urlPtrs[s_flags[20]] ); // the content sb.safePrintf("&hasmime=1"); // sanity //if ( strstr(s_urlPtrs[s_flags[20]],"wdc.htm") ) // log("hey"); sb.safePrintf("&content="); sb.urlEncode(s_contentPtrs[s_flags[20]] ); sb.nullTerm(); // pre-inc it in case getUrl() blocks s_flags[20]++;//ii++; if ( ! getUrl("/admin/inject", 0, // no idea what crc to expect sb.getBufStart()) ) return false; } s_flags[2] = true; } // +the //static bool s_x5 = false; if ( ! s_flags[3] ) { wait(1.5); s_flags[3] = true; return false; } if ( ! s_flags[16] ) { s_flags[16] = true; if ( ! getUrl ( "/search?c=qatest123&qa=1&format=xml&q=%2Bthe", 702467314 ) ) return false; } // sports news //static bool s_x7 = false; if ( ! s_flags[4] ) { s_flags[4] = true; if ( ! getUrl ( "/search?c=qatest123&qa=1&format=xml&" "q=sports+news",2009472889 ) ) return false; } // 'washer & dryer' does some algorithmic synonyms 'washer and dryer' if ( ! s_flags[15] ) { s_flags[15] = true; if ( ! getUrl ( "/search?c=qatest123&qa=1&format=xml&" "debug=1&q=washer+%26+dryer",9999 ) ) return false; } // // mdw: query reindex test // // if ( ! s_flags[30] ) { // s_flags[30] = true; // if ( ! getUrl ( "/admin/reindex?c=qatest123&qa=1&format=xml&" // "debug=1&q=sports",9999 ) ) // return false; // } // // temp end it here // return true; // // eject/delete the urls // //static long s_ii2 = 0; for ( ; s_flags[5] < max ; ) { // reject using html api SafeBuf sb; sb.safePrintf( "/admin/inject?c=qatest123&deleteurl=1&" "format=xml&u="); sb.urlEncode ( s_urlPtrs[s_flags[5]] ); sb.nullTerm(); // pre-inc it in case getUrl() blocks //s_ii2++; s_flags[5]++; if ( ! getUrl ( sb.getBufStart() , 0 ) ) return false; } // // make sure no results left, +the // if ( ! s_flags[6] ) { wait(1.5); s_flags[6] = true; return false; } if ( ! s_flags[14] ) { s_flags[14] = true; if ( ! getUrl ( "/search?c=qatest123&qa=2&format=xml&q=%2Bthe", -1672870556 ) ) return false; } //static bool s_fee2 = false; if ( ! s_flags[13] ) { s_flags[13] = true; log("qa: SUCCESSFULLY COMPLETED " "QA INJECT TEST 1"); //if ( s_callback == qainject ) exit(0); return true; } return true; }
// // new code for drawing graph in html with absolute divs instead // of using GIF plotter library which had issues // void Stats::printGraphInHtml ( SafeBuf &sb ) { // gif size char tmp[64]; sprintf ( tmp , "%lix%li", (long)DX+40 , (long)DY+40 ); // "1040x440" // 20 pixel borders //int bx = 10; //int by = 30; // define the space with boundaries 100 unit wide boundaries //plotter.space ( -bx , -by , DX + bx , DY + by ); // draw the x-axis //plotter.line ( 0 , 0 , DX , 0 ); // draw the y-axis //plotter.line ( 0 , 0 , 0 , DY ); // find time ranges long long t2 = 0; for ( long i = 0 ; i < MAX_POINTS ; i++ ) { // skip empties if ( m_pts[i].m_startTime == 0 ) continue; // set min/max if ( m_pts[i].m_endTime > t2 ) t2 = m_pts[i].m_endTime; } // now compute the start time for the graph long long t1 = 0x7fffffffffffffffLL; // now recompute t1 for ( long i = 0 ; i < MAX_POINTS ; i++ ) { // skip empties if ( m_pts[i].m_startTime == 0 ) continue; // can't be behind more than 1 second if ( m_pts[i].m_startTime < t2 - DT ) continue; // otherwise, it's a candidate for the first time if ( m_pts[i].m_startTime < t1 ) t1 = m_pts[i].m_startTime; } // // main graphing window // sb.safePrintf("<div style=\"position:relative;" "background-color:#c0c0c0;" // match style of tables "border-radius:10px;" "border:#6060f0 2px solid;" //"overflow-y:hidden;" "overflow-x:hidden;" "z-index:-10;" // the tick marks we print below are based on it // being a window of the last 20 seconds... and using // DX pixels "min-width:%lipx;" "min-height:%lipx;" //"width:100%%;" //"min-height:600px;" //"margin-top:10px;" "margin-bottom:10px;" //"margin-right:10px;" //"margin-left:10px;" "\">" ,(long)DX ,(long)DY +20); // add 10 more for "2s" labels etc. // 10 x-axis tick marks for ( int x = DX/20 ; x <= DX ; x += DX/20 ) { // tick mark //plotter.line ( x , -20 , x , 20 ); sb.safePrintf("<div style=\"position:absolute;" "left:%li;" "bottom:0;" "background-color:#000000;" "z-index:110;" "min-height:20px;" "min-width:3px;\"></div>\n" , (long)x-1 ); // generate label //char buf [ 32 ]; //sprintf ( buf , "%li" , // (long)(DT * (long long)x / (long long)DX) ); // LABEL sb.safePrintf("<div style=\"position:absolute;" "left:%li;" "bottom:20;" //"background-color:#000000;" "z-index:110;" "min-height:20px;" "min-width:3px;\">%lis</div>\n" , (long)x-10 // the label: ,(long)(DT * (long long)x / (long long)DX)/1000 ); // move cursor //plotter.move ( x , -by / 2 - 9 ); // plot label //plotter.alabel ( 'c' , 'c' , buf ); } // . each line consists of several points // . we need to know each point for adding otherlines // . is about [400/6][1024] = 70k // . each line can contain multiple data points // . each data point is expressed as a horizontal line segment void *lrgBuf; long lrgSize = 0; lrgSize += MAX_LINES * MAX_POINTS * sizeof(StatPoint *); lrgSize += MAX_LINES * sizeof(long); lrgBuf = (char *) mmalloc(lrgSize, "Stats.cpp"); if (! lrgBuf) { log("could not allocate memory for local buffer in Stats.cpp" "%li bytes needed", lrgSize); return; } char *lrgPtr = (char *)lrgBuf; StatPoint **points = (StatPoint **)lrgPtr; lrgPtr += MAX_LINES * MAX_POINTS * sizeof(StatPoint *); long *numPoints = (long *)lrgPtr; lrgPtr += MAX_LINES * sizeof(long); memset ( (char *)numPoints , 0 , MAX_LINES * sizeof(long) ); // store the data points into "lines" long count = MAX_POINTS; for ( long i = m_next ; count >= 0 ; i++ , count-- ) { // wrap around the array if ( i >= MAX_POINTS ) i = 0; // skip point if empty if ( m_pts[i].m_startTime == 0 ) continue; // skip if too early if ( m_pts[i].m_endTime < t1 ) continue; // . find the lowest line the will hold us // . this adds point to points[x][n] where x is determined addPoint ( points , numPoints , &m_pts[i] ); } int y1 = 21; // plot the points (lines) in each line for ( long i = 0 ; i < MAX_LINES ; i++ ) { // increase vert y1 += MAX_WIDTH + 1; // wrap back down if necessary if ( y1 >= DY ) y1 = 21; // plt all points in this row for ( long j = 0 ; j < numPoints[i] ; j++ ) { // get the point StatPoint *p = points[MAX_POINTS * i + j]; // transform time to x coordinates int x1 = (p->m_startTime - t1) * (long long)DX / DT; int x2 = (p->m_endTime - t1) * (long long)DX / DT; // if x2 is negative, skip it if ( x2 < 0 ) continue; // if x1 is negative, boost it to -2 if ( x1 < 0 ) x1 = -2; // . line thickness is function of read/write size // . take logs int w = (int)log(((double)p->m_numBytes)/8192.0) + 3; //log("log of %li is %i",m_pts[i].m_numBytes,w); if ( w < 3 ) w = 3; if ( w > MAX_WIDTH ) w = MAX_WIDTH; //plotter.linewidth ( w ); // use the color specified from addStat_r() for this line/pt //plotter.pencolor ( ((p->m_color >> 16) & 0xff) << 8 , // ((p->m_color >> 8) & 0xff) << 8 , // ((p->m_color >> 0) & 0xff) << 8 ); // ensure at least 3 units wide for visibility if ( x2 < x1 + 3 ) x2 = x1 + 3; // . flip the y so we don't have to scroll the browser down // . DY does not include the axis and tick marks long fy1 = DY - y1 + 20 ; // plot it //plotter.line ( x1 , fy1 , x2 , fy1 ); drawLine2 ( sb , x1 , x2 , fy1 , p->m_color , w ); // debug msg //log("line (%i,%i, %i,%i) ", x1 , vert , x2 , vert ); //log("bytes = %li width = %li ", m_pts[i].m_numBytes,w); //log("st=%i, end=%i color=%lx " , // (int)m_pts[i].m_startTime , // (int)m_pts[i].m_endTime , // m_pts[i].m_color ); } } sb.safePrintf("</div>\n"); mfree(lrgBuf, lrgSize, "Stats.cpp"); }
bool qainject2 ( ) { //if ( ! s_callback ) s_callback = qainject2; // // delete the 'qatest123' collection // //static bool s_x1 = false; if ( ! s_flags[0] ) { s_flags[0] = true; if ( ! getUrl ( "/admin/delcoll?xml=1&delcoll=qatest123" ) ) return false; } // // add the 'qatest123' collection // //static bool s_x2 = false; if ( ! s_flags[1] ) { s_flags[1] = true; if ( ! getUrl ( "/admin/addcoll?addcoll=qatest123&xml=1" , // checksum of reply expected 238170006 ) ) return false; } // // try delimeter based injecting // //static bool s_y2 = false; if ( ! s_flags[7] ) { s_flags[7] = true; SafeBuf sb; // delim=+++URL: sb.safePrintf("&c=qatest123&deleteurl=0&" "delim=%%2B%%2B%%2BURL%%3A&format=xml&u=xyz.com&" "hasmime=1&content="); // use injectme3 file SafeBuf ubuf; ubuf.load("./injectme3"); sb.urlEncode(ubuf.getBufStart()); if ( ! getUrl ( "/admin/inject", // check reply, seems to have only a single // docid in it -1970198487, sb.getBufStart()) ) return false; } // now query check //static bool s_y4 = false; if ( ! s_flags[8] ) { wait(1.5); s_flags[8] = true; return false; } if ( ! s_flags[14] ) { s_flags[14] = true; if ( ! getUrl ( "/search?c=qatest123&qa=1&format=xml&q=%2Bthe", -1804253505 ) ) return false; } //static bool s_y5 = false; if ( ! s_flags[9] ) { s_flags[9] = true; if ( ! getUrl ( "/search?c=qatest123&qa=1&format=xml&q=sports" "+news&ns=1&tml=20&smxcpl=30&" "sw=10&showimages=1" ,-1874756636 ) ) return false; } //static bool s_y6 = false; if ( ! s_flags[10] ) { s_flags[10] = true; if ( ! getUrl ( "/search?c=qatest123&qa=1&format=xml&q=sports" "+news&ns=1&tml=20&smxcpl=30&" "sw=10&showimages=0&hacr=1" ,1651330319 ) ) return false; } //static bool s_y7 = false; if ( ! s_flags[11] ) { s_flags[11] = true; if ( ! getUrl ( "/search?c=qatest123&qa=1&format=xml&q=sports" "+news&ns=1&tml=20&smxcpl=30&" "sw=10&showimages=0&sc=1" ,-1405546537 ) ) return false; } // // delete the 'qatest123' collection // if ( ! s_flags[12] ) { s_flags[12] = true; if ( ! getUrl ( "/admin/delcoll?xml=1&delcoll=qatest123" ) ) return false; } //static bool s_fee2 = false; if ( ! s_flags[13] ) { s_flags[13] = true; log("qa: SUCCESSFULLY COMPLETED " "QA INJECT TEST 2"); //if ( s_callback == qainject ) exit(0); return true; } return true; }
// returns false if blocked, true otherwise bool processLoop ( void *state ) { // get it State2 *st = (State2 *)state; // get the tcp socket from the state TcpSocket *s = st->m_socket; // get it XmlDoc *xd = &st->m_xd; if ( ! xd->m_loaded ) { // setting just the docid. niceness is 0. //xd->set3 ( st->m_docId , st->m_coll , 0 ); // callback xd->setCallback ( state , processLoop ); // . and tell it to load from the old title rec // . this sets xd->m_oldTitleRec/m_oldTitleRecSize // . this sets xd->ptr_* and all other member vars from // the old title rec if found in titledb. if ( ! xd->loadFromOldTitleRec ( ) ) return false; } if ( g_errno ) return sendErrorReply ( st , g_errno ); // now force it to load old title rec //char **tr = xd->getTitleRec(); SafeBuf *tr = xd->getTitleRecBuf(); // blocked? return false if so. it will call processLoop() when it rets if ( tr == (void *)-1 ) return false; // we did not block. check for error? this will free "st" too. if ( ! tr ) return sendErrorReply ( st , g_errno ); // if title rec was empty, that is a problem if ( xd->m_titleRecBuf.length() == 0 ) return sendErrorReply ( st , ENOTFOUND); // set callback char *na = xd->getIsNoArchive(); // wait if blocked if ( na == (void *)-1 ) return false; // error? if ( ! na ) return sendErrorReply ( st , g_errno ); // forbidden? allow turkeys through though... if ( ! st->m_isMasterAdmin && *na ) return sendErrorReply ( st , ENOCACHE ); SafeBuf *sb = &st->m_sb; // &page=4 will print rainbow sections if ( ! st->m_printed && st->m_r.getLong("page",0) ) { // do not repeat this call st->m_printed = true; // this will call us again since we called // xd->setCallback() above to us if ( ! xd->printDocForProCog ( sb , &st->m_r ) ) return false; } char *contentType = "text/html"; char format = st->m_format; if ( format == FORMAT_XML ) contentType = "text/xml"; if ( format == FORMAT_JSON ) contentType = "application/json"; // if we printed a special page (like rainbow sections) then return now if ( st->m_printed ) { bool status = g_httpServer.sendDynamicPage (s, //buf,bufLen, sb->getBufStart(), sb->getLength(), -1,false, //"text/html", contentType, -1, NULL, "utf8" ); // nuke state2 mdelete ( st , sizeof(State2) , "PageGet1" ); delete (st); return status; } /* // this was calling XmlDoc and setting sections, etc. to // get the SpiderReply junk... no no no // is it banned or filtered? this ignores the TagRec in the titleRec // and uses msg8a to get it fresh instead char *vi = xd->getIsFiltered();//Visible( ); // wait if blocked if ( vi == (void *)-1 ) return false; // error? if ( ! vi ) return sendErrorReply ( st , g_errno ); // banned? if ( ! st->m_isMasterAdmin && ! *vi ) return sendErrorReply (st,EDOCBANNED); */ // get the utf8 content char **utf8 = xd->getUtf8Content(); //int32_t len = xd->size_utf8Content - 1; // wait if blocked??? if ( utf8 == (void *)-1 ) return false; // strange if ( xd->size_utf8Content<=0) { log("pageget: utf8 content <= 0"); return sendErrorReply(st,EBADENGINEER ); } // alloc error? if ( ! utf8 ) return sendErrorReply ( st , g_errno ); // get this host Host *h = g_hostdb.getHost ( g_hostdb.m_hostId ); if ( ! h ) { log("pageget: hostid %"INT32" is bad",g_hostdb.m_hostId); return sendErrorReply(st,EBADENGINEER ); } char *content = xd->ptr_utf8Content; int32_t contentLen = xd->size_utf8Content - 1; // int16_tcut char strip = st->m_strip; // alloc buffer now //char *buf = NULL; //int32_t bufMaxSize = 0; //bufMaxSize = len + ( 32 * 1024 ) ; //bufMaxSize = contentLen + ( 32 * 1024 ) ; //buf = (char *)mmalloc ( bufMaxSize , "PageGet2" ); //char *p = buf; //char *bufEnd = buf + bufMaxSize; //if ( ! buf ) { // return sendErrorReply ( st , g_errno ); //} // for undoing the header //char *start1 = p; int32_t startLen1 = sb->length(); // we are always utfu if ( strip != 2 ) sb->safePrintf( "<meta http-equiv=\"Content-Type\" " "content=\"text/html;charset=utf8\">\n"); // base href //Url *base = &xd->m_firstUrl; //if ( xd->ptr_redirUrl.m_url[0] ) // base = &xd->m_redirUrl; char *base = xd->ptr_firstUrl; if ( xd->ptr_redirUrl ) base = xd->ptr_redirUrl; //Url *redir = *xd->getRedirUrl(); if ( strip != 2 ) { sb->safePrintf ( "<BASE HREF=\"%s\">" , base ); //p += gbstrlen ( p ); } // default colors in case css files missing if ( strip != 2 ) { sb->safePrintf( "\n<style type=\"text/css\">\n" "body{background-color:white;color:black;}\n" "</style>\n"); //p += gbstrlen ( p ); } //char format = st->m_format; if ( format == FORMAT_XML ) sb->reset(); if ( format == FORMAT_JSON ) sb->reset(); if ( xd->m_contentType == CT_JSON ) sb->reset(); if ( xd->m_contentType == CT_XML ) sb->reset(); if ( xd->m_contentType == CT_STATUS ) sb->reset(); // for undoing the stuff below int32_t startLen2 = sb->length();//p; // query should be NULL terminated char *q = st->m_qsb.getBufStart(); int32_t qlen = st->m_qsb.getLength(); // m_qlen; char styleTitle[128] = "font-size:14px;font-weight:600;" "color:#000000;"; char styleText[128] = "font-size:14px;font-weight:400;" "color:#000000;"; char styleLink[128] = "font-size:14px;font-weight:400;" "color:#0000ff;"; char styleTell[128] = "font-size:14px;font-weight:600;" "color:#cc0000;"; // get the url of the title rec Url *f = xd->getFirstUrl(); bool printDisclaimer = st->m_printDisclaimer; if ( xd->m_contentType == CT_JSON ) printDisclaimer = false; if ( xd->m_contentType == CT_STATUS ) printDisclaimer = false; if ( format == FORMAT_XML ) printDisclaimer = false; if ( format == FORMAT_JSON ) printDisclaimer = false; char tbuf[100]; tbuf[0] = 0; time_t lastSpiderDate = xd->m_spideredTime; if ( printDisclaimer || format == FORMAT_XML || format == FORMAT_JSON ) { struct tm *timeStruct = gmtime ( &lastSpiderDate ); strftime ( tbuf, 100,"%b %d, %Y UTC", timeStruct); } // We should always be displaying this disclaimer. // - May eventually want to display this at a different location // on the page, or on the click 'n' scroll browser page itself // when this page is not being viewed solo. // CNS: if ( ! st->m_clickNScroll ) { if ( printDisclaimer ) { sb->safePrintf(//sprintf ( p , //"<BASE HREF=\"%s\">" //"<table border=1 width=100%%>" //"<tr><td>" "<table border=\"1\" bgcolor=\"#" BGCOLOR "\" cellpadding=\"10\" " //"id=\"gbcnsdisctable\" class=\"gbcnsdisctable_v\"" "cellspacing=\"0\" width=\"100%%\" color=\"#ffffff\">" "<tr" //" id=\"gbcnsdisctr\" class=\"gbcnsdisctr_v\"" "><td>" //"<font face=times,sans-serif color=black size=-1>" "<span style=\"%s\">" "This is Gigablast's cached page of </span>" "<a href=\"%s\" style=\"%s\">%s</a>" "" , styleTitle, f->getUrl(), styleLink, f->getUrl() ); //p += gbstrlen ( p ); // then the rest //sprintf(p , sb->safePrintf( "<span style=\"%s\">. " "Gigablast is not responsible for the content of " "this page.</span>", styleTitle ); //p += gbstrlen ( p ); sb->safePrintf ( "<br/><span style=\"%s\">" "Cached: </span>" "<span style=\"%s\">", styleTitle, styleText ); //p += gbstrlen ( p ); // then the spider date in GMT // time_t lastSpiderDate = xd->m_spideredTime; // struct tm *timeStruct = gmtime ( &lastSpiderDate ); // char tbuf[100]; // strftime ( tbuf, 100,"%b %d, %Y UTC", timeStruct); //p += gbstrlen ( p ); sb->safeStrcpy(tbuf); // Moved over from PageResults.cpp sb->safePrintf( "</span> - <a href=\"" "/get?" "q=%s&c=%s&rtq=%"INT32"&" "d=%"INT64"&strip=1\"" " style=\"%s\">" "[stripped]</a>", q , st->m_coll , (int32_t)st->m_rtq, st->m_docId, styleLink ); // a link to alexa if ( f->getUrlLen() > 5 ) { sb->safePrintf( " - <a href=\"http:" "//web.archive.org/web/*/%s\"" " style=\"%s\">" "[older copies]</a>" , f->getUrl(), styleLink ); } if (st->m_noArchive){ sb->safePrintf( " - <span style=\"%s\"><b>" "[NOARCHIVE]</b></span>", styleTell ); } if (st->m_isBanned){ sb->safePrintf(" - <span style=\"%s\"><b>" "[BANNED]</b></span>", styleTell ); } // only print this if we got a query if ( qlen > 0 ) { sb->safePrintf("<br/><br/><span style=\"%s\"> " "These search terms have been " "highlighted: ", styleText ); //p += gbstrlen ( p ); } } // how much space left in p? //int32_t avail = bufEnd - p; // . make the url that we're outputting for (like in PageResults.cpp) // . "thisUrl" is the baseUrl for click & scroll char thisUrl[MAX_URL_LEN]; char *thisUrlEnd = thisUrl + MAX_URL_LEN; char *x = thisUrl; // . use the external ip of our gateway // . construct the NAT mapped port // . you should have used iptables to map port to the correct // internal ip:port //uint32_t ip =g_conf.m_mainExternalIp ; // h->m_externalIp; //uint16_t port=g_conf.m_mainExternalPort;//h->m_externalHttpPort // local check //if ( st->m_isLocal ) { uint32_t ip = h->m_ip; uint16_t port = h->m_httpPort; //} //sprintf ( x , "http://%s:%"INT32"/get?q=" , iptoa ( ip ) , port ); // . we no longer put the port in here // . but still need http:// since we use <base href=> if (port == 80) sprintf(x,"http://%s/get?q=",iptoa(ip)); else sprintf(x,"http://%s:%hu/get?q=",iptoa(ip),port); x += gbstrlen ( x ); // the query url encoded int32_t elen = urlEncode ( x , thisUrlEnd - x , q , qlen ); x += elen; // separate cgi vars with a & //sprintf ( x, "&seq=%"INT32"&rtq=%"INT32"d=%"INT64"", // (int32_t)st->m_seq,(int32_t)st->m_rtq,st->m_msg22.getDocId()); sprintf ( x, "&d=%"INT64"",st->m_docId ); x += gbstrlen(x); // set our query for highlighting Query qq; qq.set2 ( q, st->m_langId , true ); // print the query terms into our highlight buffer Highlight hi; // make words so we can set the scores to ignore fielded terms Words qw; qw.set ( q , // content being highlighted, utf8 qlen , // content being highlighted, utf8 TITLEREC_CURRENT_VERSION, true , // computeIds false ); // hasHtmlEntities? // . assign scores of 0 to query words that should be ignored // . TRICKY: loop over words in qq.m_qwords, but they should be 1-1 // with words in qw. // . sanity check //if ( qw.getNumWords() != qq.m_numWords ) { char *xx = NULL; *xx = 0;} // declare up here Matches m; // do the loop //Scores ss; //ss.set ( &qw , NULL ); //for ( int32_t i = 0 ; i < qq.m_numWords ; i++ ) // if ( ! m.matchWord ( &qq.m_qwords[i],i ) ) ss.m_scores[i] = 0; // now set m.m_matches[] to those words in qw that match a query word // or phrase in qq. m.setQuery ( &qq ); //m.addMatches ( &qw , &ss , true ); m.addMatches ( &qw ); int32_t hilen = 0; // CNS: if ( ! st->m_clickNScroll ) { // and highlight the matches if ( printDisclaimer ) { hilen = hi.set ( //p , //avail , sb , &qw , // words to highlight &m , // matches relative to qw false , // doSteming false , // st->m_clickAndScroll , (char *)thisUrl );// base url for ClcknScrll //p += hilen; // now an hr //gbmemcpy ( p , "</span></table></table>\n" , 24 ); p += 24; sb->safeStrcpy("</span></table></table>\n"); } bool includeHeader = st->m_includeHeader; // do not show header for json object display if ( xd->m_contentType == CT_JSON ) includeHeader = false; if ( xd->m_contentType == CT_XML ) includeHeader = false; if ( xd->m_contentType == CT_STATUS ) includeHeader = false; if ( format == FORMAT_XML ) includeHeader = false; if ( format == FORMAT_JSON ) includeHeader = false; //mfree(uq, uqCapacity, "PageGet"); // undo the header writes if we should if ( ! includeHeader ) { // including base href is off by default when not including // the header, so the caller must explicitly turn it back on if ( st->m_includeBaseHref ) sb->m_length=startLen2;//p=start2; else sb->m_length=startLen1;//p=start1; } //sb->safeStrcpy(tbuf); if ( format == FORMAT_XML ) { sb->safePrintf("<response>\n"); sb->safePrintf("<statusCode>0</statusCode>\n"); sb->safePrintf("<statusMsg>Success</statusMsg>\n"); sb->safePrintf("<url><![CDATA["); sb->cdataEncode(xd->m_firstUrl.m_url); sb->safePrintf("]]></url>\n"); sb->safePrintf("<docId>%"UINT64"</docId>\n",xd->m_docId); sb->safePrintf("\t<cachedTimeUTC>%"INT32"</cachedTimeUTC>\n", (int32_t)lastSpiderDate); sb->safePrintf("\t<cachedTimeStr>%s</cachedTimeStr>\n",tbuf); } if ( format == FORMAT_JSON ) { sb->safePrintf("{\"response\":{\n"); sb->safePrintf("\t\"statusCode\":0,\n"); sb->safePrintf("\t\"statusMsg\":\"Success\",\n"); sb->safePrintf("\t\"url\":\""); sb->jsonEncode(xd->m_firstUrl.m_url); sb->safePrintf("\",\n"); sb->safePrintf("\t\"docId\":%"UINT64",\n",xd->m_docId); sb->safePrintf("\t\"cachedTimeUTC\":%"INT32",\n", (int32_t)lastSpiderDate); sb->safePrintf("\t\"cachedTimeStr\":\"%s\",\n",tbuf); } // identify start of <title> tag we wrote out char *sbstart = sb->getBufStart(); char *sbend = sb->getBufEnd(); char *titleStart = NULL; char *titleEnd = NULL; char ctype = (char)xd->m_contentType; // do not calc title or print it if doc is xml or json if ( ctype == CT_XML ) sbend = sbstart; if ( ctype == CT_JSON ) sbend = sbstart; if ( ctype == CT_STATUS ) sbend = sbstart; for ( char *t = sbstart ; t < sbend ; t++ ) { // title tag? if ( t[0]!='<' ) continue; if ( to_lower_a(t[1])!='t' ) continue; if ( to_lower_a(t[2])!='i' ) continue; if ( to_lower_a(t[3])!='t' ) continue; if ( to_lower_a(t[4])!='l' ) continue; if ( to_lower_a(t[5])!='e' ) continue; // point to it char *x = t + 5; // max - to keep things fast char *max = x + 500; for ( ; *x && *x != '>' && x < max ; x++ ); x++; // find end char *e = x; for ( ; *e && e < max ; e++ ) { if ( e[0]=='<' && to_lower_a(e[1])=='/' && to_lower_a(e[2])=='t' && to_lower_a(e[3])=='i' && to_lower_a(e[4])=='t' && to_lower_a(e[5])=='l' && to_lower_a(e[6])=='e' ) break; } if ( e < max ) { titleStart = x; titleEnd = e; } break; } // . print title at top! // . consider moving if ( titleStart ) { char *ebuf = st->m_r.getString("eb"); if ( ! ebuf ) ebuf = ""; //p += sprintf ( p , sb->safePrintf( "<table border=1 " "cellpadding=10 " "cellspacing=0 " "width=100%% " "color=#ffffff>" ); int32_t printLinks = st->m_r.getLong("links",0); if ( ! printDisclaimer && printLinks ) sb->safePrintf(//p += sprintf ( p , // first put cached and live link "<tr>" "<td bgcolor=lightyellow>" // print cached link //"<center>" " " "<b>" "<a " "style=\"font-size:18px;font-weight:600;" "color:#000000;\" " "href=\"" "/get?" "c=%s&d=%"INT64"&qh=0&cnsp=1&eb=%s\">" "cached link</a>" " " "<a " "style=\"font-size:18px;font-weight:600;" "color:#000000;\" " "href=%s>live link</a>" "</b>" //"</center>" "</td>" "</tr>\n" ,st->m_coll ,st->m_docId ,ebuf ,thisUrl // st->ptr_ubuf ); if ( printLinks ) { sb->safePrintf(//p += sprintf ( p , "<tr><td bgcolor=pink>" "<span style=\"font-size:18px;" "font-weight:600;" "color:#000000;\">" " " "<b>PAGE TITLE:</b> " ); int32_t tlen = titleEnd - titleStart; sb->safeMemcpy ( titleStart , tlen ); sb->safePrintf ( "</span></td></tr>" ); } sb->safePrintf( "</table><br>\n" ); } // is the content preformatted? bool pre = false; if ( ctype == CT_TEXT ) pre = true ; // text/plain if ( ctype == CT_DOC ) pre = true ; // filtered msword if ( ctype == CT_PS ) pre = true ; // filtered postscript if ( format == FORMAT_XML ) pre = false; if ( format == FORMAT_JSON ) pre = false; // if it is content-type text, add a <pre> if ( pre ) {//p + 5 < bufEnd && pre ) { sb->safePrintf("<pre>"); //p += 5; } if ( st->m_strip == 1 ) contentLen = stripHtml( content, contentLen, (int32_t)xd->m_version, st->m_strip ); // it returns -1 and sets g_errno on error, line OOM if ( contentLen == -1 ) { //if ( buf ) mfree ( buf , bufMaxSize , "PageGet2" ); return sendErrorReply ( st , g_errno ); } Xml xml; Words ww; // if no highlighting, skip it bool queryHighlighting = st->m_queryHighlighting; if ( st->m_strip == 2 ) queryHighlighting = false; // do not do term highlighting if json if ( xd->m_contentType == CT_JSON ) queryHighlighting = false; if ( xd->m_contentType == CT_STATUS ) queryHighlighting = false; SafeBuf tmp; SafeBuf *xb = sb; if ( format == FORMAT_XML ) xb = &tmp; if ( format == FORMAT_JSON ) xb = &tmp; if ( ! queryHighlighting ) { xb->safeMemcpy ( content , contentLen ); xb->nullTerm(); //p += contentLen ; } else { // get the content as xhtml (should be NULL terminated) //Words *ww = xd->getWords(); if ( ! xml.set ( content , contentLen , false , 0 , false , TITLEREC_CURRENT_VERSION , false , 0 , CT_HTML ) ) { // niceness is 0 //if ( buf ) mfree ( buf , bufMaxSize , "PageGet2" ); return sendErrorReply ( st , g_errno ); } if ( ! ww.set ( &xml , true , 0 ) ) { // niceness is 0 //if ( buf ) mfree ( buf , bufMaxSize , "PageGet2" ); return sendErrorReply ( st , g_errno ); } // sanity check //if ( ! xd->m_wordsValid ) { char *xx=NULL;*xx=0; } // how much space left in p? //avail = bufEnd - p; Matches m; m.setQuery ( &qq ); m.addMatches ( &ww ); hilen = hi.set ( xb , // p , avail , &ww , &m , false /*doStemming?*/ , st->m_clickAndScroll , thisUrl /*base url for click & scroll*/); //p += hilen; log(LOG_DEBUG, "query: Done highlighting cached page content"); } if ( format == FORMAT_XML ) { sb->safePrintf("\t<content><![CDATA["); sb->cdataEncode ( xb->getBufStart() ); sb->safePrintf("]]></content>\n"); sb->safePrintf("</response>\n"); } if ( format == FORMAT_JSON ) { sb->safePrintf("\t\"content\":\"\n"); sb->jsonEncode ( xb->getBufStart() ); sb->safePrintf("\"\n}\n}\n"); } // if it is content-type text, add a </pre> if ( pre ) { // p + 6 < bufEnd && pre ) { sb->safeMemcpy ( "</pre>" , 6 ); //p += 6; } // calculate bufLen //int32_t bufLen = p - buf; /* MDW: return the xml page as is now. 9/28/2014 int32_t ct = xd->m_contentType; // now filter the entire buffer to escape out the xml tags // so it is displayed nice SafeBuf newbuf; if ( ct == CT_XML ) { // encode the xml tags into <tagname> sequences if ( !newbuf.htmlEncodeXmlTags ( sb->getBufStart() , sb->getLength(), 0)){// niceness=0 //if ( buf ) mfree ( buf , bufMaxSize , "PageGet2" ); return sendErrorReply ( st , g_errno ); } // free out buffer that we alloc'd before returning since this // should have copied it into another buffer //if ( buf ) mfree ( buf , bufMaxSize , "PageGet2" ); // reassign //buf = newbuf.getBufStart(); //bufLen = newbuf.length(); sb->stealBuf ( &newbuf ); } */ // now encapsulate it in html head/tail and send it off // sendErr: contentType = "text/html"; if ( strip == 2 ) contentType = "text/xml"; // xml is usually buggy and this throws browser off //if ( ctype == CT_XML ) contentType = "text/xml"; if ( xd->m_contentType == CT_JSON ) contentType = "application/json"; if ( xd->m_contentType == CT_STATUS ) contentType = "application/json"; if ( xd->m_contentType == CT_XML ) contentType = "test/xml"; if ( format == FORMAT_XML ) contentType = "text/xml"; if ( format == FORMAT_JSON ) contentType = "application/json"; // safebuf, sb, is a member of "st" so this should copy the buffer // when it constructs the http reply, and we gotta call delete(st) // AFTER this so sb is still valid. bool status = g_httpServer.sendDynamicPage (s, //buf,bufLen, sb->getBufStart(), sb->getLength(), -1,false, contentType, -1, NULL, "utf8" ); // nuke state2 mdelete ( st , sizeof(State2) , "PageGet1" ); delete (st); // free out buffer that we alloc'd before returning since this // should have copied it into another buffer //if ( ct == CT_XML ) newbuf.purge(); //else if ( buf ) mfree ( buf , bufMaxSize , "PageGet2" ); // and convey the status return status; }
bool qaspider1 ( ) { // // delete the 'qatest123' collection // //static bool s_x1 = false; if ( ! s_flags[0] ) { s_flags[0] = true; if ( ! getUrl ( "/admin/delcoll?xml=1&delcoll=qatest123" ) ) return false; } // // add the 'qatest123' collection // //static bool s_x2 = false; if ( ! s_flags[1] ) { s_flags[1] = true; if ( ! getUrl ( "/admin/addcoll?addcoll=qatest123&xml=1" , // checksum of reply expected 238170006 ) ) return false; } // restrict hopcount to 0 or 1 in url filters so we do not spider // too deep //static bool s_z1 = false; if ( ! s_flags[2] ) { s_flags[2] = true; SafeBuf sb; sb.safePrintf("&c=qatest123&" // make it the custom filter "ufp=0&" "fe=%%21ismanualadd+%%26%%26+%%21insitelist&hspl=0&hspl=1&fsf=0.000000&mspr=0&mspi=1&xg=1000&fsp=-3&" // take out hopcount for now, just test quotas // "fe1=tag%%3Ashallow+%%26%%26+hopcount%%3C%%3D1&hspl1=0&hspl1=1&fsf1=1.000000&mspr1=1&mspi1=1&xg1=1000&fsp1=3&" // just one spider out allowed for consistency "fe1=tag%%3Ashallow+%%26%%26+sitepages%%3C%%3D20&hspl1=0&hspl1=1&fsf1=1.000000&mspr1=1&mspi1=1&xg1=1000&fsp1=45&" "fe2=default&hspl2=0&hspl2=1&fsf2=1.000000&mspr2=0&mspi2=1&xg2=1000&fsp2=45&" ); if ( ! getUrl ( "/admin/filters",0,sb.getBufStart()) ) return false; } // set the site list to // a few sites //static bool s_z2 = false; if ( ! s_flags[3] ) { s_flags[3] = true; SafeBuf sb; sb.safePrintf("&c=qatest123&format=xml&sitelist="); sb.urlEncode("tag:shallow site:www.walmart.com\r\n" "tag:shallow site:http://www.ibm.com/\r\n"); sb.nullTerm(); if ( ! getUrl ("/admin/settings",0,sb.getBufStart() ) ) return false; } // // use the add url interface now // walmart.com above was not seeded because of the site: directive // so this will seed it. // //static bool s_y2 = false; if ( ! s_flags[4] ) { s_flags[4] = true; SafeBuf sb; // delim=+++URL: sb.safePrintf("&c=qatest123" "&format=json" "&strip=1" "&spiderlinks=1" "&urls=www.walmart.com+ibm.com" ); // . now a list of websites we want to spider // . the space is already encoded as + //sb.urlEncode(s_urls1); if ( ! getUrl ( "/admin/addurl",0,sb.getBufStart()) ) return false; } // // wait for spidering to stop // checkagain: // wait until spider finishes. check the spider status page // in json to see when completed //static bool s_k1 = false; if ( ! s_flags[5] ) { // wait 5 seconds, call sleep timer... then call qatest() //usleep(5000000); // 5 seconds wait(3.0); s_flags[5] = true; return false; } if ( ! s_flags[15] ) { s_flags[15] = true; if ( ! getUrl ( "/admin/status?format=json&c=qatest123",0) ) return false; } //static bool s_k2 = false; if ( ! s_flags[6] ) { // ensure spiders are done. // "Nothing currently available to spider" if ( s_content&&!strstr(s_content,"Nothing currently avail")){ s_flags[5] = false; s_flags[15] = false; goto checkagain; } s_flags[6] = true; } // wait for index msg4 to not be cached to ensure all results indexed if ( ! s_flags[22] ) { s_flags[22] = true; wait(1.5); } // verify no results for gbhopcount:2 query //static bool s_y4 = false; if ( ! s_flags[7] ) { s_flags[7] = true; if ( ! getUrl ( "/search?c=qatest123&qa=1&format=xml&" "q=gbhopcount%3A2", -1672870556 ) ) return false; } // but some for gbhopcount:0 query //static bool s_t0 = false; if ( ! s_flags[8] ) { s_flags[8] = true; if ( ! getUrl ( "/search?c=qatest123&qa=1&format=xml&" "q=gbhopcount%3A0", 908338607 ) ) return false; } // check facet sections query for walmart //static bool s_y5 = false; if ( ! s_flags[9] ) { s_flags[9] = true; if ( ! getUrl ( "/search?c=qatest123&format=json&stream=1&" "q=gbfacetstr%3Agbxpathsitehash2492664135", 55157060 ) ) return false; } //static bool s_y6 = false; if ( ! s_flags[10] ) { s_flags[10] = true; if ( ! getUrl ( "/get?page=4&q=gbfacetstr:gbxpathsitehash2492664135&qlang=xx&c=qatest123&d=9861563119&cnsp=0" , 999 ) ) return false; } // in xml //static bool s_y7 = false; if ( ! s_flags[11] ) { s_flags[11] = true; if ( ! getUrl ( "/get?xml=1&page=4&q=gbfacetstr:gbxpathsitehash2492664135&qlang=xx&c=qatest123&d=9861563119&cnsp=0" , 999 ) ) return false; } // and json //static bool s_y8 = false; if ( ! s_flags[12] ) { s_flags[12] = true; if ( ! getUrl ( "/get?json=1&page=4&q=gbfacetstr:gbxpathsitehash2492664135&qlang=xx&c=qatest123&d=9861563119&cnsp=0" , 999 ) ) return false; } // delete the collection //static bool s_fee = false; // if ( ! s_flags[13] ) { // s_flags[13] = true; // if ( ! getUrl ( "/admin/delcoll?delcoll=qatest123" ) ) // return false; // } if ( ! s_flags[17] ) { s_flags[17] = true; if ( ! getUrl ( "/search?c=qatest123&qa=1&format=xml&" "q=site2%3Awww.walmart.com+" "gbsortby%3Agbspiderdate", 999 ) ) return false; } // xpath is like a title here i think. check the returned // facet table in the left column if ( ! s_flags[18] ) { s_flags[18] = true; if ( ! getUrl ( "/search?c=qatest123&qa=1&format=html&" "q=gbfacetstr%3Agbxpathsitehash3624590799" , 999 ) ) return false; } if ( ! s_flags[19] ) { s_flags[19] = true; if ( ! getUrl ( "/search?c=qatest123&qa=1&xml=1&" "q=gbfacetint%3Agbhopcount" , 999 ) ) return false; } if ( ! s_flags[20] ) { s_flags[20] = true; if ( ! getUrl ( "/search?c=qatest123&qa=1&json=1&" "q=gbfacetint%3Alog.score" , 999 ) ) return false; } if ( ! s_flags[21] ) { s_flags[21] = true; if ( ! getUrl ( "/search?c=qatest123&qa=1&xml=1&" "q=gbfacetfloat%3Atalks.rating" , 999 ) ) return false; } if ( ! s_flags[23] ) { s_flags[23] = true; // test facets mixed with gigabits in left hand column if ( ! getUrl ( "/search?c=qatest123&qa=1&html=1&" "q=gbfacetint%3Agbhopcount+walmart" , 999 ) ) return false; } //static bool s_fee2 = false; if ( ! s_flags[14] ) { s_flags[14] = true; log("qa: SUCCESSFULLY COMPLETED " "QA SPIDER1 TEST"); return true; } return true; }
bool processLoop ( void *state ) { // cast it State8 *st = (State8 *)state; // get the xmldoc XmlDoc *xd = &st->m_xd; // error? if ( g_errno ) return sendErrorReply ( st , g_errno ); // shortcut SafeBuf *xbuf = &st->m_xbuf; if ( st->m_u && st->m_u[0] ) { // . save the ips.txt file if we are the test coll // . saveTestBuf() is a function in Msge1.cpp CollectionRec *cr = xd->getCollRec(); if ( xd && cr && cr->m_coll && ! strcmp ( cr->m_coll,"test") ) // use same dir that XmlDoc::getTestDir() would use saveTestBuf ( "test-page-parser" ); // now get the meta list, in the process it will print out a // bunch of junk into st->m_xbuf char *metalist = xd->getMetaList ( ); if ( ! metalist ) return sendErrorReply ( st , g_errno ); // return false if it blocked if ( metalist == (void *)-1 ) return false; // for debug... if ( ! xd->m_indexCode ) xd->doConsistencyTest ( false ); // print it out xd->printDoc( xbuf ); } // print reason we can't analyze it (or index it) //if ( st->m_indexCode != 0 ) { // xbuf->safePrintf ("<br><br><b>indexCode: %s</b>\n<br>", // mstrerror(st->m_indexCode)); //} // we are done g_inPageParser = false; // print the final tail //p += g_httpServer.printTail ( p , pend - p ); //log("parser: send sock=%li",st->m_s->m_sd); // now encapsulate it in html head/tail and send it off bool status = g_httpServer.sendDynamicPage( st->m_s , xbuf->getBufStart(), xbuf->length() , -1, //cachtime false ,//postreply? NULL, //ctype -1 , //httpstatus NULL,//cookie "utf-8"); // delete the state now if ( st->m_freeIt ) { mdelete ( st , sizeof(State8) , "PageParser" ); delete (st); } // return the status return status; }
// . make a web page from results stored in msg40 // . send it on TcpSocket "s" when done // . returns false if blocked, true otherwise // . sets g_errno on error bool gotIndexList ( void *state ) { // the state State10 *st = (State10 *) state; // launch more if ( ! launchRequests ( st ) ) return false; /* // get the date list //fprintf(stderr,"termId now=%lli\n",st->m_termId); //fprintf(stderr,"should be=%lli\n",(st->m_termId & TERMID_MASK)); // . now get the indexList for this termId // . date is complemented, so start with bigger one first key128_t startKey = g_datedb.makeStartKey ( st->m_termId ,0xffffffff); key128_t endKey = g_datedb.makeEndKey ( st->m_termId ,0x0); // get the rdb ptr to titledb's rdb //Rdb *rdb = g_indexdb.getRdb(); // -1 means read from all files in Indexdb long numFiles = -1; // make it zero if caller doesn't want to hit the disk if ( ! st->m_useDisk ) numFiles = 0; // get the title rec at or after this docId if ( ! st->m_msg0.getList ( -1 , 0 , 0 , 0 , // max cache age false , // add to cache? RDB_DATEDB , // rdbId of 2 = indexdb st->m_coll , &st->m_list2 , (char *)&startKey , (char *)&endKey , st->m_numRecs * sizeof(key128_t),//recSizes //st->m_useTree , // include tree? //st->m_useCache , // include cache? //false , // add to cache? //0 , // startFileNum //numFiles , // numFiles st , // state gotIndexListWrapper2 , 0 ) ) // niceness return false; // otherwise call gotResults which returns false if blocked, true else // and sets g_errno on error return gotIndexList2 ( (void *) st , NULL ); } void gotIndexListWrapper2 ( void *state , RdbList *list ) { gotIndexList2 ( state , list ); } void addedKeyWrapper ( void *state ) { gotIndexList2 ( state, NULL ); } // . make a web page from results stored in msg40 // . send it on TcpSocket "s" when done // . returns false if blocked, true otherwise // . sets g_errno on error bool gotIndexList2 ( void *state , RdbList *list ) { // the state State10 *st = (State10 *) state; */ // get the socket TcpSocket *s = st->m_socket; // don't allow pages bigger than 128k in cache //char buf [ 64*1024 ]; // a ptr into "buf" //char *p = buf; //char *pend = buf + 64*1024; /* // get termId key_t k = *(key_t *)st->m_list.getStartKey(); long long termId = g_indexdb.getTermId ( k ); // get groupId from termId //unsigned long groupId = k.n1 & g_hostdb.m_groupMask; unsigned long groupId = g_indexdb.getGroupIdFromKey ( &k ); long hostnum = g_hostdb.makeHostId ( groupId ); */ // check box " checked" strings char *ubs = ""; char *uts = ""; char *uds = ""; char *ucs = ""; char *add = ""; char *del = ""; if ( st->m_useDatedb) ubs = " checked"; if ( st->m_useTree ) uts = " checked"; if ( st->m_useDisk ) uds = " checked"; if ( st->m_useCache ) ucs = " checked"; if ( st->m_add ) add = " checked"; if ( st->m_del ) del = " checked"; SafeBuf *pbuf = &st->m_pbuf; g_pages.printAdminTop ( pbuf , st->m_socket , &st->m_r ); // get base, returns NULL and sets g_errno to ENOCOLLREC on error RdbBase *base; if (!(base=getRdbBase((uint8_t)RDB_INDEXDB,st->m_coll))) return true; // print the standard header for admin pages pbuf->safePrintf ( "<center>\n" "<table cellpadding=2><tr><td colspan=4>" "useDatedb:<input type=checkbox value=1 name=ub%s> " "useTree:<input type=checkbox value=1 name=ut%s> " "useDisk:<input type=checkbox value=1 name=ud%s> " "useCache:<input type=checkbox value=1 name=uc%s> " "ADD:<input type=checkbox value=1 name=add%s> " "DELETE:<input type=checkbox value=1 name=del%s>" "</td></tr><tr><td>" "query:" "</td><td>" "<input type=text name=q value=\"%s\" size=20>" "</td><td>" "collection:" "</td><td>" "<input type=text name=c value=\"%s\" size=10>" "</td></tr><tr><td>" "termId:" "</td><td>" "<input type=text name=t value=%lli size=20>" "</td><td>" "numRecs:" "</td><td>" "<input type=text name=numRecs value=%li size=10> " "</td></tr><tr><td>" "docId:" "</td><td>" "<input type=text name=d value=%lli size=20> " "</td><td>" "score:" "</td><td>" "<input type=text name=score value=%li size=10> " "</td><td>" "<input type=submit value=ok border=0>" "</td></tr>" "<tr><td colspan=2>" "term appears in about %lli docs +/- %li" "</td></tr>" //"<tr><td colspan=2>" //"this indexlist held by host #%li and twins" //"</td></tr>" "</table>" "</form><br><br>" , ubs, uts, uds, ucs, add, del, st->m_query , st->m_coll , st->m_termId , st->m_numRecs , st->m_docId , (long)st->m_score , st->m_termFreq , 2 * (long)GB_INDEXDB_PAGE_SIZE / 6 * base->getNumFiles() ); //hostnum ); if ( g_errno || (st->m_list.isEmpty() ) ) {//&&st->m_list2.isEmpty())){ if (g_errno)pbuf->safePrintf("Error = %s",mstrerror(g_errno)); else pbuf->safePrintf("List is empty"); pbuf->safePrintf("</center>"); // erase g_errno for sending g_errno = 0; // now encapsulate it in html head/tail and send it off bool status = g_httpServer.sendDynamicPage(s , pbuf->getBufStart(), pbuf->length() ); // delete it mdelete ( st , sizeof(State10) , "PageIndexdb" ); delete (st); return status; } pbuf->safePrintf ( "<table cellpadding=1 border=1>" "<tr><td>#</td><td>score</td>" "<td>docId</td><td>domHash</td></tr>"); //if ( searchingEvents // now print the score/docId of indexlist long i = 0; for ( st->m_list.resetListPtr () ; ! st->m_list.isExhausted () ; st->m_list.skipCurrentRecord () ) { // break if buf is low //if ( p + 1024 >= pend ) break; // but set the ip/port to a host that has this titleRec // stored locally! long long docId = st->m_list.getCurrentDocId () ; unsigned long groupId = getGroupIdFromDocId ( docId ); // get the first host's hostId in this groupId Host *h = g_hostdb.getFastestHostInGroup ( groupId ); // . pick the first host to handle the cached titleRec request // . we assume it has the best time and is up!! TODO: fix! // . use local ip though if it was an internal request // . otherwise, use the external ip //unsigned long ip = h->m_externalIp; unsigned long ip = h->m_ip; // use the NAT mapped port unsigned short port = h->m_externalHttpPort; // log the first docid so we can blaster url: queries // to PageIndexdb and see if they are in indexdb if ( i == 0 ) logf(LOG_INFO,"indexdb: %llu %s",docId,st->m_query); // adjust ip/port if local if ( st->m_isLocal ) { ip = h->m_ip; port = h->m_httpPort; } unsigned long date = 0; if ( st->m_useDatedb ) date = (unsigned long)st->m_list.getCurrentDate(); uint8_t dh = g_titledb.getDomHash8FromDocId ( docId ); char ds[32]; ds[0]=0; if ( st->m_useDatedb ) sprintf (ds,"%lu/",date); pbuf->safePrintf ( "<tr><td>%li.</td>" "<td>%s%i</td>" "<td>" //"<a href=http://%s:%hu/master/titledb?d=%llu>" "<a href=/master/titledb?c=%s&d=%llu>" "%llu" //"<td><a href=/cgi/4.cgi?d=%llu>%llu" "</td>" "<td>" "0x%02lx" "</td>" "</tr>\n" , i++, ds, (int)st->m_list.getCurrentScore() , //iptoa(ip) , port , st->m_coll, docId , docId , (long)dh ); } pbuf->safePrintf ( "</table>" ); /* if ( ! st->m_list2.isEmpty() ) p += sprintf ( p , "<br>" "<br>" "<table cellpadding=1 border=1>" "<tr><td>#</td><td>termId</td>" "<td>date</td><td>score</td>" "<td>docId</td></tr>"); // now print the score/docId of datedb list i = 0; for ( st->m_list2.resetListPtr () ; ! st->m_list2.isExhausted () ; st->m_list2.skipCurrentRecord () ) { // break if buf is low if ( p + 1024 >= pend ) break; // but set the ip/port to a host that has this titleRec // stored locally! long long docId = st->m_list2.getCurrentDocId () ; unsigned long groupId = g_titledb.getGroupId ( docId ); // get the first host's hostId in this groupId Host *h = g_hostdb.getFastestHostInGroup ( groupId ); // . pick the first host to handle the cached titleRec request // . we assume it has the best time and is up!! TODO: fix! // . use local ip though if it was an internal request // . otherwise, use the external ip //unsigned long ip = h->m_externalIp; unsigned long ip = h->m_ip; // use the NAT mapped port unsigned short port = h->m_externalHttpPort; // adjust ip/port if local if ( st->m_isLocal ) { ip = h->m_ip; port = h->m_httpPort; } // debug char kb[16]; st->m_list2.getCurrentKey(kb); //log(LOG_INFO,"debug: n1=%016llx n0=%016llx", // *(long long *)(kb+8),*(long long *)(kb+0)); //if ( (unsigned long)st->m_list2.getCurrentDate() == 0 ) // log("STOP"); sprintf ( p , "<tr><td>%li.</td>" "<td>%llu</td>" "<td>%lu</td><td>%i</td>" "<td>" //"<a href=http://%s:%hu/master/titledb?d=%llu>" "<a href=/master/titledb?c=%s&d=%llu>" "%llu" //"<td><a href=/cgi/4.cgi?d=%llu>%llu" "</td></tr>\n" , i++, st->m_list2.getTermId16(kb) , (unsigned long)st->m_list2.getCurrentDate() , (int)st->m_list2.getCurrentScore() , //iptoa(ip) , port , st->m_coll, docId , docId ); p += gbstrlen ( p ); } */ if ( ! st->m_list.isEmpty() ) pbuf->safePrintf ( "</table>" ); // print msg if we could fit all into buf //if ( p + 1024 >= pend ) { // sprintf ( p ,"... truncated ... no mem" ); // p += gbstrlen ( p ); //} // print the final tail //p += g_httpServer.printTail ( p , pend - p ); pbuf->safePrintf ( "</center>\n"); // now encapsulate it in html head/tail and send it off bool status = g_httpServer.sendDynamicPage ( s , pbuf->getBufStart() , pbuf->length() ); // delete the state mdelete ( st , sizeof(State10) , "PageIndexdb" ); delete (st) ; return status; }
bool sendPageAddDelColl ( TcpSocket *s , HttpRequest *r , bool add ) { #ifdef PRIVACORE_SAFE_VERSION g_errno = EBADENGINEER; char *msg = "Function disabled by PRIVACORE_SAFE_VERSION define"; return g_httpServer.sendErrorReply(s,g_errno,msg,NULL); #else // get collection name //int32_t nclen; //char *nc = r->getString ( "nc" , &nclen ); //int32_t cpclen; //char *cpc = r->getString ( "cpc" , &cpclen ); g_errno = 0; //bool cast = r->getLong("cast",0); const char *msg = NULL; // if any host in network is dead, do not do this //if ( g_hostdb.hasDeadHost() ) msg = "A host in the network is dead."; // . are we adding a collection? // . return if error adding, might already exist! // . g_errno should be set // . WE DO NOT NEED THIS ANYMORE. Pages.cpp now broadcasts // addcoll as CommandAddColl() parm. /* if ( nclen > 0 && add && ! cast ) { // do not allow "main" that is used for the "" collection // for backwards compatibility //if ( strcmp ( nc , "main" ) != 0 ) g_collectiondb.addRec (nc,cpc,cpclen,true,(collnum_t)-1, false , // isdump? true ) ;// save it? //else // log("admin: \"main\" collection is forbidden."); } if ( ! add && ! cast ) g_collectiondb.deleteRecs ( r ) ; */ char format = r->getReplyFormat(); if ( format == FORMAT_XML || format == FORMAT_JSON ) { // no addcoll given? int32_t page = g_pages.getDynamicPageNumber ( r ); const char *addcoll = r->getString("addcoll",NULL); const char *delcoll = r->getString("delcoll",NULL); if ( ! addcoll ) addcoll = r->getString("addColl",NULL); if ( ! delcoll ) delcoll = r->getString("delColl",NULL); if ( page == PAGE_ADDCOLL && ! addcoll ) { g_errno = EBADENGINEER; const char *msg = "no addcoll parm provided"; return g_httpServer.sendErrorReply(s,g_errno,msg,NULL); } if ( page == PAGE_DELCOLL && ! delcoll ) { g_errno = EBADENGINEER; const char *msg = "no delcoll parm provided"; return g_httpServer.sendErrorReply(s,g_errno,msg,NULL); } return g_httpServer.sendSuccessReply(s,format); } // error? const char *action = r->getString("action",NULL); const char *addColl = r->getString("addcoll",NULL); // add our ip to the list //char *ips = r->getString("collips",NULL); //char *pwds = r->getString("collpwd",NULL); char buf [ 64*1024 ]; SafeBuf p(buf, 64*1024); // // CLOUD SEARCH ENGINE SUPPORT - GIGABOT ERRORS // SafeBuf gtmp; char *gmsg = NULL; // is it too big? if ( action && addColl && gbstrlen(addColl) > MAX_COLL_LEN ) { gtmp.safePrintf("search engine name is too long"); gmsg = gtmp.getBufStart(); } // from Collectiondb.cpp::addNewColl() ensure coll name is legit const char *x = addColl; for ( ; x && *x ; x++ ) { if ( is_alnum_a(*x) ) continue; if ( *x == '-' ) continue; if ( *x == '_' ) continue; // underscore now allowed break; } if ( x && *x ) { g_errno = EBADENGINEER; gtmp.safePrintf("<font color=red>Error. \"%s\" is a " "malformed name because it " "contains the '%c' character.</font><br><br>", addColl,*x); gmsg = gtmp.getBufStart(); } // // END GIGABOT ERRORS // // // CLOUD SEARCH ENGINE SUPPORT // // if added the coll successfully, do not print same page, jump to // printing the basic settings page so they can add sites to it. // crap, this GET request, "r", is missing the "c" parm sometimes. // we need to use the "addcoll" parm anyway. maybe print a meta // redirect then? char guide = r->getLong("guide",0); // do not redirect if gmsg is set, there was a problem with the name if ( action && ! msg && format == FORMAT_HTML && guide && ! gmsg ) { //return g_parms.sendPageGeneric ( s, r, PAGE_BASIC_SETTINGS ); // just redirect to it if ( addColl ) p.safePrintf("<meta http-equiv=Refresh " "content=\"0; URL=/admin/settings" "?guide=1&c=%s\">", addColl); return g_httpServer.sendDynamicPage (s, p.getBufStart(), p.length()); } // print standard header g_pages.printAdminTop ( &p , s , r , NULL, "onload=document." "getElementById('acbox').focus();"); if ( g_errno ) { msg = mstrerror( g_errno ); } if ( msg && ! guide ) { const char *cc = "deleting"; if ( add ) cc = "adding"; p.safePrintf ( "<center>\n" "<font color=red>" "<b>Error %s collection: %s. " "See log file for details.</b>" "</font>" "</center><br>\n",cc,msg); } // // CLOUD SEARCH ENGINE SUPPORT // if ( add && guide ) printGigabotAdvice ( &p , PAGE_ADDCOLL , r , gmsg ); // print the add collection box if ( add /*&& (! nc[0] || g_errno ) */ ) { const char *t1 = "Add Collection"; if ( guide ) t1 = "Add Search Engine"; p.safePrintf ( "<center>\n<table %s>\n" "<tr class=hdrow><td colspan=2>" "<center><b>%s</b></center>" "</td></tr>\n" ,TABLE_STYLE ,t1 ); const char *t2 = "collection"; if ( guide ) t2 = "search engine"; const char *str = addColl; if ( ! addColl ) str = ""; p.safePrintf ( "<tr bgcolor=#%s>" "<td><b>name of new %s to add</td>\n" "<td><input type=text name=addcoll size=30 " "id=acbox " "value=\"%s\">" "</td></tr>\n" , LIGHT_BLUE , t2 , str ); // don't show the clone box if we are under gigabot the guide if ( ! guide ) p.safePrintf( "<tr bgcolor=#%s>" "<td><b>clone settings from this " "collection</b>" "<br><font size=1>Copy settings from " "this pre-existing collection. Leave " "blank to " "accept default values.</font></td>\n" "<td><input type=text name=clonecoll " "size=30>" "</td>" "</tr>" , LIGHT_BLUE ); // collection pwds p.safePrintf( "<tr bgcolor=#%s>" "<td><b>collection passwords" "</b>" "<br><font size=1>List of white space separated " "passwords allowed to adminster collection." "</font>" "</td>\n" "<td><input type=text name=collpwd " "size=60>" "</td>" "</tr>" , LIGHT_BLUE ); // ips box for security p.safePrintf( "<tr bgcolor=#%s>" "<td><b>collection ips" "</b>" "<br><font size=1>List of white space separated " "IPs allowed to adminster collection." "</font>" "</td>\n" "<td><input type=text name=collips " "size=60>" "</td>" "</tr>" , LIGHT_BLUE ); // now list collections from which to copy the config //p.safePrintf ( // "<tr><td><b>copy configuration from this " // "collection</b><br><font size=1>Leave blank to " // "accept default values.</font></td>\n" // "<td><input type=text name=cpc value=\"%s\" size=30>" // "</td></tr>\n",coll); p.safePrintf ( "</table></center><br>\n"); // wrap up the form started by printAdminTop g_pages.printAdminBottom ( &p ); int32_t bufLen = p.length(); return g_httpServer.sendDynamicPage (s,p.getBufStart(),bufLen); } // if we added a collection, print its page //if ( add && nc[0] && ! g_errno ) // return g_parms.sendPageGeneric2 ( s , r , PAGE_SEARCH , // nc , pwd ); if ( g_collectiondb.m_numRecsUsed <= 0 ) goto skip; // print all collections out in a checklist so you can check the // ones you want to delete, the values will be the id of that collectn p.safePrintf ( "<center>\n<table %s>\n" "<tr class=hdrow><td><center><b>Delete Collections" "</b></center></td></tr>\n" "<tr bgcolor=#%s><td>" "<center><b>Select the collections you wish to delete. " //"<font color=red>This feature is currently under " //"development.</font>" "</b></center></td></tr>\n" "<tr bgcolor=#%s><td>" // table within a table "<center><table width=20%%>\n", TABLE_STYLE, LIGHT_BLUE, DARK_BLUE ); for ( int32_t i = 0 ; i < g_collectiondb.m_numRecs ; i++ ) { CollectionRec *cr = g_collectiondb.m_recs[i]; if ( ! cr ) continue; p.safePrintf ( "<tr bgcolor=#%s><td>" "<input type=checkbox name=delcoll value=\"%s\"> " "%s</td></tr>\n", DARK_BLUE, cr->m_coll,cr->m_coll); } p.safePrintf( "</table></center></td></tr></table><br>\n" ); skip: // wrap up the form started by printAdminTop g_pages.printAdminBottom ( &p ); int32_t bufLen = p.length(); return g_httpServer.sendDynamicPage (s,p.getBufStart(),bufLen); #endif }
// . returns false if blocked, true otherwise // . sets g_errno on error bool Msg39::getLists () { if ( m_debug ) m_startTime = gettimeofdayInMilliseconds(); // . ask Indexdb for the IndexLists we need for these termIds // . each rec in an IndexList is a termId/score/docId tuple // // restrict to docid range? // // . get the docid start and end // . do docid paritioning so we can send to all hosts // in the network, not just one stripe long long docIdStart = 0; long long docIdEnd = MAX_DOCID; // . restrict to this docid? // . will really make gbdocid:| searches much faster! long long dr = m_tmpq.m_docIdRestriction; if ( dr ) { docIdStart = dr; docIdEnd = dr + 1; } // . override // . this is set from Msg39::doDocIdSplitLoop() to compute // search results in stages, so that we do not load massive // termlists into memory and got OOM (out of memory) if ( m_r->m_minDocId != -1 ) docIdStart = m_r->m_minDocId; if ( m_r->m_maxDocId != -1 ) docIdEnd = m_r->m_maxDocId+1; // if we have twins, then make sure the twins read different // pieces of the same docid range to make things 2x faster //bool useTwins = false; //if ( g_hostdb.getNumStripes() == 2 ) useTwins = true; //if ( useTwins ) { // long long delta2 = ( docIdEnd - docIdStart ) / 2; // if ( m_r->m_stripe == 0 ) docIdEnd = docIdStart + delta2; // else docIdStart = docIdStart + delta2; //} // new striping logic: long numStripes = g_hostdb.getNumStripes(); long long delta2 = ( docIdEnd - docIdStart ) / numStripes; long stripe = g_hostdb.getMyHost()->m_stripe; docIdStart += delta2 * stripe; // is this right? docIdEnd = docIdStart + delta2; // add 1 to be safe so we don't lose a docid docIdEnd++; // TODO: add triplet support later for this to split the // read 3 ways. 4 ways for quads, etc. //if ( g_hostdb.getNumStripes() >= 3 ) { char *xx=NULL;*xx=0;} // do not go over MAX_DOCID because it gets masked and // ends up being 0!!! and we get empty lists if ( docIdEnd > MAX_DOCID ) docIdEnd = MAX_DOCID; // remember so Msg2.cpp can use them to restrict the termlists // from "whiteList" as well m_docIdStart = docIdStart; m_docIdEnd = docIdEnd; // // set startkey/endkey for each term/termlist // for ( long i = 0 ; i < m_tmpq.getNumTerms() ; i++ ) { // breathe QUICKPOLL ( m_r->m_niceness ); // shortcuts QueryTerm *qterm = &m_tmpq.m_qterms[i]; char *sk = qterm->m_startKey; char *ek = qterm->m_endKey; // get the term id long long tid = m_tmpq.getTermId(i); // if only 1 stripe //if ( g_hostdb.getNumStripes() == 1 ) { // docIdStart = 0; // docIdEnd = MAX_DOCID; //} // store now in qterm g_posdb.makeStartKey ( sk , tid , docIdStart ); g_posdb.makeEndKey ( ek , tid , docIdEnd ); qterm->m_ks = sizeof(POSDBKEY);//key144_t); } // debug msg if ( m_debug || g_conf.m_logDebugQuery ) { for ( long i = 0 ; i < m_tmpq.getNumTerms() ; i++ ) { // get the term in utf8 //char bb[256]; QueryTerm *qt = &m_tmpq.m_qterms[i]; //utf16ToUtf8(bb, 256, qt->m_term, qt->m_termLen); char *tpc = qt->m_term + qt->m_termLen; char tmp = *tpc; *tpc = '\0'; char sign = qt->m_termSign; if ( sign == 0 ) sign = '0'; QueryWord *qw = qt->m_qword; long wikiPhrId = qw->m_wikiPhraseId; if ( m_tmpq.isPhrase(i) ) wikiPhrId = 0; char leftwikibigram = 0; char rightwikibigram = 0; if ( qt->m_leftPhraseTerm && qt->m_leftPhraseTerm->m_isWikiHalfStopBigram ) leftwikibigram = 1; if ( qt->m_rightPhraseTerm && qt->m_rightPhraseTerm->m_isWikiHalfStopBigram ) rightwikibigram = 1; /* char c = m_tmpq.getTermSign(i); char tt[512]; long ttlen = m_tmpq.getTermLen(i); if ( ttlen > 254 ) ttlen = 254; if ( ttlen < 0 ) ttlen = 0; // old:painful: convert each term from unicode to ascii memcpy ( tt , m_tmpq.getTerm(i) , ttlen ); */ long isSynonym = 0; QueryTerm *st = qt->m_synonymOf; if ( st ) isSynonym = true; SafeBuf sb; // now we can display it //tt[ttlen]='\0'; //if ( c == '\0' ) c = ' '; sb.safePrintf( "query: msg39: [%lu] query term #%li \"%s\" " "phr=%li termId=%llu rawTermId=%llu " //"estimatedTermFreq=%lli (+/- ~16000) " "tfweight=%.02f " "sign=%c " "numPlusses=%hhu " "required=%li " "fielcode=%li " "ebit=0x%0llx " "impBits=0x%0llx " "wikiphrid=%li " "leftwikibigram=%li " "rightwikibigram=%li " //"range.startTermNum=%hhi range.endTermNum=%hhi " //"minRecSizes=%li " "readSizeInBytes=%li " //"ebit=0x%llx " //"impBits=0x%llx " "hc=%li " "component=%li " "otermLen=%li " "isSynonym=%li " "querylangid=%li ", (long)this , i , qt->m_term,//bb , (long)m_tmpq.isPhrase (i) , m_tmpq.getTermId (i) , m_tmpq.getRawTermId (i) , ((float *)m_r->ptr_termFreqWeights)[i] , sign , //c , 0 , (long)qt->m_isRequired, (long)qt->m_fieldCode, (long long)qt->m_explicitBit , (long long)qt->m_implicitBits , wikiPhrId, (long)leftwikibigram, (long)rightwikibigram, ((long *)m_r->ptr_readSizes)[i] , //(long long)m_tmpq.m_qterms[i].m_explicitBit , //(long long)m_tmpq.m_qterms[i].m_implicitBits , (long)m_tmpq.m_qterms[i].m_hardCount , (long)m_tmpq.m_componentCodes[i], (long)m_tmpq.getTermLen(i) , isSynonym, (long)m_tmpq.m_langId); // ,tt // put it back *tpc = tmp; if ( st ) { long stnum = st - m_tmpq.m_qterms; sb.safePrintf("synofterm#=%li",stnum); //sb.safeMemcpy(st->m_term,st->m_termLen); sb.pushChar(' '); sb.safePrintf("synwid0=%lli ",qt->m_synWids0); sb.safePrintf("synwid1=%lli ",qt->m_synWids1); sb.safePrintf("synalnumwords=%li ", qt->m_numAlnumWordsInSynonym); // like for synonym "nj" it's base, // "new jersey" has 2 alnum words! sb.safePrintf("synbasealnumwords=%li ", qt->m_numAlnumWordsInBase); } logf(LOG_DEBUG,"%s",sb.getBufStart()); } m_tmpq.printBooleanTree(); } // timestamp log if ( m_debug ) log(LOG_DEBUG,"query: msg39: [%lu] Getting %li index lists ", (long)this,m_tmpq.getNumTerms()); // . now get the index lists themselves // . return if it blocked // . not doing a merge (last parm) means that the lists we receive // will be an appending of a bunch of lists so keys won't be in order // . merging is uneccessary for us here because we hash the keys anyway // . and merging takes up valuable cpu time // . caution: the index lists returned from Msg2 are now compressed // . now i'm merging because it's 10 times faster than hashing anyway // and the reply buf should now always be <= minRecSizes so we can // pre-allocate one better, and, 3) this should fix the yahoo.com // reindex bug char rdbId = RDB_POSDB; // . TODO: MDW: fix // . partap says there is a bug in this??? we can't cache UOR'ed lists? bool checkCache = false; // split is us???? //long split = g_hostdb.m_myHost->m_group; long split = g_hostdb.m_myHost->m_shardNum; // call msg2 if ( ! m_msg2.getLists ( rdbId , m_r->ptr_coll , m_r->m_maxAge , m_r->m_addToCache , //m_tmpq.m_qterms , &m_tmpq, m_r->ptr_whiteList, // we need to restrict docid range for // whitelist as well! this is from // doDocIdSplitLoop() m_docIdStart, m_docIdEnd, // how much of each termlist to read in bytes (long *)m_r->ptr_readSizes , //m_tmpq.getNumTerms() , // numLists m_lists , this , gotListsWrapper , m_r , m_r->m_niceness , true , // do merge? m_debug , NULL , // best hostids m_r->m_restrictPosdbForQuery , split , checkCache )) { m_blocked = true; return false; } // error? if ( g_errno ) { log("msg39: Had error getting termlists2: %s.", mstrerror(g_errno)); // don't bail out here because we are in docIdSplitLoop() //sendReply (m_slot,this,NULL,0,0,true); return true; } return gotLists ( true ); }
bool sendReply ( void *state , bool addUrlEnabled ) { // allow others to add now //s_inprogress = false; // get the state properly //gr *st1 = (gr *) state; GigablastRequest *gr = (GigablastRequest *)state; // in order to see what sites are being added log it, then we can // more easily remove sites from sitesearch.gigablast.com that are // being added but not being searched SafeBuf xb; if ( gr->m_urlsBuf ) { xb.safeTruncateEllipsis ( gr->m_urlsBuf , 200 ); log(LOG_INFO,"http: add url %s (%s)", xb.getBufStart(),mstrerror(g_errno)); } char format = gr->m_hr.getReplyFormat(); TcpSocket *sock = gr->m_socket; if ( format == FORMAT_JSON || format == FORMAT_XML ) { bool status = g_httpServer.sendSuccessReply ( gr ); // nuke state mdelete ( gr , sizeof(gr) , "PageAddUrl" ); delete (gr); return status; } long ulen = 0; char *url = gr->m_urlsBuf; if ( url ) ulen = gbstrlen (url); // re-null it out if just http:// bool printUrl = true; if ( ulen == 0 ) printUrl = false; if ( ! gr->m_urlsBuf ) printUrl = false; if ( ulen==7 && printUrl && !strncasecmp(gr->m_url,"http://",7)) printUrl = false; if ( ulen==8 && printUrl && !strncasecmp(gr->m_url,"https://",8)) printUrl = false; // page is not more than 32k char buf[1024*32+MAX_URL_LEN*2]; SafeBuf sb(buf, 1024*32+MAX_URL_LEN*2); //char rawbuf[1024*8]; //SafeBuf rb(rawbuf, 1024*8); //rb.safePrintf("<?xml version=\"1.0\" encoding=\"utf-8\"?>\n"); //rb.safePrintf("<status>\n"); //CollectionRec *cr = g_collectiondb.getRec ( gr->m_coll ); // collection name char tt [ 128 ]; tt[0] = '\0'; g_pages.printAdminTop ( &sb , sock , &gr->m_hr ); // display url //char *url = gr->m_urlsBuf; //if ( url && ! url[0] ) url = NULL; // watch out for NULLs if ( ! url ) url = "http://"; // if there was an error let them know //char msg[MAX_URL_LEN + 1024]; SafeBuf mbuf; //char *pm = ""; if ( g_errno ) { mbuf.safePrintf("<center><font color=red>"); mbuf.safePrintf("Error adding url(s): <b>%s[%i]</b>", mstrerror(g_errno) , g_errno); mbuf.safePrintf("</font></center>"); //pm = msg; //rb.safePrintf("Error adding url(s): %s[%i]", // mstrerror(g_errno) , g_errno); } else if ( printUrl ) { mbuf.safePrintf("<center><font color=red>"); mbuf.safePrintf("<b><u>"); mbuf.safeTruncateEllipsis(gr->m_urlsBuf,200); mbuf.safePrintf("</u></b> added to spider " "queue " "successfully<br><br>"); mbuf.safePrintf("</font></center>"); //rb.safePrintf("%s added to spider " // "queue successfully", url ); //pm = msg; //url = "http://"; //else // pm = "Don't forget to <a href=/gigaboost.html>" // "Gigaboost</a> your URL."; } if ( mbuf.length() ) sb.safeStrcpy ( mbuf.getBufStart() ); g_parms.printParmTable ( &sb , sock , &gr->m_hr ); // print the final tail g_pages.printTail ( &sb, true ); // admin? // clear g_errno, if any, so our reply send goes through g_errno = 0; // nuke state mdelete ( gr , sizeof(GigablastRequest) , "PageAddUrl" ); delete (gr); return g_httpServer.sendDynamicPage (sock, sb.getBufStart(), sb.length(), -1 ); // cachetime }
bool Msg3a::gotAllSplitReplies ( ) { // if any of the split requests had an error, give up and set m_errno // but don't set if for non critical errors like query truncation if ( m_errno ) { g_errno = m_errno; return true; } // also reset the finalbuf and the oldNumTopDocIds if ( m_finalBuf ) { mfree ( m_finalBuf, m_finalBufSize, "Msg3aF" ); m_finalBuf = NULL; m_finalBufSize = 0; } // update our estimated total hits m_numTotalEstimatedHits = 0; for ( long i = 0; i < m_numHosts ; i++ ) { // get that host that gave us the reply //Host *h = g_hostdb.getHost(i); // . get the reply from multicast // . multicast should have destroyed all slots, but saved reply // . we are responsible for freeing the reply // . we need to call this even if g_errno or m_errno is // set so we can free the replies in Msg3a::reset() // . if we don't call getBestReply() on it multicast should // free it, because Multicast::m_ownReadBuf is still true Multicast *m = &m_mcast[i]; bool freeit = false; long replySize = 0; long replyMaxSize; char *rbuf; Msg39Reply *mr; // . only get it if the reply not already full // . if reply already processed, skip // . perhaps it had no more docids to give us or all termlists // were exhausted on its disk and this is a re-call // . we have to re-process it for count m_numTotalEstHits, etc. rbuf = m->getBestReply ( &replySize , &replyMaxSize , &freeit , true ); //stealIt? // cast it mr = (Msg39Reply *)rbuf; // in case of mem leak, re-label from "mcast" to this so we // can determine where it came from, "Msg3a-GBR" relabel( rbuf, replyMaxSize , "Msg3a-GBR" ); // . we must be able to free it... we must own it // . this is true if we should free it, but we should not have // to free it since it is owned by the slot? if ( freeit ) { log(LOG_LOGIC,"query: msg3a: Steal failed."); char *xx = NULL; *xx=0; } // bad reply? if ( ! mr ) { log(LOG_LOGIC,"query: msg3a: Bad NULL reply."); m_reply [i] = NULL; m_replyMaxSize[i] = 0; // it might have been timd out, just ignore it!! continue; // if size is 0 it can be Msg39 giving us an error! g_errno = EBADREPLYSIZE; m_errno = EBADREPLYSIZE; // all reply buffers should be freed on reset() return true; } // how did this happen? if ( replySize < 29 && ! mr->m_errno ) { // if size is 0 it can be Msg39 giving us an error! g_errno = EBADREPLYSIZE; m_errno = EBADREPLYSIZE; log(LOG_LOGIC,"query: msg3a: Bad reply size of %li.", replySize); // all reply buffers should be freed on reset() return true; } // can this be non-null? we shouldn't be overwriting one // without freeing it... if ( m_reply[i] ) // note the mem leak now log("query: mem leaking a 0x39 reply"); // cast it and set it m_reply [i] = mr; m_replyMaxSize[i] = replyMaxSize; // deserialize it (just sets the ptr_ and size_ member vars) //mr->deserialize ( ); deserializeMsg ( sizeof(Msg39Reply) , &mr->size_docIds, &mr->size_clusterRecs, &mr->ptr_docIds, mr->m_buf ); // sanity check if ( mr->m_nqt != m_q->getNumTerms() ) { g_errno = EBADREPLY; m_errno = EBADREPLY; log("query: msg3a: Split reply qterms=%li != %li.", (long)mr->m_nqt,(long)m_q->getNumTerms() ); return true; } // return if split had an error, but not for a non-critical // error like query truncation if ( mr->m_errno && mr->m_errno != EQUERYTRUNCATED ) { g_errno = mr->m_errno; m_errno = mr->m_errno; log("query: msg3a: Split had error: %s", mstrerror(g_errno)); return true; } // skip down here if reply was already set //skip: // add of the total hits from each split, this is how many // total results the lastest split is estimated to be able to // return // . THIS should now be exact since we read all termlists // of posdb... m_numTotalEstimatedHits += mr->m_estimatedHits; // debug log stuff if ( ! m_debug ) continue; // cast these for printing out long long *docIds = (long long *)mr->ptr_docIds; score_t *scores = (score_t *)mr->ptr_scores; // print out every docid in this split reply for ( long j = 0; j < mr->m_numDocIds ; j++ ) { // print out score_t logf( LOG_DEBUG, "query: msg3a: [%lu] %03li) " "split=%li docId=%012llu domHash=0x%02lx " "score=%lu" , (unsigned long)this , j , i , docIds [j] , (long)g_titledb.getDomHash8FromDocId(docIds[j]), (long)scores[j] ); } } // this seems to always return true! mergeLists ( ); if ( ! m_r->m_useSeoResultsCache ) return true; // now cache the reply SafeBuf cr; long dataSize = 4 + 4 + 4 + m_numDocIds * (8+4+4); long need = sizeof(key_t) + 4 + dataSize; bool status = cr.reserve ( need ); // sanity if ( ( m_ckey.n0 & 0x01 ) == 0x00 ) { char *xx=NULL; *xx=0; } // ignore errors g_errno = 0; // return on error with g_errno cleared if cache add failed if ( ! status ) return true; // add to buf otherwise cr.safeMemcpy ( &m_ckey , sizeof(key_t) ); cr.safeMemcpy ( &dataSize , 4 ); long now = getTimeGlobal(); cr.pushLong ( now ); cr.pushLong ( m_numDocIds ); cr.pushLong ( m_numTotalEstimatedHits );//Results ); long max = m_numDocIds; // then the docids for ( long i = 0 ; i < max ; i++ ) cr.pushLongLong(m_docIds[i] ); for ( long i = 0 ; i < max ; i++ ) cr.pushFloat(m_scores[i]); for ( long i = 0 ; i < max ; i++ ) cr.pushLong(getSiteHash26(i)); // sanity if ( cr.length() != need ) { char *xx=NULL; *xx=0; } // make these key_t startKey; key_t endKey; startKey = m_ckey; // clear delbit startKey.n0 &= 0xfffffffffffffffeLL; // end key is us endKey = m_ckey; // that is the single record m_seoCacheList.set ( cr.getBufStart() , cr.length(), cr.getBufStart(), // alloc cr.getCapacity(), // alloc size (char *)&startKey, (char *)&endKey, -1, // fixeddatasize true, // owndata? false,// use half keys? sizeof(key_t) ); // do not allow cr to free it, msg1 will cr.detachBuf(); // note it //log("seopipe: storing ckey=%s q=%s" // ,KEYSTR(&m_ckey,12) // ,m_r->ptr_query // ); //log("msg1: sending niceness=%li",(long)m_r->m_niceness); // this will often block, but who cares!? it just sends a request off if ( ! m_msg1.addList ( &m_seoCacheList , RDB_SERPDB,//RDB_CACHEDB, m_r->ptr_coll, this, // state gotSerpdbReplyWrapper, // callback false, // forcelocal? m_r->m_niceness ) ) { //log("blocked"); return false; } // we can safely delete m_msg17... just return true return true; }
// . displays the stats for a username // . show stats for every day we have them for // . in a big list // . if they click the day display all docids evaluated for that day // . show the accuracy for that day too // . how many docs they edited // . how many of those docs were verified by another // . and if there was consensus void gotTransdbList ( State60 *st ) { // get today's time range time_t now = getTimeGlobal(); // get start of today time_t dayStart = now / (24*3600); SafeBuf sb; // int16_tcut TcpSocket *s = st->m_s; // make about 200k of mem to write into if ( ! sb.reserve ( 200000 ) ) return g_httpServer.sendErrorReply(s,500,mstrerrno(g_errno)); // print description so they can clikc a button to start the turk sb.safePrintf("<html>\n" "<title>Event Editor</title>\n" "<body>\n" "<table width=\"100%%\" border=\"0\">\n" "<tr><td style=\"background-color:#0079ba;\">\n" "<center><font color=#00000>" "<h2>Event Editor</h2>\n" "</font></center></td>" "</tr></table>"); // print the content sb.safePrintf("<center><font size=4><blink>" "<b><a href=\"/pageturk?c=%s&edit=1\">" "Click here to start editing.</a></b></blink>" "</font><br><i>Please take your " "time to read the information below before you begin" "</i><br><font color=\"red\" size=2> Warning: Adult " "content might be presented to you." " You should be above 18 years of age to continue." "</center></font>",st->m_coll); sb.safePrintf("<font face=arial,sans-serif color=black size=3>" "<p>By clicking <i>Start Voting</i>, you will be " "presented with an interface for editing events. " "The editor will display a modified web page that " "contains one or more events. Each event's description " "will be highlight with a blue background. You can " "toggle whether a particular event is displayed by " "clicking on that event's ID. You can highlight one or " "multiple event descriptions at the same time. " "</p><p>" "By clicking on the section icons in the web page you " "can tell the editor that a virtual fence should be " "erected around that section. The fence will make sure " "that event descriptions can not span across it. Each " "event description must be fully contained either " "inside or outside the fence. However, you can also " "declare a section as a title section, which means that " "the text that the title section contains is free to be " "used by any event description." "</p>\n" "<p>When you are done erecting section fences, you " "submit your changes. The more changes you make the " "more points you earn. Other users may evaluate " "your edits for accuracy. You will be paid based on the " "points you earn as well as your accuracy. All " "transactions are listed in the table below.</p>" "<p>You may not change your username or password " "but you can change your email address. Your email " "address will be used to pay you with PayPal every " "Friday. Paypal fees will be deducted on your end. By " "using this service you agree to all stated Terms & " "Conditions.</p>" "</font>\n"); // get the user record User *uu = g_users.getUser ( username ); // print out their info, like paypal email sb.safePrintf("<table>\n" "<tr><td colspan=10><center>Your Info</center>" "</td></tr>\n" "<tr>" "<td>Email</td>" "<td><input type=text value=%s></td>" "<td>email address used to pay with paypal</td>" "</tr>\n" "<tr><td colspan=10><input type=submit value=update>" "</td></tr>\n" "</table>\n" , uu->m_payPalEmail ); // print your stats here now sb.safePrintf("<table>\n" "<tr><td colspan=10><center>Your Stats</center>" "</td></tr>\n" "<tr>" "<td>date</td>" "<td>action</td>" "<td>amount</td>" "<td>desc</td>" "</tr>\n"); // int16_tcut RdbList *list = &st->m_list; int32_t lastDay = -1; int32_t totalReceives = 0; int32_t totalSubmits = 0; int32_t totalPasses = 0; int32_t totalFails = 0; // scan the list for ( ; ! list->isExhausted() ; ) { // get rec char *rec = list->getCurrentRecord(); char *data = list->getCurrentData(); int32_t dataSize = list->getCurrentDataSize(); // skip that list->skipCurrentRecord(); // skip if negative if ( (rec[0] & 0x01) == 0x00 ) continue; // get the time (global time - sync'd with host #0) time_t tt = g_transdb.getTimeStamp ( rec ); // get day # int32_t daynum = tt / (24*3600); // is it today? bool isToday = ( daynum >= dayStart ); // point to the Transaction Trans *trans = (Trans *)data; // if is today, print it out verbatim if ( isToday ) { // print it in html row format to match table above //printTrans ( &sb , rec ); sb.safePrintf("<tr>"); // make it into a nice date time_t dd = lastDay * 86400; struct tm *timeStruct = localtime ( &dd ); char ppp[100]; strftime(ppp,100,"%H:%M:%S",timeStruct); // print last days stats first sb.safePrintf("<td>%s</td>",ppp); // then stats if ( trans->m_actionType == AT_RECEIVE_DOC ) sb.safePrintf("<td>receive</td>" "<td>%"INT32" pts</td>" "<td>docid=%"UINT64"</td>", (int32_t)trans->m_number, trans->m_docId); else if ( trans->m_actionType == AT_SUBMIT_DOC ) sb.safePrintf("<td>submit</td>" "<td>%"INT32" pts</td>" "<td>docid=%"UINT64"</td>", (int32_t)trans->m_number, trans->m_docId); else if ( trans->m_actionType == AT_PASS_DOC ) sb.safePrintf("<td>verify</td>" "<td>%"INT32" pts</td>" "<td>docid=%"UINT64" was verified " "by user=\"%s\"</td>", (int32_t)trans->m_number, trans->m_docId, trans->m_desc); else if ( trans->m_actionType == AT_FAIL_DOC ) sb.safePrintf("<td>verify</td>" "<td>%"INT32" pts</td>" "<td>docid=%"UINT64" was deemed to " "be incorrect " "by user=\"%s\"</td>", (int32_t)trans->m_number, trans->m_docId, trans->m_desc); else if ( trans->m_actionType == AT_ACCURACY_EVAL) sb.safePrintf("<td>accuracy eval</td>" "<td>%.02f</td>" "<td>docid=%"UINT64"</td>", trans->m_number, trans->m_docId); else if ( trans->m_actionType == AT_CHARGE) sb.safePrintf("<td>credit</td>" "<td>%.02f</td>" "<td>You made money.</td>", trans->m_number); else if ( trans->m_actionType == AT_PAYMENT) sb.safePrintf("<td>payment</td>" "<td>%.02f</td>" "<td>We paid you.</td>", trans->m_number); else if ( trans->m_actionType == AT_LOGIN) sb.safePrintf("<td>login</td>" "<td>-</td>" "<td>You logged in.</td>"); else if ( trans->m_actionType == AT_LOGOUT) sb.safePrintf("<td>logout</td>" "<td>-</td>" "<td>You logged out.</td>"); else if ( trans->m_actionType == AT_AUTO_LOGOUT) sb.safePrintf("<td>logout</td>" "<td>-</td>" "<td>You were auto " "logged out.</td>"); else { char *xx=NULL;*xx=0; } sb.safePrintf("</tr>\n"); continue; } // if does not match last day, print out that last day's stats // and reset for next guy if ( daynum != lastDay && lastDay != -1 ) { // make it into a nice date time_t dd = lastDay * 86400; struct tm *timeStruct = localtime ( &dd ); char ppp[100]; strftime(ppp,100,"%b-%d-%Y",timeStruct); // print last days stats first sb.safePrintf("<td>%s</td>",ppp); // then stats sb.safePrintf("<tr>" "<td>receive</td>" "<td>%"INT32"</td>" "<td>Total received</td>" "</tr>\n", totalReceives); sb.safePrintf("<tr>" "<td>submit</td>" "<td>%"INT32"</td>" "<td>Total submitted</td>" "</tr>\n", totalSubmits); sb.safePrintf("<tr>" "<td>pass</td>" "<td>%"INT32"</td>" "<td>Total accuracy tests passed</td>" "</tr>\n", totalPasses); sb.safePrintf("<tr>" "<td>fail</td>" "<td>%"INT32"</td>" "<td>Total accuracy tests failed</td>" "</tr>\n", totalFails); // reset as well totalReceived = 0; totalSubmits = 0; totalPasses = 0; totalFails = 0; } // remember last day # we processed for accumulating stats lastDay = daynum; // accum stats if ( trans->m_actionType == AT_RECEIVE_DOC ) totalReceives++; if ( trans->m_actionType == AT_SUBMIT_DOC ) totalSubmits++; if ( trans->m_actionType == AT_PASS_DOC ) totalPasses++; if ( trans->m_actionType == AT_FAIL_DOC ) totalFails++; } sb.safePrintf("</body></html>\n"); sendReply ( &sb ); }
void Statsdb::drawHR ( float z , float ymin , float ymax , //GIFPlotter *plotter , SafeBuf &gw, Label *label , float zoff , long color ) { // convert into yspace float z2 = ((float)DY2 * (float)(z - ymin)) /(float)(ymax-ymin); // avoid collisions with other graphs z2 += zoff; // border //z2 += m_by; // round off error z2 += 0.5; // for adjusatmnet float ptsPerPixel = (ymax-ymin)/ (float)DY2; // make an adjustment to the label then! -- Commented out because it's currently not used. float zadj = zoff * ptsPerPixel; //#ifdef _USEPLOTTER_ // use the color specified from addStat_r() for this line/pt //plotter->pencolor ( ((color >> 16) & 0xff) << 8 , // ((color >> 8) & 0xff) << 8 , // ((color >> 0) & 0xff) << 8 ); // horizontal line //plotter->line ( m_bx, (long)z2 , DX2 + m_bx, (long)z2 ); long width = 1; drawLine3 ( m_gw, 0, DX2 , (long)z2,color, width); // make label char tmp[128]; // . use "graphHash" to map to unit display // . this is a disk read volume sprintf(tmp,label->m_format,z +zadj);//* label->m_yscalar); /* // a white shadow plotter->pencolor ( 0xffff,0xffff,0xffff ); plotter->move ( m_bx + 80 + 2 , z2 + 10 - 2 ); plotter->alabel ( 'c' , 'c' , tmp ); // a black shadow plotter->pencolor ( 0 , 0 , 0 ); plotter->move ( m_bx + 80 + 1 , z2 + 10 - 1 ); plotter->alabel ( 'c' , 'c' , tmp ); //long color = label->m_color; // use the color specified from addStat_r() for this line/pt plotter->pencolor ( ((color >> 16) & 0xff) << 8 , ((color >> 8) & 0xff) << 8 , ((color >> 0) & 0xff) << 8 ); // move cursor plotter->move ( m_bx + 80 , z2 + 10 ); // plot label plotter->alabel ( 'c' , 'c' , tmp ); */ // LABEL gw.safePrintf("<div style=\"position:absolute;" "left:%li;" "bottom:%li;" "color:#%lx;" "z-index:110;" "font-size:14px;" "min-height:20px;" "min-width:3px;\">%s</div>\n" , (long)(m_bx) , (long)z2 +m_by , color // the label: , tmp ); }
bool sendTurkPageReply ( State60 *st ) { XmlDoc *xd = &st->m_xd; //char *content = xd->ptr_utf8Content; //int32_t contentLen = xd->size_utf8Content - 1; // count the total number of EventDesc classes for all evids //char *evd = xd->ptr_eventData; //EventDisplay *ed = (EventDisplay *)evd; //char *addr = evd + (int32_t)ed->m_addr; //char timeZoneOffset = getTimeZoneFromAddr ( addr ); // in case getSections() block come right back in xd->setCallback ( st , xdcallback ); // . set niceness to 1 so all this processing doesn't slow queries down // . however, g_niceness should still be zero... hmmm... xd->m_niceness = 1; // default to 1 niceness st->m_niceness = 1; // now set the sections class Sections *ss = xd->getSections(); // now for each section with alnum text, telescope up as far as // possible without containing anymore alnum text than what it // contained. set SEC_CONTROL bit. such sections will have the // 2 green/blue dots, that are used for turning on/off title/desc. // but really the indians will only turn off sections that should // not have a title/desc. for ( Section *si = ss->m_rootSection ; si ; si = si->m_next ) { // breathe QUICKPOLL(st->m_niceness); // skip if does not have text if ( si->m_firstWordPos < 0 ) continue; // otherwise, find biggest parent that contains just that text Section *p = si->m_parent; Section *last = si; for ( ; p ; p = p->m_parent ) { if ( p->m_firstWordPos != si->m_firstWordPos ) break; if ( p->m_lastWordPos != si->m_lastWordPos ) break; last = p; } // set that bit then last->m_flags |= SEC_CONTROL; // and speed up the loop si = last; } // * now each SEC_CONTROL sections have a fence activated by a turker // * an event title or description can not span a fence. it must be // confined within a fence. however, it is allowed to include // title or description from a "title section". // * hold shift down to designate as title section when clicking it // * show the raw text of each event changing as you fence // sections in or out. show in a right frame. // * show list of events on page in the top frame. can toggle them // all individually. // * and remove no-display from all tags so we can see everything. // * highlight addresses, not just dates. // * each section hash has its own unique bg color when activated // * with a single click, completely reject an event because: // contains bad time, address, title or desc. specify which so // we can improve our algo. // * when selecting an individual event, scroll to its tod... // * remove all color from webpage that we can so our colors show up // * remove all imgs. just src them to dev null. // * allow for entering a custom title for an event or all events // that are or will ever appear on the page. // * when displaying the text of the events, use hyphens to // delineate the section topology. strike out text as a section // fence is activated. // * when a section is activated is it easier to just redownload // the whole text of the page? maybe just the text frame? // * clicking on an individual sentence section should just remove // that sentence. that is kinda a special content hash removal // tag. like "Click here for video." // * when an event id is selected i guess activate its bgcolor to // be light blue for all sentences currently in the event that // are not in activated sections. (make exception for designated // title sections). so we need multiple tags for each events // sentence div section. if sentence is split use multiple div tags // then to keep the order. so each event sentence would have // <div ev1=1 ev2=1 ev10=1>...</div> if it is in event ids 1,2 and // 10. that way we can activate it when one of those event ids is // activated. SafeBuf sb; // int16_tcuts if ( ! xd->m_wordsValid ) { char *xx=NULL;*xx=0; } Words *words = &xd->m_words; int32_t nw = words->getNumWords(); char **wptrs = words->getWords(); int32_t *wlens = words->getWordLens(); nodeid_t *tids = words->getTagIds(); // a special array for printing </div> tags char *endCounts = (char *)mcalloc ( nw ,"endcounts"); if ( ! endCounts ) return sendErrorReply ( st , g_errno ); // // now loop over all the words. if word starts a section that has // SEC_CONTROL bit set, and print out the section hash and a color // tag to be activated if the turkey activates us. // CAUTION: word may start multiple sections. // for ( int32_t i = 0 ; i < nw ; i++ ) { // get section ptr Section *sj = ss->m_sectionPtrs[i]; // sanity check. sj must be first section ptr that starts @ a if ( sj && sj->m_a==i && sj->m_prev && sj->m_prev->m_a==i ) { char *xx=NULL;*xx=0; } // . does word #i start a section? // . if section is control, print out the control while ( sj && sj->m_a == i ) { // print this section's hash if ( sj->m_flags & SEC_CONTROL) { // after the turkeys have made all the edits // they need to submit the changes they made. // how can we get that data sent back to the // back end? we need to send back the colors // of the sections that have been activated // i guess. just do a loop over them. sb.safePrintf("<div nobreak gbsecid=%"UINT32" " "bgcolor=#%"XINT32" " "onclick=gbtogglecolor()>", (uint32_t)sj->m_tagHash, (uint32_t)sj->m_tagHash); // sanity check if ( sj->m_b < 0 ) { char *xx=NULL;*xx=0; } if ( sj->m_b > nw ) { char *xx=NULL;*xx=0; } // and inc the /div count for that word endCounts[sj->m_b-1]++; } // try next section too sj = sj->m_next; } // if this is a tag, remove any coloring if ( tids[i] ) { } // print the word, be it a tag, alnum, punct sb.safeMemcpy ( wptrs[i] , wlens[i] ); // end a div tag? if ( ! endCounts[i] ) continue; // might be many so loop it for ( int32_t j = 0 ; j < endCounts[i] ; j++ ) sb.safePrintf("</div>"); } return false; }
// . return the score of the highest-scoring window containing match #m // . window is defined by the half-open interval [a,b) where a and b are // word #'s in the Words array indicated by match #m // . return -1 and set g_errno on error int64_t Summary::getBestWindow ( Matches *matches, int32_t mm, int32_t *lasta, int32_t *besta, int32_t *bestb, char *gotIt, char *retired, int32_t maxExcerptLen ) { // get the window around match #mm Match *m = &matches->m_matches[mm]; // what is the word # of match #mm? int32_t matchWordNum = m->m_wordNum; // what Words/Pos/Bits classes is this match in? Words *words = m->m_words; Section **sp = NULL; int32_t *pos = m->m_pos->m_pos; // use "m_swbits" not "m_bits", that is what Bits::setForSummary() uses const swbit_t *bb = m->m_bits->m_swbits; // shortcut if ( m->m_sections ) { sp = m->m_sections->m_sectionPtrs; } int32_t nw = words->getNumWords(); int64_t *wids = words->getWordIds(); nodeid_t *tids = words->getTagIds(); // . sanity check // . this prevents a core i've seen if ( matchWordNum >= nw ) { log("summary: got overflow condition for q=%s",m_q->m_orig); // assume no best window *besta = -1; *bestb = -1; *lasta = matchWordNum; return 0; } // . we NULLify the section ptrs if we already used the word in another summary. int32_t badFlags = SEC_SCRIPT|SEC_STYLE|SEC_SELECT|SEC_IN_TITLE; if ( (bb[matchWordNum] & D_USED) || ( sp && (sp[matchWordNum]->m_flags & badFlags) ) ) { // assume no best window *besta = -1; *bestb = -1; *lasta = matchWordNum; return 0; } // . "a" is the left fence post of the window (it is a word # in Words) // . go to the left as far as we can // . thus we decrement "a" int32_t a = matchWordNum; // "posa" is the character position of the END of word #a int32_t posa = pos[a+1]; int32_t firstFrag = -1; bool startOnQuote = false; bool goodStart = false; int32_t wordCount = 0; // . decrease "a" as int32_t as we stay within maxNumCharsPerLine // . avoid duplicating windows by using "lasta", the last "a" of the // previous call to getBestWindow(). This can happen if our last // central query term was close to this one. for ( ; a > 0 && posa - pos[a-1] < maxExcerptLen && a > *lasta; a-- ) { // . don't include any "dead zone", // . dead zones have already been used for the summary, and // we are getting a second/third/... excerpt here now then // stop if its the start of a sentence, too // stop before title word if ( (bb[a-1] & D_USED) || (bb[a] & D_STARTS_SENTENCE) || ( bb[a-1] & D_IN_TITLE )) { goodStart = true; break; } // don't go beyond an LI, TR, P tag if ( tids && ( tids[a-1] == TAG_LI || tids[a-1] == TAG_TR || tids[a-1] == TAG_P || tids[a-1] == TAG_DIV ) ) { goodStart = true; break; } // stop if its the start of a quoted sentence if ( a+1<nw && (bb[a+1] & D_IN_QUOTES) && words->getWord(a)[0] == '\"' ){ startOnQuote = true; goodStart = true; break; } // find out the first instance of a fragment (comma, etc) // watch out! because frag also means 's' in there's if ( ( bb[a] & D_STARTS_FRAG ) && !(bb[a-1] & D_IS_STRONG_CONNECTOR) && firstFrag == -1 ) { firstFrag = a; } if ( wids[a] ) { wordCount++; } } // if didn't find a good start, then start at the start of the frag if ( !goodStart && firstFrag != -1 ) { a = firstFrag; } // don't let punct or tag word start a line, unless a quote if ( a < matchWordNum && !wids[a] && words->getWord(a)[0] != '\"' ){ while ( a < matchWordNum && !wids[a] ) a++; // do not break right after a "strong connector", like // apostrophe while ( a < matchWordNum && a > 0 && ( bb[a-1] & D_IS_STRONG_CONNECTOR ) ) a++; // don't let punct or tag word start a line while ( a < matchWordNum && !wids[a] ) a++; } // remember, b is not included in the summary, the summary is [a,b-1] // remember to include all words in a matched phrase int32_t b = matchWordNum + m->m_numWords ; int32_t endQuoteWordNum = -1; int32_t numTagsCrossed = 0; for ( ; b <= nw; b++ ) { if ( b == nw ) { break; } if ( pos[b+1] - pos[a] >= maxExcerptLen ) { break; } if ( startOnQuote && words->getWord(b)[0] == '\"' ) { endQuoteWordNum = b; } // don't include any dead zone, those are already-used samples if ( bb[b] & D_USED ) { break; } // stop on a title word if ( bb[b] & D_IN_TITLE ) { break; } if ( wids[b] ) { wordCount++; } // don't go beyond an LI or TR backtag if ( tids && ( tids[b] == (BACKBIT|TAG_LI) || tids[b] == (BACKBIT|TAG_TR) ) ) { numTagsCrossed++; // try to have atleast 10 words in the summary if ( wordCount > 10 ) { break; } } // go beyond a P or DIV backtag in case the earlier char is a // ':'. This came from a special case for wikipedia pages // eg. http://en.wikipedia.org/wiki/Flyover if ( tids && ( tids[b] == (BACKBIT|TAG_P) || tids[b] == (BACKBIT|TAG_DIV) )) { numTagsCrossed++; // try to have atleast 10 words in the summary if ( wordCount > 10 && words->getWord(b-1)[0] != ':' ) { break; } } } // don't end on a lot of punct words if ( b > matchWordNum && !wids[b-1]){ // remove more than one punct words. if we're ending on a quote // keep it while ( b > matchWordNum && !wids[b-2] && endQuoteWordNum != -1 && b > endQuoteWordNum ) { b--; } // do not break right after a "strong connector", like apostrophe while ( b > matchWordNum && (bb[b-2] & D_IS_STRONG_CONNECTOR) ) { b--; } } Match *ms = matches->m_matches; // make m_matches.m_matches[mi] the first match in our [a,b) window int32_t mi ; // . the match at the center of the window is match #"mm", so that // matches->m_matches[mm] is the Match class // . set "mi" to it and back up "mi" as int32_t as >= a for ( mi = mm ; mi > 0 && ms[mi-1].m_wordNum >=a ; mi-- ) ; // now get the score of this excerpt. Also mark all the represented // query words. Mark the represented query words in the array that // comes to us. also mark how many times the same word is repeated in // this summary. int64_t score = 0LL; // is a url contained in the summary, that looks bad! punish! bool hasUrl = false; // the word count we did above was just an approximate. count it right wordCount = 0; // for debug //char buf[5000]; //char *xp = buf; SafeBuf xp; // wtf? if ( b > nw ) { b = nw; } // first score from the starting match down to a, including match for ( int32_t i = a ; i < b ; i++ ) { // debug print out if ( g_conf.m_logDebugSummary ) { int32_t len = words->getWordLen(i); char cs; for (int32_t k=0;k<len; k+=cs ) { const char *c = words->getWord(i)+k; cs = getUtf8CharSize(c); if ( is_binary_utf8 ( c ) ) { continue; } xp.safeMemcpy ( c , cs ); xp.nullTerm(); } } // skip if in bad section, marquee, select, script, style if ( sp && (sp[i]->m_flags & badFlags) ) { continue; } // don't count just numeric words if ( words->isNum(i) ) { continue; } // check if there is a url. best way to check for '://' if ( wids && !wids[i] ) { const char *wrd = words->getWord(i); int32_t wrdLen = words->getWordLen(i); if ( wrdLen == 3 && wrd[0] == ':' && wrd[1] == '/' && wrd[2] == '/' ) { hasUrl = true; } } // skip if not wid if ( ! wids[i] ) { continue; } // just make every word 100 pts int32_t t = 100; // penalize it if in one of these sections if ( bb[i] & ( D_IN_PARENS | D_IN_SUP | D_IN_LIST ) ) { t /= 2; } // boost it if in bold or italics if ( bb[i] & D_IN_BOLDORITALICS ) { t *= 2; } // add the score for this word score += t; // print the score, "t" if ( g_conf.m_logDebugSummary ) { xp.safePrintf("(%" PRId32")",t); } // count the alpha words we got wordCount++; // if no matches left, skip if ( mi >= matches->m_numMatches ) { continue; } // get the match Match *next = &ms[mi]; // skip if not a match if ( i != next->m_wordNum ) { continue; } // must be a match in this class if ( next->m_words != words ) { continue; } // advance it mi++; // which query word # does it match int32_t qwn = next->m_qwordNum; if ( qwn < 0 || qwn >= m_q->m_numWords ){g_process.shutdownAbort(true);} // undo old score score -= t; // add 100000 per match t = 100000; // weight based on tf, goes from 0.1 to 1.0 t = (int32_t)((float)t * m_wordWeights [ qwn ]); // if it is a query stop word, make it 10000 pts if ( m_q->m_qwords[qwn].m_isQueryStopWord ) { t = 0;//10000; } // penalize it if in one of these sections if ( bb[i] & ( D_IN_PARENS | D_IN_SUP | D_IN_LIST ) ) { t /= 2; } if ( gotIt[qwn] > 0 ) { // have we matched it in this [a,b) already? if ( gotIt[qwn] == 1 ) { t /= 15; } else { // if we have more than 2 matches in the same window, // it may not give a good summary. give a heavy penalty t -= 200000; } } else if ( retired [qwn] > 0 ) { // have we matched it already in a winning window? t /= 12; } // add it back score += t; if ( g_conf.m_logDebugSummary ) { xp.safePrintf ("[%" PRId32"]{qwn=%" PRId32",ww=%f}",t,qwn, m_wordWeights[qwn]); } // inc the query word count for this window if ( gotIt[qwn] < 100 ) { gotIt[qwn]++; } } int32_t oldScore = score; // apply the bonus if it starts or a sentence // only apply if the score is positive and if the wordcount is decent if ( score > 0 && wordCount > 7 ){ // a match can give us 10k to 100k pts based on the tf weights // so we don't want to overwhelm that too much, so let's make // this a 20k bonus if it starts a sentence if ( bb[a] & D_STARTS_SENTENCE ) { score += 8000; } else if ( bb[a] & D_STARTS_FRAG ) { // likewise, a fragment, like after a comma score += 4000; } // 1k if the match word is very close to the // start of a sentence, lets say 3 alphawords if ( matchWordNum - a < 7 ) { score += 1000; } } // a summary isn't really a summary if its less than 7 words. // reduce the score, but still give it a decent score. // minus 5M. if ( wordCount < 7 ) { score -= 20000; } // summaries that cross a lot of tags are usually bad, penalize them if ( numTagsCrossed > 1 ) { score -= (numTagsCrossed * 20000); } if ( hasUrl ) { score -= 8000; } // show it if ( g_conf.m_logDebugSummary ) { log(LOG_DEBUG, "sum: score=%08" PRId32" prescore=%08" PRId32" a=%05" PRId32" b=%05" PRId32" %s", (int32_t)score,oldScore,(int32_t)a,(int32_t)b, xp.getBufStart()); } // set lasta, besta, bestb *lasta = a; *besta = a; *bestb = b; return score; }