// . return ptr to the buffer we serialize into // . return NULL and set g_errno on error bool Msg20Reply::sendReply ( XmlDoc *xd ) { // get it UdpSlot *slot = (UdpSlot *)xd->m_slot; if ( g_errno ) { // extract titleRec ptr log("query: Had error generating msg20 reply for d=%lli: " "%s",m_docId, mstrerror(g_errno)); // don't forget to delete this list haderror: mdelete ( xd, sizeof(XmlDoc) , "Msg20" ); delete ( xd ); g_udpServer.sendErrorReply ( slot , g_errno ) ; return true; } // now create a buffer to store title/summary/url/docLen and send back long need = getStoredSize(); char *buf = (char *)mmalloc ( need , "Msg20Reply" ); if ( ! buf ) goto haderror; // should never have an error! long used = serialize ( buf , need ); // sanity if ( used != need ) { char *xx=NULL;*xx=0; } // sanity check, no, might have been banned/filtered above around // line 956 and just called sendReply directly //if ( st->m_memUsed == 0 ) { char *xx=NULL;*xx=0; } // use blue for our color long color = 0x0000ff; // but use dark blue for niceness > 0 if ( xd->m_niceness > 0 ) color = 0x0000b0; //Msg20Reply *tt = (Msg20Reply *)buf; // sanity check if ( ! xd->m_utf8ContentValid ) { char *xx=NULL;*xx=0; } // for records long clen = 0; if ( xd->m_utf8ContentValid ) clen = xd->size_utf8Content - 1; // show it in performance graph if ( xd->m_startTimeValid ) g_stats.addStat_r ( clen , xd->m_startTime , gettimeofdayInMilliseconds() , color ); // . del the list at this point, we've copied all the data into reply // . this will free a non-null State20::m_ps (ParseState) for us mdelete ( xd , sizeof(XmlDoc) , "xd20" ); delete ( xd ); g_udpServer.sendReply_ass ( buf , need , buf , need , slot ); return true; }
void Msg0::reset ( ) { if ( m_msg5 && m_deleteMsg5 ) { mdelete ( m_msg5 , sizeof(Msg5) , "Msg0" ); delete ( m_msg5 ); } if ( m_msg5b && m_deleteMsg5b ) { mdelete ( m_msg5b , sizeof(Msg5) , "Msg0" ); delete ( m_msg5b ); } m_msg5 = NULL; m_msg5b = NULL; //#ifdef SPLIT_INDEXDB if ( m_replyBuf ) mfree ( m_replyBuf, m_replyBufSize, "Msg0" ); m_replyBuf = NULL; m_replyBufSize = 0; //#endif if ( m_mcasts ) { mfree(m_mcasts,sizeof(Multicast),"msg0mcast"); m_mcasts = NULL; } // no longer do this because we call reset after the msg5 completes // and it was destroying our handylist... so just call freelist // in the destructor now //m_handyList.freeList(); }
void gotMulticastReplyWrapper ( void *state , void *state2 ) { Multicast *mcast = (Multicast *)state; //msg7->gotMsg7Reply(); ImportState *is = mcast->m_importState; is->m_numIn++; log("import: imported %lli docs (off=%lli)", is->m_numIn,is->m_fileOffset); if ( ! is->importLoop() ) return; // we will be called again when this multicast reply comes in... if ( is->m_numIn < is->m_numOut ) return; log("inject: import is done"); CollectionRec *cr = g_collectiondb.getRec ( is->m_collnum ); // signify to qa.cpp that we are done if ( cr ) cr->m_importState = NULL; mdelete ( is, sizeof(ImportState) , "impstate"); delete (is); }
bool sendReply ( SafeBuf *sb ) { // save this TcpSocket *s = st->m_s; // nuke state60 mdelete ( st , sizeof(State60) , "turk1" ); delete (st); // get page to send back char *buf = sb->getBufStart(); // does this include the \0??? int32_t bufLen = sb->length(); // remove \0 i guess if we had one if ( bufLen > 0 && buf[bufLen-1] == '\0' ) bufLen--; // and send that back bool status = g_httpServer.sendDynamicPage (s, buf, bufLen, -1, // cachetime false, // POSTReply? "text/html", -1, // httpstatus NULL, // cookie "utf8" ); // charset // and convey the status return status; }
void Blaster::freeStateBD(StateBD *st){ // Free stateBD's buf if (!st) return; if (st->m_buf1) mfree(st->m_buf1,st->m_buf1MaxLen,"Blaster5"); mdelete(st,sizeof(StateBD),"Blaster3"); }
// when XmlDoc::inject() complets it calls this void doneInjectingWrapper10 ( void *state ) { XmlDoc *xd = (XmlDoc *)state; UdpSlot *slot = (UdpSlot *)xd->m_slot; long err = g_errno; mdelete ( xd, sizeof(XmlDoc) , "PageInject" ); delete (xd); g_errno = err; sendReply ( slot ); }
// when XmlDoc::inject() complets it calls this void doneInjectingWrapper10 ( void *state ) { XmlDoc *xd = (XmlDoc *)state; UdpSlot *slot = (UdpSlot *)xd->m_slot; int32_t err = g_errno; mdelete ( xd, sizeof(XmlDoc) , "PageInject" ); delete (xd); g_errno = err; if ( g_errno ) g_udpServer.sendErrorReply(slot,g_errno); else g_udpServer.sendReply_ass(NULL,0,NULL,0,slot); }
// reset rdb void Collectiondb::reset() { log("db: resetting collectiondb."); for ( long i = 0 ; i < m_numRecs ; i++ ) { if ( ! m_recs[i] ) continue; mdelete ( m_recs[i], sizeof(CollectionRec), "CollectionRec" ); delete ( m_recs[i] ); m_recs[i] = NULL; } m_numRecs = 0; m_numRecsUsed = 0; }
// send back a reply to the originator of the msg7 injection request void sendUdpReply7 ( void *state ) { XmlDoc *xd = (XmlDoc *)state; // remove from linked list if ( xd->m_nextInject ) xd->m_nextInject->m_prevInject = xd->m_prevInject; if ( xd->m_prevInject ) xd->m_prevInject->m_nextInject = xd->m_nextInject; if ( s_injectHead == xd ) s_injectHead = xd->m_nextInject; if ( s_injectTail == xd ) s_injectTail = xd->m_prevInject; xd->m_nextInject = NULL; xd->m_prevInject = NULL; UdpSlot *slot = xd->m_injectionSlot; uint32_t statColor = 0xccffcc; if(xd->m_indexCode) { statColor = 0xaaddaa;//0x4e99e9; } g_stats.addStat_r ( xd->m_rawUtf8ContentSize, xd->m_injectStartTime, gettimeofdayInMilliseconds(), statColor ); // injecting a warc seems to not set m_indexCodeValid to true // for the container doc... hmmm... int32_t indexCode = -1; int64_t docId = 0; if ( xd && xd->m_indexCodeValid ) indexCode = xd->m_indexCode; if ( xd && xd->m_docIdValid ) docId = xd->m_docId; mdelete ( xd, sizeof(XmlDoc) , "PageInject" ); delete (xd); if ( g_errno ) { g_udpServer.sendErrorReply(slot,g_errno); return; } // just send back the 4 byte indexcode, which is 0 on success, // otherwise it is the errno char *tmp = slot->m_tmpBuf; char *p = tmp; memcpy ( p , (char *)&indexCode , 4 ); p += 4; memcpy ( p , (char *)&docId , 8 ); p += 8; g_udpServer.sendReply_ass(tmp,(p-tmp),NULL,0,slot); }
void doneInjectingLinksWrapper ( void *state ) { Msg7 *msg7 = (Msg7 *)state; SafeBuf *sb = &msg7->m_sb; // copy the serps into ou rbuf if ( ! g_errno ) { // print header if ( sb->length() == 0 ) { // print header of page sb->safePrintf("<?xml version=\"1.0\" " "encoding=\"UTF-8\" ?>\n" "<response>\n" ); } // serp header if ( msg7->m_round == 1 ) sb->safePrintf("\t<googleResults>\n"); else sb->safePrintf("\t<bingResults>\n"); // print results sb->safeMemcpy(&msg7->m_xd.m_serpBuf); // end that if ( msg7->m_round == 1 ) sb->safePrintf("\t</googleResults>\n"); else sb->safePrintf("\t</bingResults>\n"); } // do bing now if ( msg7->m_round == 1 ) { // return if it blocks if ( ! msg7->scrapeQuery() ) return; } // otherwise, parse out the search results so steve can display them if ( g_errno ) sb->safePrintf("<error><![CDATA[%s]]></error>\n", mstrerror(g_errno)); // print header of page sb->safePrintf("</response>\n"); // page is not more than 32k //char buf[1024*32]; //char *p = buf; // return docid and hostid //p += sprintf ( p , "scraping status "); // print error msg out, too or "Success" //p += sprintf ( p , "%s", mstrerror(g_errno)); TcpSocket *sock = msg7->m_socket; g_httpServer.sendDynamicPage ( sock, sb->getBufStart(), sb->length(), -1/*cachetime*/); // hopefully sb buffer is copied becaues this will free it: mdelete ( msg7, sizeof(Msg7) , "PageInject" ); delete (msg7); }
/******************************************************************* Part of Sub-level Kurtosis Calculate: sum(Zjk*ones(1,p).*(data_proj))./sum(Zjk) *******************************************************************/ void kurtmodel(matrix *mZjk, double sumZjk, matrix *data, vector *meanZjk) { int i; matrix Mt; mnew(&Mt, data->m, data->n); mmDotMul(mZjk, data, &Mt); msum(&Mt, 'c', meanZjk); for (i=0; i<(meanZjk->l); i++) { *(meanZjk->pr + i) /= sumZjk; }; mdelete(&Mt); }
static void test(struct map *m , int n, int start) { init(n,start); shuffle(n); int i; for (i=0;i<n;i++) { mnew(m,INDEX[i]); } shuffle(n); n = rand() % (n/2); for (i=0;i<n;i++) { mdelete(m,INDEX[i]); } }
// this must always be called sometime AFTER handleRequest() is called void sendReply ( UdpSlot *slot , Msg39 *msg39 , char *reply , int32_t replyLen , int32_t replyMaxSize , bool hadError ) { // debug msg if ( g_conf.m_logDebugQuery || (msg39&&msg39->m_debug) ) logf(LOG_DEBUG,"query: msg39: [%"PTRFMT"] " "Sending reply len=%"INT32".", (PTRTYPE)msg39,replyLen); // sanity if ( hadError && ! g_errno ) { char *xx=NULL;*xx=0; } // no longer in use. msg39 will be NULL if ENOMEM or something if ( msg39 ) msg39->m_inUse = false; // . if we enter from a local call and not from handling a udp slot // then execute this logic here to return control to caller. // . do not delete ourselves because we will be re-used probably and // caller handles that now. if ( msg39 && msg39->m_callback ) { // if we blocked call user callback if ( msg39->m_blocked ) msg39->m_callback ( msg39->m_state ); // if not sending back a udp reply, return now return; } // . now we can free the lists before sending // . may help a little bit... //if ( msg39 ) { // for ( int32_t j = 0 ; j < msg39->m_msg2.m_numLists ; j++ ) // msg39->m_lists[j].freeList(); //} // get the appropriate UdpServer for this niceness level UdpServer *us = &g_udpServer; // i guess clear this int32_t err = g_errno; g_errno = 0; // send an error reply if g_errno is set if ( err ) us->sendErrorReply ( slot , err ) ; else us->sendReply_ass ( reply , replyLen , reply , replyMaxSize , slot ); // always delete ourselves when done handling the request if ( msg39 ) { mdelete ( msg39 , sizeof(Msg39) , "Msg39" ); delete (msg39); } }
void ckurtmodel(matrix *mZjk, double sumZjk, matrix *data_re, matrix *data_im, vector *meanZjk_re, vector *meanZjk_im) { int i; matrix Mt_re; matrix Mt_im; matrix mZjk_im; mnew(&Mt_re, data_re->m, data_re->n); mnew(&Mt_im, data_im->m, data_im->n); mnew(&mZjk_im, mZjk->m, mZjk->n); cmmDotMul(mZjk, &mZjk_im, data_re, data_im, &Mt_re, &Mt_im); msum(&Mt_re, 'c', meanZjk_re); msum(&Mt_im, 'c', meanZjk_im); for (i=0; i<(meanZjk_re->l); i++) { *(meanZjk_re->pr + i) /= sumZjk; *(meanZjk_im->pr + i) /= sumZjk; }; mdelete(&Mt_re); mdelete(&Mt_im); mdelete(&mZjk_im); }
// . this may be called from a signal handler // . we call from a signal handler to keep msg21 zippy // . this may be called twice, onece from sig handler and next time not // from the sig handler void doneSending_ass ( void *state , UdpSlot *slot ) { // point to our state State00 *st0 = (State00 *)state; // this is nULL if we hit the cache above if ( ! st0 ) return; // this might be inaccurate cuz sig handler can't call it! int64_t now = gettimeofdayInMilliseconds(); // log the stats if ( g_conf.m_logTimingNet ) { double mbps ; mbps = (((double)slot->m_sendBufSize) * 8.0 / (1024.0*1024.0))/ (((double)slot->m_startTime)/1000.0); log("net: msg0: Sent %"INT32" bytes of data in %"INT64" ms (%3.1fMbps) " "(niceness=%"INT32").", slot->m_sendBufSize , now - slot->m_startTime , mbps , st0->m_niceness ); } // can't go any further if we're in a sig handler //if ( g_inSigHandler ) return; // . mark it in pinkish purple // . BUT, do not add stats here for tagdb, we get WAY too many lookups // and it clutters the performance graph if ( st0->m_rdbId == RDB_TAGDB ) { } else if(slot->m_niceness > 0) { g_stats.addStat_r ( slot->m_sendBufSize , st0->m_startTime , now , //"transmit_data_nice", 0x00aa00aa); } else { g_stats.addStat_r ( slot->m_sendBufSize , st0->m_startTime , now , //"transmit_data", 0x00ff00ff ); } // release st0 now mdelete ( st0 , sizeof(State00) , "Msg0" ); delete ( st0 ); }
void Msg7::reset() { m_round = 0; //if ( m_inUse ) { char *xx=NULL;*xx=0; } //m_firstTime = true; //m_fixMe = false; //m_injectCount = 0; //m_start = NULL; m_sbuf.reset(); //m_isDoneInjecting = false; if ( m_xd ) { mdelete ( m_xd, sizeof(XmlDoc) , "PageInject" ); delete (m_xd); m_xd = NULL; } if ( m_sir ) { mfree ( m_sir , m_sirSize , "m7ir" ); m_sir = NULL; } }
// returns true bool sendErrorReply ( void *state , long err ) { // ensure this is set if ( ! err ) { char *xx=NULL;*xx=0; } // get it State8 *st = (State8 *)state; // get the tcp socket from the state TcpSocket *s = st->m_s; char tmp [ 1024*32 ] ; sprintf ( tmp , "<b>had server-side error: %s</b><br>", mstrerror(g_errno)); // nuke state8 mdelete ( st , sizeof(State8) , "PageGet1" ); delete (st); // erase g_errno for sending //g_errno = 0; // . now encapsulate it in html head/tail and send it off //return g_httpServer.sendDynamicPage ( s , tmp , gbstrlen(tmp) ); return g_httpServer.sendErrorReply ( s, err, mstrerror(err) ); }
void doneReindexing ( void *state ) { // cast it State13 *st = (State13 *)state; GigablastRequest *gr = &st->m_gr; // note it if ( gr->m_query && gr->m_query[0] ) log(LOG_INFO,"admin: Done with query reindex. %s", mstrerror(g_errno)); //// // // print the html page // ///// HttpRequest *hr = &gr->m_hr; char format = hr->getReplyFormat(); SafeBuf sb; const char *ct = "text/html"; if ( format == FORMAT_JSON ) ct = "application/json"; if ( format == FORMAT_XML ) ct = "text/xml"; if ( format == FORMAT_XML ) { sb.safePrintf("<response>\n" "\t<statusCode>0</statusCode>\n" "\t<statusMsg>Success</statusMsg>\n" "\t<matchingResults>%" PRId32"</matchingResults>\n" "</response>" , st->m_msg1c.m_numDocIdsAdded ); g_httpServer.sendDynamicPage ( gr->m_socket, sb.getBufStart(), sb.length(), -1, false,ct); mdelete ( st , sizeof(State13) , "PageTagdb" ); delete (st); return; } if ( format == FORMAT_JSON ) { sb.safePrintf("{\"response\":{\n" "\t\"statusCode\":0,\n" "\t\"statusMsg\":\"Success\",\n" "\t\"matchingResults\":%" PRId32"\n" "}\n" "}\n" , st->m_msg1c.m_numDocIdsAdded ); g_httpServer.sendDynamicPage ( gr->m_socket, sb.getBufStart(), sb.length(), -1, false,ct); mdelete ( st , sizeof(State13) , "PageTagdb" ); delete (st); return; } g_pages.printAdminTop ( &sb , gr->m_socket , &gr->m_hr ); sb.safePrintf("<style>" ".poo { background-color:#%s;}\n" "</style>\n" , LIGHT_BLUE ); // // print error msg if any // if ( gr->m_query && gr->m_query[0] && ! g_errno ) sb.safePrintf ( "<center><font color=red><b>Success. " "Added %" PRId32" docid(s) to " "spider queue.</b></font></center><br>" , st->m_msg1c.m_numDocIdsAdded ); if ( gr->m_query && gr->m_query[0] && g_errno ) sb.safePrintf ( "<center><font color=red><b>Error. " "%s</b></font></center><br>" , mstrerror(g_errno)); // print the reindex interface g_parms.printParmTable ( &sb , gr->m_socket , &gr->m_hr ); g_httpServer.sendDynamicPage ( gr->m_socket, sb.getBufStart(), sb.length(), -1, false); mdelete ( st , sizeof(State13) , "PageTagdb" ); delete (st); }
bool sendReply ( void *state ) { // get the state properly Msg7 *msg7= (Msg7 *) state; // extract info from state TcpSocket *s = msg7->m_socket; XmlDoc *xd = &msg7->m_xd; // log it //if ( msg7->m_url[0] ) xd->logIt(); // msg7 has the docid for what we injected, iff g_errno is not set //long long docId = msg7->m_msg7.m_docId; //long hostId = msg7->m_msg7.m_hostId; long long docId = xd->m_docId; long hostId = 0;//msg7->m_msg7.m_hostId; // // debug // /* // now get the meta list, in the process it will print out a // bunch of junk into msg7->m_pbuf if ( xd->m_docId ) { char *metalist = xd->getMetaList ( 1,1,1,1,1,1 ); if ( ! metalist || metalist==(void *)-1){char *xx=NULL;*xx=0;} // print it out SafeBuf *pbuf = &msg7->m_sbuf; xd->printDoc( pbuf ); bool status = g_httpServer.sendDynamicPage( msg7->m_socket , pbuf->getBufStart(), pbuf->length() , -1, //cachtime false ,//postreply? NULL, //ctype -1 , //httpstatus NULL,//cookie "utf-8"); // delete the state now mdelete ( st , sizeof(Msg7) , "PageInject" ); delete (st); // return the status return status; } */ // // end debug // // page is not more than 32k char buf[1024*32]; // . if we're talking w/ a robot he doesn't care about this crap // . send him back the error code (0 means success) if ( msg7->m_quickReply ) { char *p = buf; // set g_errno to index code if ( xd->m_indexCodeValid && xd->m_indexCode && ! g_errno ) g_errno = xd->m_indexCode; // return docid and hostid if ( ! g_errno ) p += sprintf ( p , "0,docId=%lli,hostId=%li," , docId , hostId ); // print error number here else p += sprintf ( p , "%li,0,0,", (long)g_errno ); // print error msg out, too or "Success" p += sprintf ( p , "%s", mstrerror(g_errno)); mdelete ( msg7, sizeof(Msg7) , "PageInject" ); delete (msg7); return g_httpServer.sendDynamicPage ( s, buf , gbstrlen(buf) , -1/*cachetime*/); } // get an active ptr into buf char *p = buf; char *pend = buf + 1024*32; // print admin bar p = g_pages.printAdminTop ( p , pend , PAGE_INJECT, NULL, // msg7->m_username , msg7->m_coll , NULL , // pwd s->m_ip ); // if there was an error let them know char msg[1024]; char *pm = ""; if ( g_errno ) { sprintf ( msg ,"Error injecting url: <b>%s[%i]</b>", mstrerror(g_errno) , g_errno); pm = msg; } //else if ( msg7->m_injected ) // pm = "url successfully injected"; // bail if not enabled //if ( ! g_conf.m_injectionEnabled ) { // sprintf ( msg ,"<font color=red>URL injection is disabled " // "in the Master Controls</font>"); // pm = msg; //} //char *c = msg7->m_coll; char bb [ MAX_COLL_LEN + 60 ]; bb[0]='\0'; //if ( c && c[0] ) sprintf ( bb , " (%s)", c); // make a table, each row will be an injectable parameter sprintf ( p , "<center>" "<b>%s</b>\n\n" // the url msg //"<FORM method=POST action=/inject>\n\n" //"<input type=hidden name=pwd value=\"%s\">\n" //"<input type=hidden name=username value=\"%s\">\n" "<table width=100%% bgcolor=#%s cellpadding=4 border=1>" "<tr><td bgcolor=#%s colspan=2>" "<center>" //"<font size=+1>" "<b>" "Inject URL</b>%s" //"</font>" "<br>" //"Enter the information below to inject " //"a URL. This allows you to specify the URL as well as the " //"content for the URL." "</td></tr>\n\n" "<tr><td><b>url</b></td>" "<td>\n" "<input type=text name=u value=\"\" size=50>" "</td></tr>\n\n" "<tr><td><b>query to scrape</b></td>" "<td>\n" "<input type=text name=qts value=\"\" size=50>" "</td></tr>\n\n" //"<tr><td><b>use ahrefs.com</b></td>" //"<td>\n" //"<input type=radio name=useahrefs value=0 checked>no " //"<input type=radio name=useahrefs value=1>yes " //"</td></tr>\n\n" "<tr><td><b>spider links</b></td>" "<td>\n" "<input type=radio name=spiderlinks value=0>no " "<input type=radio name=spiderlinks value=1 checked>yes " "<br>" "<font size=1>Should we add the page's outlinks to " "spiderdb for spidering? " "Default: yes" "</font>" "</td></tr>\n\n" "<tr><td><b>inject scraped links</b></td>" "<td>\n" "<input type=radio name=injectlinks value=0 checked>no " "<input type=radio name=injectlinks value=1>yes " "</td></tr>\n\n" "<tr><td><b>collection</b></td>" "<td>\n" "<input type=text name=c value=\"%s\" size=15>" "</td></tr>\n\n" "<tr><td><b>quick reply?</b><br>" "<font size=1>Should reply be short? " "Default: no" "</td>" "<td>\n" "<input type=radio name=quick value=0 checked>no " "<input type=radio name=quick value=1>yes " "</td></tr>\n\n" "<tr><td><b>only inject new docs?</b><br>" "<font size=1>Skips injection if docs already indexed. " "Default: no" "</td>" "<td>\n" "<input type=radio name=newonly value=0 checked>no " "<input type=radio name=newonly value=1>yes " "</td></tr>\n\n" "<tr><td><b>delete?</b><br>" "<font size=1>Should this url be deleted from the index? " "Default: no" "</td>" "<td>\n" "<input type=radio name=delete value=0 checked>no " "<input type=radio name=delete value=1>yes " "</td></tr>\n\n" "<tr><td><b>recycle content?</b><br>" "<font size=1>Should page content be recycled if " "reindexing? " "Default: no" "</td>" "<td>\n" "<input type=radio name=recycle value=0 checked>no " "<input type=radio name=recycle value=1>yes " "</td></tr>\n\n" "<tr><td><b>ip</b><br>" "<font size=1>IP address of the url. If blank then " "Gigablast will look up. " "Default: blank" "</td>" "<td>\n<input type=text name=ip value=\"\" size=15>" "</td></tr>\n\n" /* "<tr><td><b>do ip lookups?</b><br>" "<font size=1>Should Gigablast look up the IP address " "of the url, if it is not provided. " "Default: yes" "</td>" "<td>\n" "<input type=radio name=iplookups value=0>no " "<input type=radio name=iplookups value=1 checked>yes " "</td></tr>\n\n" */ //"<tr><td><b>is url new?</b><br>" //"<font size=1>Is this url new to the index? If unsure " //"then you should say no here. " //"Default: yes" //"</td>" //"<td>\n" //"<input type=radio name=isnew value=0>no " //"<input type=radio name=isnew value=1 checked>yes " //"</td></tr>\n\n" "<tr><td><b>dedup?</b><br>" "<font size=1>Should this url be skipped if there is " "already a url in the index from this same domain with " "this same content? " "Default: yes" "</td>" "<td>\n" "<input type=radio name=dedup value=0>no " "<input type=radio name=dedup value=1 checked>yes " "</td></tr>\n\n" , //"<tr><td><b>ruleset</b><br>" //"<font size=1>Use this ruleset to index the URL. " //"Default: auto" //"</td>" //"<td>\n<select name=rs>" , pm , // msg7->m_pwd , //msg7->m_username, LIGHT_BLUE , DARK_BLUE , bb , msg7->m_coll ); p += gbstrlen(p); // . print pulldown menu of different site filenums // . 0 - default site // . 1 - banned site // . 2 - bad site // . 3 - decent site // . 4 - good site // . 5 - super site /* for ( long i = 0 ; i < 10000 ; i++ ) { Xml *xml = g_tagdb.getSiteXml(i, msg7->m_coll, gbstrlen(msg7->m_coll)); if ( ! xml ) break; long slen; char *s = xml->getString ( "name" , &slen ); if ( s && slen > 0 ) { char c = s[slen]; s[slen] = '\0'; sprintf ( p , "<option value=%li>%s", i , s ); s[slen] = c; } else sprintf ( p , "<option value=%li>#%li", i , i ); p += gbstrlen ( p ); } // end the pull-down menu sprintf ( p , "</select></td></tr>\n\n" ); p += gbstrlen ( p ); */ // make a table, each row will be an injectable parameter sprintf ( p , "<tr><td><b>content has mime</b><br>" "<font size=1>IP address of the url. If blank then " "Gigablast will look up. " "Default: blank" "</td>" "<td>\n" "<input type=radio name=hasmime value=0 checked>no " "<input type=radio name=hasmime value=1>yes " "</td></tr>\n\n" "<tr><td colspan=2>" "<center>" "<b>content</b><br>" "<font size=1>Enter the content here. Enter MIME header " "first if \"content has mime\" is set to true above. " "Separate MIME from actual content with two returns." "<br>" "<input type=submit value=Submit>" "<br>" "\n" "<textarea rows=32 cols=80 name=content>" "</textarea>" "<br>" "<br>\n\n" "<input type=submit value=Submit>" "</center>" "</td></tr></table>\n" "</form>\n" ); p += gbstrlen ( p ); p += sprintf(p, "\n</body>\n</html>\n"); // print the final tail //p += g_httpServer.printTail ( p , pend - p , true /*adminLink?*/); // clear g_errno, if any, so our reply send goes through g_errno = 0; // calculate buffer length long bufLen = p - buf; // nuke state mdelete ( msg7, sizeof(Msg7) , "PageInject" ); delete (msg7); // . send this page // . encapsulates in html header and tail // . make a Mime // . i thought we need -2 for cacheTime, but i guess not return g_httpServer.sendDynamicPage (s, buf, bufLen, -1/*cachetime*/); }
static void check(void *ud, struct object * obj) { struct map * m = ud; printf("%u ",obj->id); mdelete(m, obj->id); }
// . slot should be auto-nuked upon transmission or error // . TODO: ensure if this sendReply() fails does it really nuke the slot? void gotListWrapper ( void *state , RdbList *listb , Msg5 *msg5xx ) { logTrace( g_conf.m_logTraceMsg0, "BEGIN" ); // get the state State00 *st0 = (State00 *)state; // extract the udp slot and list and msg5 UdpSlot *slot = st0->m_slot; RdbList *list = &st0->m_list; Msg5 *msg5 = &st0->m_msg5; UdpServer *us = st0->m_us; // timing debug if ( g_conf.m_logTimingNet || g_conf.m_logDebugNet ) { //log("Msg0:hndled request %" PRIu64,gettimeofdayInMilliseconds()); int32_t size = -1; if ( list ) size = list->getListSize(); log(LOG_TIMING|LOG_DEBUG, "net: msg0: Handled request for data. " "Now sending data termId=%" PRIu64" size=%" PRId32 " transId=%" PRId32" ip=%s port=%i took=%" PRId64" " "(niceness=%" PRId32").", g_posdb.getTermId(msg5->m_startKey), size,slot->m_transId, iptoa(slot->m_ip),slot->m_port, gettimeofdayInMilliseconds() - st0->m_startTime , st0->m_niceness ); } // on error nuke the list and it's data if ( g_errno ) { mdelete ( st0 , sizeof(State00) , "Msg0" ); delete (st0); // TODO: free "slot" if this send fails log(LOG_ERROR,"%s:%s:%d: call sendErrorReply.", __FILE__, __func__, __LINE__); us->sendErrorReply ( slot , g_errno ); return; } QUICKPOLL(st0->m_niceness); // point to the serialized list in "list" char *data = list->getList(); int32_t dataSize = list->getListSize(); char *alloc = list->getAlloc(); int32_t allocSize = list->getAllocSize(); // tell list not to free the data since it is a reply so UdpServer // will free it when it destroys the slot list->setOwnData ( false ); // keep track of stats Rdb *rdb = getRdbFromId ( st0->m_rdbId ); if ( rdb ) rdb->sentReplyGet ( dataSize ); // TODO: can we free any memory here??? // keep track of how long it takes to complete the send st0->m_startTime = gettimeofdayInMilliseconds(); // debug point int32_t oldSize = msg5->m_minRecSizes; int32_t newSize = msg5->m_minRecSizes + 20; // watch for wrap around if ( newSize < oldSize ) newSize = 0x7fffffff; if ( dataSize > newSize && list->getFixedDataSize() == 0 && // do not annoy me with these linkdb msgs dataSize > newSize+100 ) log(LOG_LOGIC,"net: msg0: Sending more data than what was " "requested. Ineffcient. Bad engineer. dataSize=%" PRId32" " "minRecSizes=%" PRId32".",dataSize,oldSize); // // for linkdb lists, remove all the keys that have the same IP32 // and store a count of what we removed somewhere // if ( st0->m_rdbId == RDB_LINKDB ) { // store compressed list on itself char *dst = list->m_list; // keep stats int32_t totalOrigLinks = 0; int32_t ipDups = 0; int32_t lastIp32 = 0; char *listEnd = list->getListEnd(); // compress the list for ( ; ! list->isExhausted() ; list->skipCurrentRecord() ) { // breathe QUICKPOLL ( st0->m_niceness ); // count it totalOrigLinks++; // get rec char *rec = list->getCurrentRec(); int32_t ip32 = g_linkdb.getLinkerIp_uk((key224_t *)rec ); // same as one before? if ( ip32 == lastIp32 && // are we the last rec? include that for // advancing the m_nextKey in Linkdb more // efficiently. rec + LDBKS < listEnd ) { ipDups++; continue; } // store it gbmemcpy (dst , rec , LDBKS ); dst += LDBKS; // update it lastIp32 = ip32; } // . if we removed one key, store the stats // . caller should recognize reply is not a multiple of // the linkdb key size LDBKS and no its there! if ( ipDups ) { //*(int32_t *)dst = totalOrigLinks; //dst += 4; //*(int32_t *)dst = ipDups; //dst += 4; } // update list parms list->m_listSize = dst - list->m_list; list->m_listEnd = list->m_list + list->m_listSize; data = list->getList(); dataSize = list->getListSize(); } //log("sending replySize=%" PRId32" min=%" PRId32,dataSize,msg5->m_minRecSizes); // . TODO: dataSize may not equal list->getListMaxSize() so // Mem class may show an imblanace // . now g_udpServer is responsible for freeing data/dataSize // . the "true" means to call doneSending_ass() from the signal handler // if need be st0->m_us->sendReply_ass( data, dataSize, alloc, allocSize, slot, st0, doneSending_ass, -1, -1, true ); logTrace( g_conf.m_logTraceMsg0, "END" ); }
/******************************************************************* Subroutine to compute the inverse matrix and determinant matrix *cov: the pointer to the covariance matrix matrix *inv_cov: the pointer to the inverse covariance matrix matrix *cov_mat: the pointer to the approximate covariance matrix when singular. If unsingular, it equals to cov double *det_cov: the pointer to determinant return value: '1' - successfully exit '0' - exit with waring/error *******************************************************************/ int veCov(matrix *cov, matrix *inv_cov, matrix *cov_mat, double *det_cov) { int i, j; matrix eigvec_re; matrix eigvec_im; vector eigval_re; vector eigval_im; int *eig_order; int eig_info; int num_v; // the number of eigenvalue int rank_c; double sum_v; double factor = 0.02; double ass_value; double min_real; mnew(&eigvec_re, cov->m, cov->n); mnew(&eigvec_im, cov->m, cov->n); vnew(&eigval_re, cov->n); vnew(&eigval_im, cov->n); eig_order = new int[cov->n]; // the eigenvector and eigenvalue of covariance matrix eig_info = eig(cov, &eigvec_re, &eigvec_im, &eigval_re, &eigval_im); //vprint(&eigval_re); //vprint(&eigval_im); if (!eig_info) { printf(" The eigenvalue computation failed! \n"); return 0; //.... } // the rank of covariance matrix num_v = cov->n; /*rank_c = num_v; for (i=0; i<num_v; i++) { if ((fabs(*(eigval_re.pr+i)) < ZEROTHRESH) && (fabs(*(eigval_im.pr+i)) < ZEROTHRESH)) { rank_c--; } } printf("rank = %d", rank_c);*/ rank_c = rank(cov, TOLERANCE); // compute the inverse and determinate if (rank_c == num_v) { // nonsingular inv(cov, inv_cov); mcopy(cov, cov_mat); *det_cov = det(cov); } else { // singular min_real = pow(10, (((double)-250) / ((double) cov->m))); /*for (i=0; i<num_v; i++) { if ((*(eigval_re.pr+i) < ZEROTHRESH) || (*(eigval_im.pr+i) != 0)) { *(eigval_re.pr+i) = 0; // ???? keep the real part of complex or not *(eigval_im.pr+i) = 0; } } sort(&eigval_re, eig_order, 'd'); */ for (i=0; i<num_v; i++) { // when negtive real eigenvalue, change to absolute value // to ensure all the real eigenvalues are positive if ((eigval_re.pr[i] < 0) && (eigval_im.pr[i] == 0)) { eigval_re.pr[i] *= -1; // the i-th column of eigenvector should also be changed the sign for (j=0; j<(eigvec_re.m); j++) { eigvec_re.pr[j*(eigvec_re.n)+i] *= -1; } } } //vprint(&eigval_re); //vprint(&eigval_im); // sort real eigenvalues descendingly, put complex ones at the end sorteig(&eigval_re, &eigval_im, eig_order); for (i=rank_c; i<num_v; i++) { *(eigval_re.pr+i) = 0; *(eigval_im.pr+i) = 0; } //vprint(&eigval_re); //vprint(&eigval_im); sum_v = vsum(&eigval_re); ass_value = factor * sum_v / (num_v - rank_c); if (ass_value < (0.5 * (*(eigval_re.pr+rank_c)) * (1 - factor))) { if (ass_value > min_real) { for (i=rank_c; i<num_v; i++) { *(eigval_re.pr+i) = ass_value; } for (i=0; i<rank_c; i++) { *(eigval_re.pr+i) *= 1 - factor; } } else { for (i=rank_c; i<num_v; i++) { *(eigval_re.pr+i) = min_real; } } } else { ass_value = 0.5 * (*(eigval_re.pr+rank_c)) * (1 - factor); if (ass_value > min_real) { for (i=rank_c; i<num_v; i++) { *(eigval_re.pr+i) = ass_value; } for (i=0; i<rank_c; i++) { *(eigval_re.pr+i) = *(eigval_re.pr+i) - ass_value * (num_v - rank_c) * (*(eigval_re.pr+i)) / sum_v; } } else { for (i=rank_c; i<num_v; i++) { *(eigval_re.pr+i) = min_real; } } } //vprint(&eigval_re); //vprint(&eigval_im); matrix eigvec_re_sorted; matrix eigvec_re_sorted_t; mnew(&eigvec_re_sorted, num_v, num_v); mnew(&eigvec_re_sorted_t, num_v, num_v); sortcols(eig_order, &eigvec_re, &eigvec_re_sorted); transpose(&eigvec_re_sorted, &eigvec_re_sorted_t); matrix inv_eig_vl_s; mnew(&inv_eig_vl_s, num_v, num_v); for (i=1; i<num_v; i++) { *(inv_eig_vl_s.pr + i*num_v + i) = 1 / (*(eigval_re.pr+i)); } matrix tmp; mnew(&tmp, num_v, num_v); mmMul(&eigvec_re_sorted, &inv_eig_vl_s, &tmp); mmMul(&tmp, &eigvec_re_sorted_t, inv_cov); matrix diag_eigval; mnew(&diag_eigval, num_v, num_v); for (i=0; i<num_v; i++) { *(diag_eigval.pr + i*num_v + i) = *(eigval_re.pr+i); } mmMul(&eigvec_re_sorted, &diag_eigval, &tmp); mmMul(&tmp, &eigvec_re_sorted_t, cov_mat); *det_cov = 1; for (i=0; i<num_v; i++) { *det_cov = (*det_cov) * (*(eigval_re.pr+i)); } mdelete(&inv_eig_vl_s); mdelete(&eigvec_re_sorted); mdelete(&eigvec_re_sorted_t); mdelete(&tmp); mdelete(&diag_eigval); } #ifdef _DEBUG printf("rank = %d \n", rank_c); printf("\n det_cov = %e \n", *det_cov); printf("inv_cov = \n"); mprint(inv_cov); printf("cov_mat = \n"); mprint(cov_mat); #endif mdelete(&eigvec_re); mdelete(&eigvec_im); vdelete(&eigval_re); vdelete(&eigval_im); delete []eig_order; return 1; }
// . returns false if blocked, true otherwise // . sets g_errno on error bool sendPageGet ( TcpSocket *s , HttpRequest *r ) { // get the collection long collLen = 0; char *coll = r->getString("c",&collLen); if ( ! coll || ! coll[0] ) { //coll = g_conf.m_defaultColl; coll = g_conf.getDefaultColl( r->getHost(), r->getHostLen() ); collLen = gbstrlen(coll); } // ensure collection not too big if ( collLen >= MAX_COLL_LEN ) { g_errno = ECOLLTOOBIG; return g_httpServer.sendErrorReply(s,500,mstrerror(g_errno)); } // get the collection rec CollectionRec *cr = g_collectiondb.getRec ( coll ); if ( ! cr ) { g_errno = ENOCOLLREC; log("query: Archived copy retrieval failed. " "No collection record found for " "collection \"%s\".",coll); return g_httpServer.sendErrorReply(s,500,mstrerror(g_errno)); } // does this collection ban this IP? if ( ! cr->hasSearchPermission ( s ) ) { g_errno = ENOPERM; //log("PageGet::sendDynamicReply0: permission denied for %s", // iptoa(s->m_ip) ); g_msg = " (error: permission denied)"; return g_httpServer.sendErrorReply(s,500,mstrerror(g_errno)); } // . get fields from cgi field of the requested url // . get the search query long qlen = 0; char *q = r->getString ( "q" , &qlen , NULL /*default*/); // ensure query not too big if ( qlen >= MAX_QUERY_LEN-1 ) { g_errno=EQUERYTOOBIG; return g_httpServer.sendErrorReply (s,500 ,mstrerror(g_errno)); } // the docId long long docId = r->getLongLong ( "d" , 0LL /*default*/ ); // get url char *url = r->getString ( "u",NULL); if ( docId == 0 && ! url ) { g_errno = EMISSINGINPUT; return g_httpServer.sendErrorReply (s,500 ,mstrerror(g_errno)); } // . should we do a sequential lookup? // . we need to match summary here so we need to know this //bool seq = r->getLong ( "seq" , false ); // restrict to root file? bool rtq = r->getLong ( "rtq" , false ); // . get the titleRec // . TODO: redirect client to a better http server to save bandwidth State2 *st ; try { st = new (State2); } catch (... ) { g_errno = ENOMEM; log("PageGet: new(%i): %s", (int)sizeof(State2),mstrerror(g_errno)); return g_httpServer.sendErrorReply(s,500,mstrerror(g_errno));} mnew ( st , sizeof(State2) , "PageGet1" ); // save the socket and if Host: is local in the Http request Mime st->m_socket = s; st->m_isAdmin = g_conf.isCollAdmin ( s , r ); st->m_isLocal = r->isLocal(); st->m_docId = docId; st->m_printed = false; // include header ... "this page cached by Gigablast on..." st->m_includeHeader = r->getLong ("ih" , true ); st->m_includeBaseHref = r->getLong ("ibh" , false ); st->m_queryHighlighting = r->getLong ("qh" , true ); st->m_strip = r->getLong ("strip" , 0 ); st->m_clickAndScroll = r->getLong ("cas" , true ); st->m_cnsPage = r->getLong ("cnsp" , true ); char *langAbbr = r->getString("qlang",NULL); st->m_langId = langUnknown; if ( langAbbr ) { uint8_t langId = getLangIdFromAbbr ( langAbbr ); st->m_langId = langId; } strncpy ( st->m_coll , coll , MAX_COLL_LEN+1 ); // store query for query highlighting st->m_netTestResults = r->getLong ("rnettest", false ); //if( st->m_netTestResults ) { // mdelete ( st , sizeof(State2) , "PageGet1" ); // delete ( st ); // return sendPageNetResult( s ); //} if ( q && qlen > 0 ) strcpy ( st->m_q , q ); else st->m_q[0] = '\0'; st->m_qlen = qlen; //st->m_seq = seq; st->m_rtq = rtq; st->m_boolFlag = r->getLong ("bq", 2 /*default is 2*/ ); st->m_isBanned = false; st->m_noArchive = false; st->m_socket = s; st->m_format = r->getReplyFormat(); // default to 0 niceness st->m_niceness = 0; st->m_r.copy ( r ); //st->m_cr = cr; st->m_printDisclaimer = true; if ( st->m_cnsPage ) st->m_printDisclaimer = false; if ( st->m_strip ) // ! st->m_evbits.isEmpty() ) st->m_printDisclaimer = false; // should we cache it? char useCache = r->getLong ( "usecache" , 1 ); char rcache = r->getLong ( "rcache" , 1 ); char wcache = r->getLong ( "wcache" , 1 ); long cacheAge = r->getLong ( "cacheAge" , 60*60 ); // default one hour if ( useCache == 0 ) { cacheAge = 0; wcache = 0; } if ( rcache == 0 ) cacheAge = 0; // . fetch the TitleRec // . a max cache age of 0 means not to read from the cache XmlDoc *xd = &st->m_xd; // url based? if ( url ) { SpiderRequest sreq; sreq.reset(); strcpy(sreq.m_url, url ); sreq.setDataSize(); // this returns false if "coll" is invalid if ( ! xd->set4 ( &sreq , NULL , coll , NULL , st->m_niceness ) ) goto hadSetError; } // . when getTitleRec() is called it will load the old one // since XmlDoc::m_setFromTitleRec will be true // . niceness is 0 // . use st->m_coll since XmlDoc just points to it! // . this returns false if "coll" is invalid else if ( ! xd->set3 ( docId , st->m_coll , 0 ) ) { hadSetError: mdelete ( st , sizeof(State2) , "PageGet1" ); delete ( st ); g_errno = ENOMEM; log("PageGet: set3: %s", mstrerror(g_errno)); return g_httpServer.sendErrorReply(s,500,mstrerror(g_errno)); } // if it blocks while it loads title rec, it will re-call this routine xd->setCallback ( st , processLoopWrapper ); // good to go! return processLoop ( st ); }
// . returns false if blocked, true otherwise // . sets g_errno on error // . query re-index interface // . call g_httpServer.sendDynamicPage() to send it bool sendPageReindex ( TcpSocket *s , HttpRequest *r ) { // make a state State13 *st ; try { st = new (State13); } catch ( ... ) { g_errno = ENOMEM; log("PageTagdb: new(%i): %s", (int)sizeof(State13),mstrerror(g_errno)); return g_httpServer.sendErrorReply(s,500,mstrerror(g_errno));} mnew ( st , sizeof(State13) , "PageReindex" ); // set this. also sets gr->m_hr GigablastRequest *gr = &st->m_gr; // this will fill in GigablastRequest so all the parms we need are set g_parms.setGigablastRequest ( s , r , gr ); TcpSocket *sock = gr->m_socket; // get collection rec CollectionRec *cr = g_collectiondb.getRec ( gr->m_coll ); // bitch if no collection rec found if ( ! cr ) { g_errno = ENOCOLLREC; // g_errno should be set so it will return an error response g_httpServer.sendErrorReply(sock,500,mstrerror(g_errno)); mdelete ( st , sizeof(State13) , "PageTagdb" ); delete (st); return true; } collnum_t collnum = cr->m_collnum; // if no query send back the page blanked out i guess if ( ! gr->m_query || ! gr->m_query[0] ) { doneReindexing ( st ); return true; } // no permmission? bool isMasterAdmin = g_conf.isMasterAdmin ( s , r ); bool isCollAdmin = g_conf.isCollAdmin ( s , r ); if ( ! isMasterAdmin && ! isCollAdmin ) { g_errno = ENOPERM; doneReindexing ( st ); return true; } int32_t langId = getLangIdFromAbbr ( gr->m_qlang ); // let msg1d do all the work now if ( ! st->m_msg1c.reindexQuery ( gr->m_query , collnum, gr->m_srn , // startNum , gr->m_ern , // endNum , (bool)gr->m_forceDel, langId, st , doneReindexing ) ) return false; // no waiting doneReindexing ( st ); return true; }
// returns false if blocked, true otherwise bool processLoop ( void *state ) { // get it State2 *st = (State2 *)state; // get the tcp socket from the state TcpSocket *s = st->m_socket; // get it XmlDoc *xd = &st->m_xd; if ( ! xd->m_loaded ) { // setting just the docid. niceness is 0. //xd->set3 ( st->m_docId , st->m_coll , 0 ); // callback xd->setCallback ( state , processLoop ); // . and tell it to load from the old title rec // . this sets xd->m_oldTitleRec/m_oldTitleRecSize // . this sets xd->ptr_* and all other member vars from // the old title rec if found in titledb. if ( ! xd->loadFromOldTitleRec ( ) ) return false; } if ( g_errno ) return sendErrorReply ( st , g_errno ); // now force it to load old title rec //char **tr = xd->getTitleRec(); SafeBuf *tr = xd->getTitleRecBuf(); // blocked? return false if so. it will call processLoop() when it rets if ( tr == (void *)-1 ) return false; // we did not block. check for error? this will free "st" too. if ( ! tr ) return sendErrorReply ( st , g_errno ); // if title rec was empty, that is a problem if ( xd->m_titleRecBuf.length() == 0 ) return sendErrorReply ( st , ENOTFOUND); // set callback char *na = xd->getIsNoArchive(); // wait if blocked if ( na == (void *)-1 ) return false; // error? if ( ! na ) return sendErrorReply ( st , g_errno ); // forbidden? allow turkeys through though... if ( ! st->m_isAdmin && *na ) return sendErrorReply ( st , ENOCACHE ); SafeBuf *sb = &st->m_sb; // &page=4 will print rainbow sections if ( ! st->m_printed && st->m_r.getLong("page",0) ) { // do not repeat this call st->m_printed = true; // this will call us again since we called // xd->setCallback() above to us if ( ! xd->printDocForProCog ( sb , &st->m_r ) ) return false; } char *contentType = "text/html"; char format = st->m_format; if ( format == FORMAT_XML ) contentType = "text/xml"; if ( format == FORMAT_JSON ) contentType = "application/json"; // if we printed a special page (like rainbow sections) then return now if ( st->m_printed ) { bool status = g_httpServer.sendDynamicPage (s, //buf,bufLen, sb->getBufStart(), sb->getLength(), -1,false, //"text/html", contentType, -1, NULL, "utf8" ); // nuke state2 mdelete ( st , sizeof(State2) , "PageGet1" ); delete (st); return status; } /* // this was calling XmlDoc and setting sections, etc. to // get the SpiderReply junk... no no no // is it banned or filtered? this ignores the TagRec in the titleRec // and uses msg8a to get it fresh instead char *vi = xd->getIsFiltered();//Visible( ); // wait if blocked if ( vi == (void *)-1 ) return false; // error? if ( ! vi ) return sendErrorReply ( st , g_errno ); // banned? if ( ! st->m_isAdmin && ! *vi ) return sendErrorReply (st,EDOCBANNED); */ // get the utf8 content char **utf8 = xd->getUtf8Content(); //long len = xd->size_utf8Content - 1; // wait if blocked??? if ( utf8 == (void *)-1 ) return false; // strange if ( xd->size_utf8Content<=0) { log("pageget: utf8 content <= 0"); return sendErrorReply(st,EBADENGINEER ); } // alloc error? if ( ! utf8 ) return sendErrorReply ( st , g_errno ); // get this host Host *h = g_hostdb.getHost ( g_hostdb.m_hostId ); if ( ! h ) { log("pageget: hostid %li is bad",g_hostdb.m_hostId); return sendErrorReply(st,EBADENGINEER ); } char *content = xd->ptr_utf8Content; long contentLen = xd->size_utf8Content - 1; // shortcut char strip = st->m_strip; // alloc buffer now //char *buf = NULL; //long bufMaxSize = 0; //bufMaxSize = len + ( 32 * 1024 ) ; //bufMaxSize = contentLen + ( 32 * 1024 ) ; //buf = (char *)mmalloc ( bufMaxSize , "PageGet2" ); //char *p = buf; //char *bufEnd = buf + bufMaxSize; //if ( ! buf ) { // return sendErrorReply ( st , g_errno ); //} // for undoing the header //char *start1 = p; long startLen1 = sb->length(); // we are always utfu if ( strip != 2 ) sb->safePrintf( "<meta http-equiv=\"Content-Type\" " "content=\"text/html;charset=utf8\">\n"); // base href //Url *base = &xd->m_firstUrl; //if ( xd->ptr_redirUrl.m_url[0] ) // base = &xd->m_redirUrl; char *base = xd->ptr_firstUrl; if ( xd->ptr_redirUrl ) base = xd->ptr_redirUrl; //Url *redir = *xd->getRedirUrl(); if ( strip != 2 ) { sb->safePrintf ( "<BASE HREF=\"%s\">" , base ); //p += gbstrlen ( p ); } // default colors in case css files missing if ( strip != 2 ) { sb->safePrintf( "\n<style type=\"text/css\">\n" "body{background-color:white;color:black;}\n" "</style>\n"); //p += gbstrlen ( p ); } //char format = st->m_format; if ( format == FORMAT_XML ) sb->reset(); if ( format == FORMAT_JSON ) sb->reset(); // for undoing the stuff below long startLen2 = sb->length();//p; // query should be NULL terminated char *q = st->m_q; long qlen = st->m_qlen; char styleTitle[128] = "font-size:14px;font-weight:600;" "color:#000000;"; char styleText[128] = "font-size:14px;font-weight:400;" "color:#000000;"; char styleLink[128] = "font-size:14px;font-weight:400;" "color:#0000ff;"; char styleTell[128] = "font-size:14px;font-weight:600;" "color:#cc0000;"; // get the url of the title rec Url *f = xd->getFirstUrl(); bool printDisclaimer = st->m_printDisclaimer; if ( xd->m_contentType == CT_JSON ) printDisclaimer = false; if ( format == FORMAT_XML ) printDisclaimer = false; if ( format == FORMAT_JSON ) printDisclaimer = false; char tbuf[100]; tbuf[0] = 0; time_t lastSpiderDate = xd->m_spideredTime; if ( printDisclaimer || format == FORMAT_XML || format == FORMAT_JSON ) { struct tm *timeStruct = gmtime ( &lastSpiderDate ); strftime ( tbuf, 100,"%b %d, %Y UTC", timeStruct); } // We should always be displaying this disclaimer. // - May eventually want to display this at a different location // on the page, or on the click 'n' scroll browser page itself // when this page is not being viewed solo. // CNS: if ( ! st->m_clickNScroll ) { if ( printDisclaimer ) { sb->safePrintf(//sprintf ( p , //"<BASE HREF=\"%s\">" //"<table border=1 width=100%%>" //"<tr><td>" "<table border=\"1\" bgcolor=\"#" BGCOLOR "\" cellpadding=\"10\" " //"id=\"gbcnsdisctable\" class=\"gbcnsdisctable_v\"" "cellspacing=\"0\" width=\"100%%\" color=\"#ffffff\">" "<tr" //" id=\"gbcnsdisctr\" class=\"gbcnsdisctr_v\"" "><td>" //"<font face=times,sans-serif color=black size=-1>" "<span style=\"%s\">" "This is Gigablast's cached page of </span>" "<a href=\"%s\" style=\"%s\">%s</a>" "" , styleTitle, f->getUrl(), styleLink, f->getUrl() ); //p += gbstrlen ( p ); // then the rest //sprintf(p , sb->safePrintf( "<span style=\"%s\">. " "Gigablast is not responsible for the content of " "this page.</span>", styleTitle ); //p += gbstrlen ( p ); sb->safePrintf ( "<br/><span style=\"%s\">" "Cached: </span>" "<span style=\"%s\">", styleTitle, styleText ); //p += gbstrlen ( p ); // then the spider date in GMT // time_t lastSpiderDate = xd->m_spideredTime; // struct tm *timeStruct = gmtime ( &lastSpiderDate ); // char tbuf[100]; // strftime ( tbuf, 100,"%b %d, %Y UTC", timeStruct); //p += gbstrlen ( p ); sb->safeStrcpy(tbuf); // Moved over from PageResults.cpp sb->safePrintf( "</span> - <a href=\"" "/get?" "q=%s&c=%s&rtq=%li&" "d=%lli&strip=1\"" " style=\"%s\">" "[stripped]</a>", q , st->m_coll , (long)st->m_rtq, st->m_docId, styleLink ); // a link to alexa if ( f->getUrlLen() > 5 ) { sb->safePrintf( " - <a href=\"http:" "//web.archive.org/web/*/%s\"" " style=\"%s\">" "[older copies]</a>" , f->getUrl(), styleLink ); } if (st->m_noArchive){ sb->safePrintf( " - <span style=\"%s\"><b>" "[NOARCHIVE]</b></span>", styleTell ); } if (st->m_isBanned){ sb->safePrintf(" - <span style=\"%s\"><b>" "[BANNED]</b></span>", styleTell ); } // only print this if we got a query if ( qlen > 0 ) { sb->safePrintf("<br/><br/><span style=\"%s\"> " "These search terms have been " "highlighted: ", styleText ); //p += gbstrlen ( p ); } } // how much space left in p? //long avail = bufEnd - p; // . make the url that we're outputting for (like in PageResults.cpp) // . "thisUrl" is the baseUrl for click & scroll char thisUrl[MAX_URL_LEN]; char *thisUrlEnd = thisUrl + MAX_URL_LEN; char *x = thisUrl; // . use the external ip of our gateway // . construct the NAT mapped port // . you should have used iptables to map port to the correct // internal ip:port //unsigned long ip =g_conf.m_mainExternalIp ; // h->m_externalIp; //unsigned short port=g_conf.m_mainExternalPort;//h->m_externalHttpPort // local check //if ( st->m_isLocal ) { unsigned long ip = h->m_ip; unsigned short port = h->m_httpPort; //} //sprintf ( x , "http://%s:%li/get?q=" , iptoa ( ip ) , port ); // . we no longer put the port in here // . but still need http:// since we use <base href=> if (port == 80) sprintf(x,"http://%s/get?q=",iptoa(ip)); else sprintf(x,"http://%s:%hu/get?q=",iptoa(ip),port); x += gbstrlen ( x ); // the query url encoded long elen = urlEncode ( x , thisUrlEnd - x , q , qlen ); x += elen; // separate cgi vars with a & //sprintf ( x, "&seq=%li&rtq=%lid=%lli", // (long)st->m_seq,(long)st->m_rtq,st->m_msg22.getDocId()); sprintf ( x, "&d=%lli",st->m_docId ); x += gbstrlen(x); // set our query for highlighting Query qq; qq.set2 ( q, st->m_langId , true ); // print the query terms into our highlight buffer Highlight hi; // make words so we can set the scores to ignore fielded terms Words qw; qw.set ( q , // content being highlighted, utf8 qlen , // content being highlighted, utf8 TITLEREC_CURRENT_VERSION, true , // computeIds false ); // hasHtmlEntities? // . assign scores of 0 to query words that should be ignored // . TRICKY: loop over words in qq.m_qwords, but they should be 1-1 // with words in qw. // . sanity check //if ( qw.getNumWords() != qq.m_numWords ) { char *xx = NULL; *xx = 0;} // declare up here Matches m; // do the loop //Scores ss; //ss.set ( &qw , NULL ); //for ( long i = 0 ; i < qq.m_numWords ; i++ ) // if ( ! m.matchWord ( &qq.m_qwords[i],i ) ) ss.m_scores[i] = 0; // now set m.m_matches[] to those words in qw that match a query word // or phrase in qq. m.setQuery ( &qq ); //m.addMatches ( &qw , &ss , true ); m.addMatches ( &qw ); long hilen = 0; // CNS: if ( ! st->m_clickNScroll ) { // and highlight the matches if ( printDisclaimer ) { hilen = hi.set ( //p , //avail , sb , &qw , // words to highlight &m , // matches relative to qw false , // doSteming false , // st->m_clickAndScroll , (char *)thisUrl );// base url for ClcknScrll //p += hilen; // now an hr //memcpy ( p , "</span></table></table>\n" , 24 ); p += 24; sb->safeStrcpy("</span></table></table>\n"); } bool includeHeader = st->m_includeHeader; // do not show header for json object display if ( xd->m_contentType == CT_JSON ) includeHeader = false; if ( format == FORMAT_XML ) includeHeader = false; if ( format == FORMAT_JSON ) includeHeader = false; //mfree(uq, uqCapacity, "PageGet"); // undo the header writes if we should if ( ! includeHeader ) { // including base href is off by default when not including // the header, so the caller must explicitly turn it back on if ( st->m_includeBaseHref ) sb->m_length=startLen2;//p=start2; else sb->m_length=startLen1;//p=start1; } //sb->safeStrcpy(tbuf); if ( format == FORMAT_XML ) { sb->safePrintf("<response>\n"); sb->safePrintf("<statusCode>0</statusCode>\n"); sb->safePrintf("<statusMsg>Success</statusMsg>\n"); sb->safePrintf("<url><![CDATA["); sb->cdataEncode(xd->m_firstUrl.m_url); sb->safePrintf("]]></url>\n"); sb->safePrintf("<docId>%llu</docId>\n",xd->m_docId); sb->safePrintf("\t<cachedTimeUTC>%lu</cachedTimeUTC>\n", lastSpiderDate); sb->safePrintf("\t<cachedTimeStr>%s</cachedTimeStr>\n",tbuf); } if ( format == FORMAT_JSON ) { sb->safePrintf("{\"response\":{\n"); sb->safePrintf("\t\"statusCode\":0,\n"); sb->safePrintf("\t\"statusMsg\":\"Success\",\n"); sb->safePrintf("\t\"url\":\""); sb->jsonEncode(xd->m_firstUrl.m_url); sb->safePrintf("\",\n"); sb->safePrintf("\t\"docId\":%llu,\n",xd->m_docId); sb->safePrintf("\t\"cachedTimeUTC\":%lu,\n",lastSpiderDate); sb->safePrintf("\t\"cachedTimeStr\":\"%s\",\n",tbuf); } // identify start of <title> tag we wrote out char *sbstart = sb->getBufStart(); char *sbend = sb->getBufEnd(); char *titleStart = NULL; char *titleEnd = NULL; for ( char *t = sbstart ; t < sbend ; t++ ) { // title tag? if ( t[0]!='<' ) continue; if ( to_lower_a(t[1])!='t' ) continue; if ( to_lower_a(t[2])!='i' ) continue; if ( to_lower_a(t[3])!='t' ) continue; if ( to_lower_a(t[4])!='l' ) continue; if ( to_lower_a(t[5])!='e' ) continue; // point to it char *x = t + 5; // max - to keep things fast char *max = x + 500; for ( ; *x && *x != '>' && x < max ; x++ ); x++; // find end char *e = x; for ( ; *e && e < max ; e++ ) { if ( e[0]=='<' && to_lower_a(e[1])=='/' && to_lower_a(e[2])=='t' && to_lower_a(e[3])=='i' && to_lower_a(e[4])=='t' && to_lower_a(e[5])=='l' && to_lower_a(e[6])=='e' ) break; } if ( e < max ) { titleStart = x; titleEnd = e; } break; } // . print title at top! // . consider moving if ( titleStart ) { char *ebuf = st->m_r.getString("eb"); if ( ! ebuf ) ebuf = ""; //p += sprintf ( p , sb->safePrintf( "<table border=1 " "cellpadding=10 " "cellspacing=0 " "width=100%% " "color=#ffffff>" ); long printLinks = st->m_r.getLong("links",0); if ( ! printDisclaimer && printLinks ) sb->safePrintf(//p += sprintf ( p , // first put cached and live link "<tr>" "<td bgcolor=lightyellow>" // print cached link //"<center>" " " "<b>" "<a " "style=\"font-size:18px;font-weight:600;" "color:#000000;\" " "href=\"" "/get?" "c=%s&d=%lli&qh=0&cnsp=1&eb=%s\">" "cached link</a>" " " "<a " "style=\"font-size:18px;font-weight:600;" "color:#000000;\" " "href=%s>live link</a>" "</b>" //"</center>" "</td>" "</tr>\n" ,st->m_coll ,st->m_docId ,ebuf ,thisUrl // st->ptr_ubuf ); if ( printLinks ) { sb->safePrintf(//p += sprintf ( p , "<tr><td bgcolor=pink>" "<span style=\"font-size:18px;" "font-weight:600;" "color:#000000;\">" " " "<b>PAGE TITLE:</b> " ); long tlen = titleEnd - titleStart; sb->safeMemcpy ( titleStart , tlen ); sb->safePrintf ( "</span></td></tr>" ); } sb->safePrintf( "</table><br>\n" ); } // is the content preformatted? bool pre = false; char ctype = (char)xd->m_contentType; if ( ctype == CT_TEXT ) pre = true ; // text/plain if ( ctype == CT_DOC ) pre = true ; // filtered msword if ( ctype == CT_PS ) pre = true ; // filtered postscript if ( format == FORMAT_XML ) pre = false; if ( format == FORMAT_JSON ) pre = false; // if it is content-type text, add a <pre> if ( pre ) {//p + 5 < bufEnd && pre ) { sb->safePrintf("<pre>"); //p += 5; } if ( st->m_strip == 1 ) contentLen = stripHtml( content, contentLen, (long)xd->m_version, st->m_strip ); // it returns -1 and sets g_errno on error, line OOM if ( contentLen == -1 ) { //if ( buf ) mfree ( buf , bufMaxSize , "PageGet2" ); return sendErrorReply ( st , g_errno ); } Xml xml; Words ww; // if no highlighting, skip it bool queryHighlighting = st->m_queryHighlighting; if ( st->m_strip == 2 ) queryHighlighting = false; // do not do term highlighting if json if ( xd->m_contentType == CT_JSON ) queryHighlighting = false; SafeBuf tmp; SafeBuf *xb = sb; if ( format == FORMAT_XML ) xb = &tmp; if ( format == FORMAT_JSON ) xb = &tmp; if ( ! queryHighlighting ) { xb->safeMemcpy ( content , contentLen ); //p += contentLen ; } else { // get the content as xhtml (should be NULL terminated) //Words *ww = xd->getWords(); if ( ! xml.set ( content , contentLen , false , 0 , false , TITLEREC_CURRENT_VERSION , false , 0 , CT_HTML ) ) { // niceness is 0 //if ( buf ) mfree ( buf , bufMaxSize , "PageGet2" ); return sendErrorReply ( st , g_errno ); } if ( ! ww.set ( &xml , true , 0 ) ) { // niceness is 0 //if ( buf ) mfree ( buf , bufMaxSize , "PageGet2" ); return sendErrorReply ( st , g_errno ); } // sanity check //if ( ! xd->m_wordsValid ) { char *xx=NULL;*xx=0; } // how much space left in p? //avail = bufEnd - p; Matches m; m.setQuery ( &qq ); m.addMatches ( &ww ); hilen = hi.set ( xb , // p , avail , &ww , &m , false /*doStemming?*/ , st->m_clickAndScroll , thisUrl /*base url for click & scroll*/); //p += hilen; log(LOG_DEBUG, "query: Done highlighting cached page content"); } if ( format == FORMAT_XML ) { sb->safePrintf("\t<content><![CDATA["); sb->cdataEncode ( xb->getBufStart() ); sb->safePrintf("]]></content>\n"); sb->safePrintf("</response>\n"); } if ( format == FORMAT_JSON ) { sb->safePrintf("\t\"content\":\"\n"); sb->jsonEncode ( xb->getBufStart() ); sb->safePrintf("\"\n}\n}\n"); } // if it is content-type text, add a </pre> if ( pre ) { // p + 6 < bufEnd && pre ) { sb->safeMemcpy ( "</pre>" , 6 ); //p += 6; } // calculate bufLen //long bufLen = p - buf; long ct = xd->m_contentType; // now filter the entire buffer to escape out the xml tags // so it is displayed nice SafeBuf newbuf; if ( ct == CT_XML ) { // encode the xml tags into <tagname> sequences if ( !newbuf.htmlEncodeXmlTags ( sb->getBufStart() , sb->getLength(), 0)){// niceness=0 //if ( buf ) mfree ( buf , bufMaxSize , "PageGet2" ); return sendErrorReply ( st , g_errno ); } // free out buffer that we alloc'd before returning since this // should have copied it into another buffer //if ( buf ) mfree ( buf , bufMaxSize , "PageGet2" ); // reassign //buf = newbuf.getBufStart(); //bufLen = newbuf.length(); sb->stealBuf ( &newbuf ); } // now encapsulate it in html head/tail and send it off // sendErr: contentType = "text/html"; if ( strip == 2 ) contentType = "text/xml"; // xml is usually buggy and this throws browser off //if ( ctype == CT_XML ) contentType = "text/xml"; if ( xd->m_contentType == CT_JSON ) contentType = "application/json"; if ( format == FORMAT_XML ) contentType = "text/xml"; if ( format == FORMAT_JSON ) contentType = "application/json"; // safebuf, sb, is a member of "st" so this should copy the buffer // when it constructs the http reply, and we gotta call delete(st) // AFTER this so sb is still valid. bool status = g_httpServer.sendDynamicPage (s, //buf,bufLen, sb->getBufStart(), sb->getLength(), -1,false, contentType, -1, NULL, "utf8" ); // nuke state2 mdelete ( st , sizeof(State2) , "PageGet1" ); delete (st); // free out buffer that we alloc'd before returning since this // should have copied it into another buffer //if ( ct == CT_XML ) newbuf.purge(); //else if ( buf ) mfree ( buf , bufMaxSize , "PageGet2" ); // and convey the status return status; }
// . returns false if blocked, true otherwise // . sets g_errno on error // . add url page for admin, users use sendPageAddUrl() in PageRoot.cpp bool sendPageAddUrl2 ( TcpSocket *sock , HttpRequest *hr ) { // or if in read-only mode if ( g_conf.m_readOnlyMode ) { g_errno = EREADONLYMODE; const char *msg = mstrerror(g_errno); return g_httpServer.sendErrorReply(sock,500,msg); } // . get fields from cgi field of the requested url // . get the search query int32_t urlLen = 0; const char *urls = hr->getString ( "urls" , &urlLen , NULL /*default*/); char format = hr->getReplyFormat(); const char *c = hr->getString("c"); if ( ! c && (format == FORMAT_XML || format == FORMAT_JSON) ) { g_errno = EMISSINGINPUT; const char *msg = "missing c parm. See /admin/api to see parms."; return g_httpServer.sendErrorReply(sock,500,msg); } if ( ! urls && (format == FORMAT_XML || format == FORMAT_JSON) ) { g_errno = EMISSINGINPUT; const char *msg = "missing urls parm. See /admin/api to see parms."; return g_httpServer.sendErrorReply(sock,500,msg); } // get collection rec CollectionRec *cr = g_collectiondb.getRec ( hr ); // bitch if no collection rec found if ( ! cr ) { g_errno = ENOCOLLREC; const char *msg = mstrerror(g_errno); return g_httpServer.sendErrorReply(sock,500,msg); } // make a new state GigablastRequest *gr; try { gr = new (GigablastRequest); } catch ( ... ) { g_errno = ENOMEM; log( LOG_WARN, "PageAddUrl: new(%i): %s", (int)sizeof(GigablastRequest),mstrerror(g_errno) ); return g_httpServer.sendErrorReply(sock, 500, mstrerror(g_errno)); } mnew ( gr , sizeof(GigablastRequest) , "PageAddUrl" ); // this will fill in GigablastRequest so all the parms we need are set // set this. also sets gr->m_hr g_parms.setGigablastRequest ( sock , hr , gr ); // if no url given, just print a blank page if ( ! urls ) return sendReply ( gr ); // do not spider links for spots bool status = getSpiderRequestMetaList ( (char*)urls, &gr->m_listBuf , gr->m_harvestLinks, NULL ); int32_t size = gr->m_listBuf.length(); // error / not list if ( ! status || !size ) { // nuke it if ( !size ) { g_errno = EMISSINGINPUT; } bool rc = g_httpServer.sendErrorReply(gr); mdelete ( gr , sizeof(gr) , "PageAddUrl" ); delete gr; return rc; } // add to spiderdb if ( ! gr->m_msg4.addMetaList( &(gr->m_listBuf), cr->m_collnum, gr, addedUrlsToSpiderdbWrapper, 0 ) ) { // blocked! return false; } // did not block, print page! sendReply ( gr ); return true; }
bool processLoop ( void *state ) { // cast it State8 *st = (State8 *)state; // get the xmldoc XmlDoc *xd = &st->m_xd; // error? if ( g_errno ) return sendErrorReply ( st , g_errno ); // shortcut SafeBuf *xbuf = &st->m_xbuf; if ( st->m_u && st->m_u[0] ) { // . save the ips.txt file if we are the test coll // . saveTestBuf() is a function in Msge1.cpp CollectionRec *cr = xd->getCollRec(); if ( xd && cr && cr->m_coll && !strcmp(cr->m_coll,"qatest123")) // use same dir that XmlDoc::getTestDir() would use //saveTestBuf ( "test-page-parser" ); saveTestBuf("qa"); // now get the meta list, in the process it will print out a // bunch of junk into st->m_xbuf char *metalist = xd->getMetaList ( ); if ( ! metalist ) return sendErrorReply ( st , g_errno ); // return false if it blocked if ( metalist == (void *)-1 ) return false; // for debug... if ( ! xd->m_indexCode ) xd->doConsistencyTest ( false ); // print it out xd->printDoc( xbuf ); } // print reason we can't analyze it (or index it) //if ( st->m_indexCode != 0 ) { // xbuf->safePrintf ("<br><br><b>indexCode: %s</b>\n<br>", // mstrerror(st->m_indexCode)); //} // we are done g_inPageParser = false; // print the final tail //p += g_httpServer.printTail ( p , pend - p ); //log("parser: send sock=%li",st->m_s->m_sd); // now encapsulate it in html head/tail and send it off bool status = g_httpServer.sendDynamicPage( st->m_s , xbuf->getBufStart(), xbuf->length() , -1, //cachtime false ,//postreply? NULL, //ctype -1 , //httpstatus NULL,//cookie "utf-8"); // delete the state now if ( st->m_freeIt ) { mdelete ( st , sizeof(State8) , "PageParser" ); delete (st); } // return the status return status; }
bool sendReply ( void *state , bool addUrlEnabled ) { // allow others to add now //s_inprogress = false; // get the state properly //gr *st1 = (gr *) state; GigablastRequest *gr = (GigablastRequest *)state; // in order to see what sites are being added log it, then we can // more easily remove sites from sitesearch.gigablast.com that are // being added but not being searched SafeBuf xb; if ( gr->m_urlsBuf ) { xb.safeTruncateEllipsis ( gr->m_urlsBuf , 200 ); log(LOG_INFO,"http: add url %s (%s)", xb.getBufStart(),mstrerror(g_errno)); } char format = gr->m_hr.getReplyFormat(); TcpSocket *sock = gr->m_socket; if ( format == FORMAT_JSON || format == FORMAT_XML ) { bool status = g_httpServer.sendSuccessReply ( gr ); // nuke state mdelete ( gr , sizeof(gr) , "PageAddUrl" ); delete (gr); return status; } long ulen = 0; char *url = gr->m_urlsBuf; if ( url ) ulen = gbstrlen (url); // re-null it out if just http:// bool printUrl = true; if ( ulen == 0 ) printUrl = false; if ( ! gr->m_urlsBuf ) printUrl = false; if ( ulen==7 && printUrl && !strncasecmp(gr->m_url,"http://",7)) printUrl = false; if ( ulen==8 && printUrl && !strncasecmp(gr->m_url,"https://",8)) printUrl = false; // page is not more than 32k char buf[1024*32+MAX_URL_LEN*2]; SafeBuf sb(buf, 1024*32+MAX_URL_LEN*2); //char rawbuf[1024*8]; //SafeBuf rb(rawbuf, 1024*8); //rb.safePrintf("<?xml version=\"1.0\" encoding=\"utf-8\"?>\n"); //rb.safePrintf("<status>\n"); //CollectionRec *cr = g_collectiondb.getRec ( gr->m_coll ); // collection name char tt [ 128 ]; tt[0] = '\0'; g_pages.printAdminTop ( &sb , sock , &gr->m_hr ); // display url //char *url = gr->m_urlsBuf; //if ( url && ! url[0] ) url = NULL; // watch out for NULLs if ( ! url ) url = "http://"; // if there was an error let them know //char msg[MAX_URL_LEN + 1024]; SafeBuf mbuf; //char *pm = ""; if ( g_errno ) { mbuf.safePrintf("<center><font color=red>"); mbuf.safePrintf("Error adding url(s): <b>%s[%i]</b>", mstrerror(g_errno) , g_errno); mbuf.safePrintf("</font></center>"); //pm = msg; //rb.safePrintf("Error adding url(s): %s[%i]", // mstrerror(g_errno) , g_errno); } else if ( printUrl ) { mbuf.safePrintf("<center><font color=red>"); mbuf.safePrintf("<b><u>"); mbuf.safeTruncateEllipsis(gr->m_urlsBuf,200); mbuf.safePrintf("</u></b> added to spider " "queue " "successfully<br><br>"); mbuf.safePrintf("</font></center>"); //rb.safePrintf("%s added to spider " // "queue successfully", url ); //pm = msg; //url = "http://"; //else // pm = "Don't forget to <a href=/gigaboost.html>" // "Gigaboost</a> your URL."; } if ( mbuf.length() ) sb.safeStrcpy ( mbuf.getBufStart() ); g_parms.printParmTable ( &sb , sock , &gr->m_hr ); // print the final tail g_pages.printTail ( &sb, true ); // admin? // clear g_errno, if any, so our reply send goes through g_errno = 0; // nuke state mdelete ( gr , sizeof(GigablastRequest) , "PageAddUrl" ); delete (gr); return g_httpServer.sendDynamicPage (sock, sb.getBufStart(), sb.length(), -1 ); // cachetime }
bool gotXmlDoc ( void *state ) { // cast it State8 *st = (State8 *)state; // get the xmldoc XmlDoc *xd = &st->m_xd; // if we loaded from old title rec, it should be there! // . save the ips.txt file if we are the test coll // . saveTestBuf() is a function in Msge1.cpp //if ( xd && xd->m_coll && ! strcmp ( xd->m_coll , "qatest123")) // // use same dir that XmlDoc::getTestDir() would use // saveTestBuf ( "test-page-parser" ); // error? if ( g_errno ) return sendErrorReply ( st , g_errno ); // shortcut SafeBuf *xbuf = &st->m_xbuf; bool printIt = false; if ( st->m_u && st->m_u[0] ) printIt = true; if ( st->m_docId != -1LL ) printIt = true; if ( st->m_donePrinting ) printIt = false; // do not re-call this if printDocForProCog blocked... (check length()) if ( printIt ) { // mark as done st->m_donePrinting = true; // always re-compute the page inlinks dynamically, do not // use the ptr_linkInfo1 stored in titlerec!! // NO! not if set from titlerec/docid if ( st->m_recompute ) xd->m_linkInfo1Valid = false; // try a recompute regardless, because we do not store the // bad inlinkers, and ppl want to see why they are bad! //xd->m_linkInfo1Valid = false; // now get the meta list, in the process it will print out a // bunch of junk into st->m_xbuf //char *metalist = xd->getMetaList ( ); //if ( ! metalist ) return sendErrorReply ( st , g_errno ); // return false if it blocked //if ( metalist == (void *)-1 ) return false; // for debug... //if ( ! xd->m_indexCode ) xd->doConsistencyTest ( false ); // . print it out // . returns false if blocks, true otherwise // . sets g_errno on error if ( ! xd->printDocForProCog ( xbuf , &st->m_r ) ) return false; // error? if ( g_errno ) return sendErrorReply ( st , g_errno ); } long isXml = st->m_r.getLong("xml",0); char ctype2 = CT_HTML; if ( isXml ) ctype2 = CT_XML; // now encapsulate it in html head/tail and send it off bool status = g_httpServer.sendDynamicPage( st->m_s , xbuf->getBufStart(), xbuf->length() , -1, //cachtime false ,//postreply? &ctype2, -1 , //httpstatus NULL,//cookie "utf-8"); // delete the state now if ( st->m_freeIt ) { mdelete ( st , sizeof(State8) , "PageParser" ); delete (st); } // return the status return status; }
// . returns false if blocked, true otherwise // . sets g_errno on error // . add url page for admin, users use sendPageAddUrl() in PageRoot.cpp bool sendPageAddUrl2 ( TcpSocket *sock , HttpRequest *hr ) { // or if in read-only mode if ( g_conf.m_readOnlyMode ) { g_errno = EREADONLYMODE; char *msg = mstrerror(g_errno); return g_httpServer.sendErrorReply(sock,500,msg); } // . get fields from cgi field of the requested url // . get the search query long urlLen = 0; char *urls = hr->getString ( "urls" , &urlLen , NULL /*default*/); // also try "url" and "urls" //if ( ! url ) url = r->getString ( "url" , &urlLen , NULL ); //if ( ! url ) url = r->getString ( "urls" , &urlLen , NULL ); char format = hr->getReplyFormat(); char *c = hr->getString("c"); if ( ! c && (format == FORMAT_XML || format == FORMAT_JSON) ) { g_errno = EMISSINGINPUT; char *msg = "missing c parm. See /admin/api to see parms."; return g_httpServer.sendErrorReply(sock,500,msg); } if ( ! urls && (format == FORMAT_XML || format == FORMAT_JSON) ) { g_errno = EMISSINGINPUT; char *msg = "missing urls parm. See /admin/api to see parms."; return g_httpServer.sendErrorReply(sock,500,msg); } // get collection rec CollectionRec *cr = g_collectiondb.getRec ( hr ); // bitch if no collection rec found if ( ! cr ) { g_errno = ENOCOLLREC; //g_msg = " (error: no collection)"; char *msg = mstrerror(g_errno); return g_httpServer.sendErrorReply(sock,500,msg); } // make a new state GigablastRequest *gr; try { gr = new (GigablastRequest); } catch ( ... ) { g_errno = ENOMEM; log("PageAddUrl: new(%i): %s", sizeof(GigablastRequest),mstrerror(g_errno)); return g_httpServer.sendErrorReply(sock,500, mstrerror(g_errno)); } mnew ( gr , sizeof(GigablastRequest) , "PageAddUrl" ); // this will fill in GigablastRequest so all the parms we need are set // set this. also sets gr->m_hr g_parms.setGigablastRequest ( sock , hr , gr ); // if no url given, just print a blank page if ( ! urls ) return sendReply ( gr , true ); bool status = true; // do not spider links for spots if ( ! getSpiderRequestMetaList ( urls, // a safebuf &gr->m_listBuf , gr->m_harvestLinks, // spiderLinks? NULL ) ) status = false; // empty? long size = gr->m_listBuf.length(); // error? if ( ! status ) { // nuke it mdelete ( gr , sizeof(gr) , "PageAddUrl" ); delete (gr); return g_httpServer.sendErrorReply(gr); } // if not list if ( ! size ) { // nuke it mdelete ( gr , sizeof(gr) , "PageAddUrl" ); delete (gr); g_errno = EMISSINGINPUT; return g_httpServer.sendErrorReply(gr); } // add to spiderdb if ( ! gr->m_msg4.addMetaList( gr->m_listBuf.getBufStart() , gr->m_listBuf.length(), cr->m_coll, gr , addedUrlsToSpiderdbWrapper, 0 // niceness ) ) // blocked! return false; // did not block, print page! //addedUrlsToSpiderdbWrapper(gr); sendReply ( gr , true ); return true; // send back the reply //return sendReply ( gr , true ); }