// . returns false if blocked, true otherwise // . sets g_errno on error // . make a web page displaying the titleRec of "docId" given via cgi // . call g_httpServer.sendDynamicPage() to send it bool sendPageTitledb ( TcpSocket *s , HttpRequest *r ) { // get the docId from the cgi vars long long docId = r->getLongLong ("d", 0LL ); // set up a msg22 to get the next titleRec State4 *st ; try { st = new (State4); } catch ( ... ) { g_errno = ENOMEM; log("PageTitledb: new(%i): %s", (int)sizeof(State4),mstrerror(g_errno)); return g_httpServer.sendErrorReply(s,500,mstrerror(g_errno));} mnew ( st , sizeof(State4) , "PageTitledb"); // save the socket st->m_socket = s; // copy it st->m_r.copy ( r ); // remember if http request is internal/local or not st->m_isRootAdmin = g_conf.isCollAdmin ( s , r ); st->m_isLocal = r->isLocal(); st->m_docId = docId; // password, too st->m_pwd = r->getString ( "pwd" ); // get the collection long collLen = 0; char *coll = st->m_r.getString("c",&collLen); if ( ! coll || ! coll[0] ) { //coll = g_conf.m_defaultColl; coll = g_conf.getDefaultColl( r->getHost(), r->getHostLen() ); collLen = gbstrlen(coll); } st->m_coll = coll; st->m_collLen = collLen; // just print page if no docid provided if ( ! docId ) return gotTitleRec ( st ); // get the handy XmlDoc XmlDoc *xd = &st->m_xd; // use 0 for niceness xd->set3 ( docId , coll , 0 ); // callback xd->setCallback ( st , gotTitleRec ); // . and tell it to load from old title rec // . this sets all the member vars from it and also sets // m_titleRecBuf to contain the actual compressed title rec if ( ! xd->loadFromOldTitleRec ( ) ) return false; // we got it without blocking. cached? return gotTitleRec ( st ); }
// returns false if blocked, true otherwise bool processLoop ( void *state ) { // get it State60 *st = (State60 *)state; // get the tcp socket from the state TcpSocket *s = st->m_socket; // get it XmlDoc *xd = &st->m_xd; if ( ! xd->m_loaded ) { // setting just the docid. niceness is 0. xd->set3 ( st->m_docId , st->m_coll , 0 ); // callback xd->setCallback ( state , processLoop ); // . and tell it to load from the old title rec // . if it returns false it blocked and will call our callback // processLoop() when it completes if ( ! xd->loadFromOldTitleRec ( ) ) return false; } if ( g_errno ) return sendErrorReply ( st , g_errno ); // get the utf8 content char **utf8 = xd->getUtf8Content(); //int32_t len = xd->size_utf8Content - 1; // wait if blocked??? if ( utf8 == (void *)-1 ) return false; // strange if ( xd->size_utf8Content<=0) return sendErrorReply(st,EBADENGINEER ); // alloc error? if ( ! utf8 ) return sendErrorReply ( st , g_errno ); // get this host Host *h = g_hostdb.getHost ( g_hostdb.m_hostId ); if ( ! h ) return sendErrorReply ( st , EBADENGINEER ); // make it into an editable page now for the turk guy sendTurkPageReply ( st ); }
// for procog bool sendPageAnalyze ( TcpSocket *s , HttpRequest *r ) { // make a state State8 *st; try { st = new (State8); } catch ( ... ) { g_errno = ENOMEM; log("PageParser: new(%i): %s", (int)sizeof(State8),mstrerror(g_errno)); return g_httpServer.sendErrorReply(s,500, mstrerror(g_errno));} mnew ( st , sizeof(State8) , "PageParser" ); st->m_freeIt = true; st->m_state = NULL; //st->m_callback = callback; //st->m_q = q; //st->m_termFreqs = termFreqs; //st->m_termFreqWeights = termFreqWeights; //st->m_affWeights = affWeights; //st->m_total = (score_t)-1; st->m_indexCode = 0; st->m_blocked = false; st->m_didRootDom = false; st->m_didRootWWW = false; st->m_wasRootDom = false; st->m_u = NULL; // password, too long pwdLen = 0; char *pwd = r->getString ( "pwd" , &pwdLen ); if ( pwdLen > 31 ) pwdLen = 31; if ( pwdLen > 0 ) strncpy ( st->m_pwd , pwd , pwdLen ); st->m_pwd[pwdLen]='\0'; // save socket ptr st->m_s = s; st->m_r.copy ( r ); // get the collection char *coll = r->getString ( "c" , &st->m_collLen ,NULL /*default*/); if ( ! coll ) coll = g_conf.m_defaultColl; if ( ! coll ) coll = "main"; long collLen = gbstrlen(coll); if ( collLen > MAX_COLL_LEN ) return sendErrorReply ( st , ENOBUFS ); strcpy ( st->m_coll , coll ); // version to use, if -1 use latest st->m_titleRecVersion = r->getLong("version",-1); if ( st->m_titleRecVersion == -1 ) st->m_titleRecVersion = TITLEREC_CURRENT_VERSION; // default to 0 if not provided st->m_hopCount = r->getLong("hc",0); long old = r->getLong ( "old", 0 ); // set query long qlen; char *qs = r->getString("q",&qlen,NULL); if ( qs ) st->m_tq.set2 ( qs , langUnknown , true ); // url will override docid if given st->m_docId = r->getLongLong ("d",-1); st->m_docId = r->getLongLong ("docid",st->m_docId); long ulen; char *u = st->m_r.getString("u",&ulen,NULL); if ( ! u ) u = st->m_r.getString("url",&ulen,NULL); if ( ! u && st->m_docId == -1LL ) return sendErrorReply ( st , EBADREQUEST ); // set url in state class (may have length 0) //if ( u ) st->m_url.set ( u , ulen ); //st->m_urlLen = ulen; st->m_u = u; st->m_ulen = 0; if ( u ) st->m_ulen = gbstrlen(u); // should we recycle link info? st->m_recycle = r->getLong("recycle",1); st->m_recycle2 = r->getLong("recycleimp",0); st->m_render = r->getLong("render" ,0); st->m_recompute = r->getLong("recompute" ,0); // for quality computation... takes way longer cuz we have to // lookup the IP address of every outlink, so we can get its root // quality using Msg25 which needs to filter out voters from that IP // range. st->m_oips = r->getLong("oips" ,0); //st->m_page = r->getLong("page",1); long linkInfoLen = 0; // default is NULL char *linkInfoColl = r->getString ( "oli" , &linkInfoLen, NULL ); if ( linkInfoColl ) strcpy ( st->m_linkInfoColl , linkInfoColl ); else st->m_linkInfoColl[0] = '\0'; // set the flag in our SafeBuf class so that Words.cpp knows to show // html or html source depending on this value //st->m_xbuf.m_renderHtml = st->m_render; // should we use the old title rec? st->m_old = old; // are we coming from a local machine? st->m_isLocal = r->isLocal(); //no more setting the default root quality to 30, instead if we do not // know it setting it to -1 st->m_rootQuality=-1; // header //xbuf->safePrintf("<meta http-equiv=\"Content-Type\" " // "content=\"text/html; charset=utf-8\">\n"); XmlDoc *xd = &st->m_xd; long isXml = r->getLong("xml",0); // if got docid, use that if ( st->m_docId != -1 ) { if ( ! xd->set3 ( st->m_docId, st->m_coll, 0 ) ) // niceness // return error reply if g_errno is set return sendErrorReply ( st , g_errno ); // make this our callback in case something blocks xd->setCallback ( st , gotXmlDoc ); xd->m_pbuf = &st->m_wbuf; // reset this flag st->m_donePrinting = false; // . set xd from the old title rec if recycle is true // . can also use XmlDoc::m_loadFromOldTitleRec flag //if ( st->m_recycle ) xd->m_recycleContent = true; xd->m_recycleContent = true; // force this on //xd->m_useSiteLinkBuf = true; //xd->m_usePageLinkBuf = true; if ( isXml ) xd->m_printInXml = true; // now tell it to fetch the old title rec if ( ! xd->loadFromOldTitleRec () ) // return false if this blocks return false; return gotXmlDoc ( st ); } // set this up SpiderRequest sreq; sreq.reset(); if ( st->m_u ) strcpy(sreq.m_url,st->m_u); long firstIp = hash32n(st->m_u); if ( firstIp == -1 || firstIp == 0 ) firstIp = 1; // parentdocid of 0 sreq.setKey( firstIp, 0LL, false ); sreq.m_isPageParser = 1; sreq.m_hopCount = st->m_hopCount; sreq.m_hopCountValid = 1; sreq.m_fakeFirstIp = 1; sreq.m_firstIp = firstIp; Url nu; nu.set(sreq.m_url); sreq.m_domHash32 = nu.getDomainHash32(); sreq.m_siteHash32 = nu.getHostHash32(); // . get provided content if any // . will be NULL if none provided // . "content" may contain a MIME long contentLen = 0; char *content = r->getString ( "content" , &contentLen , NULL ); // is the "content" url-encoded? default is true. bool contentIsEncoded = true; // mark doesn't like to url-encode his content if ( ! content ) { content = r->getUnencodedContent (); contentLen = r->getUnencodedContentLen (); contentIsEncoded = false; } // ensure null if ( contentLen == 0 ) content = NULL; //uint8_t contentType = CT_HTML; //if ( isXml ) contentType = CT_XML; long ctype = r->getLong("ctype",CT_HTML); // . use the enormous power of our new XmlDoc class // . this returns false if blocked if ( ! xd->set4 ( &sreq , NULL , st->m_coll , // we need this so the term table is set! &st->m_wbuf , // XmlDoc::m_pbuf 0, // try 0 now! 1 ,//PP_NICENESS )) content , false, // deletefromindex 0, // forced ip ctype )) // return error reply if g_errno is set return sendErrorReply ( st , g_errno ); // make this our callback in case something blocks xd->setCallback ( st , gotXmlDoc ); // reset this flag st->m_donePrinting = false; // prevent a core here in the event we download the page content xd->m_crawlDelayValid = true; xd->m_crawlDelay = 0; // . set xd from the old title rec if recycle is true // . can also use XmlDoc::m_loadFromOldTitleRec flag //if ( st->m_recycle ) xd->m_recycleContent = true; // only recycle if docid is given!! if ( st->m_recycle ) xd->m_recycleContent = true; // force this on //xd->m_useSiteLinkBuf = true; //xd->m_usePageLinkBuf = true; if ( isXml ) xd->m_printInXml = true; return gotXmlDoc ( st ); }
// . returns false if blocked, true otherwise // . sets g_errno on error bool sendPageGet ( TcpSocket *s , HttpRequest *r ) { // get the collection long collLen = 0; char *coll = r->getString("c",&collLen); if ( ! coll || ! coll[0] ) { //coll = g_conf.m_defaultColl; coll = g_conf.getDefaultColl( r->getHost(), r->getHostLen() ); collLen = gbstrlen(coll); } // ensure collection not too big if ( collLen >= MAX_COLL_LEN ) { g_errno = ECOLLTOOBIG; return g_httpServer.sendErrorReply(s,500,mstrerror(g_errno)); } // get the collection rec CollectionRec *cr = g_collectiondb.getRec ( coll ); if ( ! cr ) { g_errno = ENOCOLLREC; log("query: Archived copy retrieval failed. " "No collection record found for " "collection \"%s\".",coll); return g_httpServer.sendErrorReply(s,500,mstrerror(g_errno)); } // does this collection ban this IP? if ( ! cr->hasSearchPermission ( s ) ) { g_errno = ENOPERM; //log("PageGet::sendDynamicReply0: permission denied for %s", // iptoa(s->m_ip) ); g_msg = " (error: permission denied)"; return g_httpServer.sendErrorReply(s,500,mstrerror(g_errno)); } // . get fields from cgi field of the requested url // . get the search query long qlen = 0; char *q = r->getString ( "q" , &qlen , NULL /*default*/); // ensure query not too big if ( qlen >= MAX_QUERY_LEN-1 ) { g_errno=EQUERYTOOBIG; return g_httpServer.sendErrorReply (s,500 ,mstrerror(g_errno)); } // the docId long long docId = r->getLongLong ( "d" , 0LL /*default*/ ); // get url char *url = r->getString ( "u",NULL); if ( docId == 0 && ! url ) { g_errno = EMISSINGINPUT; return g_httpServer.sendErrorReply (s,500 ,mstrerror(g_errno)); } // . should we do a sequential lookup? // . we need to match summary here so we need to know this //bool seq = r->getLong ( "seq" , false ); // restrict to root file? bool rtq = r->getLong ( "rtq" , false ); // . get the titleRec // . TODO: redirect client to a better http server to save bandwidth State2 *st ; try { st = new (State2); } catch (... ) { g_errno = ENOMEM; log("PageGet: new(%i): %s", (int)sizeof(State2),mstrerror(g_errno)); return g_httpServer.sendErrorReply(s,500,mstrerror(g_errno));} mnew ( st , sizeof(State2) , "PageGet1" ); // save the socket and if Host: is local in the Http request Mime st->m_socket = s; st->m_isAdmin = g_conf.isCollAdmin ( s , r ); st->m_isLocal = r->isLocal(); st->m_docId = docId; st->m_printed = false; // include header ... "this page cached by Gigablast on..." st->m_includeHeader = r->getLong ("ih" , true ); st->m_includeBaseHref = r->getLong ("ibh" , false ); st->m_queryHighlighting = r->getLong ("qh" , true ); st->m_strip = r->getLong ("strip" , 0 ); st->m_clickAndScroll = r->getLong ("cas" , true ); st->m_cnsPage = r->getLong ("cnsp" , true ); char *langAbbr = r->getString("qlang",NULL); st->m_langId = langUnknown; if ( langAbbr ) { uint8_t langId = getLangIdFromAbbr ( langAbbr ); st->m_langId = langId; } strncpy ( st->m_coll , coll , MAX_COLL_LEN+1 ); // store query for query highlighting st->m_netTestResults = r->getLong ("rnettest", false ); //if( st->m_netTestResults ) { // mdelete ( st , sizeof(State2) , "PageGet1" ); // delete ( st ); // return sendPageNetResult( s ); //} if ( q && qlen > 0 ) strcpy ( st->m_q , q ); else st->m_q[0] = '\0'; st->m_qlen = qlen; //st->m_seq = seq; st->m_rtq = rtq; st->m_boolFlag = r->getLong ("bq", 2 /*default is 2*/ ); st->m_isBanned = false; st->m_noArchive = false; st->m_socket = s; st->m_format = r->getReplyFormat(); // default to 0 niceness st->m_niceness = 0; st->m_r.copy ( r ); //st->m_cr = cr; st->m_printDisclaimer = true; if ( st->m_cnsPage ) st->m_printDisclaimer = false; if ( st->m_strip ) // ! st->m_evbits.isEmpty() ) st->m_printDisclaimer = false; // should we cache it? char useCache = r->getLong ( "usecache" , 1 ); char rcache = r->getLong ( "rcache" , 1 ); char wcache = r->getLong ( "wcache" , 1 ); long cacheAge = r->getLong ( "cacheAge" , 60*60 ); // default one hour if ( useCache == 0 ) { cacheAge = 0; wcache = 0; } if ( rcache == 0 ) cacheAge = 0; // . fetch the TitleRec // . a max cache age of 0 means not to read from the cache XmlDoc *xd = &st->m_xd; // url based? if ( url ) { SpiderRequest sreq; sreq.reset(); strcpy(sreq.m_url, url ); sreq.setDataSize(); // this returns false if "coll" is invalid if ( ! xd->set4 ( &sreq , NULL , coll , NULL , st->m_niceness ) ) goto hadSetError; } // . when getTitleRec() is called it will load the old one // since XmlDoc::m_setFromTitleRec will be true // . niceness is 0 // . use st->m_coll since XmlDoc just points to it! // . this returns false if "coll" is invalid else if ( ! xd->set3 ( docId , st->m_coll , 0 ) ) { hadSetError: mdelete ( st , sizeof(State2) , "PageGet1" ); delete ( st ); g_errno = ENOMEM; log("PageGet: set3: %s", mstrerror(g_errno)); return g_httpServer.sendErrorReply(s,500,mstrerror(g_errno)); } // if it blocks while it loads title rec, it will re-call this routine xd->setCallback ( st , processLoopWrapper ); // good to go! return processLoop ( st ); }
void gotDatedbList ( State60 *st ) { // must only be run on host #0 since we need just one lock table if ( g_hostdb.m_myHost->m_hostId != 0 ) { char *xx=NULL;*xx=0; } // load turk lock table if we need to bool s_init = false; if ( ! s_init ) { s_init = true; if ( ! g_turkLocks.set(8,sizeof(TurkLock),256) ) log("turk: failed to init turk lock table"); if ( ! g_turkLocks.load(g_conf.m_dir,"turkdir/docidlocks.dat")) log("turk: failed to load turk lock table"); } time_t now = getTimeGlobal(); // int16_tcut RdbList *list = &st->m_list; // the best docid int64_t best = 0LL; // scan the list to get urls/docids to turk out for ( ; ! list->isExhausted() ; ) { // get rec char *k = list->getCurrentKey(); // skip that list->skipCurrentRecord(); // skip if negative if ( (k[0] & 0x01) == 0x00 ) continue; // get the docid int64_t docid = g_datedb.getDocId ( k ); // skip if locked TurkLock *tt = (TurkLock *)g_turkLock.getValue(&docid); // if there check time if ( tt && now - tt->m_lockTime > 3600 ) { // remove it g_turkLock.removeKey(&docId); // nuke tt tt = NULL; } // if still there, skip it and try next one if ( tt ) continue; // ok, we got a good docid to dish out best = docId; break; } SafeBuf sb; // print description so they can clikc a button to start the turk sb.safePrintf("<html>\n" "<title>Event Editor</title>\n" "<body>\n" "<table width=\"100%%\" border=\"0\">\n" "<tr><td style=\"background-color:#0079ba;\">\n" "<center><font color=#00000>" "<h2>Event Editor</h2>\n" "</font></center></td>" "</tr></table>"); // if we had no docid, give user an empty msg if ( ! best ) { sb.safePrintf("<center>Nothing currently available to edit. " "Please try again later.</center>" "</body></html>\n"); sendReply ( &sb ); return; } // lock it! TurkLock tt; strcpy ( tt.m_user , st->m_user ); tt.m_lockTime = now; if ( ! g_lockTable.addLock ( &tt ) ) { sendErrorReply ( st , g_errno ); return; } // . fetch the TitleRec // . a max cache age of 0 means not to read from the cache XmlDoc *xd = &st->m_xd; // . when getTitleRec() is called it will load the old one // since XmlDoc::m_setFromTitleRec will be true // . niceness is 0 xd->set3 ( best , st->m_coll , 0 ); // if it blocks while it loads title rec, it will re-call this routine xd->setCallback ( st , processLoopWrapper ); // good to go! return processLoop ( st ); }