// . returns false if blocked, true otherwise // . sets g_errno on error bool sendPageGet ( TcpSocket *s , HttpRequest *r ) { // get the collection long collLen = 0; char *coll = r->getString("c",&collLen); if ( ! coll || ! coll[0] ) { //coll = g_conf.m_defaultColl; coll = g_conf.getDefaultColl( r->getHost(), r->getHostLen() ); collLen = gbstrlen(coll); } // ensure collection not too big if ( collLen >= MAX_COLL_LEN ) { g_errno = ECOLLTOOBIG; return g_httpServer.sendErrorReply(s,500,mstrerror(g_errno)); } // get the collection rec CollectionRec *cr = g_collectiondb.getRec ( coll ); if ( ! cr ) { g_errno = ENOCOLLREC; log("query: Archived copy retrieval failed. " "No collection record found for " "collection \"%s\".",coll); return g_httpServer.sendErrorReply(s,500,mstrerror(g_errno)); } // does this collection ban this IP? if ( ! cr->hasSearchPermission ( s ) ) { g_errno = ENOPERM; //log("PageGet::sendDynamicReply0: permission denied for %s", // iptoa(s->m_ip) ); g_msg = " (error: permission denied)"; return g_httpServer.sendErrorReply(s,500,mstrerror(g_errno)); } // . get fields from cgi field of the requested url // . get the search query long qlen = 0; char *q = r->getString ( "q" , &qlen , NULL /*default*/); // ensure query not too big if ( qlen >= MAX_QUERY_LEN-1 ) { g_errno=EQUERYTOOBIG; return g_httpServer.sendErrorReply (s,500 ,mstrerror(g_errno)); } // the docId long long docId = r->getLongLong ( "d" , 0LL /*default*/ ); // get url char *url = r->getString ( "u",NULL); if ( docId == 0 && ! url ) { g_errno = EMISSINGINPUT; return g_httpServer.sendErrorReply (s,500 ,mstrerror(g_errno)); } // . should we do a sequential lookup? // . we need to match summary here so we need to know this //bool seq = r->getLong ( "seq" , false ); // restrict to root file? bool rtq = r->getLong ( "rtq" , false ); // . get the titleRec // . TODO: redirect client to a better http server to save bandwidth State2 *st ; try { st = new (State2); } catch (... ) { g_errno = ENOMEM; log("PageGet: new(%i): %s", (int)sizeof(State2),mstrerror(g_errno)); return g_httpServer.sendErrorReply(s,500,mstrerror(g_errno));} mnew ( st , sizeof(State2) , "PageGet1" ); // save the socket and if Host: is local in the Http request Mime st->m_socket = s; st->m_isAdmin = g_conf.isCollAdmin ( s , r ); st->m_isLocal = r->isLocal(); st->m_docId = docId; st->m_printed = false; // include header ... "this page cached by Gigablast on..." st->m_includeHeader = r->getLong ("ih" , true ); st->m_includeBaseHref = r->getLong ("ibh" , false ); st->m_queryHighlighting = r->getLong ("qh" , true ); st->m_strip = r->getLong ("strip" , 0 ); st->m_clickAndScroll = r->getLong ("cas" , true ); st->m_cnsPage = r->getLong ("cnsp" , true ); char *langAbbr = r->getString("qlang",NULL); st->m_langId = langUnknown; if ( langAbbr ) { uint8_t langId = getLangIdFromAbbr ( langAbbr ); st->m_langId = langId; } strncpy ( st->m_coll , coll , MAX_COLL_LEN+1 ); // store query for query highlighting st->m_netTestResults = r->getLong ("rnettest", false ); //if( st->m_netTestResults ) { // mdelete ( st , sizeof(State2) , "PageGet1" ); // delete ( st ); // return sendPageNetResult( s ); //} if ( q && qlen > 0 ) strcpy ( st->m_q , q ); else st->m_q[0] = '\0'; st->m_qlen = qlen; //st->m_seq = seq; st->m_rtq = rtq; st->m_boolFlag = r->getLong ("bq", 2 /*default is 2*/ ); st->m_isBanned = false; st->m_noArchive = false; st->m_socket = s; st->m_format = r->getReplyFormat(); // default to 0 niceness st->m_niceness = 0; st->m_r.copy ( r ); //st->m_cr = cr; st->m_printDisclaimer = true; if ( st->m_cnsPage ) st->m_printDisclaimer = false; if ( st->m_strip ) // ! st->m_evbits.isEmpty() ) st->m_printDisclaimer = false; // should we cache it? char useCache = r->getLong ( "usecache" , 1 ); char rcache = r->getLong ( "rcache" , 1 ); char wcache = r->getLong ( "wcache" , 1 ); long cacheAge = r->getLong ( "cacheAge" , 60*60 ); // default one hour if ( useCache == 0 ) { cacheAge = 0; wcache = 0; } if ( rcache == 0 ) cacheAge = 0; // . fetch the TitleRec // . a max cache age of 0 means not to read from the cache XmlDoc *xd = &st->m_xd; // url based? if ( url ) { SpiderRequest sreq; sreq.reset(); strcpy(sreq.m_url, url ); sreq.setDataSize(); // this returns false if "coll" is invalid if ( ! xd->set4 ( &sreq , NULL , coll , NULL , st->m_niceness ) ) goto hadSetError; } // . when getTitleRec() is called it will load the old one // since XmlDoc::m_setFromTitleRec will be true // . niceness is 0 // . use st->m_coll since XmlDoc just points to it! // . this returns false if "coll" is invalid else if ( ! xd->set3 ( docId , st->m_coll , 0 ) ) { hadSetError: mdelete ( st , sizeof(State2) , "PageGet1" ); delete ( st ); g_errno = ENOMEM; log("PageGet: set3: %s", mstrerror(g_errno)); return g_httpServer.sendErrorReply(s,500,mstrerror(g_errno)); } // if it blocks while it loads title rec, it will re-call this routine xd->setCallback ( st , processLoopWrapper ); // good to go! return processLoop ( st ); }
// . reset a collection // . returns false if failed bool Collectiondb::resetColl ( char *coll , bool resetTurkdb ) { // ensure it's not NULL if ( ! coll ) { log(LOG_LOGIC,"admin: Collection name to delete is NULL."); return false; } // now must be "test" only for now if ( strcmp(coll,"test") ) { char *xx=NULL;*xx=0; } // no spiders can be out. they may be referencing the CollectionRec // in XmlDoc.cpp... quite likely. if ( g_conf.m_spideringEnabled || g_spiderLoop.m_numSpidersOut > 0 ) { log("admin: Can not delete collection while " "spiders are enabled or active."); return false; } // do not allow this if in repair mode if ( g_repairMode > 0 ) { log("admin: Can not delete collection while in repair mode."); return false; } // get the CollectionRec for "test" CollectionRec *cr = getRec ( "test" ); // must be there. if not, we create test i guess if ( ! cr ) { log("db: could not get test coll rec"); char *xx=NULL;*xx=0; } // make sure an update not in progress if ( cr->m_inProgress ) { char *xx=NULL;*xx=0; } CollectionRec tmp; // copy it to "tmp" long size = (char *)&(cr->m_END_COPY) - (char *)cr; // do not copy the hashtable crap since you will have to re-init it! memcpy ( &tmp , cr , size ); // sizeof(CollectionRec) ); // delete the test coll now if ( ! deleteRec ( "test" , resetTurkdb ) ) return log("admin: reset coll failed"); // make a collection called "test2" so that we copy "test"'s parms bool status = addRec ( "test" , NULL , 0 , true , // bool isNew , (collnum_t) -1 , // not a dump false , // do not save it! false ); // bail on error if ( ! status ) return log("admin: failed to add new coll for reset"); // get its rec CollectionRec *nr = getRec ( "test" ); // must be there if ( ! nr ) { char *xx=NULL;*xx=0; } // save this though, this might have changed! collnum_t cn = nr->m_collnum; // overwrite its rec memcpy ( nr , &tmp , size ) ; // sizeof(CollectionRec) ); // put that collnum back nr->m_collnum = cn; // set the flag m_needsSave = true; // save it again after copy nr->save(); // and clear the robots.txt cache in case we recently spidered a // robots.txt, we don't want to use it, we want to use the one we // have in the test-parser subdir so we are consistent RdbCache *robots = Msg13::getHttpCacheRobots(); RdbCache *others = Msg13::getHttpCacheOthers(); robots->clear ( cn ); others->clear ( cn ); //g_templateTable.reset(); //g_templateTable.save( g_hostdb.m_dir , "turkedtemplates.dat" ); // repopulate CollectionRec::m_sortByDateTable. should be empty // since we are resetting here. //initSortByDateTable ( coll ); // done return true; }
// . returns false if blocked, true otherwise // . sets g_errno on error bool sendPageCatdb ( TcpSocket *s , HttpRequest *r ) { // are we the admin? bool isAdmin = g_collectiondb.isAdmin ( r , s ); // get the collection record CollectionRec *cr = g_collectiondb.getRec ( r ); if ( ! cr ) { log("admin: No collection record found " "for specified collection name. Could not add sites to " "tagdb. Returning HTTP status of 500."); return g_httpServer.sendErrorReply ( s , 500 , "collection does not exist"); } bool isAssassin = cr->isAssassin ( s->m_ip ); if ( isAdmin ) isAssassin = true; // bail if permission denied if ( ! isAssassin && ! cr->hasPermission ( r , s ) ) { log("admin: Bad collection name or password. Could not add " "sites to tagdb. Permission denied."); return sendPageLogin ( s , r , "Collection name or " "password is incorrect"); } // get the collection long collLen = 0; char *coll = r->getString("c", &collLen, NULL); // check for generate catdb command long genCatdb = r->getLong("gencatdb", 0); // check for a lookup url long urlLen = 0; char *url = r->getString("caturl", &urlLen, NULL); // create the State StateCatdb *st; try { st = new (StateCatdb); } catch ( ... ) { g_errno = ENOMEM; log("catdb: Unable to allocate %i bytes for StateCatdb", sizeof(StateCatdb) ); return true; } mnew ( st, sizeof(StateCatdb), "PageCatdb" ); // fill the state st->m_socket = s; st->m_r.copy(r); // copy collection if (collLen > MAX_COLL_LEN) collLen = MAX_COLL_LEN - 1; memcpy(st->m_coll, coll, collLen); st->m_coll[collLen] = '\0'; st->m_collLen = collLen; // defaults st->m_catLookup = false; st->m_genCatdb = false; st->m_startTime = gettimeofdayInMilliseconds(); // generate catdb if requested if (genCatdb == 1) { st->m_genCatdb = true; if (!st->m_msg2a.makeCatdb ( st->m_coll, st->m_collLen, false, st, sendReplyWrapper ) ) return false; } // update catdb from .new files else if (genCatdb == 2) { st->m_genCatdb = true; if (!st->m_msg2a.makeCatdb ( st->m_coll, st->m_collLen, true, st, sendReplyWrapper ) ) return false; } // lookup a url if requested else if (url && urlLen > 0) { st->m_catLookup = true; // set the url st->m_url.set(url, urlLen); // call msg8b to lookup in catdb if (!st->m_msg8b.getCatRec ( &st->m_url, NULL,//st->m_coll, 0,//st->m_collLen, true, 1, &st->m_catRec, st, gotCatInfoWrapper)) //RDB_CATDB ) ) //RDB_TAGDB ) ) return false; } // otherwise return the regular page return sendReply ( st ); }
// . returns false if blocked, true otherwise // . sets g_errno on error bool sendPageAddUrl ( TcpSocket *s , HttpRequest *r ) { // . get fields from cgi field of the requested url // . get the search query long urlLen = 0; char *url = r->getString ( "u" , &urlLen , NULL /*default*/); // see if they provided a url of a file of urls if they did not // provide a url to add directly //bool isAdmin = g_collectiondb.isAdmin ( r , s ); bool isAdmin = r->getIsLocal(); long ufuLen = 0; char *ufu = NULL; if ( isAdmin ) // get the url of a file of urls (ufu) ufu = r->getString ( "ufu" , &ufuLen , NULL ); // can't be too long, that's obnoxious if ( urlLen > MAX_URL_LEN || ufuLen > MAX_URL_LEN ) { g_errno = EBUFTOOSMALL; g_msg = " (error: url too long)"; return g_httpServer.sendErrorReply(s,500,"url too long"); } // get the collection long collLen = 0; char *coll = r->getString("c",&collLen); if ( ! coll || ! coll[0] ) { //coll = g_conf.m_defaultColl; coll = g_conf.getDefaultColl( r->getHost(), r->getHostLen() ); collLen = gbstrlen(coll); } // get collection rec CollectionRec *cr = g_collectiondb.getRec ( coll ); // bitch if no collection rec found if ( ! cr ) { g_errno = ENOCOLLREC; g_msg = " (error: no collection)"; return g_httpServer.sendErrorReply(s,500,"no coll rec"); } // . make sure the ip is not banned // . we may also have an exclusive list of IPs for private collections if ( ! cr->hasSearchPermission ( s ) ) { g_errno = ENOPERM; g_msg = " (error: permission denied)"; return g_httpServer.sendErrorReply(s,500,mstrerror(g_errno)); } // make a new state State1 *st1 ; try { st1 = new (State1); } catch ( ... ) { g_errno = ENOMEM; log("PageAddUrl: new(%i): %s", sizeof(State1),mstrerror(g_errno)); return g_httpServer.sendErrorReply(s,500,mstrerror(g_errno)); } mnew ( st1 , sizeof(State1) , "PageAddUrl" ); // save socket and isAdmin st1->m_socket = s; st1->m_isAdmin = isAdmin; // assume no url buf yet, set below //st1->m_ubuf = NULL; //st1->m_ubufAlloc = NULL; //st1->m_metaList = NULL; // save the url st1->m_url[0] = '\0'; if ( url ) { // normalize and add www. if it needs it Url uu; uu.set ( url , gbstrlen(url) , true ); // remove >'s i guess and store in st1->m_url[] buffer st1->m_urlLen=cleanInput ( st1->m_url, MAX_URL_LEN, uu.getUrl(), uu.getUrlLen() ); // point to that as the url "buf" to add //st1->m_ubuf = st1->m_url; //st1->m_ubufSize = urlLen; //st1->m_ubufAlloc = NULL; // do not free it! } // save the "ufu" (url of file of urls) st1->m_ufu[0] = '\0'; st1->m_ufuLen = ufuLen; memcpy ( st1->m_ufu , ufu , ufuLen ); st1->m_ufu[ufuLen] = '\0'; st1->m_doTuringTest = cr->m_doTuringTest; char *username = g_users.getUsername(r); if(username) strcpy(st1->m_username,username); //st1->m_user = g_pages.getUserType ( s , r ); st1->m_spiderLinks = true; st1->m_strip = true; //st1->m_raw = r->getLong("raw",0); // init state2 for ( long i = 0; i < 5; i++ ){ st1->m_state2[i].m_buf = NULL; st1->m_state2[i].m_bufLen = 0; st1->m_state2[i].m_bufMaxLen = 0; } // save the collection name in the State1 class if ( collLen > MAX_COLL_LEN ) collLen = MAX_COLL_LEN; strncpy ( st1->m_coll , coll , collLen ); st1->m_coll [ collLen ] = '\0'; // assume they answered turing test correctly st1->m_goodAnswer = true; // if addurl is turned off, just print "disabled" msg if ( ! g_conf.m_addUrlEnabled ) return sendReply ( st1 , false ); // can also be turned off in the collection rec if ( ! cr->m_addUrlEnabled ) return sendReply ( st1 , false ); // or if in read-only mode if ( g_conf.m_readOnlyMode ) return sendReply ( st1 , false ); // cannot add if another Msg10 from here is still in progress if ( s_inprogress ) return sendReply ( st1 , true ); // use now as the spiderTime // get ip of submitter //unsigned long h = ipdom ( s->m_ip ); // . use top 2 bytes now, some isps have large blocks // . if this causes problems, then they can do pay for inclusion unsigned long h = iptop ( s->m_ip ); long codeLen; char* code = r->getString("code", &codeLen); if(g_autoBan.hasCode(code, codeLen, s->m_ip)) { long uipLen = 0; char* uip = r->getString("uip",&uipLen); long hip = 0; //use the uip when we have a raw query to test if //we can submit if(uip) { hip = atoip(uip, uipLen); h = iptop( hip ); } } st1->m_strip = r->getLong("strip",0); // Remember, for cgi, if the box is not checked, then it is not // reported in the request, so set default return value to 0 long spiderLinks = r->getLong("spiderLinks",-1); // also support all lowercase like PageInject.cpp uses if ( spiderLinks == -1 ) spiderLinks = r->getLong("spiderlinks",0); // . should we force it into spiderdb even if already in there // . use to manually update spider times for a url // . however, will not remove old scheduled spider times // . mdw: made force on the default st1->m_forceRespider = r->getLong("force",1); // 0); long now = getTimeGlobal(); // . allow 1 submit every 1 hour // . restrict by submitter domain ip if ( ! st1->m_isAdmin && ! canSubmit ( h , now , cr->m_maxAddUrlsPerIpDomPerDay ) ) { // return error page g_errno = ETOOEARLY; return sendReply ( st1 , true ); } //st1->m_query = r->getString( "qts", &st1->m_queryLen ); // check it, if turing test is enabled for this collection if ( ! st1->m_isAdmin && cr->m_doTuringTest && ! g_turingTest.isHuman(r) ) { // log note so we know it didn't make it g_msg = " (error: bad answer)"; //log("PageAddUrl:: addurl failed for %s : bad answer", // iptoa(s->m_ip)); st1->m_goodAnswer = false; return sendReply ( st1 , true /*addUrl enabled?*/ ); } //if ( st1->m_queryLen > 0 ) // return getPages( st1 ); // if no url given, just print a blank page if ( ! url ) return sendReply ( st1 , true ); // // make a SpiderRequest // SpiderRequest *sreq = &st1->m_sreq; // reset it sreq->reset(); // make the probable docid long long probDocId = g_titledb.getProbableDocId ( st1->m_url ); // make one up, like we do in PageReindex.cpp long firstIp = (probDocId & 0xffffffff); // . now fill it up // . TODO: calculate the other values... lazy!!! (m_isRSSExt, // m_siteNumInlinks,...) sreq->m_isNewOutlink = 1; sreq->m_isAddUrl = 1; sreq->m_addedTime = now; sreq->m_fakeFirstIp = 1; sreq->m_probDocId = probDocId; sreq->m_firstIp = firstIp; sreq->m_hopCount = 0; // its valid if root Url uu; uu.set ( st1->m_url ); if ( uu.isRoot() ) sreq->m_hopCountValid = true; // too big? //long len = st1->m_urlLen; // the url! includes \0 strcpy ( sreq->m_url , st1->m_url ); // call this to set sreq->m_dataSize now sreq->setDataSize(); // make the key dude -- after setting url sreq->setKey ( firstIp , 0LL, false ); // need a fake first ip lest we core! //sreq->m_firstIp = (pdocId & 0xffffffff); // how to set m_firstIp? i guess addurl can be throttled independently // of the other urls??? use the hash of the domain for it! long dlen; char *dom = getDomFast ( st1->m_url , &dlen ); // fake it for this... //sreq->m_firstIp = hash32 ( dom , dlen ); // sanity if ( ! dom ) { g_errno = EBADURL; return sendReply ( st1 , true ); } // shortcut Msg4 *m = &st1->m_msg4; // now add that to spiderdb using msg4 if ( ! m->addMetaList ( (char *)sreq , sreq->getRecSize() , coll , st1 , // state addedStuff , MAX_NICENESS , RDB_SPIDERDB ) ) // we blocked return false; // send back the reply return sendReply ( st1 , true ); }