bool sendPageTurk ( TcpSocket *s , HttpRequest *r ) { // get the current timestamp int32_t now = getTimeGlobal (); char *coll = r->getString("c"); if ( ! coll ) return g_httpServer.sendErrorReply( s, 500, "No collection"); // make a state for callback State60 *st ; try { st = new ( State60 ); } catch ( ... ) { g_errno = ENOMEM; log( "pgrank: new(%i): %s", sizeof(State60), mstrerror(g_errno) ); return g_httpServer.sendErrorReply(s,500,mstrerrno(g_errno)); } mnew ( st , sizeof(State60) , "PageTurk" ); // get username char *username = r->getStringFromCookie("username", NULL); if ( !username ) username = r->getString("username",NULL); if ( !username ) username = r->getString("user",NULL); if ( !username ) username = r->getString("code",NULL); if ( ! username ) return g_httpServer.sendErrorReply(s,500,"No username"); int32_t ulen = gbsrlen(username); if ( ulen >= MAX_USER_SIZE ) return g_httpServer.sendErrorReply(s,500,"Bad username"); // save crap. don't we need to copy "r" into our own? yeah... st->m_s = s; // save username strcpy(st->m_username,username,ulen+1); // assume no url //st->m_url[0] = 0; // copy coll strcpy(st->m_coll,coll); // this is 1 to imply to edit a page st->m_editMode = r->getString("edit",0); st->m_docId = r->getLongLong("docid",0LL); // get url //char *url = r->getString ("url", NULL); // if no url is given then present their stats page if ( ! edit ) return sendPageTurkStats (st); // copy url //strcpy ( st->m_url , url ); // otherwise, send them the eval page return sendPageTurkEval (st); }
time_t genDate( char *date, long dateLen ) { time_t result = -1; // the date string should always be the same length if ( ! date || dateLen != 16 ) return result; struct tm tmRef; struct tm tmBuild; //* memset( (char *)&tmRef, 0, sizeof( tmRef ) ); time_t now = (time_t)getTimeGlobal(); localtime_r( &now, &tmRef ); now = mktime( &tmRef ); // */ char tmp[18]; char *p = tmp; memcpy( p, date, dateLen ); p[2] = '\0'; p[5] = '\0'; p[10] = '\0'; p[13] = '\0'; p[16] = '\0'; memset( (char *)&tmBuild, 0, sizeof( tmBuild ) ); tmBuild.tm_mon = atoi( p ) - 1; p += 3; tmBuild.tm_mday = atoi( p ); p += 3; tmBuild.tm_year = atoi( p ) - 1900; p += 5; tmBuild.tm_hour = atoi( p ); p += 3; tmBuild.tm_min = atoi( p ); p += 3; tmBuild.tm_isdst = tmRef.tm_isdst; p += 3; // We must manually adjust for DST difference // if the current state of DST does not match // that of the date that was requested. /* struct tm nowDST; struct tm resultDST; localtime_r( &now, &nowDST ); localtime_r( &result, &resultDST ); if ( nowDST.tm_isdst && !resultDST.tm_isdst ) tmBuild.tm_hour++; else if ( !nowDST.tm_isdst && resultDST.tm_isdst ) tmBuild.tm_hour--; memcpy( p, date, dateLen ); p[16] = '\0'; log ( LOG_DEBUG, "stats: user string [%s]", p ); log ( LOG_DEBUG, "stats: user provided time [%s]", ctime( &result ) ); log ( LOG_DEBUG, "stats: our timestamp [%s]", ctime( &now ) ); // */ result = mktime( &tmBuild ); return result; }
// . this returns false if blocks, true otherwise // . sets g_errno on failure bool Msg1c::gotList ( ) { if ( g_errno ) return true; int64_t *tmpDocIds = m_msg3a.getDocIds(); int32_t numDocIds = m_msg3a.getNumDocIds(); if ( m_startNum > 0) { numDocIds -= m_startNum; tmpDocIds = &tmpDocIds[m_startNum]; } m_numDocIds = numDocIds; // save for reporting // log it log(LOG_INFO,"admin: Got %" PRId32" docIds for query reindex.", numDocIds); // bail if no need if ( numDocIds <= 0 ) return true; // force spiders on on entire network. they will progagate from // host #0... g_conf.m_spideringEnabled = true; int32_t nowGlobal = getTimeGlobal(); HashTableX dt; char dbuf[1024]; dt.set(8,0,64,dbuf,1024,false,0,"ddocids"); m_sb.setLabel("reiadd"); State13 *st = (State13 *)m_state; GigablastRequest *gr = &st->m_gr; m_numDocIdsAdded = 0; // list consists of docIds, loop through each one for(int32_t i = 0; i < numDocIds; i++) { int64_t docId = tmpDocIds[i]; // when searching events we get multiple docids that are same if ( dt.isInTable ( &docId ) ) continue; // add it if ( ! dt.addKey ( &docId ) ) return true; SpiderRequest sr; sr.reset(); // url is a docid! sprintf ( sr.m_url , "%" PRIu64 , docId ); // make a fake first ip // use only 64k values so we don't stress doledb/waittrees/etc. // for large #'s of docids int32_t firstIp = (docId & 0x0000ffff); // bits 6-13 of the docid are the domain hash so use those // when doing a REINDEX (not delete!) to ensure that requests // on the same domain go to the same shard, at least when // we have up to 256 shards. if we have more than 256 shards // at this point some shards will not participate in the // query reindex/delete process because of this, so // we'll want to allow more bits in in that case perhaps. // check out Hostdb::getShardNum(RDB_SPIDERDB) in Hostdb.cpp // to see what shard is responsible for storing and indexing // this SpiderRequest based on the firstIp. if ( ! m_forceDel ) { // if we are a REINDEX not a delete because // deletes don't need to spider/redownload the doc // so the distribution can be more random firstIp >>= 6; firstIp &= 0xff; } // 0 is not a legit val. it'll core below. if ( firstIp == 0 ) { firstIp = 1; } // use a fake ip sr.m_firstIp = firstIp; // we are not really injecting... sr.m_isInjecting = false;//true; sr.m_hopCount = -1; sr.m_isPageReindex = 1; sr.m_urlIsDocId = 1; sr.m_fakeFirstIp = 1; // now you can recycle content instead of re-downloading it // for every docid sr.m_recycleContent = gr->m_recycleContent; // if this is zero we end up getting deduped in // dedupSpiderList() if there was a SpiderReply whose // spider time was > 0 sr.m_addedTime = nowGlobal; sr.m_forceDelete = m_forceDel ? 1 : 0; // . complete its m_key member // . parentDocId is used to make the key, but only allow one // page reindex spider request per url... so use "0" // . this will set "uh48" to hash64b(m_url) which is the docid sr.setKey( firstIp, 0LL , false ); // how big to serialize int32_t recSize = sr.getRecSize(); m_numDocIdsAdded++; // store it if ( ! m_sb.safeMemcpy ( (char *)&sr , recSize ) ) { // g_errno must be set if ( ! g_errno ) { g_process.shutdownAbort(true); } log(LOG_LOGIC, "admin: Query reindex size of %" PRId32" " "too big. Aborting. Bad engineer." , (int32_t)0);//m_list.getListSize() ); return true; } }
void handleRequest12 ( UdpSlot *udpSlot , int32_t niceness ) { // get request char *request = udpSlot->m_readBuf; int32_t reqSize = udpSlot->m_readBufSize; // shortcut UdpServer *us = &g_udpServer; // breathe QUICKPOLL ( niceness ); // shortcut char *reply = udpSlot->m_tmpBuf; // // . is it confirming that he got all the locks? // . if so, remove the doledb record and dock the doleiptable count // before adding a waiting tree entry to re-pop the doledb record // if ( reqSize == sizeof(ConfirmRequest) ) { char *msg = NULL; ConfirmRequest *cq = (ConfirmRequest *)request; // confirm the lock HashTableX *ht = &g_spiderLoop.m_lockTable; int32_t slot = ht->getSlot ( &cq->m_lockKeyUh48 ); if ( slot < 0 ) { log("spider: got a confirm request for a key not " "in the table! coll must have been deleted " " or reset " "while lock request was outstanding."); g_errno = EBADENGINEER; log(LOG_ERROR,"%s:%s:%d: call sendErrorReply.", __FILE__, __func__, __LINE__); us->sendErrorReply ( udpSlot , g_errno ); return; //char *xx=NULL;*xx=0; } } UrlLock *lock = (UrlLock *)ht->getValueFromSlot ( slot ); lock->m_confirmed = true; // note that if ( g_conf.m_logDebugSpider ) // Wait ) log("spider: got confirm lock request for ip=%s", iptoa(lock->m_firstIp)); // get it SpiderColl *sc = g_spiderCache.getSpiderColl(cq->m_collnum); // make it negative cq->m_doledbKey.n0 &= 0xfffffffffffffffeLL; // and add the negative rec to doledb (deletion operation) Rdb *rdb = &g_doledb.m_rdb; if ( ! rdb->addRecord ( cq->m_collnum, (char *)&cq->m_doledbKey, NULL , // data 0 , //dataSize 1 )){ // niceness // tree is dumping or something, probably ETRYAGAIN if ( g_errno != ETRYAGAIN ) {msg = "error adding neg rec to doledb"; log("spider: %s %s",msg,mstrerror(g_errno)); } //char *xx=NULL;*xx=0; log(LOG_ERROR,"%s:%s:%d: call sendErrorReply.", __FILE__, __func__, __LINE__); us->sendErrorReply ( udpSlot , g_errno ); return; } // now remove from doleiptable since we removed from doledb if ( sc ) sc->removeFromDoledbTable ( cq->m_firstIp ); // how many spiders outstanding for this coll and IP? //int32_t out=g_spiderLoop.getNumSpidersOutPerIp ( cq->m_firstIp); // DO NOT add back to waiting tree if max spiders // out per ip was 1 OR there was a crawldelay. but better // yet, take care of that in the winReq code above. // . now add to waiting tree so we add another spiderdb // record for this firstip to doledb // . true = callForScan // . do not add to waiting tree if we have enough outstanding // spiders for this ip. we will add to waiting tree when // we receive a SpiderReply in addSpiderReply() if ( sc && //out < cq->m_maxSpidersOutPerIp && // this will just return true if we are not the // responsible host for this firstip // DO NOT populate from this!!! say "false" here... ! sc->addToWaitingTree ( 0 , cq->m_firstIp, false ) && // must be an error... g_errno ) { msg = "FAILED TO ADD TO WAITING TREE"; log("spider: %s %s",msg,mstrerror(g_errno)); log(LOG_ERROR,"%s:%s:%d: call sendErrorReply.", __FILE__, __func__, __LINE__); us->sendErrorReply ( udpSlot , g_errno ); return; } // success!! reply[0] = 1; us->sendReply_ass ( reply , 1 , reply , 1 , udpSlot ); return; } // sanity check if ( reqSize != sizeof(LockRequest) ) { log("spider: bad msg12 request size of %" PRId32,reqSize); log(LOG_ERROR,"%s:%s:%d: call sendErrorReply.", __FILE__, __func__, __LINE__); us->sendErrorReply ( udpSlot , EBADREQUEST ); return; } // deny it if we are not synced yet! otherwise we core in // getTimeGlobal() below if ( ! isClockInSync() ) { // log it so we can debug it //log("spider: clock not in sync with host #0. so " // "returning etryagain for lock reply"); // let admin know why we are not spidering log(LOG_ERROR,"%s:%s:%d: call sendErrorReply.", __FILE__, __func__, __LINE__); us->sendErrorReply ( udpSlot , ETRYAGAIN ); return; } LockRequest *lr = (LockRequest *)request; //uint64_t lockKey = *(int64_t *)request; //int32_t lockSequence = *(int32_t *)(request+8); // is this a remove operation? assume not //bool remove = false; // get top bit //if ( lockKey & 0x8000000000000000LL ) remove = true; // mask it out //lockKey &= 0x7fffffffffffffffLL; // sanity check, just 6 bytes! (48 bits) if ( lr->m_lockKeyUh48 &0xffff000000000000LL ) { char *xx=NULL;*xx=0; } // note it if ( g_conf.m_logDebugSpider ) log("spider: got msg12 request uh48=%" PRId64" remove=%" PRId32, lr->m_lockKeyUh48, (int32_t)lr->m_removeLock); // get time int32_t nowGlobal = getTimeGlobal(); // shortcut HashTableX *ht = &g_spiderLoop.m_lockTable; int32_t hostId = g_hostdb.getHostId ( udpSlot->m_ip , udpSlot->m_port ); // this must be legit - sanity check if ( hostId < 0 ) { char *xx=NULL;*xx=0; } // remove expired locks from locktable removeExpiredLocks ( hostId ); int64_t lockKey = lr->m_lockKeyUh48; // check tree int32_t slot = ht->getSlot ( &lockKey ); // lr->m_lockKeyUh48 ); // put it here UrlLock *lock = NULL; // if there say no no if ( slot >= 0 ) lock = (UrlLock *)ht->getValueFromSlot ( slot ); // if doing a remove operation and that was our hostid then unlock it if ( lr->m_removeLock && lock && lock->m_hostId == hostId && lock->m_lockSequence == lr->m_lockSequence ) { // note it for now if ( g_conf.m_logDebugSpider ) log("spider: removing lock for lockkey=%" PRIu64" hid=%" PRId32, lr->m_lockKeyUh48,hostId); // unlock it ht->removeSlot ( slot ); // it is gone lock = NULL; } // ok, at this point all remove ops return if ( lr->m_removeLock ) { reply[0] = 1; us->sendReply_ass ( reply , 1 , reply , 1 , udpSlot ); return; } ///////// // // add new lock // ///////// // if lock > 1 hour old then remove it automatically!! if ( lock && nowGlobal - lock->m_timestamp > MAX_LOCK_AGE ) { // note it for now log("spider: removing lock after %" PRId32" seconds " "for lockKey=%" PRIu64" hid=%" PRId32, (nowGlobal - lock->m_timestamp), lr->m_lockKeyUh48,hostId); // unlock it ht->removeSlot ( slot ); // it is gone lock = NULL; } // if lock still there, do not grant another lock if ( lock ) { // note it for now if ( g_conf.m_logDebugSpider ) log("spider: refusing lock for lockkey=%" PRIu64" hid=%" PRId32, lr->m_lockKeyUh48,hostId); reply[0] = 0; us->sendReply_ass ( reply , 1 , reply , 1 , udpSlot ); return; } // make the new lock UrlLock tmp; tmp.m_hostId = hostId; tmp.m_lockSequence = lr->m_lockSequence; tmp.m_timestamp = nowGlobal; tmp.m_expires = 0; tmp.m_firstIp = lr->m_firstIp; tmp.m_collnum = lr->m_collnum; // when the spider returns we remove its lock on reception of the // spiderReply, however, we actually just set the m_expires time // to 5 seconds into the future in case there is a current request // to get a lock for that url in progress. but, we do need to // indicate that the spider has indeed completed by setting // m_spiderOutstanding to true. this way, addToWaitingTree() will // not count it towards a "max spiders per IP" quota when deciding // on if it should add a new entry for this IP. tmp.m_spiderOutstanding = true; // this is set when all hosts in the group (shard) have granted the // lock and the host sends out a confirmLockAcquisition() request. // until then we do not know if the lock will be granted by all hosts // in the group (shard) tmp.m_confirmed = false; // put it into the table if ( ! ht->addKey ( &lockKey , &tmp ) ) { // return error if that failed! log(LOG_ERROR,"%s:%s:%d: call sendErrorReply.", __FILE__, __func__, __LINE__); us->sendErrorReply ( udpSlot , g_errno ); return; } // note it for now if ( g_conf.m_logDebugSpider ) log("spider: granting lock for lockKey=%" PRIu64" hid=%" PRId32, lr->m_lockKeyUh48,hostId); // grant the lock reply[0] = 1; us->sendReply_ass ( reply , 1 , reply , 1 , udpSlot ); return; }
// returns true if all done, false if waiting for more replies bool Msg12::gotLockReply ( UdpSlot *slot ) { // no longer use this char *xx=NULL;*xx=0; // got reply m_numReplies++; // don't let udpserver free the request, it's our m_request[] slot->m_sendBufAlloc = NULL; // check for a hammer reply char *reply = slot->m_readBuf; int32_t replySize = slot->m_readBufSize; // if error, treat as a not grant if ( g_errno ) { bool logIt = true; // note it if ( g_conf.m_logDebugSpider ) log("spider: got msg12 reply error = %s", mstrerror(g_errno)); // if we got an ETRYAGAIN when trying to confirm our lock // that means doledb was saving/dumping to disk and we // could not remove the record from doledb and add an // entry to the waiting tree, so we need to keep trying if ( g_errno == ETRYAGAIN && m_confirming ) { // c ount it again m_numRequests++; // use what we were using char *request = (char *)&m_confirmRequest; int32_t requestSize = sizeof(ConfirmRequest); Host *h = g_hostdb.getHost(slot->m_hostId); // send request to him UdpServer *us = &g_udpServer; if ( ! us->sendRequest ( request , requestSize , 0x12 , // msgType h->m_ip , h->m_port , h->m_hostId , NULL , // retSlotPtrPt this , // state data gotLockReplyWrapper , udpserver_sendrequest_infinite_timeout ) ) return false; // error? // don't spam the log! static int32_t s_last = 0; int32_t now = getTimeLocal(); if ( now - s_last >= 1 ) { s_last = now; log("spider: error re-sending confirm " "request: %s", mstrerror(g_errno)); } } // only log every 10 seconds for ETRYAGAIN if ( g_errno == ETRYAGAIN ) { static time_t s_lastTime = 0; time_t now = getTimeLocal(); logIt = false; if ( now - s_lastTime >= 3 ) { logIt = true; s_lastTime = now; } } if ( logIt ) log ( "sploop: host had error getting lock url=%s" ": %s" , m_url,mstrerror(g_errno) ); } // grant or not if ( replySize == 1 && ! g_errno && *reply == 1 ) m_grants++; // wait for all to get back if ( m_numReplies < m_numRequests ) return false; // all done if we were removing if ( m_removing ) { // note it if ( g_conf.m_logDebugSpider ) logf(LOG_DEBUG,"spider: done removing all locks " "(replies=%" PRId32") for %s", m_numReplies,m_url);//m_sreq->m_url); // we are done m_gettingLocks = false; return true; } // all done if we were confirming if ( m_confirming ) { // note it if ( g_conf.m_logDebugSpider ) logf(LOG_DEBUG,"spider: done confirming all locks " "for %s uh48=%" PRId64,m_url,m_origUh48);//m_sreq->m_url); // we are done m_gettingLocks = false; // . keep processing // . if the collection was nuked from under us the spiderUrl2 // will return true and set g_errno if ( ! m_callback ) return g_spiderLoop.spiderUrl2(); // if we had a callback let our parent call it return true; } // if got ALL locks, spider it if ( m_grants == m_numReplies ) { // note it if ( g_conf.m_logDebugSpider ) logf(LOG_DEBUG,"spider: got lock for docid=lockkey=%" PRIu64, m_lockKeyUh48); // flag this m_hasLock = true; // we are done //m_gettingLocks = false; /////// // // now tell our group (shard) to remove from doledb // and re-add to waiting tree. the evalIpLoop() function // should skip this probable docid because it is in the // LOCK TABLE! // // This logic should allow us to spider multiple urls // from the same IP at the same time. // /////// // returns false if would block if ( ! confirmLockAcquisition ( ) ) return false; // . we did it without blocking, maybe cuz we are a single node // . ok, they are all back, resume loop // . if the collection was nuked from under us the spiderUrl2 // will return true and set g_errno if ( ! m_callback ) g_spiderLoop.spiderUrl2 ( ); // all done return true; } // note it if ( g_conf.m_logDebugSpider ) logf(LOG_DEBUG,"spider: missed lock for %s lockkey=%" PRIu64" " "(grants=%" PRId32")", m_url,m_lockKeyUh48,m_grants); // . if it was locked by another then add to our lock cache so we do // not try to lock it again // . if grants is not 0 then one host granted us the lock, but not // all hosts, so we should probably keep trying on it until it is // locked up by one host if ( m_grants == 0 ) { int32_t now = getTimeGlobal(); g_spiderLoop.m_lockCache.addLong(0,m_lockKeyUh48,now,NULL); } // reset again m_numRequests = 0; m_numReplies = 0; // no need to remove them if none were granted because another // host in our group might have it 100% locked. if ( m_grants == 0 ) { // no longer in locks operation mode m_gettingLocks = false; // ok, they are all back, resume loop //if ( ! m_callback ) g_spiderLoop.spiderUrl2 ( ); // all done return true; } // note that if ( g_conf.m_logDebugSpider ) logf(LOG_DEBUG,"spider: sending request to all in shard to " "remove lock uh48=%" PRIu64". grants=%" PRId32, m_lockKeyUh48,(int32_t)m_grants); // remove all locks we tried to get, BUT only if from our hostid! // no no! that doesn't quite work right... we might be the ones // locking it! i.e. another one of our spiders has it locked... if ( ! removeAllLocks ( ) ) return false; // true; // if did not block, how'd that happen? log("sploop: did not block in removeAllLocks: %s",mstrerror(g_errno)); return true; }
// . returns false if blocked true otherwise // . sets g_errno on error bool Msg35::getToken ( void *state, void (*callback )(void *state), char priority ){ // if threads are disabled, we are probably repairing dbs // from main.cpp fixTitleRecs() or makeDbs() so no token needed if ( g_threads.areThreadsDisabled() ) return true; // you can also disable the token so twins can merge as the same time if ( ! g_conf.m_useMergeToken ) return true; // disable this until it works again return true; // . if only one host per group, you always have the token // . no, they can only have one merge going at a time //if ( g_hostdb.getNumHostsPerShard() == 1 ) return true; // . ensure not already registered // . this can happen if a client's get request arrives before their // release request... so allow for that now for ( int32_t i = 0 ; i < 64 ; i++ ) { if ( m_clientWaits[i].m_isEmpty ) continue; if ( m_clientWaits[i].m_state != state ) continue; //g_errno = EBADENGINEER; log(LOG_REMIND,"merge: Already queued merge token request."); // return false since they'll be called when token comes //return false; break; } // get next available slot int32_t i; for ( i = 0 ; i < 64 ; i++ ) if ( m_clientWaits[i].m_isEmpty ) break; // . if none empty bitch and return // . this should never happen, if it does than increase the limit // otherwise, his callback will never be called!! if ( i >= 64 ) { g_errno = EBADENGINEER; log(LOG_LOGIC,"merge: msg35: Too many waiting for token."); return true; } ClientWait *c = &m_clientWaits [ i ]; // get current time int32_t timestamp = getTimeGlobal(); // the request is just the priority really char *p = c->m_buf; *p = REQUEST_GETTOKEN ; p += 1; *(int32_t *)p = g_hostdb.m_hostId; p += 4; *(int32_t *)p = timestamp ; p += 4; *p = priority ; p += 1; *p = i; ; p += 1; // client slot # // . send to the governing host, he must be up // . this returns NULL and sets g_errno on error Host *h = getTokenManager ( ); // . the priority of this msg is low, use g_udpServer // . returns false and sets g_errno on error // . if there is a sending error, we will try sending token manager // our client queue (queue of requests) during call to sync() if ( ! g_udpServer.sendRequest ( c->m_buf , 11 , // requestLen 0x35 , // msgType 0x35 h->m_ip , // low priority ip h->m_port , // low priority port h->m_hostId, NULL , // slotPtr this , // state data gotReplyWrapper35 , 31536000 ) ) { // 1 yr timeout log("merge: Got error sending merge token request: %s.", mstrerror(g_errno)); g_errno = 0; } // save callback info even if request not launched successfully since // it will be retried during call to sync() c->m_state = state; c->m_callback = callback; c->m_priority = priority; c->m_timestamp = timestamp; c->m_isEmpty = false; if ( i > m_topUsedClient ) m_topUsedClient = i; // we blocked waiting for the reply return false; }
void gotDatedbList ( State60 *st ) { // must only be run on host #0 since we need just one lock table if ( g_hostdb.m_myHost->m_hostId != 0 ) { char *xx=NULL;*xx=0; } // load turk lock table if we need to bool s_init = false; if ( ! s_init ) { s_init = true; if ( ! g_turkLocks.set(8,sizeof(TurkLock),256) ) log("turk: failed to init turk lock table"); if ( ! g_turkLocks.load(g_conf.m_dir,"turkdir/docidlocks.dat")) log("turk: failed to load turk lock table"); } time_t now = getTimeGlobal(); // int16_tcut RdbList *list = &st->m_list; // the best docid int64_t best = 0LL; // scan the list to get urls/docids to turk out for ( ; ! list->isExhausted() ; ) { // get rec char *k = list->getCurrentKey(); // skip that list->skipCurrentRecord(); // skip if negative if ( (k[0] & 0x01) == 0x00 ) continue; // get the docid int64_t docid = g_datedb.getDocId ( k ); // skip if locked TurkLock *tt = (TurkLock *)g_turkLock.getValue(&docid); // if there check time if ( tt && now - tt->m_lockTime > 3600 ) { // remove it g_turkLock.removeKey(&docId); // nuke tt tt = NULL; } // if still there, skip it and try next one if ( tt ) continue; // ok, we got a good docid to dish out best = docId; break; } SafeBuf sb; // print description so they can clikc a button to start the turk sb.safePrintf("<html>\n" "<title>Event Editor</title>\n" "<body>\n" "<table width=\"100%%\" border=\"0\">\n" "<tr><td style=\"background-color:#0079ba;\">\n" "<center><font color=#00000>" "<h2>Event Editor</h2>\n" "</font></center></td>" "</tr></table>"); // if we had no docid, give user an empty msg if ( ! best ) { sb.safePrintf("<center>Nothing currently available to edit. " "Please try again later.</center>" "</body></html>\n"); sendReply ( &sb ); return; } // lock it! TurkLock tt; strcpy ( tt.m_user , st->m_user ); tt.m_lockTime = now; if ( ! g_lockTable.addLock ( &tt ) ) { sendErrorReply ( st , g_errno ); return; } // . fetch the TitleRec // . a max cache age of 0 means not to read from the cache XmlDoc *xd = &st->m_xd; // . when getTitleRec() is called it will load the old one // since XmlDoc::m_setFromTitleRec will be true // . niceness is 0 xd->set3 ( best , st->m_coll , 0 ); // if it blocks while it loads title rec, it will re-call this routine xd->setCallback ( st , processLoopWrapper ); // good to go! return processLoop ( st ); }
// . displays the stats for a username // . show stats for every day we have them for // . in a big list // . if they click the day display all docids evaluated for that day // . show the accuracy for that day too // . how many docs they edited // . how many of those docs were verified by another // . and if there was consensus void gotTransdbList ( State60 *st ) { // get today's time range time_t now = getTimeGlobal(); // get start of today time_t dayStart = now / (24*3600); SafeBuf sb; // int16_tcut TcpSocket *s = st->m_s; // make about 200k of mem to write into if ( ! sb.reserve ( 200000 ) ) return g_httpServer.sendErrorReply(s,500,mstrerrno(g_errno)); // print description so they can clikc a button to start the turk sb.safePrintf("<html>\n" "<title>Event Editor</title>\n" "<body>\n" "<table width=\"100%%\" border=\"0\">\n" "<tr><td style=\"background-color:#0079ba;\">\n" "<center><font color=#00000>" "<h2>Event Editor</h2>\n" "</font></center></td>" "</tr></table>"); // print the content sb.safePrintf("<center><font size=4><blink>" "<b><a href=\"/pageturk?c=%s&edit=1\">" "Click here to start editing.</a></b></blink>" "</font><br><i>Please take your " "time to read the information below before you begin" "</i><br><font color=\"red\" size=2> Warning: Adult " "content might be presented to you." " You should be above 18 years of age to continue." "</center></font>",st->m_coll); sb.safePrintf("<font face=arial,sans-serif color=black size=3>" "<p>By clicking <i>Start Voting</i>, you will be " "presented with an interface for editing events. " "The editor will display a modified web page that " "contains one or more events. Each event's description " "will be highlight with a blue background. You can " "toggle whether a particular event is displayed by " "clicking on that event's ID. You can highlight one or " "multiple event descriptions at the same time. " "</p><p>" "By clicking on the section icons in the web page you " "can tell the editor that a virtual fence should be " "erected around that section. The fence will make sure " "that event descriptions can not span across it. Each " "event description must be fully contained either " "inside or outside the fence. However, you can also " "declare a section as a title section, which means that " "the text that the title section contains is free to be " "used by any event description." "</p>\n" "<p>When you are done erecting section fences, you " "submit your changes. The more changes you make the " "more points you earn. Other users may evaluate " "your edits for accuracy. You will be paid based on the " "points you earn as well as your accuracy. All " "transactions are listed in the table below.</p>" "<p>You may not change your username or password " "but you can change your email address. Your email " "address will be used to pay you with PayPal every " "Friday. Paypal fees will be deducted on your end. By " "using this service you agree to all stated Terms & " "Conditions.</p>" "</font>\n"); // get the user record User *uu = g_users.getUser ( username ); // print out their info, like paypal email sb.safePrintf("<table>\n" "<tr><td colspan=10><center>Your Info</center>" "</td></tr>\n" "<tr>" "<td>Email</td>" "<td><input type=text value=%s></td>" "<td>email address used to pay with paypal</td>" "</tr>\n" "<tr><td colspan=10><input type=submit value=update>" "</td></tr>\n" "</table>\n" , uu->m_payPalEmail ); // print your stats here now sb.safePrintf("<table>\n" "<tr><td colspan=10><center>Your Stats</center>" "</td></tr>\n" "<tr>" "<td>date</td>" "<td>action</td>" "<td>amount</td>" "<td>desc</td>" "</tr>\n"); // int16_tcut RdbList *list = &st->m_list; int32_t lastDay = -1; int32_t totalReceives = 0; int32_t totalSubmits = 0; int32_t totalPasses = 0; int32_t totalFails = 0; // scan the list for ( ; ! list->isExhausted() ; ) { // get rec char *rec = list->getCurrentRecord(); char *data = list->getCurrentData(); int32_t dataSize = list->getCurrentDataSize(); // skip that list->skipCurrentRecord(); // skip if negative if ( (rec[0] & 0x01) == 0x00 ) continue; // get the time (global time - sync'd with host #0) time_t tt = g_transdb.getTimeStamp ( rec ); // get day # int32_t daynum = tt / (24*3600); // is it today? bool isToday = ( daynum >= dayStart ); // point to the Transaction Trans *trans = (Trans *)data; // if is today, print it out verbatim if ( isToday ) { // print it in html row format to match table above //printTrans ( &sb , rec ); sb.safePrintf("<tr>"); // make it into a nice date time_t dd = lastDay * 86400; struct tm *timeStruct = localtime ( &dd ); char ppp[100]; strftime(ppp,100,"%H:%M:%S",timeStruct); // print last days stats first sb.safePrintf("<td>%s</td>",ppp); // then stats if ( trans->m_actionType == AT_RECEIVE_DOC ) sb.safePrintf("<td>receive</td>" "<td>%"INT32" pts</td>" "<td>docid=%"UINT64"</td>", (int32_t)trans->m_number, trans->m_docId); else if ( trans->m_actionType == AT_SUBMIT_DOC ) sb.safePrintf("<td>submit</td>" "<td>%"INT32" pts</td>" "<td>docid=%"UINT64"</td>", (int32_t)trans->m_number, trans->m_docId); else if ( trans->m_actionType == AT_PASS_DOC ) sb.safePrintf("<td>verify</td>" "<td>%"INT32" pts</td>" "<td>docid=%"UINT64" was verified " "by user=\"%s\"</td>", (int32_t)trans->m_number, trans->m_docId, trans->m_desc); else if ( trans->m_actionType == AT_FAIL_DOC ) sb.safePrintf("<td>verify</td>" "<td>%"INT32" pts</td>" "<td>docid=%"UINT64" was deemed to " "be incorrect " "by user=\"%s\"</td>", (int32_t)trans->m_number, trans->m_docId, trans->m_desc); else if ( trans->m_actionType == AT_ACCURACY_EVAL) sb.safePrintf("<td>accuracy eval</td>" "<td>%.02f</td>" "<td>docid=%"UINT64"</td>", trans->m_number, trans->m_docId); else if ( trans->m_actionType == AT_CHARGE) sb.safePrintf("<td>credit</td>" "<td>%.02f</td>" "<td>You made money.</td>", trans->m_number); else if ( trans->m_actionType == AT_PAYMENT) sb.safePrintf("<td>payment</td>" "<td>%.02f</td>" "<td>We paid you.</td>", trans->m_number); else if ( trans->m_actionType == AT_LOGIN) sb.safePrintf("<td>login</td>" "<td>-</td>" "<td>You logged in.</td>"); else if ( trans->m_actionType == AT_LOGOUT) sb.safePrintf("<td>logout</td>" "<td>-</td>" "<td>You logged out.</td>"); else if ( trans->m_actionType == AT_AUTO_LOGOUT) sb.safePrintf("<td>logout</td>" "<td>-</td>" "<td>You were auto " "logged out.</td>"); else { char *xx=NULL;*xx=0; } sb.safePrintf("</tr>\n"); continue; } // if does not match last day, print out that last day's stats // and reset for next guy if ( daynum != lastDay && lastDay != -1 ) { // make it into a nice date time_t dd = lastDay * 86400; struct tm *timeStruct = localtime ( &dd ); char ppp[100]; strftime(ppp,100,"%b-%d-%Y",timeStruct); // print last days stats first sb.safePrintf("<td>%s</td>",ppp); // then stats sb.safePrintf("<tr>" "<td>receive</td>" "<td>%"INT32"</td>" "<td>Total received</td>" "</tr>\n", totalReceives); sb.safePrintf("<tr>" "<td>submit</td>" "<td>%"INT32"</td>" "<td>Total submitted</td>" "</tr>\n", totalSubmits); sb.safePrintf("<tr>" "<td>pass</td>" "<td>%"INT32"</td>" "<td>Total accuracy tests passed</td>" "</tr>\n", totalPasses); sb.safePrintf("<tr>" "<td>fail</td>" "<td>%"INT32"</td>" "<td>Total accuracy tests failed</td>" "</tr>\n", totalFails); // reset as well totalReceived = 0; totalSubmits = 0; totalPasses = 0; totalFails = 0; } // remember last day # we processed for accumulating stats lastDay = daynum; // accum stats if ( trans->m_actionType == AT_RECEIVE_DOC ) totalReceives++; if ( trans->m_actionType == AT_SUBMIT_DOC ) totalSubmits++; if ( trans->m_actionType == AT_PASS_DOC ) totalPasses++; if ( trans->m_actionType == AT_FAIL_DOC ) totalFails++; } sb.safePrintf("</body></html>\n"); sendReply ( &sb ); }
// . returns false if blocked, otherwise true // . sets g_errno on error bool sendPageStatsdb ( TcpSocket *s, HttpRequest *r ) { char *cgi; long cgiLen; StateStatsdb *st; try { st = new StateStatsdb; } catch ( ... ) { g_errno = ENOMEM; log(LOG_INFO, "PageStatsdb: failed to allocate state memory."); return true; } mnew( st, sizeof(StateStatsdb), "PageStatsdb" ); st->m_niceness = MAX_NICENESS; st->m_socket = s; st->m_request = *r; // hostId must be one of the following: // 0-n - a valid hostId // -1 - a sample (subset) of the hosts // -2 - all hosts // -3 - this host st->m_hostId = r->getLong( "host", -3 ); if ( st->m_hostId == -3 ) st->m_hostId = g_hostdb.getMyHostId(); // If we are pulling from multiple hosts, are we merging // the data into a single graph? // TODO: // - Make sure this always happens. Now our only concern // is how many stats we will be drawing. //st->m_mergeResults = (bool )r->getLong( "merge_results" , 1 ); // get session parameters st->m_cacti = (bool )r->getLong( "cacti" , 0 ); // get date parameters cgi = r->getString( "sdate" , &cgiLen , NULL ); st->m_startDate = genDate( cgi, cgiLen ); cgi = r->getString( "edate" , &cgiLen , NULL ); st->m_endDate = genDate( cgi, cgiLen ); st->m_dateCustom = (bool)r->getLong( "custom", 0 ); // default to 10 hours, i would do 1 day except that there are // some bugs that mess up the display a lot when i do that st->m_datePeriod = r->getLong( "date_period" , 36000 ); st->m_dateUnits = r->getLong( "date_units" , 1 );//SECS_PER_MIN st->m_now = (bool)r->getLong( "date_now" , 1 ); st->m_autoUpdate = (bool)r->getLong( "auto_update" , 0 ); // # samples in moving average st->m_samples = r->getLong( "samples" , 300 ); //if ( st->m_columns < MIN_COLUMNS || st->m_columns > MAX_COLUMNS ) // st->m_columns = DEF_COLUMNS; if ( st->m_now ) st->m_startDate = (time_t)getTimeGlobal(); st->m_startDateR = st->m_startDate; st->m_endDateR = st->m_endDate; if ( ! st->m_dateCustom ) { st->m_endDateR = st->m_startDateR - ( st->m_datePeriod * st->m_dateUnits ); st->m_endDate = st->m_endDateR; } if ( ! g_statsdb.makeGIF ( st->m_endDateR , st->m_startDateR , st->m_samples , &st->m_sb2 , st , sendReply ) ) return false; // if we didn't block call it ourselves directly sendReply ( st ); return true; }
bool Msg3a::gotAllSplitReplies ( ) { // if any of the split requests had an error, give up and set m_errno // but don't set if for non critical errors like query truncation if ( m_errno ) { g_errno = m_errno; return true; } // also reset the finalbuf and the oldNumTopDocIds if ( m_finalBuf ) { mfree ( m_finalBuf, m_finalBufSize, "Msg3aF" ); m_finalBuf = NULL; m_finalBufSize = 0; } // update our estimated total hits m_numTotalEstimatedHits = 0; for ( long i = 0; i < m_numHosts ; i++ ) { // get that host that gave us the reply //Host *h = g_hostdb.getHost(i); // . get the reply from multicast // . multicast should have destroyed all slots, but saved reply // . we are responsible for freeing the reply // . we need to call this even if g_errno or m_errno is // set so we can free the replies in Msg3a::reset() // . if we don't call getBestReply() on it multicast should // free it, because Multicast::m_ownReadBuf is still true Multicast *m = &m_mcast[i]; bool freeit = false; long replySize = 0; long replyMaxSize; char *rbuf; Msg39Reply *mr; // . only get it if the reply not already full // . if reply already processed, skip // . perhaps it had no more docids to give us or all termlists // were exhausted on its disk and this is a re-call // . we have to re-process it for count m_numTotalEstHits, etc. rbuf = m->getBestReply ( &replySize , &replyMaxSize , &freeit , true ); //stealIt? // cast it mr = (Msg39Reply *)rbuf; // in case of mem leak, re-label from "mcast" to this so we // can determine where it came from, "Msg3a-GBR" relabel( rbuf, replyMaxSize , "Msg3a-GBR" ); // . we must be able to free it... we must own it // . this is true if we should free it, but we should not have // to free it since it is owned by the slot? if ( freeit ) { log(LOG_LOGIC,"query: msg3a: Steal failed."); char *xx = NULL; *xx=0; } // bad reply? if ( ! mr ) { log(LOG_LOGIC,"query: msg3a: Bad NULL reply."); m_reply [i] = NULL; m_replyMaxSize[i] = 0; // it might have been timd out, just ignore it!! continue; // if size is 0 it can be Msg39 giving us an error! g_errno = EBADREPLYSIZE; m_errno = EBADREPLYSIZE; // all reply buffers should be freed on reset() return true; } // how did this happen? if ( replySize < 29 && ! mr->m_errno ) { // if size is 0 it can be Msg39 giving us an error! g_errno = EBADREPLYSIZE; m_errno = EBADREPLYSIZE; log(LOG_LOGIC,"query: msg3a: Bad reply size of %li.", replySize); // all reply buffers should be freed on reset() return true; } // can this be non-null? we shouldn't be overwriting one // without freeing it... if ( m_reply[i] ) // note the mem leak now log("query: mem leaking a 0x39 reply"); // cast it and set it m_reply [i] = mr; m_replyMaxSize[i] = replyMaxSize; // deserialize it (just sets the ptr_ and size_ member vars) //mr->deserialize ( ); deserializeMsg ( sizeof(Msg39Reply) , &mr->size_docIds, &mr->size_clusterRecs, &mr->ptr_docIds, mr->m_buf ); // sanity check if ( mr->m_nqt != m_q->getNumTerms() ) { g_errno = EBADREPLY; m_errno = EBADREPLY; log("query: msg3a: Split reply qterms=%li != %li.", (long)mr->m_nqt,(long)m_q->getNumTerms() ); return true; } // return if split had an error, but not for a non-critical // error like query truncation if ( mr->m_errno && mr->m_errno != EQUERYTRUNCATED ) { g_errno = mr->m_errno; m_errno = mr->m_errno; log("query: msg3a: Split had error: %s", mstrerror(g_errno)); return true; } // skip down here if reply was already set //skip: // add of the total hits from each split, this is how many // total results the lastest split is estimated to be able to // return // . THIS should now be exact since we read all termlists // of posdb... m_numTotalEstimatedHits += mr->m_estimatedHits; // debug log stuff if ( ! m_debug ) continue; // cast these for printing out long long *docIds = (long long *)mr->ptr_docIds; score_t *scores = (score_t *)mr->ptr_scores; // print out every docid in this split reply for ( long j = 0; j < mr->m_numDocIds ; j++ ) { // print out score_t logf( LOG_DEBUG, "query: msg3a: [%lu] %03li) " "split=%li docId=%012llu domHash=0x%02lx " "score=%lu" , (unsigned long)this , j , i , docIds [j] , (long)g_titledb.getDomHash8FromDocId(docIds[j]), (long)scores[j] ); } } // this seems to always return true! mergeLists ( ); if ( ! m_r->m_useSeoResultsCache ) return true; // now cache the reply SafeBuf cr; long dataSize = 4 + 4 + 4 + m_numDocIds * (8+4+4); long need = sizeof(key_t) + 4 + dataSize; bool status = cr.reserve ( need ); // sanity if ( ( m_ckey.n0 & 0x01 ) == 0x00 ) { char *xx=NULL; *xx=0; } // ignore errors g_errno = 0; // return on error with g_errno cleared if cache add failed if ( ! status ) return true; // add to buf otherwise cr.safeMemcpy ( &m_ckey , sizeof(key_t) ); cr.safeMemcpy ( &dataSize , 4 ); long now = getTimeGlobal(); cr.pushLong ( now ); cr.pushLong ( m_numDocIds ); cr.pushLong ( m_numTotalEstimatedHits );//Results ); long max = m_numDocIds; // then the docids for ( long i = 0 ; i < max ; i++ ) cr.pushLongLong(m_docIds[i] ); for ( long i = 0 ; i < max ; i++ ) cr.pushFloat(m_scores[i]); for ( long i = 0 ; i < max ; i++ ) cr.pushLong(getSiteHash26(i)); // sanity if ( cr.length() != need ) { char *xx=NULL; *xx=0; } // make these key_t startKey; key_t endKey; startKey = m_ckey; // clear delbit startKey.n0 &= 0xfffffffffffffffeLL; // end key is us endKey = m_ckey; // that is the single record m_seoCacheList.set ( cr.getBufStart() , cr.length(), cr.getBufStart(), // alloc cr.getCapacity(), // alloc size (char *)&startKey, (char *)&endKey, -1, // fixeddatasize true, // owndata? false,// use half keys? sizeof(key_t) ); // do not allow cr to free it, msg1 will cr.detachBuf(); // note it //log("seopipe: storing ckey=%s q=%s" // ,KEYSTR(&m_ckey,12) // ,m_r->ptr_query // ); //log("msg1: sending niceness=%li",(long)m_r->m_niceness); // this will often block, but who cares!? it just sends a request off if ( ! m_msg1.addList ( &m_seoCacheList , RDB_SERPDB,//RDB_CACHEDB, m_r->ptr_coll, this, // state gotSerpdbReplyWrapper, // callback false, // forcelocal? m_r->m_niceness ) ) { //log("blocked"); return false; } // we can safely delete m_msg17... just return true return true; }
// a cacheTime of -1 means browser should not cache at all void HttpMime::makeMime ( long totalContentLen , long cacheTime , time_t lastModified , long offset , long bytesToSend , char *ext , bool POSTReply , char *contentType , char *charset , long httpStatus , char *cookie ) { // assume UTF-8 //if ( ! charset ) charset = "utf-8"; // . make the content type line // . uses a static buffer if ( ! contentType ) contentType = (char *)getContentTypeFromExtension ( ext ); // do not cache plug ins if ( contentType && strcmp(contentType,"application/x-xpinstall")==0) cacheTime = -2; // assume UTF-8, but only if content type is text // . No No No!!! // . This prevents charset specification in html files // . -partap //if ( ! charset && contentType && strncmp(contentType,"text",4)==0) // charset = "utf-8"; // this is used for bz2 and gz files (mp3?) const char *contentEncoding = getContentEncodingFromExtension ( ext ); // the string char enc[128]; if ( contentEncoding ) sprintf ( enc , "Content-Encoding: %s\r\n", contentEncoding ); else enc[0] = '\0'; // get the time now //time_t now = getTimeGlobal(); time_t now; if ( isClockInSync() ) now = getTimeGlobal(); else now = getTimeLocal(); // get the greenwhich mean time (GMT) char ns[128]; struct tm *timeStruct = gmtime ( &now ); // Wed, 20 Mar 2002 16:47:30 GMT strftime ( ns , 126 , "%a, %d %b %Y %T GMT" , timeStruct ); // if lastModified is 0 use now if ( lastModified == 0 ) lastModified = now; // convert lastModified greenwhich mean time (GMT) char lms[128]; timeStruct = gmtime ( &lastModified ); // Wed, 20 Mar 2002 16:47:30 GMT strftime ( lms , 126 , "%a, %d %b %Y %T GMT" , timeStruct ); // . the pragma no cache string (used just for proxy servers?) // . also use cache-control: for the browser itself (HTTP1.1, though) // . pns = "Pragma: no-cache\nCache-Control: no-cache\nExpires: -1\n"; char tmp[128]; char *pns ; // with cache-control on, when you hit the back button, it reloads // the page, this is bad for most things... so we only avoid the // cache for index.html and PageAddUrl.cpp (the main and addurl page) if ( cacheTime == -2 ) pns = "Cache-Control: no-cache\r\n" "Pragma: no-cache\r\n" "Expires: -1\r\n"; // so when we click on a control link, it responds correctly. // like turning spiders on. else if ( cacheTime == -1 ) pns = "Pragma: no-cache\r\n" "Expires: -1\r\n"; // don't specify cache times if it's 0 (let browser regulate it) else if ( cacheTime == 0 ) pns = ""; // otherwise, expire tag: "Expires: Wed, 23 Dec 2001 10:23:01 GMT" else { time_t expDate = now + cacheTime; timeStruct = gmtime ( &expDate ); strftime ( tmp , 100 , "Expires: %a, %d %b %Y %T GMT\r\n", timeStruct ); pns = tmp; } // . set httpStatus // . a reply to a POST (not a GET or HEAD) should be 201 char *p = m_buf; char *smsg = ""; if ( POSTReply ) { if ( httpStatus == -1 ) httpStatus = 200; if ( httpStatus == 200 ) smsg = " OK"; if ( ! charset ) charset = "utf-8"; //sprintf ( m_buf , p += sprintf ( p, "HTTP/1.0 %li%s\r\n" "Date: %s\r\n" //"P3P: CP=\"CAO PSA OUR\"\r\n" "Server: Gigablast/1.0\r\n" "Content-Length: %li\r\n" //"Expires: Wed, 23 Dec 2003 10:23:01 GMT\r\n" //"Expires: -1\r\n" "Connection: Close\r\n" "%s" "Content-Type: %s\r\n\r\n", //"Connection: Keep-Alive\r\n" //"%s" //"Location: f**k\r\n" //"Location: http://192.168.0.4:8000/cgi/3.cgi\r\n" //"Last-Modified: %s\r\n\r\n" , httpStatus , smsg , ns , totalContentLen , enc , contentType ); //pns , //ns ); //lms ); } // . is it partial content? // . if bytesToSend is < 0 it means "totalContentLen" else if ( offset > 0 || bytesToSend != -1 ) { if ( httpStatus == -1 ) httpStatus = 206; if ( ! charset ) charset = "utf-8"; //sprintf ( m_buf , p += sprintf( p, "HTTP/1.0 %li Partial content\r\n" "%s" "Content-Length: %li\r\n" "Content-Range: %li-%li(%li)\r\n"// added "bytes" "Connection: Close\r\n" //"P3P: CP=\"CAO PSA OUR\"\r\n" "Server: Gigablast/1.0\r\n" "%s" "Date: %s\r\n" "Last-Modified: %s\r\n" "Content-Type: %s\r\n", httpStatus , enc ,bytesToSend , offset , offset + bytesToSend , totalContentLen , pns , ns , lms , contentType ); // otherwise, do a normal mime } else { char encoding[256]; if (charset) sprintf(encoding, "; charset=%s", charset); else encoding[0] = '\0'; if ( httpStatus == -1 ) httpStatus = 200; if ( httpStatus == 200 ) smsg = " OK"; //sprintf ( m_buf , p += sprintf( p, "HTTP/1.0 %li%s\r\n" // make it at least 4 spaces so we can change // the length of the content should we insert // a login bar in Proxy::storeLoginBar() "Content-Length: %04li\r\n" "%s" "Content-Type: %s", httpStatus , smsg , totalContentLen , enc , contentType ); if ( charset ) p += sprintf ( p , "; charset=%s", charset ); p += sprintf ( p , "\r\n"); p += sprintf ( p , //"Connection: Keep-Alive\r\n" "Connection: Close\r\n" //"P3P: CP=\"CAO PSA OUR\"\r\n" "Server: Gigablast/1.0\r\n" "%s" "Date: %s\r\n" "Last-Modified: %s\r\n" , pns , ns , lms ); } // write the cookie if we have one if (cookie) { // now it is a list of Set-Cookie: x=y\r\n lines //p += sprintf ( p, "Set-Cookie: %s\r\n", cookie); if ( strncmp(cookie,"Set-Cookie",10 ) ) p += sprintf(p,"Set-Cookie: "); p += sprintf ( p, "%s", cookie); if ( p[-1] != '\n' && p[-2] != '\r' ) { *p++ = '\r'; *p++ = '\n'; } } // write another line to end the mime p += sprintf(p, "\r\n"); // set the mime's length //m_bufLen = gbstrlen ( m_buf ); m_bufLen = p - m_buf; }
// . returns false if blocked, true otherwise // . sets g_errno on error // . dumps the RdbTree, m_tree, into m_file // . also sets and writes the RdbMap for m_file // . we methodically get RdbLists from the RdbTree // . dumped recs are ordered by key if "orderedDump" was true in call to set() // otherwise, lists are ordered by node # // . we write each list of recs to the file until the whole tree has been done // . we delete all records in list from the tree after we've written the list // . if a cache was provided we incorporate the list into the cache before // deleting it from the tree to keep the cache in sync. NO we do NOT! // . called again by writeBuf() when it's done writing the whole list bool RdbDump::dumpTree ( bool recall ) { // set up some vars //int32_t nextNode; //key_t maxEndKey; //maxEndKey.setMax(); char maxEndKey[MAX_KEY_BYTES]; KEYMAX(maxEndKey,m_ks); // if dumping statsdb, we can only dump records 30 seconds old or // more because Statsdb.cpp can "back modify" such records in the tree // because it may have a query that took 10 seconds come in then it // needs to add a partial stat to the last 10 stats for those 10 secs. // we use Global time at this juncture if ( m_rdb->m_rdbId == RDB_STATSDB ) { int32_t nowSecs = getTimeGlobal(); StatKey *sk = (StatKey *)maxEndKey; sk->m_zero = 0x01; sk->m_labelHash = 0xffffffff; // leave last 60 seconds in there just to be safe sk->m_time1 = nowSecs - 60; } // this list will hold the list of nodes/recs from m_tree m_list = &m_ourList; // convert coll to collnum //collnum_t collnum = g_collectiondb.getCollnum ( m_coll ); // a collnum of -1 is for collectionless rdbs //if ( collnum < 0 ) { // //if ( g_catdb->getRdb() == m_rdb ) // if ( ! m_rdb->m_isCollectionLess ) { // char *xx=NULL;*xx=0; //return true; // } // g_errno = 0; // collnum = 0; //} // getMemOccupiedForList2() can take some time, so breathe int32_t niceness = 1; loop: // if the lastKey was the max end key last time then we're done if ( m_rolledOver ) return true; // this is set to -1 when we're done with our unordered dump if ( m_nextNode == -1 ) return true; // . NOTE: list's buffer space should be re-used!! (TODO) // . "lastNode" is set to the last node # in the list bool status = true; //if ( ! m_orderedDump ) { // status = ((RdbTree *)m_tree)->getListUnordered ( m_nextNode , // m_maxBufSize , // m_list , // &nextNode ); // // this is -1 when no more nodes are left // m_nextNode = nextNode; //} // "lastKey" is set to the last key in the list //else { { // can we remove neg recs? // class RdbBase *base = m_rdb->getBase(m_collnum); // bool removeNegRecs = false; // if ( base->m_numFiles <= 0 ) removeNegRecs = true; if ( recall ) goto skip; // debug msg //log("RdbDump:: getting list"); m_t1 = gettimeofdayInMilliseconds(); if(m_tree) status = m_tree->getList ( m_collnum , m_nextKey , maxEndKey , m_maxBufSize , // max recSizes m_list , &m_numPosRecs , &m_numNegRecs , m_useHalfKeys , niceness ); else if(m_buckets) status = m_buckets->getList ( m_collnum, m_nextKey , maxEndKey , m_maxBufSize , // max recSizes m_list , &m_numPosRecs , &m_numNegRecs , m_useHalfKeys ); // don't dump out any neg recs if it is our first time dumping // to a file for this rdb/coll. TODO: implement this later. //if ( removeNegRecs ) // m_list.removeNegRecs(); // if(!m_list->checkList_r ( false , // removeNegRecs? // false , // sleep on problem? // m_rdb->m_rdbId )) { // log("db: list to dump is not sane!"); // char *xx=NULL;*xx=0; // } skip: int64_t t2; //key_t lastKey; char *lastKey; // if error getting list (out of memory?) if ( ! status ) goto hadError; // debug msg t2 = gettimeofdayInMilliseconds(); log(LOG_INFO,"db: Get list took %"INT64" ms. " "%"INT32" positive. %"INT32" negative.", t2 - m_t1 , m_numPosRecs , m_numNegRecs ); // keep a total count for reporting when done m_totalPosDumped += m_numPosRecs; m_totalNegDumped += m_numNegRecs; // . check the list we got from the tree for problems // . ensures keys are ordered from lowest to highest as well //#ifdef GBSANITYCHECK if ( g_conf.m_verifyWrites ) { char *s = "none"; if ( m_rdb ) s = getDbnameFromId(m_rdb->m_rdbId); log("dump: verifying list before dumping (rdb=%s)",s); m_list->checkList_r ( false , // removeNegRecs? false , // sleep on problem? m_rdb->m_rdbId ); } // if list is empty, we're done! if ( status && m_list->isEmpty() ) { // consider that a rollover? if ( m_rdb->m_rdbId == RDB_STATSDB ) m_rolledOver = true; return true; } // get the last key of the list lastKey = m_list->getLastKey(); // advance m_nextKey //m_nextKey = lastKey ; //m_nextKey += (uint32_t)1; //if ( m_nextKey < lastKey ) m_rolledOver = true; KEYSET(m_nextKey,lastKey,m_ks); KEYADD(m_nextKey,1,m_ks); if (KEYCMP(m_nextKey,lastKey,m_ks)<0) m_rolledOver = true; // debug msg //log(0,"RdbDump:lastKey.n1=%"UINT32",n0=%"UINT64"",lastKey.n1,lastKey.n0); //log(0,"RdbDump:next.n1=%"UINT32",n0=%"UINT64"",m_nextKey.n1,m_nextKey.n0); } // . return true on error, g_errno should have been set // . this is probably out of memory error if ( ! status ) { hadError: log("db: Had error getting data for dump: %s. Retrying.", mstrerror(g_errno)); // debug msg //log("RdbDump::getList: sleeping and retrying"); // retry for the remaining two types of errors if (!g_loop.registerSleepCallback(1000,this,tryAgainWrapper2)){ log( "db: Retry failed. Could not register callback."); return true; } // wait for sleep return false; } // if list is empty, we're done! if ( m_list->isEmpty() ) return true; // . set m_firstKeyInQueue and m_lastKeyInQueue // . this doesn't work if you're doing an unordered dump, but we should // not allow adds when closing m_lastKeyInQueue = m_list->getLastKey(); //m_firstKeyInQueue = m_list->getCurrentKey(); m_list->getCurrentKey(m_firstKeyInQueue); // . write this list to disk // . returns false if blocked, true otherwise // . sets g_errno on error // . if this blocks it should call us (dumpTree() back) if ( ! dumpList ( m_list , m_niceness , false ) ) return false; // close up shop on a write/dumpList error if ( g_errno ) return true; // . if dumpList() did not block then keep on truckin' // . otherwise, wait for callback of dumpTree() goto loop; }
// . returns false if blocked, true otherwise // . sets g_errno on error bool sendPageAddUrl ( TcpSocket *s , HttpRequest *r ) { // . get fields from cgi field of the requested url // . get the search query long urlLen = 0; char *url = r->getString ( "u" , &urlLen , NULL /*default*/); // see if they provided a url of a file of urls if they did not // provide a url to add directly //bool isAdmin = g_collectiondb.isAdmin ( r , s ); bool isAdmin = r->getIsLocal(); long ufuLen = 0; char *ufu = NULL; if ( isAdmin ) // get the url of a file of urls (ufu) ufu = r->getString ( "ufu" , &ufuLen , NULL ); // can't be too long, that's obnoxious if ( urlLen > MAX_URL_LEN || ufuLen > MAX_URL_LEN ) { g_errno = EBUFTOOSMALL; g_msg = " (error: url too long)"; return g_httpServer.sendErrorReply(s,500,"url too long"); } // get the collection long collLen = 0; char *coll = r->getString("c",&collLen); if ( ! coll || ! coll[0] ) { //coll = g_conf.m_defaultColl; coll = g_conf.getDefaultColl( r->getHost(), r->getHostLen() ); collLen = gbstrlen(coll); } // get collection rec CollectionRec *cr = g_collectiondb.getRec ( coll ); // bitch if no collection rec found if ( ! cr ) { g_errno = ENOCOLLREC; g_msg = " (error: no collection)"; return g_httpServer.sendErrorReply(s,500,"no coll rec"); } // . make sure the ip is not banned // . we may also have an exclusive list of IPs for private collections if ( ! cr->hasSearchPermission ( s ) ) { g_errno = ENOPERM; g_msg = " (error: permission denied)"; return g_httpServer.sendErrorReply(s,500,mstrerror(g_errno)); } // make a new state State1 *st1 ; try { st1 = new (State1); } catch ( ... ) { g_errno = ENOMEM; log("PageAddUrl: new(%i): %s", sizeof(State1),mstrerror(g_errno)); return g_httpServer.sendErrorReply(s,500,mstrerror(g_errno)); } mnew ( st1 , sizeof(State1) , "PageAddUrl" ); // save socket and isAdmin st1->m_socket = s; st1->m_isAdmin = isAdmin; // assume no url buf yet, set below //st1->m_ubuf = NULL; //st1->m_ubufAlloc = NULL; //st1->m_metaList = NULL; // save the url st1->m_url[0] = '\0'; if ( url ) { // normalize and add www. if it needs it Url uu; uu.set ( url , gbstrlen(url) , true ); // remove >'s i guess and store in st1->m_url[] buffer st1->m_urlLen=cleanInput ( st1->m_url, MAX_URL_LEN, uu.getUrl(), uu.getUrlLen() ); // point to that as the url "buf" to add //st1->m_ubuf = st1->m_url; //st1->m_ubufSize = urlLen; //st1->m_ubufAlloc = NULL; // do not free it! } // save the "ufu" (url of file of urls) st1->m_ufu[0] = '\0'; st1->m_ufuLen = ufuLen; memcpy ( st1->m_ufu , ufu , ufuLen ); st1->m_ufu[ufuLen] = '\0'; st1->m_doTuringTest = cr->m_doTuringTest; char *username = g_users.getUsername(r); if(username) strcpy(st1->m_username,username); //st1->m_user = g_pages.getUserType ( s , r ); st1->m_spiderLinks = true; st1->m_strip = true; //st1->m_raw = r->getLong("raw",0); // init state2 for ( long i = 0; i < 5; i++ ){ st1->m_state2[i].m_buf = NULL; st1->m_state2[i].m_bufLen = 0; st1->m_state2[i].m_bufMaxLen = 0; } // save the collection name in the State1 class if ( collLen > MAX_COLL_LEN ) collLen = MAX_COLL_LEN; strncpy ( st1->m_coll , coll , collLen ); st1->m_coll [ collLen ] = '\0'; // assume they answered turing test correctly st1->m_goodAnswer = true; // if addurl is turned off, just print "disabled" msg if ( ! g_conf.m_addUrlEnabled ) return sendReply ( st1 , false ); // can also be turned off in the collection rec if ( ! cr->m_addUrlEnabled ) return sendReply ( st1 , false ); // or if in read-only mode if ( g_conf.m_readOnlyMode ) return sendReply ( st1 , false ); // cannot add if another Msg10 from here is still in progress if ( s_inprogress ) return sendReply ( st1 , true ); // use now as the spiderTime // get ip of submitter //unsigned long h = ipdom ( s->m_ip ); // . use top 2 bytes now, some isps have large blocks // . if this causes problems, then they can do pay for inclusion unsigned long h = iptop ( s->m_ip ); long codeLen; char* code = r->getString("code", &codeLen); if(g_autoBan.hasCode(code, codeLen, s->m_ip)) { long uipLen = 0; char* uip = r->getString("uip",&uipLen); long hip = 0; //use the uip when we have a raw query to test if //we can submit if(uip) { hip = atoip(uip, uipLen); h = iptop( hip ); } } st1->m_strip = r->getLong("strip",0); // Remember, for cgi, if the box is not checked, then it is not // reported in the request, so set default return value to 0 long spiderLinks = r->getLong("spiderLinks",-1); // also support all lowercase like PageInject.cpp uses if ( spiderLinks == -1 ) spiderLinks = r->getLong("spiderlinks",0); // . should we force it into spiderdb even if already in there // . use to manually update spider times for a url // . however, will not remove old scheduled spider times // . mdw: made force on the default st1->m_forceRespider = r->getLong("force",1); // 0); long now = getTimeGlobal(); // . allow 1 submit every 1 hour // . restrict by submitter domain ip if ( ! st1->m_isAdmin && ! canSubmit ( h , now , cr->m_maxAddUrlsPerIpDomPerDay ) ) { // return error page g_errno = ETOOEARLY; return sendReply ( st1 , true ); } //st1->m_query = r->getString( "qts", &st1->m_queryLen ); // check it, if turing test is enabled for this collection if ( ! st1->m_isAdmin && cr->m_doTuringTest && ! g_turingTest.isHuman(r) ) { // log note so we know it didn't make it g_msg = " (error: bad answer)"; //log("PageAddUrl:: addurl failed for %s : bad answer", // iptoa(s->m_ip)); st1->m_goodAnswer = false; return sendReply ( st1 , true /*addUrl enabled?*/ ); } //if ( st1->m_queryLen > 0 ) // return getPages( st1 ); // if no url given, just print a blank page if ( ! url ) return sendReply ( st1 , true ); // // make a SpiderRequest // SpiderRequest *sreq = &st1->m_sreq; // reset it sreq->reset(); // make the probable docid long long probDocId = g_titledb.getProbableDocId ( st1->m_url ); // make one up, like we do in PageReindex.cpp long firstIp = (probDocId & 0xffffffff); // . now fill it up // . TODO: calculate the other values... lazy!!! (m_isRSSExt, // m_siteNumInlinks,...) sreq->m_isNewOutlink = 1; sreq->m_isAddUrl = 1; sreq->m_addedTime = now; sreq->m_fakeFirstIp = 1; sreq->m_probDocId = probDocId; sreq->m_firstIp = firstIp; sreq->m_hopCount = 0; // its valid if root Url uu; uu.set ( st1->m_url ); if ( uu.isRoot() ) sreq->m_hopCountValid = true; // too big? //long len = st1->m_urlLen; // the url! includes \0 strcpy ( sreq->m_url , st1->m_url ); // call this to set sreq->m_dataSize now sreq->setDataSize(); // make the key dude -- after setting url sreq->setKey ( firstIp , 0LL, false ); // need a fake first ip lest we core! //sreq->m_firstIp = (pdocId & 0xffffffff); // how to set m_firstIp? i guess addurl can be throttled independently // of the other urls??? use the hash of the domain for it! long dlen; char *dom = getDomFast ( st1->m_url , &dlen ); // fake it for this... //sreq->m_firstIp = hash32 ( dom , dlen ); // sanity if ( ! dom ) { g_errno = EBADURL; return sendReply ( st1 , true ); } // shortcut Msg4 *m = &st1->m_msg4; // now add that to spiderdb using msg4 if ( ! m->addMetaList ( (char *)sreq , sreq->getRecSize() , coll , st1 , // state addedStuff , MAX_NICENESS , RDB_SPIDERDB ) ) // we blocked return false; // send back the reply return sendReply ( st1 , true ); }