void RdbMerge::doneMerging ( ) { // let RdbDump free its m_verifyBuf buffer if it existed m_dump.reset(); // debug msg //fprintf(stderr,"exiting, g_errno=%s!\n",mstrerror(g_errno)); //exit(-1); // . free the list's memory, reset() doesn't do it // . when merging titledb i'm still seeing 200MB allocs to read from // tfndb. m_list.freeList(); // nuke our msg3 //delete (m_msg3); // log a msg log(LOG_INFO,"db: Merge status: %s.",mstrerror(g_errno)); // . reset our class // . this will free it's cutoff keys buffer, trash buffer, treelist // . TODO: should we not reset to keep the mem handy for next time // to help avoid out of mem errors? m_msg5.reset(); // . do we really need these anymore? // . turn these off before calling incorporateMerge() since it // will call attemptMerge() on all the other dbs m_isMerging = false; m_isSuspended = false; // get base, returns NULL and sets g_errno to ENOCOLLREC on error RdbBase *base; if (!(base=getRdbBase(m_rdbId,m_coll))) return; // pass g_errno on to incorporate merge so merged file can be unlinked base->incorporateMerge ( ); // nuke the lock so others can merge //s_isMergeLocked = false; }
void doneScanningWrapper ( void *state ) { Msg3 *THIS = (Msg3 *) state; // inc the scan count THIS->m_numScansCompleted++; // we decided to try to ignore these errors if ( g_errno == EINTR ) { log("net: Interrupted system call while reading file. " "Ignoring."); g_errno = 0; } // if we had an error, remember it if ( g_errno ) { // get base, returns NULL and sets g_errno to ENOCOLLREC on err RdbBase *base = getRdbBase( THIS->m_rdbId, THIS->m_collnum ); char *dbname = "NOT FOUND"; if ( base ) dbname = base->m_dbname; int32_t tt = LOG_WARN; if ( g_errno == EFILECLOSED ) tt = LOG_INFO; log(tt,"net: Reading %s had error: %s.", dbname,mstrerror(g_errno)); THIS->m_errno = g_errno; g_errno = 0; } // return now if we're awaiting more scan completions if ( THIS->m_numScansCompleted < THIS->m_numScansStarted ) return; // . give control to doneScanning // . return if it blocks if ( ! THIS->doneScanning() ) return; // if one of our lists was *huge* and could not alloc mem, it was // due to corruption if ( THIS->m_hadCorruption ) g_errno = ECORRUPTDATA; // if it doesn't block call the callback, g_errno may be set THIS->m_callback ( THIS->m_state ); }
void RdbMerge::doneMerging ( ) { // save this int32_t saved = g_errno; // let RdbDump free its m_verifyBuf buffer if it existed m_dump.reset(); // debug msg //fprintf(stderr,"exiting, g_errno=%s!\n",mstrerror(g_errno)); //exit(-1); // . free the list's memory, reset() doesn't do it // . when merging titledb i'm still seeing 200MB allocs to read from // tfndb. m_list.freeList(); // nuke our msg3 //delete (m_msg3); // log a msg log(LOG_INFO,"db: Merge status: %s.",mstrerror(g_errno)); // . reset our class // . this will free it's cutoff keys buffer, trash buffer, treelist // . TODO: should we not reset to keep the mem handy for next time // to help avoid out of mem errors? m_msg5.reset(); // . do we really need these anymore? // . turn these off before calling incorporateMerge() since it // will call attemptMerge() on all the other dbs m_isMerging = false; m_isSuspended = false; // if collection rec was deleted while merging files for it // then the rdbbase should be NULL i guess. if ( saved == ENOCOLLREC ) return; // if we are exiting then dont bother renaming the files around now. // this prevents a core in RdbBase::incorporateMerge() if ( g_process.m_mode == EXIT_MODE ) { log("merge: exiting. not ending merge."); return; } // get base, returns NULL and sets g_errno to ENOCOLLREC on error RdbBase *base = getRdbBase( m_rdbId, m_collnum ); if ( ! base ) { return; } // pass g_errno on to incorporate merge so merged file can be unlinked base->incorporateMerge ( ); // nuke the lock so others can merge //s_isMergeLocked = false; }
void setTermFreqWeights ( char *coll, Query *q , long long *termFreqs, float *termFreqWeights ) { long long numDocsInColl = 0; RdbBase *base = getRdbBase ( RDB_CLUSTERDB , coll ); if ( base ) numDocsInColl = base->getNumGlobalRecs(); // issue? set it to 1000 if so if ( numDocsInColl < 0 ) { log("query: Got num docs in coll of %lli < 0",numDocsInColl); // avoid divide by zero below numDocsInColl = 1; } // now get term freqs again, like the good old days long long *termIds = q->getTermIds(); // just use rdbmap to estimate! for ( long i = 0 ; i < q->getNumTerms(); i++ ) { long long tf = g_posdb.getTermFreq ( coll ,termIds[i]); if ( termFreqs ) termFreqs[i] = tf; float tfw = getTermFreqWeight(tf,numDocsInColl); termFreqWeights[i] = tfw; } }
// . but now that we may get a list remotely to fix data corruption, // this may indeed block bool Msg3::doneScanning ( ) { QUICKPOLL(m_niceness); // . did we have any error on any scan? // . if so, repeat ALL of the scans g_errno = m_errno; // 2 retry is the default int32_t max = 2; // see if explicitly provided by the caller if ( m_maxRetries >= 0 ) max = m_maxRetries; // now use -1 (no max) as the default no matter what max = -1; // ENOMEM is particulary contagious, so watch out with it... if ( g_errno == ENOMEM && m_maxRetries == -1 ) max = 0; // msg0 sets maxRetries to 2, don't let max stay set to -1 if ( g_errno == ENOMEM && m_maxRetries != -1 ) max = m_maxRetries; // when thread cannot alloc enough read buf it keeps the read buf // set to NULL and BigFile.cpp sets g_errno to EBUFTOOSMALL if ( g_errno == EBUFTOOSMALL && m_maxRetries == -1 ) max = 0; // msg0 sets maxRetries to 2, don't let max stay set to -1 if ( g_errno == EBUFTOOSMALL && m_maxRetries != -1 ) max = m_maxRetries; // . if no thread slots available, that hogs up serious memory. // the size of Msg3 is 82k, so having just 5000 of them is 430MB. // . i just made Msg3 alloc mem when it needs more than about 2k // so this problem is greatly reduced, therefore let's keep // retrying... forever if no thread slots in thread queue since // we become the thread queue in a way. if ( g_errno == ENOTHREADSLOTS ) max = -1; // this is set above if the map has the same consecutive key repeated // and the read is enormous if ( g_errno == ECORRUPTDATA ) max = 0; // usually bad disk failures, don't retry those forever //if ( g_errno == EIO ) max = 3; // no, now our hitachis return these even when they're good so // we have to keep retrying forever if ( g_errno == EIO ) max = -1; // count these so we do not take drives offline just because // kernel ring buffer complains... if ( g_errno == EIO ) g_numIOErrors++; // bail early on high priority reads for these errors if ( g_errno == EDISKSTUCK && m_niceness == 0 ) max = 0; if ( g_errno == EIO && m_niceness == 0 ) max = 0; // how does this happen? we should never bail out on a low priority // disk read... we just wait for it to complete... if ( g_errno == EDISKSTUCK && m_niceness != 0 ) { char *xx=NULL;*xx=0;} // on I/O, give up at call it corrupt after a while. some hitachis // have I/O errros on little spots, like gk88, maybe we can fix him if ( g_errno == EIO && m_retryNum >= 5 ) { m_errno = ECORRUPTDATA; m_hadCorruption = true; // do not do any retries any more max = 0; } // convert m_errno to ECORRUPTDATA if it is EBUFTOOSMALL and the // max of the bytesToRead are over 500MB. // if bytesToRead was ludicrous, then assume that the data file // was corrupted, the map was regenerated and it patched // over the corrupted bits which were 500MB or more in size. // we cannot practically allocate that much, so let's just // give back an empty buffer. treat it like corruption... // the way it patches is to store the same key over all the corrupted // pages, which can get pretty big. so if you read a range with that // key you will be hurting!! // this may be the same scenario as when the rdbmap has consecutive // same keys. see above where we set m_errno to ECORRUPTDATA... if ( g_errno == EBUFTOOSMALL ) { int32_t biggest = 0; for ( int32_t i = 0 ; i < m_numFileNums ; i++ ) { if ( m_scans[i].m_bytesToRead < biggest ) continue; biggest = m_scans[i].m_bytesToRead; } if ( biggest > 500000000 ) { log("db: Max read size was %" PRId32" > 500000000. Assuming " "corrupt data in data file.",biggest); m_errno = ECORRUPTDATA; m_hadCorruption = true; // do not do any retries on this, the read was > 500MB max = 0; } } // if shutting down gb then limit to 20 so we can shutdown because // it can't shutdown until all threads are out of the queue i think if ( g_process.m_mode == EXIT_MODE && max < 0 ) { //log("msg3: forcing retries to 0 because shutting down"); max = 0; } // get base, returns NULL and sets g_errno to ENOCOLLREC on error RdbBase *base = getRdbBase( m_rdbId, m_collnum ); if ( ! base ) { return true; } // this really slows things down because it blocks the cpu so // leave it out for now #ifdef GBSANITYCHECK // check for corruption here, do not do it again in Msg5 if we pass if ( ! g_errno ) { // && g_conf.m_doErrorCorrection ) { int32_t i; for ( i = 0 ; i < m_numFileNums ; i++ ) if ( ! m_lists[i].checkList_r ( false, false ) ) break; if ( i < m_numFileNums ) { g_errno = ECORRUPTDATA; m_errno = ECORRUPTDATA; max = g_conf.m_corruptRetries; // try 100 times log("db: Encountered corrupt list in file %s.", base->getFile(m_fileNums[i])->getFilename()); } else m_listsChecked = true; } #endif // try to fix this error i've seen if ( g_errno == EBADENGINEER && max == -1 ) max = 100; // . if we had a ETRYAGAIN error, then try again now // . it usually means the whole file or a part of it was deleted // before we could finish reading it, so we should re-read all now // . RdbMerge deletes BigFiles after it merges them and also chops // off file heads // . now that we have threads i'd imagine we'd get EBADFD or something // . i've also seen "illegal seek" as well if ( m_errno && (m_retryNum < max || max < 0) && // this will complete in due time, we can't call a sleep wrapper // on it because the read is really still pending... m_errno != EDISKSTUCK ) { // print the error static time_t s_time = 0; time_t now = getTime(); if ( now - s_time > 5 || g_errno != ENOTHREADSLOTS ) { log("net: Had error reading %s: %s. Retrying. " "(retry #%" PRId32")", base->m_dbname,mstrerror(m_errno) , m_retryNum ); s_time = now; } // send email alert if in an infinite loop, but don't send // more than once every 2 hours static int32_t s_lastSendTime = 0; if ( m_retryNum == 100 && getTime() - s_lastSendTime > 3600*2){ // remove this for now it is going off all the time //g_pingServer.sendEmail(NULL,//g_hostdb.getMyHost(), // "100 read retries",true); s_lastSendTime = getTime(); } // clear g_errno cuz we should for call to readList() g_errno = 0; // free the list buffer since if we have 1000 Msg3s retrying // it will totally use all of our memory for ( int32_t i = 0 ; i < m_numChunks ; i++ ) m_lists[i].destructor(); // count retries m_retryNum++; // backoff scheme, wait 100ms more each time int32_t wait ; if ( m_retryNum == 1 ) wait = 10; else wait = 200 * m_retryNum; // . don't wait more than 10 secs between tries // . i've seen gf0 and gf16 get mega saturated if ( wait > 10000 ) wait = 10000; // wait 500 ms if ( g_loop.registerSleepCallback ( wait , // ms this , doneSleepingWrapper3, m_niceness)) return false; // otherwise, registration failed log( "net: Failed to register sleep callback for retry. " "Abandoning read. This is bad."); // return, g_errno should be set g_errno = EBUFTOOSMALL; m_errno = EBUFTOOSMALL; return true; } // if we got an error and should not retry any more then give up if ( g_errno ) { log( "net: Had error reading %s: %s. Giving up after %" PRId32" " "retries.", base->m_dbname,mstrerror(g_errno) , m_retryNum ); return true; } // note it if the retry finally worked if ( m_retryNum > 0 ) log(LOG_INFO,"disk: Read succeeded after retrying %" PRId32" times.", (int32_t)m_retryNum); // count total bytes for logging int32_t count = 0; // . constrain all lists to make merging easier // . if we have only one list, then that's nice cuz the constrain // will allow us to send it right away w/ zero copying // . if we have only 1 list, it won't be merged into a final list, // that is, we'll just set m_list = &m_lists[i] for ( int32_t i = 0 ; i < m_numFileNums ; i++ ) { QUICKPOLL(m_niceness); // count total bytes for logging count += m_lists[i].getListSize(); // . hint offset is relative to the offset of first key we read // . if that key was only 6 bytes RdbScan shift the list buf // down 6 bytes to make the first key 12 bytes... a // requirement for all RdbLists // . don't inc it, though, if it was 0, pointing to the start // of the list because our shift won't affect that if ( m_scans[i].m_shifted == 6 && m_hintOffsets[i] > 0 ) m_hintOffsets[i] += 6; // posdb double compression if ( m_scans[i].m_shifted == 12 && m_hintOffsets[i] > 0 ) m_hintOffsets[i] += 12; // . don't constrain on minRecSizes here because it may // make our endKey smaller, which will cause problems // when Msg5 merges these lists. // . If all lists have different endKeys RdbList's merge // chooses the min and will merge in recs beyond that // causing a bad list BECAUSE we don't check to make // sure that recs we are adding are below the endKey // . if we only read from one file then constrain based // on minRecSizes so we can send the list back w/o merging // OR if just merging with RdbTree's list int32_t mrs ; // . constrain to m_minRecSizesOrig, not m_minRecSizes cuz // that could be adjusted by compensateForNegativeRecs() // . but, really, they should be the same if we only read from // the root file if ( m_numFileNums == 1 ) mrs = m_minRecSizesOrig; else mrs = -1; // . this returns false and sets g_errno on error // . like if data is corrupt BigFile *ff = base->getFile(m_fileNums[i]); // if we did a merge really quick and delete one of the // files we were reading, i've seen 'ff' be NULL char *filename = "lostfilename"; if ( ff ) filename = ff->getFilename(); // compute cache info RdbCache *rpc = getDiskPageCache ( m_rdbId ); if ( ! m_allowPageCache ) rpc = NULL; int64_t vfd ; if ( ff ) vfd = ff->getVfd(); key192_t ck ; if ( ff ) ck = makeCacheKey ( vfd , m_scans[i].m_offset , m_scans[i].m_bytesToRead ); if ( m_validateCache && ff && rpc && vfd != -1 ) { bool inCache; char *rec; int32_t recSize; inCache = rpc->getRecord ( (collnum_t)0 , // collnum (char *)&ck , &rec , &recSize , true , // copy? -1 , // maxAge, none true ); // inccounts? if ( inCache && // 1st byte is RdbScan::m_shifted ( m_lists[i].m_listSize != recSize-1 || memcmp ( m_lists[i].m_list , rec+1,recSize-1) || *rec != m_scans[i].m_shifted ) ) { log("msg3: cache did not validate"); char *xx=NULL;*xx=0; } mfree ( rec , recSize , "vca" ); } /////// // // STORE IN PAGE CACHE // /////// // store what we read in the cache. don't bother storing // if it was a retry, just in case something strange happened. // store pre-constrain call is more efficient. if ( m_retryNum<=0 && ff && rpc && vfd != -1 && ! m_scans[i].m_inPageCache ) rpc->addRecord ( (collnum_t)0 , // collnum (char *)&ck , // rec1 is this little thingy &m_scans[i].m_shifted, 1, // rec2 m_lists[i].getList() , m_lists[i].getListSize() , 0 ); // timestamp. 0 = now QUICKPOLL(m_niceness); // if from our 'page' cache, no need to constrain if ( ! m_lists[i].constrain ( m_startKey , m_constrainKey , // m_endKey mrs , // m_minRecSizes m_hintOffsets[i] , //m_hintKeys [i] , &m_hintKeys [i*m_ks] , filename,//ff->getFilename() , m_niceness ) ) { log("net: Had error while constraining list read from " "%s: %s/%s. vfd=%" PRId32" parts=%" PRId32". " "This is likely caused by corrupted " "data on disk.", mstrerror(g_errno), ff->getDir(), ff->getFilename(), ff->m_vfd , (int32_t)ff->m_numParts ); continue; } } // print the time if ( g_conf.m_logTimingDb ) { int64_t now = gettimeofdayInMilliseconds(); int64_t took = now - m_startTime; log(LOG_TIMING, "net: Took %" PRId64" ms to read %" PRId32" lists of %" PRId32" bytes total" " from %s (niceness=%" PRId32").", took,m_numFileNums,count,base->m_dbname,m_niceness); } return true; }
// . return false if blocked, true otherwise // . set g_errno on error // . read list of keys in [startKey,endKey] range // . read at least "minRecSizes" bytes of keys in that range // . the "m_endKey" of resulting, merged list may have a smaller endKey // than the argument, "endKey" due to limitation by "minRecSizes" // . resulting list will contain ALL keys between ITS [m_startKey,m_endKey] // . final merged list "should" try to have a size of at least "minRecSizes" // but due to negative/postive rec elimination may be less // . the endKey of the lists we read may be <= "endKey" provided // . we try to shrink the endKey if minRecSizes is >= 0 in order to // avoid excessive reading // . by shrinking the endKey we cannot take into account the size of deleted // records, so therefore we may fall short of "minRecSizes" in actuality, // in fact, the returned list may even be empty with a shrunken endKey // . we merge all lists read from disk into the provided "list" // . caller should call Msg3.getList(int32_t i) and Msg3:getNumLists() to retrieve // . this makes the query engine faster since we don't need to merge the docIds // and can just send them across the network separately and they will be // hashed into IndexTable's table w/o having to do time-wasting merging. // . caller can specify array of filenums to read from so incremental syncing // in Sync class can just read from titledb*.dat files that were formed // since the last sync point. bool Msg3::readList ( char rdbId , collnum_t collnum , const char *startKeyArg , const char *endKeyArg , int32_t minRecSizes , // max size of scan int32_t startFileNum , // first file to scan int32_t numFiles , // rel. to startFileNum void *state , // for callback void (* callback ) ( void *state ) , int32_t niceness , int32_t retryNum , int32_t maxRetries , bool compensateForMerge , bool justGetEndKey , bool allowPageCache , bool hitDisk ) { // set this to true to validate m_validateCache = false;//true; // clear, this MUST be done so if we return true g_errno is correct g_errno = 0; // assume lists are not checked for corruption m_listsChecked = false; // warn if ( minRecSizes < -1 ) { log(LOG_LOGIC,"db: Msg3 got minRecSizes of %" PRId32", changing " "to -1.",minRecSizes); minRecSizes = -1; } // reset m_alloc and data in all lists in case we are a re-call reset(); // warning if ( collnum < 0 ) log(LOG_LOGIC,"net: NULL collection. msg3."); // remember the callback m_rdbId = rdbId; m_collnum = collnum; m_callback = callback; m_state = state; m_niceness = niceness; m_numScansCompleted = 0; m_retryNum = retryNum; m_maxRetries = maxRetries; m_compensateForMerge = compensateForMerge; m_allowPageCache = allowPageCache; m_hitDisk = hitDisk; m_hadCorruption = false; // get keySize of rdb m_ks = getKeySizeFromRdbId ( m_rdbId ); // reset the group error m_errno = 0; // . reset all our lists // . these are reset in call the RdbScan::setRead() below //for ( int32_t i = 0 ; i < MAX_RDB_FILES ; i++ ) m_lists[i].reset(); // . ensure startKey last bit clear, endKey last bit set // . no! this warning is now only in Msg5 // . if RdbMerge is merging some files, not involving the root // file, then we can expect to get a lot of unmatched negative recs. // . as a consequence, our endKeys may often be negative. This means // it may not annihilate with the positive key, but we should only // miss like this at the boundaries of the lists we fetch. // . so in that case RdbList::merge will stop merging once the // minRecSizes limit is reached even if it means ending on a negative // rec key //if ( (startKey.n0 & 0x01) == 0x01 ) if ( !KEYNEG(startKeyArg) ) log(LOG_REMIND,"net: msg3: StartKey lastbit set."); if ( KEYNEG(endKeyArg) ) log(LOG_REMIND,"net: msg3: EndKey lastbit clear."); // declare vars here becaues of 'goto skip' below int32_t mergeFileNum = -1 ; int32_t max ; // get base, returns NULL and sets g_errno to ENOCOLLREC on error RdbBase *base = getRdbBase( m_rdbId, m_collnum ); if ( ! base ) { return true; } // store the file numbers in the array, these are the files we read m_numFileNums = 0; // save startFileNum here, just for recall m_startFileNum = startFileNum; m_numFiles = numFiles; // . if we have a merge going on, we may have to change startFileNum // . if some files get unlinked because merge completes then our // reads will detect the error and loop back here // . we launch are reads right after this without giving up the cpu // and we use file descriptors, so any changes to Rdb::m_files[] // should not hurt us // . WARNING: just make sure you don't lose control of cpu until after // you call RdbScan::set() // . we use hasMergeFile() instead of isMerging() because he may not // be merging cuz he got suspended or he restarted and // hasn't called attemptMerge() yet, but he may still contain it if ( g_conf.m_logDebugQuery ) log(LOG_DEBUG, "net: msg3: " "c=%" PRId32" hmf=%" PRId32" sfn=%" PRId32" msfn=%" PRId32" nf=%" PRId32" db=%s.", (int32_t)compensateForMerge,(int32_t)base->hasMergeFile(), (int32_t)startFileNum,(int32_t)base->m_mergeStartFileNum-1, (int32_t)numFiles,base->m_dbname); int32_t pre = -10; if ( compensateForMerge && base->hasMergeFile() && startFileNum >= base->m_mergeStartFileNum - 1 && (startFileNum > 0 || numFiles != -1) ) { // now also include the file being merged into, but only // if we are reading from a file being merged... if ( startFileNum < base->m_mergeStartFileNum + base->m_numFilesToMerge - 1 ) //m_fileNums [ m_numFileNums++ ] = // base->m_mergeStartFileNum - 1; pre = base->m_mergeStartFileNum - 1; // debug msg if ( g_conf.m_logDebugQuery ) log(LOG_DEBUG, "net: msg3: startFileNum from %" PRId32" to %" PRId32" (mfn=%" PRId32")", startFileNum,startFileNum+1,mergeFileNum); // if merge file was inserted before us, inc our file number startFileNum++; } // adjust num files if we need to, as well if ( compensateForMerge && base->hasMergeFile() && startFileNum < base->m_mergeStartFileNum - 1 && numFiles != -1 && startFileNum + numFiles - 1 >= base->m_mergeStartFileNum - 1 ) { // debug msg if ( g_conf.m_logDebugQuery ) log(LOG_DEBUG,"net: msg3: numFiles up one."); // if merge file was inserted before us, inc our file number numFiles++; } // . how many rdb files does this base have? // . IMPORTANT: this can change since files are unstable because they // might have all got merged into one! // . so do this check to make sure we're safe... especially if // there was an error before and we called readList() on ourselves max = base->getNumFiles(); // -1 means we should scan ALL the files in the base if ( numFiles == -1 ) numFiles = max; // limit it by startFileNum, however if ( numFiles > max - startFileNum ) numFiles = max - startFileNum; // set g_errno and return true if it is < 0 if ( numFiles < 0 ) { log(LOG_LOGIC, "net: msg3: readList: numFiles = %" PRId32" < 0 (max=%" PRId32")(sf=%" PRId32")", numFiles , max , startFileNum ); g_errno = EBADENGINEER; // force core dump char *xx=NULL;*xx=0; return true; } // . allocate buffer space // . m_scans, m_startpg, m_endpg, m_hintKeys, m_hintOffsets, // m_fileNums, m_lists int32_t chunk = sizeof(RdbScan) + // m_scans 4 + // m_startpg 4 + // m_endpg //sizeof(key_t) + // m_hintKeys m_ks + // m_hintKeys 4 + // m_hintOffsets 4 + // m_fileNums sizeof(RdbList) ; // m_lists int32_t nn = numFiles; if ( pre != -10 ) nn++; m_numChunks = nn; int32_t need = nn * (chunk); m_alloc = m_buf; if ( need > (int32_t)MSG3_BUF_SIZE ) { m_allocSize = need; m_alloc = (char *)mcalloc ( need , "Msg3" ); if ( ! m_alloc ) { log("disk: Could not allocate %" PRId32" bytes read " "structures to read %s.",need,base->m_dbname); return true; } } char *p = m_alloc; m_scans = (RdbScan *)p; p += nn * sizeof(RdbScan); m_startpg = (int32_t *)p; p += nn * 4; m_endpg = (int32_t *)p; p += nn * 4; //m_hintKeys = (key_t *)p; p += nn * sizeof(key_t); m_hintKeys = (char *)p; p += nn * m_ks; m_hintOffsets = (int32_t *)p; p += nn * 4; m_fileNums = (int32_t *)p; p += nn * 4; m_lists = (RdbList *)p; p += nn * sizeof(RdbList); // sanity check if ( p - m_alloc != need ) { log(LOG_LOGIC,"disk: Bad malloc in Msg3.cpp."); char *xx = NULL; *xx = 0; } // call constructors for ( int32_t i = 0 ; i < nn ; i++ ) m_lists[i].constructor(); // make fix from up top if ( pre != -10 ) m_fileNums [ m_numFileNums++ ] = pre; // store them all for ( int32_t i = startFileNum ; i < startFileNum + numFiles ; i++ ) m_fileNums [ m_numFileNums++ ] = i; // . remove file nums that are being unlinked after a merge now // . keep it here (below skip: label) so sync point reads can use it int32_t n = 0; for ( int32_t i = 0 ; i < m_numFileNums ; i++ ) { // skip those that are being unlinked after the merge if ( base->m_isUnlinking && m_fileNums[i] >= base->m_mergeStartFileNum && m_fileNums[i] < base->m_mergeStartFileNum + base->m_numFilesToMerge ) continue; // otherwise, keep it m_fileNums[n++] = m_fileNums[i]; } m_numFileNums = n; // . if root file is being merged, he's file #0, & root file is file #1 // . this is a hack so caller gets what he wants //if ( startFileNum == 0 && base->getFileId(0) == 0 && numFiles == 1 ) // numFiles = 2; // remember the file range we should scan m_numScansStarted = 0; m_numScansCompleted = 0; //m_startKey = startKey; //m_endKey = endKey; //m_constrainKey = endKey; // set in case justGetEndKey is true KEYSET(m_startKey,startKeyArg,m_ks); KEYSET(m_endKey,endKeyArg,m_ks); KEYSET(m_constrainKey,endKeyArg,m_ks);//set incase justGetEndKey istrue m_minRecSizes = minRecSizes; m_compensateForMerge = compensateForMerge; // bail if 0 files to scan -- no! need to set startKey/endKey if ( numFiles == 0 ) return true; // don't read anything if endKey < startKey //if ( m_startKey > m_endKey ) return true; if ( KEYCMP(m_startKey,m_endKey,m_ks)>0 ) return true; // keep the original in tact in case g_errno == ETRYAGAIN //m_endKeyOrig = endKey; KEYSET(m_endKeyOrig,endKeyArg,m_ks); m_minRecSizesOrig = minRecSizes; // start reading at this key m_fileStartKey = startKeyArg; // start the timer, keep it fast for clusterdb though if ( g_conf.m_logTimingDb ) m_startTime = gettimeofdayInMilliseconds(); // translate base to an id, for the sake of m_msg0 //char baseId = m_msg0->getRdbId ( base ); // map ptrs RdbMap **maps = base->getMaps(); // . we now boost m_minRecSizes to account for negative recs // . but not if only reading one list, cuz it won't get merged and // it will be too big to send back if ( m_numFileNums > 1 ) compensateForNegativeRecs ( base ); // . often endKey is too big for an efficient read of minRecSizes bytes // because we end up reading too much from all the files // . this will set m_startpg[i], m_endpg[i] for each RdbScan/RdbFile // to ensure we read "minRecSizes" worth of records, not much more // . returns the new endKey for all ranges // . now this just overwrites m_endKey //m_endKey = setPageRanges ( base , setPageRanges ( base , m_fileNums , m_numFileNums , m_fileStartKey , // start reading @ key m_endKey , // stop reading @ key m_minRecSizes ); // . NEVER let m_endKey be a negative key, because it will // always be unmatched, since delbit is cleared // . adjusting it here ensures our generated hints are valid // . we will use this key to call constrain() with //m_constrainKey = m_endKey; //if ( ( m_constrainKey.n0 & 0x01) == 0x00 ) // m_constrainKey -= (uint32_t)1; KEYSET(m_constrainKey,m_endKey,m_ks); if ( KEYNEG(m_constrainKey) ) KEYSUB(m_constrainKey,m_ks); // Msg5 likes to get the endkey for getting the list from the tree if ( justGetEndKey ) return true; // sanity check if ( m_numFileNums > nn ) { log(LOG_LOGIC,"disk: Failed sanity check in Msg3."); char *xx = NULL; *xx = 0; } // debug msg //log("msg3 getting list (msg5=%" PRIu32")",m_state); // . MDW removed this -- go ahead an end on a delete key // . RdbMerge might not pick it up this round, but oh well // . so we can have both positive and negative co-existing in same file // make sure the last bit is set so we don't end on a delete key //m_endKey.n0 |= 0x01LL; // . now start reading/scanning the files // . our m_scans array starts at 0 for ( int32_t i = 0 ; i < m_numFileNums ; i++ ) { // get the page range //int32_t p1 = m_startpg [ i ]; //int32_t p2 = m_endpg [ i ]; //#ifdef GBSANITYCHECK int32_t fn = m_fileNums[i]; // this can happen somehow! if ( fn < 0 ) { log(LOG_LOGIC,"net: msg3: fn=%" PRId32". Bad engineer.",fn); continue; } // sanity check if ( i > 0 && m_fileNums[i-1] >= fn ) { log(LOG_LOGIC, "net: msg3: files must be read in order " "from oldest to newest so RdbList::indexMerge_r " "works properly. Otherwise, corruption will " "result. "); char *xx = NULL; *xx = 0; return true; } // . sanity check? // . no, we must get again since we turn on endKey's last bit int32_t p1 , p2; maps[fn]->getPageRange ( m_fileStartKey , m_endKey , &p1 , &p2 , NULL ); //if ( p1 != p1c || p2 != p2c ) { // fprintf(stderr,"Msg3::bad page range\n"); // sleep(50000); //} // sanity check, each endpg's key should be > endKey //if ( p2 < maps[fn]->getNumPages() && // maps[fn]->getKey ( p2 ) <= m_endKey ) { // fprintf(stderr,"Msg3::bad page range 2\n"); // sleep(50000); //} //#endif //int32_t p1 , p2; //maps[fn]->getPageRange (startKey,endKey,minRecSizes,&p1,&p2); // now get some read info int64_t offset = maps[fn]->getAbsoluteOffset ( p1 ); int32_t bytesToRead = maps[fn]->getRecSizes ( p1, p2, false); // max out the endkey for this list // debug msg //#ifdef _DEBUG_ //if ( minRecSizes == 2000000 ) //log("Msg3:: reading %" PRId32" bytes from file #%" PRId32,bytesToRead,i); //#endif // inc our m_numScans m_numScansStarted++; // . keep stats on our disk accesses // . count disk seeks (assuming no fragmentation) // . count disk bytes read if ( bytesToRead > 0 ) { base->m_rdb->didSeek ( ); base->m_rdb->didRead ( bytesToRead ); } // . the startKey may be different for each RdbScan class // . RdbLists must have all keys within their [startKey,endKey] // . therefore set startKey individually from first page in map // . this endKey must be >= m_endKey // . this startKey must be < m_startKey //key_t startKey = maps[fn]->getKey ( p1 ); //key_t endKey = maps[fn]->getKey ( p2 ); char startKey2 [ MAX_KEY_BYTES ]; char endKey2 [ MAX_KEY_BYTES ]; maps[fn]->getKey ( p1 , startKey2 ); maps[fn]->getKey ( p2 , endKey2 ); //char *startKey = maps[fn]->getKeyPtr ( p1 ); //char *endKey = maps[fn]->getKeyPtr ( p2 ); // store in here m_startpg [ i ] = p1; m_endpg [ i ] = p2; // . we read UP TO that endKey, so reduce by 1 // . but iff p2 is NOT the last page in the map/file // . maps[fn]->getKey(lastPage) will return the LAST KEY // and maps[fn]->getOffset(lastPage) the length of the file //if ( maps[fn]->getNumPages()!=p2) endKey -=(uint32_t)1; if ( maps[fn]->getNumPages() != p2 ) KEYSUB(endKey2,m_ks); // otherwise, if we're reading all pages, then force the // endKey to virtual inifinite //else endKey.setMax(); else KEYMAX(endKey2,m_ks); // . set up the hints // . these are only used if we are only reading from 1 file // . these are used to call constrain() so we can constrain // the end of the list w/o looping through all the recs // in the list int32_t h2 = p2 ; // decrease by one page if we're on the last page if ( h2 > p1 && maps[fn]->getNumPages() == h2 ) h2--; // . decrease hint page until key is <= endKey on that page // AND offset is NOT -1 because the old way would give // us hints passed the endkey // . also decrease so we can constrain on minRecSizes in // case we're the only list being read // . use >= m_minRecSizes instead of >, otherwise we may // never be able to set "size" in RdbList::constrain() // because "p" could equal "maxPtr" right away while ( h2 > p1 && //( maps[fn]->getKey (h2) > m_constrainKey || (KEYCMP(maps[fn]->getKeyPtr(h2),m_constrainKey,m_ks)>0|| maps[fn]->getOffset(h2) == -1 || maps[fn]->getAbsoluteOffset(h2) - offset >= m_minRecSizes ) ) h2--; // now set the hint m_hintOffsets [ i ] = maps[fn]->getAbsoluteOffset ( h2 ) - maps[fn]->getAbsoluteOffset ( p1 ) ; //m_hintKeys [ i ] = maps[fn]->getKey ( h2 ); KEYSET(&m_hintKeys[i*m_ks],maps[fn]->getKeyPtr(h2),m_ks); // reset g_errno before calling setRead() g_errno = 0; // . this fix is now in RdbList::checklist_r() // . we can now have dup keys, so, we may read in // a rec with key "lastMinKey" even though we don't read // in the first key on the end page, so don't subtract 1... //if ( endKey != m_endKeyOrig ) // endKey += (uint32_t) 1; // timing debug if ( g_conf.m_logTimingDb ) log(LOG_TIMING, "net: msg: reading %" PRId32" bytes from %s file #%" PRId32" " "(niceness=%" PRId32")", bytesToRead,base->m_dbname,i,m_niceness); // log huge reads, those hurt us if ( bytesToRead > 150000000 ) { logf(LOG_INFO,"disk: Reading %" PRId32" bytes at offset %" PRId64" " "from %s.", bytesToRead,offset,base->m_dbname); } // if any keys in the map are the same report corruption char tmpKey [16]; char lastTmpKey[16]; int32_t ccount = 0; if ( bytesToRead > 10000000 && bytesToRead / 2 > m_minRecSizes && base->m_fixedDataSize >= 0 ) { for ( int32_t pn = p1 ; pn <= p2 ; pn++ ) { maps[fn]->getKey ( pn , tmpKey ); if ( KEYCMP(tmpKey,lastTmpKey,m_ks) == 0 ) ccount++; gbmemcpy(lastTmpKey,tmpKey,m_ks); } } if ( ccount > 10 ) { logf(LOG_INFO,"disk: Reading %" PRId32" bytes from %s file #" "%" PRId32" when min " "required is %" PRId32". Map is corrupt and has %" PRId32" " "identical consecutive page keys because the " "map was \"repaired\" because out of order keys " "in the index.", (int32_t)bytesToRead, base->m_dbname,fn, (int32_t)m_minRecSizes, (int32_t)ccount); m_numScansCompleted++; m_errno = ECORRUPTDATA; m_hadCorruption = true; //m_maxRetries = 0; break; } //////// // // try to get from PAGE CACHE // //////// BigFile *ff = base->getFile(m_fileNums[i]); RdbCache *rpc = getDiskPageCache ( m_rdbId ); if ( ! m_allowPageCache ) rpc = NULL; // . vfd is unique 64 bit file id // . if file is opened vfd is -1, only set in call to open() int64_t vfd = ff->getVfd(); key192_t ck = makeCacheKey ( vfd , offset, bytesToRead); char *rec; int32_t recSize; bool inCache = false; if ( rpc && vfd != -1 && ! m_validateCache ) inCache = rpc->getRecord ( (collnum_t)0 , // collnum (char *)&ck , &rec , &recSize , true , // copy? -1 , // maxAge, none true ); // inccounts? m_scans[i].m_inPageCache = false; if ( inCache ) { m_scans[i].m_inPageCache = true; m_numScansCompleted++; // now we have to store this value, 6 or 12 so // we can modify the hint appropriately m_scans[i].m_shifted = *rec; m_lists[i].set ( rec +1, recSize-1 , rec , // alloc recSize , // allocSize startKey2 , endKey2 , base->m_fixedDataSize , true , // owndata base->useHalfKeys() , getKeySizeFromRdbId ( m_rdbId ) ); continue; } // . do the scan/read of file #i // . this returns false if blocked, true otherwise // . this will set g_errno on error bool done = m_scans[i].setRead (base->getFile(m_fileNums[i]), base->m_fixedDataSize , offset , bytesToRead , startKey2 , endKey2 , m_ks , &m_lists[i] , this , doneScanningWrapper , base->useHalfKeys() , m_rdbId, m_niceness , m_allowPageCache , m_hitDisk ) ; // . damn, usually the above will indirectly launch a thread // to do the reading, but it sets g_errno to EINTR, // "interrupted system call"! // . i guess the thread does the read w/o blocking and then // queues the signal on g_loop's queue before it exits // . try ignoring, and keep going if ( g_errno == EINTR ) { log("net: Interrupted system call while reading file. " "Ignoring."); g_errno = 0; } // debug msg //fprintf(stderr,"Msg3:: reading %" PRId32" bytes from file #%" PRId32"," // "done=%" PRId32",offset=%" PRId64",g_errno=%s," // "startKey=n1=%" PRIu32",n0=%" PRIu64", " // "endKey=n1=%" PRIu32",n0=%" PRIu64"\n", // bytesToRead,i,(int32_t)done,offset,mstrerror(g_errno), // m_startKey,m_endKey); //if ( bytesToRead == 0 ) // fprintf(stderr,"shit\n"); // if it did not block then it completed, so count it if ( done ) m_numScansCompleted++; // break on an error, and remember g_errno in case we block if ( g_errno && g_errno != ENOTHREADSLOTS ) { int32_t tt = LOG_WARN; if ( g_errno == EFILECLOSED ) tt = LOG_INFO; log(tt,"disk: Reading %s had error: %s.", base->m_dbname, mstrerror(g_errno)); m_errno = g_errno; break; } } // debug test //if ( rand() % 100 <= 10 ) m_errno = EIO; // if we blocked, return false if ( m_numScansCompleted < m_numScansStarted ) return false; // . if all scans completed without blocking then wrap it up & ret true // . doneScanning may now block if it finds data corruption and must // get the list remotely return doneScanning(); }
bool RdbMerge::getAnotherList ( ) { log(LOG_DEBUG,"db: Getting another list for merge."); // clear it up in case it was already set g_errno = 0; // get base, returns NULL and sets g_errno to ENOCOLLREC on error RdbBase *base; if (!(base=getRdbBase(m_rdbId,m_collnum))) return true; // if merging titledb files, we must adjust m_endKey so we do // not have to read a huge 200MB+ tfndb list //key_t newEndKey = m_endKey; char newEndKey[MAX_KEY_BYTES]; KEYSET(newEndKey,m_endKey,m_ks); //CollectionRec *cr = g_collectiondb.getRec ( m_collnum ); //char *coll = cr->m_coll; /* if ( m_rdbId == RDB_TITLEDB ) { // && m_rdbId == RDB_TFNDB ) { //long long docId1 = g_titledb.getDocIdFromKey ( m_startKey ); long long docId1=g_titledb.getDocIdFromKey((key_t *)m_startKey); //long long docId2 = g_titledb.getDocIdFromKey ( m_endKey ); // tfndb is pretty much uniformly distributed RdbBase *ubase = getRdbBase(RDB_TFNDB,m_coll); if ( ! ubase ) return true; long long space = ubase->getDiskSpaceUsed(); //long long readSize = (space * (docId2-docId1)) / DOCID_MASK; long long bufSize = g_conf.m_mergeBufSize; // for now force to 100k bufSize = 100000; if ( bufSize > space ) bufSize = space; long long docId3 = (long long) (((double)bufSize / (double)space) * (double)DOCID_MASK + docId1); // constrain newEndKey based on docId3 if ( docId3 < 0 ) docId3 = DOCID_MASK; //if ( docId3 >= DOCID_MASK ) newEndKey.setMax(); if ( docId3 >= DOCID_MASK ) KEYMAX(newEndKey,m_ks); //else newEndKey = g_titledb.makeLastKey ( docId3 ); else { key_t nk = g_titledb.makeLastKey(docId3); KEYSET(newEndKey,(char *)&nk,m_ks); } //log(LOG_DEBUG,"build: remapping endkey from %lx.%llx to " // "%lx.%llx to avoid big tfndb read.", // m_endKey.n1,m_endKey.n0, newEndKey.n1,newEndKey.n0); log(LOG_DEBUG,"build: remapping endkey from %llx.%llx to " "%llx.%llx to avoid big tfndb read.", KEY1(m_endKey,m_ks),KEY0(m_endKey), KEY1(newEndKey,m_ks),KEY0(newEndKey)); } */ // . this returns false if blocked, true otherwise // . sets g_errno on error // . we return false if it blocked // . m_maxBufSize may be exceeded by a rec, it's just a target size // . niceness is usually MAX_NICENESS, but reindex.cpp sets to 0 // . this was a call to Msg3, but i made it call Msg5 since // we now do the merging in Msg5, not in msg3 anymore // . this will now handle truncation, dup and neg rec removal // . it remembers last termId and count so it can truncate even when // IndexList is split between successive reads // . IMPORTANT: when merging titledb we could be merging about 255 // files, so if we are limited to only X fds it can have a cascade // affect where reading from one file closes the fd of another file // in the read (since we call open before spawning the read thread) // and can therefore take 255 retries for the Msg3 to complete // because each read gives a EFILCLOSED error. // so to fix it we allow one retry for each file in the read plus // the original retry of 25 long nn = base->getNumFiles(); if ( m_numFiles > 0 && m_numFiles < nn ) nn = m_numFiles; // don't access any biased page caches bool usePageCache = true; if ( m_rdbId == RDB_CLUSTERDB ) usePageCache = false; // . i don't trust page cache too much (mdw)... well, give it a shot // . see if ths helps fix WD corruption... i doubt it usePageCache = false; // for now force to 100k long bufSize = 100000; // g_conf.m_mergeBufSize , // minRecSizes // get it return m_msg5.getList ( m_rdbId , m_collnum , &m_list , m_startKey , newEndKey , // usually is maxed! bufSize , false , // includeTree? false , // add to cache? 0 , // max cache age for lookup m_startFileNum , // startFileNum m_numFiles , this , // state gotListWrapper , // callback m_niceness , // niceness true , // do error correction? NULL , // cache key ptr 0 , // retry # nn + 75 , // max retries (mk it high) false , // compensate for merge? -1LL , // sync point &m_msg5b , true , // isRealMerge? absolutely! usePageCache ); }
// . buffer is used for reading and writing // . return false if blocked, true otherwise // . sets g_errno on error // . if niceness is 0 merge will block, otherwise will not block // . we now use niceness of 1 which should spawn threads that don't allow // niceness 2 threads to launch while they're running // . spider process now uses mostly niceness 2 // . we need the merge to take priority over spider processes on disk otherwise // there's too much contention from spider lookups on disk for the merge // to finish in a decent amount of time and we end up getting too many files! bool RdbMerge::merge ( char rdbId , //char *coll , //RdbBase *base , collnum_t collnum, BigFile *target , RdbMap *targetMap , long id2 , // target's secondary id long startFileNum , long numFiles , long niceness , class DiskPageCache *pc , long long maxTargetFileSize , char keySize ) { // reset ourselves reset(); // set it m_rdbId = rdbId; Rdb *rdb = getRdbFromId ( rdbId ); // get base, returns NULL and sets g_errno to ENOCOLLREC on error RdbBase *base; if (!(base=getRdbBase(m_rdbId,collnum))) return true; // don't breech the max //if ( numFiles > m_maxFilesToMerge ) numFiles = m_maxFilesToMerge; // reset this map! it's m_crcs needs to be reset //targetMap->reset(); // remember some parms //if ( ! coll && rdb->m_isCollectionLess ) // strcpy ( m_coll , rdb->m_dbname ); //else // strcpy ( m_coll , coll ); m_collnum = collnum; if ( rdb->m_isCollectionLess ) m_collnum = 0; m_target = target; m_targetMap = targetMap; m_id2 = id2; m_startFileNum = startFileNum; m_numFiles = numFiles; m_dedup = base->m_dedup; m_fixedDataSize = base->m_fixedDataSize; m_niceness = niceness; m_pc = pc; m_maxTargetFileSize = maxTargetFileSize; m_doneMerging = false; m_ks = keySize; // . set the key range we want to retrieve from the files // . just get from the files, not tree (not cache?) //m_startKey.setMin(); //m_endKey.setMax(); KEYMIN(m_startKey,m_ks); KEYMAX(m_endKey,m_ks); // if we're resuming a killed merge, set m_startKey to last // key the map knows about. // the dump will start dumping at the end of the targetMap's data file. if ( m_targetMap->getNumRecs() > 0 ) { log(LOG_INIT,"db: Resuming a killed merge."); //m_startKey = m_targetMap->getLastKey(); m_targetMap->getLastKey(m_startKey); //m_startKey += (unsigned long) 1; KEYADD(m_startKey,1,m_ks); // if power goes out and we are not doing synchronous writes // then we could have completely lost some data and unlinked // a part file from the file being merged, so that the data is // gone. to be able to resume merging, we must increment the // startKey until it references a valid offset in all the // files being merged. invalid offsets will reference parts // that have been chopped. /* RdbMap **maps = rdb->getMaps(); BigFile **files = rdb->getFiles(); for ( long i=m_startFileNum;i<m_startFileNum+m_numFiles;i++){ long long minOff = 0LL; long k = 0; while ( k < files[i]->m_maxParts && ! files[i]->m_files[k] ) { k++; minOff += MAX_PART_SIZE; } long pn0 = maps[i]->getPage ( m_startKey ); long pn = pn0; while ( maps[i]->getAbsoluteOffset(pn) < minOff ) pn++; if ( pn != pn0 ) { log("db: Lost data during merge. Starting " "merge at page number %li from %li for " "file.",pn,pn0); m_startKey = maps[i]->getKey ( pn ); } } */ } // free our list's memory, just in case //m_list.freeList(); // . we may have multiple hosts running on the same cpu/hardDrive // . therefore, to maximize disk space, we should only have 1 merge // at a time going on between these hosts // . now tfndb has own merge class since titledb merge writes url recs /* if ( s_isMergeLocked ) { //log("RdbMerge::merge: someone else merging sleeping."); log("RdbMerge::merge: someone else merging. bad engineer."); return false; // if it fails then sleep until it works //returng_loop.registerSleepCallback(5000,this,getLockWrapper); } */ return gotLock(); }
// . return false if blocked, true otherwise // . sets g_errno on error bool RdbMerge::getNextList ( ) { // return true if g_errno is set if ( g_errno || m_doneMerging ) return true; // it's suspended so we count this as blocking if ( m_isSuspended ) { m_isReadyToSave = true; return false; } // if the power is off, suspend the merging if ( ! g_process.m_powerIsOn ) { m_isReadyToSave = true; doSleep(); return false; } // no chop threads m_numThreads = 0; // get base, returns NULL and sets g_errno to ENOCOLLREC on error RdbBase *base; if (!(base=getRdbBase(m_rdbId,m_collnum))) return true; // . if a contributor has just surpassed a "part" in his BigFile // then we can delete that part from the BigFile and the map for ( long i = m_startFileNum ; i < m_startFileNum + m_numFiles; i++ ){ RdbMap *map = base->m_maps[i]; long page = map->getPage ( m_startKey ); long long offset = map->getAbsoluteOffset ( page ); BigFile *file = base->m_files[i]; long part = file->getPartNum ( offset ) ; if ( part == 0 ) continue; // i've seen this bug happen if we chop a part off on our // last dump and the merge never completes for some reason... // so if we're in the last part then don't chop the part b4 us if ( part >= file->m_maxParts - 1 ) continue; // if we already unlinked part # (part-1) then continue if ( ! file->doesPartExist ( part - 1 ) ) continue; // . otherwise, excise from the map // . we must be able to chop the mapped segments corresponding // EXACTLY to the part file // . therefore, PAGES_PER_SEGMENT define'd in RdbMap.h must // evenly divide MAX_PART_SIZE in BigFile.h // . i do this check in RdbMap.cpp if ( ! map->chopHead ( MAX_PART_SIZE ) ) { // we had an error! log("db: Failed to remove data from map for " "%s.part%li.", file->getFilename(),part); return true; } // . also, unlink any part files BELOW part # "part" // . this returns false if it blocked, true otherwise // . this sets g_errno on error // . now we just unlink part file #(part-1) explicitly if ( ! file->chopHead ( part - 1 , chopWrapper , this ) ) m_numThreads++; if ( ! g_errno ) continue; log("db: Failed to unlink file %s.part%li.", file->getFilename(),part); return true; } // wait for file to be unlinked before getting list if ( m_numThreads > 0 ) return false; // otherwise, get it now return getAnotherList ( ); }
// . returns false if blocked, true otherwise // . sets g_errno on error bool RdbMerge::gotLock ( ) { // get total recSizes of files we're merging //long totalSize = 0; //for ( long i=m_startFileNum ; i < m_startFileNum + m_numFiles ; i++ ) //totalSize += m_base->m_files[i]->getSize(); // . grow the map now so it doesn't have to keep growing dynamically // which wastes memory // . setMapSize() returns false and sets g_errno on error // . we return true if it had an error //if ( ! m_targetMap->setMapSizeFromFileSize ( totalSize ) ) { //log("RdbMerge::getLockFile: targetMap setMapSize failed"); //return true; //} // . get last mapped offset // . this may actually be smaller than the file's actual size // but the excess is not in the map, so we need to do it again long long startOffset = m_targetMap->getFileSize(); // if startOffset is > 0 use the last key as RdbDump:m_prevLastKey // so it can compress the next key it dumps providee m_useHalfKeys // is true (key compression) and the next key has the same top 6 bytes // as m_prevLastKey //key_t prevLastKey; //if ( startOffset > 0 ) prevLastKey = m_targetMap->getLastKey(); //else prevLastKey.setMin(); char prevLastKey[MAX_KEY_BYTES]; if ( startOffset > 0 ) m_targetMap->getLastKey(prevLastKey); else KEYMIN(prevLastKey,m_ks); // get base, returns NULL and sets g_errno to ENOCOLLREC on error RdbBase *base; if (!(base=getRdbBase(m_rdbId,m_collnum))) return true; // . set up a a file to dump the records into // . returns false and sets g_errno on error // . this will open m_target as O_RDWR | O_NONBLOCK | O_ASYNC ... m_dump.set ( m_collnum , m_target , m_id2 , //m_startFileNum - 1 , // merge fileNum in Rdb::m_files[] (m_rdbId == RDB_TITLEDB||m_rdbId== RDB2_TITLEDB2) , NULL , // buckets to dump is NULL, we call dumpList NULL , // tree to dump is NULL, we call dumpList m_targetMap , NULL , // for caching dumped tree 0 , // m_maxBufSize. not needed if no tree! true , // orderedDump? m_dedup , m_niceness , // niceness of dump this , // state dumpListWrapper , base->useHalfKeys() , startOffset , prevLastKey , m_ks , m_pc , m_maxTargetFileSize , NULL ); // set m_base::m_needsToSave? no. // what kind of error? if ( g_errno ) { log("db: gotLock: %s.", mstrerror(g_errno) ); return true; } // . create a new msg3 // . don't keep static because it contains a msg3, treeList & diskList // . these can take up many megs of mem // . yes, but we need to avoid fragmentation, so hold on to our mem! //m_msg3 = new (Msg3); //if ( ! m_msg3 ) return false; // we're now merging since the dump was set up successfully m_isMerging = true; // make it suspended for now m_isSuspended = true; // grab the lock //s_isMergeLocked = true; // . this unsuspends it // . this returns false on error and sets g_errno // . it returns true if blocked or merge completed successfully return resumeMerge ( ); }
void handleRequest22 ( UdpSlot *slot , long netnice ) { // shortcut UdpServer *us = &g_udpServer; // get the request Msg22Request *r = (Msg22Request *)slot->m_readBuf; // get this //char *coll = g_collectiondb.getCollName ( r->m_collnum ); // sanity check long requestSize = slot->m_readBufSize; if ( requestSize < r->getMinSize() ) { log("db: Got bad request size of %li bytes for title record. " "Need at least 28.", requestSize ); us->sendErrorReply ( slot , EBADREQUESTSIZE ); return; } // get base, returns NULL and sets g_errno to ENOCOLLREC on error RdbBase *tbase; if ( ! (tbase=getRdbBase(RDB_TITLEDB,r->m_collnum) ) ) { log("db: Could not get title rec in collection # %li " "because rdbbase is null.", (long)r->m_collnum); g_errno = EBADENGINEER; us->sendErrorReply ( slot , g_errno ); return; } // overwrite what is in there so niceness conversion algo works r->m_niceness = netnice; // if just checking tfndb, do not do the cache lookup in clusterdb if ( r->m_justCheckTfndb ) r->m_maxCacheAge = 0; // keep track of stats //if (r->m_justCheckTfndb) // g_tfndb.getRdb()->readRequestGet(requestSize); // else g_titledb.getRdb()->readRequestGet (requestSize); // breathe QUICKPOLL ( r->m_niceness); // sanity check if ( r->m_collnum < 0 ) { char *xx=NULL;*xx=0; } // make the state now State22 *st ; try { st = new (State22); } catch ( ... ) { g_errno = ENOMEM; log("query: Msg22: new(%i): %s", sizeof(State22), mstrerror(g_errno)); us->sendErrorReply ( slot , g_errno ); return; } mnew ( st , sizeof(State22) , "Msg22" ); // store ptr to the msg22request st->m_r = r; // save for sending back reply st->m_slot = slot; // then tell slot not to free it since m_r references it! // so we'll have to free it when we destroy State22 st->m_slotAllocSize = slot->m_readBufMaxSize; st->m_slotReadBuf = slot->m_readBuf; slot->m_readBuf = NULL; // . make the keys for getting recs from tfndb // . url recs map docid to the title file # that contains the titleRec //key_t uk1 ; //key_t uk2 ; // . if docId was explicitly specified... // . we may get multiple tfndb recs if ( ! r->m_url[0] ) { // there are no del bits in tfndb //uk1 = g_tfndb.makeMinKey ( r->m_docId ); //uk2 = g_tfndb.makeMaxKey ( r->m_docId ); st->m_docId1 = r->m_docId; st->m_docId2 = r->m_docId; } // but if we are requesting an available docid, it might be taken // so try the range if ( r->m_getAvailDocIdOnly ) { long long pd = r->m_docId; long long d1 = g_titledb.getFirstProbableDocId ( pd ); long long d2 = g_titledb.getLastProbableDocId ( pd ); // sanity - bad url with bad subdomain? if ( pd < d1 || pd > d2 ) { char *xx=NULL;*xx=0; } // make sure we get a decent sample in titledb then in // case the docid we wanted is not available st->m_docId1 = d1; st->m_docId2 = d2; } // . otherwise, url was given, like from Msg15 // . we may get multiple tfndb recs if ( r->m_url[0] ) { long dlen = 0; // this causes ip based urls to be inconsistent with the call // to getProbableDocId(url) below char *dom = getDomFast ( r->m_url , &dlen ); // bogus url? if ( ! dom ) { log("msg22: got bad url in request: %s",r->m_url); g_errno = EBADURL; us->sendErrorReply ( slot , g_errno ); mdelete ( st , sizeof(State22) , "Msg22" ); delete ( st ); return; } long long pd = g_titledb.getProbableDocId (r->m_url,dom,dlen); long long d1 = g_titledb.getFirstProbableDocId ( pd ); long long d2 = g_titledb.getLastProbableDocId ( pd ); // sanity - bad url with bad subdomain? if ( pd < d1 || pd > d2 ) { char *xx=NULL;*xx=0; } // there are no del bits in tfndb //uk1 = g_tfndb.makeMinKey ( d1 ); //uk2 = g_tfndb.makeMaxKey ( d2 ); // store these st->m_pd = pd; st->m_docId1 = d1; st->m_docId2 = d2; st->m_uh48 = hash64b ( r->m_url ) & 0x0000ffffffffffffLL; } QUICKPOLL ( r->m_niceness ); /* // shortcut Rdb *tdb = g_titledb.getRdb(); // init this st->m_tfn2 = -1; // skip tfndb lookup if we can. saves some time. if ( g_conf.m_readOnlyMode && // must not be a *url* lookup, it must be a docid lookup ! r->m_url[0] && // tree must be empty too i guess tdb->getTree()->getNumUsedNodes() ==0 ) { // the RdbBase contains the BigFiles for tfndb RdbBase *base = tdb->m_bases[r->m_collnum]; // can only have one titledb file if ( base->getNumFiles() == 1 ) { // now we can get RdbBase st->m_tfn2 = base->m_fileIds2[0]; // sanity check if ( st->m_tfn2 < 0 ) { char *xx = NULL; *xx = 0; } } } // check the tree for this docid RdbTree *tt = tdb->getTree(); // make titledb keys key_t startKey = g_titledb.makeFirstKey ( st->m_docId1 ); key_t endKey = g_titledb.makeLastKey ( st->m_docId2 ); long n = tt->getNextNode ( r->m_collnum , startKey ); // there should only be one match, one titlerec per docid! for ( ; n >= 0 ; n = tt->getNextNode ( n ) ) { // break if collnum does not match. we exceeded our tree range. if ( tt->getCollnum ( n ) != r->m_collnum ) break; // get the key of this node key_t k = *(key_t *)tt->getKey(n); // if passed limit, break out, no match if ( k > endKey ) break; // if we had a url make sure uh48 matches if ( r->m_url[0] ) { // get it long long uh48 = g_titledb.getUrlHash48(&k); // sanity check if ( st->m_uh48 == 0 ) { char *xx=NULL;*xx=0; } // we must match this exactly if ( uh48 != st->m_uh48 ) continue; } // . if we matched a negative key, then skip // . just break out here and enter the normal logic // . it should load tfndb and find that it is not in tfndb // because when you add a negative key to titledb in // Rdb::addList, it adds a negative rec to tfndb immediately // . NO! because we add the negative key to the tree when we // delete the old titledb rec, then we add the new one! // when a negative key is added Rdb::addRecord() removes // the positive key (and vice versa) from the tree. if ( KEYNEG((char *)&k) ) continue; // if just checking for its existence, we are done if ( r->m_justCheckTfndb ) { us->sendReply_ass ( NULL,0,NULL,0,slot); // don't forget to free the state mdelete ( st , sizeof(State22) , "Msg22" ); delete ( st ); return; } // ok, we got a match, return it char *data = tt->getData ( n ); long dataSize = tt->getDataSize ( n ); // wierd! if ( dataSize == 0 ) { char *xx=NULL;*xx=0; } // send the whole rec back long need = 12 + 4 + dataSize; // will this copy it? not! char *buf = (char *)mmalloc ( need , "msg22t" ); if ( ! buf ) { us->sendErrorReply ( slot , g_errno ); mdelete ( st , sizeof(State22) , "Msg22" ); delete ( st ); return; } // log it if ( g_conf.m_logDebugSpider ) logf(LOG_DEBUG,"spider: found %s in titledb tree", r->m_url); // store in the buf for sending char *p = buf; // store key *(key_t *)p = k; p += sizeof(key_t); // then dataSize *(long *)p = dataSize; p += 4; // then the data memcpy ( p , data , dataSize ); p += dataSize; // send off the record us->sendReply_ass (buf, need,buf, need,slot); // don't forget to free the state mdelete ( st , sizeof(State22) , "Msg22" ); delete ( st ); return; } // if we did not need to consult tfndb cuz we only have one file if ( st->m_tfn2 >= 0 ) { gotUrlListWrapper ( st , NULL , NULL ); return; } // . get the list of url recs for this docid range // . this should not block, tfndb SHOULD all be in memory all the time // . use 500 million for min recsizes to get all in range // . no, using 500MB causes problems for RdbTree::getList, so use // 100k. how many recs can there be? if ( ! st->m_msg5.getList ( RDB_TFNDB , coll , &st->m_ulist , uk1 , // startKey uk2 , // endKey // use 0x7fffffff preceisely because it // will determine eactly how long the // tree list needs to allocate in Msg5.cpp 0x7fffffff , // minRecSizes true , // includeTree? false , // addToCache? 0 , // max cache age 0 , // startFileNum -1 , // numFiles (-1 =all) st , gotUrlListWrapper , r->m_niceness , true ))// error correction? return ; // we did not block gotUrlListWrapper ( st , NULL , NULL ); } static void gotTitleList ( void *state , RdbList *list , Msg5 *msg5 ) ; void gotUrlListWrapper ( void *state , RdbList *list , Msg5 *msg5 ) { // shortcuts State22 *st = (State22 *)state; UdpServer *us = &g_udpServer; // bail on error if ( g_errno ) { log("db: Had error getting info from tfndb: %s.", mstrerror(g_errno)); log("db: uk1.n1=%li n0=%lli uk2.n1=%li n0=%lli " "d1=%lli d2=%lli.", ((key_t *)st->m_msg5.m_startKey)->n1 , ((key_t *)st->m_msg5.m_startKey)->n0 , ((key_t *)st->m_msg5.m_endKey)->n1 , ((key_t *)st->m_msg5.m_endKey)->n0 , st->m_docId1 , st->m_docId2 ); us->sendErrorReply ( st->m_slot , g_errno ); mdelete ( st , sizeof(State22) , "Msg22" ); delete ( st ); return; } // shortcuts RdbList *ulist = &st->m_ulist; Msg22Request *r = st->m_r; char *coll = g_collectiondb.getCollName ( r->m_collnum ); // point to top just in case ulist->resetListPtr(); // get base, returns NULL and sets g_errno to ENOCOLLREC on error RdbBase *tbase = getRdbBase(RDB_TITLEDB,coll); // set probable docid long long pd = 0LL; if ( r->m_url[0] ) { pd = g_titledb.getProbableDocId(r->m_url); // sanity if ( pd != st->m_pd ) { char *xx=NULL;*xx=0; } } // . these are both meant to be available docids // . if ad2 gets exhausted we use ad1 long long ad1 = st->m_docId1; long long ad2 = pd; long tfn = -1; // sanity check. make sure did not load from tfndb if did not need to if ( ! ulist->isExhausted() && st->m_tfn2 >= 0 ) {char *xx=NULL;*xx=0;} // if only one titledb file and none in memory use it if ( st->m_tfn2 >= 0 ) tfn = st->m_tfn2; // we may have multiple tfndb recs but we should NEVER have to read // multiple titledb files... for ( ; ! ulist->isExhausted() ; ulist->skipCurrentRecord() ) { // breathe QUICKPOLL ( r->m_niceness ); // get first rec key_t k = ulist->getCurrentKey(); // . skip negative keys // . seems to happen when we have tfndb in the tree... if ( KEYNEG((char *)&k) ) continue; // if we have a url and no docid, we gotta check uh48! if ( r->m_url[0] && g_tfndb.getUrlHash48(&k)!=st->m_uh48){ // get docid of that guy long long dd = g_tfndb.getDocId(&k); // if matches avail docid, inc it if ( dd == ad1 ) ad1++; if ( dd == ad2 ) ad2++; // try next tfndb key continue; } // . get file num this rec is stored in // . this is updated right after the file num is merged by // scanning all records in tfndb. this is very quick if all // of tfndb is in memory, otherwise, it might take a few // seconds. update call done in RdbMerge::incorporateMerge(). tfn = g_tfndb.getTfn ( &k ); // i guess we got a good match! break; } // sanity check. 255 used to mean in spiderdb or in tree if ( tfn >= 255 ) { char *xx=NULL;*xx=0; } // maybe no available docid if we breached our range if ( ad1 >= pd ) ad1 = 0LL; if ( ad2 > st->m_docId2 ) ad2 = 0LL; // get best long long ad = ad2; // but wrap around if we need to if ( ad == 0LL ) ad = ad1; // breathe QUICKPOLL ( r->m_niceness); // . log if different // . if our url rec was in there, this could still be different // if there was another url rec in there with the same docid and // a diferent extension, but with a tfn of 255, meaning that it // is just in spiderdb and not in titledb yet. so it hasn't been // assigned a permanent docid... // . another way "ad" may be different now is from the old bug which // did not chain the docid properly because it limited the docid // chaining to one titleRec file. so conceivably we can have // different docs sharing the same docids, but with different // url hash extensions. for instance, on host #9 we have: // 00f3b2ff63aec3a9 docId=261670033643 e=0x58 tfn=117 clean=0 half=0 // 00f3b2ff63af66c9 docId=261670033643 e=0x6c tfn=217 clean=0 half=0 // . Msg16 will only use the avail docid if the titleRec is not found if ( r->m_url[0] && pd != ad ) { //log(LOG_INFO,"build: Docid %lli collided. %s Changing " // // http://www.airliegardens.org/events.asp?dt=2&date=8/5/2011 // // COLLIDES WITH // // http://www.bbonline.com/i/chicago.html // // collision alert! log("spider: Docid %lli collided. %s Changing " "to %lli.", r->m_docId , r->m_url , ad ); // debug this for now //char *xx=NULL;*xx=0; } // remember it st->m_availDocId = ad; // if tfn is -1 then it was not in titledb if ( tfn == -1 ) { // store docid in reply char *p = st->m_slot->m_tmpBuf; // send back the available docid *(long long *)p = ad; // send it us->sendReply_ass ( p , 8 , p , 8 , st->m_slot ); // don't forget to free state mdelete ( st , sizeof(State22) , "Msg22" ); delete ( st ); return; } // sanity if ( tfn < 0 ) { char *xx=NULL;*xx=0; } // breathe QUICKPOLL ( r->m_niceness ); // ok, if just "checking tfndb" no need to go further if ( r->m_justCheckTfndb ) { // send back a good reply (empty means found!) us->sendReply_ass ( NULL,0,NULL,0,st->m_slot); // don't forget to free the state mdelete ( st , sizeof(State22) , "Msg22" ); delete ( st ); return; } // . compute the file scan range // . tfn is now equivalent to Rdb's id2, a secondary file id, it // follows the hyphen in "titledb0001-023.dat" // . default to just scan the root file AND the tree, cuz we're // assuming restrictToRoot was set to true so we did not get a tfndb // list // . even if a file number is given, always check the tree in case // it got re-spidered // . shit, but we can still miss it if it gets dumped right after // our thread is spawned, in which case we'd fall back to the old // version. no. because if its in the tree now we get it before // spawning a thread. there is no blocking. TRICKY. so if it is in // the tree at this point we'll get it, but may end up scanning the // file with the older version of the doc... not too bad. long startFileNum = tbase->getFileNumFromId2 ( tfn ); // if tfn refers to a missing titledb file... if ( startFileNum < 0 ) { if ( r->m_url[0] ) log("db: titledb missing url %s",r->m_url); else log("db: titledb missing docid %lli", r->m_docId); us->sendErrorReply ( st->m_slot,ENOTFOUND ); mdelete ( st , sizeof(State22) , "Msg22" ); delete ( st ); return ; } // save this st->m_tfn = tfn; */ // make the cacheKey ourself, since Msg5 would make the key wrong // since it would base it on startFileNum and numFiles key_t cacheKey ; cacheKey.n1 = 0; cacheKey.n0 = r->m_docId; // make titledb keys key_t startKey = g_titledb.makeFirstKey ( st->m_docId1 ); key_t endKey = g_titledb.makeLastKey ( st->m_docId2 ); // . load the list of title recs from disk now // . our file range should be solid // . use 500 million for min recsizes to get all in range if ( ! st->m_msg5.getList ( RDB_TITLEDB , r->m_collnum , &st->m_tlist , startKey , // startKey endKey , // endKey 500000000 , // minRecSizes true , // includeTree false,//r->m_addToCache , // addToCache? 0,//r->m_maxCacheAge , // max cache age 0,//startFileNum , -1 , // numFiles st , // state , gotTitleList , r->m_niceness , true , // do error correct? &cacheKey , 0 , // retry num -1 , // maxRetries true , // compensate for merge -1LL , // sync point &st->m_msg5b ) ) return ; // we did not block, nice... in cache? gotTitleList ( st , NULL , NULL ); }
// . make a web page from results stored in msg40 // . send it on TcpSocket "s" when done // . returns false if blocked, true otherwise // . sets g_errno on error bool gotIndexList ( void *state ) { // the state State10 *st = (State10 *) state; // launch more if ( ! launchRequests ( st ) ) return false; /* // get the date list //fprintf(stderr,"termId now=%lli\n",st->m_termId); //fprintf(stderr,"should be=%lli\n",(st->m_termId & TERMID_MASK)); // . now get the indexList for this termId // . date is complemented, so start with bigger one first key128_t startKey = g_datedb.makeStartKey ( st->m_termId ,0xffffffff); key128_t endKey = g_datedb.makeEndKey ( st->m_termId ,0x0); // get the rdb ptr to titledb's rdb //Rdb *rdb = g_indexdb.getRdb(); // -1 means read from all files in Indexdb long numFiles = -1; // make it zero if caller doesn't want to hit the disk if ( ! st->m_useDisk ) numFiles = 0; // get the title rec at or after this docId if ( ! st->m_msg0.getList ( -1 , 0 , 0 , 0 , // max cache age false , // add to cache? RDB_DATEDB , // rdbId of 2 = indexdb st->m_coll , &st->m_list2 , (char *)&startKey , (char *)&endKey , st->m_numRecs * sizeof(key128_t),//recSizes //st->m_useTree , // include tree? //st->m_useCache , // include cache? //false , // add to cache? //0 , // startFileNum //numFiles , // numFiles st , // state gotIndexListWrapper2 , 0 ) ) // niceness return false; // otherwise call gotResults which returns false if blocked, true else // and sets g_errno on error return gotIndexList2 ( (void *) st , NULL ); } void gotIndexListWrapper2 ( void *state , RdbList *list ) { gotIndexList2 ( state , list ); } void addedKeyWrapper ( void *state ) { gotIndexList2 ( state, NULL ); } // . make a web page from results stored in msg40 // . send it on TcpSocket "s" when done // . returns false if blocked, true otherwise // . sets g_errno on error bool gotIndexList2 ( void *state , RdbList *list ) { // the state State10 *st = (State10 *) state; */ // get the socket TcpSocket *s = st->m_socket; // don't allow pages bigger than 128k in cache //char buf [ 64*1024 ]; // a ptr into "buf" //char *p = buf; //char *pend = buf + 64*1024; /* // get termId key_t k = *(key_t *)st->m_list.getStartKey(); long long termId = g_indexdb.getTermId ( k ); // get groupId from termId //unsigned long groupId = k.n1 & g_hostdb.m_groupMask; unsigned long groupId = g_indexdb.getGroupIdFromKey ( &k ); long hostnum = g_hostdb.makeHostId ( groupId ); */ // check box " checked" strings char *ubs = ""; char *uts = ""; char *uds = ""; char *ucs = ""; char *add = ""; char *del = ""; if ( st->m_useDatedb) ubs = " checked"; if ( st->m_useTree ) uts = " checked"; if ( st->m_useDisk ) uds = " checked"; if ( st->m_useCache ) ucs = " checked"; if ( st->m_add ) add = " checked"; if ( st->m_del ) del = " checked"; SafeBuf *pbuf = &st->m_pbuf; g_pages.printAdminTop ( pbuf , st->m_socket , &st->m_r ); // get base, returns NULL and sets g_errno to ENOCOLLREC on error RdbBase *base; if (!(base=getRdbBase((uint8_t)RDB_INDEXDB,st->m_coll))) return true; // print the standard header for admin pages pbuf->safePrintf ( "<center>\n" "<table cellpadding=2><tr><td colspan=4>" "useDatedb:<input type=checkbox value=1 name=ub%s> " "useTree:<input type=checkbox value=1 name=ut%s> " "useDisk:<input type=checkbox value=1 name=ud%s> " "useCache:<input type=checkbox value=1 name=uc%s> " "ADD:<input type=checkbox value=1 name=add%s> " "DELETE:<input type=checkbox value=1 name=del%s>" "</td></tr><tr><td>" "query:" "</td><td>" "<input type=text name=q value=\"%s\" size=20>" "</td><td>" "collection:" "</td><td>" "<input type=text name=c value=\"%s\" size=10>" "</td></tr><tr><td>" "termId:" "</td><td>" "<input type=text name=t value=%lli size=20>" "</td><td>" "numRecs:" "</td><td>" "<input type=text name=numRecs value=%li size=10> " "</td></tr><tr><td>" "docId:" "</td><td>" "<input type=text name=d value=%lli size=20> " "</td><td>" "score:" "</td><td>" "<input type=text name=score value=%li size=10> " "</td><td>" "<input type=submit value=ok border=0>" "</td></tr>" "<tr><td colspan=2>" "term appears in about %lli docs +/- %li" "</td></tr>" //"<tr><td colspan=2>" //"this indexlist held by host #%li and twins" //"</td></tr>" "</table>" "</form><br><br>" , ubs, uts, uds, ucs, add, del, st->m_query , st->m_coll , st->m_termId , st->m_numRecs , st->m_docId , (long)st->m_score , st->m_termFreq , 2 * (long)GB_INDEXDB_PAGE_SIZE / 6 * base->getNumFiles() ); //hostnum ); if ( g_errno || (st->m_list.isEmpty() ) ) {//&&st->m_list2.isEmpty())){ if (g_errno)pbuf->safePrintf("Error = %s",mstrerror(g_errno)); else pbuf->safePrintf("List is empty"); pbuf->safePrintf("</center>"); // erase g_errno for sending g_errno = 0; // now encapsulate it in html head/tail and send it off bool status = g_httpServer.sendDynamicPage(s , pbuf->getBufStart(), pbuf->length() ); // delete it mdelete ( st , sizeof(State10) , "PageIndexdb" ); delete (st); return status; } pbuf->safePrintf ( "<table cellpadding=1 border=1>" "<tr><td>#</td><td>score</td>" "<td>docId</td><td>domHash</td></tr>"); //if ( searchingEvents // now print the score/docId of indexlist long i = 0; for ( st->m_list.resetListPtr () ; ! st->m_list.isExhausted () ; st->m_list.skipCurrentRecord () ) { // break if buf is low //if ( p + 1024 >= pend ) break; // but set the ip/port to a host that has this titleRec // stored locally! long long docId = st->m_list.getCurrentDocId () ; unsigned long groupId = getGroupIdFromDocId ( docId ); // get the first host's hostId in this groupId Host *h = g_hostdb.getFastestHostInGroup ( groupId ); // . pick the first host to handle the cached titleRec request // . we assume it has the best time and is up!! TODO: fix! // . use local ip though if it was an internal request // . otherwise, use the external ip //unsigned long ip = h->m_externalIp; unsigned long ip = h->m_ip; // use the NAT mapped port unsigned short port = h->m_externalHttpPort; // log the first docid so we can blaster url: queries // to PageIndexdb and see if they are in indexdb if ( i == 0 ) logf(LOG_INFO,"indexdb: %llu %s",docId,st->m_query); // adjust ip/port if local if ( st->m_isLocal ) { ip = h->m_ip; port = h->m_httpPort; } unsigned long date = 0; if ( st->m_useDatedb ) date = (unsigned long)st->m_list.getCurrentDate(); uint8_t dh = g_titledb.getDomHash8FromDocId ( docId ); char ds[32]; ds[0]=0; if ( st->m_useDatedb ) sprintf (ds,"%lu/",date); pbuf->safePrintf ( "<tr><td>%li.</td>" "<td>%s%i</td>" "<td>" //"<a href=http://%s:%hu/master/titledb?d=%llu>" "<a href=/master/titledb?c=%s&d=%llu>" "%llu" //"<td><a href=/cgi/4.cgi?d=%llu>%llu" "</td>" "<td>" "0x%02lx" "</td>" "</tr>\n" , i++, ds, (int)st->m_list.getCurrentScore() , //iptoa(ip) , port , st->m_coll, docId , docId , (long)dh ); } pbuf->safePrintf ( "</table>" ); /* if ( ! st->m_list2.isEmpty() ) p += sprintf ( p , "<br>" "<br>" "<table cellpadding=1 border=1>" "<tr><td>#</td><td>termId</td>" "<td>date</td><td>score</td>" "<td>docId</td></tr>"); // now print the score/docId of datedb list i = 0; for ( st->m_list2.resetListPtr () ; ! st->m_list2.isExhausted () ; st->m_list2.skipCurrentRecord () ) { // break if buf is low if ( p + 1024 >= pend ) break; // but set the ip/port to a host that has this titleRec // stored locally! long long docId = st->m_list2.getCurrentDocId () ; unsigned long groupId = g_titledb.getGroupId ( docId ); // get the first host's hostId in this groupId Host *h = g_hostdb.getFastestHostInGroup ( groupId ); // . pick the first host to handle the cached titleRec request // . we assume it has the best time and is up!! TODO: fix! // . use local ip though if it was an internal request // . otherwise, use the external ip //unsigned long ip = h->m_externalIp; unsigned long ip = h->m_ip; // use the NAT mapped port unsigned short port = h->m_externalHttpPort; // adjust ip/port if local if ( st->m_isLocal ) { ip = h->m_ip; port = h->m_httpPort; } // debug char kb[16]; st->m_list2.getCurrentKey(kb); //log(LOG_INFO,"debug: n1=%016llx n0=%016llx", // *(long long *)(kb+8),*(long long *)(kb+0)); //if ( (unsigned long)st->m_list2.getCurrentDate() == 0 ) // log("STOP"); sprintf ( p , "<tr><td>%li.</td>" "<td>%llu</td>" "<td>%lu</td><td>%i</td>" "<td>" //"<a href=http://%s:%hu/master/titledb?d=%llu>" "<a href=/master/titledb?c=%s&d=%llu>" "%llu" //"<td><a href=/cgi/4.cgi?d=%llu>%llu" "</td></tr>\n" , i++, st->m_list2.getTermId16(kb) , (unsigned long)st->m_list2.getCurrentDate() , (int)st->m_list2.getCurrentScore() , //iptoa(ip) , port , st->m_coll, docId , docId ); p += gbstrlen ( p ); } */ if ( ! st->m_list.isEmpty() ) pbuf->safePrintf ( "</table>" ); // print msg if we could fit all into buf //if ( p + 1024 >= pend ) { // sprintf ( p ,"... truncated ... no mem" ); // p += gbstrlen ( p ); //} // print the final tail //p += g_httpServer.printTail ( p , pend - p ); pbuf->safePrintf ( "</center>\n"); // now encapsulate it in html head/tail and send it off bool status = g_httpServer.sendDynamicPage ( s , pbuf->getBufStart() , pbuf->length() ); // delete the state mdelete ( st , sizeof(State10) , "PageIndexdb" ); delete (st) ; return status; }
void handleRequest22 ( UdpSlot *slot , int32_t netnice ) { // shortcut UdpServer *us = &g_udpServer; // get the request Msg22Request *r = (Msg22Request *)slot->m_readBuf; // sanity check int32_t requestSize = slot->m_readBufSize; if ( requestSize < r->getMinSize() ) { log("db: Got bad request size of %" PRId32" bytes for title record. " "Need at least 28.", requestSize ); log(LOG_ERROR,"%s:%s:%d: call sendErrorReply.", __FILE__, __func__, __LINE__); us->sendErrorReply ( slot , EBADREQUESTSIZE ); return; } // get base, returns NULL and sets g_errno to ENOCOLLREC on error RdbBase *tbase = getRdbBase( RDB_TITLEDB, r->m_collnum ); if ( ! tbase ) { log("db: Could not get title rec in collection # %" PRId32" because rdbbase is null.", (int32_t)r->m_collnum); g_errno = EBADENGINEER; log(LOG_ERROR,"%s:%s:%d: call sendErrorReply.", __FILE__, __func__, __LINE__); us->sendErrorReply ( slot , g_errno ); return; } // overwrite what is in there so niceness conversion algo works r->m_niceness = netnice; // if just checking tfndb, do not do the cache lookup in clusterdb if ( r->m_justCheckTfndb ) { r->m_maxCacheAge = 0; } g_titledb.getRdb()->readRequestGet (requestSize); // breathe QUICKPOLL ( r->m_niceness); // sanity check if ( r->m_collnum < 0 ) { char *xx=NULL;*xx=0; } // make the state now State22 *st ; try { st = new (State22); } catch ( ... ) { g_errno = ENOMEM; log("query: Msg22: new(%" PRId32"): %s", (int32_t)sizeof(State22), mstrerror(g_errno)); log(LOG_ERROR,"%s:%s:%d: call sendErrorReply.", __FILE__, __func__, __LINE__); us->sendErrorReply ( slot , g_errno ); return; } mnew ( st , sizeof(State22) , "Msg22" ); // store ptr to the msg22request st->m_r = r; // save for sending back reply st->m_slot = slot; // then tell slot not to free it since m_r references it! // so we'll have to free it when we destroy State22 st->m_slotAllocSize = slot->m_readBufMaxSize; st->m_slotReadBuf = slot->m_readBuf; slot->m_readBuf = NULL; // . if docId was explicitly specified... // . we may get multiple tfndb recs if ( ! r->m_url[0] ) { st->m_docId1 = r->m_docId; st->m_docId2 = r->m_docId; } // but if we are requesting an available docid, it might be taken // so try the range if ( r->m_getAvailDocIdOnly ) { int64_t pd = r->m_docId; int64_t d1 = g_titledb.getFirstProbableDocId ( pd ); int64_t d2 = g_titledb.getLastProbableDocId ( pd ); // sanity - bad url with bad subdomain? if ( pd < d1 || pd > d2 ) { char *xx=NULL;*xx=0; } // make sure we get a decent sample in titledb then in // case the docid we wanted is not available st->m_docId1 = d1; st->m_docId2 = d2; } // . otherwise, url was given, like from Msg15 // . we may get multiple tfndb recs if ( r->m_url[0] ) { int32_t dlen = 0; // this causes ip based urls to be inconsistent with the call // to getProbableDocId(url) below char *dom = getDomFast ( r->m_url , &dlen ); // bogus url? if ( ! dom ) { log("msg22: got bad url in request: %s from " "hostid %" PRId32" for msg22 call ", r->m_url,slot->m_host->m_hostId); g_errno = EBADURL; log(LOG_ERROR,"%s:%s:%d: call sendErrorReply.", __FILE__, __func__, __LINE__); us->sendErrorReply ( slot , g_errno ); mdelete ( st , sizeof(State22) , "Msg22" ); delete ( st ); return; } int64_t pd = g_titledb.getProbableDocId (r->m_url,dom,dlen); int64_t d1 = g_titledb.getFirstProbableDocId ( pd ); int64_t d2 = g_titledb.getLastProbableDocId ( pd ); // sanity - bad url with bad subdomain? if ( pd < d1 || pd > d2 ) { char *xx=NULL;*xx=0; } // store these st->m_pd = pd; st->m_docId1 = d1; st->m_docId2 = d2; st->m_uh48 = hash64b ( r->m_url ) & 0x0000ffffffffffffLL; } QUICKPOLL ( r->m_niceness ); // make the cacheKey ourself, since Msg5 would make the key wrong // since it would base it on startFileNum and numFiles key_t cacheKey ; cacheKey.n1 = 0; cacheKey.n0 = r->m_docId; // make titledb keys key_t startKey = g_titledb.makeFirstKey ( st->m_docId1 ); key_t endKey = g_titledb.makeLastKey ( st->m_docId2 ); // . load the list of title recs from disk now // . our file range should be solid // . use 500 million for min recsizes to get all in range if ( ! st->m_msg5.getList ( RDB_TITLEDB , r->m_collnum , &st->m_tlist , startKey , // startKey endKey , // endKey 500000000 , // minRecSizes true , // includeTree false,//r->m_addToCache , // addToCache? 0,//r->m_maxCacheAge , // max cache age 0,//startFileNum , -1 , // numFiles st , // state , gotTitleList , r->m_niceness , true , // do error correct? &cacheKey , 0 , // retry num -1 , // maxRetries true , // compensate for merge -1LL ) ) // sync point return ; // we did not block, nice... in cache? gotTitleList ( st , NULL , NULL ); }
bool RdbMerge::getAnotherList ( ) { log(LOG_DEBUG,"db: Getting another list for merge."); // clear it up in case it was already set g_errno = 0; // get base, returns NULL and sets g_errno to ENOCOLLREC on error RdbBase *base = getRdbBase( m_rdbId, m_collnum ); if ( ! base ) { return true; } // if merging titledb files, we must adjust m_endKey so we do // not have to read a huge 200MB+ tfndb list //key_t newEndKey = m_endKey; char newEndKey[MAX_KEY_BYTES]; KEYSET(newEndKey,m_endKey,m_ks); // . this returns false if blocked, true otherwise // . sets g_errno on error // . we return false if it blocked // . m_maxBufSize may be exceeded by a rec, it's just a target size // . niceness is usually MAX_NICENESS, but reindex.cpp sets to 0 // . this was a call to Msg3, but i made it call Msg5 since // we now do the merging in Msg5, not in msg3 anymore // . this will now handle truncation, dup and neg rec removal // . it remembers last termId and count so it can truncate even when // IndexList is split between successive reads // . IMPORTANT: when merging titledb we could be merging about 255 // files, so if we are limited to only X fds it can have a cascade // affect where reading from one file closes the fd of another file // in the read (since we call open before spawning the read thread) // and can therefore take 255 retries for the Msg3 to complete // because each read gives a EFILCLOSED error. // so to fix it we allow one retry for each file in the read plus // the original retry of 25 int32_t nn = base->getNumFiles(); if ( m_numFiles > 0 && m_numFiles < nn ) nn = m_numFiles; // don't access any biased page caches bool usePageCache = true; if ( m_rdbId == RDB_CLUSTERDB ) usePageCache = false; // . i don't trust page cache too much (mdw)... well, give it a shot // . see if ths helps fix WD corruption... i doubt it usePageCache = false; // for now force to 100k int32_t bufSize = 100000; // g_conf.m_mergeBufSize , // minRecSizes // get it return m_msg5.getList ( m_rdbId , m_collnum , &m_list , m_startKey , newEndKey , // usually is maxed! bufSize , false , // includeTree? false , // add to cache? 0 , // max cache age for lookup m_startFileNum , // startFileNum m_numFiles , this , // state gotListWrapper , // callback m_niceness , // niceness true , // do error correction? NULL , // cache key ptr 0 , // retry # nn + 75 , // max retries (mk it high) false , // compensate for merge? -1LL , // sync point true , // isRealMerge? absolutely! usePageCache ); }