// . return false if blocked, true otherwise // . set g_errno on error // . list should be truncated, possible have all negative keys removed, // and de-duped thanks to RdbList::indexMerge_r() and RdbList::merge_r() bool RdbMerge::dumpList ( ) { // return true on g_errno if ( g_errno ) return true; // . it's suspended so we count this as blocking // . resumeMerge() will call getNextList() again, not dumpList() so // don't advance m_startKey if ( m_isSuspended ) { m_isReadyToSave = true; return false; } // . set the list to only those records that should be in our group // . filter the records that don't belong in this group via groupId //filterList ( &m_list ); // . compute the new m_startKey to get the next list from disk // . m_list was formed via RdbList::merge() // . m_list may be empty because of negative/positive collisions // but there may still be data left //m_startKey = m_list.getLastKey() ; //m_list.getLastKey(m_startKey) ; // if we use getLastKey() for this the merge completes but then // tries to merge two empty lists and cores in the merge function // because of that. i guess it relies on endkey rollover only and // not on reading less than minRecSizes to determine when to stop // doing the merge. m_list.getEndKey(m_startKey) ; //m_startKey += (uint32_t)1; KEYADD(m_startKey,m_ks); ///// // // dedup for spiderdb before we dump it. try to save disk space. // ///// if ( m_rdbId == RDB_SPIDERDB ) // removeNegRecs? = false dedupSpiderdbList(&m_list, false); // if the startKey rolled over we're done //if ( m_startKey.n0 == 0LL && m_startKey.n1 == 0 ) m_doneMerging=true; if ( KEYCMP(m_startKey,KEYMIN(),m_ks)==0 ) m_doneMerging = true; // debug msg log(LOG_DEBUG,"db: Dumping list."); // debug msg //fprintf(stderr,"list startKey.n1=%" PRIu32",n0=%" PRIu64", endKey.n1=%" PRIu32",n0=%" PRIu64"," // " size=%" PRId32"\n", // m_list.getStartKey().n1, // m_list.getStartKey().n0, // m_list.getLastKey().n1, // m_list.getLastKey().n0, m_list.getListSize() ); // . send the whole list to the dump // . it returns false if blocked, true otherwise // . it sets g_errno on error // . it calls dumpListWrapper when done dumping // . return true if m_dump had an error or it did not block // . if it gets a EFILECLOSED error it will keep retrying forever return m_dump.dumpList ( &m_list , m_niceness , false/*recall?*/ ) ; }
// . returns a new, smaller endKey // . shrinks endKey while still preserving the minRecSizes requirement // . this is the most confusing subroutine in the project // . this now OVERWRITES endKey with the new one //key_t Msg3::setPageRanges ( RdbBase *base , void Msg3::setPageRanges ( RdbBase *base , int32_t *fileNums , int32_t numFileNums , const char *startKey , char *endKey , int32_t minRecSizes ) { // sanity check //if ( m_ks != 12 && m_ks != 16 ) { char *xx=NULL;*xx=0; } // get the file maps from the rdb RdbMap **maps = base->getMaps(); // . initialize the startpg/endpg for each file // . we read from the first offset on m_startpg to offset on m_endpg // . since we set them equal that means an empty range for each file for ( int32_t i = 0 ; i < numFileNums ; i++ ) { int32_t fn = fileNums[i]; if ( fn < 0 ) { char *xx = NULL; *xx = 0; } m_startpg[i] = maps[fn]->getPage( startKey ); m_endpg [i] = m_startpg[i]; } // just return if minRecSizes 0 (no reading needed) //if ( minRecSizes <= 0 ) return endKey ; if ( minRecSizes <= 0 ) return; // calculate minKey minus one //key_t lastMinKey ; char lastMinKey[MAX_KEY_BYTES]; char lastMinKeyIsValid = 0; // loop until we find the page ranges that barely satisfy "minRecSizes" loop: // find the map whose next page has the lowest key int32_t minpg = -1; //key_t minKey; char minKey[MAX_KEY_BYTES]; for ( int32_t i = 0 ; i < numFileNums ; i++ ) { int32_t fn = fileNums[i]; // this guy is out of race if his end key > "endKey" already //if ( maps[fn]->getKey ( m_endpg[i] ) > endKey ) continue; if(KEYCMP(maps[fn]->getKeyPtr(m_endpg[i]),endKey,m_ks)>0) continue; // get the next page after m_endpg[i] int32_t nextpg = m_endpg[i] + 1; // if endpg[i]+1 == m_numPages then we maxed out this range if ( nextpg > maps[fn]->getNumPages() ) continue; // . but this may have an offset of -1 // . which means the page has no key starting on it and // it's occupied by a rec which starts on a previous page while ( nextpg < maps[fn]->getNumPages() && maps[fn]->getOffset ( nextpg ) == -1 ) nextpg++; // . continue if his next page doesn't have the minimum key // . if nextpg == getNumPages() then it returns the LAST KEY // contained in the corresponding RdbFile //if ( minpg != -1 && maps[fn]->getKey ( nextpg ) > minKey ) if (minpg != -1 && KEYCMP(maps[fn]->getKeyPtr(nextpg),minKey,m_ks)>0)continue; // . we got a winner, his next page has the current min key // . if m_endpg[i]+1 == getNumPages() then getKey() returns the // last key in the mapped file // . minKey should never equal the key on m_endpg[i] UNLESS // it's on page #m_numPages //minKey = maps[fn]->getKey ( nextpg ); KEYSET(minKey,maps[fn]->getKeyPtr(nextpg),m_ks); minpg = i; // if minKey is same as the current key on this endpg, inc it // so we cause some advancement, otherwise, we'll loop forever //if ( minKey != maps[fn]->getKey ( m_endpg[i] ) ) continue; if ( KEYCMP(minKey,maps[fn]->getKeyPtr(m_endpg[i]),m_ks)!=0) continue; //minKey += (uint32_t) 1; KEYADD(minKey,m_ks); } // . we're done if we hit the end of all maps in the race // . return the max end key // key_t maxEndKey; maxEndKey.setMax(); return maxEndKey; } // . no, just the endKey //if ( minpg == -1 ) return endKey; if ( minpg == -1 ) return; // sanity check if ( lastMinKeyIsValid && KEYCMP(minKey,lastMinKey,m_ks)<=0 ) { g_errno = ECORRUPTDATA; log("db: Got corrupted map in memory for %s. This is almost " "always because of bad memory. Please replace your RAM.", base->m_dbname); // do not wait for any merge to complete... otherwise // Rdb.cpp will not close until the merge is done g_merge.m_isMerging = false; g_merge2.m_isMerging = false; // to complete // shutdown with urgent=true so threads are disabled. g_process.shutdown(true); //g_numCorrupt++; // sleep for now until we make sure this works //sleep(2000); return; } // don't let minKey exceed endKey, however //if ( minKey > endKey ) { if ( KEYCMP(minKey,endKey,m_ks)>0 ) { //minKey = endKey ; //minKey += (uint32_t) 1; //lastMinKey = endKey; KEYSET(minKey,endKey,m_ks); KEYADD(minKey,m_ks); KEYSET(lastMinKey,endKey,m_ks); } else { //lastMinKey = minKey ; //lastMinKey -= (uint32_t) 1; KEYSET(lastMinKey,minKey,m_ks); KEYSUB(lastMinKey,m_ks); } // it is now valid lastMinKeyIsValid = 1; // . advance m_endpg[i] so that next page < minKey // . we want to read UP TO the first key on m_endpg[i] for ( int32_t i = 0 ; i < m_numFileNums ; i++ ) { int32_t fn = fileNums[i]; m_endpg[i] = maps[fn]->getEndPage ( m_endpg[i], lastMinKey ); } // . if the minKey is BIGGER than the provided endKey we're done // . we don't necessarily include records whose key is "minKey" //if ( minKey > endKey ) return endKey; if ( KEYCMP(minKey,endKey,m_ks)>0) return; // . calculate recSizes per page within [startKey,minKey-1] // . compute bytes of records in [startKey,minKey-1] for each map // . this includes negative records so we may have annihilations // when merging into "diskList" and get less than what we wanted // but endKey should be shortened, so our caller will know to call // again if he wants more int32_t recSizes = 0; for ( int32_t i = 0 ; i < m_numFileNums ; i++ ) { int32_t fn = fileNums[i]; recSizes += maps[fn]->getMinRecSizes ( m_startpg[i] , m_endpg [i] , startKey , lastMinKey , false ); } // if we hit it then return minKey -1 so we only read UP TO "minKey" // not including "minKey" //if ( recSizes >= minRecSizes ) if ( recSizes >= minRecSizes ) { // . sanity check // . this sanity check fails sometimes, but leave it // out for now... causes the Illegal endkey msgs in // RdbList::indexMerge_r() //if ( KEYNEG(lastMinKey) ) { char *xx=NULL;*xx=0; } KEYSET(endKey,lastMinKey,m_ks); //return lastMinKey; return; } // keep on truckin' goto loop; }
// . buffer is used for reading and writing // . return false if blocked, true otherwise // . sets g_errno on error // . if niceness is 0 merge will block, otherwise will not block // . we now use niceness of 1 which should spawn threads that don't allow // niceness 2 threads to launch while they're running // . spider process now uses mostly niceness 2 // . we need the merge to take priority over spider processes on disk otherwise // there's too much contention from spider lookups on disk for the merge // to finish in a decent amount of time and we end up getting too many files! bool RdbMerge::merge ( char rdbId , //char *coll , //RdbBase *base , collnum_t collnum, BigFile *target , RdbMap *targetMap , long id2 , // target's secondary id long startFileNum , long numFiles , long niceness , class DiskPageCache *pc , long long maxTargetFileSize , char keySize ) { // reset ourselves reset(); // set it m_rdbId = rdbId; Rdb *rdb = getRdbFromId ( rdbId ); // get base, returns NULL and sets g_errno to ENOCOLLREC on error RdbBase *base; if (!(base=getRdbBase(m_rdbId,collnum))) return true; // don't breech the max //if ( numFiles > m_maxFilesToMerge ) numFiles = m_maxFilesToMerge; // reset this map! it's m_crcs needs to be reset //targetMap->reset(); // remember some parms //if ( ! coll && rdb->m_isCollectionLess ) // strcpy ( m_coll , rdb->m_dbname ); //else // strcpy ( m_coll , coll ); m_collnum = collnum; if ( rdb->m_isCollectionLess ) m_collnum = 0; m_target = target; m_targetMap = targetMap; m_id2 = id2; m_startFileNum = startFileNum; m_numFiles = numFiles; m_dedup = base->m_dedup; m_fixedDataSize = base->m_fixedDataSize; m_niceness = niceness; m_pc = pc; m_maxTargetFileSize = maxTargetFileSize; m_doneMerging = false; m_ks = keySize; // . set the key range we want to retrieve from the files // . just get from the files, not tree (not cache?) //m_startKey.setMin(); //m_endKey.setMax(); KEYMIN(m_startKey,m_ks); KEYMAX(m_endKey,m_ks); // if we're resuming a killed merge, set m_startKey to last // key the map knows about. // the dump will start dumping at the end of the targetMap's data file. if ( m_targetMap->getNumRecs() > 0 ) { log(LOG_INIT,"db: Resuming a killed merge."); //m_startKey = m_targetMap->getLastKey(); m_targetMap->getLastKey(m_startKey); //m_startKey += (unsigned long) 1; KEYADD(m_startKey,1,m_ks); // if power goes out and we are not doing synchronous writes // then we could have completely lost some data and unlinked // a part file from the file being merged, so that the data is // gone. to be able to resume merging, we must increment the // startKey until it references a valid offset in all the // files being merged. invalid offsets will reference parts // that have been chopped. /* RdbMap **maps = rdb->getMaps(); BigFile **files = rdb->getFiles(); for ( long i=m_startFileNum;i<m_startFileNum+m_numFiles;i++){ long long minOff = 0LL; long k = 0; while ( k < files[i]->m_maxParts && ! files[i]->m_files[k] ) { k++; minOff += MAX_PART_SIZE; } long pn0 = maps[i]->getPage ( m_startKey ); long pn = pn0; while ( maps[i]->getAbsoluteOffset(pn) < minOff ) pn++; if ( pn != pn0 ) { log("db: Lost data during merge. Starting " "merge at page number %li from %li for " "file.",pn,pn0); m_startKey = maps[i]->getKey ( pn ); } } */ } // free our list's memory, just in case //m_list.freeList(); // . we may have multiple hosts running on the same cpu/hardDrive // . therefore, to maximize disk space, we should only have 1 merge // at a time going on between these hosts // . now tfndb has own merge class since titledb merge writes url recs /* if ( s_isMergeLocked ) { //log("RdbMerge::merge: someone else merging sleeping."); log("RdbMerge::merge: someone else merging. bad engineer."); return false; // if it fails then sleep until it works //returng_loop.registerSleepCallback(5000,this,getLockWrapper); } */ return gotLock(); }
// . returns false if blocked, true otherwise // . sets g_errno on error // . dumps the RdbTree, m_tree, into m_file // . also sets and writes the RdbMap for m_file // . we methodically get RdbLists from the RdbTree // . dumped recs are ordered by key if "orderedDump" was true in call to set() // otherwise, lists are ordered by node # // . we write each list of recs to the file until the whole tree has been done // . we delete all records in list from the tree after we've written the list // . if a cache was provided we incorporate the list into the cache before // deleting it from the tree to keep the cache in sync. NO we do NOT! // . called again by writeBuf() when it's done writing the whole list bool RdbDump::dumpTree ( bool recall ) { // set up some vars //int32_t nextNode; //key_t maxEndKey; //maxEndKey.setMax(); char maxEndKey[MAX_KEY_BYTES]; KEYMAX(maxEndKey,m_ks); // if dumping statsdb, we can only dump records 30 seconds old or // more because Statsdb.cpp can "back modify" such records in the tree // because it may have a query that took 10 seconds come in then it // needs to add a partial stat to the last 10 stats for those 10 secs. // we use Global time at this juncture if ( m_rdb->m_rdbId == RDB_STATSDB ) { int32_t nowSecs = getTimeGlobal(); StatKey *sk = (StatKey *)maxEndKey; sk->m_zero = 0x01; sk->m_labelHash = 0xffffffff; // leave last 60 seconds in there just to be safe sk->m_time1 = nowSecs - 60; } // this list will hold the list of nodes/recs from m_tree m_list = &m_ourList; // convert coll to collnum //collnum_t collnum = g_collectiondb.getCollnum ( m_coll ); // a collnum of -1 is for collectionless rdbs //if ( collnum < 0 ) { // //if ( g_catdb->getRdb() == m_rdb ) // if ( ! m_rdb->m_isCollectionLess ) { // char *xx=NULL;*xx=0; //return true; // } // g_errno = 0; // collnum = 0; //} // getMemOccupiedForList2() can take some time, so breathe int32_t niceness = 1; loop: // if the lastKey was the max end key last time then we're done if ( m_rolledOver ) return true; // this is set to -1 when we're done with our unordered dump if ( m_nextNode == -1 ) return true; // . NOTE: list's buffer space should be re-used!! (TODO) // . "lastNode" is set to the last node # in the list bool status = true; //if ( ! m_orderedDump ) { // status = ((RdbTree *)m_tree)->getListUnordered ( m_nextNode , // m_maxBufSize , // m_list , // &nextNode ); // // this is -1 when no more nodes are left // m_nextNode = nextNode; //} // "lastKey" is set to the last key in the list //else { { // can we remove neg recs? // class RdbBase *base = m_rdb->getBase(m_collnum); // bool removeNegRecs = false; // if ( base->m_numFiles <= 0 ) removeNegRecs = true; if ( recall ) goto skip; // debug msg //log("RdbDump:: getting list"); m_t1 = gettimeofdayInMilliseconds(); if(m_tree) status = m_tree->getList ( m_collnum , m_nextKey , maxEndKey , m_maxBufSize , // max recSizes m_list , &m_numPosRecs , &m_numNegRecs , m_useHalfKeys , niceness ); else if(m_buckets) status = m_buckets->getList ( m_collnum, m_nextKey , maxEndKey , m_maxBufSize , // max recSizes m_list , &m_numPosRecs , &m_numNegRecs , m_useHalfKeys ); // don't dump out any neg recs if it is our first time dumping // to a file for this rdb/coll. TODO: implement this later. //if ( removeNegRecs ) // m_list.removeNegRecs(); // if(!m_list->checkList_r ( false , // removeNegRecs? // false , // sleep on problem? // m_rdb->m_rdbId )) { // log("db: list to dump is not sane!"); // char *xx=NULL;*xx=0; // } skip: int64_t t2; //key_t lastKey; char *lastKey; // if error getting list (out of memory?) if ( ! status ) goto hadError; // debug msg t2 = gettimeofdayInMilliseconds(); log(LOG_INFO,"db: Get list took %"INT64" ms. " "%"INT32" positive. %"INT32" negative.", t2 - m_t1 , m_numPosRecs , m_numNegRecs ); // keep a total count for reporting when done m_totalPosDumped += m_numPosRecs; m_totalNegDumped += m_numNegRecs; // . check the list we got from the tree for problems // . ensures keys are ordered from lowest to highest as well //#ifdef GBSANITYCHECK if ( g_conf.m_verifyWrites ) { char *s = "none"; if ( m_rdb ) s = getDbnameFromId(m_rdb->m_rdbId); log("dump: verifying list before dumping (rdb=%s)",s); m_list->checkList_r ( false , // removeNegRecs? false , // sleep on problem? m_rdb->m_rdbId ); } // if list is empty, we're done! if ( status && m_list->isEmpty() ) { // consider that a rollover? if ( m_rdb->m_rdbId == RDB_STATSDB ) m_rolledOver = true; return true; } // get the last key of the list lastKey = m_list->getLastKey(); // advance m_nextKey //m_nextKey = lastKey ; //m_nextKey += (uint32_t)1; //if ( m_nextKey < lastKey ) m_rolledOver = true; KEYSET(m_nextKey,lastKey,m_ks); KEYADD(m_nextKey,1,m_ks); if (KEYCMP(m_nextKey,lastKey,m_ks)<0) m_rolledOver = true; // debug msg //log(0,"RdbDump:lastKey.n1=%"UINT32",n0=%"UINT64"",lastKey.n1,lastKey.n0); //log(0,"RdbDump:next.n1=%"UINT32",n0=%"UINT64"",m_nextKey.n1,m_nextKey.n0); } // . return true on error, g_errno should have been set // . this is probably out of memory error if ( ! status ) { hadError: log("db: Had error getting data for dump: %s. Retrying.", mstrerror(g_errno)); // debug msg //log("RdbDump::getList: sleeping and retrying"); // retry for the remaining two types of errors if (!g_loop.registerSleepCallback(1000,this,tryAgainWrapper2)){ log( "db: Retry failed. Could not register callback."); return true; } // wait for sleep return false; } // if list is empty, we're done! if ( m_list->isEmpty() ) return true; // . set m_firstKeyInQueue and m_lastKeyInQueue // . this doesn't work if you're doing an unordered dump, but we should // not allow adds when closing m_lastKeyInQueue = m_list->getLastKey(); //m_firstKeyInQueue = m_list->getCurrentKey(); m_list->getCurrentKey(m_firstKeyInQueue); // . write this list to disk // . returns false if blocked, true otherwise // . sets g_errno on error // . if this blocks it should call us (dumpTree() back) if ( ! dumpList ( m_list , m_niceness , false ) ) return false; // close up shop on a write/dumpList error if ( g_errno ) return true; // . if dumpList() did not block then keep on truckin' // . otherwise, wait for callback of dumpTree() goto loop; }