// scan all Rdb databases and ensure no recs (it was a clean delete) bool checkRdbLists ( long *rdbId ) { CollectionRec *cr = g_collectiondb.getRec("qatest123"); if ( ! cr ) return true; collnum_t cn = cr->m_collnum; for ( ; *rdbId < RDB_END ; ) { // pre-inc it *rdbId = *rdbId + 1; char minKey[MAX_KEY_BYTES]; char maxKey[MAX_KEY_BYTES]; KEYMIN(minKey,MAX_KEY_BYTES); KEYMAX(maxKey,MAX_KEY_BYTES); if ( ! s_msg0.getList ( 0 , // hostid 0 , // ip 0 , // port 0 , // cacheage false, // addtocache *rdbId , // rdbid cn , // collnum &s_list , minKey , maxKey , 1000 , // minrecsizes rdbId , // state gotList33, 0 // niceness ) ) return false; } return true; }
// . return false if blocked, true otherwise // . set g_errno on error // . list should be truncated, possible have all negative keys removed, // and de-duped thanks to RdbList::indexMerge_r() and RdbList::merge_r() bool RdbMerge::dumpList ( ) { // return true on g_errno if ( g_errno ) return true; // . it's suspended so we count this as blocking // . resumeMerge() will call getNextList() again, not dumpList() so // don't advance m_startKey if ( m_isSuspended ) { m_isReadyToSave = true; return false; } // . set the list to only those records that should be in our group // . filter the records that don't belong in this group via groupId //filterList ( &m_list ); // . compute the new m_startKey to get the next list from disk // . m_list was formed via RdbList::merge() // . m_list may be empty because of negative/positive collisions // but there may still be data left //m_startKey = m_list.getLastKey() ; //m_list.getLastKey(m_startKey) ; // if we use getLastKey() for this the merge completes but then // tries to merge two empty lists and cores in the merge function // because of that. i guess it relies on endkey rollover only and // not on reading less than minRecSizes to determine when to stop // doing the merge. m_list.getEndKey(m_startKey) ; //m_startKey += (uint32_t)1; KEYADD(m_startKey,m_ks); ///// // // dedup for spiderdb before we dump it. try to save disk space. // ///// if ( m_rdbId == RDB_SPIDERDB ) // removeNegRecs? = false dedupSpiderdbList(&m_list, false); // if the startKey rolled over we're done //if ( m_startKey.n0 == 0LL && m_startKey.n1 == 0 ) m_doneMerging=true; if ( KEYCMP(m_startKey,KEYMIN(),m_ks)==0 ) m_doneMerging = true; // debug msg log(LOG_DEBUG,"db: Dumping list."); // debug msg //fprintf(stderr,"list startKey.n1=%" PRIu32",n0=%" PRIu64", endKey.n1=%" PRIu32",n0=%" PRIu64"," // " size=%" PRId32"\n", // m_list.getStartKey().n1, // m_list.getStartKey().n0, // m_list.getLastKey().n1, // m_list.getLastKey().n0, m_list.getListSize() ); // . send the whole list to the dump // . it returns false if blocked, true otherwise // . it sets g_errno on error // . it calls dumpListWrapper when done dumping // . return true if m_dump had an error or it did not block // . if it gets a EFILECLOSED error it will keep retrying forever return m_dump.dumpList ( &m_list , m_niceness , false/*recall?*/ ) ; }
// . buffer is used for reading and writing // . return false if blocked, true otherwise // . sets g_errno on error // . if niceness is 0 merge will block, otherwise will not block // . we now use niceness of 1 which should spawn threads that don't allow // niceness 2 threads to launch while they're running // . spider process now uses mostly niceness 2 // . we need the merge to take priority over spider processes on disk otherwise // there's too much contention from spider lookups on disk for the merge // to finish in a decent amount of time and we end up getting too many files! bool RdbMerge::merge ( char rdbId , //char *coll , //RdbBase *base , collnum_t collnum, BigFile *target , RdbMap *targetMap , long id2 , // target's secondary id long startFileNum , long numFiles , long niceness , class DiskPageCache *pc , long long maxTargetFileSize , char keySize ) { // reset ourselves reset(); // set it m_rdbId = rdbId; Rdb *rdb = getRdbFromId ( rdbId ); // get base, returns NULL and sets g_errno to ENOCOLLREC on error RdbBase *base; if (!(base=getRdbBase(m_rdbId,collnum))) return true; // don't breech the max //if ( numFiles > m_maxFilesToMerge ) numFiles = m_maxFilesToMerge; // reset this map! it's m_crcs needs to be reset //targetMap->reset(); // remember some parms //if ( ! coll && rdb->m_isCollectionLess ) // strcpy ( m_coll , rdb->m_dbname ); //else // strcpy ( m_coll , coll ); m_collnum = collnum; if ( rdb->m_isCollectionLess ) m_collnum = 0; m_target = target; m_targetMap = targetMap; m_id2 = id2; m_startFileNum = startFileNum; m_numFiles = numFiles; m_dedup = base->m_dedup; m_fixedDataSize = base->m_fixedDataSize; m_niceness = niceness; m_pc = pc; m_maxTargetFileSize = maxTargetFileSize; m_doneMerging = false; m_ks = keySize; // . set the key range we want to retrieve from the files // . just get from the files, not tree (not cache?) //m_startKey.setMin(); //m_endKey.setMax(); KEYMIN(m_startKey,m_ks); KEYMAX(m_endKey,m_ks); // if we're resuming a killed merge, set m_startKey to last // key the map knows about. // the dump will start dumping at the end of the targetMap's data file. if ( m_targetMap->getNumRecs() > 0 ) { log(LOG_INIT,"db: Resuming a killed merge."); //m_startKey = m_targetMap->getLastKey(); m_targetMap->getLastKey(m_startKey); //m_startKey += (unsigned long) 1; KEYADD(m_startKey,1,m_ks); // if power goes out and we are not doing synchronous writes // then we could have completely lost some data and unlinked // a part file from the file being merged, so that the data is // gone. to be able to resume merging, we must increment the // startKey until it references a valid offset in all the // files being merged. invalid offsets will reference parts // that have been chopped. /* RdbMap **maps = rdb->getMaps(); BigFile **files = rdb->getFiles(); for ( long i=m_startFileNum;i<m_startFileNum+m_numFiles;i++){ long long minOff = 0LL; long k = 0; while ( k < files[i]->m_maxParts && ! files[i]->m_files[k] ) { k++; minOff += MAX_PART_SIZE; } long pn0 = maps[i]->getPage ( m_startKey ); long pn = pn0; while ( maps[i]->getAbsoluteOffset(pn) < minOff ) pn++; if ( pn != pn0 ) { log("db: Lost data during merge. Starting " "merge at page number %li from %li for " "file.",pn,pn0); m_startKey = maps[i]->getKey ( pn ); } } */ } // free our list's memory, just in case //m_list.freeList(); // . we may have multiple hosts running on the same cpu/hardDrive // . therefore, to maximize disk space, we should only have 1 merge // at a time going on between these hosts // . now tfndb has own merge class since titledb merge writes url recs /* if ( s_isMergeLocked ) { //log("RdbMerge::merge: someone else merging sleeping."); log("RdbMerge::merge: someone else merging. bad engineer."); return false; // if it fails then sleep until it works //returng_loop.registerSleepCallback(5000,this,getLockWrapper); } */ return gotLock(); }
// . returns false if blocked, true otherwise // . sets g_errno on error bool RdbMerge::gotLock ( ) { // get total recSizes of files we're merging //long totalSize = 0; //for ( long i=m_startFileNum ; i < m_startFileNum + m_numFiles ; i++ ) //totalSize += m_base->m_files[i]->getSize(); // . grow the map now so it doesn't have to keep growing dynamically // which wastes memory // . setMapSize() returns false and sets g_errno on error // . we return true if it had an error //if ( ! m_targetMap->setMapSizeFromFileSize ( totalSize ) ) { //log("RdbMerge::getLockFile: targetMap setMapSize failed"); //return true; //} // . get last mapped offset // . this may actually be smaller than the file's actual size // but the excess is not in the map, so we need to do it again long long startOffset = m_targetMap->getFileSize(); // if startOffset is > 0 use the last key as RdbDump:m_prevLastKey // so it can compress the next key it dumps providee m_useHalfKeys // is true (key compression) and the next key has the same top 6 bytes // as m_prevLastKey //key_t prevLastKey; //if ( startOffset > 0 ) prevLastKey = m_targetMap->getLastKey(); //else prevLastKey.setMin(); char prevLastKey[MAX_KEY_BYTES]; if ( startOffset > 0 ) m_targetMap->getLastKey(prevLastKey); else KEYMIN(prevLastKey,m_ks); // get base, returns NULL and sets g_errno to ENOCOLLREC on error RdbBase *base; if (!(base=getRdbBase(m_rdbId,m_collnum))) return true; // . set up a a file to dump the records into // . returns false and sets g_errno on error // . this will open m_target as O_RDWR | O_NONBLOCK | O_ASYNC ... m_dump.set ( m_collnum , m_target , m_id2 , //m_startFileNum - 1 , // merge fileNum in Rdb::m_files[] (m_rdbId == RDB_TITLEDB||m_rdbId== RDB2_TITLEDB2) , NULL , // buckets to dump is NULL, we call dumpList NULL , // tree to dump is NULL, we call dumpList m_targetMap , NULL , // for caching dumped tree 0 , // m_maxBufSize. not needed if no tree! true , // orderedDump? m_dedup , m_niceness , // niceness of dump this , // state dumpListWrapper , base->useHalfKeys() , startOffset , prevLastKey , m_ks , m_pc , m_maxTargetFileSize , NULL ); // set m_base::m_needsToSave? no. // what kind of error? if ( g_errno ) { log("db: gotLock: %s.", mstrerror(g_errno) ); return true; } // . create a new msg3 // . don't keep static because it contains a msg3, treeList & diskList // . these can take up many megs of mem // . yes, but we need to avoid fragmentation, so hold on to our mem! //m_msg3 = new (Msg3); //if ( ! m_msg3 ) return false; // we're now merging since the dump was set up successfully m_isMerging = true; // make it suspended for now m_isSuspended = true; // grab the lock //s_isMergeLocked = true; // . this unsuspends it // . this returns false on error and sets g_errno // . it returns true if blocked or merge completed successfully return resumeMerge ( ); }
// . return false if blocked, true otherwise // . sets g_errno on error bool RdbDump::set ( //char *coll , collnum_t collnum , BigFile *file , int32_t id2 , // in Rdb::m_files[] array bool isTitledb , RdbBuckets *buckets , // optional buckets to dump RdbTree *tree , // optional tree to dump RdbMap *map , RdbCache *cache , int32_t maxBufSize , bool orderedDump , // dump in order of keys? bool dedup , // 4 RdbCache::incorporateList() int32_t niceness , void *state , void (* callback) ( void *state ) , bool useHalfKeys , int64_t startOffset , //key_t prevLastKey , char *prevLastKey , char keySize , //class DiskPageCache *pc , void *pc , int64_t maxFileSize , Rdb *rdb ) { if ( ! orderedDump ) { log(LOG_LOGIC,"db: RdbDump does not support non-ordered."); char *xx = NULL; *xx = 0; } //if ( ! coll && //if ( ! coll && rdb->m_isCollectionLess ) // strcpy(m_coll,rdb->m_dbname); m_collnum = collnum; // use 0 for collectionless if ( rdb && rdb->m_isCollectionLess ) m_collnum = 0; // are we like catdb/statsdb etc.? m_doCollCheck = true; if ( rdb && rdb->m_isCollectionLess ) m_doCollCheck = false; // RdbMerge also calls us but rdb is always set to NULL and it was // causing a merge on catdb (collectionless) to screw up if ( ! rdb ) m_doCollCheck = false; /* if ( ! coll && g_catdb.getRdb() == rdb ) strcpy(m_coll, "catdb"); else if ( ! coll && g_statsdb.getRdb() == rdb ) strcpy(m_coll, "statsdb"); else if ( ! coll && g_accessdb.getRdb() == rdb ) strcpy(m_coll, "accessdb"); */ //else // strcpy ( m_coll , coll ); m_file = file; m_id2 = id2; m_isTitledb = isTitledb; m_buckets = buckets; m_tree = tree; m_map = map; m_cache = cache; m_orderedDump = orderedDump; m_dedup = dedup; m_state = state; m_callback = callback; m_list = NULL; m_niceness = niceness; m_tried = false; m_isSuspended = false; m_ks = keySize; m_addToMap = true; // reset this in case we run out of mem, it doesn't get set properly // and needs to be NULL for RdbMem's call to getLastKeyinQueue() m_lastKeyInQueue = NULL; KEYMIN(m_firstKeyInQueue,m_ks); m_isDumping = false; m_writing = false; m_buf = NULL; m_verifyBuf = NULL; m_maxBufSize = maxBufSize; m_offset = startOffset ; m_rolledOver = false; // true if m_nextKey rolls over back to 0 //m_nextKey = 0 ; // used in dumpTree() KEYMIN(m_nextKey,m_ks); m_nextNode = 0 ; // used in dumpTree() // if we're dumping indexdb, allow half keys m_useHalfKeys = useHalfKeys; //m_prevLastKey = prevLastKey; KEYSET(m_prevLastKey,prevLastKey,m_ks); // for setting m_rdb->m_needsSave after deleting the dump list m_rdb = rdb; // . don't dump to a pre-existing file // . seems like Rdb.cpp makes a new BigFile before calling this // . now we can resume merges, so we can indeed dump to the END // of a pre-exiting file, but not when dumping a tree! //if ( m_file->doesExist() > 0 ) { if ( (m_tree || m_buckets) && m_file->getFileSize() > 0 ) { g_errno = EEXIST; log("db: Could not dump to %s. File exists.", m_file->getFilename()); return true; } // . NOTE: MAX_PART_SIZE in BigFile must be defined to be bigger than // anything we actually dump since we only anticipate spanning 1 file // and so only register the first file's fd for write callbacks //if ( m_tree && m_tree->getMaxMem() > MAX_PART_SIZE ) //return log("RdbDump::dump: tree bigger than file part size"); // . open the file nonblocking, sync with disk, read/write // . NOTE: O_SYNC doesn't work too well over NFS // . we need O_SYNC when dumping trees only because we delete the // nodes/records as we dump them // . ensure this sets g_errno for us // . TODO: open might not block! fix that! int32_t flags = O_RDWR | O_CREAT ; // a niceness bigger than 0 means to do non-blocking dumps if ( niceness > 0 ) flags |= O_ASYNC | O_NONBLOCK ; if ( ! m_file->open ( flags , pc , maxFileSize ) ) return true; // . get the file descriptor of the first real file in BigFile // . we should only dump to the first file in BigFile otherwise, // we'd have to juggle fd registration m_fd = m_file->getfd ( 0 , false /*for reading?*/ ); if ( m_fd < 0 ) { log(LOG_LOGIC,"db: dump: Bad fd of first file in BigFile.") ; return true; } // debug test //char buf1[10*1024]; //int32_t n1 = m_file->write ( buf1 , 10*1024 , 0 ); //log("bytes written=%"INT32"\n",n1); // we're now considered to be in dumping state m_isDumping = true; // . if no tree was provided to dump it must be RdbMerge calling us // . he'll want to call dumpList() on his own if ( ! m_tree && !m_buckets ) return true; // how many recs in tree? int32_t nr; char *structureName; if(m_tree) { nr = m_tree->getNumUsedNodes(); structureName = "tree"; } else if(m_buckets){ nr = m_buckets->getNumKeys(); structureName = "buckets"; } // debug msg log(LOG_INFO,"db: Dumping %"INT32" recs from %s to files.", nr, structureName); // nr , m_file->getFilename() ); // keep a total count for reporting when done m_totalPosDumped = 0; m_totalNegDumped = 0; // we have our own flag here since m_dump::m_isDumping gets // set to true between collection dumps, RdbMem.cpp needs // a flag that doesn't do that... see RdbDump.cpp. // this was in Rdb.cpp but when threads were turned off it was // NEVER getting set and resulted in corruption in RdbMem.cpp. m_rdb->m_inDumpLoop = true; // . start dumping the tree // . return false if it blocked if ( ! dumpTree ( false ) ) return false; // no longer dumping doneDumping(); // return true since we didn't block return true; }