void cSystemIrd2::ProcessEMM(int pid, int caid, const unsigned char *data) { int prov=0; //XXX how to get provider here?? int len=SCT_LEN(data); unsigned char *emm=AUTOMEM(len); cPlainKey *pk; if(!(pk=keys.FindKey('I',caid,KEYSET(prov,TYPE_IV,0),16))) { PRINTF(L_SYS_EMM,"missing %04x %02x IV key",caid,prov); return; } unsigned char EMM_IV[16]; pk->Get(EMM_IV); for(int keyno=0; keyno<2; keyno++) { if(!(pk=keys.FindKey('I',caid,KEYSET(prov,TYPE_PMK,keyno),16))) { PRINTF(L_SYS_EMM,"missing %04x %02x MK%d key",caid,prov,keyno); continue; } unsigned char PMK[16]; pk->Get(PMK); if(!(pk=keys.FindKey('I',caid,KEYSET(prov,TYPE_SEED,1),16))) { PRINTF(L_SYS_EMM,"missing %04x %02x EMM key",caid,prov); return; } unsigned char EMM_Seed[16]; pk->Get(EMM_Seed); PrepareSeed(EMM_Seed,PMK); memcpy(emm,data,len); Decrypt(&emm[10],EMM_IV,EMM_Seed,len-10); NanoDecrypt(emm,16,len-8,PMK,EMM_IV); memmove(emm+6,emm+7,len-7); // removing padding byte if(CalculateHash(EMM_Seed,EMM_IV,emm+3,len-4)) { HEXDUMP(L_SYS_RAWEMM,emm,len,"Irdeto2 RAWEMM"); for(int i=15; i<len-9;) { int l=NANOLEN(emm[i+1]); switch(emm[i]) { case 0x10: case 0x50: if(l==0x13 && i+l<=len-9) { FoundKey(); if(keys.NewKey('I',caid,KEYSET(prov,TYPE_OP,emm[i+2]>>2),&emm[i+3],16)) NewKey(); } break; } i+=l; } break; }
bool cSystemIrd2::ProcessECM(const cEcmInfo *ecm, unsigned char *data) { int len=data[11]; if(len!=0x28 || SCT_LEN(data)<len+12) { if(doLog) PRINTF(L_SYS_ECM,"bad ECM length"); return false; } int prov=data[8]; cPlainKey *pk; unsigned char ECM_IV[16]; if(!(pk=keys.FindKey('I',ecm->caId,KEYSET(prov,TYPE_IV,0),16))) { if(doLog) PRINTF(L_SYS_KEY,"missing %04x %02x IV key",ecm->caId,prov); return false; } pk->Get(ECM_IV); unsigned char ECM_Seed[16]; if(!(pk=keys.FindKey('I',ecm->caId,KEYSET(prov,TYPE_SEED,0),16))) { if(doLog) PRINTF(L_SYS_KEY,"missing %04x %02x ECM key",ecm->caId,prov); return false; } pk->Get(ECM_Seed); cKeySnoop ks(this,'I',ecm->caId,KEYSET(prov,TYPE_OP,data[9])); unsigned char key[16]; if(!(pk=keys.FindKey('I',ecm->caId,KEYSET(prov,TYPE_OP,data[9]),16))) return false; pk->Get(key); PrepareSeed(ECM_Seed,key); data+=12; Decrypt(data,ECM_IV,ECM_Seed,len); int i=(data[0]&7)+1; NanoDecrypt(data,i,len-8,key,ECM_IV); if(CalculateHash(ECM_Seed,ECM_IV,data-6,len+6)) { HEXDUMP(L_SYS_RAWECM,data-12,len+12,"Irdeto2 RAWECM"); while(i<len-8) { int l=NANOLEN(data[i+1]); switch(data[i]) { case 0x78: memcpy(cw,&data[i+4],16); ks.OK(pk); return true; } i+=l; } } else { if(doLog) PRINTF(L_SYS_ECM,"hash failed"); } return false; }
// . THIS Msg0 class must be alloc'd, i.e. not on the stack, etc. // . if list is stored locally this tries to get it locally // . otherwise tries to get the list from the network // . returns false if blocked, true otherwise // . sets g_errno on error // . NOTE: i was having problems with queries being cached too long, you // see the cache here is a NETWORK cache, so when the machines that owns // the list updates it on disk it can't flush our cache... so use a small // maxCacheAge of like , 30 seconds or so... bool Msg0::getList ( int64_t hostId , // host to ask (-1 if none) int32_t ip , // info on hostId int16_t port , int32_t maxCacheAge , // max cached age in seconds bool addToCache , // add net recv'd list to cache? char rdbId , // specifies the rdb collnum_t collnum , RdbList *list , const char *startKey , const char *endKey , int32_t minRecSizes , // use -1 for no max void *state , void (* callback)(void *state ),//, RdbList *list ) , int32_t niceness , bool doErrorCorrection , bool includeTree , bool doMerge , int32_t firstHostId , int32_t startFileNum , int32_t numFiles , int64_t timeout , int64_t syncPoint , int32_t preferLocalReads , Msg5 *msg5 , bool isRealMerge , bool allowPageCache , bool forceLocalIndexdb , bool noSplit , int32_t forceParitySplit ) { logTrace( g_conf.m_logTraceMsg0, "BEGIN. hostId: %" PRId64", rdbId: %d", hostId, (int)rdbId ); // warning if ( collnum < 0 ) log(LOG_LOGIC,"net: NULL collection. msg0."); // reset the list they passed us list->reset(); // get keySize of rdb m_ks = getKeySizeFromRdbId ( rdbId ); // if( g_conf.m_logTraceMsg0 ) // { // log("%s:%s:%d: rdbId. [%d]", __FILE__,__func__,__LINE__, (int)rdbId); // log("%s:%s:%d: m_ks.. [%d]", __FILE__,__func__,__LINE__, (int)m_ks); // log("%s:%s:%d: hostId [%" PRId64"]", __FILE__,__func__,__LINE__, hostId); // } // if startKey > endKey, don't read anything //if ( startKey > endKey ) return true; if ( KEYCMP(startKey,endKey,m_ks)>0 ) { char *xx=NULL;*xx=0; }//rettrue // . reset hostid if it is dead // . this is causing UOR queries to take forever when we have a dead if ( hostId >= 0 && g_hostdb.isDead ( hostId ) ) hostId = -1; // no longer accept negative minrecsize if ( minRecSizes < 0 ) { g_errno = EBADENGINEER; logTrace( g_conf.m_logTraceMsg0, "END" ); log(LOG_LOGIC, "net: msg0: Negative minRecSizes no longer supported."); char *xx=NULL;*xx=0; } // remember these m_state = state; m_callback = callback; m_list = list; m_hostId = hostId; m_niceness = niceness; m_addToCache = addToCache; // . these define our request 100% KEYSET(m_startKey,startKey,m_ks); KEYSET(m_endKey,endKey,m_ks); m_minRecSizes = minRecSizes; m_rdbId = rdbId; m_collnum = collnum;// = coll; m_isRealMerge = isRealMerge; m_allowPageCache = allowPageCache; // . group to ask is based on the first key // . we only do 1 group per call right now // . groupMask must turn on higher bits first (count downwards kinda) // . titledb and spiderdb use special masks to get groupId // if diffbot.cpp is reading spiderdb from each shard we have to // get groupid from hostid here lest we core in getGroupId() below. // it does that for dumping spiderdb to the client browser. they // can download the whole enchilada. if ( hostId >= 0 && m_rdbId == RDB_SPIDERDB ) m_shardNum = 0; // did they force it? core until i figure out what this is else if ( forceParitySplit >= 0 ) //m_groupId = g_hostdb.getGroupId ( forceParitySplit ); m_shardNum = forceParitySplit; else //m_groupId = getGroupId ( m_rdbId , startKey , ! noSplit ); m_shardNum = getShardNum ( m_rdbId , startKey ); // if we are looking up a termlist in posdb that is split by termid and // not the usual docid then we have to set this posdb key bit that tells // us that ... if ( noSplit && m_rdbId == RDB_POSDB ) m_shardNum = g_hostdb.getShardNumByTermId ( startKey ); // how is this used? if ( forceLocalIndexdb ) m_shardNum = getMyShardNum(); // if( g_conf.m_logTraceMsg0 ) log("%s:%s:%d: shardNum [%" PRId32"]", __FILE__,__func__, __LINE__, m_shardNum); // . store these parameters // . get a handle to the rdb in case we can satisfy locally // . returns NULL and sets g_errno on error QUICKPOLL((m_niceness)); Rdb *rdb = getRdbFromId ( m_rdbId ); if ( ! rdb ) return true; // we need the fixedDataSize m_fixedDataSize = rdb->getFixedDataSize(); m_useHalfKeys = rdb->useHalfKeys(); // . debug msg // . Msg2 does this when checking for a cached compound list. // compound lists do not actually exist, they are merges of smaller // UOR'd lists. if ( maxCacheAge != 0 && ! addToCache && (numFiles > 0 || includeTree)) { log( LOG_LOGIC, "net: msg0: Weird. check but don't add... rdbid=%" PRId32".", ( int32_t ) m_rdbId ); } // set this here since we may not call msg5 if list not local //m_list->setFixedDataSize ( m_fixedDataSize ); // . now that we do load balancing we don't want to do a disk lookup // even if local if we are merging or dumping // . UNLESS g_conf.m_preferLocalReads is true if ( preferLocalReads == -1 ) preferLocalReads = g_conf.m_preferLocalReads; // . always prefer local for full split clusterdb // . and keep the tfndb/titledb lookups in the same stripe // . so basically we can't do biased caches if fully split //if ( g_conf.m_fullSplit ) preferLocalReads = true; preferLocalReads = true; // it it stored locally? bool isLocal = ( m_hostId == -1 && //g_hostdb.m_groupId == m_groupId ); m_shardNum == getMyShardNum() ); // only do local lookups if this is true if ( ! preferLocalReads ) isLocal = false; /* int64_t singleDocIdQuery = 0LL; if ( rdbId == RDB_POSDB ) { int64_t d1 = g_posdb.getDocId(m_startKey); int64_t d2 = g_posdb.getDocId(m_endKey); if ( d1+1 == d2 ) singleDocIdQuery = d1; } // . try the LOCAL termlist cache // . so when msg2 is evaluating a gbdocid:| query and it has to // use msg0 to go across the network to get the same damn termlist // over and over again for the same docid, this will help alot. // . ideally it'd be nice if the seo pipe in xmldoc.cpp can try to // send the same gbdocid:xxxx docids to the same hosts. maybe hash // based on docid into the list of hosts and if that host is busy // just chain until we find someone not busy. if ( singleDocIdQuery && getListFromTermListCache ( coll, m_startKey, m_endKey, maxCacheAge, list ) ) // found! return true; */ // but always local if only one host if ( g_hostdb.getNumHosts() == 1 ) isLocal = true; // . if the group is local then do it locally // . Msg5::getList() returns false if blocked, true otherwise // . Msg5::getList() sets g_errno on error // . don't do this if m_hostId was specified if ( isLocal ) { logTrace( g_conf.m_logTraceMsg0, "isLocal" ); if ( msg5 ) { m_msg5 = msg5; m_deleteMsg5 = false; } else { try { m_msg5 = new ( Msg5 ); } catch ( ... ) { g_errno = ENOMEM; log("net: Local alloc for disk read failed " "while tring to read data for %s. " "Trying remote request.", getDbnameFromId(m_rdbId)); goto skip; } mnew ( m_msg5 , sizeof(Msg5) , "Msg0::Msg5" ); m_deleteMsg5 = true; } QUICKPOLL(m_niceness); if ( ! m_msg5->getList ( rdbId, m_collnum , m_list , m_startKey , m_endKey , m_minRecSizes , includeTree , // include Tree? addToCache , // addToCache? maxCacheAge , startFileNum , numFiles , this , gotListWrapper2 , niceness , doErrorCorrection , NULL , // cacheKeyPtr 0 , // retryNum -1 , // maxRetries true , // compensateForMerge syncPoint , m_isRealMerge , m_allowPageCache ) ) { logTrace( g_conf.m_logTraceMsg0, "END, return false" ); return false; } // nuke it reset(); logTrace( g_conf.m_logTraceMsg0, "END, return true" ); return true; } skip: // debug msg if ( g_conf.m_logDebugQuery ) log(LOG_DEBUG,"net: msg0: Sending request for data to " "shard=%" PRIu32" " "listPtr=%" PTRFMT" minRecSizes=%" PRId32" termId=%" PRIu64" " //"startKey.n1=%" PRIx32",n0=%" PRIx64" (niceness=%" PRId32")", "startKey.n1=%" PRIx64",n0=%" PRIx64" (niceness=%" PRId32")", //g_hostdb.makeHostId ( m_groupId ) , m_shardNum, (PTRTYPE)m_list, m_minRecSizes, g_posdb.getTermId(m_startKey) , //m_startKey.n1,m_startKey.n0 , (int32_t)m_niceness); KEY1(m_startKey,m_ks),KEY0(m_startKey), (int32_t)m_niceness); char *replyBuf = NULL; int32_t replyBufMaxSize = 0; bool freeReply = true; // . make a request with the info above (note: not in network order) // . IMPORTANT!!!!! if you change this change // Multicast.cpp::sleepWrapper1 too!!!!!!!!!!!! // no, not anymore, we commented out that request peeking code char *p = m_request; *(int64_t *) p = syncPoint ; p += 8; //*(key_t *) p = m_startKey ; p += sizeof(key_t); //*(key_t *) p = m_endKey ; p += sizeof(key_t); *(int32_t *) p = m_minRecSizes ; p += 4; *(int32_t *) p = startFileNum ; p += 4; *(int32_t *) p = numFiles ; p += 4; *(int32_t *) p = maxCacheAge ; p += 4; if ( p - m_request != RDBIDOFFSET ) { char *xx=NULL;*xx=0; } *p = m_rdbId ; p++; *p = addToCache ; p++; *p = doErrorCorrection; p++; *p = includeTree ; p++; *p = (char)niceness ; p++; *p = (char)m_allowPageCache; p++; KEYSET(p,m_startKey,m_ks); ; p+=m_ks; KEYSET(p,m_endKey,m_ks); ; p+=m_ks; // NULL terminated collection name //strcpy ( p , coll ); p += gbstrlen ( coll ); *p++ = '\0'; *(collnum_t *)p = m_collnum; p += sizeof(collnum_t); m_requestSize = p - m_request; // ask an individual host for this list if hostId is NOT -1 if ( m_hostId != -1 ) { // get Host Host *h = g_hostdb.getHost ( m_hostId ); if ( ! h ) { g_errno = EBADHOSTID; log(LOG_LOGIC,"net: msg0: Bad hostId of %" PRId64".", m_hostId); logTrace( g_conf.m_logTraceMsg0, "END, return true. Bad hostId" ); return true; } // if niceness is 0, use the higher priority udpServer UdpServer *us ; uint16_t port; QUICKPOLL(m_niceness); us = &g_udpServer ; port = h->m_port ; // . returns false on error and sets g_errno, true otherwise // . calls callback when reply is received (or error) // . we return true if it returns false if ( ! us->sendRequest ( m_request , m_requestSize , 0x00 , // msgType h->m_ip , port , m_hostId , NULL , // the slotPtr this , gotSingleReplyWrapper , timeout , -1 , // backoff -1 , // maxwait replyBuf , replyBufMaxSize , m_niceness ) ) { // cback niceness logTrace( g_conf.m_logTraceMsg0, "END, return true. Request sent" ); return true; } // return false cuz it blocked logTrace( g_conf.m_logTraceMsg0, "END, return false. sendRequest blocked" ); return false; } // timing debug if ( g_conf.m_logTimingNet ) m_startTime = gettimeofdayInMilliseconds(); else m_startTime = 0; // . get the top int32_t of the key // . i guess this will work for 128 bit keys... hmmmmm int32_t keyTop = hash32 ( (char *)startKey , m_ks ); // . otherwise, multicast to a host in group "groupId" // . returns false and sets g_errno on error // . calls callback on completion // . select first host to send to in group based on upper 32 bits // of termId (m_startKey.n1) // . need to send out to all the indexdb split hosts m_numRequests = 0; m_numReplies = 0; //for ( int32_t i = 0; i < m_numSplit; i++ ) { QUICKPOLL(m_niceness); //int32_t gr; char *buf; buf = replyBuf; // get the multicast Multicast *m = &m_mcast; if ( ! m->send ( m_request , m_requestSize, 0x00 , // msgType 0x00 false , // does multicast own request? m_shardNum , false , // send to whole group? //m_startKey.n1, // key is passed on startKey keyTop , // key is passed on startKey this , // state data NULL , // state data gotMulticastReplyWrapper0 , timeout*1000 , // timeout niceness , firstHostId , buf , replyBufMaxSize , freeReply , // free reply buf? true , // do disk load balancing? maxCacheAge , //(key_t *)cacheKey , // multicast uses it for determining the best // host to send the request to when doing // disk load balancing. if the host has our // data cached, then it will probably get to // handle the request. for now let's just assume // this is a 96-bit key. TODO: fix... 0 , // *(key_t *)cacheKey , rdbId , minRecSizes ) ) { log(LOG_ERROR, "net: Failed to send request for data from %s in shard " "#%" PRIu32" over network: %s.", getDbnameFromId(m_rdbId),m_shardNum, mstrerror(g_errno)); // but speed it up m_errno = g_errno; m->reset(); if ( m_numRequests > 0 ) { logTrace( g_conf.m_logTraceMsg0, "END - returning false" ); return false; } logTrace( g_conf.m_logTraceMsg0, "END - returning true" ); return true; } m_numRequests++; // we blocked logTrace( g_conf.m_logTraceMsg0, "END - returning false, blocked" ); return false; }
// . return false if blocked, true otherwise // . set g_errno on error // . read list of keys in [startKey,endKey] range // . read at least "minRecSizes" bytes of keys in that range // . the "m_endKey" of resulting, merged list may have a smaller endKey // than the argument, "endKey" due to limitation by "minRecSizes" // . resulting list will contain ALL keys between ITS [m_startKey,m_endKey] // . final merged list "should" try to have a size of at least "minRecSizes" // but due to negative/postive rec elimination may be less // . the endKey of the lists we read may be <= "endKey" provided // . we try to shrink the endKey if minRecSizes is >= 0 in order to // avoid excessive reading // . by shrinking the endKey we cannot take into account the size of deleted // records, so therefore we may fall short of "minRecSizes" in actuality, // in fact, the returned list may even be empty with a shrunken endKey // . we merge all lists read from disk into the provided "list" // . caller should call Msg3.getList(int32_t i) and Msg3:getNumLists() to retrieve // . this makes the query engine faster since we don't need to merge the docIds // and can just send them across the network separately and they will be // hashed into IndexTable's table w/o having to do time-wasting merging. // . caller can specify array of filenums to read from so incremental syncing // in Sync class can just read from titledb*.dat files that were formed // since the last sync point. bool Msg3::readList ( char rdbId , collnum_t collnum , const char *startKeyArg , const char *endKeyArg , int32_t minRecSizes , // max size of scan int32_t startFileNum , // first file to scan int32_t numFiles , // rel. to startFileNum void *state , // for callback void (* callback ) ( void *state ) , int32_t niceness , int32_t retryNum , int32_t maxRetries , bool compensateForMerge , bool justGetEndKey , bool allowPageCache , bool hitDisk ) { // set this to true to validate m_validateCache = false;//true; // clear, this MUST be done so if we return true g_errno is correct g_errno = 0; // assume lists are not checked for corruption m_listsChecked = false; // warn if ( minRecSizes < -1 ) { log(LOG_LOGIC,"db: Msg3 got minRecSizes of %" PRId32", changing " "to -1.",minRecSizes); minRecSizes = -1; } // reset m_alloc and data in all lists in case we are a re-call reset(); // warning if ( collnum < 0 ) log(LOG_LOGIC,"net: NULL collection. msg3."); // remember the callback m_rdbId = rdbId; m_collnum = collnum; m_callback = callback; m_state = state; m_niceness = niceness; m_numScansCompleted = 0; m_retryNum = retryNum; m_maxRetries = maxRetries; m_compensateForMerge = compensateForMerge; m_allowPageCache = allowPageCache; m_hitDisk = hitDisk; m_hadCorruption = false; // get keySize of rdb m_ks = getKeySizeFromRdbId ( m_rdbId ); // reset the group error m_errno = 0; // . reset all our lists // . these are reset in call the RdbScan::setRead() below //for ( int32_t i = 0 ; i < MAX_RDB_FILES ; i++ ) m_lists[i].reset(); // . ensure startKey last bit clear, endKey last bit set // . no! this warning is now only in Msg5 // . if RdbMerge is merging some files, not involving the root // file, then we can expect to get a lot of unmatched negative recs. // . as a consequence, our endKeys may often be negative. This means // it may not annihilate with the positive key, but we should only // miss like this at the boundaries of the lists we fetch. // . so in that case RdbList::merge will stop merging once the // minRecSizes limit is reached even if it means ending on a negative // rec key //if ( (startKey.n0 & 0x01) == 0x01 ) if ( !KEYNEG(startKeyArg) ) log(LOG_REMIND,"net: msg3: StartKey lastbit set."); if ( KEYNEG(endKeyArg) ) log(LOG_REMIND,"net: msg3: EndKey lastbit clear."); // declare vars here becaues of 'goto skip' below int32_t mergeFileNum = -1 ; int32_t max ; // get base, returns NULL and sets g_errno to ENOCOLLREC on error RdbBase *base = getRdbBase( m_rdbId, m_collnum ); if ( ! base ) { return true; } // store the file numbers in the array, these are the files we read m_numFileNums = 0; // save startFileNum here, just for recall m_startFileNum = startFileNum; m_numFiles = numFiles; // . if we have a merge going on, we may have to change startFileNum // . if some files get unlinked because merge completes then our // reads will detect the error and loop back here // . we launch are reads right after this without giving up the cpu // and we use file descriptors, so any changes to Rdb::m_files[] // should not hurt us // . WARNING: just make sure you don't lose control of cpu until after // you call RdbScan::set() // . we use hasMergeFile() instead of isMerging() because he may not // be merging cuz he got suspended or he restarted and // hasn't called attemptMerge() yet, but he may still contain it if ( g_conf.m_logDebugQuery ) log(LOG_DEBUG, "net: msg3: " "c=%" PRId32" hmf=%" PRId32" sfn=%" PRId32" msfn=%" PRId32" nf=%" PRId32" db=%s.", (int32_t)compensateForMerge,(int32_t)base->hasMergeFile(), (int32_t)startFileNum,(int32_t)base->m_mergeStartFileNum-1, (int32_t)numFiles,base->m_dbname); int32_t pre = -10; if ( compensateForMerge && base->hasMergeFile() && startFileNum >= base->m_mergeStartFileNum - 1 && (startFileNum > 0 || numFiles != -1) ) { // now also include the file being merged into, but only // if we are reading from a file being merged... if ( startFileNum < base->m_mergeStartFileNum + base->m_numFilesToMerge - 1 ) //m_fileNums [ m_numFileNums++ ] = // base->m_mergeStartFileNum - 1; pre = base->m_mergeStartFileNum - 1; // debug msg if ( g_conf.m_logDebugQuery ) log(LOG_DEBUG, "net: msg3: startFileNum from %" PRId32" to %" PRId32" (mfn=%" PRId32")", startFileNum,startFileNum+1,mergeFileNum); // if merge file was inserted before us, inc our file number startFileNum++; } // adjust num files if we need to, as well if ( compensateForMerge && base->hasMergeFile() && startFileNum < base->m_mergeStartFileNum - 1 && numFiles != -1 && startFileNum + numFiles - 1 >= base->m_mergeStartFileNum - 1 ) { // debug msg if ( g_conf.m_logDebugQuery ) log(LOG_DEBUG,"net: msg3: numFiles up one."); // if merge file was inserted before us, inc our file number numFiles++; } // . how many rdb files does this base have? // . IMPORTANT: this can change since files are unstable because they // might have all got merged into one! // . so do this check to make sure we're safe... especially if // there was an error before and we called readList() on ourselves max = base->getNumFiles(); // -1 means we should scan ALL the files in the base if ( numFiles == -1 ) numFiles = max; // limit it by startFileNum, however if ( numFiles > max - startFileNum ) numFiles = max - startFileNum; // set g_errno and return true if it is < 0 if ( numFiles < 0 ) { log(LOG_LOGIC, "net: msg3: readList: numFiles = %" PRId32" < 0 (max=%" PRId32")(sf=%" PRId32")", numFiles , max , startFileNum ); g_errno = EBADENGINEER; // force core dump char *xx=NULL;*xx=0; return true; } // . allocate buffer space // . m_scans, m_startpg, m_endpg, m_hintKeys, m_hintOffsets, // m_fileNums, m_lists int32_t chunk = sizeof(RdbScan) + // m_scans 4 + // m_startpg 4 + // m_endpg //sizeof(key_t) + // m_hintKeys m_ks + // m_hintKeys 4 + // m_hintOffsets 4 + // m_fileNums sizeof(RdbList) ; // m_lists int32_t nn = numFiles; if ( pre != -10 ) nn++; m_numChunks = nn; int32_t need = nn * (chunk); m_alloc = m_buf; if ( need > (int32_t)MSG3_BUF_SIZE ) { m_allocSize = need; m_alloc = (char *)mcalloc ( need , "Msg3" ); if ( ! m_alloc ) { log("disk: Could not allocate %" PRId32" bytes read " "structures to read %s.",need,base->m_dbname); return true; } } char *p = m_alloc; m_scans = (RdbScan *)p; p += nn * sizeof(RdbScan); m_startpg = (int32_t *)p; p += nn * 4; m_endpg = (int32_t *)p; p += nn * 4; //m_hintKeys = (key_t *)p; p += nn * sizeof(key_t); m_hintKeys = (char *)p; p += nn * m_ks; m_hintOffsets = (int32_t *)p; p += nn * 4; m_fileNums = (int32_t *)p; p += nn * 4; m_lists = (RdbList *)p; p += nn * sizeof(RdbList); // sanity check if ( p - m_alloc != need ) { log(LOG_LOGIC,"disk: Bad malloc in Msg3.cpp."); char *xx = NULL; *xx = 0; } // call constructors for ( int32_t i = 0 ; i < nn ; i++ ) m_lists[i].constructor(); // make fix from up top if ( pre != -10 ) m_fileNums [ m_numFileNums++ ] = pre; // store them all for ( int32_t i = startFileNum ; i < startFileNum + numFiles ; i++ ) m_fileNums [ m_numFileNums++ ] = i; // . remove file nums that are being unlinked after a merge now // . keep it here (below skip: label) so sync point reads can use it int32_t n = 0; for ( int32_t i = 0 ; i < m_numFileNums ; i++ ) { // skip those that are being unlinked after the merge if ( base->m_isUnlinking && m_fileNums[i] >= base->m_mergeStartFileNum && m_fileNums[i] < base->m_mergeStartFileNum + base->m_numFilesToMerge ) continue; // otherwise, keep it m_fileNums[n++] = m_fileNums[i]; } m_numFileNums = n; // . if root file is being merged, he's file #0, & root file is file #1 // . this is a hack so caller gets what he wants //if ( startFileNum == 0 && base->getFileId(0) == 0 && numFiles == 1 ) // numFiles = 2; // remember the file range we should scan m_numScansStarted = 0; m_numScansCompleted = 0; //m_startKey = startKey; //m_endKey = endKey; //m_constrainKey = endKey; // set in case justGetEndKey is true KEYSET(m_startKey,startKeyArg,m_ks); KEYSET(m_endKey,endKeyArg,m_ks); KEYSET(m_constrainKey,endKeyArg,m_ks);//set incase justGetEndKey istrue m_minRecSizes = minRecSizes; m_compensateForMerge = compensateForMerge; // bail if 0 files to scan -- no! need to set startKey/endKey if ( numFiles == 0 ) return true; // don't read anything if endKey < startKey //if ( m_startKey > m_endKey ) return true; if ( KEYCMP(m_startKey,m_endKey,m_ks)>0 ) return true; // keep the original in tact in case g_errno == ETRYAGAIN //m_endKeyOrig = endKey; KEYSET(m_endKeyOrig,endKeyArg,m_ks); m_minRecSizesOrig = minRecSizes; // start reading at this key m_fileStartKey = startKeyArg; // start the timer, keep it fast for clusterdb though if ( g_conf.m_logTimingDb ) m_startTime = gettimeofdayInMilliseconds(); // translate base to an id, for the sake of m_msg0 //char baseId = m_msg0->getRdbId ( base ); // map ptrs RdbMap **maps = base->getMaps(); // . we now boost m_minRecSizes to account for negative recs // . but not if only reading one list, cuz it won't get merged and // it will be too big to send back if ( m_numFileNums > 1 ) compensateForNegativeRecs ( base ); // . often endKey is too big for an efficient read of minRecSizes bytes // because we end up reading too much from all the files // . this will set m_startpg[i], m_endpg[i] for each RdbScan/RdbFile // to ensure we read "minRecSizes" worth of records, not much more // . returns the new endKey for all ranges // . now this just overwrites m_endKey //m_endKey = setPageRanges ( base , setPageRanges ( base , m_fileNums , m_numFileNums , m_fileStartKey , // start reading @ key m_endKey , // stop reading @ key m_minRecSizes ); // . NEVER let m_endKey be a negative key, because it will // always be unmatched, since delbit is cleared // . adjusting it here ensures our generated hints are valid // . we will use this key to call constrain() with //m_constrainKey = m_endKey; //if ( ( m_constrainKey.n0 & 0x01) == 0x00 ) // m_constrainKey -= (uint32_t)1; KEYSET(m_constrainKey,m_endKey,m_ks); if ( KEYNEG(m_constrainKey) ) KEYSUB(m_constrainKey,m_ks); // Msg5 likes to get the endkey for getting the list from the tree if ( justGetEndKey ) return true; // sanity check if ( m_numFileNums > nn ) { log(LOG_LOGIC,"disk: Failed sanity check in Msg3."); char *xx = NULL; *xx = 0; } // debug msg //log("msg3 getting list (msg5=%" PRIu32")",m_state); // . MDW removed this -- go ahead an end on a delete key // . RdbMerge might not pick it up this round, but oh well // . so we can have both positive and negative co-existing in same file // make sure the last bit is set so we don't end on a delete key //m_endKey.n0 |= 0x01LL; // . now start reading/scanning the files // . our m_scans array starts at 0 for ( int32_t i = 0 ; i < m_numFileNums ; i++ ) { // get the page range //int32_t p1 = m_startpg [ i ]; //int32_t p2 = m_endpg [ i ]; //#ifdef GBSANITYCHECK int32_t fn = m_fileNums[i]; // this can happen somehow! if ( fn < 0 ) { log(LOG_LOGIC,"net: msg3: fn=%" PRId32". Bad engineer.",fn); continue; } // sanity check if ( i > 0 && m_fileNums[i-1] >= fn ) { log(LOG_LOGIC, "net: msg3: files must be read in order " "from oldest to newest so RdbList::indexMerge_r " "works properly. Otherwise, corruption will " "result. "); char *xx = NULL; *xx = 0; return true; } // . sanity check? // . no, we must get again since we turn on endKey's last bit int32_t p1 , p2; maps[fn]->getPageRange ( m_fileStartKey , m_endKey , &p1 , &p2 , NULL ); //if ( p1 != p1c || p2 != p2c ) { // fprintf(stderr,"Msg3::bad page range\n"); // sleep(50000); //} // sanity check, each endpg's key should be > endKey //if ( p2 < maps[fn]->getNumPages() && // maps[fn]->getKey ( p2 ) <= m_endKey ) { // fprintf(stderr,"Msg3::bad page range 2\n"); // sleep(50000); //} //#endif //int32_t p1 , p2; //maps[fn]->getPageRange (startKey,endKey,minRecSizes,&p1,&p2); // now get some read info int64_t offset = maps[fn]->getAbsoluteOffset ( p1 ); int32_t bytesToRead = maps[fn]->getRecSizes ( p1, p2, false); // max out the endkey for this list // debug msg //#ifdef _DEBUG_ //if ( minRecSizes == 2000000 ) //log("Msg3:: reading %" PRId32" bytes from file #%" PRId32,bytesToRead,i); //#endif // inc our m_numScans m_numScansStarted++; // . keep stats on our disk accesses // . count disk seeks (assuming no fragmentation) // . count disk bytes read if ( bytesToRead > 0 ) { base->m_rdb->didSeek ( ); base->m_rdb->didRead ( bytesToRead ); } // . the startKey may be different for each RdbScan class // . RdbLists must have all keys within their [startKey,endKey] // . therefore set startKey individually from first page in map // . this endKey must be >= m_endKey // . this startKey must be < m_startKey //key_t startKey = maps[fn]->getKey ( p1 ); //key_t endKey = maps[fn]->getKey ( p2 ); char startKey2 [ MAX_KEY_BYTES ]; char endKey2 [ MAX_KEY_BYTES ]; maps[fn]->getKey ( p1 , startKey2 ); maps[fn]->getKey ( p2 , endKey2 ); //char *startKey = maps[fn]->getKeyPtr ( p1 ); //char *endKey = maps[fn]->getKeyPtr ( p2 ); // store in here m_startpg [ i ] = p1; m_endpg [ i ] = p2; // . we read UP TO that endKey, so reduce by 1 // . but iff p2 is NOT the last page in the map/file // . maps[fn]->getKey(lastPage) will return the LAST KEY // and maps[fn]->getOffset(lastPage) the length of the file //if ( maps[fn]->getNumPages()!=p2) endKey -=(uint32_t)1; if ( maps[fn]->getNumPages() != p2 ) KEYSUB(endKey2,m_ks); // otherwise, if we're reading all pages, then force the // endKey to virtual inifinite //else endKey.setMax(); else KEYMAX(endKey2,m_ks); // . set up the hints // . these are only used if we are only reading from 1 file // . these are used to call constrain() so we can constrain // the end of the list w/o looping through all the recs // in the list int32_t h2 = p2 ; // decrease by one page if we're on the last page if ( h2 > p1 && maps[fn]->getNumPages() == h2 ) h2--; // . decrease hint page until key is <= endKey on that page // AND offset is NOT -1 because the old way would give // us hints passed the endkey // . also decrease so we can constrain on minRecSizes in // case we're the only list being read // . use >= m_minRecSizes instead of >, otherwise we may // never be able to set "size" in RdbList::constrain() // because "p" could equal "maxPtr" right away while ( h2 > p1 && //( maps[fn]->getKey (h2) > m_constrainKey || (KEYCMP(maps[fn]->getKeyPtr(h2),m_constrainKey,m_ks)>0|| maps[fn]->getOffset(h2) == -1 || maps[fn]->getAbsoluteOffset(h2) - offset >= m_minRecSizes ) ) h2--; // now set the hint m_hintOffsets [ i ] = maps[fn]->getAbsoluteOffset ( h2 ) - maps[fn]->getAbsoluteOffset ( p1 ) ; //m_hintKeys [ i ] = maps[fn]->getKey ( h2 ); KEYSET(&m_hintKeys[i*m_ks],maps[fn]->getKeyPtr(h2),m_ks); // reset g_errno before calling setRead() g_errno = 0; // . this fix is now in RdbList::checklist_r() // . we can now have dup keys, so, we may read in // a rec with key "lastMinKey" even though we don't read // in the first key on the end page, so don't subtract 1... //if ( endKey != m_endKeyOrig ) // endKey += (uint32_t) 1; // timing debug if ( g_conf.m_logTimingDb ) log(LOG_TIMING, "net: msg: reading %" PRId32" bytes from %s file #%" PRId32" " "(niceness=%" PRId32")", bytesToRead,base->m_dbname,i,m_niceness); // log huge reads, those hurt us if ( bytesToRead > 150000000 ) { logf(LOG_INFO,"disk: Reading %" PRId32" bytes at offset %" PRId64" " "from %s.", bytesToRead,offset,base->m_dbname); } // if any keys in the map are the same report corruption char tmpKey [16]; char lastTmpKey[16]; int32_t ccount = 0; if ( bytesToRead > 10000000 && bytesToRead / 2 > m_minRecSizes && base->m_fixedDataSize >= 0 ) { for ( int32_t pn = p1 ; pn <= p2 ; pn++ ) { maps[fn]->getKey ( pn , tmpKey ); if ( KEYCMP(tmpKey,lastTmpKey,m_ks) == 0 ) ccount++; gbmemcpy(lastTmpKey,tmpKey,m_ks); } } if ( ccount > 10 ) { logf(LOG_INFO,"disk: Reading %" PRId32" bytes from %s file #" "%" PRId32" when min " "required is %" PRId32". Map is corrupt and has %" PRId32" " "identical consecutive page keys because the " "map was \"repaired\" because out of order keys " "in the index.", (int32_t)bytesToRead, base->m_dbname,fn, (int32_t)m_minRecSizes, (int32_t)ccount); m_numScansCompleted++; m_errno = ECORRUPTDATA; m_hadCorruption = true; //m_maxRetries = 0; break; } //////// // // try to get from PAGE CACHE // //////// BigFile *ff = base->getFile(m_fileNums[i]); RdbCache *rpc = getDiskPageCache ( m_rdbId ); if ( ! m_allowPageCache ) rpc = NULL; // . vfd is unique 64 bit file id // . if file is opened vfd is -1, only set in call to open() int64_t vfd = ff->getVfd(); key192_t ck = makeCacheKey ( vfd , offset, bytesToRead); char *rec; int32_t recSize; bool inCache = false; if ( rpc && vfd != -1 && ! m_validateCache ) inCache = rpc->getRecord ( (collnum_t)0 , // collnum (char *)&ck , &rec , &recSize , true , // copy? -1 , // maxAge, none true ); // inccounts? m_scans[i].m_inPageCache = false; if ( inCache ) { m_scans[i].m_inPageCache = true; m_numScansCompleted++; // now we have to store this value, 6 or 12 so // we can modify the hint appropriately m_scans[i].m_shifted = *rec; m_lists[i].set ( rec +1, recSize-1 , rec , // alloc recSize , // allocSize startKey2 , endKey2 , base->m_fixedDataSize , true , // owndata base->useHalfKeys() , getKeySizeFromRdbId ( m_rdbId ) ); continue; } // . do the scan/read of file #i // . this returns false if blocked, true otherwise // . this will set g_errno on error bool done = m_scans[i].setRead (base->getFile(m_fileNums[i]), base->m_fixedDataSize , offset , bytesToRead , startKey2 , endKey2 , m_ks , &m_lists[i] , this , doneScanningWrapper , base->useHalfKeys() , m_rdbId, m_niceness , m_allowPageCache , m_hitDisk ) ; // . damn, usually the above will indirectly launch a thread // to do the reading, but it sets g_errno to EINTR, // "interrupted system call"! // . i guess the thread does the read w/o blocking and then // queues the signal on g_loop's queue before it exits // . try ignoring, and keep going if ( g_errno == EINTR ) { log("net: Interrupted system call while reading file. " "Ignoring."); g_errno = 0; } // debug msg //fprintf(stderr,"Msg3:: reading %" PRId32" bytes from file #%" PRId32"," // "done=%" PRId32",offset=%" PRId64",g_errno=%s," // "startKey=n1=%" PRIu32",n0=%" PRIu64", " // "endKey=n1=%" PRIu32",n0=%" PRIu64"\n", // bytesToRead,i,(int32_t)done,offset,mstrerror(g_errno), // m_startKey,m_endKey); //if ( bytesToRead == 0 ) // fprintf(stderr,"shit\n"); // if it did not block then it completed, so count it if ( done ) m_numScansCompleted++; // break on an error, and remember g_errno in case we block if ( g_errno && g_errno != ENOTHREADSLOTS ) { int32_t tt = LOG_WARN; if ( g_errno == EFILECLOSED ) tt = LOG_INFO; log(tt,"disk: Reading %s had error: %s.", base->m_dbname, mstrerror(g_errno)); m_errno = g_errno; break; } } // debug test //if ( rand() % 100 <= 10 ) m_errno = EIO; // if we blocked, return false if ( m_numScansCompleted < m_numScansStarted ) return false; // . if all scans completed without blocking then wrap it up & ret true // . doneScanning may now block if it finds data corruption and must // get the list remotely return doneScanning(); }
// . returns a new, smaller endKey // . shrinks endKey while still preserving the minRecSizes requirement // . this is the most confusing subroutine in the project // . this now OVERWRITES endKey with the new one //key_t Msg3::setPageRanges ( RdbBase *base , void Msg3::setPageRanges ( RdbBase *base , int32_t *fileNums , int32_t numFileNums , const char *startKey , char *endKey , int32_t minRecSizes ) { // sanity check //if ( m_ks != 12 && m_ks != 16 ) { char *xx=NULL;*xx=0; } // get the file maps from the rdb RdbMap **maps = base->getMaps(); // . initialize the startpg/endpg for each file // . we read from the first offset on m_startpg to offset on m_endpg // . since we set them equal that means an empty range for each file for ( int32_t i = 0 ; i < numFileNums ; i++ ) { int32_t fn = fileNums[i]; if ( fn < 0 ) { char *xx = NULL; *xx = 0; } m_startpg[i] = maps[fn]->getPage( startKey ); m_endpg [i] = m_startpg[i]; } // just return if minRecSizes 0 (no reading needed) //if ( minRecSizes <= 0 ) return endKey ; if ( minRecSizes <= 0 ) return; // calculate minKey minus one //key_t lastMinKey ; char lastMinKey[MAX_KEY_BYTES]; char lastMinKeyIsValid = 0; // loop until we find the page ranges that barely satisfy "minRecSizes" loop: // find the map whose next page has the lowest key int32_t minpg = -1; //key_t minKey; char minKey[MAX_KEY_BYTES]; for ( int32_t i = 0 ; i < numFileNums ; i++ ) { int32_t fn = fileNums[i]; // this guy is out of race if his end key > "endKey" already //if ( maps[fn]->getKey ( m_endpg[i] ) > endKey ) continue; if(KEYCMP(maps[fn]->getKeyPtr(m_endpg[i]),endKey,m_ks)>0) continue; // get the next page after m_endpg[i] int32_t nextpg = m_endpg[i] + 1; // if endpg[i]+1 == m_numPages then we maxed out this range if ( nextpg > maps[fn]->getNumPages() ) continue; // . but this may have an offset of -1 // . which means the page has no key starting on it and // it's occupied by a rec which starts on a previous page while ( nextpg < maps[fn]->getNumPages() && maps[fn]->getOffset ( nextpg ) == -1 ) nextpg++; // . continue if his next page doesn't have the minimum key // . if nextpg == getNumPages() then it returns the LAST KEY // contained in the corresponding RdbFile //if ( minpg != -1 && maps[fn]->getKey ( nextpg ) > minKey ) if (minpg != -1 && KEYCMP(maps[fn]->getKeyPtr(nextpg),minKey,m_ks)>0)continue; // . we got a winner, his next page has the current min key // . if m_endpg[i]+1 == getNumPages() then getKey() returns the // last key in the mapped file // . minKey should never equal the key on m_endpg[i] UNLESS // it's on page #m_numPages //minKey = maps[fn]->getKey ( nextpg ); KEYSET(minKey,maps[fn]->getKeyPtr(nextpg),m_ks); minpg = i; // if minKey is same as the current key on this endpg, inc it // so we cause some advancement, otherwise, we'll loop forever //if ( minKey != maps[fn]->getKey ( m_endpg[i] ) ) continue; if ( KEYCMP(minKey,maps[fn]->getKeyPtr(m_endpg[i]),m_ks)!=0) continue; //minKey += (uint32_t) 1; KEYADD(minKey,m_ks); } // . we're done if we hit the end of all maps in the race // . return the max end key // key_t maxEndKey; maxEndKey.setMax(); return maxEndKey; } // . no, just the endKey //if ( minpg == -1 ) return endKey; if ( minpg == -1 ) return; // sanity check if ( lastMinKeyIsValid && KEYCMP(minKey,lastMinKey,m_ks)<=0 ) { g_errno = ECORRUPTDATA; log("db: Got corrupted map in memory for %s. This is almost " "always because of bad memory. Please replace your RAM.", base->m_dbname); // do not wait for any merge to complete... otherwise // Rdb.cpp will not close until the merge is done g_merge.m_isMerging = false; g_merge2.m_isMerging = false; // to complete // shutdown with urgent=true so threads are disabled. g_process.shutdown(true); //g_numCorrupt++; // sleep for now until we make sure this works //sleep(2000); return; } // don't let minKey exceed endKey, however //if ( minKey > endKey ) { if ( KEYCMP(minKey,endKey,m_ks)>0 ) { //minKey = endKey ; //minKey += (uint32_t) 1; //lastMinKey = endKey; KEYSET(minKey,endKey,m_ks); KEYADD(minKey,m_ks); KEYSET(lastMinKey,endKey,m_ks); } else { //lastMinKey = minKey ; //lastMinKey -= (uint32_t) 1; KEYSET(lastMinKey,minKey,m_ks); KEYSUB(lastMinKey,m_ks); } // it is now valid lastMinKeyIsValid = 1; // . advance m_endpg[i] so that next page < minKey // . we want to read UP TO the first key on m_endpg[i] for ( int32_t i = 0 ; i < m_numFileNums ; i++ ) { int32_t fn = fileNums[i]; m_endpg[i] = maps[fn]->getEndPage ( m_endpg[i], lastMinKey ); } // . if the minKey is BIGGER than the provided endKey we're done // . we don't necessarily include records whose key is "minKey" //if ( minKey > endKey ) return endKey; if ( KEYCMP(minKey,endKey,m_ks)>0) return; // . calculate recSizes per page within [startKey,minKey-1] // . compute bytes of records in [startKey,minKey-1] for each map // . this includes negative records so we may have annihilations // when merging into "diskList" and get less than what we wanted // but endKey should be shortened, so our caller will know to call // again if he wants more int32_t recSizes = 0; for ( int32_t i = 0 ; i < m_numFileNums ; i++ ) { int32_t fn = fileNums[i]; recSizes += maps[fn]->getMinRecSizes ( m_startpg[i] , m_endpg [i] , startKey , lastMinKey , false ); } // if we hit it then return minKey -1 so we only read UP TO "minKey" // not including "minKey" //if ( recSizes >= minRecSizes ) if ( recSizes >= minRecSizes ) { // . sanity check // . this sanity check fails sometimes, but leave it // out for now... causes the Illegal endkey msgs in // RdbList::indexMerge_r() //if ( KEYNEG(lastMinKey) ) { char *xx=NULL;*xx=0; } KEYSET(endKey,lastMinKey,m_ks); //return lastMinKey; return; } // keep on truckin' goto loop; }
bool RdbMerge::getAnotherList ( ) { log(LOG_DEBUG,"db: Getting another list for merge."); // clear it up in case it was already set g_errno = 0; // get base, returns NULL and sets g_errno to ENOCOLLREC on error RdbBase *base; if (!(base=getRdbBase(m_rdbId,m_collnum))) return true; // if merging titledb files, we must adjust m_endKey so we do // not have to read a huge 200MB+ tfndb list //key_t newEndKey = m_endKey; char newEndKey[MAX_KEY_BYTES]; KEYSET(newEndKey,m_endKey,m_ks); //CollectionRec *cr = g_collectiondb.getRec ( m_collnum ); //char *coll = cr->m_coll; /* if ( m_rdbId == RDB_TITLEDB ) { // && m_rdbId == RDB_TFNDB ) { //long long docId1 = g_titledb.getDocIdFromKey ( m_startKey ); long long docId1=g_titledb.getDocIdFromKey((key_t *)m_startKey); //long long docId2 = g_titledb.getDocIdFromKey ( m_endKey ); // tfndb is pretty much uniformly distributed RdbBase *ubase = getRdbBase(RDB_TFNDB,m_coll); if ( ! ubase ) return true; long long space = ubase->getDiskSpaceUsed(); //long long readSize = (space * (docId2-docId1)) / DOCID_MASK; long long bufSize = g_conf.m_mergeBufSize; // for now force to 100k bufSize = 100000; if ( bufSize > space ) bufSize = space; long long docId3 = (long long) (((double)bufSize / (double)space) * (double)DOCID_MASK + docId1); // constrain newEndKey based on docId3 if ( docId3 < 0 ) docId3 = DOCID_MASK; //if ( docId3 >= DOCID_MASK ) newEndKey.setMax(); if ( docId3 >= DOCID_MASK ) KEYMAX(newEndKey,m_ks); //else newEndKey = g_titledb.makeLastKey ( docId3 ); else { key_t nk = g_titledb.makeLastKey(docId3); KEYSET(newEndKey,(char *)&nk,m_ks); } //log(LOG_DEBUG,"build: remapping endkey from %lx.%llx to " // "%lx.%llx to avoid big tfndb read.", // m_endKey.n1,m_endKey.n0, newEndKey.n1,newEndKey.n0); log(LOG_DEBUG,"build: remapping endkey from %llx.%llx to " "%llx.%llx to avoid big tfndb read.", KEY1(m_endKey,m_ks),KEY0(m_endKey), KEY1(newEndKey,m_ks),KEY0(newEndKey)); } */ // . this returns false if blocked, true otherwise // . sets g_errno on error // . we return false if it blocked // . m_maxBufSize may be exceeded by a rec, it's just a target size // . niceness is usually MAX_NICENESS, but reindex.cpp sets to 0 // . this was a call to Msg3, but i made it call Msg5 since // we now do the merging in Msg5, not in msg3 anymore // . this will now handle truncation, dup and neg rec removal // . it remembers last termId and count so it can truncate even when // IndexList is split between successive reads // . IMPORTANT: when merging titledb we could be merging about 255 // files, so if we are limited to only X fds it can have a cascade // affect where reading from one file closes the fd of another file // in the read (since we call open before spawning the read thread) // and can therefore take 255 retries for the Msg3 to complete // because each read gives a EFILCLOSED error. // so to fix it we allow one retry for each file in the read plus // the original retry of 25 long nn = base->getNumFiles(); if ( m_numFiles > 0 && m_numFiles < nn ) nn = m_numFiles; // don't access any biased page caches bool usePageCache = true; if ( m_rdbId == RDB_CLUSTERDB ) usePageCache = false; // . i don't trust page cache too much (mdw)... well, give it a shot // . see if ths helps fix WD corruption... i doubt it usePageCache = false; // for now force to 100k long bufSize = 100000; // g_conf.m_mergeBufSize , // minRecSizes // get it return m_msg5.getList ( m_rdbId , m_collnum , &m_list , m_startKey , newEndKey , // usually is maxed! bufSize , false , // includeTree? false , // add to cache? 0 , // max cache age for lookup m_startFileNum , // startFileNum m_numFiles , this , // state gotListWrapper , // callback m_niceness , // niceness true , // do error correction? NULL , // cache key ptr 0 , // retry # nn + 75 , // max retries (mk it high) false , // compensate for merge? -1LL , // sync point &m_msg5b , true , // isRealMerge? absolutely! usePageCache ); }
* * Pwr btn is active high, whereas vol up and down are active low, thus we need * to mask the input we read for active low buttons to ensure we interpret them * right. * */ #define KEYSET(pwr, vup, vdn) ((pwr) | (vup) | (vdn)) enum { VOL_DOWN_SHIFT, VOL_UP_SHIFT, PWR_BTN_SHIFT, VOL_DOWN = (1 << VOL_DOWN_SHIFT), VOL_UP = (1 << VOL_UP_SHIFT), PWR_BTN = (1 << PWR_BTN_SHIFT), NO_BTN_PRESSED = KEYSET(0,0,0), }; /* * Key definitions are as follows: * Buttons: * Pwr = Power button * Vup = Volume Up button * Vdn = Volume Down button * * Ctrl - D = Pwr + Vup * Ctrl - U = Pwr + Vdn * CR = Vup * Space = Vdn * Ctrl - L = Pwr -> Vup * Tab = Pwr -> Vdn
// . THIS Msg0 class must be alloc'd, i.e. not on the stack, etc. // . if list is stored locally this tries to get it locally // . otherwise tries to get the list from the network // . returns false if blocked, true otherwise // . sets g_errno on error // . NOTE: i was having problems with queries being cached too long, you // see the cache here is a NETWORK cache, so when the machines that owns // the list updates it on disk it can't flush our cache... so use a small // maxCacheAge of like , 30 seconds or so... bool Msg0::getList ( int64_t hostId , // host to ask (-1 if none) int32_t ip , // info on hostId int16_t port , int32_t maxCacheAge , // max cached age in seconds bool addToCache , // add net recv'd list to cache? char rdbId , // specifies the rdb //char *coll , collnum_t collnum , RdbList *list , //key_t startKey , //key_t endKey , char *startKey , char *endKey , int32_t minRecSizes , // use -1 for no max void *state , void (* callback)(void *state ),//, RdbList *list ) , int32_t niceness , bool doErrorCorrection , bool includeTree , bool doMerge , int32_t firstHostId , int32_t startFileNum , int32_t numFiles , int32_t timeout , int64_t syncPoint , int32_t preferLocalReads , Msg5 *msg5 , Msg5 *msg5b , bool isRealMerge , //#ifdef SPLIT_INDEXDB bool allowPageCache , bool forceLocalIndexdb , bool noSplit , // doIndexdbSplit , int32_t forceParitySplit ) { //#else // bool allowPageCache ) { //#endif // this is obsolete! mostly, but we need it for PageIndexdb.cpp to // show a "termlist" for a given query term in its entirety so you // don't have to check each machine in the network. if this is true it // means to query each split and merge the results together into a // single unified termlist. only applies to indexdb/datedb. //if ( doIndexdbSplit ) { char *xx = NULL; *xx = 0; } // note this because if caller is wrong it hurts performance major!! //if ( doIndexdbSplit ) // logf(LOG_DEBUG,"net: doing msg0 with indexdb split true"); // warning if ( collnum < 0 ) log(LOG_LOGIC,"net: NULL collection. msg0."); //if ( doIndexdbSplit ) { char *xx=NULL;*xx=0; } // reset the list they passed us list->reset(); // get keySize of rdb m_ks = getKeySizeFromRdbId ( rdbId ); // if startKey > endKey, don't read anything //if ( startKey > endKey ) return true; if ( KEYCMP(startKey,endKey,m_ks)>0 ) { char *xx=NULL;*xx=0; }//rettrue // . reset hostid if it is dead // . this is causing UOR queries to take forever when we have a dead if ( hostId >= 0 && g_hostdb.isDead ( hostId ) ) hostId = -1; // no longer accept negative minrecsize if ( minRecSizes < 0 ) { g_errno = EBADENGINEER; log(LOG_LOGIC, "net: msg0: Negative minRecSizes no longer supported."); char *xx=NULL;*xx=0; return true; } // debug msg //if ( niceness != 0 ) log("HEY start"); // ensure startKey last bit clear, endKey last bit set //if ( (startKey.n0 & 0x01) == 0x01 ) // log("Msg0::getList: warning startKey lastbit set"); //if ( (endKey.n0 & 0x01) == 0x00 ) // log("Msg0::getList: warning endKey lastbit clear"); // remember these m_state = state; m_callback = callback; m_list = list; m_hostId = hostId; m_niceness = niceness; //m_ip = ip; //m_port = port; m_addToCache = addToCache; // . these define our request 100% //m_startKey = startKey; //m_endKey = endKey; KEYSET(m_startKey,startKey,m_ks); KEYSET(m_endKey,endKey,m_ks); m_minRecSizes = minRecSizes; m_rdbId = rdbId; m_collnum = collnum;// = coll; m_isRealMerge = isRealMerge; m_allowPageCache = allowPageCache; // . group to ask is based on the first key // . we only do 1 group per call right now // . groupMask must turn on higher bits first (count downwards kinda) // . titledb and spiderdb use special masks to get groupId // if diffbot.cpp is reading spiderdb from each shard we have to // get groupid from hostid here lest we core in getGroupId() below. // it does that for dumping spiderdb to the client browser. they // can download the whole enchilada. if ( hostId >= 0 && m_rdbId == RDB_SPIDERDB ) m_shardNum = 0; // did they force it? core until i figure out what this is else if ( forceParitySplit >= 0 ) //m_groupId = g_hostdb.getGroupId ( forceParitySplit ); m_shardNum = forceParitySplit; else //m_groupId = getGroupId ( m_rdbId , startKey , ! noSplit ); m_shardNum = getShardNum ( m_rdbId , startKey ); // if we are looking up a termlist in posdb that is split by termid and // not the usual docid then we have to set this posdb key bit that tells // us that ... if ( noSplit && m_rdbId == RDB_POSDB ) m_shardNum = g_hostdb.getShardNumByTermId ( startKey ); // how is this used? //if ( forceLocalIndexdb ) m_groupId = g_hostdb.m_groupId; if ( forceLocalIndexdb ) m_shardNum = getMyShardNum(); // . store these parameters // . get a handle to the rdb in case we can satisfy locally // . returns NULL and sets g_errno on error QUICKPOLL((m_niceness)); Rdb *rdb = getRdbFromId ( m_rdbId ); if ( ! rdb ) return true; // we need the fixedDataSize m_fixedDataSize = rdb->getFixedDataSize(); m_useHalfKeys = rdb->useHalfKeys(); // . debug msg // . Msg2 does this when checking for a cached compound list. // compound lists do not actually exist, they are merges of smaller // UOR'd lists. if ( maxCacheAge != 0 && ! addToCache && (numFiles > 0 || includeTree)) log(LOG_LOGIC,"net: msg0: " "Weird. check but don't add... rdbid=%"INT32".",(int32_t)m_rdbId); // set this here since we may not call msg5 if list not local //m_list->setFixedDataSize ( m_fixedDataSize ); // . now that we do load balancing we don't want to do a disk lookup // even if local if we are merging or dumping // . UNLESS g_conf.m_preferLocalReads is true if ( preferLocalReads == -1 ) preferLocalReads = g_conf.m_preferLocalReads; // . always prefer local for full split clusterdb // . and keep the tfndb/titledb lookups in the same stripe // . so basically we can't do biased caches if fully split //if ( g_conf.m_fullSplit ) preferLocalReads = true; preferLocalReads = true; // it it stored locally? bool isLocal = ( m_hostId == -1 && //g_hostdb.m_groupId == m_groupId ); m_shardNum == getMyShardNum() ); // only do local lookups if this is true if ( ! preferLocalReads ) isLocal = false; /* m_numSplit = 1; if ( g_hostdb.m_indexSplits > 1 && ( rdbId == RDB_POSDB || rdbId==RDB_DATEDB)&& ! forceLocalIndexdb && doIndexdbSplit ) { isLocal = false; //m_numSplit = INDEXDB_SPLIT; m_numSplit = g_hostdb.m_indexSplits; char *xx=NULL;*xx=0; } */ /* int64_t singleDocIdQuery = 0LL; if ( rdbId == RDB_POSDB ) { int64_t d1 = g_posdb.getDocId(m_startKey); int64_t d2 = g_posdb.getDocId(m_endKey); if ( d1+1 == d2 ) singleDocIdQuery = d1; } // . try the LOCAL termlist cache // . so when msg2 is evaluating a gbdocid:| query and it has to // use msg0 to go across the network to get the same damn termlist // over and over again for the same docid, this will help alot. // . ideally it'd be nice if the seo pipe in xmldoc.cpp can try to // send the same gbdocid:xxxx docids to the same hosts. maybe hash // based on docid into the list of hosts and if that host is busy // just chain until we find someone not busy. if ( singleDocIdQuery && getListFromTermListCache ( coll, m_startKey, m_endKey, maxCacheAge, list ) ) // found! return true; */ // but always local if only one host if ( g_hostdb.getNumHosts() == 1 ) isLocal = true; // force a msg0 if doing a docid restrictive query like // gbdocid:xxxx|<query> so we call cacheTermLists() //if ( singleDocIdQuery ) isLocal = false; // . if the group is local then do it locally // . Msg5::getList() returns false if blocked, true otherwise // . Msg5::getList() sets g_errno on error // . don't do this if m_hostId was specified if ( isLocal ) { // && !g_conf.m_interfaceMachine ) { if ( msg5 ) { m_msg5 = msg5; m_deleteMsg5 = false; } else { try { m_msg5 = new ( Msg5 ); } catch ( ... ) { g_errno = ENOMEM; log("net: Local alloc for disk read failed " "while tring to read data for %s. " "Trying remote request.", getDbnameFromId(m_rdbId)); goto skip; } mnew ( m_msg5 , sizeof(Msg5) , "Msg0" ); m_deleteMsg5 = true; } QUICKPOLL(m_niceness); // same for msg5b if ( msg5b ) { m_msg5b = msg5b; m_deleteMsg5b = false; } /* else if ( m_rdbId == RDB_TITLEDB ) { try { m_msg5b = new ( Msg5 ); } catch ( ... ) { g_errno = ENOMEM; log("net: Local alloc for disk read failed " "while tring to read data for %s. " "Trying remote request. 2.", getDbnameFromId(m_rdbId)); goto skip; } mnew ( m_msg5b , sizeof(Msg5) , "Msg0b" ); m_deleteMsg5b = true; } */ QUICKPOLL(m_niceness); if ( ! m_msg5->getList ( rdbId, m_collnum , m_list , m_startKey , m_endKey , m_minRecSizes , includeTree , // include Tree? addToCache , // addToCache? maxCacheAge , startFileNum , numFiles , this , gotListWrapper2 , niceness , doErrorCorrection , NULL , // cacheKeyPtr 0 , // retryNum -1 , // maxRetries true , // compensateForMerge syncPoint , NULL,//m_msg5b , m_isRealMerge , m_allowPageCache ) ) return false; // nuke it reset(); return true; } skip: // debug msg if ( g_conf.m_logDebugQuery ) log(LOG_DEBUG,"net: msg0: Sending request for data to " "shard=%"UINT32" " "listPtr=%"PTRFMT" minRecSizes=%"INT32" termId=%"UINT64" " //"startKey.n1=%"XINT32",n0=%"XINT64" (niceness=%"INT32")", "startKey.n1=%"XINT64",n0=%"XINT64" (niceness=%"INT32")", //g_hostdb.makeHostId ( m_groupId ) , m_shardNum, (PTRTYPE)m_list, m_minRecSizes, g_posdb.getTermId(m_startKey) , //m_startKey.n1,m_startKey.n0 , (int32_t)m_niceness); KEY1(m_startKey,m_ks),KEY0(m_startKey), (int32_t)m_niceness); char *replyBuf = NULL; int32_t replyBufMaxSize = 0; bool freeReply = true; // adjust niceness for net transmission bool realtime = false; //if ( minRecSizes + 32 < TMPBUFSIZE ) realtime = true; // if we're niceness 0 we need to pre-allocate for reply since it // might be received within the asynchronous signal handler which // cannot call mmalloc() if ( realtime ) { // niceness <= 0 || netnice == 0 ) { // . we should not get back more than minRecSizes bytes since // we are now performing merges // . it should not slow things down too much since the hashing // is 10 times slower than merging anyhow... // . CAUTION: if rdb is not fixed-datasize then this will // not work for us! it can exceed m_minRecSizes. replyBufMaxSize = m_minRecSizes ; // . get a little extra to fix the error where we ask for 64 // but get 72 // . where is that coming from? // . when getting titleRecs we often exceed the minRecSizes // . ?Msg8? was having trouble. was int16_t 32 bytes sometimes. replyBufMaxSize += 36; // why add ten percent? //replyBufMaxSize *= 110 ; //replyBufMaxSize /= 100 ; // make a buffer to hold the reply //#ifdef SPLIT_INDEXDB /* if ( m_numSplit > 1 ) { m_replyBufSize = replyBufMaxSize * m_numSplit; replyBuf = (char *) mmalloc(m_replyBufSize, "Msg0"); m_replyBuf = replyBuf; freeReply = false; } else */ //#endif replyBuf = (char *) mmalloc(replyBufMaxSize , "Msg0"); // g_errno is set and we return true if it failed if ( ! replyBuf ) { log("net: Failed to pre-allocate %"INT32" bytes to hold " "data read remotely from %s: %s.", replyBufMaxSize,getDbnameFromId(m_rdbId), mstrerror(g_errno)); return true; } } // . make a request with the info above (note: not in network order) // . IMPORTANT!!!!! if you change this change // Multicast.cpp::sleepWrapper1 too!!!!!!!!!!!! // no, not anymore, we commented out that request peeking code char *p = m_request; *(int64_t *) p = syncPoint ; p += 8; //*(key_t *) p = m_startKey ; p += sizeof(key_t); //*(key_t *) p = m_endKey ; p += sizeof(key_t); *(int32_t *) p = m_minRecSizes ; p += 4; *(int32_t *) p = startFileNum ; p += 4; *(int32_t *) p = numFiles ; p += 4; *(int32_t *) p = maxCacheAge ; p += 4; if ( p - m_request != RDBIDOFFSET ) { char *xx=NULL;*xx=0; } *p = m_rdbId ; p++; *p = addToCache ; p++; *p = doErrorCorrection; p++; *p = includeTree ; p++; *p = (char)niceness ; p++; *p = (char)m_allowPageCache; p++; KEYSET(p,m_startKey,m_ks); ; p+=m_ks; KEYSET(p,m_endKey,m_ks); ; p+=m_ks; // NULL terminated collection name //strcpy ( p , coll ); p += gbstrlen ( coll ); *p++ = '\0'; *(collnum_t *)p = m_collnum; p += sizeof(collnum_t); m_requestSize = p - m_request; // ask an individual host for this list if hostId is NOT -1 if ( m_hostId != -1 ) { // get Host Host *h = g_hostdb.getHost ( m_hostId ); if ( ! h ) { g_errno = EBADHOSTID; log(LOG_LOGIC,"net: msg0: Bad hostId of %"INT64".", m_hostId); return true; } // if niceness is 0, use the higher priority udpServer UdpServer *us ; uint16_t port; QUICKPOLL(m_niceness); //if ( niceness <= 0 || netnice == 0 ) { //if ( realtime ) { // us = &g_udpServer2; port = h->m_port2; } //else { us = &g_udpServer ; port = h->m_port ; // . returns false on error and sets g_errno, true otherwise // . calls callback when reply is received (or error) // . we return true if it returns false if ( ! us->sendRequest ( m_request , m_requestSize , 0x00 , // msgType h->m_ip , port , m_hostId , NULL , // the slotPtr this , gotSingleReplyWrapper , timeout , -1 , // backoff -1 , // maxwait replyBuf , replyBufMaxSize , m_niceness ) ) // cback niceness return true; // return false cuz it blocked return false; } // timing debug if ( g_conf.m_logTimingNet ) m_startTime = gettimeofdayInMilliseconds(); else m_startTime = 0; //if ( m_rdbId == RDB_INDEXDB ) log("Msg0:: getting remote indexlist. " // "termId=%"UINT64", " // "groupNum=%"UINT32"", // g_indexdb.getTermId(m_startKey) , // g_hostdb.makeHostId ( m_groupId ) ); /* // make the cache key so we can see what remote host cached it, if any char cacheKey[MAX_KEY_BYTES]; //key_t cacheKey = makeCacheKey ( startKey , makeCacheKey ( startKey , endKey , includeTree , minRecSizes , startFileNum , numFiles , cacheKey , m_ks ); */ // . get the top int32_t of the key // . i guess this will work for 128 bit keys... hmmmmm int32_t keyTop = hash32 ( (char *)startKey , m_ks ); /* // allocate space if ( m_numSplit > 1 ) { int32_t need = m_numSplit * sizeof(Multicast) ; char *buf = (char *)mmalloc ( need,"msg0mcast" ); if ( ! buf ) return true; m_mcasts = (Multicast *)buf; for ( int32_t i = 0; i < m_numSplit ; i++ ) m_mcasts[i].constructor(); } */ // . otherwise, multicast to a host in group "groupId" // . returns false and sets g_errno on error // . calls callback on completion // . select first host to send to in group based on upper 32 bits // of termId (m_startKey.n1) //#ifdef SPLIT_INDEXDB // . need to send out to all the indexdb split hosts m_numRequests = 0; m_numReplies = 0; //for ( int32_t i = 0; i < m_numSplit; i++ ) { QUICKPOLL(m_niceness); //int32_t gr; char *buf; /* if ( m_numSplit > 1 ) { gr = g_indexdb.getSplitGroupId ( baseGroupId, i ); buf = &replyBuf[i*replyBufMaxSize]; } else { */ //gr = m_groupId; buf = replyBuf; //} // get the multicast Multicast *m = &m_mcast; //if ( m_numSplit > 1 ) m = &m_mcasts[i]; if ( ! m->send ( m_request , //#else // if ( ! m_mcast.send ( m_request , //#endif m_requestSize, 0x00 , // msgType 0x00 false , // does multicast own request? m_shardNum , //#ifdef SPLIT_INDEXDB // gr , // group + offset //#else // m_groupId , // group to send to (groupKey) //#endif false , // send to whole group? //m_startKey.n1, // key is passed on startKey keyTop , // key is passed on startKey this , // state data NULL , // state data gotMulticastReplyWrapper0 , timeout , // timeout in seconds (was 30) niceness , realtime , firstHostId , //#ifdef SPLIT_INDEXDB // &replyBuf[i*replyBufMaxSize] , //#else // replyBuf , //#endif buf , replyBufMaxSize , freeReply , // free reply buf? true , // do disk load balancing? maxCacheAge , //(key_t *)cacheKey , // multicast uses it for determining the best // host to send the request to when doing // disk load balancing. if the host has our // data cached, then it will probably get to // handle the request. for now let's just assume // this is a 96-bit key. TODO: fix... 0 , // *(key_t *)cacheKey , rdbId , minRecSizes ) ) { log("net: Failed to send request for data from %s in shard " "#%"UINT32" over network: %s.", getDbnameFromId(m_rdbId),m_shardNum, mstrerror(g_errno)); // no, multicast will free this when it is destroyed //if (replyBuf) mfree ( replyBuf , replyBufMaxSize , "Msg22" ); // but speed it up //#ifdef SPLIT_INDEXDB m_errno = g_errno; m->reset(); if ( m_numRequests > 0 ) return false; //#else // m_mcast.reset(); //#endif return true; } //#ifdef SPLIT_INDEXDB m_numRequests++; //#endif // we blocked return false; }
// . returns false if blocked, true otherwise // . sets g_errno on error // . dumps the RdbTree, m_tree, into m_file // . also sets and writes the RdbMap for m_file // . we methodically get RdbLists from the RdbTree // . dumped recs are ordered by key if "orderedDump" was true in call to set() // otherwise, lists are ordered by node # // . we write each list of recs to the file until the whole tree has been done // . we delete all records in list from the tree after we've written the list // . if a cache was provided we incorporate the list into the cache before // deleting it from the tree to keep the cache in sync. NO we do NOT! // . called again by writeBuf() when it's done writing the whole list bool RdbDump::dumpTree ( bool recall ) { // set up some vars //int32_t nextNode; //key_t maxEndKey; //maxEndKey.setMax(); char maxEndKey[MAX_KEY_BYTES]; KEYMAX(maxEndKey,m_ks); // if dumping statsdb, we can only dump records 30 seconds old or // more because Statsdb.cpp can "back modify" such records in the tree // because it may have a query that took 10 seconds come in then it // needs to add a partial stat to the last 10 stats for those 10 secs. // we use Global time at this juncture if ( m_rdb->m_rdbId == RDB_STATSDB ) { int32_t nowSecs = getTimeGlobal(); StatKey *sk = (StatKey *)maxEndKey; sk->m_zero = 0x01; sk->m_labelHash = 0xffffffff; // leave last 60 seconds in there just to be safe sk->m_time1 = nowSecs - 60; } // this list will hold the list of nodes/recs from m_tree m_list = &m_ourList; // convert coll to collnum //collnum_t collnum = g_collectiondb.getCollnum ( m_coll ); // a collnum of -1 is for collectionless rdbs //if ( collnum < 0 ) { // //if ( g_catdb->getRdb() == m_rdb ) // if ( ! m_rdb->m_isCollectionLess ) { // char *xx=NULL;*xx=0; //return true; // } // g_errno = 0; // collnum = 0; //} // getMemOccupiedForList2() can take some time, so breathe int32_t niceness = 1; loop: // if the lastKey was the max end key last time then we're done if ( m_rolledOver ) return true; // this is set to -1 when we're done with our unordered dump if ( m_nextNode == -1 ) return true; // . NOTE: list's buffer space should be re-used!! (TODO) // . "lastNode" is set to the last node # in the list bool status = true; //if ( ! m_orderedDump ) { // status = ((RdbTree *)m_tree)->getListUnordered ( m_nextNode , // m_maxBufSize , // m_list , // &nextNode ); // // this is -1 when no more nodes are left // m_nextNode = nextNode; //} // "lastKey" is set to the last key in the list //else { { // can we remove neg recs? // class RdbBase *base = m_rdb->getBase(m_collnum); // bool removeNegRecs = false; // if ( base->m_numFiles <= 0 ) removeNegRecs = true; if ( recall ) goto skip; // debug msg //log("RdbDump:: getting list"); m_t1 = gettimeofdayInMilliseconds(); if(m_tree) status = m_tree->getList ( m_collnum , m_nextKey , maxEndKey , m_maxBufSize , // max recSizes m_list , &m_numPosRecs , &m_numNegRecs , m_useHalfKeys , niceness ); else if(m_buckets) status = m_buckets->getList ( m_collnum, m_nextKey , maxEndKey , m_maxBufSize , // max recSizes m_list , &m_numPosRecs , &m_numNegRecs , m_useHalfKeys ); // don't dump out any neg recs if it is our first time dumping // to a file for this rdb/coll. TODO: implement this later. //if ( removeNegRecs ) // m_list.removeNegRecs(); // if(!m_list->checkList_r ( false , // removeNegRecs? // false , // sleep on problem? // m_rdb->m_rdbId )) { // log("db: list to dump is not sane!"); // char *xx=NULL;*xx=0; // } skip: int64_t t2; //key_t lastKey; char *lastKey; // if error getting list (out of memory?) if ( ! status ) goto hadError; // debug msg t2 = gettimeofdayInMilliseconds(); log(LOG_INFO,"db: Get list took %"INT64" ms. " "%"INT32" positive. %"INT32" negative.", t2 - m_t1 , m_numPosRecs , m_numNegRecs ); // keep a total count for reporting when done m_totalPosDumped += m_numPosRecs; m_totalNegDumped += m_numNegRecs; // . check the list we got from the tree for problems // . ensures keys are ordered from lowest to highest as well //#ifdef GBSANITYCHECK if ( g_conf.m_verifyWrites ) { char *s = "none"; if ( m_rdb ) s = getDbnameFromId(m_rdb->m_rdbId); log("dump: verifying list before dumping (rdb=%s)",s); m_list->checkList_r ( false , // removeNegRecs? false , // sleep on problem? m_rdb->m_rdbId ); } // if list is empty, we're done! if ( status && m_list->isEmpty() ) { // consider that a rollover? if ( m_rdb->m_rdbId == RDB_STATSDB ) m_rolledOver = true; return true; } // get the last key of the list lastKey = m_list->getLastKey(); // advance m_nextKey //m_nextKey = lastKey ; //m_nextKey += (uint32_t)1; //if ( m_nextKey < lastKey ) m_rolledOver = true; KEYSET(m_nextKey,lastKey,m_ks); KEYADD(m_nextKey,1,m_ks); if (KEYCMP(m_nextKey,lastKey,m_ks)<0) m_rolledOver = true; // debug msg //log(0,"RdbDump:lastKey.n1=%"UINT32",n0=%"UINT64"",lastKey.n1,lastKey.n0); //log(0,"RdbDump:next.n1=%"UINT32",n0=%"UINT64"",m_nextKey.n1,m_nextKey.n0); } // . return true on error, g_errno should have been set // . this is probably out of memory error if ( ! status ) { hadError: log("db: Had error getting data for dump: %s. Retrying.", mstrerror(g_errno)); // debug msg //log("RdbDump::getList: sleeping and retrying"); // retry for the remaining two types of errors if (!g_loop.registerSleepCallback(1000,this,tryAgainWrapper2)){ log( "db: Retry failed. Could not register callback."); return true; } // wait for sleep return false; } // if list is empty, we're done! if ( m_list->isEmpty() ) return true; // . set m_firstKeyInQueue and m_lastKeyInQueue // . this doesn't work if you're doing an unordered dump, but we should // not allow adds when closing m_lastKeyInQueue = m_list->getLastKey(); //m_firstKeyInQueue = m_list->getCurrentKey(); m_list->getCurrentKey(m_firstKeyInQueue); // . write this list to disk // . returns false if blocked, true otherwise // . sets g_errno on error // . if this blocks it should call us (dumpTree() back) if ( ! dumpList ( m_list , m_niceness , false ) ) return false; // close up shop on a write/dumpList error if ( g_errno ) return true; // . if dumpList() did not block then keep on truckin' // . otherwise, wait for callback of dumpTree() goto loop; }
// . return false if blocked, true otherwise // . sets g_errno on error bool RdbDump::set ( //char *coll , collnum_t collnum , BigFile *file , int32_t id2 , // in Rdb::m_files[] array bool isTitledb , RdbBuckets *buckets , // optional buckets to dump RdbTree *tree , // optional tree to dump RdbMap *map , RdbCache *cache , int32_t maxBufSize , bool orderedDump , // dump in order of keys? bool dedup , // 4 RdbCache::incorporateList() int32_t niceness , void *state , void (* callback) ( void *state ) , bool useHalfKeys , int64_t startOffset , //key_t prevLastKey , char *prevLastKey , char keySize , //class DiskPageCache *pc , void *pc , int64_t maxFileSize , Rdb *rdb ) { if ( ! orderedDump ) { log(LOG_LOGIC,"db: RdbDump does not support non-ordered."); char *xx = NULL; *xx = 0; } //if ( ! coll && //if ( ! coll && rdb->m_isCollectionLess ) // strcpy(m_coll,rdb->m_dbname); m_collnum = collnum; // use 0 for collectionless if ( rdb && rdb->m_isCollectionLess ) m_collnum = 0; // are we like catdb/statsdb etc.? m_doCollCheck = true; if ( rdb && rdb->m_isCollectionLess ) m_doCollCheck = false; // RdbMerge also calls us but rdb is always set to NULL and it was // causing a merge on catdb (collectionless) to screw up if ( ! rdb ) m_doCollCheck = false; /* if ( ! coll && g_catdb.getRdb() == rdb ) strcpy(m_coll, "catdb"); else if ( ! coll && g_statsdb.getRdb() == rdb ) strcpy(m_coll, "statsdb"); else if ( ! coll && g_accessdb.getRdb() == rdb ) strcpy(m_coll, "accessdb"); */ //else // strcpy ( m_coll , coll ); m_file = file; m_id2 = id2; m_isTitledb = isTitledb; m_buckets = buckets; m_tree = tree; m_map = map; m_cache = cache; m_orderedDump = orderedDump; m_dedup = dedup; m_state = state; m_callback = callback; m_list = NULL; m_niceness = niceness; m_tried = false; m_isSuspended = false; m_ks = keySize; m_addToMap = true; // reset this in case we run out of mem, it doesn't get set properly // and needs to be NULL for RdbMem's call to getLastKeyinQueue() m_lastKeyInQueue = NULL; KEYMIN(m_firstKeyInQueue,m_ks); m_isDumping = false; m_writing = false; m_buf = NULL; m_verifyBuf = NULL; m_maxBufSize = maxBufSize; m_offset = startOffset ; m_rolledOver = false; // true if m_nextKey rolls over back to 0 //m_nextKey = 0 ; // used in dumpTree() KEYMIN(m_nextKey,m_ks); m_nextNode = 0 ; // used in dumpTree() // if we're dumping indexdb, allow half keys m_useHalfKeys = useHalfKeys; //m_prevLastKey = prevLastKey; KEYSET(m_prevLastKey,prevLastKey,m_ks); // for setting m_rdb->m_needsSave after deleting the dump list m_rdb = rdb; // . don't dump to a pre-existing file // . seems like Rdb.cpp makes a new BigFile before calling this // . now we can resume merges, so we can indeed dump to the END // of a pre-exiting file, but not when dumping a tree! //if ( m_file->doesExist() > 0 ) { if ( (m_tree || m_buckets) && m_file->getFileSize() > 0 ) { g_errno = EEXIST; log("db: Could not dump to %s. File exists.", m_file->getFilename()); return true; } // . NOTE: MAX_PART_SIZE in BigFile must be defined to be bigger than // anything we actually dump since we only anticipate spanning 1 file // and so only register the first file's fd for write callbacks //if ( m_tree && m_tree->getMaxMem() > MAX_PART_SIZE ) //return log("RdbDump::dump: tree bigger than file part size"); // . open the file nonblocking, sync with disk, read/write // . NOTE: O_SYNC doesn't work too well over NFS // . we need O_SYNC when dumping trees only because we delete the // nodes/records as we dump them // . ensure this sets g_errno for us // . TODO: open might not block! fix that! int32_t flags = O_RDWR | O_CREAT ; // a niceness bigger than 0 means to do non-blocking dumps if ( niceness > 0 ) flags |= O_ASYNC | O_NONBLOCK ; if ( ! m_file->open ( flags , pc , maxFileSize ) ) return true; // . get the file descriptor of the first real file in BigFile // . we should only dump to the first file in BigFile otherwise, // we'd have to juggle fd registration m_fd = m_file->getfd ( 0 , false /*for reading?*/ ); if ( m_fd < 0 ) { log(LOG_LOGIC,"db: dump: Bad fd of first file in BigFile.") ; return true; } // debug test //char buf1[10*1024]; //int32_t n1 = m_file->write ( buf1 , 10*1024 , 0 ); //log("bytes written=%"INT32"\n",n1); // we're now considered to be in dumping state m_isDumping = true; // . if no tree was provided to dump it must be RdbMerge calling us // . he'll want to call dumpList() on his own if ( ! m_tree && !m_buckets ) return true; // how many recs in tree? int32_t nr; char *structureName; if(m_tree) { nr = m_tree->getNumUsedNodes(); structureName = "tree"; } else if(m_buckets){ nr = m_buckets->getNumKeys(); structureName = "buckets"; } // debug msg log(LOG_INFO,"db: Dumping %"INT32" recs from %s to files.", nr, structureName); // nr , m_file->getFilename() ); // keep a total count for reporting when done m_totalPosDumped = 0; m_totalNegDumped = 0; // we have our own flag here since m_dump::m_isDumping gets // set to true between collection dumps, RdbMem.cpp needs // a flag that doesn't do that... see RdbDump.cpp. // this was in Rdb.cpp but when threads were turned off it was // NEVER getting set and resulted in corruption in RdbMem.cpp. m_rdb->m_inDumpLoop = true; // . start dumping the tree // . return false if it blocked if ( ! dumpTree ( false ) ) return false; // no longer dumping doneDumping(); // return true since we didn't block return true; }
bool RdbMerge::getAnotherList ( ) { log(LOG_DEBUG,"db: Getting another list for merge."); // clear it up in case it was already set g_errno = 0; // get base, returns NULL and sets g_errno to ENOCOLLREC on error RdbBase *base = getRdbBase( m_rdbId, m_collnum ); if ( ! base ) { return true; } // if merging titledb files, we must adjust m_endKey so we do // not have to read a huge 200MB+ tfndb list //key_t newEndKey = m_endKey; char newEndKey[MAX_KEY_BYTES]; KEYSET(newEndKey,m_endKey,m_ks); // . this returns false if blocked, true otherwise // . sets g_errno on error // . we return false if it blocked // . m_maxBufSize may be exceeded by a rec, it's just a target size // . niceness is usually MAX_NICENESS, but reindex.cpp sets to 0 // . this was a call to Msg3, but i made it call Msg5 since // we now do the merging in Msg5, not in msg3 anymore // . this will now handle truncation, dup and neg rec removal // . it remembers last termId and count so it can truncate even when // IndexList is split between successive reads // . IMPORTANT: when merging titledb we could be merging about 255 // files, so if we are limited to only X fds it can have a cascade // affect where reading from one file closes the fd of another file // in the read (since we call open before spawning the read thread) // and can therefore take 255 retries for the Msg3 to complete // because each read gives a EFILCLOSED error. // so to fix it we allow one retry for each file in the read plus // the original retry of 25 int32_t nn = base->getNumFiles(); if ( m_numFiles > 0 && m_numFiles < nn ) nn = m_numFiles; // don't access any biased page caches bool usePageCache = true; if ( m_rdbId == RDB_CLUSTERDB ) usePageCache = false; // . i don't trust page cache too much (mdw)... well, give it a shot // . see if ths helps fix WD corruption... i doubt it usePageCache = false; // for now force to 100k int32_t bufSize = 100000; // g_conf.m_mergeBufSize , // minRecSizes // get it return m_msg5.getList ( m_rdbId , m_collnum , &m_list , m_startKey , newEndKey , // usually is maxed! bufSize , false , // includeTree? false , // add to cache? 0 , // max cache age for lookup m_startFileNum , // startFileNum m_numFiles , this , // state gotListWrapper , // callback m_niceness , // niceness true , // do error correction? NULL , // cache key ptr 0 , // retry # nn + 75 , // max retries (mk it high) false , // compensate for merge? -1LL , // sync point true , // isRealMerge? absolutely! usePageCache ); }