/* get the tdb sequence number. Only makes sense if the writers opened with TDB1_SEQNUM set. Note that this sequence number will wrap quite quickly, so it should only be used for a 'has something changed' test, not for code that relies on the count of the number of changes made. If you want a counter then use a tdb record. The aim of this sequence number is to allow for a very lightweight test of a possible tdb change. */ int tdb1_get_seqnum(struct tdb_context *tdb) { tdb1_off_t seqnum=0; tdb1_ofs_read(tdb, TDB1_SEQNUM_OFS, &seqnum); return seqnum; }
/* * See if we have a dead record around with enough space */ static tdb1_off_t tdb1_find_dead(struct tdb_context *tdb, uint32_t hash, struct tdb1_record *r, tdb1_len_t length) { tdb1_off_t rec_ptr; /* read in the hash top */ if (tdb1_ofs_read(tdb, TDB1_HASH_TOP(hash), &rec_ptr) == -1) return 0; /* keep looking until we find the right record */ while (rec_ptr) { if (tdb1_rec_read(tdb, rec_ptr, r) == -1) return 0; if (TDB1_DEAD(r) && r->rec_len >= length) { /* * First fit for simple coding, TODO: change to best * fit */ return rec_ptr; } rec_ptr = r->next; } return 0; }
/* actually delete an entry in the database given the offset */ int tdb1_do_delete(struct tdb_context *tdb, tdb1_off_t rec_ptr, struct tdb1_record *rec) { tdb1_off_t last_ptr, i; struct tdb1_record lastrec; if ((tdb->flags & TDB_RDONLY) || tdb->tdb1.traverse_read) return -1; if (((tdb->tdb1.traverse_write != 0) && (!TDB1_DEAD(rec))) || tdb1_write_lock_record(tdb, rec_ptr) == -1) { /* Someone traversing here: mark it as dead */ rec->magic = TDB1_DEAD_MAGIC; return tdb1_rec_write(tdb, rec_ptr, rec); } if (tdb1_write_unlock_record(tdb, rec_ptr) != 0) return -1; /* find previous record in hash chain */ if (tdb1_ofs_read(tdb, TDB1_HASH_TOP(rec->full_hash), &i) == -1) return -1; for (last_ptr = 0; i != rec_ptr; last_ptr = i, i = lastrec.next) if (tdb1_rec_read(tdb, i, &lastrec) == -1) return -1; /* unlink it: next ptr is at start of record. */ if (last_ptr == 0) last_ptr = TDB1_HASH_TOP(rec->full_hash); if (tdb1_ofs_write(tdb, last_ptr, &rec->next) == -1) return -1; /* recover the space */ if (tdb1_free(tdb, rec_ptr, rec) == -1) return -1; return 0; }
/* * Purge all DEAD records from a hash chain */ static int tdb1_purge_dead(struct tdb_context *tdb, uint32_t hash) { int res = -1; struct tdb1_record rec; tdb1_off_t rec_ptr; if (tdb1_lock(tdb, -1, F_WRLCK) == -1) { return -1; } /* read in the hash top */ if (tdb1_ofs_read(tdb, TDB1_HASH_TOP(hash), &rec_ptr) == -1) goto fail; while (rec_ptr) { tdb1_off_t next; if (tdb1_rec_read(tdb, rec_ptr, &rec) == -1) { goto fail; } next = rec.next; if (rec.magic == TDB1_DEAD_MAGIC && tdb1_do_delete(tdb, rec_ptr, &rec) == -1) { goto fail; } rec_ptr = next; } res = 0; fail: tdb1_unlock(tdb, -1, F_WRLCK); return res; }
/* Returns 0 on fail; last_error will be TDB_ERR_NOEXIST if it simply * wasn't there, otherwise a real error. * On success, return offset of record, and fills in rec */ static tdb1_off_t tdb1_find(struct tdb_context *tdb, TDB_DATA key, uint32_t hash, struct tdb1_record *r) { tdb1_off_t rec_ptr; /* read in the hash top */ if (tdb1_ofs_read(tdb, TDB1_HASH_TOP(hash), &rec_ptr) == -1) return 0; /* keep looking until we find the right record */ while (rec_ptr) { if (tdb1_rec_read(tdb, rec_ptr, r) == -1) return 0; tdb->stats.compares++; if (TDB1_DEAD(r)) { tdb->stats.compare_wrong_bucket++; } else if (key.dsize != r->key_len) { tdb->stats.compare_wrong_keylen++; } else if (hash != r->full_hash) { tdb->stats.compare_wrong_rechash++; } else { enum TDB_ERROR ecode; bool matches; ecode = tdb1_parse_data(tdb, key, rec_ptr + sizeof(*r), r->key_len, tdb1_key_compare, &matches); if (ecode != TDB_SUCCESS) { tdb->last_error = ecode; return 0; } if (!matches) { tdb->stats.compare_wrong_keycmp++; } else { return rec_ptr; } } /* detect tight infinite loop */ if (rec_ptr == r->next) { tdb->last_error = tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR, "tdb1_find: loop detected."); return 0; } rec_ptr = r->next; } tdb->last_error = TDB_ERR_NOEXIST; return 0; }
/* non-blocking increment of the tdb sequence number if the tdb has been opened using the TDB_SEQNUM flag */ void tdb1_increment_seqnum_nonblock(struct tdb_context *tdb) { tdb1_off_t seqnum=0; if (!(tdb->flags & TDB_SEQNUM)) { return; } /* we ignore errors from this, as we have no sane way of dealing with them. */ tdb1_ofs_read(tdb, TDB1_SEQNUM_OFS, &seqnum); seqnum++; tdb1_ofs_write(tdb, TDB1_SEQNUM_OFS, &seqnum); }
static size_t get_hash_length(struct tdb_context *tdb, unsigned int i) { tdb1_off_t rec_ptr; size_t count = 0; if (tdb1_ofs_read(tdb, TDB1_HASH_TOP(i), &rec_ptr) == -1) return 0; /* keep looking until we find the right record */ while (rec_ptr) { struct tdb1_record r; ++count; if (tdb1_rec_read(tdb, rec_ptr, &r) == -1) return 0; rec_ptr = r.next; } return count; }
static int tdb1_count_dead(struct tdb_context *tdb, uint32_t hash) { int res = 0; tdb1_off_t rec_ptr; struct tdb1_record rec; /* read in the hash top */ if (tdb1_ofs_read(tdb, TDB1_HASH_TOP(hash), &rec_ptr) == -1) return 0; while (rec_ptr) { if (tdb1_rec_read(tdb, rec_ptr, &rec) == -1) return 0; if (rec.magic == TDB1_DEAD_MAGIC) { res += 1; } rec_ptr = rec.next; } return res; }
/* wipe the entire database, deleting all records. This can be done very fast by using a allrecord lock. The entire data portion of the file becomes a single entry in the freelist. This code carefully steps around the recovery area, leaving it alone */ int tdb1_wipe_all(struct tdb_context *tdb) { int i; tdb1_off_t offset = 0; ssize_t data_len; tdb1_off_t recovery_head; tdb1_len_t recovery_size = 0; if (tdb_lockall(tdb) != TDB_SUCCESS) { return -1; } /* see if the tdb has a recovery area, and remember its size if so. We don't want to lose this as otherwise each tdb1_wipe_all() in a transaction will increase the size of the tdb by the size of the recovery area */ if (tdb1_ofs_read(tdb, TDB1_RECOVERY_HEAD, &recovery_head) == -1) { tdb_logerr(tdb, tdb->last_error, TDB_LOG_ERROR, "tdb1_wipe_all: failed to read recovery head"); goto failed; } if (recovery_head != 0) { struct tdb1_record rec; if (tdb->tdb1.io->tdb1_read(tdb, recovery_head, &rec, sizeof(rec), TDB1_DOCONV()) == -1) { tdb_logerr(tdb, tdb->last_error, TDB_LOG_ERROR, "tdb1_wipe_all: failed to read recovery record"); return -1; } recovery_size = rec.rec_len + sizeof(rec); } /* wipe the hashes */ for (i=0;i<tdb->tdb1.header.hash_size;i++) { if (tdb1_ofs_write(tdb, TDB1_HASH_TOP(i), &offset) == -1) { tdb_logerr(tdb, tdb->last_error, TDB_LOG_ERROR, "tdb1_wipe_all: failed to write hash %d", i); goto failed; } } /* wipe the freelist */ if (tdb1_ofs_write(tdb, TDB1_FREELIST_TOP, &offset) == -1) { tdb_logerr(tdb, tdb->last_error, TDB_LOG_ERROR, "tdb1_wipe_all: failed to write freelist"); goto failed; } /* add all the rest of the file to the freelist, possibly leaving a gap for the recovery area */ if (recovery_size == 0) { /* the simple case - the whole file can be used as a freelist */ data_len = (tdb->file->map_size - TDB1_DATA_START(tdb->tdb1.header.hash_size)); if (tdb1_free_region(tdb, TDB1_DATA_START(tdb->tdb1.header.hash_size), data_len) != 0) { goto failed; } } else { /* we need to add two freelist entries - one on either side of the recovery area Note that we cannot shift the recovery area during this operation. Only the transaction.c code may move the recovery area or we risk subtle data corruption */ data_len = (recovery_head - TDB1_DATA_START(tdb->tdb1.header.hash_size)); if (tdb1_free_region(tdb, TDB1_DATA_START(tdb->tdb1.header.hash_size), data_len) != 0) { goto failed; } /* and the 2nd free list entry after the recovery area - if any */ data_len = tdb->file->map_size - (recovery_head+recovery_size); if (tdb1_free_region(tdb, recovery_head+recovery_size, data_len) != 0) { goto failed; } } tdb1_increment_seqnum_nonblock(tdb); tdb_unlockall(tdb); return 0; failed: tdb_unlockall(tdb); return -1; }
static int _tdb1_store(struct tdb_context *tdb, TDB_DATA key, TDB_DATA dbuf, int flag, uint32_t hash) { struct tdb1_record rec; tdb1_off_t rec_ptr; char *p = NULL; int ret = -1; /* check for it existing, on insert. */ if (flag == TDB_INSERT) { if (tdb1_exists_hash(tdb, key, hash)) { tdb->last_error = TDB_ERR_EXISTS; goto fail; } if (tdb->last_error != TDB_ERR_NOEXIST) { goto fail; } } else { /* first try in-place update, on modify or replace. */ if (tdb1_update_hash(tdb, key, hash, dbuf) == 0) { goto done; } if (tdb->last_error != TDB_SUCCESS) { if (tdb->last_error != TDB_ERR_NOEXIST) { goto fail; } if (flag == TDB_MODIFY) { /* if the record doesn't exist and we are in TDB1_MODIFY mode then we should fail the store */ goto fail; } } } /* reset the error code potentially set by the tdb1_update() */ tdb->last_error = TDB_SUCCESS; /* delete any existing record - if it doesn't exist we don't care. Doing this first reduces fragmentation, and avoids coalescing with `allocated' block before it's updated. */ if (flag != TDB_INSERT) tdb1_delete_hash(tdb, key, hash); /* Copy key+value *before* allocating free space in case malloc fails and we are left with a dead spot in the tdb. */ if (!(p = (char *)malloc(key.dsize + dbuf.dsize))) { tdb->last_error = tdb_logerr(tdb, TDB_ERR_OOM, TDB_LOG_ERROR, "tdb1_store: out of memory" " allocating copy"); goto fail; } memcpy(p, key.dptr, key.dsize); if (dbuf.dsize) memcpy(p+key.dsize, dbuf.dptr, dbuf.dsize); if (tdb->tdb1.max_dead_records != 0) { /* * Allow for some dead records per hash chain, look if we can * find one that can hold the new record. We need enough space * for key, data and tailer. If we find one, we don't have to * consult the central freelist. */ rec_ptr = tdb1_find_dead( tdb, hash, &rec, key.dsize + dbuf.dsize + sizeof(tdb1_off_t)); if (rec_ptr != 0) { rec.key_len = key.dsize; rec.data_len = dbuf.dsize; rec.full_hash = hash; rec.magic = TDB1_MAGIC; if (tdb1_rec_write(tdb, rec_ptr, &rec) == -1 || tdb->tdb1.io->tdb1_write( tdb, rec_ptr + sizeof(rec), p, key.dsize + dbuf.dsize) == -1) { goto fail; } goto done; } } /* * We have to allocate some space from the freelist, so this means we * have to lock it. Use the chance to purge all the DEAD records from * the hash chain under the freelist lock. */ if (tdb1_lock(tdb, -1, F_WRLCK) == -1) { goto fail; } if ((tdb->tdb1.max_dead_records != 0) && (tdb1_purge_dead(tdb, hash) == -1)) { tdb1_unlock(tdb, -1, F_WRLCK); goto fail; } /* we have to allocate some space */ rec_ptr = tdb1_allocate(tdb, key.dsize + dbuf.dsize, &rec); tdb1_unlock(tdb, -1, F_WRLCK); if (rec_ptr == 0) { goto fail; } /* Read hash top into next ptr */ if (tdb1_ofs_read(tdb, TDB1_HASH_TOP(hash), &rec.next) == -1) goto fail; rec.key_len = key.dsize; rec.data_len = dbuf.dsize; rec.full_hash = hash; rec.magic = TDB1_MAGIC; /* write out and point the top of the hash chain at it */ if (tdb1_rec_write(tdb, rec_ptr, &rec) == -1 || tdb->tdb1.io->tdb1_write(tdb, rec_ptr+sizeof(rec), p, key.dsize+dbuf.dsize)==-1 || tdb1_ofs_write(tdb, TDB1_HASH_TOP(hash), &rec_ptr) == -1) { /* Need to tdb1_unallocate() here */ goto fail; } done: ret = 0; fail: if (ret == 0) { tdb1_increment_seqnum(tdb); } SAFE_FREE(p); return ret; }
/* Add an element into the freelist. Merge adjacent records if necessary. */ int tdb1_free(struct tdb_context *tdb, tdb1_off_t offset, struct tdb1_record *rec) { /* Allocation and tailer lock */ if (tdb1_lock(tdb, -1, F_WRLCK) != 0) return -1; /* set an initial tailer, so if we fail we don't leave a bogus record */ if (update_tailer(tdb, offset, rec) != 0) { tdb_logerr(tdb, tdb->last_error, TDB_LOG_ERROR, "tdb_free: update_tailer failed!\n"); goto fail; } tdb->stats.alloc_coalesce_tried++; /* Look left */ if (offset - sizeof(tdb1_off_t) > TDB1_DATA_START(tdb->tdb1.header.hash_size)) { tdb1_off_t left = offset - sizeof(tdb1_off_t); struct tdb1_record l; tdb1_off_t leftsize; /* Read in tailer and jump back to header */ if (tdb1_ofs_read(tdb, left, &leftsize) == -1) { tdb_logerr(tdb, tdb->last_error, TDB_LOG_ERROR, "tdb1_free: left offset read failed at %u", left); goto update; } /* it could be uninitialised data */ if (leftsize == 0 || leftsize == TDB1_PAD_U32) { goto update; } left = offset - leftsize; if (leftsize > offset || left < TDB1_DATA_START(tdb->tdb1.header.hash_size)) { goto update; } /* Now read in the left record */ if (tdb->tdb1.io->tdb1_read(tdb, left, &l, sizeof(l), TDB1_DOCONV()) == -1) { tdb_logerr(tdb, tdb->last_error, TDB_LOG_ERROR, "tdb1_free: left read failed at %u (%u)", left, leftsize); goto update; } /* If it's free, expand to include it. */ if (l.magic == TDB1_FREE_MAGIC) { /* we now merge the new record into the left record, rather than the other way around. This makes the operation O(1) instead of O(n). This change prevents traverse from being O(n^2) after a lot of deletes */ l.rec_len += sizeof(*rec) + rec->rec_len; if (tdb1_rec_write(tdb, left, &l) == -1) { tdb_logerr(tdb, tdb->last_error, TDB_LOG_ERROR, "tdb1_free: update_left failed at %u", left); goto fail; } if (update_tailer(tdb, left, &l) == -1) { tdb_logerr(tdb, tdb->last_error, TDB_LOG_ERROR, "tdb1_free: update_tailer failed at %u", offset); goto fail; } tdb->stats.alloc_coalesce_succeeded++; tdb->stats.alloc_coalesce_num_merged++; tdb->stats.frees++; tdb1_unlock(tdb, -1, F_WRLCK); return 0; } } update: /* Now, prepend to free list */ rec->magic = TDB1_FREE_MAGIC; if (tdb1_ofs_read(tdb, TDB1_FREELIST_TOP, &rec->next) == -1 || tdb1_rec_write(tdb, offset, rec) == -1 || tdb1_ofs_write(tdb, TDB1_FREELIST_TOP, &offset) == -1) { tdb_logerr(tdb, tdb->last_error, TDB_LOG_ERROR, "tdb1_free record write failed at offset=%d", offset); goto fail; } /* And we're done. */ tdb->stats.frees++; tdb1_unlock(tdb, -1, F_WRLCK); return 0; fail: tdb1_unlock(tdb, -1, F_WRLCK); return -1; }
/* allocate some space from the free list. The offset returned points to a unconnected tdb1_record within the database with room for at least length bytes of total data 0 is returned if the space could not be allocated */ tdb1_off_t tdb1_allocate(struct tdb_context *tdb, tdb1_len_t length, struct tdb1_record *rec) { tdb1_off_t rec_ptr, last_ptr, newrec_ptr; struct { tdb1_off_t rec_ptr, last_ptr; tdb1_len_t rec_len; } bestfit; float multiplier = 1.0; if (tdb1_lock(tdb, -1, F_WRLCK) == -1) return 0; /* over-allocate to reduce fragmentation */ length *= 1.25; /* Extra bytes required for tailer */ length += sizeof(tdb1_off_t); length = TDB1_ALIGN(length, TDB1_ALIGNMENT); again: last_ptr = TDB1_FREELIST_TOP; /* read in the freelist top */ if (tdb1_ofs_read(tdb, TDB1_FREELIST_TOP, &rec_ptr) == -1) goto fail; bestfit.rec_ptr = 0; bestfit.last_ptr = 0; bestfit.rec_len = 0; /* this is a best fit allocation strategy. Originally we used a first fit strategy, but it suffered from massive fragmentation issues when faced with a slowly increasing record size. */ while (rec_ptr) { if (tdb1_rec_free_read(tdb, rec_ptr, rec) == -1) { goto fail; } if (rec->rec_len >= length) { if (bestfit.rec_ptr == 0 || rec->rec_len < bestfit.rec_len) { bestfit.rec_len = rec->rec_len; bestfit.rec_ptr = rec_ptr; bestfit.last_ptr = last_ptr; } } /* move to the next record */ last_ptr = rec_ptr; rec_ptr = rec->next; /* if we've found a record that is big enough, then stop searching if its also not too big. The definition of 'too big' changes as we scan through */ if (bestfit.rec_len > 0 && bestfit.rec_len < length * multiplier) { break; } /* this multiplier means we only extremely rarely search more than 50 or so records. At 50 records we accept records up to 11 times larger than what we want */ multiplier *= 1.05; } if (bestfit.rec_ptr != 0) { if (tdb1_rec_free_read(tdb, bestfit.rec_ptr, rec) == -1) { goto fail; } newrec_ptr = tdb1_allocate_ofs(tdb, length, bestfit.rec_ptr, rec, bestfit.last_ptr); tdb1_unlock(tdb, -1, F_WRLCK); return newrec_ptr; } /* we didn't find enough space. See if we can expand the database and if we can then try again */ if (tdb1_expand(tdb, length + sizeof(*rec)) == 0) goto again; fail: tdb1_unlock(tdb, -1, F_WRLCK); return 0; }