/* delete an entry in the database given a key */ static int tdb1_delete_hash(struct tdb_context *tdb, TDB_DATA key, uint32_t hash) { tdb1_off_t rec_ptr; struct tdb1_record rec; int ret; if (tdb->tdb1.max_dead_records != 0) { /* * Allow for some dead records per hash chain, mainly for * tdb's with a very high create/delete rate like locking.tdb. */ if (tdb1_lock(tdb, TDB1_BUCKET(hash), F_WRLCK) == -1) return -1; if (tdb1_count_dead(tdb, hash) >= tdb->tdb1.max_dead_records) { /* * Don't let the per-chain freelist grow too large, * delete all existing dead records */ tdb1_purge_dead(tdb, hash); } if (!(rec_ptr = tdb1_find(tdb, key, hash, &rec))) { tdb1_unlock(tdb, TDB1_BUCKET(hash), F_WRLCK); return -1; } /* * Just mark the record as dead. */ rec.magic = TDB1_DEAD_MAGIC; ret = tdb1_rec_write(tdb, rec_ptr, &rec); } else { if (!(rec_ptr = tdb1_find_lock_hash(tdb, key, hash, F_WRLCK, &rec))) return -1; ret = tdb1_do_delete(tdb, rec_ptr, &rec); } if (ret == 0) { tdb1_increment_seqnum(tdb); } if (tdb1_unlock(tdb, TDB1_BUCKET(rec.full_hash), F_WRLCK) != 0) tdb_logerr(tdb, tdb->last_error, TDB_LOG_ERROR, "tdb1_delete: WARNING tdb1_unlock failed!"); return ret; }
enum TDB_ERROR tdb1_parse_record(struct tdb_context *tdb, TDB_DATA key, enum TDB_ERROR (*parser)(TDB_DATA key, TDB_DATA data, void *private_data), void *private_data) { tdb1_off_t rec_ptr; struct tdb1_record rec; enum TDB_ERROR ret; uint32_t hash; /* find which hash bucket it is in */ hash = tdb_hash(tdb, key.dptr, key.dsize); if (!(rec_ptr = tdb1_find_lock_hash(tdb,key,hash,F_RDLCK,&rec))) { return tdb->last_error; } ret = tdb1_parse_data(tdb, key, rec_ptr + sizeof(rec) + rec.key_len, rec.data_len, parser, private_data); tdb1_unlock(tdb, TDB1_BUCKET(rec.full_hash), F_RDLCK); return ret; }
/* * Purge all DEAD records from a hash chain */ static int tdb1_purge_dead(struct tdb_context *tdb, uint32_t hash) { int res = -1; struct tdb1_record rec; tdb1_off_t rec_ptr; if (tdb1_lock(tdb, -1, F_WRLCK) == -1) { return -1; } /* read in the hash top */ if (tdb1_ofs_read(tdb, TDB1_HASH_TOP(hash), &rec_ptr) == -1) goto fail; while (rec_ptr) { tdb1_off_t next; if (tdb1_rec_read(tdb, rec_ptr, &rec) == -1) { goto fail; } next = rec.next; if (rec.magic == TDB1_DEAD_MAGIC && tdb1_do_delete(tdb, rec_ptr, &rec) == -1) { goto fail; } rec_ptr = next; } res = 0; fail: tdb1_unlock(tdb, -1, F_WRLCK); return res; }
/* check if an entry in the database exists note that 1 is returned if the key is found and 0 is returned if not found this doesn't match the conventions in the rest of this module, but is compatible with gdbm */ static int tdb1_exists_hash(struct tdb_context *tdb, TDB_DATA key, uint32_t hash) { struct tdb1_record rec; if (tdb1_find_lock_hash(tdb, key, hash, F_RDLCK, &rec) == 0) return 0; tdb1_unlock(tdb, TDB1_BUCKET(rec.full_hash), F_RDLCK); return 1; }
/* As tdb1_find, but if you succeed, keep the lock */ tdb1_off_t tdb1_find_lock_hash(struct tdb_context *tdb, TDB_DATA key, uint32_t hash, int locktype, struct tdb1_record *rec) { uint32_t rec_ptr; if (tdb1_lock(tdb, TDB1_BUCKET(hash), locktype) == -1) return 0; if (!(rec_ptr = tdb1_find(tdb, key, hash, rec))) tdb1_unlock(tdb, TDB1_BUCKET(hash), locktype); return rec_ptr; }
/* Append to an entry. Create if not exist. */ int tdb1_append(struct tdb_context *tdb, TDB_DATA key, TDB_DATA new_dbuf) { uint32_t hash; TDB_DATA dbuf; int ret = -1; assert(tdb->flags & TDB_VERSION1); /* find which hash bucket it is in */ hash = tdb_hash(tdb, key.dptr, key.dsize); if (tdb1_lock(tdb, TDB1_BUCKET(hash), F_WRLCK) == -1) return -1; dbuf = _tdb1_fetch(tdb, key); if (dbuf.dptr == NULL) { dbuf.dptr = (unsigned char *)malloc(new_dbuf.dsize); } else { unsigned int new_len = dbuf.dsize + new_dbuf.dsize; unsigned char *new_dptr; /* realloc '0' is special: don't do that. */ if (new_len == 0) new_len = 1; new_dptr = (unsigned char *)realloc(dbuf.dptr, new_len); if (new_dptr == NULL) { free(dbuf.dptr); } dbuf.dptr = new_dptr; } if (dbuf.dptr == NULL) { tdb->last_error = TDB_ERR_OOM; goto failed; } memcpy(dbuf.dptr + dbuf.dsize, new_dbuf.dptr, new_dbuf.dsize); dbuf.dsize += new_dbuf.dsize; ret = _tdb1_store(tdb, key, dbuf, 0, hash); failed: tdb1_unlock(tdb, TDB1_BUCKET(hash), F_WRLCK); SAFE_FREE(dbuf.dptr); return ret; }
/* If an entry doesn't exist tdb1_err will be set to * TDB_ERR_NOEXIST. If a key has no data attached * then the TDB_DATA will have zero length but * a non-zero pointer */ static TDB_DATA _tdb1_fetch(struct tdb_context *tdb, TDB_DATA key) { tdb1_off_t rec_ptr; struct tdb1_record rec; TDB_DATA ret; uint32_t hash; /* find which hash bucket it is in */ hash = tdb_hash(tdb, key.dptr, key.dsize); if (!(rec_ptr = tdb1_find_lock_hash(tdb,key,hash,F_RDLCK,&rec))) { ret.dptr = NULL; ret.dsize = 0; return ret; } ret.dptr = tdb1_alloc_read(tdb, rec_ptr + sizeof(rec) + rec.key_len, rec.data_len); ret.dsize = rec.data_len; tdb1_unlock(tdb, TDB1_BUCKET(rec.full_hash), F_RDLCK); return ret; }
/* store an element in the database, replacing any existing element with the same key return 0 on success, -1 on failure */ int tdb1_store(struct tdb_context *tdb, TDB_DATA key, TDB_DATA dbuf, int flag) { uint32_t hash; int ret; assert(tdb->flags & TDB_VERSION1); if ((tdb->flags & TDB_RDONLY) || tdb->tdb1.traverse_read) { tdb->last_error = tdb_logerr(tdb, TDB_ERR_RDONLY, TDB_LOG_USE_ERROR, "tdb_store: read-only tdb"); return -1; } /* find which hash bucket it is in */ hash = tdb_hash(tdb, key.dptr, key.dsize); if (tdb1_lock(tdb, TDB1_BUCKET(hash), F_WRLCK) == -1) return -1; ret = _tdb1_store(tdb, key, dbuf, flag, hash); tdb1_unlock(tdb, TDB1_BUCKET(hash), F_WRLCK); return ret; }
static int _tdb1_store(struct tdb_context *tdb, TDB_DATA key, TDB_DATA dbuf, int flag, uint32_t hash) { struct tdb1_record rec; tdb1_off_t rec_ptr; char *p = NULL; int ret = -1; /* check for it existing, on insert. */ if (flag == TDB_INSERT) { if (tdb1_exists_hash(tdb, key, hash)) { tdb->last_error = TDB_ERR_EXISTS; goto fail; } if (tdb->last_error != TDB_ERR_NOEXIST) { goto fail; } } else { /* first try in-place update, on modify or replace. */ if (tdb1_update_hash(tdb, key, hash, dbuf) == 0) { goto done; } if (tdb->last_error != TDB_SUCCESS) { if (tdb->last_error != TDB_ERR_NOEXIST) { goto fail; } if (flag == TDB_MODIFY) { /* if the record doesn't exist and we are in TDB1_MODIFY mode then we should fail the store */ goto fail; } } } /* reset the error code potentially set by the tdb1_update() */ tdb->last_error = TDB_SUCCESS; /* delete any existing record - if it doesn't exist we don't care. Doing this first reduces fragmentation, and avoids coalescing with `allocated' block before it's updated. */ if (flag != TDB_INSERT) tdb1_delete_hash(tdb, key, hash); /* Copy key+value *before* allocating free space in case malloc fails and we are left with a dead spot in the tdb. */ if (!(p = (char *)malloc(key.dsize + dbuf.dsize))) { tdb->last_error = tdb_logerr(tdb, TDB_ERR_OOM, TDB_LOG_ERROR, "tdb1_store: out of memory" " allocating copy"); goto fail; } memcpy(p, key.dptr, key.dsize); if (dbuf.dsize) memcpy(p+key.dsize, dbuf.dptr, dbuf.dsize); if (tdb->tdb1.max_dead_records != 0) { /* * Allow for some dead records per hash chain, look if we can * find one that can hold the new record. We need enough space * for key, data and tailer. If we find one, we don't have to * consult the central freelist. */ rec_ptr = tdb1_find_dead( tdb, hash, &rec, key.dsize + dbuf.dsize + sizeof(tdb1_off_t)); if (rec_ptr != 0) { rec.key_len = key.dsize; rec.data_len = dbuf.dsize; rec.full_hash = hash; rec.magic = TDB1_MAGIC; if (tdb1_rec_write(tdb, rec_ptr, &rec) == -1 || tdb->tdb1.io->tdb1_write( tdb, rec_ptr + sizeof(rec), p, key.dsize + dbuf.dsize) == -1) { goto fail; } goto done; } } /* * We have to allocate some space from the freelist, so this means we * have to lock it. Use the chance to purge all the DEAD records from * the hash chain under the freelist lock. */ if (tdb1_lock(tdb, -1, F_WRLCK) == -1) { goto fail; } if ((tdb->tdb1.max_dead_records != 0) && (tdb1_purge_dead(tdb, hash) == -1)) { tdb1_unlock(tdb, -1, F_WRLCK); goto fail; } /* we have to allocate some space */ rec_ptr = tdb1_allocate(tdb, key.dsize + dbuf.dsize, &rec); tdb1_unlock(tdb, -1, F_WRLCK); if (rec_ptr == 0) { goto fail; } /* Read hash top into next ptr */ if (tdb1_ofs_read(tdb, TDB1_HASH_TOP(hash), &rec.next) == -1) goto fail; rec.key_len = key.dsize; rec.data_len = dbuf.dsize; rec.full_hash = hash; rec.magic = TDB1_MAGIC; /* write out and point the top of the hash chain at it */ if (tdb1_rec_write(tdb, rec_ptr, &rec) == -1 || tdb->tdb1.io->tdb1_write(tdb, rec_ptr+sizeof(rec), p, key.dsize+dbuf.dsize)==-1 || tdb1_ofs_write(tdb, TDB1_HASH_TOP(hash), &rec_ptr) == -1) { /* Need to tdb1_unallocate() here */ goto fail; } done: ret = 0; fail: if (ret == 0) { tdb1_increment_seqnum(tdb); } SAFE_FREE(p); return ret; }
/* Add an element into the freelist. Merge adjacent records if necessary. */ int tdb1_free(struct tdb_context *tdb, tdb1_off_t offset, struct tdb1_record *rec) { /* Allocation and tailer lock */ if (tdb1_lock(tdb, -1, F_WRLCK) != 0) return -1; /* set an initial tailer, so if we fail we don't leave a bogus record */ if (update_tailer(tdb, offset, rec) != 0) { tdb_logerr(tdb, tdb->last_error, TDB_LOG_ERROR, "tdb_free: update_tailer failed!\n"); goto fail; } tdb->stats.alloc_coalesce_tried++; /* Look left */ if (offset - sizeof(tdb1_off_t) > TDB1_DATA_START(tdb->tdb1.header.hash_size)) { tdb1_off_t left = offset - sizeof(tdb1_off_t); struct tdb1_record l; tdb1_off_t leftsize; /* Read in tailer and jump back to header */ if (tdb1_ofs_read(tdb, left, &leftsize) == -1) { tdb_logerr(tdb, tdb->last_error, TDB_LOG_ERROR, "tdb1_free: left offset read failed at %u", left); goto update; } /* it could be uninitialised data */ if (leftsize == 0 || leftsize == TDB1_PAD_U32) { goto update; } left = offset - leftsize; if (leftsize > offset || left < TDB1_DATA_START(tdb->tdb1.header.hash_size)) { goto update; } /* Now read in the left record */ if (tdb->tdb1.io->tdb1_read(tdb, left, &l, sizeof(l), TDB1_DOCONV()) == -1) { tdb_logerr(tdb, tdb->last_error, TDB_LOG_ERROR, "tdb1_free: left read failed at %u (%u)", left, leftsize); goto update; } /* If it's free, expand to include it. */ if (l.magic == TDB1_FREE_MAGIC) { /* we now merge the new record into the left record, rather than the other way around. This makes the operation O(1) instead of O(n). This change prevents traverse from being O(n^2) after a lot of deletes */ l.rec_len += sizeof(*rec) + rec->rec_len; if (tdb1_rec_write(tdb, left, &l) == -1) { tdb_logerr(tdb, tdb->last_error, TDB_LOG_ERROR, "tdb1_free: update_left failed at %u", left); goto fail; } if (update_tailer(tdb, left, &l) == -1) { tdb_logerr(tdb, tdb->last_error, TDB_LOG_ERROR, "tdb1_free: update_tailer failed at %u", offset); goto fail; } tdb->stats.alloc_coalesce_succeeded++; tdb->stats.alloc_coalesce_num_merged++; tdb->stats.frees++; tdb1_unlock(tdb, -1, F_WRLCK); return 0; } } update: /* Now, prepend to free list */ rec->magic = TDB1_FREE_MAGIC; if (tdb1_ofs_read(tdb, TDB1_FREELIST_TOP, &rec->next) == -1 || tdb1_rec_write(tdb, offset, rec) == -1 || tdb1_ofs_write(tdb, TDB1_FREELIST_TOP, &offset) == -1) { tdb_logerr(tdb, tdb->last_error, TDB_LOG_ERROR, "tdb1_free record write failed at offset=%d", offset); goto fail; } /* And we're done. */ tdb->stats.frees++; tdb1_unlock(tdb, -1, F_WRLCK); return 0; fail: tdb1_unlock(tdb, -1, F_WRLCK); return -1; }
/* allocate some space from the free list. The offset returned points to a unconnected tdb1_record within the database with room for at least length bytes of total data 0 is returned if the space could not be allocated */ tdb1_off_t tdb1_allocate(struct tdb_context *tdb, tdb1_len_t length, struct tdb1_record *rec) { tdb1_off_t rec_ptr, last_ptr, newrec_ptr; struct { tdb1_off_t rec_ptr, last_ptr; tdb1_len_t rec_len; } bestfit; float multiplier = 1.0; if (tdb1_lock(tdb, -1, F_WRLCK) == -1) return 0; /* over-allocate to reduce fragmentation */ length *= 1.25; /* Extra bytes required for tailer */ length += sizeof(tdb1_off_t); length = TDB1_ALIGN(length, TDB1_ALIGNMENT); again: last_ptr = TDB1_FREELIST_TOP; /* read in the freelist top */ if (tdb1_ofs_read(tdb, TDB1_FREELIST_TOP, &rec_ptr) == -1) goto fail; bestfit.rec_ptr = 0; bestfit.last_ptr = 0; bestfit.rec_len = 0; /* this is a best fit allocation strategy. Originally we used a first fit strategy, but it suffered from massive fragmentation issues when faced with a slowly increasing record size. */ while (rec_ptr) { if (tdb1_rec_free_read(tdb, rec_ptr, rec) == -1) { goto fail; } if (rec->rec_len >= length) { if (bestfit.rec_ptr == 0 || rec->rec_len < bestfit.rec_len) { bestfit.rec_len = rec->rec_len; bestfit.rec_ptr = rec_ptr; bestfit.last_ptr = last_ptr; } } /* move to the next record */ last_ptr = rec_ptr; rec_ptr = rec->next; /* if we've found a record that is big enough, then stop searching if its also not too big. The definition of 'too big' changes as we scan through */ if (bestfit.rec_len > 0 && bestfit.rec_len < length * multiplier) { break; } /* this multiplier means we only extremely rarely search more than 50 or so records. At 50 records we accept records up to 11 times larger than what we want */ multiplier *= 1.05; } if (bestfit.rec_ptr != 0) { if (tdb1_rec_free_read(tdb, bestfit.rec_ptr, rec) == -1) { goto fail; } newrec_ptr = tdb1_allocate_ofs(tdb, length, bestfit.rec_ptr, rec, bestfit.last_ptr); tdb1_unlock(tdb, -1, F_WRLCK); return newrec_ptr; } /* we didn't find enough space. See if we can expand the database and if we can then try again */ if (tdb1_expand(tdb, length + sizeof(*rec)) == 0) goto again; fail: tdb1_unlock(tdb, -1, F_WRLCK); return 0; }