/* actually delete an entry in the database given the offset */ int tdb_do_delete(struct tdb_context *tdb, tdb_off_t rec_ptr, struct list_struct*rec) { tdb_off_t last_ptr, i; struct list_struct lastrec; if (tdb->read_only || tdb->traverse_read) return -1; if (tdb_write_lock_record(tdb, rec_ptr) == -1) { /* Someone traversing here: mark it as dead */ rec->magic = TDB_DEAD_MAGIC; return tdb_rec_write(tdb, rec_ptr, rec); } if (tdb_write_unlock_record(tdb, rec_ptr) != 0) return -1; /* find previous record in hash chain */ if (tdb_ofs_read(tdb, TDB_HASH_TOP(rec->full_hash), &i) == -1) return -1; for (last_ptr = 0; i != rec_ptr; last_ptr = i, i = lastrec.next) if (tdb_rec_read(tdb, i, &lastrec) == -1) return -1; /* unlink it: next ptr is at start of record. */ if (last_ptr == 0) last_ptr = TDB_HASH_TOP(rec->full_hash); if (tdb_ofs_write(tdb, last_ptr, &rec->next) == -1) return -1; /* recover the space */ if (tdb_free(tdb, rec_ptr, rec) == -1) return -1; return 0; }
/* update a record tailer (must hold allocation lock) */ static int update_tailer(struct tdb_context *tdb, tdb_off_t offset, const struct tdb_record *rec) { tdb_off_t totalsize; /* Offset of tailer from record header */ totalsize = sizeof(*rec) + rec->rec_len; return tdb_ofs_write(tdb, offset + totalsize - sizeof(tdb_off_t), &totalsize); }
/* non-blocking increment of the tdb sequence number if the tdb has been opened using the TDB_SEQNUM flag */ _PUBLIC_ void tdb_increment_seqnum_nonblock(struct tdb_context *tdb) { tdb_off_t seqnum=0; if (!(tdb->flags & TDB_SEQNUM)) { return; } /* we ignore errors from this, as we have no sane way of dealing with them. */ tdb_ofs_read(tdb, TDB_SEQNUM_OFS, &seqnum); seqnum++; tdb_ofs_write(tdb, TDB_SEQNUM_OFS, &seqnum); }
/* the core of tdb_allocate - called when we have decided which free list entry to use Note that we try to allocate by grabbing data from the end of an existing record, not the beginning. This is so the left merge in a free is more likely to be able to free up the record without fragmentation */ static tdb_off_t tdb_allocate_ofs(struct tdb_context *tdb, tdb_len_t length, tdb_off_t rec_ptr, struct tdb_record *rec, tdb_off_t last_ptr) { #define MIN_REC_SIZE (sizeof(struct tdb_record) + sizeof(tdb_off_t) + 8) if (rec->rec_len < length + MIN_REC_SIZE) { /* we have to grab the whole record */ /* unlink it from the previous record */ if (tdb_ofs_write(tdb, last_ptr, &rec->next) == -1) { return 0; } /* mark it not free */ rec->magic = TDB_MAGIC; if (tdb_rec_write(tdb, rec_ptr, rec) == -1) { return 0; } return rec_ptr; } /* we're going to just shorten the existing record */ rec->rec_len -= (length + sizeof(*rec)); if (tdb_rec_write(tdb, rec_ptr, rec) == -1) { return 0; } if (update_tailer(tdb, rec_ptr, rec) == -1) { return 0; } /* and setup the new record */ rec_ptr += sizeof(*rec) + rec->rec_len; memset(rec, '\0', sizeof(*rec)); rec->rec_len = length; rec->magic = TDB_MAGIC; if (tdb_rec_write(tdb, rec_ptr, rec) == -1) { return 0; } if (update_tailer(tdb, rec_ptr, rec) == -1) { return 0; } return rec_ptr; }
/* Remove an element from the freelist. Must have alloc lock. */ static int remove_from_freelist(struct tdb_context *tdb, tdb_off_t off, tdb_off_t next) { tdb_off_t last_ptr, i; /* read in the freelist top */ last_ptr = FREELIST_TOP; while (tdb_ofs_read(tdb, last_ptr, &i) != -1 && i != 0) { if (i == off) { /* We've found it! */ return tdb_ofs_write(tdb, last_ptr, &next); } /* Follow chain (next offset is at start of record) */ last_ptr = i; } tdb->ecode = TDB_ERR_CORRUPT; TDB_LOG((tdb, TDB_DEBUG_FATAL,"remove_from_freelist: not on list at off=%d\n", off)); return -1; }
/* increment the tdb sequence number if the tdb has been opened using the TDB_SEQNUM flag */ static void tdb_increment_seqnum(struct tdb_context *tdb) { tdb_off_t seqnum=0; if (!(tdb->flags & TDB_SEQNUM)) { return; } if (tdb_brlock(tdb, TDB_SEQNUM_OFS, F_WRLCK, F_SETLKW, 1, 1) != 0) { return; } /* we ignore errors from this, as we have no sane way of dealing with them. */ tdb_ofs_read(tdb, TDB_SEQNUM_OFS, &seqnum); seqnum++; tdb_ofs_write(tdb, TDB_SEQNUM_OFS, &seqnum); tdb_brlock(tdb, TDB_SEQNUM_OFS, F_UNLCK, F_SETLKW, 1, 1); }
/* Add an element into the freelist. Merge adjacent records if necessary. */ int tdb_free(struct tdb_context *tdb, tdb_off_t offset, struct tdb_record *rec) { /* Allocation and tailer lock */ if (tdb_lock(tdb, -1, F_WRLCK) != 0) return -1; /* set an initial tailer, so if we fail we don't leave a bogus record */ if (update_tailer(tdb, offset, rec) != 0) { TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_free: update_tailer failed!\n")); goto fail; } #if USE_RIGHT_MERGES /* Look right first (I'm an Australian, dammit) */ if (offset + sizeof(*rec) + rec->rec_len + sizeof(*rec) <= tdb->map_size) { tdb_off_t right = offset + sizeof(*rec) + rec->rec_len; struct tdb_record r; if (tdb->methods->tdb_read(tdb, right, &r, sizeof(r), DOCONV()) == -1) { TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_free: right read failed at %u\n", right)); goto left; } /* If it's free, expand to include it. */ if (r.magic == TDB_FREE_MAGIC) { if (remove_from_freelist(tdb, right, r.next) == -1) { TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_free: right free failed at %u\n", right)); goto left; } rec->rec_len += sizeof(r) + r.rec_len; if (update_tailer(tdb, offset, rec) == -1) { TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_free: update_tailer failed at %u\n", offset)); goto fail; } } } left: #endif /* Look left */ if (offset - sizeof(tdb_off_t) > TDB_DATA_START(tdb->header.hash_size)) { tdb_off_t left = offset - sizeof(tdb_off_t); struct tdb_record l; tdb_off_t leftsize; /* Read in tailer and jump back to header */ if (tdb_ofs_read(tdb, left, &leftsize) == -1) { TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_free: left offset read failed at %u\n", left)); goto update; } /* it could be uninitialised data */ if (leftsize == 0 || leftsize == TDB_PAD_U32) { goto update; } left = offset - leftsize; if (leftsize > offset || left < TDB_DATA_START(tdb->header.hash_size)) { goto update; } /* Now read in the left record */ if (tdb->methods->tdb_read(tdb, left, &l, sizeof(l), DOCONV()) == -1) { TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_free: left read failed at %u (%u)\n", left, leftsize)); goto update; } /* If it's free, expand to include it. */ if (l.magic == TDB_FREE_MAGIC) { /* we now merge the new record into the left record, rather than the other way around. This makes the operation O(1) instead of O(n). This change prevents traverse from being O(n^2) after a lot of deletes */ l.rec_len += sizeof(*rec) + rec->rec_len; if (tdb_rec_write(tdb, left, &l) == -1) { TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_free: update_left failed at %u\n", left)); goto fail; } if (update_tailer(tdb, left, &l) == -1) { TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_free: update_tailer failed at %u\n", offset)); goto fail; } tdb_unlock(tdb, -1, F_WRLCK); return 0; } } update: /* Now, prepend to free list */ rec->magic = TDB_FREE_MAGIC; if (tdb_ofs_read(tdb, FREELIST_TOP, &rec->next) == -1 || tdb_rec_write(tdb, offset, rec) == -1 || tdb_ofs_write(tdb, FREELIST_TOP, &offset) == -1) { TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_free record write failed at offset=%d\n", offset)); goto fail; } /* And we're done. */ tdb_unlock(tdb, -1, F_WRLCK); return 0; fail: tdb_unlock(tdb, -1, F_WRLCK); return -1; }
/* store an element in the database, replacing any existing element with the same key return 0 on success, -1 on failure */ int tdb_store(struct tdb_context *tdb, TDB_DATA key, TDB_DATA dbuf, int flag) { struct list_struct rec; u32 hash; tdb_off_t rec_ptr; char *p = NULL; int ret = -1; if (tdb->read_only || tdb->traverse_read) { tdb->ecode = TDB_ERR_RDONLY; return -1; } /* find which hash bucket it is in */ hash = tdb->hash_fn(&key); if (tdb_lock(tdb, BUCKET(hash), F_WRLCK) == -1) return -1; /* check for it existing, on insert. */ if (flag == TDB_INSERT) { if (tdb_exists_hash(tdb, key, hash)) { tdb->ecode = TDB_ERR_EXISTS; goto fail; } } else { /* first try in-place update, on modify or replace. */ if (tdb_update_hash(tdb, key, hash, dbuf) == 0) { goto done; } if (tdb->ecode == TDB_ERR_NOEXIST && flag == TDB_MODIFY) { /* if the record doesn't exist and we are in TDB_MODIFY mode then we should fail the store */ goto fail; } } /* reset the error code potentially set by the tdb_update() */ tdb->ecode = TDB_SUCCESS; /* delete any existing record - if it doesn't exist we don't care. Doing this first reduces fragmentation, and avoids coalescing with `allocated' block before it's updated. */ if (flag != TDB_INSERT) tdb_delete_hash(tdb, key, hash); /* Copy key+value *before* allocating free space in case malloc fails and we are left with a dead spot in the tdb. */ if (!(p = (char *)malloc(key.dsize + dbuf.dsize))) { tdb->ecode = TDB_ERR_OOM; goto fail; } memcpy(p, key.dptr, key.dsize); if (dbuf.dsize) memcpy(p+key.dsize, dbuf.dptr, dbuf.dsize); if (tdb->max_dead_records != 0) { /* * Allow for some dead records per hash chain, look if we can * find one that can hold the new record. We need enough space * for key, data and tailer. If we find one, we don't have to * consult the central freelist. */ rec_ptr = tdb_find_dead( tdb, hash, &rec, key.dsize + dbuf.dsize + sizeof(tdb_off_t)); if (rec_ptr != 0) { rec.key_len = key.dsize; rec.data_len = dbuf.dsize; rec.full_hash = hash; rec.magic = TDB_MAGIC; if (tdb_rec_write(tdb, rec_ptr, &rec) == -1 || tdb->methods->tdb_write( tdb, rec_ptr + sizeof(rec), p, key.dsize + dbuf.dsize) == -1) { goto fail; } goto done; } } /* * We have to allocate some space from the freelist, so this means we * have to lock it. Use the chance to purge all the DEAD records from * the hash chain under the freelist lock. */ if (tdb_lock(tdb, -1, F_WRLCK) == -1) { goto fail; } if ((tdb->max_dead_records != 0) && (tdb_purge_dead(tdb, hash) == -1)) { tdb_unlock(tdb, -1, F_WRLCK); goto fail; } /* we have to allocate some space */ rec_ptr = tdb_allocate(tdb, key.dsize + dbuf.dsize, &rec); tdb_unlock(tdb, -1, F_WRLCK); if (rec_ptr == 0) { goto fail; } /* Read hash top into next ptr */ if (tdb_ofs_read(tdb, TDB_HASH_TOP(hash), &rec.next) == -1) goto fail; rec.key_len = key.dsize; rec.data_len = dbuf.dsize; rec.full_hash = hash; rec.magic = TDB_MAGIC; /* write out and point the top of the hash chain at it */ if (tdb_rec_write(tdb, rec_ptr, &rec) == -1 || tdb->methods->tdb_write(tdb, rec_ptr+sizeof(rec), p, key.dsize+dbuf.dsize)==-1 || tdb_ofs_write(tdb, TDB_HASH_TOP(hash), &rec_ptr) == -1) { /* Need to tdb_unallocate() here */ goto fail; } done: ret = 0; fail: if (ret == 0) { tdb_increment_seqnum(tdb); } SAFE_FREE(p); tdb_unlock(tdb, BUCKET(hash), F_WRLCK); return ret; }
/* wipe the entire database, deleting all records. This can be done very fast by using a global lock. The entire data portion of the file becomes a single entry in the freelist. This code carefully steps around the recovery area, leaving it alone */ int tdb_wipe_all(struct tdb_context *tdb) { int i; tdb_off_t offset = 0; ssize_t data_len; tdb_off_t recovery_head; tdb_len_t recovery_size = 0; if (tdb_lockall(tdb) != 0) { return -1; } /* see if the tdb has a recovery area, and remember its size if so. We don't want to lose this as otherwise each tdb_wipe_all() in a transaction will increase the size of the tdb by the size of the recovery area */ if (tdb_ofs_read(tdb, TDB_RECOVERY_HEAD, &recovery_head) == -1) { TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_wipe_all: failed to read recovery head\n")); goto failed; } if (recovery_head != 0) { struct list_struct rec; if (tdb->methods->tdb_read(tdb, recovery_head, &rec, sizeof(rec), DOCONV()) == -1) { TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_wipe_all: failed to read recovery record\n")); return -1; } recovery_size = rec.rec_len + sizeof(rec); } /* wipe the hashes */ for (i=0;i<tdb->header.hash_size;i++) { if (tdb_ofs_write(tdb, TDB_HASH_TOP(i), &offset) == -1) { TDB_LOG((tdb, TDB_DEBUG_FATAL,"tdb_wipe_all: failed to write hash %d\n", i)); goto failed; } } /* wipe the freelist */ if (tdb_ofs_write(tdb, FREELIST_TOP, &offset) == -1) { TDB_LOG((tdb, TDB_DEBUG_FATAL,"tdb_wipe_all: failed to write freelist\n")); goto failed; } /* add all the rest of the file to the freelist, possibly leaving a gap for the recovery area */ if (recovery_size == 0) { /* the simple case - the whole file can be used as a freelist */ data_len = (tdb->map_size - TDB_DATA_START(tdb->header.hash_size)); if (tdb_free_region(tdb, TDB_DATA_START(tdb->header.hash_size), data_len) != 0) { goto failed; } } else { /* we need to add two freelist entries - one on either side of the recovery area Note that we cannot shift the recovery area during this operation. Only the transaction.c code may move the recovery area or we risk subtle data corruption */ data_len = (recovery_head - TDB_DATA_START(tdb->header.hash_size)); if (tdb_free_region(tdb, TDB_DATA_START(tdb->header.hash_size), data_len) != 0) { goto failed; } /* and the 2nd free list entry after the recovery area - if any */ data_len = tdb->map_size - (recovery_head+recovery_size); if (tdb_free_region(tdb, recovery_head+recovery_size, data_len) != 0) { goto failed; } } if (tdb_unlockall(tdb) != 0) { TDB_LOG((tdb, TDB_DEBUG_FATAL,"tdb_wipe_all: failed to unlock\n")); goto failed; } return 0; failed: tdb_unlockall(tdb); return -1; }