/* We only need to lock individual bytes, but Linux merges consecutive locks * so we lock in contiguous ranges. */ static int tdb_chainlock_gradual(struct tdb_context *tdb, int ltype, enum tdb_lock_flags flags, size_t off, size_t len) { int ret; enum tdb_lock_flags nb_flags = (flags & ~TDB_LOCK_WAIT); if (len <= 4) { /* Single record. Just do blocking lock. */ return tdb_brlock(tdb, ltype, off, len, flags); } /* First we try non-blocking. */ ret = tdb_brlock(tdb, ltype, off, len, nb_flags); if (ret == 0) { return 0; } /* Try locking first half, then second. */ ret = tdb_chainlock_gradual(tdb, ltype, flags, off, len / 2); if (ret == -1) return -1; ret = tdb_chainlock_gradual(tdb, ltype, flags, off + len / 2, len - len / 2); if (ret == -1) { tdb_brunlock(tdb, ltype, off, len / 2); return -1; } return 0; }
/* increment the tdb sequence number if the tdb has been opened using the TDB_SEQNUM flag */ static void tdb_increment_seqnum(struct tdb_context *tdb) { if (!(tdb->flags & TDB_SEQNUM)) { return; } if (tdb_brlock(tdb, TDB_SEQNUM_OFS, F_WRLCK, F_SETLKW, 1, 1) != 0) { return; } tdb_increment_seqnum_nonblock(tdb); tdb_brlock(tdb, TDB_SEQNUM_OFS, F_UNLCK, F_SETLKW, 1, 1); }
/* record lock stops delete underneath */ int tdb_lock_record(struct tdb_context *tdb, tdb_off_t off) { if (tdb->allrecord_lock.count) { return 0; } return off ? tdb_brlock(tdb, F_RDLCK, off, 1, TDB_LOCK_WAIT) : 0; }
static int tdb_lock_and_recover(struct tdb_context *tdb) { int ret; /* We need to match locking order in transaction commit. */ if (tdb_brlock(tdb, F_WRLCK, FREELIST_TOP, 0, TDB_LOCK_WAIT)) { return -1; } if (tdb_brlock(tdb, F_WRLCK, OPEN_LOCK, 1, TDB_LOCK_WAIT)) { tdb_brunlock(tdb, F_WRLCK, FREELIST_TOP, 0); return -1; } ret = tdb_transaction_recover(tdb); tdb_brunlock(tdb, F_WRLCK, OPEN_LOCK, 1); tdb_brunlock(tdb, F_WRLCK, FREELIST_TOP, 0); return ret; }
/* increment the tdb sequence number if the tdb has been opened using the TDB_SEQNUM flag */ static void tdb_increment_seqnum(struct tdb_context *tdb) { tdb_off_t seqnum=0; if (!(tdb->flags & TDB_SEQNUM)) { return; } if (tdb_brlock(tdb, TDB_SEQNUM_OFS, F_WRLCK, F_SETLKW, 1, 1) != 0) { return; } /* we ignore errors from this, as we have no sane way of dealing with them. */ tdb_ofs_read(tdb, TDB_SEQNUM_OFS, &seqnum); seqnum++; tdb_ofs_write(tdb, TDB_SEQNUM_OFS, &seqnum); tdb_brlock(tdb, TDB_SEQNUM_OFS, F_UNLCK, F_SETLKW, 1, 1); }
/* lock an offset in the database. */ int tdb_nest_lock(struct tdb_context *tdb, uint32_t offset, int ltype, enum tdb_lock_flags flags) { struct tdb_lock_type *new_lck; if (offset >= lock_offset(tdb->hash_size)) { tdb->ecode = TDB_ERR_LOCK; TDB_LOG((tdb, TDB_DEBUG_ERROR,"tdb_lock: invalid offset %u for ltype=%d\n", offset, ltype)); return -1; } if (tdb->flags & TDB_NOLOCK) return 0; new_lck = find_nestlock(tdb, offset); if (new_lck) { /* * Just increment the in-memory struct, posix locks * don't stack. */ new_lck->count++; return 0; } if (tdb->num_lockrecs == tdb->lockrecs_array_length) { new_lck = (struct tdb_lock_type *)realloc( tdb->lockrecs, sizeof(*tdb->lockrecs) * (tdb->num_lockrecs+1)); if (new_lck == NULL) { errno = ENOMEM; return -1; } tdb->lockrecs_array_length = tdb->num_lockrecs+1; tdb->lockrecs = new_lck; } /* Since fcntl locks don't nest, we do a lock for the first one, and simply bump the count for future ones */ if (tdb_brlock(tdb, ltype, offset, 1, flags)) { return -1; } new_lck = &tdb->lockrecs[tdb->num_lockrecs]; new_lck->off = offset; new_lck->count = 1; new_lck->ltype = ltype; tdb->num_lockrecs++; return 0; }
/* Write locks override our own fcntl readlocks, so check it here. Note this is meant to be F_SETLK, *not* F_SETLKW, as it's not an error to fail to get the lock here. */ int tdb_write_lock_record(struct tdb_context *tdb, tdb_off_t off) { struct tdb_traverse_lock *i; for (i = &tdb->travlocks; i; i = i->next) if (i->off == off) return -1; if (tdb->allrecord_lock.count) { if (tdb->allrecord_lock.ltype == F_WRLCK) { return 0; } return -1; } return tdb_brlock(tdb, F_WRLCK, off, 1, TDB_LOCK_NOWAIT|TDB_LOCK_PROBE); }
/* upgrade a read lock to a write lock. This needs to be handled in a special way as some OSes (such as solaris) have too conservative deadlock detection and claim a deadlock when progress can be made. For those OSes we may loop for a while. */ int tdb_brlock_upgrade(struct tdb_context *tdb, tdb_off_t offset, size_t len) { int count = 1000; while (count--) { struct timeval tv; if (tdb_brlock(tdb, offset, F_WRLCK, F_SETLKW, 1, len) == 0) { return 0; } if (errno != EDEADLK) { break; } /* sleep for as short a time as we can - more portable than usleep() */ tv.tv_sec = 0; tv.tv_usec = 1; select(0, NULL, NULL, NULL, &tv); } TDB_LOG((tdb, TDB_DEBUG_TRACE,"tdb_brlock_upgrade failed at offset %d\n", offset)); return -1; }
static int tdb_brlock_retry(struct tdb_context *tdb, int rw_type, tdb_off_t offset, size_t len, enum tdb_lock_flags flags) { int count = 1000; while (count--) { struct timeval tv; int ret; ret = tdb_brlock(tdb, rw_type, offset, len, flags); if (ret == 0) { return 0; } if (errno != EDEADLK) { break; } /* sleep for as short a time as we can - more portable than usleep() */ tv.tv_sec = 0; tv.tv_usec = 1; select(0, NULL, NULL, NULL, &tv); } return -1; }
/* lock/unlock entire database. It can only be upgradable if you have some * other way of guaranteeing exclusivity (ie. transaction write lock). * We do the locking gradually to avoid being starved by smaller locks. */ int tdb_allrecord_lock(struct tdb_context *tdb, int ltype, enum tdb_lock_flags flags, bool upgradable) { int ret; switch (tdb_allrecord_check(tdb, ltype, flags, upgradable)) { case -1: return -1; case 0: return 0; } /* We cover two kinds of locks: * 1) Normal chain locks. Taken for almost all operations. * 2) Individual records locks. Taken after normal or free * chain locks. * * It is (1) which cause the starvation problem, so we're only * gradual for that. */ if (tdb_have_mutexes(tdb)) { ret = tdb_mutex_allrecord_lock(tdb, ltype, flags); } else { ret = tdb_chainlock_gradual(tdb, ltype, flags, FREELIST_TOP, tdb->hash_size * 4); } if (ret == -1) { return -1; } /* Grab individual record locks. */ if (tdb_brlock(tdb, ltype, lock_offset(tdb->hash_size), 0, flags) == -1) { if (tdb_have_mutexes(tdb)) { tdb_mutex_allrecord_unlock(tdb); } else { tdb_brunlock(tdb, ltype, FREELIST_TOP, tdb->hash_size * 4); } return -1; } tdb->allrecord_lock.count = 1; /* If it's upgradable, it's actually exclusive so we can treat * it as a write lock. */ tdb->allrecord_lock.ltype = upgradable ? F_WRLCK : ltype; tdb->allrecord_lock.off = upgradable; if (tdb_needs_recovery(tdb)) { bool mark = flags & TDB_LOCK_MARK_ONLY; tdb_allrecord_unlock(tdb, ltype, mark); if (mark) { tdb->ecode = TDB_ERR_LOCK; TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_lockall_mark cannot do recovery\n")); return -1; } if (tdb_lock_and_recover(tdb) == -1) { return -1; } return tdb_allrecord_lock(tdb, ltype, flags, upgradable); } return 0; }