/*! \brief * Called at initialization to figure out version of the dbase we really have. * * This routine is called after replaying the log; it reads the restored labels. */ static int InitializeDB(struct ubik_dbase *adbase) { afs_int32 code; code = (*adbase->getlabel) (adbase, 0, &adbase->version); if (code) { /* try setting the label to a new value */ UBIK_VERSION_LOCK; adbase->version.epoch = 1; /* value for newly-initialized db */ adbase->version.counter = 1; code = (*adbase->setlabel) (adbase, 0, &adbase->version); if (code) { /* failed, try to set it back */ adbase->version.epoch = 0; adbase->version.counter = 0; (*adbase->setlabel) (adbase, 0, &adbase->version); } #ifdef AFS_PTHREAD_ENV CV_BROADCAST(&adbase->version_cond); #else LWP_NoYieldSignal(&adbase->version); #endif UBIK_VERSION_UNLOCK; } return 0; }
/** * low-level thread entry point. * * @param[in] rock opaque pointer to thread worker object * * @return opaque return pointer from pool entry function * * @internal */ static void * _afs_tp_worker_run(void * rock) { struct afs_thread_pool_worker * worker = rock; struct afs_thread_pool * pool = worker->pool; /* register worker with pool */ MUTEX_ENTER(&pool->lock); queue_Append(&pool->thread_list, worker); pool->nthreads++; MUTEX_EXIT(&pool->lock); /* call high-level entry point */ worker->ret = (*pool->entry)(pool, worker, pool->work_queue, pool->rock); /* adjust pool live thread count */ MUTEX_ENTER(&pool->lock); osi_Assert(pool->nthreads); queue_Remove(worker); pool->nthreads--; if (!pool->nthreads) { CV_BROADCAST(&pool->shutdown_cv); pool->state = AFS_TP_STATE_STOPPED; } MUTEX_EXIT(&pool->lock); _afs_tp_worker_free(worker); return NULL; }
afs_int32 canWrite(int fid) { #ifndef AFS_PTHREAD_ENV afs_int32 code = 0; #endif extern dumpSyncP dumpSyncPtr; ObtainWriteLock(&dumpSyncPtr->ds_lock); /* let the pipe drain */ while (dumpSyncPtr->ds_bytes > 0) { if (dumpSyncPtr->ds_readerStatus == DS_WAITING) { dumpSyncPtr->ds_readerStatus = 0; #ifdef AFS_PTHREAD_ENV CV_BROADCAST(&dumpSyncPtr->ds_readerStatus_cond); #else code = LWP_SignalProcess(&dumpSyncPtr->ds_readerStatus); if (code) LogError(code, "canWrite: Signal delivery failed\n"); #endif } dumpSyncPtr->ds_writerStatus = DS_WAITING; ReleaseWriteLock(&dumpSyncPtr->ds_lock); #ifdef AFS_PTHREAD_ENV MUTEX_ENTER(&dumpSyncPtr->ds_writerStatus_mutex); CV_WAIT(&dumpSyncPtr->ds_writerStatus_cond, &dumpSyncPtr->ds_writerStatus_mutex); MUTEX_EXIT(&dumpSyncPtr->ds_writerStatus_mutex); #else LWP_WaitProcess(&dumpSyncPtr->ds_writerStatus); #endif ObtainWriteLock(&dumpSyncPtr->ds_lock); } return (1); }
/** * wakeup all threads waiting in dequeue. * * @param[in] list list object * * @return operation status * @retval 0 success * * @internal */ static int _afs_wq_node_list_shutdown(struct afs_work_queue_node_list * list) { int ret = 0; struct afs_work_queue_node *node, *nnode; MUTEX_ENTER(&list->lock); list->shutdown = 1; for (queue_Scan(&list->list, node, nnode, afs_work_queue_node)) { _afs_wq_node_state_change(node, AFS_WQ_NODE_STATE_ERROR); queue_Remove(node); node->qidx = AFS_WQ_NODE_LIST_NONE; node->queue = NULL; if (node->detached) { /* if we are detached, we hold the reference on the node; * otherwise, it is some other caller that holds the reference. * So don't put the node if we are not detached; the node will * get freed when someone else calls afs_wq_node_put */ afs_wq_node_put(node); } } CV_BROADCAST(&list->cv); MUTEX_EXIT(&list->lock); return ret; }
/** * decrements queue->running_count, and signals waiters if appropriate. * * @param[in] queue queue to dec the running count of */ static void _afs_wq_dec_running_count(struct afs_work_queue *queue) { MUTEX_ENTER(&queue->lock); queue->running_count--; if (queue->shutdown && queue->running_count == 0) { /* if we've shut down, someone may be waiting for the running count * to drop to 0 */ CV_BROADCAST(&queue->running_cv); } MUTEX_EXIT(&queue->lock); }
/** * change a node's state. * * @param[in] node node object * @param[in] new_state new object state * * @return old state * * @pre node->lock held * * @internal */ static afs_wq_work_state_t _afs_wq_node_state_change(struct afs_work_queue_node * node, afs_wq_work_state_t new_state) { afs_wq_work_state_t old_state; old_state = node->state; node->state = new_state; CV_BROADCAST(&node->state_cv); return old_state; }
static void * SalvageChildReaperThread(void * args) { int slot, pid, status; struct log_cleanup_node * cleanup; MUTEX_ENTER(&worker_lock); /* loop reaping our children */ while (1) { /* wait() won't block unless we have children, so * block on the cond var if we're childless */ while (current_workers == 0) { CV_WAIT(&worker_cv, &worker_lock); } MUTEX_EXIT(&worker_lock); cleanup = (struct log_cleanup_node *) malloc(sizeof(struct log_cleanup_node)); while (Reap_Child("salvageserver", &pid, &status) < 0) { /* try to prevent livelock if something goes wrong */ sleep(1); } VOL_LOCK; for (slot = 0; slot < Parallel; slot++) { if (child_slot[slot] == pid) break; } osi_Assert(slot < Parallel); child_slot[slot] = 0; VOL_UNLOCK; SALVSYNC_doneWorkByPid(pid, status); MUTEX_ENTER(&worker_lock); if (cleanup) { cleanup->pid = pid; queue_Append(&log_cleanup_queue, cleanup); CV_SIGNAL(&log_cleanup_queue.queue_change_cv); } /* ok, we've reaped a child */ current_workers--; CV_BROADCAST(&worker_cv); } return NULL; }
/** * shutdown a work queue. * * @param[in] queue work queue object pointer * * @return operation status * @retval 0 success */ int afs_wq_shutdown(struct afs_work_queue * queue) { int ret = 0; MUTEX_ENTER(&queue->lock); if (queue->shutdown) { /* already shutdown, do nothing */ MUTEX_EXIT(&queue->lock); goto error; } queue->shutdown = 1; ret = _afs_wq_node_list_shutdown(&queue->ready_list); if (ret) { goto error; } ret = _afs_wq_node_list_shutdown(&queue->blocked_list); if (ret) { goto error; } ret = _afs_wq_node_list_shutdown(&queue->done_list); if (ret) { goto error; } /* signal everyone that could be waiting, since these conditions will * generally fail to signal on their own if we're shutdown, since no * progress is being made */ CV_BROADCAST(&queue->pend_cv); CV_BROADCAST(&queue->empty_cv); MUTEX_EXIT(&queue->lock); error: return ret; }
/** * change VVGC partition state. * * @param[in] part disk partition object * @param[in] state new state * * @pre VOL_LOCK is held * * @return old state * * @internal */ int _VVGC_state_change(struct DiskPartition64 * part, VVGCache_part_state_t state) { VVGCache_part_state_t old_state; old_state = VVGCache.part[part->index].state; VVGCache.part[part->index].state = state; if (old_state != state) { CV_BROADCAST(&VVGCache.part[part->index].cv); } return old_state; }
void haveWritten(afs_int32 nbytes) { #ifndef AFS_PTHREAD_ENV afs_int32 code = 0; #endif extern dumpSyncP dumpSyncPtr; dumpSyncPtr->ds_bytes += nbytes; if (dumpSyncPtr->ds_readerStatus == DS_WAITING) { dumpSyncPtr->ds_readerStatus = 0; #ifdef AFS_PTHREAD_ENV CV_BROADCAST(&dumpSyncPtr->ds_readerStatus_cond); #else code = LWP_SignalProcess(&dumpSyncPtr->ds_readerStatus); if (code) LogError(code, "haveWritten: Signal delivery failed\n"); #endif } ReleaseWriteLock(&dumpSyncPtr->ds_lock); }
void doneWriting(afs_int32 error) { #ifndef AFS_PTHREAD_ENV afs_int32 code = 0; #endif /* wait for the reader */ ObtainWriteLock(&dumpSyncPtr->ds_lock); while (dumpSyncPtr->ds_readerStatus != DS_WAITING) { LogDebug(4, "doneWriting: waiting for Reader\n"); dumpSyncPtr->ds_writerStatus = DS_WAITING; ReleaseWriteLock(&dumpSyncPtr->ds_lock); #ifdef AFS_PTHREAD_ENV MUTEX_ENTER(&dumpSyncPtr->ds_writerStatus_mutex); CV_WAIT(&dumpSyncPtr->ds_writerStatus_cond, &dumpSyncPtr->ds_writerStatus_mutex); MUTEX_EXIT(&dumpSyncPtr->ds_writerStatus_mutex); #else LWP_WaitProcess(&dumpSyncPtr->ds_writerStatus); #endif ObtainWriteLock(&dumpSyncPtr->ds_lock); } LogDebug(4, "doneWriting: setting done\n"); /* signal that we are done */ if (error) dumpSyncPtr->ds_writerStatus = DS_DONE_ERROR; else dumpSyncPtr->ds_writerStatus = DS_DONE; dumpSyncPtr->ds_readerStatus = 0; #ifdef AFS_PTHREAD_ENV CV_BROADCAST(&dumpSyncPtr->ds_readerStatus_cond); #else code = LWP_NoYieldSignal(&dumpSyncPtr->ds_readerStatus); if (code) LogError(code, "doneWriting: Signal delivery failed\n"); #endif ReleaseWriteLock(&dumpSyncPtr->ds_lock); }
/*! * \brief Main interaction loop for the recovery manager * * The recovery light-weight process only runs when you're the * synchronization site. It performs the following tasks, if and only * if the prerequisite tasks have been performed successfully (it * keeps track of which ones have been performed in its bit map, * \p urecovery_state). * * First, it is responsible for probing that all servers are up. This * is the only operation that must be performed even if this is not * yet the sync site, since otherwise this site may not notice that * enough other machines are running to even elect this guy to be the * sync site. * * After that, the recovery process does nothing until the beacon and * voting modules manage to get this site elected sync site. * * After becoming sync site, recovery first attempts to find the best * database available in the network (it must do this in order to * ensure finding the latest committed data). After finding the right * database, it must fetch this dbase to the sync site. * * After fetching the dbase, it relabels it with a new version number, * to ensure that everyone recognizes this dbase as the most recent * dbase. * * One the dbase has been relabelled, this machine can start handling * requests. However, the recovery module still has one more task: * propagating the dbase out to everyone who is up in the network. */ void * urecovery_Interact(void *dummy) { afs_int32 code, tcode; struct ubik_server *bestServer = NULL; struct ubik_server *ts; int dbok, doingRPC, now; afs_int32 lastProbeTime; /* if we're the sync site, the best db version we've found yet */ static struct ubik_version bestDBVersion; struct ubik_version tversion; struct timeval tv; int length, tlen, offset, file, nbytes; struct rx_call *rxcall; char tbuffer[1024]; struct ubik_stat ubikstat; struct in_addr inAddr; char hoststr[16]; char pbuffer[1028]; int fd = -1; afs_int32 pass; afs_pthread_setname_self("recovery"); /* otherwise, begin interaction */ urecovery_state = 0; lastProbeTime = 0; while (1) { /* Run through this loop every 4 seconds */ tv.tv_sec = 4; tv.tv_usec = 0; #ifdef AFS_PTHREAD_ENV select(0, 0, 0, 0, &tv); #else IOMGR_Select(0, 0, 0, 0, &tv); #endif ubik_dprint("recovery running in state %x\n", urecovery_state); /* Every 30 seconds, check all the down servers and mark them * as up if they respond. When a server comes up or found to * not be current, then re-find the the best database and * propogate it. */ if ((now = FT_ApproxTime()) > 30 + lastProbeTime) { for (ts = ubik_servers, doingRPC = 0; ts; ts = ts->next) { UBIK_BEACON_LOCK; if (!ts->up) { UBIK_BEACON_UNLOCK; doingRPC = 1; code = DoProbe(ts); if (code == 0) { UBIK_BEACON_LOCK; ts->up = 1; UBIK_BEACON_UNLOCK; DBHOLD(ubik_dbase); urecovery_state &= ~UBIK_RECFOUNDDB; DBRELE(ubik_dbase); } } else { UBIK_BEACON_UNLOCK; DBHOLD(ubik_dbase); if (!ts->currentDB) urecovery_state &= ~UBIK_RECFOUNDDB; DBRELE(ubik_dbase); } } if (doingRPC) now = FT_ApproxTime(); lastProbeTime = now; } /* Mark whether we are the sync site */ DBHOLD(ubik_dbase); if (!ubeacon_AmSyncSite()) { urecovery_state &= ~UBIK_RECSYNCSITE; DBRELE(ubik_dbase); continue; /* nothing to do */ } urecovery_state |= UBIK_RECSYNCSITE; /* If a server has just come up or if we have not found the * most current database, then go find the most current db. */ if (!(urecovery_state & UBIK_RECFOUNDDB)) { DBRELE(ubik_dbase); bestServer = (struct ubik_server *)0; bestDBVersion.epoch = 0; bestDBVersion.counter = 0; for (ts = ubik_servers; ts; ts = ts->next) { UBIK_BEACON_LOCK; if (!ts->up) { UBIK_BEACON_UNLOCK; continue; /* don't bother with these guys */ } UBIK_BEACON_UNLOCK; if (ts->isClone) continue; UBIK_ADDR_LOCK; code = DISK_GetVersion(ts->disk_rxcid, &ts->version); UBIK_ADDR_UNLOCK; if (code == 0) { /* perhaps this is the best version */ if (vcmp(ts->version, bestDBVersion) > 0) { /* new best version */ bestDBVersion = ts->version; bestServer = ts; } } } /* take into consideration our version. Remember if we, * the sync site, have the best version. Also note that * we may need to send the best version out. */ DBHOLD(ubik_dbase); if (vcmp(ubik_dbase->version, bestDBVersion) >= 0) { bestDBVersion = ubik_dbase->version; bestServer = (struct ubik_server *)0; urecovery_state |= UBIK_RECHAVEDB; } else { /* Clear the flag only when we know we have to retrieve * the db. Because urecovery_AllBetter() looks at it. */ urecovery_state &= ~UBIK_RECHAVEDB; } urecovery_state |= UBIK_RECFOUNDDB; urecovery_state &= ~UBIK_RECSENTDB; } if (!(urecovery_state & UBIK_RECFOUNDDB)) { DBRELE(ubik_dbase); continue; /* not ready */ } /* If we, the sync site, do not have the best db version, then * go and get it from the server that does. */ if ((urecovery_state & UBIK_RECHAVEDB) || !bestServer) { urecovery_state |= UBIK_RECHAVEDB; } else { /* we don't have the best version; we should fetch it. */ urecovery_AbortAll(ubik_dbase); /* Rx code to do the Bulk fetch */ file = 0; offset = 0; UBIK_ADDR_LOCK; rxcall = rx_NewCall(bestServer->disk_rxcid); ubik_print("Ubik: Synchronize database with server %s\n", afs_inet_ntoa_r(bestServer->addr[0], hoststr)); UBIK_ADDR_UNLOCK; code = StartDISK_GetFile(rxcall, file); if (code) { ubik_dprint("StartDiskGetFile failed=%d\n", code); goto FetchEndCall; } nbytes = rx_Read(rxcall, (char *)&length, sizeof(afs_int32)); length = ntohl(length); if (nbytes != sizeof(afs_int32)) { ubik_dprint("Rx-read length error=%d\n", code = BULK_ERROR); code = EIO; goto FetchEndCall; } /* give invalid label during file transit */ UBIK_VERSION_LOCK; tversion.epoch = 0; code = (*ubik_dbase->setlabel) (ubik_dbase, file, &tversion); UBIK_VERSION_UNLOCK; if (code) { ubik_dprint("setlabel io error=%d\n", code); goto FetchEndCall; } snprintf(pbuffer, sizeof(pbuffer), "%s.DB%s%d.TMP", ubik_dbase->pathName, (file<0)?"SYS":"", (file<0)?-file:file); fd = open(pbuffer, O_CREAT | O_RDWR | O_TRUNC, 0600); if (fd < 0) { code = errno; goto FetchEndCall; } code = lseek(fd, HDRSIZE, 0); if (code != HDRSIZE) { close(fd); goto FetchEndCall; } pass = 0; while (length > 0) { tlen = (length > sizeof(tbuffer) ? sizeof(tbuffer) : length); #ifndef AFS_PTHREAD_ENV if (pass % 4 == 0) IOMGR_Poll(); #endif nbytes = rx_Read(rxcall, tbuffer, tlen); if (nbytes != tlen) { ubik_dprint("Rx-read bulk error=%d\n", code = BULK_ERROR); code = EIO; close(fd); goto FetchEndCall; } nbytes = write(fd, tbuffer, tlen); pass++; if (nbytes != tlen) { code = UIOERROR; close(fd); goto FetchEndCall; } offset += tlen; length -= tlen; } code = close(fd); if (code) goto FetchEndCall; code = EndDISK_GetFile(rxcall, &tversion); FetchEndCall: tcode = rx_EndCall(rxcall, code); if (!code) code = tcode; if (!code) { /* we got a new file, set up its header */ urecovery_state |= UBIK_RECHAVEDB; UBIK_VERSION_LOCK; memcpy(&ubik_dbase->version, &tversion, sizeof(struct ubik_version)); snprintf(tbuffer, sizeof(tbuffer), "%s.DB%s%d", ubik_dbase->pathName, (file<0)?"SYS":"", (file<0)?-file:file); #ifdef AFS_NT40_ENV snprintf(pbuffer, sizeof(pbuffer), "%s.DB%s%d.OLD", ubik_dbase->pathName, (file<0)?"SYS":"", (file<0)?-file:file); code = unlink(pbuffer); if (!code) code = rename(tbuffer, pbuffer); snprintf(pbuffer, sizeof(pbuffer), "%s.DB%s%d.TMP", ubik_dbase->pathName, (file<0)?"SYS":"", (file<0)?-file:file); #endif if (!code) code = rename(pbuffer, tbuffer); if (!code) { (*ubik_dbase->open) (ubik_dbase, file); /* after data is good, sync disk with correct label */ code = (*ubik_dbase->setlabel) (ubik_dbase, 0, &ubik_dbase->version); } UBIK_VERSION_UNLOCK; #ifdef AFS_NT40_ENV snprintf(pbuffer, sizeof(pbuffer), "%s.DB%s%d.OLD", ubik_dbase->pathName, (file<0)?"SYS":"", (file<0)?-file:file); unlink(pbuffer); #endif } if (code) { unlink(pbuffer); /* * We will effectively invalidate the old data forever now. * Unclear if we *should* but we do. */ UBIK_VERSION_LOCK; ubik_dbase->version.epoch = 0; ubik_dbase->version.counter = 0; UBIK_VERSION_UNLOCK; ubik_print("Ubik: Synchronize database failed (error = %d)\n", code); } else { ubik_print("Ubik: Synchronize database completed\n"); urecovery_state |= UBIK_RECHAVEDB; } udisk_Invalidate(ubik_dbase, 0); /* data has changed */ #ifdef AFS_PTHREAD_ENV CV_BROADCAST(&ubik_dbase->version_cond); #else LWP_NoYieldSignal(&ubik_dbase->version); #endif } if (!(urecovery_state & UBIK_RECHAVEDB)) { DBRELE(ubik_dbase); continue; /* not ready */ } /* If the database was newly initialized, then when we establish quorum, write * a new label. This allows urecovery_AllBetter() to allow access for reads. * Setting it to 2 also allows another site to come along with a newer * database and overwrite this one. */ if (ubik_dbase->version.epoch == 1) { urecovery_AbortAll(ubik_dbase); UBIK_VERSION_LOCK; version_globals.ubik_epochTime = 2; ubik_dbase->version.epoch = version_globals.ubik_epochTime; ubik_dbase->version.counter = 1; code = (*ubik_dbase->setlabel) (ubik_dbase, 0, &ubik_dbase->version); UBIK_VERSION_UNLOCK; udisk_Invalidate(ubik_dbase, 0); /* data may have changed */ #ifdef AFS_PTHREAD_ENV CV_BROADCAST(&ubik_dbase->version_cond); #else LWP_NoYieldSignal(&ubik_dbase->version); #endif } /* Check the other sites and send the database to them if they * do not have the current db. */ if (!(urecovery_state & UBIK_RECSENTDB)) { /* now propagate out new version to everyone else */ dbok = 1; /* start off assuming they all worked */ /* * Check if a write transaction is in progress. We can't send the * db when a write is in progress here because the db would be * obsolete as soon as it goes there. Also, ops after the begin * trans would reach the recepient and wouldn't find a transaction * pending there. Frankly, I don't think it's possible to get past * the write-lock above if there is a write transaction in progress, * but then, it won't hurt to check, will it? */ if (ubik_dbase->flags & DBWRITING) { struct timeval tv; int safety = 0; long cur_usec = 50000; while ((ubik_dbase->flags & DBWRITING) && (safety < 500)) { DBRELE(ubik_dbase); /* sleep for a little while */ tv.tv_sec = 0; tv.tv_usec = cur_usec; #ifdef AFS_PTHREAD_ENV select(0, 0, 0, 0, &tv); #else IOMGR_Select(0, 0, 0, 0, &tv); #endif cur_usec += 10000; safety++; DBHOLD(ubik_dbase); } } for (ts = ubik_servers; ts; ts = ts->next) { UBIK_ADDR_LOCK; inAddr.s_addr = ts->addr[0]; UBIK_ADDR_UNLOCK; UBIK_BEACON_LOCK; if (!ts->up) { UBIK_BEACON_UNLOCK; ubik_dprint("recovery cannot send version to %s\n", afs_inet_ntoa_r(inAddr.s_addr, hoststr)); dbok = 0; continue; } UBIK_BEACON_UNLOCK; ubik_dprint("recovery sending version to %s\n", afs_inet_ntoa_r(inAddr.s_addr, hoststr)); if (vcmp(ts->version, ubik_dbase->version) != 0) { ubik_dprint("recovery stating local database\n"); /* Rx code to do the Bulk Store */ code = (*ubik_dbase->stat) (ubik_dbase, 0, &ubikstat); if (!code) { length = ubikstat.size; file = offset = 0; UBIK_ADDR_LOCK; rxcall = rx_NewCall(ts->disk_rxcid); UBIK_ADDR_UNLOCK; code = StartDISK_SendFile(rxcall, file, length, &ubik_dbase->version); if (code) { ubik_dprint("StartDiskSendFile failed=%d\n", code); goto StoreEndCall; } while (length > 0) { tlen = (length > sizeof(tbuffer) ? sizeof(tbuffer) : length); nbytes = (*ubik_dbase->read) (ubik_dbase, file, tbuffer, offset, tlen); if (nbytes != tlen) { ubik_dprint("Local disk read error=%d\n", code = UIOERROR); goto StoreEndCall; } nbytes = rx_Write(rxcall, tbuffer, tlen); if (nbytes != tlen) { ubik_dprint("Rx-write bulk error=%d\n", code = BULK_ERROR); goto StoreEndCall; } offset += tlen; length -= tlen; } code = EndDISK_SendFile(rxcall); StoreEndCall: code = rx_EndCall(rxcall, code); } if (code == 0) { /* we set a new file, process its header */ ts->version = ubik_dbase->version; ts->currentDB = 1; } else dbok = 0; } else { /* mark file up to date */ ts->currentDB = 1; } } if (dbok) urecovery_state |= UBIK_RECSENTDB; } DBRELE(ubik_dbase); } return NULL; }
/** * execute a node on the queue. * * @param[in] queue work queue * @param[in] rock opaque pointer (passed as third arg to callback func) * @param[in] block allow blocking in dequeue * * @return operation status * @retval 0 completed a work unit * * @internal */ static int _afs_wq_do(struct afs_work_queue * queue, void * rock, int block) { int code, ret = 0; struct afs_work_queue_node * node; afs_wq_callback_func_t * cbf; afs_wq_work_state_t next_state; struct afs_work_queue_node_list * ql; void * node_rock; int detached = 0; /* We can inc queue->running_count before actually pulling the node off * of the ready_list, since running_count only really matters when we are * shut down. If we get shut down before we pull the node off of * ready_list, but after we inc'd running_count, * _afs_wq_node_list_dequeue should return immediately with EINTR, * in which case we'll dec running_count, so it's as if we never inc'd it * in the first place. */ MUTEX_ENTER(&queue->lock); if (queue->shutdown) { MUTEX_EXIT(&queue->lock); return EINTR; } queue->running_count++; MUTEX_EXIT(&queue->lock); ret = _afs_wq_node_list_dequeue(&queue->ready_list, &node, AFS_WQ_NODE_STATE_RUNNING, block); if (ret) { _afs_wq_dec_running_count(queue); goto error; } cbf = node->cbf; node_rock = node->rock; detached = node->detached; if (cbf != NULL) { MUTEX_EXIT(&node->lock); code = (*cbf)(queue, node, queue->rock, node_rock, rock); MUTEX_ENTER(&node->lock); if (code == 0) { next_state = AFS_WQ_NODE_STATE_DONE; ql = &queue->done_list; } else if (code == AFS_WQ_ERROR_RESCHEDULE) { if (node->error_count) { next_state = AFS_WQ_NODE_STATE_ERROR; ql = &queue->done_list; } else if (node->block_count) { next_state = AFS_WQ_NODE_STATE_BLOCKED; ql = &queue->blocked_list; } else { next_state = AFS_WQ_NODE_STATE_SCHEDULED; ql = &queue->ready_list; } } else { next_state = AFS_WQ_NODE_STATE_ERROR; ql = &queue->done_list; } } else { next_state = AFS_WQ_NODE_STATE_DONE; code = 0; ql = &queue->done_list; } _afs_wq_dec_running_count(queue); node->retcode = code; if ((next_state == AFS_WQ_NODE_STATE_DONE) || (next_state == AFS_WQ_NODE_STATE_ERROR)) { MUTEX_ENTER(&queue->lock); if (queue->drain && queue->pend_count == queue->opts.pend_lothresh) { /* signal other threads if we're about to below the low * pending-tasks threshold */ queue->drain = 0; CV_SIGNAL(&queue->pend_cv); } if (queue->pend_count == 1) { /* signal other threads if we're about to become 'empty' */ CV_BROADCAST(&queue->empty_cv); } queue->pend_count--; MUTEX_EXIT(&queue->lock); } ret = _afs_wq_node_state_wait_busy(node); if (ret) { goto error; } /* propagate scheduling changes down through dependencies */ ret = _afs_wq_dep_propagate(node, next_state); if (ret) { goto error; } ret = _afs_wq_node_state_wait_busy(node); if (ret) { goto error; } if (detached && ((next_state == AFS_WQ_NODE_STATE_DONE) || (next_state == AFS_WQ_NODE_STATE_ERROR))) { _afs_wq_node_state_change(node, next_state); _afs_wq_node_put_r(node, 1); } else { ret = _afs_wq_node_list_enqueue(ql, node, next_state); } error: return ret; }
static void SalvageServer(int argc, char **argv) { int pid, ret; struct SalvageQueueNode * node; pthread_t tid; pthread_attr_t attrs; int slot; VolumePackageOptions opts; /* All entries to the log will be appended. Useful if there are * multiple salvagers appending to the log. */ CheckLogFile((char *)AFSDIR_SERVER_SALSRVLOG_FILEPATH); #ifndef AFS_NT40_ENV #ifdef AFS_LINUX20_ENV fcntl(fileno(logFile), F_SETFL, O_APPEND); /* Isn't this redundant? */ #else fcntl(fileno(logFile), F_SETFL, FAPPEND); /* Isn't this redundant? */ #endif #endif setlinebuf(logFile); fprintf(logFile, "%s\n", cml_version_number); LogCommandLine(argc, argv, "Online Salvage Server", SalvageVersion, "Starting OpenAFS", Log); /* Get and hold a lock for the duration of the salvage to make sure * that no other salvage runs at the same time. The routine * VInitVolumePackage2 (called below) makes sure that a file server or * other volume utilities don't interfere with the salvage. */ /* even demand attach online salvager * still needs this because we don't want * a stand-alone salvager to conflict with * the salvager daemon */ ObtainSharedSalvageLock(); child_slot = (int *) malloc(Parallel * sizeof(int)); osi_Assert(child_slot != NULL); memset(child_slot, 0, Parallel * sizeof(int)); /* initialize things */ VOptDefaults(salvageServer, &opts); if (VInitVolumePackage2(salvageServer, &opts)) { Log("Shutting down: errors encountered initializing volume package\n"); Exit(1); } DInit(10); queue_Init(&pending_q); queue_Init(&log_cleanup_queue); MUTEX_INIT(&worker_lock, "worker", MUTEX_DEFAULT, 0); CV_INIT(&worker_cv, "worker", CV_DEFAULT, 0); CV_INIT(&log_cleanup_queue.queue_change_cv, "queuechange", CV_DEFAULT, 0); osi_Assert(pthread_attr_init(&attrs) == 0); /* start up the reaper and log cleaner threads */ osi_Assert(pthread_attr_setdetachstate(&attrs, PTHREAD_CREATE_DETACHED) == 0); osi_Assert(pthread_create(&tid, &attrs, &SalvageChildReaperThread, NULL) == 0); osi_Assert(pthread_create(&tid, &attrs, &SalvageLogCleanupThread, NULL) == 0); osi_Assert(pthread_create(&tid, &attrs, &SalvageLogScanningThread, NULL) == 0); /* loop forever serving requests */ while (1) { node = SALVSYNC_getWork(); osi_Assert(node != NULL); Log("dispatching child to salvage volume %u...\n", node->command.sop.parent); VOL_LOCK; /* find a slot */ for (slot = 0; slot < Parallel; slot++) { if (!child_slot[slot]) break; } osi_Assert (slot < Parallel); do_fork: pid = Fork(); if (pid == 0) { VOL_UNLOCK; ret = DoSalvageVolume(node, slot); Exit(ret); } else if (pid < 0) { Log("failed to fork child worker process\n"); sleep(1); goto do_fork; } else { child_slot[slot] = pid; node->pid = pid; VOL_UNLOCK; MUTEX_ENTER(&worker_lock); current_workers++; /* let the reaper thread know another worker was spawned */ CV_BROADCAST(&worker_cv); /* if we're overquota, wait for the reaper */ while (current_workers >= Parallel) { CV_WAIT(&worker_cv, &worker_lock); } MUTEX_EXIT(&worker_lock); } } }
/** * acquire a lock on a file on local disk. * * @param[in] dl the VDiskLock structure corresponding to the file on disk * @param[in] locktype READ_LOCK if you want a read lock, or WRITE_LOCK if * you want a write lock * @param[in] nonblock 0 to wait for conflicting locks to clear before * obtaining the lock; 1 to fail immediately if a * conflicting lock is held by someone else * * @return operation status * @retval 0 success * @retval EBUSY someone else is holding a conflicting lock and nonblock=1 was * specified * @retval EIO error acquiring file lock * * @note DAFS only * * @note while normal fcntl-y locks on Unix systems generally only work per- * process, this interface also deals with locks between threads in the * process in addition to different processes acquiring the lock */ int VGetDiskLock(struct VDiskLock *dl, int locktype, int nonblock) { int code = 0; osi_Assert(locktype == READ_LOCK || locktype == WRITE_LOCK); if (nonblock) { if (locktype == READ_LOCK) { ObtainReadLockNoBlock(&dl->rwlock, code); } else { ObtainWriteLockNoBlock(&dl->rwlock, code); } if (code) { return EBUSY; } } else if (locktype == READ_LOCK) { ObtainReadLock(&dl->rwlock); } else { ObtainWriteLock(&dl->rwlock); } MUTEX_ENTER(&dl->mutex); if ((dl->flags & VDISKLOCK_ACQUIRING)) { /* Some other thread is waiting to acquire an fs lock. If nonblock=1, * we can return immediately, since we know we'll need to wait to * acquire. Otherwise, wait for the other thread to finish acquiring * the fs lock */ if (nonblock) { code = EBUSY; } else { while ((dl->flags & VDISKLOCK_ACQUIRING)) { CV_WAIT(&dl->cv, &dl->mutex); } } } if (code == 0 && !(dl->flags & VDISKLOCK_ACQUIRED)) { /* no other thread holds the lock on the actual file; so grab one */ /* first try, don't block on the lock to see if we can get it without * waiting */ code = VLockFileLock(dl->lockfile, dl->offset, locktype, 1); if (code == EBUSY && !nonblock) { /* mark that we are waiting on the fs lock */ dl->flags |= VDISKLOCK_ACQUIRING; MUTEX_EXIT(&dl->mutex); code = VLockFileLock(dl->lockfile, dl->offset, locktype, nonblock); MUTEX_ENTER(&dl->mutex); dl->flags &= ~VDISKLOCK_ACQUIRING; if (code == 0) { dl->flags |= VDISKLOCK_ACQUIRED; } CV_BROADCAST(&dl->cv); } } if (code) { if (locktype == READ_LOCK) { ReleaseReadLock(&dl->rwlock); } else { ReleaseWriteLock(&dl->rwlock); } } else { /* successfully got the lock, so inc the number of unlocks we need * to do before we can unlock the actual file */ ++dl->lockers; } MUTEX_EXIT(&dl->mutex); return code; }