/** * Finalize the insertion of a new rdev in a group. * * @param[in] params The parsed command array * * The real parameters passed in the array are: * - UUID of the group to insert the new rdev in * * @return 0 on success, a negative error code on failure */ static int vrt_cmd_group_insert_rdev(const struct VrtGroupInsertRdev *cmd) { int ret; exalog_debug("finish adding rdev in group " UUID_FMT, UUID_VAL(&cmd->group_uuid)); os_thread_mutex_lock(&pending_group_lock); if (!pending_group) { os_thread_mutex_unlock(&pending_group_lock); return -EPERM; } if (!uuid_is_equal(&cmd->group_uuid, &pending_group->uuid)) { exalog_error("You are trying to insert an rdev into a group " "while a dgcreate, dgstart or dgdiskadd is running."); os_thread_mutex_unlock(&pending_group_lock); return -EAGAIN; } ret = vrt_group_insert_rdev(pending_group, &cmd->uuid, &cmd->nbd_uuid, cmd->node_id, cmd->spof_id, cmd->local, cmd->old_sb_version, cmd->new_sb_version); os_free(pending_group); os_thread_mutex_unlock(&pending_group_lock); return ret; }
md_com_error_code_t md_com_send_msg(int connection_id, const md_com_msg_t* tx_msg) { int ret; int total_size = sizeof(tx_msg->size) + sizeof(tx_msg->type) + tx_msg->size; char* buffer = (char*) malloc(total_size); memcpy(buffer, &tx_msg->size, sizeof(tx_msg->size)); memcpy(buffer + sizeof(tx_msg->size), &tx_msg->type, sizeof(tx_msg->type)); memcpy(buffer + sizeof(tx_msg->size) + sizeof(tx_msg->type), tx_msg->payload, tx_msg->size); os_thread_mutex_lock(&send_lock); ret = send(connection_id, buffer, total_size, 0); os_thread_mutex_unlock(&send_lock); free(buffer); if (ret != total_size) goto tx_error; return COM_SUCCESS; tx_error: return (errno == EPIPE) ? COM_CONNECTION_CLOSED : COM_WRITE_ERROR; }
void nbd_ndev_getinfo(const exa_uuid_t *uuid, ExamsgID from) { const device_t *dev; exported_device_info_t device_info; /* Do not send garbage from the stack when we return an error */ /* FIXME this seems to be a side effect expected by upper layers... * They SHOULD NOT rely on the content of answer if it returns an error. */ memset(&device_info, 0, sizeof(device_info)); os_thread_mutex_lock(&nbd_server.mutex_edevs); dev = find_device_from_uuid(uuid); if (dev == NULL) device_info.status = -CMD_EXP_ERR_UNKNOWN_DEVICE; else { device_info.device_nb = dev->dev_index; device_info.device_sectors = dev->size_in_sectors; device_info.status = EXA_SUCCESS; } os_thread_mutex_unlock(&nbd_server.mutex_edevs); admwrk_daemon_reply(nbd_server.mh, from, &device_info, sizeof(device_info)); }
void iscsi_target_perf_end_request(int rw, TARGET_CMD_T *cmd) { exaperf_duration_record(target_req_time[rw], (double)(os_gettimeofday_msec() - cmd->submit_date)); os_thread_mutex_lock(&iodepth_mutex); iodepth--; os_thread_mutex_unlock(&iodepth_mutex); }
/** * Clean up the pending group. Used in exa_clstop. Needed when a group * creation failed and just after that, the user want to stop the * cluster. * * FIXME I really doubt this cleans the layout correctly. * * @return EXA_SUCCESS, period. */ static int vrt_cmd_pending_group_cleanup(void) { os_thread_mutex_lock(&pending_group_lock); os_free(pending_group); os_thread_mutex_unlock(&pending_group_lock); return EXA_SUCCESS; }
void iscsi_target_perf_make_request(int rw, TARGET_CMD_T *cmd, double len) { cmd->submit_date = os_gettimeofday_msec(); exaperf_repart_add_value(target_req_size_repart[rw], len); os_thread_mutex_lock(&iodepth_mutex); exaperf_repart_add_value(target_iodepth[rw], iodepth); iodepth++; os_thread_mutex_unlock(&iodepth_mutex); }
void rain1_rdev_clear_rebuild_context(struct rain1_realdev *lr) { lr->rebuild_desc.type = EXA_RDEV_REBUILD_NONE; os_thread_mutex_lock(&lr->rebuild_progress.lock); lr->rebuild_progress.complete = FALSE; lr->rebuild_progress.nb_slots_rebuilt = 0; os_thread_mutex_unlock(&lr->rebuild_progress.lock); }
md_com_error_code_t md_com_close(int connection_id) { int ret; os_thread_mutex_lock(&send_lock); ret = shutdown(connection_id, SHUT_RDWR); ret = close(connection_id); os_thread_mutex_unlock(&send_lock); if (ret != 0) return COM_UNKNOWN_ERROR; return COM_SUCCESS; }
static void wq_wake(wq_t *wq) { os_thread_mutex_lock(&wq->lock); if (wq->wait != 0) { os_sem_post(&wq->sem); wq->wait = 0; } else wq->ev = 1; os_thread_mutex_unlock(&wq->lock); }
/** * Finalize group creation. This command must be called after all * group_add_rdev() commands in order to finalize the creation * of the group. * * @param[in] params The parsed command array * * The real parameters passed in the array are: * - UUID of the group to create * - Group properties: slot width, chunk size, SU size, etc. * * @return EXA_SUCCESS on success, negative error code on failure */ static int vrt_cmd_group_create(const struct VrtGroupCreate *cmd, struct vrt_group_create *reply) { vrt_group_layout_info_t *layout_info; int ret; /* FIXME The newlines (\n) in the error messages have *NOTHING* to do here: leave the formatting to upper layers. */ os_thread_mutex_lock(&pending_group_lock); if (!pending_group) { os_thread_mutex_unlock(&pending_group_lock); os_snprintf(reply->error_msg, EXA_MAXSIZE_LINE + 1, "Pending group not available\n"); return -EPERM; } if (!uuid_is_equal(&cmd->group_uuid, &pending_group->uuid)) { os_thread_mutex_unlock(&pending_group_lock); os_snprintf(reply->error_msg, EXA_MAXSIZE_LINE + 1, "You are trying to create, start or delete two groups at" " the same time.\n"); return -EAGAIN; } os_snprintf(reply->error_msg, EXA_MAXSIZE_LINE + 1, "OK\n"); layout_info = &pending_group->layout_info; layout_info->is_set = true; layout_info->slot_width = cmd->slot_width; layout_info->chunk_size = KBYTES_2_SECTORS(cmd->chunk_size); layout_info->su_size = KBYTES_2_SECTORS(cmd->su_size); layout_info->dirty_zone_size = KBYTES_2_SECTORS(cmd->dirty_zone_size); layout_info->blended_stripes = cmd->blended_stripes != 0; layout_info->nb_spares = cmd->nb_spare; /* All sizes in 'cmd' are in KB. VRT internal functions want sizes in * sectors. */ ret = vrt_group_create(pending_group, reply->error_msg); os_free(pending_group); os_thread_mutex_unlock(&pending_group_lock); return ret; }
static void wq_wait(wq_t *wq) { int wait = 0; os_thread_mutex_lock(&wq->lock); if (wq->ev != 0) wq->ev = 0; else wait = 1; wq->wait = wait; os_thread_mutex_unlock(&wq->lock); if (wait == 1) os_sem_wait(&wq->sem); }
void rain1_rdev_init_rebuild_context(struct rain1_realdev *lr, rdev_rebuild_type_t type, sync_tag_t sync_tag) { EXA_ASSERT(type == EXA_RDEV_REBUILD_UPDATING || type == EXA_RDEV_REBUILD_REPLICATING); lr->rebuild_desc.type = type; lr->rebuild_desc.sync_tag = sync_tag; os_thread_mutex_lock(&lr->rebuild_progress.lock); lr->rebuild_progress.complete = FALSE; lr->rebuild_progress.nb_slots_rebuilt = 0; os_thread_mutex_unlock(&lr->rebuild_progress.lock); }
static void set_peers(const char ip_addresses[][EXA_MAXSIZE_NICADDRESS + 1]) { exa_nodeid_t node_id; os_thread_mutex_lock(&peers_lock); for (node_id = 0; node_id < EXA_MAX_NODES_NUMBER; node_id++) if (ip_addresses[node_id][0] == '\0') { EXA_ASSERT(!__peer_is_connected(node_id)); __reset_peer(node_id); } else __set_peer(node_id, ip_addresses[node_id]); os_thread_mutex_unlock(&peers_lock); }
static exa_nodeid_t get_peer_id_from_ip_addr(const char *ip_addr) { exa_nodeid_t i; exa_nodeid_t node_id = EXA_NODEID_NONE; os_thread_mutex_lock(&peers_lock); for (i = 0; i < EXA_MAX_NODES_NUMBER; i++) if (!strcmp(ip_addr, peers[i].ip_addr)) { node_id = i; break; } os_thread_mutex_unlock(&peers_lock); return node_id; }
/** * Begin group creation/starting command. Such a command must be * followed by several group_add_rdev() commands to add the real * devices in the group, and finally by a group_create() or * group_start() command. * * @param[in] params The parsed command array * * The real parameters passed in the array are: * - Name of the group to create or start * - Its UUID * - Name of the layout to use * * @return EXA_SUCCESS on success, negative error code on failure */ static int vrt_cmd_group_begin(const struct VrtGroupBegin *cmd) { vrt_group_t *group; exalog_debug("begin group '%s': UUID='" UUID_FMT "' layout='%s'", cmd->group_name, UUID_VAL(& cmd->group_uuid), cmd->layout); /* Check if the group is already started */ group = vrt_get_group_from_uuid(&cmd->group_uuid); if(group) { EXA_ASSERT(strcmp(cmd->group_name, group->name)==0 ); vrt_group_unref(group); return -VRT_INFO_GROUP_ALREADY_STARTED; } /* check if the group name is already used (this should not happen if * admind XML parsing is correct... */ group = vrt_get_group_from_name(cmd->group_name); if (group != NULL) { vrt_group_unref(group); return -VRT_ERR_GROUPNAME_USED; } os_thread_mutex_lock(&pending_group_lock); if (pending_group) os_free(pending_group); pending_group = os_malloc(sizeof(vrt_group_info_t)); vrt_group_info_init(pending_group); os_strlcpy(pending_group->name, cmd->group_name, sizeof(pending_group->name)); os_strlcpy(pending_group->layout_name, cmd->layout, sizeof(pending_group->layout_name)); uuid_copy(&pending_group->uuid, &cmd->group_uuid); pending_group->sb_version = cmd->sb_version; os_thread_mutex_unlock(&pending_group_lock); return EXA_SUCCESS; }
/** * Command to register a real device in a group. * It must be called after group_begin() and before * group_create() or group_start(). * * @param[in] params The parsed command array * * The real parameters passed in the array are: * - UUID of the group in which the real device has to be added * - UUID of the real device in the VRT * - UUID of the real device in the NBD * - Whether the disk is local or not * - Whether the disk is UP or not * - Whether the device properties must be loaded from disk * * @return EXA_SUCCESS on success, negative error code on failure. */ static int vrt_cmd_group_add_rdev(const struct VrtGroupAddRdev *cmd) { vrt_rdev_info_t *rdev_info; os_thread_mutex_lock(&pending_group_lock); if (!pending_group) { os_thread_mutex_unlock(&pending_group_lock); return -EPERM; } if (!uuid_is_equal(&cmd->group_uuid, &pending_group->uuid)) { exalog_error("Failed to edit group " UUID_FMT ", group " UUID_FMT " is already being edited.", UUID_VAL(&cmd->group_uuid), UUID_VAL(&pending_group->uuid)); os_thread_mutex_unlock(&pending_group_lock); return -EAGAIN; } if (pending_group->nb_rdevs == NBMAX_DISKS_PER_GROUP) { os_thread_mutex_unlock(&pending_group_lock); return -EAGAIN; } rdev_info = &pending_group->rdevs[pending_group->nb_rdevs]; uuid_copy(&rdev_info->uuid, &cmd->uuid); uuid_copy(&rdev_info->nbd_uuid, &cmd->nbd_uuid); rdev_info->node_id = cmd->node_id; rdev_info->spof_id = cmd->spof_id; rdev_info->local = cmd->local; rdev_info->up = cmd->up; pending_group->nb_rdevs++; os_thread_mutex_unlock(&pending_group_lock); return EXA_SUCCESS; }
void algopr_update_client_connections(const exa_nodeset_t *mship) { exa_nodeid_t node_id; os_thread_mutex_lock(&peers_lock); for (node_id = 0; node_id < EXA_MAX_NODES_NUMBER; node_id++) { /* FIXME Handle errors */ if (exa_nodeset_contains(mship, node_id) && !__peer_is_connected(node_id)) { /* Don't connect to self nor to nodes with a higher node id */ if (node_id < this_node_id) __connect_to_peer(node_id); } else if (!exa_nodeset_contains(mship, node_id) && __peer_is_connected(node_id)) __disconnect_from_peer(node_id); } os_thread_mutex_unlock(&peers_lock); }
static void set_peer_socket(exa_nodeid_t node_id, const char *ip_addr, int sock) { peer_t *peer; exalog_debug("setting socket of peer %"PRInodeid": %d '%s'", node_id, sock, ip_addr); os_thread_mutex_lock(&peers_lock); peer = &peers[node_id]; EXA_ASSERT(peer->sock == -1); /* A node's IP address is not supposed to change during the lifetime of a cluster (ie, the node id <-> IP address mapping is bijective), so we assert if the received IP doesn't match the one registered */ EXA_ASSERT_VERBOSE(strcmp(ip_addr, peer->ip_addr) == 0, "peer %"PRInodeid": received addr %s, expected %s", node_id, ip_addr, peer->ip_addr); peer->sock = sock; os_thread_mutex_unlock(&peers_lock); }
int unexport_device(const exa_uuid_t *uuid) { device_t *dev = find_device_from_uuid(uuid); if (dev == NULL) { exalog_error("can not remove unknown device with UUID = " UUID_FMT, UUID_VAL(uuid)); return -CMD_EXP_ERR_UNKNOWN_DEVICE; } os_thread_mutex_lock(&nbd_server.mutex_edevs); /* ask the thread to terminate */ dev->exit_thread = true; /* prevent any new IO to be put in device IO list */ nbd_server.devices[dev->dev_index] = NULL; os_thread_mutex_unlock(&nbd_server.mutex_edevs); /* now we can join, because with the nbd_close_list() * we can assume was the disk thread will reach a cancelation point */ os_thread_join(nbd_server.td_pid[dev->dev_index]); /* close the list used to disk queue */ nbd_close_list(&dev->disk_queue); /* get back all header in the kernel exa_rdev to the free list and close the device */ if (dev->handle != NULL) exa_rdev_handle_free(dev->handle); /* close the semaphore used by the disk */ os_sem_destroy(&dev->lock_sem_disk); /* free used memory for the device */ os_free(dev); return EXA_SUCCESS; }
/** * Finalize the start of a group. * * @param[in] params The parsed command array * * The real parameters passed in the array are: * - UUID of the group to start * * @return 0 on success, a negative error code on failure */ static int vrt_cmd_group_start(const struct VrtGroupStart *cmd) { vrt_group_t *group; int ret; exalog_debug("start group " UUID_FMT, UUID_VAL(&cmd->group_uuid)); os_thread_mutex_lock(&pending_group_lock); if (!pending_group) { os_thread_mutex_unlock(&pending_group_lock); return -EPERM; } if (!uuid_is_equal(&cmd->group_uuid, &pending_group->uuid)) { exalog_error("You are trying to create or start two groups at the same time."); os_thread_mutex_unlock(&pending_group_lock); return -EAGAIN; } ret = vrt_group_start(pending_group, &group); if (ret == EXA_SUCCESS) { ret = vrt_groups_list_add(group); if (ret != EXA_SUCCESS) vrt_group_stop(group); } os_free(pending_group); os_thread_mutex_unlock(&pending_group_lock); return ret; }
/* * thread responsible for receiving data for a client or a server * note when we add client, this client is effectively added in the receive queue * only few second later due to the select timeout of 3 seconds * and there are the same problem for the deleteion of a client */ static void algopr_receive_thread(void *unused) { struct pending_request pending_requests[EXA_MAX_NODES_NUMBER]; exa_select_handle_t *sh = exa_select_new_handle(); int i; int ret; payload_t *payload = NULL; struct nbd_root_list root_list_recv; /* FIXME: handle the case when we have more than 1024 open file (limit of fd_set) */ fd_set fds; exalog_as(EXAMSG_ISCSI_ID); nbd_init_root(EXA_MAX_NODES_NUMBER, sizeof(payload_t), &root_list_recv); for (i = 0; i < EXA_MAX_NODES_NUMBER; i++) request_reset(&pending_requests[i]); while (algopr_run) { int nfds = 0; FD_ZERO(&fds); /* if one node is added or deleted, this deletion or addition are effective after this */ os_thread_mutex_lock(&peers_lock); for (i = 0; i < EXA_MAX_NODES_NUMBER; i++) { int fd_act = __get_peer_socket(i); if (fd_act < 0) { payload_t *temp_payload; temp_payload = request_reset(&pending_requests[i]); if (temp_payload != NULL) { if (pending_requests[i].big_buffer) nbd_list_post(ð.root_list_big_recv.free, temp_payload->buffer, -1); nbd_list_post(&root_list_recv.free, temp_payload, -1); } temp_payload = NULL; continue; } FD_SET(fd_act, &fds); nfds = fd_act > nfds ? fd_act : nfds; } os_thread_mutex_unlock(&peers_lock); ret = exa_select_in(sh, nfds + 1, &fds); if (ret != 0 && ret != -EFAULT) exalog_error("Select upon receive failed: %s (%d)", os_strerror(-ret), ret); os_thread_mutex_lock(&peers_lock); for (i = 0; i < EXA_MAX_NODES_NUMBER; i++) { struct pending_request *req; int fd_act; fd_act = __get_peer_socket(i); if (fd_act < 0 || !FD_ISSET(fd_act, &fds)) continue; req = &pending_requests[i]; /* WARNING payload is kept from an iteration of while loop to * another, so the variable MUST be global. */ /* FIXME Remove the nbdlist which is useless as we already know * that we NEED EXA_MAX_NODES_NUMBER payload_t elements to be able * to receive simultaneously from EXA_MAX_NODES_NUMBER nodes * FIXME the LISTWAIT flag below is WRONG because waiting here * would mean deadlock... hopefully there are enough elements, and * we never wait.... */ if (payload == NULL) { payload = nbd_list_remove(&root_list_recv.free, NULL, LISTWAIT); EXA_ASSERT(payload != NULL); } if (request_init_transfer(payload, req) == 1) payload = NULL; ret = request_receive(fd_act, req); if (ret == DATA_TRANSFER_NEED_BIG_BUFFER) { req->payload->buffer = nbd_list_remove(ð.root_list_big_recv.free, NULL, LISTWAIT); EXA_ASSERT(req->payload->buffer != NULL); req->big_buffer = true; /* here we just continue because it is forbidden to call * request_receive without passing into select (as sockets are * blocking, we may remain blocked on the recv of nothing) */ continue; } if (ret == DATA_TRANSFER_PENDING) continue; if (ret == DATA_TRANSFER_ERROR) { payload_t *temp_payload = request_reset(req); if (req->big_buffer) nbd_list_post(ð.root_list_big_recv.free, temp_payload->buffer, -1); nbd_list_post(&root_list_recv.free, temp_payload, -1); __disconnect_from_peer(i); if (!suspended) exalog_warning("Failed receiving from peer %" PRInodeid " (socket %d): transfer error.", i, fd_act); continue; } if (ret == DATA_TRANSFER_COMPLETE) { payload_t *_payload = request_reset(req); /* update data network checking data */ algopr_new_msg(_payload->payload, _payload->size1, _payload->buffer, _payload->size2); nbd_list_post(&root_list_recv.free, _payload, -1); } } os_thread_mutex_unlock(&peers_lock); } nbd_close_root(&root_list_recv); exa_select_delete_handle(sh); }
/* thread for asynchronously sending data for a client or a server */ static void algopr_send_thread(void *unused) { struct pending_request pending_requests[EXA_MAX_NODES_NUMBER]; int i; exa_select_handle_t *sh = exa_select_new_handle(); exalog_as(EXAMSG_ISCSI_ID); for (i = 0; i < EXA_MAX_NODES_NUMBER; i++) request_reset(&pending_requests[i]); while (algopr_run) { fd_set fds; int nfds = 0; bool active_sock = false; FD_ZERO(&fds); /* if one node is added or deleted, this deletion or addition are effective after this */ os_thread_mutex_lock(&peers_lock); for (i = 0; i < EXA_MAX_NODES_NUMBER; i++) { int fd_act = __get_peer_socket(i); if (fd_act < 0) { /* release all buffer of clients who's sockets were closed */ payload_t *payload = request_reset(&pending_requests[i]); if (payload != NULL) nbd_list_post(ð.send_list[i].root->free, payload, -1); /* release all pending messages for node i: connection is dead, * those messages will never be delivered anyway. */ drop_all_messages_for_node(i); continue; } if (!pending_requests[i].used) { /* pick a new request if no one is in progress for this peer */ payload_t *payload = nbd_list_remove(ð.send_list[i], NULL, LISTNOWAIT); if (payload) request_init_transfer(payload, &pending_requests[i]); } if (pending_requests[i].used) { /* if buffers are waiting to be sent, add peer to select list */ FD_SET(fd_act, &fds); nfds = fd_act > nfds ? fd_act : nfds; active_sock = true; } } os_thread_mutex_unlock(&peers_lock); if (!active_sock) { wq_wait(ð.wq_send); /* we were waiting for new requests to send, someone signaled us * so restart the loop and look for new request. */ continue; } exa_select_out(sh, nfds + 1, &fds); os_thread_mutex_lock(&peers_lock); for (i = 0; i < EXA_MAX_NODES_NUMBER; i++) { struct pending_request *request = &pending_requests[i]; int fd_act = __get_peer_socket(i); if (fd_act >= 0 && pending_requests[i].used && FD_ISSET(fd_act, &fds)) { /* send remaining data if any */ int ret = request_send(fd_act, request); switch (ret) { case DATA_TRANSFER_COMPLETE: nbd_list_post(ð.send_list[i].root->free, request->payload, -1); request_reset(request); break; case DATA_TRANSFER_ERROR: nbd_list_post(ð.send_list[i].root->free, request->payload, -1); request_reset(request); break; case DATA_TRANSFER_PENDING: break; } } } os_thread_mutex_unlock(&peers_lock); } exa_select_delete_handle(sh); }