void __rdev_perf_make_request(device_t *disk_device, header_t *req_header) { uint64_t now_ms = os_gettimeofday_msec(); uint64_t inter_arrival; int rw = -1; EXA_ASSERT(NBD_REQ_TYPE_IS_VALID(req_header->request_type)); switch (req_header->request_type) { case NBD_REQ_TYPE_READ: rw = __READ; break; case NBD_REQ_TYPE_WRITE: rw = __WRITE; break; case NBD_REQ_TYPE_LOCK: case NBD_REQ_TYPE_UNLOCK: EXA_ASSERT(false); /* FIXME: formerly this case was not handled */ } /* WARNING: be careful, although the first call is taken into account * here (for the inter-arrival time), the first exaperf log should not * be taken into consideration for the analysis. Max and mean value * are very big because of the history of the IOs. This happens even * if we start/stop the cluser between two experiments. */ if (disk_device->last_req_time[rw] != 0) { inter_arrival = now_ms - disk_device->last_req_time[rw]; exaperf_repart_add_value(disk_device->inter_arrival_repart[rw], inter_arrival); } disk_device->last_req_time[rw] = now_ms; req_header->rdev_submit_date = now_ms; }
int algopr_send_data(exa_nodeid_t node_id, void *buffer1, int size1, void *buffer2, int size2) { payload_t *payload; EXA_ASSERT(EXA_NODEID_VALID(node_id)); /* FIXME see comment above definition of SIZEOF_ALGOPR_MSGNETWORK_T */ EXA_ASSERT(size1 == SIZEOF_ALGOPR_MSGNETWORK_T); /* Send is local, just forwrd it */ if (node_id == this_node_id) { algopr_new_msg(buffer1, size1, buffer2, size2); return 1; } /* Note: All nbd lists send_list[] share the same root so here we don't * care in which one is picked up the buffer */ payload = nbd_list_remove(ð.send_list[0].root->free, NULL, LISTWAIT); EXA_ASSERT(payload != NULL); memcpy(payload->payload, buffer1, size1); payload->size1 = size1; payload->size2 = size2; payload->buffer = buffer2; nbd_list_post(ð.send_list[node_id], payload, -1); wq_wake(ð.wq_send); return 1; }
static void writewrap(exa_ringbuf_t *rng, const char *buf, size_t nbytes) { char *data = rng->data; int n; EXA_ASSERT(nbytes < rng->size); if (rng->pWr + nbytes >= rng->size) { /* wrap around */ n = rng->size - rng->pWr; memcpy(data + rng->pWr, buf, n); nbytes -= n; buf += n; rng->pWr = 0; } /* direct write */ memcpy(data + rng->pWr, buf, nbytes); rng->pWr += nbytes; EXA_ASSERT(rng->pRd < rng->size); EXA_ASSERT(rng->pWr < rng->size); }
/** * Create a volume in a given group * * @param[in] params The parsed command array * * The real parameters passed in the array are: * - UUID of the group in which the volume has to be created * - Name of the volume to create * - UUID of the volume to create * - Size of the volume to create (in KB) * * @return 0 on success, a negative error code on failure */ static int vrt_cmd_volume_create(const struct VrtVolumeCreate *cmd) { vrt_group_t *group; vrt_volume_t *volume; int ret; EXA_ASSERT(cmd->volume_size > 0); exalog_debug("create volume '%s': size %" PRIu64 " KB", cmd->volume_name, cmd->volume_size); group = vrt_get_group_from_uuid(&cmd->group_uuid); if (group == NULL) { exalog_debug("Unknown group " UUID_FMT, UUID_VAL(&cmd->group_uuid)); return -VRT_ERR_UNKNOWN_GROUP_UUID; } /* !!! All sizes in 'cmd' are in KB and VRT internal functions want sizes in * sectors. */ ret = vrt_group_create_volume(group, &volume, &cmd->volume_uuid, cmd->volume_name, KBYTES_2_SECTORS(cmd->volume_size)); if (ret != EXA_SUCCESS) { exalog_error("Can't create volume '%s' in group '%s': %s(%d)", cmd->volume_name, group->name, exa_error_msg(ret), ret); vrt_group_unref(group); return ret; } EXA_ASSERT(volume != NULL); /* wipe the newly created volume * * FIXME: This code is called from all the clients while it should be done * only once. To do so we should add a new RPC and trigger the wipping from * admind. */ /* Let only one node (the first one) do the wipe */ if (vrt_node_get_upnode_id() == 0) { ret = vrt_group_wipe_volume(group, volume); if (ret != EXA_SUCCESS) { exalog_error("Can't wipe volume '%s' in group '%s': %s(%d)", volume->name, group->name, exa_error_msg(ret), ret); /* Rollback volume creation */ vrt_group_delete_volume(group, volume); vrt_group_unref(group); return ret; } } vrt_group_unref(group); return EXA_SUCCESS; }
void storage_rdev_iterator_end(storage_rdev_iter_t *iter) { EXA_ASSERT(iter != NULL); EXA_ASSERT(iter->storage != NULL); iter->storage = NULL; }
vrt_realdev_t *storage_rdev_iterator_get(storage_rdev_iter_t *iter) { const spof_group_t *spof_group; vrt_realdev_t *rdev = NULL; EXA_ASSERT(iter != NULL); EXA_ASSERT(iter->storage != NULL); if (iter->spof_group_index >= iter->storage->num_spof_groups) return NULL; spof_group = &iter->storage->spof_groups[iter->spof_group_index]; if (iter->realdev_index >= spof_group->nb_realdevs) { iter->realdev_index = 0; iter->spof_group_index++; return storage_rdev_iterator_get(iter); } rdev = spof_group->realdevs[iter->realdev_index]; iter->realdev_index++; return rdev; }
void __serverd_perf_end_request(header_t *req_header) { double now = os_gettimeofday_msec(); int rw = -1; EXA_ASSERT(NBD_REQ_TYPE_IS_VALID(req_header->request_type)); switch (req_header->request_type) { case NBD_REQ_TYPE_READ: rw = __READ; break; case NBD_REQ_TYPE_WRITE: rw = __WRITE; break; case NBD_REQ_TYPE_LOCK: case NBD_REQ_TYPE_UNLOCK: EXA_ASSERT(false); /* FIXME: formerly this case was not handled */ } exaperf_duration_record(header_dur[rw], (double)now - req_header->header_submit_date); if (rw == __WRITE) exaperf_duration_record(data_dur, (double)now - req_header->data_submit_date); }
void storage_spof_iterator_begin(storage_spof_iter_t *iter, const storage_t *storage) { EXA_ASSERT(iter != NULL); EXA_ASSERT(storage != NULL); iter->storage = storage; iter->index = 0; }
void storage_rdev_iterator_begin(storage_rdev_iter_t *iter, const storage_t *storage) { EXA_ASSERT(iter != NULL); EXA_ASSERT(storage != NULL); iter->storage = storage; iter->spof_group_index = 0; iter->realdev_index = 0; }
/** * merge the lock/unlock header to the list of the locked zone of the device * set disk_device->locking_return according to the succes (0) or error (-1) * of the merge * @param disk_device device * @param header lock/unlocking to merge */ static void td_merge_lock(device_t *disk_device, header_t *header) { int i; EXA_ASSERT(header->type = NBD_HEADER_LOCK); EXA_ASSERT(header->lock.op == NBD_REQ_TYPE_LOCK || header->lock.op == NBD_REQ_TYPE_UNLOCK); switch (header->lock.op) { case NBD_REQ_TYPE_LOCK: if (disk_device->nb_locked_zone > NBMAX_DISK_LOCKED_ZONES) { disk_device->locking_return = -1; return; } else { struct locked_zone *locked_zone = &disk_device->locked_zone[disk_device->nb_locked_zone]; locked_zone->sector = header->lock.sector; locked_zone->sector_count = header->lock.sector_nb; disk_device->nb_locked_zone++; disk_device->locking_return = 0; } return; case NBD_REQ_TYPE_UNLOCK: for (i = 0; i < disk_device->nb_locked_zone; i++) { struct locked_zone *locked_zone = &disk_device->locked_zone[i]; if (locked_zone->sector == header->lock.sector && locked_zone->sector_count == header->lock.sector_nb) { disk_device->locking_return = 0; disk_device->nb_locked_zone--; /* The array is not sorted but need to be consolidated, thus * when removing an element which is not at the end, we fill * the 'hole' by moving the last element to this place. */ if (i < disk_device->nb_locked_zone) /* last zone */ disk_device->locked_zone[i] = disk_device->locked_zone[disk_device->nb_locked_zone]; return; } } disk_device->locking_return = -1; return; } }
/** * Get the node within a cluster given its id. * * \param cluster Cluster to get the node from * \param[in] node_id Id of node to get * * \return Node if found, NULL otherwise */ sup_node_t * sup_cluster_node(const sup_cluster_t *cluster, exa_nodeid_t node_id) { EXA_ASSERT(cluster); EXA_ASSERT(node_id < EXA_MAX_NODES_NUMBER); if (cluster->nodes[node_id].id == EXA_NODEID_NONE) return NULL; return (sup_node_t *)&cluster->nodes[node_id]; }
/** * send one request to device, it validate there is no problem with the * * @header IN request to do * OUT last request done * @return EXA_RDEV_REQUEST_END_OK new request submitted successfully and header * contains an old request succesfully done * EXA_RDEV_REQUEST_END_ERROR new request submitted successfully * and header contains an old request that fail * RDEV_REQUEST_NOT_ENOUGH_FREE_REQ not enough resources to submit a new request */ static int exa_td_process_one_request(header_t **header, device_t *disk_device) { void * buffer; int sector_nb; uint64_t sector; int retval; header_t *req_header = *header; /* FIXME this is a ugly hack to prevent compiler to complain about * uninitialized variable. Actually, this is because the request type * itself is f***ed up (no type and the funky use os bit masks...) * Please remove this whe reworking header_t content... */ rdev_op_t op = (rdev_op_t)-1; /* submit this new request to exa_rdev and so to the disk driver */ sector_nb = req_header->io.desc.sector_nb; buffer = req_header->io.buf; sector = req_header->io.desc.sector; EXA_ASSERT(NBD_REQ_TYPE_IS_VALID(req_header->io.desc.request_type)); switch (req_header->io.desc.request_type) { case NBD_REQ_TYPE_READ: EXA_ASSERT(!req_header->io.desc.flush_cache); op = RDEV_OP_READ; break; case NBD_REQ_TYPE_WRITE: if (req_header->io.desc.flush_cache) op = RDEV_OP_WRITE_BARRIER; else op = RDEV_OP_WRITE; break; } /* Be carefull the 'header' pointer can be modified */ retval = exa_rdev_make_request_new(op, (void *)header, sector + RDEV_RESERVED_AREA_IN_SECTORS, sector_nb, buffer, disk_device->handle); if (retval == RDEV_REQUEST_NOT_ENOUGH_FREE_REQ) return RDEV_REQUEST_NOT_ENOUGH_FREE_REQ; if (*header != NULL) (*header)->io.desc.result = retval == RDEV_REQUEST_END_OK ? 0 : -EIO; if (retval < 0) return RDEV_REQUEST_END_ERROR; return retval; }
uint64_t storage_get_spof_group_free_chunk_count(const storage_t *storage, spof_id_t spof_id) { spof_group_t *sg; EXA_ASSERT(storage != NULL); EXA_ASSERT(SPOF_ID_IS_VALID(spof_id)); sg = storage_get_spof_group_by_id(storage, spof_id); EXA_ASSERT(sg != NULL); return spof_group_free_chunk_count(sg); }
/* If the peer is already known, this function doesn't do anything (apart from checking that the IP address given is identical to the one already known). Most notably, it leaves the peer's socket alone */ static void __set_peer(exa_nodeid_t node_id, const char *ip_addr) { peer_t *peer; exalog_debug("setting peer %"PRInodeid": '%s'", node_id, ip_addr); peer = &peers[node_id]; /* ip addr of a node can't change */ EXA_ASSERT(peer->ip_addr[0] == '\0' || strcmp(peer->ip_addr, ip_addr) == 0); EXA_ASSERT(os_strlcpy(peer->ip_addr, ip_addr, sizeof(peer->ip_addr)) < sizeof(peer->ip_addr)); }
/** * Set the storage chunk size. This must only be done once (and never change). * * @param[in,out] storage Storage in which we set the size * @param[in] chunk_size Size of chunks, in Kbytes * * @return 0 if successful, -EINVAL if the chunk size is invalid */ static int storage_set_chunk_size(storage_t *storage, uint32_t chunk_size) { EXA_ASSERT(storage); /* We don't allow changing the storage's chunk size once it's been set */ EXA_ASSERT(storage->chunk_size == 0 || storage->chunk_size == chunk_size); if (chunk_size == 0) return -EINVAL; storage->chunk_size = chunk_size; return 0; }
spof_id_t storage_spof_iterator_get(storage_spof_iter_t *iter) { const spof_group_t *spof_group; EXA_ASSERT(iter != NULL); EXA_ASSERT(iter->storage != NULL); if (iter->index >= iter->storage->num_spof_groups) return SPOF_ID_NONE; spof_group = &iter->storage->spof_groups[iter->index]; iter->index++; return spof_group->spof_id; }
/** * Cut an rdev into chunks * * @param[in] storage The storage * @param[in] rdev The rdev to cut in chunks * * @return 0 if successful, a negative error code otherwise */ int storage_cut_rdev_in_chunks(storage_t *storage, vrt_realdev_t *rdev) { uint64_t total_size = vrt_realdev_get_usable_size(rdev); uint32_t chunk_size = KBYTES_2_SECTORS(storage->chunk_size); EXA_ASSERT(total_size > 0); EXA_ASSERT(chunk_size > 0); if (chunk_size > total_size) return -VRT_ERR_RDEV_TOO_SMALL; storage_initialize_rdev_chunks_info(storage, rdev, total_size / chunk_size); return EXA_SUCCESS; }
void __adm_volume_free(struct adm_volume *volume) { EXA_ASSERT(volume); EXA_ASSERT(volume->group == NULL); EXA_ASSERT(volume->next == NULL); #ifdef WITH_FS if (volume->filesystem) { adm_fs_free(volume->filesystem); volume->filesystem = NULL; } #endif os_free(volume); }
/** * Add a node to a cluster. * * \param cluster Cluster to add the node to * \param[in] node_id Id of node to add * * \return 0 if added, negative error code otherwise: * -EINVAL if node id invalid * -ENOSPC if cluster full * -EEXIST if node already present in cluster */ int sup_cluster_add_node(sup_cluster_t *cluster, exa_nodeid_t node_id) { sup_node_t *node; EXA_ASSERT(cluster); if (node_id >= EXA_MAX_NODES_NUMBER) return -EINVAL; if (cluster->num_nodes == EXA_MAX_NODES_NUMBER) return -ENOSPC; node = &cluster->nodes[node_id]; if (sup_node_defined(node)) return -EEXIST; sup_node_init(node); node->id = node_id; exa_nodeset_add(&cluster->known_nodes, node_id); cluster->num_nodes++; return 0; }
int vrt_group_sync_sb_versions(int thr_nb, struct adm_group *group) { admwrk_request_t rpc; exa_nodeid_t nid; sb_serialized_t sb_ser; int err; sb_version_local_recover(group->sb_version); sb_version_serialize(group->sb_version, &sb_ser); /* Exchange exports file version number */ admwrk_bcast(thr_nb, &rpc, EXAMSG_SERVICE_VRT_SB_SYNC, &sb_ser, sizeof(sb_ser)); while (admwrk_get_bcast(&rpc, &nid, &sb_ser, sizeof(sb_ser), &err)) { if (err == -ADMIND_ERR_NODE_DOWN) continue; sb_version_update_from(group->sb_version, &sb_ser); } /* After the synchronisation of the sb_versions, they can't be invalid * anymore. */ EXA_ASSERT(sb_version_is_valid(group->sb_version)); return err; }
void adm_volume_set_goal(struct adm_volume *volume, exa_nodeset_t *hostlist, exa_volume_status_t status, int readonly) { exa_nodeid_t node; EXA_ASSERT(volume != NULL); /* Iterate through the hostlist in order to remove every node from * all the status lists and finally add it to the right one * according to its new status. */ exa_nodeset_foreach(hostlist, node) { exa_nodeset_del(&volume->goal_stopped, node); exa_nodeset_del(&volume->goal_started, node); exa_nodeset_del(&volume->goal_readonly, node); switch (status) { case EXA_VOLUME_STOPPED: exa_nodeset_add(&volume->goal_stopped, node); break; case EXA_VOLUME_STARTED: exa_nodeset_add(&volume->goal_started, node); break; } if (readonly) exa_nodeset_add(&volume->goal_readonly, node); }
exaperf_err_t exaperf_sensor_template_param_set(exaperf_sensor_template_t *sensor_template, exaperf_sensor_param_t key, uint32_t value) { EXA_ASSERT(sensor_template != NULL); switch (key) { case EXAPERF_PARAM_FLUSHING_PERIOD: sensor_template->flushing_period = value; break; case EXAPERF_PARAM_SAMPLING_PERIOD: sensor_template->sampling_period = value; break; case EXAPERF_PARAM_FLUSHING_FILTER: sensor_template->flushing_filter = value; break; case EXAPERF_PARAM_SAMPLE_SIZE: sensor_template->sample_size = value; break; case EXAPERF_PARAM_DISTRIBUTION: case EXAPERF_PARAM_NONE: return EXAPERF_INVALID_PARAM; } return EXAPERF_SUCCESS; }
void examsgRngGetInfo(const exa_ringbuf_t *rng, exa_ringinfo_t *rng_info) { EXA_ASSERT(rng_info); rng_info->stats = rng->stats; rng_info->ring_size = rng->size; rng_info->available = examsgRngBytes(rng); }
/** * Deserialize a storage. * * @param[in] The storage to deserialize * @param[in] The stream to serialize from * * @return 0 if successful, a negative error code otherwise * * @attention This deserializes only the chunk_size and minimal * rdev information, the storage is expected to already contain * SPOF groups and realdevs. */ int storage_deserialize(storage_t *storage, stream_t *stream) { storage_header_t header; int err, i; err = storage_header_read(&header, stream); if (err != 0) return err; if (header.magic != STORAGE_HEADER_MAGIC) return -VRT_ERR_SB_MAGIC; if (header.format != STORAGE_HEADER_FORMAT) return -VRT_ERR_SB_FORMAT; if (header.nb_rdevs != storage_get_num_realdevs(storage)) return -VRT_ERR_SB_CORRUPTION; storage_set_chunk_size(storage, header.chunk_size); for (i = 0; i < header.nb_rdevs; i++) { vrt_realdev_t *rdev; err = storage_rdev_deserialize(&rdev, storage, stream); if (err != 0) return err; EXA_ASSERT(rdev != NULL); } return 0; }
static void local_exa_fscreate(int thr_nb, void *msg) { int ret = EXA_SUCCESS; struct fscreate_info *info = msg; const fs_definition_t *fs_definition; if (info->nodeid != adm_my_id) { ret = -ADMIND_ERR_NOTHINGTODO; goto local_exa_fscreate_end; } fs_definition = fs_get_definition(info->fs.fstype); if (!fs_definition) { ret = -EXA_ERR_INVALID_PARAM; goto local_exa_fscreate_end; } EXA_ASSERT(fs_definition->create_fs); ret = fs_definition->create_fs(thr_nb, &info->fs); local_exa_fscreate_end: exalog_debug("local_exa_fscreate() = %s", exa_error_msg(ret)); admwrk_ack(thr_nb, ret); }
/** * Resize a volume * * @param[in] params The parsed command array * * The real parameters passed in the array are: * - UUID of the group in which is located the volume to resize * - UUID of the volume to resize * - New size of the volume (in KB) * * @return 0 on success, a negative error code on failure */ static int vrt_cmd_volume_resize(const struct VrtVolumeResize *cmd) { struct vrt_group *group; struct vrt_volume *volume; int ret; EXA_ASSERT(cmd->volume_size > 0); group = vrt_get_group_from_uuid(&cmd->group_uuid); if (! group) return -VRT_ERR_UNKNOWN_GROUP_UUID; volume = vrt_group_find_volume(group, &cmd->volume_uuid); if (! volume) { vrt_group_unref(group); return -VRT_ERR_UNKNOWN_VOLUME_UUID; } /* !!! All sizes in 'cmd' are in KB and VRT internal functions want sizes in * sectors. */ ret = vrt_volume_resize(volume, KBYTES_2_SECTORS(cmd->volume_size), group->storage); if (ret != EXA_SUCCESS) { vrt_group_unref(group); return ret; } vrt_group_unref(group); return ret; }
bool iqn_filter_is_equal(const iqn_filter_t *filter1, const iqn_filter_t *filter2) { const iqn_t *iqn1, *iqn2; iqn_filter_policy_t policy1, policy2; EXA_ASSERT(filter1 != NULL); EXA_ASSERT(filter2 != NULL); iqn1 = iqn_filter_get_pattern(filter1); iqn2 = iqn_filter_get_pattern(filter2); policy1 = iqn_filter_get_policy(filter1); policy2 = iqn_filter_get_policy(filter2); return iqn_is_equal(iqn1, iqn2) && policy1 == policy2; }
/** * Check whether we're seen as coordinator by all the nodes in our clique. * NOTE: This function assumes that we see ourself as coord. * * \return true if the whole clique agrees that we are coord, false otherwise */ static bool clique_sees_self_as_coord(void) { exa_nodeid_t node_id; for (node_id = 0; node_id < self->view.num_seen; node_id++) if (exa_nodeset_contains(&self->view.clique, node_id)) { sup_node_t *node = sup_cluster_node(&cluster, node_id); EXA_ASSERT(node); if (!self_sees(node)) return false; if (!exa_nodeset_equals(&node->view.clique, &self->view.clique)) return false; if (node->view.coord != self->view.coord) return false; } __trace("clique_sees_self_as_coord -> YES"); return true; }
/** * Ask the local NBD to unlock writes on a range of sectors for a * given device. It only works on local devices. * * @param[in] rdev The real device to unlock * @param[in] start The starting sector * @param[in] siz The size in sectors * * @return EXA_SUCCESS on success, an error code on failure */ int vrt_rdev_unlock_sectors(struct vrt_realdev *rdev, unsigned long start, unsigned long size) { EXA_ASSERT (rdev_is_local (rdev)); return vrt_msg_nbd_lock(&rdev->nbd_uuid, start, size, FALSE); }
static bool persistent_spc2_release_lun(pr_context_t *context, int session_id, lun_t lun, scsi_command_status_t *scsi_status) { pr_info_t *pr_info = &context->pr_info[lun]; EXA_ASSERT(session_id < MAX_GLOBAL_SESSION); if (!can_use_spc2_reserve(context, lun, session_id)) { exalog_warning("iSCSI PR conflict: session %i cannot release SPC-2 " "reservation on LUN %" PRIlun ", SPC-2 reserve not " "possible on this LUN", session_id, lun); return false; } if (pr_info->spc2_reserve != SPC2_RESERVE_NONE && pr_info->spc2_reserve != session_id) { exalog_warning("iSCSI PR conflict: session %i cannot release SPC-2 " "reservation on LUN %" PRIlun ", LUN already reserved " "by %i", session_id, lun, pr_info->spc2_reserve); return false; } pr_info->spc2_reserve = SPC2_RESERVE_NONE; exalog_debug("iSCSI PR: session %i released SPC-2 reservation on LUN %" PRIlun, session_id, lun); return true; }