static void rain1_update_sync_tag(rain1_group_t *rxg) { rain1_realdev_t *lr; int i; sync_tag_t new_tag = sync_tag_inc(rxg->sync_tag); foreach_rainx_rdev(rxg, lr, i) { /* The uptodate devices that are still UP must have their tag * incremented as well */ if (rdev_is_up(lr->rdev) && sync_tag_is_equal(lr->sync_tag, rxg->sync_tag)) { exalog_debug("Marking rdev "UUID_FMT" with new sync_tag %"PRIsync_tag, UUID_VAL(&lr->uuid), new_tag); lr->sync_tag = new_tag; } else exalog_debug("NOT marking rdev "UUID_FMT" with new sync_tag.", UUID_VAL(&lr->uuid)); /* The device that are so outdated that their tag is not even * comparable with the new tag are considered as blank. * It avoids that a very outdated device magically becomes uptodate * due to the tag wraping. */ if (!sync_tags_are_comparable(lr->sync_tag, new_tag)) lr->sync_tag = SYNC_TAG_BLANK; } exalog_debug("Marking group with new sync_tag %"PRIsync_tag, new_tag); rxg->sync_tag = new_tag; }
/** * Stop an active group * * @param[in] params The parsed command array * * The real parameters passed in the array are: * - UUID of the group to stop * * @return 0 on success, a negative error code on failure */ static int vrt_cmd_group_stop(const struct VrtGroupStop *cmd) { struct vrt_group *group; int ret; exalog_debug("stop group " UUID_FMT, UUID_VAL(&cmd->group_uuid)); group = vrt_get_group_from_uuid(&cmd->group_uuid); if (!group) { exalog_debug("unknown group " UUID_FMT, UUID_VAL(&cmd->group_uuid)); return -VRT_ERR_GROUP_NOT_STARTED; } /* removing group from group list prevent other user to be able to get * new references on this group */ vrt_groups_list_del(group); ret = vrt_group_stop(group); if (ret != EXA_SUCCESS) { vrt_group_unref(group); return ret; } /* No need to call (and can't anyway) vrt_group_unref() because the group has been freed */ return EXA_SUCCESS; }
/** * Start a volume. * * @param[in] cmd : A struct VrtVolumeStart * * @return EXA_SUCCESS or a negative error code */ static int vrt_cmd_volume_start(const struct VrtVolumeStart *cmd) { struct vrt_group *group; struct vrt_volume *volume; int error; group = vrt_get_group_from_uuid(&cmd->group_uuid); if (!group) return -VRT_ERR_GROUP_NOT_STARTED; volume = vrt_group_find_volume(group, &cmd->volume_uuid); if (! volume) { exalog_debug("volume '" UUID_FMT "' not found in group '" UUID_FMT "'", UUID_VAL(&cmd->volume_uuid), UUID_VAL(&group->uuid)); vrt_group_unref(group); return -VRT_ERR_UNKNOWN_VOLUME_UUID; } error = vrt_volume_start(volume); vrt_group_unref(group); return error; }
/* parts to handle devices events, cases REBUILD and RECOVER are missing */ static int vrt_cmd_device_event(const struct VrtDeviceEvent *event_msg) { int retval = -EINVAL; struct vrt_group *group; struct vrt_realdev *rdev; group = vrt_get_group_from_uuid(&event_msg->group_uuid); if (!group) { exalog_debug("group " UUID_FMT " not found", UUID_VAL(&event_msg->group_uuid)); return -VRT_ERR_UNKNOWN_GROUP_UUID; } rdev = storage_get_rdev(group->storage, &event_msg->rdev_uuid); if (!rdev) { exalog_debug("rdev " UUID_FMT " not found", UUID_VAL(&event_msg->rdev_uuid)); return -VRT_ERR_OLD_RDEVS_MISSING; } switch(event_msg->event) { case VRT_DEVICE_DOWN: retval = vrt_group_rdev_down(group, rdev); break; case VRT_DEVICE_UP: retval = vrt_group_rdev_up(group, rdev); break; case VRT_DEVICE_REINTEGRATE: retval = vrt_group_reintegrate_rdev(group, rdev); break; case VRT_DEVICE_POST_REINTEGRATE: retval = vrt_group_post_reintegrate_rdev(group, rdev); break; default : EXA_ASSERT_VERBOSE(0, "struct VrtDeviceEvent: Unknown event type %d\n", event_msg->event); } vrt_group_unref(group); return retval; }
/** * Unfreeze a group: resume IO previously blocked by freeze * * @param[in] cmd The msg containing parameters for freezing the group * * @return 0 on success, a negative error code on failure */ int vrt_cmd_group_unfreeze(const struct VrtGroupUnfreeze *cmd) { struct vrt_group *group; int i; group = vrt_get_group_from_uuid(&cmd->group_uuid); if(!group) { exalog_debug("Unknown group " UUID_FMT, UUID_VAL(&cmd->group_uuid)); return -VRT_ERR_UNKNOWN_GROUP_UUID; } for (i = 0 ; i < NBMAX_VOLUMES_PER_GROUP ; i++) { struct vrt_volume *volume = group->volumes[i]; if (! volume) continue; volume->frozen = FALSE; wake_up_all(&volume->frozen_req_wq); } vrt_group_unref(group); return EXA_SUCCESS; }
/** * Freeze a group: block incoming IO and wait current IO to be finished * * @param[in] cmd The msg containing parameters for freezing the group * * @return 0 on success, a negative error code on failure */ static int vrt_cmd_group_freeze(const struct VrtGroupFreeze *cmd) { struct vrt_group *group; int i; group = vrt_get_group_from_uuid(&cmd->group_uuid); if(!group) { exalog_debug("Unknown group " UUID_FMT, UUID_VAL(&cmd->group_uuid)); return -VRT_ERR_UNKNOWN_GROUP_UUID; } for (i = 0 ; i < NBMAX_VOLUMES_PER_GROUP ; i++) { struct vrt_volume *volume = group->volumes[i]; if (! volume) continue; volume->frozen = TRUE; wait_event(volume->cmd_wq, os_atomic_read(&volume->inprogress_request_count) == 0); } vrt_group_unref(group); return EXA_SUCCESS; }
/** * Create a volume in a given group * * @param[in] params The parsed command array * * The real parameters passed in the array are: * - UUID of the group in which the volume has to be created * - Name of the volume to create * - UUID of the volume to create * - Size of the volume to create (in KB) * * @return 0 on success, a negative error code on failure */ static int vrt_cmd_volume_create(const struct VrtVolumeCreate *cmd) { vrt_group_t *group; vrt_volume_t *volume; int ret; EXA_ASSERT(cmd->volume_size > 0); exalog_debug("create volume '%s': size %" PRIu64 " KB", cmd->volume_name, cmd->volume_size); group = vrt_get_group_from_uuid(&cmd->group_uuid); if (group == NULL) { exalog_debug("Unknown group " UUID_FMT, UUID_VAL(&cmd->group_uuid)); return -VRT_ERR_UNKNOWN_GROUP_UUID; } /* !!! All sizes in 'cmd' are in KB and VRT internal functions want sizes in * sectors. */ ret = vrt_group_create_volume(group, &volume, &cmd->volume_uuid, cmd->volume_name, KBYTES_2_SECTORS(cmd->volume_size)); if (ret != EXA_SUCCESS) { exalog_error("Can't create volume '%s' in group '%s': %s(%d)", cmd->volume_name, group->name, exa_error_msg(ret), ret); vrt_group_unref(group); return ret; } EXA_ASSERT(volume != NULL); /* wipe the newly created volume * * FIXME: This code is called from all the clients while it should be done * only once. To do so we should add a new RPC and trigger the wipping from * admind. */ /* Let only one node (the first one) do the wipe */ if (vrt_node_get_upnode_id() == 0) { ret = vrt_group_wipe_volume(group, volume); if (ret != EXA_SUCCESS) { exalog_error("Can't wipe volume '%s' in group '%s': %s(%d)", volume->name, group->name, exa_error_msg(ret), ret); /* Rollback volume creation */ vrt_group_delete_volume(group, volume); vrt_group_unref(group); return ret; } } vrt_group_unref(group); return EXA_SUCCESS; }
/** * Finalize the insertion of a new rdev in a group. * * @param[in] params The parsed command array * * The real parameters passed in the array are: * - UUID of the group to insert the new rdev in * * @return 0 on success, a negative error code on failure */ static int vrt_cmd_group_insert_rdev(const struct VrtGroupInsertRdev *cmd) { int ret; exalog_debug("finish adding rdev in group " UUID_FMT, UUID_VAL(&cmd->group_uuid)); os_thread_mutex_lock(&pending_group_lock); if (!pending_group) { os_thread_mutex_unlock(&pending_group_lock); return -EPERM; } if (!uuid_is_equal(&cmd->group_uuid, &pending_group->uuid)) { exalog_error("You are trying to insert an rdev into a group " "while a dgcreate, dgstart or dgdiskadd is running."); os_thread_mutex_unlock(&pending_group_lock); return -EAGAIN; } ret = vrt_group_insert_rdev(pending_group, &cmd->uuid, &cmd->nbd_uuid, cmd->node_id, cmd->spof_id, cmd->local, cmd->old_sb_version, cmd->new_sb_version); os_free(pending_group); os_thread_mutex_unlock(&pending_group_lock); return ret; }
static int vrt_cmd_device_replace(const struct VrtDeviceReplace *cmd) { struct vrt_group *group; struct vrt_realdev *rdev; int ret; group = vrt_get_group_from_uuid(&cmd->group_uuid); if (group == NULL) return -VRT_ERR_UNKNOWN_GROUP_UUID; rdev = storage_get_rdev(group->storage, &cmd->vrt_uuid); if (rdev == NULL) { exalog_error("Cannot find vrt UUID " UUID_FMT " in group '%s'", UUID_VAL(&cmd->vrt_uuid), group->name); vrt_group_unref(group); return -VRT_ERR_NO_SUCH_RDEV_IN_GROUP; } if (!vrt_group_supports_device_replacement(group)) { exalog_error("Group '%s' (layout '%s') does not support disk replacement", group->name, group->layout->name); vrt_group_unref(group); return -VRT_ERR_DISK_REPLACEMENT_NOT_SUPPORTED; } ret = vrt_group_rdev_replace(group, rdev, &cmd->rdev_uuid); vrt_group_unref(group); return ret; }
int vrt_rdev_open(struct vrt_realdev *rdev) { int err; rdev->blockdevice = client_get_blockdevice(&rdev->nbd_uuid); if (rdev->blockdevice == NULL) { exalog_error("Could not open device "UUID_FMT, UUID_VAL(&rdev->uuid)); return -ENODEV; } err = vrt_rdev_open_superblock_streams(rdev); if (err != 0) return err; if (rdev->up) { uint64_t size = __get_block_aligned_bdev_size(rdev->blockdevice); err = vrt_rdev_set_real_size(rdev, size); if (err != 0) return err; } return EXA_SUCCESS; }
/** * Synchronize the broken field of each disk with the content of the table * of broken disks. Request an NBD recovery if the status of one or several * disks changed. */ static void rdev_update_disks(void) { struct adm_node *node; adm_cluster_for_each_node(node) { struct adm_disk *disk; adm_node_for_each_disk(node, disk) { /* FIXME there is no check that the uuid in broken_disks are actually part * of the cluster */ bool broken = broken_disk_table_contains(broken_disks, &disk->uuid); bool missing = disk->local != NULL && !disk->local->reachable; if (broken == disk->broken) continue; exalog_info("%s:"UUID_FMT" (%s) is %s%s", node->name, UUID_VAL(&disk->uuid), disk->path, broken ? "broken" : "not broken", missing ? ", missing" : ""); disk->broken = broken; if (disk->broken) { rdev_stop_disk(disk, node); inst_set_resources_changed_down(&adm_service_nbd); } else inst_set_resources_changed_up(&adm_service_nbd); } }
int vrt_rdev_replace(struct vrt_realdev *rdev, const exa_uuid_t *new_rdev_uuid) { blockdevice_t *new_blockdevice; int err; if (rdev_is_ok(rdev)) { exalog_error("Bad rdev status %d", rdev_get_compound_status(rdev)); return -VRT_ERR_CANT_DGDISKRECOVER; } new_blockdevice = client_get_blockdevice(new_rdev_uuid); if (new_blockdevice == NULL) { exalog_error("Could not open device "UUID_FMT, UUID_VAL(new_rdev_uuid)); return -ENODEV; } if (__get_block_aligned_bdev_size(new_blockdevice) < __get_block_aligned_bdev_size(rdev->blockdevice)) return -VRT_ERR_RDEV_TOO_SMALL; rdev->blockdevice = new_blockdevice; /* re-open the superblock stream */ vrt_rdev_close_superblock_streams(rdev); err = vrt_rdev_open_superblock_streams(rdev); if (err != 0) return err; uuid_copy(&rdev->nbd_uuid, new_rdev_uuid); return EXA_SUCCESS; }
void rain1_compute_status(struct vrt_group *group) { rain1_group_t *lg = RAIN1_GROUP(group); struct assembly_group *ag = &lg->assembly_group; size_t nb_not_corrected_spofs; rain1_realdev_t *lr; int i; /* Compute the status of the group. * This status depends on the number of SPOF groups that are not working * fine (down or outdated). */ nb_not_corrected_spofs = 0; for (i = 0; i < group->storage->num_spof_groups; i++) { if (rain1_spof_group_has_defect(lg, &group->storage->spof_groups[i])) nb_not_corrected_spofs++; } if (nb_not_corrected_spofs == 0) { group->status = EXA_GROUP_OK; exalog_debug("Status of group '%s' is OK", group->name); #ifdef WITH_MONITORING md_client_notify_diskgroup_ok(vrt_msg_handle, &group->uuid, group->name); #endif /* FIXME: There could be no 'not_corrected' spof groups while an * updating operation is on-going. In this case, we should have * the group status set to EXA_GROUP_REBUILDING ... if we keep using * EXA_GROUP_REBUILDING. */ } else if (nb_not_corrected_spofs == 1) { group->status = EXA_GROUP_DEGRADED; exalog_debug("Status of group '%s' is DEGRADED", group->name); #ifdef WITH_MONITORING md_client_notify_diskgroup_degraded(vrt_msg_handle, &group->uuid, group->name); #endif } else { group->status = EXA_GROUP_OFFLINE; exalog_debug("Status of group '%s' is OFFLINE (%" PRIzu " SPOFs not corrected)", group->name, nb_not_corrected_spofs); #ifdef WITH_MONITORING md_client_notify_diskgroup_offline(vrt_msg_handle, &group->uuid, group->name); #endif } exalog_debug("Clearing Rebuild context for each of the group "UUID_FMT" rdevs", UUID_VAL(&group->uuid)); /* Cleanup previous rebuilding status */ foreach_rainx_rdev(lg, lr, i) { rain1_rdev_clear_rebuild_context(lr); }
/** * Command to register a real device in a group. * It must be called after group_begin() and before * group_create() or group_start(). * * @param[in] params The parsed command array * * The real parameters passed in the array are: * - UUID of the group in which the real device has to be added * - UUID of the real device in the VRT * - UUID of the real device in the NBD * - Whether the disk is local or not * - Whether the disk is UP or not * - Whether the device properties must be loaded from disk * * @return EXA_SUCCESS on success, negative error code on failure. */ static int vrt_cmd_group_add_rdev(const struct VrtGroupAddRdev *cmd) { vrt_rdev_info_t *rdev_info; os_thread_mutex_lock(&pending_group_lock); if (!pending_group) { os_thread_mutex_unlock(&pending_group_lock); return -EPERM; } if (!uuid_is_equal(&cmd->group_uuid, &pending_group->uuid)) { exalog_error("Failed to edit group " UUID_FMT ", group " UUID_FMT " is already being edited.", UUID_VAL(&cmd->group_uuid), UUID_VAL(&pending_group->uuid)); os_thread_mutex_unlock(&pending_group_lock); return -EAGAIN; } if (pending_group->nb_rdevs == NBMAX_DISKS_PER_GROUP) { os_thread_mutex_unlock(&pending_group_lock); return -EAGAIN; } rdev_info = &pending_group->rdevs[pending_group->nb_rdevs]; uuid_copy(&rdev_info->uuid, &cmd->uuid); uuid_copy(&rdev_info->nbd_uuid, &cmd->nbd_uuid); rdev_info->node_id = cmd->node_id; rdev_info->spof_id = cmd->spof_id; rdev_info->local = cmd->local; rdev_info->up = cmd->up; pending_group->nb_rdevs++; os_thread_mutex_unlock(&pending_group_lock); return EXA_SUCCESS; }
static int vrt_cmd_group_event(const struct VrtGroupEvent *event_msg) { int retval = -EINVAL; struct vrt_group *group; group = vrt_get_group_from_uuid(&event_msg->group_uuid); if (!group) { exalog_debug("group " UUID_FMT " not found", UUID_VAL(&event_msg->group_uuid)); return -VRT_ERR_UNKNOWN_GROUP_UUID; } switch(event_msg->event) { case VRT_GROUP_RESUME: retval = vrt_group_resume(group); break; case VRT_GROUP_SUSPEND_METADATA_AND_REBUILD: vrt_group_metadata_thread_suspend(group); vrt_group_rebuild_thread_suspend(group); retval = 0; break; case VRT_GROUP_RESUME_METADATA_AND_REBUILD: vrt_group_metadata_thread_resume(group); vrt_group_rebuild_thread_resume(group); retval = 0; break; case VRT_GROUP_COMPUTESTATUS: retval = vrt_group_compute_status(group); break; case VRT_GROUP_WAIT_INITIALIZED_REQUESTS: vrt_group_wait_initialized_requests (group); retval = EXA_SUCCESS; break; case VRT_GROUP_POSTRESYNC: retval = vrt_group_post_resync(group); break; default : EXA_ASSERT_VERBOSE(0, "struct VrtGroupEvent: Unknown event type %d\n", event_msg->event); } vrt_group_unref(group); return retval; }
/** * Create a new real dev. * * @param group Group in which the real device will be added * @param node_id ID of the node where the device is attached * @param uuid UUID of the real device in the VRT * @param nbd_uuid UUID of the real device in the NBD * @param local true is the rdev is local * @param up true if Admind considers the device as UP * * @return a valid vrt_realdev */ struct vrt_realdev *vrt_rdev_new(exa_nodeid_t node_id, spof_id_t spof_id, const exa_uuid_t *uuid, const exa_uuid_t *nbd_uuid, int index, bool local, bool up) { struct vrt_realdev *rdev; exalog_debug("adding rdev " UUID_FMT ": status = %s", UUID_VAL(uuid), up ? "UP" : "DOWN"); rdev = os_malloc(sizeof(struct vrt_realdev)); if (rdev == NULL) return NULL; memset(rdev, 0, sizeof(struct vrt_realdev)); /* FIXME It would be f*****g great to have the fields initialized in the same order as they appear in the structure definition. */ rdev->local = local; rdev->node_id = node_id; rdev->spof_id = spof_id; rdev->index = index; /* Initialize the device status */ rdev->up = up; rdev->corrupted = FALSE; rdev->real_size = 0; /* Initialize the its superblock info */ uuid_copy(&rdev->uuid, uuid); uuid_copy(&rdev->nbd_uuid, nbd_uuid); rdev->chunks.chunk_size = 0; rdev->chunks.total_chunks_count = 0; rdev->chunks.free_chunks_count = 0; rdev->chunks.free_chunks = NULL; rdev->raw_sb_stream = NULL; rdev->sb_data_streams[0] = NULL; rdev->sb_data_streams[1] = NULL; rdev->checksum_sb_streams[0] = NULL; rdev->checksum_sb_streams[1] = NULL; return rdev; }
/** * Sync metadata on disk. * * @param[in] params The parsed command array * * The real parameters passed in the array are: * - UUID of the group * * @return 0 on success, a negative error code on failure */ static int vrt_cmd_group_sync_sb(const struct VrtGroupSyncSb *cmd) { struct vrt_group *group; int ret; group = vrt_get_group_from_uuid(&cmd->group_uuid); if(!group) { exalog_error("Unknown group " UUID_FMT, UUID_VAL(&cmd->group_uuid)); return -VRT_ERR_UNKNOWN_GROUP_UUID; } ret = vrt_group_sync_sb(group, cmd->old_sb_version, cmd->new_sb_version); vrt_group_unref(group); return ret; }
int vrt_rdev_set_real_size(struct vrt_realdev *rdev, uint64_t size) { EXA_ASSERT(rdev->up); EXA_ASSERT(rdev->blockdevice != NULL); if (__usable_size(size) <= 0) { exalog_error("rdev " UUID_FMT " is too small (%" PRIu64 " sectors) to store the superblocks", UUID_VAL(&rdev->uuid), size); return -VRT_ERR_RDEV_TOO_SMALL; } rdev->real_size = size; return EXA_SUCCESS; }
/** * Begin group creation/starting command. Such a command must be * followed by several group_add_rdev() commands to add the real * devices in the group, and finally by a group_create() or * group_start() command. * * @param[in] params The parsed command array * * The real parameters passed in the array are: * - Name of the group to create or start * - Its UUID * - Name of the layout to use * * @return EXA_SUCCESS on success, negative error code on failure */ static int vrt_cmd_group_begin(const struct VrtGroupBegin *cmd) { vrt_group_t *group; exalog_debug("begin group '%s': UUID='" UUID_FMT "' layout='%s'", cmd->group_name, UUID_VAL(& cmd->group_uuid), cmd->layout); /* Check if the group is already started */ group = vrt_get_group_from_uuid(&cmd->group_uuid); if(group) { EXA_ASSERT(strcmp(cmd->group_name, group->name)==0 ); vrt_group_unref(group); return -VRT_INFO_GROUP_ALREADY_STARTED; } /* check if the group name is already used (this should not happen if * admind XML parsing is correct... */ group = vrt_get_group_from_name(cmd->group_name); if (group != NULL) { vrt_group_unref(group); return -VRT_ERR_GROUPNAME_USED; } os_thread_mutex_lock(&pending_group_lock); if (pending_group) os_free(pending_group); pending_group = os_malloc(sizeof(vrt_group_info_t)); vrt_group_info_init(pending_group); os_strlcpy(pending_group->name, cmd->group_name, sizeof(pending_group->name)); os_strlcpy(pending_group->layout_name, cmd->layout, sizeof(pending_group->layout_name)); uuid_copy(&pending_group->uuid, &cmd->group_uuid); pending_group->sb_version = cmd->sb_version; os_thread_mutex_unlock(&pending_group_lock); return EXA_SUCCESS; }
/** * Tells that a unusable device is now available again. We'll put it * in the EXA_REALDEV_UPDATING state, which doesn't mean we can safely * use it, but that a rebuilding process must take place before * changing the status to EXA_REALDEV_OK. * * @param[in] rdev The real device which is now available again * * @return always EXA_SUCCESS */ int vrt_rdev_up(struct vrt_realdev *rdev) { uint64_t size; uint64_t required_size; int err; /* Admind can send an up message even if the device is not down */ if (rdev_is_ok(rdev)) return EXA_SUCCESS; rdev->up = TRUE; size = __get_block_aligned_bdev_size(rdev->blockdevice); err = vrt_rdev_set_real_size(rdev, size); if (err != EXA_SUCCESS) return err; /* Check that the size of the device correspond to the real size of the * device. * This test is also done at group start (see vrt_group_start) * * FIXME: This kind of verification could be done by the service in * charge of the devices */ required_size = rdev_chunk_based_size(rdev); if (vrt_realdev_get_usable_size(rdev) < required_size) { /* XXX Duplicate: same error in vrt_group_start() */ exalog_error("Real size of device "UUID_FMT" is too small: %"PRIu64" < %"PRIu64, UUID_VAL(&rdev->uuid), vrt_realdev_get_usable_size(rdev), required_size); rdev->corrupted = TRUE; rdev->real_size = 0; return EXA_SUCCESS; } rdev->corrupted = FALSE; return EXA_SUCCESS; }
/** * Prepare an updating operation * * @param[in] ag assembly group * @param[in] lg rain1 group data * @param[in] spof_group The SPOF group to update */ static void rain1_group_prepare_updating(const struct assembly_group *ag, const rain1_group_t *lg, const spof_group_t *spof_group) { uint32_t i; for (i = 0; i < spof_group->nb_realdevs; i++) { struct vrt_realdev *rdev = spof_group->realdevs[i]; struct rain1_realdev *lr = RAIN1_REALDEV(lg, rdev); if (rdev_is_ok(rdev) && !rain1_rdev_is_uptodate(lr, lg->sync_tag)) { exalog_debug("New updating: update disk: index = %d, UUID = " UUID_FMT, rdev->index, UUID_VAL(&rdev->uuid)); rain1_rdev_init_rebuild_context(lr, EXA_RDEV_REBUILD_UPDATING, lg->sync_tag); } } }
int unexport_device(const exa_uuid_t *uuid) { device_t *dev = find_device_from_uuid(uuid); if (dev == NULL) { exalog_error("can not remove unknown device with UUID = " UUID_FMT, UUID_VAL(uuid)); return -CMD_EXP_ERR_UNKNOWN_DEVICE; } os_thread_mutex_lock(&nbd_server.mutex_edevs); /* ask the thread to terminate */ dev->exit_thread = true; /* prevent any new IO to be put in device IO list */ nbd_server.devices[dev->dev_index] = NULL; os_thread_mutex_unlock(&nbd_server.mutex_edevs); /* now we can join, because with the nbd_close_list() * we can assume was the disk thread will reach a cancelation point */ os_thread_join(nbd_server.td_pid[dev->dev_index]); /* close the list used to disk queue */ nbd_close_list(&dev->disk_queue); /* get back all header in the kernel exa_rdev to the free list and close the device */ if (dev->handle != NULL) exa_rdev_handle_free(dev->handle); /* close the semaphore used by the disk */ os_sem_destroy(&dev->lock_sem_disk); /* free used memory for the device */ os_free(dev); return EXA_SUCCESS; }
/** * Finalize the start of a group. * * @param[in] params The parsed command array * * The real parameters passed in the array are: * - UUID of the group to start * * @return 0 on success, a negative error code on failure */ static int vrt_cmd_group_start(const struct VrtGroupStart *cmd) { vrt_group_t *group; int ret; exalog_debug("start group " UUID_FMT, UUID_VAL(&cmd->group_uuid)); os_thread_mutex_lock(&pending_group_lock); if (!pending_group) { os_thread_mutex_unlock(&pending_group_lock); return -EPERM; } if (!uuid_is_equal(&cmd->group_uuid, &pending_group->uuid)) { exalog_error("You are trying to create or start two groups at the same time."); os_thread_mutex_unlock(&pending_group_lock); return -EAGAIN; } ret = vrt_group_start(pending_group, &group); if (ret == EXA_SUCCESS) { ret = vrt_groups_list_add(group); if (ret != EXA_SUCCESS) vrt_group_stop(group); } os_free(pending_group); os_thread_mutex_unlock(&pending_group_lock); return ret; }
static void disk_checking_thread(void *dummy) { exalog_as(EXAMSG_RDEV_ID); while (!quit) { int rdev_need_check = false; struct adm_disk *disk; adm_node_lock_disk_removal(); adm_node_for_each_disk(adm_myself(), disk) { if (disk->local->rdev_req != NULL) { int state, last_state; last_state = disk->local->state; state = exa_rdev_test(disk->local->rdev_req, rdev_check_buffer, RDEV_SUPERBLOCK_SIZE); /* if exa_rdev_test returns an error, the disk is considered in failure * as we have no mean to know what really happened. */ if (state < 0) { exalog_error("testing rdev '%s' " UUID_FMT " failed: %s (%d)", disk->path, UUID_VAL(&disk->uuid), exa_error_msg(state), state); state = EXA_RDEV_STATUS_FAIL; } if (state != last_state) { if (state == EXA_RDEV_STATUS_FAIL) rdev_need_check = true; disk->local->state = state; } } } adm_node_unlock_disk_removal(); if (quit) break; if (rdev_need_check) { instance_event_msg_t msg; int ret; msg.any.type = EXAMSG_EVMGR_INST_EVENT; msg.event.id = EXAMSG_RDEV_ID; msg.event.state = INSTANCE_CHECK_DOWN; msg.event.node_id = adm_myself()->id; exalog_info("... broadcasting action: rdev check down"); ret = examsgSend(mh, EXAMSG_ADMIND_EVMGR_ID, EXAMSG_ALLHOSTS, &msg, sizeof(msg)); EXA_ASSERT(ret == sizeof(msg)); } os_sleep(DISK_CHECK_INTERVAL); } }
void rebuild_helper_thread(void *p) { ExamsgHandle mh; int err; exalog_as(EXAMSG_NBD_SERVER_ID); /* initialize examsg framework */ mh = examsgInit(EXAMSG_NBD_LOCKING_ID); EXA_ASSERT(mh != NULL); err = examsgAddMbox(mh, EXAMSG_NBD_LOCKING_ID, 1, 5 * EXAMSG_MSG_MAX); EXA_ASSERT(err == 0); os_sem_post(&nbd_server.mailbox_sem); while (nbd_server.run) { device_t *device; ExamsgNbdLock nbd_lock_msg; ExamsgMID from; struct timeval timeout = { .tv_sec = 0, .tv_usec = 100000 }; exa_nodeset_t dest_nodes; err = examsgWaitTimeout(mh, &timeout); /* Just in order to check stopping the thread is required*/ if (err == -ETIME) continue; if (err != 0) { exalog_error("Locking thread encountered error %s (%d) while " "waiting in event loop.", exa_error_msg(err), err); continue; } err = examsgRecv(mh, &from, &nbd_lock_msg, sizeof(nbd_lock_msg)); /* No message */ if (err == 0) continue; if (err < 0) { exalog_error("Locking thread encountered error %s (%d) while " "receiving a messsage.", exa_error_msg(err), err); continue; } switch(nbd_lock_msg.any.type) { case EXAMSG_NBD_LOCK: /* find device from name */ /* FIXME devices lock is not held... it should */ device = find_device_from_uuid(&nbd_lock_msg.disk_uuid); if (device == NULL) { exalog_error("Unknown device with UUID " UUID_FMT, UUID_VAL(&nbd_lock_msg.disk_uuid)); err = -CMD_EXP_ERR_UNKNOWN_DEVICE; break; } if (nbd_lock_msg.lock) { err = exa_disk_lock_zone(device, nbd_lock_msg.locked_zone_start, nbd_lock_msg.locked_zone_size); EXA_ASSERT_VERBOSE(err == 0, "Trying to lock too many zone " "(>%d). Last zone not succesfully locked " "(start = %" PRId64 ", size = %" PRId64 " ) " "on device UUID " UUID_FMT, NBMAX_DISK_LOCKED_ZONES, nbd_lock_msg.locked_zone_start, nbd_lock_msg.locked_zone_size, UUID_VAL(&nbd_lock_msg.disk_uuid)); } else { err = exa_disk_unlock_zone(device, nbd_lock_msg.locked_zone_start, nbd_lock_msg.locked_zone_size); EXA_ASSERT_VERBOSE(err == 0, "Trying to unlock a never locked " "zone (unlocked zone start =%" PRId64 ", " "unlocked zone size = %" PRId64 ") on device" " UUID " UUID_FMT, nbd_lock_msg.locked_zone_start, nbd_lock_msg.locked_zone_size, UUID_VAL(&nbd_lock_msg.disk_uuid)); } break; default: /* error */ EXA_ASSERT_VERBOSE(false, "Locking thread got unknown message of" " type %d ", nbd_lock_msg.any.type); break; } exa_nodeset_single(&dest_nodes, from.netid.node); examsgAckReply(mh, (Examsg *)&nbd_lock_msg, err, from.id, &dest_nodes); } examsgDelMbox(mh, EXAMSG_NBD_LOCKING_ID); examsgExit(mh); } /** get the number of sector of the device * \param device_path the device to get the number of sector * \param nb_sectors64 the number of sectors of the device * \return nb_sectors the returned number of sector */ static int get_nb_sectors(const char *device_path, uint64_t *nb_sectors) { uint64_t device_size; /* in bytes */ int retval; int fd; /* We need the read access to get the size. */ if ((fd = os_disk_open_raw(device_path, OS_DISK_READ)) < 0) { exalog_error("cannot open device '%s' error=%s ", device_path, exa_error_msg(-fd)); return -CMD_EXP_ERR_OPEN_DEVICE; } retval = os_disk_get_size(fd, &device_size); if (retval < 0) { exalog_error("os_disk_get_size() error=%s", exa_error_msg(retval)); if (close(fd) != 0) exalog_error("can't EVEN close dev '%s'", device_path); return -EXA_ERR_IOCTL; } retval = close(fd); if (retval < 0) { retval = -errno; exalog_error("cannot close device '%s' error=%s ", device_path, exa_error_msg(retval)); return -CMD_EXP_ERR_CLOSE_DEVICE; } *nb_sectors = device_size / SECTOR_SIZE; /* remove the size of the reserved area for storing admind info */ *nb_sectors -= RDEV_RESERVED_AREA_IN_SECTORS; /* Align the size on 1K * this is the best we can do to have the same size of devices on 2.4 and 2.6 kernels due to * the fact that kernel 2.4 rounds the size of devices with 1 K */ *nb_sectors -= *nb_sectors % (1024 / SECTOR_SIZE); return EXA_SUCCESS; }
int adm_vrt_group_sync_sb(int thr_nb, struct adm_group *group) { struct { bool group_is_started; bool can_write; bool have_disk_in_group; } info, reply; exa_nodeid_t nid; bool group_is_started_somewhere = false; int ret; int barrier_ret = EXA_SUCCESS; admwrk_request_t rpc; struct adm_disk *disk; int nb_nodes_with_writable_disks = 0; int nb_nodes_with_disks_in_group = 0; uint64_t old_sb_version, new_sb_version; COMPILE_TIME_ASSERT(sizeof(info) <= ADM_MAILBOX_PAYLOAD_PER_NODE); /* XXX maybe checking started is useless as administrable => started * and !administrable => return */ info.group_is_started = group->started; info.can_write = false; info.have_disk_in_group = false; adm_group_for_each_disk(group, disk) { if (disk->node_id == adm_my_id) { info.have_disk_in_group = true; if (disk->up_in_vrt) info.can_write = true; } } admwrk_bcast(thr_nb, &rpc, EXAMSG_SERVICE_VRT_SB_SYNC, &info, sizeof(info)); while (admwrk_get_bcast(&rpc, &nid, &reply, sizeof(reply), &ret)) { if (ret == -ADMIND_ERR_NODE_DOWN) { barrier_ret = -ADMIND_ERR_NODE_DOWN; continue; } EXA_ASSERT(ret == EXA_SUCCESS); if (reply.can_write) nb_nodes_with_writable_disks++; if (reply.have_disk_in_group) nb_nodes_with_disks_in_group++; if (reply.group_is_started) group_is_started_somewhere = true; } if (barrier_ret != EXA_SUCCESS) return barrier_ret; /* do not write superblocks if the group is stopped on all nodes */ if (!group_is_started_somewhere) return EXA_SUCCESS; if (nb_nodes_with_writable_disks < quotient_ceil64(nb_nodes_with_disks_in_group, 2)) return -VRT_ERR_GROUP_NOT_ADMINISTRABLE; old_sb_version = sb_version_get_version(group->sb_version); new_sb_version = sb_version_new_version_prepare(group->sb_version); if (group->started) { ret = vrt_client_group_sync_sb(adm_wt_get_localmb(), &group->uuid, old_sb_version, new_sb_version); EXA_ASSERT_VERBOSE(ret == EXA_SUCCESS || ret == -ADMIND_ERR_NODE_DOWN, "Synchronization of superblocks failed for group '%s' " "UUID=" UUID_FMT ": %s (%d)", group->name, UUID_VAL(&group->uuid), exa_error_msg(ret), ret); } else ret = EXA_SUCCESS; barrier_ret = admwrk_barrier(thr_nb, ret, "VRT: Preparing superblocks version"); if (barrier_ret != EXA_SUCCESS) return barrier_ret; sb_version_new_version_done(group->sb_version); barrier_ret = admwrk_barrier(thr_nb, EXA_SUCCESS, "VRT: Writing superblocks version"); /* Commit anyway, If we are here, we are sure that other nodes have done the * job too even if they crashed meanwhile */ sb_version_new_version_commit(group->sb_version); return barrier_ret; }
static void get_info_from_params(const struct dgcreate_params *params, struct dgcreate_info *info, cl_error_desc_t *err_desc) { xmlDocPtr config; xmlNodePtr diskgroup_ptr; xmlAttrPtr attr; int i; EXA_ASSERT(params); EXA_ASSERT(info); EXA_ASSERT(err_desc); config = params->config; memset(info, 0, sizeof(*info)); diskgroup_ptr = xml_conf_xpath_singleton(config, "/Exanodes/diskgroup"); uuid_generate(&info->uuid); /* 0 means that the slot width will be automagically computed */ info->slot_width = 0; info->chunk_size = adm_cluster_get_param_int("default_chunk_size"); info->su_size = adm_cluster_get_param_int("default_su_size"); info->dirty_zone_size = adm_cluster_get_param_int("default_dirty_zone_size"); info->blended_stripes = false; info->nb_disks = 0; info->nb_spare = VRT_DEFAULT_NB_SPARES; info->layout[0] = '\0'; for (attr = diskgroup_ptr->properties; attr != NULL; attr = attr->next) { if (xmlStrEqual(attr->name, BAD_CAST("name"))) strlcpy(info->name, xml_get_prop(diskgroup_ptr, "name"), EXA_MAXSIZE_GROUPNAME + 1); else if (xmlStrEqual(attr->name, BAD_CAST("layout"))) strlcpy(info->layout, xml_get_prop(diskgroup_ptr, "layout"), EXA_MAXSIZE_LAYOUTNAME + 1); else if (xmlStrEqual(attr->name, BAD_CAST("slot_width"))) { if (xml_get_uint_prop(diskgroup_ptr, "slot_width", &info->slot_width, err_desc) != 0) return; /* NOTE User can not give a zero value * If slot_width is not provided, we pass zero * to vrt so that it can calculate the proper slot_width */ if (info->slot_width == 0) { set_error(err_desc, -EXA_ERR_XML_GET, "slot_width must be greater than zero"); return; } } else if (xmlStrEqual(attr->name, BAD_CAST("chunk_size"))) { if (xml_get_uint_prop(diskgroup_ptr, "chunk_size", &info->chunk_size, err_desc) != 0) return; } else if (xmlStrEqual(attr->name, BAD_CAST("su_size"))) { if (xml_get_uint_prop(diskgroup_ptr, "su_size", &info->su_size, err_desc) != 0) return; } else if (xmlStrEqual(attr->name, BAD_CAST("dirty_zone_size"))) { if (xml_get_uint_prop(diskgroup_ptr, "dirty_zone_size", &info->dirty_zone_size, err_desc) != 0) return; } else if (xmlStrEqual(attr->name, BAD_CAST("blended_stripes"))) { if (xml_get_uint_prop(diskgroup_ptr, "blended_stripes", &info->blended_stripes, err_desc) != 0) return; } else if (xmlStrEqual(attr->name, BAD_CAST("nb_spare"))) { if (xml_get_uint_prop(diskgroup_ptr, "nb_spare", &info->nb_spare, err_desc) != 0) return; } else if (!xmlStrEqual(attr->name, BAD_CAST("cluster"))) { set_error(err_desc, -EXA_ERR_XML_GET, "Unknown group property '%s'", (char *)attr->name); return; } } /* Check the group name */ if (info->name == NULL || info->name[0] == '\0') { set_error(err_desc, -EXA_ERR_INVALID_PARAM, NULL); return; } /* Check if a group with that name already exist */ if (adm_group_get_group_by_name(info->name) != NULL) { set_error(err_desc, -VRT_ERR_GROUPNAME_USED, NULL); return; } if (info->layout[0] == '\0') { set_error(err_desc, -EXA_ERR_XML_GET, NULL); return; } if (params->alldisks) { struct adm_node *node; adm_cluster_for_each_node(node) { struct adm_disk *disk; adm_node_for_each_disk(node, disk) { if (uuid_is_zero(&disk->group_uuid)) { if (disk->path[0] == '\0') { set_error(err_desc, -ADMIND_ERR_UNKNOWN_DISK, "disk " UUID_FMT " is unknown", UUID_VAL(&disk->uuid)); return; } if (info->nb_disks >= NBMAX_DISKS_PER_GROUP) { set_error(err_desc, -ADMIND_ERR_TOO_MANY_DISKS_IN_GROUP, "too many disks in group (> %d)", NBMAX_DISKS_PER_GROUP); return; } uuid_copy(&info->disks[info->nb_disks], &disk->uuid); info->nb_disks++; } } } } else {