static void get_data_oid(char *sheep, uint64_t oid, struct sd_obj_rsp *rsp, char *buf, void *data) { struct get_data_oid_info *info = data; struct sheepdog_inode *inode = (struct sheepdog_inode *)buf; switch (rsp->result) { case SD_RES_SUCCESS: if (info->success) break; info->success = 1; if (inode->data_vdi_id[info->idx]) info->data_oid = vid_to_data_oid(inode->data_vdi_id[info->idx], info->idx); break; case SD_RES_NO_OBJ: break; case SD_RES_OLD_NODE_VER: case SD_RES_NEW_NODE_VER: printf("the node list has changed, try again\n"); break; default: printf("%s: hit an expected error, %d\n", sheep, rsp->result); break; } }
static int bnode_do_create(struct kv_bnode *bnode, struct sd_inode *inode, uint32_t idx, bool create) { uint32_t vid = inode->vdi_id; uint64_t oid = vid_to_data_oid(vid, idx); int ret; bnode->oid = oid; ret = sd_write_object(oid, (char *)bnode, sizeof(*bnode), 0, create); if (ret != SD_RES_SUCCESS) { sd_err("failed to create object, %" PRIx64, oid); goto out; } if (!create) goto out; sd_inode_set_vid(inode, idx, vid); ret = sd_inode_write_vid(inode, idx, vid, vid, 0, false, false); if (ret != SD_RES_SUCCESS) { sd_err("failed to update inode, %" PRIx64, vid_to_vdi_oid(vid)); goto out; } out: return ret; }
static void bucket_iterater(void *data, enum btree_node_type type, void *arg) { struct sd_extent *ext; struct bucket_iterater_arg *biarg = arg; struct kv_bnode bnode; uint64_t oid; int ret; if (type == BTREE_EXT) { ext = (struct sd_extent *)data; if (!ext->vdi_id) return; oid = vid_to_data_oid(ext->vdi_id, ext->idx); ret = sd_read_object(oid, (char *)&bnode, sizeof(bnode), 0); if (ret != SD_RES_SUCCESS) { sd_err("Failed to read data object %"PRIx64, oid); return; } if (bnode.name[0] == 0) return; if (biarg->cb) biarg->cb(bnode.name, biarg->opaque); biarg->bucket_count++; biarg->object_count += bnode.object_count; biarg->bytes_used += bnode.bytes_used; } }
static void bucket_iterater(struct sd_index *idx, void *arg, int ignore) { struct bucket_iterater_arg *biarg = arg; struct kv_bnode bnode; uint64_t oid; int ret; if (!idx->vdi_id) return; oid = vid_to_data_oid(idx->vdi_id, idx->idx); ret = sd_read_object(oid, (char *)&bnode, sizeof(bnode), 0); if (ret != SD_RES_SUCCESS) { sd_err("Failed to read data object %"PRIx64, oid); return; } if (bnode.name[0] == 0) return; if (biarg->cb) biarg->cb(bnode.name, biarg->opaque); biarg->bucket_count++; biarg->object_count += bnode.object_count; biarg->bytes_used += bnode.bytes_used; }
static int onode_do_create(struct kv_onode *onode, struct sd_inode *inode, uint32_t idx, bool create) { uint32_t vid = inode->vdi_id; uint64_t oid = vid_to_data_oid(vid, idx), len; int ret; onode->oid = oid; if (onode->inlined) len = onode->size; else len = sizeof(struct onode_extent) * onode->nr_extent; ret = sd_write_object(oid, (char *)onode, BLOCK_SIZE + len, 0, create); if (ret != SD_RES_SUCCESS) { sd_err("failed to create object, %" PRIx64, oid); goto out; } if (!create) goto out; INODE_SET_VID(inode, idx, vid); ret = sd_inode_write_vid(sheep_bnode_writer, inode, idx, vid, vid, 0, false, false); if (ret != SD_RES_SUCCESS) { sd_err("failed to update inode, %" PRIx64, vid_to_vdi_oid(vid)); goto out; } out: return ret; }
static void object_iterater(void *data, enum btree_node_type type, void *arg) { struct sd_extent *ext; struct object_iterater_arg *oiarg = arg; struct kv_onode *onode = NULL; uint64_t oid; int ret; if (type == BTREE_EXT) { ext = (struct sd_extent *)data; if (!ext->vdi_id) goto out; onode = xmalloc(SD_DATA_OBJ_SIZE); oid = vid_to_data_oid(ext->vdi_id, ext->idx); ret = sd_read_object(oid, (char *)onode, SD_DATA_OBJ_SIZE, 0); if (ret != SD_RES_SUCCESS) { sd_err("Failed to read data object %"PRIx64, oid); goto out; } if (onode->name[0] == '\0') goto out; if (oiarg->cb) oiarg->cb(onode->name, oiarg->opaque); oiarg->count++; } out: free(onode); }
static int get_data_oid(char *sheep, uint64_t oid, struct sd_rsp *rsp, char *buf, void *data) { struct get_data_oid_info *info = data; struct sheepdog_inode *inode = (struct sheepdog_inode *)buf; switch (rsp->result) { case SD_RES_SUCCESS: if (info->success) break; info->success = 1; if (inode->data_vdi_id[info->idx]) { info->data_oid = vid_to_data_oid(inode->data_vdi_id[info->idx], info->idx); return 1; } break; case SD_RES_NO_OBJ: break; case SD_RES_OLD_NODE_VER: case SD_RES_NEW_NODE_VER: fprintf(stderr, "The node list has changed: please try again\n"); break; default: fprintf(stderr, "%s: hit an unexpected error (%d)\n", sheep, rsp->result); break; } return 0; }
static int onode_create(struct kv_onode *onode, uint32_t bucket_vid) { struct sd_inode *inode = xmalloc(sizeof(struct sd_inode)); uint32_t tmp_vid, idx; uint64_t hval, i; int ret; bool create = true; sys->cdrv->lock(bucket_vid); ret = sd_read_object(vid_to_vdi_oid(bucket_vid), (char *)inode, sizeof(*inode), 0); if (ret != SD_RES_SUCCESS) { sd_err("failed to read %" PRIx32 " %s", bucket_vid, sd_strerror(ret)); goto out; } hval = sd_hash(onode->name, strlen(onode->name)); for (i = 0; i < MAX_DATA_OBJS; i++) { idx = (hval + i) % MAX_DATA_OBJS; tmp_vid = INODE_GET_VID(inode, idx); if (tmp_vid) { uint64_t oid = vid_to_data_oid(bucket_vid, idx); char name[SD_MAX_OBJECT_NAME] = { }; ret = sd_read_object(oid, name, sizeof(name), 0); if (ret != SD_RES_SUCCESS) goto out; if (name[0] == 0) { create = false; goto create; } } else break; } if (i == MAX_DATA_OBJS) { ret = SD_RES_NO_SPACE; goto out; } create: ret = onode_do_create(onode, inode, idx, create); out: free(inode); sys->cdrv->unlock(bucket_vid); return ret; }
/* * Check if object by name exists in a bucket and init 'onode' if it exists. * * Return SD_RES_SUCCESS if found, SD_RES_NO_OBJ if not found. * * We check adjacent objects one by one once we get a start index by hashing * name. Unallocated slot marks the end of the check window. * * For e.g, if we are going to check if fish in the following bucket, assume * fish hashes to 'sheep', so we compare the name one by one from 'sheep' to * 'fish'. '\0' indicates that object was deleted before checking. * * [ sheep, dog, wolve, '\0', fish, {unallocated}, tiger, ] */ static int onode_lookup(struct kv_onode *onode, uint32_t ovid, const char *name) { struct sd_inode *inode = xmalloc(sizeof(struct sd_inode)); uint32_t tmp_vid, idx; uint64_t hval, i; int ret; sys->cdrv->lock(ovid); ret = sd_read_object(vid_to_vdi_oid(ovid), (char *)inode, sizeof(*inode), 0); if (ret != SD_RES_SUCCESS) { sd_err("failed to read %" PRIx32 " %s", ovid, sd_strerror(ret)); goto out; } hval = sd_hash(name, strlen(name)); for (i = 0; i < MAX_DATA_OBJS; i++) { idx = (hval + i) % MAX_DATA_OBJS; tmp_vid = INODE_GET_VID(inode, idx); if (tmp_vid) { uint64_t oid = vid_to_data_oid(ovid, idx); ret = sd_read_object(oid, (char *)onode, sizeof(*onode), 0); if (ret != SD_RES_SUCCESS) goto out; if (strcmp(onode->name, name) == 0) break; } else { ret = SD_RES_NO_OBJ; break; } } if (i == MAX_DATA_OBJS) { ret = SD_RES_NO_OBJ; goto out; } out: free(inode); sys->cdrv->unlock(ovid); return ret; }
static int vdi_read_write(uint32_t vid, char *data, size_t length, off_t offset, bool is_read) { struct sd_req hdr; uint32_t idx = offset / SD_DATA_OBJ_SIZE; uint64_t done = 0; struct request_iocb *iocb; int ret; iocb = local_req_init(); if (!iocb) return SD_RES_SYSTEM_ERROR; offset %= SD_DATA_OBJ_SIZE; while (done < length) { size_t len = min(length - done, SD_DATA_OBJ_SIZE - offset); if (is_read) { sd_init_req(&hdr, SD_OP_READ_OBJ); } else { sd_init_req(&hdr, SD_OP_CREATE_AND_WRITE_OBJ); hdr.flags = SD_FLAG_CMD_WRITE; } hdr.data_length = len; hdr.obj.oid = vid_to_data_oid(vid, idx); hdr.obj.offset = offset; ret = exec_local_req_async(&hdr, data, iocb); if (ret != SD_RES_SUCCESS) sd_err("failed to write object %" PRIx64 ", %s", hdr.obj.oid, sd_strerror(ret)); offset += len; if (offset == SD_DATA_OBJ_SIZE) { offset = 0; idx++; } done += len; data += len; } return local_req_wait(iocb); }
static int bnode_lookup(struct kv_bnode *bnode, uint32_t vid, const char *name) { uint64_t hval, i; int ret; hval = sd_hash(name, strlen(name)); for (i = 0; i < MAX_DATA_OBJS; i++) { uint32_t idx = (hval + i) % MAX_DATA_OBJS; uint64_t oid = vid_to_data_oid(vid, idx); ret = sd_read_object(oid, (char *)bnode, sizeof(*bnode), 0); if (ret != SD_RES_SUCCESS) goto out; if (strcmp(bnode->name, name) == 0) break; } if (i == MAX_DATA_OBJS) ret = SD_RES_NO_OBJ; out: return ret; }
static int vdi_create(int argc, char **argv) { char *vdiname = argv[optind++]; uint64_t size; uint32_t vid; uint64_t oid; int idx, max_idx, ret; struct sheepdog_inode *inode = NULL; char *buf = NULL; if (!argv[optind]) { fprintf(stderr, "please specify the size of vdi\n"); return EXIT_USAGE; } ret = parse_option_size(argv[optind], &size); if (ret < 0) return EXIT_USAGE; if (size > SD_MAX_VDI_SIZE) { fprintf(stderr, "too big image size, %s\n", argv[optind]); return EXIT_USAGE; } ret = do_vdi_create(vdiname, size, 0, &vid, 0); if (ret != EXIT_SUCCESS || !vdi_cmd_data.prealloc) goto out; inode = malloc(sizeof(*inode)); buf = zalloc(SD_DATA_OBJ_SIZE); if (!inode || !buf) { fprintf(stderr, "oom\n"); ret = EXIT_SYSFAIL; goto out; } ret = sd_read_object(vid_to_vdi_oid(vid), inode, sizeof(*inode), 0); if (ret != SD_RES_SUCCESS) { fprintf(stderr, "failed to read a newly created vdi object\n"); ret = EXIT_FAILURE; goto out; } max_idx = DIV_ROUND_UP(size, SD_DATA_OBJ_SIZE); for (idx = 0; idx < max_idx; idx++) { oid = vid_to_data_oid(vid, idx); ret = sd_write_object(oid, 0, buf, SD_DATA_OBJ_SIZE, 0, 0, inode->nr_copies, 1); if (ret != SD_RES_SUCCESS) { ret = EXIT_FAILURE; goto out; } inode->data_vdi_id[idx] = vid; ret = sd_write_object(vid_to_vdi_oid(vid), 0, &vid, sizeof(vid), SD_INODE_HEADER_SIZE + sizeof(vid) * idx, 0, inode->nr_copies, 0); if (ret) { ret = EXIT_FAILURE; goto out; } } ret = EXIT_SUCCESS; out: free(inode); free(buf); return ret; }
/* * Initialize the data vdi * * @vid: the vdi where the allocator resides */ int oalloc_init(uint32_t vid) { struct strbuf buf = STRBUF_INIT; struct sd_inode *inode = xmalloc(sizeof(struct sd_inode)); struct header hd = { .nr_free = 1, }; struct free_desc fd = { .start = 1, /* Use first object as the meta object */ .count = MAX_DATA_OBJS - 1, }; int ret; strbuf_add(&buf, &hd, sizeof(hd)); strbuf_add(&buf, &fd, sizeof(fd)); ret = sd_read_object(vid_to_vdi_oid(vid), (char *)inode, sizeof(*inode), 0); if (ret != SD_RES_SUCCESS) { sd_err("failed to read inode, %" PRIx32", %s", vid, sd_strerror(ret)); goto out; } ret = sd_write_object(vid_to_data_oid(vid, 0), buf.buf, buf.len, 0, true); if (ret != SD_RES_SUCCESS) { sd_err("failed to create meta object for %" PRIx32", %s", vid, sd_strerror(ret)); goto out; } sd_inode_set_vid(inode, 0, vid); ret = sd_inode_write_vid(inode, 0, vid, vid, 0, false, false); if (ret != SD_RES_SUCCESS) { sd_err("failed to update inode, %" PRIx32", %s", vid, sd_strerror(ret)); goto out; } out: strbuf_release(&buf); free(inode); return ret; } /* * Allocate the objects and update the free list. * * Callers are expected to call oalloc_new_finish() to update the inode bitmap * after filling up the data. * * @vid: the vdi where the allocator resides * @start: start index of the objects to allocate * @count: number of the objects to allocate */ int oalloc_new_prepare(uint32_t vid, uint64_t *start, uint64_t count) { char *meta = xvalloc(SD_DATA_OBJ_SIZE); struct header *hd; struct free_desc *fd; uint64_t oid = vid_to_data_oid(vid, 0), i; int ret; ret = sd_read_object(oid, meta, SD_DATA_OBJ_SIZE, 0); if (ret != SD_RES_SUCCESS) { sd_err("failed to read meta %" PRIx64 ", %s", oid, sd_strerror(ret)); goto out; } hd = (struct header *)meta; fd = (struct free_desc *)(meta + oalloc_meta_length(hd)) - 1; sd_debug("used %"PRIu64", nr_free %"PRIu64, hd->used, hd->nr_free); for (i = 0; i < hd->nr_free; i++, fd--) { sd_debug("start %"PRIu64", count %"PRIu64, fd->start, fd->count); if (fd->count > count) break; } if (i == hd->nr_free) { ret = SD_RES_NO_SPACE; goto out; } *start = fd->start; fd->start += count; fd->count -= count; hd->used += count; /* Update the meta object */ ret = sd_write_object(oid, meta, oalloc_meta_length(hd), 0, false); if (ret != SD_RES_SUCCESS) sd_err("failed to update meta %"PRIx64 ", %s", oid, sd_strerror(ret)); out: free(meta); return ret; } /* * Update the inode map of the vid * * @vid: the vdi where the allocator resides * @start: start index of the objects to update * @count: number of the objects to update */ int oalloc_new_finish(uint32_t vid, uint64_t start, uint64_t count) { struct sd_inode *inode = xmalloc(sizeof(struct sd_inode)); int ret; ret = sd_read_object(vid_to_vdi_oid(vid), (char *)inode, sizeof(*inode), 0); if (ret != SD_RES_SUCCESS) { sd_err("failed to read inode, %" PRIx64 ", %s", vid_to_vdi_oid(vid), sd_strerror(ret)); goto out; } sd_debug("start %"PRIu64" end %"PRIu64, start, start + count - 1); sd_inode_set_vid_range(inode, start, (start + count - 1), vid); ret = sd_inode_write(inode, 0, false, false); if (ret != SD_RES_SUCCESS) { sd_err("failed to update inode, %" PRIx64", %s", vid_to_vdi_oid(vid), sd_strerror(ret)); goto out; } out: free(inode); return ret; } static int free_desc_cmp(struct free_desc *a, struct free_desc *b) { return -intcmp(a->start, b->start); } static inline int update_and_merge_free_desc(char *meta, uint64_t start, uint64_t count, uint32_t vid) { struct header *hd = (struct header *)meta; struct free_desc *tail, *fd = HEADER_TO_FREE_DESC(hd); uint64_t i, j; /* Try our best to merge it in place, or append it to tail */ for (i = 0; i < hd->nr_free; i++) { if (start + count == fd->start) { fd->start = start; fd->count += count; break; } else if(fd->start + fd->count == start) { fd->count +=count; break; } fd++; } if (i == hd->nr_free) { if (hd->nr_free >= MAX_FREE_DESC) return SD_RES_NO_SPACE; tail = (struct free_desc *)(meta + oalloc_meta_length(hd)); tail->start = start; tail->count = count; hd->nr_free++; } hd->used -= count; xqsort(HEADER_TO_FREE_DESC(hd), hd->nr_free, free_desc_cmp); /* Merge as hard as we can */ j = hd->nr_free - 1; tail = (struct free_desc *)(meta + oalloc_meta_length(hd)) - 1; for (i = 0; i < j; i++, tail--) { struct free_desc *front = tail - 1; sd_debug("start %"PRIu64", count %"PRIu64, tail->start, tail->count); if (tail->start + tail->count > front->start) sd_emerg("bad free descriptor found at %"PRIx32, vid); if (tail->start + tail->count == front->start) { front->start = tail->start; front->count += tail->count; memmove(tail, tail + 1, sizeof(*tail) * i); hd->nr_free--; } } return SD_RES_SUCCESS; } /* * Discard the allocated objects and update the free list of the allocator * * Caller should check the return value since it might fail. * * @vid: the vdi where the allocator resides * @start: start index of the objects to free * @count: number of the objects to free */ int oalloc_free(uint32_t vid, uint64_t start, uint64_t count) { char *meta = xvalloc(SD_DATA_OBJ_SIZE); struct header *hd; uint64_t oid = vid_to_data_oid(vid, 0), i; struct sd_inode *inode = xmalloc(sizeof(struct sd_inode)); int ret; ret = sd_read_object(vid_to_vdi_oid(vid), (char *)inode, sizeof(*inode), 0); if (ret != SD_RES_SUCCESS) { sd_err("failed to read inode, %" PRIx64 ", %s", vid_to_vdi_oid(vid), sd_strerror(ret)); goto out; } sd_debug("discard start %"PRIu64" end %"PRIu64, start, start + count - 1); sd_inode_set_vid_range(inode, start, (start + count - 1), 0); ret = sd_inode_write(inode, 0, false, false); if (ret != SD_RES_SUCCESS) { sd_err("failed to update inode, %" PRIx64", %s", vid_to_vdi_oid(vid), sd_strerror(ret)); goto out; } ret = sd_read_object(oid, meta, SD_DATA_OBJ_SIZE, 0); if (ret != SD_RES_SUCCESS) { sd_err("failed to read meta %" PRIx64 ", %s", oid, sd_strerror(ret)); goto out; } ret = update_and_merge_free_desc(meta, start, count, vid); if (ret != SD_RES_SUCCESS) goto out; /* XXX use aio to speed up remove of objects */ for (i = 0; i < count; i++) { struct sd_req hdr; int res; sd_init_req(&hdr, SD_OP_REMOVE_OBJ); hdr.obj.oid = vid_to_data_oid(vid, start + i); res = exec_local_req(&hdr, NULL); /* * return the error code if it does not * success or can't find obj. */ if (res != SD_RES_SUCCESS && res != SD_RES_NO_OBJ) ret = res; } hd = (struct header *)meta; ret = sd_write_object(oid, meta, oalloc_meta_length(hd), 0, false); if (ret != SD_RES_SUCCESS) { sd_err("failed to update meta %"PRIx64 ", %s", oid, sd_strerror(ret)); goto out; } sd_debug("used %"PRIu64", nr_free %"PRIu64, hd->used, hd->nr_free); out: free(meta); free(inode); return ret; }