static void check_host_env(void) { struct rlimit r; if (getrlimit(RLIMIT_NOFILE, &r) < 0) sd_err("failed to get nofile %m"); /* * 1024 is default for NOFILE on most distributions, which is very * dangerous to run Sheepdog cluster. */ else if (r.rlim_cur == 1024) sd_warn("Allowed open files 1024 too small, suggested %u", SD_RLIM_NOFILE); else if (r.rlim_cur < SD_RLIM_NOFILE) sd_info("Allowed open files %lu, suggested %u", r.rlim_cur, SD_RLIM_NOFILE); if (getrlimit(RLIMIT_CORE, &r) < 0) sd_debug("failed to get core %m"); else if (r.rlim_cur < RLIM_INFINITY) sd_debug("Allowed core file size %lu, suggested unlimited", r.rlim_cur); /* * Disable glibc's dynamic mmap threshold and set it as 512k. * * We have to disable dynamic threshold because its inefficiency to * release freed memory back to OS. Setting it as 512k practically means * allocation larger than or equal to 512k will use mmap() for malloc() * and munmap() for free(), guaranteeing allocated memory will not be * cached in the glibc's ptmalloc internal pool. * * 512k is not a well tested optimal value for IO request size, I choose * it because it is default value for disk drive that it can transfer at * a time. So default installation of guest will issue at most 512K * sized request. */ mallopt(M_MMAP_THRESHOLD, 512 * 1024); }
/* * Wait for all forward requests completion. * * Even if something goes wrong, we have to wait forward requests completion to * avoid interleaved requests. * * Return error code if any one request fails. */ static int wait_forward_request(struct write_info *wi, struct request *req) { int nr_sent, err_ret = SD_RES_SUCCESS, ret, pollret, i, repeat = MAX_RETRY_COUNT; struct pfd_info pi; struct sd_rsp *rsp = &req->rp; again: pfd_info_init(wi, &pi); pollret = poll(pi.pfds, pi.nr, 1000 * POLL_TIMEOUT); if (pollret < 0) { if (errno == EINTR) goto again; panic("%m"); } else if (pollret == 0) { /* * If IO NIC is down, epoch isn't incremented, so we can't retry * for ever. */ if (sheep_need_retry(req->rq.epoch) && repeat) { repeat--; sd_warn("poll timeout %d, disks of some nodes or " "network is busy. Going to poll-wait again", wi->nr_sent); goto again; } nr_sent = wi->nr_sent; /* XXX Blinedly close all the connections */ for (i = 0; i < nr_sent; i++) sockfd_cache_del(wi->ent[i].nid, wi->ent[i].sfd); return SD_RES_NETWORK_ERROR; } nr_sent = wi->nr_sent; for (i = 0; i < nr_sent; i++) if (pi.pfds[i].revents & POLLIN) break; if (i < nr_sent) { int re = pi.pfds[i].revents; sd_debug("%d, revents %x", i, re); if (re & (POLLERR | POLLHUP | POLLNVAL)) { err_ret = SD_RES_NETWORK_ERROR; finish_one_write_err(wi, i); goto finish_write; } if (do_read(pi.pfds[i].fd, rsp, sizeof(*rsp), sheep_need_retry, req->rq.epoch, MAX_RETRY_COUNT)) { sd_err("remote node might have gone away"); err_ret = SD_RES_NETWORK_ERROR; finish_one_write_err(wi, i); goto finish_write; } ret = rsp->result; if (ret != SD_RES_SUCCESS) { sd_err("fail %"PRIx64", %s", req->rq.obj.oid, sd_strerror(ret)); err_ret = ret; } finish_one_write(wi, i); } finish_write: if (wi->nr_sent > 0) goto again; return err_ret; }