static int sli_open_backing_file(struct fidc_membh *f) { int lvl = PLL_DIAG, incr, rc = 0; char fidfn[PATH_MAX]; /* * XXX hit setrlimit: operation not permitted, but no open-fail. * This per open system call should go! Hit again due to fs.nr_open * sysctl limit. */ incr = psc_rlim_adj(RLIMIT_NOFILE, 1); sli_fg_makepath(&f->fcmh_fg, fidfn); fcmh_2_fd(f) = open(fidfn, O_CREAT|O_RDWR, 0600); if (fcmh_2_fd(f) == -1) { rc = errno; if (incr) psc_rlim_adj(RLIMIT_NOFILE, -1); OPSTAT_INCR("open-fail"); lvl = PLL_WARN; } else OPSTAT_INCR("open-succeed"); psclog(lvl, "opened backing file path=%s fd=%d rc=%d", strstr(fidfn, SL_RPATH_FIDNS_DIR), fcmh_2_fd(f), rc); return (rc); }
void sli_fcmh_dtor(__unusedx struct fidc_membh *f) { if (f->fcmh_flags & FCMH_IOD_BACKFILE) { if (close(fcmh_2_fd(f)) == -1) { OPSTAT_INCR("close-fail"); DEBUG_FCMH(PLL_ERROR, f, "dtor/close errno=%d", errno); } else OPSTAT_INCR("close-succeed"); psc_rlim_adj(RLIMIT_NOFILE, -1); f->fcmh_flags &= ~FCMH_IOD_BACKFILE; } }
int bcr_update_inodeinfo(struct bcrcupd *bcr) { struct fidc_membh *f; struct stat stb; struct bmap *b; b = bcr_2_bmap(bcr); f = b->bcm_fcmh; if (bcr->bcr_crcup.fg.fg_fid == FID_ANY) return (EINVAL); psc_assert(bcr->bcr_crcup.fg.fg_fid == f->fcmh_fg.fg_fid); if (bcr->bcr_crcup.fg.fg_gen != f->fcmh_fg.fg_gen) { OPSTAT_INCR("brcupdate-stale"); return (ESTALE); } if ((f->fcmh_flags & FCMH_IOD_BACKFILE) == 0) return (EBADF); if (fstat(fcmh_2_fd(f), &stb) == -1) return (errno); /* Used by mds_bmap_crc_update() */ bcr->bcr_crcup.fsize = stb.st_size; bcr->bcr_crcup.nblks = stb.st_blocks; bcr->bcr_crcup.utimgen = f->fcmh_sstb.sst_utimgen; return (0); }
uint64_t mds_bmap_timeotbl_getnextseq(void) { int locked; uint64_t hwm; locked = reqlock(&mdsBmapTimeoTbl.btt_lock); /* * Skip zero sequence number because the client does not * like it. More work is needed when an IOS decides if * a smaller sequence number is actually ahead of a larger * one after a wrap around happens. */ mdsBmapTimeoTbl.btt_maxseq++; if (mdsBmapTimeoTbl.btt_maxseq == BMAPSEQ_ANY) { OPSTAT_INCR("seqno-wrap"); mdsBmapTimeoTbl.btt_maxseq = 1; } hwm = mdsBmapTimeoTbl.btt_maxseq; mds_bmap_timeotbl_journal_seqno(); ureqlock(&mdsBmapTimeoTbl.btt_lock, locked); return (hwm); }
size_t pfl_odt_allocslot(struct pfl_odt *t) { struct pfl_odt_hdr *h; size_t item; h = t->odt_hdr; spinlock(&t->odt_lock); if (psc_vbitmap_next(t->odt_bitmap, &item) <= 0) { ODT_STAT_INCR(t, full); freelock(&t->odt_lock); return (-1); } if (item >= h->odth_nitems) { ODT_STAT_INCR(t, extend); OPSTAT_INCR("pfl.odtable-resize"); /* * psc_vbitmap_next() has enlarged the bitmap. Update * the number of items accordingly and write to the * disk. */ h->odth_nitems = psc_vbitmap_getsize(t->odt_bitmap); t->odt_ops.odtop_resize(t); /* slm_odt_resize() */ PFLOG_ODT(PLL_WARN, t, "odtable now has %u items (used to be %zd)", h->odth_nitems, item); } freelock(&t->odt_lock); return (item); }
void sli_fcmh_dtor(__unusedx struct fidc_membh *f) { struct fcmh_iod_info *fii; if (f->fcmh_flags & FCMH_IOD_BACKFILE) { if (close(fcmh_2_fd(f)) == -1) { OPSTAT_INCR("close-fail"); DEBUG_FCMH(PLL_ERROR, f, "dtor/close errno=%d", errno); } else OPSTAT_INCR("close-succeed"); psc_rlim_adj(RLIMIT_NOFILE, -1); f->fcmh_flags &= ~FCMH_IOD_BACKFILE; } if (f->fcmh_flags & FCMH_IOD_DIRTYFILE) { fii = fcmh_2_fii(f); lc_remove(&sli_fcmh_dirty, fii); f->fcmh_flags &= ~FCMH_IOD_DIRTYFILE; } }
static int sli_open_backing_file(struct fidc_membh *f) { int lvl = PLL_DIAG, incr, rc = 0; char fidfn[PATH_MAX]; incr = psc_rlim_adj(RLIMIT_NOFILE, 1); sli_fg_makepath(&f->fcmh_fg, fidfn); fcmh_2_fd(f) = open(fidfn, O_CREAT|O_RDWR, 0600); if (fcmh_2_fd(f) == -1) { rc = errno; if (incr) psc_rlim_adj(RLIMIT_NOFILE, -1); OPSTAT_INCR("open-fail"); lvl = PLL_WARN; } else OPSTAT_INCR("open-succeed"); psclog(lvl, "opened backing file path=%s fd=%d rc=%d", strstr(fidfn, SL_RPATH_FIDNS_DIR), fcmh_2_fd(f), rc); return (rc); }
int slm_rmc_handle_getbmap(struct pscrpc_request *rq) { const struct srm_leasebmap_req *mq; struct srm_leasebmap_rep *mp; struct fidc_membh *f; int rc = 0; SL_RSX_ALLOCREP(rq, mq, mp); if (mq->rw == SL_WRITE) OPSTAT_INCR("getbmap-lease-write"); else if (mq->rw == SL_READ) OPSTAT_INCR("getbmap-lease-read"); else { mp->rc = -EINVAL; return (0); } mp->rc = -slm_fcmh_get(&mq->fg, &f); if (mp->rc) return (0); mp->flags = mq->flags; mp->rc = mds_bmap_load_cli(f, mq->bmapno, mq->flags, mq->rw, mq->prefios[0], &mp->sbd, rq->rq_export, mp->repls, 0); if (mp->rc) PFL_GOTOERR(out, mp->rc); if (mp->flags & SRM_LEASEBMAPF_GETINODE) slm_pack_inode(f, &mp->ino); out: fcmh_op_done(f); return (rc ? rc : mp->rc); }
/* * Get the specified bmap. * @f: fcmh. * @n: bmap number. * @rw: access mode. * @flags: retrieval parameters. * @bp: value-result bmap pointer. * Notes: returns the bmap referenced and locked. */ int _bmap_get(const struct pfl_callerinfo *pci, struct fidc_membh *f, sl_bmapno_t n, enum rw rw, int flags, struct bmap **bp) { int rc = 0, new_bmap, bmaprw = 0; struct bmap *b; if (bp) *bp = NULL; if (rw) bmaprw = rw == SL_WRITE ? BMAPF_WR : BMAPF_RD; new_bmap = flags & BMAPGETF_CREATE; b = bmap_lookup_cache(f, n, bmaprw, &new_bmap); if (b == NULL) { rc = ENOENT; goto out; } if (flags & BMAPGETF_NONBLOCK) { if (b->bcm_flags & BMAPF_LOADING) goto out; } else bmap_wait_locked(b, b->bcm_flags & BMAPF_LOADING); if (b->bcm_flags & BMAPF_LOADED) goto loaded; if (flags & BMAPGETF_NORETRIEVE) { if (b->bcm_flags & BMAPF_LOADED) OPSTAT_INCR("bmap-already-loaded"); else OPSTAT_INCR("bmap-not-yet-loaded"); goto out; } b->bcm_flags |= BMAPF_LOADING; DEBUG_BMAP(PLL_DIAG, b, "loading bmap; flags=%d", flags); BMAP_ULOCK(b); /* msl_bmap_retrieve(), iod_bmap_retrieve(), mds_bmap_read() */ rc = sl_bmap_ops.bmo_retrievef(b, flags); BMAP_LOCK(b); if (flags & BMAPGETF_NONBLOCK) { if (rc) b->bcm_flags &= ~BMAPF_LOADING; goto out; } b->bcm_flags &= ~BMAPF_LOADING; if (!rc) { b->bcm_flags |= BMAPF_LOADED; bmap_wake_locked(b); } loaded: /* * Early bail out should be safe. There is only one place the client * will do a bmap lookup. And it that code path, we just add DIO flag * to the bmap. See msrcm_handle_bmapdio(). */ if (rc || !bmaprw) goto out; /* * Others wishing to access this bmap in the same mode must wait * until MODECHNG ops have completed. If the desired mode is * present then a thread may proceed without blocking here so * long as it only accesses structures which pertain to its * mode. */ if (flags & BMAPGETF_NONBLOCK) { if (b->bcm_flags & BMAPF_MODECHNG) goto out; } else bmap_wait_locked(b, b->bcm_flags & BMAPF_MODECHNG); /* * Not all lookups are done with the intent of changing the bmap * mode i.e. bmap_lookup() does not specify a rw value. */ if (!(bmaprw & b->bcm_flags) && sl_bmap_ops.bmo_mode_chngf) { psc_assert(!(b->bcm_flags & BMAPF_MODECHNG)); b->bcm_flags |= BMAPF_MODECHNG; DEBUG_BMAP(PLL_DIAG, b, "mode change (rw=%d)", rw); BMAP_ULOCK(b); psc_assert(rw == SL_WRITE || rw == SL_READ); /* client only: call msl_bmap_modeset() */ rc = sl_bmap_ops.bmo_mode_chngf(b, rw, flags); BMAP_LOCK(b); } out: if (b) { DEBUG_BMAP(rc && (rc != SLERR_BMAP_INVALID || (flags & BMAPGETF_NOAUTOINST) == 0) ? PLL_ERROR : PLL_DIAG, b, "grabbed rc=%d", rc); if (rc) bmap_op_done(b); else *bp = b; } return (rc); }
/* * Lookup and optionally create a new bmap structure. * @f: file's bmap tree to search. * @n: bmap index number to search for. * @new_bmap: whether to allow creation and also value-result of whether * it was newly created or not. */ struct bmap * bmap_lookup_cache(struct fidc_membh *f, sl_bmapno_t n, int bmaprw, int *new_bmap) { struct bmap lb, *b, *bnew = NULL; int doalloc; doalloc = *new_bmap; lb.bcm_bmapno = n; restart: if (bnew) pfl_rwlock_wrlock(&f->fcmh_rwlock); else pfl_rwlock_rdlock(&f->fcmh_rwlock); b = RB_FIND(bmaptree, &f->fcmh_bmaptree, &lb); if (b) { if (!BMAP_TRYLOCK(b)) { pfl_rwlock_unlock(&f->fcmh_rwlock); usleep(10); goto restart; } if (b->bcm_flags & BMAPF_TOFREE) { /* * This bmap is going away; wait for it so we * can reload it back. */ DEBUG_BMAP(PLL_DIAG, b, "wait on to-free bmap"); BMAP_ULOCK(b); /* * We don't want to spin if we are waiting for a * flush to clear. */ psc_waitq_waitrelf_us(&f->fcmh_waitq, PFL_LOCKPRIMT_RWLOCK, &f->fcmh_rwlock, 100); goto restart; } bmap_op_start_type(b, BMAP_OPCNT_LOOKUP); } if (doalloc == 0 || b) { pfl_rwlock_unlock(&f->fcmh_rwlock); if (bnew) psc_pool_return(bmap_pool, bnew); *new_bmap = 0; OPSTAT_INCR("bmapcache.hit"); return (b); } if (bnew == NULL) { pfl_rwlock_unlock(&f->fcmh_rwlock); if (sl_bmap_ops.bmo_reapf) sl_bmap_ops.bmo_reapf(); bnew = psc_pool_get(bmap_pool); goto restart; } b = bnew; OPSTAT_INCR("bmapcache.miss"); *new_bmap = 1; memset(b, 0, bmap_pool->ppm_master->pms_entsize); INIT_PSC_LISTENTRY(&b->bcm_lentry); INIT_SPINLOCK(&b->bcm_lock); psc_atomic32_set(&b->bcm_opcnt, 0); b->bcm_fcmh = f; b->bcm_bmapno = n; /* * Signify that the bmap is newly initialized and therefore may * not contain certain structures. */ psc_assert(bmaprw == BMAPF_RD || bmaprw == BMAPF_WR); b->bcm_flags = bmaprw; bmap_op_start_type(b, BMAP_OPCNT_LOOKUP); /* * Perform app-specific substructure initialization, which is * msl_bmap_init(), iod_bmap_init(), or mds_bmap_init(). */ sl_bmap_ops.bmo_init_privatef(b); /* Add to the fcmh's bmap cache */ PSC_RB_XINSERT(bmaptree, &f->fcmh_bmaptree, b); pfl_rwlock_unlock(&f->fcmh_rwlock); fcmh_op_start_type(f, FCMH_OPCNT_BMAP); BMAP_LOCK(b); return (b); }
/* * Update the high-level app stat(2)-like attribute buffer for a FID * cache member. * @f: FID cache member to update. * @sstb: incoming stat attributes. * @flags: behavioral flags. * Notes: * (1) if SAVELOCAL has been specified, save local field values: * (o) file size * (o) mtime * (2) This function should only be used by a client. */ void slc_fcmh_setattrf(struct fidc_membh *f, struct srt_stat *sstb, int flags) { uidmap_int_stat(sstb); if (flags & FCMH_SETATTRF_HAVELOCK) FCMH_LOCK_ENSURE(f); else FCMH_LOCK(f); if (fcmh_2_gen(f) == FGEN_ANY) fcmh_2_gen(f) = sstb->sst_gen; if ((FID_GET_INUM(fcmh_2_fid(f))) != SLFID_ROOT && fcmh_2_gen(f) > sstb->sst_gen) { OPSTAT_INCR("msl.generation-backwards"); DEBUG_FCMH(PLL_DIAG, f, "attempt to set attr with " "gen %"PRIu64" from old gen %"PRIu64, fcmh_2_gen(f), sstb->sst_gen); goto out; } /* * If we don't have stat attributes, how can we save our local * updates? */ if ((f->fcmh_flags & FCMH_HAVE_ATTRS) == 0) flags |= FCMH_SETATTRF_CLOBBER; /* * Always update for roots because we might have faked them * with readdir at the super root. */ if ((FID_GET_INUM(fcmh_2_fid(f))) == SLFID_ROOT) flags |= FCMH_SETATTRF_CLOBBER; psc_assert(sstb->sst_gen != FGEN_ANY); psc_assert(f->fcmh_fg.fg_fid == sstb->sst_fid); /* * The default behavior is to save st_size and st_mtim since we * might have done I/O that the MDS does not know about. */ if ((flags & FCMH_SETATTRF_CLOBBER) == 0 && fcmh_isreg(f)) { /* * If generation numbers match, take the highest of the * values. Otherwise, disregard local values and * blindly accept whatever the MDS tells us. */ if (fcmh_2_ptruncgen(f) == sstb->sst_ptruncgen && fcmh_2_gen(f) == sstb->sst_gen && fcmh_2_fsz(f) > sstb->sst_size) sstb->sst_size = fcmh_2_fsz(f); if (fcmh_2_utimgen(f) == sstb->sst_utimgen) sstb->sst_mtim = f->fcmh_sstb.sst_mtim; } COPY_SSTB(sstb, &f->fcmh_sstb); f->fcmh_flags |= FCMH_HAVE_ATTRS; f->fcmh_flags &= ~FCMH_GETTING_ATTRS; if (sl_fcmh_ops.sfop_postsetattr) sl_fcmh_ops.sfop_postsetattr(f); DEBUG_FCMH(PLL_DEBUG, f, "attr set"); out: if (!(flags & FCMH_SETATTRF_HAVELOCK)) FCMH_ULOCK(f); }
/* * If the generation number changes, we assume a full truncation has * happened. We need to open a new backing file and attach it to the * fcmh. */ int sli_fcmh_reopen(struct fidc_membh *f, slfgen_t fgen) { int rc = 0; FCMH_LOCK_ENSURE(f); OPSTAT_INCR("reopen"); if (fgen == FGEN_ANY) { OPSTAT_INCR("generation-bogus"); return (EBADF); } if (fgen < fcmh_2_gen(f)) { OPSTAT_INCR("generation-stale"); return (ESTALE); } /* * If our generation number is still unknown try to set it here. */ if (fcmh_2_gen(f) == FGEN_ANY && fgen != FGEN_ANY) { OPSTAT_INCR("generation-fix"); fcmh_2_gen(f) = fgen; } if (fgen > fcmh_2_gen(f)) { struct sl_fidgen oldfg; char fidfn[PATH_MAX]; DEBUG_FCMH(PLL_DIAG, f, "reopening new backing file"); OPSTAT_INCR("slvr-remove-reopen"); slvr_remove_all(f); /* * It's possible the pruning of all slivers and bmaps * ended up fcmh_op_done() our fcmh so ensure it is * locked upon finishing. */ FCMH_RLOCK(f); /* * Need to reopen the backing file and possibly remove * the old one. */ if (f->fcmh_flags & FCMH_IOD_BACKFILE) { if (close(fcmh_2_fd(f)) == -1) { OPSTAT_INCR("close-fail"); DEBUG_FCMH(PLL_ERROR, f, "reopen/close errno=%d", errno); } else { OPSTAT_INCR("close-succeed"); } fcmh_2_fd(f) = -1; psc_rlim_adj(RLIMIT_NOFILE, -1); f->fcmh_flags &= ~FCMH_IOD_BACKFILE; } oldfg.fg_fid = fcmh_2_fid(f); oldfg.fg_gen = fcmh_2_gen(f); fcmh_2_gen(f) = fgen; rc = sli_open_backing_file(f); /* Notify upper layers that open() has failed. */ if (!rc) f->fcmh_flags |= FCMH_IOD_BACKFILE; /* Do some upfront garbage collection. */ sli_fg_makepath(&oldfg, fidfn); errno = 0; unlink(fidfn); DEBUG_FCMH(PLL_INFO, f, "upfront unlink(), errno=%d", errno); } else if (!(f->fcmh_flags & FCMH_IOD_BACKFILE)) { rc = sli_open_backing_file(f); if (!rc) f->fcmh_flags |= FCMH_IOD_BACKFILE; OPSTAT_INCR("generation-same"); } return (rc); }
void slm_repl_upd_write(struct bmap *b, int rel) { struct { sl_replica_t iosv[SL_MAX_REPLICAS]; char *stat[SL_MAX_REPLICAS]; unsigned nios; } add, del, chg; int off, vold, vnew, sprio, uprio, rc; struct sl_mds_iosinfo *si; struct bmap_mds_info *bmi; struct fidc_membh *f; struct sl_resource *r; sl_ios_id_t resid; unsigned n, nrepls; bmi = bmap_2_bmi(b); f = b->bcm_fcmh; sprio = bmi->bmi_sys_prio; uprio = bmi->bmi_usr_prio; add.nios = 0; del.nios = 0; chg.nios = 0; nrepls = fcmh_2_nrepls(f); for (n = 0, off = 0; n < nrepls; n++, off += SL_BITS_PER_REPLICA) { if (n == SL_DEF_REPLICAS) mds_inox_ensure_loaded(fcmh_2_inoh(f)); resid = fcmh_2_repl(f, n); vold = SL_REPL_GET_BMAP_IOS_STAT(bmi->bmi_orepls, off); vnew = SL_REPL_GET_BMAP_IOS_STAT(bmi->bmi_repls, off); r = libsl_id2res(resid); si = r ? res2iosinfo(r) : &slm_null_iosinfo; if (vold == vnew) ; /* Work was added. */ else if ((vold != BREPLST_REPL_SCHED && vold != BREPLST_GARBAGE_QUEUED && vold != BREPLST_GARBAGE_SCHED && vnew == BREPLST_REPL_QUEUED) || (vold != BREPLST_GARBAGE_SCHED && vnew == BREPLST_GARBAGE_QUEUED && (si->si_flags & SIF_PRECLAIM_NOTSUP) == 0)) { OPSTAT_INCR("repl-work-add"); PUSH_IOS(b, &add, resid, NULL); } /* Work has finished. */ else if ((vold == BREPLST_REPL_QUEUED || vold == BREPLST_REPL_SCHED || vold == BREPLST_TRUNC_SCHED || vold == BREPLST_TRUNC_QUEUED || vold == BREPLST_GARBAGE_SCHED || vold == BREPLST_VALID) && (((si->si_flags & SIF_PRECLAIM_NOTSUP) && vnew == BREPLST_GARBAGE_QUEUED) || vnew == BREPLST_VALID || vnew == BREPLST_INVALID)) { OPSTAT_INCR("repl-work-del"); PUSH_IOS(b, &del, resid, NULL); } /* * Work that was previously scheduled failed so * requeue it. */ else if (vold == BREPLST_REPL_SCHED || vold == BREPLST_GARBAGE_SCHED || vold == BREPLST_TRUNC_SCHED) PUSH_IOS(b, &chg, resid, "Q"); /* Work was scheduled. */ else if (vnew == BREPLST_REPL_SCHED || vnew == BREPLST_GARBAGE_SCHED || vnew == BREPLST_TRUNC_SCHED) PUSH_IOS(b, &chg, resid, "S"); /* Work was reprioritized. */ else if (sprio != -1 || uprio != -1) PUSH_IOS(b, &chg, resid, NULL); } for (n = 0; n < add.nios; n++) { rc = slm_upsch_insert(b, add.iosv[n].bs_id, sprio, uprio); if (!rc) continue; psclog_warnx("upsch insert failed: bno = %d, " "fid=%"PRId64", ios= %d, rc = %d", b->bcm_bmapno, bmap_2_fid(b), add.iosv[n].bs_id, rc); } for (n = 0; n < del.nios; n++) { spinlock(&slm_upsch_lock); dbdo(NULL, NULL, " DELETE FROM upsch" " WHERE resid = ?" " AND fid = ?" " AND bno = ?", SQLITE_INTEGER, del.iosv[n].bs_id, SQLITE_INTEGER64, bmap_2_fid(b), SQLITE_INTEGER, b->bcm_bmapno); freelock(&slm_upsch_lock); } for (n = 0; n < chg.nios; n++) { spinlock(&slm_upsch_lock); dbdo(NULL, NULL, " UPDATE upsch" " SET status = IFNULL(?, status)," " sys_prio = IFNULL(?, sys_prio)," " usr_prio = IFNULL(?, usr_prio)" " WHERE resid = ?" " AND fid = ?" " AND bno = ?", chg.stat[n] ? SQLITE_TEXT : SQLITE_NULL, chg.stat[n] ? chg.stat[n] : 0, sprio == -1 ? SQLITE_NULL : SQLITE_INTEGER, sprio == -1 ? 0 : sprio, uprio == -1 ? SQLITE_NULL : SQLITE_INTEGER, uprio == -1 ? 0 : uprio, SQLITE_INTEGER, chg.iosv[n].bs_id, SQLITE_INTEGER64, bmap_2_fid(b), SQLITE_INTEGER, b->bcm_bmapno); freelock(&slm_upsch_lock); } bmap_2_bmi(b)->bmi_sys_prio = -1; bmap_2_bmi(b)->bmi_usr_prio = -1; if (rel) { BMAP_LOCK(b); b->bcm_flags &= ~BMAPF_REPLMODWR; bmap_wake_locked(b); bmap_op_done_type(b, BMAP_OPCNT_WORK); } }
/* * Return the index of the given IOS ID or a negative error code on failure. */ int _mds_repl_ios_lookup(int vfsid, struct slash_inode_handle *ih, sl_ios_id_t ios, int flag) { int locked, rc; struct slm_inox_od *ix = NULL; struct sl_resource *res; struct fidc_membh *f; sl_replica_t *repl; uint32_t i, j, nr; char buf[LINE_MAX]; switch (flag) { case IOSV_LOOKUPF_ADD: OPSTAT_INCR("replicate-add"); break; case IOSV_LOOKUPF_DEL: OPSTAT_INCR("replicate-del"); break; case IOSV_LOOKUPF_LOOKUP: OPSTAT_INCR("replicate-lookup"); break; default: psc_fatalx("Invalid IOS lookup flag %d", flag); } /* * Can I assume that IOS ID are non-zeros. If so, I can use * zero to mark a free slot. See sl_global_id_build(). */ f = inoh_2_fcmh(ih); nr = ih->inoh_ino.ino_nrepls; repl = ih->inoh_ino.ino_repls; locked = INOH_RLOCK(ih); psc_assert(nr <= SL_MAX_REPLICAS); if (nr == SL_MAX_REPLICAS && flag == IOSV_LOOKUPF_ADD) { DEBUG_INOH(PLL_WARN, ih, buf, "too many replicas"); PFL_GOTOERR(out, rc = -ENOSPC); } res = libsl_id2res(ios); if (res == NULL || !RES_ISFS(res)) PFL_GOTOERR(out, rc = -SLERR_RES_BADTYPE); /* * 09/29/2016: Hit SLERR_SHORTIO in the function. Need more investigation. */ /* * Return ENOENT by default for IOSV_LOOKUPF_DEL & IOSV_LOOKUPF_LOOKUP. */ rc = -ENOENT; /* * Search the existing replicas to see if the given IOS is * already there. * * The following code can step through zero IOS IDs just fine. * */ for (i = 0, j = 0; i < nr; i++, j++) { if (i == SL_DEF_REPLICAS) { /* * The first few replicas are in the inode * itself, the rest are in the extra inode * block. */ rc = mds_inox_ensure_loaded(ih); if (rc) goto out; ix = ih->inoh_extras; repl = ix->inox_repls; j = 0; } DEBUG_INOH(PLL_DEBUG, ih, buf, "is rep[%u](=%u) == %u ?", j, repl[j].bs_id, ios); if (repl[j].bs_id == ios) { /* * Luckily, this code is only called by mds_repl_delrq() * for directories. * * Make sure that the logic works for at least the following * edge cases: * * (1) There is only one item in the basic array. * (2) There is only one item in the extra array. * (3) The number of items is SL_DEF_REPLICAS. * (4) The number of items is SL_MAX_REPLICAS. */ if (flag == IOSV_LOOKUPF_DEL) { /* * Compact the array if the IOS is not the last * one. The last one will be either overwritten * or zeroed. Note that we might move extra * garbage at the end if the total number is less * than SL_DEF_REPLICAS. */ if (i < SL_DEF_REPLICAS - 1) { memmove(&repl[j], &repl[j + 1], (SL_DEF_REPLICAS - j - 1) * sizeof(*repl)); } /* * All items in the basic array, zero the last * one and we are done. */ if (nr <= SL_DEF_REPLICAS) { repl[nr-1].bs_id = 0; goto syncit; } /* * Now we know we have more than SL_DEF_REPLICAS * items. However, if we are in the basic array, * we have not read the extra array yet. In this * case, we should also move the first item from * the extra array to the last one in the basic * array (overwrite). */ if (i < SL_DEF_REPLICAS) { rc = mds_inox_ensure_loaded(ih); if (rc) goto out; ix = ih->inoh_extras; repl[SL_DEF_REPLICAS - 1].bs_id = ix->inox_repls[0].bs_id; repl = ix->inox_repls; j = 0; } /* * Compact the extra array unless the IOS is * the last one, which will be zeroed. */ if (i < SL_MAX_REPLICAS - 1) { memmove(&repl[j], &repl[j + 1], (SL_INOX_NREPLICAS - j - 1) * sizeof(*repl)); } repl[nr-SL_DEF_REPLICAS-1].bs_id = 0; syncit: ih->inoh_ino.ino_nrepls = nr - 1; rc = mds_inodes_odsync(vfsid, f, mdslog_ino_repls); if (rc) goto out; } /* XXX EEXIST for IOSV_LOOKUPF_ADD? */ rc = i; goto out; } } /* It doesn't exist; add to inode replica table if requested. */ if (flag == IOSV_LOOKUPF_ADD) { /* paranoid */ psc_assert(i == nr); if (nr >= SL_DEF_REPLICAS) { /* be careful with the case of nr = SL_DEF_REPLICAS */ rc = mds_inox_ensure_loaded(ih); if (rc) goto out; repl = ih->inoh_extras->inox_repls; j = i - SL_DEF_REPLICAS; } else { repl = ih->inoh_ino.ino_repls; j = i; } repl[j].bs_id = ios; DEBUG_INOH(PLL_DIAG, ih, buf, "add IOS(%u) at idx %d", ios, i); ih->inoh_ino.ino_nrepls = nr + 1; rc = mds_inodes_odsync(vfsid, f, mdslog_ino_repls); if (!rc) rc = i; } out: INOH_URLOCK(ih, locked); return (rc); }