/* question: does afs_create need to set CDirty in the adp or the avc? * I think we can get away without it, but I'm not sure. Note that * afs_setattr is called in here for truncation. */ #ifdef AFS_SGI64_ENV int afs_create(OSI_VC_DECL(adp), char *aname, struct vattr *attrs, int flags, int amode, struct vcache **avcp, afs_ucred_t *acred) #else /* AFS_SGI64_ENV */ int afs_create(OSI_VC_DECL(adp), char *aname, struct vattr *attrs, enum vcexcl aexcl, int amode, struct vcache **avcp, afs_ucred_t *acred) #endif /* AFS_SGI64_ENV */ { afs_int32 origCBs, origZaps, finalZaps; struct vrequest *treq = NULL; afs_int32 code; struct afs_conn *tc; struct VenusFid newFid; struct AFSStoreStatus InStatus; struct AFSFetchStatus *OutFidStatus, *OutDirStatus; struct AFSVolSync tsync; struct AFSCallBack CallBack; afs_int32 now; struct dcache *tdc; afs_size_t offset, len; struct server *hostp = 0; struct vcache *tvc; struct volume *volp = 0; struct afs_fakestat_state fakestate; struct rx_connection *rxconn; XSTATS_DECLS; OSI_VC_CONVERT(adp); AFS_STATCNT(afs_create); OutFidStatus = osi_AllocSmallSpace(sizeof(struct AFSFetchStatus)); OutDirStatus = osi_AllocSmallSpace(sizeof(struct AFSFetchStatus)); memset(&InStatus, 0, sizeof(InStatus)); if ((code = afs_CreateReq(&treq, acred))) goto done2; afs_Trace3(afs_iclSetp, CM_TRACE_CREATE, ICL_TYPE_POINTER, adp, ICL_TYPE_STRING, aname, ICL_TYPE_INT32, amode); afs_InitFakeStat(&fakestate); #ifdef AFS_SGI65_ENV /* If avcp is passed not null, it's the old reference to this file. * We can use this to avoid create races. For now, just decrement * the reference count on it. */ if (*avcp) { AFS_RELE(AFSTOV(*avcp)); *avcp = NULL; } #endif if (strlen(aname) > AFSNAMEMAX) { code = ENAMETOOLONG; goto done3; } if (!afs_ENameOK(aname)) { code = EINVAL; goto done3; } switch (attrs->va_type) { case VBLK: case VCHR: #if !defined(AFS_SUN5_ENV) case VSOCK: #endif case VFIFO: /* We don't support special devices or FIFOs */ code = EINVAL; goto done3; default: ; } AFS_DISCON_LOCK(); code = afs_EvalFakeStat(&adp, &fakestate, treq); if (code) goto done; tagain: code = afs_VerifyVCache(adp, treq); if (code) goto done; /** If the volume is read-only, return error without making an RPC to the * fileserver */ if (adp->f.states & CRO) { code = EROFS; goto done; } if (AFS_IS_DISCONNECTED && !AFS_IS_DISCON_RW) { code = ENETDOWN; goto done; } tdc = afs_GetDCache(adp, (afs_size_t) 0, treq, &offset, &len, 1); ObtainWriteLock(&adp->lock, 135); if (tdc) ObtainSharedLock(&tdc->lock, 630); /* * Make sure that the data in the cache is current. We may have * received a callback while we were waiting for the write lock. */ if (!(adp->f.states & CStatd) || (tdc && !hsame(adp->f.m.DataVersion, tdc->f.versionNo))) { ReleaseWriteLock(&adp->lock); if (tdc) { ReleaseSharedLock(&tdc->lock); afs_PutDCache(tdc); } goto tagain; } if (tdc) { /* see if file already exists. If it does, we only set * the size attributes (to handle O_TRUNC) */ code = afs_dir_Lookup(tdc, aname, &newFid.Fid); /* use dnlc first xxx */ if (code == 0) { ReleaseSharedLock(&tdc->lock); afs_PutDCache(tdc); ReleaseWriteLock(&adp->lock); #ifdef AFS_SGI64_ENV if (flags & VEXCL) { #else if (aexcl != NONEXCL) { #endif code = EEXIST; /* file exists in excl mode open */ goto done; } /* found the file, so use it */ newFid.Cell = adp->f.fid.Cell; newFid.Fid.Volume = adp->f.fid.Fid.Volume; tvc = NULL; if (newFid.Fid.Unique == 0) { tvc = afs_LookupVCache(&newFid, treq, NULL, adp, aname); } if (!tvc) /* lookup failed or wasn't called */ tvc = afs_GetVCache(&newFid, treq, NULL, NULL); if (tvc) { /* if the thing exists, we need the right access to open it. * we must check that here, since no other checks are * made by the open system call */ len = attrs->va_size; /* only do the truncate */ /* * We used to check always for READ access before; the * problem is that we will fail if the existing file * has mode -w-w-w, which is wrong. */ if ((amode & VREAD) && !afs_AccessOK(tvc, PRSFS_READ, treq, CHECK_MODE_BITS)) { afs_PutVCache(tvc); code = EACCES; goto done; } #if defined(AFS_DARWIN80_ENV) if ((amode & VWRITE) || VATTR_IS_ACTIVE(attrs, va_data_size)) #elif defined(AFS_SUN5_ENV) || defined(AFS_SGI_ENV) if ((amode & VWRITE) || (attrs->va_mask & AT_SIZE)) #else if ((amode & VWRITE) || len != 0xffffffff) #endif { /* needed for write access check */ tvc->f.parent.vnode = adp->f.fid.Fid.Vnode; tvc->f.parent.unique = adp->f.fid.Fid.Unique; /* need write mode for these guys */ if (!afs_AccessOK (tvc, PRSFS_WRITE, treq, CHECK_MODE_BITS)) { afs_PutVCache(tvc); code = EACCES; goto done; } } #if defined(AFS_DARWIN80_ENV) if (VATTR_IS_ACTIVE(attrs, va_data_size)) #elif defined(AFS_SUN5_ENV) || defined(AFS_SGI_ENV) if (attrs->va_mask & AT_SIZE) #else if (len != 0xffffffff) #endif { if (vType(tvc) != VREG) { afs_PutVCache(tvc); code = EISDIR; goto done; } /* do a truncate */ #if defined(AFS_DARWIN80_ENV) VATTR_INIT(attrs); VATTR_SET_SUPPORTED(attrs, va_data_size); VATTR_SET_ACTIVE(attrs, va_data_size); #elif defined(UKERNEL) attrs->va_mask = ATTR_SIZE; #elif defined(AFS_SUN5_ENV) || defined(AFS_SGI_ENV) attrs->va_mask = AT_SIZE; #else VATTR_NULL(attrs); #endif attrs->va_size = len; ObtainWriteLock(&tvc->lock, 136); tvc->f.states |= CCreating; ReleaseWriteLock(&tvc->lock); #if defined(AFS_SUN5_ENV) || defined(AFS_SGI_ENV) #if defined(AFS_SGI64_ENV) code = afs_setattr(VNODE_TO_FIRST_BHV((vnode_t *) tvc), attrs, 0, acred); #else code = afs_setattr(tvc, attrs, 0, acred); #endif /* AFS_SGI64_ENV */ #else /* SUN5 || SGI */ code = afs_setattr(tvc, attrs, acred); #endif /* SUN5 || SGI */ ObtainWriteLock(&tvc->lock, 137); tvc->f.states &= ~CCreating; ReleaseWriteLock(&tvc->lock); if (code) { afs_PutVCache(tvc); goto done; } } *avcp = tvc; } else code = ENOENT; /* shouldn't get here */ /* make sure vrefCount bumped only if code == 0 */ goto done; } } /* if we create the file, we don't do any access checks, since * that's how O_CREAT is supposed to work */ if (adp->f.states & CForeign) { origCBs = afs_allCBs; origZaps = afs_allZaps; } else { origCBs = afs_evenCBs; /* if changes, we don't really have a callback */ origZaps = afs_evenZaps; /* number of even numbered vnodes discarded */ } InStatus.Mask = AFS_SETMODTIME | AFS_SETMODE | AFS_SETGROUP; InStatus.ClientModTime = osi_Time(); InStatus.Group = (afs_int32) afs_cr_gid(acred); if (AFS_NFSXLATORREQ(acred)) { /* * XXX The following is mainly used to fix a bug in the HP-UX * nfs client where they create files with mode of 0 without * doing any setattr later on to fix it. * XXX */ #if defined(AFS_AIX_ENV) if (attrs->va_mode != -1) { #else #if defined(AFS_SUN5_ENV) || defined(AFS_SGI_ENV) if (attrs->va_mask & AT_MODE) { #else if (attrs->va_mode != ((unsigned short)-1)) { #endif #endif if (!attrs->va_mode) attrs->va_mode = 0x1b6; /* XXX default mode: rw-rw-rw XXX */ } } if (!AFS_IS_DISCONNECTED) { /* If not disconnected, connect to the server.*/ InStatus.UnixModeBits = attrs->va_mode & 0xffff; /* only care about protection bits */ do { tc = afs_Conn(&adp->f.fid, treq, SHARED_LOCK, &rxconn); if (tc) { hostp = tc->srvr->server; /* remember for callback processing */ now = osi_Time(); XSTATS_START_TIME(AFS_STATS_FS_RPCIDX_CREATEFILE); RX_AFS_GUNLOCK(); code = RXAFS_CreateFile(rxconn, (struct AFSFid *)&adp->f.fid.Fid, aname, &InStatus, (struct AFSFid *) &newFid.Fid, OutFidStatus, OutDirStatus, &CallBack, &tsync); RX_AFS_GLOCK(); XSTATS_END_TIME; CallBack.ExpirationTime += now; } else code = -1; } while (afs_Analyze (tc, rxconn, code, &adp->f.fid, treq, AFS_STATS_FS_RPCIDX_CREATEFILE, SHARED_LOCK, NULL)); if ((code == EEXIST || code == UAEEXIST) && #ifdef AFS_SGI64_ENV !(flags & VEXCL) #else /* AFS_SGI64_ENV */ aexcl == NONEXCL #endif ) { /* if we get an EEXIST in nonexcl mode, just do a lookup */ if (tdc) { ReleaseSharedLock(&tdc->lock); afs_PutDCache(tdc); } ReleaseWriteLock(&adp->lock); #if defined(AFS_SGI64_ENV) code = afs_lookup(VNODE_TO_FIRST_BHV((vnode_t *) adp), aname, avcp, NULL, 0, NULL, acred); #elif defined(AFS_SUN5_ENV) || defined(AFS_SGI_ENV) code = afs_lookup(adp, aname, avcp, NULL, 0, NULL, acred); #elif defined(UKERNEL) code = afs_lookup(adp, aname, avcp, acred, 0); #elif !defined(AFS_DARWIN_ENV) code = afs_lookup(adp, aname, avcp, acred); #endif goto done; } if (code) { if (code < 0) { ObtainWriteLock(&afs_xcbhash, 488); afs_DequeueCallback(adp); adp->f.states &= ~CStatd; ReleaseWriteLock(&afs_xcbhash); osi_dnlc_purgedp(adp); } ReleaseWriteLock(&adp->lock); if (tdc) { ReleaseSharedLock(&tdc->lock); afs_PutDCache(tdc); } goto done; } } else { /* Generate a fake FID for disconnected mode. */ newFid.Cell = adp->f.fid.Cell; newFid.Fid.Volume = adp->f.fid.Fid.Volume; afs_GenFakeFid(&newFid, VREG, 1); } /* if (!AFS_IS_DISCON_RW) */ /* otherwise, we should see if we can make the change to the dir locally */ if (tdc) UpgradeSToWLock(&tdc->lock, 631); if (AFS_IS_DISCON_RW || afs_LocalHero(adp, tdc, OutDirStatus, 1)) { /* we can do it locally */ ObtainWriteLock(&afs_xdcache, 291); code = afs_dir_Create(tdc, aname, &newFid.Fid); ReleaseWriteLock(&afs_xdcache); if (code) { ZapDCE(tdc); DZap(tdc); } } if (tdc) { ReleaseWriteLock(&tdc->lock); afs_PutDCache(tdc); } if (AFS_IS_DISCON_RW) adp->f.m.LinkCount++; newFid.Cell = adp->f.fid.Cell; newFid.Fid.Volume = adp->f.fid.Fid.Volume; ReleaseWriteLock(&adp->lock); volp = afs_FindVolume(&newFid, READ_LOCK); /* New tricky optimistic callback handling algorithm for file creation works * as follows. We create the file essentially with no locks set at all. File * server may thus handle operations from others cache managers as well as from * this very own cache manager that reference the file in question before * we managed to create the cache entry. However, if anyone else changes * any of the status information for a file, we'll see afs_evenCBs increase * (files always have even fids). If someone on this workstation manages * to do something to the file, they'll end up having to create a cache * entry for the new file. Either we'll find it once we've got the afs_xvcache * lock set, or it was also *deleted* the vnode before we got there, in which case * we will find evenZaps has changed, too. Thus, we only assume we have the right * status information if no callbacks or vnode removals have occurred to even * numbered files from the time the call started until the time that we got the xvcache * lock set. Of course, this also assumes that any call that modifies a file first * gets a write lock on the file's vnode, but if that weren't true, the whole cache manager * would fail, since no call would be able to update the local vnode status after modifying * a file on a file server. */ ObtainWriteLock(&afs_xvcache, 138); if (adp->f.states & CForeign) finalZaps = afs_allZaps; /* do this before calling newvcache */ else finalZaps = afs_evenZaps; /* do this before calling newvcache */ /* don't need to call RemoveVCB, since only path leaving a callback is the * one where we pass through afs_NewVCache. Can't have queued a VCB unless * we created and freed an entry between file creation time and here, and the * freeing of the vnode will change evenZaps. Don't need to update the VLRU * queue, since the find will only succeed in the event of a create race, and * then the vcache will be at the front of the VLRU queue anyway... */ if (!(tvc = afs_FindVCache(&newFid, 0, DO_STATS))) { tvc = afs_NewVCache(&newFid, hostp); if (tvc) { int finalCBs; ObtainWriteLock(&tvc->lock, 139); ObtainWriteLock(&afs_xcbhash, 489); finalCBs = afs_evenCBs; /* add the callback in */ if (adp->f.states & CForeign) { tvc->f.states |= CForeign; finalCBs = afs_allCBs; } if (origCBs == finalCBs && origZaps == finalZaps) { tvc->f.states |= CStatd; /* we've fake entire thing, so don't stat */ tvc->f.states &= ~CBulkFetching; if (!AFS_IS_DISCON_RW) { tvc->cbExpires = CallBack.ExpirationTime; afs_QueueCallback(tvc, CBHash(CallBack.ExpirationTime), volp); } } else { afs_DequeueCallback(tvc); tvc->f.states &= ~(CStatd | CUnique); tvc->callback = 0; if (tvc->f.fid.Fid.Vnode & 1 || (vType(tvc) == VDIR)) osi_dnlc_purgedp(tvc); } ReleaseWriteLock(&afs_xcbhash); if (AFS_IS_DISCON_RW) { afs_DisconAddDirty(tvc, VDisconCreate, 0); afs_GenDisconStatus(adp, tvc, &newFid, attrs, treq, VREG); } else { afs_ProcessFS(tvc, OutFidStatus, treq); } tvc->f.parent.vnode = adp->f.fid.Fid.Vnode; tvc->f.parent.unique = adp->f.fid.Fid.Unique; #if !defined(UKERNEL) if (volp && (volp->states & VPartVisible)) tvc->f.states |= CPartVisible; #endif ReleaseWriteLock(&tvc->lock); *avcp = tvc; code = 0; } else code = ENOENT; } else { /* otherwise cache entry already exists, someone else must * have created it. Comments used to say: "don't need write * lock to *clear* these flags" but we should do it anyway. * Code used to clear stat bit and callback, but I don't see * the point -- we didn't have a create race, somebody else just * snuck into NewVCache before we got here, probably a racing * lookup. */ *avcp = tvc; code = 0; } ReleaseWriteLock(&afs_xvcache); done: AFS_DISCON_UNLOCK(); done3: if (volp) afs_PutVolume(volp, READ_LOCK); if (code == 0) { if (afs_mariner) afs_AddMarinerName(aname, *avcp); /* return the new status in vattr */ afs_CopyOutAttrs(*avcp, attrs); if (afs_mariner) afs_MarinerLog("store$Creating", *avcp); } afs_PutFakeStat(&fakestate); code = afs_CheckCode(code, treq, 20); afs_DestroyReq(treq); done2: osi_FreeSmallSpace(OutFidStatus); osi_FreeSmallSpace(OutDirStatus); return code; } /* * Check to see if we can track the change locally: requires that * we have sufficiently recent info in data cache. If so, we * know the new DataVersion number, and place it correctly in both the * data and stat cache entries. This routine returns 1 if we should * do the operation locally, and 0 otherwise. * * This routine must be called with the stat cache entry write-locked, * and dcache entry write-locked. */ int afs_LocalHero(struct vcache *avc, struct dcache *adc, AFSFetchStatus * astat, int aincr) { afs_int32 ok; afs_hyper_t avers; AFS_STATCNT(afs_LocalHero); hset64(avers, astat->dataVersionHigh, astat->DataVersion); /* avers *is* the version number now, no matter what */ if (adc) { /* does what's in the dcache *now* match what's in the vcache *now*, * and do we have a valid callback? if not, our local copy is not "ok" */ ok = (hsame(avc->f.m.DataVersion, adc->f.versionNo) && avc->callback && (avc->f.states & CStatd) && avc->cbExpires >= osi_Time()); } else { ok = 0; } if (ok) { /* check that the DV on the server is what we expect it to be */ afs_hyper_t newDV; hset(newDV, adc->f.versionNo); hadd32(newDV, aincr); if (!hsame(avers, newDV)) { ok = 0; } } #if defined(AFS_SGI_ENV) osi_Assert(avc->v.v_type == VDIR); #endif /* The bulk status code used the length as a sequence number. */ /* Don't update the vcache entry unless the stats are current. */ if (avc->f.states & CStatd) { hset(avc->f.m.DataVersion, avers); #ifdef AFS_64BIT_CLIENT FillInt64(avc->f.m.Length, astat->Length_hi, astat->Length); #else /* AFS_64BIT_CLIENT */ avc->f.m.Length = astat->Length; #endif /* AFS_64BIT_CLIENT */ avc->f.m.Date = astat->ClientModTime; } if (ok) { /* we've been tracking things correctly */ adc->dflags |= DFEntryMod; adc->f.versionNo = avers; return 1; } else { if (adc) { ZapDCE(adc); DZap(adc); } if (avc->f.states & CStatd) { osi_dnlc_purgedp(avc); } return 0; } }
kern_return_t map_fd_funneled( int fd, vm_object_offset_t offset, vm_offset_t *va, boolean_t findspace, vm_size_t size) { kern_return_t result; struct fileproc *fp; struct vnode *vp; void * pager; vm_offset_t map_addr=0; vm_size_t map_size; int err=0; vm_map_t my_map; proc_t p = current_proc(); struct vnode_attr vattr; /* * Find the inode; verify that it's a regular file. */ err = fp_lookup(p, fd, &fp, 0); if (err) return(err); if (fp->f_fglob->fg_type != DTYPE_VNODE){ err = KERN_INVALID_ARGUMENT; goto bad; } if (!(fp->f_fglob->fg_flag & FREAD)) { err = KERN_PROTECTION_FAILURE; goto bad; } vp = (struct vnode *)fp->f_fglob->fg_data; err = vnode_getwithref(vp); if(err != 0) goto bad; if (vp->v_type != VREG) { (void)vnode_put(vp); err = KERN_INVALID_ARGUMENT; goto bad; } AUDIT_ARG(vnpath, vp, ARG_VNODE1); /* * POSIX: mmap needs to update access time for mapped files */ if ((vnode_vfsvisflags(vp) & MNT_NOATIME) == 0) { VATTR_INIT(&vattr); nanotime(&vattr.va_access_time); VATTR_SET_ACTIVE(&vattr, va_access_time); vnode_setattr(vp, &vattr, vfs_context_current()); } if (offset & PAGE_MASK_64) { printf("map_fd: file offset not page aligned(%d : %s)\n",p->p_pid, p->p_comm); (void)vnode_put(vp); err = KERN_INVALID_ARGUMENT; goto bad; } map_size = round_page(size); /* * Allow user to map in a zero length file. */ if (size == 0) { (void)vnode_put(vp); err = KERN_SUCCESS; goto bad; } /* * Map in the file. */ pager = (void *)ubc_getpager(vp); if (pager == NULL) { (void)vnode_put(vp); err = KERN_FAILURE; goto bad; } my_map = current_map(); result = vm_map_64( my_map, &map_addr, map_size, (vm_offset_t)0, VM_FLAGS_ANYWHERE, pager, offset, TRUE, VM_PROT_DEFAULT, VM_PROT_ALL, VM_INHERIT_DEFAULT); if (result != KERN_SUCCESS) { (void)vnode_put(vp); err = result; goto bad; } if (!findspace) { vm_offset_t dst_addr; vm_map_copy_t tmp; if (copyin(CAST_USER_ADDR_T(va), &dst_addr, sizeof (dst_addr)) || trunc_page_32(dst_addr) != dst_addr) { (void) vm_map_remove( my_map, map_addr, map_addr + map_size, VM_MAP_NO_FLAGS); (void)vnode_put(vp); err = KERN_INVALID_ADDRESS; goto bad; } result = vm_map_copyin(my_map, (vm_map_address_t)map_addr, (vm_map_size_t)map_size, TRUE, &tmp); if (result != KERN_SUCCESS) { (void) vm_map_remove(my_map, vm_map_trunc_page(map_addr), vm_map_round_page(map_addr + map_size), VM_MAP_NO_FLAGS); (void)vnode_put(vp); err = result; goto bad; } result = vm_map_copy_overwrite(my_map, (vm_map_address_t)dst_addr, tmp, FALSE); if (result != KERN_SUCCESS) { vm_map_copy_discard(tmp); (void)vnode_put(vp); err = result; goto bad; } } else { if (copyout(&map_addr, CAST_USER_ADDR_T(va), sizeof (map_addr))) { (void) vm_map_remove(my_map, vm_map_trunc_page(map_addr), vm_map_round_page(map_addr + map_size), VM_MAP_NO_FLAGS); (void)vnode_put(vp); err = KERN_INVALID_ADDRESS; goto bad; } } ubc_setthreadcred(vp, current_proc(), current_thread()); (void)ubc_map(vp, (PROT_READ | PROT_EXEC)); (void)vnode_put(vp); err = 0; bad: fp_drop(p, fd, fp, 0); return (err); }
/* * XXX Internally, we use VM_PROT_* somewhat interchangeably, but the correct * XXX usage is PROT_* from an interface perspective. Thus the values of * XXX VM_PROT_* and PROT_* need to correspond. */ int mmap(proc_t p, struct mmap_args *uap, user_addr_t *retval) { /* * Map in special device (must be SHARED) or file */ struct fileproc *fp; register struct vnode *vp; int flags; int prot, file_prot; int err=0; vm_map_t user_map; kern_return_t result; mach_vm_offset_t user_addr; mach_vm_size_t user_size; vm_object_offset_t pageoff; vm_object_offset_t file_pos; int alloc_flags=0; boolean_t docow; vm_prot_t maxprot; void *handle; vm_pager_t pager; int mapanon=0; int fpref=0; int error =0; int fd = uap->fd; user_addr = (mach_vm_offset_t)uap->addr; user_size = (mach_vm_size_t) uap->len; AUDIT_ARG(addr, user_addr); AUDIT_ARG(len, user_size); AUDIT_ARG(fd, uap->fd); prot = (uap->prot & VM_PROT_ALL); #if 3777787 /* * Since the hardware currently does not support writing without * read-before-write, or execution-without-read, if the request is * for write or execute access, we must imply read access as well; * otherwise programs expecting this to work will fail to operate. */ if (prot & (VM_PROT_EXECUTE | VM_PROT_WRITE)) prot |= VM_PROT_READ; #endif /* radar 3777787 */ flags = uap->flags; vp = NULLVP; /* * The vm code does not have prototypes & compiler doesn't do the' * the right thing when you cast 64bit value and pass it in function * call. So here it is. */ file_pos = (vm_object_offset_t)uap->pos; /* make sure mapping fits into numeric range etc */ if (file_pos + user_size > (vm_object_offset_t)-PAGE_SIZE_64) return (EINVAL); /* * Align the file position to a page boundary, * and save its page offset component. */ pageoff = (file_pos & PAGE_MASK); file_pos -= (vm_object_offset_t)pageoff; /* Adjust size for rounding (on both ends). */ user_size += pageoff; /* low end... */ user_size = mach_vm_round_page(user_size); /* hi end */ /* * Check for illegal addresses. Watch out for address wrap... Note * that VM_*_ADDRESS are not constants due to casts (argh). */ if (flags & MAP_FIXED) { /* * The specified address must have the same remainder * as the file offset taken modulo PAGE_SIZE, so it * should be aligned after adjustment by pageoff. */ user_addr -= pageoff; if (user_addr & PAGE_MASK) return (EINVAL); } #ifdef notyet /* DO not have apis to get this info, need to wait till then*/ /* * XXX for non-fixed mappings where no hint is provided or * the hint would fall in the potential heap space, * place it after the end of the largest possible heap. * * There should really be a pmap call to determine a reasonable * location. */ else if (addr < mach_vm_round_page(p->p_vmspace->vm_daddr + MAXDSIZ)) addr = mach_vm_round_page(p->p_vmspace->vm_daddr + MAXDSIZ); #endif alloc_flags = 0; if (flags & MAP_ANON) { /* * Mapping blank space is trivial. Use positive fds as the alias * value for memory tracking. */ if (fd != -1) { /* * Use "fd" to pass (some) Mach VM allocation flags, * (see the VM_FLAGS_* definitions). */ alloc_flags = fd & (VM_FLAGS_ALIAS_MASK | VM_FLAGS_PURGABLE); if (alloc_flags != fd) { /* reject if there are any extra flags */ return EINVAL; } } handle = NULL; maxprot = VM_PROT_ALL; file_pos = 0; mapanon = 1; } else { struct vnode_attr va; vfs_context_t ctx = vfs_context_current(); /* * Mapping file, get fp for validation. Obtain vnode and make * sure it is of appropriate type. */ err = fp_lookup(p, fd, &fp, 0); if (err) return(err); fpref = 1; if(fp->f_fglob->fg_type == DTYPE_PSXSHM) { uap->addr = (user_addr_t)user_addr; uap->len = (user_size_t)user_size; uap->prot = prot; uap->flags = flags; uap->pos = file_pos; error = pshm_mmap(p, uap, retval, fp, (off_t)pageoff); goto bad; } if (fp->f_fglob->fg_type != DTYPE_VNODE) { error = EINVAL; goto bad; } vp = (struct vnode *)fp->f_fglob->fg_data; error = vnode_getwithref(vp); if(error != 0) goto bad; if (vp->v_type != VREG && vp->v_type != VCHR) { (void)vnode_put(vp); error = EINVAL; goto bad; } AUDIT_ARG(vnpath, vp, ARG_VNODE1); /* * POSIX: mmap needs to update access time for mapped files */ if ((vnode_vfsvisflags(vp) & MNT_NOATIME) == 0) { VATTR_INIT(&va); nanotime(&va.va_access_time); VATTR_SET_ACTIVE(&va, va_access_time); vnode_setattr(vp, &va, ctx); } /* * XXX hack to handle use of /dev/zero to map anon memory (ala * SunOS). */ if (vp->v_type == VCHR || vp->v_type == VSTR) { (void)vnode_put(vp); error = ENODEV; goto bad; } else { /* * Ensure that file and memory protections are * compatible. Note that we only worry about * writability if mapping is shared; in this case, * current and max prot are dictated by the open file. * XXX use the vnode instead? Problem is: what * credentials do we use for determination? What if * proc does a setuid? */ maxprot = VM_PROT_EXECUTE; /* ??? */ if (fp->f_fglob->fg_flag & FREAD) maxprot |= VM_PROT_READ; else if (prot & PROT_READ) { (void)vnode_put(vp); error = EACCES; goto bad; } /* * If we are sharing potential changes (either via * MAP_SHARED or via the implicit sharing of character * device mappings), and we are trying to get write * permission although we opened it without asking * for it, bail out. */ if ((flags & MAP_SHARED) != 0) { if ((fp->f_fglob->fg_flag & FWRITE) != 0) { /* * check for write access * * Note that we already made this check when granting FWRITE * against the file, so it seems redundant here. */ error = vnode_authorize(vp, NULL, KAUTH_VNODE_CHECKIMMUTABLE, ctx); /* if not granted for any reason, but we wanted it, bad */ if ((prot & PROT_WRITE) && (error != 0)) { vnode_put(vp); goto bad; } /* if writable, remember */ if (error == 0) maxprot |= VM_PROT_WRITE; } else if ((prot & PROT_WRITE) != 0) { (void)vnode_put(vp); error = EACCES; goto bad; } } else maxprot |= VM_PROT_WRITE; handle = (void *)vp; #if CONFIG_MACF error = mac_file_check_mmap(vfs_context_ucred(ctx), fp->f_fglob, prot, flags, &maxprot); if (error) { (void)vnode_put(vp); goto bad; } #endif /* MAC */ } } if (user_size == 0) { if (!mapanon) (void)vnode_put(vp); error = 0; goto bad; } /* * We bend a little - round the start and end addresses * to the nearest page boundary. */ user_size = mach_vm_round_page(user_size); if (file_pos & PAGE_MASK_64) { if (!mapanon) (void)vnode_put(vp); error = EINVAL; goto bad; } user_map = current_map(); if ((flags & MAP_FIXED) == 0) { alloc_flags |= VM_FLAGS_ANYWHERE; user_addr = mach_vm_round_page(user_addr); } else { if (user_addr != mach_vm_trunc_page(user_addr)) { if (!mapanon) (void)vnode_put(vp); error = EINVAL; goto bad; } /* * mmap(MAP_FIXED) will replace any existing mappings in the * specified range, if the new mapping is successful. * If we just deallocate the specified address range here, * another thread might jump in and allocate memory in that * range before we get a chance to establish the new mapping, * and we won't have a chance to restore the old mappings. * So we use VM_FLAGS_OVERWRITE to let Mach VM know that it * has to deallocate the existing mappings and establish the * new ones atomically. */ alloc_flags |= VM_FLAGS_FIXED | VM_FLAGS_OVERWRITE; } if (flags & MAP_NOCACHE) alloc_flags |= VM_FLAGS_NO_CACHE; /* * Lookup/allocate object. */ if (handle == NULL) { pager = NULL; #ifdef notyet /* Hmm .. */ #if defined(VM_PROT_READ_IS_EXEC) if (prot & VM_PROT_READ) prot |= VM_PROT_EXECUTE; if (maxprot & VM_PROT_READ) maxprot |= VM_PROT_EXECUTE; #endif #endif #if 3777787 if (prot & (VM_PROT_EXECUTE | VM_PROT_WRITE)) prot |= VM_PROT_READ; if (maxprot & (VM_PROT_EXECUTE | VM_PROT_WRITE)) maxprot |= VM_PROT_READ; #endif /* radar 3777787 */ result = vm_map_enter_mem_object(user_map, &user_addr, user_size, 0, alloc_flags, IPC_PORT_NULL, 0, FALSE, prot, maxprot, (flags & MAP_SHARED) ? VM_INHERIT_SHARE : VM_INHERIT_DEFAULT); if (result != KERN_SUCCESS) goto out; } else { pager = (vm_pager_t)ubc_getpager(vp); if (pager == NULL) { (void)vnode_put(vp); error = ENOMEM; goto bad; } /* * Set credentials: * FIXME: if we're writing the file we need a way to * ensure that someone doesn't replace our R/W creds * with ones that only work for read. */ ubc_setthreadcred(vp, p, current_thread()); docow = FALSE; if ((flags & (MAP_ANON|MAP_SHARED)) == 0) { docow = TRUE; } #ifdef notyet /* Hmm .. */ #if defined(VM_PROT_READ_IS_EXEC) if (prot & VM_PROT_READ) prot |= VM_PROT_EXECUTE; if (maxprot & VM_PROT_READ) maxprot |= VM_PROT_EXECUTE; #endif #endif /* notyet */ #if 3777787 if (prot & (VM_PROT_EXECUTE | VM_PROT_WRITE)) prot |= VM_PROT_READ; if (maxprot & (VM_PROT_EXECUTE | VM_PROT_WRITE)) maxprot |= VM_PROT_READ; #endif /* radar 3777787 */ result = vm_map_enter_mem_object(user_map, &user_addr, user_size, 0, alloc_flags, (ipc_port_t)pager, file_pos, docow, prot, maxprot, (flags & MAP_SHARED) ? VM_INHERIT_SHARE : VM_INHERIT_DEFAULT); if (result != KERN_SUCCESS) { (void)vnode_put(vp); goto out; } file_prot = prot & (PROT_READ | PROT_WRITE | PROT_EXEC); if (docow) { /* private mapping: won't write to the file */ file_prot &= ~PROT_WRITE; } (void) ubc_map(vp, file_prot); } if (!mapanon) (void)vnode_put(vp); out: switch (result) { case KERN_SUCCESS: *retval = user_addr + pageoff; error = 0; break; case KERN_INVALID_ADDRESS: case KERN_NO_SPACE: error = ENOMEM; break; case KERN_PROTECTION_FAILURE: error = EACCES; break; default: error = EINVAL; break; } bad: if (fpref) fp_drop(p, fd, fp, 0); KERNEL_DEBUG_CONSTANT((BSDDBG_CODE(DBG_BSD_SC_EXTENDED_INFO, SYS_mmap) | DBG_FUNC_NONE), fd, (uint32_t)(*retval), (uint32_t)user_size, error, 0); KERNEL_DEBUG_CONSTANT((BSDDBG_CODE(DBG_BSD_SC_EXTENDED_INFO2, SYS_mmap) | DBG_FUNC_NONE), (uint32_t)(*retval >> 32), (uint32_t)(user_size >> 32), (uint32_t)(file_pos >> 32), (uint32_t)file_pos, 0); return(error); }
/* * shared_region_map_np() * * This system call is intended for dyld. * * dyld uses this to map a shared cache file into a shared region. * This is usually done only the first time a shared cache is needed. * Subsequent processes will just use the populated shared region without * requiring any further setup. */ int shared_region_map_np( struct proc *p, struct shared_region_map_np_args *uap, __unused int *retvalp) { int error; kern_return_t kr; int fd; struct fileproc *fp; struct vnode *vp, *root_vp; struct vnode_attr va; off_t fs; memory_object_size_t file_size; user_addr_t user_mappings; struct shared_file_mapping_np *mappings; #define SFM_MAX_STACK 8 struct shared_file_mapping_np stack_mappings[SFM_MAX_STACK]; unsigned int mappings_count; vm_size_t mappings_size; memory_object_control_t file_control; struct vm_shared_region *shared_region; SHARED_REGION_TRACE_DEBUG( ("shared_region: %p [%d(%s)] -> map\n", current_thread(), p->p_pid, p->p_comm)); shared_region = NULL; mappings_count = 0; mappings_size = 0; mappings = NULL; fp = NULL; vp = NULL; /* get file descriptor for shared region cache file */ fd = uap->fd; /* get file structure from file descriptor */ error = fp_lookup(p, fd, &fp, 0); if (error) { SHARED_REGION_TRACE_ERROR( ("shared_region: %p [%d(%s)] map: " "fd=%d lookup failed (error=%d)\n", current_thread(), p->p_pid, p->p_comm, fd, error)); goto done; } /* make sure we're attempting to map a vnode */ if (fp->f_fglob->fg_type != DTYPE_VNODE) { SHARED_REGION_TRACE_ERROR( ("shared_region: %p [%d(%s)] map: " "fd=%d not a vnode (type=%d)\n", current_thread(), p->p_pid, p->p_comm, fd, fp->f_fglob->fg_type)); error = EINVAL; goto done; } /* we need at least read permission on the file */ if (! (fp->f_fglob->fg_flag & FREAD)) { SHARED_REGION_TRACE_ERROR( ("shared_region: %p [%d(%s)] map: " "fd=%d not readable\n", current_thread(), p->p_pid, p->p_comm, fd)); error = EPERM; goto done; } /* get vnode from file structure */ error = vnode_getwithref((vnode_t) fp->f_fglob->fg_data); if (error) { SHARED_REGION_TRACE_ERROR( ("shared_region: %p [%d(%s)] map: " "fd=%d getwithref failed (error=%d)\n", current_thread(), p->p_pid, p->p_comm, fd, error)); goto done; } vp = (struct vnode *) fp->f_fglob->fg_data; /* make sure the vnode is a regular file */ if (vp->v_type != VREG) { SHARED_REGION_TRACE_ERROR( ("shared_region: %p [%d(%s)] map(%p:'%s'): " "not a file (type=%d)\n", current_thread(), p->p_pid, p->p_comm, vp, vp->v_name, vp->v_type)); error = EINVAL; goto done; } /* make sure vnode is on the process's root volume */ root_vp = p->p_fd->fd_rdir; if (root_vp == NULL) { root_vp = rootvnode; } if (vp->v_mount != root_vp->v_mount) { SHARED_REGION_TRACE_ERROR( ("shared_region: %p [%d(%s)] map(%p:'%s'): " "not on process's root volume\n", current_thread(), p->p_pid, p->p_comm, vp, vp->v_name)); error = EPERM; goto done; } /* make sure vnode is owned by "root" */ VATTR_INIT(&va); VATTR_WANTED(&va, va_uid); error = vnode_getattr(vp, &va, vfs_context_current()); if (error) { SHARED_REGION_TRACE_ERROR( ("shared_region: %p [%d(%s)] map(%p:'%s'): " "vnode_getattr(%p) failed (error=%d)\n", current_thread(), p->p_pid, p->p_comm, vp, vp->v_name, vp, error)); goto done; } if (va.va_uid != 0) { SHARED_REGION_TRACE_ERROR( ("shared_region: %p [%d(%s)] map(%p:'%s'): " "owned by uid=%d instead of 0\n", current_thread(), p->p_pid, p->p_comm, vp, vp->v_name, va.va_uid)); error = EPERM; goto done; } /* get vnode size */ error = vnode_size(vp, &fs, vfs_context_current()); if (error) { SHARED_REGION_TRACE_ERROR( ("shared_region: %p [%d(%s)] map(%p:'%s'): " "vnode_size(%p) failed (error=%d)\n", current_thread(), p->p_pid, p->p_comm, vp, vp->v_name, vp, error)); goto done; } file_size = fs; /* get the file's memory object handle */ file_control = ubc_getobject(vp, UBC_HOLDOBJECT); if (file_control == MEMORY_OBJECT_CONTROL_NULL) { SHARED_REGION_TRACE_ERROR( ("shared_region: %p [%d(%s)] map(%p:'%s'): " "no memory object\n", current_thread(), p->p_pid, p->p_comm, vp, vp->v_name)); error = EINVAL; goto done; } /* get the list of mappings the caller wants us to establish */ mappings_count = uap->count; /* number of mappings */ mappings_size = (vm_size_t) (mappings_count * sizeof (mappings[0])); if (mappings_count == 0) { SHARED_REGION_TRACE_INFO( ("shared_region: %p [%d(%s)] map(%p:'%s'): " "no mappings\n", current_thread(), p->p_pid, p->p_comm, vp, vp->v_name)); error = 0; /* no mappings: we're done ! */ goto done; } else if (mappings_count <= SFM_MAX_STACK) { mappings = &stack_mappings[0]; } else { SHARED_REGION_TRACE_ERROR( ("shared_region: %p [%d(%s)] map(%p:'%s'): " "too many mappings (%d)\n", current_thread(), p->p_pid, p->p_comm, vp, vp->v_name, mappings_count)); error = EINVAL; goto done; } user_mappings = uap->mappings; /* the mappings, in user space */ error = copyin(user_mappings, mappings, mappings_size); if (error) { SHARED_REGION_TRACE_ERROR( ("shared_region: %p [%d(%s)] map(%p:'%s'): " "copyin(0x%llx, %d) failed (error=%d)\n", current_thread(), p->p_pid, p->p_comm, vp, vp->v_name, (uint64_t)user_mappings, mappings_count, error)); goto done; } /* get the process's shared region (setup in vm_map_exec()) */ shared_region = vm_shared_region_get(current_task()); if (shared_region == NULL) { SHARED_REGION_TRACE_ERROR( ("shared_region: %p [%d(%s)] map(%p:'%s'): " "no shared region\n", current_thread(), p->p_pid, p->p_comm, vp, vp->v_name)); goto done; } /* map the file into that shared region's submap */ kr = vm_shared_region_map_file(shared_region, mappings_count, mappings, file_control, file_size, (void *) p->p_fd->fd_rdir); if (kr != KERN_SUCCESS) { SHARED_REGION_TRACE_ERROR( ("shared_region: %p [%d(%s)] map(%p:'%s'): " "vm_shared_region_map_file() failed kr=0x%x\n", current_thread(), p->p_pid, p->p_comm, vp, vp->v_name, kr)); switch (kr) { case KERN_INVALID_ADDRESS: error = EFAULT; break; case KERN_PROTECTION_FAILURE: error = EPERM; break; case KERN_NO_SPACE: error = ENOMEM; break; case KERN_FAILURE: case KERN_INVALID_ARGUMENT: default: error = EINVAL; break; } goto done; } /* * The mapping was successful. Let the buffer cache know * that we've mapped that file with these protections. This * prevents the vnode from getting recycled while it's mapped. */ (void) ubc_map(vp, VM_PROT_READ); error = 0; /* update the vnode's access time */ if (! (vnode_vfsvisflags(vp) & MNT_NOATIME)) { VATTR_INIT(&va); nanotime(&va.va_access_time); VATTR_SET_ACTIVE(&va, va_access_time); vnode_setattr(vp, &va, vfs_context_current()); } if (p->p_flag & P_NOSHLIB) { /* signal that this process is now using split libraries */ OSBitAndAtomic(~((uint32_t)P_NOSHLIB), (UInt32 *)&p->p_flag); } done: if (vp != NULL) { /* * release the vnode... * ubc_map() still holds it for us in the non-error case */ (void) vnode_put(vp); vp = NULL; } if (fp != NULL) { /* release the file descriptor */ fp_drop(p, fd, fp, 0); fp = NULL; } if (shared_region != NULL) { vm_shared_region_deallocate(shared_region); } SHARED_REGION_TRACE_DEBUG( ("shared_region: %p [%d(%s)] <- map\n", current_thread(), p->p_pid, p->p_comm)); return error; }