static int spigen_mmap_single(struct cdev *cdev, vm_ooffset_t *offset, vm_size_t size, struct vm_object **object, int nprot) { device_t dev = cdev->si_drv1; struct spigen_softc *sc = device_get_softc(dev); vm_page_t *m; size_t n, pages; if (size == 0 || (nprot & (PROT_EXEC | PROT_READ | PROT_WRITE)) != (PROT_READ | PROT_WRITE)) return (EINVAL); size = roundup2(size, PAGE_SIZE); pages = size / PAGE_SIZE; mtx_lock(&sc->sc_mtx); if (sc->sc_mmap_buffer != NULL) { mtx_unlock(&sc->sc_mtx); return (EBUSY); } else if (size > sc->sc_command_length_max + sc->sc_data_length_max) { mtx_unlock(&sc->sc_mtx); return (E2BIG); } sc->sc_mmap_buffer_size = size; *offset = 0; sc->sc_mmap_buffer = *object = vm_pager_allocate(OBJT_PHYS, 0, size, nprot, *offset, curthread->td_ucred); m = malloc(sizeof(*m) * pages, M_TEMP, M_WAITOK); VM_OBJECT_WLOCK(*object); vm_object_reference_locked(*object); // kernel and userland both for (n = 0; n < pages; n++) { m[n] = vm_page_grab(*object, n, VM_ALLOC_NOBUSY | VM_ALLOC_ZERO | VM_ALLOC_WIRED); m[n]->valid = VM_PAGE_BITS_ALL; } VM_OBJECT_WUNLOCK(*object); sc->sc_mmap_kvaddr = kva_alloc(size); pmap_qenter(sc->sc_mmap_kvaddr, m, pages); free(m, M_TEMP); mtx_unlock(&sc->sc_mtx); if (*object == NULL) return (EINVAL); return (0); }
int vm_fault_hold(vm_map_t map, vm_offset_t vaddr, vm_prot_t fault_type, int fault_flags, vm_page_t *m_hold) { vm_prot_t prot; long ahead, behind; int alloc_req, era, faultcount, nera, reqpage, result; boolean_t growstack, is_first_object_locked, wired; int map_generation; vm_object_t next_object; vm_page_t marray[VM_FAULT_READ_MAX]; int hardfault; struct faultstate fs; struct vnode *vp; int locked, error; hardfault = 0; growstack = TRUE; PCPU_INC(cnt.v_vm_faults); fs.vp = NULL; faultcount = reqpage = 0; RetryFault:; /* * Find the backing store object and offset into it to begin the * search. */ fs.map = map; result = vm_map_lookup(&fs.map, vaddr, fault_type, &fs.entry, &fs.first_object, &fs.first_pindex, &prot, &wired); if (result != KERN_SUCCESS) { if (growstack && result == KERN_INVALID_ADDRESS && map != kernel_map) { result = vm_map_growstack(curproc, vaddr); if (result != KERN_SUCCESS) return (KERN_FAILURE); growstack = FALSE; goto RetryFault; } return (result); } map_generation = fs.map->timestamp; if (fs.entry->eflags & MAP_ENTRY_NOFAULT) { panic("vm_fault: fault on nofault entry, addr: %lx", (u_long)vaddr); } /* * Make a reference to this object to prevent its disposal while we * are messing with it. Once we have the reference, the map is free * to be diddled. Since objects reference their shadows (and copies), * they will stay around as well. * * Bump the paging-in-progress count to prevent size changes (e.g. * truncation operations) during I/O. This must be done after * obtaining the vnode lock in order to avoid possible deadlocks. */ VM_OBJECT_WLOCK(fs.first_object); vm_object_reference_locked(fs.first_object); vm_object_pip_add(fs.first_object, 1); fs.lookup_still_valid = TRUE; if (wired) fault_type = prot | (fault_type & VM_PROT_COPY); fs.first_m = NULL; /* * Search for the page at object/offset. */ fs.object = fs.first_object; fs.pindex = fs.first_pindex; while (TRUE) { /* * If the object is dead, we stop here */ if (fs.object->flags & OBJ_DEAD) { unlock_and_deallocate(&fs); return (KERN_PROTECTION_FAILURE); } /* * See if page is resident */ fs.m = vm_page_lookup(fs.object, fs.pindex); if (fs.m != NULL) { /* * check for page-based copy on write. * We check fs.object == fs.first_object so * as to ensure the legacy COW mechanism is * used when the page in question is part of * a shadow object. Otherwise, vm_page_cowfault() * removes the page from the backing object, * which is not what we want. */ vm_page_lock(fs.m); if ((fs.m->cow) && (fault_type & VM_PROT_WRITE) && (fs.object == fs.first_object)) { vm_page_cowfault(fs.m); unlock_and_deallocate(&fs); goto RetryFault; } /* * Wait/Retry if the page is busy. We have to do this * if the page is busy via either VPO_BUSY or * vm_page_t->busy because the vm_pager may be using * vm_page_t->busy for pageouts ( and even pageins if * it is the vnode pager ), and we could end up trying * to pagein and pageout the same page simultaneously. * * We can theoretically allow the busy case on a read * fault if the page is marked valid, but since such * pages are typically already pmap'd, putting that * special case in might be more effort then it is * worth. We cannot under any circumstances mess * around with a vm_page_t->busy page except, perhaps, * to pmap it. */ if ((fs.m->oflags & VPO_BUSY) || fs.m->busy) { /* * Reference the page before unlocking and * sleeping so that the page daemon is less * likely to reclaim it. */ vm_page_aflag_set(fs.m, PGA_REFERENCED); vm_page_unlock(fs.m); if (fs.object != fs.first_object) { if (!VM_OBJECT_TRYWLOCK( fs.first_object)) { VM_OBJECT_WUNLOCK(fs.object); VM_OBJECT_WLOCK(fs.first_object); VM_OBJECT_WLOCK(fs.object); } vm_page_lock(fs.first_m); vm_page_free(fs.first_m); vm_page_unlock(fs.first_m); vm_object_pip_wakeup(fs.first_object); VM_OBJECT_WUNLOCK(fs.first_object); fs.first_m = NULL; } unlock_map(&fs); if (fs.m == vm_page_lookup(fs.object, fs.pindex)) { vm_page_sleep_if_busy(fs.m, TRUE, "vmpfw"); } vm_object_pip_wakeup(fs.object); VM_OBJECT_WUNLOCK(fs.object); PCPU_INC(cnt.v_intrans); vm_object_deallocate(fs.first_object); goto RetryFault; } vm_page_remque(fs.m); vm_page_unlock(fs.m); /* * Mark page busy for other processes, and the * pagedaemon. If it still isn't completely valid * (readable), jump to readrest, else break-out ( we * found the page ). */ vm_page_busy(fs.m); if (fs.m->valid != VM_PAGE_BITS_ALL) goto readrest; break; } /* * Page is not resident, If this is the search termination * or the pager might contain the page, allocate a new page. */ if (TRYPAGER || fs.object == fs.first_object) { if (fs.pindex >= fs.object->size) { unlock_and_deallocate(&fs); return (KERN_PROTECTION_FAILURE); } /* * Allocate a new page for this object/offset pair. * * Unlocked read of the p_flag is harmless. At * worst, the P_KILLED might be not observed * there, and allocation can fail, causing * restart and new reading of the p_flag. */ fs.m = NULL; if (!vm_page_count_severe() || P_KILLED(curproc)) { #if VM_NRESERVLEVEL > 0 if ((fs.object->flags & OBJ_COLORED) == 0) { fs.object->flags |= OBJ_COLORED; fs.object->pg_color = atop(vaddr) - fs.pindex; } #endif alloc_req = P_KILLED(curproc) ? VM_ALLOC_SYSTEM : VM_ALLOC_NORMAL; if (fs.object->type != OBJT_VNODE && fs.object->backing_object == NULL) alloc_req |= VM_ALLOC_ZERO; fs.m = vm_page_alloc(fs.object, fs.pindex, alloc_req); } if (fs.m == NULL) { unlock_and_deallocate(&fs); VM_WAITPFAULT; goto RetryFault; } else if (fs.m->valid == VM_PAGE_BITS_ALL) break; } readrest: /* * We have found a valid page or we have allocated a new page. * The page thus may not be valid or may not be entirely * valid. * * Attempt to fault-in the page if there is a chance that the * pager has it, and potentially fault in additional pages * at the same time. */ if (TRYPAGER) { int rv; u_char behavior = vm_map_entry_behavior(fs.entry); if (behavior == MAP_ENTRY_BEHAV_RANDOM || P_KILLED(curproc)) { behind = 0; ahead = 0; } else if (behavior == MAP_ENTRY_BEHAV_SEQUENTIAL) { behind = 0; ahead = atop(fs.entry->end - vaddr) - 1; if (ahead > VM_FAULT_READ_AHEAD_MAX) ahead = VM_FAULT_READ_AHEAD_MAX; if (fs.pindex == fs.entry->next_read) vm_fault_cache_behind(&fs, VM_FAULT_READ_MAX); } else { /* * If this is a sequential page fault, then * arithmetically increase the number of pages * in the read-ahead window. Otherwise, reset * the read-ahead window to its smallest size. */ behind = atop(vaddr - fs.entry->start); if (behind > VM_FAULT_READ_BEHIND) behind = VM_FAULT_READ_BEHIND; ahead = atop(fs.entry->end - vaddr) - 1; era = fs.entry->read_ahead; if (fs.pindex == fs.entry->next_read) { nera = era + behind; if (nera > VM_FAULT_READ_AHEAD_MAX) nera = VM_FAULT_READ_AHEAD_MAX; behind = 0; if (ahead > nera) ahead = nera; if (era == VM_FAULT_READ_AHEAD_MAX) vm_fault_cache_behind(&fs, VM_FAULT_CACHE_BEHIND); } else if (ahead > VM_FAULT_READ_AHEAD_MIN) ahead = VM_FAULT_READ_AHEAD_MIN; if (era != ahead) fs.entry->read_ahead = ahead; } /* * Call the pager to retrieve the data, if any, after * releasing the lock on the map. We hold a ref on * fs.object and the pages are VPO_BUSY'd. */ unlock_map(&fs); if (fs.object->type == OBJT_VNODE) { vp = fs.object->handle; if (vp == fs.vp) goto vnode_locked; else if (fs.vp != NULL) { vput(fs.vp); fs.vp = NULL; } locked = VOP_ISLOCKED(vp); if (locked != LK_EXCLUSIVE) locked = LK_SHARED; /* Do not sleep for vnode lock while fs.m is busy */ error = vget(vp, locked | LK_CANRECURSE | LK_NOWAIT, curthread); if (error != 0) { vhold(vp); release_page(&fs); unlock_and_deallocate(&fs); error = vget(vp, locked | LK_RETRY | LK_CANRECURSE, curthread); vdrop(vp); fs.vp = vp; KASSERT(error == 0, ("vm_fault: vget failed")); goto RetryFault; } fs.vp = vp; } vnode_locked: KASSERT(fs.vp == NULL || !fs.map->system_map, ("vm_fault: vnode-backed object mapped by system map")); /* * now we find out if any other pages should be paged * in at this time this routine checks to see if the * pages surrounding this fault reside in the same * object as the page for this fault. If they do, * then they are faulted in also into the object. The * array "marray" returned contains an array of * vm_page_t structs where one of them is the * vm_page_t passed to the routine. The reqpage * return value is the index into the marray for the * vm_page_t passed to the routine. * * fs.m plus the additional pages are VPO_BUSY'd. */ faultcount = vm_fault_additional_pages( fs.m, behind, ahead, marray, &reqpage); rv = faultcount ? vm_pager_get_pages(fs.object, marray, faultcount, reqpage) : VM_PAGER_FAIL; if (rv == VM_PAGER_OK) { /* * Found the page. Leave it busy while we play * with it. */ /* * Relookup in case pager changed page. Pager * is responsible for disposition of old page * if moved. */ fs.m = vm_page_lookup(fs.object, fs.pindex); if (!fs.m) { unlock_and_deallocate(&fs); goto RetryFault; } hardfault++; break; /* break to PAGE HAS BEEN FOUND */ } /* * Remove the bogus page (which does not exist at this * object/offset); before doing so, we must get back * our object lock to preserve our invariant. * * Also wake up any other process that may want to bring * in this page. * * If this is the top-level object, we must leave the * busy page to prevent another process from rushing * past us, and inserting the page in that object at * the same time that we are. */ if (rv == VM_PAGER_ERROR) printf("vm_fault: pager read error, pid %d (%s)\n", curproc->p_pid, curproc->p_comm); /* * Data outside the range of the pager or an I/O error */ /* * XXX - the check for kernel_map is a kludge to work * around having the machine panic on a kernel space * fault w/ I/O error. */ if (((fs.map != kernel_map) && (rv == VM_PAGER_ERROR)) || (rv == VM_PAGER_BAD)) { vm_page_lock(fs.m); vm_page_free(fs.m); vm_page_unlock(fs.m); fs.m = NULL; unlock_and_deallocate(&fs); return ((rv == VM_PAGER_ERROR) ? KERN_FAILURE : KERN_PROTECTION_FAILURE); } if (fs.object != fs.first_object) { vm_page_lock(fs.m); vm_page_free(fs.m); vm_page_unlock(fs.m); fs.m = NULL; /* * XXX - we cannot just fall out at this * point, m has been freed and is invalid! */ } } /* * We get here if the object has default pager (or unwiring) * or the pager doesn't have the page. */ if (fs.object == fs.first_object) fs.first_m = fs.m; /* * Move on to the next object. Lock the next object before * unlocking the current one. */ fs.pindex += OFF_TO_IDX(fs.object->backing_object_offset); next_object = fs.object->backing_object; if (next_object == NULL) { /* * If there's no object left, fill the page in the top * object with zeros. */ if (fs.object != fs.first_object) { vm_object_pip_wakeup(fs.object); VM_OBJECT_WUNLOCK(fs.object); fs.object = fs.first_object; fs.pindex = fs.first_pindex; fs.m = fs.first_m; VM_OBJECT_WLOCK(fs.object); } fs.first_m = NULL; /* * Zero the page if necessary and mark it valid. */ if ((fs.m->flags & PG_ZERO) == 0) { pmap_zero_page(fs.m); } else { PCPU_INC(cnt.v_ozfod); } PCPU_INC(cnt.v_zfod); fs.m->valid = VM_PAGE_BITS_ALL; break; /* break to PAGE HAS BEEN FOUND */ } else { KASSERT(fs.object != next_object, ("object loop %p", next_object)); VM_OBJECT_WLOCK(next_object); vm_object_pip_add(next_object, 1); if (fs.object != fs.first_object) vm_object_pip_wakeup(fs.object); VM_OBJECT_WUNLOCK(fs.object); fs.object = next_object; } } KASSERT((fs.m->oflags & VPO_BUSY) != 0, ("vm_fault: not busy after main loop")); /* * PAGE HAS BEEN FOUND. [Loop invariant still holds -- the object lock * is held.] */ /* * If the page is being written, but isn't already owned by the * top-level object, we have to copy it into a new page owned by the * top-level object. */ if (fs.object != fs.first_object) { /* * We only really need to copy if we want to write it. */ if ((fault_type & (VM_PROT_COPY | VM_PROT_WRITE)) != 0) { /* * This allows pages to be virtually copied from a * backing_object into the first_object, where the * backing object has no other refs to it, and cannot * gain any more refs. Instead of a bcopy, we just * move the page from the backing object to the * first object. Note that we must mark the page * dirty in the first object so that it will go out * to swap when needed. */ is_first_object_locked = FALSE; if ( /* * Only one shadow object */ (fs.object->shadow_count == 1) && /* * No COW refs, except us */ (fs.object->ref_count == 1) && /* * No one else can look this object up */ (fs.object->handle == NULL) && /* * No other ways to look the object up */ ((fs.object->type == OBJT_DEFAULT) || (fs.object->type == OBJT_SWAP)) && (is_first_object_locked = VM_OBJECT_TRYWLOCK(fs.first_object)) && /* * We don't chase down the shadow chain */ fs.object == fs.first_object->backing_object) { /* * get rid of the unnecessary page */ vm_page_lock(fs.first_m); vm_page_free(fs.first_m); vm_page_unlock(fs.first_m); /* * grab the page and put it into the * process'es object. The page is * automatically made dirty. */ vm_page_lock(fs.m); vm_page_rename(fs.m, fs.first_object, fs.first_pindex); vm_page_unlock(fs.m); vm_page_busy(fs.m); fs.first_m = fs.m; fs.m = NULL; PCPU_INC(cnt.v_cow_optim); } else { /* * Oh, well, lets copy it. */ pmap_copy_page(fs.m, fs.first_m); fs.first_m->valid = VM_PAGE_BITS_ALL; if (wired && (fault_flags & VM_FAULT_CHANGE_WIRING) == 0) { vm_page_lock(fs.first_m); vm_page_wire(fs.first_m); vm_page_unlock(fs.first_m); vm_page_lock(fs.m); vm_page_unwire(fs.m, FALSE); vm_page_unlock(fs.m); } /* * We no longer need the old page or object. */ release_page(&fs); } /* * fs.object != fs.first_object due to above * conditional */ vm_object_pip_wakeup(fs.object); VM_OBJECT_WUNLOCK(fs.object); /* * Only use the new page below... */ fs.object = fs.first_object; fs.pindex = fs.first_pindex; fs.m = fs.first_m; if (!is_first_object_locked) VM_OBJECT_WLOCK(fs.object); PCPU_INC(cnt.v_cow_faults); curthread->td_cow++; } else { prot &= ~VM_PROT_WRITE; } } /* * We must verify that the maps have not changed since our last * lookup. */ if (!fs.lookup_still_valid) { vm_object_t retry_object; vm_pindex_t retry_pindex; vm_prot_t retry_prot; if (!vm_map_trylock_read(fs.map)) { release_page(&fs); unlock_and_deallocate(&fs); goto RetryFault; } fs.lookup_still_valid = TRUE; if (fs.map->timestamp != map_generation) { result = vm_map_lookup_locked(&fs.map, vaddr, fault_type, &fs.entry, &retry_object, &retry_pindex, &retry_prot, &wired); /* * If we don't need the page any longer, put it on the inactive * list (the easiest thing to do here). If no one needs it, * pageout will grab it eventually. */ if (result != KERN_SUCCESS) { release_page(&fs); unlock_and_deallocate(&fs); /* * If retry of map lookup would have blocked then * retry fault from start. */ if (result == KERN_FAILURE) goto RetryFault; return (result); } if ((retry_object != fs.first_object) || (retry_pindex != fs.first_pindex)) { release_page(&fs); unlock_and_deallocate(&fs); goto RetryFault; } /* * Check whether the protection has changed or the object has * been copied while we left the map unlocked. Changing from * read to write permission is OK - we leave the page * write-protected, and catch the write fault. Changing from * write to read permission means that we can't mark the page * write-enabled after all. */ prot &= retry_prot; } } /* * If the page was filled by a pager, update the map entry's * last read offset. Since the pager does not return the * actual set of pages that it read, this update is based on * the requested set. Typically, the requested and actual * sets are the same. * * XXX The following assignment modifies the map * without holding a write lock on it. */ if (hardfault) fs.entry->next_read = fs.pindex + faultcount - reqpage; if ((prot & VM_PROT_WRITE) != 0 || (fault_flags & VM_FAULT_DIRTY) != 0) { vm_object_set_writeable_dirty(fs.object); /* * If this is a NOSYNC mmap we do not want to set VPO_NOSYNC * if the page is already dirty to prevent data written with * the expectation of being synced from not being synced. * Likewise if this entry does not request NOSYNC then make * sure the page isn't marked NOSYNC. Applications sharing * data should use the same flags to avoid ping ponging. */ if (fs.entry->eflags & MAP_ENTRY_NOSYNC) { if (fs.m->dirty == 0) fs.m->oflags |= VPO_NOSYNC; } else { fs.m->oflags &= ~VPO_NOSYNC; } /* * If the fault is a write, we know that this page is being * written NOW so dirty it explicitly to save on * pmap_is_modified() calls later. * * Also tell the backing pager, if any, that it should remove * any swap backing since the page is now dirty. */ if (((fault_type & VM_PROT_WRITE) != 0 && (fault_flags & VM_FAULT_CHANGE_WIRING) == 0) || (fault_flags & VM_FAULT_DIRTY) != 0) { vm_page_dirty(fs.m); vm_pager_page_unswapped(fs.m); } } /* * Page had better still be busy */ KASSERT(fs.m->oflags & VPO_BUSY, ("vm_fault: page %p not busy!", fs.m)); /* * Page must be completely valid or it is not fit to * map into user space. vm_pager_get_pages() ensures this. */ KASSERT(fs.m->valid == VM_PAGE_BITS_ALL, ("vm_fault: page %p partially invalid", fs.m)); VM_OBJECT_WUNLOCK(fs.object); /* * Put this page into the physical map. We had to do the unlock above * because pmap_enter() may sleep. We don't put the page * back on the active queue until later so that the pageout daemon * won't find it (yet). */ pmap_enter(fs.map->pmap, vaddr, fault_type, fs.m, prot, wired); if ((fault_flags & VM_FAULT_CHANGE_WIRING) == 0 && wired == 0) vm_fault_prefault(fs.map->pmap, vaddr, fs.entry); VM_OBJECT_WLOCK(fs.object); vm_page_lock(fs.m); /* * If the page is not wired down, then put it where the pageout daemon * can find it. */ if (fault_flags & VM_FAULT_CHANGE_WIRING) { if (wired) vm_page_wire(fs.m); else vm_page_unwire(fs.m, 1); } else vm_page_activate(fs.m); if (m_hold != NULL) { *m_hold = fs.m; vm_page_hold(fs.m); } vm_page_unlock(fs.m); vm_page_wakeup(fs.m); /* * Unlock everything, and return */ unlock_and_deallocate(&fs); if (hardfault) { PCPU_INC(cnt.v_io_faults); curthread->td_ru.ru_majflt++; } else curthread->td_ru.ru_minflt++; return (KERN_SUCCESS); }
static int link_elf_obj_load_file(const char *filename, linker_file_t * result) { struct nlookupdata nd; struct thread *td = curthread; /* XXX */ struct proc *p = td->td_proc; char *pathname; struct vnode *vp; Elf_Ehdr *hdr; Elf_Shdr *shdr; Elf_Sym *es; int nbytes, i, j; vm_offset_t mapbase; size_t mapsize; int error = 0; int resid; elf_file_t ef; linker_file_t lf; int symtabindex; int symstrindex; int shstrindex; int nsym; int pb, rl, ra; int alignmask; /* XXX Hack for firmware loading where p == NULL */ if (p == NULL) { p = &proc0; } KKASSERT(p != NULL); if (p->p_ucred == NULL) { kprintf("link_elf_obj_load_file: cannot load '%s' from filesystem" " this early\n", filename); return ENOENT; } shdr = NULL; lf = NULL; mapsize = 0; hdr = NULL; pathname = linker_search_path(filename); if (pathname == NULL) return ENOENT; error = nlookup_init(&nd, pathname, UIO_SYSSPACE, NLC_FOLLOW | NLC_LOCKVP); if (error == 0) error = vn_open(&nd, NULL, FREAD, 0); kfree(pathname, M_LINKER); if (error) { nlookup_done(&nd); return error; } vp = nd.nl_open_vp; nd.nl_open_vp = NULL; nlookup_done(&nd); /* * Read the elf header from the file. */ hdr = kmalloc(sizeof(*hdr), M_LINKER, M_WAITOK); error = vn_rdwr(UIO_READ, vp, (void *)hdr, sizeof(*hdr), 0, UIO_SYSSPACE, IO_NODELOCKED, p->p_ucred, &resid); if (error) goto out; if (resid != 0) { error = ENOEXEC; goto out; } if (!IS_ELF(*hdr)) { error = ENOEXEC; goto out; } if (hdr->e_ident[EI_CLASS] != ELF_TARG_CLASS || hdr->e_ident[EI_DATA] != ELF_TARG_DATA) { link_elf_obj_error(filename, "Unsupported file layout"); error = ENOEXEC; goto out; } if (hdr->e_ident[EI_VERSION] != EV_CURRENT || hdr->e_version != EV_CURRENT) { link_elf_obj_error(filename, "Unsupported file version"); error = ENOEXEC; goto out; } if (hdr->e_type != ET_REL) { error = ENOSYS; goto out; } if (hdr->e_machine != ELF_TARG_MACH) { link_elf_obj_error(filename, "Unsupported machine"); error = ENOEXEC; goto out; } ef = kmalloc(sizeof(struct elf_file), M_LINKER, M_WAITOK | M_ZERO); lf = linker_make_file(filename, ef, &link_elf_obj_file_ops); if (lf == NULL) { kfree(ef, M_LINKER); error = ENOMEM; goto out; } ef->nprogtab = 0; ef->e_shdr = NULL; ef->nreltab = 0; ef->nrelatab = 0; /* Allocate and read in the section header */ nbytes = hdr->e_shnum * hdr->e_shentsize; if (nbytes == 0 || hdr->e_shoff == 0 || hdr->e_shentsize != sizeof(Elf_Shdr)) { error = ENOEXEC; goto out; } shdr = kmalloc(nbytes, M_LINKER, M_WAITOK); ef->e_shdr = shdr; error = vn_rdwr(UIO_READ, vp, (caddr_t) shdr, nbytes, hdr->e_shoff, UIO_SYSSPACE, IO_NODELOCKED, p->p_ucred, &resid); if (error) goto out; if (resid) { error = ENOEXEC; goto out; } /* Scan the section header for information and table sizing. */ nsym = 0; symtabindex = -1; symstrindex = -1; for (i = 0; i < hdr->e_shnum; i++) { if (shdr[i].sh_size == 0) continue; switch (shdr[i].sh_type) { case SHT_PROGBITS: case SHT_NOBITS: ef->nprogtab++; break; case SHT_SYMTAB: nsym++; symtabindex = i; symstrindex = shdr[i].sh_link; break; case SHT_REL: ef->nreltab++; break; case SHT_RELA: ef->nrelatab++; break; case SHT_STRTAB: break; } } if (ef->nprogtab == 0) { link_elf_obj_error(filename, "file has no contents"); error = ENOEXEC; goto out; } if (nsym != 1) { /* Only allow one symbol table for now */ link_elf_obj_error(filename, "file has no valid symbol table"); error = ENOEXEC; goto out; } if (symstrindex < 0 || symstrindex > hdr->e_shnum || shdr[symstrindex].sh_type != SHT_STRTAB) { link_elf_obj_error(filename, "file has invalid symbol strings"); error = ENOEXEC; goto out; } /* Allocate space for tracking the load chunks */ if (ef->nprogtab != 0) ef->progtab = kmalloc(ef->nprogtab * sizeof(*ef->progtab), M_LINKER, M_WAITOK | M_ZERO); if (ef->nreltab != 0) ef->reltab = kmalloc(ef->nreltab * sizeof(*ef->reltab), M_LINKER, M_WAITOK | M_ZERO); if (ef->nrelatab != 0) ef->relatab = kmalloc(ef->nrelatab * sizeof(*ef->relatab), M_LINKER, M_WAITOK | M_ZERO); if ((ef->nprogtab != 0 && ef->progtab == NULL) || (ef->nreltab != 0 && ef->reltab == NULL) || (ef->nrelatab != 0 && ef->relatab == NULL)) { error = ENOMEM; goto out; } if (symtabindex == -1) panic("lost symbol table index"); /* Allocate space for and load the symbol table */ ef->ddbsymcnt = shdr[symtabindex].sh_size / sizeof(Elf_Sym); ef->ddbsymtab = kmalloc(shdr[symtabindex].sh_size, M_LINKER, M_WAITOK); error = vn_rdwr(UIO_READ, vp, (void *)ef->ddbsymtab, shdr[symtabindex].sh_size, shdr[symtabindex].sh_offset, UIO_SYSSPACE, IO_NODELOCKED, p->p_ucred, &resid); if (error) goto out; if (resid != 0) { error = EINVAL; goto out; } if (symstrindex == -1) panic("lost symbol string index"); /* Allocate space for and load the symbol strings */ ef->ddbstrcnt = shdr[symstrindex].sh_size; ef->ddbstrtab = kmalloc(shdr[symstrindex].sh_size, M_LINKER, M_WAITOK); error = vn_rdwr(UIO_READ, vp, ef->ddbstrtab, shdr[symstrindex].sh_size, shdr[symstrindex].sh_offset, UIO_SYSSPACE, IO_NODELOCKED, p->p_ucred, &resid); if (error) goto out; if (resid != 0) { error = EINVAL; goto out; } /* Do we have a string table for the section names? */ shstrindex = -1; if (hdr->e_shstrndx != 0 && shdr[hdr->e_shstrndx].sh_type == SHT_STRTAB) { shstrindex = hdr->e_shstrndx; ef->shstrcnt = shdr[shstrindex].sh_size; ef->shstrtab = kmalloc(shdr[shstrindex].sh_size, M_LINKER, M_WAITOK); error = vn_rdwr(UIO_READ, vp, ef->shstrtab, shdr[shstrindex].sh_size, shdr[shstrindex].sh_offset, UIO_SYSSPACE, IO_NODELOCKED, p->p_ucred, &resid); if (error) goto out; if (resid != 0) { error = EINVAL; goto out; } } /* Size up code/data(progbits) and bss(nobits). */ alignmask = 0; for (i = 0; i < hdr->e_shnum; i++) { if (shdr[i].sh_size == 0) continue; switch (shdr[i].sh_type) { case SHT_PROGBITS: case SHT_NOBITS: alignmask = shdr[i].sh_addralign - 1; mapsize += alignmask; mapsize &= ~alignmask; mapsize += shdr[i].sh_size; break; } } /* * We know how much space we need for the text/data/bss/etc. This * stuff needs to be in a single chunk so that profiling etc can get * the bounds and gdb can associate offsets with modules */ ef->object = vm_object_allocate(OBJT_DEFAULT, round_page(mapsize) >> PAGE_SHIFT); if (ef->object == NULL) { error = ENOMEM; goto out; } vm_object_hold(ef->object); vm_object_reference_locked(ef->object); ef->address = (caddr_t) vm_map_min(&kernel_map); ef->bytes = 0; /* * In order to satisfy x86_64's architectural requirements on the * location of code and data in the kernel's address space, request a * mapping that is above the kernel. * * vkernel64's text+data is outside the managed VM space entirely. */ #if defined(__x86_64__) && defined(_KERNEL_VIRTUAL) error = vkernel_module_memory_alloc(&mapbase, round_page(mapsize)); vm_object_drop(ef->object); #else mapbase = KERNBASE; error = vm_map_find(&kernel_map, ef->object, NULL, 0, &mapbase, round_page(mapsize), PAGE_SIZE, TRUE, VM_MAPTYPE_NORMAL, VM_PROT_ALL, VM_PROT_ALL, FALSE); vm_object_drop(ef->object); if (error) { vm_object_deallocate(ef->object); ef->object = NULL; goto out; } /* Wire the pages */ error = vm_map_wire(&kernel_map, mapbase, mapbase + round_page(mapsize), 0); #endif if (error != KERN_SUCCESS) { error = ENOMEM; goto out; } /* Inform the kld system about the situation */ lf->address = ef->address = (caddr_t) mapbase; lf->size = round_page(mapsize); ef->bytes = mapsize; /* * Now load code/data(progbits), zero bss(nobits), allocate space for * and load relocs */ pb = 0; rl = 0; ra = 0; alignmask = 0; for (i = 0; i < hdr->e_shnum; i++) { if (shdr[i].sh_size == 0) continue; switch (shdr[i].sh_type) { case SHT_PROGBITS: case SHT_NOBITS: alignmask = shdr[i].sh_addralign - 1; mapbase += alignmask; mapbase &= ~alignmask; if (ef->shstrtab && shdr[i].sh_name != 0) ef->progtab[pb].name = ef->shstrtab + shdr[i].sh_name; else if (shdr[i].sh_type == SHT_PROGBITS) ef->progtab[pb].name = "<<PROGBITS>>"; else ef->progtab[pb].name = "<<NOBITS>>"; #if 0 if (ef->progtab[pb].name != NULL && !strcmp(ef->progtab[pb].name, "set_pcpu")) ef->progtab[pb].addr = dpcpu_alloc(shdr[i].sh_size); #ifdef VIMAGE else if (ef->progtab[pb].name != NULL && !strcmp(ef->progtab[pb].name, VNET_SETNAME)) ef->progtab[pb].addr = vnet_data_alloc(shdr[i].sh_size); #endif else #endif ef->progtab[pb].addr = (void *)(uintptr_t) mapbase; if (ef->progtab[pb].addr == NULL) { error = ENOSPC; goto out; } ef->progtab[pb].size = shdr[i].sh_size; ef->progtab[pb].sec = i; if (shdr[i].sh_type == SHT_PROGBITS) { error = vn_rdwr(UIO_READ, vp, ef->progtab[pb].addr, shdr[i].sh_size, shdr[i].sh_offset, UIO_SYSSPACE, IO_NODELOCKED, p->p_ucred, &resid); if (error) goto out; if (resid != 0) { error = EINVAL; goto out; } #if 0 /* Initialize the per-cpu or vnet area. */ if (ef->progtab[pb].addr != (void *)mapbase && !strcmp(ef->progtab[pb].name, "set_pcpu")) dpcpu_copy(ef->progtab[pb].addr, shdr[i].sh_size); #ifdef VIMAGE else if (ef->progtab[pb].addr != (void *)mapbase && !strcmp(ef->progtab[pb].name, VNET_SETNAME)) vnet_data_copy(ef->progtab[pb].addr, shdr[i].sh_size); #endif #endif } else bzero(ef->progtab[pb].addr, shdr[i].sh_size); /* Update all symbol values with the offset. */ for (j = 0; j < ef->ddbsymcnt; j++) { es = &ef->ddbsymtab[j]; if (es->st_shndx != i) continue; es->st_value += (Elf_Addr) ef->progtab[pb].addr; } mapbase += shdr[i].sh_size; pb++; break; case SHT_REL: ef->reltab[rl].rel = kmalloc(shdr[i].sh_size, M_LINKER, M_WAITOK); ef->reltab[rl].nrel = shdr[i].sh_size / sizeof(Elf_Rel); ef->reltab[rl].sec = shdr[i].sh_info; error = vn_rdwr(UIO_READ, vp, (void *)ef->reltab[rl].rel, shdr[i].sh_size, shdr[i].sh_offset, UIO_SYSSPACE, IO_NODELOCKED, p->p_ucred, &resid); if (error) goto out; if (resid != 0) { error = EINVAL; goto out; } rl++; break; case SHT_RELA: ef->relatab[ra].rela = kmalloc(shdr[i].sh_size, M_LINKER, M_WAITOK); ef->relatab[ra].nrela = shdr[i].sh_size / sizeof(Elf_Rela); ef->relatab[ra].sec = shdr[i].sh_info; error = vn_rdwr(UIO_READ, vp, (void *)ef->relatab[ra].rela, shdr[i].sh_size, shdr[i].sh_offset, UIO_SYSSPACE, IO_NODELOCKED, p->p_ucred, &resid); if (error) goto out; if (resid != 0) { error = EINVAL; goto out; } ra++; break; } } if (pb != ef->nprogtab) panic("lost progbits"); if (rl != ef->nreltab) panic("lost reltab"); if (ra != ef->nrelatab) panic("lost relatab"); if (mapbase != (vm_offset_t) ef->address + mapsize) panic("mapbase 0x%lx != address %p + mapsize 0x%lx (0x%lx)", mapbase, ef->address, mapsize, (vm_offset_t) ef->address + mapsize); /* Local intra-module relocations */ link_elf_obj_reloc_local(lf); /* Pull in dependencies */ error = linker_load_dependencies(lf); if (error) goto out; /* External relocations */ error = relocate_file(lf); if (error) goto out; *result = lf; out: if (error && lf) linker_file_unload(lf /*, LINKER_UNLOAD_FORCE */); if (hdr) kfree(hdr, M_LINKER); vn_unlock(vp); vn_close(vp, FREAD, NULL); return error; }
vm_object_t cdev_pager_allocate(void *handle, enum obj_type tp, struct cdev_pager_ops *ops, vm_ooffset_t size, vm_prot_t prot, vm_ooffset_t foff, struct ucred *cred) { cdev_t dev; vm_object_t object; u_short color; /* * Offset should be page aligned. */ if (foff & PAGE_MASK) return (NULL); size = round_page64(size); if (ops->cdev_pg_ctor(handle, size, prot, foff, cred, &color) != 0) return (NULL); /* * Look up pager, creating as necessary. */ mtx_lock(&dev_pager_mtx); object = vm_pager_object_lookup(&dev_pager_object_list, handle); if (object == NULL) { /* * Allocate object and associate it with the pager. */ object = vm_object_allocate_hold(tp, OFF_TO_IDX(foff + size)); object->handle = handle; object->un_pager.devp.ops = ops; object->un_pager.devp.dev = handle; TAILQ_INIT(&object->un_pager.devp.devp_pglist); /* * handle is only a device for old_dev_pager_ctor. */ if (ops->cdev_pg_ctor == old_dev_pager_ctor) { dev = handle; dev->si_object = object; } TAILQ_INSERT_TAIL(&dev_pager_object_list, object, pager_object_list); vm_object_drop(object); } else { /* * Gain a reference to the object. */ vm_object_hold(object); vm_object_reference_locked(object); if (OFF_TO_IDX(foff + size) > object->size) object->size = OFF_TO_IDX(foff + size); vm_object_drop(object); } mtx_unlock(&dev_pager_mtx); return (object); }
static int link_elf_load_file(const char* filename, linker_file_t* result) { struct nlookupdata nd; struct thread *td = curthread; /* XXX */ struct proc *p = td->td_proc; struct vnode *vp; Elf_Ehdr *hdr; caddr_t firstpage; int nbytes, i; Elf_Phdr *phdr; Elf_Phdr *phlimit; Elf_Phdr *segs[2]; int nsegs; Elf_Phdr *phdyn; Elf_Phdr *phphdr; caddr_t mapbase; size_t mapsize; Elf_Off base_offset; Elf_Addr base_vaddr; Elf_Addr base_vlimit; int error = 0; int resid; elf_file_t ef; linker_file_t lf; char *pathname; Elf_Shdr *shdr; int symtabindex; int symstrindex; int symcnt; int strcnt; /* XXX Hack for firmware loading where p == NULL */ if (p == NULL) { p = &proc0; } KKASSERT(p != NULL); if (p->p_ucred == NULL) { kprintf("link_elf_load_file: cannot load '%s' from filesystem" " this early\n", filename); return ENOENT; } shdr = NULL; lf = NULL; pathname = linker_search_path(filename); if (pathname == NULL) return ENOENT; error = nlookup_init(&nd, pathname, UIO_SYSSPACE, NLC_FOLLOW|NLC_LOCKVP); if (error == 0) error = vn_open(&nd, NULL, FREAD, 0); kfree(pathname, M_LINKER); if (error) { nlookup_done(&nd); return error; } vp = nd.nl_open_vp; nd.nl_open_vp = NULL; nlookup_done(&nd); /* * Read the elf header from the file. */ firstpage = kmalloc(PAGE_SIZE, M_LINKER, M_WAITOK); hdr = (Elf_Ehdr *)firstpage; error = vn_rdwr(UIO_READ, vp, firstpage, PAGE_SIZE, 0, UIO_SYSSPACE, IO_NODELOCKED, p->p_ucred, &resid); nbytes = PAGE_SIZE - resid; if (error) goto out; if (!IS_ELF(*hdr)) { error = ENOEXEC; goto out; } if (hdr->e_ident[EI_CLASS] != ELF_TARG_CLASS || hdr->e_ident[EI_DATA] != ELF_TARG_DATA) { link_elf_error("Unsupported file layout"); error = ENOEXEC; goto out; } if (hdr->e_ident[EI_VERSION] != EV_CURRENT || hdr->e_version != EV_CURRENT) { link_elf_error("Unsupported file version"); error = ENOEXEC; goto out; } if (hdr->e_type != ET_EXEC && hdr->e_type != ET_DYN) { error = ENOSYS; goto out; } if (hdr->e_machine != ELF_TARG_MACH) { link_elf_error("Unsupported machine"); error = ENOEXEC; goto out; } /* * We rely on the program header being in the first page. This is * not strictly required by the ABI specification, but it seems to * always true in practice. And, it simplifies things considerably. */ if (!((hdr->e_phentsize == sizeof(Elf_Phdr)) && (hdr->e_phoff + hdr->e_phnum*sizeof(Elf_Phdr) <= PAGE_SIZE) && (hdr->e_phoff + hdr->e_phnum*sizeof(Elf_Phdr) <= nbytes))) link_elf_error("Unreadable program headers"); /* * Scan the program header entries, and save key information. * * We rely on there being exactly two load segments, text and data, * in that order. */ phdr = (Elf_Phdr *) (firstpage + hdr->e_phoff); phlimit = phdr + hdr->e_phnum; nsegs = 0; phdyn = NULL; phphdr = NULL; while (phdr < phlimit) { switch (phdr->p_type) { case PT_LOAD: if (nsegs == 2) { link_elf_error("Too many sections"); error = ENOEXEC; goto out; } segs[nsegs] = phdr; ++nsegs; break; case PT_PHDR: phphdr = phdr; break; case PT_DYNAMIC: phdyn = phdr; break; case PT_INTERP: error = ENOSYS; goto out; } ++phdr; } if (phdyn == NULL) { link_elf_error("Object is not dynamically-linked"); error = ENOEXEC; goto out; } /* * Allocate the entire address space of the object, to stake out our * contiguous region, and to establish the base address for relocation. */ base_offset = trunc_page(segs[0]->p_offset); base_vaddr = trunc_page(segs[0]->p_vaddr); base_vlimit = round_page(segs[1]->p_vaddr + segs[1]->p_memsz); mapsize = base_vlimit - base_vaddr; ef = kmalloc(sizeof(struct elf_file), M_LINKER, M_WAITOK | M_ZERO); #ifdef SPARSE_MAPPING ef->object = vm_object_allocate(OBJT_DEFAULT, mapsize >> PAGE_SHIFT); if (ef->object == NULL) { kfree(ef, M_LINKER); error = ENOMEM; goto out; } vm_object_hold(ef->object); vm_object_reference_locked(ef->object); ef->address = (caddr_t)vm_map_min(&kernel_map); error = vm_map_find(&kernel_map, ef->object, 0, (vm_offset_t *)&ef->address, mapsize, PAGE_SIZE, 1, VM_MAPTYPE_NORMAL, VM_PROT_ALL, VM_PROT_ALL, 0); vm_object_drop(ef->object); if (error) { vm_object_deallocate(ef->object); kfree(ef, M_LINKER); goto out; } #else ef->address = kmalloc(mapsize, M_LINKER, M_WAITOK); #endif mapbase = ef->address; /* * Read the text and data sections and zero the bss. */ for (i = 0; i < 2; i++) { caddr_t segbase = mapbase + segs[i]->p_vaddr - base_vaddr; error = vn_rdwr(UIO_READ, vp, segbase, segs[i]->p_filesz, segs[i]->p_offset, UIO_SYSSPACE, IO_NODELOCKED, p->p_ucred, &resid); if (error) { #ifdef SPARSE_MAPPING vm_map_remove(&kernel_map, (vm_offset_t) ef->address, (vm_offset_t) ef->address + (ef->object->size << PAGE_SHIFT)); vm_object_deallocate(ef->object); #else kfree(ef->address, M_LINKER); #endif kfree(ef, M_LINKER); goto out; } bzero(segbase + segs[i]->p_filesz, segs[i]->p_memsz - segs[i]->p_filesz); #ifdef SPARSE_MAPPING /* * Wire down the pages */ vm_map_wire(&kernel_map, (vm_offset_t) segbase, (vm_offset_t) segbase + segs[i]->p_memsz, 0); #endif } ef->dynamic = (const Elf_Dyn *) (mapbase + phdyn->p_vaddr - base_vaddr); lf = linker_make_file(filename, ef, &link_elf_file_ops); if (lf == NULL) { #ifdef SPARSE_MAPPING vm_map_remove(&kernel_map, (vm_offset_t) ef->address, (vm_offset_t) ef->address + (ef->object->size << PAGE_SHIFT)); vm_object_deallocate(ef->object); #else kfree(ef->address, M_LINKER); #endif kfree(ef, M_LINKER); error = ENOMEM; goto out; } lf->address = ef->address; lf->size = mapsize; error = parse_dynamic(lf); if (error) goto out; link_elf_reloc_local(lf); error = linker_load_dependencies(lf); if (error) goto out; error = relocate_file(lf); if (error) goto out; /* Try and load the symbol table if it's present. (you can strip it!) */ nbytes = hdr->e_shnum * hdr->e_shentsize; if (nbytes == 0 || hdr->e_shoff == 0) goto nosyms; shdr = kmalloc(nbytes, M_LINKER, M_WAITOK | M_ZERO); error = vn_rdwr(UIO_READ, vp, (caddr_t)shdr, nbytes, hdr->e_shoff, UIO_SYSSPACE, IO_NODELOCKED, p->p_ucred, &resid); if (error) goto out; symtabindex = -1; symstrindex = -1; for (i = 0; i < hdr->e_shnum; i++) { if (shdr[i].sh_type == SHT_SYMTAB) { symtabindex = i; symstrindex = shdr[i].sh_link; } } if (symtabindex < 0 || symstrindex < 0) goto nosyms; symcnt = shdr[symtabindex].sh_size; ef->symbase = kmalloc(symcnt, M_LINKER, M_WAITOK); strcnt = shdr[symstrindex].sh_size; ef->strbase = kmalloc(strcnt, M_LINKER, M_WAITOK); error = vn_rdwr(UIO_READ, vp, ef->symbase, symcnt, shdr[symtabindex].sh_offset, UIO_SYSSPACE, IO_NODELOCKED, p->p_ucred, &resid); if (error) goto out; error = vn_rdwr(UIO_READ, vp, ef->strbase, strcnt, shdr[symstrindex].sh_offset, UIO_SYSSPACE, IO_NODELOCKED, p->p_ucred, &resid); if (error) goto out; ef->ddbsymcnt = symcnt / sizeof(Elf_Sym); ef->ddbsymtab = (const Elf_Sym *)ef->symbase; ef->ddbstrcnt = strcnt; ef->ddbstrtab = ef->strbase; nosyms: *result = lf; out: if (error && lf) linker_file_unload(lf); if (shdr) kfree(shdr, M_LINKER); if (firstpage) kfree(firstpage, M_LINKER); vn_unlock(vp); vn_close(vp, FREAD); return error; }