void cpu_set_upcall(struct thread *td, struct thread *td0) { struct pcb *pcb; struct trapframe *tf; ia64_highfp_save(td0); tf = td->td_frame; KASSERT(tf != NULL, ("foo")); bcopy(td0->td_frame, tf, sizeof(*tf)); tf->tf_length = sizeof(struct trapframe); tf->tf_flags = FRAME_SYSCALL; tf->tf_special.ndirty = 0; tf->tf_special.bspstore &= ~0x1ffUL; tf->tf_scratch.gr8 = 0; tf->tf_scratch.gr9 = 1; tf->tf_scratch.gr10 = 0; pcb = td->td_pcb; KASSERT(pcb != NULL, ("foo")); bcopy(td0->td_pcb, pcb, sizeof(*pcb)); pcb->pcb_special.bspstore = td->td_kstack; pcb->pcb_special.pfs = 0; pcb->pcb_current_pmap = vmspace_pmap(td->td_proc->p_vmspace); pcb->pcb_special.sp = (uintptr_t)tf - 16; pcb->pcb_special.rp = FDESC_FUNC(fork_trampoline); cpu_set_fork_handler(td, (void (*)(void*))fork_return, td); /* Setup to release the spin count in fork_exit(). */ td->td_md.md_spinlock_count = 1; td->td_md.md_saved_intr = 1; }
/* * vm_fault_prefault provides a quick way of clustering * pagefaults into a processes address space. It is a "cousin" * of vm_map_pmap_enter, except it runs at page fault time instead * of mmap time. */ static void vm_fault_prefault(pmap_t pmap, vm_offset_t addra, vm_map_entry_t entry) { int i; vm_offset_t addr, starta; vm_pindex_t pindex; vm_page_t m; vm_object_t object; if (pmap != vmspace_pmap(curthread->td_proc->p_vmspace)) return; object = entry->object.vm_object; starta = addra - PFBAK * PAGE_SIZE; if (starta < entry->start) { starta = entry->start; } else if (starta > addra) { starta = 0; } for (i = 0; i < PAGEORDER_SIZE; i++) { vm_object_t backing_object, lobject; addr = addra + prefault_pageorder[i]; if (addr > addra + (PFFOR * PAGE_SIZE)) addr = 0; if (addr < starta || addr >= entry->end) continue; if (!pmap_is_prefaultable(pmap, addr)) continue; pindex = ((addr - entry->start) + entry->offset) >> PAGE_SHIFT; lobject = object; VM_OBJECT_WLOCK(lobject); while ((m = vm_page_lookup(lobject, pindex)) == NULL && lobject->type == OBJT_DEFAULT && (backing_object = lobject->backing_object) != NULL) { KASSERT((lobject->backing_object_offset & PAGE_MASK) == 0, ("vm_fault_prefault: unaligned object offset")); pindex += lobject->backing_object_offset >> PAGE_SHIFT; VM_OBJECT_WLOCK(backing_object); VM_OBJECT_WUNLOCK(lobject); lobject = backing_object; } /* * give-up when a page is not in memory */ if (m == NULL) { VM_OBJECT_WUNLOCK(lobject); break; } if (m->valid == VM_PAGE_BITS_ALL && (m->flags & PG_FICTITIOUS) == 0) pmap_enter_quick(pmap, addr, m, entry->protection); VM_OBJECT_WUNLOCK(lobject); } }
/* * Passively intercepts the thread switch function to increase * the thread priority from a user priority to a kernel priority, reducing * syscall and trap overhead for the case where no switch occurs. * * Synchronizes td_ucred with p_ucred. This is used by system calls, * signal handling, faults, AST traps, and anything else that enters the * kernel from userland and provides the kernel with a stable read-only * copy of the process ucred. * * To avoid races with another thread updating p_ucred we obtain p_spin. * The other thread doing the update will obtain both p_token and p_spin. * In the case where the cached cred pointer matches, we will already have * the ref and we don't have to do one blessed thing. */ static __inline void userenter(struct thread *curtd, struct proc *curp) { struct ucred *ocred; struct ucred *ncred; curtd->td_release = lwkt_passive_release; if (curtd->td_ucred != curp->p_ucred) { spin_lock(&curp->p_spin); ncred = crhold(curp->p_ucred); spin_unlock(&curp->p_spin); ocred = curtd->td_ucred; curtd->td_ucred = ncred; if (ocred) crfree(ocred); } #ifdef DDB /* * Debugging, remove top two user stack pages to catch kernel faults */ if (freeze_on_seg_fault > 1 && curtd->td_lwp) { pmap_remove(vmspace_pmap(curtd->td_lwp->lwp_vmspace), 0x00007FFFFFFFD000LU, 0x0000800000000000LU); } #endif }
/* * Get user stack from the thread. * This assumes the thread is unlocked, idle, * and 64-bit. */ static struct ksample_stack * stack_capture_user(struct thread *thread) { struct ksample_stack *retval = NULL; struct amd64_frame frame = { 0 }; size_t depth = 0; static const size_t MAXDEPTH = 4096 / sizeof(vm_offset_t); caddr_t *pcs = NULL; int error = 0; pmap_t pmap = vmspace_pmap(thread->td_proc->p_vmspace); frame.f_frame = (void*)thread->td_frame->tf_rbp; pcs = malloc(sizeof(*pcs) * MAXDEPTH, M_TEMP, M_WAITOK | M_ZERO); pcs[depth++] = (caddr_t)thread->td_frame->tf_rip; // printf("%s(%d): frame.f_frame = %x\n", __FUNCTION__, __LINE__, (unsigned int)frame.f_frame); while (frame.f_frame && depth < MAXDEPTH) { struct iovec iov; struct uio uio; iov.iov_base = (caddr_t)&frame; iov.iov_len = sizeof(frame); uio.uio_iov = &iov; uio.uio_iovcnt = 1; uio.uio_offset = (off_t)(uintptr_t)frame.f_frame; uio.uio_resid = sizeof(frame); uio.uio_segflg = UIO_SYSSPACE; uio.uio_rw = UIO_READ; uio.uio_td = curthread; // If it's not mapped in, just stop if (pmap_extract(pmap, (vm_offset_t)frame.f_frame) == 0) { break; } error = proc_rwmem(thread->td_proc, &uio); if (error) { // printf("%s(%d): error = %d\n", __FUNCTION__, __LINE__, error); break; } pcs[depth++] = (caddr_t)frame.f_retaddr; // printf("%s(%d): frame.f_frame = %x\n", __FUNCTION__, __LINE__, (unsigned int)frame.f_frame); } // printf("%s(%d): depth = %u\n", __FUNCTION__, __LINE__, (unsigned int)depth); retval = malloc(sizeof(struct ksample_stack) + depth * sizeof(caddr_t), M_TEMP, M_WAITOK); if (retval) { retval->depth = depth; bcopy(pcs, retval->pcs, depth * sizeof(pcs[0])); } // printf("%s(%d)\n", __FUNCTION__, __LINE__); free(pcs, M_TEMP); return retval; }
/* * Finish a fork operation, with process p2 nearly set up. * Copy and update the pcb, set up the stack so that the child * ready to run and return to user mode. */ void cpu_fork(struct thread *td1, struct proc *p2, struct thread *td2, int flags) { struct pcb *pcb2; struct trapframe *tf; if ((flags & RFPROC) == 0) return; if (td1 == curthread) { /* * Save the tpidr_el0 and the vfp state, these normally happen * in cpu_switch, but if userland changes these then forks * this may not have happened. */ td1->td_pcb->pcb_tpidr_el0 = READ_SPECIALREG(tpidr_el0); td1->td_pcb->pcb_tpidrro_el0 = READ_SPECIALREG(tpidrro_el0); #ifdef VFP if ((td1->td_pcb->pcb_fpflags & PCB_FP_STARTED) != 0) vfp_save_state(td1, td1->td_pcb); #endif } pcb2 = (struct pcb *)(td2->td_kstack + td2->td_kstack_pages * PAGE_SIZE) - 1; td2->td_pcb = pcb2; bcopy(td1->td_pcb, pcb2, sizeof(*pcb2)); td2->td_proc->p_md.md_l0addr = vtophys(vmspace_pmap(td2->td_proc->p_vmspace)->pm_l0); tf = (struct trapframe *)STACKALIGN((struct trapframe *)pcb2 - 1); bcopy(td1->td_frame, tf, sizeof(*tf)); tf->tf_x[0] = 0; tf->tf_x[1] = 0; tf->tf_spsr = td1->td_frame->tf_spsr & PSR_M_32; td2->td_frame = tf; /* Set the return value registers for fork() */ td2->td_pcb->pcb_x[8] = (uintptr_t)fork_return; td2->td_pcb->pcb_x[9] = (uintptr_t)td2; td2->td_pcb->pcb_x[PCB_LR] = (uintptr_t)fork_trampoline; td2->td_pcb->pcb_sp = (uintptr_t)td2->td_frame; td2->td_pcb->pcb_fpusaved = &td2->td_pcb->pcb_fpustate; td2->td_pcb->pcb_vfpcpu = UINT_MAX; /* Setup to release spin count in fork_exit(). */ td2->td_md.md_spinlock_count = 1; td2->td_md.md_saved_daif = td1->td_md.md_saved_daif & ~DAIF_I_MASKED; }
/* * Finish a fork operation, with process p2 nearly set up. * Copy and update the pcb, set up the stack so that the child * ready to run and return to user mode. */ void cpu_fork(struct thread *td1, struct proc *p2, struct thread *td2, int flags) { struct pcb *pcb2; struct trapframe *tf; if ((flags & RFPROC) == 0) return; pcb2 = (struct pcb *)(td2->td_kstack + td2->td_kstack_pages * PAGE_SIZE) - 1; td2->td_pcb = pcb2; bcopy(td1->td_pcb, pcb2, sizeof(*pcb2)); td2->td_pcb->pcb_l1addr = vtophys(vmspace_pmap(td2->td_proc->p_vmspace)->pm_l1); tf = (struct trapframe *)STACKALIGN((struct trapframe *)pcb2 - 1); bcopy(td1->td_frame, tf, sizeof(*tf)); /* Clear syscall error flag */ tf->tf_t[0] = 0; /* Arguments for child */ tf->tf_a[0] = 0; tf->tf_a[1] = 0; tf->tf_sstatus |= (SSTATUS_SPIE); /* Enable interrupts. */ tf->tf_sstatus |= (SSTATUS_SUM); /* Supervisor can access userspace. */ tf->tf_sstatus &= ~(SSTATUS_SPP); /* User mode. */ td2->td_frame = tf; /* Set the return value registers for fork() */ td2->td_pcb->pcb_s[0] = (uintptr_t)fork_return; td2->td_pcb->pcb_s[1] = (uintptr_t)td2; td2->td_pcb->pcb_ra = (uintptr_t)fork_trampoline; td2->td_pcb->pcb_sp = (uintptr_t)td2->td_frame; /* Setup to release spin count in fork_exit(). */ td2->td_md.md_spinlock_count = 1; td2->td_md.md_saved_sstatus_ie = (SSTATUS_SIE); }
/* * Try to remove FS references in the specified process. This function * is used during shutdown */ static void shutdown_cleanup_proc(struct proc *p) { struct filedesc *fdp; struct vmspace *vm; if (p == NULL) return; if ((fdp = p->p_fd) != NULL) { kern_closefrom(0); if (fdp->fd_cdir) { cache_drop(&fdp->fd_ncdir); vrele(fdp->fd_cdir); fdp->fd_cdir = NULL; } if (fdp->fd_rdir) { cache_drop(&fdp->fd_nrdir); vrele(fdp->fd_rdir); fdp->fd_rdir = NULL; } if (fdp->fd_jdir) { cache_drop(&fdp->fd_njdir); vrele(fdp->fd_jdir); fdp->fd_jdir = NULL; } } if (p->p_vkernel) vkernel_exit(p); if (p->p_textvp) { vrele(p->p_textvp); p->p_textvp = NULL; } vm = p->p_vmspace; if (vm != NULL) { pmap_remove_pages(vmspace_pmap(vm), VM_MIN_USER_ADDRESS, VM_MAX_USER_ADDRESS); vm_map_remove(&vm->vm_map, VM_MIN_USER_ADDRESS, VM_MAX_USER_ADDRESS); } }
int vm_create(const char *name, struct vm **retvm) { int i; struct vm *vm; struct vmspace *vmspace; const int BSP = 0; /* * If vmm.ko could not be successfully initialized then don't attempt * to create the virtual machine. */ if (!vmm_initialized) return (ENXIO); if (name == NULL || strlen(name) >= VM_MAX_NAMELEN) return (EINVAL); vmspace = VMSPACE_ALLOC(VM_MIN_ADDRESS, VM_MAXUSER_ADDRESS); if (vmspace == NULL) return (ENOMEM); vm = malloc(sizeof(struct vm), M_VM, M_WAITOK | M_ZERO); strcpy(vm->name, name); vm->vmspace = vmspace; mtx_init(&vm->rendezvous_mtx, "vm rendezvous lock", 0, MTX_DEF); vm->cookie = VMINIT(vm, vmspace_pmap(vmspace)); vm->vioapic = vioapic_init(vm); vm->vhpet = vhpet_init(vm); vm->vatpic = vatpic_init(vm); for (i = 0; i < VM_MAXCPU; i++) { vcpu_init(vm, i); guest_msrs_init(vm, i); } vm_activate_cpu(vm, BSP); *retvm = vm; return (0); }
/* * Implement fork's actions on an address space. * Here we arrange for the address space to be copied or referenced, * allocate a user struct (pcb and kernel stack), then call the * machine-dependent layer to fill those in and make the new process * ready to run. The new process is set up so that it returns directly * to user mode to avoid stack copying and relocation problems. * * No requirements. */ void vm_fork(struct proc *p1, struct proc *p2, int flags) { if ((flags & RFPROC) == 0) { /* * Divorce the memory, if it is shared, essentially * this changes shared memory amongst threads, into * COW locally. */ if ((flags & RFMEM) == 0) { if (p1->p_vmspace->vm_sysref.refcnt > 1) { vmspace_unshare(p1); } } cpu_fork(ONLY_LWP_IN_PROC(p1), NULL, flags); return; } if (flags & RFMEM) { vmspace_ref(p1->p_vmspace); p2->p_vmspace = p1->p_vmspace; } while (vm_page_count_severe()) { vm_wait(0); } if ((flags & RFMEM) == 0) { p2->p_vmspace = vmspace_fork(p1->p_vmspace); pmap_pinit2(vmspace_pmap(p2->p_vmspace)); if (p1->p_vmspace->vm_shm) shmfork(p1, p2); } pmap_init_proc(p2); }
/* * Load a uio. */ static int _bus_dmamap_load_uio(bus_dma_tag_t dmat, bus_dmamap_t map, struct uio *uio, int *nsegs, int flags) { bus_size_t resid; bus_size_t minlen; struct iovec *iov; pmap_t pmap; caddr_t addr; int error, i; if (uio->uio_segflg == UIO_USERSPACE) { KASSERT(uio->uio_td != NULL, ("bus_dmamap_load_uio: USERSPACE but no proc")); pmap = vmspace_pmap(uio->uio_td->td_proc->p_vmspace); } else pmap = kernel_pmap; resid = uio->uio_resid; iov = uio->uio_iov; error = 0; for (i = 0; i < uio->uio_iovcnt && resid != 0 && !error; i++) { /* * Now at the first iovec to load. Load each iovec * until we have exhausted the residual count. */ addr = (caddr_t) iov[i].iov_base; minlen = resid < iov[i].iov_len ? resid : iov[i].iov_len; if (minlen > 0) { error = _bus_dmamap_load_buffer(dmat, map, addr, minlen, pmap, flags, NULL, nsegs); resid -= minlen; } } return (error); }
static int vm_handle_paging(struct vm *vm, int vcpuid, bool *retu) { int rv, ftype; struct vm_map *map; struct vcpu *vcpu; struct vm_exit *vme; vcpu = &vm->vcpu[vcpuid]; vme = &vcpu->exitinfo; ftype = vme->u.paging.fault_type; KASSERT(ftype == VM_PROT_READ || ftype == VM_PROT_WRITE || ftype == VM_PROT_EXECUTE, ("vm_handle_paging: invalid fault_type %d", ftype)); if (ftype == VM_PROT_READ || ftype == VM_PROT_WRITE) { rv = pmap_emulate_accessed_dirty(vmspace_pmap(vm->vmspace), vme->u.paging.gpa, ftype); if (rv == 0) goto done; } map = &vm->vmspace->vm_map; rv = vm_fault(map, vme->u.paging.gpa, ftype, VM_FAULT_NORMAL); VCPU_CTR3(vm, vcpuid, "vm_handle_paging rv = %d, gpa = %#lx, " "ftype = %d", rv, vme->u.paging.gpa, ftype); if (rv != KERN_SUCCESS) return (EFAULT); done: /* restart execution at the faulting instruction */ vme->inst_length = 0; return (0); }
int vm_run(struct vm *vm, struct vm_run *vmrun) { int error, vcpuid; struct vcpu *vcpu; struct pcb *pcb; uint64_t tscval, rip; struct vm_exit *vme; bool retu, intr_disabled; pmap_t pmap; vcpuid = vmrun->cpuid; if (vcpuid < 0 || vcpuid >= VM_MAXCPU) return (EINVAL); pmap = vmspace_pmap(vm->vmspace); vcpu = &vm->vcpu[vcpuid]; vme = &vcpu->exitinfo; rip = vmrun->rip; restart: critical_enter(); KASSERT(!CPU_ISSET(curcpu, &pmap->pm_active), ("vm_run: absurd pm_active")); tscval = rdtsc(); pcb = PCPU_GET(curpcb); set_pcb_flags(pcb, PCB_FULL_IRET); restore_guest_msrs(vm, vcpuid); restore_guest_fpustate(vcpu); vcpu_require_state(vm, vcpuid, VCPU_RUNNING); error = VMRUN(vm->cookie, vcpuid, rip, pmap, &vm->rendezvous_func); vcpu_require_state(vm, vcpuid, VCPU_FROZEN); save_guest_fpustate(vcpu); restore_host_msrs(vm, vcpuid); vmm_stat_incr(vm, vcpuid, VCPU_TOTAL_RUNTIME, rdtsc() - tscval); critical_exit(); if (error == 0) { retu = false; switch (vme->exitcode) { case VM_EXITCODE_IOAPIC_EOI: vioapic_process_eoi(vm, vcpuid, vme->u.ioapic_eoi.vector); break; case VM_EXITCODE_RENDEZVOUS: vm_handle_rendezvous(vm, vcpuid); error = 0; break; case VM_EXITCODE_HLT: intr_disabled = ((vme->u.hlt.rflags & PSL_I) == 0); error = vm_handle_hlt(vm, vcpuid, intr_disabled, &retu); break; case VM_EXITCODE_PAGING: error = vm_handle_paging(vm, vcpuid, &retu); break; case VM_EXITCODE_INST_EMUL: error = vm_handle_inst_emul(vm, vcpuid, &retu); break; default: retu = true; /* handled in userland */ break; } } if (error == 0 && retu == false) { rip = vme->rip + vme->inst_length; goto restart; } /* copy the exit information */ bcopy(vme, &vmrun->vm_exit, sizeof(struct vm_exit)); return (error); }
size_t md_stack_capture_curthread(caddr_t *pcs, size_t size) { struct trapframe *tf = curthread->td_intr_frame; pmap_t pmap = vmspace_pmap(curthread->td_proc->p_vmspace); size_t num_kstacks = 0, num_ustacks = 0; #if SAMPLE_DEBUG > 2 printf("%s(%d): curthread pid %u, tid %u, name %s\n", __FUNCTION__, __LINE__, curthread->td_proc->p_pid, curthread->td_tid, curthread->td_name); #endif if (tf == NULL) { #if SAMPLE_DEBUG > 5 printf("%s(%d): intr frame is NULL?\n", __FUNCTION__, __LINE__); #endif // tf = curthread->td_frame; /* * I'm not sure what this means. This should be done via interrupt, * so if curthread wasn't interrupted, then I don't think we can get * anything here? Please tell me what I'm doing wrong. */ return 0; } if (!TRAPF_USERMODE(tf)) { // Start with kernel mode unsigned long callpc; struct amd64_frame *frame; frame = (struct amd64_frame*)tf->tf_rbp; while (num_kstacks < size) { if (!INKERNEL((long)frame)) break; callpc = frame->f_retaddr; if (!INKERNEL(callpc)) break; pcs[num_kstacks++] = (caddr_t)callpc; if (frame->f_frame <= frame || (vm_offset_t)frame->f_frame >= (vm_offset_t)tf->tf_rbp + KSTACK_PAGES * PAGE_SIZE) break; frame = frame->f_frame; } tf = curthread->td_frame; } if (TRAPF_USERMODE(tf)) { caddr_t *start_pc = pcs + num_kstacks; struct amd64_frame frame; #if SAMPLE_DEBUG > 3 printf("%s(%d): doing user mode crawl\n", __FUNCTION__, __LINE__); #endif frame.f_frame = (struct amd64_frame*)tf->tf_rbp; if (tf != curthread->td_intr_frame) { start_pc[num_ustacks++] = (caddr_t)tf->tf_rip; } while ((num_kstacks + num_ustacks) < size) { void *bp = frame.f_frame; /* * The calls to GET_WORD replace the non-functional: * copyin_nofault((void*)frame.f_frame, &frame, sizeof(frame)); */ frame.f_frame = (struct amd64_frame*)GET_WORD(pmap, (caddr_t)frame.f_frame); if (frame.f_frame == 0) { #if SAMPLE_DEBUG > 4 printf("%s(%d): %s: frame is 0\n", __FUNCTION__, __LINE__, curthread->td_name); #endif break; } frame.f_retaddr = (long)GET_WORD(pmap, (caddr_t)frame.f_frame + offsetof(struct amd64_frame, f_retaddr)); if (frame.f_retaddr == 0) { #if SAMPLE_DEBUG > 4 printf("%s(%d): %s: retaddr from frame %p is 0\n", __FUNCTION__, __LINE__, curthread->td_name, (void*)frame.f_frame); #endif break; } #if SAMPLE_DEBUG > 2 printf("%s(%d): f_frame = %ld, f_retaddr = %ld\n", __FUNCTION__, __LINE__, (long)frame.f_frame, (long)frame.f_retaddr); #endif start_pc[num_ustacks++] = (caddr_t)frame.f_retaddr; if ((void*)frame.f_frame < bp) { break; } } } else {
/* * The map entries can *almost* be read with programs like cat. However, * large maps need special programs to read. It is not easy to implement * a program that can sense the required size of the buffer, and then * subsequently do a read with the appropriate size. This operation cannot * be atomic. The best that we can do is to allow the program to do a read * with an arbitrarily large buffer, and return as much as we can. We can * return an error code if the buffer is too small (EFBIG), then the program * can try a bigger buffer. */ int procfs_domap(struct proc *curp, struct lwp *lp, struct pfsnode *pfs, struct uio *uio) { struct proc *p = lp->lwp_proc; int len; struct vnode *vp; char *fullpath, *freepath; int error; vm_map_t map = &p->p_vmspace->vm_map; pmap_t pmap = vmspace_pmap(p->p_vmspace); vm_map_entry_t entry; char mebuffer[MEBUFFERSIZE]; if (uio->uio_rw != UIO_READ) return (EOPNOTSUPP); if (uio->uio_offset != 0) return (0); error = 0; vm_map_lock_read(map); for (entry = map->header.next; ((uio->uio_resid > 0) && (entry != &map->header)); entry = entry->next) { vm_object_t obj, tobj, lobj; int ref_count, shadow_count, flags; vm_offset_t addr; vm_offset_t ostart; int resident, privateresident; char *type; if (entry->maptype != VM_MAPTYPE_NORMAL && entry->maptype != VM_MAPTYPE_VPAGETABLE) { continue; } obj = entry->object.vm_object; if (obj) vm_object_hold(obj); if (obj && (obj->shadow_count == 1)) privateresident = obj->resident_page_count; else privateresident = 0; /* * Use map->hint as a poor man's ripout detector. */ map->hint = entry; ostart = entry->start; /* * Count resident pages (XXX can be horrible on 64-bit) */ resident = 0; addr = entry->start; while (addr < entry->end) { if (pmap_extract(pmap, addr)) resident++; addr += PAGE_SIZE; } if (obj) { lobj = obj; while ((tobj = lobj->backing_object) != NULL) { KKASSERT(tobj != obj); vm_object_hold(tobj); if (tobj == lobj->backing_object) { if (lobj != obj) { vm_object_lock_swap(); vm_object_drop(lobj); } lobj = tobj; } else { vm_object_drop(tobj); } } } else { lobj = NULL; } freepath = NULL; fullpath = "-"; if (lobj) { switch(lobj->type) { default: case OBJT_DEFAULT: type = "default"; vp = NULL; break; case OBJT_VNODE: type = "vnode"; vp = lobj->handle; vref(vp); break; case OBJT_SWAP: type = "swap"; vp = NULL; break; case OBJT_DEVICE: type = "device"; vp = NULL; break; } flags = obj->flags; ref_count = obj->ref_count; shadow_count = obj->shadow_count; if (vp != NULL) { vn_fullpath(p, vp, &fullpath, &freepath, 1); vrele(vp); } if (lobj != obj) vm_object_drop(lobj); } else { type = "none"; flags = 0; ref_count = 0; shadow_count = 0; } /* * format: * start, end, res, priv res, cow, access, type, (fullpath). */ ksnprintf(mebuffer, sizeof(mebuffer), #if LONG_BIT == 64 "0x%016lx 0x%016lx %d %d %p %s%s%s %d %d " #else "0x%08lx 0x%08lx %d %d %p %s%s%s %d %d " #endif "0x%04x %s %s %s %s\n", (u_long)entry->start, (u_long)entry->end, resident, privateresident, obj, (entry->protection & VM_PROT_READ)?"r":"-", (entry->protection & VM_PROT_WRITE)?"w":"-", (entry->protection & VM_PROT_EXECUTE)?"x":"-", ref_count, shadow_count, flags, (entry->eflags & MAP_ENTRY_COW)?"COW":"NCOW", (entry->eflags & MAP_ENTRY_NEEDS_COPY)?"NC":"NNC", type, fullpath); if (obj) vm_object_drop(obj); if (freepath != NULL) { kfree(freepath, M_TEMP); freepath = NULL; } len = strlen(mebuffer); if (len > uio->uio_resid) { error = EFBIG; break; } /* * We cannot safely hold the map locked while accessing * userspace as a VM fault might recurse the locked map. */ vm_map_unlock_read(map); error = uiomove(mebuffer, len, uio); vm_map_lock_read(map); if (error) break; /* * We use map->hint as a poor man's ripout detector. If * it does not match the entry we set it to prior to * unlocking the map the entry MIGHT now be stale. In * this case we do an expensive lookup to find our place * in the iteration again. */ if (map->hint != entry) { vm_map_entry_t reentry; vm_map_lookup_entry(map, ostart, &reentry); entry = reentry; } } vm_map_unlock_read(map); return error; }
void abort_handler(struct trapframe *tf, int type) { struct vm_map *map; struct pcb *pcb; struct thread *td; u_int user, far, fsr; vm_prot_t ftype; void *onfault; vm_offset_t va; int error = 0; struct ksig ksig; struct proc *p; if (type == 1) return (prefetch_abort_handler(tf)); /* Grab FAR/FSR before enabling interrupts */ far = cpu_faultaddress(); fsr = cpu_faultstatus(); #if 0 printf("data abort: fault address=%p (from pc=%p lr=%p)\n", (void*)far, (void*)tf->tf_pc, (void*)tf->tf_svc_lr); #endif /* Update vmmeter statistics */ #if 0 vmexp.traps++; #endif td = curthread; p = td->td_proc; PCPU_INC(cnt.v_trap); /* Data abort came from user mode? */ user = TRAP_USERMODE(tf); if (user) { td->td_pticks = 0; td->td_frame = tf; if (td->td_cowgen != td->td_proc->p_cowgen) thread_cow_update(td); } /* Grab the current pcb */ pcb = td->td_pcb; /* Re-enable interrupts if they were enabled previously */ if (td->td_md.md_spinlock_count == 0) { if (__predict_true(tf->tf_spsr & PSR_I) == 0) enable_interrupts(PSR_I); if (__predict_true(tf->tf_spsr & PSR_F) == 0) enable_interrupts(PSR_F); } /* Invoke the appropriate handler, if necessary */ if (__predict_false(data_aborts[fsr & FAULT_TYPE_MASK].func != NULL)) { if ((data_aborts[fsr & FAULT_TYPE_MASK].func)(tf, fsr, far, td, &ksig)) { goto do_trapsignal; } goto out; } /* * At this point, we're dealing with one of the following data aborts: * * FAULT_TRANS_S - Translation -- Section * FAULT_TRANS_P - Translation -- Page * FAULT_DOMAIN_S - Domain -- Section * FAULT_DOMAIN_P - Domain -- Page * FAULT_PERM_S - Permission -- Section * FAULT_PERM_P - Permission -- Page * * These are the main virtual memory-related faults signalled by * the MMU. */ /* * Make sure the Program Counter is sane. We could fall foul of * someone executing Thumb code, in which case the PC might not * be word-aligned. This would cause a kernel alignment fault * further down if we have to decode the current instruction. * XXX: It would be nice to be able to support Thumb at some point. */ if (__predict_false((tf->tf_pc & 3) != 0)) { if (user) { /* * Give the user an illegal instruction signal. */ /* Deliver a SIGILL to the process */ ksig.signb = SIGILL; ksig.code = 0; goto do_trapsignal; } /* * The kernel never executes Thumb code. */ printf("\ndata_abort_fault: Misaligned Kernel-mode " "Program Counter\n"); dab_fatal(tf, fsr, far, td, &ksig); } va = trunc_page((vm_offset_t)far); /* * It is only a kernel address space fault iff: * 1. user == 0 and * 2. pcb_onfault not set or * 3. pcb_onfault set and not LDRT/LDRBT/STRT/STRBT instruction. */ if (user == 0 && (va >= VM_MIN_KERNEL_ADDRESS || (va < VM_MIN_ADDRESS && vector_page == ARM_VECTORS_LOW)) && __predict_true((pcb->pcb_onfault == NULL || (ReadWord(tf->tf_pc) & 0x05200000) != 0x04200000))) { map = kernel_map; /* Was the fault due to the FPE/IPKDB ? */ if (__predict_false((tf->tf_spsr & PSR_MODE)==PSR_UND32_MODE)) { /* * Force exit via userret() * This is necessary as the FPE is an extension to * userland that actually runs in a priveledged mode * but uses USR mode permissions for its accesses. */ user = 1; ksig.signb = SIGSEGV; ksig.code = 0; goto do_trapsignal; } } else { map = &td->td_proc->p_vmspace->vm_map; } /* * We need to know whether the page should be mapped as R or R/W. * On armv4, the fault status register does not indicate whether * the access was a read or write. We know that a permission fault * can only be the result of a write to a read-only location, so we * can deal with those quickly. Otherwise we need to disassemble * the faulting instruction to determine if it was a write. */ if (IS_PERMISSION_FAULT(fsr)) ftype = VM_PROT_WRITE; else { u_int insn = ReadWord(tf->tf_pc); if (((insn & 0x0c100000) == 0x04000000) || /* STR/STRB */ ((insn & 0x0e1000b0) == 0x000000b0) || /* STRH/STRD */ ((insn & 0x0a100000) == 0x08000000)) { /* STM/CDT */ ftype = VM_PROT_WRITE; } else { if ((insn & 0x0fb00ff0) == 0x01000090) /* SWP */ ftype = VM_PROT_READ | VM_PROT_WRITE; else ftype = VM_PROT_READ; } } /* * See if the fault is as a result of ref/mod emulation, * or domain mismatch. */ #ifdef DEBUG last_fault_code = fsr; #endif if (td->td_critnest != 0 || WITNESS_CHECK(WARN_SLEEPOK | WARN_GIANTOK, NULL, "Kernel page fault") != 0) goto fatal_pagefault; if (pmap_fault_fixup(vmspace_pmap(td->td_proc->p_vmspace), va, ftype, user)) { goto out; } onfault = pcb->pcb_onfault; pcb->pcb_onfault = NULL; error = vm_fault(map, va, ftype, VM_FAULT_NORMAL); pcb->pcb_onfault = onfault; if (__predict_true(error == 0)) goto out; fatal_pagefault: if (user == 0) { if (pcb->pcb_onfault) { tf->tf_r0 = error; tf->tf_pc = (register_t)(intptr_t) pcb->pcb_onfault; return; } printf("\nvm_fault(%p, %x, %x, 0) -> %x\n", map, va, ftype, error); dab_fatal(tf, fsr, far, td, &ksig); } if (error == ENOMEM) { printf("VM: pid %d (%s), uid %d killed: " "out of swap\n", td->td_proc->p_pid, td->td_name, (td->td_proc->p_ucred) ? td->td_proc->p_ucred->cr_uid : -1); ksig.signb = SIGKILL; } else { ksig.signb = SIGSEGV; } ksig.code = 0; do_trapsignal: call_trapsignal(td, ksig.signb, ksig.code); out: /* If returning to user mode, make sure to invoke userret() */ if (user) userret(td, tf); }
/* * Utility function to load a linear buffer. lastaddrp holds state * between invocations (for multiple-buffer loads). segp contains * the starting segment on entrance, and the ending segment on exit. * first indicates if this is the first invocation of this function. */ static int bus_dmamap_load_buffer(bus_dma_tag_t dmat, bus_dma_segment_t segs[], void *buf, bus_size_t buflen, struct thread *td, int flags, vm_offset_t *lastaddrp, int *segp, int first) { bus_size_t sgsize; bus_addr_t curaddr, lastaddr, baddr, bmask; vm_offset_t vaddr = (vm_offset_t)buf; int seg; pmap_t pmap; if (td != NULL) pmap = vmspace_pmap(td->td_proc->p_vmspace); else pmap = NULL; lastaddr = *lastaddrp; bmask = ~(dmat->boundary - 1); for (seg = *segp; buflen > 0 ; ) { /* * Get the physical address for this segment. */ if (pmap) curaddr = pmap_extract(pmap, vaddr); else curaddr = pmap_kextract(vaddr); /* * Compute the segment size, and adjust counts. */ sgsize = PAGE_SIZE - ((u_long)curaddr & PAGE_MASK); if (sgsize > dmat->maxsegsz) sgsize = dmat->maxsegsz; if (buflen < sgsize) sgsize = buflen; /* * Make sure we don't cross any boundaries. */ if (dmat->boundary > 0) { baddr = (curaddr + dmat->boundary) & bmask; if (sgsize > (baddr - curaddr)) sgsize = (baddr - curaddr); } /* * Insert chunk into a segment, coalescing with * the previous segment if possible. */ if (first) { segs[seg].ds_addr = curaddr; segs[seg].ds_len = sgsize; first = 0; } else { if (curaddr == lastaddr && (segs[seg].ds_len + sgsize) <= dmat->maxsegsz && (dmat->boundary == 0 || (segs[seg].ds_addr & bmask) == (curaddr & bmask))) segs[seg].ds_len += sgsize; else { if (++seg >= dmat->nsegments) break; segs[seg].ds_addr = curaddr; segs[seg].ds_len = sgsize; } } lastaddr = curaddr + sgsize; vaddr += sgsize; buflen -= sgsize; } *segp = seg; *lastaddrp = lastaddr; /* * Did we fit? */ return (buflen != 0 ? EFBIG : 0); /* XXX better return value here? */ }
/* * Finish a fork operation, with process p2 nearly set up. * Copy and update the pcb, set up the stack so that the child * ready to run and return to user mode. */ void cpu_fork(register struct thread *td1, register struct proc *p2, struct thread *td2, int flags) { struct pcb *pcb2; struct trapframe *tf; struct mdproc *mdp2; if ((flags & RFPROC) == 0) return; /* Point the pcb to the top of the stack */ pcb2 = (struct pcb *) (td2->td_kstack + td2->td_kstack_pages * PAGE_SIZE) - 1; #ifdef __XSCALE__ #ifndef CPU_XSCALE_CORE3 pmap_use_minicache(td2->td_kstack, td2->td_kstack_pages * PAGE_SIZE); #endif #endif td2->td_pcb = pcb2; /* Clone td1's pcb */ bcopy(td1->td_pcb, pcb2, sizeof(*pcb2)); /* Point to mdproc and then copy over td1's contents */ mdp2 = &p2->p_md; bcopy(&td1->td_proc->p_md, mdp2, sizeof(*mdp2)); /* Point the frame to the stack in front of pcb and copy td1's frame */ td2->td_frame = (struct trapframe *)pcb2 - 1; *td2->td_frame = *td1->td_frame; /* * Create a new fresh stack for the new process. * Copy the trap frame for the return to user mode as if from a * syscall. This copies most of the user mode register values. */ pmap_set_pcb_pagedir(vmspace_pmap(p2->p_vmspace), pcb2); pcb2->pcb_regs.sf_r4 = (register_t)fork_return; pcb2->pcb_regs.sf_r5 = (register_t)td2; pcb2->pcb_regs.sf_lr = (register_t)fork_trampoline; pcb2->pcb_regs.sf_sp = STACKALIGN(td2->td_frame); pcb2->pcb_vfpcpu = -1; pcb2->pcb_vfpstate.fpscr = VFPSCR_DN | VFPSCR_FZ; tf = td2->td_frame; tf->tf_spsr &= ~PSR_C; tf->tf_r0 = 0; tf->tf_r1 = 0; /* Setup to release spin count in fork_exit(). */ td2->td_md.md_spinlock_count = 1; td2->td_md.md_saved_cspr = PSR_SVC32_MODE;; #ifdef ARM_TP_ADDRESS td2->td_md.md_tp = *(register_t *)ARM_TP_ADDRESS; #else td2->td_md.md_tp = td1->td_md.md_tp; #endif }
int acpi_sleep_machdep(struct acpi_softc *sc, int state) { ACPI_STATUS status; vm_offset_t oldphys; struct pmap *pm; vm_page_t page; static vm_page_t opage = NULL; int ret = 0; int pteobj_allocated = 0; u_long ef; struct proc *p; if (sc->acpi_wakeaddr == 0) { return (0); } AcpiSetFirmwareWakingVector(sc->acpi_wakephys); ef = read_eflags(); disable_intr(); /* Create Identity Mapping */ if ((p = curproc) == NULL) p = &proc0; pm = vmspace_pmap(p->p_vmspace); if (pm->pm_pteobj == NULL) { pm->pm_pteobj = vm_object_allocate(OBJT_DEFAULT, PTDPTDI + 1); pteobj_allocated = 1; } oldphys = pmap_extract(pm, sc->acpi_wakephys); if (oldphys) { opage = PHYS_TO_VM_PAGE(oldphys); } page = PHYS_TO_VM_PAGE(sc->acpi_wakephys); pmap_enter(pm, sc->acpi_wakephys, page, VM_PROT_READ | VM_PROT_WRITE | VM_PROT_EXECUTE, 1); ret_addr = 0; if (acpi_savecpu()) { /* Execute Sleep */ p_gdt = (struct region_descriptor *)(sc->acpi_wakeaddr + physical_gdt); p_gdt->rd_limit = r_gdt.rd_limit; p_gdt->rd_base = vtophys(r_gdt.rd_base); WAKECODE_FIXUP(physical_esp, u_int32_t, vtophys(r_esp)); WAKECODE_FIXUP(previous_cr0, u_int32_t, r_cr0); WAKECODE_FIXUP(previous_cr2, u_int32_t, r_cr2); WAKECODE_FIXUP(previous_cr3, u_int32_t, r_cr3); WAKECODE_FIXUP(previous_cr4, u_int32_t, r_cr4); WAKECODE_FIXUP(previous_tr, u_int16_t, r_tr); WAKECODE_BCOPY(previous_gdt, struct region_descriptor, r_gdt); WAKECODE_FIXUP(previous_ldt, u_int16_t, r_ldt); WAKECODE_BCOPY(previous_idt, struct region_descriptor, r_idt); WAKECODE_FIXUP(where_to_recover, void, acpi_restorecpu); WAKECODE_FIXUP(previous_ds, u_int16_t, r_ds); WAKECODE_FIXUP(previous_es, u_int16_t, r_es); WAKECODE_FIXUP(previous_fs, u_int16_t, r_fs); WAKECODE_FIXUP(previous_gs, u_int16_t, r_gs); WAKECODE_FIXUP(previous_ss, u_int16_t, r_ss); if (acpi_get_verbose(sc)) { acpi_printcpu(); } wbinvd(); if (state == ACPI_STATE_S4 && sc->acpi_s4bios) { status = AcpiEnterSleepStateS4Bios(); } else { status = AcpiEnterSleepState(state); } if (status != AE_OK) { device_printf(sc->acpi_dev, "AcpiEnterSleepState failed - %s\n", AcpiFormatException(status)); ret = -1; goto out; } for (;;) ; } else { /* Execute Wakeup */ #if 0 initializecpu(); #endif icu_reinit(); if (acpi_get_verbose(sc)) { acpi_savecpu(); acpi_printcpu(); } } out: vm_page_lock_queues(); pmap_remove(pm, sc->acpi_wakephys, sc->acpi_wakephys + PAGE_SIZE); vm_page_unlock_queues(); if (opage) { pmap_enter(pm, sc->acpi_wakephys, page, VM_PROT_READ | VM_PROT_WRITE, 0); } if (pteobj_allocated) { vm_object_deallocate(pm->pm_pteobj); pm->pm_pteobj = NULL; } write_eflags(ef); return (ret); }
/* * Destroy old address space, and allocate a new stack. * The new stack is only sgrowsiz large because it is grown * automatically on a page fault. */ int exec_new_vmspace(struct image_params *imgp, struct sysentvec *sv) { int error; struct proc *p = imgp->proc; struct vmspace *vmspace = p->p_vmspace; vm_object_t obj; struct rlimit rlim_stack; vm_offset_t sv_minuser, stack_addr; vm_map_t map; u_long ssiz; imgp->vmspace_destroyed = 1; imgp->sysent = sv; /* May be called with Giant held */ EVENTHANDLER_DIRECT_INVOKE(process_exec, p, imgp); /* * Blow away entire process VM, if address space not shared, * otherwise, create a new VM space so that other threads are * not disrupted */ map = &vmspace->vm_map; if (map_at_zero) sv_minuser = sv->sv_minuser; else sv_minuser = MAX(sv->sv_minuser, PAGE_SIZE); if (vmspace->vm_refcnt == 1 && vm_map_min(map) == sv_minuser && vm_map_max(map) == sv->sv_maxuser && cpu_exec_vmspace_reuse(p, map)) { shmexit(vmspace); pmap_remove_pages(vmspace_pmap(vmspace)); vm_map_remove(map, vm_map_min(map), vm_map_max(map)); /* * An exec terminates mlockall(MCL_FUTURE), ASLR state * must be re-evaluated. */ vm_map_lock(map); vm_map_modflags(map, 0, MAP_WIREFUTURE | MAP_ASLR | MAP_ASLR_IGNSTART); vm_map_unlock(map); } else { error = vmspace_exec(p, sv_minuser, sv->sv_maxuser); if (error) return (error); vmspace = p->p_vmspace; map = &vmspace->vm_map; } map->flags |= imgp->map_flags; /* Map a shared page */ obj = sv->sv_shared_page_obj; if (obj != NULL) { vm_object_reference(obj); error = vm_map_fixed(map, obj, 0, sv->sv_shared_page_base, sv->sv_shared_page_len, VM_PROT_READ | VM_PROT_EXECUTE, VM_PROT_READ | VM_PROT_EXECUTE, MAP_INHERIT_SHARE | MAP_ACC_NO_CHARGE); if (error != KERN_SUCCESS) { vm_object_deallocate(obj); return (vm_mmap_to_errno(error)); } } /* Allocate a new stack */ if (imgp->stack_sz != 0) { ssiz = trunc_page(imgp->stack_sz); PROC_LOCK(p); lim_rlimit_proc(p, RLIMIT_STACK, &rlim_stack); PROC_UNLOCK(p); if (ssiz > rlim_stack.rlim_max) ssiz = rlim_stack.rlim_max; if (ssiz > rlim_stack.rlim_cur) { rlim_stack.rlim_cur = ssiz; kern_setrlimit(curthread, RLIMIT_STACK, &rlim_stack); } } else if (sv->sv_maxssiz != NULL) { ssiz = *sv->sv_maxssiz; } else { ssiz = maxssiz; } stack_addr = sv->sv_usrstack - ssiz; error = vm_map_stack(map, stack_addr, (vm_size_t)ssiz, obj != NULL && imgp->stack_prot != 0 ? imgp->stack_prot : sv->sv_stackprot, VM_PROT_ALL, MAP_STACK_GROWS_DOWN); if (error != KERN_SUCCESS) return (vm_mmap_to_errno(error)); /* * vm_ssize and vm_maxsaddr are somewhat antiquated concepts, but they * are still used to enforce the stack rlimit on the process stack. */ vmspace->vm_ssize = sgrowsiz >> PAGE_SHIFT; vmspace->vm_maxsaddr = (char *)stack_addr; return (0); }
/* * Abort handler. * * FAR, FSR, and everything what can be lost after enabling * interrupts must be grabbed before the interrupts will be * enabled. Note that when interrupts will be enabled, we * could even migrate to another CPU ... * * TODO: move quick cases to ASM */ void abort_handler(struct trapframe *tf, int prefetch) { struct thread *td; vm_offset_t far, va; int idx, rv; uint32_t fsr; struct ksig ksig; struct proc *p; struct pcb *pcb; struct vm_map *map; struct vmspace *vm; vm_prot_t ftype; bool usermode; #ifdef INVARIANTS void *onfault; #endif td = curthread; fsr = (prefetch) ? cp15_ifsr_get(): cp15_dfsr_get(); #if __ARM_ARCH >= 7 far = (prefetch) ? cp15_ifar_get() : cp15_dfar_get(); #else far = (prefetch) ? TRAPF_PC(tf) : cp15_dfar_get(); #endif idx = FSR_TO_FAULT(fsr); usermode = TRAPF_USERMODE(tf); /* Abort came from user mode? */ if (usermode) td->td_frame = tf; CTR6(KTR_TRAP, "%s: fsr %#x (idx %u) far %#x prefetch %u usermode %d", __func__, fsr, idx, far, prefetch, usermode); /* * Firstly, handle aborts that are not directly related to mapping. */ if (__predict_false(idx == FAULT_EA_IMPREC)) { abort_imprecise(tf, fsr, prefetch, usermode); return; } if (__predict_false(idx == FAULT_DEBUG)) { abort_debug(tf, fsr, prefetch, usermode, far); return; } /* * ARM has a set of unprivileged load and store instructions * (LDRT/LDRBT/STRT/STRBT ...) which are supposed to be used in other * than user mode and OS should recognize their aborts and behave * appropriately. However, there is no way how to do that reasonably * in general unless we restrict the handling somehow. * * For now, these instructions are used only in copyin()/copyout() * like functions where usermode buffers are checked in advance that * they are not from KVA space. Thus, no action is needed here. */ #ifdef ARM_NEW_PMAP rv = pmap_fault(PCPU_GET(curpmap), far, fsr, idx, usermode); if (rv == 0) { return; } else if (rv == EFAULT) { call_trapsignal(td, SIGSEGV, SEGV_MAPERR, far); userret(td, tf); return; } #endif /* * Now, when we handled imprecise and debug aborts, the rest of * aborts should be really related to mapping. */ PCPU_INC(cnt.v_trap); #ifdef KDB if (kdb_active) { kdb_reenter(); goto out; } #endif if (__predict_false((td->td_pflags & TDP_NOFAULTING) != 0)) { /* * Due to both processor errata and lazy TLB invalidation when * access restrictions are removed from virtual pages, memory * accesses that are allowed by the physical mapping layer may * nonetheless cause one spurious page fault per virtual page. * When the thread is executing a "no faulting" section that * is bracketed by vm_fault_{disable,enable}_pagefaults(), * every page fault is treated as a spurious page fault, * unless it accesses the same virtual address as the most * recent page fault within the same "no faulting" section. */ if (td->td_md.md_spurflt_addr != far || (td->td_pflags & TDP_RESETSPUR) != 0) { td->td_md.md_spurflt_addr = far; td->td_pflags &= ~TDP_RESETSPUR; tlb_flush_local(far & ~PAGE_MASK); return; } } else { /* * If we get a page fault while in a critical section, then * it is most likely a fatal kernel page fault. The kernel * is already going to panic trying to get a sleep lock to * do the VM lookup, so just consider it a fatal trap so the * kernel can print out a useful trap message and even get * to the debugger. * * If we get a page fault while holding a non-sleepable * lock, then it is most likely a fatal kernel page fault. * If WITNESS is enabled, then it's going to whine about * bogus LORs with various VM locks, so just skip to the * fatal trap handling directly. */ if (td->td_critnest != 0 || WITNESS_CHECK(WARN_SLEEPOK | WARN_GIANTOK, NULL, "Kernel page fault") != 0) { abort_fatal(tf, idx, fsr, far, prefetch, td, &ksig); return; } } /* Re-enable interrupts if they were enabled previously. */ if (td->td_md.md_spinlock_count == 0) { if (__predict_true(tf->tf_spsr & PSR_I) == 0) enable_interrupts(PSR_I); if (__predict_true(tf->tf_spsr & PSR_F) == 0) enable_interrupts(PSR_F); } p = td->td_proc; if (usermode) { td->td_pticks = 0; if (td->td_cowgen != p->p_cowgen) thread_cow_update(td); } /* Invoke the appropriate handler, if necessary. */ if (__predict_false(aborts[idx].func != NULL)) { if ((aborts[idx].func)(tf, idx, fsr, far, prefetch, td, &ksig)) goto do_trapsignal; goto out; } /* * Don't pass faulting cache operation to vm_fault(). We don't want * to handle all vm stuff at this moment. */ pcb = td->td_pcb; if (__predict_false(pcb->pcb_onfault == cachebailout)) { tf->tf_r0 = far; /* return failing address */ tf->tf_pc = (register_t)pcb->pcb_onfault; return; } /* Handle remaining I-cache aborts. */ if (idx == FAULT_ICACHE) { if (abort_icache(tf, idx, fsr, far, prefetch, td, &ksig)) goto do_trapsignal; goto out; } /* * At this point, we're dealing with one of the following aborts: * * FAULT_TRAN_xx - Translation * FAULT_PERM_xx - Permission * * These are the main virtual memory-related faults signalled by * the MMU. */ /* fusubailout is used by [fs]uswintr to avoid page faulting. */ pcb = td->td_pcb; if (__predict_false(pcb->pcb_onfault == fusubailout)) { tf->tf_r0 = EFAULT; tf->tf_pc = (register_t)pcb->pcb_onfault; return; } va = trunc_page(far); if (va >= KERNBASE) { /* * Don't allow user-mode faults in kernel address space. */ if (usermode) goto nogo; map = kernel_map; } else { /* * This is a fault on non-kernel virtual memory. If curproc * is NULL or curproc->p_vmspace is NULL the fault is fatal. */ vm = (p != NULL) ? p->p_vmspace : NULL; if (vm == NULL) goto nogo; map = &vm->vm_map; if (!usermode && (td->td_intr_nesting_level != 0 || pcb->pcb_onfault == NULL)) { abort_fatal(tf, idx, fsr, far, prefetch, td, &ksig); return; } } ftype = (fsr & FSR_WNR) ? VM_PROT_WRITE : VM_PROT_READ; if (prefetch) ftype |= VM_PROT_EXECUTE; #ifdef DEBUG last_fault_code = fsr; #endif #ifndef ARM_NEW_PMAP if (pmap_fault_fixup(vmspace_pmap(td->td_proc->p_vmspace), va, ftype, usermode)) { goto out; } #endif #ifdef INVARIANTS onfault = pcb->pcb_onfault; pcb->pcb_onfault = NULL; #endif /* Fault in the page. */ rv = vm_fault(map, va, ftype, VM_FAULT_NORMAL); #ifdef INVARIANTS pcb->pcb_onfault = onfault; #endif if (__predict_true(rv == KERN_SUCCESS)) goto out; nogo: if (!usermode) { if (td->td_intr_nesting_level == 0 && pcb->pcb_onfault != NULL) { tf->tf_r0 = rv; tf->tf_pc = (int)pcb->pcb_onfault; return; } CTR2(KTR_TRAP, "%s: vm_fault() failed with %d", __func__, rv); abort_fatal(tf, idx, fsr, far, prefetch, td, &ksig); return; } ksig.sig = SIGSEGV; ksig.code = (rv == KERN_PROTECTION_FAILURE) ? SEGV_ACCERR : SEGV_MAPERR; ksig.addr = far; do_trapsignal: call_trapsignal(td, ksig.sig, ksig.code, ksig.addr); out: if (usermode) userret(td, tf); }
/* * mincore system call handler * * mincore_args(const void *addr, size_t len, char *vec) * * No requirements */ int sys_mincore(struct mincore_args *uap) { struct proc *p = curproc; vm_offset_t addr, first_addr; vm_offset_t end, cend; pmap_t pmap; vm_map_t map; char *vec; int error; int vecindex, lastvecindex; vm_map_entry_t current; vm_map_entry_t entry; int mincoreinfo; unsigned int timestamp; /* * Make sure that the addresses presented are valid for user * mode. */ first_addr = addr = trunc_page((vm_offset_t) uap->addr); end = addr + (vm_size_t)round_page(uap->len); if (end < addr) return (EINVAL); if (VM_MAX_USER_ADDRESS > 0 && end > VM_MAX_USER_ADDRESS) return (EINVAL); /* * Address of byte vector */ vec = uap->vec; map = &p->p_vmspace->vm_map; pmap = vmspace_pmap(p->p_vmspace); lwkt_gettoken(&map->token); vm_map_lock_read(map); RestartScan: timestamp = map->timestamp; if (!vm_map_lookup_entry(map, addr, &entry)) entry = entry->next; /* * Do this on a map entry basis so that if the pages are not * in the current processes address space, we can easily look * up the pages elsewhere. */ lastvecindex = -1; for(current = entry; (current != &map->header) && (current->start < end); current = current->next) { /* * ignore submaps (for now) or null objects */ if (current->maptype != VM_MAPTYPE_NORMAL && current->maptype != VM_MAPTYPE_VPAGETABLE) { continue; } if (current->object.vm_object == NULL) continue; /* * limit this scan to the current map entry and the * limits for the mincore call */ if (addr < current->start) addr = current->start; cend = current->end; if (cend > end) cend = end; /* * scan this entry one page at a time */ while (addr < cend) { /* * Check pmap first, it is likely faster, also * it can provide info as to whether we are the * one referencing or modifying the page. * * If we have to check the VM object, only mess * around with normal maps. Do not mess around * with virtual page tables (XXX). */ mincoreinfo = pmap_mincore(pmap, addr); if (mincoreinfo == 0 && current->maptype == VM_MAPTYPE_NORMAL) { vm_pindex_t pindex; vm_ooffset_t offset; vm_page_t m; /* * calculate the page index into the object */ offset = current->offset + (addr - current->start); pindex = OFF_TO_IDX(offset); /* * if the page is resident, then gather * information about it. spl protection is * required to maintain the object * association. And XXX what if the page is * busy? What's the deal with that? * * XXX vm_token - legacy for pmap_ts_referenced * in i386 and vkernel pmap code. */ lwkt_gettoken(&vm_token); vm_object_hold(current->object.vm_object); m = vm_page_lookup(current->object.vm_object, pindex); if (m && m->valid) { mincoreinfo = MINCORE_INCORE; if (m->dirty || pmap_is_modified(m)) mincoreinfo |= MINCORE_MODIFIED_OTHER; if ((m->flags & PG_REFERENCED) || pmap_ts_referenced(m)) { vm_page_flag_set(m, PG_REFERENCED); mincoreinfo |= MINCORE_REFERENCED_OTHER; } } vm_object_drop(current->object.vm_object); lwkt_reltoken(&vm_token); } /* * subyte may page fault. In case it needs to modify * the map, we release the lock. */ vm_map_unlock_read(map); /* * calculate index into user supplied byte vector */ vecindex = OFF_TO_IDX(addr - first_addr); /* * If we have skipped map entries, we need to make sure that * the byte vector is zeroed for those skipped entries. */ while((lastvecindex + 1) < vecindex) { error = subyte( vec + lastvecindex, 0); if (error) { error = EFAULT; goto done; } ++lastvecindex; } /* * Pass the page information to the user */ error = subyte( vec + vecindex, mincoreinfo); if (error) { error = EFAULT; goto done; } /* * If the map has changed, due to the subyte, the previous * output may be invalid. */ vm_map_lock_read(map); if (timestamp != map->timestamp) goto RestartScan; lastvecindex = vecindex; addr += PAGE_SIZE; } } /* * subyte may page fault. In case it needs to modify * the map, we release the lock. */ vm_map_unlock_read(map); /* * Zero the last entries in the byte vector. */ vecindex = OFF_TO_IDX(end - first_addr); while((lastvecindex + 1) < vecindex) { error = subyte( vec + lastvecindex, 0); if (error) { error = EFAULT; goto done; } ++lastvecindex; } /* * If the map has changed, due to the subyte, the previous * output may be invalid. */ vm_map_lock_read(map); if (timestamp != map->timestamp) goto RestartScan; vm_map_unlock_read(map); error = 0; done: lwkt_reltoken(&map->token); return (error); }
/* * Finish a fork operation, with lwp lp2 nearly set up. * Copy and update the pcb, set up the stack so that the child * ready to run and return to user mode. */ void cpu_fork(struct lwp *lp1, struct lwp *lp2, int flags) { struct pcb *pcb2; if ((flags & RFPROC) == 0) { if ((flags & RFMEM) == 0) { /* unshare user LDT */ struct pcb *pcb1 = lp1->lwp_thread->td_pcb; struct pcb_ldt *pcb_ldt = pcb1->pcb_ldt; if (pcb_ldt && pcb_ldt->ldt_refcnt > 1) { pcb_ldt = user_ldt_alloc(pcb1,pcb_ldt->ldt_len); user_ldt_free(pcb1); pcb1->pcb_ldt = pcb_ldt; set_user_ldt(pcb1); } } return; } #if NNPX > 0 /* Ensure that lp1's pcb is up to date. */ if (mdcpu->gd_npxthread == lp1->lwp_thread) npxsave(lp1->lwp_thread->td_savefpu); #endif /* * Copy lp1's PCB. This really only applies to the * debug registers and FP state, but its faster to just copy the * whole thing. Because we only save the PCB at switchout time, * the register state may not be current. */ pcb2 = lp2->lwp_thread->td_pcb; *pcb2 = *lp1->lwp_thread->td_pcb; /* * Create a new fresh stack for the new process. * Copy the trap frame for the return to user mode as if from a * syscall. This copies the user mode register values. The * 16 byte offset saves space for vm86, and must match * common_tss.esp0 (kernel stack pointer on entry from user mode) * * pcb_esp must allocate an additional call-return pointer below * the trap frame which will be restored by cpu_restore from * PCB_EIP, and the thread's td_sp pointer must allocate an * additonal two worsd below the pcb_esp call-return pointer to * hold the LWKT restore function pointer and eflags. * * The LWKT restore function pointer must be set to cpu_restore, * which is our standard heavy weight process switch-in function. * YYY eventually we should shortcut fork_return and fork_trampoline * to use the LWKT restore function directly so we can get rid of * all the extra crap we are setting up. */ lp2->lwp_md.md_regs = (struct trapframe *)((char *)pcb2 - 16) - 1; bcopy(lp1->lwp_md.md_regs, lp2->lwp_md.md_regs, sizeof(*lp2->lwp_md.md_regs)); /* * Set registers for trampoline to user mode. Leave space for the * return address on stack. These are the kernel mode register values. */ pcb2->pcb_cr3 = vtophys(vmspace_pmap(lp2->lwp_proc->p_vmspace)->pm_pdir); pcb2->pcb_edi = 0; pcb2->pcb_esi = (int)fork_return; /* fork_trampoline argument */ pcb2->pcb_ebp = 0; pcb2->pcb_esp = (int)lp2->lwp_md.md_regs - sizeof(void *); pcb2->pcb_ebx = (int)lp2; /* fork_trampoline argument */ pcb2->pcb_eip = (int)fork_trampoline; lp2->lwp_thread->td_sp = (char *)(pcb2->pcb_esp - sizeof(void *)); *(u_int32_t *)lp2->lwp_thread->td_sp = PSL_USER; lp2->lwp_thread->td_sp -= sizeof(void *); *(void **)lp2->lwp_thread->td_sp = (void *)cpu_heavy_restore; /* * pcb2->pcb_ldt: duplicated below, if necessary. * pcb2->pcb_savefpu: cloned above. * pcb2->pcb_flags: cloned above (always 0 here). * pcb2->pcb_onfault: cloned above (always NULL here). * pcb2->pcb_onfault_sp:cloned above (don't care) */ /* * XXX don't copy the i/o pages. this should probably be fixed. */ pcb2->pcb_ext = NULL; /* Copy the LDT, if necessary. */ if (pcb2->pcb_ldt != NULL) { if (flags & RFMEM) { pcb2->pcb_ldt->ldt_refcnt++; } else { pcb2->pcb_ldt = user_ldt_alloc(pcb2, pcb2->pcb_ldt->ldt_len); } } bcopy(&lp1->lwp_thread->td_tls, &lp2->lwp_thread->td_tls, sizeof(lp2->lwp_thread->td_tls)); /* * Now, cpu_switch() can schedule the new lwp. * pcb_esp is loaded pointing to the cpu_switch() stack frame * containing the return address when exiting cpu_switch. * This will normally be to fork_trampoline(), which will have * %ebx loaded with the new lwp's pointer. fork_trampoline() * will set up a stack to call fork_return(lp, frame); to complete * the return to user-mode. */ }
static int dma_memcpy(void *dst, void *src, int len, int flags) { struct i80321_dma_softc *sc; i80321_dmadesc_t *desc; int ret; int csr; int descnb = 0; int tmplen = len; int to_nextpagesrc, to_nextpagedst; int min_hop; vm_paddr_t pa, pa2, tmppa; pmap_t pmap = vmspace_pmap(curthread->td_proc->p_vmspace); if (!softcs[0] || !softcs[1]) return (-1); mtx_lock_spin(&softcs[0]->mtx); if (softcs[0]->flags & BUSY) { mtx_unlock_spin(&softcs[0]->mtx); mtx_lock_spin(&softcs[1]->mtx); if (softcs[1]->flags & BUSY) { mtx_unlock(&softcs[1]->mtx); return (-1); } sc = softcs[1]; } else sc = softcs[0]; sc->flags |= BUSY; mtx_unlock_spin(&sc->mtx); desc = sc->dmaring[0].desc; if (flags & IS_PHYSICAL) { desc->next_desc = 0; desc->low_pciaddr = (vm_paddr_t)src; desc->high_pciaddr = 0; desc->local_addr = (vm_paddr_t)dst; desc->count = len; desc->descr_ctrl = 1 << 6; /* Local memory to local memory. */ bus_dmamap_sync(sc->dmatag, sc->dmaring[0].map, BUS_DMASYNC_PREWRITE); } else { if (!virt_addr_is_valid(dst, len, 1, !(flags & DST_IS_USER)) || !virt_addr_is_valid(src, len, 0, !(flags & SRC_IS_USER))) { mtx_lock_spin(&sc->mtx); sc->flags &= ~BUSY; mtx_unlock_spin(&sc->mtx); return (-1); } cpu_dcache_wb_range((vm_offset_t)src, len); if ((vm_offset_t)dst & (31)) cpu_dcache_wb_range((vm_offset_t)dst & ~31, 32); if (((vm_offset_t)dst + len) & 31) cpu_dcache_wb_range(((vm_offset_t)dst + len) & ~31, 32); cpu_dcache_inv_range((vm_offset_t)dst, len); while (tmplen > 0) { pa = (flags & SRC_IS_USER) ? pmap_extract(pmap, (vm_offset_t)src) : vtophys(src); pa2 = (flags & DST_IS_USER) ? pmap_extract(pmap, (vm_offset_t)dst) : vtophys(dst); to_nextpagesrc = ((vm_offset_t)src & ~PAGE_MASK) + PAGE_SIZE - (vm_offset_t)src; to_nextpagedst = ((vm_offset_t)dst & ~PAGE_MASK) + PAGE_SIZE - (vm_offset_t)dst; while (to_nextpagesrc < tmplen) { tmppa = (flags & SRC_IS_USER) ? pmap_extract(pmap, (vm_offset_t)src + to_nextpagesrc) : vtophys((vm_offset_t)src + to_nextpagesrc); if (tmppa != pa + to_nextpagesrc) break; to_nextpagesrc += PAGE_SIZE; } while (to_nextpagedst < tmplen) { tmppa = (flags & DST_IS_USER) ? pmap_extract(pmap, (vm_offset_t)dst + to_nextpagedst) : vtophys((vm_offset_t)dst + to_nextpagedst); if (tmppa != pa2 + to_nextpagedst) break; to_nextpagedst += PAGE_SIZE; } min_hop = to_nextpagedst > to_nextpagesrc ? to_nextpagesrc : to_nextpagedst; if (min_hop < 64) { tmplen -= min_hop; memcpy(dst, src, min_hop); cpu_dcache_wbinv_range((vm_offset_t)dst, min_hop); src = (void *)((vm_offset_t)src + min_hop); dst = (void *)((vm_offset_t)dst + min_hop); if (tmplen <= 0 && descnb > 0) { sc->dmaring[descnb - 1].desc->next_desc = 0; bus_dmamap_sync(sc->dmatag, sc->dmaring[descnb - 1].map, BUS_DMASYNC_PREWRITE); } continue; } desc->low_pciaddr = pa; desc->high_pciaddr = 0; desc->local_addr = pa2; desc->count = tmplen > min_hop ? min_hop : tmplen; desc->descr_ctrl = 1 << 6; if (min_hop < tmplen) { tmplen -= min_hop; src = (void *)((vm_offset_t)src + min_hop); dst = (void *)((vm_offset_t)dst + min_hop); } else tmplen = 0; if (descnb + 1 >= DMA_RING_SIZE) { mtx_lock_spin(&sc->mtx); sc->flags &= ~BUSY; mtx_unlock_spin(&sc->mtx); return (-1); } if (tmplen > 0) { desc->next_desc = sc->dmaring[descnb + 1]. phys_addr; bus_dmamap_sync(sc->dmatag, sc->dmaring[descnb].map, BUS_DMASYNC_PREWRITE); desc = sc->dmaring[descnb + 1].desc; descnb++; } else { desc->next_desc = 0; bus_dmamap_sync(sc->dmatag, sc->dmaring[descnb].map, BUS_DMASYNC_PREWRITE); } } } DMA_REG_WRITE(sc, 4 /* Status register */, DMA_REG_READ(sc, 4) | DMA_CLEAN_MASK); DMA_REG_WRITE(sc, 0x10 /* Descriptor addr */, sc->dmaring[0].phys_addr); DMA_REG_WRITE(sc, 0 /* Control register */, 1 | 2/* Start transfer */); while ((csr = DMA_REG_READ(sc, 0x4)) & (1 << 10)); /* Wait until it's done. */ if (csr & 0x2e) /* error */ ret = -1; else ret = 0; DMA_REG_WRITE(sc, 0, 0); mtx_lock_spin(&sc->mtx); sc->flags &= ~BUSY; mtx_unlock_spin(&sc->mtx); return (ret); }