/* Pretty sick eh? */ int prom_callback(long *args) { struct console *cons, *saved_console = NULL; unsigned long flags; char *cmd; if (!args) return -1; if (!(cmd = (char *)args[0])) return -1; save_and_cli(flags); cons = console_drivers; while (cons) { unregister_console(cons); cons->flags &= ~(CON_PRINTBUFFER); cons->next = saved_console; saved_console = cons; cons = console_drivers; } register_console(&prom_console); if (!strcmp(cmd, "sync")) { prom_printf("PROM `%s' command...\n", cmd); show_free_areas(); if(current->pid != 0) { sti(); sys_sync(); cli(); } args[2] = 0; args[args[1] + 3] = -1; prom_printf("Returning to PROM\n"); } else if (!strcmp(cmd, "va>tte-data")) { unsigned long ctx, va; unsigned long tte = 0; long res = PROM_FALSE; ctx = args[3]; va = args[4]; if (ctx) { /* * Find process owning ctx, lookup mapping. */ struct task_struct *p; pgd_t *pgdp; pmd_t *pmdp; pte_t *ptep; for_each_task(p) if (p->tss.ctx == ctx) break; if (p->tss.ctx != ctx) goto done; pgdp = pgd_offset(p->mm, va); if (pgd_none(*pgdp)) goto done; pmdp = pmd_offset(pgdp, va); if (pmd_none(*pmdp)) goto done; ptep = pte_offset(pmdp, va); if (!pte_present(*ptep)) goto done; tte = pte_val(*ptep); res = PROM_TRUE; goto done; } if ((va >= KERNBASE) && (va < (KERNBASE + (4 * 1024 * 1024)))) { /* * Locked down tlb entry 63. */ tte = spitfire_get_dtlb_data(63); res = PROM_TRUE; goto done; } if (va < PGDIR_SIZE) { /* * vmalloc or prom_inherited mapping. */ pgd_t *pgdp; pmd_t *pmdp; pte_t *ptep; pgdp = pgd_offset_k(va); if (pgd_none(*pgdp)) goto done; pmdp = pmd_offset(pgdp, va); if (pmd_none(*pmdp)) goto done; ptep = pte_offset(pmdp, va); if (!pte_present(*ptep)) goto done; tte = pte_val(*ptep); res = PROM_TRUE; goto done; } if (va < PAGE_OFFSET) { /* * No mappings here. */ goto done; } if (va & (1UL << 40)) { /* * I/O page. */ tte = (__pa(va) & _PAGE_PADDR) | _PAGE_VALID | _PAGE_SZ4MB | _PAGE_E | _PAGE_P | _PAGE_W; res = PROM_TRUE; goto done; } /* * Normal page. */ tte = (__pa(va) & _PAGE_PADDR) | _PAGE_VALID | _PAGE_SZ4MB | _PAGE_CP | _PAGE_CV | _PAGE_P | _PAGE_W; res = PROM_TRUE; done: if (res == PROM_TRUE) { args[2] = 3; args[args[1] + 3] = 0; args[args[1] + 4] = res; args[args[1] + 5] = tte; } else { args[2] = 2; args[args[1] + 3] = 0; args[args[1] + 4] = res; } } else if (!strcmp(cmd, ".soft1")) {
/* * This routine is responsible for faulting in user pages. * It passes the work off to one of the appropriate routines. * It returns true if the fault was successfully handled. */ static int handle_page_fault(struct pt_regs *regs, int fault_num, int is_page_fault, unsigned long address, int write) { struct task_struct *tsk; struct mm_struct *mm; struct vm_area_struct *vma; unsigned long stack_offset; int fault; int si_code; int is_kernel_mode; pgd_t *pgd; /* on TILE, protection faults are always writes */ if (!is_page_fault) write = 1; flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; is_kernel_mode = (EX1_PL(regs->ex1) != USER_PL); tsk = validate_current(); /* * Check to see if we might be overwriting the stack, and bail * out if so. The page fault code is a relatively likely * place to get trapped in an infinite regress, and once we * overwrite the whole stack, it becomes very hard to recover. */ stack_offset = stack_pointer & (THREAD_SIZE-1); if (stack_offset < THREAD_SIZE / 8) { pr_alert("Potential stack overrun: sp %#lx\n", stack_pointer); show_regs(regs); pr_alert("Killing current process %d/%s\n", tsk->pid, tsk->comm); do_group_exit(SIGKILL); } /* * Early on, we need to check for migrating PTE entries; * see homecache.c. If we find a migrating PTE, we wait until * the backing page claims to be done migrating, then we proceed. * For kernel PTEs, we rewrite the PTE and return and retry. * Otherwise, we treat the fault like a normal "no PTE" fault, * rather than trying to patch up the existing PTE. */ pgd = get_current_pgd(); if (handle_migrating_pte(pgd, fault_num, address, regs->pc, is_kernel_mode, write)) return 1; si_code = SEGV_MAPERR; /* * We fault-in kernel-space virtual memory on-demand. The * 'reference' page table is init_mm.pgd. * * NOTE! We MUST NOT take any locks for this case. We may * be in an interrupt or a critical region, and should * only copy the information from the master page table, * nothing more. * * This verifies that the fault happens in kernel space * and that the fault was not a protection fault. */ if (unlikely(address >= TASK_SIZE && !is_arch_mappable_range(address, 0))) { if (is_kernel_mode && is_page_fault && vmalloc_fault(pgd, address) >= 0) return 1; /* * Don't take the mm semaphore here. If we fixup a prefetch * fault we could otherwise deadlock. */ mm = NULL; /* happy compiler */ vma = NULL; goto bad_area_nosemaphore; } /* * If we're trying to touch user-space addresses, we must * be either at PL0, or else with interrupts enabled in the * kernel, so either way we can re-enable interrupts here * unless we are doing atomic access to user space with * interrupts disabled. */ if (!(regs->flags & PT_FLAGS_DISABLE_IRQ)) local_irq_enable(); mm = tsk->mm; /* * If we're in an interrupt, have no user context or are running in an * atomic region then we must not take the fault. */ if (in_atomic() || !mm) { vma = NULL; /* happy compiler */ goto bad_area_nosemaphore; } if (!is_kernel_mode) flags |= FAULT_FLAG_USER; /* * When running in the kernel we expect faults to occur only to * addresses in user space. All other faults represent errors in the * kernel and should generate an OOPS. Unfortunately, in the case of an * erroneous fault occurring in a code path which already holds mmap_sem * we will deadlock attempting to validate the fault against the * address space. Luckily the kernel only validly references user * space from well defined areas of code, which are listed in the * exceptions table. * * As the vast majority of faults will be valid we will only perform * the source reference check when there is a possibility of a deadlock. * Attempt to lock the address space, if we cannot we then validate the * source. If this is invalid we can skip the address space check, * thus avoiding the deadlock. */ if (!down_read_trylock(&mm->mmap_sem)) { if (is_kernel_mode && !search_exception_tables(regs->pc)) { vma = NULL; /* happy compiler */ goto bad_area_nosemaphore; } down_read(&mm->mmap_sem); } vma = find_vma(mm, address); if (!vma) goto bad_area; if (vma->vm_start <= address) goto good_area; if (!(vma->vm_flags & VM_GROWSDOWN)) goto bad_area; if (regs->sp < PAGE_OFFSET) { /* * accessing the stack below sp is always a bug. */ if (address < regs->sp) goto bad_area; } if (expand_stack(vma, address)) goto bad_area; /* * Ok, we have a good vm_area for this memory access, so * we can handle it.. */ good_area: si_code = SEGV_ACCERR; if (fault_num == INT_ITLB_MISS) { if (!(vma->vm_flags & VM_EXEC)) goto bad_area; } else if (write) { #ifdef TEST_VERIFY_AREA if (!is_page_fault && regs->cs == KERNEL_CS) pr_err("WP fault at "REGFMT"\n", regs->eip); #endif if (!(vma->vm_flags & VM_WRITE)) goto bad_area; flags |= FAULT_FLAG_WRITE; } else { if (!is_page_fault || !(vma->vm_flags & VM_READ)) goto bad_area; } /* * If for any reason at all we couldn't handle the fault, * make sure we exit gracefully rather than endlessly redo * the fault. */ fault = handle_mm_fault(mm, vma, address, write); if (unlikely(fault & VM_FAULT_ERROR)) { if (fault & VM_FAULT_OOM) goto out_of_memory; else if (fault & VM_FAULT_SIGBUS) goto do_sigbus; BUG(); } if (fault & VM_FAULT_MAJOR) tsk->maj_flt++; else tsk->min_flt++; #if CHIP_HAS_TILE_DMA() || CHIP_HAS_SN_PROC() /* * If this was an asynchronous fault, * restart the appropriate engine. */ switch (fault_num) { #if CHIP_HAS_TILE_DMA() case INT_DMATLB_MISS: case INT_DMATLB_MISS_DWNCL: case INT_DMATLB_ACCESS: case INT_DMATLB_ACCESS_DWNCL: __insn_mtspr(SPR_DMA_CTR, SPR_DMA_CTR__REQUEST_MASK); break; #endif #if CHIP_HAS_SN_PROC() case INT_SNITLB_MISS: case INT_SNITLB_MISS_DWNCL: __insn_mtspr(SPR_SNCTL, __insn_mfspr(SPR_SNCTL) & ~SPR_SNCTL__FRZPROC_MASK); break; #endif } #endif up_read(&mm->mmap_sem); return 1; /* * Something tried to access memory that isn't in our memory map.. * Fix it, but check if it's kernel or user first.. */ bad_area: up_read(&mm->mmap_sem); bad_area_nosemaphore: /* User mode accesses just cause a SIGSEGV */ if (!is_kernel_mode) { /* * It's possible to have interrupts off here. */ local_irq_enable(); force_sig_info_fault("segfault", SIGSEGV, si_code, address, fault_num, tsk, regs); return 0; } no_context: /* Are we prepared to handle this kernel fault? */ if (fixup_exception(regs)) return 0; /* * Oops. The kernel tried to access some bad page. We'll have to * terminate things with extreme prejudice. */ bust_spinlocks(1); /* FIXME: no lookup_address() yet */ #ifdef SUPPORT_LOOKUP_ADDRESS if (fault_num == INT_ITLB_MISS) { pte_t *pte = lookup_address(address); if (pte && pte_present(*pte) && !pte_exec_kernel(*pte)) pr_crit("kernel tried to execute" " non-executable page - exploit attempt?" " (uid: %d)\n", current->uid); } #endif if (address < PAGE_SIZE) pr_alert("Unable to handle kernel NULL pointer dereference\n"); else pr_alert("Unable to handle kernel paging request\n"); pr_alert(" at virtual address "REGFMT", pc "REGFMT"\n", address, regs->pc); show_regs(regs); if (unlikely(tsk->pid < 2)) { panic("Kernel page fault running %s!", is_idle_task(tsk) ? "the idle task" : "init"); } /* * More FIXME: we should probably copy the i386 here and * implement a generic die() routine. Not today. */ #ifdef SUPPORT_DIE die("Oops", regs); #endif bust_spinlocks(1); do_group_exit(SIGKILL); /* * We ran out of memory, or some other thing happened to us that made * us unable to handle the page fault gracefully. */ out_of_memory: up_read(&mm->mmap_sem); if (is_kernel_mode) goto no_context; pagefault_out_of_memory(); return 0; do_sigbus: up_read(&mm->mmap_sem); /* Kernel mode? Handle exceptions or die */ if (is_kernel_mode) goto no_context; force_sig_info_fault("bus error", SIGBUS, BUS_ADRERR, address, fault_num, tsk, regs); return 0; }
/* * This routine handles page faults. It determines the address, * and the problem, and then passes it off to one of the appropriate * routines. */ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long cause, unsigned long address) { struct vm_area_struct *vma = NULL; struct task_struct *tsk = current; struct mm_struct *mm = tsk->mm; int code = SEGV_MAPERR; int fault; unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; cause >>= 2; /* Restart the instruction */ regs->ea -= 4; /* * We fault-in kernel-space virtual memory on-demand. The * 'reference' page table is init_mm.pgd. * * NOTE! We MUST NOT take any locks for this case. We may * be in an interrupt or a critical region, and should * only copy the information from the master page table, * nothing more. */ if (unlikely(address >= VMALLOC_START && address <= VMALLOC_END)) { if (user_mode(regs)) goto bad_area_nosemaphore; else goto vmalloc_fault; } if (unlikely(address >= TASK_SIZE)) goto bad_area_nosemaphore; /* * If we're in an interrupt or have no user * context, we must not take the fault.. */ if (faulthandler_disabled() || !mm) goto bad_area_nosemaphore; if (user_mode(regs)) flags |= FAULT_FLAG_USER; if (!down_read_trylock(&mm->mmap_sem)) { if (!user_mode(regs) && !search_exception_tables(regs->ea)) goto bad_area_nosemaphore; retry: down_read(&mm->mmap_sem); } vma = find_vma(mm, address); if (!vma) goto bad_area; if (vma->vm_start <= address) goto good_area; if (!(vma->vm_flags & VM_GROWSDOWN)) goto bad_area; if (expand_stack(vma, address)) goto bad_area; /* * Ok, we have a good vm_area for this memory access, so * we can handle it.. */ good_area: code = SEGV_ACCERR; switch (cause) { case EXC_SUPERV_INSN_ACCESS: goto bad_area; case EXC_SUPERV_DATA_ACCESS: goto bad_area; case EXC_X_PROTECTION_FAULT: if (!(vma->vm_flags & VM_EXEC)) goto bad_area; break; case EXC_R_PROTECTION_FAULT: if (!(vma->vm_flags & VM_READ)) goto bad_area; break; case EXC_W_PROTECTION_FAULT: if (!(vma->vm_flags & VM_WRITE)) goto bad_area; flags = FAULT_FLAG_WRITE; break; } /* * If for any reason at all we couldn't handle the fault, * make sure we exit gracefully rather than endlessly redo * the fault. */ fault = handle_mm_fault(mm, vma, address, flags); if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) return; if (unlikely(fault & VM_FAULT_ERROR)) { if (fault & VM_FAULT_OOM) goto out_of_memory; else if (fault & VM_FAULT_SIGSEGV) goto bad_area; else if (fault & VM_FAULT_SIGBUS) goto do_sigbus; BUG(); } /* * Major/minor page fault accounting is only done on the * initial attempt. If we go through a retry, it is extremely * likely that the page will be found in page cache at that point. */ if (flags & FAULT_FLAG_ALLOW_RETRY) { if (fault & VM_FAULT_MAJOR) current->maj_flt++; else current->min_flt++; if (fault & VM_FAULT_RETRY) { /* Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk * of starvation. */ flags &= ~FAULT_FLAG_ALLOW_RETRY; flags |= FAULT_FLAG_TRIED; /* * No need to up_read(&mm->mmap_sem) as we would * have already released it in __lock_page_or_retry * in mm/filemap.c. */ goto retry; } } up_read(&mm->mmap_sem); return; /* * Something tried to access memory that isn't in our memory map.. * Fix it, but check if it's kernel or user first.. */ bad_area: up_read(&mm->mmap_sem); bad_area_nosemaphore: /* User mode accesses just cause a SIGSEGV */ if (user_mode(regs)) { if (unhandled_signal(current, SIGSEGV) && printk_ratelimit()) { pr_info("%s: unhandled page fault (%d) at 0x%08lx, " "cause %ld\n", current->comm, SIGSEGV, address, cause); show_regs(regs); } _exception(SIGSEGV, regs, code, address); return; } no_context: /* Are we prepared to handle this kernel fault? */ if (fixup_exception(regs)) return; /* * Oops. The kernel tried to access some bad page. We'll have to * terminate things with extreme prejudice. */ bust_spinlocks(1); pr_alert("Unable to handle kernel %s at virtual address %08lx", address < PAGE_SIZE ? "NULL pointer dereference" : "paging request", address); pr_alert("ea = %08lx, ra = %08lx, cause = %ld\n", regs->ea, regs->ra, cause); panic("Oops"); return; /* * We ran out of memory, or some other thing happened to us that made * us unable to handle the page fault gracefully. */ out_of_memory: up_read(&mm->mmap_sem); if (!user_mode(regs)) goto no_context; pagefault_out_of_memory(); return; do_sigbus: up_read(&mm->mmap_sem); /* Kernel mode? Handle exceptions or die */ if (!user_mode(regs)) goto no_context; _exception(SIGBUS, regs, BUS_ADRERR, address); return; vmalloc_fault: { /* * Synchronize this task's top level page-table * with the 'reference' page table. * * Do _not_ use "tsk" here. We might be inside * an interrupt in the middle of a task switch.. */ int offset = pgd_index(address); pgd_t *pgd, *pgd_k; pud_t *pud, *pud_k; pmd_t *pmd, *pmd_k; pte_t *pte_k; pgd = pgd_current + offset; pgd_k = init_mm.pgd + offset; if (!pgd_present(*pgd_k)) goto no_context; set_pgd(pgd, *pgd_k); pud = pud_offset(pgd, address); pud_k = pud_offset(pgd_k, address); if (!pud_present(*pud_k)) goto no_context; pmd = pmd_offset(pud, address); pmd_k = pmd_offset(pud_k, address); if (!pmd_present(*pmd_k)) goto no_context; set_pmd(pmd, *pmd_k); pte_k = pte_offset_kernel(pmd_k, address); if (!pte_present(*pte_k)) goto no_context; flush_tlb_one(address); return; } }
int VirtToPhys(void *vaddr, int *paddrp) { #if defined(__KERNEL__) unsigned long addr = (unsigned long) vaddr; if (addr < P1SEG || ((addr >= VMALLOC_START) && (addr < VMALLOC_END))) { /* * Find the virtual address of either a user page (<P1SEG) or VMALLOC (P3SEG) * * This code is based on vmalloc_to_page() in mm/memory.c */ struct mm_struct *mm; pgd_t *pgd; #if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,10) pud_t *pud; #endif pmd_t *pmd; pte_t *ptep, pte; /* Must use the correct mm based on whether this is a kernel or a userspace address */ if (addr >= VMALLOC_START) mm = &init_mm; else mm = current->mm; /* Safety first! */ if (mm == NULL) return VTOP_INVALID_ARG; spin_lock(&mm->page_table_lock); pgd = pgd_offset(mm, addr); if (pgd_none(*pgd) || pgd_bad(*pgd)) goto out; #if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,10) pud = pud_offset(pgd, addr); if (pud_none(*pud) || pud_bad(*pud)) goto out; pmd = pmd_offset(pud, addr); #else pmd = pmd_offset(pgd, addr); #endif if (pmd_none(*pmd) || pmd_bad(*pmd)) goto out; #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,9) ptep = pte_offset(pmd, addr); #else ptep = pte_offset_map(pmd, addr); #endif if (!ptep) goto out; pte = *ptep; if (pte_present(pte)) { #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,9) pte_unmap(ptep); #endif spin_unlock(&mm->page_table_lock); /* pte_page() macro is broken for SH in linux 2.6.20 and later */ *paddrp = page_to_phys(pfn_to_page(pte_pfn(pte))) | (addr & (PAGE_SIZE-1)); /* INSbl28636: P3 segment pages cannot be looked up with pmb_virt_to_phys() * instead we need to examine the _PAGE_CACHABLE bit in the pte */ return ((pte_val(pte) & _PAGE_CACHABLE) ? VTOP_INCOHERENT_MEM : VTOP_SUCCESS); } #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,9) pte_unmap(ptep); #endif out: spin_unlock(&mm->page_table_lock); /* Failed to find a pte */ return VTOP_INVALID_ARG; } else #if defined(CONFIG_32BIT) { unsigned long flags; /* Try looking for an ioremap() via the PMB */ if (pmb_virt_to_phys(vaddr, (unsigned long *)paddrp, &flags) == 0) { /* Success: Test the returned PMB flags */ return ((flags & PMB_C) ? VTOP_INCOHERENT_MEM : VTOP_SUCCESS); } /* Failed to find a mapping */ return VTOP_INVALID_ARG; } #else { unsigned long addr = (unsigned long) vaddr; /* Assume 29-bit SH4 Linux */ *(paddrp)= PHYSADDR(addr); /* only the P2SEG is uncached (doubt we will see P4SEG addresses) */ return ((PXSEG(addr) == P2SEG) ? VTOP_SUCCESS: VTOP_INCOHERENT_MEM); } #endif /* CONFIG_32BIT */ #endif /* __KERNEL__ */ /* Not implemented */ return VTOP_INVALID_ARG; }
asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long address, unsigned long vector, int write_acc) { struct task_struct *tsk; struct mm_struct *mm; struct vm_area_struct *vma; siginfo_t info; int fault; unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; tsk = current; /* * We fault-in kernel-space virtual memory on-demand. The * 'reference' page table is init_mm.pgd. * * NOTE! We MUST NOT take any locks for this case. We may * be in an interrupt or a critical region, and should * only copy the information from the master page table, * nothing more. * * NOTE2: This is done so that, when updating the vmalloc * mappings we don't have to walk all processes pgdirs and * add the high mappings all at once. Instead we do it as they * are used. However vmalloc'ed page entries have the PAGE_GLOBAL * bit set so sometimes the TLB can use a lingering entry. * * This verifies that the fault happens in kernel space * and that the fault was not a protection error. */ if (address >= VMALLOC_START && (vector != 0x300 && vector != 0x400) && !user_mode(regs)) goto vmalloc_fault; /* If exceptions were enabled, we can reenable them here */ if (user_mode(regs)) { /* Exception was in userspace: reenable interrupts */ local_irq_enable(); flags |= FAULT_FLAG_USER; } else { /* If exception was in a syscall, then IRQ's may have * been enabled or disabled. If they were enabled, * reenable them. */ if (regs->sr && (SPR_SR_IEE | SPR_SR_TEE)) local_irq_enable(); } mm = tsk->mm; info.si_code = SEGV_MAPERR; /* * If we're in an interrupt or have no user * context, we must not take the fault.. */ if (in_interrupt() || !mm) goto no_context; retry: down_read(&mm->mmap_sem); vma = find_vma(mm, address); if (!vma) goto bad_area; if (vma->vm_start <= address) goto good_area; if (!(vma->vm_flags & VM_GROWSDOWN)) goto bad_area; if (user_mode(regs)) { /* * accessing the stack below usp is always a bug. * we get page-aligned addresses so we can only check * if we're within a page from usp, but that might be * enough to catch brutal errors at least. */ if (address + PAGE_SIZE < regs->sp) goto bad_area; } if (expand_stack(vma, address)) goto bad_area; /* * Ok, we have a good vm_area for this memory access, so * we can handle it.. */ good_area: info.si_code = SEGV_ACCERR; /* first do some preliminary protection checks */ if (write_acc) { if (!(vma->vm_flags & VM_WRITE)) goto bad_area; flags |= FAULT_FLAG_WRITE; } else { /* not present */ if (!(vma->vm_flags & (VM_READ | VM_EXEC))) goto bad_area; } /* are we trying to execute nonexecutable area */ if ((vector == 0x400) && !(vma->vm_page_prot.pgprot & _PAGE_EXEC)) goto bad_area; /* * If for any reason at all we couldn't handle the fault, * make sure we exit gracefully rather than endlessly redo * the fault. */ fault = handle_mm_fault(vma, address, flags); if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) return; if (unlikely(fault & VM_FAULT_ERROR)) { if (fault & VM_FAULT_OOM) goto out_of_memory; else if (fault & VM_FAULT_SIGSEGV) goto bad_area; else if (fault & VM_FAULT_SIGBUS) goto do_sigbus; BUG(); } if (flags & FAULT_FLAG_ALLOW_RETRY) { /*RGD modeled on Cris */ if (fault & VM_FAULT_MAJOR) tsk->maj_flt++; else tsk->min_flt++; if (fault & VM_FAULT_RETRY) { flags &= ~FAULT_FLAG_ALLOW_RETRY; flags |= FAULT_FLAG_TRIED; /* No need to up_read(&mm->mmap_sem) as we would * have already released it in __lock_page_or_retry * in mm/filemap.c. */ goto retry; } } up_read(&mm->mmap_sem); return; /* * Something tried to access memory that isn't in our memory map.. * Fix it, but check if it's kernel or user first.. */ bad_area: up_read(&mm->mmap_sem); bad_area_nosemaphore: /* User mode accesses just cause a SIGSEGV */ if (user_mode(regs)) { info.si_signo = SIGSEGV; info.si_errno = 0; /* info.si_code has been set above */ info.si_addr = (void *)address; force_sig_info(SIGSEGV, &info, tsk); return; } no_context: /* Are we prepared to handle this kernel fault? * * (The kernel has valid exception-points in the source * when it acesses user-memory. When it fails in one * of those points, we find it in a table and do a jump * to some fixup code that loads an appropriate error * code) */ { const struct exception_table_entry *entry; __asm__ __volatile__("l.nop 42"); if ((entry = search_exception_tables(regs->pc)) != NULL) { /* Adjust the instruction pointer in the stackframe */ regs->pc = entry->fixup; return; } } /* * Oops. The kernel tried to access some bad page. We'll have to * terminate things with extreme prejudice. */ if ((unsigned long)(address) < PAGE_SIZE) printk(KERN_ALERT "Unable to handle kernel NULL pointer dereference"); else printk(KERN_ALERT "Unable to handle kernel access"); printk(" at virtual address 0x%08lx\n", address); die("Oops", regs, write_acc); do_exit(SIGKILL); /* * We ran out of memory, or some other thing happened to us that made * us unable to handle the page fault gracefully. */ out_of_memory: __asm__ __volatile__("l.nop 42"); __asm__ __volatile__("l.nop 1"); up_read(&mm->mmap_sem); if (!user_mode(regs)) goto no_context; pagefault_out_of_memory(); return; do_sigbus: up_read(&mm->mmap_sem); /* * Send a sigbus, regardless of whether we were in kernel * or user mode. */ info.si_signo = SIGBUS; info.si_errno = 0; info.si_code = BUS_ADRERR; info.si_addr = (void *)address; force_sig_info(SIGBUS, &info, tsk); /* Kernel mode? Handle exceptions or die */ if (!user_mode(regs)) goto no_context; return; vmalloc_fault: { /* * Synchronize this task's top level page-table * with the 'reference' page table. * * Use current_pgd instead of tsk->active_mm->pgd * since the latter might be unavailable if this * code is executed in a misfortunately run irq * (like inside schedule() between switch_mm and * switch_to...). */ int offset = pgd_index(address); pgd_t *pgd, *pgd_k; pud_t *pud, *pud_k; pmd_t *pmd, *pmd_k; pte_t *pte_k; /* phx_warn("do_page_fault(): vmalloc_fault will not work, " "since current_pgd assign a proper value somewhere\n" "anyhow we don't need this at the moment\n"); phx_mmu("vmalloc_fault"); */ pgd = (pgd_t *)current_pgd + offset; pgd_k = init_mm.pgd + offset; /* Since we're two-level, we don't need to do both * set_pgd and set_pmd (they do the same thing). If * we go three-level at some point, do the right thing * with pgd_present and set_pgd here. * * Also, since the vmalloc area is global, we don't * need to copy individual PTE's, it is enough to * copy the pgd pointer into the pte page of the * root task. If that is there, we'll find our pte if * it exists. */ pud = pud_offset(pgd, address); pud_k = pud_offset(pgd_k, address); if (!pud_present(*pud_k)) goto no_context; pmd = pmd_offset(pud, address); pmd_k = pmd_offset(pud_k, address); if (!pmd_present(*pmd_k)) goto bad_area_nosemaphore; set_pmd(pmd, *pmd_k); /* Make sure the actual PTE exists as well to * catch kernel vmalloc-area accesses to non-mapped * addresses. If we don't do this, this will just * silently loop forever. */ pte_k = pte_offset_kernel(pmd_k, address); if (!pte_present(*pte_k)) goto no_context; return; } }
unsigned int m4u_user_v2p(unsigned int va) { unsigned int pageOffset = (va & (PAGE_SIZE - 1)); pgd_t *pgd; pud_t *pud; pmd_t *pmd; pte_t *pte; unsigned int pa; //M4UMSG("Enter m4u_user_v2p()! \n", va); if(NULL==current) { M4UMSG("warning: m4u_user_v2p, current is NULL! \n"); return 0; } if(NULL==current->mm) { M4UMSG("warning: m4u_user_v2p, current->mm is NULL! tgid=0x%x, name=%s \n", current->tgid, current->comm); return 0; } pgd = pgd_offset(current->mm, va); /* what is tsk->mm */ if(pgd_none(*pgd)||pgd_bad(*pgd)) { M4UMSG("m4u_user_v2p(), va=0x%x, pgd invalid! \n", va); return 0; } pud = pud_offset(pgd, va); if(pud_none(*pud)||pud_bad(*pud)) { M4UDBG("m4u_user_v2p(), va=0x%x, pud invalid! \n", va); return 0; } pmd = pmd_offset(pud, va); if(pmd_none(*pmd)||pmd_bad(*pmd)) { M4UDBG("m4u_user_v2p(), va=0x%x, pmd invalid! \n", va); return 0; } pte = pte_offset_map(pmd, va); if(pte_present(*pte)) { /* if((long long)pte_val(pte[PTE_HWTABLE_PTRS]) == (long long)0) { M4UMSG("user_v2p, va=0x%x, *ppte=%08llx", va, (long long)pte_val(pte[PTE_HWTABLE_PTRS])); pte_unmap(pte); return 0; } */ pa=(pte_val(*pte) & (PAGE_MASK)) | pageOffset; pte_unmap(pte); return pa; } pte_unmap(pte); M4UDBG("m4u_user_v2p(), va=0x%x, pte invalid! \n", va); // m4u_dump_maps(va); return 0; }
static void __init map_node(int node) { #define PTRTREESIZE (256*1024) #define ROOTTREESIZE (32*1024*1024) unsigned long physaddr, virtaddr, size; pgd_t *pgd_dir; pmd_t *pmd_dir; pte_t *pte_dir; size = m68k_memory[node].size; physaddr = m68k_memory[node].addr; virtaddr = (unsigned long)phys_to_virt(physaddr); physaddr |= m68k_supervisor_cachemode | _PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_DIRTY; if (CPU_IS_040_OR_060) physaddr |= _PAGE_GLOBAL040; while (size > 0) { #ifdef DEBUG if (!(virtaddr & (PTRTREESIZE-1))) printk ("\npa=%#lx va=%#lx ", physaddr & PAGE_MASK, virtaddr); #endif pgd_dir = pgd_offset_k(virtaddr); if (virtaddr && CPU_IS_020_OR_030) { if (!(virtaddr & (ROOTTREESIZE-1)) && size >= ROOTTREESIZE) { #ifdef DEBUG printk ("[very early term]"); #endif pgd_val(*pgd_dir) = physaddr; size -= ROOTTREESIZE; virtaddr += ROOTTREESIZE; physaddr += ROOTTREESIZE; continue; } } if (!pgd_present(*pgd_dir)) { pmd_dir = kernel_ptr_table(); #ifdef DEBUG printk ("[new pointer %p]", pmd_dir); #endif pgd_set(pgd_dir, pmd_dir); } else pmd_dir = pmd_offset(pgd_dir, virtaddr); if (CPU_IS_020_OR_030) { if (virtaddr) { #ifdef DEBUG printk ("[early term]"); #endif pmd_dir->pmd[(virtaddr/PTRTREESIZE) & 15] = physaddr; physaddr += PTRTREESIZE; } else { int i; #ifdef DEBUG printk ("[zero map]"); #endif zero_pgtable = kernel_ptr_table(); pte_dir = (pte_t *)zero_pgtable; pmd_dir->pmd[0] = virt_to_phys(pte_dir) | _PAGE_TABLE | _PAGE_ACCESSED; pte_val(*pte_dir++) = 0; physaddr += PAGE_SIZE; for (i = 1; i < 64; physaddr += PAGE_SIZE, i++) pte_val(*pte_dir++) = physaddr; } size -= PTRTREESIZE; virtaddr += PTRTREESIZE; } else { if (!pmd_present(*pmd_dir)) { #ifdef DEBUG printk ("[new table]"); #endif pte_dir = kernel_page_table(); pmd_set(pmd_dir, pte_dir); } pte_dir = pte_offset_kernel(pmd_dir, virtaddr); if (virtaddr) { if (!pte_present(*pte_dir)) pte_val(*pte_dir) = physaddr; } else pte_val(*pte_dir) = 0; size -= PAGE_SIZE; virtaddr += PAGE_SIZE; physaddr += PAGE_SIZE; } } #ifdef DEBUG printk("\n"); #endif }
/* * This routine handles page faults. It determines the address, * and the problem, and then passes it off to one of the appropriate * routines. */ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long write, unsigned long address) { struct vm_area_struct * vma = NULL; struct task_struct *tsk = current; struct mm_struct *mm = tsk->mm; const int field = sizeof(unsigned long) * 2; siginfo_t info; #if 0 printk("Cpu%d[%s:%d:%0*lx:%ld:%0*lx]\n", smp_processor_id(), current->comm, current->pid, field, address, write, field, regs->cp0_epc); #endif #ifdef CONFIG_CAVIUM_OCTEON_HW_FIX_UNALIGNED /* * Normally the FPU emulator uses a load word from address one to retake * control of the CPU after executing the instruction in the delay slot * of an emulated branch. The Octeon hardware unaligned access fix changes * this from an address exception into a TLB exception. This code checks * to see if this page fault was caused by an FPU emulation. * * Terminate if exception was recognized as a delay slot return */ extern int do_dsemulret(struct pt_regs *); if (do_dsemulret(regs)) return; #endif info.si_code = SEGV_MAPERR; /* * We fault-in kernel-space virtual memory on-demand. The * 'reference' page table is init_mm.pgd. * * NOTE! We MUST NOT take any locks for this case. We may * be in an interrupt or a critical region, and should * only copy the information from the master page table, * nothing more. */ if (unlikely(address >= VMALLOC_START)) goto vmalloc_fault; ltt_ev_trap_entry(CAUSE_EXCCODE(regs), CAUSE_EPC(regs)); /* * If we're in an interrupt or have no user * context, we must not take the fault.. */ if (in_atomic() || !mm) goto bad_area_nosemaphore; down_read(&mm->mmap_sem); vma = find_vma(mm, address); if (!vma) goto bad_area; if (vma->vm_start <= address) goto good_area; if (!(vma->vm_flags & VM_GROWSDOWN)) goto bad_area; if (expand_stack(vma, address)) goto bad_area; /* * Ok, we have a good vm_area for this memory access, so * we can handle it.. */ good_area: info.si_code = SEGV_ACCERR; if (write) { if (!(vma->vm_flags & VM_WRITE)) goto bad_area; } else { if (!(vma->vm_flags & (VM_READ | VM_EXEC))) goto bad_area; } survive: /* * If for any reason at all we couldn't handle the fault, * make sure we exit gracefully rather than endlessly redo * the fault. */ switch (handle_mm_fault(mm, vma, address, write)) { case VM_FAULT_MINOR: tsk->min_flt++; break; case VM_FAULT_MAJOR: tsk->maj_flt++; break; case VM_FAULT_SIGBUS: goto do_sigbus; case VM_FAULT_OOM: goto out_of_memory; default: BUG(); } up_read(&mm->mmap_sem); ltt_ev_trap_exit(); return; /* * Something tried to access memory that isn't in our memory map.. * Fix it, but check if it's kernel or user first.. */ bad_area: up_read(&mm->mmap_sem); bad_area_nosemaphore: /* User mode accesses just cause a SIGSEGV */ if (user_mode(regs)) { tsk->thread.cp0_badvaddr = address; tsk->thread.error_code = write; #if 0 printk("do_page_fault() #2: sending SIGSEGV to %s for " "invalid %s\n%0*lx (epc == %0*lx, ra == %0*lx)\n", tsk->comm, write ? "write access to" : "read access from", field, address, field, (unsigned long) regs->cp0_epc, field, (unsigned long) regs->regs[31]); #endif info.si_signo = SIGSEGV; info.si_errno = 0; /* info.si_code has been set above */ info.si_addr = (void *) address; force_sig_info(SIGSEGV, &info, tsk); ltt_ev_trap_exit(); return; } no_context: /* Are we prepared to handle this kernel fault? */ if (fixup_exception(regs)) { current->thread.cp0_baduaddr = address; return; } /* * Oops. The kernel tried to access some bad page. We'll have to * terminate things with extreme prejudice. */ bust_spinlocks(1); printk(KERN_ALERT "CPU %d Unable to handle kernel paging request at " "virtual address %0*lx, epc == %0*lx, ra == %0*lx\n", smp_processor_id(), field, address, field, regs->cp0_epc, field, regs->regs[31]); die("Oops", regs); /* * We ran out of memory, or some other thing happened to us that made * us unable to handle the page fault gracefully. */ out_of_memory: up_read(&mm->mmap_sem); if (tsk->pid == 1) { yield(); down_read(&mm->mmap_sem); goto survive; } printk("VM: killing process %s\n", tsk->comm); if (user_mode(regs)) do_exit(SIGKILL); goto no_context; do_sigbus: up_read(&mm->mmap_sem); /* Kernel mode? Handle exceptions or die */ if (!user_mode(regs)) goto no_context; /* * Send a sigbus, regardless of whether we were in kernel * or user mode. */ tsk->thread.cp0_badvaddr = address; info.si_signo = SIGBUS; info.si_errno = 0; info.si_code = BUS_ADRERR; info.si_addr = (void *) address; force_sig_info(SIGBUS, &info, tsk); ltt_ev_trap_exit(); return; vmalloc_fault: { /* * Synchronize this task's top level page-table * with the 'reference' page table. * * Do _not_ use "tsk" here. We might be inside * an interrupt in the middle of a task switch.. */ int offset = __pgd_offset(address); pgd_t *pgd, *pgd_k; pmd_t *pmd, *pmd_k; pte_t *pte_k; pgd = (pgd_t *) pgd_current[smp_processor_id()] + offset; pgd_k = init_mm.pgd + offset; if (!pgd_present(*pgd_k)) goto no_context; set_pgd(pgd, *pgd_k); pmd = pmd_offset(pgd, address); pmd_k = pmd_offset(pgd_k, address); if (!pmd_present(*pmd_k)) goto no_context; set_pmd(pmd, *pmd_k); pte_k = pte_offset_kernel(pmd_k, address); if (!pte_present(*pte_k)) goto no_context; return; } ltt_ev_trap_exit(); }
static struct page *follow_page_pte(struct vm_area_struct *vma, unsigned long address, pmd_t *pmd, unsigned int flags) { struct mm_struct *mm = vma->vm_mm; struct page *page; spinlock_t *ptl; pte_t *ptep, pte; retry: if (unlikely(pmd_bad(*pmd))) return no_page_table(vma, flags); ptep = pte_offset_map_lock(mm, pmd, address, &ptl); pte = *ptep; if (!pte_present(pte)) { swp_entry_t entry; /* * KSM's break_ksm() relies upon recognizing a ksm page * even while it is being migrated, so for that case we * need migration_entry_wait(). */ if (likely(!(flags & FOLL_MIGRATION))) goto no_page; if (pte_none(pte) || pte_file(pte)) goto no_page; entry = pte_to_swp_entry(pte); if (!is_migration_entry(entry)) goto no_page; pte_unmap_unlock(ptep, ptl); migration_entry_wait(mm, pmd, address); goto retry; } if ((flags & FOLL_NUMA) && pte_numa(pte)) goto no_page; if ((flags & FOLL_WRITE) && !pte_write(pte)) { pte_unmap_unlock(ptep, ptl); return NULL; } page = vm_normal_page(vma, address, pte); if (unlikely(!page)) { if ((flags & FOLL_DUMP) || !is_zero_pfn(pte_pfn(pte))) goto bad_page; page = pte_page(pte); } if (flags & FOLL_GET) get_page_foll(page); if (flags & FOLL_TOUCH) { if ((flags & FOLL_WRITE) && !pte_dirty(pte) && !PageDirty(page)) set_page_dirty(page); /* * pte_mkyoung() would be more correct here, but atomic care * is needed to avoid losing the dirty bit: it is easier to use * mark_page_accessed(). */ mark_page_accessed(page); } if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) { /* * The preliminary mapping check is mainly to avoid the * pointless overhead of lock_page on the ZERO_PAGE * which might bounce very badly if there is contention. * * If the page is already locked, we don't need to * handle it now - vmscan will handle it later if and * when it attempts to reclaim the page. */ if (page->mapping && trylock_page(page)) { lru_add_drain(); /* push cached pages to LRU */ /* * Because we lock page here, and migration is * blocked by the pte's page reference, and we * know the page is still mapped, we don't even * need to check for file-cache page truncation. */ mlock_vma_page(page); unlock_page(page); } } pte_unmap_unlock(ptep, ptl); return page; bad_page: pte_unmap_unlock(ptep, ptl); return ERR_PTR(-EFAULT); no_page: pte_unmap_unlock(ptep, ptl); if (!pte_none(pte)) return NULL; return no_page_table(vma, flags); }
static int pin_page_for_write(const void __user *_addr, pte_t **ptep, spinlock_t **ptlp) { unsigned long addr = (unsigned long)_addr; pgd_t *pgd; pmd_t *pmd; pte_t *pte; pud_t *pud; spinlock_t *ptl; pgd = pgd_offset(current->mm, addr); if (unlikely(pgd_none(*pgd) || pgd_bad(*pgd))) return 0; pud = pud_offset(pgd, addr); if (unlikely(pud_none(*pud) || pud_bad(*pud))) return 0; pmd = pmd_offset(pud, addr); if (unlikely(pmd_none(*pmd))) return 0; /* * A pmd can be bad if it refers to a HugeTLB or THP page. * * Both THP and HugeTLB pages have the same pmd layout * and should not be manipulated by the pte functions. * * Lock the page table for the destination and check * to see that it's still huge and whether or not we will * need to fault on write. */ if (unlikely(pmd_thp_or_huge(*pmd))) { ptl = ¤t->mm->page_table_lock; spin_lock(ptl); if (unlikely(!pmd_thp_or_huge(*pmd) || pmd_hugewillfault(*pmd))) { spin_unlock(ptl); return 0; } *ptep = NULL; *ptlp = ptl; return 1; } if (unlikely(pmd_bad(*pmd))) return 0; pte = pte_offset_map_lock(current->mm, pmd, addr, &ptl); if (unlikely(!pte_present(*pte) || !pte_young(*pte) || !pte_write(*pte) || !pte_dirty(*pte))) { pte_unmap_unlock(pte, ptl); return 0; } *ptep = pte; *ptlp = ptl; return 1; }
/* * The swap-out functions return 1 if they successfully * threw something out, and we got a free page. It returns * zero if it couldn't do anything, and any other value * indicates it decreased rss, but the page was shared. * * NOTE! If it sleeps, it *must* return 1 to make sure we * don't continue with the swap-out. Otherwise we may be * using a process that no longer actually exists (it might * have died while we slept). */ static int try_to_swap_out(struct mm_struct * mm, struct vm_area_struct* vma, unsigned long address, pte_t * page_table, int gfp_mask) { pte_t pte; swp_entry_t entry; struct page * page; int onlist; pte = *page_table; if (!pte_present(pte)) goto out_failed; page = pte_page(pte); if ((!VALID_PAGE(page)) || PageReserved(page)) goto out_failed; if (mm->swap_cnt) mm->swap_cnt--; onlist = PageActive(page); /* Don't look at this pte if it's been accessed recently. */ if (ptep_test_and_clear_young(page_table)) { age_page_up(page); goto out_failed; } if (!onlist) /* The page is still mapped, so it can't be freeable... */ age_page_down_ageonly(page); /* * If the page is in active use by us, or if the page * is in active use by others, don't unmap it or * (worse) start unneeded IO. */ if (page->age > 0) goto out_failed; if (TryLockPage(page)) goto out_failed; /* From this point on, the odds are that we're going to * nuke this pte, so read and clear the pte. This hook * is needed on CPUs which update the accessed and dirty * bits in hardware. */ pte = ptep_get_and_clear(page_table); /* * Is the page already in the swap cache? If so, then * we can just drop our reference to it without doing * any IO - it's already up-to-date on disk. * * Return 0, as we didn't actually free any real * memory, and we should just continue our scan. */ if (PageSwapCache(page)) { entry.val = page->index; if (pte_dirty(pte)) set_page_dirty(page); set_swap_pte: swap_duplicate(entry); set_pte(page_table, swp_entry_to_pte(entry)); drop_pte: UnlockPage(page); mm->rss--; flush_tlb_page(vma, address); deactivate_page(page); page_cache_release(page); out_failed: return 0; } /* * Is it a clean page? Then it must be recoverable * by just paging it in again, and we can just drop * it.. * * However, this won't actually free any real * memory, as the page will just be in the page cache * somewhere, and as such we should just continue * our scan. * * Basically, this just makes it possible for us to do * some real work in the future in "refill_inactive()". */ flush_cache_page(vma, address); if (!pte_dirty(pte)) goto drop_pte; /* * Ok, it's really dirty. That means that * we should either create a new swap cache * entry for it, or we should write it back * to its own backing store. */ if (page->mapping) { set_page_dirty(page); goto drop_pte; } /* * This is a dirty, swappable page. First of all, * get a suitable swap entry for it, and make sure * we have the swap cache set up to associate the * page with that swap entry. */ entry = get_swap_page(); if (!entry.val) goto out_unlock_restore; /* No swap space left */ /* Add it to the swap cache and mark it dirty */ add_to_swap_cache(page, entry); set_page_dirty(page); goto set_swap_pte; out_unlock_restore: set_pte(page_table, pte); UnlockPage(page); return 0; }
void do_page_fault(struct pt_regs *regs) { struct vm_area_struct * vma; struct mm_struct *mm = current->mm; unsigned int exccause = regs->exccause; unsigned int address = regs->excvaddr; siginfo_t info; int is_write, is_exec; int fault; info.si_code = SEGV_MAPERR; if (address >= TASK_SIZE && !user_mode(regs)) goto vmalloc_fault; if (in_atomic() || !mm) { bad_page_fault(regs, address, SIGSEGV); return; } is_write = (exccause == EXCCAUSE_STORE_CACHE_ATTRIBUTE) ? 1 : 0; is_exec = (exccause == EXCCAUSE_ITLB_PRIVILEGE || exccause == EXCCAUSE_ITLB_MISS || exccause == EXCCAUSE_FETCH_CACHE_ATTRIBUTE) ? 1 : 0; #ifdef DEBUG_PAGE_FAULT printk("[%s:%d:%08x:%d:%08x:%s%s]\n", current->comm, current->pid, address, exccause, regs->pc, is_write? "w":"", is_exec? "x":""); #endif down_read(&mm->mmap_sem); vma = find_vma(mm, address); if (!vma) goto bad_area; if (vma->vm_start <= address) goto good_area; if (!(vma->vm_flags & VM_GROWSDOWN)) goto bad_area; if (expand_stack(vma, address)) goto bad_area; good_area: info.si_code = SEGV_ACCERR; if (is_write) { if (!(vma->vm_flags & VM_WRITE)) goto bad_area; } else if (is_exec) { if (!(vma->vm_flags & VM_EXEC)) goto bad_area; } else if (!(vma->vm_flags & (VM_READ | VM_WRITE))) goto bad_area; fault = handle_mm_fault(mm, vma, address, is_write ? FAULT_FLAG_WRITE : 0); if (unlikely(fault & VM_FAULT_ERROR)) { if (fault & VM_FAULT_OOM) goto out_of_memory; else if (fault & VM_FAULT_SIGBUS) goto do_sigbus; BUG(); } if (fault & VM_FAULT_MAJOR) current->maj_flt++; else current->min_flt++; up_read(&mm->mmap_sem); return; bad_area: up_read(&mm->mmap_sem); if (user_mode(regs)) { current->thread.bad_vaddr = address; current->thread.error_code = is_write; info.si_signo = SIGSEGV; info.si_errno = 0; info.si_addr = (void *) address; force_sig_info(SIGSEGV, &info, current); return; } bad_page_fault(regs, address, SIGSEGV); return; out_of_memory: up_read(&mm->mmap_sem); if (!user_mode(regs)) bad_page_fault(regs, address, SIGKILL); else pagefault_out_of_memory(); return; do_sigbus: up_read(&mm->mmap_sem); current->thread.bad_vaddr = address; info.si_code = SIGBUS; info.si_errno = 0; info.si_code = BUS_ADRERR; info.si_addr = (void *) address; force_sig_info(SIGBUS, &info, current); if (!user_mode(regs)) bad_page_fault(regs, address, SIGBUS); vmalloc_fault: { struct mm_struct *act_mm = current->active_mm; int index = pgd_index(address); pgd_t *pgd, *pgd_k; pmd_t *pmd, *pmd_k; pte_t *pte_k; if (act_mm == NULL) goto bad_page_fault; pgd = act_mm->pgd + index; pgd_k = init_mm.pgd + index; if (!pgd_present(*pgd_k)) goto bad_page_fault; pgd_val(*pgd) = pgd_val(*pgd_k); pmd = pmd_offset(pgd, address); pmd_k = pmd_offset(pgd_k, address); if (!pmd_present(*pmd) || !pmd_present(*pmd_k)) goto bad_page_fault; pmd_val(*pmd) = pmd_val(*pmd_k); pte_k = pte_offset_kernel(pmd_k, address); if (!pte_present(*pte_k)) goto bad_page_fault; return; } bad_page_fault: bad_page_fault(regs, address, SIGKILL); return; }
/* Note this is constrained to return 0, -EFAULT, -EACCESS, -ENOMEM by segv(). */ int handle_page_fault(unsigned long address, unsigned long ip, int is_write, int is_user, int *code_out) { struct mm_struct *mm = current->mm; struct vm_area_struct *vma; pgd_t *pgd; pud_t *pud; pmd_t *pmd; pte_t *pte; int err = -EFAULT; *code_out = SEGV_MAPERR; /* If the fault was during atomic operation, don't take the fault, just * fail. */ if (in_atomic()) goto out_nosemaphore; down_read(&mm->mmap_sem); vma = find_vma(mm, address); if(!vma) goto out; else if(vma->vm_start <= address) goto good_area; else if(!(vma->vm_flags & VM_GROWSDOWN)) goto out; else if(is_user && !ARCH_IS_STACKGROW(address)) goto out; else if(expand_stack(vma, address)) goto out; good_area: *code_out = SEGV_ACCERR; if(is_write && !(vma->vm_flags & VM_WRITE)) goto out; /* Don't require VM_READ|VM_EXEC for write faults! */ if(!is_write && !(vma->vm_flags & (VM_READ | VM_EXEC))) goto out; do { survive: switch (handle_mm_fault(mm, vma, address, is_write)){ case VM_FAULT_MINOR: current->min_flt++; break; case VM_FAULT_MAJOR: current->maj_flt++; break; case VM_FAULT_SIGBUS: err = -EACCES; goto out; case VM_FAULT_OOM: err = -ENOMEM; goto out_of_memory; default: BUG(); } pgd = pgd_offset(mm, address); pud = pud_offset(pgd, address); pmd = pmd_offset(pud, address); pte = pte_offset_kernel(pmd, address); } while(!pte_present(*pte)); err = 0; /* The below warning was added in place of * pte_mkyoung(); if (is_write) pte_mkdirty(); * If it's triggered, we'd see normally a hang here (a clean pte is * marked read-only to emulate the dirty bit). * However, the generic code can mark a PTE writable but clean on a * concurrent read fault, triggering this harmlessly. So comment it out. */ #if 0 WARN_ON(!pte_young(*pte) || (is_write && !pte_dirty(*pte))); #endif flush_tlb_page(vma, address); out: up_read(&mm->mmap_sem); out_nosemaphore: return(err); /* * We ran out of memory, or some other thing happened to us that made * us unable to handle the page fault gracefully. */ out_of_memory: if (is_init(current)) { up_read(&mm->mmap_sem); yield(); down_read(&mm->mmap_sem); goto survive; } goto out; }
/* * copy one vm_area from one task to the other. Assumes the page tables * already present in the new task to be cleared in the whole range * covered by this vma. * * 08Jan98 Merged into one routine from several inline routines to reduce * variable count and make things faster. -jj * * dst->page_table_lock is held on entry and exit, * but may be dropped within pmd_alloc() and pte_alloc(). */ int copy_page_range(struct mm_struct *dst, struct mm_struct *src, struct vm_area_struct *vma) { pgd_t * src_pgd, * dst_pgd; unsigned long address = vma->vm_start; unsigned long end = vma->vm_end; unsigned long cow = (vma->vm_flags & (VM_SHARED | VM_WRITE)) == VM_WRITE; src_pgd = pgd_offset(src, address)-1; dst_pgd = pgd_offset(dst, address)-1; for (;;) { pmd_t * src_pmd, * dst_pmd; src_pgd++; dst_pgd++; /* copy_pmd_range */ if (pgd_none(*src_pgd)) goto skip_copy_pmd_range; if (pgd_bad(*src_pgd)) { pgd_ERROR(*src_pgd); pgd_clear(src_pgd); skip_copy_pmd_range: address = (address + PGDIR_SIZE) & PGDIR_MASK; if (!address || (address >= end)) goto out; continue; } src_pmd = pmd_offset(src_pgd, address); dst_pmd = pmd_alloc(dst, dst_pgd, address); if (!dst_pmd) goto nomem; do { pte_t * src_pte, * dst_pte; /* copy_pte_range */ if (pmd_none(*src_pmd)) goto skip_copy_pte_range; if (pmd_bad(*src_pmd)) { pmd_ERROR(*src_pmd); pmd_clear(src_pmd); skip_copy_pte_range: address = (address + PMD_SIZE) & PMD_MASK; if (address >= end) goto out; goto cont_copy_pmd_range; } src_pte = pte_offset(src_pmd, address); dst_pte = pte_alloc(dst, dst_pmd, address); if (!dst_pte) goto nomem; spin_lock(&src->page_table_lock); do { pte_t pte = *src_pte; struct page *ptepage; /* copy_one_pte */ if (pte_none(pte)) goto cont_copy_pte_range_noset; if (!pte_present(pte)) { swap_duplicate(pte_to_swp_entry(pte)); goto cont_copy_pte_range; } ptepage = pte_page(pte); if ((!VALID_PAGE(ptepage)) || PageReserved(ptepage)) goto cont_copy_pte_range; /* If it's a COW mapping, write protect it both in the parent and the child */ if (cow) { ptep_set_wrprotect(src_pte); pte = *src_pte; } /* If it's a shared mapping, mark it clean in the child */ if (vma->vm_flags & VM_SHARED) pte = pte_mkclean(pte); pte = pte_mkold(pte); get_page(ptepage); dst->rss++; cont_copy_pte_range: set_pte(dst_pte, pte); cont_copy_pte_range_noset: address += PAGE_SIZE; if (address >= end) goto out_unlock; src_pte++; dst_pte++; } while ((unsigned long)src_pte & PTE_TABLE_MASK); spin_unlock(&src->page_table_lock); cont_copy_pmd_range: src_pmd++; dst_pmd++; } while ((unsigned long)src_pmd & PMD_TABLE_MASK); } out_unlock: spin_unlock(&src->page_table_lock); out: return 0; nomem: return -ENOMEM; }
asmlinkage void do_page_fault(unsigned long address, struct pt_regs *regs, int protection, int writeaccess) { struct task_struct *tsk; struct mm_struct *mm; struct vm_area_struct * vma; siginfo_t info; D(printk("Page fault for %X at %X, prot %d write %d\n", address, regs->erp, protection, writeaccess)); tsk = current; /* * We fault-in kernel-space virtual memory on-demand. The * 'reference' page table is init_mm.pgd. * * NOTE! We MUST NOT take any locks for this case. We may * be in an interrupt or a critical region, and should * only copy the information from the master page table, * nothing more. * * NOTE2: This is done so that, when updating the vmalloc * mappings we don't have to walk all processes pgdirs and * add the high mappings all at once. Instead we do it as they * are used. However vmalloc'ed page entries have the PAGE_GLOBAL * bit set so sometimes the TLB can use a lingering entry. * * This verifies that the fault happens in kernel space * and that the fault was not a protection error (error_code & 1). */ if (address >= VMALLOC_START && !protection && !user_mode(regs)) goto vmalloc_fault; /* we can and should enable interrupts at this point */ sti(); mm = tsk->mm; info.si_code = SEGV_MAPERR; /* * If we're in an interrupt or have no user * context, we must not take the fault.. */ if (in_interrupt() || !mm) goto no_context; down_read(&mm->mmap_sem); vma = find_vma(mm, address); if (!vma) goto bad_area; if (vma->vm_start <= address) goto good_area; if (!(vma->vm_flags & VM_GROWSDOWN)) goto bad_area; if (user_mode(regs)) { /* * accessing the stack below usp is always a bug. * we get page-aligned addresses so we can only check * if we're within a page from usp, but that might be * enough to catch brutal errors at least. */ if (address + PAGE_SIZE < rdusp()) goto bad_area; } if (expand_stack(vma, address)) goto bad_area; /* * Ok, we have a good vm_area for this memory access, so * we can handle it.. */ good_area: info.si_code = SEGV_ACCERR; /* first do some preliminary protection checks */ if (writeaccess) { if (!(vma->vm_flags & VM_WRITE)) goto bad_area; } else { if (!(vma->vm_flags & (VM_READ | VM_EXEC))) goto bad_area; } /* * If for any reason at all we couldn't handle the fault, * make sure we exit gracefully rather than endlessly redo * the fault. */ switch (handle_mm_fault(mm, vma, address, writeaccess)) { case 1: tsk->min_flt++; break; case 2: tsk->maj_flt++; break; case 0: goto do_sigbus; default: goto out_of_memory; } up_read(&mm->mmap_sem); return; /* * Something tried to access memory that isn't in our memory map.. * Fix it, but check if it's kernel or user first.. */ bad_area: up_read(&mm->mmap_sem); bad_area_nosemaphore: DPG(show_registers(regs)); /* User mode accesses just cause a SIGSEGV */ if (user_mode(regs)) { info.si_signo = SIGSEGV; info.si_errno = 0; /* info.si_code has been set above */ info.si_addr = (void *)address; force_sig_info(SIGSEGV, &info, tsk); return; } no_context: /* Are we prepared to handle this kernel fault? * * (The kernel has valid exception-points in the source * when it acesses user-memory. When it fails in one * of those points, we find it in a table and do a jump * to some fixup code that loads an appropriate error * code) */ if (find_fixup_code(regs)) return; /* * Oops. The kernel tried to access some bad page. We'll have to * terminate things with extreme prejudice. */ if ((unsigned long) (address) < PAGE_SIZE) printk(KERN_ALERT "Unable to handle kernel NULL pointer dereference"); else printk(KERN_ALERT "Unable to handle kernel access"); printk(" at virtual address %08lx\n",address); die_if_kernel("Oops", regs, (writeaccess << 1) | protection); do_exit(SIGKILL); /* * We ran out of memory, or some other thing happened to us that made * us unable to handle the page fault gracefully. */ out_of_memory: up_read(&mm->mmap_sem); printk("VM: killing process %s\n", tsk->comm); if (user_mode(regs)) do_exit(SIGKILL); goto no_context; do_sigbus: up_read(&mm->mmap_sem); /* * Send a sigbus, regardless of whether we were in kernel * or user mode. */ info.si_signo = SIGBUS; info.si_errno = 0; info.si_code = BUS_ADRERR; info.si_addr = (void *)address; force_sig_info(SIGBUS, &info, tsk); /* Kernel mode? Handle exceptions or die */ if (!user_mode(regs)) goto no_context; return; vmalloc_fault: { /* * Synchronize this task's top level page-table * with the 'reference' page table. * * Use current_pgd instead of tsk->active_mm->pgd * since the latter might be unavailable if this * code is executed in a misfortunately run irq * (like inside schedule() between switch_mm and * switch_to...). */ int offset = pgd_index(address); pgd_t *pgd, *pgd_k; pmd_t *pmd, *pmd_k; pte_t *pte_k; pgd = (pgd_t *)current_pgd + offset; pgd_k = init_mm.pgd + offset; /* Since we're two-level, we don't need to do both * set_pgd and set_pmd (they do the same thing). If * we go three-level at some point, do the right thing * with pgd_present and set_pgd here. * * Also, since the vmalloc area is global, we don't * need to copy individual PTE's, it is enough to * copy the pgd pointer into the pte page of the * root task. If that is there, we'll find our pte if * it exists. */ pmd = pmd_offset(pgd, address); pmd_k = pmd_offset(pgd_k, address); if (!pmd_present(*pmd_k)) goto bad_area_nosemaphore; set_pmd(pmd, *pmd_k); /* Make sure the actual PTE exists as well to * catch kernel vmalloc-area accesses to non-mapped * addresses. If we don't do this, this will just * silently loop forever. */ pte_k = pte_offset_kernel(pmd_k, address); if (!pte_present(*pte_k)) goto no_context; return; } }
/* * Note this is constrained to return 0, -EFAULT, -EACCESS, -ENOMEM by * segv(). */ int handle_page_fault(unsigned long address, unsigned long ip, int is_write, int is_user, int *code_out) { struct mm_struct *mm = current->mm; struct vm_area_struct *vma; pgd_t *pgd; pud_t *pud; pmd_t *pmd; pte_t *pte; int err = -EFAULT; unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; *code_out = SEGV_MAPERR; /* * If the fault was with pagefaults disabled, don't take the fault, just * fail. */ if (faulthandler_disabled()) goto out_nosemaphore; if (is_user) flags |= FAULT_FLAG_USER; retry: down_read(&mm->mmap_sem); vma = find_vma(mm, address); if (!vma) goto out; else if (vma->vm_start <= address) goto good_area; else if (!(vma->vm_flags & VM_GROWSDOWN)) goto out; else if (is_user && !ARCH_IS_STACKGROW(address)) goto out; else if (expand_stack(vma, address)) goto out; good_area: *code_out = SEGV_ACCERR; if (is_write) { if (!(vma->vm_flags & VM_WRITE)) goto out; flags |= FAULT_FLAG_WRITE; } else { /* Don't require VM_READ|VM_EXEC for write faults! */ if (!(vma->vm_flags & (VM_READ | VM_EXEC))) goto out; } do { int fault; fault = handle_mm_fault(mm, vma, address, flags); if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) goto out_nosemaphore; if (unlikely(fault & VM_FAULT_ERROR)) { if (fault & VM_FAULT_OOM) { goto out_of_memory; } else if (fault & VM_FAULT_SIGSEGV) { goto out; } else if (fault & VM_FAULT_SIGBUS) { err = -EACCES; goto out; } BUG(); } if (flags & FAULT_FLAG_ALLOW_RETRY) { if (fault & VM_FAULT_MAJOR) current->maj_flt++; else current->min_flt++; if (fault & VM_FAULT_RETRY) { flags &= ~FAULT_FLAG_ALLOW_RETRY; flags |= FAULT_FLAG_TRIED; goto retry; } } pgd = pgd_offset(mm, address); pud = pud_offset(pgd, address); pmd = pmd_offset(pud, address); pte = pte_offset_kernel(pmd, address); } while (!pte_present(*pte)); err = 0; /* * The below warning was added in place of * pte_mkyoung(); if (is_write) pte_mkdirty(); * If it's triggered, we'd see normally a hang here (a clean pte is * marked read-only to emulate the dirty bit). * However, the generic code can mark a PTE writable but clean on a * concurrent read fault, triggering this harmlessly. So comment it out. */ #if 0 WARN_ON(!pte_young(*pte) || (is_write && !pte_dirty(*pte))); #endif flush_tlb_page(vma, address); out: up_read(&mm->mmap_sem); out_nosemaphore: return err; out_of_memory: /* * We ran out of memory, call the OOM killer, and return the userspace * (which will retry the fault, or kill us if we got oom-killed). */ up_read(&mm->mmap_sem); if (!is_user) goto out_nosemaphore; pagefault_out_of_memory(); return 0; }
unsigned int m4u_user_v2p(unsigned int va) { unsigned int pmdOffset = (va & (PMD_SIZE - 1)); unsigned int pageOffset = (va & (PAGE_SIZE - 1)); pgd_t *pgd; pmd_t *pmd; pte_t *pte; unsigned int pa; if(NULL==current) { M4UMSG("error: m4u_user_v2p, current is NULL! \n"); return 0; } if(NULL==current->mm) { M4UMSG("error: m4u_user_v2p, current->mm is NULL! tgid=0x%x, name=%s \n", current->tgid, current->comm); return 0; } pgd = pgd_offset(current->mm, va); /* what is tsk->mm */ M4UDBG("m4u_user_v2p(), pgd 0x%x\n", pgd); M4UDBG("pgd_none=%d, pgd_bad=%d\n", pgd_none(*pgd), pgd_bad(*pgd)); if(pgd_none(*pgd)||pgd_bad(*pgd)) { M4UMSG("warning: m4u_user_v2p(), va=0x%x, pgd invalid! \n", va); return 0; } pmd = pmd_offset(pgd, va); M4UDBG("m4u_user_v2p(), pmd 0x%x\n", pmd); M4UDBG("pmd_none=%d, pmd_bad=%d, pmd_val=0x%x\n", pmd_none(*pmd), pmd_bad(*pmd), pmd_val(*pmd)); /* If this is a page table entry, keep on walking to the next level */ if (( (unsigned int)pmd_val(*pmd) & PMD_TYPE_MASK) == PMD_TYPE_TABLE) { if(pmd_none(*pmd)||pmd_bad(*pmd)) { M4UDBG("warning: m4u_user_v2p(), va=0x%x, pmd invalid! \n", va); return 0; } // we encounter some pte not preset issue, do not know why pte = pte_offset_map(pmd, va); if(pte_present(*pte)) { pa=(pte_val(*pte) & (PAGE_MASK)) | pageOffset; M4UDBG("PA = 0x%8x\n", pa); return pa; } } else /* Only 1 level page table */ { if(pmd_none(*pmd)) { M4UDBG("Error: m4u_user_v2p(), virtual addr 0x%x, pmd invalid! \n", va); return 0; } pa=(pte_val(*pmd) & (PMD_MASK)) | pmdOffset; M4UDBG("PA = 0x%8x\n", pa); return pa; } M4UDBG("warning: m4u_user_v2p(), pte invalid! \n"); // m4u_dump_maps(va); return 0; }
static void setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, sigset_t *set, struct pt_regs *regs) { struct rt_sigframe __user *frame; int err = 0; int signal; unsigned long address = 0; #ifdef CONFIG_MMU pmd_t *pmdp; pte_t *ptep; #endif frame = get_sigframe(ka, regs, sizeof(*frame)); if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) goto give_sigsegv; signal = current_thread_info()->exec_domain && current_thread_info()->exec_domain->signal_invmap && sig < 32 ? current_thread_info()->exec_domain->signal_invmap[sig] : sig; if (info) err |= copy_siginfo_to_user(&frame->info, info); /* Create the ucontext. */ err |= __put_user(0, &frame->uc.uc_flags); err |= __put_user(0, &frame->uc.uc_link); err |= __put_user((void *)current->sas_ss_sp, &frame->uc.uc_stack.ss_sp); err |= __put_user(sas_ss_flags(regs->r1), &frame->uc.uc_stack.ss_flags); err |= __put_user(current->sas_ss_size, &frame->uc.uc_stack.ss_size); err |= setup_sigcontext(&frame->uc.uc_mcontext, regs, set->sig[0]); err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set)); /* Set up to return from userspace. If provided, use a stub already in userspace. */ /* minus 8 is offset to cater for "rtsd r15,8" */ if (ka->sa.sa_flags & SA_RESTORER) { regs->r15 = ((unsigned long)ka->sa.sa_restorer)-8; } else { /* addi r12, r0, __NR_sigreturn */ err |= __put_user(0x31800000 | __NR_rt_sigreturn , frame->tramp + 0); /* brki r14, 0x8 */ err |= __put_user(0xb9cc0008, frame->tramp + 1); /* Return from sighandler will jump to the tramp. Negative 8 offset because return is rtsd r15, 8 */ regs->r15 = ((unsigned long)frame->tramp)-8; address = ((unsigned long)frame->tramp); #ifdef CONFIG_MMU pmdp = pmd_offset(pud_offset( pgd_offset(current->mm, address), address), address); preempt_disable(); ptep = pte_offset_map(pmdp, address); if (pte_present(*ptep)) { address = (unsigned long) page_address(pte_page(*ptep)); /* MS: I need add offset in page */ address += ((unsigned long)frame->tramp) & ~PAGE_MASK; /* MS address is virtual */ address = virt_to_phys(address); invalidate_icache_range(address, address + 8); flush_dcache_range(address, address + 8); } pte_unmap(ptep); preempt_enable(); #else invalidate_icache_range(address, address + 8); flush_dcache_range(address, address + 8); #endif } if (err) goto give_sigsegv; /* Set up registers for signal handler */ regs->r1 = (unsigned long) frame - STATE_SAVE_ARG_SPACE; /* Signal handler args: */ regs->r5 = signal; /* arg 0: signum */ regs->r6 = (unsigned long) &frame->info; /* arg 1: siginfo */ regs->r7 = (unsigned long) &frame->uc; /* arg2: ucontext */ /* Offset to handle microblaze rtid r14, 0 */ regs->pc = (unsigned long)ka->sa.sa_handler; set_fs(USER_DS); /* the tracer may want to single-step inside the handler */ if (test_thread_flag(TIF_SINGLESTEP)) ptrace_notify(SIGTRAP); #ifdef DEBUG_SIG printk(KERN_INFO "SIG deliver (%s:%d): sp=%p pc=%08lx\n", current->comm, current->pid, frame, regs->pc); #endif return; give_sigsegv: if (sig == SIGSEGV) ka->sa.sa_handler = SIG_DFL; force_sig(SIGSEGV, current); }
static struct page *follow_page_pte(struct vm_area_struct *vma, unsigned long address, pmd_t *pmd, unsigned int flags) { struct mm_struct *mm = vma->vm_mm; struct dev_pagemap *pgmap = NULL; struct page *page; spinlock_t *ptl; pte_t *ptep, pte; retry: if (unlikely(pmd_bad(*pmd))) return no_page_table(vma, flags); ptep = pte_offset_map_lock(mm, pmd, address, &ptl); pte = *ptep; if (!pte_present(pte)) { swp_entry_t entry; /* * KSM's break_ksm() relies upon recognizing a ksm page * even while it is being migrated, so for that case we * need migration_entry_wait(). */ if (likely(!(flags & FOLL_MIGRATION))) goto no_page; if (pte_none(pte)) goto no_page; entry = pte_to_swp_entry(pte); if (!is_migration_entry(entry)) goto no_page; pte_unmap_unlock(ptep, ptl); migration_entry_wait(mm, pmd, address); goto retry; } if ((flags & FOLL_NUMA) && pte_protnone(pte)) goto no_page; if ((flags & FOLL_WRITE) && !can_follow_write_pte(pte, flags)) { pte_unmap_unlock(ptep, ptl); return NULL; } page = vm_normal_page(vma, address, pte); if (!page && pte_devmap(pte) && (flags & FOLL_GET)) { /* * Only return device mapping pages in the FOLL_GET case since * they are only valid while holding the pgmap reference. */ pgmap = get_dev_pagemap(pte_pfn(pte), NULL); if (pgmap) page = pte_page(pte); else goto no_page; } else if (unlikely(!page)) { if (flags & FOLL_DUMP) { /* Avoid special (like zero) pages in core dumps */ page = ERR_PTR(-EFAULT); goto out; } if (is_zero_pfn(pte_pfn(pte))) { page = pte_page(pte); } else { int ret; ret = follow_pfn_pte(vma, address, ptep, flags); page = ERR_PTR(ret); goto out; } } if (flags & FOLL_SPLIT && PageTransCompound(page)) { int ret; get_page(page); pte_unmap_unlock(ptep, ptl); lock_page(page); ret = split_huge_page(page); unlock_page(page); put_page(page); if (ret) return ERR_PTR(ret); goto retry; } if (flags & FOLL_GET) { get_page(page); /* drop the pgmap reference now that we hold the page */ if (pgmap) { put_dev_pagemap(pgmap); pgmap = NULL; } } if (flags & FOLL_TOUCH) { if ((flags & FOLL_WRITE) && !pte_dirty(pte) && !PageDirty(page)) set_page_dirty(page); /* * pte_mkyoung() would be more correct here, but atomic care * is needed to avoid losing the dirty bit: it is easier to use * mark_page_accessed(). */ mark_page_accessed(page); } if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) { /* Do not mlock pte-mapped THP */ if (PageTransCompound(page)) goto out; /* * The preliminary mapping check is mainly to avoid the * pointless overhead of lock_page on the ZERO_PAGE * which might bounce very badly if there is contention. * * If the page is already locked, we don't need to * handle it now - vmscan will handle it later if and * when it attempts to reclaim the page. */ if (page->mapping && trylock_page(page)) { lru_add_drain(); /* push cached pages to LRU */ /* * Because we lock page here, and migration is * blocked by the pte's page reference, and we * know the page is still mapped, we don't even * need to check for file-cache page truncation. */ mlock_vma_page(page); unlock_page(page); } } out: pte_unmap_unlock(ptep, ptl); return page; no_page: pte_unmap_unlock(ptep, ptl); if (!pte_none(pte)) return NULL; return no_page_table(vma, flags); }
static void fix_range(struct mm_struct *mm, unsigned long start_addr, unsigned long end_addr, int force) { pgd_t *npgd; pmd_t *npmd; pte_t *npte; unsigned long addr; int r, w, x, err; if((current->thread.mode.tt.extern_pid != -1) && (current->thread.mode.tt.extern_pid != os_getpid())) panic("fix_range fixing wrong address space, current = 0x%p", current); if(mm == NULL) return; for(addr=start_addr;addr<end_addr;){ if(addr == TASK_SIZE){ /* Skip over kernel text, kernel data, and physical * memory, which don't have ptes, plus kernel virtual * memory, which is flushed separately, and remap * the process stack. The only way to get here is * if (end_addr == STACK_TOP) > TASK_SIZE, which is * only true in the honeypot case. */ addr = STACK_TOP - ABOVE_KMEM; continue; } npgd = pgd_offset(mm, addr); npmd = pmd_offset(npgd, addr); if(pmd_present(*npmd)){ npte = pte_offset_kernel(npmd, addr); r = pte_read(*npte); w = pte_write(*npte); x = pte_exec(*npte); if(!pte_dirty(*npte)) w = 0; if(!pte_young(*npte)){ r = 0; w = 0; } if(force || pte_newpage(*npte)){ err = os_unmap_memory((void *) addr, PAGE_SIZE); if(err < 0) panic("munmap failed, errno = %d\n", -err); if(pte_present(*npte)) map_memory(addr, pte_val(*npte) & PAGE_MASK, PAGE_SIZE, r, w, x); } else if(pte_newprot(*npte)){ protect_memory(addr, PAGE_SIZE, r, w, x, 1); } *npte = pte_mkuptodate(*npte); addr += PAGE_SIZE; } else { if(force || pmd_newpage(*npmd)){ err = os_unmap_memory((void *) addr, PMD_SIZE); if(err < 0) panic("munmap failed, errno = %d\n", -err); pmd_mkuptodate(*npmd); } addr += PMD_SIZE; } } }
/* Pretty sick eh? */ int prom_callback(long *args) { struct console *cons, *saved_console = NULL; unsigned long flags; char *cmd; extern spinlock_t prom_entry_lock; if (!args) return -1; if (!(cmd = (char *)args[0])) return -1; /* * The callback can be invoked on the cpu that first dropped * into prom_cmdline after taking the serial interrupt, or on * a slave processor that was smp_captured() if the * administrator has done a switch-cpu inside obp. In either * case, the cpu is marked as in-interrupt. Drop IRQ locks. */ irq_exit(smp_processor_id(), 0); save_and_cli(flags); spin_unlock(&prom_entry_lock); cons = console_drivers; while (cons) { unregister_console(cons); cons->flags &= ~(CON_PRINTBUFFER); cons->next = saved_console; saved_console = cons; cons = console_drivers; } register_console(&prom_console); if (!strcmp(cmd, "sync")) { prom_printf("PROM `%s' command...\n", cmd); show_free_areas(); if(current->pid != 0) { sti(); sys_sync(); cli(); } args[2] = 0; args[args[1] + 3] = -1; prom_printf("Returning to PROM\n"); } else if (!strcmp(cmd, "va>tte-data")) { unsigned long ctx, va; unsigned long tte = 0; long res = PROM_FALSE; ctx = args[3]; va = args[4]; if (ctx) { /* * Find process owning ctx, lookup mapping. */ struct task_struct *p; struct mm_struct *mm = NULL; pgd_t *pgdp; pmd_t *pmdp; pte_t *ptep; for_each_task(p) { mm = p->mm; if (CTX_HWBITS(mm->context) == ctx) break; } if (!mm || CTX_HWBITS(mm->context) != ctx) goto done; pgdp = pgd_offset(mm, va); if (pgd_none(*pgdp)) goto done; pmdp = pmd_offset(pgdp, va); if (pmd_none(*pmdp)) goto done; ptep = pte_offset(pmdp, va); if (!pte_present(*ptep)) goto done; tte = pte_val(*ptep); res = PROM_TRUE; goto done; } if ((va >= KERNBASE) && (va < (KERNBASE + (4 * 1024 * 1024)))) { /* Spitfire Errata #32 workaround */ __asm__ __volatile__("stxa %0, [%1] %2\n\t" "flush %%g6" : /* No outputs */ : "r" (0), "r" (PRIMARY_CONTEXT), "i" (ASI_DMMU)); /* * Locked down tlb entry. */ if (tlb_type == spitfire) tte = spitfire_get_dtlb_data(SPITFIRE_HIGHEST_LOCKED_TLBENT); else if (tlb_type == cheetah) tte = cheetah_get_ldtlb_data(CHEETAH_HIGHEST_LOCKED_TLBENT); res = PROM_TRUE; goto done; }
/* Pretty sick eh? */ int prom_callback(long *args) { struct console *cons, *saved_console = NULL; unsigned long flags; char *cmd; if (!args) return -1; if (!(cmd = (char *)args[0])) return -1; save_and_cli(flags); cons = console_drivers; while (cons) { unregister_console(cons); cons->flags &= ~(CON_PRINTBUFFER); cons->next = saved_console; saved_console = cons; cons = console_drivers; } register_console(&prom_console); if (!strcmp(cmd, "sync")) { prom_printf("PROM `%s' command...\n", cmd); show_free_areas(); if(current->pid != 0) { sti(); sys_sync(); cli(); } args[2] = 0; args[args[1] + 3] = -1; prom_printf("Returning to PROM\n"); } else if (!strcmp(cmd, "va>tte-data")) { unsigned long ctx, va; unsigned long tte = 0; long res = PROM_FALSE; ctx = args[3]; va = args[4]; if (ctx) { /* * Find process owning ctx, lookup mapping. */ struct task_struct *p; pgd_t *pgdp; pmd_t *pmdp; pte_t *ptep; for_each_task(p) if (p->tss.ctx == ctx) break; if (p->tss.ctx != ctx) goto done; pgdp = pgd_offset(p->mm, va); if (pgd_none(*pgdp)) goto done; pmdp = pmd_offset(pgdp, va); if (pmd_none(*pmdp)) goto done; ptep = pte_offset(pmdp, va); if (!pte_present(*ptep)) goto done; tte = pte_val(*ptep); res = PROM_TRUE; goto done; } if ((va >= KERNBASE) && (va < (KERNBASE + (4 * 1024 * 1024)))) { /* Spitfire Errata #32 workaround */ __asm__ __volatile__("stxa %0, [%1] %2\n\t" "flush %%g6" : /* No outputs */ : "r" (0), "r" (PRIMARY_CONTEXT), "i" (ASI_DMMU)); /* * Locked down tlb entry 63. */ tte = spitfire_get_dtlb_data(63); res = PROM_TRUE; goto done; }
/*pgtable sequential scan and count for __access_bits.*/ static int scan_pgtable(void) { pgd_t *pgd = NULL; pud_t *pud = NULL; pmd_t *pmd = NULL; pte_t *ptep, pte; spinlock_t *ptl; struct mm_struct *mm; struct vm_area_struct *vma; unsigned long start = 0; /*the start of address.*/ unsigned long end = 0; /*the end of address.*/ unsigned long address = 0; /* the address of vma.*/ int number_hotpages = 0; /* the number of hot pages */ int number_vpages = 0; int cycle_index = 0; /* the loop counter, which denotes ITERATIONS. */ /* the array that records the number of hot page in every cycle */ int hot_page[ITERATIONS]; int number_current_pg = 0; int pg_count = 0; int j = 0; int times = 0; /* records reuse time*/ /* some variables that describe page "heat" */ int hig = 0; int mid = 0; int low = 0; int llow = 0; int lllow = 0; int llllow = 0; int all_pages = 0;/* the total number of pages */ /*the average number of hot pages in each iteration.*/ long avg_hotpage=0; /*the total number of memory accesses across all pages*/ long num_access=0; /* avg utilization of each page */ int avg_page_utilization = 0; /*get the handle of current running benchmark.*/ struct task_struct *bench_process = get_current_process(); if(bench_process == NULL) { printk("sysmon: get no process handle in scan_pgtable function...exit&trying again...\n"); return 0; } else /* get the process*/ mm = bench_process->mm; if(mm == NULL) { printk("sysmon: error mm is NULL, return back & trying...\n"); return 0; } for(j = 0; j < PAGE_ALL; j++) page_heat[j] = -1; for(j = 0; j < ITERATIONS; j++) { hot_page[j] = 0; reuse_time[j] = 0; dirty_page[j] = 0; } /*yanghao*/ times = 0; for(cycle_index = 0; cycle_index < ITERATIONS; cycle_index++) { number_hotpages = 0; /*scan each vma*/ for(vma = mm->mmap; vma; vma = vma->vm_next) { start = vma->vm_start; end = vma->vm_end; mm = vma->vm_mm; /*in each vma, we check all pages*/ for(address = start; address < end; address += PAGE_SIZE) { /*scan page table for each page in this VMA*/ pgd = pgd_offset(mm, address); if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) continue; pud = pud_offset(pgd, address); if (pud_none(*pud) || unlikely(pud_bad(*pud))) continue; pmd = pmd_offset(pud, address); if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd))) continue; ptep = pte_offset_map_lock(mm, pmd, address, &ptl); pte = *ptep; if(pte_present(pte)) { if(pte_young(pte)) /*hot page*/ { /*re-set and clear _access_bits to 0*/ pte = pte_mkold(pte); set_pte_at(mm, address, ptep, pte); /*yanghao:re-set and clear _dirty_bits to 0*/ pte = pte_mkclean(pte); set_pte_at(mm, address, ptep, pte); } } else /*no page pte_none*/ { pte_unmap_unlock(ptep, ptl); continue; } pte_unmap_unlock(ptep, ptl); page_counts++; } } /*count the number of hot pages*/ if(bench_process == NULL) { printk("sysmon: get no process handle in scan_pgtable function...exit&trying again...\n"); return 0; } else /*get the process*/ mm = bench_process->mm; if(mm == NULL) { printk("sysmon: error mm is NULL, return back & trying...\n"); return 0; } number_vpages = 0; sampling_interval = page_counts / 250; /*yanghao:*/ page_counts = 0; for(vma = mm->mmap; vma; vma = vma->vm_next) { start = vma->vm_start; end = vma->vm_end; /*scan each page in this VMA*/ mm = vma->vm_mm; pg_count = 0; for(address = start; address < end; address += PAGE_SIZE) { /*scan page table for each page in this VMA*/ pgd = pgd_offset(mm, address); if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) continue; pud = pud_offset(pgd, address); if (pud_none(*pud) || unlikely(pud_bad(*pud))) continue; pmd = pmd_offset(pud, address); if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd))) continue; ptep = pte_offset_map_lock(mm, pmd, address, &ptl); pte = *ptep; if(pte_present(pte)) { if(pte_young(pte)) /* hot pages*/ { number_current_pg = pg_count + number_vpages; page_heat[number_current_pg]++; hot_page[cycle_index]++; /*yanghao:*/ if (page_counts == random_page) { times++; if (pte_dirty(pte)) dirty_page[cycle_index] = 1; } } else { if (page_counts == random_page) reuse_time[times]++; } } pg_count++; pte_unmap_unlock(ptep, ptl); page_counts++; } number_vpages += (int)(end - start)/PAGE_SIZE; } } /*yanghao:cal. the No. of random_page*/ random_page += sampling_interval; if(random_page >= page_counts) random_page=page_counts / 300; /*****************************OUTPUT************************************/ for(j = 0; j < PAGE_ALL; j++) { if(page_heat[j] < VH && page_heat[j] > H) hig++; if(page_heat[j] > M && page_heat[j] <= H) mid++; if(page_heat[j] <= M && page_heat[j] > L) low++; if(page_heat[j] > VL_MAX && page_heat[j] <= L) llow++; if(page_heat[j] > VL_MIN && page_heat[j] <= VL_MAX) lllow++; if(page_heat[j] >= 0 && page_heat[j] <= VL_MIN) llllow++; if(page_heat[j] > -1) all_pages++; } /*the values reflect the accessing frequency of each physical page.*/ printk("[LOG: after sampling (%d loops) ...] ",ITERATIONS); printk("the values denote the physical page accessing frequence.\n"); printk("-->hig (150,200) is %d. Indicating the number of re-used pages is high.\n",hig); printk("-->mid (100,150] is %d.\n",mid); printk("-->low (64,100] is %d.\n",low); printk("-->llow (10,64] is %d. In locality,no too many re-used pages.\n",llow); printk("-->lllow (5,10] is %d.\n",lllow); printk("-->llllow [1,5] is %d.\n",llllow); for(j = 0;j < ITERATIONS; j++) avg_hotpage += hot_page[j]; avg_hotpage /= (j+1); /* * new step@20140704 * (1)the different phases of memory utilization * (2)the avg. page accessing utilization * (3)memory pages layout and spectrum */ for(j = 0; j < PAGE_ALL; j++) if(page_heat[j] > -1) /*the page that is accessed at least once.*/ num_access += (page_heat[j] + 1); printk("the total number of memory accesses is %ld, the average is %ld\n", num_access, num_access / ITERATIONS); avg_page_utilization = num_access / all_pages; printk("Avg hot pages num is %ld, all used pages num is %d, avg utilization of each page is %d\n", avg_hotpage, all_pages, avg_page_utilization); /*yanghao:print the information about reuse-distance*/ if ((times == 0) && (reuse_time[0] ==0)) printk("the page No.%d is not available.",random_page); else { if ((times == 0) && (reuse_time[0] == 0)) printk("the page No.%d was not used in this 200 loops.",random_page); else { if (times < ITERATIONS) times++; printk("the reusetime of page No.%d is:",random_page); for (j = 0; j < times; j++) printk("%d ",reuse_time[j]); printk("\n"); printk("the total number of the digit above denotes the sum that page NO.%d be accessd in %d loops.\n", random_page,ITERATIONS); printk("each digit means the sum loops that between current loop and the last loop.\n"); } } printk("\n\n"); return 1; }
void do_page_fault(struct pt_regs *regs) { struct vm_area_struct * vma; struct mm_struct *mm = current->mm; unsigned int exccause = regs->exccause; unsigned int address = regs->excvaddr; siginfo_t info; int is_write, is_exec; int fault; unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; info.si_code = SEGV_MAPERR; /* We fault-in kernel-space virtual memory on-demand. The * 'reference' page table is init_mm.pgd. */ if (address >= TASK_SIZE && !user_mode(regs)) goto vmalloc_fault; /* If we're in an interrupt or have no user * context, we must not take the fault.. */ if (faulthandler_disabled() || !mm) { bad_page_fault(regs, address, SIGSEGV); return; } is_write = (exccause == EXCCAUSE_STORE_CACHE_ATTRIBUTE) ? 1 : 0; is_exec = (exccause == EXCCAUSE_ITLB_PRIVILEGE || exccause == EXCCAUSE_ITLB_MISS || exccause == EXCCAUSE_FETCH_CACHE_ATTRIBUTE) ? 1 : 0; #ifdef DEBUG_PAGE_FAULT printk("[%s:%d:%08x:%d:%08x:%s%s]\n", current->comm, current->pid, address, exccause, regs->pc, is_write? "w":"", is_exec? "x":""); #endif if (user_mode(regs)) flags |= FAULT_FLAG_USER; retry: down_read(&mm->mmap_sem); vma = find_vma(mm, address); if (!vma) goto bad_area; if (vma->vm_start <= address) goto good_area; if (!(vma->vm_flags & VM_GROWSDOWN)) goto bad_area; if (expand_stack(vma, address)) goto bad_area; /* Ok, we have a good vm_area for this memory access, so * we can handle it.. */ good_area: info.si_code = SEGV_ACCERR; if (is_write) { if (!(vma->vm_flags & VM_WRITE)) goto bad_area; flags |= FAULT_FLAG_WRITE; } else if (is_exec) { if (!(vma->vm_flags & VM_EXEC)) goto bad_area; } else /* Allow read even from write-only pages. */ if (!(vma->vm_flags & (VM_READ | VM_WRITE))) goto bad_area; /* If for any reason at all we couldn't handle the fault, * make sure we exit gracefully rather than endlessly redo * the fault. */ fault = handle_mm_fault(vma, address, flags); if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) return; if (unlikely(fault & VM_FAULT_ERROR)) { if (fault & VM_FAULT_OOM) goto out_of_memory; else if (fault & VM_FAULT_SIGSEGV) goto bad_area; else if (fault & VM_FAULT_SIGBUS) goto do_sigbus; BUG(); } if (flags & FAULT_FLAG_ALLOW_RETRY) { if (fault & VM_FAULT_MAJOR) current->maj_flt++; else current->min_flt++; if (fault & VM_FAULT_RETRY) { flags &= ~FAULT_FLAG_ALLOW_RETRY; flags |= FAULT_FLAG_TRIED; /* No need to up_read(&mm->mmap_sem) as we would * have already released it in __lock_page_or_retry * in mm/filemap.c. */ goto retry; } } up_read(&mm->mmap_sem); perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); if (flags & VM_FAULT_MAJOR) perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, regs, address); else perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, regs, address); return; /* Something tried to access memory that isn't in our memory map.. * Fix it, but check if it's kernel or user first.. */ bad_area: up_read(&mm->mmap_sem); if (user_mode(regs)) { current->thread.bad_vaddr = address; current->thread.error_code = is_write; info.si_signo = SIGSEGV; info.si_errno = 0; /* info.si_code has been set above */ info.si_addr = (void *) address; force_sig_info(SIGSEGV, &info, current); return; } bad_page_fault(regs, address, SIGSEGV); return; /* We ran out of memory, or some other thing happened to us that made * us unable to handle the page fault gracefully. */ out_of_memory: up_read(&mm->mmap_sem); if (!user_mode(regs)) bad_page_fault(regs, address, SIGKILL); else pagefault_out_of_memory(); return; do_sigbus: up_read(&mm->mmap_sem); /* Send a sigbus, regardless of whether we were in kernel * or user mode. */ current->thread.bad_vaddr = address; info.si_code = SIGBUS; info.si_errno = 0; info.si_code = BUS_ADRERR; info.si_addr = (void *) address; force_sig_info(SIGBUS, &info, current); /* Kernel mode? Handle exceptions or die */ if (!user_mode(regs)) bad_page_fault(regs, address, SIGBUS); return; vmalloc_fault: { /* Synchronize this task's top level page-table * with the 'reference' page table. */ struct mm_struct *act_mm = current->active_mm; int index = pgd_index(address); pgd_t *pgd, *pgd_k; pmd_t *pmd, *pmd_k; pte_t *pte_k; if (act_mm == NULL) goto bad_page_fault; pgd = act_mm->pgd + index; pgd_k = init_mm.pgd + index; if (!pgd_present(*pgd_k)) goto bad_page_fault; pgd_val(*pgd) = pgd_val(*pgd_k); pmd = pmd_offset(pgd, address); pmd_k = pmd_offset(pgd_k, address); if (!pmd_present(*pmd) || !pmd_present(*pmd_k)) goto bad_page_fault; pmd_val(*pmd) = pmd_val(*pmd_k); pte_k = pte_offset_kernel(pmd_k, address); if (!pte_present(*pte_k)) goto bad_page_fault; return; } bad_page_fault: bad_page_fault(regs, address, SIGKILL); return; }
static size_t copy_in_user_pt(size_t n, void __user *to, const void __user *from) { struct mm_struct *mm = current->mm; unsigned long offset_from, offset_to, offset_max, pfn_from, pfn_to, uaddr, done, size, error_code; unsigned long uaddr_from = (unsigned long) from; unsigned long uaddr_to = (unsigned long) to; pte_t *pte_from, *pte_to; int write_user; if (segment_eq(get_fs(), KERNEL_DS)) { memcpy((void __force *) to, (void __force *) from, n); return 0; } done = 0; retry: spin_lock(&mm->page_table_lock); do { write_user = 0; uaddr = uaddr_from; pte_from = follow_table(mm, uaddr_from); error_code = (unsigned long) pte_from; if (error_code < 0x1000) goto fault; if (!pte_present(*pte_from)) { error_code = 0x11; goto fault; } write_user = 1; uaddr = uaddr_to; pte_to = follow_table(mm, uaddr_to); error_code = (unsigned long) pte_to; if (error_code < 0x1000) goto fault; if (!pte_present(*pte_to)) { error_code = 0x11; goto fault; } else if (!pte_write(*pte_to)) { error_code = 0x04; goto fault; } pfn_from = pte_pfn(*pte_from); pfn_to = pte_pfn(*pte_to); offset_from = uaddr_from & (PAGE_SIZE-1); offset_to = uaddr_from & (PAGE_SIZE-1); offset_max = max(offset_from, offset_to); size = min(n - done, PAGE_SIZE - offset_max); memcpy((void *)(pfn_to << PAGE_SHIFT) + offset_to, (void *)(pfn_from << PAGE_SHIFT) + offset_from, size); done += size; uaddr_from += size; uaddr_to += size; } while (done < n); spin_unlock(&mm->page_table_lock); return n - done; fault: spin_unlock(&mm->page_table_lock); if (__handle_fault(uaddr, error_code, write_user)) return n - done; goto retry; }
/* * This routine handles page faults. It determines the address, * and the problem, and then passes it off to one of the appropriate * routines. */ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long write, unsigned long mmu_meh) { struct vm_area_struct *vma = NULL; struct task_struct *tsk = current; struct mm_struct *mm = tsk->mm; int si_code; int fault; unsigned long address = mmu_meh & PAGE_MASK; si_code = SEGV_MAPERR; #ifndef CONFIG_CPU_HAS_TLBI /* * We fault-in kernel-space virtual memory on-demand. The * 'reference' page table is init_mm.pgd. * * NOTE! We MUST NOT take any locks for this case. We may * be in an interrupt or a critical region, and should * only copy the information from the master page table, * nothing more. */ if (unlikely(address >= VMALLOC_START) && unlikely(address <= VMALLOC_END)) { /* * Synchronize this task's top level page-table * with the 'reference' page table. * * Do _not_ use "tsk" here. We might be inside * an interrupt in the middle of a task switch.. */ int offset = __pgd_offset(address); pgd_t *pgd, *pgd_k; pud_t *pud, *pud_k; pmd_t *pmd, *pmd_k; pte_t *pte_k; unsigned long pgd_base; pgd_base = tlb_get_pgd(); pgd = (pgd_t *)pgd_base + offset; pgd_k = init_mm.pgd + offset; if (!pgd_present(*pgd_k)) goto no_context; set_pgd(pgd, *pgd_k); pud = (pud_t *)pgd; pud_k = (pud_t *)pgd_k; if (!pud_present(*pud_k)) goto no_context; pmd = pmd_offset(pud, address); pmd_k = pmd_offset(pud_k, address); if (!pmd_present(*pmd_k)) goto no_context; set_pmd(pmd, *pmd_k); pte_k = pte_offset_kernel(pmd_k, address); if (!pte_present(*pte_k)) goto no_context; return; } #endif /* * If we're in an interrupt or have no user * context, we must not take the fault.. */ if (in_atomic() || !mm) goto bad_area_nosemaphore; down_read(&mm->mmap_sem); vma = find_vma(mm, address); if (!vma) goto bad_area; if (vma->vm_start <= address) goto good_area; if (!(vma->vm_flags & VM_GROWSDOWN)) goto bad_area; if (expand_stack(vma, address)) goto bad_area; /* * Ok, we have a good vm_area for this memory access, so * we can handle it.. */ good_area: si_code = SEGV_ACCERR; if (write) { if (!(vma->vm_flags & VM_WRITE)) goto bad_area; } else { if (!(vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC))) goto bad_area; } /* * If for any reason at all we couldn't handle the fault, * make sure we exit gracefully rather than endlessly redo * the fault. */ fault = handle_mm_fault(vma, address, write ? FAULT_FLAG_WRITE : 0); if (unlikely(fault & VM_FAULT_ERROR)) { if (fault & VM_FAULT_OOM) goto out_of_memory; else if (fault & VM_FAULT_SIGBUS) goto do_sigbus; else if (fault & VM_FAULT_SIGSEGV) goto bad_area; BUG(); } if (fault & VM_FAULT_MAJOR) tsk->maj_flt++; else tsk->min_flt++; up_read(&mm->mmap_sem); return; /* * Something tried to access memory that isn't in our memory map.. * Fix it, but check if it's kernel or user first.. */ bad_area: up_read(&mm->mmap_sem); bad_area_nosemaphore: /* User mode accesses just cause a SIGSEGV */ if (user_mode(regs)) { force_sig_fault(SIGSEGV, si_code, (void __user *)address, current); return; } no_context: /* Are we prepared to handle this kernel fault? */ if (fixup_exception(regs)) return; /* * Oops. The kernel tried to access some bad page. We'll have to * terminate things with extreme prejudice. */ bust_spinlocks(1); pr_alert("Unable to handle kernel paging request at virtual " "address 0x%08lx, pc: 0x%08lx\n", address, regs->pc); die_if_kernel("Oops", regs, write); out_of_memory: /* * We ran out of memory, call the OOM killer, and return the userspace * (which will retry the fault, or kill us if we got oom-killed). */ pagefault_out_of_memory(); return; do_sigbus: up_read(&mm->mmap_sem); /* Kernel mode? Handle exceptions or die */ if (!user_mode(regs)) goto no_context; force_sig_fault(SIGBUS, BUS_ADRERR, (void __user *)address, current); }
static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, sigset_t *set, struct pt_regs *regs) { struct rt_sigframe __user *frame; int err = 0; int signal; unsigned long address = 0; #ifdef CONFIG_MMU pmd_t *pmdp; pte_t *ptep; #endif frame = get_sigframe(ka, regs, sizeof(*frame)); if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) goto give_sigsegv; signal = current_thread_info()->exec_domain && current_thread_info()->exec_domain->signal_invmap && sig < 32 ? current_thread_info()->exec_domain->signal_invmap[sig] : sig; if (info) err |= copy_siginfo_to_user(&frame->info, info); /* Create the ucontext. */ err |= __put_user(0, &frame->uc.uc_flags); err |= __put_user(NULL, &frame->uc.uc_link); err |= __save_altstack(&frame->uc.uc_stack, regs->r1); err |= setup_sigcontext(&frame->uc.uc_mcontext, regs, set->sig[0]); err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set)); /* Set up to return from userspace. If provided, use a stub already in userspace. */ /* minus 8 is offset to cater for "rtsd r15,8" */ /* addi r12, r0, __NR_sigreturn */ err |= __put_user(0x31800000 | __NR_rt_sigreturn , frame->tramp + 0); /* brki r14, 0x8 */ err |= __put_user(0xb9cc0008, frame->tramp + 1); /* Return from sighandler will jump to the tramp. Negative 8 offset because return is rtsd r15, 8 */ regs->r15 = ((unsigned long)frame->tramp)-8; address = ((unsigned long)frame->tramp); #ifdef CONFIG_MMU pmdp = pmd_offset(pud_offset( pgd_offset(current->mm, address), address), address); preempt_disable(); ptep = pte_offset_map(pmdp, address); if (pte_present(*ptep)) { address = (unsigned long) page_address(pte_page(*ptep)); /* MS: I need add offset in page */ address += ((unsigned long)frame->tramp) & ~PAGE_MASK; /* MS address is virtual */ address = virt_to_phys(address); invalidate_icache_range(address, address + 8); flush_dcache_range(address, address + 8); } pte_unmap(ptep); preempt_enable(); #else flush_icache_range(address, address + 8); flush_dcache_range(address, address + 8); #endif if (err) goto give_sigsegv; /* Set up registers for signal handler */ regs->r1 = (unsigned long) frame; /* Signal handler args: */ regs->r5 = signal; /* arg 0: signum */ regs->r6 = (unsigned long) &frame->info; /* arg 1: siginfo */ regs->r7 = (unsigned long) &frame->uc; /* arg2: ucontext */ /* Offset to handle microblaze rtid r14, 0 */ regs->pc = (unsigned long)ka->sa.sa_handler; set_fs(USER_DS); #ifdef DEBUG_SIG pr_info("SIG deliver (%s:%d): sp=%p pc=%08lx\n", current->comm, current->pid, frame, regs->pc); #endif return 0; give_sigsegv: force_sigsegv(sig, current); return -EFAULT; }
/* * This routine handles page faults. It determines the address, * and the problem, and then passes it off to one of the appropriate * routines. */ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long write, unsigned long address) { struct vm_area_struct * vma = NULL; struct task_struct *tsk = current; struct mm_struct *mm = tsk->mm; const int field = sizeof(unsigned long) * 2; siginfo_t info; int fault; #if 0 printk("Cpu%d[%s:%d:%0*lx:%ld:%0*lx]\n", raw_smp_processor_id(), current->comm, current->pid, field, address, write, field, regs->cp0_epc); #endif info.si_code = SEGV_MAPERR; /* * We fault-in kernel-space virtual memory on-demand. The * 'reference' page table is init_mm.pgd. * * NOTE! We MUST NOT take any locks for this case. We may * be in an interrupt or a critical region, and should * only copy the information from the master page table, * nothing more. */ if (unlikely(address >= VMALLOC_START && address <= VMALLOC_END)) goto vmalloc_fault; #ifdef MODULE_START if (unlikely(address >= MODULE_START && address < MODULE_END)) goto vmalloc_fault; #endif /* * If we're in an interrupt or have no user * context, we must not take the fault.. */ if (in_atomic() || !mm) goto bad_area_nosemaphore; down_read(&mm->mmap_sem); vma = find_vma(mm, address); if (!vma) goto bad_area; if (vma->vm_start <= address) goto good_area; if (!(vma->vm_flags & VM_GROWSDOWN)) goto bad_area; if (expand_stack(vma, address)) goto bad_area; /* * Ok, we have a good vm_area for this memory access, so * we can handle it.. */ good_area: info.si_code = SEGV_ACCERR; if (write) { if (!(vma->vm_flags & VM_WRITE)) goto bad_area; } else { if (!(vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC))) goto bad_area; } /* * If for any reason at all we couldn't handle the fault, * make sure we exit gracefully rather than endlessly redo * the fault. */ fault = handle_mm_fault(mm, vma, address, write ? FAULT_FLAG_WRITE : 0); if (unlikely(fault & VM_FAULT_ERROR)) { if (fault & VM_FAULT_OOM) goto out_of_memory; else if (fault & VM_FAULT_SIGBUS) goto do_sigbus; BUG(); } if (fault & VM_FAULT_MAJOR) tsk->maj_flt++; else tsk->min_flt++; up_read(&mm->mmap_sem); return; /* * Something tried to access memory that isn't in our memory map.. * Fix it, but check if it's kernel or user first.. */ bad_area: up_read(&mm->mmap_sem); bad_area_nosemaphore: /* User mode accesses just cause a SIGSEGV */ if (user_mode(regs)) { tsk->thread.cp0_badvaddr = address; tsk->thread.error_code = write; #if 0 printk("do_page_fault() #2: sending SIGSEGV to %s for " "invalid %s\n%0*lx (epc == %0*lx, ra == %0*lx)\n", tsk->comm, write ? "write access to" : "read access from", field, address, field, (unsigned long) regs->cp0_epc, field, (unsigned long) regs->regs[31]); #endif info.si_signo = SIGSEGV; info.si_errno = 0; /* info.si_code has been set above */ info.si_addr = (void __user *) address; force_sig_info(SIGSEGV, &info, tsk); return; } no_context: /* Are we prepared to handle this kernel fault? */ if (fixup_exception(regs)) { current->thread.cp0_baduaddr = address; return; } /* * Oops. The kernel tried to access some bad page. We'll have to * terminate things with extreme prejudice. */ bust_spinlocks(1); printk(KERN_ALERT "CPU %d Unable to handle kernel paging request at " "virtual address %0*lx, epc == %0*lx, ra == %0*lx\n", raw_smp_processor_id(), field, address, field, regs->cp0_epc, field, regs->regs[31]); die("Oops", regs); out_of_memory: /* * We ran out of memory, call the OOM killer, and return the userspace * (which will retry the fault, or kill us if we got oom-killed). */ up_read(&mm->mmap_sem); pagefault_out_of_memory(); return; do_sigbus: up_read(&mm->mmap_sem); /* Kernel mode? Handle exceptions or die */ if (!user_mode(regs)) goto no_context; else /* * Send a sigbus, regardless of whether we were in kernel * or user mode. */ #if 0 printk("do_page_fault() #3: sending SIGBUS to %s for " "invalid %s\n%0*lx (epc == %0*lx, ra == %0*lx)\n", tsk->comm, write ? "write access to" : "read access from", field, address, field, (unsigned long) regs->cp0_epc, field, (unsigned long) regs->regs[31]); #endif tsk->thread.cp0_badvaddr = address; info.si_signo = SIGBUS; info.si_errno = 0; info.si_code = BUS_ADRERR; info.si_addr = (void __user *) address; force_sig_info(SIGBUS, &info, tsk); return; vmalloc_fault: { /* * Synchronize this task's top level page-table * with the 'reference' page table. * * Do _not_ use "tsk" here. We might be inside * an interrupt in the middle of a task switch.. */ int offset = __pgd_offset(address); pgd_t *pgd, *pgd_k; pud_t *pud, *pud_k; pmd_t *pmd, *pmd_k; pte_t *pte_k; pgd = (pgd_t *) pgd_current[raw_smp_processor_id()] + offset; pgd_k = init_mm.pgd + offset; if (!pgd_present(*pgd_k)) goto no_context; set_pgd(pgd, *pgd_k); pud = pud_offset(pgd, address); pud_k = pud_offset(pgd_k, address); if (!pud_present(*pud_k)) goto no_context; pmd = pmd_offset(pud, address); pmd_k = pmd_offset(pud_k, address); if (!pmd_present(*pmd_k)) goto no_context; set_pmd(pmd, *pmd_k); pte_k = pte_offset_kernel(pmd_k, address); if (!pte_present(*pte_k)) goto no_context; return; } }
void flush_tlb_kernel_range_skas(unsigned long start, unsigned long end) { struct mm_struct *mm; pgd_t *pgd; pud_t *pud; pmd_t *pmd; pte_t *pte; unsigned long addr, last; int updated = 0, err; mm = &init_mm; for(addr = start; addr < end;){ pgd = pgd_offset(mm, addr); pud = pud_offset(pgd, addr); pmd = pmd_offset(pud, addr); if(!pgd_present(*pgd)){ if(pgd_newpage(*pgd)){ updated = 1; last = addr + PGDIR_SIZE; if(last > end) last = end; err = os_unmap_memory((void *) addr, last - addr); if(err < 0) panic("munmap failed, errno = %d\n", -err); } addr += PGDIR_SIZE; continue; } pud = pud_offset(pgd, addr); if(!pud_present(*pud)){ if(pud_newpage(*pud)){ updated = 1; last = addr + PUD_SIZE; if(last > end) last = end; err = os_unmap_memory((void *) addr, last - addr); if(err < 0) panic("munmap failed, errno = %d\n", -err); } addr += PUD_SIZE; continue; } pmd = pmd_offset(pud, addr); if(!pmd_present(*pmd)){ if(pmd_newpage(*pmd)){ updated = 1; last = addr + PMD_SIZE; if(last > end) last = end; err = os_unmap_memory((void *) addr, last - addr); if(err < 0) panic("munmap failed, errno = %d\n", -err); } addr += PMD_SIZE; continue; } pte = pte_offset_kernel(pmd, addr); if(!pte_present(*pte) || pte_newpage(*pte)){ updated = 1; err = os_unmap_memory((void *) addr, PAGE_SIZE); if(err < 0) panic("munmap failed, errno = %d\n", -err); if(pte_present(*pte)) map_memory(addr, pte_val(*pte) & PAGE_MASK, PAGE_SIZE, 1, 1, 1); } else if(pte_newprot(*pte)){ updated = 1; protect_memory(addr, PAGE_SIZE, 1, 1, 1, 1); } addr += PAGE_SIZE; } }
long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags, long pte_index, unsigned long pteh, unsigned long ptel, pgd_t *pgdir, bool realmode, unsigned long *pte_idx_ret) { unsigned long i, pa, gpa, gfn, psize; unsigned long slot_fn, hva; __be64 *hpte; struct revmap_entry *rev; unsigned long g_ptel; struct kvm_memory_slot *memslot; unsigned hpage_shift; bool is_ci; unsigned long *rmap; pte_t *ptep; unsigned int writing; unsigned long mmu_seq; unsigned long rcbits, irq_flags = 0; if (kvm_is_radix(kvm)) return H_FUNCTION; psize = kvmppc_actual_pgsz(pteh, ptel); if (!psize) return H_PARAMETER; writing = hpte_is_writable(ptel); pteh &= ~(HPTE_V_HVLOCK | HPTE_V_ABSENT | HPTE_V_VALID); ptel &= ~HPTE_GR_RESERVED; g_ptel = ptel; /* used later to detect if we might have been invalidated */ mmu_seq = kvm->mmu_notifier_seq; smp_rmb(); /* Find the memslot (if any) for this address */ gpa = (ptel & HPTE_R_RPN) & ~(psize - 1); gfn = gpa >> PAGE_SHIFT; memslot = __gfn_to_memslot(kvm_memslots_raw(kvm), gfn); pa = 0; is_ci = false; rmap = NULL; if (!(memslot && !(memslot->flags & KVM_MEMSLOT_INVALID))) { /* Emulated MMIO - mark this with key=31 */ pteh |= HPTE_V_ABSENT; ptel |= HPTE_R_KEY_HI | HPTE_R_KEY_LO; goto do_insert; } /* Check if the requested page fits entirely in the memslot. */ if (!slot_is_aligned(memslot, psize)) return H_PARAMETER; slot_fn = gfn - memslot->base_gfn; rmap = &memslot->arch.rmap[slot_fn]; /* Translate to host virtual address */ hva = __gfn_to_hva_memslot(memslot, gfn); /* * If we had a page table table change after lookup, we would * retry via mmu_notifier_retry. */ if (!realmode) local_irq_save(irq_flags); /* * If called in real mode we have MSR_EE = 0. Otherwise * we disable irq above. */ ptep = __find_linux_pte(pgdir, hva, NULL, &hpage_shift); if (ptep) { pte_t pte; unsigned int host_pte_size; if (hpage_shift) host_pte_size = 1ul << hpage_shift; else host_pte_size = PAGE_SIZE; /* * We should always find the guest page size * to <= host page size, if host is using hugepage */ if (host_pte_size < psize) { if (!realmode) local_irq_restore(flags); return H_PARAMETER; } pte = kvmppc_read_update_linux_pte(ptep, writing); if (pte_present(pte) && !pte_protnone(pte)) { if (writing && !__pte_write(pte)) /* make the actual HPTE be read-only */ ptel = hpte_make_readonly(ptel); is_ci = pte_ci(pte); pa = pte_pfn(pte) << PAGE_SHIFT; pa |= hva & (host_pte_size - 1); pa |= gpa & ~PAGE_MASK; } } if (!realmode) local_irq_restore(irq_flags); ptel &= HPTE_R_KEY | HPTE_R_PP0 | (psize-1); ptel |= pa; if (pa) pteh |= HPTE_V_VALID; else { pteh |= HPTE_V_ABSENT; ptel &= ~(HPTE_R_KEY_HI | HPTE_R_KEY_LO); } /*If we had host pte mapping then Check WIMG */ if (ptep && !hpte_cache_flags_ok(ptel, is_ci)) { if (is_ci) return H_PARAMETER; /* * Allow guest to map emulated device memory as * uncacheable, but actually make it cacheable. */ ptel &= ~(HPTE_R_W|HPTE_R_I|HPTE_R_G); ptel |= HPTE_R_M; } /* Find and lock the HPTEG slot to use */ do_insert: if (pte_index >= kvmppc_hpt_npte(&kvm->arch.hpt)) return H_PARAMETER; if (likely((flags & H_EXACT) == 0)) { pte_index &= ~7UL; hpte = (__be64 *)(kvm->arch.hpt.virt + (pte_index << 4)); for (i = 0; i < 8; ++i) { if ((be64_to_cpu(*hpte) & HPTE_V_VALID) == 0 && try_lock_hpte(hpte, HPTE_V_HVLOCK | HPTE_V_VALID | HPTE_V_ABSENT)) break; hpte += 2; } if (i == 8) { /* * Since try_lock_hpte doesn't retry (not even stdcx. * failures), it could be that there is a free slot * but we transiently failed to lock it. Try again, * actually locking each slot and checking it. */ hpte -= 16; for (i = 0; i < 8; ++i) { u64 pte; while (!try_lock_hpte(hpte, HPTE_V_HVLOCK)) cpu_relax(); pte = be64_to_cpu(hpte[0]); if (!(pte & (HPTE_V_VALID | HPTE_V_ABSENT))) break; __unlock_hpte(hpte, pte); hpte += 2; } if (i == 8) return H_PTEG_FULL; } pte_index += i; } else { hpte = (__be64 *)(kvm->arch.hpt.virt + (pte_index << 4)); if (!try_lock_hpte(hpte, HPTE_V_HVLOCK | HPTE_V_VALID | HPTE_V_ABSENT)) { /* Lock the slot and check again */ u64 pte; while (!try_lock_hpte(hpte, HPTE_V_HVLOCK)) cpu_relax(); pte = be64_to_cpu(hpte[0]); if (pte & (HPTE_V_VALID | HPTE_V_ABSENT)) { __unlock_hpte(hpte, pte); return H_PTEG_FULL; } } } /* Save away the guest's idea of the second HPTE dword */ rev = &kvm->arch.hpt.rev[pte_index]; if (realmode) rev = real_vmalloc_addr(rev); if (rev) { rev->guest_rpte = g_ptel; note_hpte_modification(kvm, rev); } /* Link HPTE into reverse-map chain */ if (pteh & HPTE_V_VALID) { if (realmode) rmap = real_vmalloc_addr(rmap); lock_rmap(rmap); /* Check for pending invalidations under the rmap chain lock */ if (mmu_notifier_retry(kvm, mmu_seq)) { /* inval in progress, write a non-present HPTE */ pteh |= HPTE_V_ABSENT; pteh &= ~HPTE_V_VALID; ptel &= ~(HPTE_R_KEY_HI | HPTE_R_KEY_LO); unlock_rmap(rmap); } else { kvmppc_add_revmap_chain(kvm, rev, rmap, pte_index, realmode); /* Only set R/C in real HPTE if already set in *rmap */ rcbits = *rmap >> KVMPPC_RMAP_RC_SHIFT; ptel &= rcbits | ~(HPTE_R_R | HPTE_R_C); } } /* Convert to new format on P9 */ if (cpu_has_feature(CPU_FTR_ARCH_300)) { ptel = hpte_old_to_new_r(pteh, ptel); pteh = hpte_old_to_new_v(pteh); } hpte[1] = cpu_to_be64(ptel); /* Write the first HPTE dword, unlocking the HPTE and making it valid */ eieio(); __unlock_hpte(hpte, pteh); asm volatile("ptesync" : : : "memory"); *pte_idx_ret = pte_index; return H_SUCCESS; }