/* * Trying to stop swapping from a file is fraught with races, so * we repeat quite a bit here when we have to pause. swapoff() * isn't exactly timing-critical, so who cares (but this is /really/ * inefficient, ugh). * * We return 1 after having slept, which makes the process start over * from the beginning for this process.. */ static inline int unuse_pte(struct vm_area_struct * vma, unsigned long address, pte_t *dir, unsigned int type, unsigned long page) { pte_t pte = *dir; if (pte_none(pte)) return 0; if (pte_present(pte)) { unsigned long page_nr = MAP_NR(pte_page(pte)); if (page_nr >= MAP_NR(high_memory)) return 0; if (!in_swap_cache(page_nr)) return 0; if (SWP_TYPE(in_swap_cache(page_nr)) != type) return 0; delete_from_swap_cache(page_nr); set_pte(dir, pte_mkdirty(pte)); return 0; } if (SWP_TYPE(pte_val(pte)) != type) return 0; read_swap_page(pte_val(pte), (char *) page); #if 0 /* Is this really needed here, hasn't it been solved elsewhere? */ flush_page_to_ram(page); #endif if (pte_val(*dir) != pte_val(pte)) { free_page(page); return 1; } set_pte(dir, pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)))); flush_tlb_page(vma, address); ++vma->vm_mm->rss; swap_free(pte_val(pte)); return 1; }
/* this routine handles present pages, when users try to write to a shared page. */ void do_wp_page(struct vm_area_struct *vma, unsigned long address, int write_access) { pgd_t *pgd; pmd_t *pmd; pte_t *page_table,pte; unsigned long old_page, new_page; new_page = get_free_page(GFP_KERNEL); pgd = pgd_offset(vma->vm_task, address); if(pgd_none(*pgd)) goto end_wp_page; if(pgd_bad(*pgd)) goto bad_wp_page; pmd = pmd_offset(pgd,address); if(pmd_none(*pmd)) goto end_wp_page; if(pmd_bad(*pmd)) goto bad_wp_page; page_table = pte_offset(pmd,address); pte = *page_table; if(!pte_present(pte)) goto end_wp_page; if(pte_write(pte)) goto end_wp_page; old_page = pte_page(pte); if(old_page >= main_memory_end) goto bad_wp_page; (vma->vm_task->mm->min_flt)++; if(mem_map[MAP_NR(old_page)].flags & PAGE_PRESENT) { if(new_page) { if(mem_map[MAP_NR(old_page)].flags & MAP_PAGE_RESERVED) ++(vma->vm_task->mm->rss); copy_page(old_page, new_page); *page_table = pte_mkwrite(pte_mkdirty(mk_pte((unsigned long)&new_page, vma->vm_page_prot))); free_page(old_page); return; } pte_val(*page_table) &= PAGE_BAD; free_page(old_page); oom(); return; } *page_table = pte_mkdirty(pte_mkwrite(pte)); if(new_page) free_page(new_page); return; bad_wp_page: printk("do_wp_page: bogus page at address %08lx (%08lx)\n",address,old_page); goto end_wp_page; end_wp_page: if(new_page) free_page(new_page); return; }
pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { pte_t pte; if (pte_cont(*ptep)) { int ncontig, i; size_t pgsize; pte_t *cpte; bool is_dirty = false; cpte = huge_pte_offset(mm, addr); ncontig = find_num_contig(mm, addr, cpte, *cpte, &pgsize); /* save the 1st pte to return */ pte = ptep_get_and_clear(mm, addr, cpte); for (i = 1; i < ncontig; ++i) { /* * If HW_AFDBM is enabled, then the HW could * turn on the dirty bit for any of the page * in the set, so check them all. */ ++cpte; if (pte_dirty(ptep_get_and_clear(mm, addr, cpte))) is_dirty = true; } if (is_dirty) return pte_mkdirty(pte); else return pte; } else { return ptep_get_and_clear(mm, addr, ptep); } }
/* * Changing some bits of contiguous entries requires us to follow a * Break-Before-Make approach, breaking the whole contiguous set * before we can change any entries. See ARM DDI 0487A.k_iss10775, * "Misprogramming of the Contiguous bit", page D4-1762. * * This helper performs the break step. */ static pte_t get_clear_flush(struct mm_struct *mm, unsigned long addr, pte_t *ptep, unsigned long pgsize, unsigned long ncontig) { pte_t orig_pte = huge_ptep_get(ptep); bool valid = pte_valid(orig_pte); unsigned long i, saddr = addr; for (i = 0; i < ncontig; i++, addr += pgsize, ptep++) { pte_t pte = ptep_get_and_clear(mm, addr, ptep); /* * If HW_AFDBM is enabled, then the HW could turn on * the dirty or accessed bit for any page in the set, * so check them all. */ if (pte_dirty(pte)) orig_pte = pte_mkdirty(orig_pte); if (pte_young(pte)) orig_pte = pte_mkyoung(orig_pte); } if (valid) { struct vm_area_struct vma = TLB_FLUSH_VMA(mm, 0); flush_tlb_range(&vma, saddr, addr); } return orig_pte; }
/* * page not present ... go through shm_pages */ static unsigned long shm_nopage(struct vm_area_struct * shmd, unsigned long address, int no_share) { pte_t pte; struct shmid_kernel *shp; unsigned int id, idx; id = SWP_OFFSET(shmd->vm_pte) & SHM_ID_MASK; idx = (address - shmd->vm_start + shmd->vm_offset) >> PAGE_SHIFT; #ifdef DEBUG_SHM if (id > max_shmid) { printk ("shm_nopage: id=%d too big. proc mem corrupted\n", id); return 0; } #endif shp = shm_segs[id]; #ifdef DEBUG_SHM if (shp == IPC_UNUSED || shp == IPC_NOID) { printk ("shm_nopage: id=%d invalid. Race.\n", id); return 0; } #endif /* This can occur on a remap */ if (idx >= shp->shm_npages) { return 0; } pte = __pte(shp->shm_pages[idx]); if (!pte_present(pte)) { unsigned long page = get_free_page(GFP_USER); if (!page) return -1; pte = __pte(shp->shm_pages[idx]); if (pte_present(pte)) { free_page (page); /* doesn't sleep */ goto done; } if (!pte_none(pte)) { rw_swap_page_nocache(READ, pte_val(pte), (char *)page); pte = __pte(shp->shm_pages[idx]); if (pte_present(pte)) { free_page (page); /* doesn't sleep */ goto done; } swap_free(pte_val(pte)); shm_swp--; } shm_rss++; pte = pte_mkdirty(mk_pte(page, PAGE_SHARED)); shp->shm_pages[idx] = pte_val(pte); } else --current->maj_flt; /* was incremented in do_no_page */ done: /* pte_val(pte) == shp->shm_pages[idx] */ current->min_flt++; atomic_inc(&mem_map[MAP_NR(pte_page(pte))].count); return pte_page(pte); }
/* * We hold the mm semaphore for reading and vma->vm_mm->page_table_lock */ static inline void break_cow(struct vm_area_struct * vma, struct page * new_page, unsigned long address, pte_t *page_table) { flush_page_to_ram(new_page); flush_cache_page(vma, address); establish_pte(vma, address, page_table, pte_mkwrite(pte_mkdirty(mk_pte(new_page, vma->vm_page_prot)))); }
/* * Changing some bits of contiguous entries requires us to follow a * Break-Before-Make approach, breaking the whole contiguous set * before we can change any entries. See ARM DDI 0487A.k_iss10775, * "Misprogramming of the Contiguous bit", page D4-1762. * * This helper performs the break step. */ static pte_t get_clear_flush(struct mm_struct *mm, unsigned long addr, pte_t *ptep, unsigned long pgsize, unsigned long ncontig) { struct vm_area_struct vma = { .vm_mm = mm }; pte_t orig_pte = huge_ptep_get(ptep); bool valid = pte_valid(orig_pte); unsigned long i, saddr = addr; for (i = 0; i < ncontig; i++, addr += pgsize, ptep++) { pte_t pte = ptep_get_and_clear(mm, addr, ptep); /* * If HW_AFDBM is enabled, then the HW could turn on * the dirty bit for any page in the set, so check * them all. All hugetlb entries are already young. */ if (pte_dirty(pte)) orig_pte = pte_mkdirty(orig_pte); } if (valid) flush_tlb_range(&vma, saddr, addr); return orig_pte; }
int huge_ptep_set_access_flags(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, pte_t pte, int dirty) { int ncontig, i, changed = 0; size_t pgsize = 0; unsigned long pfn = pte_pfn(pte), dpfn; pgprot_t hugeprot; pte_t orig_pte; if (!pte_cont(pte)) return ptep_set_access_flags(vma, addr, ptep, pte, dirty); ncontig = find_num_contig(vma->vm_mm, addr, ptep, &pgsize); dpfn = pgsize >> PAGE_SHIFT; orig_pte = get_clear_flush(vma->vm_mm, addr, ptep, pgsize, ncontig); if (!pte_same(orig_pte, pte)) changed = 1; /* Make sure we don't lose the dirty state */ if (pte_dirty(orig_pte)) pte = pte_mkdirty(pte); hugeprot = pte_pgprot(pte); for (i = 0; i < ncontig; i++, ptep++, addr += pgsize, pfn += dpfn) set_pte_at(vma->vm_mm, addr, ptep, pfn_pte(pfn, hugeprot)); return changed; }
/* * This routine handles present pages, when users try to write * to a shared page. It is done by copying the page to a new address * and decrementing the shared-page counter for the old page. * * Goto-purists beware: the only reason for goto's here is that it results * in better assembly code.. The "default" path will see no jumps at all. * * Note that this routine assumes that the protection checks have been * done by the caller (the low-level page fault routine in most cases). * Thus we can safely just mark it writable once we've done any necessary * COW. * * We also mark the page dirty at this point even though the page will * change only once the write actually happens. This avoids a few races, * and potentially makes it more efficient. * * We hold the mm semaphore and the page_table_lock on entry and exit * with the page_table_lock released. */ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct * vma, unsigned long address, pte_t *page_table, pte_t pte) { struct page *old_page, *new_page; old_page = pte_page(pte); if (!VALID_PAGE(old_page)) goto bad_wp_page; if (!TryLockPage(old_page)) { int reuse = can_share_swap_page(old_page); unlock_page(old_page); if (reuse) { #ifndef CONFIG_SUPERH /* Not needed for VIPT cache */ flush_cache_page(vma, address); #endif establish_pte(vma, address, page_table, pte_mkyoung(pte_mkdirty(pte_mkwrite(pte)))); spin_unlock(&mm->page_table_lock); return 1; /* Minor fault */ } } /* * Ok, we need to copy. Oh, well.. */ page_cache_get(old_page); spin_unlock(&mm->page_table_lock); new_page = alloc_page(GFP_HIGHUSER); if (!new_page) goto no_mem; copy_cow_page(old_page,new_page,address); /* * Re-check the pte - we dropped the lock */ spin_lock(&mm->page_table_lock); if (pte_same(*page_table, pte)) { if (PageReserved(old_page)) ++mm->rss; break_cow(vma, new_page, address, page_table); lru_cache_add(new_page); /* Free the old page.. */ new_page = old_page; } spin_unlock(&mm->page_table_lock); page_cache_release(new_page); page_cache_release(old_page); return 1; /* Minor fault */ bad_wp_page: spin_unlock(&mm->page_table_lock); printk("do_wp_page: bogus page at address %08lx (page 0x%lx)\n",address,(unsigned long)old_page); return -1; no_mem: page_cache_release(old_page); return -1; }
/* * These routines also need to handle stuff like marking pages dirty * and/or accessed for architectures that don't do it in hardware (most * RISC architectures). The early dirtying is also good on the i386. * * There is also a hook called "update_mmu_cache()" that architectures * with external mmu caches can use to update those (ie the Sparc or * PowerPC hashed page tables that act as extended TLBs). * * Note the "page_table_lock". It is to protect against kswapd removing * pages from under us. Note that kswapd only ever _removes_ pages, never * adds them. As such, once we have noticed that the page is not present, * we can drop the lock early. * * The adding of pages is protected by the MM semaphore (which we hold), * so we don't need to worry about a page being suddenly been added into * our VM. */ static inline int handle_pte_fault(struct mm_struct *mm, struct vm_area_struct * vma, unsigned long address, int write_access, pte_t * pte) { pte_t entry; /* * We need the page table lock to synchronize with kswapd * and the SMP-safe atomic PTE updates. */ spin_lock(&mm->page_table_lock); entry = *pte; if (!pte_present(entry)) { /* * If it truly wasn't present, we know that kswapd * and the PTE updates will not touch it later. So * drop the lock. */ spin_unlock(&mm->page_table_lock); if (pte_none(entry)) return do_no_page(mm, vma, address, write_access, pte); return do_swap_page(mm, vma, address, pte, pte_to_swp_entry(entry), write_access); } if (write_access) { if (!pte_write(entry)) return do_wp_page(mm, vma, address, pte, entry); entry = pte_mkdirty(entry); } entry = pte_mkyoung(entry); establish_pte(vma, address, pte, entry); spin_unlock(&mm->page_table_lock); return 1; }
/* * This routine is used to map in a page into an address space: needed by * execve() for the initial stack and environment pages. */ unsigned long put_dirty_page(struct task_struct * tsk, unsigned long page, unsigned long address) { pgd_t * pgd; pmd_t * pmd; pte_t * pte; if (page >= high_memory) printk("put_dirty_page: trying to put page %08lx at %08lx\n",page,address); if (mem_map[MAP_NR(page)].count != 1) printk("mem_map disagrees with %08lx at %08lx\n",page,address); pgd = pgd_offset(tsk->mm,address); pmd = pmd_alloc(pgd, address); if (!pmd) { free_page(page); oom(tsk); return 0; } pte = pte_alloc(pmd, address); if (!pte) { free_page(page); oom(tsk); return 0; } if (!pte_none(*pte)) { printk("put_dirty_page: page already exists\n"); free_page(page); return 0; } flush_page_to_ram(page); set_pte(pte, pte_mkwrite(pte_mkdirty(mk_pte(page, PAGE_COPY)))); /* no need for invalidate */ return page; }
static inline void copy_one_pte(pte_t * old_pte, pte_t * new_pte, int cow) { pte_t pte = *old_pte; unsigned long page_nr; if (pte_none(pte)) return; if (!pte_present(pte)) { swap_duplicate(pte_val(pte)); set_pte(new_pte, pte); return; } page_nr = MAP_NR(pte_page(pte)); if (page_nr >= MAP_NR(high_memory) || PageReserved(mem_map+page_nr)) { set_pte(new_pte, pte); return; } if (cow) pte = pte_wrprotect(pte); if (delete_from_swap_cache(page_nr)) pte = pte_mkdirty(pte); set_pte(new_pte, pte_mkold(pte)); set_pte(old_pte, pte); mem_map[page_nr].count++; }
/* Remap IO memory, the same way as remap_pfn_range(), but use * the obio memory space. * * They use a pgprot that sets PAGE_IO and does not check the * mem_map table as this is independent of normal memory. */ static inline void io_remap_pte_range(struct mm_struct *mm, pte_t * pte, unsigned long address, unsigned long size, unsigned long offset, pgprot_t prot, int space) { unsigned long end; /* clear hack bit that was used as a write_combine side-effect flag */ offset &= ~0x1UL; address &= ~PMD_MASK; end = address + size; if (end > PMD_SIZE) end = PMD_SIZE; do { pte_t entry; unsigned long curend = address + PAGE_SIZE; entry = mk_pte_io(offset, prot, space, PAGE_SIZE); if (!(address & 0xffff)) { if (PAGE_SIZE < (4 * 1024 * 1024) && !(address & 0x3fffff) && !(offset & 0x3ffffe) && end >= address + 0x400000) { entry = mk_pte_io(offset, prot, space, 4 * 1024 * 1024); curend = address + 0x400000; offset += 0x400000; } else if (PAGE_SIZE < (512 * 1024) && !(address & 0x7ffff) && !(offset & 0x7fffe) && end >= address + 0x80000) { entry = mk_pte_io(offset, prot, space, 512 * 1024 * 1024); curend = address + 0x80000; offset += 0x80000; } else if (PAGE_SIZE < (64 * 1024) && !(offset & 0xfffe) && end >= address + 0x10000) { entry = mk_pte_io(offset, prot, space, 64 * 1024); curend = address + 0x10000; offset += 0x10000; } else offset += PAGE_SIZE; } else offset += PAGE_SIZE; if (pte_write(entry)) entry = pte_mkdirty(entry); do { BUG_ON(!pte_none(*pte)); set_pte_at(mm, address, pte, entry); address += PAGE_SIZE; pte_val(entry) += PAGE_SIZE; pte++; } while (address < curend); } while (address < end); }
pte_t __bad_page(void) { extern char empty_bad_page[PAGE_SIZE]; unsigned long page = (unsigned long)empty_bad_page; clear_page(page); return pte_mkdirty(mk_pte(page, PAGE_SHARED)); }
/* * We hold the mm semaphore for reading and vma->vm_mm->page_table_lock */ static inline void break_cow(struct vm_area_struct * vma, struct page * new_page, unsigned long address, pte_t *page_table) { flush_page_to_ram(new_page); #ifndef CONFIG_SUPERH /* Not needed for VIPT cache (need better API for caches) */ flush_cache_page(vma, address); #endif establish_pte(vma, address, page_table, pte_mkwrite(pte_mkdirty(mk_pte(new_page, vma->vm_page_prot)))); }
pte_t __bad_page(void) { extern char empty_bad_page[PAGE_SIZE]; __asm__ __volatile__("cld ; rep ; stosl": :"a" (0), "D" ((long) empty_bad_page), "c" (PAGE_SIZE/4) :"di","cx"); return pte_mkdirty(mk_pte((unsigned long) empty_bad_page, PAGE_SHARED)); }
/* * Free the swap entry and set the new pte for the shm page. */ static void shm_unuse_page(struct shmid_kernel *shp, unsigned long idx, unsigned long page, unsigned long entry) { pte_t pte; pte = pte_mkdirty(mk_pte(page, PAGE_SHARED)); shp->shm_pages[idx] = pte_val(pte); atomic_inc(&mem_map[MAP_NR(page)].count); shm_rss++; swap_free(entry); shm_swp--; }
/* * This routine puts a long into any process space by following the page * tables. NOTE! You should check that the long isn't on a page boundary, * and that it is in the task area before calling this: this routine does * no checking. * * Now keeps R/W state of page so that a text page stays readonly * even if a debugger scribbles breakpoints into it. -M.U- */ static void put_long(struct task_struct * tsk, struct vm_area_struct * vma, unsigned long addr, unsigned long data) { pgd_t *pgdir; pmd_t *pgmiddle; pte_t *pgtable; unsigned long page; repeat: pgdir = pgd_offset(vma->vm_mm, addr); if (!pgd_present(*pgdir)) { do_no_page(tsk, vma, addr, 1); goto repeat; } if (pgd_bad(*pgdir)) { printk("ptrace: bad page directory %08lx\n", pgd_val(*pgdir)); pgd_clear(pgdir); return; } pgmiddle = pmd_offset(pgdir,addr); if (pmd_none(*pgmiddle)) { do_no_page(tsk, vma, addr, 1); goto repeat; } if (pmd_bad(*pgmiddle)) { printk("ptrace: bad page directory %08lx\n", pmd_val(*pgmiddle)); pmd_clear(pgmiddle); return; } pgtable = pte_offset(pgmiddle, addr); if (!pte_present(*pgtable)) { do_no_page(tsk, vma, addr, 1); goto repeat; } page = pte_page(*pgtable); if (!pte_write(*pgtable)) { do_wp_page(tsk, vma, addr, 2); goto repeat; } /* this is a hack for non-kernel-mapped video buffers and similar */ if (page < high_memory) { *(unsigned long *) (page + (addr & ~PAGE_MASK)) = data; flush_page_to_ram (page); } /* we're bypassing pagetables, so we have to set the dirty bit ourselves */ /* this should also re-instate whatever read-only mode there was before */ *pgtable = pte_mkdirty(mk_pte(page, vma->vm_page_prot)); flush_tlb_all(); }
/* * The swap entry has been read in advance, and we return 1 to indicate * that the page has been used or is no longer needed. * * Always set the resulting pte to be nowrite (the same as COW pages * after one process has exited). We don't know just how many PTEs will * share this swap entry, so be cautious and let do_wp_page work out * what to do if a write is requested later. */ static inline void unswap_pte(struct vm_area_struct * vma, unsigned long address, pte_t *dir, unsigned long entry, unsigned long page /*, int isswap */) { pte_t pte = *dir; if (pte_none(pte)) return; if (pte_present(pte)) { /* If this entry is swap-cached, then page must already hold the right address for any copies in physical memory */ if (pte_page(pte) != page) return; if (0 /* isswap */) mem_map[MAP_NR(pte_page(pte))].offset = page; else /* We will be removing the swap cache in a moment, so... */ set_pte(dir, pte_mkdirty(pte)); return; } if (pte_val(pte) != entry) return; if (0 /* isswap */) { DPRINTK( "unswap_pte: replacing entry %08lx by %08lx", entry, page ); set_pte(dir, __pte(page)); } else { DPRINTK( "unswap_pte: replacing entry %08lx by new page %08lx", entry, page ); set_pte(dir, pte_mkdirty(mk_pte(page,vma->vm_page_prot))); atomic_inc(&mem_map[MAP_NR(page)].count); ++vma->vm_mm->rss; } swap_free(entry); }
void __iomem * ioremap_prot(phys_addr_t addr, unsigned long size, unsigned long flags) { pte_t pte = __pte(flags); /* writeable implies dirty for kernel addresses */ if (pte_write(pte)) pte = pte_mkdirty(pte); /* we don't want to let _PAGE_USER and _PAGE_EXEC leak out */ pte = pte_exprotect(pte); pte = pte_mkprivileged(pte); return __ioremap_caller(addr, size, pte_pgprot(pte), __builtin_return_address(0)); }
/* * The above separate functions for the no-page and wp-page * cases will go away (they mostly do the same thing anyway), * and we'll instead use only a general "handle_mm_fault()". * * These routines also need to handle stuff like marking pages dirty * and/or accessed for architectures that don't do it in hardware (most * RISC architectures). The early dirtying is also good on the i386. * * There is also a hook called "update_mmu_cache()" that architectures * with external mmu caches can use to update those (ie the Sparc or * PowerPC hashed page tables that act as extended TLBs). */ static inline void handle_pte_fault(struct vm_area_struct * vma, unsigned long address, int write_access, pte_t * pte) { if (!pte_present(*pte)) { do_no_page(current, vma, address, write_access); return; } set_pte(pte, pte_mkyoung(*pte)); flush_tlb_page(vma, address); if (!write_access) return; if (pte_write(*pte)) { set_pte(pte, pte_mkdirty(*pte)); flush_tlb_page(vma, address); return; } do_wp_page(current, vma, address, write_access); }
/* * This only needs the MM semaphore */ static int do_anonymous_page(struct mm_struct * mm, struct vm_area_struct * vma, pte_t *page_table, int write_access, unsigned long addr) { struct page *page = NULL; pte_t entry = pte_wrprotect(mk_pte(ZERO_PAGE(addr), vma->vm_page_prot)); if (write_access) { page = alloc_page(GFP_HIGHUSER); if (!page) return -1; clear_user_highpage(page, addr); entry = pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot))); mm->rss++; flush_page_to_ram(page); } set_pte(page_table, entry); /* No need to invalidate - it was non-present before */ update_mmu_cache(vma, addr, entry); return 1; /* Minor fault */ }
/* * the function for the no-page and wp_page */ void handle_pte_fault(struct vm_area_struct *vma, unsigned long address, pte_t *pte,int write_access) { // no page present if(!pte_present(*pte)) { do_no_page(vma,address,write_access); return; } *pte = pte_mkyoung(*pte); if(!write_access) { *pte = pte_mkdirty(*pte); return; } // write the shared page do_wp_page(vma, address, write_access); }
/* * do_no_page() tries to create a new page mapping. It aggressively * tries to share with existing pages, but makes a separate copy if * the "write_access" parameter is true in order to avoid the next * page fault. * * As this is called only for pages that do not currently exist, we * do not need to flush old virtual caches or the TLB. * * This is called with the MM semaphore held. */ static int do_no_page(struct mm_struct * mm, struct vm_area_struct * vma, unsigned long address, int write_access, pte_t *page_table) { struct page * new_page; pte_t entry; if (!vma->vm_ops || !vma->vm_ops->nopage) return do_anonymous_page(mm, vma, page_table, write_access, address); /* * The third argument is "no_share", which tells the low-level code * to copy, not share the page even if sharing is possible. It's * essentially an early COW detection. */ new_page = vma->vm_ops->nopage(vma, address & PAGE_MASK, (vma->vm_flags & VM_SHARED)?0:write_access); if (new_page == NULL) /* no page was available -- SIGBUS */ return 0; if (new_page == NOPAGE_OOM) return -1; ++mm->rss; /* * This silly early PAGE_DIRTY setting removes a race * due to the bad i386 page protection. But it's valid * for other architectures too. * * Note that if write_access is true, we either now have * an exclusive copy of the page, or this is a shared mapping, * so we can make it writable and dirty to avoid having to * handle that later. */ flush_page_to_ram(new_page); flush_icache_page(vma, new_page); entry = mk_pte(new_page, vma->vm_page_prot); if (write_access) { entry = pte_mkwrite(pte_mkdirty(entry)); } else if (page_count(new_page) > 1 && !(vma->vm_flags & VM_SHARED)) entry = pte_wrprotect(entry); set_pte(page_table, entry); /* no need to invalidate: a not-present page shouldn't be cached */ update_mmu_cache(vma, address, entry); return 2; /* Major fault */ }
/* * We are called with the MM semaphore and page_table_lock * spinlock held to protect against concurrent faults in * multithreaded programs. */ static int do_anonymous_page(struct mm_struct * mm, struct vm_area_struct * vma, pte_t *page_table, int write_access, unsigned long addr) { pte_t entry; /* Read-only mapping of ZERO_PAGE. */ entry = pte_wrprotect(mk_pte(ZERO_PAGE(addr), vma->vm_page_prot)); /* ..except if it's a write access */ if (write_access) { struct page *page; /* Allocate our own private page. */ spin_unlock(&mm->page_table_lock); page = alloc_page(GFP_HIGHUSER); if (!page) goto no_mem; clear_user_highpage(page, addr); spin_lock(&mm->page_table_lock); if (!pte_none(*page_table)) { page_cache_release(page); spin_unlock(&mm->page_table_lock); return 1; } mm->rss++; flush_page_to_ram(page); entry = pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot))); lru_cache_add(page); mark_page_accessed(page); } set_pte(page_table, entry); /* No need to invalidate - it was non-present before */ update_mmu_cache(vma, addr, entry); spin_unlock(&mm->page_table_lock); return 1; /* Minor fault */ no_mem: return -1; }
static int do_swap_page(struct mm_struct * mm, struct vm_area_struct * vma, unsigned long address, pte_t * page_table, swp_entry_t entry, int write_access) { struct page *page = lookup_swap_cache(entry); pte_t pte; if (!page) { lock_kernel(); swapin_readahead(entry); page = read_swap_cache(entry); unlock_kernel(); if (!page) return -1; flush_page_to_ram(page); flush_icache_page(vma, page); } mm->rss++; pte = mk_pte(page, vma->vm_page_prot); /* * Freeze the "shared"ness of the page, ie page_count + swap_count. * Must lock page before transferring our swap count to already * obtained page count. */ lock_page(page); swap_free(entry); if (write_access && !is_page_shared(page)) pte = pte_mkwrite(pte_mkdirty(pte)); UnlockPage(page); set_pte(page_table, pte); /* No need to invalidate - it was non-present before */ update_mmu_cache(vma, address, pte); return 1; /* Minor fault */ }
/* * This routine puts a long into any process space by following the page * tables. NOTE! You should check that the long isn't on a page boundary, * and that it is in the task area before calling this: this routine does * no checking. * * Now keeps R/W state of page so that a text page stays readonly * even if a debugger scribbles breakpoints into it. -M.U- */ static void put_long(struct vm_area_struct * vma, unsigned long addr, unsigned long data) { pgd_t *pgdir; pte_t *pgtable; unsigned long page; repeat: pgdir = PAGE_DIR_OFFSET(vma->vm_task, addr); if (!pgd_present(*pgdir)) { do_no_page(vma, addr, 1); goto repeat; } if (pgd_bad(*pgdir)) { printk("ptrace: bad page directory %08lx\n", pgd_val(*pgdir)); pgd_clear(pgdir); return; } pgtable = (pte_t *) (PAGE_PTR(addr) + pgd_page(*pgdir)); if (!pte_present(*pgtable)) { do_no_page(vma, addr, 1); goto repeat; } page = pte_page(*pgtable); if (!pte_write(*pgtable)) { do_wp_page(vma, addr, 1); goto repeat; } /* this is a hack for non-kernel-mapped video buffers and similar */ if (page < high_memory) { page += addr & ~PAGE_MASK; *(unsigned long *) page = data; } /* we're bypassing pagetables, so we have to set the dirty bit ourselves */ /* this should also re-instate whatever read-only mode there was before */ *pgtable = pte_mkdirty(mk_pte(page, vma->vm_page_prot)); invalidate(); }
static int follow_pfn_pte(struct vm_area_struct *vma, unsigned long address, pte_t *pte, unsigned int flags) { /* No page to get reference */ if (flags & FOLL_GET) return -EFAULT; if (flags & FOLL_TOUCH) { pte_t entry = *pte; if (flags & FOLL_WRITE) entry = pte_mkdirty(entry); entry = pte_mkyoung(entry); if (!pte_same(*pte, entry)) { set_pte_at(vma->vm_mm, address, pte, entry); update_mmu_cache(vma, address, pte); } } /* Proper page table entry exists, but no corresponding struct page */ return -EEXIST; }
/*H:330 * (i) Looking up a page table entry when the Guest faults. * * We saw this call in run_guest(): when we see a page fault in the Guest, we * come here. That's because we only set up the shadow page tables lazily as * they're needed, so we get page faults all the time and quietly fix them up * and return to the Guest without it knowing. * * If we fixed up the fault (ie. we mapped the address), this routine returns * true. Otherwise, it was a real fault and we need to tell the Guest. */ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) { pgd_t gpgd; pgd_t *spgd; unsigned long gpte_ptr; pte_t gpte; pte_t *spte; /* Mid level for PAE. */ #ifdef CONFIG_X86_PAE pmd_t *spmd; pmd_t gpmd; #endif /* First step: get the top-level Guest page table entry. */ if (unlikely(cpu->linear_pages)) { /* Faking up a linear mapping. */ gpgd = __pgd(CHECK_GPGD_MASK); } else { gpgd = lgread(cpu, gpgd_addr(cpu, vaddr), pgd_t); /* Toplevel not present? We can't map it in. */ if (!(pgd_flags(gpgd) & _PAGE_PRESENT)) return false; } /* Now look at the matching shadow entry. */ spgd = spgd_addr(cpu, cpu->cpu_pgd, vaddr); if (!(pgd_flags(*spgd) & _PAGE_PRESENT)) { /* No shadow entry: allocate a new shadow PTE page. */ unsigned long ptepage = get_zeroed_page(GFP_KERNEL); /* * This is not really the Guest's fault, but killing it is * simple for this corner case. */ if (!ptepage) { kill_guest(cpu, "out of memory allocating pte page"); return false; } /* We check that the Guest pgd is OK. */ check_gpgd(cpu, gpgd); /* * And we copy the flags to the shadow PGD entry. The page * number in the shadow PGD is the page we just allocated. */ set_pgd(spgd, __pgd(__pa(ptepage) | pgd_flags(gpgd))); } #ifdef CONFIG_X86_PAE if (unlikely(cpu->linear_pages)) { /* Faking up a linear mapping. */ gpmd = __pmd(_PAGE_TABLE); } else { gpmd = lgread(cpu, gpmd_addr(gpgd, vaddr), pmd_t); /* Middle level not present? We can't map it in. */ if (!(pmd_flags(gpmd) & _PAGE_PRESENT)) return false; } /* Now look at the matching shadow entry. */ spmd = spmd_addr(cpu, *spgd, vaddr); if (!(pmd_flags(*spmd) & _PAGE_PRESENT)) { /* No shadow entry: allocate a new shadow PTE page. */ unsigned long ptepage = get_zeroed_page(GFP_KERNEL); /* * This is not really the Guest's fault, but killing it is * simple for this corner case. */ if (!ptepage) { kill_guest(cpu, "out of memory allocating pte page"); return false; } /* We check that the Guest pmd is OK. */ check_gpmd(cpu, gpmd); /* * And we copy the flags to the shadow PMD entry. The page * number in the shadow PMD is the page we just allocated. */ set_pmd(spmd, __pmd(__pa(ptepage) | pmd_flags(gpmd))); } /* * OK, now we look at the lower level in the Guest page table: keep its * address, because we might update it later. */ gpte_ptr = gpte_addr(cpu, gpmd, vaddr); #else /* * OK, now we look at the lower level in the Guest page table: keep its * address, because we might update it later. */ gpte_ptr = gpte_addr(cpu, gpgd, vaddr); #endif if (unlikely(cpu->linear_pages)) { /* Linear? Make up a PTE which points to same page. */ gpte = __pte((vaddr & PAGE_MASK) | _PAGE_RW | _PAGE_PRESENT); } else { /* Read the actual PTE value. */ gpte = lgread(cpu, gpte_ptr, pte_t); } /* If this page isn't in the Guest page tables, we can't page it in. */ if (!(pte_flags(gpte) & _PAGE_PRESENT)) return false; /* * Check they're not trying to write to a page the Guest wants * read-only (bit 2 of errcode == write). */ if ((errcode & 2) && !(pte_flags(gpte) & _PAGE_RW)) return false; /* User access to a kernel-only page? (bit 3 == user access) */ if ((errcode & 4) && !(pte_flags(gpte) & _PAGE_USER)) return false; /* * Check that the Guest PTE flags are OK, and the page number is below * the pfn_limit (ie. not mapping the Launcher binary). */ check_gpte(cpu, gpte); /* Add the _PAGE_ACCESSED and (for a write) _PAGE_DIRTY flag */ gpte = pte_mkyoung(gpte); if (errcode & 2) gpte = pte_mkdirty(gpte); /* Get the pointer to the shadow PTE entry we're going to set. */ spte = spte_addr(cpu, *spgd, vaddr); /* * If there was a valid shadow PTE entry here before, we release it. * This can happen with a write to a previously read-only entry. */ release_pte(*spte); /* * If this is a write, we insist that the Guest page is writable (the * final arg to gpte_to_spte()). */ if (pte_dirty(gpte)) *spte = gpte_to_spte(cpu, gpte, 1); else /* * If this is a read, don't set the "writable" bit in the page * table entry, even if the Guest says it's writable. That way * we will come back here when a write does actually occur, so * we can update the Guest's _PAGE_DIRTY flag. */ set_pte(spte, gpte_to_spte(cpu, pte_wrprotect(gpte), 0)); /* * Finally, we write the Guest PTE entry back: we've set the * _PAGE_ACCESSED and maybe the _PAGE_DIRTY flags. */ if (likely(!cpu->linear_pages)) lgwrite(cpu, gpte_ptr, pte_t, gpte); /* * The fault is fixed, the page table is populated, the mapping * manipulated, the result returned and the code complete. A small * delay and a trace of alliteration are the only indications the Guest * has that a page fault occurred at all. */ return true; }
/* * This routine handles present pages, when users try to write * to a shared page. It is done by copying the page to a new address * and decrementing the shared-page counter for the old page. * * Goto-purists beware: the only reason for goto's here is that it results * in better assembly code.. The "default" path will see no jumps at all. * * Note that this routine assumes that the protection checks have been * done by the caller (the low-level page fault routine in most cases). * Thus we can safely just mark it writable once we've done any necessary * COW. * * We also mark the page dirty at this point even though the page will * change only once the write actually happens. This avoids a few races, * and potentially makes it more efficient. * * We enter with the page table read-lock held, and need to exit without * it. */ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct * vma, unsigned long address, pte_t *page_table, pte_t pte) { struct page *old_page, *new_page; old_page = pte_page(pte); if (!VALID_PAGE(old_page)) goto bad_wp_page; /* * We can avoid the copy if: * - we're the only user (count == 1) * - the only other user is the swap cache, * and the only swap cache user is itself, * in which case we can just continue to * use the same swap cache (it will be * marked dirty). */ switch (page_count(old_page)) { case 2: /* * Lock the page so that no one can look it up from * the swap cache, grab a reference and start using it. * Can not do lock_page, holding page_table_lock. */ if (!PageSwapCache(old_page) || TryLockPage(old_page)) break; if (is_page_shared(old_page)) { UnlockPage(old_page); break; } UnlockPage(old_page); /* FallThrough */ case 1: flush_cache_page(vma, address); establish_pte(vma, address, page_table, pte_mkyoung(pte_mkdirty(pte_mkwrite(pte)))); spin_unlock(&mm->page_table_lock); return 1; /* Minor fault */ } /* * Ok, we need to copy. Oh, well.. */ spin_unlock(&mm->page_table_lock); new_page = page_cache_alloc(); if (!new_page) return -1; spin_lock(&mm->page_table_lock); /* * Re-check the pte - we dropped the lock */ if (pte_same(*page_table, pte)) { if (PageReserved(old_page)) ++mm->rss; break_cow(vma, old_page, new_page, address, page_table); /* Free the old page.. */ new_page = old_page; } spin_unlock(&mm->page_table_lock); page_cache_release(new_page); return 1; /* Minor fault */ bad_wp_page: spin_unlock(&mm->page_table_lock); printk("do_wp_page: bogus page at address %08lx (page 0x%lx)\n",address,(unsigned long)old_page); return -1; }