// swap_copy_entry - copy a content of swap out page frame to a new page // - set this new page PG_swap flag and add to swap active list int swap_copy_entry(swap_entry_t entry, swap_entry_t *store) { if (store == NULL) { return -E_INVAL; } int ret = -E_NO_MEM; struct Page *page, *newpage; swap_duplicate(entry); if ((newpage = alloc_page()) == NULL) { goto failed; } if ((ret = swap_in_page(entry, &page)) != 0) { goto failed_free_page; } ret = -E_NO_MEM; if (!swap_page_add(newpage, 0)) { goto failed_free_page; } swap_active_list_add(newpage); memcpy(page2kva(newpage), page2kva(page), PGSIZE); *store = newpage->index; ret = 0; out: swap_remove_entry(entry); return ret; failed_free_page: free_page(newpage); failed: goto out; }
static inline void copy_one_pte(pte_t * old_pte, pte_t * new_pte, int cow) { pte_t pte = *old_pte; unsigned long page_nr; if (pte_none(pte)) return; if (!pte_present(pte)) { swap_duplicate(pte_val(pte)); set_pte(new_pte, pte); return; } page_nr = MAP_NR(pte_page(pte)); if (page_nr >= MAP_NR(high_memory) || PageReserved(mem_map+page_nr)) { set_pte(new_pte, pte); return; } if (cow) pte = pte_wrprotect(pte); if (delete_from_swap_cache(page_nr)) pte = pte_mkdirty(pte); set_pte(new_pte, pte_mkold(pte)); set_pte(old_pte, pte); mem_map[page_nr].count++; }
// swap_out_vma - try unmap pte & move pages into swap active list. static int swap_out_vma(struct mm_struct *mm, struct vma_struct *vma, uintptr_t addr, size_t require) { if (require == 0 || !(addr >= vma->vm_start && addr < vma->vm_end)) { return 0; } uintptr_t end; size_t free_count = 0; addr = ROUNDDOWN(addr, PGSIZE), end = ROUNDUP(vma->vm_end, PGSIZE); while (addr < end && require != 0) { pte_t *ptep = get_pte(mm->pgdir, addr, 0); if (ptep == NULL) { if (get_pud(mm->pgdir, addr, 0) == NULL) { addr = ROUNDDOWN(addr + PUSIZE, PUSIZE); } else if (get_pmd(mm->pgdir, addr, 0) == NULL) { addr = ROUNDDOWN(addr + PMSIZE, PMSIZE); } else { addr = ROUNDDOWN(addr + PTSIZE, PTSIZE); } continue ; } if (ptep_present(ptep)) { struct Page *page = pte2page(*ptep); assert(!PageReserved(page)); if (ptep_accessed(ptep)) { ptep_unset_accessed(ptep); mp_tlb_invalidate(mm->pgdir, addr); goto try_next_entry; } if (!PageSwap(page)) { if (!swap_page_add(page, 0)) { goto try_next_entry; } swap_active_list_add(page); } else if (ptep_dirty(ptep)) { SetPageDirty(page); } swap_entry_t entry = page->index; swap_duplicate(entry); page_ref_dec(page); ptep_copy(ptep, &entry); mp_tlb_invalidate(mm->pgdir, addr); mm->swap_address = addr + PGSIZE; free_count ++, require --; if ((vma->vm_flags & VM_SHARE) && page_ref(page) == 1) { uintptr_t shmem_addr = addr - vma->vm_start + vma->shmem_off; pte_t *sh_ptep = shmem_get_entry(vma->shmem, shmem_addr, 0); assert(sh_ptep != NULL && ! ptep_invalid(sh_ptep)); if (ptep_present(sh_ptep)) { shmem_insert_entry(vma->shmem, shmem_addr, entry); } } } try_next_entry: addr += PGSIZE; } return free_count; }
/* * Locate a page of swap in physical memory, reserving swap cache space * and reading the disk if it is not already cached. * A failure return means that either the page allocation failed or that * the swap entry is no longer in use. */ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, struct vm_area_struct *vma, unsigned long addr) { struct page *found_page, *new_page = NULL; int err; do { /* * First check the swap cache. Since this is normally * called after lookup_swap_cache() failed, re-calling * that would confuse statistics. */ found_page = find_get_page(&swapper_space, entry.val); if (found_page) break; /* * Get a new page to read into from swap. */ if (!new_page) { new_page = alloc_page_vma(gfp_mask, vma, addr); if (!new_page) break; /* Out of memory */ } /* * Swap entry may have been freed since our caller observed it. */ if (!swap_duplicate(entry)) break; /* * Associate the page with swap entry in the swap cache. * May fail (-EEXIST) if there is already a page associated * with this entry in the swap cache: added by a racing * read_swap_cache_async, or add_to_swap or shmem_writepage * re-using the just freed swap entry for an existing page. * May fail (-ENOMEM) if radix-tree node allocation failed. */ __set_page_locked(new_page); SetPageSwapBacked(new_page); err = add_to_swap_cache(new_page, entry, gfp_mask & GFP_KERNEL); if (likely(!err)) { /* * Initiate read into locked page and return. */ lru_cache_add_anon(new_page); swap_readpage(NULL, new_page); return new_page; } ClearPageSwapBacked(new_page); __clear_page_locked(new_page); swap_free(entry); } while (err != -ENOMEM); if (new_page) page_cache_release(new_page); return found_page; }
struct page * read_swap_cache_async(unsigned long entry, int wait) { struct page *found_page = 0, *new_page; unsigned long new_page_addr; #ifdef DEBUG_SWAP printk("DebugVM: read_swap_cache_async entry %08lx%s\n", entry, wait ? ", wait" : ""); #endif /* * Make sure the swap entry is still in use. */ if (!swap_duplicate(entry)) /* Account for the swap cache */ goto out; /* * Look for the page in the swap cache. */ found_page = lookup_swap_cache(entry); if (found_page) goto out_free_swap; new_page_addr = __get_free_page(GFP_USER); if (!new_page_addr) goto out_free_swap; /* Out of memory */ new_page = mem_map + MAP_NR(new_page_addr); /* * Check the swap cache again, in case we stalled above. */ found_page = lookup_swap_cache(entry); if (found_page) goto out_free_page; /* * Add it to the swap cache and read its contents. */ if (!add_to_swap_cache(new_page, entry)) goto out_free_page; set_bit(PG_locked, &new_page->flags); rw_swap_page(READ, entry, (char *) new_page_addr, wait); #ifdef DEBUG_SWAP printk("DebugVM: read_swap_cache_async created " "entry %08lx at %p\n", entry, (char *) page_address(new_page)); #endif return new_page; out_free_page: __free_page(new_page); out_free_swap: swap_free(entry); out: return found_page; }
/* * Strange swizzling function only for use by shmem_writepage */ int move_to_swap_cache(struct page *page, swp_entry_t entry) { int err = __add_to_swap_cache(page, entry, GFP_ATOMIC); if (!err) { remove_from_page_cache(page); page_cache_release(page); /* pagecache ref */ if (!swap_duplicate(entry)) BUG(); SetPageDirty(page); INC_CACHE_INFO(add_total); } else if (err == -EEXIST) INC_CACHE_INFO(exist_race); return err; }
int shmem_insert_entry(struct shmem_struct *shmem, uintptr_t addr, pte_t entry) { pte_t *ptep = shmem_get_entry(shmem, addr, 1); if (ptep == NULL) { return -E_NO_MEM; } if (*ptep != 0) { shmem_remove_entry_pte(ptep); } if (entry & PTE_P) { page_ref_inc(pte2page(entry)); } else if (entry != 0) { swap_duplicate(entry); } *ptep = entry; return 0; }
int shmem_insert_entry(struct shmem_struct *shmem, uintptr_t addr, pte_t entry) { pte_t *ptep = shmem_get_entry(shmem, addr, 1); if (ptep == NULL) { return -E_NO_MEM; } if (! ptep_invalid(ptep)) { shmem_remove_entry_pte(ptep); } if (ptep_present(&entry)) { page_ref_inc(pte2page(entry)); } else if (! ptep_invalid(&entry)) { swap_duplicate(entry); } ptep_copy(ptep, &entry); return 0; }
struct page * read_swap_cache_async(swp_entry_t entry, int wait) { struct page *found_page = 0, *new_page; unsigned long new_page_addr; /* * Make sure the swap entry is still in use. */ if (!swap_duplicate(entry)) /* Account for the swap cache */ goto out; /* * Look for the page in the swap cache. */ found_page = lookup_swap_cache(entry); if (found_page) goto out_free_swap; new_page_addr = __get_free_page(GFP_USER); if (!new_page_addr) goto out_free_swap; /* Out of memory */ new_page = virt_to_page(new_page_addr); /* * Check the swap cache again, in case we stalled above. */ found_page = lookup_swap_cache(entry); if (found_page) goto out_free_page; /* * Add it to the swap cache and read its contents. */ lock_page(new_page); add_to_swap_cache(new_page, entry); rw_swap_page(READ, new_page, wait); return new_page; out_free_page: page_cache_release(new_page); out_free_swap: swap_free(entry); out: return found_page; }
int add_to_swap_cache(struct page *page, swp_entry_t entry) { if (page->mapping) BUG(); if (!swap_duplicate(entry)) { INC_CACHE_INFO(noent_race); return -ENOENT; } if (add_to_page_cache_unique(page, &swapper_space, entry.val, page_hash(&swapper_space, entry.val)) != 0) { swap_free(entry); INC_CACHE_INFO(exist_race); return -EEXIST; } if (!PageLocked(page)) BUG(); if (!PageSwapCache(page)) BUG(); INC_CACHE_INFO(add_total); return 0; }
static int add_to_swap_cache(struct page *page, swp_entry_t entry) { int error; if (!swap_duplicate(entry)) { INC_CACHE_INFO(noent_race); return -ENOENT; } error = __add_to_swap_cache(page, entry, GFP_KERNEL); /* * Anon pages are already on the LRU, we don't run lru_cache_add here. */ if (error) { swap_free(entry); if (error == -EEXIST) INC_CACHE_INFO(exist_race); return error; } INC_CACHE_INFO(add_total); return 0; }
// page_launder - try to move page to swap_active_list OR swap_inactive_list, // - and call swap_fs_write to swap out pages in swap_inactive_list int page_launder(void) { size_t maxscan = nr_inactive_pages, free_count = 0; list_entry_t *list = &(inactive_list.swap_list), *le = list_next(list); while (maxscan -- > 0 && le != list) { struct Page *page = le2page(le, swap_link); le = list_next(le); if (!(PageSwap(page) && !PageActive(page))) { panic("inactive: wrong swap list.\n"); } swap_list_del(page); if (page_ref(page) != 0) { swap_active_list_add(page); continue ; } swap_entry_t entry = page->index; if (!try_free_swap_entry(entry)) { if (PageDirty(page)) { ClearPageDirty(page); swap_duplicate(entry); if (swapfs_write(entry, page) != 0) { SetPageDirty(page); } mem_map[swap_offset(entry)] --; if (page_ref(page) != 0) { swap_active_list_add(page); continue ; } if (PageDirty(page)) { swap_inactive_list_add(page); continue ; } try_free_swap_entry(entry); } } free_count ++; swap_free_page(page); } return free_count; }
int do_pgfault(struct mm_struct *mm, machine_word_t error_code, uintptr_t addr) { if (mm == NULL) { assert(current != NULL); /* Chen Yuheng * give handler a chance to deal with it */ kprintf ("page fault in kernel thread: pid = %d, name = %s, %d %08x.\n", current->pid, current->name, error_code, addr); return -E_KILLED; } bool need_unlock = 1; if (!try_lock_mm(mm)) { if (current != NULL && mm->locked_by == current->pid) { need_unlock = 0; } else { lock_mm(mm); } } int ret = -E_INVAL; struct vma_struct *vma = find_vma(mm, addr); if (vma == NULL || vma->vm_start > addr) { goto failed; } if (vma->vm_flags & VM_STACK) { if (addr < vma->vm_start + PGSIZE) { goto failed; } } //kprintf("@ %x %08x\n", vma->vm_flags, vma->vm_start); //assert((vma->vm_flags & VM_IO)==0); if (vma->vm_flags & VM_IO) { ret = -E_INVAL; goto failed; } switch (error_code & 3) { default: /* default is 3: write, present */ case 2: /* write, not present */ if (!(vma->vm_flags & VM_WRITE)) { goto failed; } break; case 1: /* read, present */ goto failed; case 0: /* read, not present */ if (!(vma->vm_flags & (VM_READ | VM_EXEC))) { goto failed; } } pte_perm_t perm, nperm; #ifdef ARCH_ARM /* ARM9 software emulated PTE_xxx */ perm = PTE_P | PTE_U; if (vma->vm_flags & VM_WRITE) { perm |= PTE_W; } #else ptep_unmap(&perm); ptep_set_u_read(&perm); if (vma->vm_flags & VM_WRITE) { ptep_set_u_write(&perm); } #endif addr = ROUNDDOWN(addr, PGSIZE); ret = -E_NO_MEM; pte_t *ptep; if ((ptep = get_pte(mm->pgdir, addr, 1)) == NULL) { goto failed; } if (ptep_invalid(ptep)) { #ifdef UCONFIG_BIONIC_LIBC if (vma->mfile.file != NULL) { struct file *file = vma->mfile.file; off_t old_pos = file->pos, new_pos = vma->mfile.offset + addr - vma->vm_start; #ifdef SHARE_MAPPED_FILE struct mapped_addr *maddr = find_maddr(file, new_pos, NULL); if (maddr == NULL) { #endif // SHARE_MAPPED_FILE struct Page *page; if ((page = alloc_page()) == NULL) { assert(false); goto failed; } nperm = perm; #ifdef ARCH_ARM /* ARM9 software emulated PTE_xxx */ nperm &= ~PTE_W; #else ptep_unset_s_write(&nperm); #endif page_insert_pte(mm->pgdir, page, ptep, addr, nperm); if ((ret = filestruct_setpos(file, new_pos)) != 0) { assert(false); goto failed; } filestruct_read(file, page2kva(page), PGSIZE); if ((ret = filestruct_setpos(file, old_pos)) != 0) { assert(false); goto failed; } #ifdef SHARE_MAPPED_FILE if ((maddr = (struct mapped_addr *) kmalloc(sizeof(struct mapped_addr))) != NULL) { maddr->page = page; maddr->offset = new_pos; page->maddr = maddr; list_add(& (file->node->mapped_addr_list), &(maddr->list)); } else { assert(false); } } else { nperm = perm; #ifdef ARCH_ARM /* ARM9 software emulated PTE_xxx */ nperm &= ~PTE_W; #else ptep_unset_s_write(&nperm); #endif page_insert_pte(mm->pgdir, maddr->page, ptep, addr, nperm); } #endif //SHARE_MAPPED_FILE } else #endif //UCONFIG_BIONIC_LIBC if (!(vma->vm_flags & VM_SHARE)) { if (pgdir_alloc_page(mm->pgdir, addr, perm) == NULL) { goto failed; } #ifdef UCONFIG_BIONIC_LIBC if (vma->vm_flags & VM_ANONYMOUS) { memset((void *)addr, 0, PGSIZE); } #endif //UCONFIG_BIONIC_LIBC } else { //shared mem lock_shmem(vma->shmem); uintptr_t shmem_addr = addr - vma->vm_start + vma->shmem_off; pte_t *sh_ptep = shmem_get_entry(vma->shmem, shmem_addr, 1); if (sh_ptep == NULL || ptep_invalid(sh_ptep)) { unlock_shmem(vma->shmem); goto failed; } unlock_shmem(vma->shmem); if (ptep_present(sh_ptep)) { page_insert(mm->pgdir, pa2page(*sh_ptep), addr, perm); } else { #ifdef UCONFIG_SWAP swap_duplicate(*ptep); ptep_copy(ptep, sh_ptep); #else panic("NO SWAP\n"); #endif } } } else { //a present page, handle copy-on-write (cow) struct Page *page, *newpage = NULL; bool cow = ((vma->vm_flags & (VM_SHARE | VM_WRITE)) == VM_WRITE), may_copy = 1; #if 1 if (!(!ptep_present(ptep) || ((error_code & 2) && !ptep_u_write(ptep) && cow))) { //assert(PADDR(mm->pgdir) == rcr3()); kprintf("%p %p %d %d %x\n", *ptep, addr, error_code, cow, vma->vm_flags); assert(0); } #endif if (cow) { newpage = alloc_page(); } if (ptep_present(ptep)) { page = pte2page(*ptep); } else { #ifdef UCONFIG_SWAP if ((ret = swap_in_page(*ptep, &page)) != 0) { if (newpage != NULL) { free_page(newpage); } goto failed; } #else assert(0); #endif if (!(error_code & 2) && cow) { #ifdef ARCH_ARM //#warning ARM9 software emulated PTE_xxx perm &= ~PTE_W; #else ptep_unset_s_write(&perm); #endif may_copy = 0; } } if (cow && may_copy) { #ifdef UCONFIG_SWAP if (page_ref(page) + swap_page_count(page) > 1) { #else if (page_ref(page) > 1) { #endif if (newpage == NULL) { goto failed; } memcpy(page2kva(newpage), page2kva(page), PGSIZE); //kprintf("COW!\n"); page = newpage, newpage = NULL; } } #ifdef UCONFIG_BIONIC_LIBC else if (vma->mfile.file != NULL) { #ifdef UCONFIG_SWAP assert(page_reg(page) + swap_page_count(page) == 1); #else assert(page_ref(page) == 1); #endif #ifdef SHARE_MAPPED_FILE off_t offset = vma->mfile.offset + addr - vma->vm_start; struct mapped_addr *maddr = find_maddr(vma->mfile.file, offset, page); if (maddr != NULL) { list_del(&(maddr->list)); kfree(maddr); page->maddr = NULL; assert(find_maddr(vma->mfile.file, offset, page) == NULL); } else { } #endif //SHARE_MAPPED_FILE } #endif //UCONFIG_BIONIC_LIBC else { } page_insert(mm->pgdir, page, addr, perm); if (newpage != NULL) { free_page(newpage); } } ret = 0; failed: if (need_unlock) { unlock_mm(mm); } return ret; }
/* * We completely avoid races by reading each swap page in advance, * and then search for the process using it. All the necessary * page table adjustments can then be made atomically. */ static int try_to_unuse(unsigned int type) { struct swap_info_struct * si = &swap_info[type]; struct mm_struct *start_mm; unsigned short *swap_map; unsigned short swcount; struct page *page; swp_entry_t entry; unsigned int i = 0; int retval = 0; int reset_overflow = 0; int shmem; /* * When searching mms for an entry, a good strategy is to * start at the first mm we freed the previous entry from * (though actually we don't notice whether we or coincidence * freed the entry). Initialize this start_mm with a hold. * * A simpler strategy would be to start at the last mm we * freed the previous entry from; but that would take less * advantage of mmlist ordering, which clusters forked mms * together, child after parent. If we race with dup_mmap(), we * prefer to resolve parent before child, lest we miss entries * duplicated after we scanned child: using last mm would invert * that. Though it's only a serious concern when an overflowed * swap count is reset from SWAP_MAP_MAX, preventing a rescan. */ start_mm = &init_mm; atomic_inc(&init_mm.mm_users); /* * Keep on scanning until all entries have gone. Usually, * one pass through swap_map is enough, but not necessarily: * there are races when an instance of an entry might be missed. */ while ((i = find_next_to_unuse(si, i)) != 0) { if (signal_pending(current)) { retval = -EINTR; break; } /* * Get a page for the entry, using the existing swap * cache page if there is one. Otherwise, get a clean * page and read the swap into it. */ swap_map = &si->swap_map[i]; entry = swp_entry(type, i); page = read_swap_cache_async(entry, NULL, 0); if (!page) { /* * Either swap_duplicate() failed because entry * has been freed independently, and will not be * reused since sys_swapoff() already disabled * allocation from here, or alloc_page() failed. */ if (!*swap_map) continue; retval = -ENOMEM; break; } /* * Don't hold on to start_mm if it looks like exiting. */ if (atomic_read(&start_mm->mm_users) == 1) { mmput(start_mm); start_mm = &init_mm; atomic_inc(&init_mm.mm_users); } /* * Wait for and lock page. When do_swap_page races with * try_to_unuse, do_swap_page can handle the fault much * faster than try_to_unuse can locate the entry. This * apparently redundant "wait_on_page_locked" lets try_to_unuse * defer to do_swap_page in such a case - in some tests, * do_swap_page and try_to_unuse repeatedly compete. */ wait_on_page_locked(page); wait_on_page_writeback(page); lock_page(page); wait_on_page_writeback(page); /* * Remove all references to entry. * Whenever we reach init_mm, there's no address space * to search, but use it as a reminder to search shmem. */ shmem = 0; swcount = *swap_map; if (swcount > 1) { if (start_mm == &init_mm) shmem = shmem_unuse(entry, page); else retval = unuse_mm(start_mm, entry, page); } if (*swap_map > 1) { int set_start_mm = (*swap_map >= swcount); struct list_head *p = &start_mm->mmlist; struct mm_struct *new_start_mm = start_mm; struct mm_struct *prev_mm = start_mm; struct mm_struct *mm; atomic_inc(&new_start_mm->mm_users); atomic_inc(&prev_mm->mm_users); spin_lock(&mmlist_lock); while (*swap_map > 1 && !retval && (p = p->next) != &start_mm->mmlist) { mm = list_entry(p, struct mm_struct, mmlist); if (!atomic_inc_not_zero(&mm->mm_users)) continue; spin_unlock(&mmlist_lock); mmput(prev_mm); prev_mm = mm; cond_resched(); swcount = *swap_map; if (swcount <= 1) ; else if (mm == &init_mm) { set_start_mm = 1; shmem = shmem_unuse(entry, page); } else retval = unuse_mm(mm, entry, page); if (set_start_mm && *swap_map < swcount) { mmput(new_start_mm); atomic_inc(&mm->mm_users); new_start_mm = mm; set_start_mm = 0; } spin_lock(&mmlist_lock); } spin_unlock(&mmlist_lock); mmput(prev_mm); mmput(start_mm); start_mm = new_start_mm; } if (retval) { unlock_page(page); page_cache_release(page); break; } /* * How could swap count reach 0x7fff when the maximum * pid is 0x7fff, and there's no way to repeat a swap * page within an mm (except in shmem, where it's the * shared object which takes the reference count)? * We believe SWAP_MAP_MAX cannot occur in Linux 2.4. * * If that's wrong, then we should worry more about * exit_mmap() and do_munmap() cases described above: * we might be resetting SWAP_MAP_MAX too early here. * We know "Undead"s can happen, they're okay, so don't * report them; but do report if we reset SWAP_MAP_MAX. */ if (*swap_map == SWAP_MAP_MAX) { spin_lock(&swap_lock); *swap_map = 1; spin_unlock(&swap_lock); reset_overflow = 1; } /* * If a reference remains (rare), we would like to leave * the page in the swap cache; but try_to_unmap could * then re-duplicate the entry once we drop page lock, * so we might loop indefinitely; also, that page could * not be swapped out to other storage meanwhile. So: * delete from cache even if there's another reference, * after ensuring that the data has been saved to disk - * since if the reference remains (rarer), it will be * read from disk into another page. Splitting into two * pages would be incorrect if swap supported "shared * private" pages, but they are handled by tmpfs files. * * Note shmem_unuse already deleted a swappage from * the swap cache, unless the move to filepage failed: * in which case it left swappage in cache, lowered its * swap count to pass quickly through the loops above, * and now we must reincrement count to try again later. */ if ((*swap_map > 1) && PageDirty(page) && PageSwapCache(page)) { struct writeback_control wbc = { .sync_mode = WB_SYNC_NONE, }; swap_writepage(page, &wbc); lock_page(page); wait_on_page_writeback(page); } if (PageSwapCache(page)) { if (shmem) swap_duplicate(entry); else delete_from_swap_cache(page); } /* * So we could skip searching mms once swap count went * to 1, we did not mark any present ptes as dirty: must * mark page dirty so shrink_page_list will preserve it. */ SetPageDirty(page); unlock_page(page); page_cache_release(page); /* * Make sure that we aren't completely killing * interactive performance. */ cond_resched(); }
/* ucore use copy-on-write when forking a new process, * thus copy_range only copy pdt/pte and set their permission to * READONLY, a write will be handled in pgfault */ int copy_range(pgd_t *to, pgd_t *from, uintptr_t start, uintptr_t end, bool share) { assert(start % PGSIZE == 0 && end % PGSIZE == 0); assert(USER_ACCESS(start, end)); do { pte_t *ptep = get_pte(from, start, 0), *nptep; if (ptep == NULL) { if (get_pud(from, start, 0) == NULL) { start = ROUNDDOWN(start + PUSIZE, PUSIZE); } else if (get_pmd(from, start, 0) == NULL) { start = ROUNDDOWN(start + PMSIZE, PMSIZE); } else { start = ROUNDDOWN(start + PTSIZE, PTSIZE); } continue ; } if (*ptep != 0) { if ((nptep = get_pte(to, start, 1)) == NULL) { return -E_NO_MEM; } int ret; //kprintf("%08x %08x %08x\n", nptep, *nptep, start); assert(*ptep != 0 && *nptep == 0); #ifdef ARCH_ARM //TODO add code to handle swap if (ptep_present(ptep)){ //no body should be able to write this page //before a W-pgfault pte_perm_t perm = PTE_P; if(ptep_u_read(ptep)) perm |= PTE_U; if(!share){ //Original page should be set to readonly! //because Copy-on-write may happen //after the current proccess modifies its page ptep_set_perm(ptep, perm); }else{ if(ptep_u_write(ptep)){ perm |= PTE_W; } } struct Page *page = pte2page(*ptep); ret = page_insert(to, page, start, perm); } #else /* ARCH_ARM */ if (ptep_present(ptep)) { pte_perm_t perm = ptep_get_perm(ptep, PTE_USER); struct Page *page = pte2page(*ptep); if (!share && ptep_s_write(ptep)) { ptep_unset_s_write(&perm); pte_perm_t perm_with_swap_stat = ptep_get_perm(ptep, PTE_SWAP); ptep_set_perm(&perm_with_swap_stat, perm); page_insert(from, page, start, perm_with_swap_stat); } ret = page_insert(to, page, start, perm); assert(ret == 0); } #endif /* ARCH_ARM */ else { #ifdef CONFIG_NO_SWAP assert(0); #endif swap_entry_t entry; ptep_copy(&entry, ptep); swap_duplicate(entry); ptep_copy(nptep, &entry); } } start += PGSIZE; } while (start != 0 && start < end); #ifdef ARCH_ARM /* we have modified the PTE of the original * process, so invalidate TLB */ tlb_invalidate_all(); #endif return 0; }
/* * The swap-out functions return 1 if they successfully * threw something out, and we got a free page. It returns * zero if it couldn't do anything, and any other value * indicates it decreased rss, but the page was shared. * * NOTE! If it sleeps, it *must* return 1 to make sure we * don't continue with the swap-out. Otherwise we may be * using a process that no longer actually exists (it might * have died while we slept). */ static int try_to_swap_out(struct mm_struct * mm, struct vm_area_struct* vma, unsigned long address, pte_t * page_table, int gfp_mask) { pte_t pte; swp_entry_t entry; struct page * page; int onlist; pte = *page_table; if (!pte_present(pte)) goto out_failed; page = pte_page(pte); if ((!VALID_PAGE(page)) || PageReserved(page)) goto out_failed; if (mm->swap_cnt) mm->swap_cnt--; onlist = PageActive(page); /* Don't look at this pte if it's been accessed recently. */ if (ptep_test_and_clear_young(page_table)) { age_page_up(page); goto out_failed; } if (!onlist) /* The page is still mapped, so it can't be freeable... */ age_page_down_ageonly(page); /* * If the page is in active use by us, or if the page * is in active use by others, don't unmap it or * (worse) start unneeded IO. */ if (page->age > 0) goto out_failed; if (TryLockPage(page)) goto out_failed; /* From this point on, the odds are that we're going to * nuke this pte, so read and clear the pte. This hook * is needed on CPUs which update the accessed and dirty * bits in hardware. */ pte = ptep_get_and_clear(page_table); /* * Is the page already in the swap cache? If so, then * we can just drop our reference to it without doing * any IO - it's already up-to-date on disk. * * Return 0, as we didn't actually free any real * memory, and we should just continue our scan. */ if (PageSwapCache(page)) { entry.val = page->index; if (pte_dirty(pte)) set_page_dirty(page); set_swap_pte: swap_duplicate(entry); set_pte(page_table, swp_entry_to_pte(entry)); drop_pte: UnlockPage(page); mm->rss--; flush_tlb_page(vma, address); deactivate_page(page); page_cache_release(page); out_failed: return 0; } /* * Is it a clean page? Then it must be recoverable * by just paging it in again, and we can just drop * it.. * * However, this won't actually free any real * memory, as the page will just be in the page cache * somewhere, and as such we should just continue * our scan. * * Basically, this just makes it possible for us to do * some real work in the future in "refill_inactive()". */ flush_cache_page(vma, address); if (!pte_dirty(pte)) goto drop_pte; /* * Ok, it's really dirty. That means that * we should either create a new swap cache * entry for it, or we should write it back * to its own backing store. */ if (page->mapping) { set_page_dirty(page); goto drop_pte; } /* * This is a dirty, swappable page. First of all, * get a suitable swap entry for it, and make sure * we have the swap cache set up to associate the * page with that swap entry. */ entry = get_swap_page(); if (!entry.val) goto out_unlock_restore; /* No swap space left */ /* Add it to the swap cache and mark it dirty */ add_to_swap_cache(page, entry); set_page_dirty(page); goto set_swap_pte; out_unlock_restore: set_pte(page_table, pte); UnlockPage(page); return 0; }
// do_pgfault - interrupt handler to process the page fault execption int do_pgfault(struct mm_struct *mm, uint32_t error_code, uintptr_t addr) { if (mm == NULL) { assert(current != NULL); panic("page fault in kernel thread: pid = %d, %d %08x.\n", current->pid, error_code, addr); } lock_mm(mm); int ret = -E_INVAL; struct vma_struct *vma = find_vma(mm, addr); if (vma == NULL || vma->vm_start > addr) { goto failed; } if (vma->vm_flags & VM_STACK) { if (addr < vma->vm_start + PGSIZE) { goto failed; } } switch (error_code & 3) { default: /* default is 3: write, present */ case 2: /* write, not present */ if (!(vma->vm_flags & VM_WRITE)) { goto failed; } break; case 1: /* read, present */ goto failed; case 0: /* read, not present */ if (!(vma->vm_flags & (VM_READ | VM_EXEC))) { goto failed; } } uint32_t perm = PTE_U; if (vma->vm_flags & VM_WRITE) { perm |= PTE_W; } addr = ROUNDDOWN(addr, PGSIZE); ret = -E_NO_MEM; pte_t *ptep; if ((ptep = get_pte(mm->pgdir, addr, 1)) == NULL) { goto failed; } if (*ptep == 0) { if (!(vma->vm_flags & VM_SHARE)) { if (pgdir_alloc_page(mm->pgdir, addr, perm) == NULL) { goto failed; } } else { lock_shmem(vma->shmem); uintptr_t shmem_addr = addr - vma->vm_start + vma->shmem_off; pte_t *sh_ptep = shmem_get_entry(vma->shmem, shmem_addr, 1); if (sh_ptep == NULL || *sh_ptep == 0) { unlock_shmem(vma->shmem); goto failed; } unlock_shmem(vma->shmem); if (*sh_ptep & PTE_P) { page_insert(mm->pgdir, pa2page(*sh_ptep), addr, perm); } else { swap_duplicate(*ptep); *ptep = *sh_ptep; } } } else { struct Page *page, *newpage = NULL; bool cow = ((vma->vm_flags & (VM_SHARE | VM_WRITE)) == VM_WRITE), may_copy = 1; assert(!(*ptep & PTE_P) || ((error_code & 2) && !(*ptep & PTE_W) && cow)); if (cow) { newpage = alloc_page(); } if (*ptep & PTE_P) { page = pte2page(*ptep); } else { if ((ret = swap_in_page(*ptep, &page)) != 0) { if (newpage != NULL) { free_page(newpage); } goto failed; } if (!(error_code & 2) && cow) { perm &= ~PTE_W; may_copy = 0; } } if (cow && may_copy) { if (page_ref(page) + swap_page_count(page) > 1) { if (newpage == NULL) { goto failed; } memcpy(page2kva(newpage), page2kva(page), PGSIZE); page = newpage, newpage = NULL; } } page_insert(mm->pgdir, page, addr, perm); if (newpage != NULL) { free_page(newpage); } } ret = 0; failed: unlock_mm(mm); return ret; }
// check_swap - check the correctness of swap & page replacement algorithm static void check_swap(void) { size_t nr_used_pages_store = nr_used_pages(); size_t slab_allocated_store = slab_allocated(); size_t offset; for (offset = 2; offset < max_swap_offset; offset ++) { mem_map[offset] = 1; } struct mm_struct *mm = mm_create(); assert(mm != NULL); extern struct mm_struct *check_mm_struct; assert(check_mm_struct == NULL); check_mm_struct = mm; pgd_t *pgdir = mm->pgdir = init_pgdir_get(); assert(pgdir[PGX(TEST_PAGE)] == 0); struct vma_struct *vma = vma_create(TEST_PAGE, TEST_PAGE + PTSIZE, VM_WRITE | VM_READ); assert(vma != NULL); insert_vma_struct(mm, vma); struct Page *rp0 = alloc_page(), *rp1 = alloc_page(); assert(rp0 != NULL && rp1 != NULL); pte_perm_t perm; ptep_unmap (&perm); ptep_set_u_write(&perm); int ret = page_insert(pgdir, rp1, TEST_PAGE, perm); assert(ret == 0 && page_ref(rp1) == 1); page_ref_inc(rp1); ret = page_insert(pgdir, rp0, TEST_PAGE, perm); assert(ret == 0 && page_ref(rp1) == 1 && page_ref(rp0) == 1); // check try_alloc_swap_entry swap_entry_t entry = try_alloc_swap_entry(); assert(swap_offset(entry) == 1); mem_map[1] = 1; assert(try_alloc_swap_entry() == 0); // set rp1, Swap, Active, add to hash_list, active_list swap_page_add(rp1, entry); swap_active_list_add(rp1); assert(PageSwap(rp1)); mem_map[1] = 0; entry = try_alloc_swap_entry(); assert(swap_offset(entry) == 1); assert(!PageSwap(rp1)); // check swap_remove_entry assert(swap_hash_find(entry) == NULL); mem_map[1] = 2; swap_remove_entry(entry); assert(mem_map[1] == 1); swap_page_add(rp1, entry); swap_inactive_list_add(rp1); swap_remove_entry(entry); assert(PageSwap(rp1)); assert(rp1->index == entry && mem_map[1] == 0); // check page_launder, move page from inactive_list to active_list assert(page_ref(rp1) == 1); assert(nr_active_pages == 0 && nr_inactive_pages == 1); assert(list_next(&(inactive_list.swap_list)) == &(rp1->swap_link)); page_launder(); assert(nr_active_pages == 1 && nr_inactive_pages == 0); assert(PageSwap(rp1) && PageActive(rp1)); entry = try_alloc_swap_entry(); assert(swap_offset(entry) == 1); assert(!PageSwap(rp1) && nr_active_pages == 0); assert(list_empty(&(active_list.swap_list))); // set rp1 inactive again assert(page_ref(rp1) == 1); swap_page_add(rp1, 0); assert(PageSwap(rp1) && swap_offset(rp1->index) == 1); swap_inactive_list_add(rp1); mem_map[1] = 1; assert(nr_inactive_pages == 1); page_ref_dec(rp1); size_t count = nr_used_pages(); swap_remove_entry(entry); assert(nr_inactive_pages == 0 && nr_used_pages() == count - 1); // check swap_out_mm pte_t *ptep0 = get_pte(pgdir, TEST_PAGE, 0), *ptep1; assert(ptep0 != NULL && pte2page(*ptep0) == rp0); ret = swap_out_mm(mm, 0); assert(ret == 0); ret = swap_out_mm(mm, 10); assert(ret == 1 && mm->swap_address == TEST_PAGE + PGSIZE); ret = swap_out_mm(mm, 10); assert(ret == 0 && *ptep0 == entry && mem_map[1] == 1); assert(PageDirty(rp0) && PageActive(rp0) && page_ref(rp0) == 0); assert(nr_active_pages == 1 && list_next(&(active_list.swap_list)) == &(rp0->swap_link)); // check refill_inactive_scan() refill_inactive_scan(); assert(!PageActive(rp0) && page_ref(rp0) == 0); assert(nr_inactive_pages == 1 && list_next(&(inactive_list.swap_list)) == &(rp0->swap_link)); page_ref_inc(rp0); page_launder(); assert(PageActive(rp0) && page_ref(rp0) == 1); assert(nr_active_pages == 1 && list_next(&(active_list.swap_list)) == &(rp0->swap_link)); page_ref_dec(rp0); refill_inactive_scan(); assert(!PageActive(rp0)); // save data in rp0 int i; for (i = 0; i < PGSIZE; i ++) { ((char *)page2kva(rp0))[i] = (char)i; } page_launder(); assert(nr_inactive_pages == 0 && list_empty(&(inactive_list.swap_list))); assert(mem_map[1] == 1); rp1 = alloc_page(); assert(rp1 != NULL); ret = swapfs_read(entry, rp1); assert(ret == 0); for (i = 0; i < PGSIZE; i ++) { assert(((char *)page2kva(rp1))[i] == (char)i); } // page fault now *(char *)(TEST_PAGE) = 0xEF; rp0 = pte2page(*ptep0); assert(page_ref(rp0) == 1); assert(PageSwap(rp0) && PageActive(rp0)); entry = try_alloc_swap_entry(); assert(swap_offset(entry) == 1 && mem_map[1] == SWAP_UNUSED); assert(!PageSwap(rp0) && nr_active_pages == 0 && nr_inactive_pages == 0); // clear accessed flag assert(rp0 == pte2page(*ptep0)); assert(!PageSwap(rp0)); ret = swap_out_mm(mm, 10); assert(ret == 0); assert(!PageSwap(rp0) && ptep_present(ptep0)); // change page table ret = swap_out_mm(mm, 10); assert(ret == 1); assert(*ptep0 == entry && page_ref(rp0) == 0 && mem_map[1] == 1); count = nr_used_pages(); refill_inactive_scan(); page_launder(); assert(count - 1 == nr_used_pages()); ret = swapfs_read(entry, rp1); assert(ret == 0 && *(char *)(page2kva(rp1)) == (char)0xEF); free_page(rp1); // duplictate *ptep0 ptep1 = get_pte(pgdir, TEST_PAGE + PGSIZE, 0); assert(ptep1 != NULL && ptep_invalid(ptep1)); swap_duplicate(*ptep0); ptep_copy(ptep1, ptep0); mp_tlb_invalidate (pgdir, TEST_PAGE + PGSIZE); // page fault again // update for copy on write *(char *)(TEST_PAGE + 1) = 0x88; *(char *)(TEST_PAGE + PGSIZE) = 0x8F; *(char *)(TEST_PAGE + PGSIZE + 1) = 0xFF; assert(pte2page(*ptep0) != pte2page(*ptep1)); assert(*(char *)(TEST_PAGE) == (char)0xEF); assert(*(char *)(TEST_PAGE + 1) == (char)0x88); assert(*(char *)(TEST_PAGE + PGSIZE) == (char)0x8F); assert(*(char *)(TEST_PAGE + PGSIZE + 1) == (char)0xFF); rp0 = pte2page(*ptep0); rp1 = pte2page(*ptep1); assert(!PageSwap(rp0) && PageSwap(rp1) && PageActive(rp1)); entry = try_alloc_swap_entry(); assert(!PageSwap(rp0) && !PageSwap(rp1)); assert(swap_offset(entry) == 1 && mem_map[1] == SWAP_UNUSED); assert(list_empty(&(active_list.swap_list))); assert(list_empty(&(inactive_list.swap_list))); ptep_set_accessed(&perm); page_insert(pgdir, rp0, TEST_PAGE + PGSIZE, perm); // check swap_out_mm *(char *)(TEST_PAGE) = *(char *)(TEST_PAGE + PGSIZE) = 0xEE; mm->swap_address = TEST_PAGE + PGSIZE * 2; ret = swap_out_mm(mm, 2); assert(ret == 0); assert(ptep_present(ptep0) && ! ptep_accessed(ptep0)); assert(ptep_present(ptep1) && ! ptep_accessed(ptep1)); ret = swap_out_mm(mm, 2); assert(ret == 2); assert(mem_map[1] == 2 && page_ref(rp0) == 0); refill_inactive_scan(); page_launder(); assert(mem_map[1] == 2 && swap_hash_find(entry) == NULL); // check copy entry swap_remove_entry(entry); ptep_unmap(ptep1); assert(mem_map[1] == 1); swap_entry_t store; ret = swap_copy_entry(entry, &store); assert(ret == -E_NO_MEM); mem_map[2] = SWAP_UNUSED; ret = swap_copy_entry(entry, &store); assert(ret == 0 && swap_offset(store) == 2 && mem_map[2] == 0); mem_map[2] = 1; ptep_copy(ptep1, &store); assert(*(char *)(TEST_PAGE + PGSIZE) == (char)0xEE && *(char *)(TEST_PAGE + PGSIZE + 1)== (char)0x88); *(char *)(TEST_PAGE + PGSIZE) = 1, *(char *)(TEST_PAGE + PGSIZE + 1) = 2; assert(*(char *)TEST_PAGE == (char)0xEE && *(char *)(TEST_PAGE + 1) == (char)0x88); ret = swap_in_page(entry, &rp0); assert(ret == 0); ret = swap_in_page(store, &rp1); assert(ret == 0); assert(rp1 != rp0); // free memory swap_list_del(rp0), swap_list_del(rp1); swap_page_del(rp0), swap_page_del(rp1); assert(page_ref(rp0) == 1 && page_ref(rp1) == 1); assert(nr_active_pages == 0 && list_empty(&(active_list.swap_list))); assert(nr_inactive_pages == 0 && list_empty(&(inactive_list.swap_list))); for (i = 0; i < HASH_LIST_SIZE; i ++) { assert(list_empty(hash_list + i)); } page_remove(pgdir, TEST_PAGE); page_remove(pgdir, (TEST_PAGE + PGSIZE)); #if PMXSHIFT != PUXSHIFT free_page(pa2page(PMD_ADDR(*get_pmd(pgdir, TEST_PAGE, 0)))); #endif #if PUXSHIFT != PGXSHIFT free_page(pa2page(PUD_ADDR(*get_pud(pgdir, TEST_PAGE, 0)))); #endif free_page(pa2page(PGD_ADDR(*get_pgd(pgdir, TEST_PAGE, 0)))); pgdir[PGX(TEST_PAGE)] = 0; mm->pgdir = NULL; mm_destroy(mm); check_mm_struct = NULL; assert(nr_active_pages == 0 && nr_inactive_pages == 0); for (offset = 0; offset < max_swap_offset; offset ++) { mem_map[offset] = SWAP_UNUSED; } assert(nr_used_pages_store == nr_used_pages()); assert(slab_allocated_store == slab_allocated()); kprintf("check_swap() succeeded.\n"); }
// check_swap - check the correctness of swap & page replacement algorithm static void check_swap(void) { size_t nr_free_pages_store = nr_free_pages(); size_t slab_allocated_store = slab_allocated(); size_t offset; for (offset = 2; offset < max_swap_offset; offset ++) { mem_map[offset] = 1; } struct mm_struct *mm = mm_create(); assert(mm != NULL); extern struct mm_struct *check_mm_struct; assert(check_mm_struct == NULL); check_mm_struct = mm; pde_t *pgdir = mm->pgdir = boot_pgdir; assert(pgdir[0] == 0); struct vma_struct *vma = vma_create(0, PTSIZE, VM_WRITE | VM_READ); assert(vma != NULL); insert_vma_struct(mm, vma); struct Page *rp0 = alloc_page(), *rp1 = alloc_page(); assert(rp0 != NULL && rp1 != NULL); uint32_t perm = PTE_U | PTE_W; int ret = page_insert(pgdir, rp1, 0, perm); assert(ret == 0 && page_ref(rp1) == 1); page_ref_inc(rp1); ret = page_insert(pgdir, rp0, 0, perm); assert(ret == 0 && page_ref(rp1) == 1 && page_ref(rp0) == 1); // check try_alloc_swap_entry swap_entry_t entry = try_alloc_swap_entry(); assert(swap_offset(entry) == 1); mem_map[1] = 1; assert(try_alloc_swap_entry() == 0); // set rp1, Swap, Active, add to hash_list, active_list swap_page_add(rp1, entry); swap_active_list_add(rp1); assert(PageSwap(rp1)); mem_map[1] = 0; entry = try_alloc_swap_entry(); assert(swap_offset(entry) == 1); assert(!PageSwap(rp1)); // check swap_remove_entry assert(swap_hash_find(entry) == NULL); mem_map[1] = 2; swap_remove_entry(entry); assert(mem_map[1] == 1); swap_page_add(rp1, entry); swap_inactive_list_add(rp1); swap_remove_entry(entry); assert(PageSwap(rp1)); assert(rp1->index == entry && mem_map[1] == 0); // check page_launder, move page from inactive_list to active_list assert(page_ref(rp1) == 1); assert(nr_active_pages == 0 && nr_inactive_pages == 1); assert(list_next(&(inactive_list.swap_list)) == &(rp1->swap_link)); page_launder(); assert(nr_active_pages == 1 && nr_inactive_pages == 0); assert(PageSwap(rp1) && PageActive(rp1)); entry = try_alloc_swap_entry(); assert(swap_offset(entry) == 1); assert(!PageSwap(rp1) && nr_active_pages == 0); assert(list_empty(&(active_list.swap_list))); // set rp1 inactive again assert(page_ref(rp1) == 1); swap_page_add(rp1, 0); assert(PageSwap(rp1) && swap_offset(rp1->index) == 1); swap_inactive_list_add(rp1); mem_map[1] = 1; assert(nr_inactive_pages == 1); page_ref_dec(rp1); size_t count = nr_free_pages(); swap_remove_entry(entry); assert(nr_inactive_pages == 0 && nr_free_pages() == count + 1); // check swap_out_mm pte_t *ptep0 = get_pte(pgdir, 0, 0), *ptep1; assert(ptep0 != NULL && pte2page(*ptep0) == rp0); ret = swap_out_mm(mm, 0); assert(ret == 0); ret = swap_out_mm(mm, 10); assert(ret == 1 && mm->swap_address == PGSIZE); ret = swap_out_mm(mm, 10); assert(ret == 0 && *ptep0 == entry && mem_map[1] == 1); assert(PageDirty(rp0) && PageActive(rp0) && page_ref(rp0) == 0); assert(nr_active_pages == 1 && list_next(&(active_list.swap_list)) == &(rp0->swap_link)); // check refill_inactive_scan() refill_inactive_scan(); assert(!PageActive(rp0) && page_ref(rp0) == 0); assert(nr_inactive_pages == 1 && list_next(&(inactive_list.swap_list)) == &(rp0->swap_link)); page_ref_inc(rp0); page_launder(); assert(PageActive(rp0) && page_ref(rp0) == 1); assert(nr_active_pages == 1 && list_next(&(active_list.swap_list)) == &(rp0->swap_link)); page_ref_dec(rp0); refill_inactive_scan(); assert(!PageActive(rp0)); // save data in rp0 int i; for (i = 0; i < PGSIZE; i ++) { ((char *)page2kva(rp0))[i] = (char)i; } page_launder(); assert(nr_inactive_pages == 0 && list_empty(&(inactive_list.swap_list))); assert(mem_map[1] == 1); rp1 = alloc_page(); assert(rp1 != NULL); ret = swapfs_read(entry, rp1); assert(ret == 0); for (i = 0; i < PGSIZE; i ++) { assert(((char *)page2kva(rp1))[i] == (char)i); } // page fault now *(char *)0 = 0xEF; rp0 = pte2page(*ptep0); assert(page_ref(rp0) == 1); assert(PageSwap(rp0) && PageActive(rp0)); entry = try_alloc_swap_entry(); assert(swap_offset(entry) == 1 && mem_map[1] == SWAP_UNUSED); assert(!PageSwap(rp0) && nr_active_pages == 0 && nr_inactive_pages == 0); // clear accessed flag assert(rp0 == pte2page(*ptep0)); assert(!PageSwap(rp0)); ret = swap_out_mm(mm, 10); assert(ret == 0); assert(!PageSwap(rp0) && (*ptep0 & PTE_P)); // change page table ret = swap_out_mm(mm, 10); assert(ret == 1); assert(*ptep0 == entry && page_ref(rp0) == 0 && mem_map[1] == 1); count = nr_free_pages(); refill_inactive_scan(); page_launder(); assert(count + 1 == nr_free_pages()); ret = swapfs_read(entry, rp1); assert(ret == 0 && *(char *)(page2kva(rp1)) == (char)0xEF); free_page(rp1); // duplictate *ptep0 ptep1 = get_pte(pgdir, PGSIZE, 0); assert(ptep1 != NULL && *ptep1 == 0); swap_duplicate(*ptep0); *ptep1 = *ptep0; // page fault again *(char *)0 = 0xFF; *(char *)(PGSIZE + 1) = 0x88; assert(pte2page(*ptep0) == pte2page(*ptep1)); rp0 = pte2page(*ptep0); assert(*(char *)1 == (char)0x88 && *(char *)PGSIZE == (char)0xFF); assert(page_ref(rp0) == 2 && rp0->index == entry && mem_map[1] == 0); assert(PageSwap(rp0) && PageActive(rp0)); entry = try_alloc_swap_entry(); assert(swap_offset(entry) == 1 && mem_map[1] == SWAP_UNUSED); assert(!PageSwap(rp0)); assert(list_empty(&(active_list.swap_list))); assert(list_empty(&(inactive_list.swap_list))); // check swap_out_mm *(char *)0 = *(char *)PGSIZE = 0xEE; mm->swap_address = PGSIZE * 2; ret = swap_out_mm(mm, 2); assert(ret == 0); assert((*ptep0 & PTE_P) && !(*ptep0 & PTE_A)); assert((*ptep1 & PTE_P) && !(*ptep1 & PTE_A)); ret = swap_out_mm(mm, 2); assert(ret == 2); assert(mem_map[1] == 2 && page_ref(rp0) == 0); refill_inactive_scan(); page_launder(); assert(mem_map[1] == 2 && swap_hash_find(entry) == NULL); // check copy entry swap_remove_entry(entry); *ptep1 = 0; assert(mem_map[1] == 1); swap_entry_t store; ret = swap_copy_entry(entry, &store); assert(ret == -E_NO_MEM); mem_map[2] = SWAP_UNUSED; ret = swap_copy_entry(entry, &store); assert(ret == 0 && swap_offset(store) == 2 && mem_map[2] == 0); mem_map[2] = 1; *ptep1 = store; assert(*(char *)PGSIZE == (char)0xEE && *(char *)(PGSIZE + 1)== (char)0x88); *(char *)PGSIZE = 1, *(char *)(PGSIZE + 1) = 2; assert(*(char *)0 == (char)0xEE && *(char *)1 == (char)0x88); ret = swap_in_page(entry, &rp0); assert(ret == 0); ret = swap_in_page(store, &rp1); assert(ret == 0); assert(rp1 != rp0); // free memory swap_list_del(rp0), swap_list_del(rp1); swap_page_del(rp0), swap_page_del(rp1); assert(page_ref(rp0) == 1 && page_ref(rp1) == 1); assert(nr_active_pages == 0 && list_empty(&(active_list.swap_list))); assert(nr_inactive_pages == 0 && list_empty(&(inactive_list.swap_list))); for (i = 0; i < HASH_LIST_SIZE; i ++) { assert(list_empty(hash_list + i)); } page_remove(pgdir, 0); page_remove(pgdir, PGSIZE); free_page(pa2page(pgdir[0])); pgdir[0] = 0; mm->pgdir = NULL; mm_destroy(mm); check_mm_struct = NULL; assert(nr_active_pages == 0 && nr_inactive_pages == 0); for (offset = 0; offset < max_swap_offset; offset ++) { mem_map[offset] = SWAP_UNUSED; } assert(nr_free_pages_store == nr_free_pages()); assert(slab_allocated_store == slab_allocated()); cprintf("check_swap() succeeded.\n"); }
/* * copy one vm_area from one task to the other. Assumes the page tables * already present in the new task to be cleared in the whole range * covered by this vma. * * 08Jan98 Merged into one routine from several inline routines to reduce * variable count and make things faster. -jj */ int copy_page_range(struct mm_struct *dst, struct mm_struct *src, struct vm_area_struct *vma) { pgd_t * src_pgd, * dst_pgd; unsigned long address = vma->vm_start; unsigned long end = vma->vm_end; unsigned long cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; src_pgd = pgd_offset(src, address)-1; dst_pgd = pgd_offset(dst, address)-1; for (;;) { pmd_t * src_pmd, * dst_pmd; src_pgd++; dst_pgd++; /* copy_pmd_range */ if (pgd_none(*src_pgd)) goto skip_copy_pmd_range; if (pgd_bad(*src_pgd)) { pgd_ERROR(*src_pgd); pgd_clear(src_pgd); skip_copy_pmd_range: address = (address + PGDIR_SIZE) & PGDIR_MASK; if (!address || (address >= end)) goto out; continue; } if (pgd_none(*dst_pgd)) { if (!pmd_alloc(dst_pgd, 0)) goto nomem; } src_pmd = pmd_offset(src_pgd, address); dst_pmd = pmd_offset(dst_pgd, address); do { pte_t * src_pte, * dst_pte; /* copy_pte_range */ if (pmd_none(*src_pmd)) goto skip_copy_pte_range; if (pmd_bad(*src_pmd)) { pmd_ERROR(*src_pmd); pmd_clear(src_pmd); skip_copy_pte_range: address = (address + PMD_SIZE) & PMD_MASK; if (address >= end) goto out; goto cont_copy_pmd_range; } if (pmd_none(*dst_pmd)) { if (!pte_alloc(dst_pmd, 0)) goto nomem; } src_pte = pte_offset(src_pmd, address); dst_pte = pte_offset(dst_pmd, address); do { pte_t pte = *src_pte; struct page *ptepage; /* copy_one_pte */ if (pte_none(pte)) goto cont_copy_pte_range_noset; if (!pte_present(pte)) { swap_duplicate(pte_to_swp_entry(pte)); goto cont_copy_pte_range; } ptepage = pte_page(pte); if ((!VALID_PAGE(ptepage)) || PageReserved(ptepage)) goto cont_copy_pte_range; /* If it's a COW mapping, write protect it both in the parent and the child */ if (cow) { ptep_set_wrprotect(src_pte); pte = *src_pte; } /* If it's a shared mapping, mark it clean in the child */ if (vma->vm_flags & VM_SHARED) pte = pte_mkclean(pte); pte = pte_mkold(pte); get_page(ptepage); cont_copy_pte_range: set_pte(dst_pte, pte); cont_copy_pte_range_noset: address += PAGE_SIZE; if (address >= end) goto out; src_pte++; dst_pte++; } while ((unsigned long)src_pte & PTE_TABLE_MASK); cont_copy_pmd_range: src_pmd++; dst_pmd++; } while ((unsigned long)src_pmd & PMD_TABLE_MASK); } out: return 0; nomem: return -ENOMEM; }
/* mm->page_table_lock is held. mmap_sem is not held */ static inline int try_to_swap_out(struct mm_struct * mm, struct vm_area_struct* vma, unsigned long address, pte_t * page_table, struct page *page, zone_t * classzone) { pte_t pte; swp_entry_t entry; /* Don't look at this pte if it's been accessed recently. */ if ((vma->vm_flags & VM_LOCKED) || ptep_test_and_clear_young(page_table)) { mark_page_accessed(page); return 0; } /* Don't bother unmapping pages that are active */ if (PageActive(page)) return 0; /* Don't bother replenishing zones not under pressure.. */ if (!memclass(page->zone, classzone)) return 0; if (TryLockPage(page)) return 0; /* From this point on, the odds are that we're going to * nuke this pte, so read and clear the pte. This hook * is needed on CPUs which update the accessed and dirty * bits in hardware. */ flush_cache_page(vma, address); pte = ptep_get_and_clear(page_table); flush_tlb_page(vma, address); if (pte_dirty(pte)) set_page_dirty(page); /* * Is the page already in the swap cache? If so, then * we can just drop our reference to it without doing * any IO - it's already up-to-date on disk. */ if (PageSwapCache(page)) { entry.val = page->index; swap_duplicate(entry); set_swap_pte: set_pte(page_table, swp_entry_to_pte(entry)); drop_pte: mm->rss--; UnlockPage(page); { int freeable = page_count(page) - !!page->buffers <= 2; page_cache_release(page); return freeable; } } /* * Is it a clean page? Then it must be recoverable * by just paging it in again, and we can just drop * it.. or if it's dirty but has backing store, * just mark the page dirty and drop it. * * However, this won't actually free any real * memory, as the page will just be in the page cache * somewhere, and as such we should just continue * our scan. * * Basically, this just makes it possible for us to do * some real work in the future in "refill_inactive()". */ if (page->mapping) goto drop_pte; if (!PageDirty(page)) goto drop_pte; /* * Anonymous buffercache pages can be left behind by * concurrent truncate and pagefault. */ if (page->buffers) goto preserve; /* * This is a dirty, swappable page. First of all, * get a suitable swap entry for it, and make sure * we have the swap cache set up to associate the * page with that swap entry. */ for (;;) { entry = get_swap_page(); if (!entry.val) break; /* Add it to the swap cache and mark it dirty * (adding to the page cache will clear the dirty * and uptodate bits, so we need to do it again) */ if (add_to_swap_cache(page, entry) == 0) { SetPageUptodate(page); set_page_dirty(page); goto set_swap_pte; } /* Raced with "speculative" read_swap_cache_async */ swap_free(entry); } /* No swap space left */ preserve: set_pte(page_table, pte); UnlockPage(page); return 0; }
/* * We completely avoid races by reading each swap page in advance, * and then search for the process using it. All the necessary * page table adjustments can then be made atomically. */ static int try_to_unuse(unsigned int type) { struct swap_info_struct * si = &swap_info[type]; struct mm_struct *start_mm; unsigned short *swap_map; unsigned short swcount; struct page *page; swp_entry_t entry; int i = 0; int retval = 0; int reset_overflow = 0; int shmem; /* * When searching mms for an entry, a good strategy is to * start at the first mm we freed the previous entry from * (though actually we don't notice whether we or coincidence * freed the entry). Initialize this start_mm with a hold. * * A simpler strategy would be to start at the last mm we * freed the previous entry from; but that would take less * advantage of mmlist ordering (now preserved by swap_out()), * which clusters forked address spaces together, most recent * child immediately after parent. If we race with dup_mmap(), * we very much want to resolve parent before child, otherwise * we may miss some entries: using last mm would invert that. */ start_mm = &init_mm; atomic_inc(&init_mm.mm_users); /* * Keep on scanning until all entries have gone. Usually, * one pass through swap_map is enough, but not necessarily: * mmput() removes mm from mmlist before exit_mmap() and its * zap_page_range(). That's not too bad, those entries are * on their way out, and handled faster there than here. * do_munmap() behaves similarly, taking the range out of mm's * vma list before zap_page_range(). But unfortunately, when * unmapping a part of a vma, it takes the whole out first, * then reinserts what's left after (might even reschedule if * open() method called) - so swap entries may be invisible * to swapoff for a while, then reappear - but that is rare. */ while ((i = find_next_to_unuse(si, i))) { /* * Get a page for the entry, using the existing swap * cache page if there is one. Otherwise, get a clean * page and read the swap into it. */ swap_map = &si->swap_map[i]; entry = SWP_ENTRY(type, i); page = read_swap_cache_async(entry); if (!page) { /* * Either swap_duplicate() failed because entry * has been freed independently, and will not be * reused since sys_swapoff() already disabled * allocation from here, or alloc_page() failed. */ if (!*swap_map) continue; retval = -ENOMEM; break; } /* * Don't hold on to start_mm if it looks like exiting. */ if (atomic_read(&start_mm->mm_users) == 1) { mmput(start_mm); start_mm = &init_mm; atomic_inc(&init_mm.mm_users); } /* * Wait for and lock page. When do_swap_page races with * try_to_unuse, do_swap_page can handle the fault much * faster than try_to_unuse can locate the entry. This * apparently redundant "wait_on_page" lets try_to_unuse * defer to do_swap_page in such a case - in some tests, * do_swap_page and try_to_unuse repeatedly compete. */ wait_on_page(page); lock_page(page); /* * Remove all references to entry, without blocking. * Whenever we reach init_mm, there's no address space * to search, but use it as a reminder to search shmem. */ shmem = 0; swcount = *swap_map; if (swcount > 1) { flush_page_to_ram(page); if (start_mm == &init_mm) shmem = shmem_unuse(entry, page); else unuse_process(start_mm, entry, page); } if (*swap_map > 1) { int set_start_mm = (*swap_map >= swcount); struct list_head *p = &start_mm->mmlist; struct mm_struct *new_start_mm = start_mm; struct mm_struct *mm; spin_lock(&mmlist_lock); while (*swap_map > 1 && (p = p->next) != &start_mm->mmlist) { mm = list_entry(p, struct mm_struct, mmlist); swcount = *swap_map; if (mm == &init_mm) { set_start_mm = 1; spin_unlock(&mmlist_lock); shmem = shmem_unuse(entry, page); spin_lock(&mmlist_lock); } else unuse_process(mm, entry, page); if (set_start_mm && *swap_map < swcount) { new_start_mm = mm; set_start_mm = 0; } } atomic_inc(&new_start_mm->mm_users); spin_unlock(&mmlist_lock); mmput(start_mm); start_mm = new_start_mm; } /* * How could swap count reach 0x7fff when the maximum * pid is 0x7fff, and there's no way to repeat a swap * page within an mm (except in shmem, where it's the * shared object which takes the reference count)? * We believe SWAP_MAP_MAX cannot occur in Linux 2.4. * * If that's wrong, then we should worry more about * exit_mmap() and do_munmap() cases described above: * we might be resetting SWAP_MAP_MAX too early here. * We know "Undead"s can happen, they're okay, so don't * report them; but do report if we reset SWAP_MAP_MAX. */ if (*swap_map == SWAP_MAP_MAX) { swap_list_lock(); swap_device_lock(si); nr_swap_pages++; *swap_map = 1; swap_device_unlock(si); swap_list_unlock(); reset_overflow = 1; } /* * If a reference remains (rare), we would like to leave * the page in the swap cache; but try_to_swap_out could * then re-duplicate the entry once we drop page lock, * so we might loop indefinitely; also, that page could * not be swapped out to other storage meanwhile. So: * delete from cache even if there's another reference, * after ensuring that the data has been saved to disk - * since if the reference remains (rarer), it will be * read from disk into another page. Splitting into two * pages would be incorrect if swap supported "shared * private" pages, but they are handled by tmpfs files. * * Note shmem_unuse already deleted swappage from cache, * unless corresponding filepage found already in cache: * in which case it left swappage in cache, lowered its * swap count to pass quickly through the loops above, * and now we must reincrement count to try again later. */ if ((*swap_map > 1) && PageDirty(page) && PageSwapCache(page)) { rw_swap_page(WRITE, page); lock_page(page); } if (PageSwapCache(page)) { if (shmem) swap_duplicate(entry); else delete_from_swap_cache(page); } /* * So we could skip searching mms once swap count went * to 1, we did not mark any present ptes as dirty: must * mark page dirty so try_to_swap_out will preserve it. */ SetPageDirty(page); UnlockPage(page); page_cache_release(page); /* * Make sure that we aren't completely killing * interactive performance. Interruptible check on * signal_pending() would be nice, but changes the spec? */ if (current->need_resched) schedule(); }