/* * Like get_user_pages_fast() except its IRQ-safe in that it won't fall * back to the regular GUP. */ int __get_user_pages_fast(unsigned long start, int nr_pages, int write, struct page **pages) { struct mm_struct *mm = current->mm; unsigned long addr, len, end; unsigned long next; unsigned long flags; pgd_t *pgdp; int nr = 0; start &= PAGE_MASK; addr = start; len = (unsigned long) nr_pages << PAGE_SHIFT; end = start + len; if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ, (void __user *)start, len))) return 0; /* * This doesn't prevent pagetable teardown, but does prevent * the pagetables and pages from being freed. */ local_irq_save(flags); pgdp = pgd_offset(mm, addr); do { pgd_t pgd = *pgdp; next = pgd_addr_end(addr, end); if (pgd_none(pgd)) break; if (!gup_pud_range(pgd, addr, next, write, pages, &nr)) break; } while (pgdp++, addr = next, addr != end); local_irq_restore(flags); return nr; }
static inline unsigned long uvirt_to_kva(pgd_t *pgd, unsigned long adr) { if (!pgd_none(*pgd) && !pgd_bad(*pgd)) { pmd_t *pmd; //#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,11) // pmd = pmd_offset(pgd, adr); //#else /* >= 2.6.11 */ pmd = pmd_offset(pud_offset(pgd, adr), adr); //#endif /* < 2.6.11 */ if (!pmd_none(*pmd)) { pte_t *ptep, pte; //#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0) // ptep = pte_offset(pmd, adr); //#else /* >= 2.6.0 */ ptep = pte_offset_kernel(pmd, adr); //#endif /* < 2.6.0 */ pte = *ptep; if (pte_present(pte)) { return (((unsigned long)page_address(pte_page(pte))) | (adr & (PAGE_SIZE - 1))); } } } return 0UL; }
struct page *virt2page(struct mm_struct *mm, unsigned long addr) { pgd_t *pgd; pud_t *pud; pmd_t *pmd; pte_t *pte; struct page *page = NULL;; pgd = pgd_offset(mm, addr); if (pgd_none(*pgd) || pgd_bad(*pgd)) return NULL; pud = pud_offset(pgd, addr); if (pud_none(*pud) || pud_bad(*pud)) return NULL; pmd = pmd_offset(pud, addr); if (pmd_none(*pmd) || pmd_bad(*pmd)) return NULL; if (!(pte = pte_offset_map(pmd, addr))) return NULL; if (pte_none(*pte)) goto out; if (!pte_present(*pte)) goto out; if (!(page = pte_page(*pte))) goto out; out: pte_unmap(pte); return page; }
static struct page * my_follow_page(struct vm_area_struct *vma, unsigned long addr) { pud_t *pud; pmd_t *pmd; pgd_t *pgd; pte_t *pte; spinlock_t *ptl; struct page *page = NULL; struct mm_struct *mm = vma->vm_mm; pgd = pgd_offset(mm, addr); if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) { goto out; } pud = pud_offset(pgd, addr); if (pud_none(*pud) || unlikely(pud_bad(*pud))) goto out; pmd = pmd_offset(pud, addr); if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd))) { goto out; } pte = pte_offset_map_lock(mm, pmd, addr, &ptl); if (!pte) goto out; if (!pte_present(*pte)) goto unlock; page = pfn_to_page(pte_pfn(*pte)); if (!page) goto unlock; get_page(page); unlock: pte_unmap_unlock(pte, ptl); out: return page; }
static inline void statm_pmd_range(pgd_t * pgd, unsigned long address, unsigned long size, int * pages, int * shared, int * dirty, int * total) { pmd_t * pmd; unsigned long end; if (pgd_none(*pgd)) return; if (pgd_bad(*pgd)) { printk("statm_pmd_range: bad pgd (%08lx)\n", pgd_val(*pgd)); pgd_clear(pgd); return; } pmd = pmd_offset(pgd, address); address &= ~PGDIR_MASK; end = address + size; if (end > PGDIR_SIZE) end = PGDIR_SIZE; do { statm_pte_range(pmd, address, end - address, pages, shared, dirty, total); address = (address + PMD_SIZE) & PMD_MASK; pmd++; } while (address < end); }
static int pin_page_for_read(const void __user *_addr, pte_t **ptep, spinlock_t **ptlp) { unsigned long addr = (unsigned long)_addr; pgd_t *pgd; pmd_t *pmd; pte_t *pte; pud_t *pud; spinlock_t *ptl; pgd = pgd_offset(current->mm, addr); if (unlikely(pgd_none(*pgd) || pgd_bad(*pgd))) { return 0; } pud = pud_offset(pgd, addr); if (unlikely(pud_none(*pud) || pud_bad(*pud))) { return 0; } pmd = pmd_offset(pud, addr); if (unlikely(pmd_none(*pmd) || pmd_bad(*pmd))) return 0; pte = pte_offset_map_lock(current->mm, pmd, addr, &ptl); if (unlikely(!pte_present(*pte) || !pte_young(*pte))) { pte_unmap_unlock(pte, ptl); return 0; } *ptep = pte; *ptlp = ptl; return 1; }
pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr, unsigned long sz) { pgd_t *pgd; pud_t *pud; pmd_t *pmd; pte_t *pte = NULL; pgd = pgd_offset(mm, addr); if (!pgd_none(*pgd)) { pud = pud_offset(pgd, addr); if (!pud_none(*pud)) { pmd = pmd_offset(pud, addr); if (!pmd_none(*pmd)) { if (is_hugetlb_pmd(*pmd)) pte = (pte_t *)pmd; else pte = pte_offset_map(pmd, addr); } } } return pte; }
/* * Dump out the page tables associated with 'addr' in mm 'mm'. */ void show_pte(struct mm_struct *mm, unsigned long addr) { pgd_t *pgd; if (!mm) mm = &init_mm; pr_alert("pgd = %p\n", mm->pgd); pgd = pgd_offset(mm, addr); pr_alert("[%08lx] *pgd=%016llx", addr, pgd_val(*pgd)); do { pud_t *pud; pmd_t *pmd; pte_t *pte; if (pgd_none(*pgd) || pgd_bad(*pgd)) break; pud = pud_offset(pgd, addr); printk(", *pud=%016llx", pud_val(*pud)); if (pud_none(*pud) || pud_bad(*pud)) break; pmd = pmd_offset(pud, addr); printk(", *pmd=%016llx", pmd_val(*pmd)); if (pmd_none(*pmd) || pmd_bad(*pmd)) break; pte = pte_offset_map(pmd, addr); printk(", *pte=%016llx", pte_val(*pte)); pte_unmap(pte); } while(0); printk("\n"); }
static void count_pgd_pages(struct mm_struct * mm, struct vm_area_struct * vma, pgd_t *dir, unsigned long address, unsigned long end, signed char *data_buf, int node_map) { pmd_t * pmd; unsigned long pgd_end, next_addr; if (pgd_none(*dir)) return; pmd = pmd_offset(dir, address); pgd_end = (address + PGDIR_SIZE) & PGDIR_MASK; if (pgd_end && (end > pgd_end)) end = pgd_end; do { count_pmd_pages(mm, vma, pmd, address, end, data_buf, node_map); next_addr = (address + PMD_SIZE) & PMD_MASK; data_buf += ((next_addr - address) >> PAGE_SHIFT); address = next_addr; pmd++; } while (address && (address < end)); return; }
void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags) { pgd_t *pgd; pud_t *pud; pmd_t *pmd; if (vaddr & (PMD_SIZE-1)) { printk(KERN_ERR "set_pmd_pfn: vaddr misaligned\n"); return; } if (pfn & (PTRS_PER_PTE-1)) { printk(KERN_ERR "set_pmd_pfn: pfn misaligned\n"); return; } pgd = swapper_pg_dir + pgd_index(vaddr); if (pgd_none(*pgd)) { printk(KERN_ERR "set_pmd_pfn: pgd_none\n"); return; } pud = pud_offset(pgd, vaddr); pmd = pmd_offset(pud, vaddr); set_pmd(pmd, pfn_pmd(pfn, flags)); local_flush_tlb_one(vaddr); }
void assert_pte_locked(struct mm_struct *mm, unsigned long addr) { pgd_t *pgd; pud_t *pud; pmd_t *pmd; if (mm == &init_mm) return; pgd = mm->pgd + pgd_index(addr); BUG_ON(pgd_none(*pgd)); pud = pud_offset(pgd, addr); BUG_ON(pud_none(*pud)); pmd = pmd_offset(pud, addr); /* * khugepaged to collapse normal pages to hugepage, first set * pmd to none to force page fault/gup to take mmap_sem. After * pmd is set to none, we do a pte_clear which does this assertion * so if we find pmd none, return. */ if (pmd_none(*pmd)) return; BUG_ON(!pmd_present(*pmd)); assert_spin_locked(pte_lockptr(mm, pmd)); }
static enum gcerror virt2phys(unsigned int logical, pte_t *physical) { pgd_t *pgd; /* Page Global Directory (PGD). */ pmd_t *pmd; /* Page Middle Directory (PMD). */ pte_t *pte; /* Page Table Entry (PTE). */ /* Get the pointer to the entry in PGD for the address. */ pgd = pgd_offset(current->mm, logical); if (pgd_none(*pgd) || pgd_bad(*pgd)) return GCERR_MMU_PAGE_BAD; /* Get the pointer to the entry in PMD for the address. */ pmd = pmd_offset(pgd, logical); if (pmd_none(*pmd) || pmd_bad(*pmd)) return GCERR_MMU_PAGE_BAD; /* Get the pointer to the entry in PTE for the address. */ pte = pte_offset_map(pmd, logical); if ((pte == NULL) || !pte_present(*pte)) return GCERR_MMU_PAGE_BAD; *physical = (*pte & PAGE_MASK) | (logical & ~PAGE_MASK); return GCERR_NONE; }
static void set_pte_phys(unsigned long addr, unsigned long phys, pgprot_t prot) { pgd_t *pgd; pud_t *pud; pmd_t *pmd; pte_t *pte; pgd = pgd_offset_k(addr); if (pgd_none(*pgd)) { pgd_ERROR(*pgd); return; } pud = pud_alloc(NULL, pgd, addr); if (unlikely(!pud)) { pud_ERROR(*pud); return; } pmd = pmd_alloc(NULL, pud, addr); if (unlikely(!pmd)) { pmd_ERROR(*pmd); return; } pte = pte_offset_kernel(pmd, addr); if (!pte_none(*pte)) { pte_ERROR(*pte); return; } set_pte(pte, pfn_pte(phys >> PAGE_SHIFT, prot)); if (cached_to_uncached) flush_tlb_one(get_asid(), addr); }
static unsigned long uva_to_pa(struct mm_struct *mm, unsigned long addr) { unsigned long ret = 0UL; pgd_t *pgd; pud_t *pud; pmd_t *pmd; pte_t *pte; pgd = pgd_offset(mm, addr); if (!pgd_none(*pgd)) { pud = pud_offset(pgd, addr); if (!pud_none(*pud)) { pmd = pmd_offset(pud, addr); if (!pmd_none(*pmd)) { pte = pte_offset_map(pmd, addr); if (!pte_none(*pte) && pte_present(*pte)) { #ifdef BMM_HAS_PTE_PAGE /* Use page struct */ struct page *page = pte_page(*pte); if(page) { ret = page_to_phys(page); ret |= (addr & (PAGE_SIZE-1)); } #else /* Use hard PTE */ pte = (pte_t *)((u32)pte - 2048); if(pte) ret = (*pte & 0xfffff000) | (addr & 0xfff); #endif } } } } return ret; }
int get_user_pages_fast(unsigned long start, int nr_pages, int write, struct page **pages) { struct mm_struct *mm = current->mm; unsigned long addr, len, end; unsigned long next; pgd_t *pgdp; int nr = 0; pr_devel("%s(%lx,%x,%s)\n", __func__, start, nr_pages, write ? "write" : "read"); start &= PAGE_MASK; addr = start; len = (unsigned long) nr_pages << PAGE_SHIFT; end = start + len; if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ, start, len))) goto slow_irqon; pr_devel(" aligned: %lx .. %lx\n", start, end); /* * XXX: batch / limit 'nr', to avoid large irq off latency * needs some instrumenting to determine the common sizes used by * important workloads (eg. DB2), and whether limiting the batch size * will decrease performance. * * It seems like we're in the clear for the moment. Direct-IO is * the main guy that batches up lots of get_user_pages, and even * they are limited to 64-at-a-time which is not so many. */ /* * This doesn't prevent pagetable teardown, but does prevent * the pagetables from being freed on powerpc. * * So long as we atomically load page table pointers versus teardown, * we can follow the address down to the the page and take a ref on it. */ local_irq_disable(); pgdp = pgd_offset(mm, addr); do { pgd_t pgd = *pgdp; pr_devel(" %016lx: normal pgd %p\n", addr, (void *)pgd_val(pgd)); next = pgd_addr_end(addr, end); if (pgd_none(pgd)) goto slow; if (pgd_huge(pgd)) { if (!gup_hugepte((pte_t *)pgdp, PGDIR_SIZE, addr, next, write, pages, &nr)) goto slow; } else if (is_hugepd(pgdp)) { if (!gup_hugepd((hugepd_t *)pgdp, PGDIR_SHIFT, addr, next, write, pages, &nr)) goto slow; } else if (!gup_pud_range(pgd, addr, next, write, pages, &nr)) goto slow; } while (pgdp++, addr = next, addr != end); local_irq_enable(); VM_BUG_ON(nr != (end - start) >> PAGE_SHIFT); return nr; { int ret; slow: local_irq_enable(); slow_irqon: pr_devel(" slow path ! nr = %d\n", nr); /* Try to get the remaining pages with get_user_pages */ start += nr << PAGE_SHIFT; pages += nr; down_read(&mm->mmap_sem); ret = get_user_pages(current, mm, start, (end - start) >> PAGE_SHIFT, write, 0, pages, NULL); up_read(&mm->mmap_sem); /* Have to be a bit careful with return values */ if (nr > 0) { if (ret < 0) ret = nr; else ret += nr; } return ret; } }
/* Pretty sick eh? */ int prom_callback(long *args) { struct console *cons, *saved_console = NULL; unsigned long flags; char *cmd; if (!args) return -1; if (!(cmd = (char *)args[0])) return -1; save_and_cli(flags); cons = console_drivers; while (cons) { unregister_console(cons); cons->flags &= ~(CON_PRINTBUFFER); cons->next = saved_console; saved_console = cons; cons = console_drivers; } register_console(&prom_console); if (!strcmp(cmd, "sync")) { prom_printf("PROM `%s' command...\n", cmd); show_free_areas(); if(current->pid != 0) { sti(); sys_sync(); cli(); } args[2] = 0; args[args[1] + 3] = -1; prom_printf("Returning to PROM\n"); } else if (!strcmp(cmd, "va>tte-data")) { unsigned long ctx, va; unsigned long tte = 0; long res = PROM_FALSE; ctx = args[3]; va = args[4]; if (ctx) { /* * Find process owning ctx, lookup mapping. */ struct task_struct *p; struct mm_struct *mm = NULL; pgd_t *pgdp; pmd_t *pmdp; pte_t *ptep; for_each_task(p) { mm = p->mm; if (CTX_HWBITS(mm->context) == ctx) break; } if (!mm || CTX_HWBITS(mm->context) != ctx) goto done; pgdp = pgd_offset(mm, va); if (pgd_none(*pgdp)) goto done; pmdp = pmd_offset(pgdp, va); if (pmd_none(*pmdp)) goto done; ptep = pte_offset(pmdp, va); if (!pte_present(*ptep)) goto done; tte = pte_val(*ptep); res = PROM_TRUE; goto done; } if ((va >= KERNBASE) && (va < (KERNBASE + (4 * 1024 * 1024)))) { /* Spitfire Errata #32 workaround */ __asm__ __volatile__("stxa %0, [%1] %2\n\t" "flush %%g6" : /* No outputs */ : "r" (0), "r" (PRIMARY_CONTEXT), "i" (ASI_DMMU)); /* * Locked down tlb entry 63. */ tte = spitfire_get_dtlb_data(63); res = PROM_TRUE; goto done; }
/*pgtable sequential scan and count for __access_bits.*/ static int scan_pgtable(void) { pgd_t *pgd = NULL; pud_t *pud = NULL; pmd_t *pmd = NULL; pte_t *ptep, pte; spinlock_t *ptl; struct mm_struct *mm; struct vm_area_struct *vma; unsigned long start = 0; /*the start of address.*/ unsigned long end = 0; /*the end of address.*/ unsigned long address = 0; /* the address of vma.*/ int number_hotpages = 0; /* the number of hot pages */ int number_vpages = 0; int cycle_index = 0; /* the loop counter, which denotes ITERATIONS. */ /* the array that records the number of hot page in every cycle */ int hot_page[ITERATIONS]; int number_current_pg = 0; int pg_count = 0; int j = 0; int times = 0; /* records reuse time*/ /* some variables that describe page "heat" */ int hig = 0; int mid = 0; int low = 0; int llow = 0; int lllow = 0; int llllow = 0; int all_pages = 0;/* the total number of pages */ /*the average number of hot pages in each iteration.*/ long avg_hotpage=0; /*the total number of memory accesses across all pages*/ long num_access=0; /* avg utilization of each page */ int avg_page_utilization = 0; /*get the handle of current running benchmark.*/ struct task_struct *bench_process = get_current_process(); if(bench_process == NULL) { printk("sysmon: get no process handle in scan_pgtable function...exit&trying again...\n"); return 0; } else /* get the process*/ mm = bench_process->mm; if(mm == NULL) { printk("sysmon: error mm is NULL, return back & trying...\n"); return 0; } for(j = 0; j < PAGE_ALL; j++) page_heat[j] = -1; for(j = 0; j < ITERATIONS; j++) { hot_page[j] = 0; reuse_time[j] = 0; dirty_page[j] = 0; } /*yanghao*/ times = 0; for(cycle_index = 0; cycle_index < ITERATIONS; cycle_index++) { number_hotpages = 0; /*scan each vma*/ for(vma = mm->mmap; vma; vma = vma->vm_next) { start = vma->vm_start; end = vma->vm_end; mm = vma->vm_mm; /*in each vma, we check all pages*/ for(address = start; address < end; address += PAGE_SIZE) { /*scan page table for each page in this VMA*/ pgd = pgd_offset(mm, address); if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) continue; pud = pud_offset(pgd, address); if (pud_none(*pud) || unlikely(pud_bad(*pud))) continue; pmd = pmd_offset(pud, address); if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd))) continue; ptep = pte_offset_map_lock(mm, pmd, address, &ptl); pte = *ptep; if(pte_present(pte)) { if(pte_young(pte)) /*hot page*/ { /*re-set and clear _access_bits to 0*/ pte = pte_mkold(pte); set_pte_at(mm, address, ptep, pte); /*yanghao:re-set and clear _dirty_bits to 0*/ pte = pte_mkclean(pte); set_pte_at(mm, address, ptep, pte); } } else /*no page pte_none*/ { pte_unmap_unlock(ptep, ptl); continue; } pte_unmap_unlock(ptep, ptl); page_counts++; } } /*count the number of hot pages*/ if(bench_process == NULL) { printk("sysmon: get no process handle in scan_pgtable function...exit&trying again...\n"); return 0; } else /*get the process*/ mm = bench_process->mm; if(mm == NULL) { printk("sysmon: error mm is NULL, return back & trying...\n"); return 0; } number_vpages = 0; sampling_interval = page_counts / 250; /*yanghao:*/ page_counts = 0; for(vma = mm->mmap; vma; vma = vma->vm_next) { start = vma->vm_start; end = vma->vm_end; /*scan each page in this VMA*/ mm = vma->vm_mm; pg_count = 0; for(address = start; address < end; address += PAGE_SIZE) { /*scan page table for each page in this VMA*/ pgd = pgd_offset(mm, address); if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) continue; pud = pud_offset(pgd, address); if (pud_none(*pud) || unlikely(pud_bad(*pud))) continue; pmd = pmd_offset(pud, address); if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd))) continue; ptep = pte_offset_map_lock(mm, pmd, address, &ptl); pte = *ptep; if(pte_present(pte)) { if(pte_young(pte)) /* hot pages*/ { number_current_pg = pg_count + number_vpages; page_heat[number_current_pg]++; hot_page[cycle_index]++; /*yanghao:*/ if (page_counts == random_page) { times++; if (pte_dirty(pte)) dirty_page[cycle_index] = 1; } } else { if (page_counts == random_page) reuse_time[times]++; } } pg_count++; pte_unmap_unlock(ptep, ptl); page_counts++; } number_vpages += (int)(end - start)/PAGE_SIZE; } } /*yanghao:cal. the No. of random_page*/ random_page += sampling_interval; if(random_page >= page_counts) random_page=page_counts / 300; /*****************************OUTPUT************************************/ for(j = 0; j < PAGE_ALL; j++) { if(page_heat[j] < VH && page_heat[j] > H) hig++; if(page_heat[j] > M && page_heat[j] <= H) mid++; if(page_heat[j] <= M && page_heat[j] > L) low++; if(page_heat[j] > VL_MAX && page_heat[j] <= L) llow++; if(page_heat[j] > VL_MIN && page_heat[j] <= VL_MAX) lllow++; if(page_heat[j] >= 0 && page_heat[j] <= VL_MIN) llllow++; if(page_heat[j] > -1) all_pages++; } /*the values reflect the accessing frequency of each physical page.*/ printk("[LOG: after sampling (%d loops) ...] ",ITERATIONS); printk("the values denote the physical page accessing frequence.\n"); printk("-->hig (150,200) is %d. Indicating the number of re-used pages is high.\n",hig); printk("-->mid (100,150] is %d.\n",mid); printk("-->low (64,100] is %d.\n",low); printk("-->llow (10,64] is %d. In locality,no too many re-used pages.\n",llow); printk("-->lllow (5,10] is %d.\n",lllow); printk("-->llllow [1,5] is %d.\n",llllow); for(j = 0;j < ITERATIONS; j++) avg_hotpage += hot_page[j]; avg_hotpage /= (j+1); /* * new step@20140704 * (1)the different phases of memory utilization * (2)the avg. page accessing utilization * (3)memory pages layout and spectrum */ for(j = 0; j < PAGE_ALL; j++) if(page_heat[j] > -1) /*the page that is accessed at least once.*/ num_access += (page_heat[j] + 1); printk("the total number of memory accesses is %ld, the average is %ld\n", num_access, num_access / ITERATIONS); avg_page_utilization = num_access / all_pages; printk("Avg hot pages num is %ld, all used pages num is %d, avg utilization of each page is %d\n", avg_hotpage, all_pages, avg_page_utilization); /*yanghao:print the information about reuse-distance*/ if ((times == 0) && (reuse_time[0] ==0)) printk("the page No.%d is not available.",random_page); else { if ((times == 0) && (reuse_time[0] == 0)) printk("the page No.%d was not used in this 200 loops.",random_page); else { if (times < ITERATIONS) times++; printk("the reusetime of page No.%d is:",random_page); for (j = 0; j < times; j++) printk("%d ",reuse_time[j]); printk("\n"); printk("the total number of the digit above denotes the sum that page NO.%d be accessd in %d loops.\n", random_page,ITERATIONS); printk("each digit means the sum loops that between current loop and the last loop.\n"); } } printk("\n\n"); return 1; }
int __get_user_pages_fast(unsigned long start, int nr_pages, int write, struct page **pages) { struct mm_struct *mm = current->mm; unsigned long addr, len, end; unsigned long next; unsigned long flags; pgd_t *pgdp; int nr = 0; pr_devel("%s(%lx,%x,%s)\n", __func__, start, nr_pages, write ? "write" : "read"); start &= PAGE_MASK; addr = start; len = (unsigned long) nr_pages << PAGE_SHIFT; end = start + len; if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ, start, len))) return 0; pr_devel(" aligned: %lx .. %lx\n", start, end); /* * XXX: batch / limit 'nr', to avoid large irq off latency * needs some instrumenting to determine the common sizes used by * important workloads (eg. DB2), and whether limiting the batch size * will decrease performance. * * It seems like we're in the clear for the moment. Direct-IO is * the main guy that batches up lots of get_user_pages, and even * they are limited to 64-at-a-time which is not so many. */ /* * This doesn't prevent pagetable teardown, but does prevent * the pagetables from being freed on powerpc. * * So long as we atomically load page table pointers versus teardown, * we can follow the address down to the the page and take a ref on it. */ local_irq_save(flags); pgdp = pgd_offset(mm, addr); do { pgd_t pgd = READ_ONCE(*pgdp); pr_devel(" %016lx: normal pgd %p\n", addr, (void *)pgd_val(pgd)); next = pgd_addr_end(addr, end); if (pgd_none(pgd)) break; if (pgd_huge(pgd)) { if (!gup_hugepte((pte_t *)pgdp, PGDIR_SIZE, addr, next, write, pages, &nr)) break; } else if (is_hugepd(pgdp)) { if (!gup_hugepd((hugepd_t *)pgdp, PGDIR_SHIFT, addr, next, write, pages, &nr)) break; } else if (!gup_pud_range(pgd, addr, next, write, pages, &nr)) break; } while (pgdp++, addr = next, addr != end); local_irq_restore(flags); return nr; }
/** * follow_page_mask - look up a page descriptor from a user-virtual address * @vma: vm_area_struct mapping @address * @address: virtual address to look up * @flags: flags modifying lookup behaviour * @page_mask: on output, *page_mask is set according to the size of the page * * @flags can have FOLL_ flags set, defined in <linux/mm.h> * * Returns the mapped (struct page *), %NULL if no mapping exists, or * an error pointer if there is a mapping to something not represented * by a page descriptor (see also vm_normal_page()). */ struct page *follow_page_mask(struct vm_area_struct *vma, unsigned long address, unsigned int flags, unsigned int *page_mask) { pgd_t *pgd; p4d_t *p4d; pud_t *pud; pmd_t *pmd; spinlock_t *ptl; struct page *page; struct mm_struct *mm = vma->vm_mm; *page_mask = 0; page = follow_huge_addr(mm, address, flags & FOLL_WRITE); if (!IS_ERR(page)) { BUG_ON(flags & FOLL_GET); return page; } pgd = pgd_offset(mm, address); if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) return no_page_table(vma, flags); p4d = p4d_offset(pgd, address); if (p4d_none(*p4d)) return no_page_table(vma, flags); BUILD_BUG_ON(p4d_huge(*p4d)); if (unlikely(p4d_bad(*p4d))) return no_page_table(vma, flags); pud = pud_offset(p4d, address); if (pud_none(*pud)) return no_page_table(vma, flags); if (pud_huge(*pud) && vma->vm_flags & VM_HUGETLB) { page = follow_huge_pud(mm, address, pud, flags); if (page) return page; return no_page_table(vma, flags); } if (pud_devmap(*pud)) { ptl = pud_lock(mm, pud); page = follow_devmap_pud(vma, address, pud, flags); spin_unlock(ptl); if (page) return page; } if (unlikely(pud_bad(*pud))) return no_page_table(vma, flags); pmd = pmd_offset(pud, address); if (pmd_none(*pmd)) return no_page_table(vma, flags); if (pmd_huge(*pmd) && vma->vm_flags & VM_HUGETLB) { page = follow_huge_pmd(mm, address, pmd, flags); if (page) return page; return no_page_table(vma, flags); } if (pmd_devmap(*pmd)) { ptl = pmd_lock(mm, pmd); page = follow_devmap_pmd(vma, address, pmd, flags); spin_unlock(ptl); if (page) return page; } if (likely(!pmd_trans_huge(*pmd))) return follow_page_pte(vma, address, pmd, flags); if ((flags & FOLL_NUMA) && pmd_protnone(*pmd)) return no_page_table(vma, flags); ptl = pmd_lock(mm, pmd); if (unlikely(!pmd_trans_huge(*pmd))) { spin_unlock(ptl); return follow_page_pte(vma, address, pmd, flags); } if (flags & FOLL_SPLIT) { int ret; page = pmd_page(*pmd); if (is_huge_zero_page(page)) { spin_unlock(ptl); ret = 0; split_huge_pmd(vma, pmd, address); if (pmd_trans_unstable(pmd)) ret = -EBUSY; } else { get_page(page); spin_unlock(ptl); lock_page(page); ret = split_huge_page(page); unlock_page(page); put_page(page); if (pmd_none(*pmd)) return no_page_table(vma, flags); } return ret ? ERR_PTR(ret) : follow_page_pte(vma, address, pmd, flags); } page = follow_trans_huge_pmd(vma, address, pmd, flags); spin_unlock(ptl); *page_mask = HPAGE_PMD_NR - 1; return page; }
unsigned int m4u_user_v2p(unsigned int va) { unsigned int pmdOffset = (va & (PMD_SIZE - 1)); unsigned int pageOffset = (va & (PAGE_SIZE - 1)); pgd_t *pgd; pmd_t *pmd; pte_t *pte; unsigned int pa; if(NULL==current) { M4UMSG("error: m4u_user_v2p, current is NULL! \n"); return 0; } if(NULL==current->mm) { M4UMSG("error: m4u_user_v2p, current->mm is NULL! tgid=0x%x, name=%s \n", current->tgid, current->comm); return 0; } pgd = pgd_offset(current->mm, va); /* what is tsk->mm */ M4UDBG("m4u_user_v2p(), pgd 0x%x\n", pgd); M4UDBG("pgd_none=%d, pgd_bad=%d\n", pgd_none(*pgd), pgd_bad(*pgd)); if(pgd_none(*pgd)||pgd_bad(*pgd)) { M4UMSG("warning: m4u_user_v2p(), va=0x%x, pgd invalid! \n", va); return 0; } pmd = pmd_offset(pgd, va); M4UDBG("m4u_user_v2p(), pmd 0x%x\n", pmd); M4UDBG("pmd_none=%d, pmd_bad=%d, pmd_val=0x%x\n", pmd_none(*pmd), pmd_bad(*pmd), pmd_val(*pmd)); /* If this is a page table entry, keep on walking to the next level */ if (( (unsigned int)pmd_val(*pmd) & PMD_TYPE_MASK) == PMD_TYPE_TABLE) { if(pmd_none(*pmd)||pmd_bad(*pmd)) { M4UDBG("warning: m4u_user_v2p(), va=0x%x, pmd invalid! \n", va); return 0; } // we encounter some pte not preset issue, do not know why pte = pte_offset_map(pmd, va); if(pte_present(*pte)) { pa=(pte_val(*pte) & (PAGE_MASK)) | pageOffset; M4UDBG("PA = 0x%8x\n", pa); return pa; } } else /* Only 1 level page table */ { if(pmd_none(*pmd)) { M4UDBG("Error: m4u_user_v2p(), virtual addr 0x%x, pmd invalid! \n", va); return 0; } pa=(pte_val(*pmd) & (PMD_MASK)) | pmdOffset; M4UDBG("PA = 0x%8x\n", pa); return pa; } M4UDBG("warning: m4u_user_v2p(), pte invalid! \n"); // m4u_dump_maps(va); return 0; }
int memory_remove_pte(struct local_page *lp) { unsigned long del_user_va = lp->user_va; unsigned long del_slab_va = lp->slab_va; unsigned long del_pfn = page_to_pfn(virt_to_page(del_slab_va)); struct vm_area_struct *del_vma = lp->vma; struct mm_struct *del_mm = del_vma->vm_mm; #if(DEBUG) printk("[%s]\n", __FUNCTION__); printk("del_user_va: %p\n", del_user_va); printk("del_slab_va: %p\n", del_slab_va); printk("del_pfn: %p\n", del_pfn); printk("del_vma: %p\n", del_vma); printk("del_mm: %p\n", del_mm); #endif // TODO: find PTE (need to be changed for x86) pgd_t *pgd; pud_t *pud; pmd_t *pmd; pte_t *ptep; pgd = pgd_offset(del_mm, del_user_va); if (pgd_none(*pgd) || pgd_bad(*pgd)) { printk("<error> invalid pgd\n"); return -1; } pud = pud_offset(pgd, del_user_va); if (pud_none(*pud) || pud_bad(*pud)) { printk("<error> invalid pud\n"); return -1; } pmd = pmd_offset(pud, del_user_va); if (pmd_none(*pmd) || pmd_bad(*pmd)) { printk("<error> invalid pmd\n"); return -1; } ptep = pte_offset_kernel(pmd, del_user_va); if (!ptep) { printk("<error> invalid pte\n"); return -1; } #if(DEBUG) printk("ptep: %p\n", ptep); printk("pte: %p\n", *ptep); printk("pfn: %p\n", pte_pfn(*ptep)); #endif // flush cache flush_cache_page(del_vma, del_user_va, del_pfn); // clear PTE pte_clear(del_mm, del_user_va, ptep); // flush TLB flush_tlb_page(del_vma, del_user_va); return 0; }
/** * follow_page_mask - look up a page descriptor from a user-virtual address * @vma: vm_area_struct mapping @address * @address: virtual address to look up * @flags: flags modifying lookup behaviour * @page_mask: on output, *page_mask is set according to the size of the page * * @flags can have FOLL_ flags set, defined in <linux/mm.h> * * Returns the mapped (struct page *), %NULL if no mapping exists, or * an error pointer if there is a mapping to something not represented * by a page descriptor (see also vm_normal_page()). */ struct page *follow_page_mask(struct vm_area_struct *vma, unsigned long address, unsigned int flags, unsigned int *page_mask) { pgd_t *pgd; pud_t *pud; pmd_t *pmd; spinlock_t *ptl; struct page *page; struct mm_struct *mm = vma->vm_mm; *page_mask = 0; page = follow_huge_addr(mm, address, flags & FOLL_WRITE); if (!IS_ERR(page)) { BUG_ON(flags & FOLL_GET); return page; } pgd = pgd_offset(mm, address); if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) return no_page_table(vma, flags); pud = pud_offset(pgd, address); if (pud_none(*pud)) return no_page_table(vma, flags); if (pud_huge(*pud) && vma->vm_flags & VM_HUGETLB) { if (flags & FOLL_GET) return NULL; page = follow_huge_pud(mm, address, pud, flags & FOLL_WRITE); return page; } if (unlikely(pud_bad(*pud))) return no_page_table(vma, flags); pmd = pmd_offset(pud, address); if (pmd_none(*pmd)) return no_page_table(vma, flags); if (pmd_huge(*pmd) && vma->vm_flags & VM_HUGETLB) { page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE); if (flags & FOLL_GET) { /* * Refcount on tail pages are not well-defined and * shouldn't be taken. The caller should handle a NULL * return when trying to follow tail pages. */ if (PageHead(page)) get_page(page); else page = NULL; } return page; } if ((flags & FOLL_NUMA) && pmd_numa(*pmd)) return no_page_table(vma, flags); if (pmd_trans_huge(*pmd)) { if (flags & FOLL_SPLIT) { split_huge_page_pmd(vma, address, pmd); return follow_page_pte(vma, address, pmd, flags); } ptl = pmd_lock(mm, pmd); if (likely(pmd_trans_huge(*pmd))) { if (unlikely(pmd_trans_splitting(*pmd))) { spin_unlock(ptl); wait_split_huge_page(vma->anon_vma, pmd); } else { page = follow_trans_huge_pmd(vma, address, pmd, flags); spin_unlock(ptl); *page_mask = HPAGE_PMD_NR - 1; return page; } } else spin_unlock(ptl); } return follow_page_pte(vma, address, pmd, flags); }
/* Pretty sick eh? */ int prom_callback(long *args) { struct console *cons, *saved_console = NULL; unsigned long flags; char *cmd; extern spinlock_t prom_entry_lock; if (!args) return -1; if (!(cmd = (char *)args[0])) return -1; /* * The callback can be invoked on the cpu that first dropped * into prom_cmdline after taking the serial interrupt, or on * a slave processor that was smp_captured() if the * administrator has done a switch-cpu inside obp. In either * case, the cpu is marked as in-interrupt. Drop IRQ locks. */ irq_exit(); /* XXX Revisit the locking here someday. This is a debugging * XXX feature so it isnt all that critical. -DaveM */ local_irq_save(flags); spin_unlock(&prom_entry_lock); cons = console_drivers; while (cons) { unregister_console(cons); cons->flags &= ~(CON_PRINTBUFFER); cons->next = saved_console; saved_console = cons; cons = console_drivers; } register_console(&prom_console); if (!strcmp(cmd, "sync")) { prom_printf("PROM `%s' command...\n", cmd); show_free_areas(); if (current->pid != 0) { local_irq_enable(); sys_sync(); local_irq_disable(); } args[2] = 0; args[args[1] + 3] = -1; prom_printf("Returning to PROM\n"); } else if (!strcmp(cmd, "va>tte-data")) { unsigned long ctx, va; unsigned long tte = 0; long res = PROM_FALSE; ctx = args[3]; va = args[4]; if (ctx) { /* * Find process owning ctx, lookup mapping. */ struct task_struct *p; struct mm_struct *mm = NULL; pgd_t *pgdp; pmd_t *pmdp; pte_t *ptep; for_each_process(p) { mm = p->mm; if (CTX_HWBITS(mm->context) == ctx) break; } if (!mm || CTX_HWBITS(mm->context) != ctx) goto done; pgdp = pgd_offset(mm, va); if (pgd_none(*pgdp)) goto done; pmdp = pmd_offset(pgdp, va); if (pmd_none(*pmdp)) goto done; /* Preemption implicitly disabled by virtue of * being called from inside OBP. */ ptep = pte_offset_map(pmdp, va); if (pte_present(*ptep)) { tte = pte_val(*ptep); res = PROM_TRUE; } pte_unmap(ptep); goto done; } if ((va >= KERNBASE) && (va < (KERNBASE + (4 * 1024 * 1024)))) { /* Spitfire Errata #32 workaround */ __asm__ __volatile__("stxa %0, [%1] %2\n\t" "flush %%g6" : /* No outputs */ : "r" (0), "r" (PRIMARY_CONTEXT), "i" (ASI_DMMU)); /* * Locked down tlb entry. */ if (tlb_type == spitfire) tte = spitfire_get_dtlb_data(SPITFIRE_HIGHEST_LOCKED_TLBENT); else if (tlb_type == cheetah || tlb_type == cheetah_plus) tte = cheetah_get_ldtlb_data(CHEETAH_HIGHEST_LOCKED_TLBENT); res = PROM_TRUE; goto done; }
static int pin_page_for_write(const void __user *_addr, pte_t **ptep, spinlock_t **ptlp) { unsigned long addr = (unsigned long)_addr; pgd_t *pgd; pmd_t *pmd; pte_t *pte; pud_t *pud; spinlock_t *ptl; pgd = pgd_offset(current->mm, addr); if (unlikely(pgd_none(*pgd) || pgd_bad(*pgd))) return 0; pud = pud_offset(pgd, addr); if (unlikely(pud_none(*pud) || pud_bad(*pud))) return 0; pmd = pmd_offset(pud, addr); if (unlikely(pmd_none(*pmd))) return 0; /* * A pmd can be bad if it refers to a HugeTLB or THP page. * * Both THP and HugeTLB pages have the same pmd layout * and should not be manipulated by the pte functions. * * Lock the page table for the destination and check * to see that it's still huge and whether or not we will * need to fault on write, or if we have a splitting THP. */ if (unlikely(pmd_thp_or_huge(*pmd))) { ptl = ¤t->mm->page_table_lock; spin_lock(ptl); if (unlikely(!pmd_thp_or_huge(*pmd) || pmd_hugewillfault(*pmd) || pmd_trans_splitting(*pmd))) { spin_unlock(ptl); return 0; } *ptep = NULL; *ptlp = ptl; return 1; } if (unlikely(pmd_bad(*pmd))) return 0; pte = pte_offset_map_lock(current->mm, pmd, addr, &ptl); if (unlikely(!pte_present(*pte) || !pte_young(*pte) || !pte_write(*pte) || !pte_dirty(*pte))) { pte_unmap_unlock(pte, ptl); return 0; } *ptep = pte; *ptlp = ptl; return 1; }
/* * copy one vm_area from one task to the other. Assumes the page tables * already present in the new task to be cleared in the whole range * covered by this vma. * * 08Jan98 Merged into one routine from several inline routines to reduce * variable count and make things faster. -jj */ int copy_page_range(struct mm_struct *dst, struct mm_struct *src, struct vm_area_struct *vma) { pgd_t * src_pgd, * dst_pgd; unsigned long address = vma->vm_start; unsigned long end = vma->vm_end; unsigned long cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; src_pgd = pgd_offset(src, address)-1; dst_pgd = pgd_offset(dst, address)-1; for (;;) { pmd_t * src_pmd, * dst_pmd; src_pgd++; dst_pgd++; /* copy_pmd_range */ if (pgd_none(*src_pgd)) goto skip_copy_pmd_range; if (pgd_bad(*src_pgd)) { pgd_ERROR(*src_pgd); pgd_clear(src_pgd); skip_copy_pmd_range: address = (address + PGDIR_SIZE) & PGDIR_MASK; if (!address || (address >= end)) goto out; continue; } if (pgd_none(*dst_pgd)) { if (!pmd_alloc(dst_pgd, 0)) goto nomem; } src_pmd = pmd_offset(src_pgd, address); dst_pmd = pmd_offset(dst_pgd, address); do { pte_t * src_pte, * dst_pte; /* copy_pte_range */ if (pmd_none(*src_pmd)) goto skip_copy_pte_range; if (pmd_bad(*src_pmd)) { pmd_ERROR(*src_pmd); pmd_clear(src_pmd); skip_copy_pte_range: address = (address + PMD_SIZE) & PMD_MASK; if (address >= end) goto out; goto cont_copy_pmd_range; } if (pmd_none(*dst_pmd)) { if (!pte_alloc(dst_pmd, 0)) goto nomem; } src_pte = pte_offset(src_pmd, address); dst_pte = pte_offset(dst_pmd, address); do { pte_t pte = *src_pte; struct page *ptepage; /* copy_one_pte */ if (pte_none(pte)) goto cont_copy_pte_range_noset; if (!pte_present(pte)) { swap_duplicate(pte_to_swp_entry(pte)); goto cont_copy_pte_range; } ptepage = pte_page(pte); if ((!VALID_PAGE(ptepage)) || PageReserved(ptepage)) goto cont_copy_pte_range; /* If it's a COW mapping, write protect it both in the parent and the child */ if (cow) { ptep_set_wrprotect(src_pte); pte = *src_pte; } /* If it's a shared mapping, mark it clean in the child */ if (vma->vm_flags & VM_SHARED) pte = pte_mkclean(pte); pte = pte_mkold(pte); get_page(ptepage); cont_copy_pte_range: set_pte(dst_pte, pte); cont_copy_pte_range_noset: address += PAGE_SIZE; if (address >= end) goto out; src_pte++; dst_pte++; } while ((unsigned long)src_pte & PTE_TABLE_MASK); cont_copy_pmd_range: src_pmd++; dst_pmd++; } while ((unsigned long)src_pmd & PMD_TABLE_MASK); } out: return 0; nomem: return -ENOMEM; }
/* * This is useful to dump out the page tables associated with * 'addr' in mm 'mm'. */ void show_pte(struct mm_struct *mm, unsigned long addr) { mm_segment_t fs; if (!mm) mm = &init_mm; printk(KERN_ALERT "mm = %p pgd = %p\n", mm, mm->pgd); fs = get_fs(); set_fs(get_ds()); do { pgd_t pg, *pgd = pgd_offset(mm, addr); pmd_t pm, *pmd; pte_t pt, *pte; printk(KERN_ALERT "*pgd = "); if (__get_user(pgd_val(pg), (unsigned long *)pgd)) { printk("(faulted)"); break; } printk("%08lx", pgd_val(pg)); if (pgd_none(pg)) break; if (pgd_bad(pg)) { printk("(bad)"); break; } pmd = pmd_offset(pgd, addr); printk(", *pmd = "); if (__get_user(pmd_val(pm), (unsigned long *)pmd)) { printk("(faulted)"); break; } printk("%08lx", pmd_val(pm)); if (pmd_none(pm)) break; if (pmd_bad(pm)) { printk("(bad)"); break; } pte = pte_offset(pmd, addr); printk(", *pte = "); if (__get_user(pte_val(pt), (unsigned long *)pte)) { printk("(faulted)"); break; } printk("%08lx", pte_val(pt)); #ifdef CONFIG_CPU_32 printk(", *ppte = %08lx", pte_val(pte[-PTRS_PER_PTE])); #endif } while(0); set_fs(fs); printk("\n"); }
int VirtToPhys(void *vaddr, int *paddrp) { #if defined(__KERNEL__) unsigned long addr = (unsigned long) vaddr; if (addr < P1SEG || ((addr >= VMALLOC_START) && (addr < VMALLOC_END))) { /* * Find the virtual address of either a user page (<P1SEG) or VMALLOC (P3SEG) * * This code is based on vmalloc_to_page() in mm/memory.c */ struct mm_struct *mm; pgd_t *pgd; #if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,10) pud_t *pud; #endif pmd_t *pmd; pte_t *ptep, pte; /* Must use the correct mm based on whether this is a kernel or a userspace address */ if (addr >= VMALLOC_START) mm = &init_mm; else mm = current->mm; /* Safety first! */ if (mm == NULL) return VTOP_INVALID_ARG; spin_lock(&mm->page_table_lock); pgd = pgd_offset(mm, addr); if (pgd_none(*pgd) || pgd_bad(*pgd)) goto out; #if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,10) pud = pud_offset(pgd, addr); if (pud_none(*pud) || pud_bad(*pud)) goto out; pmd = pmd_offset(pud, addr); #else pmd = pmd_offset(pgd, addr); #endif if (pmd_none(*pmd) || pmd_bad(*pmd)) goto out; #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,9) ptep = pte_offset(pmd, addr); #else ptep = pte_offset_map(pmd, addr); #endif if (!ptep) goto out; pte = *ptep; if (pte_present(pte)) { #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,9) pte_unmap(ptep); #endif spin_unlock(&mm->page_table_lock); /* pte_page() macro is broken for SH in linux 2.6.20 and later */ *paddrp = page_to_phys(pfn_to_page(pte_pfn(pte))) | (addr & (PAGE_SIZE-1)); /* INSbl28636: P3 segment pages cannot be looked up with pmb_virt_to_phys() * instead we need to examine the _PAGE_CACHABLE bit in the pte */ return ((pte_val(pte) & _PAGE_CACHABLE) ? VTOP_INCOHERENT_MEM : VTOP_SUCCESS); } #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,9) pte_unmap(ptep); #endif out: spin_unlock(&mm->page_table_lock); /* Failed to find a pte */ return VTOP_INVALID_ARG; } else #if defined(CONFIG_32BIT) { unsigned long flags; /* Try looking for an ioremap() via the PMB */ if (pmb_virt_to_phys(vaddr, (unsigned long *)paddrp, &flags) == 0) { /* Success: Test the returned PMB flags */ return ((flags & PMB_C) ? VTOP_INCOHERENT_MEM : VTOP_SUCCESS); } /* Failed to find a mapping */ return VTOP_INVALID_ARG; } #else { unsigned long addr = (unsigned long) vaddr; /* Assume 29-bit SH4 Linux */ *(paddrp)= PHYSADDR(addr); /* only the P2SEG is uncached (doubt we will see P4SEG addresses) */ return ((PXSEG(addr) == P2SEG) ? VTOP_SUCCESS: VTOP_INCOHERENT_MEM); } #endif /* CONFIG_32BIT */ #endif /* __KERNEL__ */ /* Not implemented */ return VTOP_INVALID_ARG; }
void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, struct task_struct *tsk) { struct mm_struct *real_prev = this_cpu_read(cpu_tlbstate.loaded_mm); u16 prev_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid); unsigned cpu = smp_processor_id(); u64 next_tlb_gen; /* * NB: The scheduler will call us with prev == next when switching * from lazy TLB mode to normal mode if active_mm isn't changing. * When this happens, we don't assume that CR3 (and hence * cpu_tlbstate.loaded_mm) matches next. * * NB: leave_mm() calls us with prev == NULL and tsk == NULL. */ /* We don't want flush_tlb_func_* to run concurrently with us. */ if (IS_ENABLED(CONFIG_PROVE_LOCKING)) WARN_ON_ONCE(!irqs_disabled()); /* * Verify that CR3 is what we think it is. This will catch * hypothetical buggy code that directly switches to swapper_pg_dir * without going through leave_mm() / switch_mm_irqs_off() or that * does something like write_cr3(read_cr3_pa()). * * Only do this check if CONFIG_DEBUG_VM=y because __read_cr3() * isn't free. */ #ifdef CONFIG_DEBUG_VM if (WARN_ON_ONCE(__read_cr3() != build_cr3(real_prev, prev_asid))) { /* * If we were to BUG here, we'd be very likely to kill * the system so hard that we don't see the call trace. * Try to recover instead by ignoring the error and doing * a global flush to minimize the chance of corruption. * * (This is far from being a fully correct recovery. * Architecturally, the CPU could prefetch something * back into an incorrect ASID slot and leave it there * to cause trouble down the road. It's better than * nothing, though.) */ __flush_tlb_all(); } #endif this_cpu_write(cpu_tlbstate.is_lazy, false); if (real_prev == next) { VM_BUG_ON(this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) != next->context.ctx_id); /* * We don't currently support having a real mm loaded without * our cpu set in mm_cpumask(). We have all the bookkeeping * in place to figure out whether we would need to flush * if our cpu were cleared in mm_cpumask(), but we don't * currently use it. */ if (WARN_ON_ONCE(real_prev != &init_mm && !cpumask_test_cpu(cpu, mm_cpumask(next)))) cpumask_set_cpu(cpu, mm_cpumask(next)); return; } else { u16 new_asid; bool need_flush; if (IS_ENABLED(CONFIG_VMAP_STACK)) { /* * If our current stack is in vmalloc space and isn't * mapped in the new pgd, we'll double-fault. Forcibly * map it. */ unsigned int index = pgd_index(current_stack_pointer); pgd_t *pgd = next->pgd + index; if (unlikely(pgd_none(*pgd))) set_pgd(pgd, init_mm.pgd[index]); } /* Stop remote flushes for the previous mm */ VM_WARN_ON_ONCE(!cpumask_test_cpu(cpu, mm_cpumask(real_prev)) && real_prev != &init_mm); cpumask_clear_cpu(cpu, mm_cpumask(real_prev)); /* * Start remote flushes and then read tlb_gen. */ cpumask_set_cpu(cpu, mm_cpumask(next)); next_tlb_gen = atomic64_read(&next->context.tlb_gen); choose_new_asid(next, next_tlb_gen, &new_asid, &need_flush); if (need_flush) { this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id); this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen); write_cr3(build_cr3(next, new_asid)); trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); } else { /* The new ASID is already up to date. */ write_cr3(build_cr3_noflush(next, new_asid)); trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, 0); } this_cpu_write(cpu_tlbstate.loaded_mm, next); this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid); } load_mm_cr4(next); switch_ldt(real_prev, next); }
unsigned int m4u_user_v2p(unsigned int va) { unsigned int pageOffset = (va & (PAGE_SIZE - 1)); pgd_t *pgd; pud_t *pud; pmd_t *pmd; pte_t *pte; unsigned int pa; //M4UMSG("Enter m4u_user_v2p()! \n", va); if(NULL==current) { M4UMSG("warning: m4u_user_v2p, current is NULL! \n"); return 0; } if(NULL==current->mm) { M4UMSG("warning: m4u_user_v2p, current->mm is NULL! tgid=0x%x, name=%s \n", current->tgid, current->comm); return 0; } pgd = pgd_offset(current->mm, va); /* what is tsk->mm */ if(pgd_none(*pgd)||pgd_bad(*pgd)) { M4UMSG("m4u_user_v2p(), va=0x%x, pgd invalid! \n", va); return 0; } pud = pud_offset(pgd, va); if(pud_none(*pud)||pud_bad(*pud)) { M4UDBG("m4u_user_v2p(), va=0x%x, pud invalid! \n", va); return 0; } pmd = pmd_offset(pud, va); if(pmd_none(*pmd)||pmd_bad(*pmd)) { M4UDBG("m4u_user_v2p(), va=0x%x, pmd invalid! \n", va); return 0; } pte = pte_offset_map(pmd, va); if(pte_present(*pte)) { /* if((long long)pte_val(pte[PTE_HWTABLE_PTRS]) == (long long)0) { M4UMSG("user_v2p, va=0x%x, *ppte=%08llx", va, (long long)pte_val(pte[PTE_HWTABLE_PTRS])); pte_unmap(pte); return 0; } */ pa=(pte_val(*pte) & (PAGE_MASK)) | pageOffset; pte_unmap(pte); return pa; } pte_unmap(pte); M4UDBG("m4u_user_v2p(), va=0x%x, pte invalid! \n", va); // m4u_dump_maps(va); return 0; }
int get_user_pages_fast(unsigned long start, int nr_pages, int write, struct page **pages) { struct mm_struct *mm = current->mm; unsigned long addr, len, end; unsigned long next; pgd_t *pgdp; int nr = 0; start &= PAGE_MASK; addr = start; len = (unsigned long) nr_pages << PAGE_SHIFT; end = start + len; /* * XXX: batch / limit 'nr', to avoid large irq off latency * needs some instrumenting to determine the common sizes used by * important workloads (eg. DB2), and whether limiting the batch size * will decrease performance. * * It seems like we're in the clear for the moment. Direct-IO is * the main guy that batches up lots of get_user_pages, and even * they are limited to 64-at-a-time which is not so many. */ /* * This doesn't prevent pagetable teardown, but does prevent * the pagetables from being freed on sparc. * * So long as we atomically load page table pointers versus teardown, * we can follow the address down to the the page and take a ref on it. */ local_irq_disable(); pgdp = pgd_offset(mm, addr); do { pgd_t pgd = *pgdp; next = pgd_addr_end(addr, end); if (pgd_none(pgd)) goto slow; if (!gup_pud_range(pgd, addr, next, write, pages, &nr)) goto slow; } while (pgdp++, addr = next, addr != end); local_irq_enable(); VM_BUG_ON(nr != (end - start) >> PAGE_SHIFT); return nr; { int ret; slow: local_irq_enable(); /* Try to get the remaining pages with get_user_pages */ start += nr << PAGE_SHIFT; pages += nr; down_read(&mm->mmap_sem); ret = get_user_pages(current, mm, start, (end - start) >> PAGE_SHIFT, write, 0, pages, NULL); up_read(&mm->mmap_sem); /* Have to be a bit careful with return values */ if (nr > 0) { if (ret < 0) ret = nr; else ret += nr; } return ret; } }