static int ept_set_epte(struct vmx_vcpu *vcpu, int make_write, unsigned long gpa, unsigned long hva) { int ret; epte_t *epte, flags; struct page *page; unsigned huge_shift; int level; ret = get_user_pages_fast(hva, 1, make_write, &page); if (ret != 1) { ret = ept_set_pfnmap_epte(vcpu, make_write, gpa, hva); if (ret) printk(KERN_ERR "ept: failed to get user page %lx\n", hva); return ret; } spin_lock(&vcpu->ept_lock); huge_shift = compound_order(compound_head(page)) + PAGE_SHIFT; level = 0; if (huge_shift == 30) level = 2; else if (huge_shift == 21) level = 1; ret = ept_lookup_gpa(vcpu, (void *) gpa, level, 1, &epte); if (ret) { spin_unlock(&vcpu->ept_lock); put_page(page); printk(KERN_ERR "ept: failed to lookup EPT entry\n"); return ret; } if (epte_present(*epte)) { if (!epte_big(*epte) && level == 2) ept_clear_l2_epte(epte); else if (!epte_big(*epte) && level == 1) ept_clear_l1_epte(epte); else ept_clear_epte(epte); } flags = __EPTE_READ | __EPTE_EXEC | __EPTE_TYPE(EPTE_TYPE_WB) | __EPTE_IPAT; if (make_write) flags |= __EPTE_WRITE; if (vcpu->ept_ad_enabled) { /* premark A/D to avoid extra memory references */ flags |= __EPTE_A; if (make_write) flags |= __EPTE_D; } if (level) { struct page *tmp = page; page = compound_head(page); get_page(page); put_page(tmp); flags |= __EPTE_SZ; } *epte = epte_addr(page_to_phys(page)) | flags; spin_unlock(&vcpu->ept_lock); return 0; }
static void populate_physmap(struct memop_args *a) { struct page_info *page; unsigned int i, j; xen_pfn_t gpfn, mfn; struct domain *d = a->domain; if ( !guest_handle_subrange_okay(a->extent_list, a->nr_done, a->nr_extents-1) ) return; if ( a->extent_order > (a->memflags & MEMF_populate_on_demand ? MAX_ORDER : max_order(current->domain)) ) return; for ( i = a->nr_done; i < a->nr_extents; i++ ) { if ( i != a->nr_done && hypercall_preempt_check() ) { a->preempted = 1; goto out; } if ( unlikely(__copy_from_guest_offset(&gpfn, a->extent_list, i, 1)) ) goto out; if ( a->memflags & MEMF_populate_on_demand ) { if ( guest_physmap_mark_populate_on_demand(d, gpfn, a->extent_order) < 0 ) goto out; } else { if ( is_domain_direct_mapped(d) ) { mfn = gpfn; for ( j = 0; j < (1U << a->extent_order); j++, mfn++ ) { if ( !mfn_valid(mfn) ) { gdprintk(XENLOG_INFO, "Invalid mfn %#"PRI_xen_pfn"\n", mfn); goto out; } page = mfn_to_page(mfn); if ( !get_page(page, d) ) { gdprintk(XENLOG_INFO, "mfn %#"PRI_xen_pfn" doesn't belong to d%d\n", mfn, d->domain_id); goto out; } put_page(page); } mfn = gpfn; page = mfn_to_page(mfn); } else { page = alloc_domheap_pages(d, a->extent_order, a->memflags); if ( unlikely(!page) ) { if ( !opt_tmem || a->extent_order ) gdprintk(XENLOG_INFO, "Could not allocate order=%u extent: id=%d memflags=%#x (%u of %u)\n", a->extent_order, d->domain_id, a->memflags, i, a->nr_extents); goto out; } mfn = page_to_mfn(page); } guest_physmap_add_page(d, gpfn, mfn, a->extent_order); if ( !paging_mode_translate(d) ) { for ( j = 0; j < (1U << a->extent_order); j++ ) set_gpfn_from_mfn(mfn + j, gpfn + j); /* Inform the domain of the new page's machine address. */ if ( unlikely(__copy_to_guest_offset(a->extent_list, i, &mfn, 1)) ) goto out; } } } out: a->nr_done = i; }
int iterate_phdr(int (*cb) (struct phdr_info *info, struct task_struct *task, void *data), struct task_struct *task, void *data) { struct vm_area_struct *vma; struct mm_struct *mm = task->mm; struct phdr_info pi; char buf[NAME_BUFLEN]; int res = 0, err = 0; struct page *page; // FIXME Is one page enough for all phdrs? Elf64_Ehdr *ehdr; bool first = true; if (!mm) return -EINVAL; for (vma = mm->mmap; vma; vma = vma->vm_next) { if (vma->vm_pgoff) // Only the first page contains the elf // headers, normally. continue; err = __get_user_pages_unlocked( task, task->mm, vma->vm_start, 1, 0, 0, &page, FOLL_TOUCH); if (err < 0) continue; ehdr = vmap(&page, 1, vma->vm_flags, vma->vm_page_prot); if (!ehdr) goto PUT; // Test magic bytes to check that it is an ehdr err = 0; err |= (ehdr->e_ident[0] != ELFMAG0); err |= (ehdr->e_ident[1] != ELFMAG1); err |= (ehdr->e_ident[2] != ELFMAG2); err |= (ehdr->e_ident[3] != ELFMAG3); if (err) goto UNMAP; // Set addresses pi.addr = first ? 0 : vma->vm_start; pi.phdr = (void *) ehdr + ehdr->e_phoff; pi.phnum = ehdr->e_phnum; // Find path pi.name = vma_file_path(vma, buf, NAME_BUFLEN); // Call the callback res = cb(&pi, task, data); // Free resources UNMAP: vunmap(ehdr); PUT: put_page(page); if (res) break; first = false; } return res; }
static int nilfs_recover_dsync_blocks(struct the_nilfs *nilfs, struct super_block *sb, struct nilfs_root *root, struct list_head *head, unsigned long *nr_salvaged_blocks) { struct inode *inode; struct nilfs_recovery_block *rb, *n; unsigned int blocksize = nilfs->ns_blocksize; struct page *page; loff_t pos; int err = 0, err2 = 0; list_for_each_entry_safe(rb, n, head, list) { inode = nilfs_iget(sb, root, rb->ino); if (IS_ERR(inode)) { err = PTR_ERR(inode); inode = NULL; goto failed_inode; } pos = rb->blkoff << inode->i_blkbits; err = block_write_begin(inode->i_mapping, pos, blocksize, 0, &page, nilfs_get_block); if (unlikely(err)) { loff_t isize = inode->i_size; if (pos + blocksize > isize) nilfs_write_failed(inode->i_mapping, pos + blocksize); goto failed_inode; } err = nilfs_recovery_copy_block(nilfs, rb, page); if (unlikely(err)) goto failed_page; err = nilfs_set_file_dirty(inode, 1); if (unlikely(err)) goto failed_page; block_write_end(NULL, inode->i_mapping, pos, blocksize, blocksize, page, NULL); unlock_page(page); put_page(page); (*nr_salvaged_blocks)++; goto next; failed_page: unlock_page(page); put_page(page); failed_inode: nilfs_msg(sb, KERN_WARNING, "error %d recovering data block (ino=%lu, block-offset=%llu)", err, (unsigned long)rb->ino, (unsigned long long)rb->blkoff); if (!err2) err2 = err; next: iput(inode); /* iput(NULL) is just ignored */ list_del_init(&rb->list); kfree(rb); }
/* * create a vfsmount to be automounted */ static struct vfsmount *afs_mntpt_do_automount(struct dentry *mntpt) { struct afs_super_info *as; struct vfsmount *mnt; struct afs_vnode *vnode; struct page *page; char *devname, *options; bool rwpath = false; int ret; _enter("{%pd}", mntpt); BUG_ON(!d_inode(mntpt)); ret = -ENOMEM; devname = (char *) get_zeroed_page(GFP_KERNEL); if (!devname) goto error_no_devname; options = (char *) get_zeroed_page(GFP_KERNEL); if (!options) goto error_no_options; vnode = AFS_FS_I(d_inode(mntpt)); if (test_bit(AFS_VNODE_PSEUDODIR, &vnode->flags)) { /* if the directory is a pseudo directory, use the d_name */ static const char afs_root_cell[] = ":root.cell."; unsigned size = mntpt->d_name.len; ret = -ENOENT; if (size < 2 || size > AFS_MAXCELLNAME) goto error_no_page; if (mntpt->d_name.name[0] == '.') { devname[0] = '%'; memcpy(devname + 1, mntpt->d_name.name + 1, size - 1); memcpy(devname + size, afs_root_cell, sizeof(afs_root_cell)); rwpath = true; } else { devname[0] = '#'; memcpy(devname + 1, mntpt->d_name.name, size); memcpy(devname + size + 1, afs_root_cell, sizeof(afs_root_cell)); } } else { /* read the contents of the AFS special symlink */ loff_t size = i_size_read(d_inode(mntpt)); char *buf; ret = -EINVAL; if (size > PAGE_SIZE - 1) goto error_no_page; page = read_mapping_page(d_inode(mntpt)->i_mapping, 0, NULL); if (IS_ERR(page)) { ret = PTR_ERR(page); goto error_no_page; } if (PageError(page)) { ret = afs_bad(AFS_FS_I(d_inode(mntpt)), afs_file_error_mntpt); goto error; } buf = kmap_atomic(page); memcpy(devname, buf, size); kunmap_atomic(buf); put_page(page); page = NULL; } /* work out what options we want */ as = AFS_FS_S(mntpt->d_sb); if (as->cell) { memcpy(options, "cell=", 5); strcpy(options + 5, as->cell->name); if ((as->volume && as->volume->type == AFSVL_RWVOL) || rwpath) strcat(options, ",rwpath"); } /* try and do the mount */ _debug("--- attempting mount %s -o %s ---", devname, options); mnt = vfs_submount(mntpt, &afs_fs_type, devname, options); _debug("--- mount result %p ---", mnt); free_page((unsigned long) devname); free_page((unsigned long) options); _leave(" = %p", mnt); return mnt; error: put_page(page); error_no_page: free_page((unsigned long) options); error_no_options: free_page((unsigned long) devname); error_no_devname: _leave(" = %d", ret); return ERR_PTR(ret); }
static void *vb2_dma_sg_get_userptr(void *alloc_ctx, unsigned long vaddr, unsigned long size, enum dma_data_direction dma_dir) { struct vb2_dma_sg_conf *conf = alloc_ctx; struct vb2_dma_sg_buf *buf; unsigned long first, last; int num_pages_from_user; struct vm_area_struct *vma; struct sg_table *sgt; DEFINE_DMA_ATTRS(attrs); #if LINUX_VERSION_CODE >= KERNEL_VERSION(3,6,0) dma_set_attr(DMA_ATTR_SKIP_CPU_SYNC, &attrs); #endif buf = kzalloc(sizeof *buf, GFP_KERNEL); if (!buf) return NULL; buf->vaddr = NULL; buf->dev = conf->dev; buf->dma_dir = dma_dir; buf->offset = vaddr & ~PAGE_MASK; buf->size = size; buf->dma_sgt = &buf->sg_table; first = (vaddr & PAGE_MASK) >> PAGE_SHIFT; last = ((vaddr + size - 1) & PAGE_MASK) >> PAGE_SHIFT; buf->num_pages = last - first + 1; buf->pages = kzalloc(buf->num_pages * sizeof(struct page *), GFP_KERNEL); if (!buf->pages) goto userptr_fail_alloc_pages; vma = find_vma(current->mm, vaddr); if (!vma) { dprintk(1, "no vma for address %lu\n", vaddr); goto userptr_fail_find_vma; } if (vma->vm_end < vaddr + size) { dprintk(1, "vma at %lu is too small for %lu bytes\n", vaddr, size); goto userptr_fail_find_vma; } buf->vma = vb2_get_vma(vma); if (!buf->vma) { dprintk(1, "failed to copy vma\n"); goto userptr_fail_find_vma; } if (vma_is_io(buf->vma)) { for (num_pages_from_user = 0; num_pages_from_user < buf->num_pages; ++num_pages_from_user, vaddr += PAGE_SIZE) { unsigned long pfn; if (follow_pfn(vma, vaddr, &pfn)) { dprintk(1, "no page for address %lu\n", vaddr); break; } buf->pages[num_pages_from_user] = pfn_to_page(pfn); } } else num_pages_from_user = get_user_pages(current, current->mm, vaddr & PAGE_MASK, buf->num_pages, buf->dma_dir == DMA_FROM_DEVICE, 1, /* force */ buf->pages, NULL); if (num_pages_from_user != buf->num_pages) goto userptr_fail_get_user_pages; if (sg_alloc_table_from_pages(buf->dma_sgt, buf->pages, buf->num_pages, buf->offset, size, 0)) goto userptr_fail_alloc_table_from_pages; sgt = &buf->sg_table; /* * No need to sync to the device, this will happen later when the * prepare() memop is called. */ sgt->nents = dma_map_sg_attrs(buf->dev, sgt->sgl, sgt->orig_nents, buf->dma_dir, &attrs); if (!sgt->nents) goto userptr_fail_map; return buf; userptr_fail_map: sg_free_table(&buf->sg_table); userptr_fail_alloc_table_from_pages: userptr_fail_get_user_pages: dprintk(1, "get_user_pages requested/got: %d/%d]\n", buf->num_pages, num_pages_from_user); if (!vma_is_io(buf->vma)) while (--num_pages_from_user >= 0) put_page(buf->pages[num_pages_from_user]); vb2_put_vma(buf->vma); userptr_fail_find_vma: kfree(buf->pages); userptr_fail_alloc_pages: kfree(buf); return NULL; }
/* * discard a page cached in the pagecache */ static inline void afs_dir_put_page(struct page *page) { kunmap(page); put_page(page); }
static int ivtv_yuv_prep_user_dma(struct ivtv *itv, struct ivtv_user_dma *dma, struct ivtv_dma_frame *args) { struct ivtv_dma_page_info y_dma; struct ivtv_dma_page_info uv_dma; int i; int y_pages, uv_pages; unsigned long y_buffer_offset, uv_buffer_offset; int y_decode_height, uv_decode_height, y_size; int frame = atomic_read(&itv->yuv_info.next_fill_frame); y_buffer_offset = IVTV_DEC_MEM_START + yuv_offset[frame]; uv_buffer_offset = y_buffer_offset + IVTV_YUV_BUFFER_UV_OFFSET; y_decode_height = uv_decode_height = args->src.height + args->src.top; if (y_decode_height < 512-16) y_buffer_offset += 720 * 16; if (y_decode_height & 15) y_decode_height = (y_decode_height + 16) & ~15; if (uv_decode_height & 31) uv_decode_height = (uv_decode_height + 32) & ~31; y_size = 720 * y_decode_height; /* Still in USE */ if (dma->SG_length || dma->page_count) { IVTV_DEBUG_WARN("prep_user_dma: SG_length %d page_count %d still full?\n", dma->SG_length, dma->page_count); return -EBUSY; } ivtv_udma_get_page_info (&y_dma, (unsigned long)args->y_source, 720 * y_decode_height); ivtv_udma_get_page_info (&uv_dma, (unsigned long)args->uv_source, 360 * uv_decode_height); /* Get user pages for DMA Xfer */ down_read(¤t->mm->mmap_sem); y_pages = get_user_pages(current, current->mm, y_dma.uaddr, y_dma.page_count, 0, 1, &dma->map[0], NULL); uv_pages = get_user_pages(current, current->mm, uv_dma.uaddr, uv_dma.page_count, 0, 1, &dma->map[y_pages], NULL); up_read(¤t->mm->mmap_sem); dma->page_count = y_dma.page_count + uv_dma.page_count; if (y_pages + uv_pages != dma->page_count) { IVTV_DEBUG_WARN("failed to map user pages, returned %d instead of %d\n", y_pages + uv_pages, dma->page_count); for (i = 0; i < dma->page_count; i++) { put_page(dma->map[i]); } dma->page_count = 0; return -EINVAL; } /* Fill & map SG List */ ivtv_udma_fill_sg_list (dma, &uv_dma, ivtv_udma_fill_sg_list (dma, &y_dma, 0)); dma->SG_length = pci_map_sg(itv->dev, dma->SGlist, dma->page_count, PCI_DMA_TODEVICE); /* Fill SG Array with new values */ ivtv_udma_fill_sg_array (dma, y_buffer_offset, uv_buffer_offset, y_size); /* If we've offset the y plane, ensure top area is blanked */ if (args->src.height + args->src.top < 512-16) { if (itv->yuv_info.blanking_dmaptr) { dma->SGarray[dma->SG_length].size = cpu_to_le32(720*16); dma->SGarray[dma->SG_length].src = cpu_to_le32(itv->yuv_info.blanking_dmaptr); dma->SGarray[dma->SG_length].dst = cpu_to_le32(IVTV_DEC_MEM_START + yuv_offset[frame]); dma->SG_length++; } } /* Tag SG Array with Interrupt Bit */ dma->SGarray[dma->SG_length - 1].size |= cpu_to_le32(0x80000000); ivtv_udma_sync_for_device(itv); return 0; }
int ext4_mpage_readpages(struct address_space *mapping, struct list_head *pages, struct page *page, unsigned nr_pages, bool is_readahead) { struct bio *bio = NULL; sector_t last_block_in_bio = 0; struct inode *inode = mapping->host; const unsigned blkbits = inode->i_blkbits; const unsigned blocks_per_page = PAGE_SIZE >> blkbits; const unsigned blocksize = 1 << blkbits; sector_t block_in_file; sector_t last_block; sector_t last_block_in_file; sector_t blocks[MAX_BUF_PER_PAGE]; unsigned page_block; struct block_device *bdev = inode->i_sb->s_bdev; int length; unsigned relative_block = 0; struct ext4_map_blocks map; map.m_pblk = 0; map.m_lblk = 0; map.m_len = 0; map.m_flags = 0; for (; nr_pages; nr_pages--) { int fully_mapped = 1; unsigned first_hole = blocks_per_page; prefetchw(&page->flags); if (pages) { page = lru_to_page(pages); list_del(&page->lru); if (add_to_page_cache_lru(page, mapping, page->index, readahead_gfp_mask(mapping))) goto next_page; } if (page_has_buffers(page)) goto confused; block_in_file = (sector_t)page->index << (PAGE_SHIFT - blkbits); last_block = block_in_file + nr_pages * blocks_per_page; last_block_in_file = (i_size_read(inode) + blocksize - 1) >> blkbits; if (last_block > last_block_in_file) last_block = last_block_in_file; page_block = 0; /* * Map blocks using the previous result first. */ if ((map.m_flags & EXT4_MAP_MAPPED) && block_in_file > map.m_lblk && block_in_file < (map.m_lblk + map.m_len)) { unsigned map_offset = block_in_file - map.m_lblk; unsigned last = map.m_len - map_offset; for (relative_block = 0; ; relative_block++) { if (relative_block == last) { /* needed? */ map.m_flags &= ~EXT4_MAP_MAPPED; break; } if (page_block == blocks_per_page) break; blocks[page_block] = map.m_pblk + map_offset + relative_block; page_block++; block_in_file++; } } /* * Then do more ext4_map_blocks() calls until we are * done with this page. */ while (page_block < blocks_per_page) { if (block_in_file < last_block) { map.m_lblk = block_in_file; map.m_len = last_block - block_in_file; if (ext4_map_blocks(NULL, inode, &map, 0) < 0) { set_error_page: SetPageError(page); zero_user_segment(page, 0, PAGE_SIZE); unlock_page(page); goto next_page; } } if ((map.m_flags & EXT4_MAP_MAPPED) == 0) { fully_mapped = 0; if (first_hole == blocks_per_page) first_hole = page_block; page_block++; block_in_file++; continue; } if (first_hole != blocks_per_page) goto confused; /* hole -> non-hole */ /* Contiguous blocks? */ if (page_block && blocks[page_block-1] != map.m_pblk-1) goto confused; for (relative_block = 0; ; relative_block++) { if (relative_block == map.m_len) { /* needed? */ map.m_flags &= ~EXT4_MAP_MAPPED; break; } else if (page_block == blocks_per_page) break; blocks[page_block] = map.m_pblk+relative_block; page_block++; block_in_file++; } } if (first_hole != blocks_per_page) { zero_user_segment(page, first_hole << blkbits, PAGE_SIZE); if (first_hole == 0) { SetPageUptodate(page); unlock_page(page); goto next_page; } } else if (fully_mapped) { SetPageMappedToDisk(page); } if (fully_mapped && blocks_per_page == 1 && !PageUptodate(page) && cleancache_get_page(page) == 0) { SetPageUptodate(page); goto confused; } /* * This page will go to BIO. Do we need to send this * BIO off first? */ if (bio && (last_block_in_bio != blocks[0] - 1)) { submit_and_realloc: submit_bio(bio); bio = NULL; } if (bio == NULL) { struct fscrypt_ctx *ctx = NULL; if (IS_ENCRYPTED(inode) && S_ISREG(inode->i_mode)) { ctx = fscrypt_get_ctx(inode, GFP_NOFS); if (IS_ERR(ctx)) goto set_error_page; } bio = bio_alloc(GFP_KERNEL, min_t(int, nr_pages, BIO_MAX_PAGES)); if (!bio) { if (ctx) fscrypt_release_ctx(ctx); goto set_error_page; } bio_set_dev(bio, bdev); bio->bi_iter.bi_sector = blocks[0] << (blkbits - 9); bio->bi_end_io = mpage_end_io; bio->bi_private = ctx; bio_set_op_attrs(bio, REQ_OP_READ, is_readahead ? REQ_RAHEAD : 0); } length = first_hole << blkbits; if (bio_add_page(bio, page, length, 0) < length) goto submit_and_realloc; if (((map.m_flags & EXT4_MAP_BOUNDARY) && (relative_block == map.m_len)) || (first_hole != blocks_per_page)) { submit_bio(bio); bio = NULL; } else last_block_in_bio = blocks[blocks_per_page - 1]; goto next_page; confused: if (bio) { submit_bio(bio); bio = NULL; } if (!PageUptodate(page)) block_read_full_page(page, ext4_get_block); else unlock_page(page); next_page: if (pages) put_page(page); } BUG_ON(pages && !list_empty(pages)); if (bio) submit_bio(bio); return 0; }
int psb_get_vaddr_pages(u32 vaddr, u32 size, u32 **pfn_list, int *page_count) { u32 num_pages; struct page **pages = 0; struct task_struct *task = current; struct mm_struct *mm = task->mm; struct vm_area_struct *vma; u32 *pfns = 0; int ret; int i; if (unlikely(!pfn_list || !page_count || !vaddr || !size)) return -EINVAL; num_pages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; pages = kzalloc(num_pages * sizeof(struct page *), GFP_KERNEL); if (unlikely(!pages)) { DRM_ERROR("Failed to allocate page list\n"); return -ENOMEM; } down_read(&mm->mmap_sem); ret = get_user_pages(task, mm, vaddr, num_pages, 0, 0, pages, NULL); up_read(&mm->mmap_sem); if (ret <= 0) { DRM_DEBUG("failed to get user pages\n"); kfree(pages); pages = 0; } else { DRM_DEBUG("num_pages %d, ret %d\n", num_pages, ret); num_pages = ret; } /*allocate page list*/ pfns = kzalloc(num_pages * sizeof(u32), GFP_KERNEL); if (!pfns) { DRM_ERROR("No memory\n"); goto get_page_err; } if (!pages) { DRM_ERROR("No pages found, trying to follow pfn\n"); for (i = 0; i < num_pages; i++) { vma = find_vma(mm, vaddr + i * PAGE_SIZE); if (!vma) { DRM_ERROR("failed to find vma\n"); goto find_vma_err; } ret = follow_pfn(vma, (unsigned long)(vaddr + i * PAGE_SIZE), (unsigned long *)&pfns[i]); if (ret) { DRM_ERROR("failed to follow pfn\n"); goto follow_pfn_err; } } } else { DRM_ERROR("Found pages\n"); for (i = 0; i < num_pages; i++) pfns[i] = page_to_pfn(pages[i]); } *pfn_list = pfns; *page_count = num_pages; kfree(pages); return 0; find_vma_err: follow_pfn_err: kfree(pfns); get_page_err: if (pages) { for (i = 0; i < num_pages; i++) put_page(pages[i]); kfree(pages); } return -EINVAL; }
static int hvmemul_do_io( int is_mmio, paddr_t addr, unsigned long *reps, int size, paddr_t ram_gpa, int dir, int df, void *p_data) { struct vcpu *curr = current; struct hvm_vcpu_io *vio; ioreq_t p = { .type = is_mmio ? IOREQ_TYPE_COPY : IOREQ_TYPE_PIO, .addr = addr, .size = size, .dir = dir, .df = df, .data = ram_gpa, .data_is_ptr = (p_data == NULL), }; unsigned long ram_gfn = paddr_to_pfn(ram_gpa); p2m_type_t p2mt; struct page_info *ram_page; int rc; /* Check for paged out page */ ram_page = get_page_from_gfn(curr->domain, ram_gfn, &p2mt, P2M_UNSHARE); if ( p2m_is_paging(p2mt) ) { if ( ram_page ) put_page(ram_page); p2m_mem_paging_populate(curr->domain, ram_gfn); return X86EMUL_RETRY; } if ( p2m_is_shared(p2mt) ) { if ( ram_page ) put_page(ram_page); return X86EMUL_RETRY; } /* * Weird-sized accesses have undefined behaviour: we discard writes * and read all-ones. */ if ( unlikely((size > sizeof(long)) || (size & (size - 1))) ) { gdprintk(XENLOG_WARNING, "bad mmio size %d\n", size); ASSERT(p_data != NULL); /* cannot happen with a REP prefix */ if ( dir == IOREQ_READ ) memset(p_data, ~0, size); if ( ram_page ) put_page(ram_page); return X86EMUL_UNHANDLEABLE; } if ( !p.data_is_ptr && (dir == IOREQ_WRITE) ) { memcpy(&p.data, p_data, size); p_data = NULL; } vio = &curr->arch.hvm_vcpu.hvm_io; if ( is_mmio && !p.data_is_ptr ) { /* Part of a multi-cycle read or write? */ if ( dir == IOREQ_WRITE ) { paddr_t pa = vio->mmio_large_write_pa; unsigned int bytes = vio->mmio_large_write_bytes; if ( (addr >= pa) && ((addr + size) <= (pa + bytes)) ) { if ( ram_page ) put_page(ram_page); return X86EMUL_OKAY; } } else { paddr_t pa = vio->mmio_large_read_pa; unsigned int bytes = vio->mmio_large_read_bytes; if ( (addr >= pa) && ((addr + size) <= (pa + bytes)) ) { memcpy(p_data, &vio->mmio_large_read[addr - pa], size); if ( ram_page ) put_page(ram_page); return X86EMUL_OKAY; } } } switch ( vio->io_state ) { case HVMIO_none: break; case HVMIO_completed: vio->io_state = HVMIO_none; if ( p_data == NULL ) { if ( ram_page ) put_page(ram_page); return X86EMUL_UNHANDLEABLE; } goto finish_access; case HVMIO_dispatched: /* May have to wait for previous cycle of a multi-write to complete. */ if ( is_mmio && !p.data_is_ptr && (dir == IOREQ_WRITE) && (addr == (vio->mmio_large_write_pa + vio->mmio_large_write_bytes)) ) { if ( ram_page ) put_page(ram_page); return X86EMUL_RETRY; } default: if ( ram_page ) put_page(ram_page); return X86EMUL_UNHANDLEABLE; } if ( hvm_io_pending(curr) ) { gdprintk(XENLOG_WARNING, "WARNING: io already pending?\n"); if ( ram_page ) put_page(ram_page); return X86EMUL_UNHANDLEABLE; } vio->io_state = (p_data == NULL) ? HVMIO_dispatched : HVMIO_awaiting_completion; vio->io_size = size; /* * When retrying a repeated string instruction, force exit to guest after * completion of the retried iteration to allow handling of interrupts. */ if ( vio->mmio_retrying ) *reps = 1; p.count = *reps; if ( dir == IOREQ_WRITE ) hvmtrace_io_assist(is_mmio, &p); if ( is_mmio ) { rc = hvm_mmio_intercept(&p); if ( rc == X86EMUL_UNHANDLEABLE ) rc = hvm_buffered_io_intercept(&p); } else { rc = hvm_portio_intercept(&p); } switch ( rc ) { case X86EMUL_OKAY: case X86EMUL_RETRY: *reps = p.count; p.state = STATE_IORESP_READY; if ( !vio->mmio_retry ) { hvm_io_assist(&p); vio->io_state = HVMIO_none; } else /* Defer hvm_io_assist() invocation to hvm_do_resume(). */ vio->io_state = HVMIO_handle_mmio_awaiting_completion; break; case X86EMUL_UNHANDLEABLE: /* If there is no backing DM, just ignore accesses */ if ( !hvm_has_dm(curr->domain) ) { rc = X86EMUL_OKAY; vio->io_state = HVMIO_none; } else { rc = X86EMUL_RETRY; if ( !hvm_send_assist_req(&p) ) vio->io_state = HVMIO_none; else if ( p_data == NULL ) rc = X86EMUL_OKAY; } break; default: BUG(); } if ( rc != X86EMUL_OKAY ) { if ( ram_page ) put_page(ram_page); return rc; } finish_access: if ( dir == IOREQ_READ ) hvmtrace_io_assist(is_mmio, &p); if ( p_data != NULL ) memcpy(p_data, &vio->io_data, size); if ( is_mmio && !p.data_is_ptr ) { /* Part of a multi-cycle read or write? */ if ( dir == IOREQ_WRITE ) { paddr_t pa = vio->mmio_large_write_pa; unsigned int bytes = vio->mmio_large_write_bytes; if ( bytes == 0 ) pa = vio->mmio_large_write_pa = addr; if ( addr == (pa + bytes) ) vio->mmio_large_write_bytes += size; } else { paddr_t pa = vio->mmio_large_read_pa; unsigned int bytes = vio->mmio_large_read_bytes; if ( bytes == 0 ) pa = vio->mmio_large_read_pa = addr; if ( (addr == (pa + bytes)) && ((bytes + size) <= sizeof(vio->mmio_large_read)) ) { memcpy(&vio->mmio_large_read[bytes], p_data, size); vio->mmio_large_read_bytes += size; } } } if ( ram_page ) put_page(ram_page); return X86EMUL_OKAY; } int hvmemul_do_pio( unsigned long port, unsigned long *reps, int size, paddr_t ram_gpa, int dir, int df, void *p_data) { return hvmemul_do_io(0, port, reps, size, ram_gpa, dir, df, p_data); } static int hvmemul_do_mmio( paddr_t gpa, unsigned long *reps, int size, paddr_t ram_gpa, int dir, int df, void *p_data) { return hvmemul_do_io(1, gpa, reps, size, ram_gpa, dir, df, p_data); } /* * Convert addr from linear to physical form, valid over the range * [addr, addr + *reps * bytes_per_rep]. *reps is adjusted according to * the valid computed range. It is always >0 when X86EMUL_OKAY is returned. * @pfec indicates the access checks to be performed during page-table walks. */ static int hvmemul_linear_to_phys( unsigned long addr, paddr_t *paddr, unsigned int bytes_per_rep, unsigned long *reps, uint32_t pfec, struct hvm_emulate_ctxt *hvmemul_ctxt) { struct vcpu *curr = current; unsigned long pfn, npfn, done, todo, i, offset = addr & ~PAGE_MASK; int reverse; /* * Clip repetitions to a sensible maximum. This avoids extensive looping in * this function while still amortising the cost of I/O trap-and-emulate. */ *reps = min_t(unsigned long, *reps, 4096); /* With no paging it's easy: linear == physical. */ if ( !(curr->arch.hvm_vcpu.guest_cr[0] & X86_CR0_PG) ) { *paddr = addr; return X86EMUL_OKAY; } /* Reverse mode if this is a backwards multi-iteration string operation. */ reverse = (hvmemul_ctxt->ctxt.regs->eflags & X86_EFLAGS_DF) && (*reps > 1); if ( reverse && ((PAGE_SIZE - offset) < bytes_per_rep) ) { /* Do page-straddling first iteration forwards via recursion. */ paddr_t _paddr; unsigned long one_rep = 1; int rc = hvmemul_linear_to_phys( addr, &_paddr, bytes_per_rep, &one_rep, pfec, hvmemul_ctxt); if ( rc != X86EMUL_OKAY ) return rc; pfn = _paddr >> PAGE_SHIFT; } else if ( (pfn = paging_gva_to_gfn(curr, addr, &pfec)) == INVALID_GFN )
void test(int page){ if ( page ) put_page ( page ) ; }
static unsigned long fast_copy(void *dest, const void *source, int len, memcpy_t func) { /* */ while (len >= LARGE_COPY_CUTOFF) { int copy_size, bytes_left_on_page; pte_t *src_ptep, *dst_ptep; pte_t src_pte, dst_pte; struct page *src_page, *dst_page; /* */ retry_source: src_ptep = virt_to_pte(current->mm, (unsigned long)source); if (src_ptep == NULL) break; src_pte = *src_ptep; if (!hv_pte_get_present(src_pte) || !hv_pte_get_readable(src_pte) || hv_pte_get_mode(src_pte) != HV_PTE_MODE_CACHE_TILE_L3) break; if (get_remote_cache_cpu(src_pte) == smp_processor_id()) break; src_page = pfn_to_page(hv_pte_get_pfn(src_pte)); get_page(src_page); if (pte_val(src_pte) != pte_val(*src_ptep)) { put_page(src_page); goto retry_source; } if (pte_huge(src_pte)) { /* */ int pfn = hv_pte_get_pfn(src_pte); pfn += (((unsigned long)source & (HPAGE_SIZE-1)) >> PAGE_SHIFT); src_pte = pfn_pte(pfn, src_pte); src_pte = pte_mksmall(src_pte); } /* */ retry_dest: dst_ptep = virt_to_pte(current->mm, (unsigned long)dest); if (dst_ptep == NULL) { put_page(src_page); break; } dst_pte = *dst_ptep; if (!hv_pte_get_present(dst_pte) || !hv_pte_get_writable(dst_pte)) { put_page(src_page); break; } dst_page = pfn_to_page(hv_pte_get_pfn(dst_pte)); if (dst_page == src_page) { /* */ put_page(src_page); break; } get_page(dst_page); if (pte_val(dst_pte) != pte_val(*dst_ptep)) { put_page(dst_page); goto retry_dest; } if (pte_huge(dst_pte)) { /* */ int pfn = hv_pte_get_pfn(dst_pte); pfn += (((unsigned long)dest & (HPAGE_SIZE-1)) >> PAGE_SHIFT); dst_pte = pfn_pte(pfn, dst_pte); dst_pte = pte_mksmall(dst_pte); } /* */ copy_size = len; bytes_left_on_page = PAGE_SIZE - (((int)source) & (PAGE_SIZE-1)); if (copy_size > bytes_left_on_page) copy_size = bytes_left_on_page; bytes_left_on_page = PAGE_SIZE - (((int)dest) & (PAGE_SIZE-1)); if (copy_size > bytes_left_on_page) copy_size = bytes_left_on_page; memcpy_multicache(dest, source, dst_pte, src_pte, copy_size); /* */ put_page(dst_page); put_page(src_page); /* */ dest += copy_size; source += copy_size; len -= copy_size; } return func(dest, source, len); }
/* * @optval points to the userspace buffer that the information snapshot * will be copied into. * * @optlen on input is the size of the buffer in userspace. @optlen * on output is the size of the requested snapshot in bytes. * * This function returns -errno if there is a failure, particularly -ENOSPC * if the given userspace buffer was not large enough to fit the snapshot. * On success it returns the positive number of bytes of each array element * in the snapshot. */ int rds_info_getsockopt(struct socket *sock, int optname, char __user *optval, int __user *optlen) { struct rds_info_iterator iter; struct rds_info_lengths lens; unsigned long nr_pages = 0; unsigned long start; unsigned long i; rds_info_func func; struct page **pages = NULL; int ret; int len; int total; if (get_user(len, optlen)) { ret = -EFAULT; goto out; } /* check for all kinds of wrapping and the like */ start = (unsigned long)optval; if (len < 0 || len + PAGE_SIZE - 1 < len || start + len < start) { ret = -EINVAL; goto out; } /* a 0 len call is just trying to probe its length */ if (len == 0) goto call_func; nr_pages = (PAGE_ALIGN(start + len) - (start & PAGE_MASK)) >> PAGE_SHIFT; pages = kmalloc(nr_pages * sizeof(struct page *), GFP_KERNEL); if (pages == NULL) { ret = -ENOMEM; goto out; } ret = get_user_pages_fast(start, nr_pages, 1, pages); if (ret != nr_pages) { if (ret > 0) nr_pages = ret; else nr_pages = 0; ret = -EAGAIN; /* XXX ? */ goto out; } rdsdebug("len %d nr_pages %lu\n", len, nr_pages); call_func: func = rds_info_funcs[optname - RDS_INFO_FIRST]; if (func == NULL) { ret = -ENOPROTOOPT; goto out; } iter.pages = pages; iter.addr = NULL; iter.offset = start & (PAGE_SIZE - 1); func(sock, len, &iter, &lens); BUG_ON(lens.each == 0); total = lens.nr * lens.each; rds_info_iter_unmap(&iter); if (total > len) { len = total; ret = -ENOSPC; } else { len = total; ret = lens.each; } if (put_user(len, optlen)) ret = -EFAULT; out: for (i = 0; pages != NULL && i < nr_pages; i++) put_page(pages[i]); kfree(pages); return ret; }
struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, struct vm_area_struct *vma, unsigned long addr, bool *new_page_allocated) { struct page *found_page, *new_page = NULL; struct address_space *swapper_space = swap_address_space(entry); int err; *new_page_allocated = false; do { /* * First check the swap cache. Since this is normally * called after lookup_swap_cache() failed, re-calling * that would confuse statistics. */ found_page = find_get_page(swapper_space, entry.val); if (found_page) break; /* * Get a new page to read into from swap. */ if (!new_page) { new_page = alloc_page_vma(gfp_mask, vma, addr); if (!new_page) break; /* Out of memory */ } /* * call radix_tree_preload() while we can wait. */ err = radix_tree_maybe_preload(gfp_mask & GFP_KERNEL); if (err) break; /* * Swap entry may have been freed since our caller observed it. */ err = swapcache_prepare(entry); if (err == -EEXIST) { radix_tree_preload_end(); /* * We might race against get_swap_page() and stumble * across a SWAP_HAS_CACHE swap_map entry whose page * has not been brought into the swapcache yet, while * the other end is scheduled away waiting on discard * I/O completion at scan_swap_map(). * * In order to avoid turning this transitory state * into a permanent loop around this -EEXIST case * if !CONFIG_PREEMPT and the I/O completion happens * to be waiting on the CPU waitqueue where we are now * busy looping, we just conditionally invoke the * scheduler here, if there are some more important * tasks to run. */ cond_resched(); continue; } if (err) { /* swp entry is obsolete ? */ radix_tree_preload_end(); break; } /* May fail (-ENOMEM) if radix-tree node allocation failed. */ __SetPageLocked(new_page); __SetPageSwapBacked(new_page); err = __add_to_swap_cache(new_page, entry); if (likely(!err)) { radix_tree_preload_end(); /* * Initiate read into locked page and return. */ lru_cache_add_anon(new_page); *new_page_allocated = true; return new_page; } radix_tree_preload_end(); __ClearPageLocked(new_page); /* * add_to_swap_cache() doesn't return -EEXIST, so we can safely * clear SWAP_HAS_CACHE flag. */ swapcache_free(entry); } while (err != -ENOMEM); if (new_page) put_page(new_page); return found_page; }
static int ll_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, struct page **pagep, void **fsdata) { struct ll_cl_context *lcc; const struct lu_env *env = NULL; struct cl_io *io; struct cl_page *page = NULL; struct cl_object *clob = ll_i2info(mapping->host)->lli_clob; pgoff_t index = pos >> PAGE_SHIFT; struct page *vmpage = NULL; unsigned from = pos & (PAGE_SIZE - 1); unsigned to = from + len; int result = 0; ENTRY; CDEBUG(D_VFSTRACE, "Writing %lu of %d to %d bytes\n", index, from, len); lcc = ll_cl_find(file); if (lcc == NULL) { io = NULL; GOTO(out, result = -EIO); } env = lcc->lcc_env; io = lcc->lcc_io; /* To avoid deadlock, try to lock page first. */ vmpage = grab_cache_page_nowait(mapping, index); if (unlikely(vmpage == NULL || PageDirty(vmpage) || PageWriteback(vmpage))) { struct vvp_io *vio = vvp_env_io(env); struct cl_page_list *plist = &vio->u.write.vui_queue; /* if the page is already in dirty cache, we have to commit * the pages right now; otherwise, it may cause deadlock * because it holds page lock of a dirty page and request for * more grants. It's okay for the dirty page to be the first * one in commit page list, though. */ if (vmpage != NULL && plist->pl_nr > 0) { unlock_page(vmpage); put_page(vmpage); vmpage = NULL; } /* commit pages and then wait for page lock */ result = vvp_io_write_commit(env, io); if (result < 0) GOTO(out, result); if (vmpage == NULL) { vmpage = grab_cache_page_write_begin(mapping, index, flags); if (vmpage == NULL) GOTO(out, result = -ENOMEM); } } page = cl_page_find(env, clob, vmpage->index, vmpage, CPT_CACHEABLE); if (IS_ERR(page)) GOTO(out, result = PTR_ERR(page)); lcc->lcc_page = page; lu_ref_add(&page->cp_reference, "cl_io", io); cl_page_assume(env, io, page); if (!PageUptodate(vmpage)) { /* * We're completely overwriting an existing page, * so _don't_ set it up to date until commit_write */ if (from == 0 && to == PAGE_SIZE) { CL_PAGE_HEADER(D_PAGE, env, page, "full page write\n"); POISON_PAGE(vmpage, 0x11); } else { /* TODO: can be optimized at OSC layer to check if it * is a lockless IO. In that case, it's not necessary * to read the data. */ result = ll_prepare_partial_page(env, io, page); if (result == 0) SetPageUptodate(vmpage); } } if (result < 0) cl_page_unassume(env, io, page); EXIT; out: if (result < 0) { if (vmpage != NULL) { unlock_page(vmpage); put_page(vmpage); } if (!IS_ERR_OR_NULL(page)) { lu_ref_del(&page->cp_reference, "cl_io", io); cl_page_put(env, page); } if (io) io->ci_result = result; } else { *pagep = vmpage; *fsdata = lcc; } RETURN(result); }
/* * Must not be called with IRQs off. This should only be used on the * slow path. * * Copy a foreign granted page to local memory. */ int gnttab_copy_grant_page(grant_ref_t ref, struct page **pagep) { struct gnttab_unmap_and_replace unmap; mmu_update_t mmu; struct page *page; struct page *new_page; void *new_addr; void *addr; paddr_t pfn; maddr_t mfn; maddr_t new_mfn; int err; page = *pagep; if (!get_page_unless_zero(page)) return -ENOENT; err = -ENOMEM; new_page = alloc_page(GFP_ATOMIC | __GFP_NOWARN); if (!new_page) goto out; new_addr = page_address(new_page); addr = page_address(page); copy_page(new_addr, addr); pfn = page_to_pfn(page); mfn = pfn_to_mfn(pfn); new_mfn = virt_to_mfn(new_addr); write_seqlock_bh(&gnttab_dma_lock); /* Make seq visible before checking page_mapped. */ smp_mb(); /* Has the page been DMA-mapped? */ if (unlikely(page_mapped(page))) { write_sequnlock_bh(&gnttab_dma_lock); put_page(new_page); err = -EBUSY; goto out; } if (!xen_feature(XENFEAT_auto_translated_physmap)) set_phys_to_machine(pfn, new_mfn); gnttab_set_replace_op(&unmap, (unsigned long)addr, (unsigned long)new_addr, ref); err = HYPERVISOR_grant_table_op(GNTTABOP_unmap_and_replace, &unmap, 1); BUG_ON(err); BUG_ON(unmap.status != GNTST_okay); write_sequnlock_bh(&gnttab_dma_lock); if (!xen_feature(XENFEAT_auto_translated_physmap)) { set_phys_to_machine(page_to_pfn(new_page), INVALID_P2M_ENTRY); mmu.ptr = (new_mfn << PAGE_SHIFT) | MMU_MACHPHYS_UPDATE; mmu.val = pfn; err = HYPERVISOR_mmu_update(&mmu, 1, NULL, DOMID_SELF); BUG_ON(err); } new_page->mapping = page->mapping; new_page->index = page->index; set_bit(PG_foreign, &new_page->flags); if (PageReserved(page)) SetPageReserved(new_page); *pagep = new_page; SetPageForeign(page, gnttab_page_free); page->mapping = NULL; out: put_page(page); return err; }
static int ll_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct page *vmpage, void *fsdata) { struct ll_cl_context *lcc = fsdata; const struct lu_env *env; struct cl_io *io; struct vvp_io *vio; struct cl_page *page; unsigned from = pos & (PAGE_SIZE - 1); bool unplug = false; int result = 0; ENTRY; put_page(vmpage); LASSERT(lcc != NULL); env = lcc->lcc_env; page = lcc->lcc_page; io = lcc->lcc_io; vio = vvp_env_io(env); LASSERT(cl_page_is_owned(page, io)); if (copied > 0) { struct cl_page_list *plist = &vio->u.write.vui_queue; lcc->lcc_page = NULL; /* page will be queued */ /* Add it into write queue */ cl_page_list_add(plist, page); if (plist->pl_nr == 1) /* first page */ vio->u.write.vui_from = from; else LASSERT(from == 0); vio->u.write.vui_to = from + copied; /* To address the deadlock in balance_dirty_pages() where * this dirty page may be written back in the same thread. */ if (PageDirty(vmpage)) unplug = true; /* We may have one full RPC, commit it soon */ if (plist->pl_nr >= PTLRPC_MAX_BRW_PAGES) unplug = true; CL_PAGE_DEBUG(D_VFSTRACE, env, page, "queued page: %d.\n", plist->pl_nr); } else { cl_page_disown(env, io, page); lcc->lcc_page = NULL; lu_ref_del(&page->cp_reference, "cl_io", io); cl_page_put(env, page); /* page list is not contiguous now, commit it now */ unplug = true; } if (unplug || file->f_flags & O_SYNC || IS_SYNC(file_inode(file))) result = vvp_io_write_commit(env, io); if (result < 0) io->ci_result = result; RETURN(result >= 0 ? copied : result); }
/** * ib_umem_odp_map_dma_pages - Pin and DMA map userspace memory in an ODP MR. * * Pins the range of pages passed in the argument, and maps them to * DMA addresses. The DMA addresses of the mapped pages is updated in * umem->odp_data->dma_list. * * Returns the number of pages mapped in success, negative error code * for failure. * An -EAGAIN error code is returned when a concurrent mmu notifier prevents * the function from completing its task. * * @umem: the umem to map and pin * @user_virt: the address from which we need to map. * @bcnt: the minimal number of bytes to pin and map. The mapping might be * bigger due to alignment, and may also be smaller in case of an error * pinning or mapping a page. The actual pages mapped is returned in * the return value. * @access_mask: bit mask of the requested access permissions for the given * range. * @current_seq: the MMU notifiers sequance value for synchronization with * invalidations. the sequance number is read from * umem->odp_data->notifiers_seq before calling this function * @flags: IB_ODP_DMA_MAP_FOR_PREEFTCH is used to indicate that the function * was called from the prefetch verb. IB_ODP_DMA_MAP_FOR_PAGEFAULT is * used to indicate that the function was called from a pagefault * handler. */ int ib_umem_odp_map_dma_pages(struct ib_umem *umem, u64 user_virt, u64 bcnt, u64 access_mask, unsigned long current_seq, enum ib_odp_dma_map_flags flags) { struct task_struct *owning_process = NULL; struct mm_struct *owning_mm = NULL; struct page **local_page_list = NULL; u64 off; int j, k, ret = 0, start_idx, npages = 0; if (access_mask == 0) return -EINVAL; if (user_virt < ib_umem_start(umem) || user_virt + bcnt > ib_umem_end(umem)) return -EFAULT; local_page_list = (struct page **)__get_free_page(GFP_KERNEL); if (!local_page_list) return -ENOMEM; off = user_virt & (~PAGE_MASK); user_virt = user_virt & PAGE_MASK; bcnt += off; /* Charge for the first page offset as well. */ owning_process = get_pid_task(umem->context->tgid, PIDTYPE_PID); if (owning_process == NULL) { ret = -EINVAL; goto out_no_task; } owning_mm = get_task_mm(owning_process); if (owning_mm == NULL) { ret = -EINVAL; goto out_put_task; } start_idx = (user_virt - ib_umem_start(umem)) >> PAGE_SHIFT; k = start_idx; while (bcnt > 0) { down_read(&owning_mm->mmap_sem); /* * Note: this might result in redundent page getting. We can * avoid this by checking dma_list to be 0 before calling * get_user_pages. However, this make the code much more * complex (and doesn't gain us much performance in most use * cases). */ npages = get_user_pages(owning_process, owning_mm, user_virt, min_t(size_t, (bcnt - 1 + PAGE_SIZE) / PAGE_SIZE, PAGE_SIZE / sizeof(struct page *)), access_mask & ODP_WRITE_ALLOWED_BIT, 0, local_page_list, NULL); up_read(&owning_mm->mmap_sem); if (npages < 0) break; bcnt -= min_t(size_t, npages << PAGE_SHIFT, bcnt); user_virt += npages << PAGE_SHIFT; for (j = 0; j < npages; ++j) { ret = ib_umem_odp_map_dma_single_page(umem, k, local_page_list[j], access_mask, current_seq, flags); if (ret < 0) break; k++; } if (ret < 0) { /* Release left over pages when handling errors. */ for (++j; j < npages; ++j) put_page(local_page_list[j]); break; } } if (ret >= 0) { if (npages < 0 && k == start_idx) ret = npages; else ret = k - start_idx; } mmput(owning_mm); out_put_task: put_task_struct(owning_process); out_no_task: free_page((unsigned long) local_page_list); return ret; }
static int copy_user_bh(struct page *to, struct inode *inode, struct buffer_head *bh, unsigned long vaddr) { struct blk_dax_ctl dax = { .sector = to_sector(bh, inode), .size = bh->b_size, }; struct block_device *bdev = bh->b_bdev; void *vto; if (dax_map_atomic(bdev, &dax) < 0) return PTR_ERR(dax.addr); vto = kmap_atomic(to); copy_user_page(vto, (void __force *)dax.addr, vaddr, to); kunmap_atomic(vto); dax_unmap_atomic(bdev, &dax); return 0; } #define NO_SECTOR -1 #define DAX_PMD_INDEX(page_index) (page_index & (PMD_MASK >> PAGE_SHIFT)) static int dax_radix_entry(struct address_space *mapping, pgoff_t index, sector_t sector, bool pmd_entry, bool dirty) { struct radix_tree_root *page_tree = &mapping->page_tree; pgoff_t pmd_index = DAX_PMD_INDEX(index); int type, error = 0; void *entry; WARN_ON_ONCE(pmd_entry && !dirty); if (dirty) __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); spin_lock_irq(&mapping->tree_lock); entry = radix_tree_lookup(page_tree, pmd_index); if (entry && RADIX_DAX_TYPE(entry) == RADIX_DAX_PMD) { index = pmd_index; goto dirty; } entry = radix_tree_lookup(page_tree, index); if (entry) { type = RADIX_DAX_TYPE(entry); if (WARN_ON_ONCE(type != RADIX_DAX_PTE && type != RADIX_DAX_PMD)) { error = -EIO; goto unlock; } if (!pmd_entry || type == RADIX_DAX_PMD) goto dirty; /* * We only insert dirty PMD entries into the radix tree. This * means we don't need to worry about removing a dirty PTE * entry and inserting a clean PMD entry, thus reducing the * range we would flush with a follow-up fsync/msync call. */ radix_tree_delete(&mapping->page_tree, index); mapping->nrexceptional--; } if (sector == NO_SECTOR) { /* * This can happen during correct operation if our pfn_mkwrite * fault raced against a hole punch operation. If this * happens the pte that was hole punched will have been * unmapped and the radix tree entry will have been removed by * the time we are called, but the call will still happen. We * will return all the way up to wp_pfn_shared(), where the * pte_same() check will fail, eventually causing page fault * to be retried by the CPU. */ goto unlock; } error = radix_tree_insert(page_tree, index, RADIX_DAX_ENTRY(sector, pmd_entry)); if (error) goto unlock; mapping->nrexceptional++; dirty: if (dirty) radix_tree_tag_set(page_tree, index, PAGECACHE_TAG_DIRTY); unlock: spin_unlock_irq(&mapping->tree_lock); return error; } static int dax_writeback_one(struct block_device *bdev, struct address_space *mapping, pgoff_t index, void *entry) { struct radix_tree_root *page_tree = &mapping->page_tree; int type = RADIX_DAX_TYPE(entry); struct radix_tree_node *node; struct blk_dax_ctl dax; void **slot; int ret = 0; spin_lock_irq(&mapping->tree_lock); /* * Regular page slots are stabilized by the page lock even * without the tree itself locked. These unlocked entries * need verification under the tree lock. */ if (!__radix_tree_lookup(page_tree, index, &node, &slot)) goto unlock; if (*slot != entry) goto unlock; /* another fsync thread may have already written back this entry */ if (!radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE)) goto unlock; if (WARN_ON_ONCE(type != RADIX_DAX_PTE && type != RADIX_DAX_PMD)) { ret = -EIO; goto unlock; } dax.sector = RADIX_DAX_SECTOR(entry); dax.size = (type == RADIX_DAX_PMD ? PMD_SIZE : PAGE_SIZE); spin_unlock_irq(&mapping->tree_lock); /* * We cannot hold tree_lock while calling dax_map_atomic() because it * eventually calls cond_resched(). */ ret = dax_map_atomic(bdev, &dax); if (ret < 0) return ret; if (WARN_ON_ONCE(ret < dax.size)) { ret = -EIO; goto unmap; } wb_cache_pmem(dax.addr, dax.size); spin_lock_irq(&mapping->tree_lock); radix_tree_tag_clear(page_tree, index, PAGECACHE_TAG_TOWRITE); spin_unlock_irq(&mapping->tree_lock); unmap: dax_unmap_atomic(bdev, &dax); return ret; unlock: spin_unlock_irq(&mapping->tree_lock); return ret; } /* * Flush the mapping to the persistent domain within the byte range of [start, * end]. This is required by data integrity operations to ensure file data is * on persistent storage prior to completion of the operation. */ int dax_writeback_mapping_range(struct address_space *mapping, struct block_device *bdev, struct writeback_control *wbc) { struct inode *inode = mapping->host; pgoff_t start_index, end_index, pmd_index; pgoff_t indices[PAGEVEC_SIZE]; struct pagevec pvec; bool done = false; int i, ret = 0; void *entry; if (WARN_ON_ONCE(inode->i_blkbits != PAGE_SHIFT)) return -EIO; if (!mapping->nrexceptional || wbc->sync_mode != WB_SYNC_ALL) return 0; start_index = wbc->range_start >> PAGE_SHIFT; end_index = wbc->range_end >> PAGE_SHIFT; pmd_index = DAX_PMD_INDEX(start_index); rcu_read_lock(); entry = radix_tree_lookup(&mapping->page_tree, pmd_index); rcu_read_unlock(); /* see if the start of our range is covered by a PMD entry */ if (entry && RADIX_DAX_TYPE(entry) == RADIX_DAX_PMD) start_index = pmd_index; tag_pages_for_writeback(mapping, start_index, end_index); pagevec_init(&pvec, 0); while (!done) { pvec.nr = find_get_entries_tag(mapping, start_index, PAGECACHE_TAG_TOWRITE, PAGEVEC_SIZE, pvec.pages, indices); if (pvec.nr == 0) break; for (i = 0; i < pvec.nr; i++) { if (indices[i] > end_index) { done = true; break; } ret = dax_writeback_one(bdev, mapping, indices[i], pvec.pages[i]); if (ret < 0) return ret; } } wmb_pmem(); return 0; } EXPORT_SYMBOL_GPL(dax_writeback_mapping_range); static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh, struct vm_area_struct *vma, struct vm_fault *vmf) { unsigned long vaddr = (unsigned long)vmf->virtual_address; struct address_space *mapping = inode->i_mapping; struct block_device *bdev = bh->b_bdev; struct blk_dax_ctl dax = { .sector = to_sector(bh, inode), .size = bh->b_size, }; pgoff_t size; int error; i_mmap_lock_read(mapping); /* * Check truncate didn't happen while we were allocating a block. * If it did, this block may or may not be still allocated to the * file. We can't tell the filesystem to free it because we can't * take i_mutex here. In the worst case, the file still has blocks * allocated past the end of the file. */ size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; if (unlikely(vmf->pgoff >= size)) { error = -EIO; goto out; } if (dax_map_atomic(bdev, &dax) < 0) { error = PTR_ERR(dax.addr); goto out; } if (buffer_unwritten(bh) || buffer_new(bh)) { clear_pmem(dax.addr, PAGE_SIZE); wmb_pmem(); } dax_unmap_atomic(bdev, &dax); error = dax_radix_entry(mapping, vmf->pgoff, dax.sector, false, vmf->flags & FAULT_FLAG_WRITE); if (error) goto out; error = vm_insert_mixed(vma, vaddr, dax.pfn); out: i_mmap_unlock_read(mapping); return error; } /** * __dax_fault - handle a page fault on a DAX file * @vma: The virtual memory area where the fault occurred * @vmf: The description of the fault * @get_block: The filesystem method used to translate file offsets to blocks * @complete_unwritten: The filesystem method used to convert unwritten blocks * to written so the data written to them is exposed. This is required for * required by write faults for filesystems that will return unwritten * extent mappings from @get_block, but it is optional for reads as * dax_insert_mapping() will always zero unwritten blocks. If the fs does * not support unwritten extents, the it should pass NULL. * * When a page fault occurs, filesystems may call this helper in their * fault handler for DAX files. __dax_fault() assumes the caller has done all * the necessary locking for the page fault to proceed successfully. */ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, get_block_t get_block, dax_iodone_t complete_unwritten) { struct file *file = vma->vm_file; struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; struct page *page; struct buffer_head bh; unsigned long vaddr = (unsigned long)vmf->virtual_address; unsigned blkbits = inode->i_blkbits; sector_t block; pgoff_t size; int error; int major = 0; size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; if (vmf->pgoff >= size) return VM_FAULT_SIGBUS; memset(&bh, 0, sizeof(bh)); block = (sector_t)vmf->pgoff << (PAGE_SHIFT - blkbits); bh.b_bdev = inode->i_sb->s_bdev; bh.b_size = PAGE_SIZE; repeat: page = find_get_page(mapping, vmf->pgoff); if (page) { if (!lock_page_or_retry(page, vma->vm_mm, vmf->flags)) { put_page(page); return VM_FAULT_RETRY; } if (unlikely(page->mapping != mapping)) { unlock_page(page); put_page(page); goto repeat; } size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; if (unlikely(vmf->pgoff >= size)) { /* * We have a struct page covering a hole in the file * from a read fault and we've raced with a truncate */ error = -EIO; goto unlock_page; } } error = get_block(inode, block, &bh, 0); if (!error && (bh.b_size < PAGE_SIZE)) error = -EIO; /* fs corruption? */ if (error) goto unlock_page; if (!buffer_mapped(&bh) && !buffer_unwritten(&bh) && !vmf->cow_page) { if (vmf->flags & FAULT_FLAG_WRITE) { error = get_block(inode, block, &bh, 1); count_vm_event(PGMAJFAULT); mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT); major = VM_FAULT_MAJOR; if (!error && (bh.b_size < PAGE_SIZE)) error = -EIO; if (error) goto unlock_page; } else { return dax_load_hole(mapping, page, vmf); } } if (vmf->cow_page) { struct page *new_page = vmf->cow_page; if (buffer_written(&bh)) error = copy_user_bh(new_page, inode, &bh, vaddr); else clear_user_highpage(new_page, vaddr); if (error) goto unlock_page; vmf->page = page; if (!page) { i_mmap_lock_read(mapping); /* Check we didn't race with truncate */ size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; if (vmf->pgoff >= size) { i_mmap_unlock_read(mapping); error = -EIO; goto out; } } return VM_FAULT_LOCKED; } /* Check we didn't race with a read fault installing a new page */ if (!page && major) page = find_lock_page(mapping, vmf->pgoff); if (page) { unmap_mapping_range(mapping, vmf->pgoff << PAGE_SHIFT, PAGE_SIZE, 0); delete_from_page_cache(page); unlock_page(page); put_page(page); page = NULL; } /* * If we successfully insert the new mapping over an unwritten extent, * we need to ensure we convert the unwritten extent. If there is an * error inserting the mapping, the filesystem needs to leave it as * unwritten to prevent exposure of the stale underlying data to * userspace, but we still need to call the completion function so * the private resources on the mapping buffer can be released. We * indicate what the callback should do via the uptodate variable, same * as for normal BH based IO completions. */ error = dax_insert_mapping(inode, &bh, vma, vmf); if (buffer_unwritten(&bh)) { if (complete_unwritten) complete_unwritten(&bh, !error); else WARN_ON_ONCE(!(vmf->flags & FAULT_FLAG_WRITE)); } out: if (error == -ENOMEM) return VM_FAULT_OOM | major; /* -EBUSY is fine, somebody else faulted on the same PTE */ if ((error < 0) && (error != -EBUSY)) return VM_FAULT_SIGBUS | major; return VM_FAULT_NOPAGE | major; unlock_page: if (page) { unlock_page(page); put_page(page); } goto out; }
static int nilfs_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { struct inode *old_inode = d_inode(old_dentry); struct inode *new_inode = d_inode(new_dentry); struct page *dir_page = NULL; struct nilfs_dir_entry *dir_de = NULL; struct page *old_page; struct nilfs_dir_entry *old_de; struct nilfs_transaction_info ti; int err; if (flags & ~RENAME_NOREPLACE) return -EINVAL; err = nilfs_transaction_begin(old_dir->i_sb, &ti, 1); if (unlikely(err)) return err; err = -ENOENT; old_de = nilfs_find_entry(old_dir, &old_dentry->d_name, &old_page); if (!old_de) goto out; if (S_ISDIR(old_inode->i_mode)) { err = -EIO; dir_de = nilfs_dotdot(old_inode, &dir_page); if (!dir_de) goto out_old; } if (new_inode) { struct page *new_page; struct nilfs_dir_entry *new_de; err = -ENOTEMPTY; if (dir_de && !nilfs_empty_dir(new_inode)) goto out_dir; err = -ENOENT; new_de = nilfs_find_entry(new_dir, &new_dentry->d_name, &new_page); if (!new_de) goto out_dir; nilfs_set_link(new_dir, new_de, new_page, old_inode); nilfs_mark_inode_dirty(new_dir); new_inode->i_ctime = current_time(new_inode); if (dir_de) drop_nlink(new_inode); drop_nlink(new_inode); nilfs_mark_inode_dirty(new_inode); } else { err = nilfs_add_link(new_dentry, old_inode); if (err) goto out_dir; if (dir_de) { inc_nlink(new_dir); nilfs_mark_inode_dirty(new_dir); } } /* * Like most other Unix systems, set the ctime for inodes on a * rename. */ old_inode->i_ctime = current_time(old_inode); nilfs_delete_entry(old_de, old_page); if (dir_de) { nilfs_set_link(old_inode, dir_de, dir_page, new_dir); drop_nlink(old_dir); } nilfs_mark_inode_dirty(old_dir); nilfs_mark_inode_dirty(old_inode); err = nilfs_transaction_commit(old_dir->i_sb); return err; out_dir: if (dir_de) { kunmap(dir_page); put_page(dir_page); } out_old: kunmap(old_page); put_page(old_page); out: nilfs_transaction_abort(old_dir->i_sb); return err; }
/* * Actual dumper * * This is a two-pass process; first we find the offsets of the bits, * and then they are actually written out. If we run out of core limit * we just truncate. */ static int elf_core_dump(long signr, struct pt_regs * regs, struct file * file) { int has_dumped = 0; mm_segment_t fs; int segs; size_t size = 0; int i; struct vm_area_struct *vma; struct elfhdr elf; off_t offset = 0, dataoff; unsigned long limit = current->rlim[RLIMIT_CORE].rlim_cur; int numnote = 4; struct memelfnote notes[4]; struct elf_prstatus prstatus; /* NT_PRSTATUS */ elf_fpregset_t fpu; /* NT_PRFPREG */ struct elf_prpsinfo psinfo; /* NT_PRPSINFO */ /* first copy the parameters from user space */ memset(&psinfo, 0, sizeof(psinfo)); { int i, len; len = current->mm->arg_end - current->mm->arg_start; if (len >= ELF_PRARGSZ) len = ELF_PRARGSZ-1; copy_from_user(&psinfo.pr_psargs, (const char *)current->mm->arg_start, len); for(i = 0; i < len; i++) if (psinfo.pr_psargs[i] == 0) psinfo.pr_psargs[i] = ' '; psinfo.pr_psargs[len] = 0; } memset(&prstatus, 0, sizeof(prstatus)); /* * This transfers the registers from regs into the standard * coredump arrangement, whatever that is. */ #ifdef ELF_CORE_COPY_REGS ELF_CORE_COPY_REGS(prstatus.pr_reg, regs) #else if (sizeof(elf_gregset_t) != sizeof(struct pt_regs)) { printk("sizeof(elf_gregset_t) (%ld) != sizeof(struct pt_regs) (%ld)\n", (long)sizeof(elf_gregset_t), (long)sizeof(struct pt_regs)); } else *(struct pt_regs *)&prstatus.pr_reg = *regs; #endif /* now stop all vm operations */ down_write(¤t->mm->mmap_sem); segs = current->mm->map_count; #ifdef DEBUG printk("elf_core_dump: %d segs %lu limit\n", segs, limit); #endif /* Set up header */ memcpy(elf.e_ident, ELFMAG, SELFMAG); elf.e_ident[EI_CLASS] = ELF_CLASS; elf.e_ident[EI_DATA] = ELF_DATA; elf.e_ident[EI_VERSION] = EV_CURRENT; memset(elf.e_ident+EI_PAD, 0, EI_NIDENT-EI_PAD); elf.e_type = ET_CORE; elf.e_machine = ELF_ARCH; elf.e_version = EV_CURRENT; elf.e_entry = 0; elf.e_phoff = sizeof(elf); elf.e_shoff = 0; elf.e_flags = 0; elf.e_ehsize = sizeof(elf); elf.e_phentsize = sizeof(struct elf_phdr); elf.e_phnum = segs+1; /* Include notes */ elf.e_shentsize = 0; elf.e_shnum = 0; elf.e_shstrndx = 0; fs = get_fs(); set_fs(KERNEL_DS); has_dumped = 1; current->flags |= PF_DUMPCORE; DUMP_WRITE(&elf, sizeof(elf)); offset += sizeof(elf); /* Elf header */ offset += (segs+1) * sizeof(struct elf_phdr); /* Program headers */ /* * Set up the notes in similar form to SVR4 core dumps made * with info from their /proc. */ notes[0].name = "CORE"; notes[0].type = NT_PRSTATUS; notes[0].datasz = sizeof(prstatus); notes[0].data = &prstatus; prstatus.pr_info.si_signo = prstatus.pr_cursig = signr; prstatus.pr_sigpend = current->pending.signal.sig[0]; prstatus.pr_sighold = current->blocked.sig[0]; psinfo.pr_pid = prstatus.pr_pid = current->pid; psinfo.pr_ppid = prstatus.pr_ppid = current->p_pptr->pid; psinfo.pr_pgrp = prstatus.pr_pgrp = current->pgrp; psinfo.pr_sid = prstatus.pr_sid = current->session; prstatus.pr_utime.tv_sec = CT_TO_SECS(current->times.tms_utime); prstatus.pr_utime.tv_usec = CT_TO_USECS(current->times.tms_utime); prstatus.pr_stime.tv_sec = CT_TO_SECS(current->times.tms_stime); prstatus.pr_stime.tv_usec = CT_TO_USECS(current->times.tms_stime); prstatus.pr_cutime.tv_sec = CT_TO_SECS(current->times.tms_cutime); prstatus.pr_cutime.tv_usec = CT_TO_USECS(current->times.tms_cutime); prstatus.pr_cstime.tv_sec = CT_TO_SECS(current->times.tms_cstime); prstatus.pr_cstime.tv_usec = CT_TO_USECS(current->times.tms_cstime); #ifdef DEBUG dump_regs("Passed in regs", (elf_greg_t *)regs); dump_regs("prstatus regs", (elf_greg_t *)&prstatus.pr_reg); #endif notes[1].name = "CORE"; notes[1].type = NT_PRPSINFO; notes[1].datasz = sizeof(psinfo); notes[1].data = &psinfo; i = current->state ? ffz(~current->state) + 1 : 0; psinfo.pr_state = i; psinfo.pr_sname = (i < 0 || i > 5) ? '.' : "RSDZTD"[i]; psinfo.pr_zomb = psinfo.pr_sname == 'Z'; psinfo.pr_nice = current->nice; psinfo.pr_flag = current->flags; psinfo.pr_uid = NEW_TO_OLD_UID(current->uid); psinfo.pr_gid = NEW_TO_OLD_GID(current->gid); strncpy(psinfo.pr_fname, current->comm, sizeof(psinfo.pr_fname)); notes[2].name = "CORE"; notes[2].type = NT_TASKSTRUCT; notes[2].datasz = sizeof(*current); notes[2].data = current; /* Try to dump the FPU. */ prstatus.pr_fpvalid = dump_fpu (regs, &fpu); if (!prstatus.pr_fpvalid) { numnote--; } else { notes[3].name = "CORE"; notes[3].type = NT_PRFPREG; notes[3].datasz = sizeof(fpu); notes[3].data = &fpu; } /* Write notes phdr entry */ { struct elf_phdr phdr; int sz = 0; for(i = 0; i < numnote; i++) sz += notesize(¬es[i]); phdr.p_type = PT_NOTE; phdr.p_offset = offset; phdr.p_vaddr = 0; phdr.p_paddr = 0; phdr.p_filesz = sz; phdr.p_memsz = 0; phdr.p_flags = 0; phdr.p_align = 0; offset += phdr.p_filesz; DUMP_WRITE(&phdr, sizeof(phdr)); } /* Page-align dumped data */ dataoff = offset = roundup(offset, ELF_EXEC_PAGESIZE); /* Write program headers for segments dump */ for(vma = current->mm->mmap; vma != NULL; vma = vma->vm_next) { struct elf_phdr phdr; size_t sz; sz = vma->vm_end - vma->vm_start; phdr.p_type = PT_LOAD; phdr.p_offset = offset; phdr.p_vaddr = vma->vm_start; phdr.p_paddr = 0; phdr.p_filesz = maydump(vma) ? sz : 0; phdr.p_memsz = sz; offset += phdr.p_filesz; phdr.p_flags = vma->vm_flags & VM_READ ? PF_R : 0; if (vma->vm_flags & VM_WRITE) phdr.p_flags |= PF_W; if (vma->vm_flags & VM_EXEC) phdr.p_flags |= PF_X; phdr.p_align = ELF_EXEC_PAGESIZE; DUMP_WRITE(&phdr, sizeof(phdr)); } for(i = 0; i < numnote; i++) if (!writenote(¬es[i], file)) goto end_coredump; DUMP_SEEK(dataoff); for(vma = current->mm->mmap; vma != NULL; vma = vma->vm_next) { unsigned long addr; if (!maydump(vma)) continue; #ifdef DEBUG printk("elf_core_dump: writing %08lx-%08lx\n", vma->vm_start, vma->vm_end); #endif for (addr = vma->vm_start; addr < vma->vm_end; addr += PAGE_SIZE) { struct page* page; struct vm_area_struct *vma; if (get_user_pages(current, current->mm, addr, 1, 0, 1, &page, &vma) <= 0) { DUMP_SEEK (file->f_pos + PAGE_SIZE); } else { if (page == ZERO_PAGE(addr)) { DUMP_SEEK (file->f_pos + PAGE_SIZE); } else { void *kaddr; flush_cache_page(vma, addr); kaddr = kmap(page); DUMP_WRITE(kaddr, PAGE_SIZE); flush_page_to_ram(page); kunmap(page); } put_page(page); } } } if ((off_t) file->f_pos != offset) { /* Sanity check */ printk("elf_core_dump: file->f_pos (%ld) != offset (%ld)\n", (off_t) file->f_pos, offset); } end_coredump: set_fs(fs); up_write(¤t->mm->mmap_sem); return has_dumped; }
/* * Identify large copies from remotely-cached memory, and copy them * via memcpy_multicache() if they look good, otherwise fall back * to the particular kind of copying passed as the memcpy_t function. */ static unsigned long fast_copy(void *dest, const void *source, int len, memcpy_t func) { /* * Check if it's big enough to bother with. We may end up doing a * small copy via TLB manipulation if we're near a page boundary, * but presumably we'll make it up when we hit the second page. */ while (len >= LARGE_COPY_CUTOFF) { int copy_size, bytes_left_on_page; pte_t *src_ptep, *dst_ptep; pte_t src_pte, dst_pte; struct page *src_page, *dst_page; /* Is the source page oloc'ed to a remote cpu? */ retry_source: src_ptep = virt_to_pte(current->mm, (unsigned long)source); if (src_ptep == NULL) break; src_pte = *src_ptep; if (!hv_pte_get_present(src_pte) || !hv_pte_get_readable(src_pte) || hv_pte_get_mode(src_pte) != HV_PTE_MODE_CACHE_TILE_L3) break; if (get_remote_cache_cpu(src_pte) == smp_processor_id()) break; src_page = pfn_to_page(pte_pfn(src_pte)); get_page(src_page); if (pte_val(src_pte) != pte_val(*src_ptep)) { put_page(src_page); goto retry_source; } if (pte_huge(src_pte)) { /* Adjust the PTE to correspond to a small page */ int pfn = pte_pfn(src_pte); pfn += (((unsigned long)source & (HPAGE_SIZE-1)) >> PAGE_SHIFT); src_pte = pfn_pte(pfn, src_pte); src_pte = pte_mksmall(src_pte); } /* Is the destination page writable? */ retry_dest: dst_ptep = virt_to_pte(current->mm, (unsigned long)dest); if (dst_ptep == NULL) { put_page(src_page); break; } dst_pte = *dst_ptep; if (!hv_pte_get_present(dst_pte) || !hv_pte_get_writable(dst_pte)) { put_page(src_page); break; } dst_page = pfn_to_page(pte_pfn(dst_pte)); if (dst_page == src_page) { /* * Source and dest are on the same page; this * potentially exposes us to incoherence if any * part of src and dest overlap on a cache line. * Just give up rather than trying to be precise. */ put_page(src_page); break; } get_page(dst_page); if (pte_val(dst_pte) != pte_val(*dst_ptep)) { put_page(dst_page); goto retry_dest; } if (pte_huge(dst_pte)) { /* Adjust the PTE to correspond to a small page */ int pfn = pte_pfn(dst_pte); pfn += (((unsigned long)dest & (HPAGE_SIZE-1)) >> PAGE_SHIFT); dst_pte = pfn_pte(pfn, dst_pte); dst_pte = pte_mksmall(dst_pte); } /* All looks good: create a cachable PTE and copy from it */ copy_size = len; bytes_left_on_page = PAGE_SIZE - (((int)source) & (PAGE_SIZE-1)); if (copy_size > bytes_left_on_page) copy_size = bytes_left_on_page; bytes_left_on_page = PAGE_SIZE - (((int)dest) & (PAGE_SIZE-1)); if (copy_size > bytes_left_on_page) copy_size = bytes_left_on_page; memcpy_multicache(dest, source, dst_pte, src_pte, copy_size); /* Release the pages */ put_page(dst_page); put_page(src_page); /* Continue on the next page */ dest += copy_size; source += copy_size; len -= copy_size; } return func(dest, source, len); }
static lpae_t mfn_to_p2m_entry(unsigned long mfn, unsigned int mattr, p2m_type_t t) { paddr_t pa = ((paddr_t) mfn) << PAGE_SHIFT; /* xn and write bit will be defined in the switch */ lpae_t e = (lpae_t) { .p2m.af = 1, .p2m.sh = LPAE_SH_OUTER, .p2m.read = 1, .p2m.mattr = mattr, .p2m.table = 1, .p2m.valid = 1, .p2m.type = t, }; BUILD_BUG_ON(p2m_max_real_type > (1 << 4)); switch (t) { case p2m_ram_rw: e.p2m.xn = 0; e.p2m.write = 1; break; case p2m_ram_ro: e.p2m.xn = 0; e.p2m.write = 0; break; case p2m_map_foreign: case p2m_grant_map_rw: case p2m_mmio_direct: e.p2m.xn = 1; e.p2m.write = 1; break; case p2m_grant_map_ro: case p2m_invalid: e.p2m.xn = 1; e.p2m.write = 0; break; case p2m_max_real_type: BUG(); break; } ASSERT(!(pa & ~PAGE_MASK)); ASSERT(!(pa & ~PADDR_MASK)); e.bits |= pa; return e; } /* Allocate a new page table page and hook it in via the given entry */ static int p2m_create_table(struct domain *d, lpae_t *entry) { struct p2m_domain *p2m = &d->arch.p2m; struct page_info *page; void *p; lpae_t pte; BUG_ON(entry->p2m.valid); page = alloc_domheap_page(NULL, 0); if ( page == NULL ) return -ENOMEM; page_list_add(page, &p2m->pages); p = __map_domain_page(page); clear_page(p); unmap_domain_page(p); pte = mfn_to_p2m_entry(page_to_mfn(page), MATTR_MEM, p2m_invalid); write_pte(entry, pte); return 0; } enum p2m_operation { INSERT, ALLOCATE, REMOVE, RELINQUISH, CACHEFLUSH, }; static int apply_p2m_changes(struct domain *d, enum p2m_operation op, paddr_t start_gpaddr, paddr_t end_gpaddr, paddr_t maddr, int mattr, p2m_type_t t) { int rc; struct p2m_domain *p2m = &d->arch.p2m; lpae_t *first = NULL, *second = NULL, *third = NULL; paddr_t addr; unsigned long cur_first_page = ~0, cur_first_offset = ~0, cur_second_offset = ~0; unsigned long count = 0; unsigned int flush = 0; bool_t populate = (op == INSERT || op == ALLOCATE); lpae_t pte; spin_lock(&p2m->lock); if ( d != current->domain ) p2m_load_VTTBR(d); addr = start_gpaddr; while ( addr < end_gpaddr ) { if ( cur_first_page != p2m_first_level_index(addr) ) { if ( first ) unmap_domain_page(first); first = p2m_map_first(p2m, addr); if ( !first ) { rc = -EINVAL; goto out; } cur_first_page = p2m_first_level_index(addr); } if ( !first[first_table_offset(addr)].p2m.valid ) { if ( !populate ) { addr = (addr + FIRST_SIZE) & FIRST_MASK; continue; } rc = p2m_create_table(d, &first[first_table_offset(addr)]); if ( rc < 0 ) { printk("p2m_populate_ram: L1 failed\n"); goto out; } } BUG_ON(!first[first_table_offset(addr)].p2m.valid); if ( cur_first_offset != first_table_offset(addr) ) { if (second) unmap_domain_page(second); second = map_domain_page(first[first_table_offset(addr)].p2m.base); cur_first_offset = first_table_offset(addr); } /* else: second already valid */ if ( !second[second_table_offset(addr)].p2m.valid ) { if ( !populate ) { addr = (addr + SECOND_SIZE) & SECOND_MASK; continue; } rc = p2m_create_table(d, &second[second_table_offset(addr)]); if ( rc < 0 ) { printk("p2m_populate_ram: L2 failed\n"); goto out; } } BUG_ON(!second[second_table_offset(addr)].p2m.valid); if ( cur_second_offset != second_table_offset(addr) ) { /* map third level */ if (third) unmap_domain_page(third); third = map_domain_page(second[second_table_offset(addr)].p2m.base); cur_second_offset = second_table_offset(addr); } pte = third[third_table_offset(addr)]; flush |= pte.p2m.valid; /* TODO: Handle other p2m type * * It's safe to do the put_page here because page_alloc will * flush the TLBs if the page is reallocated before the end of * this loop. */ if ( pte.p2m.valid && p2m_is_foreign(pte.p2m.type) ) { unsigned long mfn = pte.p2m.base; ASSERT(mfn_valid(mfn)); put_page(mfn_to_page(mfn)); } /* Allocate a new RAM page and attach */ switch (op) { case ALLOCATE: { struct page_info *page; ASSERT(!pte.p2m.valid); rc = -ENOMEM; page = alloc_domheap_page(d, 0); if ( page == NULL ) { printk("p2m_populate_ram: failed to allocate page\n"); goto out; } pte = mfn_to_p2m_entry(page_to_mfn(page), mattr, t); write_pte(&third[third_table_offset(addr)], pte); } break; case INSERT: { pte = mfn_to_p2m_entry(maddr >> PAGE_SHIFT, mattr, t); write_pte(&third[third_table_offset(addr)], pte); maddr += PAGE_SIZE; } break; case RELINQUISH: case REMOVE: { if ( !pte.p2m.valid ) { count++; break; } count += 0x10; memset(&pte, 0x00, sizeof(pte)); write_pte(&third[third_table_offset(addr)], pte); count++; } break; case CACHEFLUSH: { if ( !pte.p2m.valid || !p2m_is_ram(pte.p2m.type) ) break; flush_page_to_ram(pte.p2m.base); } break; } /* Preempt every 2MiB (mapped) or 32 MiB (unmapped) - arbitrary */ if ( op == RELINQUISH && count >= 0x2000 ) { if ( hypercall_preempt_check() ) { p2m->lowest_mapped_gfn = addr >> PAGE_SHIFT; rc = -EAGAIN; goto out; } count = 0; } /* Got the next page */ addr += PAGE_SIZE; } if ( flush ) { /* At the beginning of the function, Xen is updating VTTBR * with the domain where the mappings are created. In this * case it's only necessary to flush TLBs on every CPUs with * the current VMID (our domain). */ flush_tlb(); } if ( op == ALLOCATE || op == INSERT ) { unsigned long sgfn = paddr_to_pfn(start_gpaddr); unsigned long egfn = paddr_to_pfn(end_gpaddr); p2m->max_mapped_gfn = MAX(p2m->max_mapped_gfn, egfn); p2m->lowest_mapped_gfn = MIN(p2m->lowest_mapped_gfn, sgfn); } rc = 0; out: if (third) unmap_domain_page(third); if (second) unmap_domain_page(second); if (first) unmap_domain_page(first); if ( d != current->domain ) p2m_load_VTTBR(current->domain); spin_unlock(&p2m->lock); return rc; }
int j4fs_writepage(struct page *page, struct writeback_control *wbc) { struct address_space *mapping = page->mapping; loff_t offset = (loff_t) page->index << PAGE_CACHE_SHIFT; struct inode *inode; unsigned long end_index; char *buffer; int nWritten = 0; unsigned nBytes; j4fs_ctrl ctl; int nErr; if(j4fs_panic==1) { J4FS_T(J4FS_TRACE_ALWAYS,("%s %d: j4fs panic\n",__FUNCTION__,__LINE__)); return -ENOSPC; } J4FS_T(J4FS_TRACE_FS,("%s %d\n",__FUNCTION__,__LINE__)); if (!mapping) BUG(); inode = mapping->host; if (!inode) BUG(); if (offset > inode->i_size) { J4FS_T(J4FS_TRACE_FS, ("j4fs_writepage at %08x, inode size = %08x!!!\n", (unsigned)(page->index << PAGE_CACHE_SHIFT), (unsigned)inode->i_size)); J4FS_T(J4FS_TRACE_FS, (" -> don't care!!\n")); unlock_page(page); return 0; } end_index = inode->i_size >> PAGE_CACHE_SHIFT; /* easy case */ if (page->index < end_index) nBytes = PAGE_CACHE_SIZE; else nBytes = inode->i_size & (PAGE_CACHE_SIZE - 1); get_page(page); buffer = kmap(page); j4fs_GrossLock(); J4FS_T(J4FS_TRACE_FS, ("j4fs_writepage: index=%08x,nBytes=%08x,inode.i_size=%05x\n", (unsigned)(page->index << PAGE_CACHE_SHIFT), nBytes,(int)inode->i_size)); // write file ctl.buffer=buffer; ctl.count=nBytes; ctl.id=inode->i_ino; ctl.index=offset; nErr=fsd_write(&ctl); if(nErr==J4FS_RETRY_WRITE) nErr=fsd_write(&ctl); J4FS_T(J4FS_TRACE_FS, ("j4fs_writepage: index=%08x,nBytes=%08x,inode.i_size=%05x\n", (unsigned)(page->index << PAGE_CACHE_SHIFT), nBytes,(int)inode->i_size)); j4fs_GrossUnlock(); kunmap(page); SetPageUptodate(page); unlock_page(page); put_page(page); return (nWritten == nBytes) ? 0 : -ENOSPC; }
/* filemap_write_and_wait(inode->i_mapping); */ if ( inode->i_mapping->nrpages && filemap_fdatawrite(inode->i_mapping) != -EIO) filemap_fdatawait(inode->i_mapping); #endif rc = vboxCallClose(&client_handle, &sf_g->map, sf_r->handle); if (RT_FAILURE(rc)) LogFunc(("vboxCallClose failed rc=%Rrc\n", rc)); kfree(sf_r); sf_i->file = NULL; sf_i->handle = SHFL_HANDLE_NIL; file->private_data = NULL; return 0; } #if LINUX_VERSION_CODE > KERNEL_VERSION(2, 6, 25) static int sf_reg_fault(struct vm_area_struct *vma, struct vm_fault *vmf) #elif LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 0) static struct page *sf_reg_nopage(struct vm_area_struct *vma, unsigned long vaddr, int *type) # define SET_TYPE(t) *type = (t) #else /* LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 0) */ static struct page *sf_reg_nopage(struct vm_area_struct *vma, unsigned long vaddr, int unused) # define SET_TYPE(t) #endif { struct page *page; char *buf; loff_t off; uint32_t nread = PAGE_SIZE; int err; struct file *file = vma->vm_file; struct inode *inode = GET_F_DENTRY(file)->d_inode; struct sf_glob_info *sf_g = GET_GLOB_INFO(inode->i_sb); struct sf_reg_info *sf_r = file->private_data; TRACE(); #if LINUX_VERSION_CODE > KERNEL_VERSION(2, 6, 25) if (vmf->pgoff > vma->vm_end) return VM_FAULT_SIGBUS; #else if (vaddr > vma->vm_end) { SET_TYPE(VM_FAULT_SIGBUS); return NOPAGE_SIGBUS; } #endif /* Don't use GFP_HIGHUSER as long as sf_reg_read_aux() calls vboxCallRead() * which works on virtual addresses. On Linux cannot reliably determine the * physical address for high memory, see rtR0MemObjNativeLockKernel(). */ page = alloc_page(GFP_USER); if (!page) { LogRelFunc(("failed to allocate page\n")); #if LINUX_VERSION_CODE > KERNEL_VERSION(2, 6, 25) return VM_FAULT_OOM; #else SET_TYPE(VM_FAULT_OOM); return NOPAGE_OOM; #endif } buf = kmap(page); #if LINUX_VERSION_CODE > KERNEL_VERSION(2, 6, 25) off = (vmf->pgoff << PAGE_SHIFT); #else off = (vaddr - vma->vm_start) + (vma->vm_pgoff << PAGE_SHIFT); #endif err = sf_reg_read_aux(__func__, sf_g, sf_r, buf, &nread, off); if (err) { kunmap(page); put_page(page); #if LINUX_VERSION_CODE > KERNEL_VERSION(2, 6, 25) return VM_FAULT_SIGBUS; #else SET_TYPE(VM_FAULT_SIGBUS); return NOPAGE_SIGBUS; #endif } BUG_ON (nread > PAGE_SIZE); if (!nread) { #if LINUX_VERSION_CODE > KERNEL_VERSION(2, 6, 25) clear_user_page(page_address(page), vmf->pgoff, page); #elif LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 0) clear_user_page(page_address(page), vaddr, page); #else clear_user_page(page_address(page), vaddr); #endif } else memset(buf + nread, 0, PAGE_SIZE - nread); flush_dcache_page(page); kunmap(page); #if LINUX_VERSION_CODE > KERNEL_VERSION(2, 6, 25) vmf->page = page; return 0; #else SET_TYPE(VM_FAULT_MAJOR); return page; #endif }
int guest_remove_page(struct domain *d, unsigned long gmfn) { struct page_info *page; #ifdef CONFIG_X86 p2m_type_t p2mt; #endif unsigned long mfn; #ifdef CONFIG_X86 mfn = mfn_x(get_gfn_query(d, gmfn, &p2mt)); if ( unlikely(p2m_is_paging(p2mt)) ) { guest_physmap_remove_page(d, gmfn, mfn, 0); put_gfn(d, gmfn); /* If the page hasn't yet been paged out, there is an * actual page that needs to be released. */ if ( p2mt == p2m_ram_paging_out ) { ASSERT(mfn_valid(mfn)); page = mfn_to_page(mfn); if ( test_and_clear_bit(_PGC_allocated, &page->count_info) ) put_page(page); } p2m_mem_paging_drop_page(d, gmfn, p2mt); return 1; } if ( p2mt == p2m_mmio_direct ) { clear_mmio_p2m_entry(d, gmfn, _mfn(mfn)); put_gfn(d, gmfn); return 1; } #else mfn = gmfn_to_mfn(d, gmfn); #endif if ( unlikely(!mfn_valid(mfn)) ) { put_gfn(d, gmfn); gdprintk(XENLOG_INFO, "Domain %u page number %lx invalid\n", d->domain_id, gmfn); return 0; } #ifdef CONFIG_X86 if ( p2m_is_shared(p2mt) ) { /* Unshare the page, bail out on error. We unshare because * we might be the only one using this shared page, and we * need to trigger proper cleanup. Once done, this is * like any other page. */ if ( mem_sharing_unshare_page(d, gmfn, 0) ) { put_gfn(d, gmfn); (void)mem_sharing_notify_enomem(d, gmfn, 0); return 0; } /* Maybe the mfn changed */ mfn = mfn_x(get_gfn_query_unlocked(d, gmfn, &p2mt)); ASSERT(!p2m_is_shared(p2mt)); } #endif /* CONFIG_X86 */ page = mfn_to_page(mfn); if ( unlikely(!get_page(page, d)) ) { put_gfn(d, gmfn); gdprintk(XENLOG_INFO, "Bad page free for domain %u\n", d->domain_id); return 0; } if ( test_and_clear_bit(_PGT_pinned, &page->u.inuse.type_info) ) put_page_and_type(page); /* * With the lack of an IOMMU on some platforms, domains with DMA-capable * device must retrieve the same pfn when the hypercall populate_physmap * is called. * * For this purpose (and to match populate_physmap() behavior), the page * is kept allocated. */ if ( !is_domain_direct_mapped(d) && test_and_clear_bit(_PGC_allocated, &page->count_info) ) put_page(page); guest_physmap_remove_page(d, gmfn, mfn, 0); put_page(page); put_gfn(d, gmfn); return 1; }
/* * Perform a free_page(), also freeing any swap cache associated with * this page if it is the last user of the page. */ void free_page_and_swap_cache(struct page *page) { free_swap_cache(page); put_page(page); }
unsigned long hap_p2m_ga_to_gfn(GUEST_PAGING_LEVELS)( struct vcpu *v, struct p2m_domain *p2m, unsigned long cr3, paddr_t ga, uint32_t *pfec, unsigned int *page_order) { uint32_t missing; mfn_t top_mfn; void *top_map; p2m_type_t p2mt; walk_t gw; unsigned long top_gfn; struct page_info *top_page; /* Get the top-level table's MFN */ top_gfn = cr3 >> PAGE_SHIFT; top_page = get_page_from_gfn_p2m(p2m->domain, p2m, top_gfn, &p2mt, NULL, P2M_ALLOC | P2M_UNSHARE); if ( p2m_is_paging(p2mt) ) { ASSERT(p2m_is_hostp2m(p2m)); pfec[0] = PFEC_page_paged; if ( top_page ) put_page(top_page); p2m_mem_paging_populate(p2m->domain, cr3 >> PAGE_SHIFT); return INVALID_GFN; } if ( p2m_is_shared(p2mt) ) { pfec[0] = PFEC_page_shared; if ( top_page ) put_page(top_page); return INVALID_GFN; } if ( !top_page ) { pfec[0] &= ~PFEC_page_present; return INVALID_GFN; } top_mfn = _mfn(page_to_mfn(top_page)); /* Map the top-level table and call the tree-walker */ ASSERT(mfn_valid(mfn_x(top_mfn))); top_map = map_domain_page(mfn_x(top_mfn)); #if GUEST_PAGING_LEVELS == 3 top_map += (cr3 & ~(PAGE_MASK | 31)); #endif missing = guest_walk_tables(v, p2m, ga, &gw, pfec[0], top_mfn, top_map); unmap_domain_page(top_map); put_page(top_page); /* Interpret the answer */ if ( missing == 0 ) { gfn_t gfn = guest_l1e_get_gfn(gw.l1e); struct page_info *page; page = get_page_from_gfn_p2m(p2m->domain, p2m, gfn_x(gfn), &p2mt, NULL, P2M_ALLOC | P2M_UNSHARE); if ( page ) put_page(page); if ( p2m_is_paging(p2mt) ) { ASSERT(p2m_is_hostp2m(p2m)); pfec[0] = PFEC_page_paged; p2m_mem_paging_populate(p2m->domain, gfn_x(gfn)); return INVALID_GFN; } if ( p2m_is_shared(p2mt) ) { pfec[0] = PFEC_page_shared; return INVALID_GFN; } if ( page_order ) *page_order = guest_walk_to_page_order(&gw); return gfn_x(gfn); } if ( missing & _PAGE_PRESENT ) pfec[0] &= ~PFEC_page_present; if ( missing & _PAGE_INVALID_BITS ) pfec[0] |= PFEC_reserved_bit; if ( missing & _PAGE_PAGED ) pfec[0] = PFEC_page_paged; if ( missing & _PAGE_SHARED ) pfec[0] = PFEC_page_shared; return INVALID_GFN; }
static int bgmac_dma_rx_read(struct bgmac *bgmac, struct bgmac_dma_ring *ring, int weight) { u32 end_slot; int handled = 0; end_slot = bgmac_read(bgmac, ring->mmio_base + BGMAC_DMA_RX_STATUS); end_slot &= BGMAC_DMA_RX_STATDPTR; end_slot -= ring->index_base; end_slot &= BGMAC_DMA_RX_STATDPTR; end_slot /= sizeof(struct bgmac_dma_desc); while (ring->start != end_slot) { struct device *dma_dev = bgmac->core->dma_dev; struct bgmac_slot_info *slot = &ring->slots[ring->start]; struct bgmac_rx_header *rx = slot->buf + BGMAC_RX_BUF_OFFSET; struct sk_buff *skb; void *buf = slot->buf; dma_addr_t dma_addr = slot->dma_addr; u16 len, flags; do { /* Prepare new skb as replacement */ if (bgmac_dma_rx_skb_for_slot(bgmac, slot)) { bgmac_dma_rx_poison_buf(dma_dev, slot); break; } /* Unmap buffer to make it accessible to the CPU */ dma_unmap_single(dma_dev, dma_addr, BGMAC_RX_BUF_SIZE, DMA_FROM_DEVICE); /* Get info from the header */ len = le16_to_cpu(rx->len); flags = le16_to_cpu(rx->flags); /* Check for poison and drop or pass the packet */ if (len == 0xdead && flags == 0xbeef) { bgmac_err(bgmac, "Found poisoned packet at slot %d, DMA issue!\n", ring->start); put_page(virt_to_head_page(buf)); break; } if (len > BGMAC_RX_ALLOC_SIZE) { bgmac_err(bgmac, "Found oversized packet at slot %d, DMA issue!\n", ring->start); put_page(virt_to_head_page(buf)); break; } /* Omit CRC. */ len -= ETH_FCS_LEN; skb = build_skb(buf, BGMAC_RX_ALLOC_SIZE); if (unlikely(!skb)) { bgmac_err(bgmac, "build_skb failed\n"); put_page(virt_to_head_page(buf)); break; } skb_put(skb, BGMAC_RX_FRAME_OFFSET + BGMAC_RX_BUF_OFFSET + len); skb_pull(skb, BGMAC_RX_FRAME_OFFSET + BGMAC_RX_BUF_OFFSET); skb_checksum_none_assert(skb); skb->protocol = eth_type_trans(skb, bgmac->net_dev); napi_gro_receive(&bgmac->napi, skb); handled++; } while (0); bgmac_dma_rx_setup_desc(bgmac, ring, ring->start); if (++ring->start >= BGMAC_RX_RING_SLOTS) ring->start = 0; if (handled >= weight) /* Should never be greater */ break; } bgmac_dma_rx_update_index(bgmac, ring); return handled; }