static int nvtruncbuf_bp_trunc(struct buf *bp, void *data) { struct truncbuf_info *info = data; /* * Do not try to use a buffer we cannot immediately lock, * but sleep anyway to prevent a livelock. The code will * loop until all buffers can be acted upon. */ if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) { atomic_add_int(&bp->b_refs, 1); if (BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL) == 0) BUF_UNLOCK(bp); atomic_subtract_int(&bp->b_refs, 1); } else if ((info->clean && (bp->b_flags & B_DELWRI)) || (info->clean == 0 && (bp->b_flags & B_DELWRI) == 0) || bp->b_vp != info->vp || nvtruncbuf_bp_trunc_cmp(bp, data)) { BUF_UNLOCK(bp); } else { bremfree(bp); bp->b_flags |= (B_INVAL | B_RELBUF | B_NOCACHE); brelse(bp); } lwkt_yield(); return(1); }
static int ttm_bo_vm_fault(vm_object_t vm_obj, vm_ooffset_t offset, int prot, vm_page_t *mres) { struct ttm_buffer_object *bo = vm_obj->handle; struct ttm_bo_device *bdev = bo->bdev; struct ttm_tt *ttm = NULL; vm_page_t m, m1, oldm; int ret; int retval = VM_PAGER_OK; struct ttm_mem_type_manager *man = &bdev->man[bo->mem.mem_type]; vm_object_pip_add(vm_obj, 1); oldm = *mres; if (oldm != NULL) { vm_page_remove(oldm); *mres = NULL; } else oldm = NULL; retry: VM_OBJECT_WUNLOCK(vm_obj); m = NULL; reserve: ret = ttm_bo_reserve(bo, false, false, false, 0); if (unlikely(ret != 0)) { if (ret == -EBUSY) { lwkt_yield(); goto reserve; } } if (bdev->driver->fault_reserve_notify) { ret = bdev->driver->fault_reserve_notify(bo); switch (ret) { case 0: break; case -EBUSY: case -ERESTARTSYS: case -EINTR: lwkt_yield(); goto reserve; default: retval = VM_PAGER_ERROR; goto out_unlock; } } /* * Wait for buffer data in transit, due to a pipelined * move. */ lockmgr(&bdev->fence_lock, LK_EXCLUSIVE); if (test_bit(TTM_BO_PRIV_FLAG_MOVING, &bo->priv_flags)) { /* * Here, the behavior differs between Linux and FreeBSD. * * On Linux, the wait is interruptible (3rd argument to * ttm_bo_wait). There must be some mechanism to resume * page fault handling, once the signal is processed. * * On FreeBSD, the wait is uninteruptible. This is not a * problem as we can't end up with an unkillable process * here, because the wait will eventually time out. * * An example of this situation is the Xorg process * which uses SIGALRM internally. The signal could * interrupt the wait, causing the page fault to fail * and the process to receive SIGSEGV. */ ret = ttm_bo_wait(bo, false, false, false); lockmgr(&bdev->fence_lock, LK_RELEASE); if (unlikely(ret != 0)) { retval = VM_PAGER_ERROR; goto out_unlock; } } else lockmgr(&bdev->fence_lock, LK_RELEASE); ret = ttm_mem_io_lock(man, true); if (unlikely(ret != 0)) { retval = VM_PAGER_ERROR; goto out_unlock; } ret = ttm_mem_io_reserve_vm(bo); if (unlikely(ret != 0)) { retval = VM_PAGER_ERROR; goto out_io_unlock; } /* * Strictly, we're not allowed to modify vma->vm_page_prot here, * since the mmap_sem is only held in read mode. However, we * modify only the caching bits of vma->vm_page_prot and * consider those bits protected by * the bo->mutex, as we should be the only writers. * There shouldn't really be any readers of these bits except * within vm_insert_mixed()? fork? * * TODO: Add a list of vmas to the bo, and change the * vma->vm_page_prot when the object changes caching policy, with * the correct locks held. */ if (!bo->mem.bus.is_iomem) { /* Allocate all page at once, most common usage */ ttm = bo->ttm; if (ttm->bdev->driver->ttm_tt_populate(ttm)) { retval = VM_PAGER_ERROR; goto out_io_unlock; } } if (bo->mem.bus.is_iomem) { m = vm_phys_fictitious_to_vm_page(bo->mem.bus.base + bo->mem.bus.offset + offset); pmap_page_set_memattr(m, ttm_io_prot(bo->mem.placement)); } else { ttm = bo->ttm; m = ttm->pages[OFF_TO_IDX(offset)]; if (unlikely(!m)) { retval = VM_PAGER_ERROR; goto out_io_unlock; } pmap_page_set_memattr(m, (bo->mem.placement & TTM_PL_FLAG_CACHED) ? VM_MEMATTR_WRITE_BACK : ttm_io_prot(bo->mem.placement)); } VM_OBJECT_WLOCK(vm_obj); if ((m->flags & PG_BUSY) != 0) { #if 0 vm_page_sleep(m, "ttmpbs"); #endif ttm_mem_io_unlock(man); ttm_bo_unreserve(bo); goto retry; } m->valid = VM_PAGE_BITS_ALL; *mres = m; m1 = vm_page_lookup(vm_obj, OFF_TO_IDX(offset)); if (m1 == NULL) { vm_page_insert(m, vm_obj, OFF_TO_IDX(offset)); } else { KASSERT(m == m1, ("inconsistent insert bo %p m %p m1 %p offset %jx", bo, m, m1, (uintmax_t)offset)); } vm_page_busy_try(m, FALSE); if (oldm != NULL) { vm_page_free(oldm); } out_io_unlock1: ttm_mem_io_unlock(man); out_unlock1: ttm_bo_unreserve(bo); vm_object_pip_wakeup(vm_obj); return (retval); out_io_unlock: VM_OBJECT_WLOCK(vm_obj); goto out_io_unlock1; out_unlock: VM_OBJECT_WLOCK(vm_obj); goto out_unlock1; }
/* * vm_contig_pg_alloc: * * Allocate contiguous pages from the VM. This function does not * map the allocated pages into the kernel map, otherwise it is * impossible to make large allocations (i.e. >2G). * * Malloc()'s data structures have been used for collection of * statistics and for allocations of less than a page. */ static int vm_contig_pg_alloc(unsigned long size, vm_paddr_t low, vm_paddr_t high, unsigned long alignment, unsigned long boundary, int mflags) { int i, q, start, pass; vm_offset_t phys; vm_page_t pga = vm_page_array; vm_page_t m; int pqtype; size = round_page(size); if (size == 0) panic("vm_contig_pg_alloc: size must not be 0"); if ((alignment & (alignment - 1)) != 0) panic("vm_contig_pg_alloc: alignment must be a power of 2"); if ((boundary & (boundary - 1)) != 0) panic("vm_contig_pg_alloc: boundary must be a power of 2"); /* * See if we can get the pages from the contiguous page reserve * alist. The returned pages will be allocated and wired but not * busied. */ m = vm_page_alloc_contig(low, high, alignment, boundary, size); if (m) return (m - &pga[0]); /* * Three passes (0, 1, 2). Each pass scans the VM page list for * free or cached pages. After each pass if the entire scan failed * we attempt to flush inactive pages and reset the start index back * to 0. For passes 1 and 2 we also attempt to flush active pages. */ start = 0; for (pass = 0; pass < 3; pass++) { /* * Find first page in array that is free, within range, * aligned, and such that the boundary won't be crossed. */ again: for (i = start; i < vmstats.v_page_count; i++) { m = &pga[i]; phys = VM_PAGE_TO_PHYS(m); pqtype = m->queue - m->pc; if (((pqtype == PQ_FREE) || (pqtype == PQ_CACHE)) && (phys >= low) && (phys < high) && ((phys & (alignment - 1)) == 0) && (((phys ^ (phys + size - 1)) & ~(boundary - 1)) == 0) && m->busy == 0 && m->wire_count == 0 && m->hold_count == 0 && (m->flags & (PG_BUSY | PG_NEED_COMMIT)) == 0) { break; } } /* * If we cannot find the page in the given range, or we have * crossed the boundary, call the vm_contig_pg_clean() function * for flushing out the queues, and returning it back to * normal state. */ if ((i == vmstats.v_page_count) || ((VM_PAGE_TO_PHYS(&pga[i]) + size) > high)) { /* * Best effort flush of all inactive pages. * This is quite quick, for now stall all * callers, even if they've specified M_NOWAIT. */ for (q = 0; q < PQ_L2_SIZE; ++q) { vm_contig_pg_clean(PQ_INACTIVE + q, vmstats.v_inactive_count); lwkt_yield(); } /* * Best effort flush of active pages. * * This is very, very slow. * Only do this if the caller has agreed to M_WAITOK. * * If enough pages are flushed, we may succeed on * next (final) pass, if not the caller, contigmalloc(), * will fail in the index < 0 case. */ if (pass > 0 && (mflags & M_WAITOK)) { for (q = 0; q < PQ_L2_SIZE; ++q) { vm_contig_pg_clean(PQ_ACTIVE + q, vmstats.v_active_count); } lwkt_yield(); } /* * We're already too high in the address space * to succeed, reset to 0 for the next iteration. */ start = 0; continue; /* next pass */ } start = i; /* * Check successive pages for contiguous and free. * * (still in critical section) */ for (i = start + 1; i < (start + size / PAGE_SIZE); i++) { m = &pga[i]; pqtype = m->queue - m->pc; if ((VM_PAGE_TO_PHYS(&m[0]) != (VM_PAGE_TO_PHYS(&m[-1]) + PAGE_SIZE)) || ((pqtype != PQ_FREE) && (pqtype != PQ_CACHE)) || m->busy || m->wire_count || m->hold_count || (m->flags & (PG_BUSY | PG_NEED_COMMIT))) { start++; goto again; } } /* * Try to allocate the pages, wiring them as we go. * * (still in critical section) */ for (i = start; i < (start + size / PAGE_SIZE); i++) { m = &pga[i]; if (vm_page_busy_try(m, TRUE)) { vm_contig_pg_free(start, (i - start) * PAGE_SIZE); start++; goto again; } pqtype = m->queue - m->pc; if (pqtype == PQ_CACHE && m->hold_count == 0 && m->wire_count == 0 && (m->flags & (PG_UNMANAGED | PG_NEED_COMMIT)) == 0) { vm_page_protect(m, VM_PROT_NONE); KKASSERT((m->flags & PG_MAPPED) == 0); KKASSERT(m->dirty == 0); vm_page_free(m); --i; continue; /* retry the page */ } if (pqtype != PQ_FREE || m->hold_count) { vm_page_wakeup(m); vm_contig_pg_free(start, (i - start) * PAGE_SIZE); start++; goto again; } KKASSERT((m->valid & m->dirty) == 0); KKASSERT(m->wire_count == 0); KKASSERT(m->object == NULL); vm_page_unqueue_nowakeup(m); m->valid = VM_PAGE_BITS_ALL; if (m->flags & PG_ZERO) vm_page_zero_count--; KASSERT(m->dirty == 0, ("vm_contig_pg_alloc: page %p was dirty", m)); KKASSERT(m->wire_count == 0); KKASSERT(m->busy == 0); /* * Clear all flags except PG_BUSY, PG_ZERO, and * PG_WANTED, then unbusy the now allocated page. */ vm_page_flag_clear(m, ~(PG_BUSY | PG_SBUSY | PG_ZERO | PG_WANTED)); vm_page_wire(m); vm_page_wakeup(m); } /* * Our job is done, return the index page of vm_page_array. */ return (start); /* aka &pga[start] */ } /* * Failed. */ return (-1); }
/* * No requirements. */ static int do_vmtotal(SYSCTL_HANDLER_ARGS) { struct vmtotal total; struct vmtotal *totalp; struct vm_object marker; vm_object_t object; long collisions; int burst; bzero(&total, sizeof(total)); totalp = &total; bzero(&marker, sizeof(marker)); marker.type = OBJT_MARKER; collisions = vmobj_token.t_collisions; #if 0 /* * Mark all objects as inactive. */ lwkt_gettoken(&vmobj_token); for (object = TAILQ_FIRST(&vm_object_list); object != NULL; object = TAILQ_NEXT(object,object_list)) { if (object->type == OBJT_MARKER) continue; vm_object_clear_flag(object, OBJ_ACTIVE); } lwkt_reltoken(&vmobj_token); #endif /* * Calculate process statistics. */ allproc_scan(do_vmtotal_callback, totalp); /* * Calculate object memory usage statistics. */ lwkt_gettoken(&vmobj_token); TAILQ_INSERT_HEAD(&vm_object_list, &marker, object_list); burst = 0; for (object = TAILQ_FIRST(&vm_object_list); object != NULL; object = TAILQ_NEXT(object, object_list)) { /* * devices, like /dev/mem, will badly skew our totals. * markers aren't real objects. */ if (object->type == OBJT_MARKER) continue; if (object->type == OBJT_DEVICE) continue; if (object->size >= 0x7FFFFFFF) { /* * Probably unbounded anonymous memory (really * bounded by related vm_map_entry structures which * we do not have access to in this loop). */ totalp->t_vm += object->resident_page_count; } else { /* * It's questionable how useful this is but... */ totalp->t_vm += object->size; } totalp->t_rm += object->resident_page_count; if (object->flags & OBJ_ACTIVE) { totalp->t_avm += object->size; totalp->t_arm += object->resident_page_count; } if (object->shadow_count > 1) { /* shared object */ totalp->t_vmshr += object->size; totalp->t_rmshr += object->resident_page_count; if (object->flags & OBJ_ACTIVE) { totalp->t_avmshr += object->size; totalp->t_armshr += object->resident_page_count; } } /* * Don't waste time unnecessarily */ if (++burst < 25) continue; burst = 0; /* * Don't hog the vmobj_token if someone else wants it. */ TAILQ_REMOVE(&vm_object_list, &marker, object_list); TAILQ_INSERT_AFTER(&vm_object_list, object, &marker, object_list); object = ▮ if (collisions != vmobj_token.t_collisions) { tsleep(&vm_object_list, 0, "breath", 1); collisions = vmobj_token.t_collisions; } else { lwkt_yield(); } } TAILQ_REMOVE(&vm_object_list, &marker, object_list); lwkt_reltoken(&vmobj_token); totalp->t_free = vmstats.v_free_count + vmstats.v_cache_count; return (sysctl_handle_opaque(oidp, totalp, sizeof total, req)); }
/* * MPSAFE thread */ static void vm_pagezero(void *arg) { vm_page_t m = NULL; struct lwbuf *lwb = NULL; struct lwbuf lwb_cache; enum zeroidle_state state = STATE_IDLE; char *pg = NULL; int npages = 0; int sleep_time; int i = 0; int cpu = (int)(intptr_t)arg; int zero_state = 0; /* * Adjust thread parameters before entering our loop. The thread * is started with the MP lock held and with normal kernel thread * priority. * * Also put us on the last cpu for now. * * For now leave the MP lock held, the VM routines cannot be called * with it released until tokenization is finished. */ lwkt_setpri_self(TDPRI_IDLE_WORK); lwkt_setcpu_self(globaldata_find(cpu)); sleep_time = DEFAULT_SLEEP_TIME; /* * Loop forever */ for (;;) { int zero_count; switch(state) { case STATE_IDLE: /* * Wait for work. */ tsleep(&zero_state, 0, "pgzero", sleep_time); if (vm_page_zero_check(&zero_count, &zero_state)) npages = idlezero_rate / 10; sleep_time = vm_page_zero_time(zero_count); if (npages) state = STATE_GET_PAGE; /* Fallthrough */ break; case STATE_GET_PAGE: /* * Acquire page to zero */ if (--npages == 0) { state = STATE_IDLE; } else { m = vm_page_free_fromq_fast(); if (m == NULL) { state = STATE_IDLE; } else { state = STATE_ZERO_PAGE; lwb = lwbuf_alloc(m, &lwb_cache); pg = (char *)lwbuf_kva(lwb); i = 0; } } break; case STATE_ZERO_PAGE: /* * Zero-out the page */ while (i < PAGE_SIZE) { if (idlezero_nocache == 1) bzeront(&pg[i], IDLEZERO_RUN); else bzero(&pg[i], IDLEZERO_RUN); i += IDLEZERO_RUN; lwkt_yield(); } state = STATE_RELEASE_PAGE; break; case STATE_RELEASE_PAGE: lwbuf_free(lwb); vm_page_flag_set(m, PG_ZERO); vm_page_free_toq(m); state = STATE_GET_PAGE; ++idlezero_count; /* non-locked, SMP race ok */ break; } lwkt_yield(); } }