int nvramread(dev_t dev, struct uio *uio, int flags) { u_char buf[NVRAM_SIZE]; off_t pos = uio->uio_offset; u_char *tmp; size_t count = ulmin(sizeof(buf), uio->uio_resid); int ret; if (!nvram_initialized) return (ENXIO); if (uio->uio_offset < 0) return (EINVAL); if (uio->uio_resid == 0) return (0); #ifdef NVRAM_DEBUG printf("attempting to read %zu bytes at offset %lld\n", count, pos); #endif for (tmp = buf; count-- > 0 && pos < NVRAM_SIZE; ++pos, ++tmp) *tmp = nvram_get_byte(pos); #ifdef NVRAM_DEBUG printf("nvramread read %td bytes (%s)\n", (tmp - buf), tmp); #endif ret = uiomove(buf, (tmp - buf), uio); uio->uio_offset += uio->uio_resid; return (ret); }
static int esp_pci_dma_setup(struct ncr53c9x_softc *sc, void **addr, size_t *len, int datain, size_t *dmasize) { struct esp_pci_softc *esc = (struct esp_pci_softc *)sc; int error; WRITE_DMAREG(esc, DMA_CMD, DMACMD_IDLE | (datain != 0 ? DMACMD_DIR : 0)); *dmasize = esc->sc_dmasize = ulmin(*dmasize, MDL_SEG_SIZE); esc->sc_dmaaddr = addr; esc->sc_dmalen = len; esc->sc_datain = datain; /* * There's no need to set up DMA for a "Transfer Pad" operation. */ if (*dmasize == 0) return (0); /* Set the transfer length. */ WRITE_DMAREG(esc, DMA_STC, *dmasize); /* * Load the transfer buffer and program the DMA address. * Note that the NCR53C9x core can't handle EINPROGRESS so we set * BUS_DMA_NOWAIT. */ error = bus_dmamap_load(esc->sc_xferdmat, esc->sc_xferdmam, *esc->sc_dmaaddr, *dmasize, esp_pci_xfermap, sc, BUS_DMA_NOWAIT); return (error); }
/** Process a block of memory though the hash @param state The hash state @param in The data to hash @param inlen The length of the data (octets) @return CRYPT_OK if successful */ int sha1_process(sha1_state* state, const unsigned char* in, unsigned long inlen) { unsigned long n; int err; if (state->curlen > sizeof(state->buf)) { return CRYPT_INVALID_ARG; } if ((state->length + inlen) < state->length) { return CRYPT_HASH_OVERFLOW; } while (inlen > 0) { if (state->curlen == 0 && inlen >= 64) { if ((err = sha1_compress(state, in)) != CRYPT_OK) { return err; } state->length += 64 * 8; in += 64; inlen -= 64; } else { n = ulmin(inlen, (64 - state->curlen)); memcpy(state->buf + state->curlen, in, (size_t)n); state->curlen += n; in += n; inlen -= n; if (state->curlen == 64) { if ((err = sha1_compress(state, state->buf)) != CRYPT_OK) { return err; } state->length += 8 * 64; state->curlen = 0; } } } return CRYPT_OK; }
static void ofw_real_bounce_alloc(void *junk) { /* * Check that ofw_real is actually in use before allocating wads * of memory. Do this by checking if our mutex has been set up. */ if (!mtx_initialized(&of_bounce_mtx)) return; /* * Allocate a page of contiguous, wired physical memory that can * fit into a 32-bit address space and accessed from real mode. */ mtx_lock(&of_bounce_mtx); of_bounce_virt = contigmalloc(4 * PAGE_SIZE, M_OFWREAL, 0, 0, ulmin(platform_real_maxaddr(), BUS_SPACE_MAXADDR_32BIT), PAGE_SIZE, 4 * PAGE_SIZE); of_bounce_phys = vtophys(of_bounce_virt); of_bounce_size = 4 * PAGE_SIZE; /* * For virtual-mode OF, direct map this physical address so that * we have a 32-bit virtual address to give OF. */ if (!ofw_real_mode && !hw_direct_map) pmap_kenter(of_bounce_phys, of_bounce_phys); mtx_unlock(&of_bounce_mtx); }
struct resource * pcib_host_res_alloc(struct pcib_host_resources *hr, device_t dev, int type, int *rid, u_long start, u_long end, u_long count, u_int flags) { struct resource_list_entry *rle; struct resource *r; u_long new_start, new_end; if (flags & RF_PREFETCHABLE) KASSERT(type == SYS_RES_MEMORY, ("only memory is prefetchable")); rle = resource_list_find(&hr->hr_rl, type, 0); if (rle == NULL) { /* * No decoding ranges for this resource type, just pass * the request up to the parent. */ return (bus_generic_alloc_resource(hr->hr_pcib, dev, type, rid, start, end, count, flags)); } restart: /* Try to allocate from each decoded range. */ for (; rle != NULL; rle = STAILQ_NEXT(rle, link)) { if (rle->type != type) continue; if (((flags & RF_PREFETCHABLE) != 0) != ((rle->flags & RLE_PREFETCH) != 0)) continue; new_start = ulmax(start, rle->start); new_end = ulmin(end, rle->end); if (new_start > new_end || new_start + count - 1 > new_end || new_start + count < new_start) continue; r = bus_generic_alloc_resource(hr->hr_pcib, dev, type, rid, new_start, new_end, count, flags); if (r != NULL) { if (bootverbose) device_printf(hr->hr_pcib, "allocated type %d (%#lx-%#lx) for rid %x of %s\n", type, rman_get_start(r), rman_get_end(r), *rid, pcib_child_name(dev)); return (r); } } /* * If we failed to find a prefetch range for a memory * resource, try again without prefetch. */ if (flags & RF_PREFETCHABLE) { flags &= ~RF_PREFETCHABLE; rle = resource_list_find(&hr->hr_rl, type, 0); goto restart; } return (NULL); }
/* * Implement uiomove(9) from physical memory using a combination * of the direct mapping and sf_bufs to reduce the creation and * destruction of ephemeral mappings. */ int uiomove_fromphys(vm_page_t ma[], vm_offset_t offset, int n, struct uio *uio) { struct sf_buf *sf; struct thread *td = curthread; struct iovec *iov; void *cp; vm_offset_t page_offset; vm_paddr_t pa; vm_page_t m; size_t cnt; int error = 0; int save = 0; KASSERT(uio->uio_rw == UIO_READ || uio->uio_rw == UIO_WRITE, ("uiomove_fromphys: mode")); KASSERT(uio->uio_segflg != UIO_USERSPACE || uio->uio_td == curthread, ("uiomove_fromphys proc")); save = td->td_pflags & TDP_DEADLKTREAT; td->td_pflags |= TDP_DEADLKTREAT; while (n > 0 && uio->uio_resid) { iov = uio->uio_iov; cnt = iov->iov_len; if (cnt == 0) { uio->uio_iov++; uio->uio_iovcnt--; continue; } if (cnt > n) cnt = n; page_offset = offset & PAGE_MASK; cnt = ulmin(cnt, PAGE_SIZE - page_offset); m = ma[offset >> PAGE_SHIFT]; pa = VM_PAGE_TO_PHYS(m); if (MIPS_DIRECT_MAPPABLE(pa)) { sf = NULL; cp = (char *)MIPS_PHYS_TO_DIRECT(pa) + page_offset; /* * flush all mappings to this page, KSEG0 address first * in order to get it overwritten by correct data */ mips_dcache_wbinv_range((vm_offset_t)cp, cnt); pmap_flush_pvcache(m); } else { sf = sf_buf_alloc(m, 0); cp = (char *)sf_buf_kva(sf) + page_offset; } switch (uio->uio_segflg) { case UIO_USERSPACE: maybe_yield(); if (uio->uio_rw == UIO_READ) error = copyout(cp, iov->iov_base, cnt); else error = copyin(iov->iov_base, cp, cnt); if (error) { if (sf != NULL) sf_buf_free(sf); goto out; } break; case UIO_SYSSPACE: if (uio->uio_rw == UIO_READ) bcopy(cp, iov->iov_base, cnt); else bcopy(iov->iov_base, cp, cnt); break; case UIO_NOCOPY: break; } if (sf != NULL) sf_buf_free(sf); else mips_dcache_wbinv_range((vm_offset_t)cp, cnt); iov->iov_base = (char *)iov->iov_base + cnt; iov->iov_len -= cnt; uio->uio_resid -= cnt; uio->uio_offset += cnt; offset += cnt; n -= cnt; } out: if (save == 0) td->td_pflags &= ~TDP_DEADLKTREAT; return (error); }
/* getvolattrlist takes a user controlled bufferSize argument via the fgetattrlist syscall. When allocating a kernel buffer to serialize the attr list to there's the following comment: /* * Allocate a target buffer for attribute results. * Note that since we won't ever copy out more than the caller requested, * we never need to allocate more than they offer. */ ab.allocated = ulmin(bufferSize, fixedsize + varsize); if (ab.allocated > ATTR_MAX_BUFFER) { error = ENOMEM; VFS_DEBUG(ctx, vp, "ATTRLIST - ERROR: buffer size too large (%d limit %d)", ab.allocated, ATTR_MAX_BUFFER); goto out; } MALLOC(ab.base, char *, ab.allocated, M_TEMP, M_ZERO | M_WAITOK); The problem is that the code doesn't then correctly handle the case when the user supplied buffer size is smaller that the requested header size. If we pass ATTR_CMN_RETURNED_ATTRS we'll hit the following code: /* Return attribute set output if requested. */ if (return_valid) { ab.actual.commonattr |= ATTR_CMN_RETURNED_ATTRS; if (pack_invalid) { /* Only report the attributes that are valid */ ab.actual.commonattr &= ab.valid.commonattr; ab.actual.volattr &= ab.valid.volattr; } bcopy(&ab.actual, ab.base + sizeof(uint32_t), sizeof (ab.actual)); }
struct resource * isa_alloc_resource(device_t bus, device_t child, int type, int *rid, u_long start, u_long end, u_long count, u_int flags) { /* * Consider adding a resource definition. We allow rid 0-1 for * irq and drq, 0-3 for memory and 0-7 for ports which is * sufficient for isapnp. */ int passthrough = (device_get_parent(child) != bus); int isdefault = (start == 0UL && end == ~0UL); struct isa_device* idev = DEVTOISA(child); struct resource_list *rl = &idev->id_resources; struct resource_list_entry *rle; u_long base, limit; if (!passthrough && !isdefault) { rle = resource_list_find(rl, type, *rid); if (!rle) { if (*rid < 0) return 0; switch (type) { case SYS_RES_IRQ: if (*rid >= ISA_NIRQ) return 0; break; case SYS_RES_DRQ: if (*rid >= ISA_NDRQ) return 0; break; case SYS_RES_MEMORY: if (*rid >= ISA_NMEM) return 0; break; case SYS_RES_IOPORT: if (*rid >= ISA_NPORT) return 0; break; default: return 0; } resource_list_add(rl, type, *rid, start, end, count); } } /* * Add the base, change default allocations to be between base and * limit, and reject allocations if a resource type is not enabled. */ base = limit = 0; switch(type) { case SYS_RES_MEMORY: if (isa_mem_bt == NULL) return (NULL); base = isa_mem_base; limit = base + isa_mem_limit; break; case SYS_RES_IOPORT: if (isa_io_bt == NULL) return (NULL); base = isa_io_base; limit = base + isa_io_limit; break; case SYS_RES_IRQ: if (isdefault && passthrough) panic("isa_alloc_resource: cannot pass through default " "irq allocation"); if (!isdefault) { start = end = isa_route_intr_res(bus, start, end); if (start == 255) return (NULL); } break; default: panic("isa_alloc_resource: unsupported resource type %d", type); } if (type == SYS_RES_MEMORY || type == SYS_RES_IOPORT) { start = ulmin(start + base, limit); end = ulmin(end + base, limit); } /* * This inlines a modified resource_list_alloc(); this is needed * because the resources need to have offsets added to them, which * cannot be done beforehand without patching the resource list entries * (which is ugly). */ if (passthrough) { return (BUS_ALLOC_RESOURCE(device_get_parent(bus), child, type, rid, start, end, count, flags)); } rle = resource_list_find(rl, type, *rid); if (rle == NULL) return (NULL); /* no resource of that type/rid */ if (rle->res != NULL) panic("isa_alloc_resource: resource entry is busy"); if (isdefault) { start = rle->start; count = ulmax(count, rle->count); end = ulmax(rle->end, start + count - 1); switch (type) { case SYS_RES_MEMORY: case SYS_RES_IOPORT: start += base; end += base; if (!INRANGE(start, base, limit) || !INRANGE(end, base, limit)) return (NULL); break; case SYS_RES_IRQ: start = end = isa_route_intr_res(bus, start, end); if (start == 255) return (NULL); break; } } rle->res = BUS_ALLOC_RESOURCE(device_get_parent(bus), child, type, rid, start, end, count, flags); /* * Record the new range. */ if (rle->res != NULL) { rle->start = rman_get_start(rle->res) - base; rle->end = rman_get_end(rle->res) - base; rle->count = count; } return (rle->res); }
/* * Vnode op for write */ int spec_write(void *v) { struct vop_write_args *ap = v; struct vnode *vp = ap->a_vp; struct uio *uio = ap->a_uio; struct proc *p = uio->uio_procp; struct buf *bp; daddr_t bn, bscale; int bsize; struct partinfo dpart; size_t n; int on, majordev; int (*ioctl)(dev_t, u_long, caddr_t, int, struct proc *); int error = 0; #ifdef DIAGNOSTIC if (uio->uio_rw != UIO_WRITE) panic("spec_write mode"); if (uio->uio_segflg == UIO_USERSPACE && uio->uio_procp != curproc) panic("spec_write proc"); #endif switch (vp->v_type) { case VCHR: VOP_UNLOCK(vp, 0, p); error = (*cdevsw[major(vp->v_rdev)].d_write) (vp->v_rdev, uio, ap->a_ioflag); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); return (error); case VBLK: if (uio->uio_resid == 0) return (0); if (uio->uio_offset < 0) return (EINVAL); bsize = BLKDEV_IOSIZE; if ((majordev = major(vp->v_rdev)) < nblkdev && (ioctl = bdevsw[majordev].d_ioctl) != NULL && (*ioctl)(vp->v_rdev, DIOCGPART, (caddr_t)&dpart, FREAD, p) == 0) { u_int32_t frag = DISKLABELV1_FFS_FRAG(dpart.part->p_fragblock); u_int32_t fsize = DISKLABELV1_FFS_FSIZE(dpart.part->p_fragblock); if (dpart.part->p_fstype == FS_BSDFFS && frag != 0 && fsize != 0) bsize = frag * fsize; } bscale = btodb(bsize); do { bn = btodb(uio->uio_offset) & ~(bscale - 1); on = uio->uio_offset % bsize; n = ulmin((bsize - on), uio->uio_resid); error = bread(vp, bn, bsize, &bp); n = ulmin(n, bsize - bp->b_resid); if (error) { brelse(bp); return (error); } error = uiomove((char *)bp->b_data + on, n, uio); if (n + on == bsize) bawrite(bp); else bdwrite(bp); } while (error == 0 && uio->uio_resid > 0 && n != 0); return (error); default: panic("spec_write type"); } /* NOTREACHED */ }
/* ARGSUSED */ int memrw(struct cdev *dev, struct uio *uio, int flags) { struct iovec *iov; void *p; ssize_t orig_resid; u_long v, vd; u_int c; int error; error = 0; orig_resid = uio->uio_resid; while (uio->uio_resid > 0 && error == 0) { iov = uio->uio_iov; if (iov->iov_len == 0) { uio->uio_iov++; uio->uio_iovcnt--; if (uio->uio_iovcnt < 0) panic("memrw"); continue; } v = uio->uio_offset; c = ulmin(iov->iov_len, PAGE_SIZE - (u_int)(v & PAGE_MASK)); switch (dev2unit(dev)) { case CDEV_MINOR_KMEM: /* * Since c is clamped to be less or equal than * PAGE_SIZE, the uiomove() call does not * access past the end of the direct map. */ if (v >= DMAP_MIN_ADDRESS && v < DMAP_MIN_ADDRESS + dmaplimit) { error = uiomove((void *)v, c, uio); break; } if (!kernacc((void *)v, c, uio->uio_rw == UIO_READ ? VM_PROT_READ : VM_PROT_WRITE)) { error = EFAULT; break; } /* * If the extracted address is not accessible * through the direct map, then we make a * private (uncached) mapping because we can't * depend on the existing kernel mapping * remaining valid until the completion of * uiomove(). * * XXX We cannot provide access to the * physical page 0 mapped into KVA. */ v = pmap_extract(kernel_pmap, v); if (v == 0) { error = EFAULT; break; } /* FALLTHROUGH */ case CDEV_MINOR_MEM: if (v < dmaplimit) { vd = PHYS_TO_DMAP(v); error = uiomove((void *)vd, c, uio); break; } if (v > cpu_getmaxphyaddr()) { error = EFAULT; break; } p = pmap_mapdev(v, PAGE_SIZE); error = uiomove(p, c, uio); pmap_unmapdev((vm_offset_t)p, PAGE_SIZE); break; } } /* * Don't return error if any byte was written. Read and write * can return error only if no i/o was performed. */ if (uio->uio_resid != orig_resid) error = 0; return (error); }
/* * Allocate a device specific dma_tag. */ int bus_dma_tag_create(bus_dma_tag_t parent, bus_size_t alignment, bus_addr_t boundary, bus_addr_t lowaddr, bus_addr_t highaddr, bus_dma_filter_t *filter, void *filterarg, bus_size_t maxsize, int nsegments, bus_size_t maxsegsz, int flags, bus_dma_lock_t *lockfunc, void *lockfuncarg, bus_dma_tag_t *dmat) { bus_dma_tag_t newtag; /* Return a NULL tag on failure */ *dmat = NULL; /* Enforce the usage of BUS_GET_DMA_TAG(). */ if (parent == NULL) panic("%s: parent DMA tag NULL", __func__); newtag = (bus_dma_tag_t)malloc(sizeof(*newtag), M_DEVBUF, M_NOWAIT); if (newtag == NULL) return (ENOMEM); /* * The method table pointer and the cookie need to be taken over from * the parent. */ newtag->dt_cookie = parent->dt_cookie; newtag->dt_mt = parent->dt_mt; newtag->dt_parent = parent; newtag->dt_alignment = alignment; newtag->dt_boundary = boundary; newtag->dt_lowaddr = trunc_page((vm_offset_t)lowaddr) + (PAGE_SIZE - 1); newtag->dt_highaddr = trunc_page((vm_offset_t)highaddr) + (PAGE_SIZE - 1); newtag->dt_filter = filter; newtag->dt_filterarg = filterarg; newtag->dt_maxsize = maxsize; newtag->dt_nsegments = nsegments; newtag->dt_maxsegsz = maxsegsz; newtag->dt_flags = flags; newtag->dt_ref_count = 1; /* Count ourselves */ newtag->dt_map_count = 0; if (lockfunc != NULL) { newtag->dt_lockfunc = lockfunc; newtag->dt_lockfuncarg = lockfuncarg; } else { newtag->dt_lockfunc = dflt_lock; newtag->dt_lockfuncarg = NULL; } newtag->dt_segments = NULL; /* Take into account any restrictions imposed by our parent tag. */ newtag->dt_lowaddr = ulmin(parent->dt_lowaddr, newtag->dt_lowaddr); newtag->dt_highaddr = ulmax(parent->dt_highaddr, newtag->dt_highaddr); if (newtag->dt_boundary == 0) newtag->dt_boundary = parent->dt_boundary; else if (parent->dt_boundary != 0) newtag->dt_boundary = ulmin(parent->dt_boundary, newtag->dt_boundary); atomic_add_int(&parent->dt_ref_count, 1); if (newtag->dt_boundary > 0) newtag->dt_maxsegsz = ulmin(newtag->dt_maxsegsz, newtag->dt_boundary); *dmat = newtag; return (0); }
int mmrw(dev_t dev, struct uio *uio, int flags) { vaddr_t o, v; size_t c; struct iovec *iov; int error = 0; if (minor(dev) == 0) { /* lock against other uses of shared vmmap */ error = rw_enter(&physlock, RW_WRITE | RW_INTR); if (error) return (error); } while (uio->uio_resid > 0 && error == 0) { iov = uio->uio_iov; if (iov->iov_len == 0) { uio->uio_iov++; uio->uio_iovcnt--; if (uio->uio_iovcnt < 0) panic("mmrw"); continue; } switch (minor(dev)) { /* minor device 0 is physical memory */ case 0: v = uio->uio_offset; pmap_enter(pmap_kernel(), (vaddr_t)vmmap, trunc_page(v), uio->uio_rw == UIO_READ ? PROT_READ : PROT_WRITE, PMAP_WIRED); pmap_update(pmap_kernel()); o = uio->uio_offset & PGOFSET; c = ulmin(uio->uio_resid, NBPG - o); error = uiomove((caddr_t)vmmap + o, c, uio); pmap_remove(pmap_kernel(), (vaddr_t)vmmap, (vaddr_t)vmmap + NBPG); pmap_update(pmap_kernel()); continue; /* minor device 1 is kernel memory */ case 1: v = uio->uio_offset; c = ulmin(iov->iov_len, MAXPHYS); if (!uvm_kernacc((caddr_t)v, c, uio->uio_rw == UIO_READ ? B_READ : B_WRITE)) return (EFAULT); error = uiomove((caddr_t)v, c, uio); continue; /* minor device 2 is /dev/null */ case 2: if (uio->uio_rw == UIO_WRITE) uio->uio_resid = 0; return (0); /* minor device 12 is /dev/zero */ case 12: if (uio->uio_rw == UIO_WRITE) { c = iov->iov_len; break; } if (zeropage == NULL) { zeropage = malloc(PAGE_SIZE, M_TEMP, M_WAITOK|M_ZERO); } c = ulmin(iov->iov_len, PAGE_SIZE); error = uiomove(zeropage, c, uio); continue; default: return (ENXIO); } iov->iov_base = (char *)iov->iov_base + c; iov->iov_len -= c; uio->uio_offset += c; uio->uio_resid -= c; } if (minor(dev) == 0) { rw_exit(&physlock); } return (error); }
/* * We have an scb which has been processed by the * adaptor, now we look to see how the operation * went. */ void ahd_done(struct ahd_softc *ahd, struct scb *scb) { struct scsi_xfer *xs = scb->xs; int s; /* XXX in ahc there is some bus_dmamap_sync(PREREAD|PREWRITE); */ LIST_REMOVE(scb, pending_links); timeout_del(&xs->stimeout); if (xs->datalen) { int op; if ((xs->flags & SCSI_DATA_IN) != 0) op = BUS_DMASYNC_POSTREAD; else op = BUS_DMASYNC_POSTWRITE; bus_dmamap_sync(ahd->parent_dmat, scb->dmamap, 0, scb->dmamap->dm_mapsize, op); bus_dmamap_unload(ahd->parent_dmat, scb->dmamap); } /* Translate the CAM status code to a SCSI error code. */ switch (xs->error) { case CAM_SCSI_STATUS_ERROR: case CAM_REQ_INPROG: case CAM_REQ_CMP: switch (xs->status) { case SCSI_TASKSET_FULL: case SCSI_BUSY: xs->error = XS_BUSY; break; case SCSI_CHECK: case SCSI_TERMINATED: if ((scb->flags & SCB_SENSE) == 0) { /* CHECK on CHECK? */ xs->error = XS_DRIVER_STUFFUP; } else xs->error = XS_NOERROR; break; default: xs->error = XS_NOERROR; break; } break; case CAM_BUSY: case CAM_REQUEUE_REQ: xs->error = XS_BUSY; break; case CAM_CMD_TIMEOUT: xs->error = XS_TIMEOUT; break; case CAM_BDR_SENT: case CAM_SCSI_BUS_RESET: xs->error = XS_RESET; break; case CAM_SEL_TIMEOUT: xs->error = XS_SELTIMEOUT; break; default: xs->error = XS_DRIVER_STUFFUP; break; } if (xs->error != XS_NOERROR) { /* Don't clobber any existing error state */ } else if ((scb->flags & SCB_SENSE) != 0) { /* * We performed autosense retrieval. * * Zero any sense not transferred by the * device. The SCSI spec mandates that any * untransferred data should be assumed to be * zero. Complete the 'bounce' of sense information * through buffers accessible via bus-space by * copying it into the clients csio. */ memset(&xs->sense, 0, sizeof(struct scsi_sense_data)); memcpy(&xs->sense, ahd_get_sense_buf(ahd, scb), sizeof(struct scsi_sense_data)); xs->error = XS_SENSE; } else if ((scb->flags & SCB_PKT_SENSE) != 0) { struct scsi_status_iu_header *siu; u_int32_t len; siu = (struct scsi_status_iu_header *)scb->sense_data; len = SIU_SENSE_LENGTH(siu); memset(&xs->sense, 0, sizeof(xs->sense)); memcpy(&xs->sense, SIU_SENSE_DATA(siu), ulmin(len, sizeof(xs->sense))); xs->error = XS_SENSE; } ahd_lock(ahd, &s); ahd_free_scb(ahd, scb); scsi_done(xs); ahd_unlock(ahd, &s); }
/* * Tcp output routine: figure out what should be sent and send it. */ int tcp_output(struct tcpcb *tp) { struct socket *so = tp->t_inpcb->inp_socket; long len, recwin, sendwin; int off, flags, error; #ifdef TCP_SIGNATURE int sigoff = 0; #endif struct mbuf *m; struct ip *ip = NULL; struct ipovly *ipov = NULL; struct tcphdr *th; u_char opt[TCP_MAXOLEN]; unsigned ipoptlen, optlen, hdrlen; int idle, sendalot; int i, sack_rxmit; int sack_bytes_rxmt; struct sackhole *p; #if 0 int maxburst = TCP_MAXBURST; #endif struct rmxp_tao tao; #ifdef INET6 struct ip6_hdr *ip6 = NULL; int isipv6; bzero(&tao, sizeof(tao)); isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0; #endif #ifdef TCP_ECN int needect; #endif INP_LOCK_ASSERT(tp->t_inpcb); /* * Determine length of data that should be transmitted, * and flags that will be used. * If there is some data or critical controls (SYN, RST) * to send, then transmit; otherwise, investigate further. */ idle = (tp->t_flags & TF_LASTIDLE) || (tp->snd_max == tp->snd_una); if (idle && (ticks - tp->t_rcvtime) >= tp->t_rxtcur) { /* * We have been idle for "a while" and no acks are * expected to clock out any data we send -- * slow start to get ack "clock" running again. * * Set the slow-start flight size depending on whether * this is a local network or not. */ int ss = ss_fltsz; #ifdef INET6 if (isipv6) { if (in6_localaddr(&tp->t_inpcb->in6p_faddr)) ss = ss_fltsz_local; } else #endif if (in_localaddr(tp->t_inpcb->inp_faddr)) ss = ss_fltsz_local; tp->snd_cwnd = tp->t_maxseg * ss; } tp->t_flags &= ~TF_LASTIDLE; if (idle) { if (tp->t_flags & TF_MORETOCOME) { tp->t_flags |= TF_LASTIDLE; idle = 0; } } again: /* * If we've recently taken a timeout, snd_max will be greater than * snd_nxt. There may be SACK information that allows us to avoid * resending already delivered data. Adjust snd_nxt accordingly. */ if (tp->sack_enable && SEQ_LT(tp->snd_nxt, tp->snd_max)) tcp_sack_adjust(tp); sendalot = 0; off = tp->snd_nxt - tp->snd_una; sendwin = min(tp->snd_wnd, tp->snd_cwnd); sendwin = min(sendwin, tp->snd_bwnd); flags = tcp_outflags[tp->t_state]; /* * Send any SACK-generated retransmissions. If we're explicitly trying * to send out new data (when sendalot is 1), bypass this function. * If we retransmit in fast recovery mode, decrement snd_cwnd, since * we're replacing a (future) new transmission with a retransmission * now, and we previously incremented snd_cwnd in tcp_input(). */ /* * Still in sack recovery , reset rxmit flag to zero. */ sack_rxmit = 0; sack_bytes_rxmt = 0; len = 0; p = NULL; if (tp->sack_enable && IN_FASTRECOVERY(tp) && (p = tcp_sack_output(tp, &sack_bytes_rxmt))) { long cwin; cwin = min(tp->snd_wnd, tp->snd_cwnd) - sack_bytes_rxmt; if (cwin < 0) cwin = 0; /* Do not retransmit SACK segments beyond snd_recover */ if (SEQ_GT(p->end, tp->snd_recover)) { /* * (At least) part of sack hole extends beyond * snd_recover. Check to see if we can rexmit data * for this hole. */ if (SEQ_GEQ(p->rxmit, tp->snd_recover)) { /* * Can't rexmit any more data for this hole. * That data will be rexmitted in the next * sack recovery episode, when snd_recover * moves past p->rxmit. */ p = NULL; goto after_sack_rexmit; } else /* Can rexmit part of the current hole */ len = ((long)ulmin(cwin, tp->snd_recover - p->rxmit)); } else len = ((long)ulmin(cwin, p->end - p->rxmit)); off = p->rxmit - tp->snd_una; KASSERT(off >= 0,("%s: sack block to the left of una : %d", __func__, off)); if (len > 0) { sack_rxmit = 1; sendalot = 1; tcpstat.tcps_sack_rexmits++; tcpstat.tcps_sack_rexmit_bytes += min(len, tp->t_maxseg); } } after_sack_rexmit: /* * Get standard flags, and add SYN or FIN if requested by 'hidden' * state flags. */ if (tp->t_flags & TF_NEEDFIN) flags |= TH_FIN; if (tp->t_flags & TF_NEEDSYN) flags |= TH_SYN; SOCKBUF_LOCK(&so->so_snd); /* * If in persist timeout with window of 0, send 1 byte. * Otherwise, if window is small but nonzero * and timer expired, we will send what we can * and go to transmit state. */ if (tp->t_force) { if (sendwin == 0) { /* * If we still have some data to send, then * clear the FIN bit. Usually this would * happen below when it realizes that we * aren't sending all the data. However, * if we have exactly 1 byte of unsent data, * then it won't clear the FIN bit below, * and if we are in persist state, we wind * up sending the packet without recording * that we sent the FIN bit. * * We can't just blindly clear the FIN bit, * because if we don't have any more data * to send then the probe will be the FIN * itself. */ if (off < so->so_snd.sb_cc) flags &= ~TH_FIN; sendwin = 1; } else { callout_stop(tp->tt_persist); tp->t_rxtshift = 0; } } /* * If snd_nxt == snd_max and we have transmitted a FIN, the * offset will be > 0 even if so_snd.sb_cc is 0, resulting in * a negative length. This can also occur when TCP opens up * its congestion window while receiving additional duplicate * acks after fast-retransmit because TCP will reset snd_nxt * to snd_max after the fast-retransmit. * * In the normal retransmit-FIN-only case, however, snd_nxt will * be set to snd_una, the offset will be 0, and the length may * wind up 0. * * If sack_rxmit is true we are retransmitting from the scoreboard * in which case len is already set. */ if (sack_rxmit == 0) { if (sack_bytes_rxmt == 0) len = ((long)ulmin(so->so_snd.sb_cc, sendwin) - off); else { long cwin; /* * We are inside of a SACK recovery episode and are * sending new data, having retransmitted all the * data possible in the scoreboard. */ len = ((long)ulmin(so->so_snd.sb_cc, tp->snd_wnd) - off); /* * Don't remove this (len > 0) check ! * We explicitly check for len > 0 here (although it * isn't really necessary), to work around a gcc * optimization issue - to force gcc to compute * len above. Without this check, the computation * of len is bungled by the optimizer. */ if (len > 0) { cwin = tp->snd_cwnd - (tp->snd_nxt - tp->sack_newdata) - sack_bytes_rxmt; if (cwin < 0) cwin = 0; len = lmin(len, cwin); } } } /* * Lop off SYN bit if it has already been sent. However, if this * is SYN-SENT state and if segment contains data and if we don't * know that foreign host supports TAO, suppress sending segment. */ if ((flags & TH_SYN) && SEQ_GT(tp->snd_nxt, tp->snd_una)) { flags &= ~TH_SYN; off--, len++; if (tcp_do_rfc1644) tcp_hc_gettao(&tp->t_inpcb->inp_inc, &tao); if (len > 0 && tp->t_state == TCPS_SYN_SENT && tao.tao_ccsent == 0) goto just_return; } /* * Be careful not to send data and/or FIN on SYN segments * in cases when no CC option will be sent. * This measure is needed to prevent interoperability problems * with not fully conformant TCP implementations. */ if ((flags & TH_SYN) && ((tp->t_flags & TF_NOOPT) || !(tp->t_flags & TF_REQ_CC) || ((flags & TH_ACK) && !(tp->t_flags & TF_RCVD_CC)))) { len = 0; flags &= ~TH_FIN; } if (len < 0) { /* * If FIN has been sent but not acked, * but we haven't been called to retransmit, * len will be < 0. Otherwise, window shrank * after we sent into it. If window shrank to 0, * cancel pending retransmit, pull snd_nxt back * to (closed) window, and set the persist timer * if it isn't already going. If the window didn't * close completely, just wait for an ACK. */ len = 0; if (sendwin == 0) { callout_stop(tp->tt_rexmt); tp->t_rxtshift = 0; tp->snd_nxt = tp->snd_una; if (!callout_active(tp->tt_persist)) tcp_setpersist(tp); } } /* * len will be >= 0 after this point. Truncate to the maximum * segment length and ensure that FIN is removed if the length * no longer contains the last data byte. */ if (len > tp->t_maxseg) { len = tp->t_maxseg; sendalot = 1; } if (sack_rxmit) { if (SEQ_LT(p->rxmit + len, tp->snd_una + so->so_snd.sb_cc)) flags &= ~TH_FIN; } else { if (SEQ_LT(tp->snd_nxt + len, tp->snd_una + so->so_snd.sb_cc)) flags &= ~TH_FIN; } recwin = sbspace(&so->so_rcv); /* * Sender silly window avoidance. We transmit under the following * conditions when len is non-zero: * * - We have a full segment * - This is the last buffer in a write()/send() and we are * either idle or running NODELAY * - we've timed out (e.g. persist timer) * - we have more then 1/2 the maximum send window's worth of * data (receiver may be limited the window size) * - we need to retransmit */ if (len) { if (len == tp->t_maxseg) goto send; /* * NOTE! on localhost connections an 'ack' from the remote * end may occur synchronously with the output and cause * us to flush a buffer queued with moretocome. XXX * * note: the len + off check is almost certainly unnecessary. */ if (!(tp->t_flags & TF_MORETOCOME) && /* normal case */ (idle || (tp->t_flags & TF_NODELAY)) && len + off >= so->so_snd.sb_cc && (tp->t_flags & TF_NOPUSH) == 0) { goto send; } if (tp->t_force) /* typ. timeout case */ goto send; if (len >= tp->max_sndwnd / 2 && tp->max_sndwnd > 0) goto send; if (SEQ_LT(tp->snd_nxt, tp->snd_max)) /* retransmit case */ goto send; if (sack_rxmit) goto send; } /* * Compare available window to amount of window * known to peer (as advertised window less * next expected input). If the difference is at least two * max size segments, or at least 50% of the maximum possible * window, then want to send a window update to peer. * Skip this if the connection is in T/TCP half-open state. */ if (recwin > 0 && !(tp->t_flags & TF_NEEDSYN)) { /* * "adv" is the amount we can increase the window, * taking into account that we are limited by * TCP_MAXWIN << tp->rcv_scale. */ long adv = min(recwin, (long)TCP_MAXWIN << tp->rcv_scale) - (tp->rcv_adv - tp->rcv_nxt); if (adv >= (long) (2 * tp->t_maxseg)) goto send; if (2 * adv >= (long) so->so_rcv.sb_hiwat) goto send; } /* * Send if we owe the peer an ACK, RST, SYN, or urgent data. ACKNOW * is also a catch-all for the retransmit timer timeout case. */ if (tp->t_flags & TF_ACKNOW) goto send; if ((flags & TH_RST) || ((flags & TH_SYN) && (tp->t_flags & TF_NEEDSYN) == 0)) goto send; if (SEQ_GT(tp->snd_up, tp->snd_una)) goto send; /* * If our state indicates that FIN should be sent * and we have not yet done so, then we need to send. */ if (flags & TH_FIN && ((tp->t_flags & TF_SENTFIN) == 0 || tp->snd_nxt == tp->snd_una)) goto send; /* * In SACK, it is possible for tcp_output to fail to send a segment * after the retransmission timer has been turned off. Make sure * that the retransmission timer is set. */ if (tp->sack_enable && SEQ_GT(tp->snd_max, tp->snd_una) && !callout_active(tp->tt_rexmt) && !callout_active(tp->tt_persist)) { callout_reset(tp->tt_rexmt, tp->t_rxtcur, tcp_timer_rexmt, tp); goto just_return; } /* * TCP window updates are not reliable, rather a polling protocol * using ``persist'' packets is used to insure receipt of window * updates. The three ``states'' for the output side are: * idle not doing retransmits or persists * persisting to move a small or zero window * (re)transmitting and thereby not persisting * * callout_active(tp->tt_persist) * is true when we are in persist state. * tp->t_force * is set when we are called to send a persist packet. * callout_active(tp->tt_rexmt) * is set when we are retransmitting * The output side is idle when both timers are zero. * * If send window is too small, there is data to transmit, and no * retransmit or persist is pending, then go to persist state. * If nothing happens soon, send when timer expires: * if window is nonzero, transmit what we can, * otherwise force out a byte. */ if (so->so_snd.sb_cc && !callout_active(tp->tt_rexmt) && !callout_active(tp->tt_persist)) { tp->t_rxtshift = 0; tcp_setpersist(tp); } /* * No reason to send a segment, just return. */ just_return: SOCKBUF_UNLOCK(&so->so_snd); return (0); send: SOCKBUF_LOCK_ASSERT(&so->so_snd); /* * Before ESTABLISHED, force sending of initial options * unless TCP set not to do any options. * NOTE: we assume that the IP/TCP header plus TCP options * always fit in a single mbuf, leaving room for a maximum * link header, i.e. * max_linkhdr + sizeof (struct tcpiphdr) + optlen <= MCLBYTES */ optlen = 0; #ifdef INET6 if (isipv6) hdrlen = sizeof (struct ip6_hdr) + sizeof (struct tcphdr); else #endif hdrlen = sizeof (struct tcpiphdr); if (flags & TH_SYN) { tp->snd_nxt = tp->iss; if ((tp->t_flags & TF_NOOPT) == 0) { u_short mss; opt[0] = TCPOPT_MAXSEG; opt[1] = TCPOLEN_MAXSEG; mss = htons((u_short) tcp_mssopt(&tp->t_inpcb->inp_inc)); (void)memcpy(opt + 2, &mss, sizeof(mss)); optlen = TCPOLEN_MAXSEG; /* * If this is the first SYN of connection (not a SYN * ACK), include SACK_PERMIT_HDR option. If this is a * SYN ACK, include SACK_PERMIT_HDR option if peer has * already done so. This is only for active connect, * since the syncache takes care of the passive connect. */ if (tp->sack_enable && ((flags & TH_ACK) == 0 || (tp->t_flags & TF_SACK_PERMIT))) { *((u_int32_t *) (opt + optlen)) = htonl(TCPOPT_SACK_PERMIT_HDR); optlen += 4; } if ((tp->t_flags & TF_REQ_SCALE) && ((flags & TH_ACK) == 0 || (tp->t_flags & TF_RCVD_SCALE))) { *((u_int32_t *)(opt + optlen)) = htonl( TCPOPT_NOP << 24 | TCPOPT_WINDOW << 16 | TCPOLEN_WINDOW << 8 | tp->request_r_scale); optlen += 4; } } } /* * Send a timestamp and echo-reply if this is a SYN and our side * wants to use timestamps (TF_REQ_TSTMP is set) or both our side * and our peer have sent timestamps in our SYN's. */ if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP && (flags & TH_RST) == 0 && ((flags & TH_ACK) == 0 || (tp->t_flags & TF_RCVD_TSTMP))) { u_int32_t *lp = (u_int32_t *)(opt + optlen); /* Form timestamp option as shown in appendix A of RFC 1323. */ *lp++ = htonl(TCPOPT_TSTAMP_HDR); *lp++ = htonl(ticks); *lp = htonl(tp->ts_recent); optlen += TCPOLEN_TSTAMP_APPA; } /* * Send SACKs if necessary. This should be the last option processed. * Only as many SACKs are sent as are permitted by the maximum options * size. No more than three SACKs are sent. */ if (tp->sack_enable && tp->t_state == TCPS_ESTABLISHED && (tp->t_flags & (TF_SACK_PERMIT|TF_NOOPT)) == TF_SACK_PERMIT && tp->rcv_numsacks) { u_int32_t *lp = (u_int32_t *)(opt + optlen); u_int32_t *olp = lp++; int count = 0; /* actual number of SACKs inserted */ int maxsack = (MAX_TCPOPTLEN - (optlen + 4))/TCPOLEN_SACK; tcpstat.tcps_sack_send_blocks++; maxsack = min(maxsack, TCP_MAX_SACK); for (i = 0; (i < tp->rcv_numsacks && count < maxsack); i++) { struct sackblk sack = tp->sackblks[i]; if (sack.start == 0 && sack.end == 0) continue; *lp++ = htonl(sack.start); *lp++ = htonl(sack.end); count++; } *olp = htonl(TCPOPT_SACK_HDR|(TCPOLEN_SACK*count+2)); optlen += TCPOLEN_SACK*count + 4; /* including leading NOPs */ } /* * Send `CC-family' options if our side wants to use them (TF_REQ_CC), * options are allowed (!TF_NOOPT) and it's not a RST. */ if ((tp->t_flags & (TF_REQ_CC|TF_NOOPT)) == TF_REQ_CC && (flags & TH_RST) == 0) { switch (flags & (TH_SYN|TH_ACK)) { /* * This is a normal ACK, send CC if we received CC before * from our peer. */ case TH_ACK: if (!(tp->t_flags & TF_RCVD_CC)) break; /*FALLTHROUGH*/ /* * We can only get here in T/TCP's SYN_SENT* state, when * we're a sending a non-SYN segment without waiting for * the ACK of our SYN. A check above assures that we only * do this if our peer understands T/TCP. */ case 0: opt[optlen++] = TCPOPT_NOP; opt[optlen++] = TCPOPT_NOP; opt[optlen++] = TCPOPT_CC; opt[optlen++] = TCPOLEN_CC; *(u_int32_t *)&opt[optlen] = htonl(tp->cc_send); optlen += 4; break; /* * This is our initial SYN, check whether we have to use * CC or CC.new. */ case TH_SYN: opt[optlen++] = TCPOPT_NOP; opt[optlen++] = TCPOPT_NOP; opt[optlen++] = tp->t_flags & TF_SENDCCNEW ? TCPOPT_CCNEW : TCPOPT_CC; opt[optlen++] = TCPOLEN_CC; *(u_int32_t *)&opt[optlen] = htonl(tp->cc_send); optlen += 4; break; /* * This is a SYN,ACK; send CC and CC.echo if we received * CC from our peer. */ case (TH_SYN|TH_ACK): if (tp->t_flags & TF_RCVD_CC) { opt[optlen++] = TCPOPT_NOP; opt[optlen++] = TCPOPT_NOP; opt[optlen++] = TCPOPT_CC; opt[optlen++] = TCPOLEN_CC; *(u_int32_t *)&opt[optlen] = htonl(tp->cc_send); optlen += 4; opt[optlen++] = TCPOPT_NOP; opt[optlen++] = TCPOPT_NOP; opt[optlen++] = TCPOPT_CCECHO; opt[optlen++] = TCPOLEN_CC; *(u_int32_t *)&opt[optlen] = htonl(tp->cc_recv); optlen += 4; } break; } } #ifdef TCP_SIGNATURE #ifdef INET6 if (!isipv6) #endif if (tp->t_flags & TF_SIGNATURE) { int i; u_char *bp; /* Initialize TCP-MD5 option (RFC2385) */ bp = (u_char *)opt + optlen; *bp++ = TCPOPT_SIGNATURE; *bp++ = TCPOLEN_SIGNATURE; sigoff = optlen + 2; for (i = 0; i < TCP_SIGLEN; i++) *bp++ = 0; optlen += TCPOLEN_SIGNATURE; /* Terminate options list and maintain 32-bit alignment. */ *bp++ = TCPOPT_NOP; *bp++ = TCPOPT_EOL; optlen += 2; } #endif /* TCP_SIGNATURE */ hdrlen += optlen; #ifdef INET6 if (isipv6) ipoptlen = ip6_optlen(tp->t_inpcb); else #endif if (tp->t_inpcb->inp_options) ipoptlen = tp->t_inpcb->inp_options->m_len - offsetof(struct ipoption, ipopt_list); else
static void rtas_setup(void *junk) { ihandle_t rtasi; cell_t rtas_size = 0, rtas_ptr; char path[31]; int result; rtas = OF_finddevice("/rtas"); if (rtas == -1) { rtas = 0; return; } OF_package_to_path(rtas, path, sizeof(path)); rtasi = OF_open(path); if (rtasi == 0) { rtas = 0; printf("Error initializing RTAS: could not open node\n"); return; } mtx_init(&rtas_mtx, "RTAS", MTX_DEF, 0); /* RTAS must be called with everything turned off in MSR */ rtasmsr = mfmsr(); rtasmsr &= ~(PSL_IR | PSL_DR | PSL_EE | PSL_SE); #ifdef __powerpc64__ rtasmsr &= ~PSL_SF; #endif /* * Allocate rtas_size + one page of contiguous, wired physical memory * that can fit into a 32-bit address space and accessed from real mode. * This is used both to bounce arguments and for RTAS private data. * * It must be 4KB-aligned and not cross a 256 MB boundary. */ OF_getprop(rtas, "rtas-size", &rtas_size, sizeof(rtas_size)); rtas_size = round_page(rtas_size); rtas_bounce_virt = contigmalloc(rtas_size + PAGE_SIZE, M_RTAS, 0, 0, ulmin(platform_real_maxaddr(), BUS_SPACE_MAXADDR_32BIT), 4096, 256*1024*1024); rtas_private_data = vtophys(rtas_bounce_virt); rtas_bounce_virt += rtas_size; /* Actual bounce area */ rtas_bounce_phys = vtophys(rtas_bounce_virt); rtas_bounce_size = PAGE_SIZE; /* * Instantiate RTAS. We always use the 32-bit version. */ result = OF_call_method("instantiate-rtas", rtasi, 1, 1, (cell_t)rtas_private_data, &rtas_ptr); OF_close(rtasi); if (result != 0) { rtas = 0; rtas_ptr = 0; printf("Error initializing RTAS (%d)\n", result); return; } rtas_entry = (uintptr_t)(rtas_ptr); }
/* * Tcp output routine: figure out what should be sent and send it. */ int tcp_output(struct tcpcb *tp) { struct inpcb * const inp = tp->t_inpcb; struct socket *so = inp->inp_socket; long len, recvwin, sendwin; int nsacked = 0; int off, flags, error = 0; #ifdef TCP_SIGNATURE int sigoff = 0; #endif struct mbuf *m; struct ip *ip; struct tcphdr *th; u_char opt[TCP_MAXOLEN]; unsigned int ipoptlen, optlen, hdrlen; int idle; boolean_t sendalot; struct ip6_hdr *ip6; #ifdef INET6 const boolean_t isipv6 = INP_ISIPV6(inp); #else const boolean_t isipv6 = FALSE; #endif boolean_t can_tso = FALSE, use_tso; boolean_t report_sack, idle_cwv = FALSE; u_int segsz, tso_hlen, tso_lenmax = 0; int segcnt = 0; boolean_t need_sched = FALSE; KKASSERT(so->so_port == &curthread->td_msgport); /* * Determine length of data that should be transmitted, * and flags that will be used. * If there is some data or critical controls (SYN, RST) * to send, then transmit; otherwise, investigate further. */ /* * If we have been idle for a while, the send congestion window * could be no longer representative of the current state of the * link; need to validate congestion window. However, we should * not perform congestion window validation here, since we could * be asked to send pure ACK. */ if (tp->snd_max == tp->snd_una && (ticks - tp->snd_last) >= tp->t_rxtcur && tcp_idle_restart) idle_cwv = TRUE; /* * Calculate whether the transmit stream was previously idle * and adjust TF_LASTIDLE for the next time. */ idle = (tp->t_flags & TF_LASTIDLE) || (tp->snd_max == tp->snd_una); if (idle && (tp->t_flags & TF_MORETOCOME)) tp->t_flags |= TF_LASTIDLE; else tp->t_flags &= ~TF_LASTIDLE; if (TCP_DO_SACK(tp) && tp->snd_nxt != tp->snd_max && !IN_FASTRECOVERY(tp)) nsacked = tcp_sack_bytes_below(&tp->scb, tp->snd_nxt); /* * Find out whether TSO could be used or not * * For TSO capable devices, the following assumptions apply to * the processing of TCP flags: * - If FIN is set on the large TCP segment, the device must set * FIN on the last segment that it creates from the large TCP * segment. * - If PUSH is set on the large TCP segment, the device must set * PUSH on the last segment that it creates from the large TCP * segment. */ #if !defined(IPSEC) && !defined(FAST_IPSEC) if (tcp_do_tso #ifdef TCP_SIGNATURE && (tp->t_flags & TF_SIGNATURE) == 0 #endif ) { if (!isipv6) { struct rtentry *rt = inp->inp_route.ro_rt; if (rt != NULL && (rt->rt_flags & RTF_UP) && (rt->rt_ifp->if_hwassist & CSUM_TSO)) { can_tso = TRUE; tso_lenmax = rt->rt_ifp->if_tsolen; } } } #endif /* !IPSEC && !FAST_IPSEC */ again: m = NULL; ip = NULL; th = NULL; ip6 = NULL; if ((tp->t_flags & (TF_SACK_PERMITTED | TF_NOOPT)) == TF_SACK_PERMITTED && (!TAILQ_EMPTY(&tp->t_segq) || tp->reportblk.rblk_start != tp->reportblk.rblk_end)) report_sack = TRUE; else report_sack = FALSE; /* Make use of SACK information when slow-starting after a RTO. */ if (TCP_DO_SACK(tp) && tp->snd_nxt != tp->snd_max && !IN_FASTRECOVERY(tp)) { tcp_seq old_snd_nxt = tp->snd_nxt; tcp_sack_skip_sacked(&tp->scb, &tp->snd_nxt); nsacked += tp->snd_nxt - old_snd_nxt; } sendalot = FALSE; off = tp->snd_nxt - tp->snd_una; sendwin = min(tp->snd_wnd, tp->snd_cwnd + nsacked); sendwin = min(sendwin, tp->snd_bwnd); flags = tcp_outflags[tp->t_state]; /* * Get standard flags, and add SYN or FIN if requested by 'hidden' * state flags. */ if (tp->t_flags & TF_NEEDFIN) flags |= TH_FIN; if (tp->t_flags & TF_NEEDSYN) flags |= TH_SYN; /* * If in persist timeout with window of 0, send 1 byte. * Otherwise, if window is small but nonzero * and timer expired, we will send what we can * and go to transmit state. */ if (tp->t_flags & TF_FORCE) { if (sendwin == 0) { /* * If we still have some data to send, then * clear the FIN bit. Usually this would * happen below when it realizes that we * aren't sending all the data. However, * if we have exactly 1 byte of unsent data, * then it won't clear the FIN bit below, * and if we are in persist state, we wind * up sending the packet without recording * that we sent the FIN bit. * * We can't just blindly clear the FIN bit, * because if we don't have any more data * to send then the probe will be the FIN * itself. */ if (off < so->so_snd.ssb_cc) flags &= ~TH_FIN; sendwin = 1; } else { tcp_callout_stop(tp, tp->tt_persist); tp->t_rxtshift = 0; } } /* * If snd_nxt == snd_max and we have transmitted a FIN, the * offset will be > 0 even if so_snd.ssb_cc is 0, resulting in * a negative length. This can also occur when TCP opens up * its congestion window while receiving additional duplicate * acks after fast-retransmit because TCP will reset snd_nxt * to snd_max after the fast-retransmit. * * A negative length can also occur when we are in the * TCPS_SYN_RECEIVED state due to a simultanious connect where * our SYN has not been acked yet. * * In the normal retransmit-FIN-only case, however, snd_nxt will * be set to snd_una, the offset will be 0, and the length may * wind up 0. */ len = (long)ulmin(so->so_snd.ssb_cc, sendwin) - off; /* * Lop off SYN bit if it has already been sent. However, if this * is SYN-SENT state and if segment contains data, suppress sending * segment (sending the segment would be an option if we still * did TAO and the remote host supported it). */ if ((flags & TH_SYN) && SEQ_GT(tp->snd_nxt, tp->snd_una)) { flags &= ~TH_SYN; off--, len++; if (len > 0 && tp->t_state == TCPS_SYN_SENT) { tp->t_flags &= ~(TF_ACKNOW | TF_XMITNOW); return 0; } } /* * Be careful not to send data and/or FIN on SYN segments. * This measure is needed to prevent interoperability problems * with not fully conformant TCP implementations. */ if (flags & TH_SYN) { len = 0; flags &= ~TH_FIN; } if (len < 0) { /* * A negative len can occur if our FIN has been sent but not * acked, or if we are in a simultanious connect in the * TCPS_SYN_RECEIVED state with our SYN sent but not yet * acked. * * If our window has contracted to 0 in the FIN case * (which can only occur if we have NOT been called to * retransmit as per code a few paragraphs up) then we * want to shift the retransmit timer over to the * persist timer. * * However, if we are in the TCPS_SYN_RECEIVED state * (the SYN case) we will be in a simultanious connect and * the window may be zero degeneratively. In this case we * do not want to shift to the persist timer after the SYN * or the SYN+ACK transmission. */ len = 0; if (sendwin == 0 && tp->t_state != TCPS_SYN_RECEIVED) { tcp_callout_stop(tp, tp->tt_rexmt); tp->t_rxtshift = 0; tp->snd_nxt = tp->snd_una; if (!tcp_callout_active(tp, tp->tt_persist)) tcp_setpersist(tp); } } KASSERT(len >= 0, ("%s: len < 0", __func__)); /* * Automatic sizing of send socket buffer. Often the send buffer * size is not optimally adjusted to the actual network conditions * at hand (delay bandwidth product). Setting the buffer size too * small limits throughput on links with high bandwidth and high * delay (eg. trans-continental/oceanic links). Setting the * buffer size too big consumes too much real kernel memory, * especially with many connections on busy servers. * * The criteria to step up the send buffer one notch are: * 1. receive window of remote host is larger than send buffer * (with a fudge factor of 5/4th); * 2. hiwat has not significantly exceeded bwnd (inflight) * (bwnd is a maximal value if inflight is disabled). * 3. send buffer is filled to 7/8th with data (so we actually * have data to make use of it); * 4. hiwat has not hit maximal automatic size; * 5. our send window (slow start and cogestion controlled) is * larger than sent but unacknowledged data in send buffer. * * The remote host receive window scaling factor may limit the * growing of the send buffer before it reaches its allowed * maximum. * * It scales directly with slow start or congestion window * and does at most one step per received ACK. This fast * scaling has the drawback of growing the send buffer beyond * what is strictly necessary to make full use of a given * delay*bandwith product. However testing has shown this not * to be much of an problem. At worst we are trading wasting * of available bandwith (the non-use of it) for wasting some * socket buffer memory. * * The criteria for shrinking the buffer is based solely on * the inflight code (snd_bwnd). If inflight is disabled, * the buffer will not be shrinked. Note that snd_bwnd already * has a fudge factor. Our test adds a little hysteresis. */ if (tcp_do_autosndbuf && (so->so_snd.ssb_flags & SSB_AUTOSIZE)) { const int asbinc = tcp_autosndbuf_inc; const int hiwat = so->so_snd.ssb_hiwat; const int lowat = so->so_snd.ssb_lowat; u_long newsize; if ((tp->snd_wnd / 4 * 5) >= hiwat && so->so_snd.ssb_cc >= (hiwat / 8 * 7) && hiwat < tp->snd_bwnd + hiwat / 10 && hiwat + asbinc < tcp_autosndbuf_max && hiwat < (TCP_MAXWIN << tp->snd_scale) && sendwin >= (so->so_snd.ssb_cc - (tp->snd_nxt - tp->snd_una))) { newsize = ulmin(hiwat + asbinc, tcp_autosndbuf_max); if (!ssb_reserve(&so->so_snd, newsize, so, NULL)) atomic_clear_int(&so->so_snd.ssb_flags, SSB_AUTOSIZE); #if 0 if (newsize >= (TCP_MAXWIN << tp->snd_scale)) atomic_clear_int(&so->so_snd.ssb_flags, SSB_AUTOSIZE); #endif } else if ((long)tp->snd_bwnd < (long)(hiwat * 3 / 4 - lowat - asbinc) && hiwat > tp->t_maxseg * 2 + asbinc && hiwat + asbinc >= tcp_autosndbuf_min && tcp_do_autosndbuf == 1) { newsize = ulmax(hiwat - asbinc, tp->t_maxseg * 2); ssb_reserve(&so->so_snd, newsize, so, NULL); } } /* * Don't use TSO, if: * - Congestion window needs validation * - There are SACK blocks to report * - RST or SYN flags is set * - URG will be set * * XXX * Checking for SYN|RST looks overkill, just to be safe than sorry */ use_tso = can_tso; if (report_sack || idle_cwv || (flags & (TH_RST | TH_SYN))) use_tso = FALSE; if (use_tso) { tcp_seq ugr_nxt = tp->snd_nxt; if ((flags & TH_FIN) && (tp->t_flags & TF_SENTFIN) && tp->snd_nxt == tp->snd_max) --ugr_nxt; if (SEQ_GT(tp->snd_up, ugr_nxt)) use_tso = FALSE; } if (use_tso) { /* * Find out segment size and header length for TSO */ error = tcp_tso_getsize(tp, &segsz, &tso_hlen); if (error) use_tso = FALSE; } if (!use_tso) { segsz = tp->t_maxseg; tso_hlen = 0; /* not used */ } /* * Truncate to the maximum segment length if not TSO, and ensure that * FIN is removed if the length no longer contains the last data byte. */ if (len > segsz) { if (!use_tso) { len = segsz; ++segcnt; } else { int nsegs; if (__predict_false(tso_lenmax < segsz)) tso_lenmax = segsz << 1; /* * Truncate TSO transfers to (IP_MAXPACKET - iphlen - * thoff), and make sure that we send equal size * transfers down the stack (rather than big-small- * big-small-...). */ len = min(len, tso_lenmax); nsegs = min(len, (IP_MAXPACKET - tso_hlen)) / segsz; KKASSERT(nsegs > 0); len = nsegs * segsz; if (len <= segsz) { use_tso = FALSE; ++segcnt; } else { segcnt += nsegs; } } sendalot = TRUE; } else { use_tso = FALSE; if (len > 0) ++segcnt; } if (SEQ_LT(tp->snd_nxt + len, tp->snd_una + so->so_snd.ssb_cc)) flags &= ~TH_FIN; recvwin = ssb_space(&so->so_rcv); /* * Sender silly window avoidance. We transmit under the following * conditions when len is non-zero: * * - We have a full segment * - This is the last buffer in a write()/send() and we are * either idle or running NODELAY * - we've timed out (e.g. persist timer) * - we have more then 1/2 the maximum send window's worth of * data (receiver may be limiting the window size) * - we need to retransmit */ if (len) { if (len >= segsz) goto send; /* * NOTE! on localhost connections an 'ack' from the remote * end may occur synchronously with the output and cause * us to flush a buffer queued with moretocome. XXX * * note: the len + off check is almost certainly unnecessary. */ if (!(tp->t_flags & TF_MORETOCOME) && /* normal case */ (idle || (tp->t_flags & TF_NODELAY)) && len + off >= so->so_snd.ssb_cc && !(tp->t_flags & TF_NOPUSH)) { goto send; } if (tp->t_flags & TF_FORCE) /* typ. timeout case */ goto send; if (len >= tp->max_sndwnd / 2 && tp->max_sndwnd > 0) goto send; if (SEQ_LT(tp->snd_nxt, tp->snd_max)) /* retransmit case */ goto send; if (tp->t_flags & TF_XMITNOW) goto send; } /* * Compare available window to amount of window * known to peer (as advertised window less * next expected input). If the difference is at least two * max size segments, or at least 50% of the maximum possible * window, then want to send a window update to peer. */ if (recvwin > 0) { /* * "adv" is the amount we can increase the window, * taking into account that we are limited by * TCP_MAXWIN << tp->rcv_scale. */ long adv = min(recvwin, (long)TCP_MAXWIN << tp->rcv_scale) - (tp->rcv_adv - tp->rcv_nxt); long hiwat; /* * This ack case typically occurs when the user has drained * the TCP socket buffer sufficiently to warrent an ack * containing a 'pure window update'... that is, an ack that * ONLY updates the tcp window. * * It is unclear why we would need to do a pure window update * past 2 segments if we are going to do one at 1/2 the high * water mark anyway, especially since under normal conditions * the user program will drain the socket buffer quickly. * The 2-segment pure window update will often add a large * number of extra, unnecessary acks to the stream. * * avoid_pure_win_update now defaults to 1. */ if (avoid_pure_win_update == 0 || (tp->t_flags & TF_RXRESIZED)) { if (adv >= (long) (2 * segsz)) { goto send; } } hiwat = (long)(TCP_MAXWIN << tp->rcv_scale); if (hiwat > (long)so->so_rcv.ssb_hiwat) hiwat = (long)so->so_rcv.ssb_hiwat; if (adv >= hiwat / 2) goto send; } /* * Send if we owe the peer an ACK, RST, SYN, or urgent data. ACKNOW * is also a catch-all for the retransmit timer timeout case. */ if (tp->t_flags & TF_ACKNOW) goto send; if ((flags & TH_RST) || ((flags & TH_SYN) && !(tp->t_flags & TF_NEEDSYN))) goto send; if (SEQ_GT(tp->snd_up, tp->snd_una)) goto send; /* * If our state indicates that FIN should be sent * and we have not yet done so, then we need to send. */ if ((flags & TH_FIN) && (!(tp->t_flags & TF_SENTFIN) || tp->snd_nxt == tp->snd_una)) goto send; /* * TCP window updates are not reliable, rather a polling protocol * using ``persist'' packets is used to insure receipt of window * updates. The three ``states'' for the output side are: * idle not doing retransmits or persists * persisting to move a small or zero window * (re)transmitting and thereby not persisting * * tcp_callout_active(tp, tp->tt_persist) * is true when we are in persist state. * The TF_FORCE flag in tp->t_flags * is set when we are called to send a persist packet. * tcp_callout_active(tp, tp->tt_rexmt) * is set when we are retransmitting * The output side is idle when both timers are zero. * * If send window is too small, there is data to transmit, and no * retransmit or persist is pending, then go to persist state. * * If nothing happens soon, send when timer expires: * if window is nonzero, transmit what we can, otherwise force out * a byte. * * Don't try to set the persist state if we are in TCPS_SYN_RECEIVED * with data pending. This situation can occur during a * simultanious connect. */ if (so->so_snd.ssb_cc > 0 && tp->t_state != TCPS_SYN_RECEIVED && !tcp_callout_active(tp, tp->tt_rexmt) && !tcp_callout_active(tp, tp->tt_persist)) { tp->t_rxtshift = 0; tcp_setpersist(tp); } /* * No reason to send a segment, just return. */ tp->t_flags &= ~TF_XMITNOW; return (0); send: if (need_sched && len > 0) { tcp_output_sched(tp); return 0; } /* * Before ESTABLISHED, force sending of initial options * unless TCP set not to do any options. * NOTE: we assume that the IP/TCP header plus TCP options * always fit in a single mbuf, leaving room for a maximum * link header, i.e. * max_linkhdr + sizeof(struct tcpiphdr) + optlen <= MCLBYTES */ optlen = 0; if (isipv6) hdrlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr); else hdrlen = sizeof(struct tcpiphdr); if (flags & TH_SYN) { tp->snd_nxt = tp->iss; if (!(tp->t_flags & TF_NOOPT)) { u_short mss; opt[0] = TCPOPT_MAXSEG; opt[1] = TCPOLEN_MAXSEG; mss = htons((u_short) tcp_mssopt(tp)); memcpy(opt + 2, &mss, sizeof mss); optlen = TCPOLEN_MAXSEG; if ((tp->t_flags & TF_REQ_SCALE) && (!(flags & TH_ACK) || (tp->t_flags & TF_RCVD_SCALE))) { *((u_int32_t *)(opt + optlen)) = htonl( TCPOPT_NOP << 24 | TCPOPT_WINDOW << 16 | TCPOLEN_WINDOW << 8 | tp->request_r_scale); optlen += 4; } if ((tcp_do_sack && !(flags & TH_ACK)) || tp->t_flags & TF_SACK_PERMITTED) { uint32_t *lp = (uint32_t *)(opt + optlen); *lp = htonl(TCPOPT_SACK_PERMITTED_ALIGNED); optlen += TCPOLEN_SACK_PERMITTED_ALIGNED; } } } /* * Send a timestamp and echo-reply if this is a SYN and our side * wants to use timestamps (TF_REQ_TSTMP is set) or both our side * and our peer have sent timestamps in our SYN's. */ if ((tp->t_flags & (TF_REQ_TSTMP | TF_NOOPT)) == TF_REQ_TSTMP && !(flags & TH_RST) && (!(flags & TH_ACK) || (tp->t_flags & TF_RCVD_TSTMP))) { u_int32_t *lp = (u_int32_t *)(opt + optlen); /* Form timestamp option as shown in appendix A of RFC 1323. */ *lp++ = htonl(TCPOPT_TSTAMP_HDR); *lp++ = htonl(ticks); *lp = htonl(tp->ts_recent); optlen += TCPOLEN_TSTAMP_APPA; } /* Set receive buffer autosizing timestamp. */ if (tp->rfbuf_ts == 0 && (so->so_rcv.ssb_flags & SSB_AUTOSIZE)) tp->rfbuf_ts = ticks; /* * If this is a SACK connection and we have a block to report, * fill in the SACK blocks in the TCP options. */ if (report_sack) tcp_sack_fill_report(tp, opt, &optlen); #ifdef TCP_SIGNATURE if (tp->t_flags & TF_SIGNATURE) { int i; u_char *bp; /* * Initialize TCP-MD5 option (RFC2385) */ bp = (u_char *)opt + optlen; *bp++ = TCPOPT_SIGNATURE; *bp++ = TCPOLEN_SIGNATURE; sigoff = optlen + 2; for (i = 0; i < TCP_SIGLEN; i++) *bp++ = 0; optlen += TCPOLEN_SIGNATURE; /* * Terminate options list and maintain 32-bit alignment. */ *bp++ = TCPOPT_NOP; *bp++ = TCPOPT_EOL; optlen += 2; } #endif /* TCP_SIGNATURE */ KASSERT(optlen <= TCP_MAXOLEN, ("too many TCP options")); hdrlen += optlen; if (isipv6) { ipoptlen = ip6_optlen(inp); } else { if (inp->inp_options) { ipoptlen = inp->inp_options->m_len - offsetof(struct ipoption, ipopt_list); } else {
/*ARGSUSED*/ int mmrw(dev_t dev, struct uio *uio, int flags) { struct iovec *iov; boolean_t allowed; int error = 0; size_t c; vaddr_t v; while (uio->uio_resid > 0 && error == 0) { iov = uio->uio_iov; if (iov->iov_len == 0) { uio->uio_iov++; uio->uio_iovcnt--; if (uio->uio_iovcnt < 0) panic("mmrw"); continue; } switch (minor(dev)) { /* minor device 0 is physical memory */ case 0: v = uio->uio_offset; c = iov->iov_len; if (v + c < v || v + c > ptoa((psize_t)physmem)) return (EFAULT); v = (vaddr_t)PHYS_TO_XKPHYS(v, CCA_NONCOHERENT); error = uiomove((caddr_t)v, c, uio); continue; /* minor device 1 is kernel memory */ case 1: v = uio->uio_offset; c = ulmin(iov->iov_len, MAXPHYS); /* Allow access to RAM through XKPHYS... */ if (IS_XKPHYS(v)) allowed = is_memory_range(XKPHYS_TO_PHYS(v), (psize_t)c, 0); /* ...or through CKSEG0... */ else if (v >= CKSEG0_BASE && v < CKSEG0_BASE + CKSEG_SIZE) allowed = is_memory_range(CKSEG0_TO_PHYS(v), (psize_t)c, CKSEG_SIZE); /* ...or through CKSEG1... */ else if (v >= CKSEG1_BASE && v < CKSEG1_BASE + CKSEG_SIZE) allowed = is_memory_range(CKSEG1_TO_PHYS(v), (psize_t)c, CKSEG_SIZE); /* ...otherwise, check it's within kernel kvm limits. */ else allowed = uvm_kernacc((caddr_t)v, c, uio->uio_rw == UIO_READ ? B_READ : B_WRITE); if (allowed) { error = uiomove((caddr_t)v, c, uio); continue; } else { return (EFAULT); } /* minor device 2 is EOF/RATHOLE */ case 2: if (uio->uio_rw == UIO_WRITE) uio->uio_resid = 0; return (0); /* minor device 12 (/dev/zero) is source of nulls on read, rathole on write */ case 12: if (uio->uio_rw == UIO_WRITE) { c = iov->iov_len; break; } if (zeropage == NULL) zeropage = malloc(PAGE_SIZE, M_TEMP, M_WAITOK | M_ZERO); c = ulmin(iov->iov_len, PAGE_SIZE); error = uiomove(zeropage, c, uio); continue; default: return (ENODEV); } if (error) break; iov->iov_base += c; iov->iov_len -= c; uio->uio_offset += c; uio->uio_resid -= c; } return error; }
/* ARGSUSED */ int memrw(struct cdev *dev, struct uio *uio, int flags) { struct iovec *iov; vm_offset_t eva; vm_offset_t off; vm_offset_t ova; vm_offset_t va; vm_prot_t prot; vm_paddr_t pa; vm_size_t cnt; vm_page_t m; int error; int i; uint32_t colors; cnt = 0; colors = 1; error = 0; ova = 0; GIANT_REQUIRED; while (uio->uio_resid > 0 && error == 0) { iov = uio->uio_iov; if (iov->iov_len == 0) { uio->uio_iov++; uio->uio_iovcnt--; if (uio->uio_iovcnt < 0) panic("memrw"); continue; } if (dev2unit(dev) == CDEV_MINOR_MEM) { pa = uio->uio_offset & ~PAGE_MASK; if (!is_physical_memory(pa)) { error = EFAULT; break; } off = uio->uio_offset & PAGE_MASK; cnt = PAGE_SIZE - ((vm_offset_t)iov->iov_base & PAGE_MASK); cnt = ulmin(cnt, PAGE_SIZE - off); cnt = ulmin(cnt, iov->iov_len); m = NULL; for (i = 0; phys_avail[i] != 0; i += 2) { if (pa >= phys_avail[i] && pa < phys_avail[i + 1]) { m = PHYS_TO_VM_PAGE(pa); break; } } if (m != NULL) { if (ova == 0) { if (dcache_color_ignore == 0) colors = DCACHE_COLORS; ova = kmem_alloc_wait(kernel_map, PAGE_SIZE * colors); } if (colors != 1 && m->md.color != -1) va = ova + m->md.color * PAGE_SIZE; else va = ova; pmap_qenter(va, &m, 1); error = uiomove((void *)(va + off), cnt, uio); pmap_qremove(va, 1); } else { va = TLB_PHYS_TO_DIRECT(pa); error = uiomove((void *)(va + off), cnt, uio); } break; } else if (dev2unit(dev) == CDEV_MINOR_KMEM) { va = trunc_page(uio->uio_offset); eva = round_page(uio->uio_offset + iov->iov_len); /* * Make sure that all of the pages are currently * resident so we don't create any zero fill pages. */ for (; va < eva; va += PAGE_SIZE) if (pmap_kextract(va) == 0) return (EFAULT); prot = (uio->uio_rw == UIO_READ) ? VM_PROT_READ : VM_PROT_WRITE; va = uio->uio_offset; if (va < VM_MIN_DIRECT_ADDRESS && kernacc((void *)va, iov->iov_len, prot) == FALSE) return (EFAULT); error = uiomove((void *)va, iov->iov_len, uio); break; } /* else panic! */ } if (ova != 0) kmem_free_wakeup(kernel_map, ova, PAGE_SIZE * colors); return (error); }
/* * Implement uiomove(9) from physical memory using a combination * of the direct mapping and sf_bufs to reduce the creation and * destruction of ephemeral mappings. */ int uiomove_fromphys(vm_page_t ma[], vm_offset_t offset, int n, struct uio *uio) { struct sf_buf *sf; struct thread *td = curthread; struct iovec *iov; void *cp; vm_offset_t page_offset; vm_paddr_t pa; vm_page_t m; size_t cnt; int error = 0; int save = 0; KASSERT(uio->uio_rw == UIO_READ || uio->uio_rw == UIO_WRITE, ("uiomove_fromphys: mode")); KASSERT(uio->uio_segflg != UIO_USERSPACE || uio->uio_td == curthread, ("uiomove_fromphys proc")); save = td->td_pflags & TDP_DEADLKTREAT; td->td_pflags |= TDP_DEADLKTREAT; while (n > 0 && uio->uio_resid) { iov = uio->uio_iov; cnt = iov->iov_len; if (cnt == 0) { uio->uio_iov++; uio->uio_iovcnt--; continue; } if (cnt > n) cnt = n; page_offset = offset & PAGE_MASK; cnt = ulmin(cnt, PAGE_SIZE - page_offset); m = ma[offset >> PAGE_SHIFT]; pa = VM_PAGE_TO_PHYS(m); if (m->md.color != DCACHE_COLOR(pa)) { sf = sf_buf_alloc(m, 0); cp = (char *)sf_buf_kva(sf) + page_offset; } else { sf = NULL; cp = (char *)TLB_PHYS_TO_DIRECT(pa) + page_offset; } switch (uio->uio_segflg) { case UIO_USERSPACE: if (ticks - PCPU_GET(switchticks) >= hogticks) uio_yield(); if (uio->uio_rw == UIO_READ) error = copyout(cp, iov->iov_base, cnt); else error = copyin(iov->iov_base, cp, cnt); if (error) { if (sf != NULL) sf_buf_free(sf); goto out; } break; case UIO_SYSSPACE: if (uio->uio_rw == UIO_READ) bcopy(cp, iov->iov_base, cnt); else bcopy(iov->iov_base, cp, cnt); break; case UIO_NOCOPY: break; } if (sf != NULL) sf_buf_free(sf); iov->iov_base = (char *)iov->iov_base + cnt; iov->iov_len -= cnt; uio->uio_resid -= cnt; uio->uio_offset += cnt; offset += cnt; n -= cnt; } out: if (save == 0) td->td_pflags &= ~TDP_DEADLKTREAT; return (error); }
/* * Common function for DMA map synchronization. May be called * by chipset-specific DMA map synchronization functions. * * This version works with the virtually-indexed, write-back cache * found in the MIPS-3/MIPS-4 CPUs available for the Algorithmics. */ void _bus_dmamap_sync(bus_dma_tag_t t, bus_dmamap_t map, bus_addr_t offset, bus_size_t len, int ops) { bus_size_t minlen; #ifdef DIAGNOSTIC /* * Mixing PRE and POST operations is not allowed. */ if ((ops & (BUS_DMASYNC_PREREAD|BUS_DMASYNC_PREWRITE)) != 0 && (ops & (BUS_DMASYNC_POSTREAD|BUS_DMASYNC_POSTWRITE)) != 0) panic("_bus_dmamap_sync: mix PRE and POST"); if (offset >= map->dm_mapsize) panic("_bus_dmamap_sync: bad offset %"PRIxPADDR " (map size is %"PRIxPSIZE")", offset, map->dm_mapsize); if (len == 0 || (offset + len) > map->dm_mapsize) panic("_bus_dmamap_sync: bad length"); #endif /* * Since we're dealing with a virtually-indexed, write-back * cache, we need to do the following things: * * PREREAD -- Invalidate D-cache. Note we might have * to also write-back here if we have to use an Index * op, or if the buffer start/end is not cache-line aligned. * * PREWRITE -- Write-back the D-cache. If we have to use * an Index op, we also have to invalidate. Note that if * we are doing PREREAD|PREWRITE, we can collapse everything * into a single op. * * POSTREAD -- Nothing. * * POSTWRITE -- Nothing. */ #ifdef _MIPS_NEED_BUS_DMA_BOUNCE struct mips_bus_dma_cookie * const cookie = map->_dm_cookie; if (cookie != NULL && (cookie->id_flags & _BUS_DMA_IS_BOUNCING) && (ops & BUS_DMASYNC_PREWRITE)) { STAT_INCR(write_bounces); /* * Copy the caller's buffer to the bounce buffer. */ switch (cookie->id_buftype) { case _BUS_DMA_BUFTYPE_LINEAR: memcpy((char *)cookie->id_bouncebuf + offset, cookie->id_origlinearbuf + offset, len); break; case _BUS_DMA_BUFTYPE_MBUF: m_copydata(cookie->id_origmbuf, offset, len, (char *)cookie->id_bouncebuf + offset); break; case _BUS_DMA_BUFTYPE_UIO: _bus_dma_uiomove((char *)cookie->id_bouncebuf + offset, cookie->id_origuio, len, UIO_WRITE); break; #ifdef DIAGNOSTIC case _BUS_DMA_BUFTYPE_RAW: panic("_bus_dmamap_sync: _BUS_DMA_BUFTYPE_RAW"); break; case _BUS_DMA_BUFTYPE_INVALID: panic("_bus_dmamap_sync: _BUS_DMA_BUFTYPE_INVALID"); break; default: panic("_bus_dmamap_sync: unknown buffer type %d\n", cookie->id_buftype); break; #endif /* DIAGNOSTIC */ } } #endif /* _MIPS_NEED_BUS_DMA_BOUNCE */ /* * Flush the write buffer. * XXX Is this always necessary? */ wbflush(); /* * If the mapping is of COHERENT DMA-safe memory or this isn't a * PREREAD or PREWRITE, no cache flush is necessary. Check to see * if we need to bounce it. */ if ((map->_dm_flags & _BUS_DMAMAP_COHERENT) || (ops & (BUS_DMASYNC_PREREAD|BUS_DMASYNC_PREWRITE)) == 0) goto bounce_it; /* * If the mapping belongs to the kernel, or it belongs * to the currently-running process (XXX actually, vmspace), * then we can use Hit ops. Otherwise, Index ops. * * This should be true the vast majority of the time. */ const bool useindex = (!VMSPACE_IS_KERNEL_P(map->_dm_vmspace) && map->_dm_vmspace != curproc->p_vmspace); bus_dma_segment_t *seg = map->dm_segs; bus_dma_segment_t * const lastseg = seg + map->dm_nsegs; /* * Skip segments until offset are withing a segment. */ for (; offset >= seg->ds_len; seg++) { offset -= seg->ds_len; } for (; seg < lastseg && len != 0; seg++, offset = 0, len -= minlen) { /* * Now at the first segment to sync; nail each segment until we * have exhausted the length. */ vaddr_t vaddr = seg->_ds_vaddr + offset; minlen = ulmin(len, seg->ds_len - offset); #ifdef BUS_DMA_DEBUG printf("bus_dmamap_sync: flushing segment %p " "(0x%"PRIxBUSADDR"+%"PRIxBUSADDR ", 0x%"PRIxBUSADDR"+0x%"PRIxBUSADDR ") (olen = %"PRIxBUSADDR")...", seg, vaddr - offset, offset, vaddr - offset, offset + minlen - 1, len); #endif /* * If we are forced to use Index ops, it's always a * Write-back,Invalidate, so just do one test. */ if (__predict_false(useindex)) { mips_dcache_wbinv_range_index(vaddr, minlen); #ifdef BUS_DMA_DEBUG printf("\n"); #endif continue; } switch (ops) { case BUS_DMASYNC_PREREAD|BUS_DMASYNC_PREWRITE: mips_dcache_wbinv_range(vaddr, minlen); break; case BUS_DMASYNC_PREREAD: #if 1 mips_dcache_wbinv_range(vaddr, minlen); #else mips_dcache_inv_range(vaddr, minlen); #endif break; case BUS_DMASYNC_PREWRITE: mips_dcache_wb_range(vaddr, minlen); break; } #ifdef BUS_DMA_DEBUG printf("\n"); #endif } bounce_it: #ifdef _MIPS_NEED_BUS_DMA_BOUNCE if ((ops & BUS_DMASYNC_POSTREAD) == 0 || cookie == NULL || (cookie->id_flags & _BUS_DMA_IS_BOUNCING) == 0) return; STAT_INCR(read_bounces); /* * Copy the bounce buffer to the caller's buffer. */ switch (cookie->id_buftype) { case _BUS_DMA_BUFTYPE_LINEAR: memcpy(cookie->id_origlinearbuf + offset, (char *)cookie->id_bouncebuf + offset, len); break; case _BUS_DMA_BUFTYPE_MBUF: m_copyback(cookie->id_origmbuf, offset, len, (char *)cookie->id_bouncebuf + offset); break; case _BUS_DMA_BUFTYPE_UIO: _bus_dma_uiomove((char *)cookie->id_bouncebuf + offset, cookie->id_origuio, len, UIO_READ); break; #ifdef DIAGNOSTIC case _BUS_DMA_BUFTYPE_RAW: panic("_bus_dmamap_sync: _BUS_DMA_BUFTYPE_RAW"); break; case _BUS_DMA_BUFTYPE_INVALID: panic("_bus_dmamap_sync: _BUS_DMA_BUFTYPE_INVALID"); break; default: panic("_bus_dmamap_sync: unknown buffer type %d\n", cookie->id_buftype); break; #endif } #endif /* _MIPS_NEED_BUS_DMA_BOUNCE */ ; }
/* * Tcp output routine: figure out what should be sent and send it. */ int tcp_output(struct tcpcb *tp) { struct inpcb * const inp = tp->t_inpcb; struct socket *so = inp->inp_socket; long len, recvwin, sendwin; int nsacked = 0; int off, flags, error; #ifdef TCP_SIGNATURE int sigoff = 0; #endif struct mbuf *m; struct ip *ip = NULL; struct ipovly *ipov = NULL; struct tcphdr *th; u_char opt[TCP_MAXOLEN]; unsigned int ipoptlen, optlen, hdrlen; int idle; boolean_t sendalot; struct ip6_hdr *ip6 = NULL; #ifdef INET6 const boolean_t isipv6 = (inp->inp_vflag & INP_IPV6) != 0; #else const boolean_t isipv6 = FALSE; #endif KKASSERT(so->so_port == &curthread->td_msgport); /* * Determine length of data that should be transmitted, * and flags that will be used. * If there is some data or critical controls (SYN, RST) * to send, then transmit; otherwise, investigate further. */ /* * If we have been idle for a while, the send congestion window * could be no longer representative of the current state of the link. * So unless we are expecting more acks to come in, slow-start from * scratch to re-determine the send congestion window. */ if (tp->snd_max == tp->snd_una && (ticks - tp->t_rcvtime) >= tp->t_rxtcur) { if (tcp_do_rfc3390) { int initial_cwnd = min(4 * tp->t_maxseg, max(2 * tp->t_maxseg, 4380)); tp->snd_cwnd = min(tp->snd_cwnd, initial_cwnd); } else { tp->snd_cwnd = tp->t_maxseg; } tp->snd_wacked = 0; } /* * Calculate whether the transmit stream was previously idle * and adjust TF_LASTIDLE for the next time. */ idle = (tp->t_flags & TF_LASTIDLE) || (tp->snd_max == tp->snd_una); if (idle && (tp->t_flags & TF_MORETOCOME)) tp->t_flags |= TF_LASTIDLE; else tp->t_flags &= ~TF_LASTIDLE; if (TCP_DO_SACK(tp) && tp->snd_nxt != tp->snd_max && !IN_FASTRECOVERY(tp)) nsacked = tcp_sack_bytes_below(&tp->scb, tp->snd_nxt); again: /* Make use of SACK information when slow-starting after a RTO. */ if (TCP_DO_SACK(tp) && tp->snd_nxt != tp->snd_max && !IN_FASTRECOVERY(tp)) { tcp_seq old_snd_nxt = tp->snd_nxt; tcp_sack_skip_sacked(&tp->scb, &tp->snd_nxt); nsacked += tp->snd_nxt - old_snd_nxt; } sendalot = FALSE; off = tp->snd_nxt - tp->snd_una; sendwin = min(tp->snd_wnd, tp->snd_cwnd + nsacked); sendwin = min(sendwin, tp->snd_bwnd); flags = tcp_outflags[tp->t_state]; /* * Get standard flags, and add SYN or FIN if requested by 'hidden' * state flags. */ if (tp->t_flags & TF_NEEDFIN) flags |= TH_FIN; if (tp->t_flags & TF_NEEDSYN) flags |= TH_SYN; /* * If in persist timeout with window of 0, send 1 byte. * Otherwise, if window is small but nonzero * and timer expired, we will send what we can * and go to transmit state. */ if (tp->t_flags & TF_FORCE) { if (sendwin == 0) { /* * If we still have some data to send, then * clear the FIN bit. Usually this would * happen below when it realizes that we * aren't sending all the data. However, * if we have exactly 1 byte of unsent data, * then it won't clear the FIN bit below, * and if we are in persist state, we wind * up sending the packet without recording * that we sent the FIN bit. * * We can't just blindly clear the FIN bit, * because if we don't have any more data * to send then the probe will be the FIN * itself. */ if (off < so->so_snd.ssb_cc) flags &= ~TH_FIN; sendwin = 1; } else { tcp_callout_stop(tp, tp->tt_persist); tp->t_rxtshift = 0; } } /* * If snd_nxt == snd_max and we have transmitted a FIN, the * offset will be > 0 even if so_snd.ssb_cc is 0, resulting in * a negative length. This can also occur when TCP opens up * its congestion window while receiving additional duplicate * acks after fast-retransmit because TCP will reset snd_nxt * to snd_max after the fast-retransmit. * * In the normal retransmit-FIN-only case, however, snd_nxt will * be set to snd_una, the offset will be 0, and the length may * wind up 0. */ len = (long)ulmin(so->so_snd.ssb_cc, sendwin) - off; /* * Lop off SYN bit if it has already been sent. However, if this * is SYN-SENT state and if segment contains data, suppress sending * segment (sending the segment would be an option if we still * did TAO and the remote host supported it). */ if ((flags & TH_SYN) && SEQ_GT(tp->snd_nxt, tp->snd_una)) { flags &= ~TH_SYN; off--, len++; if (len > 0 && tp->t_state == TCPS_SYN_SENT) return 0; } /* * Be careful not to send data and/or FIN on SYN segments. * This measure is needed to prevent interoperability problems * with not fully conformant TCP implementations. */ if (flags & TH_SYN) { len = 0; flags &= ~TH_FIN; } if (len < 0) { /* * If FIN has been sent but not acked, * but we haven't been called to retransmit, * len will be < 0. Otherwise, window shrank * after we sent into it. If window shrank to 0, * cancel pending retransmit, pull snd_nxt back * to (closed) window, and set the persist timer * if it isn't already going. If the window didn't * close completely, just wait for an ACK. */ len = 0; if (sendwin == 0) { tcp_callout_stop(tp, tp->tt_rexmt); tp->t_rxtshift = 0; tp->snd_nxt = tp->snd_una; if (!tcp_callout_active(tp, tp->tt_persist)) tcp_setpersist(tp); } } KASSERT(len >= 0, ("%s: len < 0", __func__)); /* * Automatic sizing of send socket buffer. Often the send buffer * size is not optimally adjusted to the actual network conditions * at hand (delay bandwidth product). Setting the buffer size too * small limits throughput on links with high bandwidth and high * delay (eg. trans-continental/oceanic links). Setting the * buffer size too big consumes too much real kernel memory, * especially with many connections on busy servers. * * The criteria to step up the send buffer one notch are: * 1. receive window of remote host is larger than send buffer * (with a fudge factor of 5/4th); * 2. send buffer is filled to 7/8th with data (so we actually * have data to make use of it); * 3. send buffer fill has not hit maximal automatic size; * 4. our send window (slow start and cogestion controlled) is * larger than sent but unacknowledged data in send buffer. * * The remote host receive window scaling factor may limit the * growing of the send buffer before it reaches its allowed * maximum. * * It scales directly with slow start or congestion window * and does at most one step per received ACK. This fast * scaling has the drawback of growing the send buffer beyond * what is strictly necessary to make full use of a given * delay*bandwith product. However testing has shown this not * to be much of an problem. At worst we are trading wasting * of available bandwith (the non-use of it) for wasting some * socket buffer memory. * * TODO: Shrink send buffer during idle periods together * with congestion window. Requires another timer. Has to * wait for upcoming tcp timer rewrite. */ if (tcp_do_autosndbuf && so->so_snd.ssb_flags & SSB_AUTOSIZE) { if ((tp->snd_wnd / 4 * 5) >= so->so_snd.ssb_hiwat && so->so_snd.ssb_cc >= (so->so_snd.ssb_hiwat / 8 * 7) && so->so_snd.ssb_cc < tcp_autosndbuf_max && sendwin >= (so->so_snd.ssb_cc - (tp->snd_nxt - tp->snd_una))) { u_long newsize; newsize = ulmin(so->so_snd.ssb_hiwat + tcp_autosndbuf_inc, tcp_autosndbuf_max); if (!ssb_reserve(&so->so_snd, newsize, so, NULL)) atomic_clear_int(&so->so_snd.ssb_flags, SSB_AUTOSIZE); if (newsize >= (TCP_MAXWIN << tp->snd_scale)) atomic_clear_int(&so->so_snd.ssb_flags, SSB_AUTOSIZE); } } /* * Truncate to the maximum segment length and ensure that FIN is * removed if the length no longer contains the last data byte. */ if (len > tp->t_maxseg) { len = tp->t_maxseg; sendalot = TRUE; } if (SEQ_LT(tp->snd_nxt + len, tp->snd_una + so->so_snd.ssb_cc)) flags &= ~TH_FIN; recvwin = ssb_space(&so->so_rcv); /* * Sender silly window avoidance. We transmit under the following * conditions when len is non-zero: * * - We have a full segment * - This is the last buffer in a write()/send() and we are * either idle or running NODELAY * - we've timed out (e.g. persist timer) * - we have more then 1/2 the maximum send window's worth of * data (receiver may be limiting the window size) * - we need to retransmit */ if (len) { if (len == tp->t_maxseg) goto send; /* * NOTE! on localhost connections an 'ack' from the remote * end may occur synchronously with the output and cause * us to flush a buffer queued with moretocome. XXX * * note: the len + off check is almost certainly unnecessary. */ if (!(tp->t_flags & TF_MORETOCOME) && /* normal case */ (idle || (tp->t_flags & TF_NODELAY)) && len + off >= so->so_snd.ssb_cc && !(tp->t_flags & TF_NOPUSH)) { goto send; } if (tp->t_flags & TF_FORCE) /* typ. timeout case */ goto send; if (len >= tp->max_sndwnd / 2 && tp->max_sndwnd > 0) goto send; if (SEQ_LT(tp->snd_nxt, tp->snd_max)) /* retransmit case */ goto send; } /* * Compare available window to amount of window * known to peer (as advertised window less * next expected input). If the difference is at least two * max size segments, or at least 50% of the maximum possible * window, then want to send a window update to peer. */ if (recvwin > 0) { /* * "adv" is the amount we can increase the window, * taking into account that we are limited by * TCP_MAXWIN << tp->rcv_scale. */ long adv = min(recvwin, (long)TCP_MAXWIN << tp->rcv_scale) - (tp->rcv_adv - tp->rcv_nxt); long hiwat; /* * This ack case typically occurs when the user has drained * the TCP socket buffer sufficiently to warrent an ack * containing a 'pure window update'... that is, an ack that * ONLY updates the tcp window. * * It is unclear why we would need to do a pure window update * past 2 segments if we are going to do one at 1/2 the high * water mark anyway, especially since under normal conditions * the user program will drain the socket buffer quickly. * The 2-segment pure window update will often add a large * number of extra, unnecessary acks to the stream. * * avoid_pure_win_update now defaults to 1. */ if (avoid_pure_win_update == 0 || (tp->t_flags & TF_RXRESIZED)) { if (adv >= (long) (2 * tp->t_maxseg)) { goto send; } } hiwat = (long)(TCP_MAXWIN << tp->rcv_scale); if (hiwat > (long)so->so_rcv.ssb_hiwat) hiwat = (long)so->so_rcv.ssb_hiwat; if (adv >= hiwat / 2) goto send; } /* * Send if we owe the peer an ACK, RST, SYN, or urgent data. ACKNOW * is also a catch-all for the retransmit timer timeout case. */ if (tp->t_flags & TF_ACKNOW) goto send; if ((flags & TH_RST) || ((flags & TH_SYN) && !(tp->t_flags & TF_NEEDSYN))) goto send; if (SEQ_GT(tp->snd_up, tp->snd_una)) goto send; /* * If our state indicates that FIN should be sent * and we have not yet done so, then we need to send. */ if (flags & TH_FIN && (!(tp->t_flags & TF_SENTFIN) || tp->snd_nxt == tp->snd_una)) goto send; /* * TCP window updates are not reliable, rather a polling protocol * using ``persist'' packets is used to insure receipt of window * updates. The three ``states'' for the output side are: * idle not doing retransmits or persists * persisting to move a small or zero window * (re)transmitting and thereby not persisting * * tcp_callout_active(tp, tp->tt_persist) * is true when we are in persist state. * The TF_FORCE flag in tp->t_flags * is set when we are called to send a persist packet. * tcp_callout_active(tp, tp->tt_rexmt) * is set when we are retransmitting * The output side is idle when both timers are zero. * * If send window is too small, there is data to transmit, and no * retransmit or persist is pending, then go to persist state. * If nothing happens soon, send when timer expires: * if window is nonzero, transmit what we can, * otherwise force out a byte. */ if (so->so_snd.ssb_cc > 0 && !tcp_callout_active(tp, tp->tt_rexmt) && !tcp_callout_active(tp, tp->tt_persist)) { tp->t_rxtshift = 0; tcp_setpersist(tp); } /* * No reason to send a segment, just return. */ return (0); send: /* * Before ESTABLISHED, force sending of initial options * unless TCP set not to do any options. * NOTE: we assume that the IP/TCP header plus TCP options * always fit in a single mbuf, leaving room for a maximum * link header, i.e. * max_linkhdr + sizeof(struct tcpiphdr) + optlen <= MCLBYTES */ optlen = 0; if (isipv6) hdrlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr); else hdrlen = sizeof(struct tcpiphdr); if (flags & TH_SYN) { tp->snd_nxt = tp->iss; if (!(tp->t_flags & TF_NOOPT)) { u_short mss; opt[0] = TCPOPT_MAXSEG; opt[1] = TCPOLEN_MAXSEG; mss = htons((u_short) tcp_mssopt(tp)); memcpy(opt + 2, &mss, sizeof mss); optlen = TCPOLEN_MAXSEG; if ((tp->t_flags & TF_REQ_SCALE) && (!(flags & TH_ACK) || (tp->t_flags & TF_RCVD_SCALE))) { *((u_int32_t *)(opt + optlen)) = htonl( TCPOPT_NOP << 24 | TCPOPT_WINDOW << 16 | TCPOLEN_WINDOW << 8 | tp->request_r_scale); optlen += 4; } if ((tcp_do_sack && !(flags & TH_ACK)) || tp->t_flags & TF_SACK_PERMITTED) { uint32_t *lp = (uint32_t *)(opt + optlen); *lp = htonl(TCPOPT_SACK_PERMITTED_ALIGNED); optlen += TCPOLEN_SACK_PERMITTED_ALIGNED; } } } /* * Send a timestamp and echo-reply if this is a SYN and our side * wants to use timestamps (TF_REQ_TSTMP is set) or both our side * and our peer have sent timestamps in our SYN's. */ if ((tp->t_flags & (TF_REQ_TSTMP | TF_NOOPT)) == TF_REQ_TSTMP && !(flags & TH_RST) && (!(flags & TH_ACK) || (tp->t_flags & TF_RCVD_TSTMP))) { u_int32_t *lp = (u_int32_t *)(opt + optlen); /* Form timestamp option as shown in appendix A of RFC 1323. */ *lp++ = htonl(TCPOPT_TSTAMP_HDR); *lp++ = htonl(ticks); *lp = htonl(tp->ts_recent); optlen += TCPOLEN_TSTAMP_APPA; } /* Set receive buffer autosizing timestamp. */ if (tp->rfbuf_ts == 0 && (so->so_rcv.ssb_flags & SSB_AUTOSIZE)) tp->rfbuf_ts = ticks; /* * If this is a SACK connection and we have a block to report, * fill in the SACK blocks in the TCP options. */ if ((tp->t_flags & (TF_SACK_PERMITTED | TF_NOOPT)) == TF_SACK_PERMITTED && (!LIST_EMPTY(&tp->t_segq) || tp->reportblk.rblk_start != tp->reportblk.rblk_end)) tcp_sack_fill_report(tp, opt, &optlen); #ifdef TCP_SIGNATURE if (tp->t_flags & TF_SIGNATURE) { int i; u_char *bp; /* * Initialize TCP-MD5 option (RFC2385) */ bp = (u_char *)opt + optlen; *bp++ = TCPOPT_SIGNATURE; *bp++ = TCPOLEN_SIGNATURE; sigoff = optlen + 2; for (i = 0; i < TCP_SIGLEN; i++) *bp++ = 0; optlen += TCPOLEN_SIGNATURE; /* * Terminate options list and maintain 32-bit alignment. */ *bp++ = TCPOPT_NOP; *bp++ = TCPOPT_EOL; optlen += 2; } #endif /* TCP_SIGNATURE */ KASSERT(optlen <= TCP_MAXOLEN, ("too many TCP options")); hdrlen += optlen; if (isipv6) { ipoptlen = ip6_optlen(inp); } else { if (inp->inp_options) { ipoptlen = inp->inp_options->m_len - offsetof(struct ipoption, ipopt_list); } else {
int memrw(struct cdev *dev, struct uio *uio, int flags) { struct iovec *iov; struct vm_page m; vm_page_t marr; vm_offset_t off, v; u_int cnt; int error; error = 0; while (uio->uio_resid > 0 && error == 0) { iov = uio->uio_iov; if (iov->iov_len == 0) { uio->uio_iov++; uio->uio_iovcnt--; if (uio->uio_iovcnt < 0) panic("memrw"); continue; } v = uio->uio_offset; off = v & PAGE_MASK; cnt = ulmin(iov->iov_len, PAGE_SIZE - (u_int)off); if (cnt == 0) continue; switch(dev2unit(dev)) { case CDEV_MINOR_KMEM: /* If the address is in the DMAP just copy it */ if (VIRT_IN_DMAP(v)) { error = uiomove((void *)v, cnt, uio); break; } if (!kernacc((void *)v, cnt, uio->uio_rw == UIO_READ ? VM_PROT_READ : VM_PROT_WRITE)) { error = EFAULT; break; } /* Get the physical address to read */ v = pmap_extract(kernel_pmap, v); if (v == 0) { error = EFAULT; break; } /* FALLTHROUGH */ case CDEV_MINOR_MEM: /* If within the DMAP use this to copy from */ if (PHYS_IN_DMAP(v)) { v = PHYS_TO_DMAP(v); error = uiomove((void *)v, cnt, uio); break; } /* Have uiomove_fromphys handle the data */ m.phys_addr = trunc_page(v); marr = &m; uiomove_fromphys(&marr, off, cnt, uio); break; } } return (error); }