Beispiel #1
0
int
nvramread(dev_t dev, struct uio *uio, int flags)
{
	u_char buf[NVRAM_SIZE];
	off_t pos = uio->uio_offset;
	u_char *tmp;
	size_t count = ulmin(sizeof(buf), uio->uio_resid);
	int ret;

	if (!nvram_initialized)
		return (ENXIO);

	if (uio->uio_offset < 0)
		return (EINVAL);

	if (uio->uio_resid == 0)
		return (0);

#ifdef NVRAM_DEBUG
	printf("attempting to read %zu bytes at offset %lld\n", count, pos);
#endif

	for (tmp = buf; count-- > 0 && pos < NVRAM_SIZE; ++pos, ++tmp)
		*tmp = nvram_get_byte(pos);

#ifdef NVRAM_DEBUG
	printf("nvramread read %td bytes (%s)\n", (tmp - buf), tmp);
#endif

	ret = uiomove(buf, (tmp - buf), uio);

	uio->uio_offset += uio->uio_resid;

	return (ret);
}
Beispiel #2
0
static int
esp_pci_dma_setup(struct ncr53c9x_softc *sc, void **addr, size_t *len,
    int datain, size_t *dmasize)
{
	struct esp_pci_softc *esc = (struct esp_pci_softc *)sc;
	int error;

	WRITE_DMAREG(esc, DMA_CMD, DMACMD_IDLE | (datain != 0 ? DMACMD_DIR :
	    0));

	*dmasize = esc->sc_dmasize = ulmin(*dmasize, MDL_SEG_SIZE);
	esc->sc_dmaaddr = addr;
	esc->sc_dmalen = len;
	esc->sc_datain = datain;

	/*
	 * There's no need to set up DMA for a "Transfer Pad" operation.
	 */
	if (*dmasize == 0)
		return (0);

	/* Set the transfer length. */
	WRITE_DMAREG(esc, DMA_STC, *dmasize);

	/*
	 * Load the transfer buffer and program the DMA address.
	 * Note that the NCR53C9x core can't handle EINPROGRESS so we set
	 * BUS_DMA_NOWAIT.
	 */
	error = bus_dmamap_load(esc->sc_xferdmat, esc->sc_xferdmam,
	    *esc->sc_dmaaddr, *dmasize, esp_pci_xfermap, sc, BUS_DMA_NOWAIT);

	return (error);
}
Beispiel #3
0
/**
   Process a block of memory though the hash
   @param state   The hash state
   @param in     The data to hash
   @param inlen  The length of the data (octets)
   @return CRYPT_OK if successful
*/
int sha1_process(sha1_state* state, const unsigned char* in, unsigned long inlen) {
    unsigned long n;
    int err;
    if (state->curlen > sizeof(state->buf)) {
        return CRYPT_INVALID_ARG;
    }
    if ((state->length + inlen) < state->length) {
        return CRYPT_HASH_OVERFLOW;
    }
    while (inlen > 0) {
        if (state->curlen == 0 && inlen >= 64) {
            if ((err = sha1_compress(state, in)) != CRYPT_OK) {
                return err;
            }
            state->length += 64 * 8;
            in += 64;
            inlen -= 64;
        } else {
            n = ulmin(inlen, (64 - state->curlen));
            memcpy(state->buf + state->curlen, in, (size_t)n);
            state->curlen += n;
            in += n;
            inlen -= n;
            if (state->curlen == 64) {
                if ((err = sha1_compress(state, state->buf)) != CRYPT_OK) {
                    return err;
                }
                state->length += 8 * 64;
                state->curlen = 0;
            }
        }
    }
    return CRYPT_OK;
}
Beispiel #4
0
static void
ofw_real_bounce_alloc(void *junk)
{
	/*
	 * Check that ofw_real is actually in use before allocating wads 
	 * of memory. Do this by checking if our mutex has been set up.
	 */
	if (!mtx_initialized(&of_bounce_mtx))
		return;

	/*
	 * Allocate a page of contiguous, wired physical memory that can
	 * fit into a 32-bit address space and accessed from real mode.
	 */

	mtx_lock(&of_bounce_mtx);

	of_bounce_virt = contigmalloc(4 * PAGE_SIZE, M_OFWREAL, 0, 0,
	    ulmin(platform_real_maxaddr(), BUS_SPACE_MAXADDR_32BIT), PAGE_SIZE,
	    4 * PAGE_SIZE);

	of_bounce_phys = vtophys(of_bounce_virt);
	of_bounce_size = 4 * PAGE_SIZE;

	/*
	 * For virtual-mode OF, direct map this physical address so that
	 * we have a 32-bit virtual address to give OF.
	 */

	if (!ofw_real_mode && !hw_direct_map) 
		pmap_kenter(of_bounce_phys, of_bounce_phys);

	mtx_unlock(&of_bounce_mtx);
}
Beispiel #5
0
struct resource *
pcib_host_res_alloc(struct pcib_host_resources *hr, device_t dev, int type,
    int *rid, u_long start, u_long end, u_long count, u_int flags)
{
	struct resource_list_entry *rle;
	struct resource *r;
	u_long new_start, new_end;

	if (flags & RF_PREFETCHABLE)
		KASSERT(type == SYS_RES_MEMORY,
		    ("only memory is prefetchable"));

	rle = resource_list_find(&hr->hr_rl, type, 0);
	if (rle == NULL) {
		/*
		 * No decoding ranges for this resource type, just pass
		 * the request up to the parent.
		 */
		return (bus_generic_alloc_resource(hr->hr_pcib, dev, type, rid,
		    start, end, count, flags));
	}

restart:
	/* Try to allocate from each decoded range. */
	for (; rle != NULL; rle = STAILQ_NEXT(rle, link)) {
		if (rle->type != type)
			continue;
		if (((flags & RF_PREFETCHABLE) != 0) !=
		    ((rle->flags & RLE_PREFETCH) != 0))
			continue;
		new_start = ulmax(start, rle->start);
		new_end = ulmin(end, rle->end);
		if (new_start > new_end ||
		    new_start + count - 1 > new_end ||
		    new_start + count < new_start)
			continue;
		r = bus_generic_alloc_resource(hr->hr_pcib, dev, type, rid,
		    new_start, new_end, count, flags);
		if (r != NULL) {
			if (bootverbose)
				device_printf(hr->hr_pcib,
			    "allocated type %d (%#lx-%#lx) for rid %x of %s\n",
				    type, rman_get_start(r), rman_get_end(r),
				    *rid, pcib_child_name(dev));
			return (r);
		}
	}

	/*
	 * If we failed to find a prefetch range for a memory
	 * resource, try again without prefetch.
	 */
	if (flags & RF_PREFETCHABLE) {
		flags &= ~RF_PREFETCHABLE;
		rle = resource_list_find(&hr->hr_rl, type, 0);
		goto restart;
	}
	return (NULL);
}
Beispiel #6
0
/*
 * Implement uiomove(9) from physical memory using a combination
 * of the direct mapping and sf_bufs to reduce the creation and
 * destruction of ephemeral mappings.  
 */
int
uiomove_fromphys(vm_page_t ma[], vm_offset_t offset, int n, struct uio *uio)
{
	struct sf_buf *sf;
	struct thread *td = curthread;
	struct iovec *iov;
	void *cp;
	vm_offset_t page_offset;
	vm_paddr_t pa;
	vm_page_t m;
	size_t cnt;
	int error = 0;
	int save = 0;

	KASSERT(uio->uio_rw == UIO_READ || uio->uio_rw == UIO_WRITE,
	    ("uiomove_fromphys: mode"));
	KASSERT(uio->uio_segflg != UIO_USERSPACE || uio->uio_td == curthread,
	    ("uiomove_fromphys proc"));
	save = td->td_pflags & TDP_DEADLKTREAT;
	td->td_pflags |= TDP_DEADLKTREAT;
	while (n > 0 && uio->uio_resid) {
		iov = uio->uio_iov;
		cnt = iov->iov_len;
		if (cnt == 0) {
			uio->uio_iov++;
			uio->uio_iovcnt--;
			continue;
		}
		if (cnt > n)
			cnt = n;
		page_offset = offset & PAGE_MASK;
		cnt = ulmin(cnt, PAGE_SIZE - page_offset);
		m = ma[offset >> PAGE_SHIFT];
		pa = VM_PAGE_TO_PHYS(m);
		if (MIPS_DIRECT_MAPPABLE(pa)) {
			sf = NULL;
			cp = (char *)MIPS_PHYS_TO_DIRECT(pa) + page_offset;
			/*
			 * flush all mappings to this page, KSEG0 address first
			 * in order to get it overwritten by correct data
			 */
			mips_dcache_wbinv_range((vm_offset_t)cp, cnt);
			pmap_flush_pvcache(m);
		} else {
			sf = sf_buf_alloc(m, 0);
			cp = (char *)sf_buf_kva(sf) + page_offset;
		}
		switch (uio->uio_segflg) {
		case UIO_USERSPACE:
			maybe_yield();
			if (uio->uio_rw == UIO_READ)
				error = copyout(cp, iov->iov_base, cnt);
			else
				error = copyin(iov->iov_base, cp, cnt);
			if (error) {
				if (sf != NULL)
					sf_buf_free(sf);
				goto out;
			}
			break;
		case UIO_SYSSPACE:
			if (uio->uio_rw == UIO_READ)
				bcopy(cp, iov->iov_base, cnt);
			else
				bcopy(iov->iov_base, cp, cnt);
			break;
		case UIO_NOCOPY:
			break;
		}
		if (sf != NULL)
			sf_buf_free(sf);
		else
			mips_dcache_wbinv_range((vm_offset_t)cp, cnt);
		iov->iov_base = (char *)iov->iov_base + cnt;
		iov->iov_len -= cnt;
		uio->uio_resid -= cnt;
		uio->uio_offset += cnt;
		offset += cnt;
		n -= cnt;
	}
out:
	if (save == 0)
		td->td_pflags &= ~TDP_DEADLKTREAT;
	return (error);
}
Beispiel #7
0
/*
getvolattrlist takes a user controlled bufferSize argument via the fgetattrlist syscall.

When allocating a kernel buffer to serialize the attr list to there's the following comment:

  /*
   * Allocate a target buffer for attribute results.
   * Note that since we won't ever copy out more than the caller requested,
   * we never need to allocate more than they offer.
   */
  ab.allocated = ulmin(bufferSize, fixedsize + varsize);
  if (ab.allocated > ATTR_MAX_BUFFER) {
    error = ENOMEM;
    VFS_DEBUG(ctx, vp, "ATTRLIST - ERROR: buffer size too large (%d limit %d)", ab.allocated, ATTR_MAX_BUFFER);
    goto out;
  }
  MALLOC(ab.base, char *, ab.allocated, M_TEMP, M_ZERO | M_WAITOK);

The problem is that the code doesn't then correctly handle the case when the user supplied buffer size
is smaller that the requested header size. If we pass ATTR_CMN_RETURNED_ATTRS we'll hit the following code:

  /* Return attribute set output if requested. */
  if (return_valid) {
    ab.actual.commonattr |= ATTR_CMN_RETURNED_ATTRS;
    if (pack_invalid) {
      /* Only report the attributes that are valid */
      ab.actual.commonattr &= ab.valid.commonattr;
      ab.actual.volattr &= ab.valid.volattr;
    }
    bcopy(&ab.actual, ab.base + sizeof(uint32_t), sizeof (ab.actual));
  }
Beispiel #8
0
struct resource *
isa_alloc_resource(device_t bus, device_t child, int type, int *rid,
		   u_long start, u_long end, u_long count, u_int flags)
{
	/*
	 * Consider adding a resource definition. We allow rid 0-1 for
	 * irq and drq, 0-3 for memory and 0-7 for ports which is
	 * sufficient for isapnp.
	 */
	int passthrough = (device_get_parent(child) != bus);
	int isdefault = (start == 0UL && end == ~0UL);
	struct isa_device* idev = DEVTOISA(child);
	struct resource_list *rl = &idev->id_resources;
	struct resource_list_entry *rle;
	u_long base, limit;

	if (!passthrough && !isdefault) {
		rle = resource_list_find(rl, type, *rid);
		if (!rle) {
			if (*rid < 0)
				return 0;
			switch (type) {
			case SYS_RES_IRQ:
				if (*rid >= ISA_NIRQ)
					return 0;
				break;
			case SYS_RES_DRQ:
				if (*rid >= ISA_NDRQ)
					return 0;
				break;
			case SYS_RES_MEMORY:
				if (*rid >= ISA_NMEM)
					return 0;
				break;
			case SYS_RES_IOPORT:
				if (*rid >= ISA_NPORT)
					return 0;
				break;
			default:
				return 0;
			}
			resource_list_add(rl, type, *rid, start, end, count);
		}
	}

	/*
	 * Add the base, change default allocations to be between base and
	 * limit, and reject allocations if a resource type is not enabled.
	 */
	base = limit = 0;
	switch(type) {
	case SYS_RES_MEMORY:
		if (isa_mem_bt == NULL)
			return (NULL);
		base = isa_mem_base;
		limit = base + isa_mem_limit;
		break;
	case SYS_RES_IOPORT:
		if (isa_io_bt == NULL)
			return (NULL);
		base = isa_io_base;
		limit = base + isa_io_limit;
		break;
	case SYS_RES_IRQ:
		if (isdefault && passthrough)
			panic("isa_alloc_resource: cannot pass through default "
			    "irq allocation");
		if (!isdefault) {
			start = end = isa_route_intr_res(bus, start, end);
			if (start == 255)
				return (NULL);
		}
		break;
	default:
		panic("isa_alloc_resource: unsupported resource type %d", type);
	}
	if (type == SYS_RES_MEMORY || type == SYS_RES_IOPORT) {
		start = ulmin(start + base, limit);
		end = ulmin(end + base, limit);
	}
			
	/*
	 * This inlines a modified resource_list_alloc(); this is needed
	 * because the resources need to have offsets added to them, which
	 * cannot be done beforehand without patching the resource list entries
	 * (which is ugly).
	 */
	if (passthrough) {
		return (BUS_ALLOC_RESOURCE(device_get_parent(bus), child,
		    type, rid, start, end, count, flags));
	}

	rle = resource_list_find(rl, type, *rid);
	if (rle == NULL)
		return (NULL);		/* no resource of that type/rid */

	if (rle->res != NULL)
		panic("isa_alloc_resource: resource entry is busy");

	if (isdefault) {
		start = rle->start;
		count = ulmax(count, rle->count);
		end = ulmax(rle->end, start + count - 1);
		switch (type) {
		case SYS_RES_MEMORY:
		case SYS_RES_IOPORT:
			start += base;
			end += base;
			if (!INRANGE(start, base, limit) ||
			    !INRANGE(end, base, limit))
				return (NULL);
			break;
		case SYS_RES_IRQ:
			start = end = isa_route_intr_res(bus, start, end);
			if (start == 255)
				return (NULL);
			break;
		}
	}

	rle->res = BUS_ALLOC_RESOURCE(device_get_parent(bus), child,
	    type, rid, start, end, count, flags);

	/*
	 * Record the new range.
	 */
	if (rle->res != NULL) {
		rle->start = rman_get_start(rle->res) - base;
		rle->end = rman_get_end(rle->res) - base;
		rle->count = count;
	}

	return (rle->res);
}
Beispiel #9
0
/*
 * Vnode op for write
 */
int
spec_write(void *v)
{
	struct vop_write_args *ap = v;
	struct vnode *vp = ap->a_vp;
	struct uio *uio = ap->a_uio;
	struct proc *p = uio->uio_procp;
	struct buf *bp;
	daddr_t bn, bscale;
	int bsize;
	struct partinfo dpart;
	size_t n;
	int on, majordev;
	int (*ioctl)(dev_t, u_long, caddr_t, int, struct proc *);
	int error = 0;

#ifdef DIAGNOSTIC
	if (uio->uio_rw != UIO_WRITE)
		panic("spec_write mode");
	if (uio->uio_segflg == UIO_USERSPACE && uio->uio_procp != curproc)
		panic("spec_write proc");
#endif

	switch (vp->v_type) {

	case VCHR:
		VOP_UNLOCK(vp, 0, p);
		error = (*cdevsw[major(vp->v_rdev)].d_write)
			(vp->v_rdev, uio, ap->a_ioflag);
		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
		return (error);

	case VBLK:
		if (uio->uio_resid == 0)
			return (0);
		if (uio->uio_offset < 0)
			return (EINVAL);
		bsize = BLKDEV_IOSIZE;
		if ((majordev = major(vp->v_rdev)) < nblkdev &&
		    (ioctl = bdevsw[majordev].d_ioctl) != NULL &&
		    (*ioctl)(vp->v_rdev, DIOCGPART, (caddr_t)&dpart, FREAD, p) == 0) {
			u_int32_t frag =
			    DISKLABELV1_FFS_FRAG(dpart.part->p_fragblock);
			u_int32_t fsize =
			    DISKLABELV1_FFS_FSIZE(dpart.part->p_fragblock);
			if (dpart.part->p_fstype == FS_BSDFFS && frag != 0 &&
			    fsize != 0)
				bsize = frag * fsize;
		}
		bscale = btodb(bsize);
		do {
			bn = btodb(uio->uio_offset) & ~(bscale - 1);
			on = uio->uio_offset % bsize;
			n = ulmin((bsize - on), uio->uio_resid);
			error = bread(vp, bn, bsize, &bp);
			n = ulmin(n, bsize - bp->b_resid);
			if (error) {
				brelse(bp);
				return (error);
			}
			error = uiomove((char *)bp->b_data + on, n, uio);
			if (n + on == bsize)
				bawrite(bp);
			else
				bdwrite(bp);
		} while (error == 0 && uio->uio_resid > 0 && n != 0);
		return (error);

	default:
		panic("spec_write type");
	}
	/* NOTREACHED */
}
Beispiel #10
0
/* ARGSUSED */
int
memrw(struct cdev *dev, struct uio *uio, int flags)
{
	struct iovec *iov;
	void *p;
	ssize_t orig_resid;
	u_long v, vd;
	u_int c;
	int error;

	error = 0;
	orig_resid = uio->uio_resid;
	while (uio->uio_resid > 0 && error == 0) {
		iov = uio->uio_iov;
		if (iov->iov_len == 0) {
			uio->uio_iov++;
			uio->uio_iovcnt--;
			if (uio->uio_iovcnt < 0)
				panic("memrw");
			continue;
		}
		v = uio->uio_offset;
		c = ulmin(iov->iov_len, PAGE_SIZE - (u_int)(v & PAGE_MASK));

		switch (dev2unit(dev)) {
		case CDEV_MINOR_KMEM:
			/*
			 * Since c is clamped to be less or equal than
			 * PAGE_SIZE, the uiomove() call does not
			 * access past the end of the direct map.
			 */
			if (v >= DMAP_MIN_ADDRESS &&
			    v < DMAP_MIN_ADDRESS + dmaplimit) {
				error = uiomove((void *)v, c, uio);
				break;
			}

			if (!kernacc((void *)v, c, uio->uio_rw == UIO_READ ?
			    VM_PROT_READ : VM_PROT_WRITE)) {
				error = EFAULT;
				break;
			}

			/*
			 * If the extracted address is not accessible
			 * through the direct map, then we make a
			 * private (uncached) mapping because we can't
			 * depend on the existing kernel mapping
			 * remaining valid until the completion of
			 * uiomove().
			 *
			 * XXX We cannot provide access to the
			 * physical page 0 mapped into KVA.
			 */
			v = pmap_extract(kernel_pmap, v);
			if (v == 0) {
				error = EFAULT;
				break;
			}
			/* FALLTHROUGH */
		case CDEV_MINOR_MEM:
			if (v < dmaplimit) {
				vd = PHYS_TO_DMAP(v);
				error = uiomove((void *)vd, c, uio);
				break;
			}
			if (v > cpu_getmaxphyaddr()) {
				error = EFAULT;
				break;
			}
			p = pmap_mapdev(v, PAGE_SIZE);
			error = uiomove(p, c, uio);
			pmap_unmapdev((vm_offset_t)p, PAGE_SIZE);
			break;
		}
	}
	/*
	 * Don't return error if any byte was written.  Read and write
	 * can return error only if no i/o was performed.
	 */
	if (uio->uio_resid != orig_resid)
		error = 0;
	return (error);
}
Beispiel #11
0
/*
 * Allocate a device specific dma_tag.
 */
int
bus_dma_tag_create(bus_dma_tag_t parent, bus_size_t alignment,
    bus_addr_t boundary, bus_addr_t lowaddr, bus_addr_t highaddr,
    bus_dma_filter_t *filter, void *filterarg, bus_size_t maxsize,
    int nsegments, bus_size_t maxsegsz, int flags, bus_dma_lock_t *lockfunc,
    void *lockfuncarg, bus_dma_tag_t *dmat)
{
	bus_dma_tag_t newtag;

	/* Return a NULL tag on failure */
	*dmat = NULL;

	/* Enforce the usage of BUS_GET_DMA_TAG(). */
	if (parent == NULL)
		panic("%s: parent DMA tag NULL", __func__);

	newtag = (bus_dma_tag_t)malloc(sizeof(*newtag), M_DEVBUF, M_NOWAIT);
	if (newtag == NULL)
		return (ENOMEM);

	/*
	 * The method table pointer and the cookie need to be taken over from
	 * the parent.
	 */
	newtag->dt_cookie = parent->dt_cookie;
	newtag->dt_mt = parent->dt_mt;

	newtag->dt_parent = parent;
	newtag->dt_alignment = alignment;
	newtag->dt_boundary = boundary;
	newtag->dt_lowaddr = trunc_page((vm_offset_t)lowaddr) + (PAGE_SIZE - 1);
	newtag->dt_highaddr = trunc_page((vm_offset_t)highaddr) +
	    (PAGE_SIZE - 1);
	newtag->dt_filter = filter;
	newtag->dt_filterarg = filterarg;
	newtag->dt_maxsize = maxsize;
	newtag->dt_nsegments = nsegments;
	newtag->dt_maxsegsz = maxsegsz;
	newtag->dt_flags = flags;
	newtag->dt_ref_count = 1; /* Count ourselves */
	newtag->dt_map_count = 0;

	if (lockfunc != NULL) {
		newtag->dt_lockfunc = lockfunc;
		newtag->dt_lockfuncarg = lockfuncarg;
	} else {
		newtag->dt_lockfunc = dflt_lock;
		newtag->dt_lockfuncarg = NULL;
	}

	newtag->dt_segments = NULL;

	/* Take into account any restrictions imposed by our parent tag. */
	newtag->dt_lowaddr = ulmin(parent->dt_lowaddr, newtag->dt_lowaddr);
	newtag->dt_highaddr = ulmax(parent->dt_highaddr, newtag->dt_highaddr);
	if (newtag->dt_boundary == 0)
		newtag->dt_boundary = parent->dt_boundary;
	else if (parent->dt_boundary != 0)
		newtag->dt_boundary = ulmin(parent->dt_boundary,
		    newtag->dt_boundary);
	atomic_add_int(&parent->dt_ref_count, 1);

	if (newtag->dt_boundary > 0)
		newtag->dt_maxsegsz = ulmin(newtag->dt_maxsegsz,
		    newtag->dt_boundary);

	*dmat = newtag;
	return (0);
}
Beispiel #12
0
Datei: mem.c Projekt: bluhm/sys
int
mmrw(dev_t dev, struct uio *uio, int flags)
{
	vaddr_t o, v;
	size_t c;
	struct iovec *iov;
	int error = 0;

	if (minor(dev) == 0) {
		/* lock against other uses of shared vmmap */
		error = rw_enter(&physlock, RW_WRITE | RW_INTR);
		if (error)
			return (error);
	}
	while (uio->uio_resid > 0 && error == 0) {
		iov = uio->uio_iov;
		if (iov->iov_len == 0) {
			uio->uio_iov++;
			uio->uio_iovcnt--;
			if (uio->uio_iovcnt < 0)
				panic("mmrw");
			continue;
		}
		switch (minor(dev)) {

		/* minor device 0 is physical memory */
		case 0:
			v = uio->uio_offset;
			pmap_enter(pmap_kernel(), (vaddr_t)vmmap,
			    trunc_page(v), uio->uio_rw == UIO_READ ?
			    PROT_READ : PROT_WRITE, PMAP_WIRED);
			pmap_update(pmap_kernel());
			o = uio->uio_offset & PGOFSET;
			c = ulmin(uio->uio_resid, NBPG - o);
			error = uiomove((caddr_t)vmmap + o, c, uio);
			pmap_remove(pmap_kernel(), (vaddr_t)vmmap,
			    (vaddr_t)vmmap + NBPG);
			pmap_update(pmap_kernel());
			continue;

		/* minor device 1 is kernel memory */
		case 1:
			v = uio->uio_offset;
			c = ulmin(iov->iov_len, MAXPHYS);
			if (!uvm_kernacc((caddr_t)v, c,
			    uio->uio_rw == UIO_READ ? B_READ : B_WRITE))
				return (EFAULT);
			error = uiomove((caddr_t)v, c, uio);
			continue;

		/* minor device 2 is /dev/null */
		case 2:
			if (uio->uio_rw == UIO_WRITE)
				uio->uio_resid = 0;
			return (0);

		/* minor device 12 is /dev/zero */
		case 12:
			if (uio->uio_rw == UIO_WRITE) {
				c = iov->iov_len;
				break;
			}
			if (zeropage == NULL) {
				zeropage = malloc(PAGE_SIZE, M_TEMP,
				    M_WAITOK|M_ZERO);
			}
			c = ulmin(iov->iov_len, PAGE_SIZE);
			error = uiomove(zeropage, c, uio);
			continue;

		default:
			return (ENXIO);
		}
		iov->iov_base = (char *)iov->iov_base + c;
		iov->iov_len -= c;
		uio->uio_offset += c;
		uio->uio_resid -= c;
	}
	if (minor(dev) == 0) {
		rw_exit(&physlock);
	}
	return (error);
}
/*
 * We have an scb which has been processed by the
 * adaptor, now we look to see how the operation
 * went.
 */
void
ahd_done(struct ahd_softc *ahd, struct scb *scb)
{
    struct scsi_xfer *xs = scb->xs;
    int s;

    /* XXX in ahc there is some bus_dmamap_sync(PREREAD|PREWRITE); */

    LIST_REMOVE(scb, pending_links);

    timeout_del(&xs->stimeout);

    if (xs->datalen) {
        int op;

        if ((xs->flags & SCSI_DATA_IN) != 0)
            op = BUS_DMASYNC_POSTREAD;
        else
            op = BUS_DMASYNC_POSTWRITE;
        bus_dmamap_sync(ahd->parent_dmat, scb->dmamap, 0,
                        scb->dmamap->dm_mapsize, op);
        bus_dmamap_unload(ahd->parent_dmat, scb->dmamap);
    }

    /* Translate the CAM status code to a SCSI error code. */
    switch (xs->error) {
    case CAM_SCSI_STATUS_ERROR:
    case CAM_REQ_INPROG:
    case CAM_REQ_CMP:
        switch (xs->status) {
        case SCSI_TASKSET_FULL:
        case SCSI_BUSY:
            xs->error = XS_BUSY;
            break;
        case SCSI_CHECK:
        case SCSI_TERMINATED:
            if ((scb->flags & SCB_SENSE) == 0) {
                /* CHECK on CHECK? */
                xs->error = XS_DRIVER_STUFFUP;
            } else
                xs->error = XS_NOERROR;
            break;
        default:
            xs->error = XS_NOERROR;
            break;
        }
        break;
    case CAM_BUSY:
    case CAM_REQUEUE_REQ:
        xs->error = XS_BUSY;
        break;
    case CAM_CMD_TIMEOUT:
        xs->error = XS_TIMEOUT;
        break;
    case CAM_BDR_SENT:
    case CAM_SCSI_BUS_RESET:
        xs->error = XS_RESET;
        break;
    case CAM_SEL_TIMEOUT:
        xs->error = XS_SELTIMEOUT;
        break;
    default:
        xs->error = XS_DRIVER_STUFFUP;
        break;
    }

    if (xs->error != XS_NOERROR) {
        /* Don't clobber any existing error state */
    } else if ((scb->flags & SCB_SENSE) != 0) {
        /*
         * We performed autosense retrieval.
         *
         * Zero any sense not transferred by the
         * device.  The SCSI spec mandates that any
         * untransferred data should be assumed to be
         * zero.  Complete the 'bounce' of sense information
         * through buffers accessible via bus-space by
         * copying it into the clients csio.
         */
        memset(&xs->sense, 0, sizeof(struct scsi_sense_data));
        memcpy(&xs->sense, ahd_get_sense_buf(ahd, scb),
               sizeof(struct scsi_sense_data));
        xs->error = XS_SENSE;
    } else if ((scb->flags & SCB_PKT_SENSE) != 0) {
        struct scsi_status_iu_header *siu;
        u_int32_t len;

        siu = (struct scsi_status_iu_header *)scb->sense_data;
        len = SIU_SENSE_LENGTH(siu);
        memset(&xs->sense, 0, sizeof(xs->sense));
        memcpy(&xs->sense, SIU_SENSE_DATA(siu),
               ulmin(len, sizeof(xs->sense)));
        xs->error = XS_SENSE;
    }

    ahd_lock(ahd, &s);
    ahd_free_scb(ahd, scb);
    scsi_done(xs);
    ahd_unlock(ahd, &s);
}
Beispiel #14
0
/*
 * Tcp output routine: figure out what should be sent and send it.
 */
int
tcp_output(struct tcpcb *tp)
{
	struct socket *so = tp->t_inpcb->inp_socket;
	long len, recwin, sendwin;
	int off, flags, error;
#ifdef TCP_SIGNATURE
	int sigoff = 0;
#endif
	struct mbuf *m;
	struct ip *ip = NULL;
	struct ipovly *ipov = NULL;
	struct tcphdr *th;
	u_char opt[TCP_MAXOLEN];
	unsigned ipoptlen, optlen, hdrlen;
	int idle, sendalot;
	int i, sack_rxmit;
	int sack_bytes_rxmt;
	struct sackhole *p;
#if 0
	int maxburst = TCP_MAXBURST;
#endif
	struct rmxp_tao tao;
#ifdef INET6
	struct ip6_hdr *ip6 = NULL;
	int isipv6;

	bzero(&tao, sizeof(tao));
	isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0;
#endif
#ifdef TCP_ECN
	int needect;
#endif

	INP_LOCK_ASSERT(tp->t_inpcb);

	/*
	 * Determine length of data that should be transmitted,
	 * and flags that will be used.
	 * If there is some data or critical controls (SYN, RST)
	 * to send, then transmit; otherwise, investigate further.
	 */
	idle = (tp->t_flags & TF_LASTIDLE) || (tp->snd_max == tp->snd_una);
	if (idle && (ticks - tp->t_rcvtime) >= tp->t_rxtcur) {
		/*
		 * We have been idle for "a while" and no acks are
		 * expected to clock out any data we send --
		 * slow start to get ack "clock" running again.
		 *
		 * Set the slow-start flight size depending on whether
		 * this is a local network or not.
		 */
		int ss = ss_fltsz;
#ifdef INET6
		if (isipv6) {
			if (in6_localaddr(&tp->t_inpcb->in6p_faddr))
				ss = ss_fltsz_local;
		} else
#endif
		if (in_localaddr(tp->t_inpcb->inp_faddr))
			ss = ss_fltsz_local;
		tp->snd_cwnd = tp->t_maxseg * ss;
	}
	tp->t_flags &= ~TF_LASTIDLE;
	if (idle) {
		if (tp->t_flags & TF_MORETOCOME) {
			tp->t_flags |= TF_LASTIDLE;
			idle = 0;
		}
	}
again:
	/*
	 * If we've recently taken a timeout, snd_max will be greater than
	 * snd_nxt.  There may be SACK information that allows us to avoid
	 * resending already delivered data.  Adjust snd_nxt accordingly.
	 */
	if (tp->sack_enable && SEQ_LT(tp->snd_nxt, tp->snd_max))
		tcp_sack_adjust(tp);
	sendalot = 0;
	off = tp->snd_nxt - tp->snd_una;
	sendwin = min(tp->snd_wnd, tp->snd_cwnd);
	sendwin = min(sendwin, tp->snd_bwnd);

	flags = tcp_outflags[tp->t_state];
	/*
	 * Send any SACK-generated retransmissions.  If we're explicitly trying
	 * to send out new data (when sendalot is 1), bypass this function.
	 * If we retransmit in fast recovery mode, decrement snd_cwnd, since
	 * we're replacing a (future) new transmission with a retransmission
	 * now, and we previously incremented snd_cwnd in tcp_input().
	 */
	/*
	 * Still in sack recovery , reset rxmit flag to zero.
	 */
	sack_rxmit = 0;
	sack_bytes_rxmt = 0;
	len = 0;
	p = NULL;
	if (tp->sack_enable && IN_FASTRECOVERY(tp) &&
	    (p = tcp_sack_output(tp, &sack_bytes_rxmt))) {
		long cwin;
		
		cwin = min(tp->snd_wnd, tp->snd_cwnd) - sack_bytes_rxmt;
		if (cwin < 0)
			cwin = 0;
		/* Do not retransmit SACK segments beyond snd_recover */
		if (SEQ_GT(p->end, tp->snd_recover)) {
			/*
			 * (At least) part of sack hole extends beyond
			 * snd_recover. Check to see if we can rexmit data
			 * for this hole.
			 */
			if (SEQ_GEQ(p->rxmit, tp->snd_recover)) {
				/*
				 * Can't rexmit any more data for this hole.
				 * That data will be rexmitted in the next
				 * sack recovery episode, when snd_recover
				 * moves past p->rxmit.
				 */
				p = NULL;
				goto after_sack_rexmit;
			} else
				/* Can rexmit part of the current hole */
				len = ((long)ulmin(cwin,
						   tp->snd_recover - p->rxmit));
		} else
			len = ((long)ulmin(cwin, p->end - p->rxmit));
		off = p->rxmit - tp->snd_una;
		KASSERT(off >= 0,("%s: sack block to the left of una : %d",
		    __func__, off));
		if (len > 0) {
			sack_rxmit = 1;
			sendalot = 1;
			tcpstat.tcps_sack_rexmits++;
			tcpstat.tcps_sack_rexmit_bytes +=
			    min(len, tp->t_maxseg);
		}
	}
after_sack_rexmit:
	/*
	 * Get standard flags, and add SYN or FIN if requested by 'hidden'
	 * state flags.
	 */
	if (tp->t_flags & TF_NEEDFIN)
		flags |= TH_FIN;
	if (tp->t_flags & TF_NEEDSYN)
		flags |= TH_SYN;

	SOCKBUF_LOCK(&so->so_snd);
	/*
	 * If in persist timeout with window of 0, send 1 byte.
	 * Otherwise, if window is small but nonzero
	 * and timer expired, we will send what we can
	 * and go to transmit state.
	 */
	if (tp->t_force) {
		if (sendwin == 0) {
			/*
			 * If we still have some data to send, then
			 * clear the FIN bit.  Usually this would
			 * happen below when it realizes that we
			 * aren't sending all the data.  However,
			 * if we have exactly 1 byte of unsent data,
			 * then it won't clear the FIN bit below,
			 * and if we are in persist state, we wind
			 * up sending the packet without recording
			 * that we sent the FIN bit.
			 *
			 * We can't just blindly clear the FIN bit,
			 * because if we don't have any more data
			 * to send then the probe will be the FIN
			 * itself.
			 */
			if (off < so->so_snd.sb_cc)
				flags &= ~TH_FIN;
			sendwin = 1;
		} else {
			callout_stop(tp->tt_persist);
			tp->t_rxtshift = 0;
		}
	}

	/*
	 * If snd_nxt == snd_max and we have transmitted a FIN, the
	 * offset will be > 0 even if so_snd.sb_cc is 0, resulting in
	 * a negative length.  This can also occur when TCP opens up
	 * its congestion window while receiving additional duplicate
	 * acks after fast-retransmit because TCP will reset snd_nxt
	 * to snd_max after the fast-retransmit.
	 *
	 * In the normal retransmit-FIN-only case, however, snd_nxt will
	 * be set to snd_una, the offset will be 0, and the length may
	 * wind up 0.
	 *
	 * If sack_rxmit is true we are retransmitting from the scoreboard
	 * in which case len is already set.
	 */
	if (sack_rxmit == 0) {
		if (sack_bytes_rxmt == 0)
			len = ((long)ulmin(so->so_snd.sb_cc, sendwin) - off);
		else {
			long cwin;

                        /*
			 * We are inside of a SACK recovery episode and are
			 * sending new data, having retransmitted all the
			 * data possible in the scoreboard.
			 */
			len = ((long)ulmin(so->so_snd.sb_cc, tp->snd_wnd) 
			       - off);
			/*
			 * Don't remove this (len > 0) check !
			 * We explicitly check for len > 0 here (although it 
			 * isn't really necessary), to work around a gcc 
			 * optimization issue - to force gcc to compute
			 * len above. Without this check, the computation
			 * of len is bungled by the optimizer.
			 */
			if (len > 0) {
				cwin = tp->snd_cwnd - 
					(tp->snd_nxt - tp->sack_newdata) -
					sack_bytes_rxmt;
				if (cwin < 0)
					cwin = 0;
				len = lmin(len, cwin);
			}
		}
	}

	/*
	 * Lop off SYN bit if it has already been sent.  However, if this
	 * is SYN-SENT state and if segment contains data and if we don't
	 * know that foreign host supports TAO, suppress sending segment.
	 */
	if ((flags & TH_SYN) && SEQ_GT(tp->snd_nxt, tp->snd_una)) {
		flags &= ~TH_SYN;
		off--, len++;
		if (tcp_do_rfc1644)
			tcp_hc_gettao(&tp->t_inpcb->inp_inc, &tao);
		if (len > 0 && tp->t_state == TCPS_SYN_SENT &&
		     tao.tao_ccsent == 0)
			goto just_return;
	}

	/*
	 * Be careful not to send data and/or FIN on SYN segments
	 * in cases when no CC option will be sent.
	 * This measure is needed to prevent interoperability problems
	 * with not fully conformant TCP implementations.
	 */
	if ((flags & TH_SYN) &&
	    ((tp->t_flags & TF_NOOPT) || !(tp->t_flags & TF_REQ_CC) ||
	     ((flags & TH_ACK) && !(tp->t_flags & TF_RCVD_CC)))) {
		len = 0;
		flags &= ~TH_FIN;
	}

	if (len < 0) {
		/*
		 * If FIN has been sent but not acked,
		 * but we haven't been called to retransmit,
		 * len will be < 0.  Otherwise, window shrank
		 * after we sent into it.  If window shrank to 0,
		 * cancel pending retransmit, pull snd_nxt back
		 * to (closed) window, and set the persist timer
		 * if it isn't already going.  If the window didn't
		 * close completely, just wait for an ACK.
		 */
		len = 0;
		if (sendwin == 0) {
			callout_stop(tp->tt_rexmt);
			tp->t_rxtshift = 0;
			tp->snd_nxt = tp->snd_una;
			if (!callout_active(tp->tt_persist))
				tcp_setpersist(tp);
		}
	}

	/*
	 * len will be >= 0 after this point.  Truncate to the maximum
	 * segment length and ensure that FIN is removed if the length
	 * no longer contains the last data byte.
	 */
	if (len > tp->t_maxseg) {
		len = tp->t_maxseg;
		sendalot = 1;
	}
	if (sack_rxmit) {
		if (SEQ_LT(p->rxmit + len, tp->snd_una + so->so_snd.sb_cc))
			flags &= ~TH_FIN;
	} else {
		if (SEQ_LT(tp->snd_nxt + len, tp->snd_una + so->so_snd.sb_cc))
			flags &= ~TH_FIN;
	}

	recwin = sbspace(&so->so_rcv);

	/*
	 * Sender silly window avoidance.   We transmit under the following
	 * conditions when len is non-zero:
	 *
	 *	- We have a full segment
	 *	- This is the last buffer in a write()/send() and we are
	 *	  either idle or running NODELAY
	 *	- we've timed out (e.g. persist timer)
	 *	- we have more then 1/2 the maximum send window's worth of
	 *	  data (receiver may be limited the window size)
	 *	- we need to retransmit
	 */
	if (len) {
		if (len == tp->t_maxseg)
			goto send;
		/*
		 * NOTE! on localhost connections an 'ack' from the remote
		 * end may occur synchronously with the output and cause
		 * us to flush a buffer queued with moretocome.  XXX
		 *
		 * note: the len + off check is almost certainly unnecessary.
		 */
		if (!(tp->t_flags & TF_MORETOCOME) &&	/* normal case */
		    (idle || (tp->t_flags & TF_NODELAY)) &&
		    len + off >= so->so_snd.sb_cc &&
		    (tp->t_flags & TF_NOPUSH) == 0) {
			goto send;
		}
		if (tp->t_force)			/* typ. timeout case */
			goto send;
		if (len >= tp->max_sndwnd / 2 && tp->max_sndwnd > 0)
			goto send;
		if (SEQ_LT(tp->snd_nxt, tp->snd_max))	/* retransmit case */
			goto send;
		if (sack_rxmit)
			goto send;
	}

	/*
	 * Compare available window to amount of window
	 * known to peer (as advertised window less
	 * next expected input).  If the difference is at least two
	 * max size segments, or at least 50% of the maximum possible
	 * window, then want to send a window update to peer.
	 * Skip this if the connection is in T/TCP half-open state.
	 */
	if (recwin > 0 && !(tp->t_flags & TF_NEEDSYN)) {
		/*
		 * "adv" is the amount we can increase the window,
		 * taking into account that we are limited by
		 * TCP_MAXWIN << tp->rcv_scale.
		 */
		long adv = min(recwin, (long)TCP_MAXWIN << tp->rcv_scale) -
			(tp->rcv_adv - tp->rcv_nxt);

		if (adv >= (long) (2 * tp->t_maxseg))
			goto send;
		if (2 * adv >= (long) so->so_rcv.sb_hiwat)
			goto send;
	}

	/*
	 * Send if we owe the peer an ACK, RST, SYN, or urgent data.  ACKNOW
	 * is also a catch-all for the retransmit timer timeout case.
	 */
	if (tp->t_flags & TF_ACKNOW)
		goto send;
	if ((flags & TH_RST) ||
	    ((flags & TH_SYN) && (tp->t_flags & TF_NEEDSYN) == 0))
		goto send;
	if (SEQ_GT(tp->snd_up, tp->snd_una))
		goto send;
	/*
	 * If our state indicates that FIN should be sent
	 * and we have not yet done so, then we need to send.
	 */
	if (flags & TH_FIN &&
	    ((tp->t_flags & TF_SENTFIN) == 0 || tp->snd_nxt == tp->snd_una))
		goto send;
	/*
	 * In SACK, it is possible for tcp_output to fail to send a segment
	 * after the retransmission timer has been turned off.  Make sure
	 * that the retransmission timer is set.
	 */
	if (tp->sack_enable && SEQ_GT(tp->snd_max, tp->snd_una) &&
	    !callout_active(tp->tt_rexmt) &&
	    !callout_active(tp->tt_persist)) {
		callout_reset(tp->tt_rexmt, tp->t_rxtcur,
			      tcp_timer_rexmt, tp);
		goto just_return;
	} 
	/*
	 * TCP window updates are not reliable, rather a polling protocol
	 * using ``persist'' packets is used to insure receipt of window
	 * updates.  The three ``states'' for the output side are:
	 *	idle			not doing retransmits or persists
	 *	persisting		to move a small or zero window
	 *	(re)transmitting	and thereby not persisting
	 *
	 * callout_active(tp->tt_persist)
	 *	is true when we are in persist state.
	 * tp->t_force
	 *	is set when we are called to send a persist packet.
	 * callout_active(tp->tt_rexmt)
	 *	is set when we are retransmitting
	 * The output side is idle when both timers are zero.
	 *
	 * If send window is too small, there is data to transmit, and no
	 * retransmit or persist is pending, then go to persist state.
	 * If nothing happens soon, send when timer expires:
	 * if window is nonzero, transmit what we can,
	 * otherwise force out a byte.
	 */
	if (so->so_snd.sb_cc && !callout_active(tp->tt_rexmt) &&
	    !callout_active(tp->tt_persist)) {
		tp->t_rxtshift = 0;
		tcp_setpersist(tp);
	}

	/*
	 * No reason to send a segment, just return.
	 */
just_return:
	SOCKBUF_UNLOCK(&so->so_snd);
	return (0);

send:
	SOCKBUF_LOCK_ASSERT(&so->so_snd);
	/*
	 * Before ESTABLISHED, force sending of initial options
	 * unless TCP set not to do any options.
	 * NOTE: we assume that the IP/TCP header plus TCP options
	 * always fit in a single mbuf, leaving room for a maximum
	 * link header, i.e.
	 *	max_linkhdr + sizeof (struct tcpiphdr) + optlen <= MCLBYTES
	 */
	optlen = 0;
#ifdef INET6
	if (isipv6)
		hdrlen = sizeof (struct ip6_hdr) + sizeof (struct tcphdr);
	else
#endif
	hdrlen = sizeof (struct tcpiphdr);
	if (flags & TH_SYN) {
		tp->snd_nxt = tp->iss;
		if ((tp->t_flags & TF_NOOPT) == 0) {
			u_short mss;

			opt[0] = TCPOPT_MAXSEG;
			opt[1] = TCPOLEN_MAXSEG;
			mss = htons((u_short) tcp_mssopt(&tp->t_inpcb->inp_inc));
			(void)memcpy(opt + 2, &mss, sizeof(mss));
			optlen = TCPOLEN_MAXSEG;

			/*
			 * If this is the first SYN of connection (not a SYN
			 * ACK), include SACK_PERMIT_HDR option.  If this is a
			 * SYN ACK, include SACK_PERMIT_HDR option if peer has
			 * already done so. This is only for active connect,
			 * since the syncache takes care of the passive connect.
			 */
			if (tp->sack_enable && ((flags & TH_ACK) == 0 ||
			    (tp->t_flags & TF_SACK_PERMIT))) {
				*((u_int32_t *) (opt + optlen)) =
					htonl(TCPOPT_SACK_PERMIT_HDR);
				optlen += 4;
			}
			if ((tp->t_flags & TF_REQ_SCALE) &&
			    ((flags & TH_ACK) == 0 ||
			    (tp->t_flags & TF_RCVD_SCALE))) {
				*((u_int32_t *)(opt + optlen)) = htonl(
					TCPOPT_NOP << 24 |
					TCPOPT_WINDOW << 16 |
					TCPOLEN_WINDOW << 8 |
					tp->request_r_scale);
				optlen += 4;
			}
		}
	}

	/*
	 * Send a timestamp and echo-reply if this is a SYN and our side
	 * wants to use timestamps (TF_REQ_TSTMP is set) or both our side
	 * and our peer have sent timestamps in our SYN's.
	 */
	if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP &&
	    (flags & TH_RST) == 0 &&
	    ((flags & TH_ACK) == 0 ||
	     (tp->t_flags & TF_RCVD_TSTMP))) {
		u_int32_t *lp = (u_int32_t *)(opt + optlen);

		/* Form timestamp option as shown in appendix A of RFC 1323. */
		*lp++ = htonl(TCPOPT_TSTAMP_HDR);
		*lp++ = htonl(ticks);
		*lp   = htonl(tp->ts_recent);
		optlen += TCPOLEN_TSTAMP_APPA;
	}

	/*
	 * Send SACKs if necessary.  This should be the last option processed.
	 * Only as many SACKs are sent as are permitted by the maximum options
	 * size.  No more than three SACKs are sent.
	 */
	if (tp->sack_enable && tp->t_state == TCPS_ESTABLISHED &&
	    (tp->t_flags & (TF_SACK_PERMIT|TF_NOOPT)) == TF_SACK_PERMIT &&
	    tp->rcv_numsacks) {
		u_int32_t *lp = (u_int32_t *)(opt + optlen);
		u_int32_t *olp = lp++;
		int count = 0;  /* actual number of SACKs inserted */
		int maxsack = (MAX_TCPOPTLEN - (optlen + 4))/TCPOLEN_SACK;

		tcpstat.tcps_sack_send_blocks++;
		maxsack = min(maxsack, TCP_MAX_SACK);
		for (i = 0; (i < tp->rcv_numsacks && count < maxsack); i++) {
			struct sackblk sack = tp->sackblks[i];
			if (sack.start == 0 && sack.end == 0)
				continue;
			*lp++ = htonl(sack.start);
			*lp++ = htonl(sack.end);
			count++;
		}
		*olp = htonl(TCPOPT_SACK_HDR|(TCPOLEN_SACK*count+2));
		optlen += TCPOLEN_SACK*count + 4; /* including leading NOPs */
	}
	/*
	 * Send `CC-family' options if our side wants to use them (TF_REQ_CC),
	 * options are allowed (!TF_NOOPT) and it's not a RST.
	 */
	if ((tp->t_flags & (TF_REQ_CC|TF_NOOPT)) == TF_REQ_CC &&
	     (flags & TH_RST) == 0) {
		switch (flags & (TH_SYN|TH_ACK)) {
		/*
		 * This is a normal ACK, send CC if we received CC before
		 * from our peer.
		 */
		case TH_ACK:
			if (!(tp->t_flags & TF_RCVD_CC))
				break;
			/*FALLTHROUGH*/

		/*
		 * We can only get here in T/TCP's SYN_SENT* state, when
		 * we're a sending a non-SYN segment without waiting for
		 * the ACK of our SYN.  A check above assures that we only
		 * do this if our peer understands T/TCP.
		 */
		case 0:
			opt[optlen++] = TCPOPT_NOP;
			opt[optlen++] = TCPOPT_NOP;
			opt[optlen++] = TCPOPT_CC;
			opt[optlen++] = TCPOLEN_CC;
			*(u_int32_t *)&opt[optlen] = htonl(tp->cc_send);

			optlen += 4;
			break;

		/*
		 * This is our initial SYN, check whether we have to use
		 * CC or CC.new.
		 */
		case TH_SYN:
			opt[optlen++] = TCPOPT_NOP;
			opt[optlen++] = TCPOPT_NOP;
			opt[optlen++] = tp->t_flags & TF_SENDCCNEW ?
						TCPOPT_CCNEW : TCPOPT_CC;
			opt[optlen++] = TCPOLEN_CC;
			*(u_int32_t *)&opt[optlen] = htonl(tp->cc_send);
			optlen += 4;
			break;

		/*
		 * This is a SYN,ACK; send CC and CC.echo if we received
		 * CC from our peer.
		 */
		case (TH_SYN|TH_ACK):
			if (tp->t_flags & TF_RCVD_CC) {
				opt[optlen++] = TCPOPT_NOP;
				opt[optlen++] = TCPOPT_NOP;
				opt[optlen++] = TCPOPT_CC;
				opt[optlen++] = TCPOLEN_CC;
				*(u_int32_t *)&opt[optlen] =
					htonl(tp->cc_send);
				optlen += 4;
				opt[optlen++] = TCPOPT_NOP;
				opt[optlen++] = TCPOPT_NOP;
				opt[optlen++] = TCPOPT_CCECHO;
				opt[optlen++] = TCPOLEN_CC;
				*(u_int32_t *)&opt[optlen] =
					htonl(tp->cc_recv);
				optlen += 4;
			}
			break;
		}
	}

#ifdef TCP_SIGNATURE
#ifdef INET6
	if (!isipv6)
#endif
	if (tp->t_flags & TF_SIGNATURE) {
		int i;
		u_char *bp;

		/* Initialize TCP-MD5 option (RFC2385) */
		bp = (u_char *)opt + optlen;
		*bp++ = TCPOPT_SIGNATURE;
		*bp++ = TCPOLEN_SIGNATURE;
		sigoff = optlen + 2;
		for (i = 0; i < TCP_SIGLEN; i++)
			*bp++ = 0;
		optlen += TCPOLEN_SIGNATURE;

		/* Terminate options list and maintain 32-bit alignment. */
		*bp++ = TCPOPT_NOP;
		*bp++ = TCPOPT_EOL;
		optlen += 2;
	}
#endif /* TCP_SIGNATURE */

	hdrlen += optlen;

#ifdef INET6
	if (isipv6)
		ipoptlen = ip6_optlen(tp->t_inpcb);
	else
#endif
	if (tp->t_inpcb->inp_options)
		ipoptlen = tp->t_inpcb->inp_options->m_len -
				offsetof(struct ipoption, ipopt_list);
	else
Beispiel #15
0
static void
rtas_setup(void *junk)
{
	ihandle_t rtasi;
	cell_t rtas_size = 0, rtas_ptr;
	char path[31];
	int result;

	rtas = OF_finddevice("/rtas");
	if (rtas == -1) {
		rtas = 0;
		return;
	}
	OF_package_to_path(rtas, path, sizeof(path));
	rtasi = OF_open(path);
	if (rtasi == 0) {
		rtas = 0;
		printf("Error initializing RTAS: could not open node\n");
		return;
	}

	mtx_init(&rtas_mtx, "RTAS", MTX_DEF, 0);

	/* RTAS must be called with everything turned off in MSR */
	rtasmsr = mfmsr();
	rtasmsr &= ~(PSL_IR | PSL_DR | PSL_EE | PSL_SE);
	#ifdef __powerpc64__
	rtasmsr &= ~PSL_SF;
	#endif

	/*
	 * Allocate rtas_size + one page of contiguous, wired physical memory
	 * that can fit into a 32-bit address space and accessed from real mode.
	 * This is used both to bounce arguments and for RTAS private data.
	 *
	 * It must be 4KB-aligned and not cross a 256 MB boundary.
	 */

	OF_getprop(rtas, "rtas-size", &rtas_size, sizeof(rtas_size));
	rtas_size = round_page(rtas_size);
	rtas_bounce_virt = contigmalloc(rtas_size + PAGE_SIZE, M_RTAS, 0, 0,
	    ulmin(platform_real_maxaddr(), BUS_SPACE_MAXADDR_32BIT),
	    4096, 256*1024*1024);

	rtas_private_data = vtophys(rtas_bounce_virt);
	rtas_bounce_virt += rtas_size;	/* Actual bounce area */
	rtas_bounce_phys = vtophys(rtas_bounce_virt);
	rtas_bounce_size = PAGE_SIZE;

	/*
	 * Instantiate RTAS. We always use the 32-bit version.
	 */

	result = OF_call_method("instantiate-rtas", rtasi, 1, 1,
	    (cell_t)rtas_private_data, &rtas_ptr);
	OF_close(rtasi);

	if (result != 0) {
		rtas = 0;
		rtas_ptr = 0;
		printf("Error initializing RTAS (%d)\n", result);
		return;
	}

	rtas_entry = (uintptr_t)(rtas_ptr);
}
Beispiel #16
0
/*
 * Tcp output routine: figure out what should be sent and send it.
 */
int
tcp_output(struct tcpcb *tp)
{
	struct inpcb * const inp = tp->t_inpcb;
	struct socket *so = inp->inp_socket;
	long len, recvwin, sendwin;
	int nsacked = 0;
	int off, flags, error = 0;
#ifdef TCP_SIGNATURE
	int sigoff = 0;
#endif
	struct mbuf *m;
	struct ip *ip;
	struct tcphdr *th;
	u_char opt[TCP_MAXOLEN];
	unsigned int ipoptlen, optlen, hdrlen;
	int idle;
	boolean_t sendalot;
	struct ip6_hdr *ip6;
#ifdef INET6
	const boolean_t isipv6 = INP_ISIPV6(inp);
#else
	const boolean_t isipv6 = FALSE;
#endif
	boolean_t can_tso = FALSE, use_tso;
	boolean_t report_sack, idle_cwv = FALSE;
	u_int segsz, tso_hlen, tso_lenmax = 0;
	int segcnt = 0;
	boolean_t need_sched = FALSE;

	KKASSERT(so->so_port == &curthread->td_msgport);

	/*
	 * Determine length of data that should be transmitted,
	 * and flags that will be used.
	 * If there is some data or critical controls (SYN, RST)
	 * to send, then transmit; otherwise, investigate further.
	 */

	/*
	 * If we have been idle for a while, the send congestion window
	 * could be no longer representative of the current state of the
	 * link; need to validate congestion window.  However, we should
	 * not perform congestion window validation here, since we could
	 * be asked to send pure ACK.
	 */
	if (tp->snd_max == tp->snd_una &&
	    (ticks - tp->snd_last) >= tp->t_rxtcur && tcp_idle_restart)
		idle_cwv = TRUE;

	/*
	 * Calculate whether the transmit stream was previously idle 
	 * and adjust TF_LASTIDLE for the next time.
	 */
	idle = (tp->t_flags & TF_LASTIDLE) || (tp->snd_max == tp->snd_una);
	if (idle && (tp->t_flags & TF_MORETOCOME))
		tp->t_flags |= TF_LASTIDLE;
	else
		tp->t_flags &= ~TF_LASTIDLE;

	if (TCP_DO_SACK(tp) && tp->snd_nxt != tp->snd_max &&
	    !IN_FASTRECOVERY(tp))
		nsacked = tcp_sack_bytes_below(&tp->scb, tp->snd_nxt);

	/*
	 * Find out whether TSO could be used or not
	 *
	 * For TSO capable devices, the following assumptions apply to
	 * the processing of TCP flags:
	 * - If FIN is set on the large TCP segment, the device must set
	 *   FIN on the last segment that it creates from the large TCP
	 *   segment.
	 * - If PUSH is set on the large TCP segment, the device must set
	 *   PUSH on the last segment that it creates from the large TCP
	 *   segment.
	 */
#if !defined(IPSEC) && !defined(FAST_IPSEC)
	if (tcp_do_tso
#ifdef TCP_SIGNATURE
	    && (tp->t_flags & TF_SIGNATURE) == 0
#endif
	) {
		if (!isipv6) {
			struct rtentry *rt = inp->inp_route.ro_rt;

			if (rt != NULL && (rt->rt_flags & RTF_UP) &&
			    (rt->rt_ifp->if_hwassist & CSUM_TSO)) {
				can_tso = TRUE;
				tso_lenmax = rt->rt_ifp->if_tsolen;
			}
		}
	}
#endif	/* !IPSEC && !FAST_IPSEC */

again:
	m = NULL;
	ip = NULL;
	th = NULL;
	ip6 = NULL;

	if ((tp->t_flags & (TF_SACK_PERMITTED | TF_NOOPT)) ==
		TF_SACK_PERMITTED &&
	    (!TAILQ_EMPTY(&tp->t_segq) ||
	     tp->reportblk.rblk_start != tp->reportblk.rblk_end))
		report_sack = TRUE;
	else
		report_sack = FALSE;

	/* Make use of SACK information when slow-starting after a RTO. */
	if (TCP_DO_SACK(tp) && tp->snd_nxt != tp->snd_max &&
	    !IN_FASTRECOVERY(tp)) {
		tcp_seq old_snd_nxt = tp->snd_nxt;

		tcp_sack_skip_sacked(&tp->scb, &tp->snd_nxt);
		nsacked += tp->snd_nxt - old_snd_nxt;
	}

	sendalot = FALSE;
	off = tp->snd_nxt - tp->snd_una;
	sendwin = min(tp->snd_wnd, tp->snd_cwnd + nsacked);
	sendwin = min(sendwin, tp->snd_bwnd);

	flags = tcp_outflags[tp->t_state];
	/*
	 * Get standard flags, and add SYN or FIN if requested by 'hidden'
	 * state flags.
	 */
	if (tp->t_flags & TF_NEEDFIN)
		flags |= TH_FIN;
	if (tp->t_flags & TF_NEEDSYN)
		flags |= TH_SYN;

	/*
	 * If in persist timeout with window of 0, send 1 byte.
	 * Otherwise, if window is small but nonzero
	 * and timer expired, we will send what we can
	 * and go to transmit state.
	 */
	if (tp->t_flags & TF_FORCE) {
		if (sendwin == 0) {
			/*
			 * If we still have some data to send, then
			 * clear the FIN bit.  Usually this would
			 * happen below when it realizes that we
			 * aren't sending all the data.  However,
			 * if we have exactly 1 byte of unsent data,
			 * then it won't clear the FIN bit below,
			 * and if we are in persist state, we wind
			 * up sending the packet without recording
			 * that we sent the FIN bit.
			 *
			 * We can't just blindly clear the FIN bit,
			 * because if we don't have any more data
			 * to send then the probe will be the FIN
			 * itself.
			 */
			if (off < so->so_snd.ssb_cc)
				flags &= ~TH_FIN;
			sendwin = 1;
		} else {
			tcp_callout_stop(tp, tp->tt_persist);
			tp->t_rxtshift = 0;
		}
	}

	/*
	 * If snd_nxt == snd_max and we have transmitted a FIN, the
	 * offset will be > 0 even if so_snd.ssb_cc is 0, resulting in
	 * a negative length.  This can also occur when TCP opens up
	 * its congestion window while receiving additional duplicate
	 * acks after fast-retransmit because TCP will reset snd_nxt
	 * to snd_max after the fast-retransmit.
	 *
	 * A negative length can also occur when we are in the
	 * TCPS_SYN_RECEIVED state due to a simultanious connect where
	 * our SYN has not been acked yet.
	 *
	 * In the normal retransmit-FIN-only case, however, snd_nxt will
	 * be set to snd_una, the offset will be 0, and the length may
	 * wind up 0.
	 */
	len = (long)ulmin(so->so_snd.ssb_cc, sendwin) - off;

	/*
	 * Lop off SYN bit if it has already been sent.  However, if this
	 * is SYN-SENT state and if segment contains data, suppress sending
	 * segment (sending the segment would be an option if we still
	 * did TAO and the remote host supported it).
	 */
	if ((flags & TH_SYN) && SEQ_GT(tp->snd_nxt, tp->snd_una)) {
		flags &= ~TH_SYN;
		off--, len++;
		if (len > 0 && tp->t_state == TCPS_SYN_SENT) {
			tp->t_flags &= ~(TF_ACKNOW | TF_XMITNOW);
			return 0;
		}
	}

	/*
	 * Be careful not to send data and/or FIN on SYN segments.
	 * This measure is needed to prevent interoperability problems
	 * with not fully conformant TCP implementations.
	 */
	if (flags & TH_SYN) {
		len = 0;
		flags &= ~TH_FIN;
	}

	if (len < 0) {
		/*
		 * A negative len can occur if our FIN has been sent but not
		 * acked, or if we are in a simultanious connect in the
		 * TCPS_SYN_RECEIVED state with our SYN sent but not yet
		 * acked.
		 *
		 * If our window has contracted to 0 in the FIN case
		 * (which can only occur if we have NOT been called to
		 * retransmit as per code a few paragraphs up) then we
		 * want to shift the retransmit timer over to the
		 * persist timer.
		 *
		 * However, if we are in the TCPS_SYN_RECEIVED state
		 * (the SYN case) we will be in a simultanious connect and
		 * the window may be zero degeneratively.  In this case we
		 * do not want to shift to the persist timer after the SYN
		 * or the SYN+ACK transmission.
		 */
		len = 0;
		if (sendwin == 0 && tp->t_state != TCPS_SYN_RECEIVED) {
			tcp_callout_stop(tp, tp->tt_rexmt);
			tp->t_rxtshift = 0;
			tp->snd_nxt = tp->snd_una;
			if (!tcp_callout_active(tp, tp->tt_persist))
				tcp_setpersist(tp);
		}
	}

	KASSERT(len >= 0, ("%s: len < 0", __func__));
	/*
	 * Automatic sizing of send socket buffer.  Often the send buffer
	 * size is not optimally adjusted to the actual network conditions
	 * at hand (delay bandwidth product).  Setting the buffer size too
	 * small limits throughput on links with high bandwidth and high
	 * delay (eg. trans-continental/oceanic links).  Setting the
	 * buffer size too big consumes too much real kernel memory,
	 * especially with many connections on busy servers.
	 *
	 * The criteria to step up the send buffer one notch are:
	 *  1. receive window of remote host is larger than send buffer
	 *     (with a fudge factor of 5/4th);
	 *  2. hiwat has not significantly exceeded bwnd (inflight)
	 *     (bwnd is a maximal value if inflight is disabled).
	 *  3. send buffer is filled to 7/8th with data (so we actually
	 *     have data to make use of it);
	 *  4. hiwat has not hit maximal automatic size;
	 *  5. our send window (slow start and cogestion controlled) is
	 *     larger than sent but unacknowledged data in send buffer.
	 *
	 * The remote host receive window scaling factor may limit the
	 * growing of the send buffer before it reaches its allowed
	 * maximum.
	 *
	 * It scales directly with slow start or congestion window
	 * and does at most one step per received ACK.  This fast
	 * scaling has the drawback of growing the send buffer beyond
	 * what is strictly necessary to make full use of a given
	 * delay*bandwith product.  However testing has shown this not
	 * to be much of an problem.  At worst we are trading wasting
	 * of available bandwith (the non-use of it) for wasting some
	 * socket buffer memory.
	 *
	 * The criteria for shrinking the buffer is based solely on
	 * the inflight code (snd_bwnd).  If inflight is disabled,
	 * the buffer will not be shrinked.  Note that snd_bwnd already
	 * has a fudge factor.  Our test adds a little hysteresis.
	 */
	if (tcp_do_autosndbuf && (so->so_snd.ssb_flags & SSB_AUTOSIZE)) {
		const int asbinc = tcp_autosndbuf_inc;
		const int hiwat = so->so_snd.ssb_hiwat;
		const int lowat = so->so_snd.ssb_lowat;
		u_long newsize;

		if ((tp->snd_wnd / 4 * 5) >= hiwat &&
		    so->so_snd.ssb_cc >= (hiwat / 8 * 7) &&
		    hiwat < tp->snd_bwnd + hiwat / 10 &&
		    hiwat + asbinc < tcp_autosndbuf_max &&
		    hiwat < (TCP_MAXWIN << tp->snd_scale) &&
		    sendwin >= (so->so_snd.ssb_cc -
				(tp->snd_nxt - tp->snd_una))) {
			newsize = ulmin(hiwat + asbinc, tcp_autosndbuf_max);
			if (!ssb_reserve(&so->so_snd, newsize, so, NULL))
				atomic_clear_int(&so->so_snd.ssb_flags, SSB_AUTOSIZE);
#if 0
			if (newsize >= (TCP_MAXWIN << tp->snd_scale))
				atomic_clear_int(&so->so_snd.ssb_flags, SSB_AUTOSIZE);
#endif
		} else if ((long)tp->snd_bwnd <
			   (long)(hiwat * 3 / 4 - lowat - asbinc) &&
			   hiwat > tp->t_maxseg * 2 + asbinc &&
			   hiwat + asbinc >= tcp_autosndbuf_min &&
			   tcp_do_autosndbuf == 1) {
			newsize = ulmax(hiwat - asbinc, tp->t_maxseg * 2);
			ssb_reserve(&so->so_snd, newsize, so, NULL);
		}
	}

	/*
	 * Don't use TSO, if:
	 * - Congestion window needs validation
	 * - There are SACK blocks to report
	 * - RST or SYN flags is set
	 * - URG will be set
	 *
	 * XXX
	 * Checking for SYN|RST looks overkill, just to be safe than sorry
	 */
	use_tso = can_tso;
	if (report_sack || idle_cwv || (flags & (TH_RST | TH_SYN)))
		use_tso = FALSE;
	if (use_tso) {
		tcp_seq ugr_nxt = tp->snd_nxt;

		if ((flags & TH_FIN) && (tp->t_flags & TF_SENTFIN) &&
		    tp->snd_nxt == tp->snd_max)
			--ugr_nxt;

		if (SEQ_GT(tp->snd_up, ugr_nxt))
			use_tso = FALSE;
	}

	if (use_tso) {
		/*
		 * Find out segment size and header length for TSO
		 */
		error = tcp_tso_getsize(tp, &segsz, &tso_hlen);
		if (error)
			use_tso = FALSE;
	}
	if (!use_tso) {
		segsz = tp->t_maxseg;
		tso_hlen = 0; /* not used */
	}

	/*
	 * Truncate to the maximum segment length if not TSO, and ensure that
	 * FIN is removed if the length no longer contains the last data byte.
	 */
	if (len > segsz) {
		if (!use_tso) {
			len = segsz;
			++segcnt;
		} else {
			int nsegs;

			if (__predict_false(tso_lenmax < segsz))
				tso_lenmax = segsz << 1;

			/*
			 * Truncate TSO transfers to (IP_MAXPACKET - iphlen -
			 * thoff), and make sure that we send equal size
			 * transfers down the stack (rather than big-small-
			 * big-small-...).
			 */
			len = min(len, tso_lenmax);
			nsegs = min(len, (IP_MAXPACKET - tso_hlen)) / segsz;
			KKASSERT(nsegs > 0);

			len = nsegs * segsz;

			if (len <= segsz) {
				use_tso = FALSE;
				++segcnt;
			} else {
				segcnt += nsegs;
			}
		}
		sendalot = TRUE;
	} else {
		use_tso = FALSE;
		if (len > 0)
			++segcnt;
	}
	if (SEQ_LT(tp->snd_nxt + len, tp->snd_una + so->so_snd.ssb_cc))
		flags &= ~TH_FIN;

	recvwin = ssb_space(&so->so_rcv);

	/*
	 * Sender silly window avoidance.   We transmit under the following
	 * conditions when len is non-zero:
	 *
	 *	- We have a full segment
	 *	- This is the last buffer in a write()/send() and we are
	 *	  either idle or running NODELAY
	 *	- we've timed out (e.g. persist timer)
	 *	- we have more then 1/2 the maximum send window's worth of
	 *	  data (receiver may be limiting the window size)
	 *	- we need to retransmit
	 */
	if (len) {
		if (len >= segsz)
			goto send;
		/*
		 * NOTE! on localhost connections an 'ack' from the remote
		 * end may occur synchronously with the output and cause
		 * us to flush a buffer queued with moretocome.  XXX
		 *
		 * note: the len + off check is almost certainly unnecessary.
		 */
		if (!(tp->t_flags & TF_MORETOCOME) &&	/* normal case */
		    (idle || (tp->t_flags & TF_NODELAY)) &&
		    len + off >= so->so_snd.ssb_cc &&
		    !(tp->t_flags & TF_NOPUSH)) {
			goto send;
		}
		if (tp->t_flags & TF_FORCE)		/* typ. timeout case */
			goto send;
		if (len >= tp->max_sndwnd / 2 && tp->max_sndwnd > 0)
			goto send;
		if (SEQ_LT(tp->snd_nxt, tp->snd_max))	/* retransmit case */
			goto send;
		if (tp->t_flags & TF_XMITNOW)
			goto send;
	}

	/*
	 * Compare available window to amount of window
	 * known to peer (as advertised window less
	 * next expected input).  If the difference is at least two
	 * max size segments, or at least 50% of the maximum possible
	 * window, then want to send a window update to peer.
	 */
	if (recvwin > 0) {
		/*
		 * "adv" is the amount we can increase the window,
		 * taking into account that we are limited by
		 * TCP_MAXWIN << tp->rcv_scale.
		 */
		long adv = min(recvwin, (long)TCP_MAXWIN << tp->rcv_scale) -
			(tp->rcv_adv - tp->rcv_nxt);
		long hiwat;

		/*
		 * This ack case typically occurs when the user has drained
		 * the TCP socket buffer sufficiently to warrent an ack
		 * containing a 'pure window update'... that is, an ack that
		 * ONLY updates the tcp window.
		 *
		 * It is unclear why we would need to do a pure window update
		 * past 2 segments if we are going to do one at 1/2 the high
		 * water mark anyway, especially since under normal conditions
		 * the user program will drain the socket buffer quickly.
		 * The 2-segment pure window update will often add a large
		 * number of extra, unnecessary acks to the stream.
		 *
		 * avoid_pure_win_update now defaults to 1.
		 */
		if (avoid_pure_win_update == 0 ||
		    (tp->t_flags & TF_RXRESIZED)) {
			if (adv >= (long) (2 * segsz)) {
				goto send;
			}
		}
		hiwat = (long)(TCP_MAXWIN << tp->rcv_scale);
		if (hiwat > (long)so->so_rcv.ssb_hiwat)
			hiwat = (long)so->so_rcv.ssb_hiwat;
		if (adv >= hiwat / 2)
			goto send;
	}

	/*
	 * Send if we owe the peer an ACK, RST, SYN, or urgent data.  ACKNOW
	 * is also a catch-all for the retransmit timer timeout case.
	 */
	if (tp->t_flags & TF_ACKNOW)
		goto send;
	if ((flags & TH_RST) ||
	    ((flags & TH_SYN) && !(tp->t_flags & TF_NEEDSYN)))
		goto send;
	if (SEQ_GT(tp->snd_up, tp->snd_una))
		goto send;
	/*
	 * If our state indicates that FIN should be sent
	 * and we have not yet done so, then we need to send.
	 */
	if ((flags & TH_FIN) &&
	    (!(tp->t_flags & TF_SENTFIN) || tp->snd_nxt == tp->snd_una))
		goto send;

	/*
	 * TCP window updates are not reliable, rather a polling protocol
	 * using ``persist'' packets is used to insure receipt of window
	 * updates.  The three ``states'' for the output side are:
	 *	idle			not doing retransmits or persists
	 *	persisting		to move a small or zero window
	 *	(re)transmitting	and thereby not persisting
	 *
	 * tcp_callout_active(tp, tp->tt_persist)
	 *	is true when we are in persist state.
	 * The TF_FORCE flag in tp->t_flags
	 *	is set when we are called to send a persist packet.
	 * tcp_callout_active(tp, tp->tt_rexmt)
	 *	is set when we are retransmitting
	 * The output side is idle when both timers are zero.
	 *
	 * If send window is too small, there is data to transmit, and no
	 * retransmit or persist is pending, then go to persist state.
	 *
	 * If nothing happens soon, send when timer expires:
	 * if window is nonzero, transmit what we can, otherwise force out
	 * a byte.
	 *
	 * Don't try to set the persist state if we are in TCPS_SYN_RECEIVED
	 * with data pending.  This situation can occur during a
	 * simultanious connect.
	 */
	if (so->so_snd.ssb_cc > 0 &&
	    tp->t_state != TCPS_SYN_RECEIVED &&
	    !tcp_callout_active(tp, tp->tt_rexmt) &&
	    !tcp_callout_active(tp, tp->tt_persist)) {
		tp->t_rxtshift = 0;
		tcp_setpersist(tp);
	}

	/*
	 * No reason to send a segment, just return.
	 */
	tp->t_flags &= ~TF_XMITNOW;
	return (0);

send:
	if (need_sched && len > 0) {
		tcp_output_sched(tp);
		return 0;
	}

	/*
	 * Before ESTABLISHED, force sending of initial options
	 * unless TCP set not to do any options.
	 * NOTE: we assume that the IP/TCP header plus TCP options
	 * always fit in a single mbuf, leaving room for a maximum
	 * link header, i.e.
	 *	max_linkhdr + sizeof(struct tcpiphdr) + optlen <= MCLBYTES
	 */
	optlen = 0;
	if (isipv6)
		hdrlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
	else
		hdrlen = sizeof(struct tcpiphdr);
	if (flags & TH_SYN) {
		tp->snd_nxt = tp->iss;
		if (!(tp->t_flags & TF_NOOPT)) {
			u_short mss;

			opt[0] = TCPOPT_MAXSEG;
			opt[1] = TCPOLEN_MAXSEG;
			mss = htons((u_short) tcp_mssopt(tp));
			memcpy(opt + 2, &mss, sizeof mss);
			optlen = TCPOLEN_MAXSEG;

			if ((tp->t_flags & TF_REQ_SCALE) &&
			    (!(flags & TH_ACK) ||
			     (tp->t_flags & TF_RCVD_SCALE))) {
				*((u_int32_t *)(opt + optlen)) = htonl(
					TCPOPT_NOP << 24 |
					TCPOPT_WINDOW << 16 |
					TCPOLEN_WINDOW << 8 |
					tp->request_r_scale);
				optlen += 4;
			}

			if ((tcp_do_sack && !(flags & TH_ACK)) ||
			    tp->t_flags & TF_SACK_PERMITTED) {
				uint32_t *lp = (uint32_t *)(opt + optlen);

				*lp = htonl(TCPOPT_SACK_PERMITTED_ALIGNED);
				optlen += TCPOLEN_SACK_PERMITTED_ALIGNED;
			}
		}
	}

	/*
	 * Send a timestamp and echo-reply if this is a SYN and our side
	 * wants to use timestamps (TF_REQ_TSTMP is set) or both our side
	 * and our peer have sent timestamps in our SYN's.
	 */
	if ((tp->t_flags & (TF_REQ_TSTMP | TF_NOOPT)) == TF_REQ_TSTMP &&
	    !(flags & TH_RST) &&
	    (!(flags & TH_ACK) || (tp->t_flags & TF_RCVD_TSTMP))) {
		u_int32_t *lp = (u_int32_t *)(opt + optlen);

		/* Form timestamp option as shown in appendix A of RFC 1323. */
		*lp++ = htonl(TCPOPT_TSTAMP_HDR);
		*lp++ = htonl(ticks);
		*lp   = htonl(tp->ts_recent);
		optlen += TCPOLEN_TSTAMP_APPA;
	}

	/* Set receive buffer autosizing timestamp. */
	if (tp->rfbuf_ts == 0 && (so->so_rcv.ssb_flags & SSB_AUTOSIZE))
		tp->rfbuf_ts = ticks;

	/*
	 * If this is a SACK connection and we have a block to report,
	 * fill in the SACK blocks in the TCP options.
	 */
	if (report_sack)
		tcp_sack_fill_report(tp, opt, &optlen);

#ifdef TCP_SIGNATURE
	if (tp->t_flags & TF_SIGNATURE) {
		int i;
		u_char *bp;
		/*
		 * Initialize TCP-MD5 option (RFC2385)
		 */
		bp = (u_char *)opt + optlen;
		*bp++ = TCPOPT_SIGNATURE;
		*bp++ = TCPOLEN_SIGNATURE;
		sigoff = optlen + 2;
		for (i = 0; i < TCP_SIGLEN; i++)
			*bp++ = 0;
		optlen += TCPOLEN_SIGNATURE;
		/*
		 * Terminate options list and maintain 32-bit alignment.
		 */
		*bp++ = TCPOPT_NOP;
		*bp++ = TCPOPT_EOL;
		optlen += 2;
	}
#endif /* TCP_SIGNATURE */
	KASSERT(optlen <= TCP_MAXOLEN, ("too many TCP options"));
	hdrlen += optlen;

	if (isipv6) {
		ipoptlen = ip6_optlen(inp);
	} else {
		if (inp->inp_options) {
			ipoptlen = inp->inp_options->m_len -
			    offsetof(struct ipoption, ipopt_list);
		} else {
Beispiel #17
0
/*ARGSUSED*/
int
mmrw(dev_t dev, struct uio *uio, int flags)
{
	struct iovec *iov;
	boolean_t allowed;
	int error = 0;
	size_t c;
	vaddr_t v;

	while (uio->uio_resid > 0 && error == 0) {
		iov = uio->uio_iov;
		if (iov->iov_len == 0) {
			uio->uio_iov++;
			uio->uio_iovcnt--;
			if (uio->uio_iovcnt < 0)
				panic("mmrw");
			continue;
		}
		switch (minor(dev)) {

/* minor device 0 is physical memory */
		case 0:
			v = uio->uio_offset;
			c = iov->iov_len;
			if (v + c < v || v + c > ptoa((psize_t)physmem))
				return (EFAULT);
			v = (vaddr_t)PHYS_TO_XKPHYS(v, CCA_NONCOHERENT);
			error = uiomove((caddr_t)v, c, uio);
			continue;

/* minor device 1 is kernel memory */
		case 1:
			v = uio->uio_offset;
			c = ulmin(iov->iov_len, MAXPHYS);

			/* Allow access to RAM through XKPHYS... */
			if (IS_XKPHYS(v))
				allowed = is_memory_range(XKPHYS_TO_PHYS(v),
				    (psize_t)c, 0);
			/* ...or through CKSEG0... */
			else if (v >= CKSEG0_BASE &&
			    v < CKSEG0_BASE + CKSEG_SIZE)
				allowed = is_memory_range(CKSEG0_TO_PHYS(v),
				    (psize_t)c, CKSEG_SIZE);
			/* ...or through CKSEG1... */
			else if (v >= CKSEG1_BASE &&
			    v < CKSEG1_BASE + CKSEG_SIZE)
				allowed = is_memory_range(CKSEG1_TO_PHYS(v),
				    (psize_t)c, CKSEG_SIZE);
			/* ...otherwise, check it's within kernel kvm limits. */
			else
				allowed = uvm_kernacc((caddr_t)v, c,
				    uio->uio_rw == UIO_READ ? B_READ : B_WRITE);

			if (allowed) {
				error = uiomove((caddr_t)v, c, uio);
				continue;
			} else {
				return (EFAULT);
			}

/* minor device 2 is EOF/RATHOLE */
		case 2:
			if (uio->uio_rw == UIO_WRITE)
				uio->uio_resid = 0;
			return (0);

/* minor device 12 (/dev/zero) is source of nulls on read, rathole on write */
		case 12:
			if (uio->uio_rw == UIO_WRITE) {
				c = iov->iov_len;
				break;
			}
			if (zeropage == NULL)
				zeropage = malloc(PAGE_SIZE, M_TEMP,
				    M_WAITOK | M_ZERO);
			c = ulmin(iov->iov_len, PAGE_SIZE);
			error = uiomove(zeropage, c, uio);
			continue;

		default:
			return (ENODEV);
		}
		if (error)
			break;
		iov->iov_base += c;
		iov->iov_len -= c;
		uio->uio_offset += c;
		uio->uio_resid -= c;
	}
	return error;
}
Beispiel #18
0
/* ARGSUSED */
int
memrw(struct cdev *dev, struct uio *uio, int flags)
{
	struct iovec *iov;
	vm_offset_t eva;
	vm_offset_t off;
	vm_offset_t ova;
	vm_offset_t va;
	vm_prot_t prot;
	vm_paddr_t pa;
	vm_size_t cnt;
	vm_page_t m;
	int error;
	int i;
	uint32_t colors;

	cnt = 0;
	colors = 1;
	error = 0;
	ova = 0;

	GIANT_REQUIRED;

	while (uio->uio_resid > 0 && error == 0) {
		iov = uio->uio_iov;
		if (iov->iov_len == 0) {
			uio->uio_iov++;
			uio->uio_iovcnt--;
			if (uio->uio_iovcnt < 0)
				panic("memrw");
			continue;
		}
		if (dev2unit(dev) == CDEV_MINOR_MEM) {
			pa = uio->uio_offset & ~PAGE_MASK;
			if (!is_physical_memory(pa)) {
				error = EFAULT;
				break;
			}

			off = uio->uio_offset & PAGE_MASK;
			cnt = PAGE_SIZE - ((vm_offset_t)iov->iov_base &
			    PAGE_MASK);
			cnt = ulmin(cnt, PAGE_SIZE - off);
			cnt = ulmin(cnt, iov->iov_len);

			m = NULL;
			for (i = 0; phys_avail[i] != 0; i += 2) {
				if (pa >= phys_avail[i] &&
				    pa < phys_avail[i + 1]) {
					m = PHYS_TO_VM_PAGE(pa);
					break;
				}
			}

			if (m != NULL) {
				if (ova == 0) {
					if (dcache_color_ignore == 0)
						colors = DCACHE_COLORS;
					ova = kmem_alloc_wait(kernel_map,
					    PAGE_SIZE * colors);
				}
				if (colors != 1 && m->md.color != -1)
					va = ova + m->md.color * PAGE_SIZE;
				else
					va = ova;
				pmap_qenter(va, &m, 1);
				error = uiomove((void *)(va + off), cnt,
				    uio);
				pmap_qremove(va, 1);
			} else {
				va = TLB_PHYS_TO_DIRECT(pa);
				error = uiomove((void *)(va + off), cnt,
				    uio);
			}
			break;
		} else if (dev2unit(dev) == CDEV_MINOR_KMEM) {
			va = trunc_page(uio->uio_offset);
			eva = round_page(uio->uio_offset + iov->iov_len);

			/*
			 * Make sure that all of the pages are currently
			 * resident so we don't create any zero fill pages.
			 */
			for (; va < eva; va += PAGE_SIZE)
				if (pmap_kextract(va) == 0)
					return (EFAULT);

			prot = (uio->uio_rw == UIO_READ) ? VM_PROT_READ :
			    VM_PROT_WRITE;
			va = uio->uio_offset;
			if (va < VM_MIN_DIRECT_ADDRESS &&
			    kernacc((void *)va, iov->iov_len, prot) == FALSE)
				return (EFAULT);

			error = uiomove((void *)va, iov->iov_len, uio);
			break;
		}
		/* else panic! */
	}
	if (ova != 0)
		kmem_free_wakeup(kernel_map, ova, PAGE_SIZE * colors);
	return (error);
}
Beispiel #19
0
/*
 * Implement uiomove(9) from physical memory using a combination
 * of the direct mapping and sf_bufs to reduce the creation and
 * destruction of ephemeral mappings.  
 */
int
uiomove_fromphys(vm_page_t ma[], vm_offset_t offset, int n, struct uio *uio)
{
	struct sf_buf *sf;
	struct thread *td = curthread;
	struct iovec *iov;
	void *cp;
	vm_offset_t page_offset;
	vm_paddr_t pa;
	vm_page_t m;
	size_t cnt;
	int error = 0;
	int save = 0;

	KASSERT(uio->uio_rw == UIO_READ || uio->uio_rw == UIO_WRITE,
	    ("uiomove_fromphys: mode"));
	KASSERT(uio->uio_segflg != UIO_USERSPACE || uio->uio_td == curthread,
	    ("uiomove_fromphys proc"));
	save = td->td_pflags & TDP_DEADLKTREAT;
	td->td_pflags |= TDP_DEADLKTREAT;
	while (n > 0 && uio->uio_resid) {
		iov = uio->uio_iov;
		cnt = iov->iov_len;
		if (cnt == 0) {
			uio->uio_iov++;
			uio->uio_iovcnt--;
			continue;
		}
		if (cnt > n)
			cnt = n;
		page_offset = offset & PAGE_MASK;
		cnt = ulmin(cnt, PAGE_SIZE - page_offset);
		m = ma[offset >> PAGE_SHIFT];
		pa = VM_PAGE_TO_PHYS(m);
		if (m->md.color != DCACHE_COLOR(pa)) {
			sf = sf_buf_alloc(m, 0);
			cp = (char *)sf_buf_kva(sf) + page_offset;
		} else {
			sf = NULL;
			cp = (char *)TLB_PHYS_TO_DIRECT(pa) + page_offset;
		}
		switch (uio->uio_segflg) {
		case UIO_USERSPACE:
			if (ticks - PCPU_GET(switchticks) >= hogticks)
				uio_yield();
			if (uio->uio_rw == UIO_READ)
				error = copyout(cp, iov->iov_base, cnt);
			else
				error = copyin(iov->iov_base, cp, cnt);
			if (error) {
				if (sf != NULL)
					sf_buf_free(sf);
				goto out;
			}
			break;
		case UIO_SYSSPACE:
			if (uio->uio_rw == UIO_READ)
				bcopy(cp, iov->iov_base, cnt);
			else
				bcopy(iov->iov_base, cp, cnt);
			break;
		case UIO_NOCOPY:
			break;
		}
		if (sf != NULL)
			sf_buf_free(sf);
		iov->iov_base = (char *)iov->iov_base + cnt;
		iov->iov_len -= cnt;
		uio->uio_resid -= cnt;
		uio->uio_offset += cnt;
		offset += cnt;
		n -= cnt;
	}
out:
	if (save == 0)
		td->td_pflags &= ~TDP_DEADLKTREAT;
	return (error);
}
Beispiel #20
0
/*
 * Common function for DMA map synchronization.  May be called
 * by chipset-specific DMA map synchronization functions.
 *
 * This version works with the virtually-indexed, write-back cache
 * found in the MIPS-3/MIPS-4 CPUs available for the Algorithmics.
 */
void
_bus_dmamap_sync(bus_dma_tag_t t, bus_dmamap_t map, bus_addr_t offset,
    bus_size_t len, int ops)
{
	bus_size_t minlen;

#ifdef DIAGNOSTIC
	/*
	 * Mixing PRE and POST operations is not allowed.
	 */
	if ((ops & (BUS_DMASYNC_PREREAD|BUS_DMASYNC_PREWRITE)) != 0 &&
	    (ops & (BUS_DMASYNC_POSTREAD|BUS_DMASYNC_POSTWRITE)) != 0)
		panic("_bus_dmamap_sync: mix PRE and POST");

	if (offset >= map->dm_mapsize)
		panic("_bus_dmamap_sync: bad offset %"PRIxPADDR 
			" (map size is %"PRIxPSIZE")",
				offset, map->dm_mapsize);
	if (len == 0 || (offset + len) > map->dm_mapsize)
		panic("_bus_dmamap_sync: bad length");
#endif

	/*
	 * Since we're dealing with a virtually-indexed, write-back
	 * cache, we need to do the following things:
	 *
	 *	PREREAD -- Invalidate D-cache.  Note we might have
	 *	to also write-back here if we have to use an Index
	 *	op, or if the buffer start/end is not cache-line aligned.
	 *
	 *	PREWRITE -- Write-back the D-cache.  If we have to use
	 *	an Index op, we also have to invalidate.  Note that if
	 *	we are doing PREREAD|PREWRITE, we can collapse everything
	 *	into a single op.
	 *
	 *	POSTREAD -- Nothing.
	 *
	 *	POSTWRITE -- Nothing.
	 */
#ifdef _MIPS_NEED_BUS_DMA_BOUNCE
	struct mips_bus_dma_cookie * const cookie = map->_dm_cookie;
	if (cookie != NULL && (cookie->id_flags & _BUS_DMA_IS_BOUNCING)
	    && (ops & BUS_DMASYNC_PREWRITE)) {
		STAT_INCR(write_bounces);
		/*
		 * Copy the caller's buffer to the bounce buffer.
		 */
		switch (cookie->id_buftype) {
		case _BUS_DMA_BUFTYPE_LINEAR:
			memcpy((char *)cookie->id_bouncebuf + offset,
			    cookie->id_origlinearbuf + offset, len);
			break;
		case _BUS_DMA_BUFTYPE_MBUF:
			m_copydata(cookie->id_origmbuf, offset, len,
			    (char *)cookie->id_bouncebuf + offset);
			break;
		case _BUS_DMA_BUFTYPE_UIO:
			_bus_dma_uiomove((char *)cookie->id_bouncebuf + offset,
			    cookie->id_origuio, len, UIO_WRITE);
			break;
#ifdef DIAGNOSTIC
		case _BUS_DMA_BUFTYPE_RAW:
			panic("_bus_dmamap_sync: _BUS_DMA_BUFTYPE_RAW");
			break;

		case _BUS_DMA_BUFTYPE_INVALID:
			panic("_bus_dmamap_sync: _BUS_DMA_BUFTYPE_INVALID");
			break;

		default:
			panic("_bus_dmamap_sync: unknown buffer type %d\n",
			    cookie->id_buftype);
			break;
#endif /* DIAGNOSTIC */
		}
	}
#endif /* _MIPS_NEED_BUS_DMA_BOUNCE */

	/*
	 * Flush the write buffer.
	 * XXX Is this always necessary?
	 */
	wbflush();

	/*
	 * If the mapping is of COHERENT DMA-safe memory or this isn't a
	 * PREREAD or PREWRITE, no cache flush is necessary.  Check to see
	 * if we need to bounce it.
	 */
	if ((map->_dm_flags & _BUS_DMAMAP_COHERENT)
	    || (ops & (BUS_DMASYNC_PREREAD|BUS_DMASYNC_PREWRITE)) == 0)
		goto bounce_it;

	/*
	 * If the mapping belongs to the kernel, or it belongs
	 * to the currently-running process (XXX actually, vmspace),
	 * then we can use Hit ops.  Otherwise, Index ops.
	 *
	 * This should be true the vast majority of the time.
	 */
	const bool useindex = (!VMSPACE_IS_KERNEL_P(map->_dm_vmspace)
	    && map->_dm_vmspace != curproc->p_vmspace);

	bus_dma_segment_t *seg = map->dm_segs;
	bus_dma_segment_t * const lastseg = seg + map->dm_nsegs;
	/*
	 * Skip segments until offset are withing a segment.
	 */
	for (; offset >= seg->ds_len; seg++) {
		offset -= seg->ds_len;
	}
		
	for (; seg < lastseg && len != 0; seg++, offset = 0, len -= minlen) {
		/*
		 * Now at the first segment to sync; nail each segment until we
		 * have exhausted the length.
		 */
		vaddr_t vaddr = seg->_ds_vaddr + offset;
		minlen = ulmin(len, seg->ds_len - offset);

#ifdef BUS_DMA_DEBUG
		printf("bus_dmamap_sync: flushing segment %p "
		    "(0x%"PRIxBUSADDR"+%"PRIxBUSADDR
		    ", 0x%"PRIxBUSADDR"+0x%"PRIxBUSADDR
		    ") (olen = %"PRIxBUSADDR")...", seg,
		    vaddr - offset, offset,
		    vaddr - offset, offset + minlen - 1, len);
#endif

		/*
		 * If we are forced to use Index ops, it's always a
		 * Write-back,Invalidate, so just do one test.
		 */
		if (__predict_false(useindex)) {
			mips_dcache_wbinv_range_index(vaddr, minlen);
#ifdef BUS_DMA_DEBUG
			printf("\n");
#endif
			continue;
		}

		switch (ops) {
		case BUS_DMASYNC_PREREAD|BUS_DMASYNC_PREWRITE:
			mips_dcache_wbinv_range(vaddr, minlen);
			break;

		case BUS_DMASYNC_PREREAD:
#if 1
			mips_dcache_wbinv_range(vaddr, minlen);
#else
			mips_dcache_inv_range(vaddr, minlen);
#endif
			break;

		case BUS_DMASYNC_PREWRITE:
			mips_dcache_wb_range(vaddr, minlen);
			break;
		}
#ifdef BUS_DMA_DEBUG
		printf("\n");
#endif
	}

  bounce_it:
#ifdef _MIPS_NEED_BUS_DMA_BOUNCE
	if ((ops & BUS_DMASYNC_POSTREAD) == 0
	    || cookie == NULL
	    || (cookie->id_flags & _BUS_DMA_IS_BOUNCING) == 0)
		return;

	STAT_INCR(read_bounces);
	/*
	 * Copy the bounce buffer to the caller's buffer.
	 */
	switch (cookie->id_buftype) {
	case _BUS_DMA_BUFTYPE_LINEAR:
		memcpy(cookie->id_origlinearbuf + offset,
		    (char *)cookie->id_bouncebuf + offset, len);
		break;

	case _BUS_DMA_BUFTYPE_MBUF:
		m_copyback(cookie->id_origmbuf, offset, len, 
		    (char *)cookie->id_bouncebuf + offset);
		break;

	case _BUS_DMA_BUFTYPE_UIO:
		_bus_dma_uiomove((char *)cookie->id_bouncebuf + offset,
		    cookie->id_origuio, len, UIO_READ);
		break;
#ifdef DIAGNOSTIC
	case _BUS_DMA_BUFTYPE_RAW:
		panic("_bus_dmamap_sync: _BUS_DMA_BUFTYPE_RAW");
		break;

	case _BUS_DMA_BUFTYPE_INVALID:
		panic("_bus_dmamap_sync: _BUS_DMA_BUFTYPE_INVALID");
		break;

	default:
		panic("_bus_dmamap_sync: unknown buffer type %d\n",
		    cookie->id_buftype);
		break;
#endif
	}
#endif /* _MIPS_NEED_BUS_DMA_BOUNCE */
	;
}
Beispiel #21
0
/*
 * Tcp output routine: figure out what should be sent and send it.
 */
int
tcp_output(struct tcpcb *tp)
{
	struct inpcb * const inp = tp->t_inpcb;
	struct socket *so = inp->inp_socket;
	long len, recvwin, sendwin;
	int nsacked = 0;
	int off, flags, error;
#ifdef TCP_SIGNATURE
	int sigoff = 0;
#endif
	struct mbuf *m;
	struct ip *ip = NULL;
	struct ipovly *ipov = NULL;
	struct tcphdr *th;
	u_char opt[TCP_MAXOLEN];
	unsigned int ipoptlen, optlen, hdrlen;
	int idle;
	boolean_t sendalot;
	struct ip6_hdr *ip6 = NULL;
#ifdef INET6
	const boolean_t isipv6 = (inp->inp_vflag & INP_IPV6) != 0;
#else
	const boolean_t isipv6 = FALSE;
#endif

	KKASSERT(so->so_port == &curthread->td_msgport);

	/*
	 * Determine length of data that should be transmitted,
	 * and flags that will be used.
	 * If there is some data or critical controls (SYN, RST)
	 * to send, then transmit; otherwise, investigate further.
	 */

	/*
	 * If we have been idle for a while, the send congestion window
	 * could be no longer representative of the current state of the link.
	 * So unless we are expecting more acks to come in, slow-start from
	 * scratch to re-determine the send congestion window.
	 */
	if (tp->snd_max == tp->snd_una &&
	    (ticks - tp->t_rcvtime) >= tp->t_rxtcur) {
		if (tcp_do_rfc3390) {
			int initial_cwnd =
			    min(4 * tp->t_maxseg, max(2 * tp->t_maxseg, 4380));

			tp->snd_cwnd = min(tp->snd_cwnd, initial_cwnd);
		} else {
			tp->snd_cwnd = tp->t_maxseg;
		}
		tp->snd_wacked = 0;
	}

	/*
	 * Calculate whether the transmit stream was previously idle 
	 * and adjust TF_LASTIDLE for the next time.
	 */
	idle = (tp->t_flags & TF_LASTIDLE) || (tp->snd_max == tp->snd_una);
	if (idle && (tp->t_flags & TF_MORETOCOME))
		tp->t_flags |= TF_LASTIDLE;
	else
		tp->t_flags &= ~TF_LASTIDLE;

	if (TCP_DO_SACK(tp) && tp->snd_nxt != tp->snd_max &&
	    !IN_FASTRECOVERY(tp))
		nsacked = tcp_sack_bytes_below(&tp->scb, tp->snd_nxt);

again:
	/* Make use of SACK information when slow-starting after a RTO. */
	if (TCP_DO_SACK(tp) && tp->snd_nxt != tp->snd_max &&
	    !IN_FASTRECOVERY(tp)) {
		tcp_seq old_snd_nxt = tp->snd_nxt;

		tcp_sack_skip_sacked(&tp->scb, &tp->snd_nxt);
		nsacked += tp->snd_nxt - old_snd_nxt;
	}

	sendalot = FALSE;
	off = tp->snd_nxt - tp->snd_una;
	sendwin = min(tp->snd_wnd, tp->snd_cwnd + nsacked);
	sendwin = min(sendwin, tp->snd_bwnd);

	flags = tcp_outflags[tp->t_state];
	/*
	 * Get standard flags, and add SYN or FIN if requested by 'hidden'
	 * state flags.
	 */
	if (tp->t_flags & TF_NEEDFIN)
		flags |= TH_FIN;
	if (tp->t_flags & TF_NEEDSYN)
		flags |= TH_SYN;

	/*
	 * If in persist timeout with window of 0, send 1 byte.
	 * Otherwise, if window is small but nonzero
	 * and timer expired, we will send what we can
	 * and go to transmit state.
	 */
	if (tp->t_flags & TF_FORCE) {
		if (sendwin == 0) {
			/*
			 * If we still have some data to send, then
			 * clear the FIN bit.  Usually this would
			 * happen below when it realizes that we
			 * aren't sending all the data.  However,
			 * if we have exactly 1 byte of unsent data,
			 * then it won't clear the FIN bit below,
			 * and if we are in persist state, we wind
			 * up sending the packet without recording
			 * that we sent the FIN bit.
			 *
			 * We can't just blindly clear the FIN bit,
			 * because if we don't have any more data
			 * to send then the probe will be the FIN
			 * itself.
			 */
			if (off < so->so_snd.ssb_cc)
				flags &= ~TH_FIN;
			sendwin = 1;
		} else {
			tcp_callout_stop(tp, tp->tt_persist);
			tp->t_rxtshift = 0;
		}
	}

	/*
	 * If snd_nxt == snd_max and we have transmitted a FIN, the
	 * offset will be > 0 even if so_snd.ssb_cc is 0, resulting in
	 * a negative length.  This can also occur when TCP opens up
	 * its congestion window while receiving additional duplicate
	 * acks after fast-retransmit because TCP will reset snd_nxt
	 * to snd_max after the fast-retransmit.
	 *
	 * In the normal retransmit-FIN-only case, however, snd_nxt will
	 * be set to snd_una, the offset will be 0, and the length may
	 * wind up 0.
	 */
	len = (long)ulmin(so->so_snd.ssb_cc, sendwin) - off;

	/*
	 * Lop off SYN bit if it has already been sent.  However, if this
	 * is SYN-SENT state and if segment contains data, suppress sending
	 * segment (sending the segment would be an option if we still
	 * did TAO and the remote host supported it).
	 */
	if ((flags & TH_SYN) && SEQ_GT(tp->snd_nxt, tp->snd_una)) {
		flags &= ~TH_SYN;
		off--, len++;
		if (len > 0 && tp->t_state == TCPS_SYN_SENT)
			return 0;
	}

	/*
	 * Be careful not to send data and/or FIN on SYN segments.
	 * This measure is needed to prevent interoperability problems
	 * with not fully conformant TCP implementations.
	 */
	if (flags & TH_SYN) {
		len = 0;
		flags &= ~TH_FIN;
	}

	if (len < 0) {
		/*
		 * If FIN has been sent but not acked,
		 * but we haven't been called to retransmit,
		 * len will be < 0.  Otherwise, window shrank
		 * after we sent into it.  If window shrank to 0,
		 * cancel pending retransmit, pull snd_nxt back
		 * to (closed) window, and set the persist timer
		 * if it isn't already going.  If the window didn't
		 * close completely, just wait for an ACK.
		 */
		len = 0;
		if (sendwin == 0) {
			tcp_callout_stop(tp, tp->tt_rexmt);
			tp->t_rxtshift = 0;
			tp->snd_nxt = tp->snd_una;
			if (!tcp_callout_active(tp, tp->tt_persist))
				tcp_setpersist(tp);
		}
	}

	KASSERT(len >= 0, ("%s: len < 0", __func__));
	/*
	 * Automatic sizing of send socket buffer.  Often the send buffer
	 * size is not optimally adjusted to the actual network conditions
	 * at hand (delay bandwidth product).  Setting the buffer size too
	 * small limits throughput on links with high bandwidth and high
	 * delay (eg. trans-continental/oceanic links).  Setting the
	 * buffer size too big consumes too much real kernel memory,
	 * especially with many connections on busy servers.
	 *
	 * The criteria to step up the send buffer one notch are:
	 *  1. receive window of remote host is larger than send buffer
	 *     (with a fudge factor of 5/4th);
	 *  2. send buffer is filled to 7/8th with data (so we actually
	 *     have data to make use of it);
	 *  3. send buffer fill has not hit maximal automatic size;
	 *  4. our send window (slow start and cogestion controlled) is
	 *     larger than sent but unacknowledged data in send buffer.
	 *
	 * The remote host receive window scaling factor may limit the
	 * growing of the send buffer before it reaches its allowed
	 * maximum.
	 *
	 * It scales directly with slow start or congestion window
	 * and does at most one step per received ACK.  This fast
	 * scaling has the drawback of growing the send buffer beyond
	 * what is strictly necessary to make full use of a given
	 * delay*bandwith product.  However testing has shown this not
	 * to be much of an problem.  At worst we are trading wasting
	 * of available bandwith (the non-use of it) for wasting some
	 * socket buffer memory.
	 *
	 * TODO: Shrink send buffer during idle periods together
	 * with congestion window.  Requires another timer.  Has to
	 * wait for upcoming tcp timer rewrite.
	 */
	if (tcp_do_autosndbuf && so->so_snd.ssb_flags & SSB_AUTOSIZE) {
		if ((tp->snd_wnd / 4 * 5) >= so->so_snd.ssb_hiwat &&
		    so->so_snd.ssb_cc >= (so->so_snd.ssb_hiwat / 8 * 7) &&
		    so->so_snd.ssb_cc < tcp_autosndbuf_max &&
		    sendwin >= (so->so_snd.ssb_cc - (tp->snd_nxt - tp->snd_una))) {
			u_long newsize;

			newsize = ulmin(so->so_snd.ssb_hiwat +
					 tcp_autosndbuf_inc,
					tcp_autosndbuf_max);
			if (!ssb_reserve(&so->so_snd, newsize, so, NULL))
				atomic_clear_int(&so->so_snd.ssb_flags, SSB_AUTOSIZE);
			if (newsize >= (TCP_MAXWIN << tp->snd_scale))
				atomic_clear_int(&so->so_snd.ssb_flags, SSB_AUTOSIZE);
		}
	}

	/*
	 * Truncate to the maximum segment length and ensure that FIN is
	 * removed if the length no longer contains the last data byte.
	 */
	if (len > tp->t_maxseg) {
		len = tp->t_maxseg;
		sendalot = TRUE;
	}
	if (SEQ_LT(tp->snd_nxt + len, tp->snd_una + so->so_snd.ssb_cc))
		flags &= ~TH_FIN;

	recvwin = ssb_space(&so->so_rcv);

	/*
	 * Sender silly window avoidance.   We transmit under the following
	 * conditions when len is non-zero:
	 *
	 *	- We have a full segment
	 *	- This is the last buffer in a write()/send() and we are
	 *	  either idle or running NODELAY
	 *	- we've timed out (e.g. persist timer)
	 *	- we have more then 1/2 the maximum send window's worth of
	 *	  data (receiver may be limiting the window size)
	 *	- we need to retransmit
	 */
	if (len) {
		if (len == tp->t_maxseg)
			goto send;
		/*
		 * NOTE! on localhost connections an 'ack' from the remote
		 * end may occur synchronously with the output and cause
		 * us to flush a buffer queued with moretocome.  XXX
		 *
		 * note: the len + off check is almost certainly unnecessary.
		 */
		if (!(tp->t_flags & TF_MORETOCOME) &&	/* normal case */
		    (idle || (tp->t_flags & TF_NODELAY)) &&
		    len + off >= so->so_snd.ssb_cc &&
		    !(tp->t_flags & TF_NOPUSH)) {
			goto send;
		}
		if (tp->t_flags & TF_FORCE)		/* typ. timeout case */
			goto send;
		if (len >= tp->max_sndwnd / 2 && tp->max_sndwnd > 0)
			goto send;
		if (SEQ_LT(tp->snd_nxt, tp->snd_max))	/* retransmit case */
			goto send;
	}

	/*
	 * Compare available window to amount of window
	 * known to peer (as advertised window less
	 * next expected input).  If the difference is at least two
	 * max size segments, or at least 50% of the maximum possible
	 * window, then want to send a window update to peer.
	 */
	if (recvwin > 0) {
		/*
		 * "adv" is the amount we can increase the window,
		 * taking into account that we are limited by
		 * TCP_MAXWIN << tp->rcv_scale.
		 */
		long adv = min(recvwin, (long)TCP_MAXWIN << tp->rcv_scale) -
			(tp->rcv_adv - tp->rcv_nxt);
		long hiwat;

		/*
		 * This ack case typically occurs when the user has drained
		 * the TCP socket buffer sufficiently to warrent an ack
		 * containing a 'pure window update'... that is, an ack that
		 * ONLY updates the tcp window.
		 *
		 * It is unclear why we would need to do a pure window update
		 * past 2 segments if we are going to do one at 1/2 the high
		 * water mark anyway, especially since under normal conditions
		 * the user program will drain the socket buffer quickly.
		 * The 2-segment pure window update will often add a large
		 * number of extra, unnecessary acks to the stream.
		 *
		 * avoid_pure_win_update now defaults to 1.
		 */
		if (avoid_pure_win_update == 0 ||
		    (tp->t_flags & TF_RXRESIZED)) {
			if (adv >= (long) (2 * tp->t_maxseg)) {
				goto send;
			}
		}
		hiwat = (long)(TCP_MAXWIN << tp->rcv_scale);
		if (hiwat > (long)so->so_rcv.ssb_hiwat)
			hiwat = (long)so->so_rcv.ssb_hiwat;
		if (adv >= hiwat / 2)
			goto send;
	}

	/*
	 * Send if we owe the peer an ACK, RST, SYN, or urgent data.  ACKNOW
	 * is also a catch-all for the retransmit timer timeout case.
	 */
	if (tp->t_flags & TF_ACKNOW)
		goto send;
	if ((flags & TH_RST) ||
	    ((flags & TH_SYN) && !(tp->t_flags & TF_NEEDSYN)))
		goto send;
	if (SEQ_GT(tp->snd_up, tp->snd_una))
		goto send;
	/*
	 * If our state indicates that FIN should be sent
	 * and we have not yet done so, then we need to send.
	 */
	if (flags & TH_FIN &&
	    (!(tp->t_flags & TF_SENTFIN) || tp->snd_nxt == tp->snd_una))
		goto send;

	/*
	 * TCP window updates are not reliable, rather a polling protocol
	 * using ``persist'' packets is used to insure receipt of window
	 * updates.  The three ``states'' for the output side are:
	 *	idle			not doing retransmits or persists
	 *	persisting		to move a small or zero window
	 *	(re)transmitting	and thereby not persisting
	 *
	 * tcp_callout_active(tp, tp->tt_persist)
	 *	is true when we are in persist state.
	 * The TF_FORCE flag in tp->t_flags
	 *	is set when we are called to send a persist packet.
	 * tcp_callout_active(tp, tp->tt_rexmt)
	 *	is set when we are retransmitting
	 * The output side is idle when both timers are zero.
	 *
	 * If send window is too small, there is data to transmit, and no
	 * retransmit or persist is pending, then go to persist state.
	 * If nothing happens soon, send when timer expires:
	 * if window is nonzero, transmit what we can,
	 * otherwise force out a byte.
	 */
	if (so->so_snd.ssb_cc > 0 &&
	    !tcp_callout_active(tp, tp->tt_rexmt) &&
	    !tcp_callout_active(tp, tp->tt_persist)) {
		tp->t_rxtshift = 0;
		tcp_setpersist(tp);
	}

	/*
	 * No reason to send a segment, just return.
	 */
	return (0);

send:
	/*
	 * Before ESTABLISHED, force sending of initial options
	 * unless TCP set not to do any options.
	 * NOTE: we assume that the IP/TCP header plus TCP options
	 * always fit in a single mbuf, leaving room for a maximum
	 * link header, i.e.
	 *	max_linkhdr + sizeof(struct tcpiphdr) + optlen <= MCLBYTES
	 */
	optlen = 0;
	if (isipv6)
		hdrlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
	else
		hdrlen = sizeof(struct tcpiphdr);
	if (flags & TH_SYN) {
		tp->snd_nxt = tp->iss;
		if (!(tp->t_flags & TF_NOOPT)) {
			u_short mss;

			opt[0] = TCPOPT_MAXSEG;
			opt[1] = TCPOLEN_MAXSEG;
			mss = htons((u_short) tcp_mssopt(tp));
			memcpy(opt + 2, &mss, sizeof mss);
			optlen = TCPOLEN_MAXSEG;

			if ((tp->t_flags & TF_REQ_SCALE) &&
			    (!(flags & TH_ACK) ||
			     (tp->t_flags & TF_RCVD_SCALE))) {
				*((u_int32_t *)(opt + optlen)) = htonl(
					TCPOPT_NOP << 24 |
					TCPOPT_WINDOW << 16 |
					TCPOLEN_WINDOW << 8 |
					tp->request_r_scale);
				optlen += 4;
			}

			if ((tcp_do_sack && !(flags & TH_ACK)) ||
			    tp->t_flags & TF_SACK_PERMITTED) {
				uint32_t *lp = (uint32_t *)(opt + optlen);

				*lp = htonl(TCPOPT_SACK_PERMITTED_ALIGNED);
				optlen += TCPOLEN_SACK_PERMITTED_ALIGNED;
			}
		}
	}

	/*
	 * Send a timestamp and echo-reply if this is a SYN and our side
	 * wants to use timestamps (TF_REQ_TSTMP is set) or both our side
	 * and our peer have sent timestamps in our SYN's.
	 */
	if ((tp->t_flags & (TF_REQ_TSTMP | TF_NOOPT)) == TF_REQ_TSTMP &&
	    !(flags & TH_RST) &&
	    (!(flags & TH_ACK) || (tp->t_flags & TF_RCVD_TSTMP))) {
		u_int32_t *lp = (u_int32_t *)(opt + optlen);

		/* Form timestamp option as shown in appendix A of RFC 1323. */
		*lp++ = htonl(TCPOPT_TSTAMP_HDR);
		*lp++ = htonl(ticks);
		*lp   = htonl(tp->ts_recent);
		optlen += TCPOLEN_TSTAMP_APPA;
	}

	/* Set receive buffer autosizing timestamp. */
	if (tp->rfbuf_ts == 0 && (so->so_rcv.ssb_flags & SSB_AUTOSIZE))
		tp->rfbuf_ts = ticks;

	/*
	 * If this is a SACK connection and we have a block to report,
	 * fill in the SACK blocks in the TCP options.
	 */
	if ((tp->t_flags & (TF_SACK_PERMITTED | TF_NOOPT)) ==
		TF_SACK_PERMITTED &&
	    (!LIST_EMPTY(&tp->t_segq) ||
	     tp->reportblk.rblk_start != tp->reportblk.rblk_end))
		tcp_sack_fill_report(tp, opt, &optlen);

#ifdef TCP_SIGNATURE
	if (tp->t_flags & TF_SIGNATURE) {
		int i;
		u_char *bp;
		/*
		 * Initialize TCP-MD5 option (RFC2385)
		 */
		bp = (u_char *)opt + optlen;
		*bp++ = TCPOPT_SIGNATURE;
		*bp++ = TCPOLEN_SIGNATURE;
		sigoff = optlen + 2;
		for (i = 0; i < TCP_SIGLEN; i++)
			*bp++ = 0;
		optlen += TCPOLEN_SIGNATURE;
		/*
		 * Terminate options list and maintain 32-bit alignment.
		 */
		*bp++ = TCPOPT_NOP;
		*bp++ = TCPOPT_EOL;
		optlen += 2;
	}
#endif /* TCP_SIGNATURE */
	KASSERT(optlen <= TCP_MAXOLEN, ("too many TCP options"));
	hdrlen += optlen;

	if (isipv6) {
		ipoptlen = ip6_optlen(inp);
	} else {
		if (inp->inp_options) {
			ipoptlen = inp->inp_options->m_len -
			    offsetof(struct ipoption, ipopt_list);
		} else {
Beispiel #22
0
int
memrw(struct cdev *dev, struct uio *uio, int flags)
{
	struct iovec *iov;
	struct vm_page m;
	vm_page_t marr;
	vm_offset_t off, v;
	u_int cnt;
	int error;

	error = 0;

	while (uio->uio_resid > 0 && error == 0) {
		iov = uio->uio_iov;
		if (iov->iov_len == 0) {
			uio->uio_iov++;
			uio->uio_iovcnt--;
			if (uio->uio_iovcnt < 0)
				panic("memrw");
			continue;
		}

		v = uio->uio_offset;
		off = v & PAGE_MASK;
		cnt = ulmin(iov->iov_len, PAGE_SIZE - (u_int)off);
		if (cnt == 0)
			continue;

		switch(dev2unit(dev)) {
		case CDEV_MINOR_KMEM:
			/* If the address is in the DMAP just copy it */
			if (VIRT_IN_DMAP(v)) {
				error = uiomove((void *)v, cnt, uio);
				break;
			}

			if (!kernacc((void *)v, cnt, uio->uio_rw == UIO_READ ?
			    VM_PROT_READ : VM_PROT_WRITE)) {
				error = EFAULT;
				break;
			}

			/* Get the physical address to read */
			v = pmap_extract(kernel_pmap, v);
			if (v == 0) {
				error = EFAULT;
				break;
			}

			/* FALLTHROUGH */
		case CDEV_MINOR_MEM:
			/* If within the DMAP use this to copy from */
			if (PHYS_IN_DMAP(v)) {
				v = PHYS_TO_DMAP(v);
				error = uiomove((void *)v, cnt, uio);
				break;
			}

			/* Have uiomove_fromphys handle the data */
			m.phys_addr = trunc_page(v);
			marr = &m;
			uiomove_fromphys(&marr, off, cnt, uio);
			break;
		}
	}

	return (error);
}