Beispiel #1
0
static int
efab_tcp_drop_from_acceptq(ci_private_t *priv, void *arg)
{
  struct oo_op_tcp_drop_from_acceptq *carg = arg;
  tcp_helper_resource_t *thr;
  tcp_helper_endpoint_t *ep;
  citp_waitable *w;
  ci_tcp_state *ts;
  int rc = -EINVAL;

  /* find stack */
  rc = efab_thr_table_lookup(NULL, carg->stack_id,
                                 EFAB_THR_TABLE_LOOKUP_CHECK_USER |
                                 EFAB_THR_TABLE_LOOKUP_NO_UL,
                                 &thr);

  if( rc < 0 )
    return rc;
  ci_assert( thr->k_ref_count & TCP_HELPER_K_RC_NO_USERLAND );

  /* find endpoint and drop OS socket */
  ep = ci_trs_get_valid_ep(thr, carg->sock_id);
  if( ep == NULL )
    goto fail1;

  w = SP_TO_WAITABLE(&thr->netif, carg->sock_id);
  if( !(w->state & CI_TCP_STATE_TCP) || w->state == CI_TCP_LISTEN )
    goto fail2;
  ts = SP_TO_TCP(&thr->netif, carg->sock_id);
  ci_assert(ep->os_port_keeper);
  ci_assert_equal(ep->os_socket, NULL);

  LOG_TV(ci_log("%s: send reset to non-accepted connection", __FUNCTION__));

  /* copy from ci_tcp_listen_shutdown_queues() */
  ci_assert(ts->s.b.sb_aflags & CI_SB_AFLAG_TCP_IN_ACCEPTQ);
  rc = ci_netif_lock(&thr->netif);
  if( rc != 0 ) {
    ci_assert_equal(rc, -EINTR);
    rc = -ERESTARTSYS;
    goto fail2;
  }
  ci_bit_clear(&ts->s.b.sb_aflags, CI_SB_AFLAG_TCP_IN_ACCEPTQ_BIT);
  /* We have no way to close this connection from the other side:
   * there was no RST from peer. */
  ci_assert_nequal(ts->s.b.state, CI_TCP_CLOSED);
  ci_assert_nequal(ts->s.b.state, CI_TCP_TIME_WAIT);
  ci_tcp_send_rst(&thr->netif, ts);
  ci_tcp_drop(&thr->netif, ts, ECONNRESET);
  ci_assert_equal(ep->os_port_keeper, NULL);
  ci_netif_unlock(&thr->netif);
  efab_tcp_helper_k_ref_count_dec(thr, 1);
  return 0;

fail1:
  efab_thr_release(thr);
fail2:
  ci_log("%s: inconsistent ep %d:%d", __func__, carg->stack_id, carg->sock_id);
  return rc;
}
Beispiel #2
0
static ci_tcp_state_synrecv*
ci_tcp_listenq_bucket_lookup(ci_netif* ni, ci_tcp_listen_bucket* bucket,
                             ciip_tcp_rx_pkt* rxp,
                             int level)
{
  ci_ni_aux_mem* aux;
  int idx = ci_tcp_listenq_hash2idx(rxp->hash, level);
  ci_tcp_state_synrecv* tsr;
  unsigned saddr, daddr, sport;
#ifdef __KERNEL__
  int i = 0;

  if( level > CI_LISTENQ_BUCKET_MAX_DEPTH(ni) ) {
    ci_netif_error_detected(ni, CI_NETIF_ERROR_SYNRECV_TABLE,
                            __FUNCTION__);
    return 0;
  }
#endif

  LOG_TV(ci_log("%s([%d] level=%d hash:%x l:%s r:%s:%d)", __func__,
                NI_ID(ni), level, rxp->hash,
                ip_addr_str(oo_ip_hdr(rxp->pkt)->ip_daddr_be32),
                ip_addr_str(oo_ip_hdr(rxp->pkt)->ip_saddr_be32),
                CI_BSWAP_BE16(rxp->tcp->tcp_source_be16)));
  if( OO_P_IS_NULL(bucket->bucket[idx]) )
    return NULL;

  level++;
  aux = ci_ni_aux_p2aux(ni, bucket->bucket[idx]);
  if( aux->type == CI_TCP_AUX_TYPE_BUCKET )
    return ci_tcp_listenq_bucket_lookup(ni, &aux->u.bucket, rxp, level);

  saddr = oo_ip_hdr(rxp->pkt)->ip_saddr_be32;
  daddr = oo_ip_hdr(rxp->pkt)->ip_daddr_be32;
  sport = rxp->tcp->tcp_source_be16;

  tsr = &aux->u.synrecv;
  do {
    if( ! ((saddr - tsr->r_addr) | (daddr - tsr->l_addr) |
           (sport - tsr->r_port)) )
      return tsr;
    if( OO_P_IS_NULL(tsr->bucket_link) )
      return NULL;
    aux = ci_ni_aux_p2aux(ni, tsr->bucket_link);
    tsr = &aux->u.synrecv;
#ifdef __KERNEL__
    if( i++ > CI_LISTENQ_BUCKET_LIST_LIMIT(ni) ) {
      ci_netif_error_detected(ni, CI_NETIF_ERROR_SYNRECV_TABLE,
                              __FUNCTION__);
      return NULL;
    }
#endif
  } while(1);

  /* unreachable */
  return NULL;
}
Beispiel #3
0
static void
ci_tcp_listenq_bucket_insert(ci_netif* ni, ci_tcp_socket_listen* tls,
                             ci_tcp_listen_bucket* bucket,
                             ci_tcp_state_synrecv* tsr, int level)
{
  ci_ni_aux_mem* aux;
  int idx = ci_tcp_listenq_hash2idx(tsr->hash, level);
  oo_p tsr_p = ci_tcp_synrecv2p(ni, tsr);
#ifdef __KERNEL__
  int i = 0;
#endif

  LOG_TV(ci_log("%s([%d] level=%d "TSR_FMT")", __func__,
                NI_ID(ni), level, TSR_ARGS(tsr)));

  if( OO_P_IS_NULL(bucket->bucket[idx]) ) {
    bucket->bucket[idx] = tsr_p;
    return;
  }

  level++;
  aux = ci_ni_aux_p2aux(ni, bucket->bucket[idx]);
  if( aux->type == CI_TCP_AUX_TYPE_BUCKET ) {
    ci_tcp_listenq_bucket_insert(ni, tls, &aux->u.bucket, tsr, level);
    return;
  }

  /* So, this bucket contains of a list of other synrecv states.  We add
   * our trs to this list and try to improve things by allocating
   * next-level bucket. */
  tsr->bucket_link = bucket->bucket[idx];
  bucket->bucket[idx] = tsr_p;

  if( level > CI_LISTENQ_BUCKET_MAX_DEPTH(ni) )
    return;

  bucket->bucket[idx] = ci_ni_aux_alloc_bucket(ni);
  if( OO_P_IS_NULL(bucket->bucket[idx]) )
    return;
  bucket = ci_ni_aux_p2bucket(ni, bucket->bucket[idx]);
  tls->n_buckets++;

  while( OO_P_NOT_NULL(tsr_p) ) {
    tsr = &ci_ni_aux_p2aux(ni, tsr_p)->u.synrecv;
#ifdef __KERNEL__
    if( i++ > CI_LISTENQ_BUCKET_LIST_LIMIT(ni) ) {
      ci_tcp_listenq_bucket_insert(ni, tls, bucket, tsr, level);
      ci_netif_error_detected(ni, CI_NETIF_ERROR_SYNRECV_TABLE,
                              __FUNCTION__);
      return;
    }
#endif
    tsr_p = tsr->bucket_link;
    tsr->bucket_link = OO_P_NULL;
    ci_tcp_listenq_bucket_insert(ni, tls, bucket, tsr, level);
  }
}
Beispiel #4
0
citp_waitable_obj* citp_waitable_obj_alloc(ci_netif* netif)
{
  citp_waitable_obj* wo;

  ci_assert(netif);
  ci_assert(ci_netif_is_locked(netif));

  if( netif->state->deferred_free_eps_head != CI_ILL_END ) {
    ci_uint32 link;
    do
      link = netif->state->deferred_free_eps_head;
    while( ci_cas32_fail(&netif->state->deferred_free_eps_head,
                         link, CI_ILL_END));
    while( link != CI_ILL_END ) {
      citp_waitable* w = ID_TO_WAITABLE(netif, link);
      link = w->next_id;
      CI_DEBUG(w->next_id = CI_ILL_END);
      ci_assert_equal(w->state, CI_TCP_STATE_FREE);
      ci_assert(OO_SP_IS_NULL(w->wt_next));
      w->wt_next = netif->state->free_eps_head;
      netif->state->free_eps_head = W_SP(w);
    }
  }

  if( OO_SP_IS_NULL(netif->state->free_eps_head) ) {
    ci_tcp_helper_more_socks(netif);

    if( OO_SP_IS_NULL(netif->state->free_eps_head) )
      ci_netif_timeout_reap(netif);
  }

  if( OO_SP_IS_NULL(netif->state->free_eps_head) )
    return NULL;

  LOG_TV(ci_log("%s: allocating %d", __FUNCTION__,
                OO_SP_FMT(netif->state->free_eps_head)));

  ci_assert(IS_VALID_SOCK_P(netif, netif->state->free_eps_head));
#if !defined(__KERNEL__) && !defined (CI_HAVE_OS_NOPAGE)
  ci_netif_mmap_shmbuf(netif,
                       (netif->state->free_eps_head >> EP_BUF_BLOCKSHIFT) + 1);
#endif
  wo = SP_TO_WAITABLE_OBJ(netif, netif->state->free_eps_head);

  ci_assert(OO_SP_EQ(W_SP(&wo->waitable), netif->state->free_eps_head));
  ci_assert_equal(wo->waitable.state, CI_TCP_STATE_FREE);
  ci_assert_equal(wo->waitable.sb_aflags, (CI_SB_AFLAG_ORPHAN | CI_SB_AFLAG_NOT_READY));
  ci_assert_equal(wo->waitable.lock.wl_val, 0);

  netif->state->free_eps_head = wo->waitable.wt_next;
  CI_DEBUG(wo->waitable.wt_next = OO_SP_NULL);
  ci_assert_equal(wo->waitable.state, CI_TCP_STATE_FREE);

  return wo;
}
Beispiel #5
0
/*
** See if there is a synrecv object that matches this syn request already.
*/
ci_tcp_state_synrecv*
ci_tcp_listenq_lookup(ci_netif* netif, ci_tcp_socket_listen* tls,
                      ciip_tcp_rx_pkt* rxp)
{
  ci_tcp_state_synrecv* tsr;

  tsr = ci_tcp_listenq_bucket_lookup(
                        netif, ci_ni_aux_p2bucket(netif, tls->bucket),
                        rxp, 0);
  if( tsr == NULL ) {
    LOG_TV(log(LPF "no match for %s:%d->%s:%d",
               ip_addr_str(oo_ip_hdr(rxp->pkt)->ip_saddr_be32),
               (int) CI_BSWAP_BE16(rxp->tcp->tcp_source_be16),
               ip_addr_str(oo_ip_hdr(rxp->pkt)->ip_daddr_be32),
               (int) CI_BSWAP_BE16(rxp->tcp->tcp_dest_be16)));
  }

  return tsr;
}
Beispiel #6
0
/* NOTE: in the kernel version [fd] is unused and, if it's a ptr, [arg] will
 * be in user-space and may need to be fetched into kernel memory. */
static int ci_tcp_ioctl_lk(citp_socket* ep, ci_fd_t fd, int request,
                           void* arg)
{
  ci_netif* netif = ep->netif;
  ci_sock_cmn* s = ep->s;
  ci_tcp_state* ts = NULL;
  int rc = 0;
  int os_socket_exists = s->b.sb_aflags & CI_SB_AFLAG_OS_BACKED;

  if( s->b.state != CI_TCP_LISTEN )
    ts = SOCK_TO_TCP(s);

  /* Keep the os socket in sync.  If this is a "get" request then the
   * return will be based on our support, not the os's (except for EFAULT
   * handling which we get for free).
   * Exceptions:
   *  - FIONBIO is applied just in time on handover if needed (listening
   *    sockets always have a non-blocking OS socket)
   *  - FIONREAD, TIOCOUTQ, SIOCOUTQNSD and SIOCATMARK are useless on OS
   *    socket, let's avoid syscall.
   */
  if( os_socket_exists && request != FIONREAD && request != SIOCATMARK &&
      request != FIOASYNC && request != TIOCOUTQ && request != SIOCOUTQNSD &&
      request != (int) FIONBIO ) {
    rc = oo_os_sock_ioctl(netif, s->b.bufid, request, arg, NULL);
    if( rc < 0 )
      return rc;
  }

  /* ioctl defines are listed in `man ioctl_list` and the CI equivalent
   * CI defines are in include/ci/net/ioctls.h  */
  LOG_TV( ci_log("%s:  request = %d, arg = %ld", __FUNCTION__, request, 
		 (long)arg));

  switch( request ) {
  case FIONBIO:
    if( CI_IOCTL_ARG_OK(int, arg) ) {
      CI_CMN_IOCTL_FIONBIO(ep->s, arg);
      rc = 0;
      break;
    }
    goto fail_fault;
  case FIONREAD: /* synonym of SIOCINQ */
    if( !CI_IOCTL_ARG_OK(int, arg) )
      goto fail_fault;
    if( s->b.state == CI_TCP_LISTEN )
      goto fail_inval;

    if( s->b.state == CI_TCP_SYN_SENT ) {
      CI_IOCTL_SETARG((int*)arg, 0);
    } else {
      /* In inline mode, return the total number of bytes in the receive queue.
         If SO_OOBINLINE isn't set then return the number of bytes up to the
         mark but without counting the mark */
      int bytes_in_rxq = tcp_rcv_usr(ts);
      if (bytes_in_rxq && ! (ts->s.s_flags & CI_SOCK_FLAG_OOBINLINE)) {

        if (tcp_urg_data(ts) & CI_TCP_URG_PTR_VALID) {
          /*! \TODO: what if FIN has been received? */
          unsigned int readnxt = tcp_rcv_nxt(ts) - bytes_in_rxq;
          if (SEQ_LT(readnxt, tcp_rcv_up(ts))) {
            bytes_in_rxq = tcp_rcv_up(ts) - readnxt;
          } else if (SEQ_EQ(readnxt, tcp_rcv_up(ts))) {
            bytes_in_rxq--;
          }
        }

      }
      CI_IOCTL_SETARG((int*)arg, bytes_in_rxq);
    }
    break;

  case TIOCOUTQ: /* synonym of SIOCOUTQ */
  case SIOCOUTQNSD:
    {
    CI_BUILD_ASSERT(TIOCOUTQ == SIOCOUTQ);
    int outq_bytes = 0;

    if( !CI_IOCTL_ARG_OK(int, arg) )
      goto fail_fault;
    if( s->b.state == CI_TCP_LISTEN )
      goto fail_inval;

    if( s->b.state != CI_TCP_SYN_SENT ) {

      /* TIOCOUTQ counts all unacknowledged data, so includes retrans queue. */
      if( request == TIOCOUTQ )
        outq_bytes = SEQ_SUB(tcp_enq_nxt(ts), tcp_snd_una(ts));
      else
        outq_bytes = SEQ_SUB(tcp_enq_nxt(ts), tcp_snd_nxt(ts));
    }
    CI_IOCTL_SETARG((int*)arg, outq_bytes);
    }
    break;

  case SIOCATMARK:
    {
     if( !CI_IOCTL_ARG_OK(int, arg) )
       goto fail_fault;

      /* return true, if we are at the out-of-band byte */
      CI_IOCTL_SETARG((int*)arg, 0);
      if( s->b.state != CI_TCP_LISTEN ) {
	int readnxt;

        readnxt = SEQ_SUB(tcp_rcv_nxt(ts), tcp_rcv_usr(ts));
        if( ~ts->s.b.state & CI_TCP_STATE_ACCEPT_DATA )
          readnxt = SEQ_SUB(readnxt, 1);
        if( tcp_urg_data(ts) & CI_TCP_URG_PTR_VALID )
          CI_IOCTL_SETARG((int*)arg, readnxt == tcp_rcv_up(ts));
        LOG_URG(log(NTS_FMT "SIOCATMARK atmark=%d  readnxt=%u rcv_up=%u%s",
                    NTS_PRI_ARGS(ep->netif, ts), readnxt == tcp_rcv_up(ts), 
		    readnxt,  tcp_rcv_up(SOCK_TO_TCP(ep->s)),
                    (tcp_urg_data(ts)&CI_TCP_URG_PTR_VALID)?"":" (invalid)"));
      }
      break;
    }

#ifndef __KERNEL__
  case FIOASYNC:
    /* Need to apply this to [fd] so that our fasync file-op will be
     * invoked.
     */
    rc = ci_sys_ioctl(fd, request, arg);
    break;

  case SIOCSPGRP:
    if( !CI_IOCTL_ARG_OK(int, arg) )
      goto fail_fault;
    /* Need to apply this to [fd] to get signal delivery to work.  However,
     * SIOCSPGRP is only supported on sockets, so we need to convert to
     * fcntl().
     */
    rc = ci_sys_fcntl(fd, F_SETOWN, CI_IOCTL_GETARG(int, arg));
    if( rc == 0 ) {
      rc = ci_cmn_ioctl(netif, ep->s, request, arg, rc, os_socket_exists);
    }
    else {
      CI_SET_ERROR(rc, -rc);
    }
    break;
#endif

  default:
    return ci_cmn_ioctl(netif, ep->s, request, arg, rc, os_socket_exists);
  }
Beispiel #7
0
static int ci_tcp_setsockopt_lk(citp_socket* ep, ci_fd_t fd, int level,
				int optname, const void* optval,
				socklen_t optlen )
{
  ci_sock_cmn* s = ep->s;
#if defined(__linux__) || \
    defined(__sun__) && defined(TCP_KEEPALIVE_THRESHOLD) || \
    defined(__sun__) && defined(TCP_KEEPALIVE_ABORT_THRESHOLD)
  ci_tcp_socket_cmn* c = &(SOCK_TO_WAITABLE_OBJ(s)->tcp.c);
#endif
  ci_netif* netif = ep->netif;
  int zeroval = 0;
  int rc;

  /* ?? what to do about optval and optlen checking
  ** Kernel can raise EFAULT, here we are a little in the dark.
  ** Note: If the OS sock is sync'd then we get this checking for free.
  */

  if (optlen == 0) {
    /* Match kernel behaviour: if length is 0, it treats the value as 0; and
     * some applications rely on this.
     */
    optval = &zeroval;
    optlen = sizeof(zeroval);
  }

  /* If you're adding to this please remember to look in common_sockopts.c
   * and decide if the option is common to all protocols. */

  if(level == SOL_SOCKET) {
    switch(optname) {
    case SO_KEEPALIVE:
      /* Over-ride the default common handler.
       * Enable sending of keep-alive messages */
      if( (rc = opt_not_ok(optval, optlen, unsigned)) )
      	goto fail_inval;

      if( *(unsigned*) optval ) {
	unsigned prev_flags = s->s_flags;
	s->s_flags |= CI_SOCK_FLAG_KALIVE;
	/* Set KEEPALIVE timer only if we are not in
	** CLOSE or LISTENING state. */
	if( s->b.state != CI_TCP_CLOSED && s->b.state != CI_TCP_LISTEN &&
	    !(prev_flags & CI_SOCK_FLAG_KALIVE) ) {
	  ci_tcp_state* ts = SOCK_TO_TCP(s);
	  LOG_TV(log("%s: "NSS_FMT" run KEEPALIVE timer from setsockopt()",
		     __FUNCTION__, NSS_PRI_ARGS(netif, s)));
	  ci_assert(ts->ka_probes == 0);
	  ci_tcp_kalive_restart(netif, ts, ci_tcp_kalive_idle_get(ts));
	}
      }
      else {
      	s->s_flags &=~ CI_SOCK_FLAG_KALIVE;
	if( s->b.state != CI_TCP_LISTEN ) {
	  ci_tcp_state* ts = SOCK_TO_TCP(s);
	  ci_tcp_kalive_check_and_clear(netif, ts);
	  ts->ka_probes = 0;
	}
      }
      break;

    default:
      {
        /* Common socket level options */
        return ci_set_sol_socket(netif, s, optname, optval, optlen);
      }
    }
  }
  else if( level == IPPROTO_IP ) {
Beispiel #8
0
/*
** promote a synrecv structure to an established socket
**
** Assumes that the caller will handle a fail if we can't allocate a new
** tcp_state structure due to memory pressure or the like
*/
int ci_tcp_listenq_try_promote(ci_netif* netif, ci_tcp_socket_listen* tls,
                               ci_tcp_state_synrecv* tsr,
                               ci_ip_cached_hdrs* ipcache,
                               ci_tcp_state** ts_out)
{
  int rc = 0;
  
  ci_assert(netif);
  ci_assert(tls);
  ci_assert(tls->s.b.state == CI_TCP_LISTEN);
  ci_assert(tsr);

  if( (int) ci_tcp_acceptq_n(tls) < tls->acceptq_max ) {
    ci_tcp_state* ts;

    /* grab a tcp_state structure that will go onto the accept queue.  We take
     * from the cache of EPs if any are available
     */
    ts = get_ts_from_cache (netif, tsr, tls); 
    if( !ts ) {
      /* None on cache; try allocating a new ts */
      ts = ci_tcp_get_state_buf(netif);
#if CI_CFG_FD_CACHING
      if( ts == NULL ) {
        /* We've reaped.  Did this result in any being cached */
        ts = get_ts_from_cache(netif, tsr, tls);
        if (ts == NULL ) {
          /* No -- try again to allocate. */
          ts = ci_tcp_get_state_buf(netif);
        }
        else {
          CITP_STATS_NETIF(++netif->state->stats.sockcache_hit_reap);
        }
      }
#endif
      if( ts == NULL ) {
        LOG_TV(ci_log("%s: [%d] out of socket buffers",
                      __FUNCTION__, NI_ID(netif)));
        CITP_STATS_TCP_LISTEN(++tls->stats.n_acceptq_no_sock);
        CI_SET_SO_ERROR(&tls->s, ENOMEM);
        citp_waitable_wake(netif, &tls->s.b, CI_SB_FLAG_WAKE_RX);
        return -ENOMEM;
      }


      ci_assert(ci_tcp_is_cached(ts) ||
                (ts->s.b.sb_aflags & CI_SB_AFLAG_ORPHAN));
    }

#ifdef ONLOAD_OFE
    ts->s.ofe_code_start = tls->ofe_promote;
#endif

    if( ! ci_tcp_is_cached(ts) ) {
      /* Need to initialise address information for use when setting filters */
      ci_tcp_set_addr_on_promote(netif, ts, tsr, tls);

      /* "borrow" filter from listening socket.  For loopback socket, we
       * do not need filters, but we have to take a reference of the OS
       * socket. */
      rc = ci_tcp_ep_set_filters(netif, S_SP(ts), ts->s.cp.so_bindtodevice,
                                 S_SP(tls));
      if( rc < 0 ) {
        LOG_U(ci_log("%s: Unable to set filters %d", __FUNCTION__, rc));
        /* Either put this back on the list (at the head) or free it */
        ci_tcp_state_free(netif, ts);
        return rc;
      }
    }
#if CI_CFG_FD_CACHING
    else {
      /* Now set the s/w filter.  We leave the hw filter in place for cached
       * EPS. This will probably not have the correct raddr and rport, but as
       * it's sharing the listening socket's filter that's not a problem.  It
       * will be updated if this is still around when the listener is closed.
       */
      rc = ci_netif_filter_insert(netif, S_SP(ts), tsr->l_addr,
                                  sock_lport_be16(&tls->s), tsr->r_addr,
                                  tsr->r_port, tcp_protocol(ts));

      if (rc < 0) {
        /* Bung it back on the cache list */
        LOG_EP(ci_log("Unable to create s/w filter!"));
        ci_ni_dllist_push(netif, &tls->epcache.cache, &ts->epcache_link);
        return rc;
      }

      /* Need to initialise address information.  We do this after trying to
       * insert the sw filter, so we can push the tcp state back onto the
       * cache queue with as few changes as possible if we fail to add the
       * sw filter.
       */
      ci_tcp_set_addr_on_promote(netif, ts, tsr, tls);

      LOG_EP(ci_log("Cached fd %d from cached to connected", ts->cached_on_fd));
      ci_ni_dllist_push(netif, &tls->epcache_connected, &ts->epcache_link);
    }
#endif

    ci_assert(IS_VALID_SOCK_P(netif, S_SP(ts)));
    ci_assert(ts->s.b.state == CI_TCP_CLOSED);
    ts->s.domain = tls->s.domain;

    cicp_ip_cache_update_from(netif, &ts->s.pkt, ipcache);
    ci_pmtu_state_init(netif, &ts->s, &ts->pmtus,
                       CI_IP_TIMER_PMTU_DISCOVER);
    ci_pmtu_set(netif, &ts->pmtus,
                CI_MIN(ts->s.pkt.mtu,
                       tsr->tcpopts.smss + sizeof(ci_tcp_hdr)
                         + sizeof(ci_ip4_hdr)));

    /* If we've got SYN via local route, we can handle it */
    ci_assert_equiv(ts->s.pkt.status == retrrc_localroute,
                    OO_SP_NOT_NULL(tsr->local_peer));
    if( ts->s.pkt.status == retrrc_localroute )
      ts->s.pkt.flags |= CI_IP_CACHE_IS_LOCALROUTE;

    ts->amss = tsr->amss;

    /* options and flags */
    ts->tcpflags = 0;
    ts->tcpflags |= tsr->tcpopts.flags;
    ts->tcpflags |= CI_TCPT_FLAG_PASSIVE_OPENED;
    ts->outgoing_hdrs_len = sizeof(ci_ip4_hdr) + sizeof(ci_tcp_hdr);
    if( ts->tcpflags & CI_TCPT_FLAG_WSCL ) {
      ts->snd_wscl = tsr->tcpopts.wscl_shft;
      ts->rcv_wscl = tsr->rcv_wscl;
    } else {
      ts->snd_wscl = ts->rcv_wscl = 0u;
    }
    CI_IP_SOCK_STATS_VAL_TXWSCL( ts, ts->snd_wscl);
    CI_IP_SOCK_STATS_VAL_RXWSCL( ts, ts->rcv_wscl);

    /* Send and receive sequence numbers */
    tcp_snd_una(ts) = tcp_snd_nxt(ts) = tcp_enq_nxt(ts) = tcp_snd_up(ts) =
      tsr->snd_isn + 1;
    ci_tcp_set_snd_max(ts, tsr->rcv_nxt, tcp_snd_una(ts), 0);
    ci_tcp_rx_set_isn(ts, tsr->rcv_nxt);
    tcp_rcv_up(ts) = SEQ_SUB(tcp_rcv_nxt(ts), 1);

    if( ts->tcpflags & CI_TCPT_FLAG_TSO ) {
      ts->incoming_tcp_hdr_len += 12;
      ts->outgoing_hdrs_len += 12;
      ts->tspaws = ci_tcp_time_now(netif);
      ts->tsrecent = tsr->tspeer;
      ts->tslastack = tsr->rcv_nxt;
    }
    else {
      /* Must be after initialising snd_una. */
      ci_tcp_clear_rtt_timing(ts);
      ts->timed_ts = tsr->timest;
    }
    /* SACK has nothing to be done. */

    /* ?? ECN */
    ci_tcp_set_hdr_len(ts, (ts->outgoing_hdrs_len - sizeof(ci_ip4_hdr)));

    ts->smss = tsr->tcpopts.smss;
    ts->c.user_mss = tls->c.user_mss;
    if (ts->c.user_mss && ts->c.user_mss < ts->smss)
      ts->smss = ts->c.user_mss;
#if CI_CFG_LIMIT_SMSS
    ts->smss = ci_tcp_limit_mss(ts->smss, netif, __FUNCTION__);
#endif
    ci_assert(ts->smss>0);
    ci_tcp_set_eff_mss(netif, ts);
    ci_tcp_set_initialcwnd(netif, ts);

    /* Copy socket options & related fields that should be inherited. 
     * Note: Windows does not inherit rcvbuf until the call to accept 
     * completes. The assumption here is that all options can be
     * inherited at the same time (most won't have an effect until there
     * is a socket available for use by the app.).
     */
    ci_tcp_inherit_accept_options(netif, tls, ts, "SYN RECV (LISTENQ PROMOTE)");

    /* NB. Must have already set peer (which we have). */
    ci_tcp_set_established_state(netif, ts);
    CITP_STATS_NETIF(++netif->state->stats.synrecv2established);
  
    ci_assert(ts->ka_probes == 0);
    ci_tcp_kalive_restart(netif, ts, ci_tcp_kalive_idle_get(ts));
    ci_tcp_set_flags(ts, CI_TCP_FLAG_ACK);

    /* Remove the synrecv structure from the listen queue, and free the
    ** buffer. */
    if( tsr->tcpopts.flags & CI_TCPT_FLAG_SYNCOOKIE )
      ci_free(tsr);
    else {
      ci_tcp_listenq_remove(netif, tls, tsr);
      ci_tcp_synrecv_free(netif, tsr);
    }

    ci_bit_set(&ts->s.b.sb_aflags, CI_SB_AFLAG_TCP_IN_ACCEPTQ_BIT);
    ci_tcp_acceptq_put(netif, tls, &ts->s.b);

    LOG_TC(log(LNT_FMT "new ts=%d SYN-RECV->ESTABLISHED flags=0x%x",
               LNT_PRI_ARGS(netif, tls), S_FMT(ts), ts->tcpflags);
           log(LNTS_FMT RCV_WND_FMT " snd=%08x-%08x-%08x enq=%08x",
               LNTS_PRI_ARGS(netif, ts), RCV_WND_ARGS(ts),
               tcp_snd_una(ts),
               tcp_snd_nxt(ts), ts->snd_max, tcp_enq_nxt(ts)));

    citp_waitable_wake(netif, &tls->s.b, CI_SB_FLAG_WAKE_RX);
    *ts_out = ts;
    return 0;
  }
Beispiel #9
0
/* Return 1 if the bucket is empty now */
static int
ci_tcp_listenq_bucket_remove(ci_netif* ni, ci_tcp_socket_listen* tls,
                             ci_tcp_listen_bucket* bucket,
                             ci_tcp_state_synrecv* tsr, int level)
{
  ci_ni_aux_mem* aux;
  int idx = ci_tcp_listenq_hash2idx(tsr->hash, level);
  oo_p tsr_p = ci_tcp_synrecv2p(ni, tsr);

  /* Fixme: we remove empty buckets only.  In theory, it may be useful to
   * remove a bucket with one non-empty list, but it maked code more
   * complicated. */
  int empty = 0;
#ifdef __KERNEL__
  int i = 0;

  if( level > CI_LISTENQ_BUCKET_MAX_DEPTH(ni) ) {
    ci_netif_error_detected(ni, CI_NETIF_ERROR_SYNRECV_TABLE, __FUNCTION__);
    return 0;
  }
#endif

  LOG_TV(ci_log("%s([%d] level=%d "TSR_FMT")", __func__,
                NI_ID(ni), level, TSR_ARGS(tsr)));
  ci_assert( OO_P_NOT_NULL(bucket->bucket[idx]) );
#ifdef __KERNEL__
  if( OO_P_IS_NULL(bucket->bucket[idx]) ) {
    ci_netif_error_detected(ni, CI_NETIF_ERROR_SYNRECV_TABLE,
                            __FUNCTION__);
    return 0;
  }
#endif

  level++;
  aux = ci_ni_aux_p2aux(ni, bucket->bucket[idx]);
  if( aux->type == CI_TCP_AUX_TYPE_BUCKET ) {
    empty = ci_tcp_listenq_bucket_remove(ni, tls, &aux->u.bucket, tsr, level);
    if( empty ) {
      bucket->bucket[idx] = OO_P_NULL;
      ci_ni_aux_free(ni, aux);
      tls->n_buckets--;
    }
  }
  else {
    if( bucket->bucket[idx] == tsr_p ) {
      bucket->bucket[idx] = tsr->bucket_link;
      empty = OO_P_IS_NULL(bucket->bucket[idx]);
    }
    else {
      ci_tcp_state_synrecv* prev = &aux->u.synrecv;
      while( prev->bucket_link != tsr_p ) {
        aux = ci_ni_aux_p2aux(ni, prev->bucket_link);
        prev = &aux->u.synrecv;
#ifdef __KERNEL__
        if( i++ > CI_LISTENQ_BUCKET_LIST_LIMIT(ni) ) {
          ci_netif_error_detected(ni, CI_NETIF_ERROR_SYNRECV_TABLE,
                                  __FUNCTION__);
          return 0;
        }
#endif
      }
      prev->bucket_link = tsr->bucket_link;
    }
  }

  if( empty ) {
    int i;
    for( i = 0; i < CI_TCP_LISTEN_BUCKET_SIZE; i++ )
      if( OO_P_NOT_NULL(bucket->bucket[i]) )
        return 0;
    return 1;
  }
  return 0;
}
Beispiel #10
0
/* c_ni is assumed to be locked on enterance and is always unlocked on
 * exit. */
int ci_tcp_connect_lo_toconn(ci_netif *c_ni, oo_sp c_id, ci_uint32 dst,
                             ci_netif *l_ni, oo_sp l_id)
{
  ci_tcp_state *ts;
  ci_tcp_socket_listen *tls, *alien_tls;
  citp_waitable_obj *wo;
  citp_waitable *w;
  int rc;

  ci_assert(ci_netif_is_locked(c_ni));
  ci_assert(OO_SP_NOT_NULL(c_id));
  ci_assert(OO_SP_NOT_NULL(l_id));

  LOG_TC(log("%s: connect %d:%d to %d:%d", __FUNCTION__,
             c_ni->state->stack_id, OO_SP_TO_INT(c_id),
             l_ni->state->stack_id, OO_SP_TO_INT(l_id)));

  alien_tls = SP_TO_TCP_LISTEN(l_ni, l_id);
  if( (int)ci_tcp_acceptq_n(alien_tls) >= alien_tls->acceptq_max ) {
    ci_netif_unlock(c_ni);
    return -EBUSY;
  }

  /* In c_ni, create shadow listening socket tls (copy l_id) */
  ts = ci_tcp_get_state_buf(c_ni);
  if( ts == NULL ) {
    ci_netif_unlock(c_ni);
    LOG_E(ci_log("%s: [%d] out of socket buffers", __FUNCTION__, NI_ID(c_ni)));
    return -ENOMEM;
  }

  /* init common tcp fields */
  ts->s.so = alien_tls->s.so;
  ts->s.cp.ip_ttl = alien_tls->s.cp.ip_ttl;
  S_TCP_HDR(&ts->s)->tcp_source_be16 =
      S_TCP_HDR(&alien_tls->s)->tcp_source_be16;
  ts->s.domain = alien_tls->s.domain;
  ts->c = alien_tls->c;
  ts->c.tcp_defer_accept = OO_TCP_DEFER_ACCEPT_OFF;

  /* make sure nobody will ever connect to our "shadow" socket
   * except us */
  ci_bit_set(&ts->s.b.sb_aflags, CI_SB_AFLAG_ORPHAN_BIT);

  ci_tcp_set_slow_state(c_ni, ts, CI_TCP_LISTEN);
  tls = SOCK_TO_TCP_LISTEN(&ts->s);
  /* no timer: */
  tls->s.s_flags = alien_tls->s.s_flags | CI_SOCK_FLAG_BOUND_ALIEN;

  tls->acceptq_max = 1;
  rc = ci_tcp_listen_init(c_ni, tls);
  if( rc != 0 ) {
    citp_waitable_obj_free(c_ni, &tls->s.b);
    return rc;
  }

  /* Connect c_id to tls */
  ts = SP_TO_TCP(c_ni, c_id);
  rc = ci_tcp_connect_lo_samestack(c_ni, ts, tls->s.b.bufid);

  /* Accept as from tls */
  if( !ci_tcp_acceptq_not_empty(tls) ) {
    /* it is possible, for example, if ci_tcp_listenq_try_promote() failed
     * because there are no endpoints */
    ci_tcp_listenq_drop_all(c_ni, tls);
    citp_waitable_obj_free(c_ni, &tls->s.b);
    ci_netif_unlock(c_ni);
    return -EBUSY;
  }
  w = ci_tcp_acceptq_get(c_ni, tls);
  ci_assert(w);
  LOG_TV(ci_log("%s: %d:%d to %d:%d shadow %d:%d accepted %d:%d",
                __FUNCTION__,
                c_ni->state->stack_id, OO_SP_TO_INT(c_id),
                l_ni->state->stack_id, OO_SP_TO_INT(l_id),
                c_ni->state->stack_id, tls->s.b.bufid,
                c_ni->state->stack_id, w->bufid));

  ci_assert(w->state & CI_TCP_STATE_TCP);
  ci_assert(w->state != CI_TCP_LISTEN);

  /* Destroy tls.
   * NB: nobody could possibly connect to it, so no need to do proper
   * shutdown.
   */
  ci_assert_equal(ci_tcp_acceptq_n(tls), 0);
  ci_tcp_listenq_drop_all(c_ni, tls);
  citp_waitable_obj_free(c_ni, &tls->s.b);
  ci_netif_unlock(c_ni);

  /* Keep a port reference */
  {
    tcp_helper_endpoint_t *l_ep, *a_ep;
    struct oo_file_ref* os_sock_ref;
    ci_irqlock_state_t lock_flags;

    l_ep = ci_trs_ep_get(netif2tcp_helper_resource(l_ni), l_id);
    a_ep = ci_trs_ep_get(netif2tcp_helper_resource(c_ni), W_SP(w));
    ci_irqlock_lock(&l_ep->thr->lock, &lock_flags);
    os_sock_ref = l_ep->os_socket;
    ci_assert_equal(a_ep->os_port_keeper, NULL);
    if( os_sock_ref != NULL ) {
      os_sock_ref = oo_file_ref_add(os_sock_ref);
      os_sock_ref = oo_file_ref_xchg(&a_ep->os_port_keeper, os_sock_ref);
      ci_irqlock_unlock(&l_ep->thr->lock, &lock_flags);
      if( os_sock_ref != NULL )
        oo_file_ref_drop(os_sock_ref);
    }
    else {
      ci_irqlock_unlock(&l_ep->thr->lock, &lock_flags);
      goto cleanup;
    }
  }

  /* lock l_ni: Check that l_id is the same socket it used to be */
  /* create ref-sock in l_ni, put it into acc q */
  if( ci_netif_lock(l_ni) != 0 )
    goto cleanup;
  if( alien_tls->s.b.state != CI_TCP_LISTEN ||
      (alien_tls->s.b.sb_aflags & CI_SB_AFLAG_ORPHAN) ||
      S_TCP_HDR(&alien_tls->s)->tcp_source_be16 != TS_TCP(ts)->tcp_dest_be16 ||
      (alien_tls->s.pkt.ip.ip_saddr_be32 != INADDR_ANY &&
       alien_tls->s.pkt.ip.ip_saddr_be32 != ts->s.pkt.ip.ip_daddr_be32) ) {
    ci_netif_unlock(l_ni);
    goto cleanup;
  }

  ci_bit_mask_set(&w->sb_aflags,
                  CI_SB_AFLAG_TCP_IN_ACCEPTQ | CI_SB_AFLAG_ORPHAN);

  wo = citp_waitable_obj_alloc(l_ni);
  if( wo == NULL ) {
    ci_netif_unlock(l_ni);
    goto cleanup;
  }
  wo->waitable.state = CI_TCP_CLOSED;
  wo->waitable.sb_aflags |= CI_SB_AFLAG_MOVED_AWAY;
  wo->waitable.moved_to_stack_id = c_ni->state->stack_id;
  wo->waitable.moved_to_sock_id = W_SP(w);
  LOG_TC(log("%s: put to acceptq %d:%d referencing %d:%d", __func__,
             l_ni->state->stack_id, OO_SP_TO_INT(W_SP(&wo->waitable)),
             c_ni->state->stack_id, OO_SP_TO_INT(W_SP(w))));

  ci_tcp_acceptq_put(l_ni, alien_tls, &wo->waitable);
  citp_waitable_wake_not_in_poll(l_ni, &alien_tls->s.b, CI_SB_FLAG_WAKE_RX);
  ci_netif_unlock(l_ni);

  return rc;

cleanup:
  ci_assert(w->sb_aflags & CI_SB_AFLAG_ORPHAN);
  ci_bit_mask_clear(&w->sb_aflags,
                    CI_SB_AFLAG_TCP_IN_ACCEPTQ | CI_SB_AFLAG_ORPHAN);
  efab_tcp_helper_close_endpoint(netif2tcp_helper_resource(c_ni), w->bufid);
  /* we can not guarantee c_ni lock, so we can' call
   * ci_tcp_drop(c_ni, ts).  So, we return error; UL will handover
   * and close ts endpoint. */
  return -EBUSY;
}