Beispiel #1
0
void citp_waitable_obj_free(ci_netif* ni, citp_waitable* w)
{
  ci_assert(ci_netif_is_locked(ni));

#ifdef __KERNEL__
  {
    /* Avoid racing with tcp_helper_do_non_atomic(). */
    tcp_helper_endpoint_t* ep = ci_netif_get_valid_ep(ni, w->bufid);
    unsigned ep_aflags;
  again:
    if( (ep_aflags = ep->ep_aflags) & OO_THR_EP_AFLAG_NON_ATOMIC ) {
      ci_assert(!(ep_aflags & OO_THR_EP_AFLAG_NEED_FREE));
      if( ci_cas32_fail(&ep->ep_aflags, ep_aflags,
                        ep_aflags | OO_THR_EP_AFLAG_NEED_FREE) )
        goto again;
      return;
    }
    ci_rmb();
  }
#endif

  __citp_waitable_obj_free(ni, w);
  w->wt_next = ni->state->free_eps_head;
  ni->state->free_eps_head = W_SP(w);
  /* Must be last, as may result in stack going away. */
  ci_drop_orphan(ni);
}
Beispiel #2
0
int __oo_cp_route_resolve(struct oo_cplane_handle* cp,
                          cicp_verinfo_t* verinfo,
                          struct cp_fwd_key* key,
                          int/*bool*/ ask_server,
                          struct cp_fwd_data* data)
{
  struct cp_mibs* mib = &cp->mib[0];
  cp_version_t ver, old_ver;
  cicp_mac_rowid_t id;
  struct cp_fwd_row* fwd;

 find_again:
  id = cp_fwd_find_match(mib, key);
  if( id == CICP_MAC_ROWID_BAD ||
      ~(fwd = cp_get_fwd_by_id(mib, id))->flags & CICP_FWD_FLAG_DATA_VALID ||
      ! cp_fwd_find_row_found_perfect_match(mib, id, key) ) {
    if( ! ask_server )
      return -ENOENT;
    oo_op_route_resolve(cp, key);
    ask_server = CI_FALSE;
    goto find_again;
  }

  ver = OO_ACCESS_ONCE(*cp_fwd_version(fwd));
  do {
    if( ~ fwd->flags & CICP_FWD_FLAG_DATA_VALID ||
        ! cp_fwd_key_match(fwd, key) )
        goto find_again;
    ci_rmb();
    *data = *cp_get_fwd_data_current(fwd);

    /* We can accidentally increase TTL for a wrong row  - we do not care */
    if( fwd->flags & CICP_FWD_FLAG_STALE )
      mib->fwd_rw[id].frc_used = ci_frc64_get();
    old_ver = ver;
    ci_rmb();
  } while( old_ver != (ver = OO_ACCESS_ONCE(*cp_fwd_version(fwd))) );

  verinfo->id = id;
  verinfo->version = ver;

  /* Cplane server will refresh ARP when it reads fwd_rw[id], but it may
   * happen after some time.  Ask for the ARP immediately. */
  if( ask_server && ! data->arp_valid )
    oo_cp_arp_resolve(cp, verinfo);
  return 0;
}
Beispiel #3
0
void ci_synchronise_clock(ci_netif *ni, struct oo_timesync* oo_ts_local)
{
  ci_uint32 gc;
  struct oo_timesync *oo_ts;

  oo_ts = oo_timesync_state(CICP_HANDLE(ni));

  /* Check if our current datapoint for clock_gettime is up to date,
   * and take another if not
   */
  if( oo_ts_local->generation_count != oo_ts->generation_count ) {
    do {
      gc = oo_ts->generation_count;
      ci_rmb();
      oo_ts_local->smoothed_ticks = oo_ts->smoothed_ticks;
      oo_ts_local->smoothed_ns = oo_ts->smoothed_ns;
      oo_ts_local->clock.tv_sec = oo_ts->clock.tv_sec;
      oo_ts_local->clock.tv_nsec = oo_ts->clock.tv_nsec;
      oo_ts_local->clock_made = oo_ts->clock_made;
      ci_rmb();
    } while (gc & 1 || gc != oo_ts->generation_count);
    oo_ts_local->generation_count = gc;
  }
}
Beispiel #4
0
void ci_tcp_linger(ci_netif* ni, ci_tcp_state* ts)
{
  /* This is called at user-level when a socket is closed if linger is
  ** enabled and has a timeout, and there is TX data outstanding.
  **
  ** Our job is to block until all data is successfully sent and acked, or
  ** until timeout.
  */
  ci_uint64 sleep_seq;
  int rc = 0;
  ci_uint32 timeout = ts->s.so.linger * 1000;

  LOG_TC(log("%s: "NTS_FMT, __FUNCTION__, NTS_PRI_ARGS(ni, ts)));

  ci_assert(ts->s.b.sb_aflags & CI_SB_AFLAG_ORPHAN);
  ci_assert(ts->s.b.sb_aflags & CI_SB_AFLAG_IN_SO_LINGER);
  ci_assert(ts->s.s_flags & CI_SOCK_FLAG_LINGER);
  ci_assert(ts->s.b.state != CI_TCP_LISTEN);

  while( 1 ) {
    sleep_seq = ts->s.b.sleep_seq.all;
    ci_rmb();
    if( SEQ_EQ(tcp_enq_nxt(ts), tcp_snd_una(ts)) )
      return;
    rc = ci_sock_sleep(ni, &ts->s.b, CI_SB_FLAG_WAKE_TX, 0, sleep_seq,
                       &timeout);
    if( rc )
      break;
  }

  if( ! SEQ_EQ(tcp_enq_nxt(ts), tcp_snd_una(ts)) ) {
    ci_netif_lock(ni);
    /* check we are working with the same socket, and it was not closed and
     * dropped under our feet. */
    if( ! SEQ_EQ(tcp_enq_nxt(ts), tcp_snd_una(ts)) &&
        (ts->s.b.sb_aflags & CI_SB_AFLAG_IN_SO_LINGER) )
      ci_tcp_drop(ni, ts, 0);
    ci_netif_unlock(ni);
  }
}
Beispiel #5
0
int citp_ep_dup3(unsigned fromfd, unsigned tofd, int flags)
{
  volatile citp_fdinfo_p* p_tofdip;
  citp_fdinfo_p tofdip;
  unsigned max;

  Log_V(log("%s(%d, %d)", __FUNCTION__, fromfd, tofd));

  /* Must be checked by callers. */
  ci_assert(fromfd != tofd);

  /* Hack: if [tofd] is the fd we're using for logging, we'd better choose
  ** a different one!
  */
  if( tofd == citp.log_fd )  citp_log_change_fd();

  ci_assert(citp.init_level >= CITP_INIT_FDTABLE);

  max = CI_MAX(fromfd, tofd);
  if( max >= citp_fdtable.inited_count ) {
    ci_assert(max < citp_fdtable.size);
    CITP_FDTABLE_LOCK();
    __citp_fdtable_extend(max);
    CITP_FDTABLE_UNLOCK();
  }

  /* Bug1151: Concurrent threads doing dup2(x,y) and dup2(y,x) can deadlock
  ** against one another.  So we take out a fat lock to prevent concurrent
  ** dup2()s.
  */
  /* Lock tofd.  We need to interlock against select and poll etc, so we
  ** also grab the exclusive lock.  Also grab the bug1151 lock.
  */
  pthread_mutex_lock(&citp_dup_lock);
  CITP_FDTABLE_LOCK();
  p_tofdip = &citp_fdtable.table[tofd].fdip;
 lock_tofdip_again:
  tofdip = *p_tofdip;
  if( fdip_is_busy(tofdip) )
    tofdip = citp_fdtable_busy_wait(tofd, 1);
  if( fdip_is_closing(tofdip) )
    tofdip = citp_fdtable_closing_wait(tofd, 1);
  if( fdip_is_reserved(tofdip) ) {
    /* ?? FIXME: we can't cope with this at the moment */
    CITP_FDTABLE_UNLOCK();
    Log_U(log("%s(%d, %d): target is reserved", __FUNCTION__, fromfd, tofd));
    errno = EBUSY;
    tofd = -1;
    goto out;
  }
  if( fdip_cas_fail(p_tofdip, tofdip, fdip_busy) )
    goto lock_tofdip_again;
  CITP_FDTABLE_UNLOCK();
  ci_assert(fdip_is_normal(tofdip) | fdip_is_passthru(tofdip) |
 	    fdip_is_unknown(tofdip));

  if( fdip_is_normal(tofdip) ) {
    /* We're duping onto a user-level socket. */
    citp_fdinfo* tofdi = fdip_to_fdi(tofdip);
    if( tofdi->epoll_fd >= 0 ) {
      citp_fdinfo* epoll_fdi = citp_epoll_fdi_from_member(tofdi, 0);
      if( epoll_fdi ) {
        if( epoll_fdi->protocol->type == CITP_EPOLL_FD )
          citp_epoll_on_close(epoll_fdi, tofdi, 0);
        citp_fdinfo_release_ref(epoll_fdi, 0);
      }
    }
    ci_assert_equal(tofdi->on_ref_count_zero, FDI_ON_RCZ_NONE);
    tofdi->on_ref_count_zero = FDI_ON_RCZ_DUP2;
    tofdi->on_rcz.dup3_args.fd = fromfd;
    tofdi->on_rcz.dup3_args.flags = flags;
    citp_fdinfo_release_ref(tofdi, 0);
    {
      int i = 0;
      /* We need to free this fdi.  If someone is using it right now,
       * we are in trouble.  So, we spin for a while and interrupt the
       * user.  See bug 28123. */
      while( tofdi->on_ref_count_zero != FDI_ON_RCZ_DONE ) {
        if( ci_is_multithreaded() && i % 10000 == 9999 ) {
          pthread_t pth = tofdi->thread_id;
          if( pth !=  pthread_self() && pth != PTHREAD_NULL ) {
            pthread_kill(pth, SIGONLOAD);
            sleep(1);
          }
        }
        ci_spinloop_pause();
        i++;
      }
      ci_rmb();
    }
    if( tofdi->on_rcz.dup2_result < 0 ) {
      errno = -tofdi->on_rcz.dup2_result;
      /* Need to re-insert [tofdi] into the table. */
      ci_assert_equal(oo_atomic_read(&tofdi->ref_count), 0);
      oo_atomic_set(&tofdi->ref_count, 1);
      CI_DEBUG(tofdi->on_ref_count_zero = FDI_ON_RCZ_NONE);
      citp_fdtable_busy_clear(tofd, tofdip, 0);
      tofd = -1;
    }
    else {
      ci_assert(tofdi->on_rcz.dup2_result == tofd);
      citp_fdinfo_get_ops(tofdi)->dtor(tofdi, 0);
      citp_fdinfo_free(tofdi);
    }
    goto out;
  }

  ci_assert(fdip_is_passthru(tofdip) | fdip_is_unknown(tofdip));

  { /* We're dupping onto an O/S descriptor, or it may be closed.  Create a
    ** dummy [citp_fdinfo], just so we can share code with the case above.
    */
    citp_fdinfo fdi;
    fdi.fd = tofd;
    fdi.on_rcz.dup3_args.fd = fromfd;
    fdi.on_rcz.dup3_args.flags = flags;
    dup2_complete(&fdi, tofdip, 0);
    if( fdi.on_rcz.dup2_result < 0 ) {
      errno = -fdi.on_rcz.dup2_result;
      citp_fdtable_busy_clear(tofd, tofdip, 0);
      tofd = -1;
    }
    else
      ci_assert(fdi.on_rcz.dup2_result == tofd);
  }

 out:
  pthread_mutex_unlock(&citp_dup_lock);
  return tofd;
}
Beispiel #6
0
static int ci_udp_recvmsg_socklocked_slowpath(ci_udp_iomsg_args* a, 
                                              ci_msghdr* msg,
                                              ci_iovec_ptr *piov, int flags)
{
  int rc = 0;
  ci_netif* ni = a->ni;
  ci_udp_state* us = a->us;

  if(CI_UNLIKELY( ni->state->rxq_low ))
    ci_netif_rxq_low_on_recv(ni, &us->s,
                             1 /* assume at least one pkt freed */);
  /* In the kernel recv() with flags is not called.
   * only read(). So flags may only contain MSG_DONTWAIT */
#ifdef __KERNEL__
  ci_assert_equal(flags, 0);
#endif

#ifndef __KERNEL__
  if( flags & MSG_ERRQUEUE_CHK ) {
    if( OO_PP_NOT_NULL(us->timestamp_q.extract) ) {
      ci_ip_pkt_fmt* pkt;
      struct timespec ts[3];
      struct cmsg_state cmsg_state;
      ci_udp_hdr* udp;
      int paylen;

      /* TODO is this necessary? - mirroring ci_udp_recvmsg_get() */
      ci_rmb();
      
      pkt = PKT_CHK_NNL(ni, us->timestamp_q.extract);
      if( pkt->tx_hw_stamp.tv_sec == CI_PKT_TX_HW_STAMP_CONSUMED ) {
        if( OO_PP_IS_NULL(pkt->tsq_next) )
          goto errqueue_empty;
        us->timestamp_q.extract = pkt->tsq_next;
        pkt = PKT_CHK_NNL(ni, us->timestamp_q.extract);
        ci_assert(pkt->tx_hw_stamp.tv_sec != CI_PKT_TX_HW_STAMP_CONSUMED);
      }

      udp = oo_ip_data(pkt);
      paylen = CI_BSWAP_BE16(oo_ip_hdr(pkt)->ip_tot_len_be16) -
                        sizeof(ci_ip4_hdr) - sizeof(udp);

      msg->msg_flags = 0;
      cmsg_state.msg = msg;
      cmsg_state.cm = msg->msg_control;
      cmsg_state.cmsg_bytes_used = 0;
      ci_iovec_ptr_init_nz(piov, msg->msg_iov, msg->msg_iovlen);
      memset(ts, 0, sizeof(ts));

      if( us->s.timestamping_flags & ONLOAD_SOF_TIMESTAMPING_RAW_HARDWARE ) {
        ts[2].tv_sec = pkt->tx_hw_stamp.tv_sec;
        ts[2].tv_nsec = pkt->tx_hw_stamp.tv_nsec;
      }
      if( (us->s.timestamping_flags & ONLOAD_SOF_TIMESTAMPING_SYS_HARDWARE) &&
          (pkt->tx_hw_stamp.tv_nsec & CI_IP_PKT_HW_STAMP_FLAG_IN_SYNC) ) {
        ts[1].tv_sec = pkt->tx_hw_stamp.tv_sec;
        ts[1].tv_nsec = pkt->tx_hw_stamp.tv_nsec;
      }
      ci_put_cmsg(&cmsg_state, SOL_SOCKET, ONLOAD_SCM_TIMESTAMPING,
                  sizeof(ts), &ts);
      oo_offbuf_set_start(&pkt->buf, udp + 1);
      oo_offbuf_set_len(&pkt->buf, paylen);
      rc = oo_copy_pkt_to_iovec_no_adv(ni, pkt, piov, paylen);

      /* Mark this packet/timestamp as consumed */
      pkt->tx_hw_stamp.tv_sec = CI_PKT_TX_HW_STAMP_CONSUMED;

      ci_ip_cmsg_finish(&cmsg_state);
      msg->msg_flags |= MSG_ERRQUEUE_CHK;
      return rc;
    }
  errqueue_empty:
    /* ICMP is handled via OS, so get OS error */
    rc = oo_os_sock_recvmsg(ni, SC_SP(&us->s), msg, flags);
    if( rc < 0 ) {
      ci_assert(-rc == errno);
      return -1;
    }
    else
      return rc;
  }
#endif
  if( (rc = ci_get_so_error(&us->s)) != 0 ) {
    CI_SET_ERROR(rc, rc);
    return rc;
  }
  if( msg->msg_iovlen > 0 && msg->msg_iov == NULL ) {
    CI_SET_ERROR(rc, EFAULT);
    return rc;
  }
#if MSG_OOB_CHK
  if( flags & MSG_OOB_CHK ) {
    CI_SET_ERROR(rc, EOPNOTSUPP);
    return rc;
  }
#endif
#if CI_CFG_POSIX_RECV  
  if( ! udp_lport_be16(us)) {
    LOG_UV(log("%s: -1 (ENOTCONN)", __FUNCTION__));
    CI_SET_ERROR(rc, ENOTCONN);
    return rc;
  }
#endif
  if( msg->msg_iovlen == 0 ) {
    /* We have a difference in behaviour from the Linux stack here.  When
    ** msg_iovlen is 0 Linux 2.4.21-15.EL does not set MSG_TRUNC when a
    ** datagram has non-zero length.  We do. */
    CI_IOVEC_LEN(&piov->io) = piov->iovlen = 0;
    return IOVLEN_WORKAROUND_RC_VALUE;
  }
  return 0;
}
Beispiel #7
0
static int ci_udp_recvmsg_get(ci_netif* ni, ci_udp_state* us,
                              ci_iovec_ptr* piov, 
                              ci_msghdr* msg, int flags)
{
  ci_ip_pkt_fmt* pkt;
  int rc;

  /* NB. [msg] can be NULL for async recv. */

  if( ci_udp_recv_q_not_readable(ni, us) )
    goto recv_q_is_empty;

  ci_rmb();

  pkt = PKT_CHK_NNL(ni, us->recv_q.extract);
  if( pkt->pf.udp.rx_flags & CI_IP_PKT_FMT_PREFIX_UDP_RX_CONSUMED ) {
    /* We know that the receive queue is not empty and if a filter is
     * involved that there are some that have passed the filter, so if
     * this pkt is already consumed, the next one must be OK to
     * receive (and already have been filtered)
     */
    us->recv_q.extract = pkt->next;
    pkt = PKT_CHK_NNL(ni, us->recv_q.extract);
    ci_assert( !(pkt->pf.udp.rx_flags & 
                 CI_IP_PKT_FMT_PREFIX_UDP_RX_CONSUMED) );
#if CI_CFG_ZC_RECV_FILTER
    if( us->recv_q_filter )
      /* Filter should have run on this packet and marked it */
      ci_assert( (pkt->pf.udp.rx_flags & 
                  (CI_IP_PKT_FMT_PREFIX_UDP_RX_FILTER_PASSED | 
                   CI_IP_PKT_FMT_PREFIX_UDP_RX_FILTER_DROPPED)) ); 
    else
      /* Bump this along as we don't have a filter installed, but want
       * to keep the filter pointer in a sane place
       */
      us->recv_q.filter = us->recv_q.extract;
#endif
  }

#if CI_CFG_ZC_RECV_FILTER
  /* Skip any that the filter has dropped.  This must terminate before
   * hitting the tail because we know the queue is readable.
   */
  while( pkt->pf.udp.rx_flags & CI_IP_PKT_FMT_PREFIX_UDP_RX_FILTER_DROPPED ) {
    us->recv_q.extract = pkt->next;
    pkt = PKT_CHK_NNL(ni, us->recv_q.extract);
  }
#endif

#if defined(__linux__) && !defined(__KERNEL__)
  if( msg != NULL && msg->msg_controllen != 0 ) {
    if( CI_UNLIKELY(us->s.cmsg_flags != 0 ) )
      ci_ip_cmsg_recv(ni, us, pkt, msg, 0);
    else
      msg->msg_controllen = 0;
  }
#endif
  us->stamp = pkt->pf.udp.rx_stamp;

  rc = oo_copy_pkt_to_iovec_no_adv(ni, pkt, piov, pkt->pf.udp.pay_len);

  if(CI_LIKELY( rc >= 0 )) {
#if HAVE_MSG_FLAGS
    if(CI_UNLIKELY( rc < pkt->pf.udp.pay_len && msg != NULL ))
      msg->msg_flags |= LOCAL_MSG_TRUNC;
#endif
    ci_udp_recvmsg_fill_msghdr(ni, msg, pkt, &us->s);
    if( ! (flags & MSG_PEEK) ) {
      us->recv_q.bytes_delivered += pkt->pf.udp.pay_len;
      us->recv_q.pkts_delivered  += 1;
      pkt->pf.udp.rx_flags |= CI_IP_PKT_FMT_PREFIX_UDP_RX_CONSUMED;
#if CI_CFG_ZC_RECV_FILTER
      if( !us->recv_q_filter ) {
        /* Pretend this packet passed the filter, to keep state consistent */
        ++us->recv_q.pkts_filter_passed;
        us->recv_q.bytes_filter_passed += pkt->pf.udp.pay_len;
        pkt->pf.udp.rx_flags |= CI_IP_PKT_FMT_PREFIX_UDP_RX_FILTER_PASSED;
      }
#endif
    }
    us->udpflags |= CI_UDPF_LAST_RECV_ON;
  }

  return rc;

 recv_q_is_empty:
  return -EAGAIN;
}
Beispiel #8
0
int ci_udp_filter_recved_pkts(ci_netif* ni, ci_udp_state* us)
{
  enum onload_zc_callback_rc rc;
  struct onload_zc_msg zc_msg;
  struct onload_zc_iovec zc_iovec[CI_UDP_ZC_IOVEC_MAX];
  ci_ip_pkt_fmt* pkt;
  unsigned cb_flags;
  int dropped_bytes;

  ci_assert(ci_sock_is_locked(ni, &us->s.b));

  zc_msg.iov = zc_iovec;
  zc_msg.msghdr.msg_controllen = 0;
  zc_msg.msghdr.msg_flags = 0;

  while( us->recv_q.pkts_added != 
         us->recv_q.pkts_filter_passed + us->recv_q.pkts_filter_dropped ) {
    ci_rmb();
    pkt = PKT_CHK_NNL(ni, us->recv_q.filter);
    if( pkt->pf.udp.rx_flags & 
        (CI_IP_PKT_FMT_PREFIX_UDP_RX_FILTER_PASSED |
         CI_IP_PKT_FMT_PREFIX_UDP_RX_FILTER_DROPPED) ) {
      /* We know this can't go past tail because of the while loop condition */
      us->recv_q.filter = pkt->next;
      pkt = PKT_CHK_NNL(ni, us->recv_q.filter);
      ci_assert( !(pkt->pf.udp.rx_flags & 
                   (CI_IP_PKT_FMT_PREFIX_UDP_RX_FILTER_PASSED |
                    CI_IP_PKT_FMT_PREFIX_UDP_RX_FILTER_DROPPED)) );
    }

    ci_udp_pkt_to_zc_msg(ni, pkt, &zc_msg);

    cb_flags = CI_IP_IS_MULTICAST(oo_ip_hdr(pkt)->ip_daddr_be32) ? 
      ONLOAD_ZC_MSG_SHARED : 0;
    rc = (*(onload_zc_recv_filter_callback)((ci_uintptr_t)us->recv_q_filter))
      (&zc_msg, (void *)((ci_uintptr_t)us->recv_q_filter_arg), cb_flags);

    ci_assert(!(rc & ONLOAD_ZC_KEEP));

    if( rc & ONLOAD_ZC_TERMINATE ) {
      us->recv_q.bytes_filter_dropped += pkt->pf.udp.pay_len;
      pkt->pf.udp.rx_flags |= CI_IP_PKT_FMT_PREFIX_UDP_RX_FILTER_DROPPED;
      ++us->recv_q.pkts_filter_dropped;
    }
    else {
      pkt->pf.udp.rx_flags |= CI_IP_PKT_FMT_PREFIX_UDP_RX_FILTER_PASSED;
      ++us->recv_q.pkts_filter_passed;
      if( rc & ONLOAD_ZC_MODIFIED ) {
        ci_assert(!(cb_flags & ONLOAD_ZC_MSG_SHARED));
        dropped_bytes = ci_zc_msg_to_udp_pkt(ni, &zc_msg, pkt);
        ci_assert_gt(dropped_bytes, 0);
        ci_assert_lt(dropped_bytes, pkt->pf.udp.pay_len);
        pkt->pf.udp.pay_len -= dropped_bytes;
        us->recv_q.bytes_filter_dropped += dropped_bytes;
      }
      us->recv_q.bytes_filter_passed += pkt->pf.udp.pay_len;
      return 1;
    }
  }

  return us->recv_q.pkts_filter_passed != us->recv_q.pkts_delivered;
}