static struct sk_buff *xlgmac_create_skb(struct xlgmac_pdata *pdata, struct napi_struct *napi, struct xlgmac_desc_data *desc_data, unsigned int len) { unsigned int copy_len; struct sk_buff *skb; u8 *packet; skb = napi_alloc_skb(napi, desc_data->rx.hdr.dma_len); if (!skb) return NULL; /* Start with the header buffer which may contain just the header * or the header plus data */ dma_sync_single_range_for_cpu(pdata->dev, desc_data->rx.hdr.dma_base, desc_data->rx.hdr.dma_off, desc_data->rx.hdr.dma_len, DMA_FROM_DEVICE); packet = page_address(desc_data->rx.hdr.pa.pages) + desc_data->rx.hdr.pa.pages_offset; copy_len = (desc_data->rx.hdr_len) ? desc_data->rx.hdr_len : len; copy_len = min(desc_data->rx.hdr.dma_len, copy_len); skb_copy_to_linear_data(skb, packet, copy_len); skb_put(skb, copy_len); len -= copy_len; if (len) { /* Add the remaining data as a frag */ dma_sync_single_range_for_cpu(pdata->dev, desc_data->rx.buf.dma_base, desc_data->rx.buf.dma_off, desc_data->rx.buf.dma_len, DMA_FROM_DEVICE); skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags, desc_data->rx.buf.pa.pages, desc_data->rx.buf.pa.pages_offset, len, desc_data->rx.buf.dma_len); desc_data->rx.buf.pa.pages = NULL; } return skb; }
struct sk_buff *mlx4_en_rx_skb(struct mlx4_en_priv *priv, struct mlx4_en_rx_desc *rx_desc, struct skb_frag_struct *skb_frags, struct mlx4_en_rx_alloc *page_alloc, unsigned int length) { struct mlx4_en_dev *mdev = priv->mdev; struct sk_buff *skb; void *va; int used_frags; dma_addr_t dma; skb = dev_alloc_skb(SMALL_PACKET_SIZE + NET_IP_ALIGN); if (!skb) { mlx4_dbg(RX_ERR, priv, "Failed allocating skb\n"); return NULL; } skb->dev = priv->dev; skb_reserve(skb, NET_IP_ALIGN); skb->len = length; skb->truesize = length + sizeof(struct sk_buff); /* Get pointer to first fragment so we could copy the headers into the * (linear part of the) skb */ va = page_address(skb_frags[0].page) + skb_frags[0].page_offset; if (length <= SMALL_PACKET_SIZE) { /* We are copying all relevant data to the skb - temporarily * synch buffers for the copy */ dma = be64_to_cpu(rx_desc->data[0].addr); dma_sync_single_range_for_cpu(&mdev->pdev->dev, dma, 0, length, DMA_FROM_DEVICE); skb_copy_to_linear_data(skb, va, length); dma_sync_single_range_for_device(&mdev->pdev->dev, dma, 0, length, DMA_FROM_DEVICE); skb->tail += length; } else { /* Move relevant fragments to skb */ used_frags = mlx4_en_complete_rx_desc(priv, rx_desc, skb_frags, skb_shinfo(skb)->frags, page_alloc, length); skb_shinfo(skb)->nr_frags = used_frags; /* Copy headers into the skb linear buffer */ memcpy(skb->data, va, HEADER_COPY_SIZE); skb->tail += HEADER_COPY_SIZE; /* Skip headers in first fragment */ skb_shinfo(skb)->frags[0].page_offset += HEADER_COPY_SIZE; /* Adjust size of first fragment */ skb_shinfo(skb)->frags[0].size -= HEADER_COPY_SIZE; skb->data_len = length - HEADER_COPY_SIZE; } return skb; }
static struct ixgbe_rx_buffer *ixgbe_get_rx_buffer_zc(struct ixgbe_ring *rx_ring, unsigned int size) { struct ixgbe_rx_buffer *bi; bi = &rx_ring->rx_buffer_info[rx_ring->next_to_clean]; /* we are reusing so sync this buffer for CPU use */ dma_sync_single_range_for_cpu(rx_ring->dev, bi->dma, 0, size, DMA_BIDIRECTIONAL); return bi; }
static int xlgmac_rx_poll(struct xlgmac_channel *channel, int budget) { struct xlgmac_pdata *pdata = channel->pdata; struct xlgmac_ring *ring = channel->rx_ring; struct net_device *netdev = pdata->netdev; unsigned int len, dma_desc_len, max_len; unsigned int context_next, context; struct xlgmac_desc_data *desc_data; struct xlgmac_pkt_info *pkt_info; unsigned int incomplete, error; struct xlgmac_hw_ops *hw_ops; unsigned int received = 0; struct napi_struct *napi; struct sk_buff *skb; int packet_count = 0; hw_ops = &pdata->hw_ops; /* Nothing to do if there isn't a Rx ring for this channel */ if (!ring) return 0; incomplete = 0; context_next = 0; napi = (pdata->per_channel_irq) ? &channel->napi : &pdata->napi; desc_data = XLGMAC_GET_DESC_DATA(ring, ring->cur); pkt_info = &ring->pkt_info; while (packet_count < budget) { /* First time in loop see if we need to restore state */ if (!received && desc_data->state_saved) { skb = desc_data->state.skb; error = desc_data->state.error; len = desc_data->state.len; } else { memset(pkt_info, 0, sizeof(*pkt_info)); skb = NULL; error = 0; len = 0; } read_again: desc_data = XLGMAC_GET_DESC_DATA(ring, ring->cur); if (xlgmac_rx_dirty_desc(ring) > XLGMAC_RX_DESC_MAX_DIRTY) xlgmac_rx_refresh(channel); if (hw_ops->dev_read(channel)) break; received++; ring->cur++; incomplete = XLGMAC_GET_REG_BITS( pkt_info->attributes, RX_PACKET_ATTRIBUTES_INCOMPLETE_POS, RX_PACKET_ATTRIBUTES_INCOMPLETE_LEN); context_next = XLGMAC_GET_REG_BITS( pkt_info->attributes, RX_PACKET_ATTRIBUTES_CONTEXT_NEXT_POS, RX_PACKET_ATTRIBUTES_CONTEXT_NEXT_LEN); context = XLGMAC_GET_REG_BITS( pkt_info->attributes, RX_PACKET_ATTRIBUTES_CONTEXT_POS, RX_PACKET_ATTRIBUTES_CONTEXT_LEN); /* Earlier error, just drain the remaining data */ if ((incomplete || context_next) && error) goto read_again; if (error || pkt_info->errors) { if (pkt_info->errors) netif_err(pdata, rx_err, netdev, "error in received packet\n"); dev_kfree_skb(skb); goto next_packet; } if (!context) { /* Length is cumulative, get this descriptor's length */ dma_desc_len = desc_data->rx.len - len; len += dma_desc_len; if (dma_desc_len && !skb) { skb = xlgmac_create_skb(pdata, napi, desc_data, dma_desc_len); if (!skb) error = 1; } else if (dma_desc_len) { dma_sync_single_range_for_cpu( pdata->dev, desc_data->rx.buf.dma_base, desc_data->rx.buf.dma_off, desc_data->rx.buf.dma_len, DMA_FROM_DEVICE); skb_add_rx_frag( skb, skb_shinfo(skb)->nr_frags, desc_data->rx.buf.pa.pages, desc_data->rx.buf.pa.pages_offset, dma_desc_len, desc_data->rx.buf.dma_len); desc_data->rx.buf.pa.pages = NULL; } } if (incomplete || context_next) goto read_again; if (!skb) goto next_packet; /* Be sure we don't exceed the configured MTU */ max_len = netdev->mtu + ETH_HLEN; if (!(netdev->features & NETIF_F_HW_VLAN_CTAG_RX) && (skb->protocol == htons(ETH_P_8021Q))) max_len += VLAN_HLEN; if (skb->len > max_len) { netif_err(pdata, rx_err, netdev, "packet length exceeds configured MTU\n"); dev_kfree_skb(skb); goto next_packet; } if (netif_msg_pktdata(pdata)) xlgmac_print_pkt(netdev, skb, false); skb_checksum_none_assert(skb); if (XLGMAC_GET_REG_BITS(pkt_info->attributes, RX_PACKET_ATTRIBUTES_CSUM_DONE_POS, RX_PACKET_ATTRIBUTES_CSUM_DONE_LEN)) skb->ip_summed = CHECKSUM_UNNECESSARY; if (XLGMAC_GET_REG_BITS(pkt_info->attributes, RX_PACKET_ATTRIBUTES_VLAN_CTAG_POS, RX_PACKET_ATTRIBUTES_VLAN_CTAG_LEN)) { __vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), pkt_info->vlan_ctag); pdata->stats.rx_vlan_packets++; } if (XLGMAC_GET_REG_BITS(pkt_info->attributes, RX_PACKET_ATTRIBUTES_RSS_HASH_POS, RX_PACKET_ATTRIBUTES_RSS_HASH_LEN)) skb_set_hash(skb, pkt_info->rss_hash, pkt_info->rss_hash_type); skb->dev = netdev; skb->protocol = eth_type_trans(skb, netdev); skb_record_rx_queue(skb, channel->queue_index); napi_gro_receive(napi, skb); next_packet: packet_count++; } /* Check if we need to save state before leaving */ if (received && (incomplete || context_next)) { desc_data = XLGMAC_GET_DESC_DATA(ring, ring->cur); desc_data->state_saved = 1; desc_data->state.skb = skb; desc_data->state.len = len; desc_data->state.error = error; } XLGMAC_PR("packet_count = %d\n", packet_count); return packet_count; }
static long device_ioctl(struct file *fp, unsigned int cmd, unsigned long arg){ unsigned long n_cpy = 0; int ret = 0; int err = 0; //unsigned ind=0; int count = 0; kernel_args_hdr temp_k_hdr; gpu_buf_t *tmp_gpu_bufs = NULL; nvidia_p2p_page_table_t **tmp_pg_table = NULL; unsigned i=0, j=0; const char *rx_or_tx; const char *rx = "RX"; const char *tx = "TX"; int num_gpu_pages = 0; int num_gpu_buf = 0; int num_gpu_buf_per_page = 0; // GNOM New int num_gpu_bins = 0; struct page *tmp_page = NULL; void *tmp_page_va = NULL; dma_addr_t tmp_dma = 0; #ifdef DO_GNOM_TX // [0] Index of TX buffer to send // [1] Number of requests to send // [2] = Sanity check to make sure batch_ind is correct int tx_batch_info[3]; int gnom_km_batch_ind = -1; #endif unsigned long long average_time = 0; uint64_t last_dma = 0; uint64_t dma_diff = 64*1024; uint64_t cuda_addr; uint64_t cuda_page_size = 64*1024; void *h_ptr_from_cuda_phys = NULL; switch(cmd){ case GPU_REG_SINGLE_BUFFER_CMD: // NOTE: This signal is not in use. // This signal was used to test a single huge buffer allocation to use GPUDirect // pinning instead of many single pages. This resulted in being able to allocate // significantly more pinned memory than when pinning multiple smaller pages. printk("[GNoM_km]\n\nNOTE: The GPU_REG_SINGLE_BUFFER_CMD signal is a test signal not used for GNoM/MemcachedGPU\n\n"); // Copy arg header from user n_cpy = copy_from_user((void *)&temp_k_hdr, (void __user *)arg, sizeof(kernel_args_hdr)); if(n_cpy > 0) goto hdr_cpy_err; printk("[GNoM_km] Copy header success...\n"); m_kernel_args.num_pages = temp_k_hdr.num_pages; m_kernel_args.num_buffers = temp_k_hdr.num_buffers; m_kernel_args.buffer_type = temp_k_hdr.buffer_type; // RX or TX m_kernel_args.buf_meta_data.gpu_args = (kernel_args *)kmalloc(sizeof(kernel_args), GFP_KERNEL); if(!m_kernel_args.buf_meta_data.gpu_args) goto malloc_err; // Copy over all of the information about GPU buffers from user n_cpy = copy_from_user((void *)m_kernel_args.buf_meta_data.gpu_args, (void __user *)temp_k_hdr.buf_meta_data.gpu_args, sizeof(kernel_args)); if(n_cpy > 0) goto buffer_cpy_err; printk("[GNoM_km] Copy all CUDA buffer metadata success...\n"); // Pin and map each buffer tmp_pg_table = (nvidia_p2p_page_table_t **)kmalloc(sizeof(nvidia_p2p_page_table_t *), GFP_KERNEL); printk("[GNoM_km]: Pinning GPU buffer (%llu MB, %llu B): %p, p2pT: %llu, vaT: %u, page_table: %p\n", m_kernel_args.buf_meta_data.gpu_args->m_size/(1024*1024), m_kernel_args.buf_meta_data.gpu_args->m_size, (void *)m_kernel_args.buf_meta_data.gpu_args->m_addr, m_kernel_args.buf_meta_data.gpu_args->m_tokens.p2pToken, m_kernel_args.buf_meta_data.gpu_args->m_tokens.vaSpaceToken, tmp_pg_table ); ret = nvidia_p2p_get_pages( m_kernel_args.buf_meta_data.gpu_args->m_tokens.p2pToken, m_kernel_args.buf_meta_data.gpu_args->m_tokens.vaSpaceToken, m_kernel_args.buf_meta_data.gpu_args->m_addr, m_kernel_args.buf_meta_data.gpu_args->m_size, tmp_pg_table, free_callback, tmp_pg_table ); if(ret || (tmp_pg_table[0]->entries <= 0)){ printk("[GNoM_km]: ERROR pinning pages :(\n"); }else{ printk("[GNoM_km]: SUCCESSFULLY PINNED PAGES!! CHECK NVIDIA-SMI BAR USAGE!: # of entries: %u \n", tmp_pg_table[0]->entries); } last_dma = tmp_pg_table[0]->pages[0]->physical_address; for(i=1; i<tmp_pg_table[0]->entries; ++i){ if((tmp_pg_table[0]->pages[i]->physical_address - last_dma) != dma_diff){ printk("[GNoM_km]: ERROR DMA ADDRESS NOT CONTIGUOUS :( :( \n"); break; } last_dma = tmp_pg_table[0]->pages[i]->physical_address; } if(m_kernel_args.buf_meta_data.gpu_args) kfree(m_kernel_args.buf_meta_data.gpu_args); break; case GPU_REG_MULT_BUFFER_CMD: // User CUDA application registers multiple GPU buffers to NIC driver using GPUDirect // This signal is called with a single large buffer allocation, which is split into multiple smaller // buffers internally. This enables the maximum amount of pinned GPUDirect memory. Previously tried to // pin individual 2KB buffers with GPUDirect but resulted in significantly less pinnable memory. // Copy arg header from user n_cpy = copy_from_user((void *)&temp_k_hdr, (void __user *)arg, sizeof(kernel_args_hdr)); if(n_cpy > 0) goto hdr_cpy_err; printk("[GNoM_km] Copy header success...\n"); m_kernel_args.num_pages = temp_k_hdr.num_pages; m_kernel_args.num_buffers = temp_k_hdr.num_buffers; m_kernel_args.buffer_type = temp_k_hdr.buffer_type; // RX or TX m_kernel_args.buf_meta_data.gpu_args = (kernel_args *)kmalloc(sizeof(kernel_args), GFP_KERNEL); if(!m_kernel_args.buf_meta_data.gpu_args) goto malloc_err; // Copy over all of the information about GPU buffers from user n_cpy = copy_from_user((void *)m_kernel_args.buf_meta_data.gpu_args, (void __user *)temp_k_hdr.buf_meta_data.gpu_args, sizeof(kernel_args)); if(n_cpy > 0) goto buffer_cpy_err; printk("[GNoM_km] Copy all CUDA buffer metadata success...\n"); /* Calculate the number of pages, buffers, and buffers per page */ num_gpu_pages = m_kernel_args.num_pages; num_gpu_buf = m_kernel_args.num_buffers; num_gpu_buf_per_page = num_gpu_buf / num_gpu_pages; if(num_gpu_buf_per_page*num_gpu_pages != num_gpu_buf) goto buffer_cpy_err; tmp_gpu_bufs = vmalloc(num_gpu_buf*sizeof(gpu_buf_t)); // Use VMALLOC for large # of buffers if(!tmp_gpu_bufs) goto malloc_err; printk("[GNoM_km] Pinning and registering %d pages...\n", num_gpu_pages); tmp_pg_table = (nvidia_p2p_page_table_t **)kmalloc(sizeof(nvidia_p2p_page_table_t *), GFP_KERNEL); if(!tmp_pg_table) goto malloc_err; printk("[GNoM_km]: Pinning GPU buffer (%llu MB, %llu B): %p, p2pT: %llu, vaT: %u, page_table_ptr: %p\n", m_kernel_args.buf_meta_data.gpu_args->m_size/(1024*1024), m_kernel_args.buf_meta_data.gpu_args->m_size, (void *)m_kernel_args.buf_meta_data.gpu_args->m_addr, m_kernel_args.buf_meta_data.gpu_args->m_tokens.p2pToken, m_kernel_args.buf_meta_data.gpu_args->m_tokens.vaSpaceToken, tmp_pg_table ); // Only do one large buffer map. nvidia_pg_table contains all pages in single call ret = nvidia_p2p_get_pages( m_kernel_args.buf_meta_data.gpu_args->m_tokens.p2pToken, m_kernel_args.buf_meta_data.gpu_args->m_tokens.vaSpaceToken, m_kernel_args.buf_meta_data.gpu_args->m_addr, m_kernel_args.buf_meta_data.gpu_args->m_size, tmp_pg_table, free_callback, tmp_pg_table ); if(ret || (tmp_pg_table[0]->entries <= 0) || (num_gpu_pages != tmp_pg_table[0]->entries)){ printk("[GNoM_km]: ERROR pinning pages :( (%d, %u)\n", num_gpu_pages, tmp_pg_table[0]->entries); goto p2p_get_pages_err; }else{ printk("[GNoM_km]: SUCCESSFULLY PINNED %u GPU PAGES!\n", tmp_pg_table[0]->entries); } last_dma = tmp_pg_table[0]->pages[0]->physical_address; for(i=1; i<tmp_pg_table[0]->entries; ++i){ if((tmp_pg_table[0]->pages[i]->physical_address - last_dma) != dma_diff){ printk("[GNoM_km]: ERROR DMA ADDRESS NOT CONTIGUOUS :( :( \n"); break; } last_dma = tmp_pg_table[0]->pages[i]->physical_address; } // Now setup all internal GNoM buffer structures count = 0; cuda_addr = m_kernel_args.buf_meta_data.gpu_args->m_addr; for(i=0; i<num_gpu_pages; ++i){ count++; h_ptr_from_cuda_phys = (void *)ioremap(tmp_pg_table[0]->pages[i]->physical_address, cuda_page_size); for(j=0; j<num_gpu_buf_per_page; ++j){ tmp_gpu_bufs[j + (i*num_gpu_buf_per_page)].cuda_addr = cuda_addr; // Set to VA of each CUDA page (Calculated by offsetting original VA) tmp_gpu_bufs[j + (i*num_gpu_buf_per_page)].host_addr = h_ptr_from_cuda_phys; // Map virtual address tmp_gpu_bufs[j + (i*num_gpu_buf_per_page)].user_pg = NULL; // No CPU page for GPU buffer tmp_gpu_bufs[j + (i*num_gpu_buf_per_page)].dma = tmp_pg_table[0]->pages[i]->physical_address; // DMA address, physical bus address tmp_gpu_bufs[j + (i*num_gpu_buf_per_page)].page_offset = j*RX_BUFFER_SZ; // Offset of this buffer within the physical page } cuda_addr += cuda_page_size; } if(m_kernel_args.buf_meta_data.gpu_args) kfree(m_kernel_args.buf_meta_data.gpu_args); // Ensure all buffers are set before enabling flag to NIC rmb(); if (m_kernel_args.buffer_type == GNOM_RX) { // RX buffer mapping m_gpu_rx_bufs = tmp_gpu_bufs; page_table_rx = tmp_pg_table; num_gpu_rx_pages = num_gpu_pages; num_gpu_rx_buf = num_gpu_buf; num_gpu_rx_buf_per_page = num_gpu_buf_per_page; rx_or_tx = rx; gpu_rx_ready = 1; }else{ // TX buffer mapping m_gpu_tx_bufs = tmp_gpu_bufs; page_table_tx = tmp_pg_table; num_gpu_tx_pages = num_gpu_pages; num_gpu_tx_buf = num_gpu_buf; num_gpu_tx_buf_per_page = num_gpu_buf_per_page; rx_or_tx = tx; gpu_tx_ready = 1; } printk("[GNoM_km]: %s CUDA buffers successfully registered with GPU_km...\n", rx_or_tx); printk("[GNoM_km]: \t Number of pages:\t\t %d\n", num_gpu_pages); printk("[GNoM_km]: \t Number of buffers:\t\t %d\n", num_gpu_buf); printk("[GNoM_km]: \t Number of bins:\t\t %d\n", num_gpu_bins); printk("[GNoM_km]: \t Number of buffers per page:\t %d\n", num_gpu_buf_per_page); if(!ixgbe_callback_register) goto p2p_get_pages_err; // Free memory and return printk("[GNoM_km]: Registration successful, call IOCTL with SIGNAL_NIC cmd to initialize the NIC\n"); break; case SIGNAL_NIC: printk("[GNoM_km]: Calling to restart NIC\n"); ixgbe_callback_register(0); // Signal/reset NIC now that GPU buffers are registered. break; case STOP_SYSTEM: printk("[GNoM_km]: STOP_SYSTEM signal received\n"); IS_DONE_FLAG = 1; gpu_rx_ready = 0; gpu_tx_ready = 0; // Print GNoM_ixgbe stats ixgbe_callback_register(4); // Print this stats if(total_count > 0){ average_time = total_time / total_count; printk("[GNOM_km]: Complete receive to send total time for %lld buffers: %lld - average = %lld ns (%lld us)\n", total_time, total_count, average_time, average_time/1000); } pending_batch_tail++; rmb(); wake_up(&m_req_wq); // Wake up any potentially waiting ixgbe_callback_register(0); break; case SHUTDOWN_NIC: if(!IS_DONE_FLAG){ printk("[GNoM_km]: Calling to shutdown NIC\n"); gpu_rx_ready = 0; // Return NIC to original CPU only mode gpu_tx_ready = 0; ixgbe_callback_register(0); // Signal/reset NIC now that GPU buffers are registered. } break; case GNOM_TX_SEND: // Copy arg header from user #ifdef DO_GNOM_TX n_cpy = copy_from_user((void *)tx_batch_info, (void __user *)arg, 3*sizeof(int)); if(unlikely((n_cpy > 0) || !gnom_tx || m_lw_pending_batches[tx_batch_info[0]].batch_id != tx_batch_info[2])) goto hdr_cpy_err; gnom_km_batch_ind = m_lw_pending_batches[tx_batch_info[0]].tx_batch_ind; // Now call gnom_tx for every request in the batch for(i=0; i<tx_batch_info[1]; ++i){ gnom_tx(gnom_km_batch_ind, i, 2048 /* FIXME */); } #else printk("[GNoM_km]: Error - GNoM is not configured to run TX through GPUDirect. Please define DO_GNOM_TX\n"); #endif break; case GNOM_REG_MULT_CPU_BUFFERS: // This signal is used for GNoM TX. DO_GNOM_TX should be set // User CUDA application registers multiple GPU-accessible CPU buffers to NIC driver gnom_print("Registering GPU-accessible CPU buffers\n"); #ifndef DO_GNOM_TX printk("[GNoM_km]: Error - GNoM is not configured to run TX. Please define DO_GNOM_TX\n"); #endif if(gnom_dev == NULL) goto dev_not_set_err; // Copy arg header from user n_cpy = copy_from_user((void *)&temp_k_hdr, (void __user *)arg, sizeof(kernel_args_hdr)); if(n_cpy > 0) goto hdr_cpy_err; m_kernel_args.num_pages = temp_k_hdr.num_pages; m_kernel_args.num_buffers = temp_k_hdr.num_buffers; m_kernel_args.buffer_type = temp_k_hdr.buffer_type; // RX or TX m_kernel_args.buf_meta_data.cpu_args = (cpu_kernel_args *)kmalloc(m_kernel_args.num_pages*sizeof(cpu_kernel_args), GFP_KERNEL); if(!m_kernel_args.buf_meta_data.cpu_args) goto malloc_err; // Copy over all of the information about GPU buffers from user n_cpy = copy_from_user((void *)m_kernel_args.buf_meta_data.cpu_args, (void __user *)temp_k_hdr.buf_meta_data.cpu_args, sizeof(cpu_kernel_args)*m_kernel_args.num_pages); if(n_cpy > 0) goto buffer_cpy_err; /* Calculate the number of pages, buffers, and buffers per page */ num_gpu_pages = m_kernel_args.num_pages; num_gpu_buf = m_kernel_args.num_buffers; num_gpu_buf_per_page = num_gpu_buf / num_gpu_pages; if(num_gpu_buf_per_page*num_gpu_pages != num_gpu_buf) goto buffer_cpy_err; tmp_gpu_bufs = kmalloc(num_gpu_buf*sizeof(gpu_buf_t), GFP_KERNEL); if(!tmp_gpu_bufs) goto malloc_err; printk("[GNoM_km] Pinning and registering %d GPU-accessible CPU pages, %d buffers, in %d bins\n", num_gpu_pages, num_gpu_buf, num_gpu_bins); down_read(¤t->mm->mmap_sem); for(i=0; i<num_gpu_pages; ++i){ count++; // (1) Get the user page // (2) Map the user page // (3) Setup DMA mapping // (4) Setup GNoM data structures // (1) err = get_user_pages(current, current->mm, (unsigned long)m_kernel_args.buf_meta_data.cpu_args[i].user_page_va, 1, 1, 1, &tmp_page, NULL); if(err == 1){ // (2) tmp_page_va = kmap(tmp_page); // (3) tmp_dma = dma_map_page(gnom_dev, tmp_page, 0, CPU_PAGE_SIZE, DMA_TO_DEVICE); if (dma_mapping_error(gnom_dev, tmp_dma)){ up_read(¤t->mm->mmap_sem); goto dma_err; } if(!tmp_dma || !tmp_page){ up_read(¤t->mm->mmap_sem); goto dma_err; } // Make sure everything is synced for the CPU/GPU to access dma_sync_single_range_for_cpu(gnom_dev, tmp_dma, 0, CPU_PAGE_SIZE, DMA_TO_DEVICE); // (4) for(j=0; j<num_gpu_buf_per_page; ++j){ tmp_gpu_bufs[j + (i*num_gpu_buf_per_page)].cuda_addr = m_kernel_args.buf_meta_data.cpu_args[i].cuda_page_va; tmp_gpu_bufs[j + (i*num_gpu_buf_per_page)].host_addr = tmp_page_va; tmp_gpu_bufs[j + (i*num_gpu_buf_per_page)].user_pg = tmp_page; tmp_gpu_bufs[j + (i*num_gpu_buf_per_page)].dma = tmp_dma; // DMA address, physical bus address tmp_gpu_bufs[j + (i*num_gpu_buf_per_page)].page_offset = j*TX_BUFFER_SZ; // Offset of this buffer within the physical page // print_gpu_buf_t(&tmp_gpu_bufs[j + (i*num_gpu_buf_per_page)], j + (i*num_gpu_buf_per_page)); } }else{ printk("[GNoM_km]: Error with get_user_pages on a mult page mapping (%d)\n", i); } } up_read(¤t->mm->mmap_sem); // Free up the cpu_args structure kfree(m_kernel_args.buf_meta_data.cpu_args); // Ensure all buffers are set before enabling flag to NIC rmb(); if (m_kernel_args.buffer_type == GNOM_RX) { // RX buffer mapping m_gpu_rx_bufs = tmp_gpu_bufs; page_table_rx = NULL; // No CUDA page table for CPU resident buffers num_gpu_rx_pages = num_gpu_pages; num_gpu_rx_buf = num_gpu_buf; num_gpu_rx_buf_per_page = num_gpu_buf_per_page; num_gpu_rx_bins = num_gpu_bins; rx_or_tx = rx; gpu_rx_ready = 1; }else{ // TX buffer mapping m_gpu_tx_bufs = tmp_gpu_bufs; page_table_tx = NULL; // No CUDA page table for CPU resident buffers num_gpu_tx_pages = num_gpu_pages; num_gpu_tx_buf = num_gpu_buf; num_gpu_tx_buf_per_page = num_gpu_buf_per_page; num_gpu_tx_bins = num_gpu_bins; rx_or_tx = tx; gpu_tx_ready = 1; } printk("[GNoM_km]: %s GPU-accessible CPU CUDA buffers successfully registered with GPU_km\n", rx_or_tx); printk("[GNoM_km]: \t Number of pages:\t\t %d\n", num_gpu_pages); printk("[GNoM_km]: \t Number of buffers:\t\t %d\n", num_gpu_buf); printk("[GNoM_km]: \t Number of bins:\t\t %d\n", num_gpu_bins); printk("[GNoM_km]: \t Number of buffers per page:\t %d\n", num_gpu_buf_per_page); break; case GNOM_UNREG_MULT_CPU_BUFFERS: if(gnom_dev == NULL) goto dev_not_set_err; // Copy arg header from user n_cpy = copy_from_user((void *)&temp_k_hdr, (void __user *)arg, sizeof(kernel_args_hdr)); if(n_cpy > 0) goto hdr_cpy_err; if(temp_k_hdr.buffer_type == GNOM_RX){ gnom_print("Unregistering GRXBs\n"); tmp_gpu_bufs = m_gpu_rx_bufs; num_gpu_buf = num_gpu_rx_buf; num_gpu_pages = num_gpu_rx_pages; num_gpu_buf_per_page = num_gpu_rx_buf_per_page; num_gpu_bins = num_gpu_rx_bins; }else{ gnom_print("Unregistering GTXBs\n"); tmp_gpu_bufs = m_gpu_tx_bufs; num_gpu_buf = num_gpu_tx_buf; num_gpu_pages = num_gpu_tx_pages; num_gpu_buf_per_page = num_gpu_tx_buf_per_page; num_gpu_bins = num_gpu_tx_bins; } printk("[GNoM_km]: \t Number of pages:\t\t %d\n", num_gpu_pages); printk("[GNoM_km]: \t Number of buffers:\t\t %d\n", num_gpu_buf); printk("[GNoM_km]: \t Number of bins:\t\t %d\n", num_gpu_bins); printk("[GNoM_km]: \t Number of buffers per page:\t %d\n", num_gpu_buf_per_page); count=0; down_read(¤t->mm->mmap_sem); for(i=0; i<num_gpu_pages; i++){ if(tmp_gpu_bufs[i*num_gpu_buf_per_page].user_pg != NULL){ // First buffer in the page will clean things up, then null out everything else count++; // Tear down DMA mapping dma_unmap_page(gnom_dev, tmp_gpu_bufs[i*num_gpu_buf_per_page].dma, CPU_PAGE_SIZE, DMA_TO_DEVICE); // Unmap page kunmap(tmp_gpu_bufs[i*num_gpu_buf_per_page].user_pg); // Set dirty bit on page and release the page cache if(!PageReserved(tmp_gpu_bufs[i*num_gpu_buf_per_page].user_pg)) SetPageDirty(tmp_gpu_bufs[i*num_gpu_buf_per_page].user_pg); page_cache_release(tmp_gpu_bufs[i*num_gpu_buf_per_page].user_pg); for(j=0; j<num_gpu_buf_per_page; ++j){ // Clear the buffer info tmp_gpu_bufs[j + i*num_gpu_buf_per_page].user_pg = NULL; tmp_gpu_bufs[j + i*num_gpu_buf_per_page].host_addr = NULL; tmp_gpu_bufs[j + i*num_gpu_buf_per_page].cuda_addr = 0; tmp_gpu_bufs[j + i*num_gpu_buf_per_page].dma = 0; } } } up_read(¤t->mm->mmap_sem); printk("[GNoM_km]: Successfully unregistered %d GPU accessible CPU pages\n", count); break; case TEST_SEND_SINGLE_PACKET: printk("[GNoM_km]: Sending packet\n"); ixgbe_callback_register(3); printk("[GNoM_km]: Sending packet complete...\n"); break; case TEST_CHECK_SEND_COMPLETE: break; default: goto ioctl_err; } return 0; // TODO: Make sure all error paths below correctly clean up any partially allocated structures. dma_err: printk("Err: [GNoM_km] DMA mapping failed\n"); return -EINVAL; dev_not_set_err: printk("Err: [GNoM_km] Device not set yet\n"); return -EINVAL; hdr_cpy_err: printk("Err: [GNoM_km] Asked to copy %lu bytes, only copied %lu bytes\n", sizeof(kernel_args), sizeof(kernel_args)-n_cpy); return -EINVAL; buffer_cpy_err: printk("Err: [GNoM_km] Asked to copy %lu bytes, only copied %lu bytes\n", sizeof(kernel_args)*m_kernel_args.num_buffers, sizeof(kernel_args)*m_kernel_args.num_buffers-n_cpy); return -EINVAL; malloc_err: printk("[GNoM_km] Failed to allocate kernel args buffer...\n"); return -EINVAL; p2p_get_pages_err: if(tmp_gpu_bufs) vfree(tmp_gpu_bufs); if(m_kernel_args.buf_meta_data.gpu_args) kfree(m_kernel_args.buf_meta_data.gpu_args); printk("\t[GNoM_km] Failed to pin the GPU buffer (%d) :( (%d successfully registered)\n", ret, count); return -EINVAL; ioctl_err: pr_err("Invalid cmd: %u\n", cmd); return -EINVAL; }