/* created thread, all this calls are in the thread context */ void *hammer_cpu_worker_loop(void *context) { hammer_cpu_worker_context_t *my_context = (hammer_cpu_worker_context_t *)context; hammer_sched_t *sched = my_context->sched; hammer_batch_t *batch = my_context->batch; int core_id = my_context->core_id; hammer_epoll_handlers_t *handler; unsigned long mask = 0; /* Set affinity of this cpu worker */ mask = 1 << core_id; if (sched_setaffinity(0, sizeof(unsigned long), (cpu_set_t *)&mask) < 0) { hammer_err("Err set affinity in GPU worker\n"); exit(0); } if (config->gpu) { handler = hammer_epoll_set_handlers((void *) hammer_handler_read, (void *) hammer_batch_handler_read, (void *) hammer_handler_write, (void *) hammer_handler_write, // write directly, we have already encrypted the message (void *) hammer_handler_error, (void *) hammer_handler_close, (void *) hammer_handler_close); } else { /* This is just used for forwarding */ handler = hammer_epoll_set_handlers((void *) hammer_handler_read, (void *) hammer_handler_read, (void *) hammer_handler_write, (void *) hammer_handler_write, (void *) hammer_handler_error, (void *) hammer_handler_close, (void *) hammer_handler_close); } /* Export known scheduler node to context thread */ pthread_setspecific(worker_sched_struct, (void *)sched); __builtin_prefetch(sched); __builtin_prefetch(&worker_sched_struct); pthread_setspecific(worker_batch_struct, (void *)batch); __builtin_prefetch(batch); __builtin_prefetch(&worker_batch_struct); if (config->gpu) { /* Allocate the batch buffers, each cpu worker has a set of buffers, * two as input buffer, and two as output buffer. */ hammer_batch_init(); } /* Notify the dispatcher and the GPU worker that this thread has been created */ pthread_mutex_lock(&mutex_worker_init); sched->initialized = 1; pthread_mutex_unlock(&mutex_worker_init); /* Init epoll_wait() loop */ hammer_epoll_start(sched->epoll_fd, handler, sched->epoll_max_events); return 0; }
int hammer_dispatcher_loop(int server_fd) { int ret, remote_fd, worker_id = 0; hammer_sched_t *sched; hammer_connection_t *c; /* Activate TCP_DEFER_ACCEPT */ if (hammer_socket_set_tcp_defer_accept(server_fd) != 0) { hammer_warn("TCP_DEFER_ACCEPT failed\n"); } /* Accept new connections */ while (1) { /* accept first */ c = hammer_handler_accept(server_fd); /* Next worker target */ worker_id = hammer_dispatcher_next_worker_id(); if (hammer_unlikely(worker_id == -1)) { hammer_err("no worker available\n"); exit(0); } sched = &(sched_set[worker_id]); /* Assign connection to worker thread */ hammer_sched_add_connection(c, sched, NULL); } return 0; }
void hammer_sched_add_connection(hammer_connection_t *c, hammer_sched_t *sched, hammer_connection_t *rc) { int ret; ret = hammer_epoll_add(sched->epoll_fd, remote_fd, HAMMER_EPOLL_READ, HAMMER_EPOLL_LEVEL_TRIGGERED, (void *)c); if (hammer_likely(ret == 0)) { if (r_conn != NULL) { /* r_conn != NULL, this connection is added by connect(), to server */ c->r_conn = rc; rc->r_conn = c; sched->connected_connections ++; } else { /* r_conn == NULL, this connection is added by accept(), from client */ sched->accepted_connections ++; } } else { /* fails, close the connection */ hammer_close_connection(c); hammer_err("epoll add fails\n"); exit(0); } return; }
/* Get the buffer of each CPU worker at each time interval I */ void hammer_gpu_get_batch(hammer_gpu_worker_t *g, hammer_batch_t *batch_set) { int i, id; hammer_batch_t *batch; /* Get next Batch */ if (g->buf_set_id == 0) { g->cur_buf_set = g->buf_set_B; g->buf_set_id = 1; } else if (g->buf_set_id == 1) { g->cur_buf_set = g->buf_set_A; g->buf_set_id = 0; } /* Tell the CPU worker we are taking the batch */ for (i = 0; i < config->cpu_worker_num; i ++) { batch = &(batch_set[i]); if (batch->buf_has_been_taken == -1) { pthread_mutex_lock(&(batch->mutex_batch_launch)); id = batch->buf_has_been_taken = batch->cur_buf_id; pthread_mutex_unlock(&(batch->mutex_batch_launch)); assert(id == g->buf_set_id); } else { hammer_err("error in hammer_gpu_take_buf\n"); exit(0); } /* For statistic */ g->total_bytes += g->cur_buf_set[i]->buf_length; } return ; }
/* Write to server, this is also used for writing to clients when we accelerate * encryption and HMAC with GPU, and we just send the whole packet with this */ int hammer_handler_write(hammer_connection_t *c) { int send; hammer_connection_t *rc; hammer_job_t *this_job; struct hammer_list *job_list, *job_head; if (c->ssl) { hammer_err("What's up, this should not be a ssl connection\n"); exit(0); } // this is the socket to write to, now we get the socket that has read something rc = c->r_conn; job_list = rc->job_list; hammer_list_foreach(job_head, job_list) { this_job = hammer_list_entry(job_head, hammer_job_t, _head); send = hammer_socket_write( c->socket, this_job->job_body_ptr, this_job->job_body_length); if (send != this_job->job_body_length) { printf("Not all are send \n"); return -1; } hammer_conn_job_del(this_job); }
int hammer_epoll_create(int max_events) { int efd; efd = epoll_create(max_events); if (efd == -1) { perror("epoll_create"); hammer_err("epoll_create() failed"); } return efd; }
int hammer_close_connection(hammer_connection_t *c) { hammer_job_t *this_job; struct hammer_list *job_list, *job_head; if (c == NULL) { hammer_err("c is null\n"); return 0; } hammer_socket_close(c->socket); job_list = c->job_list; hammer_list_foreach(job_head, job_list) { this_job = hammer_list_entry(job_head, hammer_job_t, _head); hammer_conn_job_del(this_job); }
void hammer_sched_add_connection(hammer_connection_t *c, hammer_sched_t *sched) { int ret; ret = hammer_epoll_add(sched->epoll_fd, c->socket, HAMMER_EPOLL_READ, HAMMER_EPOLL_LEVEL_TRIGGERED, (void *)c); if (hammer_likely(ret == 0)) { if (c->type == HAMMER_CONN_CLIENT) { sched->client_connections ++; } else { /* HAMMER_CONN_SERVER */ sched->server_connections ++; } } else { /* fails, close the connection */ hammer_close_connection(c); hammer_err("epoll add fails\n"); exit(0); } return; }
/* Tell the CPU worker that this batch has been completed */ void hammer_gpu_give_result(hammer_gpu_worker_t *g, hammer_batch_t *batch_set) { int i; hammer_batch_t *batch; for (i = 0; i < config->cpu_worker_num; i ++) { batch = &(batch_set[i]); if (batch->processed_buf_id == -1) { /* just mark there is a buf been processed */ pthread_mutex_lock(&(batch->mutex_batch_complete)); batch->processed_buf_id = g->buf_set_id; pthread_mutex_unlock(&(batch->mutex_batch_complete)); } else { hammer_err("error in hammer_gpu_take_buf\n"); exit(0); } } return ; }
/* created thread, all this calls are in the thread context */ void *hammer_gpu_worker_loop(void *c) { hammer_timer_t t, counter, loopcounter; hammer_log_t log; hammer_gpu_worker_context_t *context = c; hammer_batch_t *batch_set = context->cpu_batch_set; hammer_sched_t *sched_set = context->sched_set; int i, first, ready, core_id = context->core_id; unsigned long mask = 0; double elapsed_time; int cuda_stream_id; hammer_batch_buf_t *buf; /* Set affinity of this gpu worker */ mask = 1 << core_id; if (sched_setaffinity(0, sizeof(unsigned long), (cpu_set_t *)&mask) < 0) { hammer_err("Err set affinity in GPU worker\n"); exit(0); } /* Init timers */ hammer_timer_init(&t); hammer_timer_init(&counter); hammer_timer_init(&loopcounter); hammer_log_init(&log); /* Synchronization, Wait for CPU workers */ while (1) { ready = 0; pthread_mutex_lock(&mutex_worker_init); for (i = 0; i < config->cpu_worker_num; i++) { if (sched_set[i].initialized) ready++; } pthread_mutex_unlock(&mutex_worker_init); if (ready == config->cpu_worker_num) break; usleep(5000); } /* Initialize GPU worker, we wait for that all CPU workers have been initialized * then we can init GPU worker with the batches of CPU worker */ hammer_gpu_worker_t g; hammer_gpu_worker_init(&g, batch_set, sched_set); /* Timers for each kernel launch */ hammer_timer_restart(&loopcounter); for (i = 0; i < config->iterations; i ++) { hammer_log_loop_marker(&log); /* Counter for the whole loop, from the second loop */ if (i == 2) hammer_timer_restart(&counter); // Wait for 'I', synchronization point ////////////////////////////////////////// /* This is a CPU/GPU synchronization point, as all commands in the * in-order queue before the preceding cl*Unmap() are now finished. * We can accurately sample the per-loop timer here. */ first = 1; do { elapsed_time = hammer_timer_get_elapsed_time(&loopcounter); if (first) { hammer_log_msg(&log, "\n%s %d\n", "<<<<<<<<Elapsed Time : ", elapsed_time); first = 0; } if (elapsed_time - config->I > 1) { // surpassed the time point more than 1 ms hammer_log_msg(&log, "\n%s %d\n", ">>>>>>>>Time point lost!!!! : ", elapsed_time); break; } } while (abs(elapsed_time - config->I) > 1); hammer_log_msg(&log, "%s %d\n", ">>>>>>>>Time point arrived : ", elapsed_time); hammer_timer_restart(&loopcounter); /* Get Input Buffer from CPU Workers */ ////////////////////////////////////////// hammer_timer_restart(&t); hammer_gpu_get_batch(&g, batch_set); hammer_timer_stop(&t); hammer_log_msg(&log, "\n%s\n", "---------------------------", 0); hammer_log_timer(&log, "%s %f ms\n", "Get Input Time", hammer_timer_get_total_time(&t), 10, 1); //Enqueue a kernel run call. ////////////////////////////////////////// hammer_timer_restart(&t); /* We launch each cpu worker batch as a stream*/ for (cuda_stream_id = 0; cuda_stream_id < config->cpu_worker_num; cuda_stream_id ++) { buf = g.cur_buf_set[cuda_stream_id]; // FIXME: crypto_context_aes_sha1_encrypt ( &(g.cry_ctx), buf->input_buf, buf->output_buf, 0, // in_pos buf->aes_key_pos, buf->ivs_pos, buf->hmac_key_pos, buf->pkt_offset_pos, buf->length_pos, buf->buf_size, // input buffer size buf->buf_length, // output buffer size FIXME ??? buf->job_num, cuda_stream_id, 128); /* Wait for transfer completion */ crypto_context_sync(&(g.cry_ctx), cuda_stream_id, buf->output_buf, 1, 1); } hammer_timer_stop(&t); hammer_log_timer(&log, "%s %f ms\n", "Execution Time", hammer_timer_get_total_time(&t), 10, 1); /* Tell the CPU workers that this batch has been processed */ hammer_gpu_give_result(&g, batch_set); hammer_log_msg(&log, "%s %dth iteration\n", "This is", i); //if (i > 1) timeLog->Msg( "%s %f ms\n", "Time after is", counter.GetElapsedTime()); } hammer_timer_stop(&counter); printf("End of execution, now the program costs : %f ms\n", hammer_timer_get_total_time(&counter)); //FIXME:printf("Processing speed is %.2f Mbps\n", (bytes * 8) / (1e3 * hammer_timer_get_total_time(&counter))); return 0; }
void *hammer_epoll_start(int efd, hammer_epoll_handlers_t *handler, int max_events) { int i, ret = -1; int num_events; struct epoll_event *events; hammer_connection_t *c; // int fds_timeout; //fds_timeout = log_current_utime + config->timeout; events = hammer_mem_malloc(max_events * sizeof(struct epoll_event)); while (1) { if (config->gpu) { /* Each time, we first check if GPU has gave any indication for 1) which buffer is taken, 2) which buffer has been processed */ if (hammer_batch_if_gpu_processed_new()) { hammer_batch_forwarding(); } } //FIXME: maybe problems in pointer &events num_events = hammer_epoll_wait(efd, &events, max_events); for (i = 0; i < num_events; i ++) { c = (hammer_connection_t *) events[i].data.ptr; if (events[i].events & EPOLLIN) { if (c->type == HAMMER_CONN_CLIENT) { ret = (*handler->client_read) (c); } else { if (c->type != HAMMER_CONN_SERVER) { hammer_err("this connection is not a server conn?\n"); exit(0); } ret = (*handler->server_read) (c); } } else if (events[i].events & EPOLLOUT) { if (c->type == HAMMER_CONN_CLIENT) { ret = (*handler->client_write) (c); } else { if (c->type != HAMMER_CONN_SERVER) { hammer_err("this connection is not a server conn?\n"); exit(0); } ret = (*handler->server_write) (c); } } else if (events[i].events & (EPOLLHUP | EPOLLERR | EPOLLRDHUP)) { ret = (*handler->error) (c); } else { hammer_err("What's up man, error here\n"); exit(0); } if (ret < 0) { HAMMER_TRACE("[FD %i] Epoll Event FORCE CLOSE | ret = %i", fd, ret); (*handler->close) (c); } } // FIXME: enable timeout /* Check timeouts and update next one if (log_current_utime >= fds_timeout) { hammer_sched_check_timeouts(sched); fds_timeout = log_current_utime + config->timeout; }*/ } return NULL; }