int main(int argc, char *argv[]) { double* matA = _mm_malloc(WIDTH*HEIGHT*sizeof(double), 64); double* matB = _mm_malloc((WIDTH*HEIGHT)*sizeof(double), 64); double* prod = _mm_malloc(WIDTH*HEIGHT*sizeof(double), 64); double* prod_ref = _mm_malloc(WIDTH*HEIGHT*sizeof(double), 64); int read_flag = read_matrix(TEST_FILENAME, prod_ref, matA, matB); if (read_flag == 1) printf("Cannot open test file\n"); else if (read_flag == 2) printf("Error while reading data from test file"); else if (read_flag == 3) printf("Error while closing the test file"); if (read_flag) return 0; uint64_t start = timestamp_us(); matmul_optimize(prod, matA, matB); /* run the optimization functions. */ uint64_t time = timestamp_us() - start; if (compare_matrix(prod, prod_ref)) { printf("%lu incorrect\n", time); } else { printf("%lu\n", time); } _mm_free(prod_ref); _mm_free(prod); _mm_free(matB); _mm_free(matA); return 0; }
void softmax_forward(softmax_layer_t* l, vol_t** in, vol_t** out, int start, int end) { uint64_t tempTime = timestamp_us(); double es[MAX_ES]; int outd = l->out_depth; for (int j = start; j <= end; j++) { vol_t* V = in[j]; vol_t* A = out[j]; // compute max activation double amax = V->w[0]; for(int i=1;i<10;i++) { if(V->w[i] > amax) amax = V->w[i]; } // compute exponentials (carefully to not blow up) double esum = 0.0; for(int i=0;i<10;i++) { double e = exp(V->w[i] - amax); esum += e; es[i] = e; } // normalize and output to sum to one //#pragma omp parallel for for(int i=0;i< outd;i++) { es[i] /= esum; A->w[i] = es[i]; } } l->myTime += timestamp_us() - tempTime; }
void conv_load(conv_layer_t* l, const char* fn) { uint64_t tempTime2 = timestamp_us(); int sx, sy, depth, filters; FILE* fin = fopen(fn, "r"); fscanf(fin, "%d %d %d %d", &sx, &sy, &depth, &filters); assert(sx == l->sx); assert(sy == l->sy); assert(depth == l->in_depth); assert(filters == l->out_depth); int depth0 = l->out_depth; for(int d = 0; d < depth0; d++) for (int x = 0; x < sx; x++) for (int y = 0; y < sy; y++) for (int z = 0; z < depth; z++) { double val; fscanf(fin, "%lf", &val); set_vol(l->filters[d], x, y, z, val); } for(int d = 0; d < depth0; d++) { double val; fscanf(fin, "%lf", &val); set_vol(l->biases, 0, 0, d, val); } l->myTime += timestamp_us() - tempTime2; fclose(fin); }
void fc_load(fc_layer_t* l, const char* fn) { uint64_t tempTime = timestamp_us(); FILE* fin = fopen(fn, "r"); int num_inputs; int out_depth; fscanf(fin, "%d %d", &num_inputs, &out_depth); assert(out_depth == l->out_depth); assert(num_inputs == l->num_inputs); for(int i = 0; i < 10; i++) for(int d = 0; d < 320; d++) { double val; fscanf(fin, "%lf", &val); l->filters[i]->w[d] = val; } for(int i = 0; i < 10; i++) { double val; fscanf(fin, "%lf", &val); l->biases->w[i] = val; } fclose(fin); l->myTime += timestamp_us() - tempTime; }
void relu_forward(relu_layer_t* l, vol_t** in, vol_t** out, int start, int end) { uint64_t tempTime = timestamp_us(); int lim = l->in_sx*l->in_sy*l->in_depth; for (int j = start; j <= end; j++) { for (int i = 0; i < lim; i++) { out[j]->w[i] = (in[j]->w[i] < 0.0) ? 0.0 : in[j]->w[i]; } } l->myTime += timestamp_us() - tempTime; }
//for 20 depth void conv_forward_1(conv_layer_t* l, vol_t** in, vol_t** out, int start, int end) { uint64_t tempTime = timestamp_us(); for (int i = start; i <= end; i++) { vol_t* V = in[i]; vol_t* A = out[i]; for(int d = 0; d < 20; d++) { vol_t* f = l->filters[d]; int x = -2; int y = -2; for(int ay = 0; ay < 8; y += 1, ay++) { x = -2; for(int ax=0; ax < 8; x += 1, ax++) { double a = 0.0; __m256d sum = _mm256_setzero_pd(); for(int fy = 0; fy < 5; fy++) { int oy = y + fy; for(int fx = 0; fx < 5; fx++) { int ox = x + fx; if(oy >= 0 && oy < 8 && ox >=0 && ox < 8) { __m256d vector = _mm256_loadu_pd (&(f->w[((5 * fy)+fx)*20])); __m256d vector2 = _mm256_loadu_pd (&(V->w[((8 * oy)+ox)*20])); __m256d vectorMult = _mm256_mul_pd(vector, vector2); sum =_mm256_add_pd (vectorMult, sum); __m256d vector0 = _mm256_loadu_pd (&(f->w[((5 * fy)+fx)*20+4])); __m256d vector9 = _mm256_loadu_pd (&(V->w[((8 * oy)+ox)*20+ 4])); __m256d vectorMult0 = _mm256_mul_pd(vector0, vector9); sum =_mm256_add_pd (vectorMult0, sum); __m256d vector3 = _mm256_loadu_pd (&(f->w[((5 * fy)+fx)*20+8])); __m256d vector4 = _mm256_loadu_pd (&(V->w[((8 * oy)+ox)*20+8])); __m256d vectorMult2 = _mm256_mul_pd(vector3, vector4); sum =_mm256_add_pd (vectorMult2, sum); __m256d vector5 = _mm256_loadu_pd (&(f->w[((5 * fy)+fx)*20+12])); __m256d vector6 = _mm256_loadu_pd (&(V->w[((8 * oy)+ox)*20+12])); __m256d vectorMult3 = _mm256_mul_pd(vector5, vector6); sum =_mm256_add_pd (vectorMult3, sum); __m256d vector7 = _mm256_loadu_pd (&(f->w[((5 * fy)+fx)*20+16])); __m256d vector8 = _mm256_loadu_pd (&(V->w[((8 * oy)+ox)*20+16])); __m256d vectorMult4 = _mm256_mul_pd(vector7, vector8); sum =_mm256_add_pd (vectorMult4, sum); } } } for(int i = 0; i < 4; i++) { a+= sum[i]; } a += l->biases->w[d]; set_vol(A, ax, ay, d, a); } } } } l->myTime += timestamp_us() - tempTime; }
void fc_forward(fc_layer_t* l, vol_t** in, vol_t** out, int start, int end) { uint64_t tempTime = timestamp_us(); for (int j = start; j <= end; j++) { vol_t* V = in[j]; vol_t* A = out[j]; for(int i=0;i<10;i++) { double a = 0.0; for(int d=0;d<320;d++) { a += V->w[d] * l->filters[i]->w[d]; } a += l->biases->w[i]; A->w[i] = a; } } l->myTime += timestamp_us() - tempTime; }
// Perform the classification (this calls into the functions from cnn.c double run_classification(int* samples, int n, double** keep_output) { fprintf(stderr, "Making network...\n"); network_t* net = load_cnn_snapshot(); fprintf(stderr, "Loading batches...\n"); for (int i = 0; i < n; i++) { int batch = samples[i]/10000; if (batches[batch] == NULL) { batches[batch] = load_batch(batch); } } vol_t** input = (vol_t**)malloc(sizeof(vol_t*)*n); double* output = (double*)malloc(sizeof(double)*n); for (int i = 0; i < n; i++) { input[i] = batches[samples[i]/10000][samples[i]%10000]; } fprintf(stderr, "Running classification...\n"); uint64_t start_time = timestamp_us(); net_classify_cats(net, input, output, n); uint64_t end_time = timestamp_us(); for (int i = 0; i < n; i++) { samples[i] = (output[i] > 0.5) ? 0 : -1; } double dt = (double)(end_time-start_time) / 1000.0; fprintf(stderr, "TIME: %lf ms\n", dt); free_network(net); free(input); if (keep_output == NULL) free(output); else *keep_output = output; return dt; }
int main(int argc, char *argv[]) { uint64_t* newimg = _mm_malloc(WIMAGE*HIMAGE*sizeof(uint64_t), 64); uint64_t* newimg_ref = _mm_malloc(WIMAGE*HIMAGE*sizeof(uint64_t), 64); uint16_t* filter = _mm_malloc(WFILTER*HFILTER*sizeof(uint16_t), 64); uint16_t* image = _mm_malloc((WIMAGE*HIMAGE+2*PAD_ZERO)*sizeof(uint16_t), 64); for (int i = 0; i < PAD_ZERO; i++) { /* PAD matrix2 with zero to ease programming the optimization functions. */ image[i] = 0; } image += PAD_ZERO; int read_flag = read_matrix(TEST_FILENAME, newimg_ref, filter, image); if (read_flag == 1) printf("Cannot open test file\n"); else if (read_flag == 2) printf("Error while reading data from test file"); else if (read_flag == 3) printf("Error while closing the test file"); if (read_flag) return 0; uint64_t start = timestamp_us(); matconv_optimize(newimg, filter, image); /* run the optimization functions. */ uint64_t time = timestamp_us() - start; if (compare_matrix(newimg, newimg_ref)) { printf("%lu incorrect\n", time); } else { printf("%lu\n", time); } _mm_free(filter); _mm_free(image-PAD_ZERO); _mm_free(newimg); _mm_free(newimg_ref); return 0; }
void pool_forward(pool_layer_t* l, vol_t** in, vol_t** out, int start, int end) { uint64_t tempTime = timestamp_us(); for (int i = start; i <= end; i++) { vol_t* V = in[i]; vol_t* A = out[i]; int n=0; for(int d=0;d<l->out_depth;d++) { int y = 0; int x = 0; int lsx = l->out_sx; int lsy = l->out_sy; int lusx = l->sx; for(int ax=0; ax<lsx; x+=2, ax++) { y = 0; for(int ay=0; ay<lsy; y+=2,ay++) { double a = -99999; for(int fx=0;fx<lusx;fx++) { for(int fy=0;fy<2;fy++) { int oy = y+fy; int ox = x+fx; if(oy>=0 && oy<32 && ox>=0 && ox<32) { double v = get_vol(V, ox, oy, d); if(v > a) { a = v; } } } } n++; set_vol(A, ax, ay, d, a); } } } } l->myTime += timestamp_us() - tempTime; }
void conv_forward(conv_layer_t* l, vol_t** in, vol_t** out, int start, int end) { uint64_t tempTime = timestamp_us(); for (int i = start; i <= end; i++) { vol_t* V = in[i]; vol_t* A = out[i]; for(int d = 0; d < 16; d++) { vol_t* f = l->filters[d]; int x = -2; int y = -2; for(int ay = 0; ay < 32; y += 1, ay++) { x = -2; for(int ax=0; ax < 32; x += 1, ax++) { double a = 0.0; __m256d sum = _mm256_setzero_pd(); for(int fy = 0; fy < 5; fy++) { int oy = y + fy; for(int fx = 0; fx < 5; fx++) { int ox = x + fx; if(oy >= 0 && oy < 32 && ox >=0 && ox < 32) { __m256d vector = _mm256_loadu_pd (&(f->w[((5 * fy)+fx)*3])); __m256d vector2 = _mm256_loadu_pd (&(V->w[((32 * oy)+ox)*3])); __m256d vectorMult = _mm256_mul_pd(vector, vector2); sum =_mm256_add_pd (vectorMult, sum); } } } for(int i = 0; i < 3; i++) { a+= sum[i]; } a += l->biases->w[d]; set_vol(A, ax, ay, d, a); } } } } l->myTime += timestamp_us() - tempTime; }
int main(int argc, char** argv) { int M, N, K; if (argc < 4) { fprintf(stderr, "M, N, K not given, use the default values\n"); M = M_default; N = N_default; K = K_default; } else{ M = atoi(argv[1]); N = atoi(argv[2]); K = atoi(argv[3]); } int incRowA = K * spacingFactor; int incRowB = N * spacingFactor; int incRowC = N * spacingFactor; Dtype* A = (Dtype*)malloc(sizeof(Dtype)*M*incRowA); Dtype* B = (Dtype*)malloc(sizeof(Dtype)*K*incRowB); Dtype* C = (Dtype*)malloc(sizeof(Dtype)*M*incRowC); for(int i = 0; i < M; i++){ for(int j = 0; j < K; j++){ A[i*incRowA+j] = 1; } } for(int i = 0; i < K; i++){ for(int j = 0; j < N; j++){ B[i*incRowB+j] = 1; } } for(int i = 0; i < M; i++){ for(int j = 0; j < N; j++){ C[i*incRowC+j] = 0; } } uint64_t start_time = timestamp_us(); // SimpleMatrixMultiplication( // M, N, K, // A, incRowA, // B, incRowB, // C, incRowC); // cblas_gemm( // M, N, K, // A, incRowA, // B, incRowB, // C, incRowC); // cache_oblivious_matrix_multiplication( // M, N, K, // A, incRowA, // B, incRowB, // C, incRowC); strassen_matrix_multiplication( M, N, K, A, incRowA, B, incRowB, C, incRowC); uint64_t end_time = timestamp_us(); double m_second_taken = (double)(end_time - start_time) / 1000.0; int error = 0; for(int i = 0; i < M; i++){ // // fprintf(stderr, "%d \n", fix16_to_int(M3[i])); for(int j = 0; j < N; j++){ if(C[i*incRowC+j] != K){ error++; fprintf(stderr, "%d %d %d \n", i, j, C[i*incRowC+j]); } // fprintf(stderr, "%d ", (int)C[i*incRowC+j]); } // fprintf(stderr, "\n"); } // print_matrix(C, M, N, N); // printf("M, N, K, error, Time taken \n"); printf("%d, %d, %d, %d, %f \n", M, N, K, error, m_second_taken); }
int connect_socket(const char *host, unsigned short port, unsigned long to_us) { int64_t tsend = to_us ? timestamp_us() + to_us : 0; struct addrinfo *ai_list = NULL; struct addrinfo hints; memset(&hints, 0, sizeof hints); hints.ai_family = PF_UNSPEC; hints.ai_socktype = SOCK_STREAM; hints.ai_protocol = 0; hints.ai_flags = AI_NUMERICSERV; char portstr[6]; snprintf(portstr, sizeof portstr, "%hu", port); int r = getaddrinfo(host, portstr, &hints, &ai_list); if (r != 0) { W("%s", gai_strerror(r)); return -1; } if (!ai_list) { W("result address list empty"); return -1; } int sck = -1; for (struct addrinfo *ai = ai_list; ai; ai = ai->ai_next) { errno = 0; sck = socket(ai->ai_family, ai->ai_socktype, ai->ai_protocol); if (sck < 0) { WE("cannot create socket"); continue; } errno = 0; if (fcntl(sck, F_SETFL, O_NONBLOCK) == -1) { WE("failed to enable nonblocking mode"); close(sck); sck = -1; continue; } D("set to nonblocking mode, calling connect() now"); errno = 0; int r = connect(sck, ai->ai_addr, ai->ai_addrlen); if (r == -1 && (errno != EINPROGRESS)) { WE("connect() failed"); close(sck); sck = -1; continue; } int opt = 1; socklen_t optlen = sizeof opt; struct timeval tout; tout.tv_sec = 0; tout.tv_usec = 0; int64_t trem = 0; for(;;) { if (tsend) { trem = tsend - timestamp_us(); if (trem <= 0) { W("timeout reached while in 3WHS"); close(sck); sck = -1; goto outer_bot; } tconv(&tout, &trem, false); } fd_set fds; FD_ZERO(&fds); FD_SET(sck, &fds); errno = 0; r = select(sck+1, NULL, &fds, NULL, tsend ? &tout : NULL); if (r < 0) { WE("select() failed"); close(sck); sck = -1; goto outer_bot; } if (r == 1) { D("select finished successfully"); break; } } if (getsockopt(sck, SOL_SOCKET, SO_ERROR, &opt, &optlen) != 0) { W("getsockopt failed"); close(sck); sck = -1; continue; } if (opt == 0) { D("socket connected, setting to blocking mode"); errno = 0; if (fcntl(sck, F_SETFL, 0) == -1) { WE("failed to clear nonblocking mode"); close(sck); sck = -1; continue; } break; } else { WC(opt, "could not connect socket (%d)", opt); close(sck); sck = -1; continue; } outer_bot:; } freeaddrinfo(ai_list); return sck; }