int main(int argc, char *argv[])
{
	double* matA = _mm_malloc(WIDTH*HEIGHT*sizeof(double), 64);
	double* matB = _mm_malloc((WIDTH*HEIGHT)*sizeof(double), 64);
	double* prod = _mm_malloc(WIDTH*HEIGHT*sizeof(double), 64);
	double* prod_ref = _mm_malloc(WIDTH*HEIGHT*sizeof(double), 64);

	int read_flag = read_matrix(TEST_FILENAME, prod_ref, matA, matB);
	if (read_flag == 1)
		printf("Cannot open test file\n");
	else if (read_flag == 2)
		printf("Error while reading data from test file");
	else if (read_flag == 3)
		printf("Error while closing the test file");
	if (read_flag)
		return 0;

	uint64_t start = timestamp_us();
	matmul_optimize(prod, matA, matB); /* run the optimization functions. */
	uint64_t time = timestamp_us() - start;

	if (compare_matrix(prod, prod_ref)) {
		printf("%lu incorrect\n", time);
	} else {
		printf("%lu\n", time);
	}
	_mm_free(prod_ref);
	_mm_free(prod);
	_mm_free(matB);
	_mm_free(matA);
	return 0;
}
Example #2
0
void softmax_forward(softmax_layer_t* l, vol_t** in, vol_t** out, int start, int end) {
  uint64_t tempTime = timestamp_us();
  double es[MAX_ES];

  int outd = l->out_depth;

  for (int j = start; j <= end; j++) {
    vol_t* V = in[j];
    vol_t* A = out[j];
  
    // compute max activation
    double amax = V->w[0];
    for(int i=1;i<10;i++) {
      if(V->w[i] > amax) amax = V->w[i];
    }
  
    // compute exponentials (carefully to not blow up)
    double esum = 0.0;
    for(int i=0;i<10;i++) {
      double e = exp(V->w[i] - amax);
      esum += e;
      es[i] = e;
    }
  
    // normalize and output to sum to one
    //#pragma omp parallel for
    for(int i=0;i< outd;i++) {
      es[i] /= esum;
      A->w[i] = es[i];
    }
  }
  l->myTime += timestamp_us() - tempTime;
}
Example #3
0
void conv_load(conv_layer_t* l, const char* fn) {
  uint64_t tempTime2 = timestamp_us();
  int sx, sy, depth, filters;

  FILE* fin = fopen(fn, "r");

  fscanf(fin, "%d %d %d %d", &sx, &sy, &depth, &filters);
  assert(sx == l->sx);
  assert(sy == l->sy);
  assert(depth == l->in_depth);
  assert(filters == l->out_depth);
  int depth0 = l->out_depth;
  for(int d = 0; d < depth0; d++)
    for (int x = 0; x < sx; x++)
      for (int y = 0; y < sy; y++)
        for (int z = 0; z < depth; z++) {
          double val;
          fscanf(fin, "%lf", &val);
          set_vol(l->filters[d], x, y, z, val);
        }
  for(int d = 0; d < depth0; d++) {
    double val;
    fscanf(fin, "%lf", &val);
    set_vol(l->biases, 0, 0, d, val);
  }
  l->myTime += timestamp_us() - tempTime2;
  fclose(fin);
}
Example #4
0
void fc_load(fc_layer_t* l, const char* fn) {
  uint64_t tempTime = timestamp_us();
  FILE* fin = fopen(fn, "r");

  int num_inputs;
  int out_depth;
  fscanf(fin, "%d %d", &num_inputs, &out_depth);
  assert(out_depth == l->out_depth);
  assert(num_inputs == l->num_inputs);

  for(int i = 0; i < 10; i++)

    for(int d = 0; d < 320; d++) {
      double val;
      fscanf(fin, "%lf", &val);
      l->filters[i]->w[d] = val;
    }

  for(int i = 0; i < 10; i++) {
    double val;
    fscanf(fin, "%lf", &val);
    l->biases->w[i] = val;
  }

  fclose(fin);
  l->myTime += timestamp_us() - tempTime;
}
Example #5
0
void relu_forward(relu_layer_t* l, vol_t** in, vol_t** out, int start, int end) {
  uint64_t tempTime = timestamp_us();
  int lim = l->in_sx*l->in_sy*l->in_depth;
  for (int j = start; j <= end; j++) {
    for (int i = 0; i < lim; i++) {
      out[j]->w[i] = (in[j]->w[i] < 0.0) ? 0.0 : in[j]->w[i];
    }
  }
  l->myTime += timestamp_us() - tempTime;
}
Example #6
0
//for 20 depth
void conv_forward_1(conv_layer_t* l, vol_t** in, vol_t** out, int start, int end) {
  uint64_t tempTime = timestamp_us();
  for (int i = start; i <= end; i++) {
    vol_t* V = in[i];
    vol_t* A = out[i];
    for(int d = 0; d < 20; d++) {
      vol_t* f = l->filters[d];    
      int x = -2;
      int y = -2;
      for(int ay = 0; ay < 8; y += 1, ay++) {
        x = -2;
        for(int ax=0; ax < 8; x += 1, ax++) {
          double a = 0.0;
          __m256d sum = _mm256_setzero_pd();
          for(int fy = 0; fy < 5; fy++) {
            int oy = y + fy;
            for(int fx = 0; fx < 5; fx++) {
              int ox = x + fx;
              if(oy >= 0 && oy < 8 && ox >=0 && ox < 8) {
                __m256d vector = _mm256_loadu_pd (&(f->w[((5 * fy)+fx)*20]));
                  __m256d vector2 = _mm256_loadu_pd (&(V->w[((8 * oy)+ox)*20]));
                  __m256d vectorMult = _mm256_mul_pd(vector, vector2);
                  sum =_mm256_add_pd (vectorMult, sum);
                  __m256d vector0 = _mm256_loadu_pd (&(f->w[((5 * fy)+fx)*20+4]));
                  __m256d vector9 = _mm256_loadu_pd (&(V->w[((8 * oy)+ox)*20+ 4]));
                  __m256d vectorMult0 = _mm256_mul_pd(vector0, vector9);
                  sum =_mm256_add_pd (vectorMult0, sum);
                  __m256d vector3 = _mm256_loadu_pd (&(f->w[((5 * fy)+fx)*20+8]));
                  __m256d vector4 = _mm256_loadu_pd (&(V->w[((8 * oy)+ox)*20+8]));
                  __m256d vectorMult2 = _mm256_mul_pd(vector3, vector4);
                  sum =_mm256_add_pd (vectorMult2, sum);
                  __m256d vector5 = _mm256_loadu_pd (&(f->w[((5 * fy)+fx)*20+12]));
                  __m256d vector6 = _mm256_loadu_pd (&(V->w[((8 * oy)+ox)*20+12]));
                  __m256d vectorMult3 = _mm256_mul_pd(vector5, vector6);
                  sum =_mm256_add_pd (vectorMult3, sum);
                  __m256d vector7 = _mm256_loadu_pd (&(f->w[((5 * fy)+fx)*20+16]));
                  __m256d vector8 = _mm256_loadu_pd (&(V->w[((8 * oy)+ox)*20+16]));
                  __m256d vectorMult4 = _mm256_mul_pd(vector7, vector8);
                  sum =_mm256_add_pd (vectorMult4, sum);
              }
            }
          }
          for(int i = 0; i < 4; i++) {
                  a+= sum[i];
          }
          a += l->biases->w[d];
          set_vol(A, ax, ay, d, a);
        }
      }
    }
  }
  l->myTime += timestamp_us() - tempTime;
}
Example #7
0
void fc_forward(fc_layer_t* l, vol_t** in, vol_t** out, int start, int end) {
  uint64_t tempTime = timestamp_us();
  for (int j = start; j <= end; j++) {
    vol_t* V = in[j];
    vol_t* A = out[j];

    for(int i=0;i<10;i++) {
      double a = 0.0;
      for(int d=0;d<320;d++) {
        a += V->w[d] * l->filters[i]->w[d];
      }
      a += l->biases->w[i];
      A->w[i] = a;
    }
  }
  l->myTime += timestamp_us() - tempTime;
}
Example #8
0
// Perform the classification (this calls into the functions from cnn.c
double run_classification(int* samples, int n, double** keep_output) {
  fprintf(stderr, "Making network...\n");
  network_t* net = load_cnn_snapshot();

  fprintf(stderr, "Loading batches...\n");


  for (int i = 0; i < n; i++) {
    int batch = samples[i]/10000;
    if (batches[batch] == NULL) {
      batches[batch] = load_batch(batch);
    }
  }

  vol_t** input = (vol_t**)malloc(sizeof(vol_t*)*n);
  double* output = (double*)malloc(sizeof(double)*n);


  for (int i = 0; i < n; i++) {
    input[i] = batches[samples[i]/10000][samples[i]%10000];
  }

  fprintf(stderr, "Running classification...\n");
  uint64_t start_time = timestamp_us(); 
  net_classify_cats(net, input, output, n);
  uint64_t end_time = timestamp_us();

  for (int i = 0; i < n; i++) {
    samples[i] = (output[i] > 0.5) ? 0 : -1;
  }

  double dt = (double)(end_time-start_time) / 1000.0;
  fprintf(stderr, "TIME: %lf ms\n", dt);

  free_network(net);
  free(input);

  if (keep_output == NULL)
    free(output);
  else
    *keep_output = output;

  return dt;
}
int main(int argc, char *argv[])
{
	uint64_t* newimg = _mm_malloc(WIMAGE*HIMAGE*sizeof(uint64_t), 64);
	uint64_t* newimg_ref = _mm_malloc(WIMAGE*HIMAGE*sizeof(uint64_t), 64);
	uint16_t* filter = _mm_malloc(WFILTER*HFILTER*sizeof(uint16_t), 64);
	uint16_t* image = _mm_malloc((WIMAGE*HIMAGE+2*PAD_ZERO)*sizeof(uint16_t), 64);
	for (int i = 0; i < PAD_ZERO; i++) { /* PAD matrix2 with zero to ease programming the optimization functions. */
		image[i] = 0;
	}
	image += PAD_ZERO;

	int read_flag = read_matrix(TEST_FILENAME, newimg_ref, filter, image);
	if (read_flag == 1)
		printf("Cannot open test file\n");
	else if (read_flag == 2)
		printf("Error while reading data from test file");
	else if (read_flag == 3)
		printf("Error while closing the test file");
	if (read_flag)
		return 0;


	uint64_t start = timestamp_us();
	matconv_optimize(newimg, filter, image); /* run the optimization functions. */
	uint64_t time = timestamp_us() - start;

	if (compare_matrix(newimg, newimg_ref)) {
		printf("%lu incorrect\n", time);
	} else {
		printf("%lu\n", time);
	}
	_mm_free(filter);
	_mm_free(image-PAD_ZERO);
	_mm_free(newimg);
	_mm_free(newimg_ref);
	return 0;
}
Example #10
0
void pool_forward(pool_layer_t* l, vol_t** in, vol_t** out, int start, int end) {
  uint64_t tempTime = timestamp_us();
  for (int i = start; i <= end; i++) {
    vol_t* V = in[i];
    vol_t* A = out[i];
        
    int n=0;
    for(int d=0;d<l->out_depth;d++) {
      int y = 0;
      int x = 0;

      int lsx = l->out_sx;
      int lsy = l->out_sy;
      int lusx = l->sx;

      for(int ax=0; ax<lsx; x+=2, ax++) {
        y = 0;
        for(int ay=0; ay<lsy; y+=2,ay++) {
          double a = -99999;
          for(int fx=0;fx<lusx;fx++) {
            for(int fy=0;fy<2;fy++) {
              int oy = y+fy;
              int ox = x+fx;
              if(oy>=0 && oy<32 && ox>=0 && ox<32) {
                double v = get_vol(V, ox, oy, d);
                if(v > a) { a = v; }
              }
            }
          }
          n++;
          set_vol(A, ax, ay, d, a);
        }
      }
    }
  }
  l->myTime += timestamp_us() - tempTime;
}
Example #11
0
void conv_forward(conv_layer_t* l, vol_t** in, vol_t** out, int start, int end) {
  uint64_t tempTime = timestamp_us();
  for (int i = start; i <= end; i++) {
    vol_t* V = in[i];
    vol_t* A = out[i];
    for(int d = 0; d < 16; d++) {
      vol_t* f = l->filters[d];
      int x = -2;
      int y = -2;
      for(int ay = 0; ay < 32; y += 1, ay++) {
        x = -2;
        for(int ax=0; ax < 32; x += 1, ax++) {
          double a = 0.0;
          __m256d sum = _mm256_setzero_pd();
          for(int fy = 0; fy < 5; fy++) {
            int oy = y + fy;
            for(int fx = 0; fx < 5; fx++) {
              int ox = x + fx;
              if(oy >= 0 && oy < 32 && ox >=0 && ox < 32) {
                __m256d vector = _mm256_loadu_pd (&(f->w[((5 * fy)+fx)*3]));
                __m256d vector2 = _mm256_loadu_pd (&(V->w[((32 * oy)+ox)*3]));
                __m256d vectorMult = _mm256_mul_pd(vector, vector2);
                sum =_mm256_add_pd (vectorMult, sum);
              }
            }
          }
          for(int i = 0; i < 3; i++) {
            a+= sum[i];
          }
          a += l->biases->w[d];
          set_vol(A, ax, ay, d, a);
        }
      }
    }
  }
  l->myTime += timestamp_us() - tempTime;
}
Example #12
0
int main(int argc, char** argv) {
  int M, N, K;
  if (argc < 4) {
    fprintf(stderr, "M, N, K not given, use the default values\n");
    M = M_default;
    N = N_default;
    K = K_default;
  }
  else{
    M = atoi(argv[1]);
    N = atoi(argv[2]);
    K = atoi(argv[3]);
  }
    int incRowA = K * spacingFactor;
    int incRowB = N * spacingFactor;
    int incRowC = N * spacingFactor;

	Dtype* A = (Dtype*)malloc(sizeof(Dtype)*M*incRowA);
	Dtype* B = (Dtype*)malloc(sizeof(Dtype)*K*incRowB);
	Dtype* C = (Dtype*)malloc(sizeof(Dtype)*M*incRowC);

    for(int i = 0; i < M; i++){
        for(int j = 0; j < K; j++){
            A[i*incRowA+j] = 1;
        }
    }
    
    for(int i = 0; i < K; i++){
        for(int j = 0; j < N; j++){
            B[i*incRowB+j] = 1;
        }
    }
    
    for(int i = 0; i < M; i++){
        for(int j = 0; j < N; j++){
            C[i*incRowC+j] = 0;
        }
    }




    uint64_t start_time = timestamp_us();
    // SimpleMatrixMultiplication(
    //         M, N, K,
    //         A, incRowA,
    //         B, incRowB,
    //         C, incRowC);
    // cblas_gemm(
    //         M, N, K,
    //         A, incRowA,
    //         B, incRowB,
    //         C, incRowC);
    // cache_oblivious_matrix_multiplication(
    //         M, N, K,
    //         A, incRowA,
    //         B, incRowB,
    //         C, incRowC);
    strassen_matrix_multiplication(
            M, N, K,
            A, incRowA,
            B, incRowB,
            C, incRowC);
    uint64_t end_time = timestamp_us();
    double m_second_taken = (double)(end_time - start_time) / 1000.0;
    int error = 0;
    for(int i = 0; i < M; i++){
    // 	// fprintf(stderr, "%d \n", fix16_to_int(M3[i]));
    	for(int j = 0; j < N; j++){
            if(C[i*incRowC+j] != K){
                error++;
                fprintf(stderr, "%d %d %d \n", i, j, C[i*incRowC+j]);
            }
    		    // fprintf(stderr, "%d ", (int)C[i*incRowC+j]);
            }
        // fprintf(stderr, "\n");
    }
    // print_matrix(C, M, N, N);
    // printf("M,  N,  K,  error, Time taken \n");
    printf("%d, %d, %d, %d, %f \n", M, N, K, error, m_second_taken);
}
Example #13
0
int
connect_socket(const char *host, unsigned short port, unsigned long to_us)
{
	int64_t tsend = to_us ? timestamp_us() + to_us : 0;
	struct addrinfo *ai_list = NULL;
	struct addrinfo hints;
	memset(&hints, 0, sizeof hints);
	hints.ai_family = PF_UNSPEC;
	hints.ai_socktype = SOCK_STREAM;
	hints.ai_protocol = 0;
	hints.ai_flags = AI_NUMERICSERV;
	char portstr[6];
	snprintf(portstr, sizeof portstr, "%hu", port);

	int r = getaddrinfo(host, portstr, &hints, &ai_list);

	if (r != 0) {
		W("%s", gai_strerror(r));
		return -1;
	}

	if (!ai_list) {
		W("result address list empty");
		return -1;
	}

	int sck = -1;
	for (struct addrinfo *ai = ai_list; ai; ai = ai->ai_next)
	{
		errno = 0;
		sck = socket(ai->ai_family, ai->ai_socktype, ai->ai_protocol);
		if (sck < 0) {
			WE("cannot create socket");
			continue;
		}
		
		errno = 0;
		if (fcntl(sck, F_SETFL, O_NONBLOCK) == -1) {
			WE("failed to enable nonblocking mode");
			close(sck);
			sck = -1;
			continue;
		}
		
		D("set to nonblocking mode, calling connect() now");
		errno = 0;
		int r = connect(sck, ai->ai_addr, ai->ai_addrlen);

		if (r == -1 && (errno != EINPROGRESS)) {
			WE("connect() failed");
			close(sck);
			sck = -1;
			continue;
		}

		int opt = 1;
		socklen_t optlen = sizeof opt;
		struct timeval tout;
		tout.tv_sec = 0;
		tout.tv_usec = 0;
		int64_t trem = 0;

		for(;;) {
			if (tsend)
			{
				trem = tsend - timestamp_us();
				if (trem <= 0) {
					W("timeout reached while in 3WHS");
					close(sck);
					sck = -1;
					goto outer_bot;
				}

				tconv(&tout, &trem, false);
			}

			fd_set fds;
			FD_ZERO(&fds);
			FD_SET(sck, &fds);

			errno = 0;
			r = select(sck+1, NULL, &fds, NULL, tsend ? &tout : NULL);
			if (r < 0)
			{
				WE("select() failed");
				close(sck);
				sck = -1;
				goto outer_bot;
			}
			if (r == 1) {
				D("select finished successfully");
				break;
			}
		}

		if (getsockopt(sck, SOL_SOCKET, SO_ERROR, &opt, &optlen) != 0) {
			W("getsockopt failed");
			close(sck);
			sck = -1;
			continue;
		}

		if (opt == 0) {
			D("socket connected, setting to blocking mode");
			errno = 0;
			if (fcntl(sck, F_SETFL, 0) == -1) {
				WE("failed to clear nonblocking mode");
				close(sck);
				sck = -1;
				continue;
			}

			break;
		} else {
			WC(opt, "could not connect socket (%d)", opt);
			close(sck);
			sck = -1;
			continue;
		}
outer_bot:;
	}

	freeaddrinfo(ai_list);

	return sck;
}