int main_demo_kmeans(std::vector<std::string> &args, unsigned int i) { int image_size = 600; int num_centers = 2; if (args.size() <= i + 1) { std::cout << "Usage: " << my::collection::args_getBinaryName(args) << std::endl; std::cout << " -kmeans -demo" << std::endl; std::cout << " -numCenters [num] Default=" << num_centers << std::endl; std::cout << " -imageSize [pixels] Default=" << image_size << std::endl; return EXIT_FAILURE; } while (i < args.size()) { if (my::collection::is_next_arg_equal("-numCenters", args, i)) num_centers = my::collection::next_arg_int(args, i); else if (my::collection::is_next_arg_equal("-image_size", args, i)) image_size = my::collection::next_arg_int(args, i); else throw std::runtime_error( "unknown parameter " + my::collection::next_arg(args, i)); } my::assert::greaterInt("numCenters", num_centers, 0); my::assert::greaterInt("imageSize", image_size, 0); MknnDataset *dataset1 = generate_dataset(image_size); MknnDistance *distance = mknn_distance_newPredefined( mknn_predefDistance_L2(), true); MknnDataset *dataset; if (false) { dataset = mknn_datasetLoader_reorderNearestNeighbor(dataset1, distance, -1, true); } else { dataset = dataset1; } MknnKmeansAlgorithm *kmeans = mknn_kmeans_new(); mknn_kmeans_setDataset(kmeans, dataset); mknn_kmeans_setDistance(kmeans, distance); mknn_kmeans_setNumCentroids(kmeans, num_centers); State state; state.dataset = dataset; state.image_size = image_size; mknn_kmeans_setIterationCallBackFunction(kmeans, callback_method, &state); mknn_kmeans_perform(kmeans); printDistanceMatrix(kmeans); return EXIT_SUCCESS; }
/** call: ./main <matrix_dimension> <number_of_tests> <use_gpu>*/ int main(int argc, char* argv[]) { cuda_identify(); if (argc != 4) { printf("program must be called with arguments: matrix_dimension tests_number use_gpu(0/1)\n"); exit(1); } const int M = atoi(argv[1]); printf("Using matrix dimension: %d\n", M); const int tests = atoi(argv[2]); const bool cpu = !atoi(argv[3]); // always use the same seed to get the same matrices during tests srand(0); #ifdef DOUBLE const fp_t min_diff = 0.00000001; //for double, fails with 8192 and floats on both cpu and gpu #else const fp_t min_diff = 0.000001; #endif const fp_t alpha = 0.9; const int max_iter = 50; fp_t* exec_times = malloc(tests * sizeof(fp_t)); fp_t* all_rmse = malloc(tests * sizeof(fp_t)); for (int k = 0; k < tests; k++) { const DataSet dataset = generate_dataset(M); Matrix* last_x = aligned_vector(M, true); Matrix* x = aligned_vector(M, true); for (int i = 0; i < M; i++) { } int iterations = 0; // solve Ax = b const fp_t start_time = omp_get_wtime(); fp_t sum = 0; int j = 0; int i = 0; const Matrix* A = dataset.A; const Matrix* b = dataset.b; assert(x != last_x); if (cpu) { //#pragma omp parallel shared(last_x, x, iterations) private(i, j, sum) while ((matrix_diff(x, last_x) > min_diff) && (max_iter < 0 || iterations < max_iter)) { //fp_t st_time0 = omp_get_wtime(); //#pragma omp single { swap(last_x, x); } // A, M, alpha and b are constant, so they cannot be declared as shared //#pragma omp for schedule(dynamic) for (i = 0; i < M; i++) { sum = 0; //#pragma omp simd aligned(A, last_x: 16) reduction(+:sum) linear(j) for (j = 0; j < M; j++) { sum += A->elements[i * M + j] * last_x->elements[j]; } sum -= A->elements[i * M + i] * last_x->elements[i]; // opt: outside the loop for sse optimizer x->elements[i] = (1 - alpha) * last_x->elements[i] + alpha * (b->elements[i] - sum) / A->elements[i * M + i]; } //#pragma omp single nowait { iterations++; } //printf("%dus spent\n", (int)((omp_get_wtime() - st_time0) * 1000000)); } } else { Matrix* d_A = device_matrix_from(A); #ifndef DOUBLE #ifdef TEXTURE texbind(d_A->elements, d_A->size * sizeof(fp_t)); #endif #endif cudaMemcpy(d_A->elements, A->elements, A->size * sizeof(fp_t), cudaMemcpyHostToDevice); Matrix* d_b = device_matrix_from(b); cudaMemcpy(d_b->elements, b->elements, b->size * sizeof(fp_t), cudaMemcpyHostToDevice); Matrix* d_last_x = device_matrix_from(last_x); Matrix* d_c = device_matrix_from(b); Matrix* d_x = device_matrix_from(x); cudaMemcpy(d_x->elements, x->elements, x->size * sizeof(fp_t), cudaMemcpyHostToDevice); cudaMemcpy(d_last_x->elements, last_x->elements, last_x->size * sizeof(fp_t), cudaMemcpyHostToDevice); fp_t x_diff = 2 * min_diff; fp_t* d_x_diff; cudaMalloc((void**)&d_x_diff, sizeof(fp_t)); //fp_t stime; while ((x_diff > min_diff) && (max_iter < 0 || iterations < max_iter)) { //stime = omp_get_wtime(); cuda_multiply(*d_A, *d_last_x, *d_c); //print_cuda_elapsed(stime); //stime = omp_get_wtime(); cuda_reduce(*d_A, *d_b, *d_c, d_x, d_last_x, alpha); //performs swap //print_cuda_elapsed(stime); //stime = omp_get_wtime(); cuda_diff(*d_x, *d_last_x, d_x_diff); //print_cuda_elapsed(stime); iterations++; //cudaMemcpyFromSymbol(&x_diff, "d_x_diff", sizeof(x_diff), 0, cudaMemcpyDeviceToHost); //stime = omp_get_wtime(); cudaMemcpy(&x_diff, d_x_diff, sizeof(fp_t), cudaMemcpyDeviceToHost); //print_cuda_elapsed(stime); } // copy last_x instead, as it was swapped cudaMemcpy(x->elements, d_last_x->elements, x->size * sizeof(fp_t), cudaMemcpyDeviceToHost); #ifndef DOUBLE #ifdef TEXTURE texunbind(); #endif #endif cudaFree(d_A->elements); cudaFree(d_b->elements); cudaFree(d_last_x->elements); cudaFree(d_c->elements); cudaFree(d_x->elements); cudaFree(d_x_diff); free(d_A); free(d_b); free(d_c); free(d_last_x); free(d_x); } const fp_t end_time = omp_get_wtime(); const fp_t seconds_spent = end_time - start_time; exec_times[k] = seconds_spent; if (verbose) { printf("x: "); print_matrix(x); printf("expected_x: "); print_matrix(dataset.x); //print_matrix(dataset.A); //print_matrix(dataset.b); } Matrix* bx = aligned_vector(M, false); for (int i = 0; i < M; i++) { for (int j = 0; j < M; j++) { bx->elements[i] += A->elements[i * M + j] * x->elements[j]; } } if (verbose) { printf("resulting b: "); print_matrix(bx); } all_rmse[k] = rmse(bx, b); printf("RMSE: %0.10f\n", all_rmse[k]); printf("iterations: %d\nseconds: %0.10f\n", iterations, seconds_spent); assert(x != last_x); free(bx->elements); free(x->elements); free(last_x->elements); free(dataset.x->elements); free(dataset.A->elements); free(dataset.b->elements); free(bx); free(x); free(last_x); free(dataset.x); free(dataset.A); free(dataset.b); } printf("Time: mean %0.10f std %0.10f\n", array_mean(exec_times, tests), array_std(exec_times, tests)); printf("RMSE: mean %0.10f std %0.10f\n", array_mean(all_rmse, tests), array_std(all_rmse, tests)); free(all_rmse); free(exec_times); return 0; }