// CPU threads----------------------------------------------------------------- void run_cpu_threads(XYZ *in, XYZ *outp, int n_tasks, float alpha, int n_threads, int n_gpu_threads, int in_size_i, int in_size_j, int out_size_i, int out_size_j #ifdef CUDA_8_0 , std::atomic_int *worklist #endif ) { std::vector<std::thread> cpu_threads; for(int k = 0; k < n_threads; k++) { cpu_threads.push_back(std::thread([=]() { #ifdef CUDA_8_0 Partitioner p = partitioner_create(n_tasks, alpha, k, n_threads, worklist); #else Partitioner p = partitioner_create(n_tasks, alpha, k, n_threads); #endif const int wg_in_J = divceil(out_size_j, n_gpu_threads); const int wg_in_I = divceil(out_size_i, n_gpu_threads); for(int t = cpu_first(&p); cpu_more(&p); t = cpu_next(&p)) { const int my_s1 = t / wg_in_J; const int my_s0 = t % wg_in_J; int Row = my_s1 * n_gpu_threads; int Col = my_s0 * n_gpu_threads; T bi; T bj; T mui, muj; for(int i = Row; i < Row + n_gpu_threads; i++) { mui = i / (T)(out_size_i - 1); for(int j = Col; j < Col + n_gpu_threads; j++) { muj = j / (T)(out_size_j - 1); if(i < out_size_i && j < out_size_j) { XYZ out = {0, 0, 0}; #pragma unroll for(int ki = 0; ki <= in_size_i; ki++) { bi = BezierBlend(ki, mui, in_size_i); #pragma unroll for(int kj = 0; kj <= in_size_j; kj++) { bj = BezierBlend(kj, muj, in_size_j); out.x += (in[ki * (in_size_j + 1) + kj].x * bi * bj); out.y += (in[ki * (in_size_j + 1) + kj].y * bi * bj); out.z += (in[ki * (in_size_j + 1) + kj].z * bi * bj); } } outp[i * out_size_j + j] = out; } } } } })); } std::for_each(cpu_threads.begin(), cpu_threads.end(), [](std::thread &t) { t.join(); }); }
void MATRIX_init() { __HAL_RCC_GPIOA_CLK_ENABLE(); __HAL_RCC_GPIOB_CLK_ENABLE(); __HAL_RCC_GPIOC_CLK_ENABLE(); for (uint8_t i = 0; i < nrows; ++i) { IO_config(rows[i], OUTPUT); } for (uint8_t i = 0; i < ncols; ++i) { IO_config(cols[i], INPUT); } states = malloc(divceil(nrows*ncols, 8)); memset(states, 0, divceil(nrows*ncols, 8)); last_scancode = malloc(nrows*ncols); }