static __device__ __forceinline__ int syncthreadsOr(int pred) { #if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ < 200) // just campilation stab return 0; #else return __syncthreads_or(pred); #endif }
__global__ static void update_equiv(cuda::Param<T> equiv_map, const cudaTextureObject_t tex) { // Basic coordinates const int base_x = (blockIdx.x * blockDim.x * n_per_thread) + threadIdx.x; const int base_y = (blockIdx.y * blockDim.y * n_per_thread) + threadIdx.y; const int width = equiv_map.dims[0]; const int height = equiv_map.dims[1]; bool tid_changed = false; // Per element write flags and label, initially 0 char write[n_per_thread * n_per_thread]; T best_label[n_per_thread * n_per_thread]; #pragma unroll for (int i = 0; i < n_per_thread * n_per_thread; ++i) { write[i] = (char)0; best_label[i] = (T)0; } // Cached tile of the equivalency map __shared__ T s_tile[n_per_thread*block_dim][(n_per_thread*block_dim)]; #pragma unroll for (int xb = 0; xb < n_per_thread; ++xb) { #pragma unroll for (int yb = 0; yb < n_per_thread; ++yb) { // Indexing variables const int x = base_x + (xb * blockDim.x); const int y = base_y + (yb * blockDim.y); const int tx = threadIdx.x + (xb * blockDim.x); const int ty = threadIdx.y + (yb * blockDim.y); const int tid_i = xb * n_per_thread + yb; const int n = y * width + x; // Get the label for this pixel if we're in bounds const T orig_label = (x < width && y < height) ? fetch<T>(n, equiv_map, tex) : (T)0; s_tile[ty][tx] = orig_label; // Find the lowest label of the nearest valid pixel // So far, all we know is that this pixel is valid. best_label[tid_i] = orig_label; if (orig_label != (T)0) { const int south_y = min(y, height-2) + 1; const int north_y = max(y, 1) - 1; const int east_x = min(x, width-2) + 1; const int west_x = max(x, 1) - 1; // Check bottom best_label[tid_i] = relabel(best_label[tid_i], fetch((south_y) * width + x, equiv_map, tex)); // Check right neighbor best_label[tid_i] = relabel(best_label[tid_i], fetch(y * width + east_x, equiv_map, tex)); // Check left neighbor best_label[tid_i] = relabel(best_label[tid_i], fetch(y * width + west_x, equiv_map, tex)); // Check top neighbor best_label[tid_i] = relabel(best_label[tid_i], fetch((north_y) * width + x, equiv_map, tex)); if (full_conn) { // Check NW corner best_label[tid_i] = relabel(best_label[tid_i], fetch((north_y) * width + west_x, equiv_map, tex)); // Check NE corner best_label[tid_i] = relabel(best_label[tid_i], fetch((north_y) * width + east_x, equiv_map, tex)); // Check SW corner best_label[tid_i] = relabel(best_label[tid_i], fetch((south_y) * width + west_x, equiv_map, tex)); // Check SE corner best_label[tid_i] = relabel(best_label[tid_i], fetch((south_y) * width + east_x, equiv_map, tex)); } // if connectivity == 8 } // if orig_label != 0 // Process the equivalency list. T last_label = orig_label; T new_label = best_label[tid_i]; while (best_label[tid_i] != (T)0 && new_label < last_label) { last_label = new_label; new_label = fetch(new_label - (T)1, equiv_map, tex); } if (orig_label != new_label) { tid_changed = true; s_tile[ty][tx] = new_label; write[tid_i] = (char)1; } best_label[tid_i] = new_label; } } bool continue_iter = __syncthreads_or((int)tid_changed); // Iterate until no pixel in the tile changes while (continue_iter) { // Reset whether or not this thread's pixels have changed. tid_changed = false; #pragma unroll for (int xb = 0; xb < n_per_thread; ++xb) { #pragma unroll for (int yb = 0; yb < n_per_thread; ++yb) { // Indexing const int tx = threadIdx.x + (xb * blockDim.x); const int ty = threadIdx.y + (yb * blockDim.y); const int tid_i = xb * n_per_thread + yb; T last_label = best_label[tid_i]; if (best_label[tid_i] != 0) { const int north_y = max(ty, 1) -1; const int south_y = min(ty, n_per_thread*block_dim - 2) +1; const int east_x = min(tx, n_per_thread*block_dim - 2) +1; const int west_x = max(tx, 1) -1; // Check bottom best_label[tid_i] = relabel(best_label[tid_i], s_tile[south_y][tx]); // Check right neighbor best_label[tid_i] = relabel(best_label[tid_i], s_tile[ty][east_x]); // Check left neighbor best_label[tid_i] = relabel(best_label[tid_i], s_tile[ty][west_x]); // Check top neighbor best_label[tid_i] = relabel(best_label[tid_i], s_tile[north_y][tx]); if (full_conn) { // Check NW corner best_label[tid_i] = relabel(best_label[tid_i], s_tile[north_y][west_x]); // Check NE corner best_label[tid_i] = relabel(best_label[tid_i], s_tile[north_y][east_x]); // Check SW corner best_label[tid_i] = relabel(best_label[tid_i], s_tile[south_y][west_x]); // Check SE corner best_label[tid_i] = relabel(best_label[tid_i], s_tile[south_y][east_x]); } // if connectivity == 8 // This thread's value changed during this iteration if the // best label is not the same as the last label. const bool changed = best_label[tid_i] != last_label; write[tid_i] = write[tid_i] || changed; tid_changed = tid_changed || changed; } } } // Done looking at neighbors for this iteration continue_iter = __syncthreads_or((int)tid_changed); // If we have to continue iterating, update the tile of the // equiv map in shared memory if (continue_iter) { #pragma unroll for (int xb = 0; xb < n_per_thread; ++xb) { #pragma unroll for (int yb = 0; yb < n_per_thread; ++yb) { const int tx = threadIdx.x + (xb * blockDim.x); const int ty = threadIdx.y + (yb * blockDim.y); const int tid_i = xb * n_per_thread + yb; // Update tile in shared memory s_tile[ty][tx] = best_label[tid_i]; } } __syncthreads(); } } // while (continue_iter) // Write out equiv_map #pragma unroll for (int xb = 0; xb < n_per_thread; ++xb) { #pragma unroll for (int yb = 0; yb < n_per_thread; ++yb) { const int x = base_x + (xb * blockDim.x); const int y = base_y + (yb * blockDim.y); const int n = y * width + x; const int tid_i = xb * n_per_thread + yb; if (x < width && y < height && write[tid_i]) { equiv_map.ptr[n] = best_label[tid_i]; continue_flag = 1; } } } }