void print_used_memory(void) { #if defined DEBUG_EASDK && defined EASDK_DEBUG_LEVEL_DEBUG #if defined __APPLE__ struct mstats stat; stat = mstats(); debug_log(DEBUG_MEM, ("Memstat: %d\n", stat.bytes_used)); #elif defined __linux__ struct mallinfo mi; mi = mallinfo(); debug_log(DEBUG_MEM, ("Memstat: %d(/%d)\n", mi.uordblks, mi.fordblks)); #endif print_peak_memory_usage(); check_memory(); print_memory_stats(); #endif }
int kmeans_cuda(bool kmpp, float tolerance, float yinyang_t, uint32_t samples_size, uint16_t features_size, uint32_t clusters_size, uint32_t seed, uint32_t device, int32_t verbosity, const float *samples, float *centroids, uint32_t *assignments) { DEBUG("arguments: %d %.3f %.2f %" PRIu32 " %" PRIu16 " %" PRIu32 " %" PRIu32 " %" PRIu32 " %" PRIi32 " %p %p %p\n", kmpp, tolerance, yinyang_t, samples_size, features_size, clusters_size, seed, device, verbosity, samples, centroids, assignments); auto check_result = check_args( tolerance, yinyang_t, samples_size, features_size, clusters_size, samples, centroids, assignments); if (check_result != kmcudaSuccess) { return check_result; } if (cudaSetDevice(device) != cudaSuccess) { return kmcudaNoSuchDevice; } void *device_samples; size_t device_samples_size = samples_size; device_samples_size *= features_size * sizeof(float); CUMALLOC(device_samples, device_samples_size, "samples"); CUMEMCPY(device_samples, samples, device_samples_size, cudaMemcpyHostToDevice); unique_devptr device_samples_sentinel(device_samples); void *device_centroids; size_t centroids_size = clusters_size * features_size * sizeof(float); CUMALLOC(device_centroids, centroids_size, "centroids"); unique_devptr device_centroids_sentinel(device_centroids); void *device_assignments; size_t assignments_size = samples_size * sizeof(uint32_t); CUMALLOC(device_assignments, assignments_size, "assignments"); unique_devptr device_assignments_sentinel(device_assignments); void *device_assignments_prev; CUMALLOC(device_assignments_prev, assignments_size, "assignments_prev"); unique_devptr device_assignments_prev_sentinel(device_assignments_prev); void *device_ccounts; CUMALLOC(device_ccounts, clusters_size * sizeof(uint32_t), "ccounts"); unique_devptr device_ccounts_sentinel(device_ccounts); uint32_t yinyang_groups = yinyang_t * clusters_size; DEBUG("yinyang groups: %" PRIu32 "\n", yinyang_groups); void *device_assignments_yy = NULL, *device_bounds_yy = NULL, *device_drifts_yy = NULL, *device_passed_yy = NULL, *device_centroids_yy = NULL; if (yinyang_groups >= 1) { CUMALLOC(device_assignments_yy, clusters_size * sizeof(uint32_t), "yinyang assignments"); size_t yyb_size = samples_size; yyb_size *= (yinyang_groups + 1) * sizeof(float); CUMALLOC(device_bounds_yy, yyb_size, "yinyang bounds"); CUMALLOC(device_drifts_yy, centroids_size + clusters_size * sizeof(float), "yinyang drifts"); CUMALLOC(device_passed_yy, assignments_size, "yinyang passed"); size_t yyc_size = yinyang_groups * features_size * sizeof(float); if (yyc_size + (clusters_size + yinyang_groups) * sizeof(uint32_t) <= assignments_size) { device_centroids_yy = device_passed_yy; } else { CUMALLOC(device_centroids_yy, yyc_size, "yinyang group centroids"); } } unique_devptr device_centroids_yinyang_sentinel( (device_centroids_yy != device_passed_yy)? device_centroids_yy : NULL); unique_devptr device_assignments_yinyang_sentinel(device_assignments_yy); unique_devptr device_bounds_yinyang_sentinel(device_bounds_yy); unique_devptr device_drifts_yinyang_sentinel(device_drifts_yy); unique_devptr device_passed_yinyang_sentinel(device_passed_yy); if (verbosity > 1) { RETERR(print_memory_stats()); } RETERR(kmeans_cuda_setup(samples_size, features_size, clusters_size, yinyang_groups, device, verbosity), DEBUG("kmeans_cuda_setup failed: %s\n", cudaGetErrorString(cudaGetLastError()))); RETERR(kmeans_init_centroids( static_cast<KMCUDAInitMethod>(kmpp), samples_size, features_size, clusters_size, seed, verbosity, reinterpret_cast<float*>(device_samples), device_assignments, reinterpret_cast<float*>(device_centroids)), DEBUG("kmeans_init_centroids failed: %s\n", cudaGetErrorString(cudaGetLastError()))); RETERR(kmeans_cuda_yy( tolerance, yinyang_groups, samples_size, clusters_size, features_size, verbosity, reinterpret_cast<float*>(device_samples), reinterpret_cast<float*>(device_centroids), reinterpret_cast<uint32_t*>(device_ccounts), reinterpret_cast<uint32_t*>(device_assignments_prev), reinterpret_cast<uint32_t*>(device_assignments), reinterpret_cast<uint32_t*>(device_assignments_yy), reinterpret_cast<float*>(device_centroids_yy), reinterpret_cast<float*>(device_bounds_yy), reinterpret_cast<float*>(device_drifts_yy), reinterpret_cast<uint32_t*>(device_passed_yy)), DEBUG("kmeans_cuda_internal failed: %s\n", cudaGetErrorString(cudaGetLastError()))); CUMEMCPY(centroids, device_centroids, centroids_size, cudaMemcpyDeviceToHost); CUMEMCPY(assignments, device_assignments, assignments_size, cudaMemcpyDeviceToHost); DEBUG("return kmcudaSuccess\n"); return kmcudaSuccess; }