Example #1
0
void print_used_memory(void)
{
#if defined DEBUG_EASDK && defined EASDK_DEBUG_LEVEL_DEBUG
#if defined __APPLE__
    struct mstats stat;

    stat = mstats();
    debug_log(DEBUG_MEM, ("Memstat: %d\n", stat.bytes_used));
#elif defined __linux__
    struct mallinfo mi;

    mi = mallinfo();
    debug_log(DEBUG_MEM, ("Memstat: %d(/%d)\n", mi.uordblks, mi.fordblks));
#endif
    print_peak_memory_usage();
    check_memory();
    print_memory_stats();
#endif
}
Example #2
0
int kmeans_cuda(bool kmpp, float tolerance, float yinyang_t, uint32_t samples_size,
                uint16_t features_size, uint32_t clusters_size, uint32_t seed,
                uint32_t device, int32_t verbosity, const float *samples,
                float *centroids, uint32_t *assignments) {
  DEBUG("arguments: %d %.3f %.2f %" PRIu32 " %" PRIu16 " %" PRIu32 " %" PRIu32
        " %" PRIu32 " %" PRIi32 " %p %p %p\n",
        kmpp, tolerance, yinyang_t, samples_size, features_size, clusters_size,
        seed, device, verbosity, samples, centroids, assignments);
  auto check_result = check_args(
      tolerance, yinyang_t, samples_size, features_size, clusters_size,
      samples, centroids, assignments);
  if (check_result != kmcudaSuccess) {
    return check_result;
  }
  if (cudaSetDevice(device) != cudaSuccess) {
    return kmcudaNoSuchDevice;
  }

  void *device_samples;
  size_t device_samples_size = samples_size;
  device_samples_size *= features_size * sizeof(float);
  CUMALLOC(device_samples, device_samples_size, "samples");
  CUMEMCPY(device_samples, samples, device_samples_size, cudaMemcpyHostToDevice);
  unique_devptr device_samples_sentinel(device_samples);

  void *device_centroids;
  size_t centroids_size = clusters_size * features_size * sizeof(float);
  CUMALLOC(device_centroids, centroids_size, "centroids");
  unique_devptr device_centroids_sentinel(device_centroids);

  void *device_assignments;
  size_t assignments_size = samples_size * sizeof(uint32_t);
  CUMALLOC(device_assignments, assignments_size, "assignments");
  unique_devptr device_assignments_sentinel(device_assignments);

  void *device_assignments_prev;
  CUMALLOC(device_assignments_prev, assignments_size, "assignments_prev");
  unique_devptr device_assignments_prev_sentinel(device_assignments_prev);

  void *device_ccounts;
  CUMALLOC(device_ccounts, clusters_size * sizeof(uint32_t), "ccounts");
  unique_devptr device_ccounts_sentinel(device_ccounts);

  uint32_t yinyang_groups = yinyang_t * clusters_size;
  DEBUG("yinyang groups: %" PRIu32 "\n", yinyang_groups);
  void *device_assignments_yy = NULL, *device_bounds_yy = NULL,
      *device_drifts_yy = NULL, *device_passed_yy = NULL,
      *device_centroids_yy = NULL;
  if (yinyang_groups >= 1) {
    CUMALLOC(device_assignments_yy, clusters_size * sizeof(uint32_t),
             "yinyang assignments");
    size_t yyb_size = samples_size;
    yyb_size *= (yinyang_groups + 1) * sizeof(float);
    CUMALLOC(device_bounds_yy, yyb_size, "yinyang bounds");
    CUMALLOC(device_drifts_yy, centroids_size + clusters_size * sizeof(float),
             "yinyang drifts");
    CUMALLOC(device_passed_yy, assignments_size, "yinyang passed");
    size_t yyc_size = yinyang_groups * features_size * sizeof(float);
    if (yyc_size + (clusters_size + yinyang_groups) * sizeof(uint32_t)
        <= assignments_size) {
      device_centroids_yy = device_passed_yy;
    } else {
      CUMALLOC(device_centroids_yy, yyc_size, "yinyang group centroids");
    }
  }
  unique_devptr device_centroids_yinyang_sentinel(
      (device_centroids_yy != device_passed_yy)? device_centroids_yy : NULL);
  unique_devptr device_assignments_yinyang_sentinel(device_assignments_yy);
  unique_devptr device_bounds_yinyang_sentinel(device_bounds_yy);
  unique_devptr device_drifts_yinyang_sentinel(device_drifts_yy);
  unique_devptr device_passed_yinyang_sentinel(device_passed_yy);

  if (verbosity > 1) {
    RETERR(print_memory_stats());
  }
  RETERR(kmeans_cuda_setup(samples_size, features_size, clusters_size,
                           yinyang_groups, device, verbosity),
         DEBUG("kmeans_cuda_setup failed: %s\n",
               cudaGetErrorString(cudaGetLastError())));
  RETERR(kmeans_init_centroids(
      static_cast<KMCUDAInitMethod>(kmpp), samples_size, features_size,
      clusters_size, seed, verbosity, reinterpret_cast<float*>(device_samples),
      device_assignments, reinterpret_cast<float*>(device_centroids)),
         DEBUG("kmeans_init_centroids failed: %s\n",
               cudaGetErrorString(cudaGetLastError())));
  RETERR(kmeans_cuda_yy(
      tolerance, yinyang_groups, samples_size, clusters_size, features_size, verbosity,
      reinterpret_cast<float*>(device_samples),
      reinterpret_cast<float*>(device_centroids),
      reinterpret_cast<uint32_t*>(device_ccounts),
      reinterpret_cast<uint32_t*>(device_assignments_prev),
      reinterpret_cast<uint32_t*>(device_assignments),
      reinterpret_cast<uint32_t*>(device_assignments_yy),
      reinterpret_cast<float*>(device_centroids_yy),
      reinterpret_cast<float*>(device_bounds_yy),
      reinterpret_cast<float*>(device_drifts_yy),
      reinterpret_cast<uint32_t*>(device_passed_yy)),
         DEBUG("kmeans_cuda_internal failed: %s\n",
               cudaGetErrorString(cudaGetLastError())));
  CUMEMCPY(centroids, device_centroids, centroids_size, cudaMemcpyDeviceToHost);
  CUMEMCPY(assignments, device_assignments, assignments_size, cudaMemcpyDeviceToHost);
  DEBUG("return kmcudaSuccess\n");
  return kmcudaSuccess;
}