{
#endif

#include <stdio.h>
#include <stdlib.h>
#include <jni.h>
#include <cuda_runtime.h>

#include "weightedCostApproximator_JNI.h"
#include "customCudaUtils.h"
#include "NengoUtilsGPU.h"

JNIEXPORT jboolean JNICALL Java_ca_nengo_math_impl_WeightedCostApproximator_hasGPU
(JNIEnv *env, jclass class)
{
    jboolean hasGPU = (jboolean) (getGPUDeviceCount() > 0);

    return hasGPU;
}

/*
  Must be called from Java code.
  Takes a matrix as input and returns its pseudoInverse. Used to pseudo invert gamma.
*/
JNIEXPORT jobjectArray JNICALL Java_ca_nengo_math_impl_WeightedCostApproximator_nativePseudoInverse
(JNIEnv* env, jclass class, jobjectArray java_matrix, jfloat minSV, jint numSV)
{
    int i = 0;

    jsize M = (*env)->GetArrayLength(env, java_matrix);
    jfloatArray temp_array = (jfloatArray) (*env)->GetObjectArrayElement(env, java_matrix, 0);
Exemple #2
0
// Main entry point, distinct from the External entry point. Intended to by
// called from python with ctypes (but can also, of course, be called from c)
void setup(int num_devices_requested, int* devices_to_use, float dt, int num_items,
           int dimension, int** index_vectors, int** stored_vectors, float tau,
           float* decoders, int neurons_per_item, float* gain, float* bias,
           float tau_ref, float tau_rc, float radius, int identical_ensembles,
           int print_data, int* probe_indices, int num_probes, int do_spikes,
           int num_steps)
{

  int i, j, k;

  int num_available_devices = getGPUDeviceCount();

  do_print = print_data;

  printf("NeuralAssocGPU: SETUP\n");

  num_devices = num_devices_requested > num_available_devices ? num_available_devices : num_devices_requested;

  if(do_print)
    printf("Using %d devices. %d available\n", num_devices, num_available_devices);

  nengo_data_array = (NengoGPUData**) malloc(sizeof(NengoGPUData*) * num_devices);

  NengoGPUData* current_data;

  // Create the NengoGPUData structs, one per device.
  for(i = 0; i < num_devices; i++)
  {
    nengo_data_array[i] = getNewNengoGPUData();
  }

  if(do_print)
    printf("About to create the NengoGPUData structures\n");

  int items_per_device = num_items / num_devices;
  int leftover = num_items % num_devices;
  int item_index = 0;
  int items_for_current_device = 0;
  int probe_count = 0;

  // Now we start to load the data into the NengoGPUData struct for each device. 
  // (though the data doesn't get put on the actual device just yet).
  // Because of the CUDA architecture, we have to do some weird things to get a good speedup
  // These arrays that store the transforms, decoders, are setup in a non-intuitive way so
  // that memory accesses can be parallelized in CUDA kernels. For more information, see 
  // the NengoGPU user manual.
  for(i = 0; i < num_devices; i++)
  {
    // set values
    current_data = nengo_data_array[i];

    current_data->device = devices_to_use[i];

    current_data->do_print = do_print;
    current_data->neurons_per_item = neurons_per_item;
    current_data->dimension = dimension;

    current_data->tau = tau;
    current_data->tau_ref = tau_ref;
    current_data->tau_rc = tau_rc;
    current_data->radius = radius;
    current_data->dt = dt;
    current_data->num_steps = num_steps;

    printf("Num STEPS: %d\n", num_steps);

    items_for_current_device = items_per_device + (leftover > 0 ? 1 : 0);
    leftover--;

    current_data->num_items = items_for_current_device;
    current_data->identical_ensembles = identical_ensembles;

    // find the number of probes on current device
    probe_count = 0;
    for(j = 0; j < num_probes; j++)
    {
        if(probe_indices[j] >= item_index &&
           probe_indices[j] < item_index + items_for_current_device)
        {
            probe_count++;
        }
    }

    current_data->num_probes = probe_count;

    if(do_spikes)
    {
        current_data->num_spikes = probe_count * neurons_per_item;
    }

    // create the arrays
    initializeNengoGPUData(current_data);

    // populate the arrays
    for(j = 0; j < items_for_current_device; j++)
    {
      memcpy(current_data->index_vectors->array + j * dimension,
             index_vectors[item_index + j], dimension * sizeof(float));
      memcpy(current_data->stored_vectors->array + j * dimension,
             stored_vectors[item_index + j], dimension * sizeof(float));
    }

    memcpy(current_data->decoders->array, decoders, neurons_per_item * sizeof(float));
    memcpy(current_data->gain->array, gain, neurons_per_item * sizeof(float));
    memcpy(current_data->bias->array, bias, neurons_per_item * sizeof(float));

    // populate the probe map
    probe_count = 0;
    for(j = 0; j < num_probes; j++)
    {
        if(probe_indices[j] >= item_index &&
           probe_indices[j] < item_index + items_for_current_device)
        {
            current_data->probe_map->array[probe_count] = probe_indices[j] - item_index;

            if(do_spikes)
            {
                for(k = 0; k < neurons_per_item; k++)
                {
                    current_data->spike_map->array[probe_count * neurons_per_item + k] =
                        (probe_indices[j] - item_index) * neurons_per_item + k;
                }
            }

            probe_count++;
        }
    }

    item_index += items_for_current_device;

    //printf("printing nengo gpu data\n");
    //printNengoGPUData(current_data, 1);
  }

  // We have all the data we need, now start the worker threads which control
  // the GPU's directly.
  run_start();
}