{ #endif #include <stdio.h> #include <stdlib.h> #include <jni.h> #include <cuda_runtime.h> #include "weightedCostApproximator_JNI.h" #include "customCudaUtils.h" #include "NengoUtilsGPU.h" JNIEXPORT jboolean JNICALL Java_ca_nengo_math_impl_WeightedCostApproximator_hasGPU (JNIEnv *env, jclass class) { jboolean hasGPU = (jboolean) (getGPUDeviceCount() > 0); return hasGPU; } /* Must be called from Java code. Takes a matrix as input and returns its pseudoInverse. Used to pseudo invert gamma. */ JNIEXPORT jobjectArray JNICALL Java_ca_nengo_math_impl_WeightedCostApproximator_nativePseudoInverse (JNIEnv* env, jclass class, jobjectArray java_matrix, jfloat minSV, jint numSV) { int i = 0; jsize M = (*env)->GetArrayLength(env, java_matrix); jfloatArray temp_array = (jfloatArray) (*env)->GetObjectArrayElement(env, java_matrix, 0);
// Main entry point, distinct from the External entry point. Intended to by // called from python with ctypes (but can also, of course, be called from c) void setup(int num_devices_requested, int* devices_to_use, float dt, int num_items, int dimension, int** index_vectors, int** stored_vectors, float tau, float* decoders, int neurons_per_item, float* gain, float* bias, float tau_ref, float tau_rc, float radius, int identical_ensembles, int print_data, int* probe_indices, int num_probes, int do_spikes, int num_steps) { int i, j, k; int num_available_devices = getGPUDeviceCount(); do_print = print_data; printf("NeuralAssocGPU: SETUP\n"); num_devices = num_devices_requested > num_available_devices ? num_available_devices : num_devices_requested; if(do_print) printf("Using %d devices. %d available\n", num_devices, num_available_devices); nengo_data_array = (NengoGPUData**) malloc(sizeof(NengoGPUData*) * num_devices); NengoGPUData* current_data; // Create the NengoGPUData structs, one per device. for(i = 0; i < num_devices; i++) { nengo_data_array[i] = getNewNengoGPUData(); } if(do_print) printf("About to create the NengoGPUData structures\n"); int items_per_device = num_items / num_devices; int leftover = num_items % num_devices; int item_index = 0; int items_for_current_device = 0; int probe_count = 0; // Now we start to load the data into the NengoGPUData struct for each device. // (though the data doesn't get put on the actual device just yet). // Because of the CUDA architecture, we have to do some weird things to get a good speedup // These arrays that store the transforms, decoders, are setup in a non-intuitive way so // that memory accesses can be parallelized in CUDA kernels. For more information, see // the NengoGPU user manual. for(i = 0; i < num_devices; i++) { // set values current_data = nengo_data_array[i]; current_data->device = devices_to_use[i]; current_data->do_print = do_print; current_data->neurons_per_item = neurons_per_item; current_data->dimension = dimension; current_data->tau = tau; current_data->tau_ref = tau_ref; current_data->tau_rc = tau_rc; current_data->radius = radius; current_data->dt = dt; current_data->num_steps = num_steps; printf("Num STEPS: %d\n", num_steps); items_for_current_device = items_per_device + (leftover > 0 ? 1 : 0); leftover--; current_data->num_items = items_for_current_device; current_data->identical_ensembles = identical_ensembles; // find the number of probes on current device probe_count = 0; for(j = 0; j < num_probes; j++) { if(probe_indices[j] >= item_index && probe_indices[j] < item_index + items_for_current_device) { probe_count++; } } current_data->num_probes = probe_count; if(do_spikes) { current_data->num_spikes = probe_count * neurons_per_item; } // create the arrays initializeNengoGPUData(current_data); // populate the arrays for(j = 0; j < items_for_current_device; j++) { memcpy(current_data->index_vectors->array + j * dimension, index_vectors[item_index + j], dimension * sizeof(float)); memcpy(current_data->stored_vectors->array + j * dimension, stored_vectors[item_index + j], dimension * sizeof(float)); } memcpy(current_data->decoders->array, decoders, neurons_per_item * sizeof(float)); memcpy(current_data->gain->array, gain, neurons_per_item * sizeof(float)); memcpy(current_data->bias->array, bias, neurons_per_item * sizeof(float)); // populate the probe map probe_count = 0; for(j = 0; j < num_probes; j++) { if(probe_indices[j] >= item_index && probe_indices[j] < item_index + items_for_current_device) { current_data->probe_map->array[probe_count] = probe_indices[j] - item_index; if(do_spikes) { for(k = 0; k < neurons_per_item; k++) { current_data->spike_map->array[probe_count * neurons_per_item + k] = (probe_indices[j] - item_index) * neurons_per_item + k; } } probe_count++; } } item_index += items_for_current_device; //printf("printing nengo gpu data\n"); //printNengoGPUData(current_data, 1); } // We have all the data we need, now start the worker threads which control // the GPU's directly. run_start(); }