nsresult dpoCContext::InitContext(cl_platform_id platform) { cl_int err_code; cl_device_id *devices; size_t cb; cl_context_properties context_properties[3] = {CL_CONTEXT_PLATFORM, (cl_context_properties)platform, NULL}; context = clCreateContextFromType(context_properties, CL_DEVICE_TYPE_CPU, ReportCLError, this, &err_code); if (err_code != CL_SUCCESS) { DEBUG_LOG_ERROR("InitContext", err_code); return NS_ERROR_NOT_AVAILABLE; } err_code = clGetContextInfo(context, CL_CONTEXT_DEVICES, 0, NULL, &cb); if (err_code != CL_SUCCESS) { DEBUG_LOG_ERROR("InitContext", err_code); return NS_ERROR_NOT_AVAILABLE; } devices = (cl_device_id *)nsMemory::Alloc(sizeof(cl_device_id)*cb); if (devices == NULL) { DEBUG_LOG_STATUS("InitContext", "Cannot allocate device list"); return NS_ERROR_OUT_OF_MEMORY; } err_code = clGetContextInfo(context, CL_CONTEXT_DEVICES, cb, devices, NULL); if (err_code != CL_SUCCESS) { DEBUG_LOG_ERROR("InitContext", err_code); nsMemory::Free(devices); return NS_ERROR_NOT_AVAILABLE; } cmdQueue = clCreateCommandQueue(context, devices[0], #ifdef CLPROFILE CL_QUEUE_PROFILING_ENABLE | #endif /* CLPROFILE */ #ifdef OUTOFORDERQUEUE CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE | #endif /* OUTOFORDERQUEUE */ 0, &err_code); if (err_code != CL_SUCCESS) { DEBUG_LOG_ERROR("InitContext", err_code); nsMemory::Free(devices); return NS_ERROR_NOT_AVAILABLE; } DEBUG_LOG_STATUS("InitContext", "queue is " << cmdQueue); nsMemory::Free(devices); kernelFailureMem = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(int), NULL, &err_code); if (err_code != CL_SUCCESS) { DEBUG_LOG_ERROR("InitContext", err_code); return NS_ERROR_NOT_AVAILABLE; } return NS_OK; }
/* [implicit_jscontext] dpoIData allocateData (in jsval templ, [optional] in uint32_t length); */ NS_IMETHODIMP dpoCContext::AllocateData(const jsval & templ, uint32_t length, JSContext *cx, dpoIData **_retval) { cl_int err_code; nsresult result; JSObject *tArray; size_t bytePerElements; nsCOMPtr<dpoCData> data; result = ExtractArray( templ, &tArray, cx); if (NS_FAILED(result)) { return result; } data = new dpoCData( this); if (data == NULL) { DEBUG_LOG_STATUS("AllocateData", "Cannot create new dpoCData object"); return NS_ERROR_OUT_OF_MEMORY; } if (length == 0) { DEBUG_LOG_STATUS("AllocateData", "size not provided, assuming template's size"); length = JS_GetTypedArrayLength(tArray); } bytePerElements = JS_GetTypedArrayByteLength(tArray) / JS_GetTypedArrayLength(tArray); DEBUG_LOG_STATUS("AllocateData", "length " << length << " bytePerElements " << bytePerElements); #ifdef PREALLOCATE_IN_JS_HEAP JSObject *jsArray; if (NS_FAILED(CreateAlignedTA(JS_GetTypedArrayType(tArray, cx), length, &jsArray, cx))) { return NS_ERROR_NOT_AVAILABLE; } if (!jsArray) { DEBUG_LOG_STATUS("AllocateData", "Cannot create typed array"); return NS_ERROR_OUT_OF_MEMORY; } cl_mem memObj = CreateBuffer( CL_MEM_USE_HOST_PTR | CL_MEM_READ_WRITE, JS_GetTypedArrayByteLength(jsArray, cx), GetPointerFromTA(jsArray, cx), &err_code); #else /* PREALLOCATE_IN_JS_HEAP */ JSObject *jsArray = nullptr; cl_mem memObj = CreateBuffer(cx, CL_MEM_READ_WRITE, length * bytePerElements, NULL, &err_code); #endif /* PREALLOCATE_IN_JS_HEAP */ if (err_code != CL_SUCCESS) { DEBUG_LOG_ERROR("AllocateData", err_code); return NS_ERROR_NOT_AVAILABLE; } result = data->InitCData(cx, cmdQueue, memObj, JS_GetArrayBufferViewType(tArray), length, length * bytePerElements, jsArray); if (NS_SUCCEEDED(result)) { data.forget((dpoCData **) _retval); } return result; }
/* [implicit_jscontext] dpoIData allocateData2 (in dpoIData templ, [optional] in uint32_t length); */ NS_IMETHODIMP dpoCContext::AllocateData2(dpoIData *templ, uint32_t length, JSContext *cx, dpoIData **_retval) { // this cast is only safe as long as no other implementations of the dpoIData interface exist dpoCData *cData = (dpoCData *) templ; cl_int err_code; nsresult result; size_t bytePerElements; nsCOMPtr<dpoCData> data; #ifdef PREALLOCATE_IN_JS_HEAP jsval jsBuffer; #endif /* PREALLOCATE_IN_JS_HEAP */ data = new dpoCData( this); if (data == NULL) { DEBUG_LOG_STATUS("AllocateData2", "Cannot create new dpoCData object"); return NS_ERROR_OUT_OF_MEMORY; } if (length == 0) { DEBUG_LOG_STATUS("AllocateData2", "length not provided, assuming template's size"); length = cData->GetLength(); } bytePerElements = cData->GetSize() / cData->GetLength(); DEBUG_LOG_STATUS("AllocateData2", "length " << length << " bytePerElements " << bytePerElements); #ifdef PREALLOCATE_IN_JS_HEAP JSObject *jsArray; if (NS_FAILED(CreateAlignedTA(cData->GetType(), length, &jsArray, cx))) { return NS_ERROR_NOT_AVAILABLE; } if (!jsArray) { DEBUG_LOG_STATUS("AllocateData2", "Cannot create typed array"); return NS_ERROR_OUT_OF_MEMORY; } cl_mem memObj = CreateBuffer(CL_MEM_USE_HOST_PTR | CL_MEM_READ_WRITE, JS_GetTypedArrayByteLength(jsArray, cx), JS_GetArrayBufferViewData(jsArray, cx), &err_code); #else /* PREALLOCATE_IN_JS_HEAP */ JSObject *jsArray = NULL; cl_mem memObj = CreateBuffer(cx, CL_MEM_READ_WRITE, length * bytePerElements, NULL, &err_code); #endif /* PREALLOCATE_IN_JS_HEAP */ if (err_code != CL_SUCCESS) { DEBUG_LOG_ERROR("AllocateData2", err_code); return NS_ERROR_NOT_AVAILABLE; } result = data->InitCData(cx, cmdQueue, memObj, cData->GetType(), length, length * bytePerElements, jsArray); if (NS_SUCCEEDED(result)) { data.forget((dpoCData **) _retval); } return result; }
void dpoCContext::RecordEndOfRoundTrip(dpoIContext *parent) { dpoCContext *self = (dpoCContext *) parent; if (self->wrt_exec_start.QuadPart == -1) { DEBUG_LOG_STATUS("RecordEndOfRoundTrip", "no previous start data"); return; } if (!QueryPerformanceCounter(&(self->wrt_exec_end))) { DEBUG_LOG_STATUS("RecordEndOfRoundTrip", "querying performance counter failed"); self->wrt_exec_start.QuadPart = -1; self->wrt_exec_end.QuadPart = -1; } }
template<> Uint8ClampedArray* CData::getValue<Uint8ClampedArray>() { cl_int err_code; #ifdef PREALLOCATE_IN_JS_HEAP void* mem; #endif // PREALLOCATE_IN_JS_HEAP if (m_theUint8ClampedArray.get()) { #ifdef PREALLOCATE_IN_JS_HEAP if (false && !m_isMapped) { DEBUG_LOG_STATUS("getValue", "memory is " << m_theUint8ClampedArray.get()); void* mem = clEnqueueMapBuffer(m_queue, m_memObj, CL_TRUE, CL_MAP_READ, 0, m_size, 0, 0, 0, &err_code); if (err_code != CL_SUCCESS) { DEBUG_LOG_ERROR("getValue", err_code); return 0; } #ifndef DEBUG_OFF if (mem != m_theUint8ClampedArray->data()) DEBUG_LOG_STATUS("getValue", "EnqueueMap returned wrong pointer"); #endif // DEBUG_OFF m_isMapped = true; } #endif // PREALLOCATE_IN_JS_HEAP return m_theUint8ClampedArray.get(); } else { #ifdef INCREMENTAL_MEM_RELEASE checkFree(); #endif // INCREMENTAL_MEM_RELEASE if (m_parent->createAlignedTA<Uint8ClampedArray, unsigned char>(m_type, m_length, m_theUint8ClampedArray) != RT_OK) return 0; if (!m_theUint8ClampedArray) { DEBUG_LOG_STATUS("getValue", "Cannot create typed array"); return 0; } err_code = enqueueReadBuffer(m_size, m_theUint8ClampedArray->data()); if (err_code != CL_SUCCESS) { DEBUG_LOG_ERROR("getValue", err_code); m_theUint8ClampedArray.clear(); return 0; } DEBUG_LOG_STATUS("getValue", "materialized typed array"); return m_theUint8ClampedArray.get(); } }
void dpoCContext::RecordBeginOfRoundTrip(dpoIContext *parent) { dpoCContext *self = (dpoCContext *) parent; if (!QueryPerformanceCounter(&(self->wrt_exec_start))) { DEBUG_LOG_STATUS("RecordBeginOfRoundTrip", "querying performance counter failed"); self->wrt_exec_start.QuadPart = -1; } }
template<> unsigned CData::initCData<Uint8ClampedArray>(cl_command_queue aQueue, cl_mem aMemObj, ArrayBufferView::ViewType aType, unsigned aLength, unsigned aSize, PassRefPtr<Uint8ClampedArray> anArray) { cl_int err_code; m_type = aType; m_length = aLength; m_size = aSize; m_memObj = aMemObj; if (anArray.get()) m_theUint8ClampedArray = anArray; else m_theUint8ClampedArray.clear(); DEBUG_LOG_STATUS("initCData", "queue is " << aQueue << " buffer is " << aMemObj); err_code = clRetainCommandQueue(m_queue); if (err_code != CL_SUCCESS) { DEBUG_LOG_ERROR("initCData", err_code); // We should really fail here but a bug in the whatif OpenCL // makes the above retain operation always fail. m_isRetained = false; } else m_isRetained = true; m_queue = aQueue; return RT_OK; }
void CL_CALLBACK dpoCContext::CollectTimings( cl_event event, cl_int status, void *data) { cl_int result; dpoCContext *instance = (dpoCContext *) data; DEBUG_LOG_STATUS("CollectTimings", "enquiring for runtimes..."); result = clGetEventProfilingInfo( event, CL_PROFILING_COMMAND_START, sizeof (cl_ulong), &(instance->clp_exec_start), NULL); if (result != CL_SUCCESS) { DEBUG_LOG_ERROR("CollectTimings", result); instance->clp_exec_start = 0; } result = clGetEventProfilingInfo( event, CL_PROFILING_COMMAND_END, sizeof (cl_ulong), &(instance->clp_exec_end), NULL); if (result != CL_SUCCESS) { DEBUG_LOG_ERROR("CollectTimings", result); instance->clp_exec_end = 0; } DEBUG_LOG_STATUS("CollectTimings", "Collected start " << instance->clp_exec_start << " and end " << instance->clp_exec_end); }
CData::~CData() { DEBUG_LOG_DESTROY("CData", this); if (m_memObj) { #ifdef INCREMENTAL_MEM_RELEASE deferFree(m_memObj); #else // INCREMENTAL_MEM_RELEASE clReleaseMemObject(m_memObj); #endif // INCREMENTAL_MEM_RELEASE } if (m_queue && m_isRetained) { DEBUG_LOG_STATUS("~CData", "releasing queue object"); clReleaseCommandQueue(m_queue); } m_parent = 0; }
/* void setArgument (in PRUint32 number, in dpoIData argument); */ NS_IMETHODIMP dpoCKernel::SetArgument(PRUint32 number, dpoIData *argument) { cl_int err_code; cl_mem buffer; /* skip internal arguments */ number = number + DPO_NUMBER_OF_ARTIFICIAL_ARGS; buffer = ((dpoCData *) argument)->GetContainedBuffer(); DEBUG_LOG_STATUS("SetArgument", "buffer is " << buffer); err_code = clSetKernelArg(kernel, number, sizeof(cl_mem), &buffer); if (err_code != CL_SUCCESS) { DEBUG_LOG_ERROR("SetArgument", err_code); return NS_ERROR_INVALID_ARG; } return NS_OK; }
/* readonly attribute uint64_t lastRoundTripTime; */ NS_IMETHODIMP dpoCContext::GetLastRoundTripTime(uint64_t *_retval) { #ifdef WINDOWS_ROUNDTRIP if ((wrt_exec_start.QuadPart == -1) || (wrt_exec_end.QuadPart == -1)) { *_retval = 0; return NS_ERROR_NOT_AVAILABLE; } else { LARGE_INTEGER freq; if (!QueryPerformanceFrequency(&freq)) { DEBUG_LOG_STATUS("GetLastRoundTrupTime", "cannot read performance counter frequency."); return NS_ERROR_NOT_AVAILABLE; } double diff = (double) (wrt_exec_end.QuadPart - wrt_exec_start.QuadPart); double time = diff / (double) freq.QuadPart * 1000000000; *_retval = (uint64_t) time; return NS_OK; } #else /* WINDOWS_ROUNDTRIP */ return NS_ERROR_NOT_IMPLEMENTED; #endif /* WINDOWS_ROUNDTRIP */ }
/* void setScalarArgument (in PRUint32 number, in jsval argument); */ NS_IMETHODIMP dpoCKernel::SetScalarArgument(PRUint32 number, const jsval & argument, const jsval & isInteger, const jsval & highPrecision) { cl_int err_code; bool isIntegerB; bool isHighPrecisionB; /* skip internal arguments */ number = number + DPO_NUMBER_OF_ARTIFICIAL_ARGS; if (!JSVAL_IS_BOOLEAN(isInteger)) { DEBUG_LOG_STATUS("SetScalarArgument", "illegal isInteger argument."); return NS_ERROR_INVALID_ARG; } isIntegerB = JSVAL_TO_BOOLEAN(isInteger); if (!JSVAL_IS_BOOLEAN(highPrecision)) { DEBUG_LOG_STATUS("SetScalarArgument", "illegal highPrecision argument."); return NS_ERROR_INVALID_ARG; } isHighPrecisionB = JSVAL_TO_BOOLEAN(highPrecision); if (!JSVAL_IS_NUMBER(argument)) { DEBUG_LOG_STATUS("SetScalarArgument", "illegal number argument."); return NS_ERROR_INVALID_ARG; } if (JSVAL_IS_INT(argument)) { int value = JSVAL_TO_INT(argument); DEBUG_LOG_STATUS("SetScalarArgument", "(JSVAL_IS_INT(argument)) isIntegerB: " << isIntegerB << " isHighPrecisionB " << isHighPrecisionB); if (isIntegerB) { DEBUG_LOG_STATUS("SetScalarArgument", "(JSVAL_IS_INT(argument)) setting integer argument " << number << " to integer value " << value); cl_int intVal = (cl_int) value; err_code = clSetKernelArg(kernel, number, sizeof(cl_int), &intVal); } else if (isHighPrecisionB) { DEBUG_LOG_STATUS("SetScalarArgument", "setting double argument " << number << " to integer value " << value); cl_double doubleVal = (cl_double) value; err_code = clSetKernelArg(kernel, number, sizeof(cl_double), &doubleVal); } else { DEBUG_LOG_STATUS("SetScalarArgument", "setting float argument " << number << " to integer value " << value); cl_float floatVal = (cl_float) value; err_code = clSetKernelArg(kernel, number, sizeof(cl_float), &floatVal); } if (err_code != CL_SUCCESS) { DEBUG_LOG_ERROR("SetScalarArgument", err_code); return NS_ERROR_NOT_AVAILABLE; } } else if (JSVAL_IS_DOUBLE(argument)) { double value = JSVAL_TO_DOUBLE(argument); DEBUG_LOG_STATUS("SetScalarArgument", "(JSVAL_IS_DOUBLE(argument)) isIntegerB: " << isIntegerB << " isHighPrecisionB " << isHighPrecisionB); if (isIntegerB) { DEBUG_LOG_STATUS("SetScalarArgument", "setting int formal argument " << number << " using double value " << value); cl_int intVal = (cl_int) value; err_code = clSetKernelArg(kernel, number, sizeof(cl_int), &intVal); } else if (isHighPrecisionB) { DEBUG_LOG_STATUS("SetScalarArgument", "setting double formal argument " << number << " using double value " << value); cl_double doubleVal = (cl_double) value; err_code = clSetKernelArg(kernel, number, sizeof(cl_double), &doubleVal); } else { DEBUG_LOG_STATUS("SetScalarArgument", "setting float formal argument " << number << " using double value " << value); cl_float floatVal = (cl_float) value; err_code = clSetKernelArg(kernel, number, sizeof(cl_float), &floatVal); } if (err_code != CL_SUCCESS) { DEBUG_LOG_ERROR("SetScalarArgument", err_code); return NS_ERROR_NOT_AVAILABLE; } } else { DEBUG_LOG_STATUS("SetScalarArgument", "illegal number argument."); return NS_ERROR_INVALID_ARG; } return NS_OK; }
void OCLUtil::Init() { cl_int error = 0; // Used to handle error codes cl_uint numberOfPlatforms; cl_uint nplatforms; cl_device_id device; size_t length; createContextSuccess = false; createCommandQueueSuccess = false; openclModule = 0; // Load OpenCL library openclModule = LoadLibrary(TEXT("OpenCL.dll")); if (!openclModule) { DEBUG_LOG_ERROR("Init", "Load OpenCL.dll failed."); return; } // Initialize function entries #define INITIALIZE_FUNCTION_ENTRY(name) checkFunction(__##name = (name##Function) GetProcAddress(openclModule, #name)); OPENCL_FUNCTION_LIST(INITIALIZE_FUNCTION_ENTRY) #undef INITIALIZE_FUNCTION_ENTRY openclFlag = true; // Platform error = clGetPlatformIDs( 0, 0, &nplatforms); if (error != CL_SUCCESS) { DEBUG_LOG_ERROR("Init", "Get platform number error: " << error); return; } cl_platform_id* m_platforms = new cl_platform_id[nplatforms]; error = clGetPlatformIDs(nplatforms, m_platforms, &numberOfPlatforms); if (error != CL_SUCCESS) { DEBUG_LOG_ERROR("Init", "Get platform id error: " << error); delete [] m_platforms; return; } const cl_uint maxNameLength = 256; char name[maxNameLength]; for (cl_uint i = 0; i < numberOfPlatforms; i++) { error = clGetPlatformInfo(m_platforms[i], CL_PLATFORM_NAME, maxNameLength * sizeof(char), name, 0); if (error != CL_SUCCESS) { DEBUG_LOG_ERROR("Init", "Get platform name error: " << error); } else { if (!strcmp(name, "Intel(R) OpenCL") || !strcmp(name, "AMD Accelerated Parallel Processing")) { platform_ = m_platforms[i]; break; } } } delete [] m_platforms; if (!platform_) { DEBUG_LOG_ERROR("Init", "Find Intel or AMD platform failed."); return; } // Version char* temp; error = getPlatformPropertyHelper(CL_PLATFORM_VERSION, temp); if (error != CL_SUCCESS) { DEBUG_LOG_ERROR("Init", "Get platform version error: " << error); } else { version_ = std::string(temp); delete [] temp; } // Name error = getPlatformPropertyHelper(CL_PLATFORM_NAME, temp); if (error != CL_SUCCESS) { DEBUG_LOG_ERROR("Init", "Get platform name error: " << error); } else { name_ = std::string(temp); delete [] temp; } // Vendor error = getPlatformPropertyHelper(CL_PLATFORM_VENDOR, temp); if (error != CL_SUCCESS) { DEBUG_LOG_ERROR("Init", "Get platform vendor error: " << error); } else { vendor_ = std::string(temp); delete [] temp; } // Profile error = getPlatformPropertyHelper(CL_PLATFORM_PROFILE, temp); if (error != CL_SUCCESS) { DEBUG_LOG_ERROR("Init", "Get platform profile error: " << error); } else { profile_ = std::string(temp); delete [] temp; } // Platform Extensions error = getPlatformPropertyHelper(CL_PLATFORM_EXTENSIONS, temp); if (error != CL_SUCCESS) { DEBUG_LOG_ERROR("Init", "Get platform extension error: " << error); } else { platformExtensions_ = std::string(temp); delete [] temp; } // Number of Device cl_uint number; error = clGetDeviceIDs(platform_, CL_DEVICE_TYPE_ALL, 0, 0, &number); if (error != CL_SUCCESS) { DEBUG_LOG_ERROR("Init", "Get device number error: " << error); } else { numberOfDevices_ = number; } // Context cl_context_properties context_properties[3] = {CL_CONTEXT_PLATFORM, (cl_context_properties)platform_, 0}; context_ = clCreateContextFromType(context_properties, CL_DEVICE_TYPE_CPU, &reportCLError, this, &error); if (error != CL_SUCCESS) { DEBUG_LOG_ERROR("Init", "Create context error: " << error); return; } createContextSuccess = true; // Device size_t cb; error = clGetContextInfo(context_, CL_CONTEXT_DEVICES, 0, 0, &cb); if (error != CL_SUCCESS) { DEBUG_LOG_ERROR("Init", "Get context device number error: " << error); return; } cl_device_id* devices = (cl_device_id*)malloc(sizeof(cl_device_id) * cb); if (!devices) { DEBUG_LOG_STATUS("Init", "Cannot allocate device list"); return; } error = clGetContextInfo(context_, CL_CONTEXT_DEVICES, cb, devices, 0); if (error != CL_SUCCESS) { DEBUG_LOG_ERROR("Init", "Get context device info error: " << error); free(devices); return; } // Command Queue queue_ = clCreateCommandQueue(context_, devices[0], #ifdef CLPROFILE CL_QUEUE_PROFILING_ENABLE | #endif // CLPROFILE #ifdef OUTOFORDERQUEUE CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE | #endif // OUTOFORDERQUEUE 0, &error); if (error != CL_SUCCESS) { DEBUG_LOG_ERROR("Init", "Create command queue error: " << error); free(devices); return; } DEBUG_LOG_STATUS("Init", "queue is " << queue_); createCommandQueueSuccess = true; error = clGetDeviceInfo(devices[0], CL_DEVICE_MEM_BASE_ADDR_ALIGN, sizeof(alignmentSize_), &alignmentSize_, 0); if (error != CL_SUCCESS) { // We can tolerate this, simply do not align. alignmentSize_ = 8; } // We use byte, not bits. if (alignmentSize_ % 8) { // They align on sub-byte borders? Odd architecture this must be. Give up. alignmentSize_ = 1; } else { alignmentSize_ = alignmentSize_ / 8; } // Device Extensions error = clGetCommandQueueInfo(queue_, CL_QUEUE_DEVICE, sizeof(cl_device_id), &device, 0); if (error != CL_SUCCESS) { DEBUG_LOG_ERROR("Init", "Get command queue device error: " << error); } else { error = clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, 0, 0, &length); if (error == CL_SUCCESS) { temp = new char[length+1]; error = clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, length, temp, 0); deviceExtensions_ = std::string(temp); delete [] temp; } else { DEBUG_LOG_ERROR("Init", "Get device extension error: " << error); } } temp = 0; free(devices); }
/* [implicit_jscontext] dpoIData mapData (in jsval source); */ NS_IMETHODIMP dpoCContext::MapData(const jsval & source, JSContext *cx, dpoIData **_retval) { cl_int err_code; nsresult result; JSObject *tArray; nsCOMPtr<dpoCData> data; result = ExtractArray( source, &tArray, cx); if (NS_SUCCEEDED(result)) { // we have a typed array data = new dpoCData( this); if (data == NULL) { DEBUG_LOG_STATUS("MapData", "Cannot create new dpoCData object"); return NS_ERROR_OUT_OF_MEMORY; } // USE_HOST_PTR is save as the CData object will keep the associated typed array alive as long as the // memory buffer lives cl_mem_flags flags = CL_MEM_READ_ONLY; void *tArrayBuffer = NULL; size_t arrayByteLength = JS_GetTypedArrayByteLength(tArray, cx); if(arrayByteLength == 0) { arrayByteLength = 1; } else { tArrayBuffer = GetPointerFromTA(tArray, cx); flags |= CL_MEM_USE_HOST_PTR; } cl_mem memObj = CreateBuffer(flags, arrayByteLength, tArrayBuffer , &err_code); if (err_code != CL_SUCCESS) { DEBUG_LOG_ERROR("MapData", err_code); return NS_ERROR_NOT_AVAILABLE; } result = data->InitCData(cx, cmdQueue, memObj, JS_GetTypedArrayType(tArray, cx), JS_GetTypedArrayLength(tArray, cx), JS_GetTypedArrayByteLength(tArray, cx), tArray); #ifdef SUPPORT_MAPPING_ARRAYS } else if (JSVAL_IS_OBJECT(source)) { // maybe it is a regular array. // // WARNING: We map a pointer to the actual array here. All this works on CPU only // and only of the OpenCL compiler knows what to do! For the current Intel OpenCL SDK // this works but your milage may vary. const jsval *elems = UnsafeDenseArrayElements(cx, JSVAL_TO_OBJECT(source)); if (elems != NULL) { data = new dpoCData( this); if (data == NULL) { DEBUG_LOG_STATUS("MapData", "Cannot create new dpoCData object"); return NS_ERROR_OUT_OF_MEMORY; } cl_mem memObj = CreateBuffer(CL_MEM_COPY_HOST_PTR | CL_MEM_READ_ONLY, sizeof(double *), &elems, &err_code); if (err_code != CL_SUCCESS) { DEBUG_LOG_ERROR("MapData", err_code); return NS_ERROR_NOT_AVAILABLE; } result = data->InitCData(cx, cmdQueue, memObj, 0 /* bogus type */, 1, sizeof(double *), JSVAL_TO_OBJECT(source)); #ifndef DEBUG_OFF } else { DEBUG_LOG_STATUS("MapData", "No elements returned!"); #endif /* DEBUG_OFF */ } #endif /* SUPPORT_MAPPING_ARRAYS */ } if (NS_SUCCEEDED(result)) { data.forget((dpoCData **)_retval); } return result; }
/* dpoIKernel compileKernel (in AString source, in AString kernelName, [optional] in AString options); */ NS_IMETHODIMP dpoCContext::CompileKernel(const nsAString & source, const nsAString & kernelName, const nsAString & options, dpoIKernel **_retval) { cl_program program; cl_kernel kernel; cl_int err_code, err_code2; cl_uint numDevices; cl_device_id *devices = NULL; size_t actual; char *sourceStr, *optionsStr, *kernelNameStr; nsCOMPtr<dpoCKernel> ret; nsresult result; sourceStr = ToNewUTF8String(source); DEBUG_LOG_STATUS("CompileKernel", "Source: " << sourceStr); program = clCreateProgramWithSource(context, 1, (const char**)&sourceStr, NULL, &err_code); nsMemory::Free(sourceStr); if (err_code != CL_SUCCESS) { DEBUG_LOG_ERROR("CompileKernel", err_code); return NS_ERROR_ILLEGAL_VALUE; } optionsStr = ToNewUTF8String(options); err_code = clBuildProgram(program, 0, NULL, optionsStr, NULL, NULL); nsMemory::Free(optionsStr); if (err_code != CL_SUCCESS) { DEBUG_LOG_ERROR("CompileKernel", err_code); } err_code2 = clGetProgramInfo(program, CL_PROGRAM_NUM_DEVICES, sizeof(cl_uint), &numDevices, NULL); if (err_code2 != CL_SUCCESS) { DEBUG_LOG_ERROR("CompileKernel", err_code2); goto FAIL; } devices = (cl_device_id *) nsMemory::Alloc(numDevices * sizeof(cl_device_id)); err_code2 = clGetProgramInfo(program, CL_PROGRAM_DEVICES, numDevices * sizeof(cl_device_id), devices, NULL); if (err_code2 != CL_SUCCESS) { DEBUG_LOG_ERROR("CompileKernel", err_code); goto FAIL; } err_code2 = clGetProgramBuildInfo(program, devices[0], CL_PROGRAM_BUILD_LOG, buildLogSize, buildLog, &actual); if (actual > buildLogSize) { if (buildLog != NULL) { nsMemory::Free(buildLog); } buildLog = (char *) nsMemory::Alloc(actual * sizeof(char)); if (buildLog == NULL) { DEBUG_LOG_STATUS("CompileKernel", "Cannot allocate buildLog"); buildLogSize = 0; goto DONE; } buildLogSize = actual; err_code2 = clGetProgramBuildInfo(program, devices[0], CL_PROGRAM_BUILD_LOG, buildLogSize, buildLog, &actual); } if (err_code2 != CL_SUCCESS) { DEBUG_LOG_ERROR("CompileKernel", err_code); goto FAIL; } DEBUG_LOG_STATUS("CompileKernel", "buildLog: " << buildLog); goto DONE; FAIL: if (buildLog != NULL) { nsMemory::Free(buildLog); buildLog = NULL; buildLogSize = 0; } DONE: if (devices != NULL) { nsMemory::Free(devices); } kernelNameStr = ToNewUTF8String(kernelName); kernel = clCreateKernel(program, kernelNameStr, &err_code); nsMemory::Free( kernelNameStr); clReleaseProgram(program); if (err_code != CL_SUCCESS) { DEBUG_LOG_ERROR("CompileKernel", err_code); return NS_ERROR_NOT_AVAILABLE; } ret = new dpoCKernel(this); if (ret == NULL) { clReleaseKernel(kernel); DEBUG_LOG_STATUS("CompileKernel", "Cannot create new dpoCKernel object"); return NS_ERROR_OUT_OF_MEMORY; } /* all kernels share the single buffer for the failure code */ result = ret->InitKernel(cmdQueue, kernel, kernelFailureMem); if (NS_FAILED(result)) { clReleaseKernel(kernel); return result; } ret.forget((dpoCKernel **)_retval); return NS_OK; }
nsresult dpoCContext::InitContext(cl_platform_id platform) { cl_int err_code; cl_device_id *devices; size_t cb; #ifdef INCREMENTAL_MEM_RELEASE defer_list = (cl_mem *)nsMemory::Alloc(DEFER_LIST_LENGTH * sizeof(cl_mem)); defer_pos = 0; defer_max = DEFER_LIST_LENGTH; #endif /* INCREMENTAL_MEM_RELEASE */ cl_context_properties context_properties[3] = {CL_CONTEXT_PLATFORM, (cl_context_properties)platform, NULL}; context = clCreateContextFromType(context_properties, CL_DEVICE_TYPE_CPU, ReportCLError, this, &err_code); if (err_code != CL_SUCCESS) { DEBUG_LOG_ERROR("InitContext", err_code); return NS_ERROR_NOT_AVAILABLE; } err_code = clGetContextInfo(context, CL_CONTEXT_DEVICES, 0, NULL, &cb); if (err_code != CL_SUCCESS) { DEBUG_LOG_ERROR("InitContext", err_code); return NS_ERROR_NOT_AVAILABLE; } devices = (cl_device_id *)nsMemory::Alloc(sizeof(cl_device_id)*cb); if (devices == NULL) { DEBUG_LOG_STATUS("InitContext", "Cannot allocate device list"); return NS_ERROR_OUT_OF_MEMORY; } err_code = clGetContextInfo(context, CL_CONTEXT_DEVICES, cb, devices, NULL); if (err_code != CL_SUCCESS) { DEBUG_LOG_ERROR("InitContext", err_code); nsMemory::Free(devices); return NS_ERROR_NOT_AVAILABLE; } cmdQueue = clCreateCommandQueue(context, devices[0], #ifdef CLPROFILE CL_QUEUE_PROFILING_ENABLE | #endif /* CLPROFILE */ #ifdef OUTOFORDERQUEUE CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE | #endif /* OUTOFORDERQUEUE */ 0, &err_code); if (err_code != CL_SUCCESS) { DEBUG_LOG_ERROR("InitContext", err_code); nsMemory::Free(devices); return NS_ERROR_NOT_AVAILABLE; } DEBUG_LOG_STATUS("InitContext", "queue is " << cmdQueue); err_code = clGetDeviceInfo(devices[0], CL_DEVICE_MEM_BASE_ADDR_ALIGN, sizeof(alignment_size), &alignment_size, NULL); if (err_code != CL_SUCCESS) { /* we can tolerate this, simply do not align */ alignment_size = 8; } /* we use byte, not bits */ if (alignment_size % 8) { /* they align on sub-byte borders? Odd architecture this must be. Give up */ alignment_size = 1; } else { alignment_size = alignment_size / 8; } nsMemory::Free(devices); kernelFailureMem = CreateBuffer(CL_MEM_READ_WRITE, sizeof(int), NULL, &err_code); if (err_code != CL_SUCCESS) { DEBUG_LOG_ERROR("InitContext", err_code); return NS_ERROR_NOT_AVAILABLE; } return NS_OK; }
/* PRUint32 run (in PRUint32 rank, [array, size_is (rank)] in PRUint32 shape, [array, size_is (rank), optional] in PRUint32 tile); */ NS_IMETHODIMP dpoCKernel::Run(PRUint32 rank, PRUint32 *shape, PRUint32 *tile, PRUint32 *_retval) { cl_int err_code; cl_event runEvent, readEvent, writeEvent; size_t *global_work_size; size_t *local_work_size; const int zero = 0; DEBUG_LOG_STATUS("Run", "preparing execution of kernel"); if (sizeof(size_t) == sizeof(PRUint32)) { global_work_size = (size_t *) shape; } else { global_work_size = (size_t *) nsMemory::Alloc(rank * sizeof(size_t)); if (global_work_size == NULL) { DEBUG_LOG_STATUS("Run", "allocation of global_work_size failed"); return NS_ERROR_OUT_OF_MEMORY; } for (int cnt = 0; cnt < rank; cnt++) { global_work_size[cnt] = shape[cnt]; } } #ifdef USE_LOCAL_WORKSIZE if (tile == NULL) { local_work_size = NULL; } else { if ((sizeof(size_t) == sizeof(PRUint32))) { local_work_size = (size_t *) tile; } else { local_work_size = (size_t *) nsMemory::Alloc(rank * sizeof(size_t)); if (local_work_size == NULL) { DEBUG_LOG_STATUS("Run", "allocation of local_work_size failed"); return NS_ERROR_OUT_OF_MEMORY; } for (int cnt = 0; cnt < rank; cnt++) { local_work_size[cnt] = (size_t) tile[cnt]; } } } #else /* USE_LOCAL_WORKSIZE */ local_work_size = NULL; #endif /* USE_LOCAL_WORKSIZE */ DEBUG_LOG_STATUS("Run", "setting failure code to 0"); err_code = clEnqueueWriteBuffer(cmdQueue, failureMem, CL_FALSE, 0, sizeof(int), &zero, 0, NULL, &writeEvent); if (err_code != CL_SUCCESS) { DEBUG_LOG_ERROR("Run", err_code); return NS_ERROR_ABORT; } DEBUG_LOG_STATUS("Run", "enqueing execution of kernel"); #ifdef WINDOWS_ROUNDTRIP dpoCContext::RecordBeginOfRoundTrip(parent); #endif /* WINDOWS_ROUNDTRIP */ err_code = clEnqueueNDRangeKernel(cmdQueue, kernel, rank, NULL, global_work_size, NULL, 1, &writeEvent, &runEvent); if (err_code != CL_SUCCESS) { DEBUG_LOG_ERROR("Run", err_code); return NS_ERROR_ABORT; } DEBUG_LOG_STATUS("Run", "reading failure code"); err_code = clEnqueueReadBuffer(cmdQueue, failureMem, CL_FALSE, 0, sizeof(int), _retval, 1, &runEvent, &readEvent); if (err_code != CL_SUCCESS) { DEBUG_LOG_ERROR("Run", err_code); return NS_ERROR_ABORT; } DEBUG_LOG_STATUS("Run", "waiting for execution to finish"); // For now we always wait for the run to complete. // In the long run, we may want to interleave this with JS execution and only sync on result read. err_code = clWaitForEvents( 1, &readEvent); DEBUG_LOG_STATUS("Run", "first event fired"); if (err_code != CL_SUCCESS) { DEBUG_LOG_ERROR("Run", err_code); return NS_ERROR_ABORT; } #ifdef WINDOWS_ROUNDTRIP dpoCContext::RecordEndOfRoundTrip(parent); #endif /* WINDOWS_ROUNDTRIP */ #ifdef CLPROFILE #ifdef CLPROFILE_ASYNC err_code = clSetEventCallback( readEvent, CL_COMPLETE, &dpoCContext::CollectTimings, parent); DEBUG_LOG_STATUS("Run", "second event fired"); if (err_code != CL_SUCCESS) { DEBUG_LOG_ERROR("Run", err_code); return NS_ERROR_ABORT; } #else /* CLPROFILE_ASYNC */ dpoCContext::CollectTimings(readEvent,CL_COMPLETE,parent); #endif /* CLPROFILE_ASYNC */ #endif /* CLPROFILE */ DEBUG_LOG_STATUS("Run", "execution completed successfully, start cleanup"); if (global_work_size != (size_t *) shape) { nsMemory::Free(global_work_size); } #ifdef USE_LOCAL_WORKSIZE if (local_work_size != (size_t *) tile) { nsMemory::Free(local_work_size); } #endif /* USE_LOCAL_WORKSIZE */ err_code = clReleaseEvent(readEvent); err_code = clReleaseEvent(runEvent); err_code = clReleaseEvent(writeEvent); if (err_code != CL_SUCCESS) { DEBUG_LOG_ERROR("Run", err_code); return NS_ERROR_ABORT; } DEBUG_LOG_STATUS("Run", "cleanup complete"); return NS_OK; }