static int perf_cladsyn(CSOUND *csound, CLADSYN *p){ uint32_t offset = p->h.insdshead->ksmps_offset; uint32_t early = p->h.insdshead->ksmps_no_end; uint32_t n, nsmps = CS_KSMPS; float *out_ = (float *) p->out_.auxp; MYFLT *asig = p->asig; int count = p->count, vsamps = p->vsamps; p->fp = (float *) (p->fsig->frame.auxp); if (UNLIKELY(offset)) memset(asig, '\0', offset*sizeof(MYFLT)); if (UNLIKELY(early)) { nsmps -= early; memset(&asig[nsmps], '\0', early*sizeof(MYFLT)); } for(n=offset; n < nsmps; n++){ if(count == 0) { int err; float freq = *p->kfreq; clSetKernelArg(p->kernel1, 4, sizeof(cl_float), &freq); clSetKernelArg(p->kernel2, 4, sizeof(cl_float), &freq); clEnqueueWriteBuffer(p->commands,p->frame, CL_TRUE, 0, sizeof(cl_float)*p->bins*2, p->fp, 0, NULL, NULL); err = clEnqueueNDRangeKernel(p->commands,p->kernel1, 1, NULL, &p->threads, &p->wgs1, 0, NULL, NULL); if(err) csound->Message(csound,"Error: Failed to compute sample kernel! %s\n", cl_error_string(err)); clFinish(p->commands); clEnqueueReadBuffer(p->commands,p->out, CL_TRUE, 0,vsamps*sizeof(cl_float),out_, 0, NULL, NULL); err = clEnqueueNDRangeKernel(p->commands,p->kernel2, 1, NULL, &p->mthreads, &p->wgs2, 0, NULL, NULL); if(err) csound->Message(csound,"Error: Failed to compute update kernel!%s\n", cl_error_string(err)); count = vsamps; } asig[n] = (MYFLT) out_[vsamps - count]; count--; } p->count = count; return OK; }
static const char *estr(CLBlastStatusCode err) { if (err > -1024) return cl_error_string((cl_int)err); switch (err) { case CLBlastNotImplemented: return "Unimplemented feature"; case CLBlastInvalidMatrixA: return "matrix A is not a valid memory object"; case CLBlastInvalidMatrixB: return "matrix B is not a valid memory object"; case CLBlastInvalidMatrixC: return "matrix C is not a valid memory object"; case CLBlastInvalidVectorX: return "vector X is not a valid memory object"; case CLBlastInvalidVectorY: return "vector Y is not a valid memory object"; case CLBlastInvalidDimension: return "An input dimension (M, N, K) is invalid"; case CLBlastInvalidLeadDimA: return "leading dimension for A must not be less than the size of the first dimension"; case CLBlastInvalidLeadDimB: return "leading dimension for B must not be less than the size of the second dimension"; case CLBlastInvalidLeadDimC: return "leading dimension for C must not be less than the size of the third dimension"; case CLBlastInvalidIncrementX: return "increment for X must not be 0"; case CLBlastInvalidIncrementY: return "increment for Y must not be 0"; case CLBlastInsufficientMemoryA: return "memory object for matrix A is too small"; case CLBlastInsufficientMemoryB: return "memory object for matrix B is too small"; case CLBlastInsufficientMemoryC: return "memory object for matrix C is too small"; case CLBlastInsufficientMemoryX: return "memory object for vector X is too small"; case CLBlastInsufficientMemoryY: return "memory object for vector Y is too small"; case CLBlastInvalidLocalMemUsage: return "not enough local memory on the device"; case CLBlastNoHalfPrecision: return "float16 is not supported on this device"; case CLBlastNoDoublePrecision: return "float64 is not supported on this device"; case CLBlastInvalidVectorScalar: return "unit-sized vector is not a valid memory object"; case CLBlastInsufficientMemoryScalar: return "memory object for unit-sized vector is too small"; case CLBlastDatabaseError: return "device entry not in database"; case CLBlastUnknownError: return "Unspecified error"; case CLBlastUnexpectedError: return "Unexpected error"; default: return "Unknow error"; } }
static const char *estr(clblasStatus err) { if (err > -1024) return cl_error_string((cl_int)err); switch (err) { case clblasNotImplemented: return "Unimplemented feature"; case clblasNotInitialized: return "Library not initialized"; case clblasInvalidMatA: return "matrix A is not a valid memory object"; case clblasInvalidMatB: return "matrix B is not a valid memory object"; case clblasInvalidMatC: return "matrix C is not a valid memory object"; case clblasInvalidVecX: return "vector X is not a valid memory object"; case clblasInvalidVecY: return "vector Y is not a valid memory object"; case clblasInvalidDim: return "An input dimension (M, N, K) is invalid"; case clblasInvalidLeadDimA: return "leading dimension for A must not be less than the size of the first dimension"; case clblasInvalidLeadDimB: return "leading dimension for B must not be less than the size of the second dimension"; case clblasInvalidLeadDimC: return "leading dimension for C must not be less than the size of the third dimension"; case clblasInvalidIncX: return "increment for X must not be 0"; case clblasInvalidIncY: return "increment for Y must not be 0"; case clblasInsufficientMemMatA: return "memory object for matrix A is too small"; case clblasInsufficientMemMatB: return "memory object for matrix B is too small"; case clblasInsufficientMemMatC: return "memory object for matrix C is too small"; case clblasInsufficientMemVecX: return "memory object for vector X is too small"; case clblasInsufficientMemVecY: return "memory object for vector Y is too small"; default: return "Unknow error"; } }
static int init_cladsyn(CSOUND *csound, CLADSYN *p){ int asize, ipsize, fpsize, err; cl_device_id device_ids[32], device_id; cl_context context; cl_command_queue commands; cl_program program; cl_kernel kernel1, kernel2; cl_uint num = 0, nump = 0; cl_platform_id platforms[16]; uint i; if(p->fsig->overlap > 1024) return csound->InitError(csound, "overlap is too large\n"); err = clGetDeviceIDs(NULL, CL_DEVICE_TYPE_ALL, 32, device_ids, &num); if (err != CL_SUCCESS){ clGetPlatformIDs(16, platforms, &nump); int devs = 0; for(i=0; i < nump && devs < 32; i++){ char name[128]; clGetPlatformInfo(platforms[i], CL_PLATFORM_NAME, 128, name, NULL); csound->Message(csound, "available platform[%d] %s\n",i, name); err = clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, 32-devs, &device_ids[devs], &num); if (err != CL_SUCCESS) csound->InitError(csound, "failed to find an OpenCL device! %s \n", cl_error_string(err)); } devs += num; } for(i=0; i < num; i++){ char name[128]; cl_device_type type; clGetDeviceInfo(device_ids[i], CL_DEVICE_NAME, 128, name, NULL); clGetDeviceInfo(device_ids[i], CL_DEVICE_TYPE, sizeof(cl_device_type), &type, NULL); if(type & CL_DEVICE_TYPE_CPU) csound->Message(csound, "available CPU[device %d] %s\n",i, name); else if(type & CL_DEVICE_TYPE_GPU) csound->Message(csound, "available GPU[device %d] %s\n",i, name); else if(type & CL_DEVICE_TYPE_ACCELERATOR) csound->Message(csound, "available ACCELLERATOR[device %d] %s\n",i, name); else csound->Message(csound, "available generic [device %d] %s\n",i, name);; } // SELECT THE GPU HERE if(*p->idev < num) device_id = device_ids[(int)*p->idev]; else device_id = device_ids[num-1]; context = clCreateContext(0, 1, &device_id, NULL, NULL, &err); if (!context) return csound->InitError(csound, "Failed to create a compute context! %s\n", cl_error_string(err)); // Create a command commands // commands = clCreateCommandQueue(context, device_id, 0, &err); if (!commands) return csound->InitError(csound, "Failed to create a command commands! %s\n", cl_error_string(err)); // Create the compute program from the source buffer // program = clCreateProgramWithSource(context, 1, (const char **) &code, NULL, &err); if (!program) return csound->InitError(csound, "Failed to create compute program! %s\n", cl_error_string(err)); err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); if (err != CL_SUCCESS) { size_t len; char buffer[2048]; csound->Message(csound, "Failed to build program executable! %s\n", cl_error_string(err)); clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len); return csound->InitError(csound, "%s\n", buffer); } kernel1 = clCreateKernel(program, "sample", &err); if (!kernel1 || err != CL_SUCCESS) return csound->InitError(csound, "Failed to create sample compute kernel! %s\n", cl_error_string(err)); kernel2 = clCreateKernel(program, "update", &err); if (!kernel2 || err != CL_SUCCESS) return csound->InitError(csound,"Failed to create update compute kernel! %s\n", cl_error_string(err)); char name[128]; clGetDeviceInfo(device_id, CL_DEVICE_NAME, 128, name, NULL); csound->Message(csound, "using device: %s\n",name); p->bins = (p->fsig->N)/2; if(*p->inum > 0 && *p->inum < p->bins) p->bins = *p->inum; p->vsamps = p->fsig->overlap; p->threads = p->bins*p->vsamps; p->mthreads = (p->bins > p->vsamps ? p->bins : p->vsamps); asize = p->vsamps*sizeof(cl_float); ipsize = (p->bins > p->vsamps ? p->bins : p->vsamps)*sizeof(cl_long); fpsize = p->fsig->N*sizeof(cl_float); p->out = clCreateBuffer(context,0, asize, NULL, NULL); p->frame = clCreateBuffer(context, CL_MEM_READ_ONLY, fpsize, NULL, NULL); p->ph = clCreateBuffer(context,0, ipsize, NULL, NULL); p->amps = clCreateBuffer(context,0,(p->bins > p->vsamps ? p->bins : p->vsamps)*sizeof(cl_float), NULL, NULL); // memset needed? asize = p->vsamps*sizeof(float); if(p->out_.auxp == NULL || p->out_.size < (unsigned long) asize) csound->AuxAlloc(csound, asize , &p->out_); csound->RegisterDeinitCallback(csound, p, destroy_cladsyn); p->count = 0; p->context = context; p->program = program; p->commands = commands; p->kernel1 = kernel1; p->kernel2 = kernel2; clGetKernelWorkGroupInfo(p->kernel1, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(p->wgs1), &p->wgs1, NULL); clGetKernelWorkGroupInfo(p->kernel2, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(p->wgs1), &p->wgs2, NULL); p->sr = csound->GetSr(csound); clSetKernelArg(p->kernel1, 0, sizeof(cl_mem), &p->out); clSetKernelArg(p->kernel1, 1, sizeof(cl_mem), &p->frame); clSetKernelArg(p->kernel1, 2, sizeof(cl_mem), &p->ph); clSetKernelArg(p->kernel1, 3, sizeof(cl_mem), &p->amps); clSetKernelArg(p->kernel1, 5, sizeof(cl_int), &p->bins); clSetKernelArg(p->kernel1, 6, sizeof(cl_int), &p->vsamps); clSetKernelArg(p->kernel1, 7, sizeof(cl_float), &p->sr); clSetKernelArg(p->kernel2, 0, sizeof(cl_mem), &p->out); clSetKernelArg(p->kernel2, 1, sizeof(cl_mem), &p->frame); clSetKernelArg(p->kernel2, 2, sizeof(cl_mem), &p->ph); clSetKernelArg(p->kernel2, 3, sizeof(cl_mem), &p->amps); clSetKernelArg(p->kernel2, 5, sizeof(cl_int), &p->bins); clSetKernelArg(p->kernel2, 6, sizeof(cl_int), &p->vsamps); clSetKernelArg(p->kernel2, 7, sizeof(cl_float), &p->sr); return OK; }