static void THCudaTensor_rawSet(THCState *state, THCudaTensor *self, THCudaStorage *storage, long storageOffset, int nDimension, long *size, long *stride) { THAssert(self->storage != NULL); /* storage */ if(self->storage != storage) { if(self->storage) THCudaStorage_free(state, self->storage); if(storage) { self->storage = storage; THCudaStorage_retain(state, self->storage); } else self->storage = THCudaStorage_new(state); } /* storageOffset */ if(storageOffset < 0) THError("Tensor: invalid storage offset"); self->storageOffset = storageOffset; /* size and stride */ THCudaTensor_rawResize(state, self, nDimension, size, stride); }
static void THCudaTensor_rawInit(THCState *state, THCudaTensor *self) { self->refcount = 1; self->storage = THCudaStorage_new(state); self->storageOffset = 0; self->size = NULL; self->stride = NULL; self->nDimension = 0; self->flag = TH_TENSOR_REFCOUNTED; }
THCudaStorage* THCudaStorage_newWithSize(long size) { THArgCheck(size >= 0, 2, "invalid size"); if(size > 0) { THCudaStorage *storage = (THCudaStorage*)THAlloc(sizeof(THCudaStorage)); THCudaCheck(cudaMalloc((void**)&(storage->data), size * sizeof(float))); storage->size = size; storage->refcount = 1; storage->flag = TH_STORAGE_REFCOUNTED | TH_STORAGE_RESIZABLE | TH_STORAGE_FREEMEM; return storage; } else { return THCudaStorage_new(); } }
static void THCudaTensor_rawResize(THCState *state, THCudaTensor *self, int nDimension, long *size, long *stride) { int d; int nDimension_; long totalSize; int hascorrectsize = 1; nDimension_ = 0; for(d = 0; d < nDimension; d++) { if(size[d] > 0) { nDimension_++; if((self->nDimension > d) && (size[d] != self->size[d])) hascorrectsize = 0; if((self->nDimension > d) && stride && (stride[d] >= 0) && (stride[d] != self->stride[d])) hascorrectsize = 0; } else break; } nDimension = nDimension_; if(nDimension != self->nDimension) hascorrectsize = 0; if(hascorrectsize) return; if(nDimension > 0) { if(nDimension != self->nDimension) { self->size = (long*)THRealloc(self->size, sizeof(long)*nDimension); self->stride = (long*)THRealloc(self->stride, sizeof(long)*nDimension); self->nDimension = nDimension; } totalSize = 1; for(d = self->nDimension-1; d >= 0; d--) { self->size[d] = size[d]; if(stride && (stride[d] >= 0) ) self->stride[d] = stride[d]; else { if(d == self->nDimension-1) self->stride[d] = 1; else self->stride[d] = self->size[d+1]*self->stride[d+1]; } totalSize += (self->size[d]-1)*self->stride[d]; } if(totalSize+self->storageOffset > 0) { if(!self->storage) self->storage = THCudaStorage_new(state); if(totalSize+self->storageOffset > self->storage->size) THCudaStorage_resize(state, self->storage, totalSize+self->storageOffset); } } else self->nDimension = 0; }
int THProcessImages(THNETWORK *network, unsigned char **images, int batchsize, int width, int height, int stride, float **results, int *outwidth, int *outheight, int bgr) { int i; THFloatTensor *out, *t = 0; THFloatStorage *st; #ifdef CUDNN if(network->net->cuda) { #ifdef HAVEFP16 if(floattype == CUDNN_DATA_HALF) { st = THCudaStorage_new(batchsize * (width * height * 3)); for(i = 0; i < batchsize; i++) cuda_rgb2half((unsigned short *)st->data + i * (width * height * 3), images[i], width, height, stride, network->mean, network->std, bgr); } else #endif { st = THCudaStorage_new(batchsize * width * height * 3); for(i = 0; i < batchsize; i++) cuda_rgb2float(st->data + i * width * height * 3, images[i], width, height, stride, network->mean, network->std, bgr); } } else #endif #ifdef OPENCL if(network->net->opencl) t = OpenCL_LoadImage(images[0], width, height, stride, network->mean, network->std, bgr); else #endif { st = THFloatStorage_new(batchsize * width * height * 3); if(bgr) #pragma omp parallel for if(batchsize>1) private(i) for(i = 0; i < batchsize; i++) bgr2float(st->data + i * width * height * 3, images[i], width, height, stride, network->mean, network->std); else #pragma omp parallel for if(batchsize>1) private(i) for(i = 0; i < batchsize; i++) rgb2float(st->data + i * width * height * 3, images[i], width, height, stride, network->mean, network->std); } if(!t) { t = THFloatTensor_new(); t->storage = st; if(batchsize == 1) { t->nDimension = 3; t->size[0] = 3; t->size[1] = height; t->size[2] = width; t->stride[0] = width * height; t->stride[1] = width; t->stride[2] = 1; } else { t->nDimension = 4; t->size[0] = batchsize; t->size[1] = 3; t->size[2] = height; t->size[3] = width; t->stride[0] = 3 * width * height; t->stride[1] = width * height; t->stride[2] = width; t->stride[3] = 1; } } #ifdef CUDNN if(network->net->cuda) { out = forward(network->net, t); if(network->out) THFloatTensor_free(network->out); #ifdef HAVEFP16 if(floattype == CUDNN_DATA_HALF) network->out = THFloatTensor_newFromHalfCudaTensor(out); else #endif network->out = THFloatTensor_newFromCudaTensor(out); out = network->out; } else #endif #ifdef OPENCL if(network->net->opencl) { out = forward(network->net, t); if(network->out) THFloatTensor_free(network->out); #ifdef HAVEFP16 if(cl_datasize == 2) network->out = THFloatTensor_newFromHalfOpenCLImageTensor(out); else #endif network->out = THFloatTensor_newFromOpenCLImageTensor(out); out = network->out; } else #endif out = forward(network->net, t); THFloatTensor_free(t); *results = out->storage->data; if(out->nDimension >= 3) { *outwidth = out->size[out->nDimension - 1]; *outheight = out->size[out->nDimension - 2]; } else *outwidth = *outheight = 1; return THFloatTensor_nElement(out); }