TH_API void THLab_(syev)(THTensor *a_, THTensor *w_, const char *jobz, const char *uplo) { int n, lda, lwork, info; THTensor *A; THTensor *work; real wkopt; THArgCheck(a_->nDimension == 2, 2, "A should be 2 dimensional"); A = THTensor_(newContiguous)(a_); n = A->size[1]; lda = n; THTensor_(resize1d)(w_,n); // get optimal workspace size THLapack_(syev)(jobz[0], uplo[0], n, THTensor_(data)(A), lda, THTensor_(data)(w_), &wkopt, -1, &info); lwork = (int)wkopt; work = THTensor_(newWithSize1d)(lwork); THLapack_(syev)(jobz[0], uplo[0], n, THTensor_(data)(A), lda, THTensor_(data)(w_), THTensor_(data)(work), lwork, &info); if (info > 0) { THError(" Lapack syev : Failed to converge. %d off-diagonal elements of an didn't converge to zero",info); } else if (info < 0) { THError("Lapack syev : Argument %d : illegal value", -info); } THTensor_(free)(A); THTensor_(free)(work); }
void THFloatTensor_addr(THFloatTensor *r_, float beta, THFloatTensor *t, float alpha, THFloatTensor *vec1, THFloatTensor *vec2) { if( (vec1->nDimension != 1) || (vec2->nDimension != 1) ) THError("vector and vector expected, got %dD, %dD tensors", vec1->nDimension, vec2->nDimension); if(t->nDimension != 2) THError("expected matrix, got %dD tensor for t", t->nDimension); if( (t->size[0] != vec1->size[0]) || (t->size[1] != vec2->size[0]) ) THError("size mismatch, t: %ld, vec1: %ld, t: %ld, vec2: %ld", t->size[0], vec1->size[0], t->size[1], vec2->size[0]); if(r_ != t) THError("r_ != t not implemented"); if(beta != 1) THFloatTensor_mul(r_, r_, beta); if(r_->stride[0] == 1) { THBlas_ger(vec1->size[0], vec2->size[0], alpha, THFloatTensor_data(vec1), vec1->stride[0], THFloatTensor_data(vec2), vec2->stride[0], THFloatTensor_data(r_), r_->stride[1]); } else if(r_->stride[1] == 1) { THBlas_ger(vec2->size[0], vec1->size[0], alpha, THFloatTensor_data(vec2), vec2->stride[0], THFloatTensor_data(vec1), vec1->stride[0], THFloatTensor_data(r_), r_->stride[0]); } else THError("addr for non-contiguous not implemented"); }
void* THRealloc(void *ptr, ptrdiff_t size) { if(!ptr) return(THAlloc(size)); if(size == 0) { THFree(ptr); return NULL; } if(size < 0) THError("$ Torch: invalid memory size -- maybe an overflow?"); ptrdiff_t oldSize = -getAllocSize(ptr); void *newptr = realloc(ptr, size); if(!newptr && torchGCFunction) { torchGCFunction(torchGCData); newptr = realloc(ptr, size); } if(!newptr) THError("$ Torch: not enough memory: you tried to reallocate %dGB. Buy new RAM!", size/1073741824); // update heapSize only after successfully reallocated THHeapUpdate(oldSize + getAllocSize(newptr)); return newptr; }
int checkAndCountListOfStreams(lua_State *L, THCState *state, int arg, int device) { if (!lua_istable(L, arg)) { THError("expecting table of device streams"); } /* Push table to top */ lua_pushvalue(L, arg); /* Check that all values in the table are numeric and in bounds */ int streams = 0; lua_pushnil(L); while (lua_next(L, -2)) { if (!lua_isnumber(L, -1)) { THError("streamWaitFor: list of streams must be numeric"); } int streamId = (int) lua_tonumber(L, -1); /* This will error out if the stream is not in bounds */ THCState_getDeviceStream(state, device, streamId); ++streams; lua_pop(L, 1); } /* Pop table from top */ lua_pop(L, 1); return streams; }
void THNN_(ClassNLLCriterion_updateOutput)(THNNState *state, THTensor *input, THIndexTensor *target, THTensor *output, bool sizeAverage, THTensor *weights, THTensor *total_weight) { int n_dims = THTensor_(nDimension)(input); int n_classes = THTensor_(size)(input, n_dims - 1); if (THIndexTensor_(nDimension)(target) > 1) { THError("multi-target not supported"); } if (THTensor_(nDimension)(input) > 2) { THError("input tensor should be 1D or 2D"); } input = THTensor_(newContiguous)(input); target = THIndexTensor_(newContiguous)(target); weights = weights ? THTensor_(newContiguous)(weights) : NULL; real *input_data = THTensor_(data)(input); THIndex_t *target_data = THIndexTensor_(data)(target); real *weights_data = weights ? THTensor_(data)(weights) : NULL; real *output_data = THTensor_(data)(output); real *total_weight_data = THTensor_(data)(total_weight); output_data[0] = total_weight_data[0] = 0.0; if (THTensor_(nDimension)(input) == 1) { int cur_target = target_data[0] - 1; THAssert(cur_target >= 0 && cur_target < n_classes); total_weight_data[0] = weights ? weights_data[cur_target] : 1.0f; output_data[0] = -input_data[cur_target] * total_weight_data[0]; } else if (THTensor_(nDimension)(input) == 2) { int batch_size = THTensor_(size)(input, 0); THAssert(THIndexTensor_(size)(target, 0) == batch_size); int n_target = THTensor_(size)(input, 1); int i; for (i = 0; i < batch_size; i++) { int cur_target = target_data[i] - 1; THAssert(cur_target >= 0 && cur_target < n_classes); real cur_weight = weights ? weights_data[cur_target] : 1.0f; total_weight_data[0] += cur_weight; output_data[0] -= input_data[i * n_target + cur_target] * cur_weight; } } if (sizeAverage && total_weight_data[0]) { output_data[0] /= total_weight_data[0]; } if (weights) { THTensor_(free)(weights); } THTensor_(free)(input); THIndexTensor_(free)(target); }
static void THMapAllocator_free(void* ctx_, void* data) { THMapAllocatorContext *ctx = ctx_; #ifdef _WIN32 if(UnmapViewOfFile(data) == 0) THError("could not unmap the shared memory file"); #else /* _WIN32 */ if (ctx->flags & TH_ALLOCATOR_MAPPED_KEEPFD) { if (close(ctx->fd) == -1) THError("could not close file descriptor %d", ctx->fd); } if (munmap(data, ctx->size)) THError("could not unmap the shared memory file"); if (!(ctx->flags & (TH_ALLOCATOR_MAPPED_FROMFD | TH_ALLOCATOR_MAPPED_UNLINK))) { if (ctx->flags & TH_ALLOCATOR_MAPPED_SHAREDMEM) { #ifdef HAVE_SHM_UNLINK if (shm_unlink(ctx->filename) == -1) THError("could not unlink the shared memory file %s", ctx->filename); #else THError("could not unlink the shared memory file %s, shm_unlink not available on platform", ctx->filename); #endif } } #endif /* _WIN32 */ THMapAllocatorContext_free(ctx); }
void* THRealloc(void *ptr, long size) { if(!ptr) return(THAlloc(size)); if(size == 0) { THFree(ptr); return NULL; } if(size < 0) THError("$ Torch: invalid memory size -- maybe an overflow?"); THHeapUpdate(-getAllocSize(ptr)); void *newptr = realloc(ptr, size); if(!newptr && torchGCFunction) { torchGCFunction(torchGCData); newptr = realloc(ptr, size); } THHeapUpdate(getAllocSize(newptr ? newptr : ptr)); if(!newptr) THError("$ Torch: not enough memory: you tried to reallocate %dGB. Buy new RAM!", size/1073741824); return newptr; }
void THNN_(LookupTable_renorm)( THNNState *state, THIndexTensor *idx, THTensor *weight, real maxNorm, real normType) { if (!THTensor_(isContiguous)(weight)) THError("weight must be contiguous"); if (!THIndexTensor_(isContiguous)(idx)) THError("input must be contiguous"); if (THIndexTensor_(nDimension)(idx) != 1) THError("idx must be a vector"); if (normType <= 0) THError("non-positive-norm not supported"); long i; THIndex_t *row_idx = THIndexTensor_(data)(idx); long numel = THIndexTensor_(nElement)(idx); long numw = THTensor_(size)(weight, 0); long stride = THTensor_(stride)(weight, 0); real *gw = THTensor_(data)(weight); for (i=0; i<numel; i++) if (row_idx[i] < 1 || row_idx[i] > numw) THError("input out of range"); // get unique indices qsort(row_idx, numel, sizeof(THIndex_t), THNN_(compare_THIndex)); long ptr = 0; for (i=0; i<numel; i++) if (i == 0 || row_idx[i] != row_idx[i-1]) row_idx[ptr++] = row_idx[i]; numel = ptr; #ifdef _OPENMP if (numel > 1000) { // The strategy is to parallelize over the rows that appear in // row_idx, so that thread 1 handles the rows in row_idx[0..numel/nThreads]. // This distributes the work evenly to each thread. #pragma omp parallel for private(i) for (i=0; i<numel; i++) { long k = row_idx[i] - 1; THNN_(LookupTable_renormRow)(gw + k*stride, stride, maxNorm, normType); } return; } #endif for (i=0; i<numel; i++) { long k = row_idx[i] - 1; THNN_(LookupTable_renormRow)(gw + k*stride, stride, maxNorm, normType); } }
TH_API void THLab_(gesvd)(THTensor *a_, THTensor *s_, THTensor *u_, THTensor *vt_, char jobu) { int k,m, n, lda, ldu, ldvt, lwork, info; THTensor *A, *work; real wkopt; char jobvt = jobu; THArgCheck(a_->nDimension == 2, 2, "A should be 2 dimensional"); THArgCheck(jobu == 'A' || jobu == 'S',4, "jobu can be 'A' or 'S'"); A = THTensor_(newContiguous)(a_); m = A->size[1]; n = A->size[0]; k = (m < n ? m : n); lda = m; ldu = m; ldvt = n; THTensor_(resize1d)(s_,k); THTensor_(resize2d)(vt_,n,ldvt); if (jobu == 'A') { THTensor_(resize2d)(u_,m,ldu); } else { THTensor_(resize2d)(u_,k,ldu); } THLapack_(gesvd)(jobu,jobvt, m,n,THTensor_(data)(A),lda, THTensor_(data)(s_), THTensor_(data)(u_), ldu, THTensor_(data)(vt_), ldvt, &wkopt, -1, &info); lwork = (int)wkopt; work = THTensor_(newWithSize1d)(lwork); THLapack_(gesvd)(jobu,jobvt, m,n,THTensor_(data)(A),lda, THTensor_(data)(s_), THTensor_(data)(u_), ldu, THTensor_(data)(vt_), ldvt, THTensor_(data)(work),lwork, &info); if (info > 0) { THError(" Lapack gesvd : %d superdiagonals failed to converge.",info); } else if (info < 0) { THError("Lapack gesvd : Argument %d : illegal value", -info); } THTensor_(free)(A); THTensor_(free)(work); }
static ptrdiff_t applyHeapDelta() { ptrdiff_t oldHeapSize = THAtomicAddPtrdiff(&heapSize, heapDelta); #ifdef DEBUG if (heapDelta > 0 && oldHeapSize > PTRDIFF_MAX - heapDelta) THError("applyHeapDelta: heapSize(%td) + increased(%td) > PTRDIFF_MAX, heapSize overflow!", oldHeapSize, heapDelta); if (heapDelta < 0 && oldHeapSize < PTRDIFF_MIN - heapDelta) THError("applyHeapDelta: heapSize(%td) + decreased(%td) < PTRDIFF_MIN, heapSize underflow!", oldHeapSize, heapDelta); #endif ptrdiff_t newHeapSize = oldHeapSize + heapDelta; heapDelta = 0; return newHeapSize; }
int THCState_getPeerToPeerAccess(THCState* state, int dev, int devToAccess) { if (dev < 0 || dev >= state->numDevices) { THError("%d is not a device", dev); } if (devToAccess < 0 || dev >= state->numDevices) { THError("%d is not a device", devToAccess); } return state->p2pAccessEnabled[dev][devToAccess]; }
int THCState_getPeerToPeerAccess(THCState* state, int dev, int devToAccess) { int numDevices = 0; THCudaCheck(cudaGetDeviceCount(&numDevices)); if (dev < 0 || dev >= numDevices) { THError("%d is not a device", dev); } if (devToAccess < 0 || dev >= numDevices) { THError("%d is not a device", devToAccess); } return state->p2pAccessEnabled[dev][devToAccess]; }
cublasHandle_t THCState_getCurrentBlasHandle(THCState *state) { /* This is called at the point of kernel execution. For some debugging code or improperly instrumented kernels, `state` is null */ if (state) { if (state->currentBlasHandle <= 0) { THError("%d is not a valid handle, valid range is: (1, %d)", state->currentBlasHandle, state->numUserBlasHandles); } return state->currentBlasHandle; } THError("THCState and blasHandles must be set as there is no default blasHandle"); return NULL; }
TH_API void THLab_(gesv)(THTensor *a_, THTensor *b_) { int n, nrhs, lda, ldb, info; THIntTensor *ipiv; THTensor *A, *B; THArgCheck(a_->nDimension == 2, 2, "A should be 2 dimensional"); THArgCheck(a_->size[0] == a_->size[1], 2, "A should be symmetric"); n = (int)a_->size[1]; lda = n; ldb = n; if (b_->nDimension == 1) { nrhs = 1; THArgCheck(n == b_->size[0], 1, "size incompatible A,b"); } else { nrhs = b_->size[0]; THArgCheck(n == b_->size[1], 1, "size incompatible A,b"); } A = THTensor_(newContiguous)(a_); B = THTensor_(newContiguous)(b_); ipiv = THIntTensor_newWithSize1d((long)n); THLapack_(gesv)(n, nrhs, THTensor_(data)(A), lda, THIntTensor_data(ipiv), THTensor_(data)(B), ldb, &info); if(!THTensor_(isContiguous)(b_)) { THTensor_(copy)(b_,B); } if (info < 0) { THError("Lapack gesv : Argument %d : illegal value", -info); } else if (info > 0) { THError("Lapack gesv : U(%d,%d) is zero, singular U.", info,info); } THIntTensor_free(ipiv); THTensor_(free)(A); THTensor_(free)(B); }
static void THCudaTensor_rawSet(THCState *state, THCudaTensor *self, THCudaStorage *storage, long storageOffset, int nDimension, long *size, long *stride) { THAssert(self->storage != NULL); /* storage */ if(self->storage != storage) { if(self->storage) THCudaStorage_free(state, self->storage); if(storage) { self->storage = storage; THCudaStorage_retain(state, self->storage); } else self->storage = THCudaStorage_new(state); } /* storageOffset */ if(storageOffset < 0) THError("Tensor: invalid storage offset"); self->storageOffset = storageOffset; /* size and stride */ THCudaTensor_rawResize(state, self, nDimension, size, stride); }
static void THNN_(SpatialMaxUnpooling_updateGradInput_frame)(scalar_t *gradInput_p, scalar_t *gradOutput_p, THIndex_t *ind_p, int nslices, int iwidth, int iheight, int owidth, int oheight) { at::parallel_for(0, nslices, 0, [&](int64_t start, int64_t end) { for (auto k = start; k < end; k++) { scalar_t *gradInput_p_k = gradInput_p + k*iwidth*iheight; scalar_t *gradOutput_p_k = gradOutput_p + k*owidth*oheight; THIndex_t *ind_p_k = ind_p + k*iwidth*iheight; int i, j; THIndex_t maxp; for(i = 0; i < iheight; i++) { for(j = 0; j < iwidth; j++) { maxp = ind_p_k[i*iwidth + j]; /* retrieve position of max */ if(maxp < 0 || maxp >= owidth * oheight) { THError("invalid max index %ld, owidth= %d, oheight= %d", maxp, owidth, oheight); } gradInput_p_k[i*iwidth + j] = gradOutput_p_k[maxp]; /* update gradient */ } } } }); }
THFloatTensor *nn_SpatialConvolutionMM_updateOutput(struct module *module, THFloatTensor *input) { int kW = module->SpatialConvolution.kW; int kH = module->SpatialConvolution.kH; int dW = module->SpatialConvolution.dW; int dH = module->SpatialConvolution.dH; int padW = module->SpatialConvolution.padW; int padH = module->SpatialConvolution.padH; THFloatTensor *finput = module->SpatialConvolution.finput; THFloatTensor *weight = module->SpatialConvolution.weight; THFloatTensor *bias = module->SpatialConvolution.bias; THFloatTensor *output = module->output; int batch = 1; if (input->nDimension == 3) { batch = 0; THFloatTensor_resize4d(input, 1, input->size[0], input->size[1], input->size[2]); } long batchSize = input->size[0]; long nInputPlane = module->SpatialConvolution.nInputPlane; long nOutputPlane = module->SpatialConvolution.nOutputPlane; long inputWidth = input->size[3]; long inputHeight = input->size[2]; long outputWidth = (inputWidth + 2*padW - kW) / dW + 1; long outputHeight = (inputHeight + 2*padH - kH) / dH + 1; if (outputWidth < 1 || outputHeight < 1) THError("Given input size: (%dx%dx%d). Calculated output size: (%dx%dx%d). Output size is too small", nInputPlane,inputHeight,inputWidth,nOutputPlane,outputHeight,outputWidth); THFloatTensor_resize3d(finput, batchSize, kW*kH*nInputPlane, outputHeight*outputWidth); THFloatTensor_resize4d(output, batchSize, nOutputPlane, outputHeight, outputWidth); long t; #pragma omp parallel for if(batchSize >= 4) private(t) for (t = 0; t < batchSize; t++) { THFloatTensor *input_t = THFloatTensor_newSelect(input, 0, t); THFloatTensor *output_t = THFloatTensor_newSelect(output, 0, t); THFloatTensor *finput_t = THFloatTensor_newSelect(finput, 0, t); nn_SpatialConvolutionMM_updateOutput_frame(input_t, output_t, weight, bias, finput_t, kW, kH, dW, dH, padW, padH, nInputPlane, inputWidth, inputHeight, nOutputPlane, outputWidth, outputHeight); THFloatTensor_free(input_t); THFloatTensor_free(output_t); THFloatTensor_free(finput_t); } if (batch == 0) { THFloatTensor_resize3d(output, nOutputPlane, outputHeight, outputWidth); THFloatTensor_resize3d(input, nInputPlane, inputHeight, inputWidth); } return output; }
static inline void THNN_(VolumetricFullConvolution_shapeCheck)( THTensor *input, THTensor *gradOutput, THTensor *weight, THTensor *bias, int dT, int dW, int dH, int pT, int pW, int pH, int aT, int aW, int aH) { THNN_ARGCHECK(input->nDimension == 4 || input->nDimension == 5, 2, input, "4D or 5D (batch mode) tensor expected for input, but got: %s"); // number of input & output planes and kernel size is indirectly defined by the weight tensor THNN_ARGCHECK(weight->nDimension == 5, 4, weight, "5D (nOutputPlane x nInputPlane x kT x kH x kW) tensor " "expected for weight, but got: %s"); THArgCheck(dT > 0 && dW > 0 && dH > 0, 11, "stride should be greater than zero, but got dT: %d dH: %d dW: %d", dT, dH, dW); THArgCheck(aT < dT && aW < dW && aH < dH, 15, "output adjustment must be smaller than stride, but got " "adjT: %d adjH: %d adjW: %d dT: %d dH: %d dW: %d", aT, aH, aW, dT, dH, dW); int ndim = input->nDimension; const int nInputPlane = (int)weight->size[0]; const int nOutputPlane = (int)weight->size[1]; const int kT = (int)weight->size[2]; const int kH = (int)weight->size[3]; const int kW = (int)weight->size[4]; if (bias != NULL) { THNN_CHECK_DIM_SIZE(bias, 1, 0, weight->size[1]); } int dimf = 0; int dimd = 1; int dimh = 2; int dimw = 3; if (ndim == 5) { dimf++; dimd++; dimh++; dimw++; } const long inputWidth = input->size[dimw]; const long inputHeight = input->size[dimh]; const long inputDepth = input->size[dimd]; const long outputWidth = (inputWidth - 1) * dW - 2*pW + kW + aW; const long outputHeight = (inputHeight - 1) * dH - 2*pH + kH + aH; const long outputDepth = (inputDepth - 1) * dT - 2*pT + kT + aT; if (outputDepth < 1 || outputWidth < 1 || outputHeight < 1) THError("Given input size: (%dx%dx%dx%d). Calculated output size: (%dx%dx%dx%d). Output size is too small", nInputPlane,inputDepth,inputHeight,inputWidth,nOutputPlane,outputDepth,outputHeight,outputWidth); THNN_CHECK_DIM_SIZE(input, ndim, dimf, nInputPlane); if (gradOutput != NULL) { THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimf, nOutputPlane); THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimd, outputDepth); THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimh, outputHeight); THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimw, outputWidth); } }
static void THNN_(SpatialMaxUnpooling_updateGradInput_frame)(real *gradInput_p, real *gradOutput_p, THIndex_t *ind_p, long nslices, long iwidth, long iheight, long owidth, long oheight) { long k; #pragma omp parallel for private(k) for (k = 0; k < nslices; k++) { real *gradInput_p_k = gradInput_p + k*iwidth*iheight; real *gradOutput_p_k = gradOutput_p + k*owidth*oheight; THIndex_t *ind_p_k = ind_p + k*iwidth*iheight; long i, j, maxp; for(i = 0; i < iheight; i++) { for(j = 0; j < iwidth; j++) { maxp = ind_p_k[i*iwidth + j] - TH_INDEX_BASE; /* retrieve position of max */ if(maxp<0 || maxp>=owidth*oheight){ THError("invalid max index %d, owidth= %d, oheight= %d",maxp,owidth,oheight); } gradInput_p_k[i*iwidth + j] = gradOutput_p_k[maxp]; /* update gradient */ } } } }
void THCudaGetGridSize(int *nBlockPerColumn_, int *nBlockPerRow_, int *nThreadPerBlock_, long size) { const int nThreadPerBlock = 256; long nBlockPerGrid = size / nThreadPerBlock; long nBlockPerColumn = 0L; long nBlockPerRow = 0L; if(size % nThreadPerBlock) nBlockPerGrid++; if(nBlockPerGrid <= 65535) { nBlockPerRow = nBlockPerGrid; nBlockPerColumn = 1; } else if(nBlockPerGrid <= (65355L * 65355L)) { unsigned int uiSqrt = (unsigned int)(sqrt((float)nBlockPerGrid)); nBlockPerRow = uiSqrt; nBlockPerColumn = uiSqrt; while((nBlockPerRow * nBlockPerColumn) < nBlockPerGrid) nBlockPerRow++; } else THError("too large vector for Cuda, sorry"); *nBlockPerColumn_ = (int)nBlockPerColumn; *nBlockPerRow_ = (int)nBlockPerRow; *nThreadPerBlock_ = (int)nThreadPerBlock; }
cudaStream_t THCState_getDeviceStream(THCState *state, int device, int stream) { /* `device` is a CUDA index */ if (device >= state->numDevices || device < 0) { THError("%d is not a device", device + 1 /* back to Torch index */); } /* Stream 0 is the default stream, 1 ... `numUserStreams` are Torch streams */ if (stream > state->numUserStreams || stream < 0) { THError("%d is not a stream", stream); } return state->streamsPerDevice[device][stream]; }
THMapAllocatorContext *THMapAllocatorContext_new(const char *filename, int flags) { THMapAllocatorContext *ctx = THAlloc(sizeof(THMapAllocatorContext)); if (!(flags & TH_ALLOCATOR_MAPPED_SHARED) && !(flags & TH_ALLOCATOR_MAPPED_SHAREDMEM)) flags &= ~TH_ALLOCATOR_MAPPED_NOCREATE; if ((flags ^ TH_ALLOCATOR_MAPPED_EXCLUSIVE) == 0) THError("TH_ALLOCATOR_MAPPED_EXCLUSIVE flag requires opening the file " "in shared mode"); if (filename) { ctx->filename = THAlloc(strlen(filename)+1); strcpy(ctx->filename, filename); } else { ctx->filename = unknown_filename; } ctx->flags = flags; ctx->size = 0; #ifdef _WIN32 ctx->handle = INVALID_HANDLE_VALUE; #else ctx->fd = -1; #endif return ctx; }
static int loadnpz_l(lua_State *L){ try{ const char *filename = lua_tostring(L, 1); std::string fpath = std::string(filename); cnpy::npz_t npzData = cnpy::npz_load(filename); // create a new table lua_newtable(L); int tbl = lua_gettop(L); for (cnpy::npz_t::iterator i=npzData.begin(); i!=npzData.end(); ++i){ std::string name = i->first; cnpy::NpyArray arr = i->second; lua_pushstring(L, name.c_str()); load_array_to_lua(L, arr); lua_rawset(L, tbl); } }catch (std::exception& e){ THError(e.what()); } return 1; }
static void nn_(SpatialMaxUnpooling_updateOutput_frame)(real *input_p, real *output_p, real *ind_p, long nslices, long iwidth, long iheight, long owidth, long oheight) { long k; #pragma omp parallel for private(k) for (k = 0; k < nslices; k++) { real *output_p_k = output_p + k*owidth*oheight; real *input_p_k = input_p + k*iwidth*iheight; real *ind_p_k = ind_p + k*iwidth*iheight; long i, j, maxp; for(i = 0; i < iheight; i++) { for(j = 0; j < iwidth; j++) { maxp = ind_p_k[i*iwidth + j] - 1; /* retrieve position of max */ if(maxp<0 || maxp>=owidth*oheight){ THError("invalid max index %d, owidth= %d, oheight= %d",maxp,owidth,oheight); } output_p_k[maxp] = input_p_k[i*iwidth + j]; /* update output */ } } } }
void THCState_setPeerToPeerAccess(THCState* state, int dev, int devToAccess, int enable) { /* This will perform device bounds checking for us */ int prevEnabled = THCState_getPeerToPeerAccess(state, dev, devToAccess); if (enable != prevEnabled) { /* If we're attempting to enable p2p access but p2p access isn't */ /* supported, throw an error */ if (enable) { int access = 0; THCudaCheck(cudaDeviceCanAccessPeer(&access, dev, devToAccess)); if (!access) { THError("p2p access not supported for %d accessing %d", dev, devToAccess); } } state->p2pAccessEnabled[dev][devToAccess] = enable; int prevDev = 0; THCudaCheck(cudaGetDevice(&prevDev)); THCudaCheck(cudaSetDevice(dev)); /* This should be in sync with the current access state */ if (enable) { THCudaCheck(cudaDeviceEnablePeerAccess(devToAccess, 0)); } else { THCudaCheck(cudaDeviceDisablePeerAccess(devToAccess)); } THCudaCheck(cudaSetDevice(prevDev)); } }
int THProcessYUYV(THNETWORK *network, unsigned char *image, int width, int height, float **results, int *outwidth, int *outheight) { THFloatTensor *out; THFloatStorage *st; #ifdef CUDNN if(network->net->cuda) THError("This function is not supported with CUDNN"); #endif st = THFloatStorage_new(width * height * 3); yuyv2fRGB(image, st->data, width*height, width, width, height, network->mean, network->std); THFloatTensor *t = THFloatTensor_new(); t->storage = st; t->nDimension = 3; t->size[0] = 3; t->size[1] = height; t->size[2] = width; t->stride[0] = width * height; t->stride[1] = width; t->stride[2] = 1; out = forward(network->net, t); THFloatTensor_free(t); *results = out->storage->data; if(out->nDimension >= 3) { *outwidth = out->size[out->nDimension - 1]; *outheight = out->size[out->nDimension - 2]; } else *outwidth = *outheight = 1; return THFloatTensor_nElement(out); }
void* THAlloc(long size) { void *ptr; if(size < 0) THError("$ Torch: invalid memory size -- maybe an overflow?"); if(size == 0) return NULL; ptr = malloc(size); if(!ptr) THError("$ Torch: not enough memory: you tried to allocate %dGB. Buy new RAM!", size/1073741824); return ptr; }
static void THGPUTensor_rawSet(THGPUTensor *self, THGPUStorage *storage, long storageOffset, int nDimension, long *size, long *stride) { /* storage */ if (self->storage != storage) { if (self->storage) THGPUStorage_free(self->storage); if (storage) { self->storage = storage; THGPUStorage_retain(self->storage); } else self->storage = NULL; } /* storageOffset */ if (storageOffset < 0) THError("Tensor: invalid storage offset"); self->storageOffset = storageOffset; /* size and stride */ THGPUTensor_rawResize(self, nDimension, size, stride); }
static inline void THNN_(Col2Im_shapeCheck)( THNNState *state, THTensor *input, THTensor *gradOutput, int64_t outputHeight, int64_t outputWidth, int64_t kH, int64_t kW, int64_t dilationH, int64_t dilationW, int64_t padH, int64_t padW, int64_t dH, int64_t dW) { THArgCheck(kW > 0 && kH > 0, 6, "kernel size should be greater than zero, but got kH: %d kW: %d", kH, kW); THArgCheck(dW > 0 && dH > 0, 12, "stride should be greater than zero, but got dH: %d dW: %d", dH, dW); THArgCheck(dilationW > 0 && dilationH > 0, 8, "dilation should be greater than zero, but got dilationH: %d dilationW: %d", dilationH, dilationW); int64_t ndim = THTensor_(nDimensionLegacyNoScalars)(input); THNN_ARGCHECK(!input->is_empty() && (ndim == 2 || ndim == 3), 2, input, "Expected non-empty 2D or 3D input tensor, but got input of shape %s"); int64_t batch_dim = (ndim == 3) ? 0 : -1; int64_t nInputPlane = input->size(batch_dim + 1); if (nInputPlane % (kW * kH) != 0) { THError("Expected size of input's dimension 1 to be divisible by the " "product of kernel_size, but got input.size(1)=%lld and " "kernel_size=(%d, %d).", (long long) nInputPlane, kH, kW); } int64_t inputLength = input->size(batch_dim + 2); int64_t nBlocksH = div_rtn<int64_t>(outputHeight + 2 * padH - dilationH * (kH - 1) - 1, dH) + 1; int64_t nBlocksW = div_rtn<int64_t>(outputWidth + 2 * padW - dilationW * (kW - 1) - 1, dW) + 1; if (inputLength != (nBlocksH * nBlocksW)) { THError("Given output_size=(%d, %d), kernel_size=(%d, %d), " "dilation=(%d, %d), padding=(%d, %d), stride=(%d, %d), expected " "size of input's dimension 2 to match the calculated number of " "sliding blocks %lld * %lld = %lld, but got input.size(2)=%lld.", outputHeight, outputWidth, kH, kW, dilationH, dilationW, padH, padW, dH, dW, (long long) nBlocksH, (long long) nBlocksW, (long long) (nBlocksH * nBlocksW), (long long) inputLength); } if (outputWidth < 1 || outputHeight < 1) { THError("Expected output spatial size to be positive, but got: output_size=(%d, %d).", outputHeight, outputWidth); } }
static void jhu_THLogSum1d_init(lua_State *L) { int ret = luaT_pushmetatable(L, "torch.DoubleTensor"); if(ret == 0) { THError("problem pushing metatable"); } luaT_registeratname(L, jhu_THLogSum1d__, "jhu"); lua_pop(L, 1); }