__global__ void AssignKernel(Param<T> out, CParam<T> in, const AssignKernelParam_t p, const int nBBS0, const int nBBS1) { // retrieve index pointers // these can be 0 where af_array index is not used const uint* ptr0 = p.ptr[0]; const uint* ptr1 = p.ptr[1]; const uint* ptr2 = p.ptr[2]; const uint* ptr3 = p.ptr[3]; // retrive booleans that tell us which index to use const bool s0 = p.isSeq[0]; const bool s1 = p.isSeq[1]; const bool s2 = p.isSeq[2]; const bool s3 = p.isSeq[3]; const int gz = blockIdx.x/nBBS0; const int gw = blockIdx.y/nBBS1; const int gx = blockDim.x * (blockIdx.x - gz*nBBS0) + threadIdx.x; const int gy = blockDim.y * (blockIdx.y - gw*nBBS1) + threadIdx.y; if (gx<in.dims[0] && gy<in.dims[1] && gz<in.dims[2] && gw<in.dims[3]) { // calculate pointer offsets for input int i = p.strds[0] * trimIndex(s0 ? gx+p.offs[0] : ptr0[gx], out.dims[0]); int j = p.strds[1] * trimIndex(s1 ? gy+p.offs[1] : ptr1[gy], out.dims[1]); int k = p.strds[2] * trimIndex(s2 ? gz+p.offs[2] : ptr2[gz], out.dims[2]); int l = p.strds[3] * trimIndex(s3 ? gw+p.offs[3] : ptr3[gw], out.dims[3]); // offset input and output pointers const T *src = (const T*)in.ptr + (gx*in.strides[0]+gy*in.strides[1]+ gz*in.strides[2]+gw*in.strides[3]); T *dst = (T*)out.ptr +(i+j+k+l); // set the output dst[0] = src[0]; } }
__global__ void lookupND(Param<in_t> out, CParam<in_t> in, CParam<idx_t> indices, int nBBS0, int nBBS1) { int lx = threadIdx.x; int ly = threadIdx.y; int gz = blockIdx.x/nBBS0; int gw = blockIdx.y/nBBS1; int gx = blockDim.x * (blockIdx.x - gz*nBBS0) + lx; int gy = blockDim.y * (blockIdx.y - gw*nBBS1) + ly; const idx_t *idxPtr = (const idx_t*)indices.ptr; int i = in.strides[0]*(dim==0 ? trimIndex((int)idxPtr[gx], in.dims[0]): gx); int j = in.strides[1]*(dim==1 ? trimIndex((int)idxPtr[gy], in.dims[1]): gy); int k = in.strides[2]*(dim==2 ? trimIndex((int)idxPtr[gz], in.dims[2]): gz); int l = in.strides[3]*(dim==3 ? trimIndex((int)idxPtr[gw], in.dims[3]): gw); const in_t *inPtr = (const in_t*)in.ptr + (i+j+k+l); in_t *outPtr = (in_t*)out.ptr +(gx*out.strides[0]+gy*out.strides[1]+ gz*out.strides[2]+gw*out.strides[3]); if (gx<out.dims[0] && gy<out.dims[1] && gz<out.dims[2] && gw<out.dims[3]) { outPtr[0] = inPtr[0]; } }
void assign(Param<T> out, af::dim4 dDims, CParam<T> rhs, std::vector<bool> const isSeq, std::vector<af_seq> const seqs, std::vector< CParam<uint> > idxArrs) { af::dim4 pDims = out.dims(); // retrieve dimensions & strides for array to which rhs is being copied to af::dim4 dst_offsets = toOffset(seqs, dDims); af::dim4 dst_strides = toStride(seqs, dDims); // retrieve rhs array dimenesions & strides af::dim4 src_dims = rhs.dims(); af::dim4 src_strides = rhs.strides(); // declare pointers to af_array index data uint const * const ptr0 = idxArrs[0].get(); uint const * const ptr1 = idxArrs[1].get(); uint const * const ptr2 = idxArrs[2].get(); uint const * const ptr3 = idxArrs[3].get(); const T * src= rhs.get(); T * dst = out.get(); for(dim_t l=0; l<src_dims[3]; ++l) { dim_t src_loff = l*src_strides[3]; dim_t dst_lIdx = trimIndex(isSeq[3] ? l+dst_offsets[3] : ptr3[l], pDims[3]); dim_t dst_loff = dst_lIdx * dst_strides[3]; for(dim_t k=0; k<src_dims[2]; ++k) { dim_t src_koff = k*src_strides[2]; dim_t dst_kIdx = trimIndex(isSeq[2] ? k+dst_offsets[2] : ptr2[k], pDims[2]); dim_t dst_koff = dst_kIdx * dst_strides[2]; for(dim_t j=0; j<src_dims[1]; ++j) { dim_t src_joff = j*src_strides[1]; dim_t dst_jIdx = trimIndex(isSeq[1] ? j+dst_offsets[1] : ptr1[j], pDims[1]); dim_t dst_joff = dst_jIdx * dst_strides[1]; for(dim_t i=0; i<src_dims[0]; ++i) { dim_t src_ioff = i*src_strides[0]; dim_t src_idx = src_ioff + src_joff + src_koff + src_loff; dim_t dst_iIdx = trimIndex(isSeq[0] ? i+dst_offsets[0] : ptr0[i], pDims[0]); dim_t dst_ioff = dst_iIdx * dst_strides[0]; dim_t dst_idx = dst_ioff + dst_joff + dst_koff + dst_loff; dst[dst_idx] = src[src_idx]; } } } } }
void index(Array<T> out, Array<T> const in, std::vector<bool> const isSeq, std::vector<af_seq> const seqs, std::vector< Array<uint> > const idxArrs) { const af::dim4 iDims = in.dims(); const af::dim4 dDims = in.getDataDims(); const af::dim4 iOffs = toOffset(seqs, dDims); const af::dim4 iStrds = toStride(seqs, dDims); const af::dim4 oDims = out.dims(); const af::dim4 oStrides = out.strides(); const T *src = in.get(); T *dst = out.get(); const uint* ptr0 = idxArrs[0].get(); const uint* ptr1 = idxArrs[1].get(); const uint* ptr2 = idxArrs[2].get(); const uint* ptr3 = idxArrs[3].get(); for (dim_t l=0; l<oDims[3]; ++l) { dim_t lOff = l*oStrides[3]; dim_t inIdx3 = trimIndex(isSeq[3] ? l+iOffs[3] : ptr3[l], iDims[3]); dim_t inOff3 = inIdx3*iStrds[3]; for (dim_t k=0; k<oDims[2]; ++k) { dim_t kOff = k*oStrides[2]; dim_t inIdx2 = trimIndex(isSeq[2] ? k+iOffs[2] : ptr2[k], iDims[2]); dim_t inOff2 = inIdx2*iStrds[2]; for (dim_t j=0; j<oDims[1]; ++j) { dim_t jOff = j*oStrides[1]; dim_t inIdx1 = trimIndex(isSeq[1] ? j+iOffs[1] : ptr1[j], iDims[1]); dim_t inOff1 = inIdx1*iStrds[1]; for (dim_t i=0; i<oDims[0]; ++i) { dim_t iOff = i*oStrides[0]; dim_t inIdx0 = trimIndex(isSeq[0] ? i+iOffs[0] : ptr0[i], iDims[0]); dim_t inOff0 = inIdx0*iStrds[0]; dst[lOff+kOff+jOff+iOff] = src[inOff3+inOff2+inOff1+inOff0]; } } } } }
Array<in_t>* arrayIndex(const Array<in_t> &input, const Array<idx_t> &indices, const unsigned dim) { const dim4 iDims = input.dims(); const dim4 iStrides = input.strides(); const in_t *inPtr = input.get(); const idx_t *idxPtr = indices.get(); dim4 oDims(1); for (dim_type d=0; d<4; ++d) oDims[d] = (d==int(dim) ? indices.elements() : iDims[d]); Array<in_t>* out = createEmptyArray<in_t>(oDims); dim4 oStrides = out->strides(); in_t *outPtr = out->get(); for (dim_type l=0; l<oDims[3]; ++l) { dim_type iLOff = iStrides[3]*(dim==3 ? trimIndex((dim_type)idxPtr[l], iDims[3]): l); dim_type oLOff = l*oStrides[3]; for (dim_type k=0; k<oDims[2]; ++k) { dim_type iKOff = iStrides[2]*(dim==2 ? trimIndex((dim_type)idxPtr[k], iDims[2]): k); dim_type oKOff = k*oStrides[2]; for (dim_type j=0; j<oDims[1]; ++j) { dim_type iJOff = iStrides[1]*(dim==1 ? trimIndex((dim_type)idxPtr[j], iDims[1]): j); dim_type oJOff = j*oStrides[1]; for (dim_type i=0; i<oDims[0]; ++i) { dim_type iIOff = iStrides[0]*(dim==0 ? trimIndex((dim_type)idxPtr[i], iDims[0]): i); dim_type oIOff = i*oStrides[0]; outPtr[oLOff+oKOff+oJOff+oIOff] = inPtr[iLOff+iKOff+iJOff+iIOff]; } } } } return out; }
void lookup(Param<InT> out, CParam<InT> input, CParam<IndexT> indices, unsigned const dim) { const af::dim4 iDims = input.dims(); const af::dim4 oDims = out.dims(); const af::dim4 iStrides = input.strides(); const af::dim4 oStrides = out.strides(); const InT *inPtr = input.get(); const IndexT *idxPtr = indices.get(); InT *outPtr = out.get(); for (dim_t l = 0; l < oDims[3]; ++l) { dim_t iLOff = iStrides[3] * (dim == 3 ? trimIndex((dim_t)idxPtr[l], iDims[3]) : l); dim_t oLOff = l * oStrides[3]; for (dim_t k = 0; k < oDims[2]; ++k) { dim_t iKOff = iStrides[2] * (dim == 2 ? trimIndex((dim_t)idxPtr[k], iDims[2]) : k); dim_t oKOff = k * oStrides[2]; for (dim_t j = 0; j < oDims[1]; ++j) { dim_t iJOff = iStrides[1] * (dim == 1 ? trimIndex((dim_t)idxPtr[j], iDims[1]) : j); dim_t oJOff = j * oStrides[1]; for (dim_t i = 0; i < oDims[0]; ++i) { dim_t iIOff = iStrides[0] * (dim == 0 ? trimIndex((dim_t)idxPtr[i], iDims[0]) : i); dim_t oIOff = i * oStrides[0]; outPtr[oLOff + oKOff + oJOff + oIOff] = inPtr[iLOff + iKOff + iJOff + iIOff]; } } } } }
__global__ void lookup1D(Param<in_t> out, CParam<in_t> in, CParam<idx_t> indices, int vDim) { int idx = threadIdx.x + blockIdx.x * THREADS * THRD_LOAD; const in_t* inPtr = (const in_t*)in.ptr; const idx_t* idxPtr = (const idx_t*)indices.ptr; in_t* outPtr = (in_t*)out.ptr; int en = min(out.dims[vDim], idx + THRD_LOAD * THREADS); for (int oIdx = idx; oIdx < en; oIdx += THREADS) { int iIdx = trimIndex(idxPtr[oIdx], in.dims[vDim]); outPtr[oIdx] = inPtr[iIdx]; } }