Example #1
0
/*
 * Based on the implementation of the THTensor_(indexCopy) in torch7
 */
static void THCudaTensor_indexCopy(THCudaTensor *tensor, int dim, THLongTensor *index, THCudaTensor *src)
{
  long i, numel;
  THCudaTensor *tSlice, *sSlice;
  long *index_data;

  numel = THLongTensor_nElement(index);
  THArgCheck(index->nDimension == 1, 3, "Index is supposed to be a vector");
  THArgCheck(dim < src->nDimension,4,"Indexing dim is out of bounds");

  index = THLongTensor_newContiguous(index);
  index_data = THLongTensor_data(index);

  for (i=0; i<numel; i++)
  {
    if (tensor->nDimension > 1 )
    {
      tSlice = THCudaTensor_new();
      sSlice = THCudaTensor_new();
      THCudaTensor_select(tSlice, tensor, dim, index_data[i]-1);
      THCudaTensor_select(sSlice, src, dim, i);
      THCudaTensor_copy(tSlice, sSlice);
      THCudaTensor_free(tSlice);
      THCudaTensor_free(sSlice);
    }
    else
    {
      // It's faster to copy a float from an address in the device to another address in the device than 
      // retrieving it to the host memory and recopy it to the device memory
      THCudaCheck(cudaMemcpy(tensor->storage->data + tensor->storageOffset + index_data[i]-1,\
        src->storage->data + src->storageOffset + i, sizeof(float), cudaMemcpyDeviceToDevice));
    }
  }
  THLongTensor_free(index);
}
Example #2
0
/*
 * Based on the implementation of the THTensor_(indexCopy) in torch7
 */
static void THCudaTensor_indexFill(THCudaTensor *tensor, int dim, THLongTensor *index, float val)
{
  long i, numel;
  THCudaTensor *tSlice;
  long *index_data;

  numel = THLongTensor_nElement(index);
  THArgCheck(index->nDimension == 1, 3, "Index is supposed to be a vector");
  THArgCheck(dim < tensor->nDimension,4,"Indexing dim is out of bounds");

  index = THLongTensor_newContiguous(index);
  index_data = THLongTensor_data(index);
  
  for (i=0; i<numel; i++)
  {
    if (tensor->nDimension > 1 )
    {
      // create a new CudaTensor
      tSlice = THCudaTensor_new();
      // set its storage to point to the corresponding storage in tensor
      THCudaTensor_select(tSlice, tensor,dim,index_data[i]-1);
      THCudaTensor_fill(tSlice, val);
      THCudaTensor_free(tSlice);
    }
    else
    {
      THCudaTensor_set1d(tensor,index_data[i]-1,val);
    }
  }
  THLongTensor_free(index);
}
Example #3
0
// Stitch takes args.
// pano - a torch tensor in RGB with dims (3 x height x width)
//
// offset_map - a torch tensor same h and w as pano storing offsets and 
// and image indices.  The two feaure dimentions are:
//  -- image number (starting at 1) and
//  -- bit offset in image tensor
// nimages - is the number of images use to make the panorama
// image1, ... imagen - are the image in a torch tensor
static int Lstitch_(stitch)(lua_State *L) {
  int nargs = lua_gettop(L);
  THTensor *pano =
    (THTensor *)luaT_checkudata(L, 1, torch_(Tensor_id));
  THLongTensor *offset_map =
    (THLongTensor *)luaT_checkudata(L, 2, luaT_checktypename2id(L, "torch.LongTensor"));
  THTensor *images[MAXIMAGES];
  int i = 0;
  long npixels = offset_map->size[1]*offset_map->size[2];
  real *pano_pt   = THTensor_(data)(pano); 
  long *offset_pt = THLongTensor_data(offset_map);
  real *images_pt[MAXIMAGES];
  long images_npixels[MAXIMAGES];
  long images_Goff[MAXIMAGES];
  long images_Boff[MAXIMAGES];
  
  real * panoR = pano_pt;
  real * panoG = pano_pt +    pano->stride[0];
  real * panoB = pano_pt + (2*pano->stride[0]);
  real * curImg_pt  = NULL;
  long unsigned int XYoffset = 0;
  long * offImg      = offset_pt;
  long * offIndexXY  = offset_pt + offset_map->stride[0];
  int nimages = 0;
  long cImgOff = 0;
  /* finish processing input image tensors */
  /* either you can pass a table */
  /* or a number and variable length of args */
  if (nargs == 3){
    if (lua_istable(L,3)){
      nimages = lua_objlen (L, 3);
      /* table is in the stack at index 3 */
      lua_pushnil(L);  /* first key */
      i = 0;
      while (lua_next(L, 3) != 0) {
        /* 'key' (at index -2) and 'value' (at index -1) */
        images[i]    =
          (THTensor *)luaT_checkudata(L, -1, torch_(Tensor_id));
        images_npixels[i] = images[i]->size[1]*images[i]->size[2];
        images_Goff[i] = images[i]->stride[0];
        images_Boff[i] = 2*images[i]->stride[0];
        images_pt[i] = THTensor_(data)(images[i]);
        /* removes 'value'; keeps 'key' for next iteration */
        lua_pop(L, 1);
        i = i+1;
      }
    } else {
      lua_pushstring(L, "with 3 args last argument is a table");
      lua_error(L);
    }
  } else {
    nimages = lua_tonumber(L,3);
    for(i=0;i<nimages;i++){
      images[i]    =
        (THTensor *)luaT_checkudata(L, i+4, torch_(Tensor_id));
      images_npixels[i] = images[i]->size[1]*images[i]->size[2];
      images_Goff[i]    = images[i]->stride[0];
      images_Boff[i]    = 2*images[i]->stride[0];
      images_pt[i]      = THTensor_(data)(images[i]);
    }
  }
  for(i=0;i<npixels;i++){
    cImgOff   = (long unsigned int)*offImg - 1;
    curImg_pt = images_pt[cImgOff];
    if ((*offIndexXY > 0) &&
        (*offIndexXY < images_npixels[cImgOff])){
      XYoffset  =  (long unsigned int)*offIndexXY;
      *panoR   = curImg_pt[XYoffset];
      *panoG   = curImg_pt[XYoffset + images_Goff[cImgOff]] ; 
      *panoB   = curImg_pt[XYoffset + images_Boff[cImgOff]]; 
    }
    panoR++;
    panoG++;
    panoB++;
    offImg++;
    offIndexXY++;
  }
  return 0;
}
Example #4
0
static void load_array_to_lua(lua_State *L, cnpy::NpyArray& arr){
	int ndims = arr.shape.size();

	//based on code from mattorch with stride fix
	int k;
	THLongStorage *size = THLongStorage_newWithSize(ndims);
	THLongStorage *stride = THLongStorage_newWithSize(ndims);
	for (k=0; k<ndims; k++) {
		THLongStorage_set(size, k, arr.shape[k]);
		if (k > 0)
			THLongStorage_set(stride, ndims-k-1, arr.shape[ndims-k]*THLongStorage_get(stride,ndims-k));
		else
			THLongStorage_set(stride, ndims-k-1, 1);
	}

	void * tensorDataPtr = NULL;
	size_t numBytes = 0;

	if ( arr.arrayType == 'f' ){ // float32/64
		if ( arr.word_size == 4 ){ //float32
			THFloatTensor *tensor = THFloatTensor_newWithSize(size, stride);
		    tensorDataPtr = (void *)(THFloatTensor_data(tensor));
		    numBytes = THFloatTensor_nElement(tensor) * arr.word_size;
		    luaT_pushudata(L, tensor, luaT_checktypename2id(L, "torch.FloatTensor"));
    
		}else if ( arr.word_size ==  8){ //float 64
			THDoubleTensor *tensor = THDoubleTensor_newWithSize(size, stride);
			tensorDataPtr = (void *)(THDoubleTensor_data(tensor));
		    numBytes = THDoubleTensor_nElement(tensor) * arr.word_size;
		    luaT_pushudata(L, tensor, luaT_checktypename2id(L, "torch.DoubleTensor"));
		}
	}else if ( arr.arrayType == 'i' || arr.arrayType == 'u' ){ // does torch have unsigned types .. need to look
		if ( arr.word_size == 1 ){ //int8
			THByteTensor *tensor = THByteTensor_newWithSize(size, stride);
			tensorDataPtr = (void *)(THByteTensor_data(tensor));
		    numBytes = THByteTensor_nElement(tensor) * arr.word_size;
		    luaT_pushudata(L, tensor, luaT_checktypename2id(L, "torch.ByteTensor"));
    
		}else if ( arr.word_size == 2 ){ //int16
			THShortTensor *tensor = THShortTensor_newWithSize(size, stride);
			tensorDataPtr = (void *)(THShortTensor_data(tensor));
		    numBytes = THShortTensor_nElement(tensor) * arr.word_size;
		    luaT_pushudata(L, tensor, luaT_checktypename2id(L, "torch.ShortTensor"));
    
		}else if ( arr.word_size == 4 ){ //int32
			THIntTensor *tensor = THIntTensor_newWithSize(size, stride);
			tensorDataPtr = (void *)(THIntTensor_data(tensor));
		    numBytes = THIntTensor_nElement(tensor) * arr.word_size;
		    luaT_pushudata(L, tensor, luaT_checktypename2id(L, "torch.IntTensor"));
    
		}else if ( arr.word_size ==  8){ //long 64
			THLongTensor *tensor = THLongTensor_newWithSize(size, stride);
			tensorDataPtr = (void *)(THLongTensor_data(tensor));
		    numBytes = THLongTensor_nElement(tensor) * arr.word_size;
		    luaT_pushudata(L, tensor, luaT_checktypename2id(L, "torch.LongTensor"));
		}
	}else{
		printf("array type unsupported");
		throw std::runtime_error("unsupported data type");
	}

		// now copy the data
		assert(tensorDataPtr);
		memcpy(tensorDataPtr, (void *)(arr.data<void>()), numBytes);


}
Example #5
0
void THNN_(IndexLinear_accGradParameters)(
          THNNState *state,
          THLongTensor *keys,
          int64_t keysOffset,
          THTensor *values,
          THLongTensor *sizes,
          THLongTensor *cumSumSizes,
          THTensor *gradOutput,
          THTensor *gradWeight,
          THTensor *gradBias,
          THTensor *weight,
          THTensor *bias,
          THTensor *valuesBuffer,
          accreal weightDecay_,
          accreal scale_)
{
  scalar_t scale = TH_CONVERT_ACCREAL_TO_REAL(scale_);
  /* Retrieve all the dimensions of the problem */
  int64_t batchSize = THLongTensor_size(sizes, 0);
  int64_t keysSize = THLongTensor_size(keys, 0);
  int64_t outDim = THTensor_(size)(bias, 0);
  int64_t woutDim = THTensor_(size)(weight, 1);
  int64_t maxNormalize = (woutDim - outDim) > 0 ?1:0;
  THArgCheck(THNN_(checkKeysValues)(keys, values), 1, "Keys and values should have the same number of elements");
  int64_t* sizesData = THLongTensor_data(sizes);

  /* COmpute the cumulative sizes */
  THLongTensor* cumSizes = THLongTensor_new();
  THLongTensor_cumsum(cumSizes, sizes, 0);
  int64_t* cumSizesData = THLongTensor_data(cumSizes);

  /* Resize the gradWeight buffer to keep it dense.
   * That speeds up updates A LOT assuming random mem access. */
  THTensor_(resize2d)(gradWeight, keysSize, outDim * (maxNormalize>0?2:1));

  /* Access the storage data/strides */
  scalar_t* gradOutputData = gradOutput->data<scalar_t>();
  scalar_t* valuesData =values->data<scalar_t>();
  scalar_t* gradWeightData = gradWeight->data<scalar_t>();
  scalar_t* gradBiasData = gradBias->data<scalar_t>();

  /* Make sure these inputs are contiguous to accelerate computations */
  THArgCheck(THLongTensor_isContiguous(keys), 1, "keys vector must be contiguous");
  THArgCheck(THTensor_(isContiguous)(values), 3, "values vector must be contiguous");
  THArgCheck(THTensor_(isContiguous)(gradOutput), 6, "gradOutput vector must be contiguous");
  THArgCheck(THTensor_(isContiguous)(gradWeight), 7, "gradWeight must be contiguous");
  THArgCheck(THTensor_(isContiguous)(gradBias), 8, "gradBias vector must be contiguous");
  THArgCheck(THTensor_(isContiguous)(weight), 9, "weight must be contiguous");
  THArgCheck(THTensor_(isContiguous)(bias), 10, "bias vector must be contiguous");
  THArgCheck(THTensor_(isContiguous)(valuesBuffer), 11, "valuesBuffer must be contiguous");

  int i,j,k;

  /* Separate cases: output dimension is == 1, or > 1
   * This allows for some optimizations.
   * No multithreading here as this could
   * corrupt the results (hogwild style) */
  if (outDim == 1)
  {
    for (j = 0; j < batchSize; j++)
    {
      int64_t offset = j==0?0:cumSizesData[j-1];
      scalar_t val = gradOutputData[j] * scale;
      scalar_t* lgradWeightData = gradWeightData + offset;
      scalar_t* lvaluesData = valuesData + offset;
      int64_t end = sizesData[j];

      if (maxNormalize)
      {
        lgradWeightData += offset;
        i = 0;
        for(;i < end; i++)
        {
          lgradWeightData[2*i] = val;
          lgradWeightData[2*i+1] = val * lvaluesData[i];
        }
      }
      else
      {
        i = 0;
        for(;i < end-4; i += 4)
        {
          lgradWeightData[i] = val * lvaluesData[i];
          lgradWeightData[i+1] = val * lvaluesData[i+1];
          lgradWeightData[i+2] = val * lvaluesData[i+2];
          lgradWeightData[i+3] = val * lvaluesData[i+3];
        }

        for(; i < end; i++)
        {
          lgradWeightData[i] = val * lvaluesData[i];
        }
      }
      *gradBiasData += val;
      offset += end;
    }
  }
  else {
    for (j = 0; j < batchSize; j++)
    {
      int64_t offset = j==0?0:cumSizesData[j-1];
      scalar_t* lgradOutputData = gradOutputData + j*outDim;
      scalar_t* lgradWeightData = gradWeightData;
      THVector_(cadd)(gradBiasData, gradBiasData, lgradOutputData, scale, outDim);
      for (i = 0; i < sizesData[j]; i++)
      {
        scalar_t val = valuesData[offset] * scale;
        lgradWeightData = gradWeightData + offset*outDim;
        if (maxNormalize)
        {
          lgradWeightData += offset*outDim;
          k = 0;
          for(;k < outDim-4; k += 4)
          {
            lgradWeightData[k] = lgradOutputData[k]*scale;
            lgradWeightData[k+1] = lgradOutputData[k+1]*scale;
            lgradWeightData[k+2] = lgradOutputData[k+2]*scale;
            lgradWeightData[k+3] = lgradOutputData[k+3]*scale;
          }

          for(; k < outDim; k++)
          {
            lgradWeightData[k] = lgradOutputData[k]*scale;
          }
          lgradWeightData += outDim;
        }
        k = 0;
        for(;k < outDim-4; k += 4)
        {
          lgradWeightData[k] = val * lgradOutputData[k];
          lgradWeightData[k+1] = val * lgradOutputData[k+1];
          lgradWeightData[k+2] = val * lgradOutputData[k+2];
          lgradWeightData[k+3] = val * lgradOutputData[k+3];
        }

        for(; k < outDim; k++)
        {
          lgradWeightData[k] = val * lgradOutputData[k];
        }
        offset++;
      }
    }
  }
  THLongTensor_free(cumSizes);
  return;
}
Example #6
0
void THNN_(IndexLinear_accUpdateGradParameters)(
          THNNState *state,
          THLongTensor *keys,
          int64_t keysOffset,
          THTensor *values,
          THLongTensor *sizes,
          THLongTensor *cumSumSizes,
          THTensor *gradOutput,
          THTensor *weight,
          THTensor *bias,
          accreal weightDecay_,
          accreal scale_)
{
  scalar_t weightDecay = TH_CONVERT_ACCREAL_TO_REAL(weightDecay_);
  scalar_t scale = TH_CONVERT_ACCREAL_TO_REAL(scale_);
  /* Retrieve all the dimensions of the problem */
  int64_t batchSize = THLongTensor_size(sizes, 0);
  int64_t outDim = THTensor_(size)(bias, 0);
  int64_t woutDim = THTensor_(size)(weight, 1);
  int maxNormalize = woutDim - outDim;
  THArgCheck(THNN_(checkKeysValues)(keys, values), 1, "Keys and values should have the same number of elements");

  /* Access the storage data/strides */
  scalar_t* gradOutputData = gradOutput->data<scalar_t>();
  scalar_t* valuesData =values->data<scalar_t>();
  scalar_t* weightData = weight->data<scalar_t>();
  scalar_t* biasData = bias->data<scalar_t>();
  int64_t weightStride0 = weight->stride(0);
  int64_t* keysData = THLongTensor_data(keys);
  int64_t* sizesData = THLongTensor_data(sizes);

  /* Make sure these inputs are contiguous to accelerate computations */
  THArgCheck(THLongTensor_isContiguous(keys), 1, "keys vector must be contiguous");
  THArgCheck(THTensor_(isContiguous)(values), 3, "values vector must be contiguous");
  THArgCheck(THTensor_(isContiguous)(gradOutput), 6, "gradOutput vector must be contiguous");
  THArgCheck(THTensor_(isContiguous)(weight), 7, "weight matrix must be contiguous");
  THArgCheck(THTensor_(isContiguous)(bias), 8, "bias matrix must be contiguous");

  int i,j,k;

  /* Separate cases: output dimension is == 1, or > 1
   * This allows for some optimizations.
   * No multithreading here as this could
   * corrupt the results (hogwild style) */
  if (outDim == 1)
  {
    if (maxNormalize)
    {
        int64_t offset = 0;
        for (j = 0; j < batchSize; j++)
        {
          scalar_t* lgradOutputData = gradOutputData + j;
          *biasData -= *lgradOutputData * scale;
          scalar_t val = *lgradOutputData * scale;
          for (i = 0; i < sizesData[j]; i++)
          {
            int64_t idx = weightStride0*(keysData[offset] + keysOffset) + maxNormalize;
            weightData[idx-1] -= weightData[idx]*val*weightData[idx-2];
            weightData[idx] -= (val*valuesData[offset] - weightDecay * weightData[idx])*weightData[idx-2];
            offset++;
          }
        }

        offset = 0;
        for (j = 0; j < batchSize; j++)
        {
          for (i = 0; i < sizesData[j]; i++)
          {
            int64_t idx = weightStride0*(keysData[offset] + keysOffset) + maxNormalize;
            weightData[idx-2] = 0;
            offset++;
          }
        }
    }
    else
    {
      if (weightDecay)
      {
        int64_t offset = 0;
        for (j = 0; j < batchSize; j++)
        {
          scalar_t* lgradOutputData = gradOutputData + j;
          *biasData -= *lgradOutputData * scale;
          scalar_t val = *lgradOutputData * scale;
          for (i = 0; i < sizesData[j]; i++)
          {
            int64_t idx = weightStride0*(keysData[offset] + keysOffset);
            weightData[idx] -= val * valuesData[offset] + weightData[idx] * weightDecay;
            offset++;
          }
        }
      }
      else
      {
        int64_t offset = 0;
        for (j = 0; j < batchSize; j++)
        {
          scalar_t val = gradOutputData[j] * scale;
          for (i = 0; i < sizesData[j]; i++)
          {
            weightData[(keysData[offset] + keysOffset)*weightStride0] -= val * valuesData[offset];
            offset++;
          }
          *biasData -= val;
        }
      }
    }
  }
  else {
    int64_t offset = 0;
    for (j = 0; j < batchSize; j++)
    {
      scalar_t* lgradOutputData = gradOutputData + j*outDim;
      scalar_t* lweightData = weightData;
      THVector_(cadd)(biasData, biasData, lgradOutputData, -scale, outDim);
      for (i = 0; i < sizesData[j]; i++)
      {
        scalar_t val = valuesData[offset] * scale;
        scalar_t wd = weightDecay;

        // Max normalize case
        if (maxNormalize)
        {
          lweightData = weightData + weightStride0*(keysData[offset] + keysOffset) + (maxNormalize-2);
          val *= lweightData[0];
          wd *= lweightData[0];
          for (k=0; k < outDim; k++)
          {
            lweightData[1] -= lweightData[k+2]*scale*lgradOutputData[k]*lweightData[0];
          }
          lweightData += 2;
        }
        else
        {
          lweightData = weightData + weightStride0*(keysData[offset] + keysOffset);
        }

        /* We do sparse weight decay.
         * We think it makes more sense. */
        if (weightDecay)
        {
          if (outDim > THNN_SPARSE_OUTDIM_THRESHOLD)
          {
            THBlas_(axpy)(outDim, -wd, lweightData, 1, lweightData, 1);
          }
          else
          {
            for (k=0; k < outDim; k++)
            {
              lweightData[k] -= wd * lweightData[k];
            }
          }
        }

        if (outDim > THNN_SPARSE_OUTDIM_THRESHOLD)
        {
          THBlas_(axpy)(outDim, -val, lgradOutputData, 1, lweightData, 1);
        }
        else
        {
          for (k=0; k < outDim; k++)
          {
            lweightData[k] -= val * lgradOutputData[k];
          }
        }
        offset++;
      }
    }

    /* Max Normalize case:
     * Reset the smart update scaling if
     * one does it batch-wise.
     * TODO: Decide what to do with that piece of code.
     * NB: If the code belowe is uncommented, so should the commented
     * code in IndexLinear:zeroGradParameters() */

    /*
    if (maxNormalize)
    {
      offset = 0;
      for (j = 0; j < batchSize; j++)
      {
        scalar_t* lweightData = weightData;
        for (i = 0; i < sizesData[j]; i++)
        {
          scalar_t val = valuesData[offset] * scale;
          scalar_t wd = weightDecay;

          lweightData = weightData + weightStride0*(keysData[offset] + keysOffset) + (maxNormalize-2);
          lweightData[0] = 0;
          offset++;
        }
      }
    }
    */
  }
  return;
}
Example #7
0
void THNN_(IndexLinear_updateOutput)(
          THNNState *state,
          THLongTensor *keys,
          int64_t keysOffset,
          THTensor *values,
          THLongTensor *sizes,
          THLongTensor *cumSumSizes,
          THTensor *output,
          THTensor *weight,
          THTensor *bias,
          THTensor *normalizedValues,
          int  train)
{
  /* Retrieve all the dimensions of the problem */
  int64_t batchSize = THLongTensor_size(sizes, 0);
  int64_t keysSize = THLongTensor_size(keys, 0);
  int64_t outDim = THTensor_(size)(bias, 0);
  int64_t woutDim = THTensor_(size)(weight, 1);
  int maxNormalize = woutDim - outDim;
  int64_t* sizesData = THLongTensor_data(sizes);
  int64_t* cumSumSizesData = THLongTensor_data(cumSumSizes);

  /* Define/resize the normalized values tensor if maxNormalize is  > 0 */
  scalar_t* normalizedValuesData = NULL;
  if (maxNormalize)
  {
    THTensor_(resize1d)(normalizedValues, keysSize);
    normalizedValuesData = normalizedValues->data<scalar_t>();
  }

  /* Resize the output */
  THTensor_(resize2d)(output, batchSize, outDim);

  /* Access the storage data/strides */
  scalar_t* outputData = output->data<scalar_t>();
  scalar_t* valuesData = values->data<scalar_t>();
  scalar_t* weightData = weight->data<scalar_t>();
  int64_t weightStride0 = weight->stride(0);
  scalar_t* biasData = bias->data<scalar_t>();
  int64_t* keysData = THLongTensor_data(keys);

  /* Make sure these inputs are contiguous to accelerate computations */
  THArgCheck(THLongTensor_isContiguous(keys), 1, "keys vector must be contiguous");
  THArgCheck(THTensor_(isContiguous)(values), 3, "values vector must be contiguous");
  THArgCheck(THTensor_(isContiguous)(output), 6, "output vector must be contiguous");
  THArgCheck(THTensor_(isContiguous)(weight), 7, "weight matrix must be contiguous");
  THArgCheck(THTensor_(isContiguous)(bias), 8, "bias vector must be contiguous");
  THArgCheck(THNN_(checkKeysValues)(keys, values), 1, "Keys and values should have the same number of elements");
  THArgCheck(THTensor_(isContiguous)(normalizedValues), 9, "normalizedValues vector must be contiguous");

  /* Separate cases: output dimension is == 1, or > 1
   * This allows for some optimizations. */
  if (outDim == 1)
  {
    THVector_(fill)(outputData, *biasData, batchSize);
    if (maxNormalize)
    {
      /* Parallelize on the batch itself */
      auto loop = [&](int64_t start, int64_t end) {
        for (auto j = start; j < end; j++)
        {
          scalar_t* loutputData = outputData + j;
          scalar_t val = 0;
          scalar_t absVal = 0;
          int64_t offset = j == 0 ? 0 : cumSumSizesData[j - 1];

          for (auto i = 0; i < sizesData[j]; i++)
          {
            int64_t woffset = weightStride0*(keysData[offset] + keysOffset);
            absVal = fabs(valuesData[offset]);
            if (train)
            {
              if (absVal > weightData[woffset])
              {
                weightData[woffset] = absVal;
                weightData[woffset+1] = 1/absVal;
              }

              /*
               * The following can be used to scale the size of the updates
               * depending on some rule, e.g. the frequency of a feature, ...
               * This is used at update time.
               * TODO: implement a smarter update scale.
               */
              weightData[woffset+2] = 1;
            }
            normalizedValuesData[offset] = (absVal > weightData[woffset] ? THNN_INDEXLINEAR_SIGN(valuesData[offset]):valuesData[offset]*weightData[woffset+1]) + weightData[woffset+3];
            val += normalizedValuesData[offset] * weightData[woffset+maxNormalize];
            offset++;
          }
          *loutputData += val;
        }
      };
      if (keysSize * outDim > THNN_SPARSE_OMP_THRESHOLD) {
        at::parallel_for(0, batchSize, 1, loop);
      } else {
        loop(0, batchSize);
      }
    }
    else
    {
      /* Parallelize on the batch itself */
      auto loop = [&](int64_t start, int64_t end) {
        for (auto j = start; j < end; j++)
        {
          int64_t offset = j == 0 ? 0 : cumSumSizesData[j - 1];
          scalar_t* loutputData = outputData + j;
          scalar_t val = 0;

          for (auto i = 0; i < sizesData[j]; i++)
          {
            val += weightData[weightStride0*(keysData[offset] + keysOffset)] * valuesData[offset];
            offset++;
          }
          *loutputData += val;
        }
      };
      if (keysSize * outDim > THNN_SPARSE_OMP_THRESHOLD) {
        at::parallel_for(0, batchSize, 1, loop);
      } else {
        loop(0, batchSize);
      }
    }
  }
  else {
    auto loop = [&](int64_t start, int64_t end) {
      for (auto j = start; j < end; j++)
      {
        int64_t offset = j == 0 ? 0 : cumSumSizesData[j -  1];
        scalar_t val;
        scalar_t* loutputData = outputData + j*outDim;
        scalar_t* lweightData = weightData;
        memcpy(loutputData, biasData, outDim*sizeof(scalar_t));
        for (auto i = 0; i < sizesData[j]; i++)
        {
          int64_t woffset = weightStride0*(keysData[offset] + keysOffset);
          if (maxNormalize)
          {
            val = valuesData[offset];
            scalar_t absVal = fabs(val);
            if (train)
            {
              if (absVal > weightData[woffset])
              {
                weightData[woffset] = absVal;
                weightData[woffset+1] = 1/absVal;
              }

              /*
               * The following can be used to scale the size of the updates
               * depending on some rule, e.g. the frequency of a feature, ...
               * The commented section thereafter is just an example of what can be done:
               *
               *```
               * weightData[woffset+2] = weightData[woffset+2]==0?1:(weightData[woffset+2] / (weightData[woffset+2] + 1));
               * scalar_t alpha = 1;
               * scalar_t beta = 0.01;
               * scalar_t gamma = 1 - 0.000001;
               * scalar_t l = weightData[woffset+2]==0?1/gamma:(weightData[woffset+2] - beta) / (alpha - beta);
               * l = gamma*l;
               * weightData[woffset+2] = (alpha-beta)*l + beta;
               * ```
               *
               * TODO: implement a smarter update scale.
               */
              weightData[woffset+2] = 1;
            }

            /* Normalize + Clamp */
            val = (absVal > weightData[woffset] ? THNN_INDEXLINEAR_SIGN(val):val*weightData[woffset+1]) + weightData[woffset+3];
            normalizedValuesData[offset] = val;

            lweightData = weightData + woffset + maxNormalize;
          }
          else
          {
            val = valuesData[offset];
            lweightData = weightData + woffset;
          }
          if (outDim > THNN_SPARSE_OUTDIM_THRESHOLD)
          {
            THBlas_(axpy)(outDim, val, lweightData, 1, loutputData, 1);
          }
          else
          {
            for (auto k = 0; k < outDim; k++)
            {
              loutputData[k] += lweightData[k] * val;
            }
          }
          offset++;
        }
      }
    };
    if (keysSize * outDim > THNN_SPARSE_OMP_THRESHOLD) {
      at::parallel_for(0, batchSize, 1, loop);
    } else {
      loop(0, batchSize);
    }

  }
  return;
}
Example #8
0
void THNN_(IndexLinear_updateParameters)(
          THNNState *state,
          THTensor *gradWeight,
          THTensor *gradBias,
          THTensor *weight,
          THTensor *bias,
          THLongTensor *runningKeys,
          THLongTensor *cumSumSizes,
          int64_t keysOffset,
          accreal weightDecay_,
          accreal learningRate_)
{
  scalar_t weightDecay = TH_CONVERT_ACCREAL_TO_REAL(weightDecay_);
  scalar_t learningRate = TH_CONVERT_ACCREAL_TO_REAL(learningRate_);
  /* Retrieve all the dimensions of the problem */
  int64_t outDim = THTensor_(size)(bias, 0);
  int64_t woutDim = THTensor_(size)(weight, 1);
  int maxNormalize = woutDim - outDim;
  int64_t keysSize = THLongTensor_size(runningKeys, 0);

  /* Access the storage data/strides */
  scalar_t* gradWeightData = gradWeight->data<scalar_t>();
  scalar_t* weightData = weight->data<scalar_t>();
  int64_t weightStride0 = weight->stride(0);
  scalar_t* gradBiasData = gradBias->data<scalar_t>();
  scalar_t* biasData = bias->data<scalar_t>();
  int64_t* keysData = THLongTensor_data(runningKeys);

  /* Make sure these inputs are contiguous to accelerate computations */
  THArgCheck(THTensor_(isContiguous)(gradWeight), 1, "gradWeight must be contiguous");
  THArgCheck(THTensor_(isContiguous)(gradBias), 2, "gradBias vector must be contiguous");
  THArgCheck(THTensor_(isContiguous)(weight), 3, "gradBias vector must be contiguous");
  THArgCheck(THTensor_(isContiguous)(bias), 4, "gradBias vector must be contiguous");
  THArgCheck(THLongTensor_isContiguous(runningKeys), 5, "keys vector must be contiguous");

  int j, k;

  /* Update the bias first */
  THVector_(cadd)(biasData, biasData, gradBiasData, -learningRate, outDim);

  /* Separate cases: output dimension is == 1, or > 1
   * This allows for some optimizations.
   * No multithreading here as this could
   * corrupt the results (hogwild style) */
  if (outDim == 1)
  {
    if (maxNormalize)
    {
      if (weightDecay)
      {
        for (j = 0; j < keysSize; j++)
        {
          int64_t woffset = weightStride0*(keysData[j] + keysOffset) + maxNormalize;
          scalar_t lr = learningRate*weightData[woffset-2];
          weightData[woffset-1] -= weightData[woffset]*gradWeightData[2*j]*lr;
          weightData[woffset] -= gradWeightData[2*j+1]*lr - weightDecay * weightData[woffset-2] * weightData[woffset];
        }
      }
      else
      {
        for (j = 0; j < keysSize; j++)
        {
          int64_t woffset = weightStride0*(keysData[j] + keysOffset) + maxNormalize;
          scalar_t lr = learningRate*weightData[woffset-2];
          weightData[woffset-1] -= weightData[woffset]*gradWeightData[2*j]*lr;
          weightData[woffset] -= gradWeightData[2*j+1]*lr;
        }
      }
    }
    else
    {
      if (weightDecay)
      {
        for (j = 0; j < keysSize; j++)
        {
          int64_t woffset = weightStride0*(keysData[j] + keysOffset);
          weightData[woffset] -= gradWeightData[j]*learningRate + weightDecay * weightData[woffset];
        }
      }
      else
      {
        for (j = 0; j < keysSize; j++)
        {
          weightData[weightStride0*(keysData[j] + keysOffset)] -= gradWeightData[j]*learningRate;
        }
      }
    }
  }
  else
  {
    for (j = 0; j < keysSize; j++)
    {
      scalar_t lr = learningRate;
      scalar_t wd = weightDecay;
      scalar_t* lweightData;
      int64_t woffset = weightStride0*(keysData[j] + keysOffset);
      scalar_t* lgradWeightData = gradWeightData + j*outDim;
      if (maxNormalize)
      {
        lgradWeightData += j*outDim;
        /* weightData[woffset + 2] */
        lweightData = weightData + woffset + maxNormalize - 2;
        lr = lr*lweightData[0];
        wd = weightDecay*lweightData[0];
        /* weightData[woffset + 3] */
        lweightData++;
        for (k=0; k < outDim; k++)
        {
            lweightData[0] -= lgradWeightData[k]*lweightData[k+1]*lr;
        }
        lweightData++;
        lgradWeightData += outDim;
      }
      else
      {
        lweightData = weightData + woffset;
      }

      /* We do sparse weight decay.
       * We think it makes more sense. */
      if (weightDecay)
      {
        for (k=0; k < outDim; k++)
        {
            lweightData[k] -= lweightData[k]*wd;
        }
      }

      if (outDim > THNN_SPARSE_OUTDIM_THRESHOLD)
      {
        THBlas_(axpy)(outDim, -lr, lgradWeightData, 1, lweightData, 1);
      }
      else
      {
        for (k=0; k < outDim; k++)
        {
          lweightData[k] -= lgradWeightData[k]*lr;
        }
      }
    }
  }
}