void gatherU(double* localRows, double* U)
{
	MPI_Status status;
	int j;
	for(j = 0; j < N; j++)
	{
		double* row = getLocalRow(j, localRows);
		if(row != NULL && PID != 0) 
		{
			MPI_Send(row, N + 1, MPI_DOUBLE, 0, j, MPI_COMM_WORLD);
		}
	}
	
	// Gather data on root node.
	if(PID == 0)
	{
		for(j = 0; j < N; j++)
		{
			if(getRowPID(j) == 0)
			{
				memcpy(&U[j * (N + 1)], &localRows[getLocalRowID(j) * (N + 1)], (N + 1) * sizeof(double));
			}
			else
			{
				MPI_Recv(&U[j * (N + 1)], N + 1, MPI_DOUBLE, getRowPID(j), j, MPI_COMM_WORLD, &status);
			}
		}
	}
}
void forwardEliminationStep(double* localRows, float* pvtime)
{
	// Fill up this array with the ids of local rows. Will be used in the FE computation to quickly
	// check if indices of local rows are bigger than pivot row.
	int localRowIds[MAX_N];
	int i = 0;
	int j;
	for(j = 0; j < N; j++)
	{
		if(getRowPID(j) == PID) 
		{
			localRowIds[i] = j;	
			i++;
		}
	}
	if(i != RPP) printf("local row number %d != RPP %d!!\n", i, RPP); // This should never happen.
	
	for(j = 0; j < N; j++)
	{
		double rowj[MAX_N];
		double* rowPtr = getLocalRow(j, localRows);
		
		int rowSource = getRowPID(j);
		
		// Pivot
		pivot(localRows, rowPtr, j, rowSource, localRowIds, pvtime);
		
		if(rowPtr != NULL)
		{
			memcpy(rowj, rowPtr, (N + 1) * sizeof(double));
		}
		
		// Broadcast the row.
		MPI_Bcast(rowj, N + 1, MPI_DOUBLE, rowSource, MPI_COMM_WORLD);
		
		// Compute new coefficients for rows after j
		int i;
		for(i = 0; i < RPP; i++)
		{
			double* rowi = &localRows[i * (N + 1)];
			double mult = -(rowi[j]/rowj[j]);
			//printf("NODE %d LOCAL ROW %d FIRST VAL %11.4e\n", PID, i, rowi[0]);
			
			if(localRowIds[i] > j)
			{
				int k;
				for(k = 0; k < N + 1; k++)
				{
					if(k == j) rowi[k] = 0;
					else rowi[k] = mult * rowj[k] + rowi[k];
					//double v = mult * rowj[k] + rowi[k];
					
					// Kind of hack but useful for visually debugging output matrix.
					//if(rowi[k] < 0.00000000000001) rowi[k] = 0;
				}
			}
		}
	}
}
Esempio n. 3
0
void SparseRowCpuMatrix::addTo(SparseRowCpuMatrix& dest,
                               size_t tid,
                               size_t numThreads) {
  CHECK(!dest.useGpu_);
  CHECK_EQ(dest.height_ * dest.width_, this->height_ * this->width_);

  std::vector<unsigned int>& localIndices = indexDictHandle_->localIndices;
  for (size_t i = 0; i < localIndices.size(); ++i) {
    uint32_t id = localIndices[i];
    if (id % numThreads == tid) {
      dest.checkIndex(id);
      simd::addTo(dest.getRow(id), getLocalRow(i), this->width_);
    }
  }
}
Esempio n. 4
0
void SparseRowCpuMatrix::sgdUpdate(BaseMatrix& value,
                                   IVector& t0,
                                   real learningRate,
                                   int currentTime,
                                   real decayRate,
                                   bool useL1,
                                   bool fini) {
  std::vector<unsigned int>& localIndices = indexDictHandle_->localIndices;

  // t0 and value are vectors
  CHECK_EQ(t0.getSize(), this->height_);
  CHECK_EQ(value.width_, this->height_ * this->width_);

  if (decayRate == 0.0f) {
    if (fini) {
      return;
    }

    for (size_t i = 0; i < localIndices.size(); ++i) {
      real* g = getLocalRow(i);
      real* v = value.rowBuf(localIndices[i]);
      for (size_t j = 0; j < this->width_; ++j) {
        v[j] -= learningRate * g[j];
      }
    }
    return;
  }  // else

  if (useL1) {  // L1 decay
    if (fini) {
      for (size_t i = 0; i < this->height_; ++i) {
        real* v = value.rowBuf(i);
        int* t = t0.getData() + i;
        if (t[0] < currentTime) {
          // W(t0) -> W(t+1)
          int tDiff = currentTime - t[0];
          real delta = tDiff * learningRate * decayRate;
          simd::decayL1(v, v, delta, this->width_);
        }
      }
      return;
    }  // else

    for (size_t i = 0; i < localIndices.size(); ++i) {
      real* g = getLocalRow(i);
      real* v = value.rowBuf(localIndices[i]);
      int* t = t0.getData() + localIndices[i];
      if (t[0] < currentTime) {
        // W(t0) -> W(t)
        int tDiff = currentTime - t[0];
        real delta = tDiff * learningRate * decayRate;
        simd::decayL1(v, v, delta, this->width_);
      }

      // W(t) -> W(t+1)
      for (size_t j = 0; j < this->width_; ++j) {
        v[j] -= learningRate * g[j];
      }
      simd::decayL1(v, v, learningRate * decayRate, this->width_);

      // state update to t+1
      t[0] = currentTime + 1;
    }

  } else {  // L2 decay
    if (fini) {
      for (size_t i = 0; i < this->height_; ++i) {
        real* v = value.rowBuf(i);
        int* t = t0.getData() + i;
        if (t[0] < currentTime) {
          // W(t0) -> W(t+1)
          int tDiff = currentTime - t[0];
          real recip = 1.0f / (1.0f + tDiff * learningRate * decayRate);
          for (size_t j = 0; j < this->width_; ++j) {
            v[j] *= recip;
          }
        }
      }
      return;
    }  // else

    real recipDecay = 1.0f / (1.0f + learningRate * decayRate);

    for (size_t i = 0; i < localIndices.size(); ++i) {
      real* g = getLocalRow(i);
      real* v = value.rowBuf(localIndices[i]);
      int* t = t0.getData() + localIndices[i];
      if (t[0] < currentTime) {
        // W(t0) -> W(t)
        int tDiff = currentTime - t[0];
        real recip = 1.0f / (1.0f + tDiff * learningRate * decayRate);
        for (size_t j = 0; j < this->width_; ++j) {
          v[j] *= recip;
        }
      }

      // W(t) -> W(t+1)
      for (size_t j = 0; j < this->width_; ++j) {
        v[j] = recipDecay * (v[j] - learningRate * g[j]);
      }

      // state update to t+1
      t[0] = currentTime + 1;
    }
  }
}
backSubstitutionStep(double* localRows, double* localSolution)
{
	MPI_Status status;
	int j;
	
	// This array is used to check wether we already sent a solution to a specific node,
	// to avoid redundant communication.
	char sendMask[MAX_N][MAX_P];
	char recvMask[MAX_N];
	
	memset(sendMask, 0, MAX_N * MAX_P);
	memset(recvMask, 0, MAX_N);
	
	// Stores values of x needed to compute local solutions.
	double x[MAX_N];
	memset(x, 0, MAX_N * sizeof(double));
	
	for(j = N - 1; j >= 0; j--)
	{
		// Is row j local?
		double* row = getLocalRow(j, localRows);
		if(row)
		{
			// Step 1: Receive all the values of x needed to solve this row
			int i;
			double sum = 0;
			for(i = N - 1; i > j; i--)
			{
				// If the row that computed x[i] is not local, do an MPI receive.
				// If the row is local, x[j] is already stored in the x array.
				int rpid = getRowPID(i);
				if(rpid != PID && !recvMask[i])	
				{
					//printf("N%d RECV X[%d] from N%d\n", PID, i, rpid);
					int err = MPI_Recv(&x[i], 1, MPI_DOUBLE, rpid, i, MPI_COMM_WORLD, &status);
					if(err != MPI_SUCCESS) printf("MPI_Recv ERROR on NODE %d\n", PID);
					
					// Mark x[i] as received
					recvMask[i] = 1;
					//else printf("N%d RECV X[%d] = %f from N%d\n", PID, i, x[i], rpid);
				}
				sum += x[i] * row[i];
			}
			// Step 2: compute solution for x[j];
			x[j] = 1.0 / row[j] * (row[N] - sum);
			//printf("X[%d] = %f\n", j + 1, x[j]);
			
			// Step 3: propagate x[j] to all rows before me
			for(i = j - 1; i >= 0; i--)
			{
				// If row i is local, we do not need to do an MPI send. The value of x will be 
				// read from the local solution array.
				int rpid = getRowPID(i);
				if(rpid != PID && !sendMask[j][rpid]) 
				{
					//printf("N%d SEND X[%d] to N%d\n", PID, j, rpid);
					MPI_Send(&x[j], 1, MPI_DOUBLE, rpid, j, MPI_COMM_WORLD);
					
					// Mark x[j] as already sent to node rpid
					sendMask[j][rpid] = 1;
				}
			}			
		}
	}
	
	// Copy data to local solution array.
	memcpy(localSolution, x, MAX_N * sizeof(double));
}