void gatherU(double* localRows, double* U) { MPI_Status status; int j; for(j = 0; j < N; j++) { double* row = getLocalRow(j, localRows); if(row != NULL && PID != 0) { MPI_Send(row, N + 1, MPI_DOUBLE, 0, j, MPI_COMM_WORLD); } } // Gather data on root node. if(PID == 0) { for(j = 0; j < N; j++) { if(getRowPID(j) == 0) { memcpy(&U[j * (N + 1)], &localRows[getLocalRowID(j) * (N + 1)], (N + 1) * sizeof(double)); } else { MPI_Recv(&U[j * (N + 1)], N + 1, MPI_DOUBLE, getRowPID(j), j, MPI_COMM_WORLD, &status); } } } }
void forwardEliminationStep(double* localRows, float* pvtime) { // Fill up this array with the ids of local rows. Will be used in the FE computation to quickly // check if indices of local rows are bigger than pivot row. int localRowIds[MAX_N]; int i = 0; int j; for(j = 0; j < N; j++) { if(getRowPID(j) == PID) { localRowIds[i] = j; i++; } } if(i != RPP) printf("local row number %d != RPP %d!!\n", i, RPP); // This should never happen. for(j = 0; j < N; j++) { double rowj[MAX_N]; double* rowPtr = getLocalRow(j, localRows); int rowSource = getRowPID(j); // Pivot pivot(localRows, rowPtr, j, rowSource, localRowIds, pvtime); if(rowPtr != NULL) { memcpy(rowj, rowPtr, (N + 1) * sizeof(double)); } // Broadcast the row. MPI_Bcast(rowj, N + 1, MPI_DOUBLE, rowSource, MPI_COMM_WORLD); // Compute new coefficients for rows after j int i; for(i = 0; i < RPP; i++) { double* rowi = &localRows[i * (N + 1)]; double mult = -(rowi[j]/rowj[j]); //printf("NODE %d LOCAL ROW %d FIRST VAL %11.4e\n", PID, i, rowi[0]); if(localRowIds[i] > j) { int k; for(k = 0; k < N + 1; k++) { if(k == j) rowi[k] = 0; else rowi[k] = mult * rowj[k] + rowi[k]; //double v = mult * rowj[k] + rowi[k]; // Kind of hack but useful for visually debugging output matrix. //if(rowi[k] < 0.00000000000001) rowi[k] = 0; } } } } }
void SparseRowCpuMatrix::addTo(SparseRowCpuMatrix& dest, size_t tid, size_t numThreads) { CHECK(!dest.useGpu_); CHECK_EQ(dest.height_ * dest.width_, this->height_ * this->width_); std::vector<unsigned int>& localIndices = indexDictHandle_->localIndices; for (size_t i = 0; i < localIndices.size(); ++i) { uint32_t id = localIndices[i]; if (id % numThreads == tid) { dest.checkIndex(id); simd::addTo(dest.getRow(id), getLocalRow(i), this->width_); } } }
void SparseRowCpuMatrix::sgdUpdate(BaseMatrix& value, IVector& t0, real learningRate, int currentTime, real decayRate, bool useL1, bool fini) { std::vector<unsigned int>& localIndices = indexDictHandle_->localIndices; // t0 and value are vectors CHECK_EQ(t0.getSize(), this->height_); CHECK_EQ(value.width_, this->height_ * this->width_); if (decayRate == 0.0f) { if (fini) { return; } for (size_t i = 0; i < localIndices.size(); ++i) { real* g = getLocalRow(i); real* v = value.rowBuf(localIndices[i]); for (size_t j = 0; j < this->width_; ++j) { v[j] -= learningRate * g[j]; } } return; } // else if (useL1) { // L1 decay if (fini) { for (size_t i = 0; i < this->height_; ++i) { real* v = value.rowBuf(i); int* t = t0.getData() + i; if (t[0] < currentTime) { // W(t0) -> W(t+1) int tDiff = currentTime - t[0]; real delta = tDiff * learningRate * decayRate; simd::decayL1(v, v, delta, this->width_); } } return; } // else for (size_t i = 0; i < localIndices.size(); ++i) { real* g = getLocalRow(i); real* v = value.rowBuf(localIndices[i]); int* t = t0.getData() + localIndices[i]; if (t[0] < currentTime) { // W(t0) -> W(t) int tDiff = currentTime - t[0]; real delta = tDiff * learningRate * decayRate; simd::decayL1(v, v, delta, this->width_); } // W(t) -> W(t+1) for (size_t j = 0; j < this->width_; ++j) { v[j] -= learningRate * g[j]; } simd::decayL1(v, v, learningRate * decayRate, this->width_); // state update to t+1 t[0] = currentTime + 1; } } else { // L2 decay if (fini) { for (size_t i = 0; i < this->height_; ++i) { real* v = value.rowBuf(i); int* t = t0.getData() + i; if (t[0] < currentTime) { // W(t0) -> W(t+1) int tDiff = currentTime - t[0]; real recip = 1.0f / (1.0f + tDiff * learningRate * decayRate); for (size_t j = 0; j < this->width_; ++j) { v[j] *= recip; } } } return; } // else real recipDecay = 1.0f / (1.0f + learningRate * decayRate); for (size_t i = 0; i < localIndices.size(); ++i) { real* g = getLocalRow(i); real* v = value.rowBuf(localIndices[i]); int* t = t0.getData() + localIndices[i]; if (t[0] < currentTime) { // W(t0) -> W(t) int tDiff = currentTime - t[0]; real recip = 1.0f / (1.0f + tDiff * learningRate * decayRate); for (size_t j = 0; j < this->width_; ++j) { v[j] *= recip; } } // W(t) -> W(t+1) for (size_t j = 0; j < this->width_; ++j) { v[j] = recipDecay * (v[j] - learningRate * g[j]); } // state update to t+1 t[0] = currentTime + 1; } } }
backSubstitutionStep(double* localRows, double* localSolution) { MPI_Status status; int j; // This array is used to check wether we already sent a solution to a specific node, // to avoid redundant communication. char sendMask[MAX_N][MAX_P]; char recvMask[MAX_N]; memset(sendMask, 0, MAX_N * MAX_P); memset(recvMask, 0, MAX_N); // Stores values of x needed to compute local solutions. double x[MAX_N]; memset(x, 0, MAX_N * sizeof(double)); for(j = N - 1; j >= 0; j--) { // Is row j local? double* row = getLocalRow(j, localRows); if(row) { // Step 1: Receive all the values of x needed to solve this row int i; double sum = 0; for(i = N - 1; i > j; i--) { // If the row that computed x[i] is not local, do an MPI receive. // If the row is local, x[j] is already stored in the x array. int rpid = getRowPID(i); if(rpid != PID && !recvMask[i]) { //printf("N%d RECV X[%d] from N%d\n", PID, i, rpid); int err = MPI_Recv(&x[i], 1, MPI_DOUBLE, rpid, i, MPI_COMM_WORLD, &status); if(err != MPI_SUCCESS) printf("MPI_Recv ERROR on NODE %d\n", PID); // Mark x[i] as received recvMask[i] = 1; //else printf("N%d RECV X[%d] = %f from N%d\n", PID, i, x[i], rpid); } sum += x[i] * row[i]; } // Step 2: compute solution for x[j]; x[j] = 1.0 / row[j] * (row[N] - sum); //printf("X[%d] = %f\n", j + 1, x[j]); // Step 3: propagate x[j] to all rows before me for(i = j - 1; i >= 0; i--) { // If row i is local, we do not need to do an MPI send. The value of x will be // read from the local solution array. int rpid = getRowPID(i); if(rpid != PID && !sendMask[j][rpid]) { //printf("N%d SEND X[%d] to N%d\n", PID, j, rpid); MPI_Send(&x[j], 1, MPI_DOUBLE, rpid, j, MPI_COMM_WORLD); // Mark x[j] as already sent to node rpid sendMask[j][rpid] = 1; } } } } // Copy data to local solution array. memcpy(localSolution, x, MAX_N * sizeof(double)); }