int sum_x(int vx=1, int tlx=0, int trx=n-1,int lx,int rx,int ly,int ry) { if (lx > rx) return 0; if (lx == tlx && trx == rx) return sum_y(vx, 1, 0, m-1, ly, ry); int tmx = (tlx + trx) / 2; return sum_x(vx*2, tlx, tmx, lx, min(rx,tmx), ly, ry) + sum_x(vx*2+1, tmx+1, trx, max(lx,tmx+1), rx, ly, ry); }
__device__ inline void contributeResidualJacobian( const unsigned ielem ) const { extern __shared__ WorkSpace work_data[] ; sum_x_clear(); // Make sure summation scratch is zero // $$ R_i = \int_{\Omega} \nabla \phi_i \cdot (k \nabla T) + \phi_i T^2 d \Omega $$ // $$ J_{i,j} = \frac{\partial R_i}{\partial T_j} = \int_{\Omega} k \nabla \phi_i \cdot \nabla \phi_j + 2 \phi_i \phi_j T d \Omega $$ const unsigned iInt = threadIdx.x ; if ( iInt < IntegrationCount ) { const double value_at_integ = work_data->value_at_integ[ iInt ] ; const double gradx_at_integ = work_data->gradx_at_integ[ iInt ] ; const double grady_at_integ = work_data->grady_at_integ[ iInt ] ; const double gradz_at_integ = work_data->gradz_at_integ[ iInt ] ; const float detJweight = work_data->detJweight[ iInt ] ; const float coeff_K_detJweight = coeff_K * detJweight ; for ( unsigned iRow = threadIdx.y ; iRow < FunctionCount ; iRow += blockDim.y ) { const float value_row = elem_data.values[ iInt ][ iRow ] * detJweight ; const float dpsidx_row = work_data->dpsidx[ iRow ][ iInt ] * coeff_K_detJweight ; const float dpsidy_row = work_data->dpsidy[ iRow ][ iInt ] * coeff_K_detJweight ; const float dpsidz_row = work_data->dpsidz[ iRow ][ iInt ] * coeff_K_detJweight ; const double res_del = dpsidx_row * gradx_at_integ + dpsidy_row * grady_at_integ + dpsidz_row * gradz_at_integ ; const double res_val = value_at_integ * value_at_integ * value_row ; const double jac_val_row = 2 * value_at_integ * value_row ; sum_x( element_vectors( ielem , iRow ) , res_del + res_val ); for ( unsigned iCol = 0 ; iCol < FunctionCount ; ++iCol ) { const float jac_del = dpsidx_row * work_data->dpsidx[iCol][iInt] + dpsidy_row * work_data->dpsidy[iCol][iInt] + dpsidz_row * work_data->dpsidz[iCol][iInt] ; const double jac_val = jac_val_row * elem_data.values[ iInt ][ iCol ] ; sum_x( element_matrices( ielem , iRow , iCol ) , jac_del + jac_val ); } } } __syncthreads(); // All warps finish before refilling shared data }
void fill_dp_matrix(const std::vector<double> & x, std::vector< std::vector< double > > & S, std::vector< std::vector< size_t > > & J) /* x: One dimension vector to be clustered, must be sorted (in any order). S: K x N matrix. S[k][i] is the sum of squares of the distance from each x[i] to its cluster mean when there are exactly x[i] is the last point in cluster k J: K x N backtrack matrix NOTE: All vector indices in this program start at position 0 */ { const int K = S.size(); const int N = S[0].size(); std::vector<double> sum_x(N), sum_x_sq(N); double shift = x[N/2]; // median. used to shift the values of x to // improve numerical stability for(int i = 0; i < N; ++i) { if(i == 0) { sum_x[0] = x[0] - shift; sum_x_sq[0] = (x[0] - shift) * (x[0] - shift); } else { sum_x[i] = sum_x[i-1] + x[i] - shift; sum_x_sq[i] = sum_x_sq[i-1] + (x[i] - shift) * (x[i] - shift); } // Initialize for k = 0 S[0][i] = ssq(0, i, sum_x, sum_x_sq); J[0][i] = 0; } for(int k = 1; k < K; ++k) { int imin; if(k < K - 1) { imin = std::max((size_t)1, (size_t)k); } else { // No need to compute S[K-1][0] ... S[K-1][N-2] imin = N-1; } #ifdef DEBUG std::cout << std::endl << "k=" << k << ":"; #endif fill_row_k(imin, N-1, k, S, J, sum_x, sum_x_sq); } }
__device__ inline void evaluateFunctions( const unsigned ielem ) const { extern __shared__ WorkSpace work_data[] ; // Each warp (threadIdx.y) computes an integration point // Each thread is responsible for a node / function. const unsigned iFunc = threadIdx.x ; const bool hasFunc = iFunc < FunctionCount ; //------------------------------------ // Each warp gathers a different variable into 'elem_mat' shared memory. if ( hasFunc ) { const unsigned node = elem_node_ids( ielem , iFunc ); for ( unsigned iy = threadIdx.y ; iy < 4 ; iy += blockDim.y ) { switch( iy ) { case 0 : work_data->sum[0][iFunc] = node_coords(node,0); break ; case 1 : work_data->sum[1][iFunc] = node_coords(node,1); break ; case 2 : work_data->sum[2][iFunc] = node_coords(node,2); break ; case 3 : work_data->sum[3][iFunc] = nodal_values(node); break ; default: break ; } } } __syncthreads(); // Wait for all warps to finish gathering // now get local 'const' copies in register space: const double x = work_data->sum[0][ iFunc ]; const double y = work_data->sum[1][ iFunc ]; const double z = work_data->sum[2][ iFunc ]; const double dof_val = work_data->sum[3][ iFunc ]; __syncthreads(); // Wait for all warps to finish extracting sum_x_clear(); // Make sure summation scratch is zero //------------------------------------ // Each warp is now on its own computing an integration point // so no further explicit synchronizations are required. if ( hasFunc ) { float * const J = work_data->spaceJac[ threadIdx.y ]; float * const invJ = work_data->spaceInvJac[ threadIdx.y ]; for ( unsigned iInt = threadIdx.y ; iInt < IntegrationCount ; iInt += blockDim.y ) { const float val = elem_data.values[iInt][iFunc] ; const float gx = elem_data.gradients[iInt][0][iFunc] ; const float gy = elem_data.gradients[iInt][1][iFunc] ; const float gz = elem_data.gradients[iInt][2][iFunc] ; sum_x( J[j11], gx * x ); sum_x( J[j12], gx * y ); sum_x( J[j13], gx * z ); sum_x( J[j21], gy * x ); sum_x( J[j22], gy * y ); sum_x( J[j23], gy * z ); sum_x( J[j31], gz * x ); sum_x( J[j32], gz * y ); sum_x( J[j33], gz * z ); // Inverse jacobian, only enough parallel work for 9 threads in the warp if ( iFunc < TensorDim ) { invJ[ iFunc ] = J[ invJacIndex[iFunc][0] ] * J[ invJacIndex[iFunc][1] ] - J[ invJacIndex[iFunc][2] ] * J[ invJacIndex[iFunc][3] ] ; // Let all threads in the warp compute determinant into a register const float detJ = J[j11] * invJ[j11] + J[j21] * invJ[j12] + J[j31] * invJ[j13] ; invJ[ iFunc ] /= detJ ; if ( 0 == iFunc ) { work_data->detJweight[ iInt ] = detJ * elem_data.weights[ iInt ] ; } } // Transform bases gradients and compute value and gradient const float dx = gx * invJ[j11] + gy * invJ[j12] + gz * invJ[j13]; const float dy = gx * invJ[j21] + gy * invJ[j22] + gz * invJ[j23]; const float dz = gx * invJ[j31] + gy * invJ[j32] + gz * invJ[j33]; work_data->dpsidx[iFunc][iInt] = dx ; work_data->dpsidy[iFunc][iInt] = dy ; work_data->dpsidz[iFunc][iInt] = dz ; sum_x( work_data->gradx_at_integ[iInt] , dof_val * dx ); sum_x( work_data->grady_at_integ[iInt] , dof_val * dy ); sum_x( work_data->gradz_at_integ[iInt] , dof_val * dz ); sum_x( work_data->value_at_integ[iInt] , dof_val * val ); } } __syncthreads(); // All shared data must be populated at return. }
/***************************************************************************************** * vector< int > K_MeansPredict::Train( const vector< vector< float > >& Data, const float stopDist, const int stopIter, const int fast ) * * Purpose: Train predictor * input: * Data: vector of data * stopDist: Distance stopping criteria * stopIter: Max Iteration stopping criteria * * return: * vector of cluster membership * * 01.07.2006 djh added stoping criterion parameters * stopDist minimum euclidean distance * stopIter maximum iterations * extra error output * 03.06.2006 djh replaced _totalUpper/_totalLowerConfBound with _totalBoundStub * ******************************************************************************************/ vector< int > K_MeansPredict::Train( const vector< vector< float > >& Data, const float stopDist, const int stopIter, const int fast ){ // create vector of example coordinates vector< Coord< float > > coordData( Data.size() ); // create vector of example key values vector< float > dataKey( Data.size() ); // for( int i=0; i<Data.size(); i++) { vector< float > tempCoords( Data[i].size()-1 ); dataKey[i]=Data[i][0]; for( int j=1; j<Data[i].size(); j++ ) { tempCoords[j-1] = Data[i][j]; } coordData[i] = Coord< float >( tempCoords ); } // calculate clusters float dist; int numIter; vector<int> clusterMap = CreateClusters( coordData, stopDist, stopIter, dist, numIter ); if( fast == 1 ){ return( clusterMap ); } cout << "# Training:\n"; cout << "# Training required " << numIter << " rounds, the max Euclid. Dist. is: " << dist << endl; // calculate cluster stats vector< float > sum_x( _k, 0. ); vector< float > sum_x2( _k, 0. ); _key_supports = vector< int >( _k, 0); // find n and sums for( int i=0; i<Data.size(); i++ ){ _key_supports[ clusterMap[i] ]++; sum_x[ clusterMap[i] ] += dataKey[ clusterMap[i] ]; sum_x2[ clusterMap[i] ] += pow( dataKey[ clusterMap[i] ], 2); } // compute mean and variance _key_means = vector< float >(_k,0.); _key_variances = vector< float >(_k,0.); for( int i=0; i<_k; i++ ){ _key_means[i]=sum_x[i]/_key_supports[i]; _key_variances[i] = ( sum_x2[i] - (sum_x[i]/float(_key_supports[i])) )/float( _key_supports[i]-1 ); } // // Calc error means and variances sum_x = vector<float>( _k, 0.); sum_x2 = vector<float>( _k, 0.); float tot_sum_x = 0.0; float tot_sum_x2 = 0.0; for( int i=0; i<coordData.size(); i++ ){ int clusterIdx = FindClusterIdx( coordData[i] ); float err = _key_means[ clusterIdx ] - dataKey[i]; sum_x[ clusterIdx ] += err; sum_x2[ clusterIdx ] += pow( err, 2 ); tot_sum_x += err; tot_sum_x2 += pow( err, 2 ); } _errMean = vector< float >( _k ); _lowerConfBound = vector< float >( _k ); _upperConfBound = vector< float >( _k ); for( int i=0; i< _k; i++ ){ _errMean[i] = sum_x[i]/( float( _key_supports[i] ) ); float errVar = ( sum_x2[i] - (sum_x[i]/float(_key_supports[i])) )/float( _key_supports[i]-1 ); float t_val = TDist( _key_supports[i] ); _lowerConfBound[i] = _errMean[i] - t_val*sqrt( errVar * (1.0+( 1.0/float(_key_supports[i]) )) ); _upperConfBound[i] = _errMean[i] + t_val*sqrt( errVar * (1.0+( 1.0/float(_key_supports[i]) )) ); } // _totalErrMean = tot_sum_x / coordData.size(); float totalErrVar = ( tot_sum_x2 - (tot_sum_x/float(coordData.size())) )/float( coordData.size()-1 ); _totalBoundStub = sqrt( totalErrVar * (1.0+( 1.0/float(coordData.size()) )) ); //_totalLowerConfBound = _totalErrMean - TDist( coordData.size() )*sqrt( totalErrVar * (1.0+( 1.0/float(coordData.size()) )) ); //_totalUpperConfBound = _totalErrMean + TDist( coordData.size() )*sqrt( totalErrVar * (1.0+( 1.0/float(coordData.size()) )) ); // return labels cout << "# Error:\n"; cout << "# Mean Squared Error (MSE) is: " << tot_sum_x2/float(coordData.size() ) << endl; cout << "# Error Mean is : " << _totalErrMean << endl; cout << "# Error Variance is : " << totalErrVar << endl; // return( clusterMap ); }
extern double test_points() { point a, b; a.x = 40; b.x = 1; return sum_x(&a, &b); }
void CalcCorr( const int nvar, const int tgtIdx, const vector< int >& delay, const vector< int >& nlags, vector< vector< float > >& Examples, vector< vector< float > >& CorrelationVect ) { for(int k=0; k<nvar; k++) { if(k != tgtIdx && nlags[k] != (nlags[tgtIdx]+1) ) { cout << "Assert: Error!\n"; exit( -1 ); } } int nl = nlags[tgtIdx]+1; vector< vector< float > > tempData( Examples.size() ); vector< float > target( Examples.size() ); for(int i=0; i<tempData.size(); i++) { target[i] = Examples[i][0]; tempData[i]=vector< float >( Examples[i].size() ); int aCtr = 0; for(int j=1; j<Examples[i].size(); j++) { if( aCtr == tgtIdx*nl) { tempData[i][aCtr] = target[i]; aCtr++; } tempData[i][aCtr] = Examples[i][j]; aCtr++; } } //for(int i=0; i<tempData.size(); i++) //{ // for(int j=0; j<Examples[i].size(); j++) // { // cout << setw(7) << Examples[i][j]; // } // cout << endl; // // for(int j=0; j<tempData[i].size(); j++) // { // cout << setw(7) << tempData[i][j]; // } // cout << endl << endl; //} CorrelationVect = vector< vector< float > >(nvar); for( int k=0; k<nvar; k++) { CorrelationVect[k] = vector< float >( nl ); cout << "# Calculate correlation for all lags" << endl; int n = Examples.size(); vector<float> sum_x(nl); vector<float> sum_y(nl); vector<float> sum_xsq(nl); vector<float> sum_ysq(nl); vector<float> sum_xy(nl); for(int j=0; j<nl; j++) { sum_x[j] = 0.0; sum_y[j] = 0.0; sum_xsq[j] = 0.0; sum_ysq[j] = 0.0; sum_xy[j] = 0.0; for(int i=0; i<Examples.size(); i++) { sum_x[j] += target[i]; sum_y[j] += tempData[i][k*nl+j]; sum_xsq[j] += target[i]*target[i]; sum_ysq[j] += tempData[i][k*nl+j]*tempData[i][k*nl+j]; sum_xy[j] += target[i]*tempData[i][k*nl+j]; //sum_x[j] += Examples[i][0]; //sum_y[j] += Examples[i][k*nl+j]; //sum_xsq[j] += Examples[i][0]*Examples[i][0]; //sum_ysq[j] += Examples[i][k*nl+j]*Examples[i][k*nl+j]; //sum_xy[j] += Examples[i][0]*Examples[i][k*nl+j]; } float numerator = n*sum_xy[j]-sum_x[j]*sum_y[j]; float denominator = sqrt(n*sum_xsq[j] - sum_x[j]*sum_x[j] ) * sqrt(n*sum_ysq[j] - sum_y[j]*sum_y[j] ); CorrelationVect[k][j] = numerator/denominator; } } vector< vector< float > > temp( nl ); for(int i=0; i< nl; i++) { temp[i] = vector< float >(nvar); for(int j=0; j<nvar; j++) { temp[i][j] = CorrelationVect[j][i]; } } //cout << "# CorrelationVect is : " << CorrelationVect.size() << " by " << CorrelationVect[0].size() << endl;; //cout << "# temp is : " << temp.size() << " by " << temp[0].size() << endl; CorrelationVect = temp; //cout << "# CorrelationVect is : " << CorrelationVect.size() << " by " << CorrelationVect[0].size() << endl;; }