//-------------------------------------------------------------- size_t DataSet::loadFragment(const std::string & filePath, const std::string & particleType) { static const int stride = 1; ofxHDF5File h5File; h5File.open(filePath, true); ofxHDF5GroupPtr h5Group = h5File.loadGroup(particleType); // Load the coordinate data and convert angles to radians. auto coordDataSet = h5Group->loadDataSet("Coordinates"); int coordCount = coordDataSet->getDimensionSize(0) / stride; coordDataSet->setHyperslab(0, coordCount, stride); vector<glm::vec3> coordData(coordCount); coordDataSet->read(coordData.data()); // Load the mass data. auto massDataSet = h5Group->loadDataSet("Masses"); int massCount = massDataSet->getDimensionSize(0) / stride; massDataSet->setHyperslab(0, massCount, stride); vector<float> massData(massCount); massDataSet->read(massData.data()); // Load the star formation rate data. auto sfrDataSet = h5Group->loadDataSet("StarFormationRate"); int sfrCount = sfrDataSet->getDimensionSize(0) / stride; sfrDataSet->setHyperslab(0, sfrCount, stride); vector<float> sfrData(sfrCount); sfrDataSet->read(sfrData.data()); // Add valid points to the data set. size_t total = 0; for (int i = 0; i < coordData.size(); ++i) { if (coordData[i].z > 0.0f) { this->coordinates.push_back(glm::vec3(ofDegToRad(coordData[i].x), ofDegToRad(coordData[i].y), coordData[i].z)); this->minRadius = std::min(this->minRadius, coordData[i].z); this->maxRadius = std::max(this->maxRadius, coordData[i].z); this->masses.push_back(massData[i]); if (particleType == "PartType6") { this->starFormationRates.push_back(sfrData[i]); } else { // These are stars so just put in dummy data. this->starFormationRates.push_back(-1.0f); } ++total; } } return total; }
/***************************************************************************************** * vector< int > K_MeansPredict::Train( const vector< vector< float > >& Data, const float stopDist, const int stopIter, const int fast ) * * Purpose: Train predictor * input: * Data: vector of data * stopDist: Distance stopping criteria * stopIter: Max Iteration stopping criteria * * return: * vector of cluster membership * * 01.07.2006 djh added stoping criterion parameters * stopDist minimum euclidean distance * stopIter maximum iterations * extra error output * 03.06.2006 djh replaced _totalUpper/_totalLowerConfBound with _totalBoundStub * ******************************************************************************************/ vector< int > K_MeansPredict::Train( const vector< vector< float > >& Data, const float stopDist, const int stopIter, const int fast ){ // create vector of example coordinates vector< Coord< float > > coordData( Data.size() ); // create vector of example key values vector< float > dataKey( Data.size() ); // for( int i=0; i<Data.size(); i++) { vector< float > tempCoords( Data[i].size()-1 ); dataKey[i]=Data[i][0]; for( int j=1; j<Data[i].size(); j++ ) { tempCoords[j-1] = Data[i][j]; } coordData[i] = Coord< float >( tempCoords ); } // calculate clusters float dist; int numIter; vector<int> clusterMap = CreateClusters( coordData, stopDist, stopIter, dist, numIter ); if( fast == 1 ){ return( clusterMap ); } cout << "# Training:\n"; cout << "# Training required " << numIter << " rounds, the max Euclid. Dist. is: " << dist << endl; // calculate cluster stats vector< float > sum_x( _k, 0. ); vector< float > sum_x2( _k, 0. ); _key_supports = vector< int >( _k, 0); // find n and sums for( int i=0; i<Data.size(); i++ ){ _key_supports[ clusterMap[i] ]++; sum_x[ clusterMap[i] ] += dataKey[ clusterMap[i] ]; sum_x2[ clusterMap[i] ] += pow( dataKey[ clusterMap[i] ], 2); } // compute mean and variance _key_means = vector< float >(_k,0.); _key_variances = vector< float >(_k,0.); for( int i=0; i<_k; i++ ){ _key_means[i]=sum_x[i]/_key_supports[i]; _key_variances[i] = ( sum_x2[i] - (sum_x[i]/float(_key_supports[i])) )/float( _key_supports[i]-1 ); } // // Calc error means and variances sum_x = vector<float>( _k, 0.); sum_x2 = vector<float>( _k, 0.); float tot_sum_x = 0.0; float tot_sum_x2 = 0.0; for( int i=0; i<coordData.size(); i++ ){ int clusterIdx = FindClusterIdx( coordData[i] ); float err = _key_means[ clusterIdx ] - dataKey[i]; sum_x[ clusterIdx ] += err; sum_x2[ clusterIdx ] += pow( err, 2 ); tot_sum_x += err; tot_sum_x2 += pow( err, 2 ); } _errMean = vector< float >( _k ); _lowerConfBound = vector< float >( _k ); _upperConfBound = vector< float >( _k ); for( int i=0; i< _k; i++ ){ _errMean[i] = sum_x[i]/( float( _key_supports[i] ) ); float errVar = ( sum_x2[i] - (sum_x[i]/float(_key_supports[i])) )/float( _key_supports[i]-1 ); float t_val = TDist( _key_supports[i] ); _lowerConfBound[i] = _errMean[i] - t_val*sqrt( errVar * (1.0+( 1.0/float(_key_supports[i]) )) ); _upperConfBound[i] = _errMean[i] + t_val*sqrt( errVar * (1.0+( 1.0/float(_key_supports[i]) )) ); } // _totalErrMean = tot_sum_x / coordData.size(); float totalErrVar = ( tot_sum_x2 - (tot_sum_x/float(coordData.size())) )/float( coordData.size()-1 ); _totalBoundStub = sqrt( totalErrVar * (1.0+( 1.0/float(coordData.size()) )) ); //_totalLowerConfBound = _totalErrMean - TDist( coordData.size() )*sqrt( totalErrVar * (1.0+( 1.0/float(coordData.size()) )) ); //_totalUpperConfBound = _totalErrMean + TDist( coordData.size() )*sqrt( totalErrVar * (1.0+( 1.0/float(coordData.size()) )) ); // return labels cout << "# Error:\n"; cout << "# Mean Squared Error (MSE) is: " << tot_sum_x2/float(coordData.size() ) << endl; cout << "# Error Mean is : " << _totalErrMean << endl; cout << "# Error Variance is : " << totalErrVar << endl; // return( clusterMap ); }