bool testpca(bool silent) { bool result; int passcount; int maxn; int maxm; double threshold; int m; int n; int i; int j; int k; int info; ap::real_1d_array means; ap::real_1d_array s; ap::real_1d_array t2; ap::real_1d_array t3; ap::real_2d_array v; ap::real_2d_array x; double t; double h; double tmean; double tmeans; double tstddev; double tstddevs; double tmean2; double tmeans2; double tstddev2; double tstddevs2; bool pcaconverrors; bool pcaorterrors; bool pcavarerrors; bool pcaopterrors; bool waserrors; // // Primary settings // maxm = 10; maxn = 100; passcount = 1; threshold = 1000*ap::machineepsilon; waserrors = false; pcaconverrors = false; pcaorterrors = false; pcavarerrors = false; pcaopterrors = false; // // Test 1: N random points in M-dimensional space // for(m = 1; m <= maxm; m++) { for(n = 1; n <= maxn; n++) { // // Generate task // x.setbounds(0, n-1, 0, m-1); means.setbounds(0, m-1); for(j = 0; j <= m-1; j++) { means(j) = 1.5*ap::randomreal()-0.75; } for(i = 0; i <= n-1; i++) { for(j = 0; j <= m-1; j++) { x(i,j) = means(j)+(2*ap::randomreal()-1); } } // // Solve // pcabuildbasis(x, n, m, info, s, v); if( info!=1 ) { pcaconverrors = true; continue; } // // Orthogonality test // for(i = 0; i <= m-1; i++) { for(j = 0; j <= m-1; j++) { t = ap::vdotproduct(&v(0, i), v.getstride(), &v(0, j), v.getstride(), ap::vlen(0,m-1)); if( i==j ) { t = t-1; } pcaorterrors = pcaorterrors||ap::fp_greater(fabs(t),threshold); } } // // Variance test // t2.setbounds(0, n-1); for(k = 0; k <= m-1; k++) { for(i = 0; i <= n-1; i++) { t = ap::vdotproduct(&x(i, 0), 1, &v(0, k), v.getstride(), ap::vlen(0,m-1)); t2(i) = t; } calculatemv(t2, n, tmean, tmeans, tstddev, tstddevs); if( n!=1 ) { t = ap::sqr(tstddev)*n/(n-1); } else { t = 0; } pcavarerrors = pcavarerrors||ap::fp_greater(fabs(t-s(k)),threshold); } for(k = 0; k <= m-2; k++) { pcavarerrors = pcavarerrors||ap::fp_less(s(k),s(k+1)); } // // Optimality: different perturbations in V[..,0] can't // increase variance of projection - can only decrease. // t2.setbounds(0, n-1); t3.setbounds(0, n-1); for(i = 0; i <= n-1; i++) { t = ap::vdotproduct(&x(i, 0), 1, &v(0, 0), v.getstride(), ap::vlen(0,m-1)); t2(i) = t; } calculatemv(t2, n, tmean, tmeans, tstddev, tstddevs); for(k = 0; k <= 2*m-1; k++) { h = 0.001; if( k%2!=0 ) { h = -h; } ap::vmove(&t3(0), 1, &t2(0), 1, ap::vlen(0,n-1)); ap::vadd(&t3(0), 1, &x(0, k/2), x.getstride(), ap::vlen(0,n-1), h); t = 0; for(j = 0; j <= m-1; j++) { if( j!=k/2 ) { t = t+ap::sqr(v(j,0)); } else { t = t+ap::sqr(v(j,0)+h); } } t = 1/sqrt(t); ap::vmul(&t3(0), 1, ap::vlen(0,n-1), t); calculatemv(t3, n, tmean2, tmeans2, tstddev2, tstddevs2); pcaopterrors = pcaopterrors||ap::fp_greater(tstddev2,tstddev+threshold); } } } // // Special test for N=0 // for(m = 1; m <= maxm; m++) { // // Solve // pcabuildbasis(x, 0, m, info, s, v); if( info!=1 ) { pcaconverrors = true; continue; } // // Orthogonality test // for(i = 0; i <= m-1; i++) { for(j = 0; j <= m-1; j++) { t = ap::vdotproduct(&v(0, i), v.getstride(), &v(0, j), v.getstride(), ap::vlen(0,m-1)); if( i==j ) { t = t-1; } pcaorterrors = pcaorterrors||ap::fp_greater(fabs(t),threshold); } } } // // Final report // waserrors = pcaconverrors||pcaorterrors||pcavarerrors||pcaopterrors; if( !silent ) { printf("PCA TEST\n"); printf("TOTAL RESULTS: "); if( !waserrors ) { printf("OK\n"); } else { printf("FAILED\n"); } printf("* CONVERGENCE "); if( !pcaconverrors ) { printf("OK\n"); } else { printf("FAILED\n"); } printf("* ORTOGONALITY "); if( !pcaorterrors ) { printf("OK\n"); } else { printf("FAILED\n"); } printf("* VARIANCE REPORT "); if( !pcavarerrors ) { printf("OK\n"); } else { printf("FAILED\n"); } printf("* OPTIMALITY "); if( !pcaopterrors ) { printf("OK\n"); } else { printf("FAILED\n"); } if( waserrors ) { printf("TEST SUMMARY: FAILED\n"); } else { printf("TEST SUMMARY: PASSED\n"); } printf("\n\n"); } result = !waserrors; return result; }
int main(int argc, const char * argv[]) { FILE * pFile; FILE * outputFile; bool debug = false; bool debugEig = true; alglib::real_2d_array ptInput; //Defaults (for my mac) const char * rootDirectory = "/Users/AdamDossa/Documents/Columbia/GRA/Babel/"; const char * currentDirectory = "/Users/AdamDossa/Documents/XCode/Babel_SGD/Babel_SGD/log/"; //Get root directory from args or use default if (argc > 1) { rootDirectory = argv[1]; } if (argc > 2) { currentDirectory = argv[2]; } //We want to find an unused log file name (to some limit) char logFileName[200]; for (int i = 0; i < 100; i++) { snprintf(logFileName, sizeof(char) * 200,"%s/log/log_pca-%d.txt.%d", currentDirectory, NUM_FILES, i); outputFile = fopen(logFileName,"r"); if (outputFile) { continue; } else { outputFile = fopen(logFileName,"w"); break; } } fprintf(outputFile,"Log file: %s\n", logFileName); //First calculate the number of training examples (since not every file is guaranteed to have 250 rows) fprintf(outputFile, "Calculating number of training examples\n"); int noOfTrainingExamples = 0; for (int i = 0; i < NUM_FILES; i++) { char fName[200]; snprintf(fName, sizeof(char) * 200,"%s/labels/train.%d.lab", rootDirectory, i); pFile = fopen(fName,"rb"); int n; fread(&n,4,1,pFile); n = ntohl(n); noOfTrainingExamples += n; //printf("File: %d No: %f\n",i,((float) noOfTrainingExamples) / 250.0f); fclose(pFile); } fprintf(outputFile, "No of training examples: %d\n", noOfTrainingExamples); fflush(outputFile); //Set input size ptInput.setlength(noOfTrainingExamples, FEATURE_COLS); //Now read in the features - hold these in an array - assume same number as training examples fprintf(outputFile,"Reading in features\n"); int readSoFar = 0; //double * features = (double *) malloc(sizeof(double) * noOfTrainingExamples * FEATURE_COLS); //double featureMeans[FEATURE_COLS]; // for (int i = 0; i< FEATURE_COLS; i++) // { // featureMeans[i] = 0.0; // } for (int i = 0; i < NUM_FILES; i++) { char fName[200]; snprintf(fName, sizeof(char) * 200,"%s/features/train.%d.fea", rootDirectory, i); pFile = fopen(fName,"rb"); int n; fread(&n,4,1,pFile); n = ntohl(n); for (int j = 0; j < n; j++) { int m; fread(&m,4,1,pFile); m = ntohl(m); float * tempFeatures = (float *) malloc(sizeof(float) * m); fread(&tempFeatures[0],sizeof(float),m,pFile); for (int k = 0; k < m; k++) { ptInput[readSoFar + j][k] = static_cast<double>(bin2flt(&tempFeatures[k])); // featureMeans[k] += features[readSoFar + k]; } free(tempFeatures); } readSoFar += n; fclose(pFile); } // for (int i = 0; i < FEATURE_COLS; i++) // { // featureMeans[i] = featureMeans[i] / noOfTrainingExamples; // } // if (debug) { // for (int i = 0; i < (noOfTrainingExamples * FEATURE_COLS); i++) // { // fprintf(outputFile,"Train: Feature coordinate: %d has value: %f\n",i,features[i]); // } // for (int i = 0; i < FEATURE_COLS; i++) // { // fprintf(outputFile, "Train: Feature coordinate: %d has mean: %f\n",i,featureMeans[i]); // } // } fflush(outputFile); //Now center data using featureMeans - not necessary for PCABuildBasis // for (int i = 0; i < noOfTrainingExamples; i++) // { // for (int j = 0; j < FEATURE_COLS; j++) // { // features[(i * FEATURE_COLS) + j] = features[(i * FEATURE_COLS) + j] - featureMeans[j]; // } // } // if (debug) { // for (int i = 0; i < (noOfTrainingExamples * FEATURE_COLS); i++) // { // fprintf(outputFile,"Train: Feature coordinate: %d has centered value: %f\n",i,features[i]); // } // } // fflush(outputFile); //Now run PCA fprintf(outputFile,"Running PCA\n"); fflush(outputFile); //Run PCA on input // alglib::real_2d_array ptInput; // ptInput.setcontent(noOfTrainingExamples, FEATURE_COLS , features); // free(features); fprintf(outputFile, "ptInput created\n"); fflush(outputFile); alglib::ae_int_t info; alglib::real_1d_array eigValues; alglib::real_2d_array eigVectors; if (debug) { for (int i = 0; i < noOfTrainingExamples; i++) { for (int j = 0; j < FEATURE_COLS; j++) { fprintf(outputFile, "ptInput: %d, %d, %f\n", i,j, ptInput[i][j]); } } } try { pcabuildbasis(ptInput, noOfTrainingExamples, FEATURE_COLS, info, eigValues, eigVectors); } catch (alglib::ap_error& err) { std::cout << err.msg; } //pcabuildbasis(ptInput, noOfTrainingExamples, FEATURE_COLS, info, eigValues, eigVectors); if (debugEig) { for (int i = 0; i < FEATURE_COLS; i++) { fprintf(outputFile, "EIGVALS: %f\n", eigValues[i]); } } fflush(outputFile); //Now compute scores // alglib::real_2d_array scores; // scores.setlength(noOfTrainingExamples, FEATURE_COLS); // alglib::rmatrixgemm(noOfTrainingExamples, FEATURE_COLS ,FEATURE_COLS, 1, ptInput, 0,0,0, eigVectors, 0,0,0, 0, scores,0,0); // if (debug) // { // for (int i = 0; i < noOfTrainingExamples; i++) // { // for (int j = 0; j < FEATURE_COLS; j++) // { // fprintf(outputFile, "PCA SCORES: %d, %d, %f\n", i,j,scores[i][j]); // } // } // } // fclose(outputFile); //Write out loadings to a file char fName[200]; snprintf(fName, sizeof(char) * 200,"%s/loadings/loadings-%d.load", currentDirectory, NUM_FILES); pFile = fopen(fName,"wb"); for (int k = 0; k < FEATURE_COLS; k++) { for (int j = 0; j < FEATURE_COLS; j++) { fwrite(&eigVectors[k][j],sizeof(double),1,pFile); } } fclose(pFile); fclose(outputFile); return 0; }
bool PCAWrapper::train(vector<vector<double> > &inputData, int pDim, string outputFile) { if (inputData.size() < 1 || inputData[0].size() < 1) return false; size_t numSamples = inputData.size(); oDim = inputData[0].size(); double *dataMat = new double[numSamples * oDim]; aveVec = new double[oDim]; for (int i = 0; i < oDim; i++) aveVec[i] = 0.0; for (int i = 0; i < numSamples; i++) for (int j = 0; j < oDim; j++) { dataMat[i*oDim+j] = inputData[i][j]; aveVec[j] += inputData[i][j]; } for (int i = 0; i < oDim; i++) aveVec[i] /= (double) numSamples; alglib::real_2d_array ptInput; ptInput.setcontent(numSamples, oDim, dataMat); alglib::ae_int_t info; alglib::real_1d_array eigValues; alglib::real_2d_array eigVectors; pcabuildbasis(ptInput, numSamples, oDim, info, eigValues, eigVectors); delete [] dataMat; size_t cols = eigVectors.cols(); if (cols < pDim) pDim = cols; this->pDim = pDim; projMat = new double[pDim * oDim]; eigVec = new double[pDim]; for (int i = 0; i < pDim; i++) { eigVec[i] = 1 / sqrt(eigValues[i]); for (int j = 0; j < oDim; j++) projMat[i*oDim + j] = eigVectors[j][i]; } // save to file ofstream fout; fout.open(outputFile.c_str()); if (!fout.is_open()) { cout<<"Cannot save projection matrix to "<<outputFile<<endl; return false; } fout<<oDim<<" "<<pDim<<endl; fout<<projMat[0]; for (int i = 1; i < oDim*pDim; i++) fout<<" "<<projMat[i]; fout<<endl; fout<<aveVec[0]; for (int i = 1; i < oDim; i++) fout<<" "<<aveVec[i]; fout<<endl; fout<<eigVec[0]; for (int i = 1; i < pDim; i++) fout<<" "<<eigVec[i]; fout<<endl; fout.close(); return true; }