Ejemplo n.º 1
0
bool testpca(bool silent)
{
    bool result;
    int passcount;
    int maxn;
    int maxm;
    double threshold;
    int m;
    int n;
    int i;
    int j;
    int k;
    int info;
    ap::real_1d_array means;
    ap::real_1d_array s;
    ap::real_1d_array t2;
    ap::real_1d_array t3;
    ap::real_2d_array v;
    ap::real_2d_array x;
    double t;
    double h;
    double tmean;
    double tmeans;
    double tstddev;
    double tstddevs;
    double tmean2;
    double tmeans2;
    double tstddev2;
    double tstddevs2;
    bool pcaconverrors;
    bool pcaorterrors;
    bool pcavarerrors;
    bool pcaopterrors;
    bool waserrors;

    
    //
    // Primary settings
    //
    maxm = 10;
    maxn = 100;
    passcount = 1;
    threshold = 1000*ap::machineepsilon;
    waserrors = false;
    pcaconverrors = false;
    pcaorterrors = false;
    pcavarerrors = false;
    pcaopterrors = false;
    
    //
    // Test 1: N random points in M-dimensional space
    //
    for(m = 1; m <= maxm; m++)
    {
        for(n = 1; n <= maxn; n++)
        {
            
            //
            // Generate task
            //
            x.setbounds(0, n-1, 0, m-1);
            means.setbounds(0, m-1);
            for(j = 0; j <= m-1; j++)
            {
                means(j) = 1.5*ap::randomreal()-0.75;
            }
            for(i = 0; i <= n-1; i++)
            {
                for(j = 0; j <= m-1; j++)
                {
                    x(i,j) = means(j)+(2*ap::randomreal()-1);
                }
            }
            
            //
            // Solve
            //
            pcabuildbasis(x, n, m, info, s, v);
            if( info!=1 )
            {
                pcaconverrors = true;
                continue;
            }
            
            //
            // Orthogonality test
            //
            for(i = 0; i <= m-1; i++)
            {
                for(j = 0; j <= m-1; j++)
                {
                    t = ap::vdotproduct(&v(0, i), v.getstride(), &v(0, j), v.getstride(), ap::vlen(0,m-1));
                    if( i==j )
                    {
                        t = t-1;
                    }
                    pcaorterrors = pcaorterrors||ap::fp_greater(fabs(t),threshold);
                }
            }
            
            //
            // Variance test
            //
            t2.setbounds(0, n-1);
            for(k = 0; k <= m-1; k++)
            {
                for(i = 0; i <= n-1; i++)
                {
                    t = ap::vdotproduct(&x(i, 0), 1, &v(0, k), v.getstride(), ap::vlen(0,m-1));
                    t2(i) = t;
                }
                calculatemv(t2, n, tmean, tmeans, tstddev, tstddevs);
                if( n!=1 )
                {
                    t = ap::sqr(tstddev)*n/(n-1);
                }
                else
                {
                    t = 0;
                }
                pcavarerrors = pcavarerrors||ap::fp_greater(fabs(t-s(k)),threshold);
            }
            for(k = 0; k <= m-2; k++)
            {
                pcavarerrors = pcavarerrors||ap::fp_less(s(k),s(k+1));
            }
            
            //
            // Optimality: different perturbations in V[..,0] can't
            // increase variance of projection - can only decrease.
            //
            t2.setbounds(0, n-1);
            t3.setbounds(0, n-1);
            for(i = 0; i <= n-1; i++)
            {
                t = ap::vdotproduct(&x(i, 0), 1, &v(0, 0), v.getstride(), ap::vlen(0,m-1));
                t2(i) = t;
            }
            calculatemv(t2, n, tmean, tmeans, tstddev, tstddevs);
            for(k = 0; k <= 2*m-1; k++)
            {
                h = 0.001;
                if( k%2!=0 )
                {
                    h = -h;
                }
                ap::vmove(&t3(0), 1, &t2(0), 1, ap::vlen(0,n-1));
                ap::vadd(&t3(0), 1, &x(0, k/2), x.getstride(), ap::vlen(0,n-1), h);
                t = 0;
                for(j = 0; j <= m-1; j++)
                {
                    if( j!=k/2 )
                    {
                        t = t+ap::sqr(v(j,0));
                    }
                    else
                    {
                        t = t+ap::sqr(v(j,0)+h);
                    }
                }
                t = 1/sqrt(t);
                ap::vmul(&t3(0), 1, ap::vlen(0,n-1), t);
                calculatemv(t3, n, tmean2, tmeans2, tstddev2, tstddevs2);
                pcaopterrors = pcaopterrors||ap::fp_greater(tstddev2,tstddev+threshold);
            }
        }
    }
    
    //
    // Special test for N=0
    //
    for(m = 1; m <= maxm; m++)
    {
        
        //
        // Solve
        //
        pcabuildbasis(x, 0, m, info, s, v);
        if( info!=1 )
        {
            pcaconverrors = true;
            continue;
        }
        
        //
        // Orthogonality test
        //
        for(i = 0; i <= m-1; i++)
        {
            for(j = 0; j <= m-1; j++)
            {
                t = ap::vdotproduct(&v(0, i), v.getstride(), &v(0, j), v.getstride(), ap::vlen(0,m-1));
                if( i==j )
                {
                    t = t-1;
                }
                pcaorterrors = pcaorterrors||ap::fp_greater(fabs(t),threshold);
            }
        }
    }
    
    //
    // Final report
    //
    waserrors = pcaconverrors||pcaorterrors||pcavarerrors||pcaopterrors;
    if( !silent )
    {
        printf("PCA TEST\n");
        printf("TOTAL RESULTS:                           ");
        if( !waserrors )
        {
            printf("OK\n");
        }
        else
        {
            printf("FAILED\n");
        }
        printf("* CONVERGENCE                            ");
        if( !pcaconverrors )
        {
            printf("OK\n");
        }
        else
        {
            printf("FAILED\n");
        }
        printf("* ORTOGONALITY                           ");
        if( !pcaorterrors )
        {
            printf("OK\n");
        }
        else
        {
            printf("FAILED\n");
        }
        printf("* VARIANCE REPORT                        ");
        if( !pcavarerrors )
        {
            printf("OK\n");
        }
        else
        {
            printf("FAILED\n");
        }
        printf("* OPTIMALITY                             ");
        if( !pcaopterrors )
        {
            printf("OK\n");
        }
        else
        {
            printf("FAILED\n");
        }
        if( waserrors )
        {
            printf("TEST SUMMARY: FAILED\n");
        }
        else
        {
            printf("TEST SUMMARY: PASSED\n");
        }
        printf("\n\n");
    }
    result = !waserrors;
    return result;
}
Ejemplo n.º 2
0
int main(int argc, const char * argv[])
{

    FILE * pFile;
    FILE * outputFile;
    bool debug = false;
    bool debugEig = true;
    alglib::real_2d_array ptInput;
    
    //Defaults (for my mac)
    const char * rootDirectory = "/Users/AdamDossa/Documents/Columbia/GRA/Babel/";
    const char * currentDirectory = "/Users/AdamDossa/Documents/XCode/Babel_SGD/Babel_SGD/log/";
    
    //Get root directory from args or use default
    if (argc > 1)
    {
        rootDirectory = argv[1];
    }
    if (argc > 2)
    {
        currentDirectory = argv[2];
    }
    
    //We want to find an unused log file name (to some limit)
    char logFileName[200];
    for (int i = 0; i < 100; i++)
    {
        snprintf(logFileName, sizeof(char) * 200,"%s/log/log_pca-%d.txt.%d", currentDirectory, NUM_FILES, i);
        outputFile = fopen(logFileName,"r");
        if (outputFile)
        {
            continue;
        } else {
            outputFile = fopen(logFileName,"w");
            break;
        }
    }
    fprintf(outputFile,"Log file: %s\n", logFileName);
    
    //First calculate the number of training examples (since not every file is guaranteed to have 250 rows)
    fprintf(outputFile, "Calculating number of training examples\n");
    int noOfTrainingExamples = 0;
    for (int i = 0; i < NUM_FILES; i++) {
        char fName[200];
        snprintf(fName, sizeof(char) * 200,"%s/labels/train.%d.lab", rootDirectory, i);
        pFile = fopen(fName,"rb");
        int n;
        fread(&n,4,1,pFile);
        n = ntohl(n);
        noOfTrainingExamples += n;
        //printf("File: %d No: %f\n",i,((float) noOfTrainingExamples) / 250.0f);
        fclose(pFile);
    }
    fprintf(outputFile, "No of training examples: %d\n", noOfTrainingExamples);
    fflush(outputFile);
    
    //Set input size
    ptInput.setlength(noOfTrainingExamples, FEATURE_COLS);
    
    //Now read in the features - hold these in an array - assume same number as training examples
    fprintf(outputFile,"Reading in features\n");
    int readSoFar = 0;
    //double * features = (double *) malloc(sizeof(double) * noOfTrainingExamples * FEATURE_COLS);
    //double featureMeans[FEATURE_COLS];
//    for (int i = 0; i< FEATURE_COLS; i++)
//    {
//        featureMeans[i] = 0.0;
//    }
    for (int i = 0; i < NUM_FILES; i++) {
        char fName[200];
        snprintf(fName, sizeof(char) * 200,"%s/features/train.%d.fea", rootDirectory, i);
        pFile = fopen(fName,"rb");
        int n;
        fread(&n,4,1,pFile);
        n = ntohl(n);
        for (int j = 0; j < n; j++)
        {
            int m;
            fread(&m,4,1,pFile);
            m = ntohl(m);
            float * tempFeatures = (float *) malloc(sizeof(float) * m);
            fread(&tempFeatures[0],sizeof(float),m,pFile);
            for (int k = 0; k < m; k++)
            {
                ptInput[readSoFar + j][k] = static_cast<double>(bin2flt(&tempFeatures[k]));
//                featureMeans[k] += features[readSoFar + k];
            }
            free(tempFeatures);
        }
        readSoFar += n;
        fclose(pFile);
    }
//    for (int i = 0; i < FEATURE_COLS; i++)
//    {
//        featureMeans[i] = featureMeans[i] / noOfTrainingExamples;
//    }
//    if (debug) {
//        for (int i = 0; i < (noOfTrainingExamples * FEATURE_COLS); i++)
//        {
//            fprintf(outputFile,"Train: Feature coordinate: %d has value: %f\n",i,features[i]);
//        }
//        for (int i = 0; i < FEATURE_COLS; i++)
//        {
//            fprintf(outputFile, "Train: Feature coordinate: %d has mean: %f\n",i,featureMeans[i]);
//        }
//    }
    fflush(outputFile);
    
    //Now center data using featureMeans - not necessary for PCABuildBasis
//    for (int i = 0; i < noOfTrainingExamples; i++)
//    {
//        for (int j = 0; j < FEATURE_COLS; j++)
//        {
//            features[(i * FEATURE_COLS) + j] = features[(i * FEATURE_COLS) + j] - featureMeans[j];
//        }
//    }
//    if (debug) {
//        for (int i = 0; i < (noOfTrainingExamples * FEATURE_COLS); i++)
//        {
//            fprintf(outputFile,"Train: Feature coordinate: %d has centered value: %f\n",i,features[i]);
//        }
//    }
//    fflush(outputFile);
    
    //Now run PCA
    fprintf(outputFile,"Running PCA\n");
    fflush(outputFile);
    
    //Run PCA on input
//    alglib::real_2d_array ptInput;
//    ptInput.setcontent(noOfTrainingExamples, FEATURE_COLS , features);
//    free(features);
    fprintf(outputFile, "ptInput created\n");
    fflush(outputFile);
    
    alglib::ae_int_t info;
    alglib::real_1d_array eigValues;
    alglib::real_2d_array eigVectors;
    if (debug) {
        for (int i = 0; i < noOfTrainingExamples; i++)
        {
            for (int j = 0; j < FEATURE_COLS; j++)
            {
                fprintf(outputFile, "ptInput: %d, %d, %f\n", i,j, ptInput[i][j]);
            }
        }
    }
    try {
        pcabuildbasis(ptInput, noOfTrainingExamples, FEATURE_COLS, info, eigValues, eigVectors);
    } catch (alglib::ap_error& err) {
        std::cout << err.msg;
    }
    //pcabuildbasis(ptInput, noOfTrainingExamples, FEATURE_COLS, info, eigValues, eigVectors);
    if (debugEig) {
        for (int i = 0; i < FEATURE_COLS; i++)
        {
            fprintf(outputFile, "EIGVALS: %f\n", eigValues[i]);
        }
    }
    fflush(outputFile);
    
    //Now compute scores
//    alglib::real_2d_array scores;
//    scores.setlength(noOfTrainingExamples, FEATURE_COLS);
//    alglib::rmatrixgemm(noOfTrainingExamples, FEATURE_COLS ,FEATURE_COLS, 1, ptInput, 0,0,0, eigVectors, 0,0,0, 0, scores,0,0);
//    if (debug)
//    {
//        for (int i = 0; i < noOfTrainingExamples; i++)
//        {
//            for (int j = 0; j < FEATURE_COLS; j++)
//            {
//                fprintf(outputFile, "PCA SCORES: %d, %d, %f\n", i,j,scores[i][j]);
//            }
//        }
//    }
//    fclose(outputFile);
    
    //Write out loadings to a file
    char fName[200];
    snprintf(fName, sizeof(char) * 200,"%s/loadings/loadings-%d.load", currentDirectory, NUM_FILES);
    pFile = fopen(fName,"wb");
    for (int k = 0; k < FEATURE_COLS; k++)
    {
        for (int j = 0; j < FEATURE_COLS; j++)
        {
            fwrite(&eigVectors[k][j],sizeof(double),1,pFile);
        }
    }
    fclose(pFile);
    
    fclose(outputFile);
    return 0;
}
Ejemplo n.º 3
0
bool PCAWrapper::train(vector<vector<double> > &inputData, int pDim, string outputFile) {
    if (inputData.size() < 1 || inputData[0].size() < 1)
        return false;
    size_t numSamples = inputData.size();
    oDim = inputData[0].size();
    double *dataMat = new double[numSamples * oDim];
    aveVec = new double[oDim];
    for (int i = 0; i < oDim; i++)
        aveVec[i] = 0.0;
    for (int i = 0; i < numSamples; i++)
        for (int j = 0; j < oDim; j++)  {
            dataMat[i*oDim+j] = inputData[i][j];
            aveVec[j] += inputData[i][j];
        }
    for (int i = 0; i < oDim; i++)
        aveVec[i] /= (double) numSamples;
    alglib::real_2d_array ptInput;
    ptInput.setcontent(numSamples, oDim, dataMat);
    alglib::ae_int_t info;
    alglib::real_1d_array eigValues;
    alglib::real_2d_array eigVectors;
    pcabuildbasis(ptInput, numSamples, oDim, info, eigValues, eigVectors);
    delete [] dataMat;

    size_t cols = eigVectors.cols();
    if (cols < pDim)
        pDim = cols;
    this->pDim = pDim;
    
    projMat = new double[pDim * oDim];
    eigVec = new double[pDim];

    for (int i = 0; i < pDim; i++)  {
        eigVec[i] = 1 / sqrt(eigValues[i]);
        for (int j = 0; j < oDim; j++)
            projMat[i*oDim + j] = eigVectors[j][i];
    }
    // save to file
    ofstream fout;
    fout.open(outputFile.c_str());
    if (!fout.is_open())    {
        cout<<"Cannot save projection matrix to "<<outputFile<<endl;
        return false;
    }
    fout<<oDim<<" "<<pDim<<endl;
    fout<<projMat[0];
    for (int i = 1; i < oDim*pDim; i++)
        fout<<" "<<projMat[i];
    fout<<endl;
    fout<<aveVec[0];
    for (int i = 1; i < oDim; i++)
        fout<<" "<<aveVec[i];
    fout<<endl;
    fout<<eigVec[0];
    for (int i = 1; i < pDim; i++)
        fout<<" "<<eigVec[i];
    fout<<endl;
    fout.close();

    return true;
}