int main(void)
{
    int M = 8192;
    int N = M;
    
    culaStatus status;

    float* A = NULL;
    float* B = NULL;

    A = (float*)malloc(M*N*sizeof(float));
    B = (float*)malloc(N*sizeof(float));
    if(!A || !B) exit(EXIT_FAILURE);
    memset(A, 0, M*N*sizeof(float));
    memset(B, 0, N*sizeof(float));

    status = culaInitialize();

    status = culaSgeqrf(M, N, A, M, B);
    
    culaShutdown();

    free(A);
    free(B);

    return EXIT_SUCCESS;
}
Esempio n. 2
0
void chofactor(int N, float* A, int LDA)
{
    int i, j;
    /*for (i=0;i<10;i++){*/
    /*for (j=0;j<10;j++){*/
    /*[>*(A+j*N+i) = 0.;<]*/
    /*printf("%0.1g ", *(A+i*N+j));*/
    /*};*/
    /*printf("\n");*/
    /*}*/
    culaStatus status;
    status = culaInitialize();
    checkStatus(status);
    status = culaSgeNancheck(LDA, N, A, LDA);
    /*printf("check nan %d\n", status);*/
    status = culaSpotrf('U', N, A, LDA);
    /*printf("INFO %d\n", status);*/
    checkStatus(status);
    culaShutdown();

    for (i=0; i<N; i++) {
        for (j=0; j<i; j++) {
            *(A+j*N+i) = 0.;
        };
    }
}
// function to read the adjacency submatrices from file (when stored in two-column or three-column sparse matrix format with doubleing point entries); note: this reads, for example, G_XA^T instead of G_XA
void read_G(char *file_name, char *G_name, double *G_ptr, int N) // input file name, matrix name, pointer to matrix buffer, (NA or NB or NC)
{
  double r_idx, c_idx; // row and column indices - matlab style
  printf("reading %s\n", G_name); fflush(stdout);
  FILE *file_ptr = fopen(file_name, "r"); // opening G_name
  if(file_ptr == NULL) // exception handling if reading G_name fails
  {
    printf("reading %s adjacency submatrix failed\n", G_name);
    culaShutdown();
    exit(1);
  }
  while(!feof(file_ptr)) // reading G_name
  {
    fscanf(file_ptr, "%lf", &c_idx); // note: since we need (NA or NB or NC) \times NX, we read the column index first, then usual column-major
    fscanf(file_ptr, "%lf", &r_idx); // now, NROWS = (NA or NB or NC) (y-axis), NCOLS = NX (x-axis)
# ifdef EXPECTED
    fscanf(file_ptr, "%lf", G_ptr+(int)((c_idx-1)*N+(r_idx-1)));
# endif
# ifdef BINARY
    *(G_ptr+(int)((c_idx-1)*N+(r_idx-1))) = 1;
# endif
    /*
    // optional printing to check file read is faithful (used for debugging)
    printf("%lf\n", c_idx);
    printf("%lf\n", r_idx);
    printf("%lf\n", *(G_ptr+(int)((c_idx-1)*N+(r_idx-1))));
    culaShutdown();
    exit(0);
    */
  }
  fclose(file_ptr);
}
Esempio n. 4
0
void chosolve(int N, int Nrhs, float* C, int ldc, float* b, int ldb)
{
    culaStatus status;
    status = culaInitialize();
    checkStatus(status);
    status = culaSpotrs('U', N, Nrhs, C, ldc, b, ldb);
    checkStatus(status);
    culaShutdown();
}
/* cula error status */
void checkStatus(culaStatus status)
{
    char buf[256];

    if(!status)
        return;

    culaGetErrorInfoString(status, culaGetErrorInfo(), buf, sizeof(buf));
    printf("%s\n", buf);

    culaShutdown();
    exit(EXIT_FAILURE);
}
// function for cula exception handling - make sure that the char * arguments are not NULL while calling
void cula_exception(culaStatus cula_err, char *cula_func, char *term) // error status, cula function that fails, term in the algorithm
{
  int cula_info; // identifier for the cula error
  char cula_msg[256]; // buffer for storing the cula excepetion message
  if(cula_err != culaNoError)
  {
    cula_info = culaGetErrorInfo();
    culaGetErrorInfoString(cula_err, cula_info, cula_msg, sizeof(cula_msg));
    printf("(cula error) user message: %s for %s failed; cula message: %s\n", cula_func, term, cula_msg);
    fflush(stdout);
    culaShutdown();
    exit(2);
  }
}
/* Main */
void mexFunction( int nlhs, mxArray *plhs[],
    int nrhs, const mxArray *prhs[]) {

	if (nrhs != 7) {
		mexErrMsgTxt("sgemm requires 7 input arguments");
	} else if (nlhs != 1) {
		mexErrMsgTxt("sgemm requires 1 output argument");
	}

	if ( !MXISDOUBLE(prhs[4]) ||
		!MXISDOUBLE(prhs[5]) ||
		!MXISDOUBLE(prhs[6]))   {
		mexErrMsgTxt("Input arrays must be single precision.");
	}

	int ta = (int) mxGetScalar(prhs[0]);
	int tb = (int) mxGetScalar(prhs[1]);
	DOUBLE alpha = (DOUBLE) mxGetScalar(prhs[2]);
	DOUBLE beta = (DOUBLE) mxGetScalar(prhs[3]);
	DOUBLE *h_A = (DOUBLE*) mxGetData(prhs[4]);
	DOUBLE *h_B = (DOUBLE*) mxGetData(prhs[5]);
	DOUBLE *h_C = (DOUBLE*) mxGetData(prhs[6]);

	int M = mxGetM(prhs[4]);   /* gets number of rows of A */
	int K = mxGetN(prhs[4]);   /* gets number of columns of A */
	int L = mxGetM(prhs[5]);   /* gets number of rows of B */
	int N = mxGetN(prhs[5]);   /* gets number of columns of B */

	char transa, transb;
	int MM, KK, NN;
	if (ta == 0) {
		transa = 'N';
		MM=M;
		KK=K;
	} else {
		transa = 'T';
		MM=K;
		KK=M;
	}

	if (tb == 0) {
		transb = 'N';
		NN=N;
	} else {
		transb = 'T';
		NN=L;
	}

/* Left hand side matrix set up */
	mwSize dims0[2];
	dims0[0]=MM;
	dims0[1]=NN;
	plhs[0] = mxCreateNumericArray(2,dims0,mxPRECISION_CLASS,mxREAL);
	DOUBLE *h_C_out = (DOUBLE*) mxGetData(plhs[0]);

	cublasStatus status;
	culaInitialize();

	int iter;
	for (iter = 0; iter < 100000; ++iter) {
	/* Performs operation using cublas */
		CUGEMM(transa, transb, MM, NN, KK, alpha, h_A, M, h_B, L, beta, h_C, MM);
	}
	memcpy((void *) h_C_out, (void *) h_C, MM * NN * sizeof(h_C[0]));

	/* Shutdown */
	culaShutdown();
}
Esempio n. 8
0
void culaFloatExample()
{
#ifdef NDEBUG
    int N = 8192;
#else
    int N = 1024;
#endif
    int NRHS = 1;
    int i;

    culaStatus status;
    
    culaFloat* A = NULL;
    culaFloat* B = NULL;
    culaFloat* X = NULL;
    culaInt* IPIV = NULL;

    culaFloat one = 1.0f;
    culaFloat thresh = 1e-6f;
    culaFloat diff;

    printf("-------------------\n");
    printf("       SGESV\n");
    printf("-------------------\n");

    printf("Allocating Matrices\n");
    A = (culaFloat*)malloc(N*N*sizeof(culaFloat));
    B = (culaFloat*)malloc(N*sizeof(culaFloat));
    X = (culaFloat*)malloc(N*sizeof(culaFloat));
    IPIV = (culaInt*)malloc(N*sizeof(culaInt));
    if(!A || !B || !IPIV)
        exit(EXIT_FAILURE);

    printf("Initializing CULA\n");
    status = culaInitialize();
    checkStatus(status);

    // Set A to the identity matrix
    memset(A, 0, N*N*sizeof(culaFloat));
    for(i = 0; i < N; ++i)
        A[i*N+i] = one;
    
    // Set B to a random matrix (see note at top)
    for(i = 0; i < N; ++i)
        B[i] = (culaFloat)rand();
    memcpy(X, B, N*sizeof(culaFloat));

    memset(IPIV, 0, N*sizeof(culaInt));

    printf("Calling culaSgesv\n");
    status = culaSgesv(N, NRHS, A, N, IPIV, X, N);
    checkStatus(status);

    printf("Verifying Result\n");
    for(i = 0; i < N; ++i)
    {
        diff = X[i] - B[i];
        if(diff < 0.0f)
            diff = -diff;
        if(diff > thresh)
            printf("Result check failed:  i=%d  X[i]=%f  B[i]=%f", i, X[i], B[i]);
    }
    
    printf("Shutting down CULA\n\n");
    culaShutdown();

    free(A);
    free(B);
    free(IPIV);
}
Esempio n. 9
0
void culaDoubleComplexExample()
{
#ifdef NDEBUG
    int N = 1024;
#else
    int N = 128;
#endif
    int NRHS = 1;
    int i;

    culaStatus status;
    
    culaDoubleComplex* A = NULL;
    culaDoubleComplex* B = NULL;
    culaDoubleComplex* X = NULL;
    culaInt* IPIV = NULL;

    culaDoubleComplex one = { 1.0, 0.0 };
    culaDouble thresh = 1e-6;
    culaDouble diffr;
    culaDouble diffc;
    culaDouble diffabs;

    printf("-------------------\n");
    printf("       ZGESV\n");
    printf("-------------------\n");

    printf("Allocating Matrices\n");
    A = (culaDoubleComplex*)malloc(N*N*sizeof(culaDoubleComplex));
    B = (culaDoubleComplex*)malloc(N*sizeof(culaDoubleComplex));
    X = (culaDoubleComplex*)malloc(N*sizeof(culaDoubleComplex));
    IPIV = (culaInt*)malloc(N*sizeof(culaInt));
    if(!A || !B || !IPIV)
        exit(EXIT_FAILURE);

    printf("Initializing CULA\n");
    status = culaInitialize();
    checkStatus(status);

    // Set A to the identity matrix
    memset(A, 0, N*N*sizeof(culaDoubleComplex));
    for(i = 0; i < N; ++i)
        A[i*N+i] = one;
    
    // Set B to a random matrix (see note at top)
    for(i = 0; i < N; ++i)
    {
        B[i].x = (culaDouble)rand();
        B[i].y = (culaDouble)rand();
    }
    memcpy(X, B, N*sizeof(culaDoubleComplex));

    memset(IPIV, 0, N*sizeof(culaInt));

    printf("Calling culaZgesv\n");
    status = culaZgesv(N, NRHS, A, N, IPIV, X, N);
    if(status == culaInsufficientComputeCapability)
    {
        printf("No Double precision support available, skipping example\n");
        free(A);
        free(B);
        free(IPIV);
        culaShutdown();
        return;
    }
    checkStatus(status);

    printf("Verifying Result\n");
    for(i = 0; i < N; ++i)
    {
        diffr = X[i].x - B[i].x;
        diffc = X[i].y - B[i].y;
        diffabs = (culaDouble)sqrt(X[i].x*X[i].x+X[i].y*X[i].y)
                - (culaDouble)sqrt(B[i].x*B[i].x+B[i].y*B[i].y);
        if(diffr < 0.0)
            diffr = -diffr;
        if(diffc < 0.0)
            diffc = -diffc;
        if(diffabs < 0.0)
            diffabs = -diffabs;
        if(diffr > thresh || diffc > thresh || diffabs > thresh)
            printf("Result check failed:  i=%d  X[i]=(%f,%f)  B[i]=(%f,%f)", i, X[i].x, X[i].y, B[i].x, B[i].y);
    }
    
    printf("Shutting down CULA\n\n");
    culaShutdown();

    free(A);
    free(B);
    free(IPIV);
}
Esempio n. 10
0
void culaDoubleExample()
{
#ifdef NDEBUG
    int N = 4096;
#else
    int N = 512;
#endif
    int NRHS = 1;
    int i;

    culaStatus status;
    
    culaDouble* A = NULL;
    culaDouble* B = NULL;
    culaDouble* X = NULL;
    culaInt* IPIV = NULL;

    culaDouble one = 1.0;
    culaDouble thresh = 1e-6;
    culaDouble diff;
    
    printf("-------------------\n");
    printf("       DGESV\n");
    printf("-------------------\n");

    printf("Allocating Matrices\n");
    A = (culaDouble*)malloc(N*N*sizeof(culaDouble));
    B = (culaDouble*)malloc(N*sizeof(culaDouble));
    X = (culaDouble*)malloc(N*sizeof(culaDouble));
    IPIV = (culaInt*)malloc(N*sizeof(culaInt));
    if(!A || !B || !IPIV)
        exit(EXIT_FAILURE);

    printf("Initializing CULA\n");
    status = culaInitialize();
    checkStatus(status);

    // Set A to the identity matrix
    memset(A, 0, N*N*sizeof(culaDouble));
    for(i = 0; i < N; ++i)
        A[i*N+i] = one;
    
    // Set B to a random matrix (see note at top)
    for(i = 0; i < N; ++i)
        B[i] = (culaDouble)rand();
    memcpy(X, B, N*sizeof(culaDouble));

    memset(IPIV, 0, N*sizeof(culaInt));

    printf("Calling culaDgesv\n");
    status = culaDgesv(N, NRHS, A, N, IPIV, X, N);
    if(status == culaInsufficientComputeCapability)
    {
        printf("No Double precision support available, skipping example\n");
        free(A);
        free(B);
        free(IPIV);
        culaShutdown();
        return;
    }
    checkStatus(status);

    printf("Verifying Result\n");
    for(i = 0; i < N; ++i)
    {
        diff = X[i] - B[i];
        if(diff < 0.0)
            diff = -diff;
        if(diff > thresh)
            printf("Result check failed:  i=%d  X[i]=%f  B[i]=%f", i, X[i], B[i]);
    }
    
    printf("Shutting down CULA\n\n");
    culaShutdown();

    free(A);
    free(B);
    free(IPIV);
}
Esempio n. 11
0
void culaFloatComplexExample()
{
#ifdef NDEBUG
    int N = 4096;
#else
    int N = 512;
#endif
    int NRHS = 1;
    int i;

    culaStatus status;
    
    culaFloatComplex* A = NULL;
    culaFloatComplex* B = NULL;
    culaFloatComplex* X = NULL;
    culaInt* IPIV = NULL;

    culaFloatComplex one = { 1.0f, 0.0f };
    culaFloat thresh = 1e-6f;
    culaFloat diffr;
    culaFloat diffc;
    culaFloat diffabs;

    printf("-------------------\n");
    printf("       CGESV\n");
    printf("-------------------\n");

    printf("Allocating Matrices\n");
    A = (culaFloatComplex*)malloc(N*N*sizeof(culaFloatComplex));
    B = (culaFloatComplex*)malloc(N*sizeof(culaFloatComplex));
    X = (culaFloatComplex*)malloc(N*sizeof(culaFloatComplex));
    IPIV = (culaInt*)malloc(N*sizeof(culaInt));
    if(!A || !B || !IPIV)
        exit(EXIT_FAILURE);

    printf("Initializing CULA\n");
    status = culaInitialize();
    checkStatus(status);

    // Set A to the identity matrix
    memset(A, 0, N*N*sizeof(culaFloatComplex));
    for(i = 0; i < N; ++i)
        A[i*N+i] = one;
    
    // Set B to a random matrix (see note at top)
    for(i = 0; i < N; ++i)
    {
        B[i].x = (culaFloat)rand();
        B[i].y = (culaFloat)rand();
    }
    memcpy(X, B, N*sizeof(culaFloatComplex));

    memset(IPIV, 0, N*sizeof(culaInt));

    printf("Calling culaCgesv\n");
    status = culaCgesv(N, NRHS, A, N, IPIV, X, N);
    checkStatus(status);

    printf("Verifying Result\n");
    for(i = 0; i < N; ++i)
    {
        diffr = X[i].x - B[i].x;
        diffc = X[i].y - B[i].y;
        diffabs = (culaFloat)sqrt(X[i].x*X[i].x+X[i].y*X[i].y)
                - (culaFloat)sqrt(B[i].x*B[i].x+B[i].y*B[i].y);
        if(diffr < 0.0f)
            diffr = -diffr;
        if(diffc < 0.0f)
            diffc = -diffc;
        if(diffabs < 0.0f)
            diffabs = -diffabs;
        if(diffr > thresh || diffc > thresh || diffabs > thresh)
            printf("Result check failed:  i=%d  X[i]=(%f,%f)  B[i]=(%f,%f)", i, X[i].x, X[i].y, B[i].x, B[i].y);
    }
    
    printf("Shutting down CULA\n\n");
    culaShutdown();

    free(A);
    free(B);
    free(IPIV);
}
Esempio n. 12
0
// function for pre-processing; here, exit(1) stands for file i/o error, exit(2) stands for cula error
int pre_proc()
{
  cula_exception(culaSelectDevice(1), "culaSelectDevice", "pre_proc gpu selection"); // select which gpu to execute on; 0 is master or primary, 1 is slave or secondary
  printf("calling culaInitialize\n"); fflush(stdout);
  cula_exception(culaInitialize(), "culaInitialize", "pre_proc.c");

  // pointers to global buffers
  double *G_XA_ptr = G_XA_buff, *G_XB_ptr = G_XB_buff, *G_XC_ptr = G_XC_buff; // adjacency matrix pointers
  double *G_XB_til_ptr = G_XB_til_buff, *G_XC_til_ptr = G_XC_til_buff; // tilde pointers
  double *G_XA_white_ptr = G_XA_white_buff, *G_XB_til_white_ptr = G_XB_til_white_buff, *G_XC_til_white_ptr = G_XC_til_white_buff; // whitened matrix related pointers
  double *M2_al0_ptr = M2_al0_buff; // moment 2 pointer
  double *G_XA_al0_ptr = G_XA_al0_buff, *G_XB_al0_ptr = G_XB_al0_buff, *G_XC_al0_ptr = G_XC_al0_buff; // alpha0 matrix pointers
  double *mu_A_ptr = mu_A_buff, *mu_B_ptr = mu_B_buff, *mu_C_ptr = mu_C_buff; // mean vector pointers
  double *mu_B_til_ptr = mu_B_til_buff, *mu_C_til_ptr = mu_C_til_buff; // mean tilde vector pointers
  double *mu_A_white_ptr = mu_A_white_buff, *mu_B_til_white_ptr = mu_B_til_white_buff, *mu_C_til_white_ptr = mu_C_til_white_buff; // pointers to buffers for whitened mean vectors
  double *mu_A_mu_A_T_ptr = mu_A_mu_A_T_buff; // outer product of mean vector A with itself
  double *Z_B_ptr = Z_B_buff, *Z_B_num_ptr = Z_B_num_buff, *Z_B_den_ptr = Z_B_den_buff, *Z_C_ptr = Z_C_buff, *Z_C_num_ptr = Z_C_num_buff;//, *Z_C_den_ptr = Z_C_den_buff;
  double *ones_ptr = ones_buff, *eye_ptr = eye_buff; // pointer to vector of ones for computing mean of size NX; identity matrix pointer
  double *pinv_ptr = pinv_buff;//, *l_svec_mat_ptr = l_svec_mat_buff, *r_svec_mat_T_ptr = r_svec_mat_T_buff, *sval_vec_ptr = sval_vec_buff, *sval_mat_ptr = sval_mat_buff; // pointers to buffers for pseudoinverse computation
  double *W_ptr = W_buff; // pointers to buffer for whitening matrix using second method (pairs)
  int i, j; // for looping / indexing
  //double *superb = sup_buff; // pointer to superb for mkl svd
  //mkl_set_num_threads(16); // number of cpu threads


  printf("initializing all ones vector of length NX\n"); fflush(stdout);
  for(i=0; i<NX; i++) // using this loop to initialize because memset 1 for double does not hold
    *(ones_ptr+i) = 1;
  printf("initializing identity matrix of size NA x NA\n"); fflush(stdout);
  for(i=0; i<NA*NA; i+=NA+1) // identity for cula products
    *(eye_ptr+i) = 1;


  // read the adjacency submatrices from the dataset
  read_G((char *)FILE_A, "G_XA", G_XA_ptr, (int)NA);
  read_G((char *)FILE_B, "G_XB", G_XB_ptr, (int)NB);
  read_G((char *)FILE_C, "G_XC", G_XC_ptr, (int)NC);


  // compute mean vectors
  printf("computing mu_A\n"); fflush(stdout);
/////  cula_exception(culaDgemv('n', NA, NX, 1/(double)(NX), G_XA_ptr, NA, ones_ptr, 1, 0, mu_A_ptr, 1), "culaDgemv", "mu_A"); // compute mean vector A
  cula_exception(culaBlockDgemm('n', 'n', NA, 1, NX, 1/(double)(NX), G_XA_ptr, NA, ones_ptr, NX, 0, mu_A_ptr, NA), "culaBlockDgemm", "mu_A"); // compute mean vector A using partitioned matrix multiplication

  printf("computing mu_B\n"); fflush(stdout);
/////  cula_exception(culaDgemv('n', NB, NX, 1/(double)(NX), G_XB_ptr, NB, ones_ptr, 1, 0, mu_B_ptr, 1), "culaDgemv", "mu_B"); // compute mean vector B
  cula_exception(culaBlockDgemm('n', 'n', NB, 1, NX, 1/(double)(NX), G_XB_ptr, NB, ones_ptr, NX, 0, mu_B_ptr, NB), "culaBlockDgemm", "mu_B"); // compute mean vector B using partitioned matrix multiplication

  printf("computing mu_C\n"); fflush(stdout);
/////  cula_exception(culaDgemv('n', NC, NX, 1/(double)(NX), G_XC_ptr, NC, ones_ptr, 1, 0, mu_C_ptr, 1), "culaDgemv", "mu_C"); // compute mean vector C
  cula_exception(culaBlockDgemm('n', 'n', NC, 1, NX, 1/(double)(NX), G_XC_ptr, NC, ones_ptr, NX, 0, mu_C_ptr, NC), "culaBlockDgemm", "mu_C"); // compute mean vector C using partitioned matrix multiplication


  // compute Z_B
  printf("computing Z_B_num\n"); fflush(stdout);
  cula_exception(culaBlockDgemm('n', 't', NA, NC, NX, 1/(double)(NX), G_XA_ptr, NA, G_XC_ptr, NC, 0, Z_B_num_ptr, NA), "culaBlockDgemm", "Z_B_num");

  printf("computing Z_B_den\n"); fflush(stdout);
  cula_exception(culaBlockDgemm('n', 't', NB, NC, NX, 1/(double)(NX), G_XB_ptr, NB, G_XC_ptr, NC, 0, Z_B_den_ptr, NB), "culaBlockDgemm", "Z_B_den");


  // note: if the estimated Pi's are 0's, then print and check Z_B_num and den here; if G_XA and G_XC are sparse, then product is even sparser; so, input dataset must be reclustered to have sufficient density; also check sizes to be handled by cula
/****** // this commented pseudoinverse using svd was used for synthetic, facebook and yelp datasets
  printf("computing SVD for Z_B_den\n"); fflush(stdout);
  cula_exception(culaDgesvd('A', 'A', NB, NC, Z_B_den_ptr, NB, sval_vec_ptr, l_svec_mat_ptr, NB, r_svec_mat_T_ptr, NC), "culaDgesvd", "Z_B_den");

///  if mkl svd is needed, using the following 3 lines instead of its cula counterpart
///  gettimeofday(&start_timeval_svd1, NULL);  // Measuring start time for svd1
///  LAPACKE_dgesvd(CblasColMajor, 'A', 'A', NB, NC, Z_B_den_ptr, NB, sval_vec_ptr, l_svec_mat_ptr, NB, r_svec_mat_T_ptr, NC, superb);
///  gettimeofday(&stop_timeval_svd1, NULL);  // Measuring stop time for svd1

  printf("for loop to copy singular values from SVD of Z_B_den\n"); fflush(stdout);
  for(i=0; i<NB; i++) // aliter: use cuda kernel from wrappers.cu but take care of the if check
  {
    if(i>NC)
      break;
    else if(fabs(*(sval_vec_ptr+i)) > PINV_TOL)
      *(sval_mat_ptr+(NB*i+i)) = 1/(*(sval_vec_ptr+i));
  }

  printf("computing pseudoinverse for Z_B_den (step 1)\n"); fflush(stdout);
  cula_exception(culaDgemm('n', 'n', NB, NC, NB, 1, l_svec_mat_ptr, NB, sval_mat_ptr, NB, 0, pinv_ptr, NB), "culaBlockDgemm", "Z_B_den^+ (1), i.e., U*S_-1"); // U*S_-1

  printf("computing pseudoinverse for Z_B_den (step 2)\n"); fflush(stdout);
  cula_exception(culaDgemm('n', 'n', NB, NC, NC, 1, pinv_ptr, NB, r_svec_mat_T_ptr, NC, 0, pinv_ptr, NB), "culaBlockDgemm", "Z_B_den^+ (2), i.e., (U*S_-1)*VT"); // (U*S_-1)*VT - note: this is not the pseudoinverse yet; this is transposed later on directly in the subsequent gemm to get V * S_-1 * U^T
******/

/* // the following 2 lines use the iterative method; very slow even with partitioned gemm
  printf("computing pseudoinverse of Z_B_den using ben-israel iterations\n");
  cula_exception(pinv(NB, NC, Z_B_den_ptr, pinv_ptr), "ben-israel pinv (c malloc must have failed)", "pinv(Z_B_den)");
*/


/****** // this was the code that used pinv_ptr before transposing; this was used for facebook, yelp before partitioning, nystrom, etc.; note: look at the comment block associated with the memset statements below
    cula_exception(culaDgemm('n', 't', NA, NB, NC, 1, Z_B_num_ptr, NA, pinv_ptr, NB, 0, Z_B_ptr, NA), "culaDgemm", "Z_B"); // note: here 2nd arg is 't' because pinv is not yet transposed
******/

/****** // resetting buffers to 0 for reusing below; note: if uncommented, take care that this comes after the pinv_ptr values are used, i.e., insert in the appropriate location
  memset(pinv_ptr, 0, NMAX*NMAX*sizeof(double));
  memset(pinv_ptr, 0, NB*NC*sizeof(double));
******/

  printf("computing pseudoinverse of Z_B_den using nystrom\n");
  cula_exception(pinv_nys_asym(NB, NC, NCOM, Z_B_den_ptr, pinv_ptr), "nystrom pinv (c malloc could have failed)", "pinv(Z_B_den)");

  printf("computing Z_B\n"); fflush(stdout);
  cula_exception(culaBlockDgemm('n', 'n', NA, NB, NC, 1, Z_B_num_ptr, NA, pinv_ptr, NC, 0, Z_B_ptr, NA), "culaBlockDgemm", "Z_B"); // note: here 2nd arg was 't' in the previous version of the code because pinv was not yet transposed; this was changed for the nystrom because 2nd arg is n not t as it is already transposed when it is returned from pin_nys_asym function

/******
  // reset buffers
  printf("resetting buffers used for pseudoinverse of Z_B_den\n"); fflush(stdout);
  memset(sval_vec_ptr, 0, NMAX*sizeof(double));
  memset(sval_mat_ptr, 0, NMAX*NMAX*sizeof(double));
  memset(l_svec_mat_ptr, 0, NMAX*NMAX*sizeof(double));
  memset(r_svec_mat_T_ptr, 0, NMAX*NMAX*sizeof(double));
  memset(pinv_ptr, 0, NMAX*NMAX*sizeof(double));
******/

  // compute Z_C
  printf("computing Z_C_num\n"); fflush(stdout);
  cula_exception(culaBlockDgemm('n', 't', NA, NB, NX, 1/(double)(NX), G_XA_ptr, NA, G_XB_ptr, NB, 0, Z_C_num_ptr, NA), "culaBlockDgemm", "Z_C_num");

/* // not needed: same as transpose of Z_B_den; can be used as debugging check
  printf("computing Z_C_den\n"); fflush(stdout);
  cula_exception(culaBlockDgemm('n', 't', NC, NB, NX, 1/(double)(NX), G_XC_ptr, NC, G_XB_ptr, NB, 0, Z_C_den_ptr, NC), "culaBlockDgemm", "Z_C_den");
*/

/****** // this commented pseudoinverse using svd was used for synthetic, facebook and yelp datasets
  printf("computing svd for Z_C_den\n"); fflush(stdout);
  cula_exception(culaDgesvd('A', 'A', NC, NB, Z_C_den_ptr, NC, sval_vec_ptr, l_svec_mat_ptr, NC, r_svec_mat_T_ptr, NB), "culaDgesvd", "Z_C_den");

///  if mkl svd is needed, using the following 3 lines instead of its cula counterpart
///  gettimeofday(&start_timeval_svd2, NULL);  // Measuring start time for svd2
///  LAPACKE_dgesvd(CblasColMajor, 'A', 'A', NC, NB, Z_C_den_ptr, NC, sval_vec_ptr, l_svec_mat_ptr, NC, r_svec_mat_T_ptr, NB, superb);
///  gettimeofday(&stop_timeval_svd2, NULL);  // Measuring stop time for svd2

  printf("for loop to copy singular values from svd of Z_C_den\n"); fflush(stdout);
  for(i=0; i<NC; i++) // aliter: could use cuda kernel from wrappers.cu but take care of the if check
  {
    if(i>NB)
      break;
    else if(fabs(*(sval_vec_ptr+i)) > PINV_TOL)
      *(sval_mat_ptr+(NC*i+i)) = 1/(*(sval_vec_ptr+i));
  }

  printf("computing pseudoinverse for Z_C_den (step 1)\n"); fflush(stdout);
  cula_exception(culaDgemm('n', 'n', NC, NB, NC, 1, l_svec_mat_ptr, NC, sval_mat_ptr, NC, 0, pinv_ptr, NC), "culaBlockDgemm", "Z_C_den^+ (1), i.e., U*S_-1"); // U*S_-1

  printf("computing pseudoinverse for Z_C_den (step 2)\n"); fflush(stdout);
  cula_exception(culaDgemm('n', 'n', NC, NB, NB, 1, pinv_ptr, NC, r_svec_mat_T_ptr, NB, 0, pinv_ptr, NC), "culaBlockDgemm", "Z_C_den^+ (2), i.e., (U*S_-1)*VT"); // (U*S_-1)*VT - note: this is not the pseudoinverse yet; this is transposed later on directly in the subsequent gemm to get V * S_-1 * U^T
******/

/* // ben-israel iterations
  pinv(NB, NC, Z_C_den_ptr, pinv_ptr);
*/

  printf("computing Z_C\n"); fflush(stdout);
  cula_exception(culaBlockDgemm('n', 't', NA, NC, NB, 1, Z_C_num_ptr, NA, pinv_ptr, NC, 0, Z_C_ptr, NA), "culaBlockDgemm", "Z_C"); // note: here 2nd arg is 't' because pinv is not yet transposed; this was changed for the nystrom because 2nd arg is t since we have to transpose pinv(Z_B_den)

/******
  // reset buffers
  printf("resetting buffers used for pseudoinverse of Z_C_den\n"); fflush(stdout);
  memset(sval_vec_ptr, 0, NMAX*sizeof(double));
  memset(sval_mat_ptr, 0, NMAX*NMAX*sizeof(double));
  memset(l_svec_mat_ptr, 0, NMAX*NMAX*sizeof(double));
  memset(r_svec_mat_T_ptr, 0, NMAX*NMAX*sizeof(double));
  memset(pinv_ptr, 0, NMAX*NMAX*sizeof(double));
******/


  // compute tilde matrices
  printf("computing G_XB_til\n"); fflush(stdout);
  cula_exception(culaBlockDgemm('t', 't', NX, NA, NB, 1, G_XB_ptr, NB, Z_B_ptr, NA, 0, G_XB_til_ptr, NX), "culaBlockDgemm", "G_XB_til"); // 1st arg is t as G_XB is NB x NX; 2nd arg is Z_B which is NA \times NB

  printf("computing G_XC_til\n"); fflush(stdout);
  cula_exception(culaBlockDgemm('t', 't', NX, NA, NC, 1, G_XC_ptr, NC, Z_C_ptr, NA, 0, G_XC_til_ptr, NX), "culaBlockDgemm", "G_XC_til"); // 1st arg is t as G_XC is NC x NX; 2nd arg is Z_C which is NA \times NC


  // compute M2_al0
  printf("computing M2_al0 (step 1)\n"); fflush(stdout);
  cula_exception(culaBlockDgemm('t', 'n', NA, NA, NX, 1/(double)(NX), G_XC_til_ptr, NX, G_XB_til_ptr, NX, 0, M2_al0_ptr, NA), "culaBlockDgemm", "M2_al0 (step 1)"); // intermediate M2

  printf("computing mu_A \\otimes mu_A\n"); fflush(stdout);
  cula_exception(culaBlockDgemm('n', 'n', NA, NA, 1, 1, mu_A_ptr, NA, mu_A_ptr, 1, 0, mu_A_mu_A_T_ptr, NA), "culaBlockDgemm", "mu_A \\otimes mu_A"); // outer product for shifted term

  printf("for loop for setting the diagonal of mu_A \\otimes mu_A to zero\n"); fflush(stdout);
  for(i=0; i<NA; i++)
    *(mu_A_mu_A_T_ptr+(NA*i+i)) = 0; // setting the diagonal to 0 for centering

  printf("computing M2_al0 (step 2/final step)\n"); fflush(stdout);
  cula_exception(culaBlockDgemm('n', 'n', NA, NA, NA, -ALPHA_0/(ALPHA_0+1), eye_ptr, NA, mu_A_mu_A_T_ptr, NA, 1, M2_al0_ptr, NA), "culaBlockDgemm", "M2_al0"); // M2_al0 (NA x NA) computed


  printf("computing whitening matrix using the nystrom method\n"); fflush(stdout);
  cula_exception(nystrom_whitening(NA, NCOM, M2_al0_ptr, W_ptr), "nystrom whitening (c malloc could have failed)", "W");


/******
  cula_exception(culaDgesvd('A', 'A', NA, NA, M2_al0_ptr, NA, sval_vec_ptr, l_svec_mat_ptr, NA, r_svec_mat_T_ptr, NA), "culaDgesvd", "M2_al0"); // k-svd: remember when comparing to matlab's output that if x is singular vector, so is -x; also here, we are actually doing full svd and selecting NCOM singular values

  printf("for loop to copy singular values from svd of M2_al0\n"); fflush(stdout);
  for(i=0; i<NA; i++)
  {
    if(i>NA)
      break;
    else if(fabs(*(sval_vec_ptr+i)) > PINV_TOL)
      *(sval_mat_ptr+(NA*i+i)) = 1/sqrt(*(sval_vec_ptr+i));
  }

  // computing the whitening matrix W
  printf("computing whitening matrix W\n"); fflush(stdout);
  cula_exception(culaBlockDgemm('n', 'n', NA, NCOM, NA, 1, l_svec_mat_ptr, NA, sval_mat_ptr, NA, 0, W_ptr, NA), "culaBlockDgemm", "W"); // whitening matrix W computed
******/


/*
  // uncomment these lines to debug for small test cases to check if the whitening matrix is non-zero / computed correctly
  printf("whitening matrix debug check\n");
  fflush(stdout);
  print_mat(1, NA, W_ptr);
  culaShutdown();
  exit(2);
*/


  // generate the whitened data for stochastic method coded in another c source file - the adjacency matrix buffers are externed in the other source file
  printf("computing whitened adjacency matrix for A\n"); fflush(stdout);
  cula_exception(culaBlockDgemm('t', 'n', NCOM, NX, NA, 1, W_ptr, NA, G_XA_ptr, NA, 0, G_XA_white_ptr, NCOM), "culaBlockDgemm", "G_XA_white"); // G_XA_white

  printf("computing whitened adjacency matrix for B\n"); fflush(stdout);
  cula_exception(culaBlockDgemm('t', 't', NCOM, NX, NA, 1, W_ptr, NA, G_XB_til_ptr, NX, 0, G_XB_til_white_ptr, NCOM), "culaBlockDgemm", "G_XB_til_white"); // G_XB_til_white

  printf("computing whitened adjacency matrix for C\n"); fflush(stdout);
  cula_exception(culaBlockDgemm('t', 't', NCOM, NX, NA, 1, W_ptr, NA, G_XC_til_ptr, NX, 0, G_XC_til_white_ptr, NCOM), "culaBlockDgemm", "G_XC_til_white"); // G_XC_til_white


  printf("computing whitened mean vector for A\n"); fflush(stdout);
/////  cula_exception(culaDgemv('t', NA, NCOM, 1, W_ptr, NA, mu_A_ptr, 1, 0, mu_A_white_ptr, 1), "culaDgemv", "mu_A_white"); // compute whitened mean vector A
  cula_exception(culaBlockDgemm('t', 'n', NCOM, 1, NA, 1, W_ptr, NA, mu_A_ptr, NA, 0, mu_A_white_ptr, NCOM), "culaBlockDgemm", "mu_A_white"); // compute whitened mean vector A using partitioned matrix multiplication

  printf("computing whitened mean vector for B (step 1)\n"); fflush(stdout);
/////  cula_exception(culaDgemv('n', NA, NB, 1, Z_B_ptr, NA, mu_B_ptr, 1, 0, mu_B_til_ptr, 1), "culaDgemv", "mu_B_til");
  cula_exception(culaBlockDgemm('n', 'n', NA, 1, NB, 1, Z_B_ptr, NA, mu_B_ptr, NB, 0, mu_B_til_ptr, NA), "culaBlockDgemm", "mu_B_til");

  printf("computing whitened mean vector for B\n"); fflush(stdout);
/////  cula_exception(culaDgemv('t', NA, NCOM, 1, W_ptr, NA, mu_B_til_ptr, 1, 0, mu_B_til_white_ptr, 1), "culaDgemv", "mu_B_til_white"); // compute whitened mean vector B
  cula_exception(culaBlockDgemm('t', 'n', NCOM, 1, NA, 1, W_ptr, NA, mu_B_til_ptr, NA, 0, mu_B_til_white_ptr, NCOM), "culaBlockDgemm", "mu_B_white"); // compute whitened mean vector B using partitioned matrix multiplication

  printf("computing whitened mean vector for C (step 1)\n"); fflush(stdout);
/////  cula_exception(culaDgemv('n', NA, NC, 1, Z_C_ptr, NA, mu_C_ptr, 1, 0, mu_C_til_ptr, 1), "culaDgemv", "mu_C_til");
  cula_exception(culaBlockDgemm('n', 'n', NA, 1, NC, 1, Z_C_ptr, NA, mu_C_ptr, NC, 0, mu_C_til_ptr, NA), "culaBlockDgemm", "mu_C_til");

  printf("computing whitened mean vector for C\n"); fflush(stdout);
/////  cula_exception(culaDgemv('t', NA, NCOM, 1, W_ptr, NA, mu_C_til_ptr, 1, 0, mu_C_til_white_ptr, 1), "culaDgemv", "mu_C_til_white"); // compute whitened mean vector C
  cula_exception(culaBlockDgemm('t', 'n', NCOM, 1, NA, 1, W_ptr, NA, mu_C_til_ptr, NA, 0, mu_C_til_white_ptr, NCOM), "culaBlockDgemm", "mu_C_white"); // compute whitened mean vector C using partitioned matrix multiplication


  printf("pre-processing completed; ready for stochastic method\n");
  culaShutdown();
  return 0;
}
int main()
{
    int i, j, m, n, k, l, p, q, s, frank, kstep, nstep, culaVersion;
    double TOL,normM,normU,normS,normV,normP,percent_error,elapsed_secs;
    mat *M, *T, *S, *C, *U, *R, *Q, *B;
    vec *Icol, *Irow;
    time_t start_time, end_time;
    char *M_file = "../../matrix_data/A_mat_6kx12k.bin";
    //char *M_file = "../data/A_mat_2kx4k.bin";
    //char *M_file = "../data/A_mat_1kx2k.bin";
    //char *M_file = "../data/A_mat_10x8.bin";
    struct timeval start_timeval, end_timeval;
    culaStatus status;

    printf("Initializing CULA\n");
    status = culaInitialize();
    checkStatus(status);

    culaVersion = culaGetVersion();
    printf("culaVersion is %d\n", culaVersion);
 

    printf("loading matrix from %s\n", M_file);
    M = matrix_load_from_binary_file(M_file);
    m = M->nrows;
    n = M->ncols;
    printf("sizes of M are %d by %d\n", m, n);
    printf("norm(M,fro) = %f\n", get_matrix_frobenius_norm(M));

    // now test rank k ID of M..
    k = 1000; // rank
    p = 20; // oversampling
    q = 0; // power scheme power
    s = 2; // power scheme orthogonalization amount
    kstep = 500; //block step size
    nstep = 2;
    TOL = 0;
    
    printf("\ncalling rank %d column ID routine..\n", k);
    gettimeofday(&start_timeval, NULL);
    //id_decomp_fixed_rank_or_prec(M, k, 0, &frank, &Icol, &T);
    gettimeofday(&end_timeval, NULL);
    elapsed_secs = get_seconds_frac(start_timeval,end_timeval);
    printf("elapsed time is: %4.1f\n", elapsed_secs);
    printf("check error\n");
    //use_id_decomp_for_approximation(M, T, Icol, k);


    printf("\ncalling rank %d old QB routine..\n", k);
    gettimeofday(&start_timeval, NULL);
    //randQB_pb(M, kstep, nstep, q, s, &Q, &B);
    gettimeofday(&end_timeval, NULL);
    elapsed_secs = get_seconds_frac(start_timeval,end_timeval);
    printf("elapsed time is: %4.1f\n", elapsed_secs);
    printf("check error\n");
    //use_QB_decomp_for_approximation(M, Q, B);


    printf("\ncalling rank %d new QB routine..\n", k);
    gettimeofday(&start_timeval, NULL);
    randQB_pb_new(M, kstep, nstep, TOL, q, s, &frank, &Q, &B);
    gettimeofday(&end_timeval, NULL);
    elapsed_secs = get_seconds_frac(start_timeval,end_timeval);
    printf("elapsed time is: %4.1f\n", elapsed_secs);
    printf("check error\n");
    use_QB_decomp_for_approximation(M, Q, B);



    printf("\ncalling rank %d block randomized column ID routine..\n", k);
    p = kstep; // p should be \geq kstep
    gettimeofday(&start_timeval, NULL);
    id_blockrand_decomp_fixed_rank_or_prec(M, k, p, TOL, kstep, q, s, &frank, &Icol, &T);
    gettimeofday(&end_timeval, NULL);
    elapsed_secs = get_seconds_frac(start_timeval,end_timeval);
    printf("elapsed time is: %4.1f\n", elapsed_secs);
    printf("check error\n");
    use_id_decomp_for_approximation(M, T, Icol, k);



    printf("\ncalling rank %d randomized column ID routine..\n", k);
    p = kstep;
    gettimeofday(&start_timeval, NULL);
    id_rand_decomp_fixed_rank(M, k, p, q, s, &Icol, &T);
    gettimeofday(&end_timeval, NULL);
    elapsed_secs = get_seconds_frac(start_timeval,end_timeval);
    printf("elapsed time is: %4.1f\n", elapsed_secs);
    printf("check error\n");
    use_id_decomp_for_approximation(M, T, Icol, k);


    // delete and exit
    //printf("delete and exit..\n");
    //matrix_delete(M); matrix_delete(T); matrix_delete(S);
    //vector_delete(Icol); vector_delete(Irow);

    //printf("Shutting down CULA\n");
    //culaShutdown();
    //return EXIT_SUCCESS;


    printf("\ncalling rank %d two sided ID routine..\n", k);
    gettimeofday(&start_timeval, NULL);
    //id_two_sided_decomp_fixed_rank_or_prec(M, k, TOL, &frank, &Icol, &Irow, &T, &S);
    gettimeofday(&end_timeval, NULL);
    elapsed_secs = get_seconds_frac(start_timeval,end_timeval);
    printf("elapsed time is: %4.1f\n", elapsed_secs);
    printf("check error\n");
    //use_id_two_sided_decomp_for_approximation(M, T, S, Icol, Irow, k);

    printf("\ncalling rank %d randomized two sided ID routine..\n", k);
    p = 20;
    gettimeofday(&start_timeval, NULL);
    id_two_sided_rand_decomp_fixed_rank(M, k, p, q, s, &Icol, &Irow, &T, &S);
    gettimeofday(&end_timeval, NULL);
    elapsed_secs = get_seconds_frac(start_timeval,end_timeval);
    printf("elapsed time is: %4.1f\n", elapsed_secs);
    printf("check error\n");
    use_id_two_sided_decomp_for_approximation(M, T, S, Icol, Irow, k);

    printf("\ncalling rank %d block randomized two sided ID routine..\n", k);
    p = kstep;
    gettimeofday(&start_timeval, NULL);
    id_two_sided_blockrand_decomp_fixed_rank_or_prec(M, k, p, TOL, kstep, q, s, &frank, &Icol, &Irow, &T, &S);
    gettimeofday(&end_timeval, NULL);
    elapsed_secs = get_seconds_frac(start_timeval,end_timeval);
    printf("elapsed time is: %4.1f\n", elapsed_secs);
    printf("check error\n");
    use_id_two_sided_decomp_for_approximation(M, T, S, Icol, Irow, k);


    printf("\ncalling rank %d CUR routine\n", k);
    gettimeofday(&start_timeval, NULL);
    //cur_decomp_fixed_rank_or_prec(M, k, TOL, &frank, &C, &U, &R);
    gettimeofday(&end_timeval, NULL);
    elapsed_secs = get_seconds_frac(start_timeval,end_timeval);
    printf("elapsed time is: %4.1f\n", elapsed_secs);
    printf("check error\n");
    //use_cur_decomp_for_approximation(M, C, U, R);

    printf("\ncalling rank %d randomized CUR routine\n", k);
    p = 20;
    gettimeofday(&start_timeval, NULL);
    cur_rand_decomp_fixed_rank(M, k, p, q, s, &C, &U, &R);
    gettimeofday(&end_timeval, NULL);
    elapsed_secs = get_seconds_frac(start_timeval,end_timeval);
    printf("elapsed time is: %4.1f\n", elapsed_secs);
    printf("check error\n");
    use_cur_decomp_for_approximation(M, C, U, R);
 
    printf("\ncalling rank %d block randomized CUR routine\n", k);
    p = kstep;
    gettimeofday(&start_timeval, NULL);
    cur_blockrand_decomp_fixed_rank_or_prec(M, k, p, TOL, kstep, q, s, &frank, &C, &U, &R);
    gettimeofday(&end_timeval, NULL);
    elapsed_secs = get_seconds_frac(start_timeval,end_timeval);
    printf("elapsed time is: %4.1f\n", elapsed_secs);
    printf("check error\n");
    use_cur_decomp_for_approximation(M, C, U, R);
    

    // delete and exit
    printf("delete and exit..\n");
    matrix_delete(M); matrix_delete(T); matrix_delete(S);
    vector_delete(Icol); vector_delete(Irow);

    printf("Shutting down CULA\n");
    culaShutdown();

    return EXIT_SUCCESS;
}
Esempio n. 14
0
int main(int argc, char* argv[]){

	//float beta;
	//int row,col;

	string file_name = FILE_NAME;

	HostMatrix<float> X;
	HostMatrix<float> X_test; 
	HostMatrix<float> Y;
	HostMatrix<float> Y_test;

	HostMatrix<float> Input;
	HostMatrix<float> Target;

	std::map<string,int> Classes;
	std::map<int,string> ClassesLookup;

	readFile(file_name,Input,Target,Classes,ClassesLookup);

	int kfold = 1;
	int correct_instances = 0;
	int incorrect_instances = 0;
	int total_instances = 0;

	int **confusionMatrix;

	confusionMatrix = (int**) malloc(sizeof(int*)*Classes.size());

	for(int i = 0; i < (int)Classes.size(); i++){
		confusionMatrix[i] = (int*) malloc(sizeof(int)*Classes.size());
		memset(confusionMatrix[i],0,sizeof(int)*Classes.size());
	}


	float Pet_mean = 0;
	float Ped_mean = 0;

	unsigned int seed = (unsigned)time(0);

	/***************RUN INFORMATION*************/

	writeHeader(Input,Classes.size(),seed);

	/*******************************************/


	if(!InitCUDA()) {
		return 1;
	}

	culaStatus status;
	status = culaInitialize();


	std::cout << "Starting " << std::endl;


	float center_time = 0;
	float width_time = 0;
	float weight_time = 0;
	float scaling_time = 0;

	unsigned int time_total = 0;
	unsigned int testing_time = 0;
	unsigned int training_time = 0;

	clock_t initialTimeTotal = clock();

	do{	
		X = crossvalidationTrain(Input,KFOLDS,kfold);
		X_test = crossvalidationTest(Input,KFOLDS,kfold);
		Y = crossvalidationTrain(Target,KFOLDS,kfold);
		Y_test = crossvalidationTest(Target,KFOLDS,kfold);

		HostMatrix<float> Weights;
		HostMatrix<float> Centers;

		/*Train Network*/

		clock_t initialTime = clock();
		RadialBasisFunction RBF(NETWORK_SIZE,RNEIGHBOURS,SCALING_FACTOR,Classes.size());
		RBF.SetSeed(seed);
		RBF.Train(X,Y);
		training_time = (clock() - initialTime);

		center_time += RBF.times[0];
		width_time += RBF.times[1];
		weight_time += RBF.times[2];
		scaling_time += RBF.times[3];

		/*Test Network*/

		initialTime = clock();
		std::cout << "Testing" << std::endl;
		HostMatrix<float> out_test;


		out_test = RBF.Test(X_test);

		for(int i = 0; i< X_test.Rows();i++){

			float max = 0;
			float out_class = 0;
			for(int j = 0; j < (int) Classes.size(); j++){
				if(out_test(i,j) > max){
					out_class = (float)j;
					max = out_test(i,j);
				}
			}

			out_test(i,0) = out_class+1;

		}

		for (int i = 0; i < out_test.Rows(); i++)
		{

			out_test(i,0) = (float)round(out_test(i,0));     

			if(out_test(i,0) <= 0) out_test(i,0) = 1;

			if(out_test(i,0) > Classes.size()) out_test(i,0) = (float)Classes.size();

			std::cout << Y_test(i,0) << " " << out_test(i,0) << std::endl;
		}


		correct_instances += out_test.Rows() - error_calc(Y_test,out_test);
		incorrect_instances += error_calc(Y_test,out_test);
		total_instances += out_test.Rows();

		/*Add values to Confusion Matrix*/
		for(int i = 0; i < Y_test.Rows(); i++){
			confusionMatrix[((int)Y_test(i,0))-1][((int)out_test(i,0))-1] = confusionMatrix[((int)Y_test(i,0))-1][((int)out_test(i,0))-1] + 1;
		}

		testing_time = (clock() - initialTime);

		/*Increment fold number, for use in crossvalidation*/
		kfold++;
	}while(kfold <= KFOLDS);

	time_total  = (clock() - initialTimeTotal);

	/*****************MEASURES****************/

	measures(correct_instances,total_instances,incorrect_instances,confusionMatrix,Classes,ClassesLookup);

	writeFooter(center_time,width_time,weight_time,scaling_time,training_time,testing_time,time_total);

	culaShutdown();
	cudaThreadExit();

	return 0;
}