int main(void) { int M = 8192; int N = M; culaStatus status; float* A = NULL; float* B = NULL; A = (float*)malloc(M*N*sizeof(float)); B = (float*)malloc(N*sizeof(float)); if(!A || !B) exit(EXIT_FAILURE); memset(A, 0, M*N*sizeof(float)); memset(B, 0, N*sizeof(float)); status = culaInitialize(); status = culaSgeqrf(M, N, A, M, B); culaShutdown(); free(A); free(B); return EXIT_SUCCESS; }
void chofactor(int N, float* A, int LDA) { int i, j; /*for (i=0;i<10;i++){*/ /*for (j=0;j<10;j++){*/ /*[>*(A+j*N+i) = 0.;<]*/ /*printf("%0.1g ", *(A+i*N+j));*/ /*};*/ /*printf("\n");*/ /*}*/ culaStatus status; status = culaInitialize(); checkStatus(status); status = culaSgeNancheck(LDA, N, A, LDA); /*printf("check nan %d\n", status);*/ status = culaSpotrf('U', N, A, LDA); /*printf("INFO %d\n", status);*/ checkStatus(status); culaShutdown(); for (i=0; i<N; i++) { for (j=0; j<i; j++) { *(A+j*N+i) = 0.; }; } }
// function to read the adjacency submatrices from file (when stored in two-column or three-column sparse matrix format with doubleing point entries); note: this reads, for example, G_XA^T instead of G_XA void read_G(char *file_name, char *G_name, double *G_ptr, int N) // input file name, matrix name, pointer to matrix buffer, (NA or NB or NC) { double r_idx, c_idx; // row and column indices - matlab style printf("reading %s\n", G_name); fflush(stdout); FILE *file_ptr = fopen(file_name, "r"); // opening G_name if(file_ptr == NULL) // exception handling if reading G_name fails { printf("reading %s adjacency submatrix failed\n", G_name); culaShutdown(); exit(1); } while(!feof(file_ptr)) // reading G_name { fscanf(file_ptr, "%lf", &c_idx); // note: since we need (NA or NB or NC) \times NX, we read the column index first, then usual column-major fscanf(file_ptr, "%lf", &r_idx); // now, NROWS = (NA or NB or NC) (y-axis), NCOLS = NX (x-axis) # ifdef EXPECTED fscanf(file_ptr, "%lf", G_ptr+(int)((c_idx-1)*N+(r_idx-1))); # endif # ifdef BINARY *(G_ptr+(int)((c_idx-1)*N+(r_idx-1))) = 1; # endif /* // optional printing to check file read is faithful (used for debugging) printf("%lf\n", c_idx); printf("%lf\n", r_idx); printf("%lf\n", *(G_ptr+(int)((c_idx-1)*N+(r_idx-1)))); culaShutdown(); exit(0); */ } fclose(file_ptr); }
void chosolve(int N, int Nrhs, float* C, int ldc, float* b, int ldb) { culaStatus status; status = culaInitialize(); checkStatus(status); status = culaSpotrs('U', N, Nrhs, C, ldc, b, ldb); checkStatus(status); culaShutdown(); }
/* cula error status */ void checkStatus(culaStatus status) { char buf[256]; if(!status) return; culaGetErrorInfoString(status, culaGetErrorInfo(), buf, sizeof(buf)); printf("%s\n", buf); culaShutdown(); exit(EXIT_FAILURE); }
// function for cula exception handling - make sure that the char * arguments are not NULL while calling void cula_exception(culaStatus cula_err, char *cula_func, char *term) // error status, cula function that fails, term in the algorithm { int cula_info; // identifier for the cula error char cula_msg[256]; // buffer for storing the cula excepetion message if(cula_err != culaNoError) { cula_info = culaGetErrorInfo(); culaGetErrorInfoString(cula_err, cula_info, cula_msg, sizeof(cula_msg)); printf("(cula error) user message: %s for %s failed; cula message: %s\n", cula_func, term, cula_msg); fflush(stdout); culaShutdown(); exit(2); } }
/* Main */ void mexFunction( int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) { if (nrhs != 7) { mexErrMsgTxt("sgemm requires 7 input arguments"); } else if (nlhs != 1) { mexErrMsgTxt("sgemm requires 1 output argument"); } if ( !MXISDOUBLE(prhs[4]) || !MXISDOUBLE(prhs[5]) || !MXISDOUBLE(prhs[6])) { mexErrMsgTxt("Input arrays must be single precision."); } int ta = (int) mxGetScalar(prhs[0]); int tb = (int) mxGetScalar(prhs[1]); DOUBLE alpha = (DOUBLE) mxGetScalar(prhs[2]); DOUBLE beta = (DOUBLE) mxGetScalar(prhs[3]); DOUBLE *h_A = (DOUBLE*) mxGetData(prhs[4]); DOUBLE *h_B = (DOUBLE*) mxGetData(prhs[5]); DOUBLE *h_C = (DOUBLE*) mxGetData(prhs[6]); int M = mxGetM(prhs[4]); /* gets number of rows of A */ int K = mxGetN(prhs[4]); /* gets number of columns of A */ int L = mxGetM(prhs[5]); /* gets number of rows of B */ int N = mxGetN(prhs[5]); /* gets number of columns of B */ char transa, transb; int MM, KK, NN; if (ta == 0) { transa = 'N'; MM=M; KK=K; } else { transa = 'T'; MM=K; KK=M; } if (tb == 0) { transb = 'N'; NN=N; } else { transb = 'T'; NN=L; } /* Left hand side matrix set up */ mwSize dims0[2]; dims0[0]=MM; dims0[1]=NN; plhs[0] = mxCreateNumericArray(2,dims0,mxPRECISION_CLASS,mxREAL); DOUBLE *h_C_out = (DOUBLE*) mxGetData(plhs[0]); cublasStatus status; culaInitialize(); int iter; for (iter = 0; iter < 100000; ++iter) { /* Performs operation using cublas */ CUGEMM(transa, transb, MM, NN, KK, alpha, h_A, M, h_B, L, beta, h_C, MM); } memcpy((void *) h_C_out, (void *) h_C, MM * NN * sizeof(h_C[0])); /* Shutdown */ culaShutdown(); }
void culaFloatExample() { #ifdef NDEBUG int N = 8192; #else int N = 1024; #endif int NRHS = 1; int i; culaStatus status; culaFloat* A = NULL; culaFloat* B = NULL; culaFloat* X = NULL; culaInt* IPIV = NULL; culaFloat one = 1.0f; culaFloat thresh = 1e-6f; culaFloat diff; printf("-------------------\n"); printf(" SGESV\n"); printf("-------------------\n"); printf("Allocating Matrices\n"); A = (culaFloat*)malloc(N*N*sizeof(culaFloat)); B = (culaFloat*)malloc(N*sizeof(culaFloat)); X = (culaFloat*)malloc(N*sizeof(culaFloat)); IPIV = (culaInt*)malloc(N*sizeof(culaInt)); if(!A || !B || !IPIV) exit(EXIT_FAILURE); printf("Initializing CULA\n"); status = culaInitialize(); checkStatus(status); // Set A to the identity matrix memset(A, 0, N*N*sizeof(culaFloat)); for(i = 0; i < N; ++i) A[i*N+i] = one; // Set B to a random matrix (see note at top) for(i = 0; i < N; ++i) B[i] = (culaFloat)rand(); memcpy(X, B, N*sizeof(culaFloat)); memset(IPIV, 0, N*sizeof(culaInt)); printf("Calling culaSgesv\n"); status = culaSgesv(N, NRHS, A, N, IPIV, X, N); checkStatus(status); printf("Verifying Result\n"); for(i = 0; i < N; ++i) { diff = X[i] - B[i]; if(diff < 0.0f) diff = -diff; if(diff > thresh) printf("Result check failed: i=%d X[i]=%f B[i]=%f", i, X[i], B[i]); } printf("Shutting down CULA\n\n"); culaShutdown(); free(A); free(B); free(IPIV); }
void culaDoubleComplexExample() { #ifdef NDEBUG int N = 1024; #else int N = 128; #endif int NRHS = 1; int i; culaStatus status; culaDoubleComplex* A = NULL; culaDoubleComplex* B = NULL; culaDoubleComplex* X = NULL; culaInt* IPIV = NULL; culaDoubleComplex one = { 1.0, 0.0 }; culaDouble thresh = 1e-6; culaDouble diffr; culaDouble diffc; culaDouble diffabs; printf("-------------------\n"); printf(" ZGESV\n"); printf("-------------------\n"); printf("Allocating Matrices\n"); A = (culaDoubleComplex*)malloc(N*N*sizeof(culaDoubleComplex)); B = (culaDoubleComplex*)malloc(N*sizeof(culaDoubleComplex)); X = (culaDoubleComplex*)malloc(N*sizeof(culaDoubleComplex)); IPIV = (culaInt*)malloc(N*sizeof(culaInt)); if(!A || !B || !IPIV) exit(EXIT_FAILURE); printf("Initializing CULA\n"); status = culaInitialize(); checkStatus(status); // Set A to the identity matrix memset(A, 0, N*N*sizeof(culaDoubleComplex)); for(i = 0; i < N; ++i) A[i*N+i] = one; // Set B to a random matrix (see note at top) for(i = 0; i < N; ++i) { B[i].x = (culaDouble)rand(); B[i].y = (culaDouble)rand(); } memcpy(X, B, N*sizeof(culaDoubleComplex)); memset(IPIV, 0, N*sizeof(culaInt)); printf("Calling culaZgesv\n"); status = culaZgesv(N, NRHS, A, N, IPIV, X, N); if(status == culaInsufficientComputeCapability) { printf("No Double precision support available, skipping example\n"); free(A); free(B); free(IPIV); culaShutdown(); return; } checkStatus(status); printf("Verifying Result\n"); for(i = 0; i < N; ++i) { diffr = X[i].x - B[i].x; diffc = X[i].y - B[i].y; diffabs = (culaDouble)sqrt(X[i].x*X[i].x+X[i].y*X[i].y) - (culaDouble)sqrt(B[i].x*B[i].x+B[i].y*B[i].y); if(diffr < 0.0) diffr = -diffr; if(diffc < 0.0) diffc = -diffc; if(diffabs < 0.0) diffabs = -diffabs; if(diffr > thresh || diffc > thresh || diffabs > thresh) printf("Result check failed: i=%d X[i]=(%f,%f) B[i]=(%f,%f)", i, X[i].x, X[i].y, B[i].x, B[i].y); } printf("Shutting down CULA\n\n"); culaShutdown(); free(A); free(B); free(IPIV); }
void culaDoubleExample() { #ifdef NDEBUG int N = 4096; #else int N = 512; #endif int NRHS = 1; int i; culaStatus status; culaDouble* A = NULL; culaDouble* B = NULL; culaDouble* X = NULL; culaInt* IPIV = NULL; culaDouble one = 1.0; culaDouble thresh = 1e-6; culaDouble diff; printf("-------------------\n"); printf(" DGESV\n"); printf("-------------------\n"); printf("Allocating Matrices\n"); A = (culaDouble*)malloc(N*N*sizeof(culaDouble)); B = (culaDouble*)malloc(N*sizeof(culaDouble)); X = (culaDouble*)malloc(N*sizeof(culaDouble)); IPIV = (culaInt*)malloc(N*sizeof(culaInt)); if(!A || !B || !IPIV) exit(EXIT_FAILURE); printf("Initializing CULA\n"); status = culaInitialize(); checkStatus(status); // Set A to the identity matrix memset(A, 0, N*N*sizeof(culaDouble)); for(i = 0; i < N; ++i) A[i*N+i] = one; // Set B to a random matrix (see note at top) for(i = 0; i < N; ++i) B[i] = (culaDouble)rand(); memcpy(X, B, N*sizeof(culaDouble)); memset(IPIV, 0, N*sizeof(culaInt)); printf("Calling culaDgesv\n"); status = culaDgesv(N, NRHS, A, N, IPIV, X, N); if(status == culaInsufficientComputeCapability) { printf("No Double precision support available, skipping example\n"); free(A); free(B); free(IPIV); culaShutdown(); return; } checkStatus(status); printf("Verifying Result\n"); for(i = 0; i < N; ++i) { diff = X[i] - B[i]; if(diff < 0.0) diff = -diff; if(diff > thresh) printf("Result check failed: i=%d X[i]=%f B[i]=%f", i, X[i], B[i]); } printf("Shutting down CULA\n\n"); culaShutdown(); free(A); free(B); free(IPIV); }
void culaFloatComplexExample() { #ifdef NDEBUG int N = 4096; #else int N = 512; #endif int NRHS = 1; int i; culaStatus status; culaFloatComplex* A = NULL; culaFloatComplex* B = NULL; culaFloatComplex* X = NULL; culaInt* IPIV = NULL; culaFloatComplex one = { 1.0f, 0.0f }; culaFloat thresh = 1e-6f; culaFloat diffr; culaFloat diffc; culaFloat diffabs; printf("-------------------\n"); printf(" CGESV\n"); printf("-------------------\n"); printf("Allocating Matrices\n"); A = (culaFloatComplex*)malloc(N*N*sizeof(culaFloatComplex)); B = (culaFloatComplex*)malloc(N*sizeof(culaFloatComplex)); X = (culaFloatComplex*)malloc(N*sizeof(culaFloatComplex)); IPIV = (culaInt*)malloc(N*sizeof(culaInt)); if(!A || !B || !IPIV) exit(EXIT_FAILURE); printf("Initializing CULA\n"); status = culaInitialize(); checkStatus(status); // Set A to the identity matrix memset(A, 0, N*N*sizeof(culaFloatComplex)); for(i = 0; i < N; ++i) A[i*N+i] = one; // Set B to a random matrix (see note at top) for(i = 0; i < N; ++i) { B[i].x = (culaFloat)rand(); B[i].y = (culaFloat)rand(); } memcpy(X, B, N*sizeof(culaFloatComplex)); memset(IPIV, 0, N*sizeof(culaInt)); printf("Calling culaCgesv\n"); status = culaCgesv(N, NRHS, A, N, IPIV, X, N); checkStatus(status); printf("Verifying Result\n"); for(i = 0; i < N; ++i) { diffr = X[i].x - B[i].x; diffc = X[i].y - B[i].y; diffabs = (culaFloat)sqrt(X[i].x*X[i].x+X[i].y*X[i].y) - (culaFloat)sqrt(B[i].x*B[i].x+B[i].y*B[i].y); if(diffr < 0.0f) diffr = -diffr; if(diffc < 0.0f) diffc = -diffc; if(diffabs < 0.0f) diffabs = -diffabs; if(diffr > thresh || diffc > thresh || diffabs > thresh) printf("Result check failed: i=%d X[i]=(%f,%f) B[i]=(%f,%f)", i, X[i].x, X[i].y, B[i].x, B[i].y); } printf("Shutting down CULA\n\n"); culaShutdown(); free(A); free(B); free(IPIV); }
// function for pre-processing; here, exit(1) stands for file i/o error, exit(2) stands for cula error int pre_proc() { cula_exception(culaSelectDevice(1), "culaSelectDevice", "pre_proc gpu selection"); // select which gpu to execute on; 0 is master or primary, 1 is slave or secondary printf("calling culaInitialize\n"); fflush(stdout); cula_exception(culaInitialize(), "culaInitialize", "pre_proc.c"); // pointers to global buffers double *G_XA_ptr = G_XA_buff, *G_XB_ptr = G_XB_buff, *G_XC_ptr = G_XC_buff; // adjacency matrix pointers double *G_XB_til_ptr = G_XB_til_buff, *G_XC_til_ptr = G_XC_til_buff; // tilde pointers double *G_XA_white_ptr = G_XA_white_buff, *G_XB_til_white_ptr = G_XB_til_white_buff, *G_XC_til_white_ptr = G_XC_til_white_buff; // whitened matrix related pointers double *M2_al0_ptr = M2_al0_buff; // moment 2 pointer double *G_XA_al0_ptr = G_XA_al0_buff, *G_XB_al0_ptr = G_XB_al0_buff, *G_XC_al0_ptr = G_XC_al0_buff; // alpha0 matrix pointers double *mu_A_ptr = mu_A_buff, *mu_B_ptr = mu_B_buff, *mu_C_ptr = mu_C_buff; // mean vector pointers double *mu_B_til_ptr = mu_B_til_buff, *mu_C_til_ptr = mu_C_til_buff; // mean tilde vector pointers double *mu_A_white_ptr = mu_A_white_buff, *mu_B_til_white_ptr = mu_B_til_white_buff, *mu_C_til_white_ptr = mu_C_til_white_buff; // pointers to buffers for whitened mean vectors double *mu_A_mu_A_T_ptr = mu_A_mu_A_T_buff; // outer product of mean vector A with itself double *Z_B_ptr = Z_B_buff, *Z_B_num_ptr = Z_B_num_buff, *Z_B_den_ptr = Z_B_den_buff, *Z_C_ptr = Z_C_buff, *Z_C_num_ptr = Z_C_num_buff;//, *Z_C_den_ptr = Z_C_den_buff; double *ones_ptr = ones_buff, *eye_ptr = eye_buff; // pointer to vector of ones for computing mean of size NX; identity matrix pointer double *pinv_ptr = pinv_buff;//, *l_svec_mat_ptr = l_svec_mat_buff, *r_svec_mat_T_ptr = r_svec_mat_T_buff, *sval_vec_ptr = sval_vec_buff, *sval_mat_ptr = sval_mat_buff; // pointers to buffers for pseudoinverse computation double *W_ptr = W_buff; // pointers to buffer for whitening matrix using second method (pairs) int i, j; // for looping / indexing //double *superb = sup_buff; // pointer to superb for mkl svd //mkl_set_num_threads(16); // number of cpu threads printf("initializing all ones vector of length NX\n"); fflush(stdout); for(i=0; i<NX; i++) // using this loop to initialize because memset 1 for double does not hold *(ones_ptr+i) = 1; printf("initializing identity matrix of size NA x NA\n"); fflush(stdout); for(i=0; i<NA*NA; i+=NA+1) // identity for cula products *(eye_ptr+i) = 1; // read the adjacency submatrices from the dataset read_G((char *)FILE_A, "G_XA", G_XA_ptr, (int)NA); read_G((char *)FILE_B, "G_XB", G_XB_ptr, (int)NB); read_G((char *)FILE_C, "G_XC", G_XC_ptr, (int)NC); // compute mean vectors printf("computing mu_A\n"); fflush(stdout); ///// cula_exception(culaDgemv('n', NA, NX, 1/(double)(NX), G_XA_ptr, NA, ones_ptr, 1, 0, mu_A_ptr, 1), "culaDgemv", "mu_A"); // compute mean vector A cula_exception(culaBlockDgemm('n', 'n', NA, 1, NX, 1/(double)(NX), G_XA_ptr, NA, ones_ptr, NX, 0, mu_A_ptr, NA), "culaBlockDgemm", "mu_A"); // compute mean vector A using partitioned matrix multiplication printf("computing mu_B\n"); fflush(stdout); ///// cula_exception(culaDgemv('n', NB, NX, 1/(double)(NX), G_XB_ptr, NB, ones_ptr, 1, 0, mu_B_ptr, 1), "culaDgemv", "mu_B"); // compute mean vector B cula_exception(culaBlockDgemm('n', 'n', NB, 1, NX, 1/(double)(NX), G_XB_ptr, NB, ones_ptr, NX, 0, mu_B_ptr, NB), "culaBlockDgemm", "mu_B"); // compute mean vector B using partitioned matrix multiplication printf("computing mu_C\n"); fflush(stdout); ///// cula_exception(culaDgemv('n', NC, NX, 1/(double)(NX), G_XC_ptr, NC, ones_ptr, 1, 0, mu_C_ptr, 1), "culaDgemv", "mu_C"); // compute mean vector C cula_exception(culaBlockDgemm('n', 'n', NC, 1, NX, 1/(double)(NX), G_XC_ptr, NC, ones_ptr, NX, 0, mu_C_ptr, NC), "culaBlockDgemm", "mu_C"); // compute mean vector C using partitioned matrix multiplication // compute Z_B printf("computing Z_B_num\n"); fflush(stdout); cula_exception(culaBlockDgemm('n', 't', NA, NC, NX, 1/(double)(NX), G_XA_ptr, NA, G_XC_ptr, NC, 0, Z_B_num_ptr, NA), "culaBlockDgemm", "Z_B_num"); printf("computing Z_B_den\n"); fflush(stdout); cula_exception(culaBlockDgemm('n', 't', NB, NC, NX, 1/(double)(NX), G_XB_ptr, NB, G_XC_ptr, NC, 0, Z_B_den_ptr, NB), "culaBlockDgemm", "Z_B_den"); // note: if the estimated Pi's are 0's, then print and check Z_B_num and den here; if G_XA and G_XC are sparse, then product is even sparser; so, input dataset must be reclustered to have sufficient density; also check sizes to be handled by cula /****** // this commented pseudoinverse using svd was used for synthetic, facebook and yelp datasets printf("computing SVD for Z_B_den\n"); fflush(stdout); cula_exception(culaDgesvd('A', 'A', NB, NC, Z_B_den_ptr, NB, sval_vec_ptr, l_svec_mat_ptr, NB, r_svec_mat_T_ptr, NC), "culaDgesvd", "Z_B_den"); /// if mkl svd is needed, using the following 3 lines instead of its cula counterpart /// gettimeofday(&start_timeval_svd1, NULL); // Measuring start time for svd1 /// LAPACKE_dgesvd(CblasColMajor, 'A', 'A', NB, NC, Z_B_den_ptr, NB, sval_vec_ptr, l_svec_mat_ptr, NB, r_svec_mat_T_ptr, NC, superb); /// gettimeofday(&stop_timeval_svd1, NULL); // Measuring stop time for svd1 printf("for loop to copy singular values from SVD of Z_B_den\n"); fflush(stdout); for(i=0; i<NB; i++) // aliter: use cuda kernel from wrappers.cu but take care of the if check { if(i>NC) break; else if(fabs(*(sval_vec_ptr+i)) > PINV_TOL) *(sval_mat_ptr+(NB*i+i)) = 1/(*(sval_vec_ptr+i)); } printf("computing pseudoinverse for Z_B_den (step 1)\n"); fflush(stdout); cula_exception(culaDgemm('n', 'n', NB, NC, NB, 1, l_svec_mat_ptr, NB, sval_mat_ptr, NB, 0, pinv_ptr, NB), "culaBlockDgemm", "Z_B_den^+ (1), i.e., U*S_-1"); // U*S_-1 printf("computing pseudoinverse for Z_B_den (step 2)\n"); fflush(stdout); cula_exception(culaDgemm('n', 'n', NB, NC, NC, 1, pinv_ptr, NB, r_svec_mat_T_ptr, NC, 0, pinv_ptr, NB), "culaBlockDgemm", "Z_B_den^+ (2), i.e., (U*S_-1)*VT"); // (U*S_-1)*VT - note: this is not the pseudoinverse yet; this is transposed later on directly in the subsequent gemm to get V * S_-1 * U^T ******/ /* // the following 2 lines use the iterative method; very slow even with partitioned gemm printf("computing pseudoinverse of Z_B_den using ben-israel iterations\n"); cula_exception(pinv(NB, NC, Z_B_den_ptr, pinv_ptr), "ben-israel pinv (c malloc must have failed)", "pinv(Z_B_den)"); */ /****** // this was the code that used pinv_ptr before transposing; this was used for facebook, yelp before partitioning, nystrom, etc.; note: look at the comment block associated with the memset statements below cula_exception(culaDgemm('n', 't', NA, NB, NC, 1, Z_B_num_ptr, NA, pinv_ptr, NB, 0, Z_B_ptr, NA), "culaDgemm", "Z_B"); // note: here 2nd arg is 't' because pinv is not yet transposed ******/ /****** // resetting buffers to 0 for reusing below; note: if uncommented, take care that this comes after the pinv_ptr values are used, i.e., insert in the appropriate location memset(pinv_ptr, 0, NMAX*NMAX*sizeof(double)); memset(pinv_ptr, 0, NB*NC*sizeof(double)); ******/ printf("computing pseudoinverse of Z_B_den using nystrom\n"); cula_exception(pinv_nys_asym(NB, NC, NCOM, Z_B_den_ptr, pinv_ptr), "nystrom pinv (c malloc could have failed)", "pinv(Z_B_den)"); printf("computing Z_B\n"); fflush(stdout); cula_exception(culaBlockDgemm('n', 'n', NA, NB, NC, 1, Z_B_num_ptr, NA, pinv_ptr, NC, 0, Z_B_ptr, NA), "culaBlockDgemm", "Z_B"); // note: here 2nd arg was 't' in the previous version of the code because pinv was not yet transposed; this was changed for the nystrom because 2nd arg is n not t as it is already transposed when it is returned from pin_nys_asym function /****** // reset buffers printf("resetting buffers used for pseudoinverse of Z_B_den\n"); fflush(stdout); memset(sval_vec_ptr, 0, NMAX*sizeof(double)); memset(sval_mat_ptr, 0, NMAX*NMAX*sizeof(double)); memset(l_svec_mat_ptr, 0, NMAX*NMAX*sizeof(double)); memset(r_svec_mat_T_ptr, 0, NMAX*NMAX*sizeof(double)); memset(pinv_ptr, 0, NMAX*NMAX*sizeof(double)); ******/ // compute Z_C printf("computing Z_C_num\n"); fflush(stdout); cula_exception(culaBlockDgemm('n', 't', NA, NB, NX, 1/(double)(NX), G_XA_ptr, NA, G_XB_ptr, NB, 0, Z_C_num_ptr, NA), "culaBlockDgemm", "Z_C_num"); /* // not needed: same as transpose of Z_B_den; can be used as debugging check printf("computing Z_C_den\n"); fflush(stdout); cula_exception(culaBlockDgemm('n', 't', NC, NB, NX, 1/(double)(NX), G_XC_ptr, NC, G_XB_ptr, NB, 0, Z_C_den_ptr, NC), "culaBlockDgemm", "Z_C_den"); */ /****** // this commented pseudoinverse using svd was used for synthetic, facebook and yelp datasets printf("computing svd for Z_C_den\n"); fflush(stdout); cula_exception(culaDgesvd('A', 'A', NC, NB, Z_C_den_ptr, NC, sval_vec_ptr, l_svec_mat_ptr, NC, r_svec_mat_T_ptr, NB), "culaDgesvd", "Z_C_den"); /// if mkl svd is needed, using the following 3 lines instead of its cula counterpart /// gettimeofday(&start_timeval_svd2, NULL); // Measuring start time for svd2 /// LAPACKE_dgesvd(CblasColMajor, 'A', 'A', NC, NB, Z_C_den_ptr, NC, sval_vec_ptr, l_svec_mat_ptr, NC, r_svec_mat_T_ptr, NB, superb); /// gettimeofday(&stop_timeval_svd2, NULL); // Measuring stop time for svd2 printf("for loop to copy singular values from svd of Z_C_den\n"); fflush(stdout); for(i=0; i<NC; i++) // aliter: could use cuda kernel from wrappers.cu but take care of the if check { if(i>NB) break; else if(fabs(*(sval_vec_ptr+i)) > PINV_TOL) *(sval_mat_ptr+(NC*i+i)) = 1/(*(sval_vec_ptr+i)); } printf("computing pseudoinverse for Z_C_den (step 1)\n"); fflush(stdout); cula_exception(culaDgemm('n', 'n', NC, NB, NC, 1, l_svec_mat_ptr, NC, sval_mat_ptr, NC, 0, pinv_ptr, NC), "culaBlockDgemm", "Z_C_den^+ (1), i.e., U*S_-1"); // U*S_-1 printf("computing pseudoinverse for Z_C_den (step 2)\n"); fflush(stdout); cula_exception(culaDgemm('n', 'n', NC, NB, NB, 1, pinv_ptr, NC, r_svec_mat_T_ptr, NB, 0, pinv_ptr, NC), "culaBlockDgemm", "Z_C_den^+ (2), i.e., (U*S_-1)*VT"); // (U*S_-1)*VT - note: this is not the pseudoinverse yet; this is transposed later on directly in the subsequent gemm to get V * S_-1 * U^T ******/ /* // ben-israel iterations pinv(NB, NC, Z_C_den_ptr, pinv_ptr); */ printf("computing Z_C\n"); fflush(stdout); cula_exception(culaBlockDgemm('n', 't', NA, NC, NB, 1, Z_C_num_ptr, NA, pinv_ptr, NC, 0, Z_C_ptr, NA), "culaBlockDgemm", "Z_C"); // note: here 2nd arg is 't' because pinv is not yet transposed; this was changed for the nystrom because 2nd arg is t since we have to transpose pinv(Z_B_den) /****** // reset buffers printf("resetting buffers used for pseudoinverse of Z_C_den\n"); fflush(stdout); memset(sval_vec_ptr, 0, NMAX*sizeof(double)); memset(sval_mat_ptr, 0, NMAX*NMAX*sizeof(double)); memset(l_svec_mat_ptr, 0, NMAX*NMAX*sizeof(double)); memset(r_svec_mat_T_ptr, 0, NMAX*NMAX*sizeof(double)); memset(pinv_ptr, 0, NMAX*NMAX*sizeof(double)); ******/ // compute tilde matrices printf("computing G_XB_til\n"); fflush(stdout); cula_exception(culaBlockDgemm('t', 't', NX, NA, NB, 1, G_XB_ptr, NB, Z_B_ptr, NA, 0, G_XB_til_ptr, NX), "culaBlockDgemm", "G_XB_til"); // 1st arg is t as G_XB is NB x NX; 2nd arg is Z_B which is NA \times NB printf("computing G_XC_til\n"); fflush(stdout); cula_exception(culaBlockDgemm('t', 't', NX, NA, NC, 1, G_XC_ptr, NC, Z_C_ptr, NA, 0, G_XC_til_ptr, NX), "culaBlockDgemm", "G_XC_til"); // 1st arg is t as G_XC is NC x NX; 2nd arg is Z_C which is NA \times NC // compute M2_al0 printf("computing M2_al0 (step 1)\n"); fflush(stdout); cula_exception(culaBlockDgemm('t', 'n', NA, NA, NX, 1/(double)(NX), G_XC_til_ptr, NX, G_XB_til_ptr, NX, 0, M2_al0_ptr, NA), "culaBlockDgemm", "M2_al0 (step 1)"); // intermediate M2 printf("computing mu_A \\otimes mu_A\n"); fflush(stdout); cula_exception(culaBlockDgemm('n', 'n', NA, NA, 1, 1, mu_A_ptr, NA, mu_A_ptr, 1, 0, mu_A_mu_A_T_ptr, NA), "culaBlockDgemm", "mu_A \\otimes mu_A"); // outer product for shifted term printf("for loop for setting the diagonal of mu_A \\otimes mu_A to zero\n"); fflush(stdout); for(i=0; i<NA; i++) *(mu_A_mu_A_T_ptr+(NA*i+i)) = 0; // setting the diagonal to 0 for centering printf("computing M2_al0 (step 2/final step)\n"); fflush(stdout); cula_exception(culaBlockDgemm('n', 'n', NA, NA, NA, -ALPHA_0/(ALPHA_0+1), eye_ptr, NA, mu_A_mu_A_T_ptr, NA, 1, M2_al0_ptr, NA), "culaBlockDgemm", "M2_al0"); // M2_al0 (NA x NA) computed printf("computing whitening matrix using the nystrom method\n"); fflush(stdout); cula_exception(nystrom_whitening(NA, NCOM, M2_al0_ptr, W_ptr), "nystrom whitening (c malloc could have failed)", "W"); /****** cula_exception(culaDgesvd('A', 'A', NA, NA, M2_al0_ptr, NA, sval_vec_ptr, l_svec_mat_ptr, NA, r_svec_mat_T_ptr, NA), "culaDgesvd", "M2_al0"); // k-svd: remember when comparing to matlab's output that if x is singular vector, so is -x; also here, we are actually doing full svd and selecting NCOM singular values printf("for loop to copy singular values from svd of M2_al0\n"); fflush(stdout); for(i=0; i<NA; i++) { if(i>NA) break; else if(fabs(*(sval_vec_ptr+i)) > PINV_TOL) *(sval_mat_ptr+(NA*i+i)) = 1/sqrt(*(sval_vec_ptr+i)); } // computing the whitening matrix W printf("computing whitening matrix W\n"); fflush(stdout); cula_exception(culaBlockDgemm('n', 'n', NA, NCOM, NA, 1, l_svec_mat_ptr, NA, sval_mat_ptr, NA, 0, W_ptr, NA), "culaBlockDgemm", "W"); // whitening matrix W computed ******/ /* // uncomment these lines to debug for small test cases to check if the whitening matrix is non-zero / computed correctly printf("whitening matrix debug check\n"); fflush(stdout); print_mat(1, NA, W_ptr); culaShutdown(); exit(2); */ // generate the whitened data for stochastic method coded in another c source file - the adjacency matrix buffers are externed in the other source file printf("computing whitened adjacency matrix for A\n"); fflush(stdout); cula_exception(culaBlockDgemm('t', 'n', NCOM, NX, NA, 1, W_ptr, NA, G_XA_ptr, NA, 0, G_XA_white_ptr, NCOM), "culaBlockDgemm", "G_XA_white"); // G_XA_white printf("computing whitened adjacency matrix for B\n"); fflush(stdout); cula_exception(culaBlockDgemm('t', 't', NCOM, NX, NA, 1, W_ptr, NA, G_XB_til_ptr, NX, 0, G_XB_til_white_ptr, NCOM), "culaBlockDgemm", "G_XB_til_white"); // G_XB_til_white printf("computing whitened adjacency matrix for C\n"); fflush(stdout); cula_exception(culaBlockDgemm('t', 't', NCOM, NX, NA, 1, W_ptr, NA, G_XC_til_ptr, NX, 0, G_XC_til_white_ptr, NCOM), "culaBlockDgemm", "G_XC_til_white"); // G_XC_til_white printf("computing whitened mean vector for A\n"); fflush(stdout); ///// cula_exception(culaDgemv('t', NA, NCOM, 1, W_ptr, NA, mu_A_ptr, 1, 0, mu_A_white_ptr, 1), "culaDgemv", "mu_A_white"); // compute whitened mean vector A cula_exception(culaBlockDgemm('t', 'n', NCOM, 1, NA, 1, W_ptr, NA, mu_A_ptr, NA, 0, mu_A_white_ptr, NCOM), "culaBlockDgemm", "mu_A_white"); // compute whitened mean vector A using partitioned matrix multiplication printf("computing whitened mean vector for B (step 1)\n"); fflush(stdout); ///// cula_exception(culaDgemv('n', NA, NB, 1, Z_B_ptr, NA, mu_B_ptr, 1, 0, mu_B_til_ptr, 1), "culaDgemv", "mu_B_til"); cula_exception(culaBlockDgemm('n', 'n', NA, 1, NB, 1, Z_B_ptr, NA, mu_B_ptr, NB, 0, mu_B_til_ptr, NA), "culaBlockDgemm", "mu_B_til"); printf("computing whitened mean vector for B\n"); fflush(stdout); ///// cula_exception(culaDgemv('t', NA, NCOM, 1, W_ptr, NA, mu_B_til_ptr, 1, 0, mu_B_til_white_ptr, 1), "culaDgemv", "mu_B_til_white"); // compute whitened mean vector B cula_exception(culaBlockDgemm('t', 'n', NCOM, 1, NA, 1, W_ptr, NA, mu_B_til_ptr, NA, 0, mu_B_til_white_ptr, NCOM), "culaBlockDgemm", "mu_B_white"); // compute whitened mean vector B using partitioned matrix multiplication printf("computing whitened mean vector for C (step 1)\n"); fflush(stdout); ///// cula_exception(culaDgemv('n', NA, NC, 1, Z_C_ptr, NA, mu_C_ptr, 1, 0, mu_C_til_ptr, 1), "culaDgemv", "mu_C_til"); cula_exception(culaBlockDgemm('n', 'n', NA, 1, NC, 1, Z_C_ptr, NA, mu_C_ptr, NC, 0, mu_C_til_ptr, NA), "culaBlockDgemm", "mu_C_til"); printf("computing whitened mean vector for C\n"); fflush(stdout); ///// cula_exception(culaDgemv('t', NA, NCOM, 1, W_ptr, NA, mu_C_til_ptr, 1, 0, mu_C_til_white_ptr, 1), "culaDgemv", "mu_C_til_white"); // compute whitened mean vector C cula_exception(culaBlockDgemm('t', 'n', NCOM, 1, NA, 1, W_ptr, NA, mu_C_til_ptr, NA, 0, mu_C_til_white_ptr, NCOM), "culaBlockDgemm", "mu_C_white"); // compute whitened mean vector C using partitioned matrix multiplication printf("pre-processing completed; ready for stochastic method\n"); culaShutdown(); return 0; }
int main() { int i, j, m, n, k, l, p, q, s, frank, kstep, nstep, culaVersion; double TOL,normM,normU,normS,normV,normP,percent_error,elapsed_secs; mat *M, *T, *S, *C, *U, *R, *Q, *B; vec *Icol, *Irow; time_t start_time, end_time; char *M_file = "../../matrix_data/A_mat_6kx12k.bin"; //char *M_file = "../data/A_mat_2kx4k.bin"; //char *M_file = "../data/A_mat_1kx2k.bin"; //char *M_file = "../data/A_mat_10x8.bin"; struct timeval start_timeval, end_timeval; culaStatus status; printf("Initializing CULA\n"); status = culaInitialize(); checkStatus(status); culaVersion = culaGetVersion(); printf("culaVersion is %d\n", culaVersion); printf("loading matrix from %s\n", M_file); M = matrix_load_from_binary_file(M_file); m = M->nrows; n = M->ncols; printf("sizes of M are %d by %d\n", m, n); printf("norm(M,fro) = %f\n", get_matrix_frobenius_norm(M)); // now test rank k ID of M.. k = 1000; // rank p = 20; // oversampling q = 0; // power scheme power s = 2; // power scheme orthogonalization amount kstep = 500; //block step size nstep = 2; TOL = 0; printf("\ncalling rank %d column ID routine..\n", k); gettimeofday(&start_timeval, NULL); //id_decomp_fixed_rank_or_prec(M, k, 0, &frank, &Icol, &T); gettimeofday(&end_timeval, NULL); elapsed_secs = get_seconds_frac(start_timeval,end_timeval); printf("elapsed time is: %4.1f\n", elapsed_secs); printf("check error\n"); //use_id_decomp_for_approximation(M, T, Icol, k); printf("\ncalling rank %d old QB routine..\n", k); gettimeofday(&start_timeval, NULL); //randQB_pb(M, kstep, nstep, q, s, &Q, &B); gettimeofday(&end_timeval, NULL); elapsed_secs = get_seconds_frac(start_timeval,end_timeval); printf("elapsed time is: %4.1f\n", elapsed_secs); printf("check error\n"); //use_QB_decomp_for_approximation(M, Q, B); printf("\ncalling rank %d new QB routine..\n", k); gettimeofday(&start_timeval, NULL); randQB_pb_new(M, kstep, nstep, TOL, q, s, &frank, &Q, &B); gettimeofday(&end_timeval, NULL); elapsed_secs = get_seconds_frac(start_timeval,end_timeval); printf("elapsed time is: %4.1f\n", elapsed_secs); printf("check error\n"); use_QB_decomp_for_approximation(M, Q, B); printf("\ncalling rank %d block randomized column ID routine..\n", k); p = kstep; // p should be \geq kstep gettimeofday(&start_timeval, NULL); id_blockrand_decomp_fixed_rank_or_prec(M, k, p, TOL, kstep, q, s, &frank, &Icol, &T); gettimeofday(&end_timeval, NULL); elapsed_secs = get_seconds_frac(start_timeval,end_timeval); printf("elapsed time is: %4.1f\n", elapsed_secs); printf("check error\n"); use_id_decomp_for_approximation(M, T, Icol, k); printf("\ncalling rank %d randomized column ID routine..\n", k); p = kstep; gettimeofday(&start_timeval, NULL); id_rand_decomp_fixed_rank(M, k, p, q, s, &Icol, &T); gettimeofday(&end_timeval, NULL); elapsed_secs = get_seconds_frac(start_timeval,end_timeval); printf("elapsed time is: %4.1f\n", elapsed_secs); printf("check error\n"); use_id_decomp_for_approximation(M, T, Icol, k); // delete and exit //printf("delete and exit..\n"); //matrix_delete(M); matrix_delete(T); matrix_delete(S); //vector_delete(Icol); vector_delete(Irow); //printf("Shutting down CULA\n"); //culaShutdown(); //return EXIT_SUCCESS; printf("\ncalling rank %d two sided ID routine..\n", k); gettimeofday(&start_timeval, NULL); //id_two_sided_decomp_fixed_rank_or_prec(M, k, TOL, &frank, &Icol, &Irow, &T, &S); gettimeofday(&end_timeval, NULL); elapsed_secs = get_seconds_frac(start_timeval,end_timeval); printf("elapsed time is: %4.1f\n", elapsed_secs); printf("check error\n"); //use_id_two_sided_decomp_for_approximation(M, T, S, Icol, Irow, k); printf("\ncalling rank %d randomized two sided ID routine..\n", k); p = 20; gettimeofday(&start_timeval, NULL); id_two_sided_rand_decomp_fixed_rank(M, k, p, q, s, &Icol, &Irow, &T, &S); gettimeofday(&end_timeval, NULL); elapsed_secs = get_seconds_frac(start_timeval,end_timeval); printf("elapsed time is: %4.1f\n", elapsed_secs); printf("check error\n"); use_id_two_sided_decomp_for_approximation(M, T, S, Icol, Irow, k); printf("\ncalling rank %d block randomized two sided ID routine..\n", k); p = kstep; gettimeofday(&start_timeval, NULL); id_two_sided_blockrand_decomp_fixed_rank_or_prec(M, k, p, TOL, kstep, q, s, &frank, &Icol, &Irow, &T, &S); gettimeofday(&end_timeval, NULL); elapsed_secs = get_seconds_frac(start_timeval,end_timeval); printf("elapsed time is: %4.1f\n", elapsed_secs); printf("check error\n"); use_id_two_sided_decomp_for_approximation(M, T, S, Icol, Irow, k); printf("\ncalling rank %d CUR routine\n", k); gettimeofday(&start_timeval, NULL); //cur_decomp_fixed_rank_or_prec(M, k, TOL, &frank, &C, &U, &R); gettimeofday(&end_timeval, NULL); elapsed_secs = get_seconds_frac(start_timeval,end_timeval); printf("elapsed time is: %4.1f\n", elapsed_secs); printf("check error\n"); //use_cur_decomp_for_approximation(M, C, U, R); printf("\ncalling rank %d randomized CUR routine\n", k); p = 20; gettimeofday(&start_timeval, NULL); cur_rand_decomp_fixed_rank(M, k, p, q, s, &C, &U, &R); gettimeofday(&end_timeval, NULL); elapsed_secs = get_seconds_frac(start_timeval,end_timeval); printf("elapsed time is: %4.1f\n", elapsed_secs); printf("check error\n"); use_cur_decomp_for_approximation(M, C, U, R); printf("\ncalling rank %d block randomized CUR routine\n", k); p = kstep; gettimeofday(&start_timeval, NULL); cur_blockrand_decomp_fixed_rank_or_prec(M, k, p, TOL, kstep, q, s, &frank, &C, &U, &R); gettimeofday(&end_timeval, NULL); elapsed_secs = get_seconds_frac(start_timeval,end_timeval); printf("elapsed time is: %4.1f\n", elapsed_secs); printf("check error\n"); use_cur_decomp_for_approximation(M, C, U, R); // delete and exit printf("delete and exit..\n"); matrix_delete(M); matrix_delete(T); matrix_delete(S); vector_delete(Icol); vector_delete(Irow); printf("Shutting down CULA\n"); culaShutdown(); return EXIT_SUCCESS; }
int main(int argc, char* argv[]){ //float beta; //int row,col; string file_name = FILE_NAME; HostMatrix<float> X; HostMatrix<float> X_test; HostMatrix<float> Y; HostMatrix<float> Y_test; HostMatrix<float> Input; HostMatrix<float> Target; std::map<string,int> Classes; std::map<int,string> ClassesLookup; readFile(file_name,Input,Target,Classes,ClassesLookup); int kfold = 1; int correct_instances = 0; int incorrect_instances = 0; int total_instances = 0; int **confusionMatrix; confusionMatrix = (int**) malloc(sizeof(int*)*Classes.size()); for(int i = 0; i < (int)Classes.size(); i++){ confusionMatrix[i] = (int*) malloc(sizeof(int)*Classes.size()); memset(confusionMatrix[i],0,sizeof(int)*Classes.size()); } float Pet_mean = 0; float Ped_mean = 0; unsigned int seed = (unsigned)time(0); /***************RUN INFORMATION*************/ writeHeader(Input,Classes.size(),seed); /*******************************************/ if(!InitCUDA()) { return 1; } culaStatus status; status = culaInitialize(); std::cout << "Starting " << std::endl; float center_time = 0; float width_time = 0; float weight_time = 0; float scaling_time = 0; unsigned int time_total = 0; unsigned int testing_time = 0; unsigned int training_time = 0; clock_t initialTimeTotal = clock(); do{ X = crossvalidationTrain(Input,KFOLDS,kfold); X_test = crossvalidationTest(Input,KFOLDS,kfold); Y = crossvalidationTrain(Target,KFOLDS,kfold); Y_test = crossvalidationTest(Target,KFOLDS,kfold); HostMatrix<float> Weights; HostMatrix<float> Centers; /*Train Network*/ clock_t initialTime = clock(); RadialBasisFunction RBF(NETWORK_SIZE,RNEIGHBOURS,SCALING_FACTOR,Classes.size()); RBF.SetSeed(seed); RBF.Train(X,Y); training_time = (clock() - initialTime); center_time += RBF.times[0]; width_time += RBF.times[1]; weight_time += RBF.times[2]; scaling_time += RBF.times[3]; /*Test Network*/ initialTime = clock(); std::cout << "Testing" << std::endl; HostMatrix<float> out_test; out_test = RBF.Test(X_test); for(int i = 0; i< X_test.Rows();i++){ float max = 0; float out_class = 0; for(int j = 0; j < (int) Classes.size(); j++){ if(out_test(i,j) > max){ out_class = (float)j; max = out_test(i,j); } } out_test(i,0) = out_class+1; } for (int i = 0; i < out_test.Rows(); i++) { out_test(i,0) = (float)round(out_test(i,0)); if(out_test(i,0) <= 0) out_test(i,0) = 1; if(out_test(i,0) > Classes.size()) out_test(i,0) = (float)Classes.size(); std::cout << Y_test(i,0) << " " << out_test(i,0) << std::endl; } correct_instances += out_test.Rows() - error_calc(Y_test,out_test); incorrect_instances += error_calc(Y_test,out_test); total_instances += out_test.Rows(); /*Add values to Confusion Matrix*/ for(int i = 0; i < Y_test.Rows(); i++){ confusionMatrix[((int)Y_test(i,0))-1][((int)out_test(i,0))-1] = confusionMatrix[((int)Y_test(i,0))-1][((int)out_test(i,0))-1] + 1; } testing_time = (clock() - initialTime); /*Increment fold number, for use in crossvalidation*/ kfold++; }while(kfold <= KFOLDS); time_total = (clock() - initialTimeTotal); /*****************MEASURES****************/ measures(correct_instances,total_instances,incorrect_instances,confusionMatrix,Classes,ClassesLookup); writeFooter(center_time,width_time,weight_time,scaling_time,training_time,testing_time,time_total); culaShutdown(); cudaThreadExit(); return 0; }