void lutsolve(Matrix A, Vector x, char uplo) { char trans='N'; char diag='N'; int one=1; int info; dtrtrs(&uplo, &trans, &diag, &A->rows, &one, A->data[0], &A->rows, x->data, &A->rows, &info); }
/************************************ Given the factorization LB = U for some B, solve the problem Bx = vec for x Solve using LAPACK functions. ************************************/ void LU_Solve1(PT_Matrix pL, PT_Matrix pUt, double *vec, double *x, ptrdiff_t *info) { ptrdiff_t n, incx=1; char U='U', N='N',T='T'; double alpha=1.0, beta=0.0; n = Matrix_Rows(pL); /* solve using lapack */ /* compute x = L*vec */ dgemv(&T, &n, &n, &alpha, pL->A, &(pL->rows_alloc), vec, &incx, &beta, x, &incx); /* solve U*xnew = x using lapack function that also checks for singularity */ dtrtrs(&U, &N, &N, &n, &incx, pUt->A, &(pUt->rows_alloc), x, &n, info); /* printf("my x=\n"); */ /* Vector_Print_raw(x,n); */ }
double nmf_neals(double * a, double * w0, double * h0, int * pm, int * pn, \ int * pk, int * maxiter, const double * pTolX, const double * pTolFun) { // code added to be able to call from R int m = * pm; int n = * pn; int k = * pk; const double TolX = * pTolX; const double TolFun = * pTolFun; // also: changed w0, h0 to simple pointer (instead of double) // // end code added #ifdef PROFILE_NMF_NEALS struct timeval start, end; gettimeofday(&start, 0); #endif #if DEBUG_LEVEL >= 2 printf("Entering nmf_neals\n"); #endif #ifdef ERROR_CHECKING errno = 0; #endif double * help1 = (double*) malloc(sizeof(double)*k*k); double * help2 = (double*) malloc(sizeof(double)*k*n); double * help3 = (double*) malloc(sizeof(double)*k*m); //----------------------------------------- // definition of necessary dynamic data structures //...for calculating matrix h double* h = (double*) malloc(sizeof(double)*k*n); int* jpvt_h = (int*) malloc(sizeof(int)*k); int info; //...for calculating matrix w double* w = (double*) malloc(sizeof(double)*m*k); //---------------- //...for calculating the norm of A-W*H double* d = (double*) malloc(sizeof(double)*m*n); //d = a - w*h double dnorm0 = 0; double dnorm = 0; const double eps = dlamch('E'); //machine precision epsilon const double sqrteps = sqrt(eps); //squareroot of epsilon //------------------- #ifdef ERROR_CHECKING if (errno) { perror("Error allocating memory in nmf_neals"); free(help1); free(help2); free(help3); free(h); free(jpvt_h); free(w); free(d); return -1; } #endif // declaration of data structures for switch to als algorithm // ---------------------------------------------------------- int als_data_allocated = 0; // indicates wheter data structures were already allocated // factor matrices for factorizing matrix w double * q; double * r; // factor matrices for factorizing matrix h double * q_h; double * r_h; double* tau_h; //stores elementary reflectors of factor matrix Q double* work_w; //work array for factorization of matrix w int lwork_w; double* work_h; //work array for factorization of matrix h int lwork_h; double * work_qta; //work array for dorgqr int lwork_qta; double * work_qth; //work array for dorgqr int lwork_qth; //query for optimal workspace size for routine dgeqp3... double querysize; //Loop-Indices int iter, i; //variable for storing if fallback happened in current iteration int fallback; // factorisation step in a loop from 1 to maxiter for (iter = 1; iter <= *maxiter; ++iter) { //no fallback in this iteration so far fallback = 0; // calculating matrix h //---------------- //help1 = w0'*w0 dgemm('T', 'N', k, k, m, 1.0, w0, m, w0, m, 0., help1, k); //help2 = w0'*a dgemm('T', 'N', k, n, m, 1.0, w0, m, a, m, 0., help2, k); //LU-Factorisation of help1 to solve equation help1 * x = help2 dgesv(k, n, help1, k, jpvt_h, help2, k, &info); // if factor matrix U is singular -> switch back to als algorithm to compute h if( info > 0) { //set fallback to 1 to indicate that fallback happened fallback = 1; // do dynamic data structures need to be allocated? if (!als_data_allocated) { als_data_allocated = 1; // factor matrices for factorizing matrix w q = (double*) malloc(sizeof(double)*m*k); r = (double*) malloc(sizeof(double)*m*k); // factor matrices for factorizing matrix h q_h = (double*) malloc(sizeof(double)*n*k); r_h = (double*) malloc(sizeof(double)*n*k); tau_h = (double*) malloc(sizeof(double)*k); //stores elementary reflectors of factor matrix Q //query for optimal workspace size for routine dgeqp3... //for matrix w dgeqp3(m, k, q, m, jpvt_h, tau_h, &querysize, -1, &info); lwork_w = (int) querysize; work_w = (double*) malloc(sizeof(double)*lwork_w); //work array for factorization of matrix help1 (dgeqp3) //for matrix h dgeqp3(n, k, q_h, n, jpvt_h, tau_h, &querysize, -1, &info); lwork_h = (int) querysize; work_h = (double*) malloc(sizeof(double)*lwork_h); //work array for factorization of matrix h //query for optimal workspace size for routine dorgqr... //for matrix w dorgqr(m, k, k, q, m, tau_h, &querysize, -1, &info); lwork_qta = (int)querysize; work_qta = (double*) malloc(sizeof(double)*lwork_qta); //work array for dorgqr //for matrix h dorgqr(n, k, k, q_h, n, tau_h, &querysize, -1, &info); lwork_qth = (int)querysize; work_qth = (double*) malloc(sizeof(double)*lwork_qth); } // calculating matrix h //---------------- //re-initialization //copy *w0 to q dlacpy('A', m, k, w0, m, q, m); //initialise jpvt_h to 0 -> every column free for (i = 0; i<k; ++i) jpvt_h[i] = 0; // Q-R factorization with column pivoting dgeqp3(m, k, q, m, jpvt_h, tau_h, work_w, lwork_w, &info); //copying upper triangular factor-matrix r out of q into r dlacpy('U', m, k, q, m, r, k); //Begin of least-squares-solution to w0 * x = a //generate explicit matrix q (m times k) and calculate q' * a dorgqr(m, k, k, q, m, tau_h, work_qta, lwork_qta, &info); dgemm('T', 'N', k, n, m, 1.0, q, m, a, m, 0.0, q_h, k); //solve R * x = (Q'*A) dtrtrs('U','N','N',k,n,r,k,q_h,k,&info); //copy matrix q to h, but permutated according to jpvt_h for (i=0; i<k; ++i) { dcopy(n, q_h + i, k, h + jpvt_h[i] - 1, k); } //transform negative and very small positive values to zero for performance reasons and to keep the non-negativity constraint for (i=0; i<k*n; ++i) { if (h[i] < ZERO_THRESHOLD) h[i] = 0.; } } else { //h = max(ZERO_THRESHOLD, help1\help2) for (i=0; i < k*n; ++i) h[i] = (help2[i] > ZERO_THRESHOLD ? help2[i] : 0.); } // calculating matrix w = max(0, help1\help3)' //---------------------------- //help1 = h*h' dgemm('N', 'T', k, k, n, 1.0, h, k, h, k, 0., help1, k); //help3 = h*a' dgemm('N', 'T', k, m, n, 1.0, h, k, a, m, 0., help3, k); //LU-Factorisation of help1 dgesv(k, m, help1, k, jpvt_h, help3, k, &info); // if( info > 0) { // do dynamic data structures need to be allocated? if (!als_data_allocated) { als_data_allocated = 1; // factor matrices for factorizing matrix w q = (double*) malloc(sizeof(double)*m*k); r = (double*) malloc(sizeof(double)*m*k); // factor matrices for factorizing matrix h q_h = (double*) malloc(sizeof(double)*n*k); r_h = (double*) malloc(sizeof(double)*n*k); tau_h = (double*) malloc(sizeof(double)*k); //stores elementary reflectors of factor matrix Q //query for optimal workspace size for routine dgeqp3... //for matrix w dgeqp3(m, k, q, m, jpvt_h, tau_h, &querysize, -1, &info); lwork_w = (int) querysize; work_w = (double*) malloc(sizeof(double)*lwork_w); //work array for factorization of matrix help1 (dgeqp3) //..for matrix h dgeqp3(n, k, q_h, n, jpvt_h, tau_h, &querysize, -1, &info); lwork_h = (int) querysize; work_h = (double*) malloc(sizeof(double)*lwork_h); //work array for factorization of matrix h //query for optimal workspace size for routine dorgqr... //for matrix w dorgqr(m, k, k, q, m, tau_h, &querysize, -1, &info); lwork_qta = (int)querysize; work_qta = (double*) malloc(sizeof(double)*lwork_qta); //work array for dorgqr // ... for matrix h dorgqr(n, k, k, q_h, n, tau_h, &querysize, -1, &info); lwork_qth = (int)querysize; work_qth = (double*) malloc(sizeof(double)*lwork_qth); } //calculating matrix w //copy original matrix h to q_h, but transposed for (i=0; i<k; ++i) { dcopy(n, h + i, k, q_h + i*n, 1); } //initialise jpvt_a to 0 -> every column free for (i = 0; i<k; ++i) jpvt_h[i] = 0; //Q-R factorization dgeqp3(n, k, q_h, n, jpvt_h, tau_h, work_h, lwork_h, &info); //copying upper triangular factor-matrix r_h out of q into r_h dlacpy('U', n, k, q_h, n, r_h, k); //Begin of least-squares-solution to w0 * x = a //generate explicit matrix q (n times k) and calculate *a = q' * a' dorgqr(n, k, k, q_h, n, tau_h, work_qth, lwork_qth, &info); dgemm('T', 'T', k, m, n, 1.0, q_h, n, a, m, 0.0, q, k); //solve R_h * x = (Q'*A') dtrtrs('U', 'N', 'N', k, m, r_h, k, q, k, &info); //jpvt_h*(R\(Q'*A')) permutation and transposed copy to w for (i=0; i<k; ++i) { dcopy(m, q + i, k, w + m * (jpvt_h[i] - 1), 1); } //transform negative and very small positive values to zero for performance reasons and to keep the non-negativity constraint for (i=0; i<k*m; ++i) { if (w[i] < ZERO_THRESHOLD) w[i] = 0.; } } else { //w = max(0, help3)' for (i=0; i<k; ++i) { dcopy(m, help3 + i, k, w + i*m, 1); } for (i=0; i<m*k; ++i) { if (w[i] < ZERO_THRESHOLD) w[i] = 0.; } } // calculating the norm of D = A-W*H dnorm = calculateNorm(a, w, h, d, m, n, k); // calculating change in w -> dw //---------------------------------- double dw; dw = calculateMaxchange(w, w0, m, k, sqrteps); // calculating change in h -> dh //----------------------------------- double dh; dh = calculateMaxchange(h, h0, k, n, sqrteps); //Max-Change = max(dh, dw) = delta double delta; delta = (dh > dw) ? dh : dw; // storing the matrix results of the current iteration swap(&w0, &w); swap(&h0, &h); // storing the norm results of the current iteration dnorm0 = dnorm; #if DEBUG_LEVEL >= 1 printf("iter: %.6d\t dnorm: %.16f\t delta: %.16f\n", iter, dnorm, delta); #endif //Check for Convergence if (iter > 1) { if (delta < TolX) { *maxiter = iter; break; } else if (dnorm <= TolFun*dnorm0) { *maxiter = iter; break; } } } //end of loop from 1 to maxiter #if DEBUG_LEVEL >= 2 printf("Exiting nmf_neals\n"); #endif #ifdef PROFILE_NMF_NEALS gettimeofday(&end, 0); outputTiming("", start, end); #endif // freeing memory if used free(help1); free(help2); free(help3); free(h); free(jpvt_h); free(w); free(d); if(als_data_allocated) { free(q); free(r); free(q_h); free(r_h); free(work_h); free(work_w); free(tau_h); free(work_qta); free(work_qth); } // returning calculated norm return dnorm; }
// Sample factor vectors // Function written from perspective of sampling user factor vectors with cross-topics // Switch roles of user-item inputs to sample item factor vectors void sampleTopicFactorVectors(uint32_t* items, double* resids, const mxArray* exampsByUser, int KU, int KM, int numUsers, int numItems, double invSigmaSqd, ptrdiff_t numTopicFacs, double* LambdaU, double* muU, double* c, double* d, uint32_t* zU, uint32_t* zM){ // Array of random number generators gsl_rng** rngs = getRngArray(); // Extract internals of jagged arrays uint32_t** userExamps; mwSize* userLens; unpackJagged(exampsByUser, &userExamps, &userLens, numUsers); ptrdiff_t numTopicFacsSqd = numTopicFacs*numTopicFacs; ptrdiff_t numTopicFacsTimesNumItems = numTopicFacs*numItems; ptrdiff_t numTopicFacsTimesNumUsers = numTopicFacs*numUsers; // BLAS constants char uplo[] = "U"; char trans[] = "N"; char diag[] = "N"; ptrdiff_t oneInt = 1; double oneDbl = 1; double zeroDbl = 0; // Compute muBase = LambdaU*muU double* muBase = mxMalloc(numTopicFacs*sizeof(*muBase)); dsymv(uplo, &numTopicFacs, &oneDbl, LambdaU, &numTopicFacs, muU, &oneInt, &zeroDbl, muBase, &oneInt); // Allocate memory for new mean and precision parameters double** muNew[MAX_NUM_THREADS]; double** LambdaNew[MAX_NUM_THREADS]; for(int thread = 0; thread < MAX_NUM_THREADS; thread++){ muNew[thread] = mxMalloc(KM*sizeof(**muNew)); LambdaNew[thread] = mxMalloc(KM*sizeof(**LambdaNew)); for(int i = 0; i < KM; i++){ muNew[thread][i] = mxMalloc(numTopicFacs*sizeof(***muNew)); LambdaNew[thread][i] = mxMalloc(numTopicFacsSqd*sizeof(***LambdaNew)); } } #pragma omp parallel for for(int u = 0; u < numUsers; u++){ int thread = omp_get_thread_num(); for(int i = 0; i < KM; i++){ // Initialize new mean to muBase dcopy(&numTopicFacs, muBase, &oneInt, muNew[thread][i], &oneInt); // Initialize new precision to LambdaU dcopy(&numTopicFacsSqd, LambdaU, &oneInt, LambdaNew[thread][i], &oneInt); } // Iterate over user's examples mxArray* exampsArray = mxGetCell(exampsByUser, u); mwSize len = mxGetN(exampsArray); uint32_t* examps = (uint32_t*) mxGetData(exampsArray); for(int j = 0; j < len; j++){ uint32_t e = examps[j]-1; int m = items[e]-1; int userTop = zU[e]-1; int itemTop = zM[e]-1; // Item vector for this rated item double* dVec = d + m*numTopicFacs + userTop*numTopicFacsTimesNumItems; // Compute posterior sufficient statistics for factor vector // Add resid * dVec/sigmaSqd to muNew double resid = resids[e]; resid *= invSigmaSqd; daxpy(&numTopicFacs, &resid, dVec, &oneInt, muNew[thread][itemTop], &oneInt); // Add (dVec * dVec^t)/sigmaSqd to LambdaNew // Exploit symmetric structure of LambdaNew dsyr(uplo, &numTopicFacs, &invSigmaSqd, dVec, &oneInt, LambdaNew[thread][itemTop], &numTopicFacs); } for(int i = 0; i < KM; i++){ // Compute upper Cholesky factor of LambdaNew ptrdiff_t info; dpotrf(uplo, &numTopicFacs, LambdaNew[thread][i], &numTopicFacs, &info); // Solve for (LambdaNew)^-1*muNew using Cholesky factor dpotrs(uplo, &numTopicFacs, &oneInt, LambdaNew[thread][i], &numTopicFacs, muNew[thread][i], &numTopicFacs, &info); // Sample vector of N(0,1) variables gsl_rng* rng = rngs[thread]; double* cVec = c + u*numTopicFacs + i*numTopicFacsTimesNumUsers; for(int f = 0; f < numTopicFacs; f++) cVec[f] = gsl_ran_gaussian(rng, 1); // Solve for (chol(LambdaNew,'U'))^-1*N(0,1) dtrtrs(uplo, trans, diag, &numTopicFacs, &oneInt, LambdaNew[thread][i], &numTopicFacs, cVec, &numTopicFacs, &info); // Add muNew to aVec daxpy(&numTopicFacs, &oneDbl, muNew[thread][i], &oneInt, cVec, &oneInt); } } // Clean up mxFree(userExamps); mxFree(userLens); mxFree(muBase); for(int thread = 0; thread < MAX_NUM_THREADS; thread++){ for(int i = 0; i < KM; i++){ mxFree(muNew[thread][i]); mxFree(LambdaNew[thread][i]); } mxFree(muNew[thread]); mxFree(LambdaNew[thread]); } }
int main(){ double *A, *A2, *b, *b2, *L, *T, *P, *D, *SD1, *SD2, *PT, *WORK; int SIZE, m, POSX, POSY, NUML, NMBR, i, j, NRHS=1, INFO, *IPIV, NUMM; char NAME[20], FNAM[30], UPLO='L', TRANS='N', DIAG='U'; FILE *FILI,*FILI2,*FILO, *FIL3; FIL3 = fopen("INPUT.dat","r"); // LECTURE DU NOMBRE DE MATRICE A TRAITER fscanf(FIL3,"%d",&NMBR); for(NUMM = 0; NUMM<NMBR;NUMM++){ // LECTURE DU NOM DE LA MATRICE ACTUELLE ET OUVERTURE DES FICHIERS fscanf(FIL3,"%s",NAME); FILI = fopen(NAME,"r"); if(FILI==NULL){ printf("Fichier %s introuvable !!!\n",NAME); continue; } sprintf(HEAD,"rhs_%s.dat",NAME); FILI2 = fopen(FNAM,"r"); sprintf(HEAD,"result_%s.dat",NAME); FILO = fopen(FNAM,"w+"); // ALLOCATION DES DIFFERENTS TABLEAUX fscanf(FILI,"%d %d %d",&SIZE, &m, &NUML); A = (double *) calloc(SIZE*SIZE,sizeof(double)); A2 = (double *) calloc(SIZE*SIZE,sizeof(double)); b = (double *) calloc(SIZE,sizeof(double)); b2 = (double *) calloc(SIZE,sizeof(double)); P = (double *) calloc(SIZE*SIZE,sizeof(double)); PT = (double *) calloc(SIZE*SIZE,sizeof(double)); L = (double *) calloc(SIZE*SIZE,sizeof(double)); T = (double *) calloc(SIZE*SIZE,sizeof(double)); D = (double *) calloc(SIZE,sizeof(double)); SD1 = (double *) calloc(SIZE-1,sizeof(double)); SD2 = (double *) calloc(SIZE-1,sizeof(double)); WORK = (double *) calloc(SIZE,sizeof(double)); IPIV = (int*) calloc(SIZE,sizeof(int)); // LECTURE DES VALEURS DE A for(i=0;i<NUML;i++){ fscanf(FILI,"%d %d",&POSX, &POSY); fscanf(FILI,"%lf",A+POSX-1+SIZE*(POSY-1)); A2[(POSX-1)+SIZE*(POSY-1)] = A[(POSX-1)+SIZE*(POSY-1)]; } // LECTURE DES VALEURS DE b for(i=0;i<SIZE;i++){ fscanf(FILI2,"%lf",b+i); b2[i] = b[i]; } // METHODE DE AASEN // CALCUL DES MATRICES T,L ET P aasen(SIZE,A,T,L,P); // RESOLUTION DES AUTRES EQUATIONS b = gaxPD(P,SIZE,SIZE,b); for(i=0;i<SIZE;i++) D[i] = T[i+SIZE*i]; for(i=0;i<SIZE-1;i++) SD1[i] = T[i+SIZE*(i+1)], SD2[i] = T[i+1+SIZE*i]; for(i=0;i<SIZE;i++) for(j=0;j<SIZE;j++) PT[i+SIZE*j]=P[j+SIZE*i]; dtrtrs(&UPLO, &TRANS, &DIAG, &SIZE, &NRHS, L, &SIZE, b, &SIZE, &INFO); dgtsv(&SIZE, &NRHS, SD1, D, SD2, b, &SIZE, &INFO); TRANS = 'T'; dtrtrs(&UPLO,&TRANS,&DIAG,&SIZE,&NRHS,L,&SIZE,b,&SIZE,&INFO); // CALCUL DE LA SOLUTION FINALE b = gaxPD(PT,SIZE,SIZE,b); // METHODE DE BUNCH-PARLETT dsytrf_(&UPLO, &SIZE, A2, &SIZE, IPIV, WORK, &SIZE, &INFO); dsytrs_(&UPLO, &SIZE, &NRHS, A2, &SIZE, IPIV, b2, &SIZE, &INFO); // ECRITURE DANS LE FICHIER DE SORTIE for(i=0;i<SIZE;i++) fprintf(FILO,"%lf\t%lf\SIZE",b[i],b2[i]); // LIBERATION DE LA MEMOIRE ET FERMETURE DES FICHIERS free(A);free(b);free(A2);free(b2);free(P);free(PT);free(L);free(T);free(D);free(SD1); free(SD2); fclose(FILI); fclose(FILI2); fclose(FILO); } return(0); }