void MedSTC::learn_svm(char *model_dir, const double &dC, const double &dEll) { char model_root[512]; sprintf(model_root, "%s/final", model_dir); load_model( model_root ); m_dC = dC; m_dDeltaEll = dEll; Params *param = new Params(); param->DELTA_ELL = m_dDeltaEll; param->LAMBDA = m_dLambda; param->RHO = m_dRho; param->INITIAL_C = m_dC; param->NLABELS = m_nLabelNum; param->NTOPICS = m_nK; param->SVM_ALGTYPE = 2; char filename[512]; get_train_filename( filename, model_dir, param ); svmStructSolver( filename, param, m_dMu ); // for testing. int nDataNum = 0; double dAcc = 0; get_test_filename( filename, model_dir, param ); readLowDimData( filename, nDataNum ); for ( int d=0; d<nDataNum; d++ ) { int predLabel = predict( theta_[d] ); if ( label_[d] == predLabel ) dAcc ++; } dAcc /= nDataNum; FILE *fileptr = fopen("overall-res.txt", "a"); fprintf(fileptr, "setup (K: %d; C: %.3f; fold: %d; ell: %.2f; lambda: %.2f; rho: %.4f; svm_alg: %d; maxIt: %d): accuracy %.3f; avgNonZeroWrdCode: %.5f\n", m_nK, m_dC, 0, dEll, m_dLambda, m_dRho, param->SVM_ALGTYPE, 0, dAcc, 0.0); fclose(fileptr); save_model( model_root, -1 ); for ( int d=0; d<nDataNum; d++ ) { free( theta_[d] ); } free( theta_ ); free( label_ ); }
void test_km_readline_realloc (void *ptr) { char *lorem_fn = NULL; char *buf = NULL; char *smallbuf = NULL; const size_t smallbuf_len = 1<<4; FILE *fp = NULL; ssize_t ret = 0; size_t line_num; char *nulcp = NULL; FILE *nulfp = NULL; char *tmpcp = NULL; size_t tmpsz = 0; size_t our_bufsize = bufsize; size_t our_smallbuf_len = smallbuf_len; km_test_err = 0; /* This should always work, so long as you run it from the right dir */ lorem_fn = get_test_filename("loremipsum.txt"); if (lorem_fn == NULL) tt_abort_msg("Broken test - get_test_filename failed\n"); if ((fp = fopen(lorem_fn, "r")) == NULL) { fprintf(stderr, "Could not open test file '%s' -- %s\n", lorem_fn, strerror(errno)); tt_skip(); } else { buf = calloc(our_bufsize, sizeof(*buf)); smallbuf = calloc(our_smallbuf_len, sizeof(*smallbuf)); } for (line_num = 0; line_num < n_loremipsum_lines; line_num++) { ret = km_readline_realloc(&buf, fp, &our_bufsize, &test_err_handler); tt_int_op(km_test_err, ==, 0); tt_int_op(strncmp(buf, loremipsum_lines[line_num], our_bufsize), ==, 0); tt_int_op(strlen(buf), ==, loremipsum_line_lens[line_num]); tt_int_op(ret, ==, loremipsum_line_lens[line_num]); tt_int_op(our_bufsize, ==, bufsize); km_test_err = 0; } ret = km_readline_realloc(&buf, fp, &our_bufsize, &test_err_handler); tt_int_op(km_test_err, ==, 0); /* check it leaves \0 in buf */ tt_int_op(strncmp(buf, "", our_bufsize), ==, 0); tt_int_op(strlen(buf), ==, 0); tt_int_op(ret, ==, EOF); tt_int_op(our_bufsize, ==, bufsize); km_test_err = 0; /* Naughty tests that try and make it fail */ rewind(fp); /* Null buf */ ret = km_readline_realloc(&nulcp, fp, &our_bufsize, &test_err_handler); tt_int_op(km_test_err, ==, 3); tt_int_op(ret, ==, -2); tt_int_op(our_bufsize, ==, bufsize); km_test_err = 0; /* Null fp */ ret = km_readline_realloc(&buf, nulfp, &our_bufsize, &test_err_handler); tt_int_op(km_test_err, ==, 3); tt_int_op(ret, ==, -2); tt_int_op(our_bufsize, ==, bufsize); km_test_err = 0; /* Both buf & fp null */ ret = km_readline_realloc(&nulcp, nulfp, &our_bufsize, &test_err_handler); tt_int_op(km_test_err, ==, 3); tt_int_op(ret, ==, -2); tt_int_op(our_bufsize, ==, bufsize); km_test_err = 0; /* Test that should require it to resize the buffer */ rewind(fp); ret = km_readline_realloc(&smallbuf, fp, &our_smallbuf_len, &test_err_handler); tt_int_op(km_test_err, ==, 0); tt_int_op(ret, ==, loremipsum_line_lens[0]); tt_int_op(strlen(smallbuf), ==, loremipsum_line_lens[0]); tmpsz = loremipsum_line_lens[0]; tt_int_op(our_smallbuf_len, ==, kmroundupz(tmpsz)); end: if (lorem_fn != NULL) free(lorem_fn); if (buf != NULL) free(buf); if (smallbuf != NULL) free(smallbuf); if (fp != NULL) fclose(fp); }
double MedSTC::sparse_coding(char* model_dir, Corpus* pC, Params *param) { char model_root[512]; sprintf(model_root, "%s/final", model_dir); load_model(model_root); init_param( pC ); // remove unseen words Document* doc = NULL; if ( pC->num_terms > m_nNumTerms ) { for ( int i=0; i<pC->num_docs; i ++ ) { doc = &(pC->docs[i]); for ( int k=0; k<doc->length; k++ ) if ( doc->words[k] >= m_nNumTerms ) doc->words[k] = m_nNumTerms - 1; } } // allocate memory int max_length = pC->max_corpus_length(); double **phi = (double**)malloc(sizeof(double*)*max_length); for (int n=0; n<max_length; n++) { phi[n] = (double*)malloc(sizeof(double) * m_nK); } double **theta = (double**)malloc(sizeof(double*)*(pC->num_docs)); for (int d=0; d<pC->num_docs; d++) { theta[d] = (double*)malloc(sizeof(double)*m_nK); } double **avgTheta = (double**)malloc(sizeof(double*)*m_nLabelNum); for ( int k=0; k<m_nLabelNum; k++ ) { avgTheta[k] = (double*)malloc(sizeof(double)*m_nK); memset(avgTheta[k], 0, sizeof(double)*m_nK); } vector<vector<double> > avgWrdCode(m_nNumTerms); vector<int> wrdCount(m_nNumTerms, 0); for ( int i=0; i<m_nNumTerms; i++ ) { avgWrdCode[i].resize( m_nK, 0 ); } vector<int> perClassDataNum(m_nLabelNum, 0); char filename[100]; sprintf(filename, "%s/evl-slda-obj.dat", model_dir); FILE* fileptr = fopen(filename, "w"); double dEntropy = 0, dobj = 0, dNonZeroWrdCode = 0, dNonZeroDocCode = 0; int nTotalWrd = 0; for (int d=0; d<pC->num_docs; d++) { doc = &(pC->docs[d]); // initialize phi. for (int n=0; n<doc->length; n++) { double *phiPtr = phi[n]; for ( int k=0; k<m_nK; k++ ) { phiPtr[k] = 1.0 / m_nK; } } dobj = sparse_coding( doc, d, param, theta[d], phi ); // do prediction doc->predlabel = predict(theta[d]); doc->scores = (double*) malloc(sizeof(double)*m_nLabelNum);; predict_scores(doc->scores,theta[d]); doc->lhood = dobj; fprintf(fileptr, "%5.5f\n", dobj); //dEntropy += safe_entropy( exp[d], m_nK ); int gndLabel = doc->gndlabel; perClassDataNum[gndLabel] ++; for ( int k=0; k<m_nK; k++ ) { for ( int n=0; n<doc->length; n++ ) { //fprintf( wrdfptr, "%.10f ", phi[n][k] ); if ( phi[n][k] > 0/*1e-10*/ ) dNonZeroWrdCode ++; } //fprintf( wrdfptr, "\n" ); avgTheta[gndLabel][k] += theta[d][k]; if ( theta[d][k] > 0 ) dNonZeroDocCode ++; } nTotalWrd += doc->length; //fprintf( wrdfptr, "\n" ); //fflush( wrdfptr ); dEntropy += safe_entropy( theta[d], m_nK ); //// the average distribution of each word on the topics. //for ( int n=0; n<doc->length; n++ ) { // int wrd = doc->words[n]; // wrdCount[wrd] ++; // for ( int k=0; k<m_nK; k++ ) { // avgWrdCode[wrd][k] += phi[n][k]; // } //} } fclose( fileptr ); /* save theta & average theta. */ sprintf(filename, "%s/evl-theta.dat", model_dir); save_theta(filename, theta, pC->num_docs, m_nK); sprintf(filename, "%s/evl-avgTheta.dat", model_dir); for ( int m=0; m<m_nLabelNum; m++ ) { int dataNum = perClassDataNum[m]; for ( int k=0; k<m_nK; k++ ) { avgTheta[m][k] /= dataNum; } } printf_mat(filename, avgTheta, m_nLabelNum, m_nK); /* save the average topic distribution for each word. */ sprintf(filename, "%s/evl-avgWrdCode.dat", model_dir); fileptr = fopen( filename, "w" ); for ( int i=0; i<m_nNumTerms; i++ ) { double dNorm = wrdCount[i]; for ( int k=0; k<m_nK; k++ ) { double dval = avgWrdCode[i][k]; if ( dNorm > 0 ) dval /= dNorm; fprintf( fileptr, "%.10f ", dval ); } fprintf( fileptr, "\n" ); } fclose( fileptr ); //printf_mat( filename, avgWrdCode, m_nNumTerms, m_nK ); /* save the low dimension representation. */ get_test_filename(filename, model_dir, param); outputLowDimData(filename, pC, theta); /* save the prediction performance. */ sprintf(filename, "%s/evl-performance.dat", model_dir); double dAcc = save_prediction(filename, pC); // free memory for (int i=0; i<pC->num_docs; i++ ) { free( theta[i] ); } for ( int n=0; n<max_length; n++ ) { free( phi[n] ); } for ( int k=0; k<m_nLabelNum; k++ ) { free( avgTheta[k] ); } free( theta ); free( phi ); free( avgTheta ); return dAcc; }