Esempio n. 1
0
void MedSTC::learn_svm(char *model_dir, const double &dC, const double &dEll)
{
	char model_root[512];
	sprintf(model_root, "%s/final", model_dir);
	load_model( model_root );
	m_dC = dC;
	m_dDeltaEll = dEll;

	Params *param = new Params();
	param->DELTA_ELL = m_dDeltaEll;
	param->LAMBDA = m_dLambda;
	param->RHO = m_dRho;
	param->INITIAL_C = m_dC;
	param->NLABELS = m_nLabelNum;
	param->NTOPICS = m_nK;
	param->SVM_ALGTYPE = 2;

	char filename[512];
	get_train_filename( filename, model_dir, param );
	svmStructSolver( filename, param, m_dMu );
	
	// for testing.
	int nDataNum = 0;
	double dAcc = 0;
	get_test_filename( filename, model_dir, param );
	readLowDimData( filename, nDataNum );

	for ( int d=0; d<nDataNum; d++ ) {
		int predLabel = predict( theta_[d] );
		if ( label_[d] == predLabel ) dAcc ++;
	}
	dAcc /= nDataNum;

	
	FILE *fileptr = fopen("overall-res.txt", "a");
	fprintf(fileptr, "setup (K: %d; C: %.3f; fold: %d; ell: %.2f; lambda: %.2f; rho: %.4f; svm_alg: %d; maxIt: %d): accuracy %.3f; avgNonZeroWrdCode: %.5f\n", 
		m_nK, m_dC, 0, dEll, m_dLambda, m_dRho, param->SVM_ALGTYPE, 0, dAcc, 0.0);
	fclose(fileptr);

	save_model( model_root, -1 );

	for ( int d=0; d<nDataNum; d++ ) {
		free( theta_[d] );
	}
	free( theta_ );
	free( label_ );
}
Esempio n. 2
0
void
test_km_readline_realloc (void *ptr)
{
    char *lorem_fn = NULL;
    char *buf = NULL;
    char *smallbuf = NULL;
    const size_t smallbuf_len = 1<<4;
    FILE *fp = NULL;
    ssize_t ret = 0;
    size_t line_num;
    char *nulcp = NULL;
    FILE *nulfp = NULL;
    char *tmpcp = NULL;
    size_t tmpsz = 0;
    size_t our_bufsize = bufsize;
    size_t our_smallbuf_len = smallbuf_len;
    km_test_err = 0;
    /* This should always work, so long as you run it from the right dir */
    lorem_fn = get_test_filename("loremipsum.txt");
    if (lorem_fn == NULL)
        tt_abort_msg("Broken test - get_test_filename failed\n");
    if ((fp = fopen(lorem_fn, "r")) == NULL) {
        fprintf(stderr, "Could not open test file '%s' -- %s\n",
                lorem_fn, strerror(errno));
        tt_skip();
    } else {
        buf = calloc(our_bufsize, sizeof(*buf));
        smallbuf = calloc(our_smallbuf_len, sizeof(*smallbuf));
    }
    for (line_num = 0; line_num < n_loremipsum_lines; line_num++) {
        ret = km_readline_realloc(&buf, fp, &our_bufsize, &test_err_handler);
        tt_int_op(km_test_err, ==, 0);
        tt_int_op(strncmp(buf, loremipsum_lines[line_num], our_bufsize), ==, 0);
        tt_int_op(strlen(buf), ==, loremipsum_line_lens[line_num]);
        tt_int_op(ret, ==, loremipsum_line_lens[line_num]);
        tt_int_op(our_bufsize, ==, bufsize);
        km_test_err = 0;
    }
    ret = km_readline_realloc(&buf, fp, &our_bufsize, &test_err_handler);
    tt_int_op(km_test_err, ==, 0);
    /* check it leaves  \0 in buf */
    tt_int_op(strncmp(buf, "", our_bufsize), ==, 0);
    tt_int_op(strlen(buf), ==, 0);
    tt_int_op(ret, ==, EOF);
    tt_int_op(our_bufsize, ==, bufsize);
    km_test_err = 0;
    /* Naughty tests that try and make it fail */
    rewind(fp);
    /* Null buf */
    ret = km_readline_realloc(&nulcp, fp, &our_bufsize, &test_err_handler);
    tt_int_op(km_test_err, ==, 3);
    tt_int_op(ret, ==, -2);
    tt_int_op(our_bufsize, ==, bufsize);
    km_test_err = 0;
    /* Null fp */
    ret = km_readline_realloc(&buf, nulfp, &our_bufsize, &test_err_handler);
    tt_int_op(km_test_err, ==, 3);
    tt_int_op(ret, ==, -2);
    tt_int_op(our_bufsize, ==, bufsize);
    km_test_err = 0;
    /* Both buf & fp null */
    ret = km_readline_realloc(&nulcp, nulfp, &our_bufsize, &test_err_handler);
    tt_int_op(km_test_err, ==, 3);
    tt_int_op(ret, ==, -2);
    tt_int_op(our_bufsize, ==, bufsize);
    km_test_err = 0;
    /* Test that should require it to resize the buffer */
    rewind(fp);
    ret = km_readline_realloc(&smallbuf, fp, &our_smallbuf_len,
            &test_err_handler);
    tt_int_op(km_test_err, ==, 0);
    tt_int_op(ret, ==, loremipsum_line_lens[0]);
    tt_int_op(strlen(smallbuf), ==, loremipsum_line_lens[0]);
    tmpsz = loremipsum_line_lens[0];
    tt_int_op(our_smallbuf_len, ==, kmroundupz(tmpsz));
end:
    if (lorem_fn != NULL) free(lorem_fn);
    if (buf != NULL) free(buf);
    if (smallbuf != NULL) free(smallbuf);
    if (fp != NULL) fclose(fp);
}
Esempio n. 3
0
double MedSTC::sparse_coding(char* model_dir, Corpus* pC, Params *param)
{
	char model_root[512];
	sprintf(model_root, "%s/final", model_dir);
	load_model(model_root);
	init_param( pC );

	// remove unseen words
	Document* doc = NULL;
	if ( pC->num_terms > m_nNumTerms ) {
		for ( int i=0; i<pC->num_docs; i ++ ) {
			doc = &(pC->docs[i]);
			for ( int k=0; k<doc->length; k++ )
				if ( doc->words[k] >= m_nNumTerms )
					doc->words[k] = m_nNumTerms - 1;
		}
	}

	// allocate memory
	int max_length = pC->max_corpus_length();
	double **phi = (double**)malloc(sizeof(double*)*max_length);
	for (int n=0; n<max_length; n++) {
		phi[n] = (double*)malloc(sizeof(double) * m_nK);
	}
	double **theta = (double**)malloc(sizeof(double*)*(pC->num_docs));
	for (int d=0; d<pC->num_docs; d++) {
		theta[d] = (double*)malloc(sizeof(double)*m_nK);
	}
	double **avgTheta = (double**)malloc(sizeof(double*)*m_nLabelNum);
	for ( int k=0; k<m_nLabelNum; k++ ) {
		avgTheta[k] = (double*)malloc(sizeof(double)*m_nK);
		memset(avgTheta[k], 0, sizeof(double)*m_nK);
	}
	vector<vector<double> > avgWrdCode(m_nNumTerms);
	vector<int> wrdCount(m_nNumTerms, 0);
	for ( int i=0; i<m_nNumTerms; i++ ) {
		avgWrdCode[i].resize( m_nK, 0 );
	}
	vector<int> perClassDataNum(m_nLabelNum, 0);

	
	char filename[100];
	sprintf(filename, "%s/evl-slda-obj.dat", model_dir);
	FILE* fileptr = fopen(filename, "w");
	
	
	
	double dEntropy = 0, dobj = 0, dNonZeroWrdCode = 0, dNonZeroDocCode = 0;
	int nTotalWrd = 0;
	
	for (int d=0; d<pC->num_docs; d++) {

		doc = &(pC->docs[d]);
		// initialize phi.
		for (int n=0; n<doc->length; n++) {
			double *phiPtr = phi[n];
			for ( int k=0; k<m_nK; k++ ) {
				phiPtr[k] = 1.0 / m_nK;
			}
		}
		dobj = sparse_coding( doc, d, param, theta[d], phi );

		// do prediction
		doc->predlabel = predict(theta[d]);
		
		doc->scores = (double*) malloc(sizeof(double)*m_nLabelNum);;
		predict_scores(doc->scores,theta[d]);
		
		doc->lhood = dobj;
		fprintf(fileptr, "%5.5f\n", dobj);

		//dEntropy += safe_entropy( exp[d], m_nK );
		int gndLabel = doc->gndlabel;
		perClassDataNum[gndLabel] ++;
		for ( int k=0; k<m_nK; k++ ) {
			for ( int n=0; n<doc->length; n++ ) {
				//fprintf( wrdfptr, "%.10f ", phi[n][k] );
				if ( phi[n][k] > 0/*1e-10*/ ) dNonZeroWrdCode ++;
			}
			//fprintf( wrdfptr, "\n" );
			avgTheta[gndLabel][k] += theta[d][k];
			if ( theta[d][k] > 0 ) dNonZeroDocCode ++;
		}
		nTotalWrd += doc->length;
		//fprintf( wrdfptr, "\n" );
		//fflush( wrdfptr );

		dEntropy += safe_entropy( theta[d], m_nK );

		//// the average distribution of each word on the topics.
		//for ( int n=0; n<doc->length; n++ ) {
		//	int wrd = doc->words[n];
		//	wrdCount[wrd] ++;
		//	for ( int k=0; k<m_nK; k++ ) {
		//		avgWrdCode[wrd][k] += phi[n][k];
		//	}
		//}
	}
	
	fclose( fileptr );
	
	
	

	/* save theta & average theta. */
	sprintf(filename, "%s/evl-theta.dat", model_dir);
	save_theta(filename, theta, pC->num_docs, m_nK);
	sprintf(filename, "%s/evl-avgTheta.dat", model_dir);
	for ( int m=0; m<m_nLabelNum; m++ ) {
		int dataNum = perClassDataNum[m];
		for ( int k=0; k<m_nK; k++ ) {
			avgTheta[m][k] /= dataNum;
		}
	}
	printf_mat(filename, avgTheta, m_nLabelNum, m_nK);

	/* save the average topic distribution for each word. */
	sprintf(filename, "%s/evl-avgWrdCode.dat", model_dir);
	fileptr = fopen( filename, "w" );
	for ( int i=0; i<m_nNumTerms; i++ ) {
		double dNorm = wrdCount[i];
		for ( int k=0; k<m_nK; k++ ) {
			double dval = avgWrdCode[i][k];
			if ( dNorm > 0 ) dval /= dNorm;
			fprintf( fileptr, "%.10f ", dval );
		}
		fprintf( fileptr, "\n" );
	}
	fclose( fileptr );
	//printf_mat( filename, avgWrdCode, m_nNumTerms, m_nK );

	/* save the low dimension representation. */
	get_test_filename(filename, model_dir, param);
	outputLowDimData(filename, pC, theta);

	/* save the prediction performance. */
	sprintf(filename, "%s/evl-performance.dat", model_dir);
	double dAcc = save_prediction(filename, pC);

	// free memory
	for (int i=0; i<pC->num_docs; i++ ) {
		free( theta[i] );
	}
	for ( int n=0; n<max_length; n++ ) {
		free( phi[n] );
	}
	for ( int k=0; k<m_nLabelNum; k++ ) {
		free( avgTheta[k] );
	}
	free( theta );
	free( phi );
	free( avgTheta );

	return dAcc;
}