extern int fp_Read( void *handle, const char *fname, int maxngrams )
{
	fp_t *h = (fp_t *)handle;
	FILE *fp;
	char line[1024];
	int cnt = 0;

	fp = fopen( fname, "r" );
	if (!fp) {
#ifdef VERBOSE
		fprintf( stderr, "Failed to open fingerprint file '%s'\n", fname);
#endif
		return 0;
	}
	
	h->fprint = (ngram_t *)wg_malloc(maxngrams * sizeof(ngram_t));

	while (cnt < maxngrams && wg_getline(line,1024,fp)) {

		char *p;

		wg_trim(line, line);

		p = strpbrk( line, " \t" );
		if ( p ) {
			*p = '\0';
		}

		if ( strlen(line) > MAXNGRAMSIZE ) {
			continue;
		}

		strcpy( h->fprint[cnt].str, line );
		h->fprint[cnt].rank = cnt;

		cnt++;
	}

	h->size = cnt;

	/*** Sort n-grams, for easy comparison later on ***/
	qsort( h->fprint, h->size, sizeof(ngram_t), ngramcmp_str ); 

	fclose(fp);

	return 1;
}
Beispiel #2
0
extern void *textcat_Init( const char *conffile )
{
	textcat_t *h;
	char line[1024];
	FILE *fp;

	fp = fopen( conffile, "r" );
	if ( !fp ) {
#ifdef VERBOSE
		fprintf( stderr, "Failed to open config file '%s'\n", conffile);
#endif
		return NULL;
	}

	h = (textcat_t *)wg_malloc(sizeof(textcat_t));
	h->size = 0;
	h->maxsize = 16;
	h->fprint = (void **)wg_malloc( sizeof(void*) * h->maxsize );

	while ( wg_getline( line, 1024, fp ) ) {
		char *p;
		char *segment[4];
		int res;

		/*** Skip comments ***/
#ifdef HAVE_STRCHR
		if (( p = strchr(line,'#') )) {
#else
		if (( p = index(line,'#') )) {
#endif

			*p = '\0';
		}
		if ((res = wg_split( segment, line, line, 4)) < 2 ) {
			continue;
		}

		/*** Ensure enough space ***/
		if ( h->size == h->maxsize ) {
			h->maxsize *= 2;
			h->fprint = (void *)wg_realloc( h->fprint, sizeof(void*) * h->maxsize );
		}

		/*** Load data ***/
		if ((h->fprint[ h->size ] = fp_Init( segment[1] ))==NULL) {
			goto ERROR;
		}
		if ( fp_Read( h->fprint[h->size], segment[0], 400 ) == 0 ) {
			textcat_Done(h);
			goto ERROR;
		}		
		h->size++;
	}

	fclose(fp);
	return h;

 ERROR:
	fclose(fp);
	return NULL;

}


extern char *textcat_Classify( void *handle, const char *buffer, size_t size )
{
	textcat_t *h = (textcat_t *)handle;
	uint4 i, cnt = 0;
	int minscore = MAXSCORE;
	int threshold = minscore;
	char *result = h->output;

#ifdef HAVE_ALLOCA
	candidate_t *candidates = (candidate_t *)alloca( sizeof(candidate_t) * h->size );
#else
	candidate_t *candidates = (candidate_t *)malloc( sizeof(candidate_t) * h->size );
#define SHOULD_FREE 1
#endif

	void *unknown;

	unknown = fp_Init(NULL);
	if ( fp_Create( unknown, buffer, size, MAXNGRAMS ) == 0 ) {
		/*** Too little information ***/
		result = _TEXTCAT_RESULT_SHORT;
		goto READY;
	}
	
	/*** Calculate the score for each category. ***/
	for (i=0; i<h->size; i++) {
		int score = fp_Compare( h->fprint[i], unknown, threshold );
		candidates[i].score = score;
		candidates[i].name = fp_Name( h->fprint[i] );
		if ( score < minscore ) {
			minscore = score;
			threshold = (int)( (double)score * THRESHOLDVALUE );
		}
	}

	/*** Find the best performers ***/
	for (i=0; i<h->size; i++) {
		if ( candidates[i].score < threshold ) {

			if ( ++cnt == MAXCANDIDATES+1 ) {
				break;
			}

			memcpy( &candidates[cnt-1], &candidates[i], sizeof(candidate_t) );

		}
	}

	/*** The verdict ***/
	if ( cnt == MAXCANDIDATES+1 ) {
		result = _TEXTCAT_RESULT_UNKOWN;
	}
	else {
		char *p = result;
		char *plimit = result+MAXOUTPUTSIZE;
		
		qsort( candidates, cnt, sizeof(candidate_t), cmpcandidates );

		*p = '\0';
		for (i=0; i<cnt; i++) {
			p = wg_strgmov( p, "[", plimit );
			p = wg_strgmov( p, candidates[i].name, plimit );
			p = wg_strgmov( p, "]", plimit );
		}
	}
 READY:
	fp_Done(unknown);
#ifdef SHOULD_FREE 
	free(candidates);
#undef SHOULD_FREE
#endif
	return result;
}