extern int fp_Read( void *handle, const char *fname, int maxngrams ) { fp_t *h = (fp_t *)handle; FILE *fp; char line[1024]; int cnt = 0; fp = fopen( fname, "r" ); if (!fp) { #ifdef VERBOSE fprintf( stderr, "Failed to open fingerprint file '%s'\n", fname); #endif return 0; } h->fprint = (ngram_t *)wg_malloc(maxngrams * sizeof(ngram_t)); while (cnt < maxngrams && wg_getline(line,1024,fp)) { char *p; wg_trim(line, line); p = strpbrk( line, " \t" ); if ( p ) { *p = '\0'; } if ( strlen(line) > MAXNGRAMSIZE ) { continue; } strcpy( h->fprint[cnt].str, line ); h->fprint[cnt].rank = cnt; cnt++; } h->size = cnt; /*** Sort n-grams, for easy comparison later on ***/ qsort( h->fprint, h->size, sizeof(ngram_t), ngramcmp_str ); fclose(fp); return 1; }
extern void *textcat_Init( const char *conffile ) { textcat_t *h; char line[1024]; FILE *fp; fp = fopen( conffile, "r" ); if ( !fp ) { #ifdef VERBOSE fprintf( stderr, "Failed to open config file '%s'\n", conffile); #endif return NULL; } h = (textcat_t *)wg_malloc(sizeof(textcat_t)); h->size = 0; h->maxsize = 16; h->fprint = (void **)wg_malloc( sizeof(void*) * h->maxsize ); while ( wg_getline( line, 1024, fp ) ) { char *p; char *segment[4]; int res; /*** Skip comments ***/ #ifdef HAVE_STRCHR if (( p = strchr(line,'#') )) { #else if (( p = index(line,'#') )) { #endif *p = '\0'; } if ((res = wg_split( segment, line, line, 4)) < 2 ) { continue; } /*** Ensure enough space ***/ if ( h->size == h->maxsize ) { h->maxsize *= 2; h->fprint = (void *)wg_realloc( h->fprint, sizeof(void*) * h->maxsize ); } /*** Load data ***/ if ((h->fprint[ h->size ] = fp_Init( segment[1] ))==NULL) { goto ERROR; } if ( fp_Read( h->fprint[h->size], segment[0], 400 ) == 0 ) { textcat_Done(h); goto ERROR; } h->size++; } fclose(fp); return h; ERROR: fclose(fp); return NULL; } extern char *textcat_Classify( void *handle, const char *buffer, size_t size ) { textcat_t *h = (textcat_t *)handle; uint4 i, cnt = 0; int minscore = MAXSCORE; int threshold = minscore; char *result = h->output; #ifdef HAVE_ALLOCA candidate_t *candidates = (candidate_t *)alloca( sizeof(candidate_t) * h->size ); #else candidate_t *candidates = (candidate_t *)malloc( sizeof(candidate_t) * h->size ); #define SHOULD_FREE 1 #endif void *unknown; unknown = fp_Init(NULL); if ( fp_Create( unknown, buffer, size, MAXNGRAMS ) == 0 ) { /*** Too little information ***/ result = _TEXTCAT_RESULT_SHORT; goto READY; } /*** Calculate the score for each category. ***/ for (i=0; i<h->size; i++) { int score = fp_Compare( h->fprint[i], unknown, threshold ); candidates[i].score = score; candidates[i].name = fp_Name( h->fprint[i] ); if ( score < minscore ) { minscore = score; threshold = (int)( (double)score * THRESHOLDVALUE ); } } /*** Find the best performers ***/ for (i=0; i<h->size; i++) { if ( candidates[i].score < threshold ) { if ( ++cnt == MAXCANDIDATES+1 ) { break; } memcpy( &candidates[cnt-1], &candidates[i], sizeof(candidate_t) ); } } /*** The verdict ***/ if ( cnt == MAXCANDIDATES+1 ) { result = _TEXTCAT_RESULT_UNKOWN; } else { char *p = result; char *plimit = result+MAXOUTPUTSIZE; qsort( candidates, cnt, sizeof(candidate_t), cmpcandidates ); *p = '\0'; for (i=0; i<cnt; i++) { p = wg_strgmov( p, "[", plimit ); p = wg_strgmov( p, candidates[i].name, plimit ); p = wg_strgmov( p, "]", plimit ); } } READY: fp_Done(unknown); #ifdef SHOULD_FREE free(candidates); #undef SHOULD_FREE #endif return result; }