static FUZZY_WORD *Stem_snowball( FUZZY_OBJECT *fi, const char *inword) { char *out_word; struct SN_env *snowball = fi->snowball_options; FUZZY_WORD *fw = create_fuzzy_word( inword, 1 ); /* create place to store stemmed word */ SN_set_current(snowball,strlen(inword),(const symbol *)inword); /* Set Word to Stem */ fi->stemmer->lang_stem(snowball); /* Stem the word */ if ( 0 == snowball->l ) { fw->error = STEM_TO_NOTHING; return fw; } fw->free_strings = 1; /* flag that malloc is used */ out_word = emalloc(snowball->l + 1); memcpy(out_word, snowball->p, snowball->l); out_word[snowball->l] = '\0'; fw->string_list[0] = out_word; return fw; }
static FUZZY_WORD *double_metaphone( FUZZY_OBJECT *fi, const char *inword) { FUZZY_WORD *fw = create_fuzzy_word( inword, 2 ); /* create place to store stemmed word */ char *codes[2]; DoubleMetaphone( inword, codes ); if ( !(*codes[0]) ) /* was there at least one conversion? */ { efree( codes[0] ); efree( codes[1] ); return fw; } fw->free_strings = 1; fw->string_list[0] = codes[0]; /* Is double metaphone enabled? */ if ( FUZZY_DOUBLE_METAPHONE != fi->stemmer->fuzzy_mode ) return fw; /* Is there a second metaphone that is different from the first? */ if ( *codes[1] && strcmp(codes[0], codes[1]) ) { fw->list_size++; fw->string_list[1] = codes[1]; } else { efree( codes[1] ); } return fw; }
static FUZZY_WORD *Stem( FUZZY_OBJECT *fi, const char *inword) { char *end; /* pointer to the end of the word */ char word[MAXWORDLEN+1]; int length; int rule_result; /* which rule is fired in replacing an end */ int i; FUZZY_WORD *fw = create_fuzzy_word( inword, 1 ); /* Make sure the word is not too large from the start. */ if ( strlen( inword ) >= MAXWORDLEN ) { fw->error = STEM_WORD_TOO_BIG; return fw; } /* make working copy */ strcpy( word, inword ); /* Part 1: Check to ensure the word is all alphabetic */ /* no longer converts to lower case -- word should be lower before calling */ for ( end = word; *end; end++ ) if ( !isalpha( (unsigned int) *end ) ) { fw->error = STEM_NOT_ALPHA; return fw; } /* Part 2: Run through the Porter algorithm */ for (i = 0; i < (int)(sizeof(all_steps)/sizeof(all_steps[0])); i++) { rule_result = ReplaceEnd(word, all_steps[i]); if ((rule_result == 106) || (rule_result == 107)) rule_result = ReplaceEnd(word, step1b1_rules); if ( rule_result == STEM_WORD_TOO_BIG ) { fw->error = rule_result; return fw; } } length = strlen( word ); /* Stem must be two chars or more in length */ if ( length <= 1 ) { fw->error = STEM_TO_NOTHING; return fw; } if ( length >= MAXWORDLEN ) { fw->error = STEM_WORD_TOO_BIG; return fw; } fw->free_strings = 1; fw->string_list[0] = estrdup( word ); return fw; }
static FUZZY_WORD *no_stem( FUZZY_OBJECT *fi, const char *inword) { return create_fuzzy_word( inword, 1 ); }
FUZZY_WORD *soundex( FUZZY_OBJECT *fi, const char *inword) { FUZZY_WORD *fw = create_fuzzy_word( inword, 1 ); /* create place to store stemmed word */ char word[MAXWORDLEN+1]; /* Misc Stuff */ char u, l ; int i, j, n; /* Resultant Sound Code */ char soundCode[5] = "0000\0"; /* Group Number Lookup Table */ static char soundTable[26] = {0, /* A */ '1', /* B */ '2', /* C */ '3', /* D */ 0, /* E */ '1', /* F */ '2', /* G */ 0, /* H */ 0, /* I */ '2', /* J */ '2', /* K */ '4', /* L */ '5', /* M */ '5', /* N */ 0, /* O */ '1', /* P */ '2', /* Q */ '6', /* R */ '2', /* S */ '3', /* T */ 0, /* U */ '1', /* V */ 0, /* W */ '2', /* X */ 0, /* Y */ '2'}; /* Z */ /* Make sure the word is not too large from the start. */ if ( strlen( inword ) >= MAXWORDLEN ) { fw->error = STEM_WORD_TOO_BIG; return fw; } /* make working copy */ strcpy( word, inword ); #ifdef _DEBUG /* Debug to console */ printf("# %15s: %s ", "soundex.c", word); #endif /* Make sure it actually starts with a letter */ if(!isalpha((int)((unsigned char)word[0]))) { fw->error = STEM_NOT_ALPHA; return fw; } #ifdef _DEBUG /* Debug to console */ printf("isalpha, "); #endif /* Get string length and make sure its at least 3 characters */ if((n = (int)strlen(word)) < 3) { fw->error = STEM_TOO_SMALL; return fw; } #ifdef _DEBUG /* Debug to console */ printf("=>3, "); #endif /* If looks like a 4 digit soundex code we don't want to touch it. */ /* Humm. Just because it looks like a duck, doesn't mean it is one * The source is suppose to not be soundex, so this doesn't make a lot of sense. - moseely */ #ifdef skip_section if((n = (int)strlen(word)) == 4){ if( isdigit( (int)(unsigned char)word[1] ) && isdigit( (int)(unsigned char)word[2] ) && isdigit( (int)(unsigned char)word[3] ) ) return STEM_OK; /* Hum, probably not right */ } #endif /* Convert chars to lower case and strip non-letter chars */ j = 0; for (i = 0; i < n; i++) { u = tolower((unsigned char)word[i]); if ((u > 96) && (u < 123)) { word[j] = u; j++; } } /* terminate string */ word[j] = 0; /* String length again */ n = strlen(word); soundCode[0] = word[0]; /* remember first char */ l = soundTable[((word[0]) - 97)]; j = 1; /* build soundex string */ for (i = 1; i < n && j < 4; i++) { u = soundTable[((word[i]) - 97)]; if (u != l) { if (u != 0) { soundCode[(int) j++] = u; } l = u; } } fw->free_strings = 1; /* flag that we are creating a string */ fw->string_list[0] = estrdup( soundCode ); return fw; }