static FUZZY_WORD *Stem_snowball( FUZZY_OBJECT *fi, const char *inword)
{
    char *out_word;
    struct SN_env *snowball = fi->snowball_options;
    FUZZY_WORD *fw = create_fuzzy_word( inword, 1 ); /* create place to store stemmed word */

    SN_set_current(snowball,strlen(inword),(const symbol *)inword); /* Set Word to Stem */

    fi->stemmer->lang_stem(snowball); /* Stem the word */


    if ( 0 == snowball->l )
    {
        fw->error = STEM_TO_NOTHING;
        return fw;
    }

    fw->free_strings = 1; /* flag that malloc is used */

    out_word = emalloc(snowball->l + 1);
    memcpy(out_word, snowball->p, snowball->l);
    out_word[snowball->l] = '\0';
    fw->string_list[0] = out_word;

    return fw;
}
static FUZZY_WORD *double_metaphone( FUZZY_OBJECT *fi, const char *inword)
{
    FUZZY_WORD *fw = create_fuzzy_word( inword, 2 ); /* create place to store stemmed word */
    char *codes[2];

    DoubleMetaphone( inword, codes );

    if ( !(*codes[0]) )  /* was there at least one conversion? */
    {
        efree( codes[0] );
        efree( codes[1] );
        return fw;
    }

    fw->free_strings = 1;
    fw->string_list[0] = codes[0];


    /* Is double metaphone enabled? */

    if ( FUZZY_DOUBLE_METAPHONE != fi->stemmer->fuzzy_mode )
        return fw;

    /* Is there a second metaphone that is different from the first? */

    if ( *codes[1] && strcmp(codes[0], codes[1]) )
    {
        fw->list_size++;
        fw->string_list[1] = codes[1];
    }
    else
    {
        efree( codes[1] );
    }
    return fw;
}
static FUZZY_WORD *Stem( FUZZY_OBJECT *fi, const char *inword)
{
    char   *end;                /* pointer to the end of the word */
    char    word[MAXWORDLEN+1];
    int     length;
    int     rule_result;        /* which rule is fired in replacing an end */
    int     i;

    FUZZY_WORD *fw = create_fuzzy_word( inword, 1 );

    /* Make sure the word is not too large from the start. */
    if ( strlen( inword ) >= MAXWORDLEN )
    {
        fw->error = STEM_WORD_TOO_BIG;
        return fw;
    }


    /* make working copy */
    strcpy( word, inword );


    /* Part 1: Check to ensure the word is all alphabetic */
    /* no longer converts to lower case -- word should be lower before calling */

    for ( end = word; *end; end++ )
        if ( !isalpha( (unsigned int) *end ) )
        {
            fw->error = STEM_NOT_ALPHA;
            return fw;
        }



    /*  Part 2: Run through the Porter algorithm */


    for (i = 0; i < (int)(sizeof(all_steps)/sizeof(all_steps[0])); i++)
    {
        rule_result = ReplaceEnd(word, all_steps[i]);

        if ((rule_result == 106) || (rule_result == 107))
            rule_result = ReplaceEnd(word, step1b1_rules);

        if ( rule_result == STEM_WORD_TOO_BIG )
        {
            fw->error = rule_result;
            return fw;
        }
    }



    length = strlen( word );

    /* Stem must be two chars or more in length */
    if ( length <= 1 )
    {
        fw->error = STEM_TO_NOTHING;
        return fw;
    }


    if ( length >= MAXWORDLEN )
    {
        fw->error = STEM_WORD_TOO_BIG;
        return fw;
    }


    fw->free_strings = 1;
    fw->string_list[0] = estrdup( word );

    return fw;
}
static FUZZY_WORD *no_stem( FUZZY_OBJECT *fi, const char *inword)
{
    return create_fuzzy_word( inword, 1 );
}
Beispiel #5
0
FUZZY_WORD *soundex( FUZZY_OBJECT *fi, const char *inword)
   {
        FUZZY_WORD *fw = create_fuzzy_word( inword, 1 ); /* create place to store stemmed word */
        char word[MAXWORDLEN+1];
	/* Misc Stuff  */
	char u, l ;
	int i, j, n;
	/* Resultant Sound Code  */
	char soundCode[5] = "0000\0";
	/* Group Number Lookup Table  */
	static char soundTable[26] =
	{0,						/* A  */
	 '1',					/* B  */
	 '2',					/* C  */
	 '3',					/* D  */
	 0,						/* E  */
	 '1',					/* F  */
	 '2',					/* G  */
	 0,						/* H  */
	 0,						/* I  */
	 '2',					/* J  */
	 '2',					/* K  */
	 '4',					/* L  */
	 '5',					/* M  */
	 '5',					/* N  */
	 0,						/* O  */
	 '1',					/* P  */
	 '2',					/* Q  */
	 '6',					/* R  */
	 '2',					/* S  */
	 '3',					/* T  */
	 0,						/* U  */
	 '1',					/* V  */
	 0,						/* W  */
	 '2',					/* X  */
	 0,						/* Y  */
	 '2'};					/* Z  */

    /* Make sure the word is not too large from the start. */
    if ( strlen( inword ) >= MAXWORDLEN )
    {
        fw->error =  STEM_WORD_TOO_BIG;
        return fw;
    }


    /* make working copy */
    strcpy( word, inword );

  

#ifdef _DEBUG
	/* Debug to console  */
	printf("# %15s: %s ", "soundex.c", word);
#endif

	/* Make sure it actually starts with a letter  */
	if(!isalpha((int)((unsigned char)word[0]))) 
        {
            fw->error = STEM_NOT_ALPHA;
            return fw;
        }

#ifdef _DEBUG
	/* Debug to console  */
	printf("isalpha, ");
#endif
	
	/* Get string length and make sure its at least 3 characters  */
	if((n = (int)strlen(word)) < 3) 
        {
            fw->error = STEM_TOO_SMALL;
            return fw;
        }
#ifdef _DEBUG
	/* Debug to console  */
	printf("=>3, ");
#endif

        /* If looks like a 4 digit soundex code we don't want to touch it. */

        /* Humm.  Just because it looks like a duck, doesn't mean it is one
         * The source is suppose to not be soundex, so this doesn't make a lot of sense.  - moseely */
#ifdef skip_section
        
        if((n = (int)strlen(word)) == 4){
                if( isdigit( (int)(unsigned char)word[1] ) 
                 && isdigit( (int)(unsigned char)word[2] ) 
                 && isdigit( (int)(unsigned char)word[3] ) )
                       return STEM_OK;  /* Hum, probably not right */
        }
#endif

	/* Convert chars to lower case and strip non-letter chars  */
	j = 0;
	for (i = 0; i < n; i++) {
		u = tolower((unsigned char)word[i]);
		if ((u > 96) && (u < 123)) {
			 word[j] = u;
			j++;
		}
	}

	/* terminate string  */
	 word[j] = 0;

	/* String length again  */
	n = strlen(word);

	soundCode[0] = word[0];

	/* remember first char  */
	l = soundTable[((word[0]) - 97)];

	j = 1;

	/* build soundex string  */
	for (i = 1; i < n && j < 4; i++) {
		u = soundTable[((word[i]) - 97)];

		if (u != l) {
			if (u != 0) {
				soundCode[(int) j++] = u;
			}
			l = u;
		}
	}


    fw->free_strings = 1; /* flag that we are creating a string */
    fw->string_list[0] = estrdup( soundCode );
    return fw;

}