/** * This function takes a unicode string 'word' representing a compound word, and * tokenizes it into tokens. The output is an array 'tokens' that contains the * numbers of the tokens that constitute the word. If case variants are allowed, * a token can be replaced by a token list delimited by the special values * BEGIN_CASE_VARIANT_LIST and END_CASE_VARIANT_LIST. The token list is ended * by END_TOKEN_LIST. * * The array 'tokens' is supposed to be large enough. 'tok' represents the text tokens. * 'tokenization_mode' indicates if the word must be tokenized character by character * or not. */ void tokenize_compound_word(const unichar* word,int tokens[],const Alphabet* alphabet, struct string_hash* tok,TokenizationPolicy tokenization_mode) { int n_token,j; struct list_ustring* list=tokenize(word,tokenization_mode,alphabet); struct list_ustring* tmp; struct list_int* ptr; n_token=0; while (list!=NULL) { j=get_value_index(list->string,tok,DONT_INSERT); /* If a token of a compound word is not a token of the text, * we MUST NOT ignore it. For instance, if we have the compound * word "a priori" and if the text only contains "PRIORI", it is not * an error case. The error case is when there is no case equivalent of * "priori" in the text. In such a situation, we traduce it by an empty * list. We don't raise an error because if there is by accident a token * in a dictionary that is not in the text, it would block the Locate * without necessity. */ if (is_letter(list->string[0],alphabet) || j==-1) { /* If the current token is made of letters, we look for all * its case variants. If we have a non letter token that is * not in the text tokens, we handle it here to produce an * empty case variant list. */ tokens[n_token++]=BEGIN_CASE_VARIANT_LIST; ptr=get_token_list_for_sequence(list->string,alphabet,tok); struct list_int* ptr_copy=ptr; // s.n. while (ptr!=NULL) { j=ptr->n; tokens[n_token++]=j; ptr=ptr->next; } free_list_int(ptr_copy); // s.n. tokens[n_token++]=END_CASE_VARIANT_LIST; } else { /* If we have a non letter single character, we just add its number to * the token array */ tokens[n_token++]=j; } tmp=list; list=list->next; free_list_ustring_element(tmp); } /* Finally, we end the token list. */ tokens[n_token]=END_TOKEN_LIST; }
/** * This function takes a compound word and tokenizes it according to * the given text tokens. The result is an integer sequence that is * stored in 'token_sequence'. Each integer represents a token number, * and the sequence is ended by -1. * * Example: "sans raison" may be turned into (121,1,1643,-1) * * WARNING: every token of the compound word is supposed to be present * in the given text tokens. */ int build_token_sequence(unichar* compound_word,struct text_tokens* tokens,int* token_sequence) { struct list_ustring* list=tokenize(compound_word,WORD_BY_WORD_TOKENIZATION,NULL); struct list_ustring* tmp; int i=0; while (list!=NULL) { token_sequence[i]=get_token_number(list->string,tokens); if (token_sequence[i]==-1) { error("Unknown token <%S> in build_token_sequence\n",list->string); return 0; } i++; tmp=list; list=list->next; free_list_ustring_element(tmp); } /* We put the final -1 */ token_sequence[i]=-1; return 1; }