Exemplo n.º 1
0
/**
 * This function takes a unicode string 'word' representing a compound word, and
 * tokenizes it into tokens. The output is an array 'tokens' that contains the
 * numbers of the tokens that constitute the word. If case variants are allowed,
 * a token can be replaced by a token list delimited by the special values
 * BEGIN_CASE_VARIANT_LIST and END_CASE_VARIANT_LIST. The token list is ended
 * by END_TOKEN_LIST.
 *
 * The array 'tokens' is supposed to be large enough. 'tok' represents the text tokens.
 * 'tokenization_mode' indicates if the word must be tokenized character by character
 * or not.
 */
void tokenize_compound_word(const unichar* word,int tokens[],const Alphabet* alphabet,
                            struct string_hash* tok,TokenizationPolicy tokenization_mode) {

int n_token,j;
struct list_ustring* list=tokenize(word,tokenization_mode,alphabet);
struct list_ustring* tmp;
struct list_int* ptr;
n_token=0;
while (list!=NULL) {
   j=get_value_index(list->string,tok,DONT_INSERT);
   /* If a token of a compound word is not a token of the text,
    * we MUST NOT ignore it. For instance, if we have the compound
    * word "a priori" and if the text only contains "PRIORI", it is not
    * an error case. The error case is when there is no case equivalent of
    * "priori" in the text. In such a situation, we traduce it by an empty
    * list. We don't raise an error because if there is by accident a token
    * in a dictionary that is not in the text, it would block the Locate
    * without necessity. */
   if (is_letter(list->string[0],alphabet) || j==-1) {
      /* If the current token is made of letters, we look for all
       * its case variants. If we have a non letter token that is
       * not in the text tokens, we handle it here to produce an
       * empty case variant list. */
      tokens[n_token++]=BEGIN_CASE_VARIANT_LIST;
      ptr=get_token_list_for_sequence(list->string,alphabet,tok);
      struct list_int* ptr_copy=ptr; // s.n.
      while (ptr!=NULL) {
         j=ptr->n;
         tokens[n_token++]=j;
         ptr=ptr->next;
      }
      free_list_int(ptr_copy); // s.n.
      tokens[n_token++]=END_CASE_VARIANT_LIST;
   } else {
      /* If we have a non letter single character, we just add its number to
       * the token array */
      tokens[n_token++]=j;
   }
   tmp=list;
   list=list->next;
   free_list_ustring_element(tmp);
}
/* Finally, we end the token list. */
tokens[n_token]=END_TOKEN_LIST;
}
/**
 * This function takes a compound word and tokenizes it according to
 * the given text tokens. The result is an integer sequence that is
 * stored in 'token_sequence'. Each integer represents a token number,
 * and the sequence is ended by -1.
 * 
 * Example: "sans raison" may be turned into (121,1,1643,-1)
 * 
 * WARNING: every token of the compound word is supposed to be present
 *          in the given text tokens.
 */
int build_token_sequence(unichar* compound_word,struct text_tokens* tokens,int* token_sequence) {
struct list_ustring* list=tokenize(compound_word,WORD_BY_WORD_TOKENIZATION,NULL);
struct list_ustring* tmp;
int i=0;
while (list!=NULL) {
   token_sequence[i]=get_token_number(list->string,tokens);
   if (token_sequence[i]==-1) {
      error("Unknown token <%S> in build_token_sequence\n",list->string);
      return 0;
   }
   i++;
   tmp=list;
   list=list->next;
   free_list_ustring_element(tmp);
}
/* We put the final -1 */
token_sequence[i]=-1;
return 1;
}