/**
 * Adds a new pair (key,value) in the hash table.
 */
void add_key_table(const unichar* key,struct string_hash_ptr* table){
void* value = get_value(key,table);
if(value != NULL){
	table->value[get_value_index(key,table)] = (void*)(((char*)value)+1);
}
else{
	get_value_index(key,table,INSERT_IF_NEEDED,(void*)1);
}
}
/**
 * Looks for a keyword that has a forbidden lemma or is a forbidden lemma
 * if the keyword is not a lemmatized one of the form XXX.YYY
 */
int has_forbidden_lemma(KeyWord* list,struct string_hash* lemmas) {
if (list==NULL || list->sequence==NULL) return 0;
int pos=last_index_of(list->sequence,(unichar)'.');
if (pos==-1) {
	/* If the keyword is not lemmatized, we just test
	 * if it is a forbidden lemma
	 */
	return (-1!=get_value_index(list->sequence,lemmas,DONT_INSERT));
}
Ustring* tmp=new_Ustring(list->sequence);
truncate(tmp,pos);
int index=get_value_index(tmp->str,lemmas,DONT_INSERT);
free_Ustring(tmp);
return index!=-1;
}
/**
 * Opens a .fst2 file in output mode and returns the associated fst_file_out_t
 * structure, or NULL in case of error.
 */
Elag_fst_file_out* fst_file_out_open(const VersatileEncodingConfig* vec,const char* fname,int type) {
Elag_fst_file_out* res=(Elag_fst_file_out*)malloc(sizeof(Elag_fst_file_out));
if (res==NULL) {
   fatal_alloc_error("fst_file_out_open");
}
if (type<0 || type>=FST_BAD_TYPE) {
   fatal_error("fst_file_out_open: bad FST_TYPE\n");
}
if ((res->f=u_fopen(vec,fname,U_WRITE))==NULL) {
   error("fst_out_open: unable to open '%s'\n",fname);
   free(res);
   return NULL;
}
res->fstart=ftell(res->f);
u_fprintf(res->f,"0000000000\n");
res->name=strdup(fname);
if (res->name==NULL) {
   fatal_alloc_error("fst_file_out_open");
}
res->type=type;
res->nb_automata=0;
res->labels=new_string_hash(16);
/* We add <E> to the tags in order to be sure that this special tag will have #0 */
get_value_index(EPSILON,res->labels);
return res;
}
void lemmatize(struct dela_entry* e,struct string_hash_ptr* keywords,Alphabet* alphabet) {
unichar* lower=u_strdup(e->inflected);
u_tolower(lower);
KeyWord* k_inflected=(KeyWord*)get_value(lower,keywords);
free(lower);
if (k_inflected==NULL) return;
Ustring* tmp=new_Ustring(64);
u_sprintf(tmp,"%S.%S",e->lemma,e->semantic_codes[0]);
KeyWord* k_lemma=(KeyWord*)get_value(tmp->str,keywords);
if (k_lemma==NULL) {
	k_lemma=new_KeyWord(0,tmp->str,NULL);
	k_lemma->lemmatized=LEMMATIZED_KEYWORD;
	get_value_index(tmp->str,keywords,INSERT_IF_NEEDED,k_lemma);
}
/* Now, we look for all the case compatible tokens, and we add
 * their weights to the new lemmatized element
 */
while (k_inflected!=NULL) {
	if (k_inflected->sequence!=NULL && is_equal_or_uppercase(e->inflected,k_inflected->sequence,alphabet)) {
		/* We have a match */
		k_lemma->weight+=k_inflected->weight;
		k_inflected->lemmatized=1;
	}
	k_inflected=k_inflected->next;
}
free_Ustring(tmp);
}
/**
 * Loads a compound word file, adding each word to the keywords.
 */
void load_compound_words(char* name,VersatileEncodingConfig* vec,
		struct string_hash_ptr* keywords) {
U_FILE* f=u_fopen(vec,name,U_READ);
if (f==NULL) return;
Ustring* line=new_Ustring(256);
Ustring* lower=new_Ustring(256);
while (EOF!=readline(line,f)) {
	if (line->str[0]=='{') {
		/* We skip tags */
		continue;
	}
	u_strcpy(lower,line->str);
	u_tolower(lower->str);
	int index=get_value_index(lower->str,keywords,INSERT_IF_NEEDED,NULL);
	if (index==-1) {
		fatal_error("Internal error in load_tokens_by_freq\n");
	}
	KeyWord* value=(KeyWord*)keywords->value[index];
	add_keyword(&value,line->str,1);
	keywords->value[index]=value;
}
free_Ustring(line);
free_Ustring(lower);
u_fclose(f);
}
/**
 * This function optimizes a pattern of the form "eat".
 */
void optimize_token_pattern(int i,Fst2Tag* tag,Alphabet* alph,
               struct locate_parameters* p,Abstract_allocator prv_alloc) {
/* Whatever happens, this pattern will be turned into a token list */
tag[i]->type=TOKEN_LIST_TAG;
unichar* opt_token=tag[i]->pattern->inflected;
/* First, we check if this token pattern can recognize some tag tokens */
struct list_int* list=p->tag_token_list;
while (list!=NULL) {
   struct dela_entry* entry=tokenize_tag_token(p->tokens->value[list->n],1);
   if ((!is_bit_mask_set(tag[i]->control,RESPECT_CASE_TAG_BIT_MASK) && is_equal_or_uppercase(opt_token,entry->inflected,alph)) ||
       !u_strcmp(opt_token,entry->inflected)) {
      tag[i]->matching_tokens=sorted_insert(list->n,tag[i]->matching_tokens,prv_alloc);
   }
   free_dela_entry(entry);
   list=list->next;
}
/* Then, we look for normal tokens */
if (is_bit_mask_set(tag[i]->control,RESPECT_CASE_TAG_BIT_MASK)) {
   /* If no case variants are allowed, then we just have to insert the number
    * of the token, but only if this token in the text ones. */
   int token_number;
   if (-1!=(token_number=get_value_index(opt_token,p->tokens,DONT_INSERT))) {
      tag[i]->matching_tokens=sorted_insert(token_number,tag[i]->matching_tokens,prv_alloc);
   }
   return;
}
/* Here, we have to get all the case variants of the token. */
tag[i]->matching_tokens=destructive_sorted_merge(get_token_list_for_sequence(opt_token,alph,p->tokens,prv_alloc),tag[i]->matching_tokens,prv_alloc);
}
Example #7
0
int		get_val(char type, char *values, t_arena *arena, t_proc *proc)
{
  t_conv	val;
  char		*ind_val;
  int		addr;

  val.integer = 0;
  if (type == A_REG)
    {
      if (is_valid_reg(type, values[0]))
	val.integer = proc->reg[values[0] - 1];
      else
	val.integer = 0;
    }
  if (type == A_DIR || type == A_IND)
    {
      val.integer = oct_to_int(values);
      if (type == A_IND)
	{
	  addr = my_mod((proc->pc + (val.integer % IDX_MOD)), MEM_SIZE);
	  ind_val = get_value_index(&addr, arena);
	  val.integer = oct_to_int(ind_val);
	}
    }
  return (val.integer);
}
/**
 * Loads the initial keyword list from a tok_by_freq.txt file,
 * and turns all those tokens in a list whose primary key is the
 * lower case token:
 * The/20 THE/2 the/50 => the->(The/20 THE/2 the/50)
 */
struct string_hash_ptr* load_tokens_by_freq(char* name,VersatileEncodingConfig* vec) {
U_FILE* f=u_fopen(vec,name,U_READ);
if (f==NULL) return NULL;
Ustring* line=new_Ustring(128);
Ustring* lower=new_Ustring(128);
struct string_hash_ptr* res=new_string_hash_ptr(1024);
int val,pos;
/* We skip the first line of the file, containing the number
 * of tokens
 */
if (EOF==readline(line,f)) {
	fatal_error("Invalid empty file %s\n",name);
}
while (EOF!=readline(line,f)) {
	if (1!=u_sscanf(line->str,"%d%n",&val,&pos)) {
		fatal_error("Invalid line in file %s:\n%S\n",name,line->str);
	}
	u_strcpy(lower,line->str+pos);
	u_tolower(lower->str);
	int index=get_value_index(lower->str,res,INSERT_IF_NEEDED,NULL);
	if (index==-1) {
		fatal_error("Internal error in load_tokens_by_freq\n");
	}
	KeyWord* value=(KeyWord*)res->value[index];
	res->value[index]=new_KeyWord(val,line->str+pos,value);
}
free_Ustring(line);
free_Ustring(lower);
u_fclose(f);
return res;
}
/**
 * This function adds the given variable to the given variable list.
 * No tests is done to check if there is already a transition with the
 * given variable, because it cannot happen if the grammar is deterministic.
 */
void add_output_variable(OutputVariables* var,unichar* variable,Transition* transition,
		struct opt_variable** variable_list,Abstract_allocator prv_alloc) {
int n=get_value_index(variable,var->variable_index,DONT_INSERT);
struct opt_variable* v=new_opt_variable(n,transition,prv_alloc);
v->next=(*variable_list);
(*variable_list)=v;
}
void LEXIC_trans_write(Elag_fst_file_out * fstf, int to) {

  unichar label[8];
  int idx;

  u_strcpy(label, "<MOT>");

  idx=get_value_index(label,fstf->labels);

  u_fprintf(fstf->f, "%d %d ", idx, to);

  u_strcpy(label, "<!MOT>");

  idx=get_value_index(label,fstf->labels);

  u_fprintf(fstf->f, "%d %d ", idx, to);
}
Example #11
0
Ints ClassnamePredicate::get_value(const PLURALVARIABLETYPE &o) const {
  IMPKERNEL_DEPRECATED_METHOD_DEF(2.1, "Use index version");
  if (o.empty()) return Ints();
  Ints ret(o.size());
  Model *m = internal::get_model(o[0]);
  for (unsigned int i = 0; i < o.size(); ++i) {
    ret[i] += get_value_index(m, internal::get_index(o[i]));
  }
  return ret;
}
Example #12
0
/**
 * Loads the lines of a text file into a string_hash and returns it, or NULL
 * if the file can not be opened. We arbitrary fix the limit of a line to 4096
 * characters. Each line is splitted into a key and a value, according to a
 * given separator character. An error message will be printed if a line does not
 * contain the separator character, if an empty line is found, or if a line contains
 * an empty key. In case of empty values, the empty string will be used.
 * Note that keys and values can contain characters protected with the \ character,
 * including protected new lines like:
 *
 * 123\
 * =ONE_TWO_THREE_NEW_LINE
 *
 */
struct string_hash* load_key_value_list(const char* name,int mask_encoding_compatibility_input,unichar separator) {
U_FILE* f=u_fopen_existing_versatile_encoding(mask_encoding_compatibility_input,name,U_READ);
if (f==NULL) return NULL;
struct string_hash* hash=new_string_hash();
unichar temp[4096];
unichar key[4096];
unichar value[4096];
/* We build a string with the separator character */
unichar stop[2];
stop[0]=separator;
stop[1]='\0';
int code;
while (EOF!=(code=u_fgets2(temp,f))) {
   if (code==0) {
      error("Empty line\n");
   }
   else {
      /* First, we try to read a non empty key */
      int pos=0;
      code=parse_string(temp,&pos,key,stop);
      if (code==P_BACKSLASH_AT_END) {
         error("Backslash at end of line:<%S>\n\n",temp);
      }
      else if (pos==0 &&temp[pos]=='\0') {
         /* Empty line */
    	  continue;
      }
      else if (pos==0) {
         /* If the line starts with the separator */
         error("Line with empty key:\n<%S>\n",temp);
      }
      else {
         /* We jump over the separator */
         pos++;
         /* We initialize 'value' with the empty string in case it is not
          * defined in the file */
         value[0]='\0';
         if(P_BACKSLASH_AT_END==parse_string(temp,&pos,value,P_EMPTY)) {
            error("Backslash at end of line:\n<%S>\n",temp);
         }
         else {
            /* If we have a valid (key,value) pair, we insert it into the string_hash */
            get_value_index(key,hash,INSERT_IF_NEEDED,value);
         }
      }
   }
}
u_fclose(f);
return hash;
}
/**
 * We remove every keyword that is tagged with the forbidden code. If
 * a forbidden keyword has several tags, all of them are removed:
 *
 * the,.DET + the,.XXX => all 'the' keywords are removed
 */
struct string_hash* compute_forbidden_lemmas(struct string_hash_ptr* keywords,unichar* code) {
struct string_hash* hash=new_string_hash(DONT_USE_VALUES,DONT_ENLARGE);
Ustring* tmp=new_Ustring();
for (int i=0;i<keywords->size;i++) {
	KeyWord* list=(KeyWord*)(keywords->value[i]);
	while (list!=NULL) {
		if (get_forbidden_keyword(list,code,tmp)) {
			get_value_index(tmp->str,hash);
		}
		list=list->next;
	}
}
free_Ustring(tmp);
return hash;
}
Example #14
0
/**
 * Loads the lines of a text file info a string_hash and returns it, or NULL
 * if the file can not be opened. We arbitrary fix the limit of a line to 4096
 * characters. For each line, we ignore the carriage return, if any, and we use
 * the remaining string as key and value. An error message will be printed if
 * an empty line is found.
 */
struct string_hash* load_key_list(const char* name,int mask_encoding_compatibility_input) {
U_FILE* f=u_fopen_existing_versatile_encoding(mask_encoding_compatibility_input,name,U_READ);
if (f==NULL) return NULL;
struct string_hash* hash=new_string_hash(DONT_USE_VALUES);
unichar temp[4096];
while (EOF!=u_fgets_limit2(temp,4096,f)) {
   if (temp[0]=='\0') {
      error("Empty line in %s\n",name);
   } else {
      get_value_index(temp,hash);
   }
}
u_fclose(f);
return hash;
}
void CHFA_trans_write(Elag_fst_file_out * fstf, int to) {

  unichar label[2];
  int idx;

  label[1] = 0;

  for (unichar C = '0'; C <= '9'; C++) {

    label[0] = C;

    idx=get_value_index(label,fstf->labels);

    u_fprintf(fstf->f, "%d %d ", idx, to);
  }
}
Example #16
0
/**
 * Adds the given DELA entry to the given tree. If the entry is already
 * present in the tree, then it is freed. Otherwise, it is put in the tree
 * so that IT MUST NOT BE FREED!
 */
void add_entry(struct DELA_tree* tree,struct dela_entry* entry) {
int n=get_value_index(entry->inflected,tree->inflected_forms);
if (n==tree->size) {
   /* If there was no entry list for the given inflected form */
   if (n==tree->capacity) {
      /* If we must double the array capacity */
      tree->capacity=2*tree->capacity;
      tree->dela_entries=(struct dela_entry_list**)realloc(tree->dela_entries,tree->capacity*sizeof(struct dela_entry_list*));
      if (tree->dela_entries==NULL) {
         fatal_alloc_error("add_entry");
      }
   }
   tree->dela_entries[n]=NULL;
   (tree->size)++;
}
tree->dela_entries[n]=insert_if_not_present(entry,tree->dela_entries[n]);
}
/**
 * This function takes a unicode string 'word' representing a compound word, and
 * tokenizes it into tokens. The output is an array 'tokens' that contains the
 * numbers of the tokens that constitute the word. If case variants are allowed,
 * a token can be replaced by a token list delimited by the special values
 * BEGIN_CASE_VARIANT_LIST and END_CASE_VARIANT_LIST. The token list is ended
 * by END_TOKEN_LIST.
 *
 * The array 'tokens' is supposed to be large enough. 'tok' represents the text tokens.
 * 'tokenization_mode' indicates if the word must be tokenized character by character
 * or not.
 */
void tokenize_compound_word(const unichar* word,int tokens[],const Alphabet* alphabet,
                            struct string_hash* tok,TokenizationPolicy tokenization_mode) {

int n_token,j;
struct list_ustring* list=tokenize(word,tokenization_mode,alphabet);
struct list_ustring* tmp;
struct list_int* ptr;
n_token=0;
while (list!=NULL) {
   j=get_value_index(list->string,tok,DONT_INSERT);
   /* If a token of a compound word is not a token of the text,
    * we MUST NOT ignore it. For instance, if we have the compound
    * word "a priori" and if the text only contains "PRIORI", it is not
    * an error case. The error case is when there is no case equivalent of
    * "priori" in the text. In such a situation, we traduce it by an empty
    * list. We don't raise an error because if there is by accident a token
    * in a dictionary that is not in the text, it would block the Locate
    * without necessity. */
   if (is_letter(list->string[0],alphabet) || j==-1) {
      /* If the current token is made of letters, we look for all
       * its case variants. If we have a non letter token that is
       * not in the text tokens, we handle it here to produce an
       * empty case variant list. */
      tokens[n_token++]=BEGIN_CASE_VARIANT_LIST;
      ptr=get_token_list_for_sequence(list->string,alphabet,tok);
      struct list_int* ptr_copy=ptr; // s.n.
      while (ptr!=NULL) {
         j=ptr->n;
         tokens[n_token++]=j;
         ptr=ptr->next;
      }
      free_list_int(ptr_copy); // s.n.
      tokens[n_token++]=END_CASE_VARIANT_LIST;
   } else {
      /* If we have a non letter single character, we just add its number to
       * the token array */
      tokens[n_token++]=j;
   }
   tmp=list;
   list=list->next;
   free_list_ustring_element(tmp);
}
/* Finally, we end the token list. */
tokens[n_token]=END_TOKEN_LIST;
}
void PNC_trans_write(Elag_fst_file_out * fstf, int to) {

  unichar label[4];
  int idx;

  label[1] = 0;

  for (const unichar * pnc = PUNC_TAB; *pnc; pnc++) {

    if (*pnc != '{') {
      label[0] = *pnc;

      idx=get_value_index(label,fstf->labels);
      u_fprintf(fstf->f, "%d %d ", idx, to);
    }
  }

}
/**
 * This function constructs and returns a token tree from a normalization grammar.
 * Tokens are represented by integers.
 */
struct normalization_tree* load_normalization_fst2(const VersatileEncodingConfig* vec,const char* grammar,
		const Alphabet* alph,struct text_tokens* tok) {
struct FST2_free_info fst2_free;
Fst2* fst2=load_abstract_fst2(vec,grammar,0,&fst2_free);
if (fst2==NULL) {
   return NULL;
}
struct string_hash* hash=new_string_hash(DONT_USE_VALUES);
/* We create the token tree to speed up the consultation */
for (int i=0;i<tok->N;i++) {
   get_value_index(tok->token[i],hash);
}
struct normalization_tree* root=new_normalization_tree();
explore_normalization_fst2(fst2,fst2->initial_states[1],root,hash,U_EMPTY,alph,NULL);
free_abstract_Fst2(fst2,&fst2_free);
free_string_hash(hash);
return root;
}
/**
 * Loads the tags of the given .fst2 file. Returns 0 in case of success; -1 otherwise.
 * Note that the position in the file is unchanged after a call to this function.
 */
int load_elag_fst2_tags(Elag_fst_file_in* fst) {
/* We backup the position in the file, and we come back at the
 * beginning of the file */
long fpos=ftell(fst->f);
rewind(fst->f);
/* Now, we go to the tags section, skipping all the automata */
unichar buf[MAXBUF];
int i=0;
int len;
while (i<fst->nb_automata) {
   if ((len=u_fgets(buf,MAXBUF,fst->f))==EOF) {
      error("load_fst_tags: %s: unexpected EOF\n",fst->name);
      return -1;
   }
   if (buf[0]=='f' && isspace(buf[1])) {
      i++;
   }
   /* If we have read the beginning of a long line, we skip the rest of the line */
   while ((len==MAXBUF-1) && (buf[len-1]!='\n')) {
      len=u_fgets(buf,MAXBUF,fst->f);
   }
}
Ustring* ustr=new_Ustring(64);
while (readline(ustr,fst->f) && ustr->str[0]!='f') {
   if (ustr->str[0]!='%' && ustr->str[0]!='@') {
      error("load_fst_tags: %s: bad symbol line: '%S'\n",fst->name,ustr->str);
      return -1;
   }
   /* +1 because we ignore the % or @ at the beginning of the line */
   symbol_t* symbol=load_grammar_symbol(fst->language,ustr->str+1);
   /* If 'symbol' is NULL, then an error message has already
    * been printed. Moreover, we want to associate NULL to the
    * string, so that we don't exit the function. Whatever it is,
    * we add the symbol to the symbols of the .fst2 */
   get_value_index(ustr->str+1,fst->symbols,INSERT_IF_NEEDED,symbol);
}
if (*ustr->str==0) {
   fatal_error("load_fst_tags: unexpected EOF\n");
}
free_Ustring(ustr);
/* We set back the position in the file */
fseek(fst->f,fpos,SEEK_SET);
return 0;
}
static int get_value_index_for_string_colon_string(const unichar* str1,const unichar* str2,struct string_hash* hash) {
   int value;
   unichar*allocated_buffer = NULL;
   unichar tmp_default[DEFAULT_TMP_GET_VALUE_INDEX_BUFFER_SIZE];
   unichar*tmp=tmp_default;
   int nb_unichar_buffer=u_strlen(str1)+u_strlen(str2)+2;
   if (nb_unichar_buffer>DEFAULT_TMP_GET_VALUE_INDEX_BUFFER_SIZE) {
	   tmp=allocated_buffer=(unichar*)malloc(sizeof(unichar*)*nb_unichar_buffer);
	   if (allocated_buffer==NULL) {
          fatal_alloc_error("get_value_index_for_string_colon_string");
	   }
   }   
   u_sprintf(tmp,"%S,%S",str1,str2);
   value=get_value_index(tmp,hash);
   if (allocated_buffer != NULL) {
     free(allocated_buffer);
   }
   return value;
}
/**
 * This function adds the given token to the given token tree, if not already
 * present. Then, it adds the given transition to its transition list. 
 */
void add_tag(unichar* token,int tag_number,int dest_state,struct fst2txt_token_tree* tree, Abstract_allocator prv_alloc) {
int n=get_value_index(token,tree->hash);
if (n==tree->size) {
   /* If we have to create a new transition list because the token was not already in
    * the tree. */
   if (tree->size==tree->capacity) {
      /* If necessary, we double the size of the transition array */
      tree->capacity=2*tree->capacity;
      tree->transition_array=(Transition**)realloc_cb(tree->transition_array,(tree->capacity/2)*sizeof(Transition*),tree->capacity*sizeof(Transition*),prv_alloc);
      if (tree->transition_array==NULL) {
         fatal_alloc_error("add_tag");
      }
   }
   (tree->size)++;
   /* We don't forget to initialize the new transition list */
   tree->transition_array[n]=NULL;
}
/* We add the new transition, assuming that it is not already in the list, becauses
 * it would mean that the fst2 is not deterministic. */
tree->transition_array[n]=new_Transition(tag_number,dest_state,tree->transition_array[n],prv_alloc);
}
/**
 * This function explores a dictionary tree in order to insert an entry.
 * 'inflected' is the inflected form to insert, and 'pos' is the current position
 * in the string 'inflected'. 'node' is the current node in the dictionary tree.
 * 'infos' is used to access to constant parameters.
 */
static void add_entry_to_dictionary_tree(const unichar* inflected,int pos,struct dictionary_node* node,
                                  struct info* infos,int /*line*/, Abstract_allocator prv_alloc) {
for (;;) {
if (inflected[pos]=='\0') {
   /* If we have reached the end of 'inflected', then we are in the
    * node where the INF code must be inserted */
   int N=get_value_index(infos->INF_code,infos->INF_code_list);
   if (node->single_INF_code_list==NULL) {
      /* If there is no INF code in the node, then
       * we add one and we return */
      node->single_INF_code_list=new_list_int(N,prv_alloc);
      node->INF_code=N;
      return;
   }
   /* If there is an INF code list in the node ...*/
   if (is_in_list(N,node->single_INF_code_list)) {
      /* If the INF code has already been taken into account for this node
       * (case of duplicates), we do nothing */
      return;
   }
   /* Otherwise, we add it to the INF code list */
   node->single_INF_code_list=head_insert(N,node->single_INF_code_list,prv_alloc);
	/* And we update the global INF line for this node */
   node->INF_code=get_value_index_for_string_colon_string(infos->INF_code_list->value[node->INF_code],infos->INF_code,infos->INF_code_list);
   return;
}
/* If we are not at the end of 'inflected', then we look for
 * the correct outgoing transition and we follow it */
struct dictionary_node_transition* t=get_transition(inflected[pos],&node,prv_alloc);
if (t->node==NULL) {
   /* We create the node if necessary */
   t->node=new_dictionary_node(prv_alloc);
   (t->node->incoming)++;
}

node=t->node;
pos++;
}
}
Example #24
0
/**
 * Adds a transition to 'automaton'.
 */
void add_transition(SingleGraph automaton,struct string_hash_ptr* symbols,int from,
                    symbol_t* label,int to) {
if (label==SYMBOL_DEF) {
   if (automaton->states[from]->default_state!=-1) {
      fatal_error("add_transition: more than one default transition\n");
   }
   automaton->states[from]->default_state=to;
   return;
}
while (label!=NULL) {
   if (label==SYMBOL_DEF) {
      fatal_error("add_transition: unexpected default transition\n");
   }
   /* We build a string representation of the symbol to avoid
    * duplicates in the value array */
   Ustring* u=new_Ustring();
   symbol_to_str(label,u);
   int n=get_value_index(u->str,symbols,INSERT_IF_NEEDED,label);
   free_Ustring(u);
   add_outgoing_transition(automaton->states[from],n,to);
   label=label->next;
}
}
Example #25
0
/**
 * Tests if s is a code pattern (V:Kms, N+Hum, ...).
 * 'semantic_codes' is a string_hash that contains all the possible
 * grammatical/semantic codes. If NULL, the return value can be
 * AMBIGUOUS_PATTERN if there no indication that helps to guess if
 * we have a code or a lemma.
 */
enum pattern_type is_code_pattern(const unichar* s,struct string_hash* semantic_codes,int tilde_negation_operator) {
if ((s==NULL)||(s[0]=='\0')) {
   fatal_error("NULL or empty pattern in is_code_pattern\n");
}
int i=0;
unichar tmp[2048];
if (P_BACKSLASH_AT_END==parse_string(s,&i,tmp,tilde_negation_operator ? P_PLUS_TILDE_COLON : P_PLUS_MINUS_COLON)) {
   fatal_error("Backslash at end of a pattern\n");
}
/* If we have found '+' '~' (or '-' is tilde_negation_operator==0) or ':', then we have a code pattern */
if (s[i]!='\0') {
   return CODE_PATTERN;
}
/* If we have no grammatical codes, we can't decide */
if (semantic_codes==NULL) {
	return AMBIGUOUS_PATTERN;
}
/* Otherwise, we test if the string is a grammatical or semantic code */
if (get_value_index(s,semantic_codes,DONT_INSERT)!=-1) {
   return CODE_PATTERN;
}
return LEMMA_PATTERN;
}
/**
 * Explores all the partial matches to produce outputs in MERGE or REPLACE mode.
 * 
 * If *var_starts!=NULL, it means that there are pending $var_start( tags
 * that wait for being taken into account when a text dependent tag is found.
 */
void explore_match_for_MERGE_or_REPLACE_mode(struct locate_tfst_infos* infos,
                                  struct tfst_simple_match_list* element,
                                  vector_ptr* items,int current_item,Ustring* s,
                                  int last_text_dependent_tfst_tag,
                                  struct list_pointer* *var_starts) {
if (current_item==items->nbelems) {
   /* If we have finished, we can save the current output */
   element->output=s->str;
   infos->matches=add_element_to_list(infos,infos->matches,element);
   element->output=NULL;
   return;
}
/* We save the length because it will be modified */
int len=s->len;
struct tfst_match* item=(struct tfst_match*)(items->tab[current_item]);
if (item==NULL) {
   fatal_error("Unexpected NULL item in explore_match_for_MERGE_mode\n");
}
if (item->debug_output!=NULL) {
	/* If we have a debug output, we deal it */
	u_strcat(s,item->debug_output);
	explore_match_for_MERGE_or_REPLACE_mode(infos,element,items,current_item+1,s,last_text_dependent_tfst_tag,var_starts);
	s->len=len;
	s->str[len]='\0';
	return;
}


unichar* output=infos->fst2->tags[item->fst2_transition->tag_number]->output;

unichar name[MAX_TRANSDUCTION_VAR_LENGTH];
int capture;
struct dela_entry* old_value_dela=NULL;
capture=is_capture_variable(output,name);
if (capture) {
	/* If we have a capture variable $:X$, we must save the previous value
	 * for this dictionary variable */
	old_value_dela=clone_dela_entry(get_dic_variable(name,infos->dic_variables));
}

Match saved_element=element->m;
struct list_int* text_tags=item->text_tag_numbers;
int captured_chars=0;
/* We explore all the text tags */
while (text_tags!=NULL) {
   /* First, we restore the output string */
   s->len=len;
   s->str[len]='\0';
   captured_chars=0;
   /* We deal with the fst2 tag output, if any */
   if (item->first_time) {
	   /* We only have to process the output only once,
	    * since it will have the same effect on all tfst tags.
	    *
	    * Example: the fst2 tag "cybercrime/ZZ" may match the two tfst tags "cyber" and
	    * "crime", but we must process the "ZZ" output only before the first tfst tag "cyber" */
	   if (capture) {
		   /* If we have a capture variable, then we have to check whether the tfst tag
	   	    * is a tagged token or not */
	   	   int tfst_tag_number=text_tags->n;
	   	   int fst2_tag_number=item->fst2_transition->tag_number;
	   	   if (!do_variable_capture(tfst_tag_number,fst2_tag_number,infos,name)) {
	   		   goto restore_dic_variable;
	   	   }
	   } else if (!deal_with_output_tfst(s,output,infos,&captured_chars)) {
         /* We do not take into account matches with variable errors if the
          * process_output_for_tfst_match function has decided that backtracking
          * was necessary, either because of a variable error of because of a
          * $a.SET$ or $a.UNSET$ test */
		  goto restore_dic_variable;
      }
   }
   int last_tag=last_text_dependent_tfst_tag;
   TfstTag* current_tag=NULL;
   if (text_tags->n==-1) {
      /* We have a text independent match */
      Fst2Tag fst2_tag=infos->fst2->tags[item->fst2_transition->tag_number];
      if (fst2_tag->type==BEGIN_OUTPUT_VAR_TAG) {
          /* If we an output variable start $|a( */
          int var_index=get_value_index(fst2_tag->variable,infos->output_variables->variable_index);

		  Ustring* old_value = new_Ustring();
		  swap_output_variable_content(infos->output_variables, var_index, old_value);
		  // now old_value contain the backup

          set_output_variable_pending(infos->output_variables,fst2_tag->variable);
          explore_match_for_MERGE_or_REPLACE_mode(infos,element,items,current_item+1,s,last_tag,var_starts);
          unset_output_variable_pending(infos->output_variables,fst2_tag->variable);

		  // restore the good content from backup
		  swap_output_variable_content(infos->output_variables, var_index, old_value);
		  free_Ustring(old_value);

          goto restore_dic_variable;
      } else if (fst2_tag->type==END_OUTPUT_VAR_TAG) {
          /* If we an output variable end $|a) */
          unset_output_variable_pending(infos->output_variables,fst2_tag->variable);
          explore_match_for_MERGE_or_REPLACE_mode(infos,element,items,current_item+1,s,last_tag,var_starts);
          set_output_variable_pending(infos->output_variables,fst2_tag->variable);
          goto restore_dic_variable;
      } else if (fst2_tag->type==BEGIN_VAR_TAG) {
         /* If we have a variable start tag $a(, we add it to our 
          * variable tag list */
         struct transduction_variable* v=get_transduction_variable(infos->input_variables,fst2_tag->variable);
         int old_value=v->start_in_tokens;
         /* We add the address of the start field to our list */
         (*var_starts)=new_list_pointer(&(v->start_in_tokens),(var_starts==NULL)?NULL:(*var_starts));
         /* Then, we go on the next item */
         explore_match_for_MERGE_or_REPLACE_mode(infos,element,items,current_item+1,s,last_tag,var_starts);
         /* After the exploration, there are 2 cases:
          * 1) *var_starts is NULL: nothing to do
          * 2) *var_starts is not NULL: we reached the end of the items without findind any
          *                             text dependent match, so we can free the list */
         free_list_pointer(*var_starts);
         (*var_starts)=NULL;
         v->start_in_tokens=old_value;
         /* If we have a $a( tag, we know that we can only have just one text tag 
          * with special value -1 */
         goto restore_dic_variable;
      } else if (fst2_tag->type==END_VAR_TAG) {
         /* If we have found a $a) tag */
         if (last_tag==-1) {
            /* If we have no tfst tag to use, then it's a variable definition error,
             * and we have nothing special to do */
            explore_match_for_MERGE_or_REPLACE_mode(infos,element,items,current_item+1,s,last_tag,var_starts);
            goto restore_dic_variable;
         } else {
            /* We can set the end of the variable, it's 'last_tag' */
            struct transduction_variable* v=get_transduction_variable(infos->input_variables,fst2_tag->variable);
            int old_value=v->end_in_tokens;
            v->end_in_tokens=last_tag;
            explore_match_for_MERGE_or_REPLACE_mode(infos,element,items,current_item+1,s,last_tag,var_starts);
            v->end_in_tokens=old_value;
            goto restore_dic_variable;
         }
      } else if (fst2_tag->type==LEFT_CONTEXT_TAG) {
         /* If we have found a $* tag, we must reset the stack string and the 
          * start position, so we save them */
         unichar* old_stack=u_strdup(s->str);
         int old_pos_token=element->m.start_pos_in_token;
         int old_pos_char=element->m.start_pos_in_char;
         int old_pos_letter=element->m.start_pos_in_letter;
         /* We set the new values */
         empty(s);
         element->m.start_pos_in_token=LEFT_CONTEXT_PENDING;
         /* We must reset last_tag to -1, because is not, we will have an 
          * extra space on the left of the match */
         explore_match_for_MERGE_or_REPLACE_mode(infos,element,items,current_item+1,s,-1,var_starts);
         
         /* And we restore previous values */
         element->m.start_pos_in_token=old_pos_token;
         element->m.start_pos_in_char=old_pos_char;
         element->m.start_pos_in_letter=old_pos_letter;
         u_strcpy(s,old_stack);
         free(old_stack);
         /* If we have a $* tag, we know that we can only have just one text tag 
          * with special value -1 */
         goto restore_dic_variable;
      } else if (fst2_tag->type==BEGIN_POSITIVE_CONTEXT_TAG) {
    	  fatal_error("problem $[\n");
      }
   } else {
      current_tag=(TfstTag*)(infos->tfst->tags->tab[text_tags->n]);
      /* We update the last tag */
      last_tag=text_tags->n;
      /* If the current text tag is not a text independent one */
      
      /* If there are some pending $a( tags, we set them to the current tag */
      if (var_starts!=NULL) {
         struct list_pointer* ptr=(*var_starts);
         while (ptr!=NULL) {
            int* start=(int*)(ptr->pointer);
            (*start)=text_tags->n;
            ptr=ptr->next;
         }
      }
      int previous_start_token,previous_start_char; 
      if (last_text_dependent_tfst_tag!=-1) {
         /* If the item is not the first, we must insert the original text that is
          * between the end of the previous merged text and the beginning of the
          * current one, typically to insert spaces */
         TfstTag* previous_tag=(TfstTag*)(infos->tfst->tags->tab[last_text_dependent_tfst_tag]);
         previous_start_token=previous_tag->m.end_pos_in_token;
         previous_start_char=previous_tag->m.end_pos_in_char;
         /* We start just after the end of the previous match */
         if (infos->tfst->token_content[previous_start_token][previous_start_char+1]!='\0') {
            /* If we were not at the end of the previous text token, we just inscrease
             * the char position */
            previous_start_char++;
         } else {
            /* Otherwise, we go on the next token */
            previous_start_token++;
            previous_start_char=0;
         }
      } else {
         /* Otherwise, we start on the beginning of the current text tag */
         //error("current item=%d\n",text_tags->n);
         previous_start_token=current_tag->m.start_pos_in_token;
         previous_start_char=current_tag->m.start_pos_in_char;
      }
      /* Here we have to insert the text that is between current_start and current_end,
       * and then, the ouput of the fst2 transition */
      if (infos->output_policy==MERGE_OUTPUTS) {
    	  insert_text_interval_tfst(infos,s,previous_start_token,previous_start_char,
                 current_tag->m.end_pos_in_token,current_tag->m.end_pos_in_char);
      }
   }
   /* Then, we go on the next item */
   struct list_pointer* ptr2=NULL;
   if (element->m.start_pos_in_token==LEFT_CONTEXT_PENDING && current_tag!=NULL) {
      element->m.start_pos_in_token=infos->tfst->offset_in_tokens+current_tag->m.start_pos_in_token;
      element->m.start_pos_in_char=current_tag->m.start_pos_in_char;
      element->m.start_pos_in_letter=current_tag->m.start_pos_in_letter;
   }
   explore_match_for_MERGE_or_REPLACE_mode(infos,element,items,current_item+1,s,last_tag
         ,&ptr2 /* We have encountered a text dependent tag, so there is no
                 * more pending start tag like $a( */
         );
   element->m=saved_element;
   /* If there was a $* tag pending */
   free_list_pointer(ptr2);
   if (infos->ambiguous_output_policy==IGNORE_AMBIGUOUS_OUTPUTS) {
      /* If we don't want ambiguous outputs, then the first path is
       * enough for our purpose */ 
      goto restore_dic_variable;
   }
   text_tags=text_tags->next;
   remove_chars_from_output_variables(infos->output_variables,captured_chars);
   /* We reset to 0, because if we exit the while normally, we don't want to
    * modify output variables twice when reaching the 'restore_dic_variable'
    * label */
   captured_chars=0;
}
restore_dic_variable:
/* We redo this about output variables here, since we may have jumped here directly */
remove_chars_from_output_variables(infos->output_variables,captured_chars);
if (capture) {
	/* If we have a capture variable $:X$, we must restore the previous value
	 * for this dictionary variable */
	set_dic_variable(name,old_value_dela,&(infos->dic_variables),0);
}
}
   }
   return;
}
/* Here, we have to get all the case variants of the token. */
tag[i]->matching_tokens=destructive_sorted_merge(get_token_list_for_sequence(opt_token,alph,p->tokens,prv_alloc),tag[i]->matching_tokens,prv_alloc);
}


/**
 * This function checks if a pattern of the form "<eat>", "<eat.V>" or "<eaten,eat.V>"
 * can match the given tag token like "{today,.ADV}".
 */
void optimize_full_pattern_for_tag(unichar* tag_token,int i,Fst2Tag* tag,Alphabet* alph,
               struct locate_parameters* p,Abstract_allocator prv_alloc) {
DISCARD_UNUSED_PARAMETER(alph)
int token_number=get_value_index(tag_token,p->tokens);
struct dela_entry* entry=tokenize_tag_token(tag_token,1);
struct pattern* pattern=tag[i]->pattern;
if ((pattern->type==LEMMA_PATTERN) || (pattern->type==INFLECTED_AND_LEMMA_PATTERN)) {
   /* If the pattern has a constraint on the lemma, we check it */
   if (u_strcmp(entry->lemma,pattern->lemma)) {
      free_dela_entry(entry,prv_alloc);
      return;
   }
}
if ((pattern->type==LEMMA_AND_CODE_PATTERN) || (pattern->type==FULL_PATTERN)) {
   /* If the pattern contains a constraint on grammatical/semantic/inflectional
    * codes, then it has been put in the pattern tree, and so, this pattern
    * was tried on the current tag token in the 'check_patterns_for_tag_tokens'
    * function. Then, we just have to test if the tag token matches this pattern. */
   if (p->matching_patterns==NULL || p->matching_patterns[token_number]==NULL ||
/**
 * Saves the given automaton into the given .fst2 file.
 */
void fst_file_write(Elag_fst_file_out* fstf,const Fst2Automaton* A) {
Ustring* tag=new_Ustring();
void (*symbol_to_tag)(const symbol_t*,Ustring*)=NULL;
switch (fstf->type) {
   case FST_TEXT:
      symbol_to_tag=symbol_to_text_label;
      break;

   case FST_GRAMMAR:
      symbol_to_tag=symbol_to_grammar_label;
      break;

  case FST_LOCATE:
      symbol_to_tag=symbol_to_locate_label;
      break;

  default:
      fatal_error("fst_file_write: invalid fstf->type: %d\n",fstf->type);
}
/* We save the graph number and name */
u_fprintf(fstf->f,"-%d %S\n",fstf->nb_automata+1,A->name);
int index;
unichar deflabel[]={'<','d','e','f','>',0};
for (int q=0;q<A->automaton->number_of_states;q++) {
   SingleGraphState state=A->automaton->states[q];
   u_fprintf(fstf->f,"%C ",is_final_state(state)?'t':':');
   for (Transition* t=state->outgoing_transitions;t!=NULL;t=t->next) {
      if (t->tag_number==-1) {
         /* If we are in the case of an "EMPTY" transition created because
          * the automaton was emptied as trim time */
         u_strcpy(tag,"EMPTY");
      } else {
         symbol_t* symbol=t->label;
         symbol_to_tag(symbol,tag);
      }
      if (fstf->type==FST_LOCATE) {
         /* If we are saving a Locate .fst2, we have to perform
          * some special things */
         if (u_strcmp(tag->str, "<PNC>") == 0) {
            PNC_trans_write(fstf, t->state_number);
         } else if (u_strcmp(tag->str, "<CHFA>") == 0 || u_strcmp(tag->str, "<NB>") == 0) {
            CHFA_trans_write(fstf, t->state_number);
         } else if (u_strcmp(tag->str, "<.>") == 0) {
            LEXIC_trans_write(fstf, t->state_number);
         } else {
            goto normal_output;
         }
      } else {
         /* If we have a normal transition to print */
         normal_output:
         index=get_value_index(tag->str,fstf->labels);
         u_fprintf(fstf->f,"%d %d ",index,t->state_number);
      }
   }
   if (state->default_state!=-1) {
      if (fstf->type!=FST_GRAMMAR) {
         error("Unexpected <def> label in text/locate automaton\n");
      }
      index=get_value_index(deflabel,fstf->labels);
      u_fprintf(fstf->f,"%d %d ",index,state->default_state);
   }
   u_fputc('\n',fstf->f);
}
u_fprintf(fstf->f,"f \n");
free_Ustring(tag);
fstf->nb_automata++;
}
Example #29
0
int ClassnamePredicate::get_value(ARGUMENTTYPE vt) const {
  IMPKERNEL_DEPRECATED_METHOD_DEF(2.1, "Use index version");
  return get_value_index(internal::get_model(vt), internal::get_index(vt));
}
Example #30
0
/**
 * Returns a control byte that represents the characteristics of the given token.
 */
unsigned char get_control_byte(const unichar* token,const Alphabet* alph,struct string_hash* err,TokenizationPolicy tokenization_policy) {
    int i;
    int tmp;
    unsigned char c=0;
    if (token==NULL || token[0]=='\0') {
        fatal_error("NULL or empty token in get_control_byte\n");
    }
    /* We consider that a token starting with a letter is a word */
    if (is_letter(token[0],alph)) {
        set_bit_mask(&c,MOT_TOKEN_BIT_MASK);
        /* If a token is a word, we check if it is in the 'err' word list
         * in order to answer the question <!DIC>. We perform this test in order
         * to avoid taking "priori" as an unknown word if the compound "a priori"
         * is in the text. */
        if (err!=NULL && get_value_index(token,err,DONT_INSERT)!=-1) {
            set_bit_mask(&c,NOT_DIC_TOKEN_BIT_MASK);
        }
        if (is_upper(token[0],alph)) {
            set_bit_mask(&c,PRE_TOKEN_BIT_MASK);
            i=0;
            tmp=0;
            while (token[i]!='\0') {
                if (is_lower(token[i],alph)) {
                    tmp=1;
                    break;
                }
                i++;
            }
            if (!tmp) {
                set_bit_mask(&c,MAJ_TOKEN_BIT_MASK);
            }
            return c;
        }
        i=0;
        tmp=0;
        while (token[i]!='\0') {
            if (is_upper(token[i],alph)) {
                tmp=1;
                break;
            }
            i++;
        }
        if (!tmp) {
            set_bit_mask(&c,MIN_TOKEN_BIT_MASK);
        }
        return c;
    }
    /* If the token doesn't start with a letter, we start with
     * checking if it is a tag like {today,.ADV} */
    if (token[0]=='{' && u_strcmp(token,"{S}") && u_strcmp(token,"{STOP}")) {
        /* Anyway, such a tag is classed as verifying <MOT> and <DIC> */
        set_bit_mask(&c,MOT_TOKEN_BIT_MASK|DIC_TOKEN_BIT_MASK|TDIC_TOKEN_BIT_MASK);
        struct dela_entry* temp=tokenize_tag_token(token);
        if (is_upper(temp->inflected[0],alph)) {
            set_bit_mask(&c,PRE_TOKEN_BIT_MASK);
            i=0;
            tmp=0;
            while (temp->inflected[i]!='\0') {
                if (is_letter(temp->inflected[i],alph) && is_lower(temp->inflected[i],alph)) {
                    tmp=1;
                    break;
                }
                i++;
            }
            if (!tmp) {
                set_bit_mask(&c,MAJ_TOKEN_BIT_MASK);
            }
        }
        else {
            i=0;
            tmp=0;
            while (temp->inflected[i]!='\0') {
                if (is_letter(temp->inflected[i],alph) && is_upper(temp->inflected[i],alph)) {
                    tmp=1;
                    break;
                }
                i++;
            }
            if (!tmp) {
                set_bit_mask(&c,MIN_TOKEN_BIT_MASK);
            }
        }
        if (!is_a_simple_word(temp->inflected,tokenization_policy,alph)) {
            /* If the tag is a compound word, we say that it verifies the <CDIC> pattern */
            set_bit_mask(&c,CDIC_TOKEN_BIT_MASK);
        }
        free_dela_entry(temp);
    }
    return c;
}