/** * Adds a new pair (key,value) in the hash table. */ void add_key_table(const unichar* key,struct string_hash_ptr* table){ void* value = get_value(key,table); if(value != NULL){ table->value[get_value_index(key,table)] = (void*)(((char*)value)+1); } else{ get_value_index(key,table,INSERT_IF_NEEDED,(void*)1); } }
/** * Looks for a keyword that has a forbidden lemma or is a forbidden lemma * if the keyword is not a lemmatized one of the form XXX.YYY */ int has_forbidden_lemma(KeyWord* list,struct string_hash* lemmas) { if (list==NULL || list->sequence==NULL) return 0; int pos=last_index_of(list->sequence,(unichar)'.'); if (pos==-1) { /* If the keyword is not lemmatized, we just test * if it is a forbidden lemma */ return (-1!=get_value_index(list->sequence,lemmas,DONT_INSERT)); } Ustring* tmp=new_Ustring(list->sequence); truncate(tmp,pos); int index=get_value_index(tmp->str,lemmas,DONT_INSERT); free_Ustring(tmp); return index!=-1; }
/** * Opens a .fst2 file in output mode and returns the associated fst_file_out_t * structure, or NULL in case of error. */ Elag_fst_file_out* fst_file_out_open(const VersatileEncodingConfig* vec,const char* fname,int type) { Elag_fst_file_out* res=(Elag_fst_file_out*)malloc(sizeof(Elag_fst_file_out)); if (res==NULL) { fatal_alloc_error("fst_file_out_open"); } if (type<0 || type>=FST_BAD_TYPE) { fatal_error("fst_file_out_open: bad FST_TYPE\n"); } if ((res->f=u_fopen(vec,fname,U_WRITE))==NULL) { error("fst_out_open: unable to open '%s'\n",fname); free(res); return NULL; } res->fstart=ftell(res->f); u_fprintf(res->f,"0000000000\n"); res->name=strdup(fname); if (res->name==NULL) { fatal_alloc_error("fst_file_out_open"); } res->type=type; res->nb_automata=0; res->labels=new_string_hash(16); /* We add <E> to the tags in order to be sure that this special tag will have #0 */ get_value_index(EPSILON,res->labels); return res; }
void lemmatize(struct dela_entry* e,struct string_hash_ptr* keywords,Alphabet* alphabet) { unichar* lower=u_strdup(e->inflected); u_tolower(lower); KeyWord* k_inflected=(KeyWord*)get_value(lower,keywords); free(lower); if (k_inflected==NULL) return; Ustring* tmp=new_Ustring(64); u_sprintf(tmp,"%S.%S",e->lemma,e->semantic_codes[0]); KeyWord* k_lemma=(KeyWord*)get_value(tmp->str,keywords); if (k_lemma==NULL) { k_lemma=new_KeyWord(0,tmp->str,NULL); k_lemma->lemmatized=LEMMATIZED_KEYWORD; get_value_index(tmp->str,keywords,INSERT_IF_NEEDED,k_lemma); } /* Now, we look for all the case compatible tokens, and we add * their weights to the new lemmatized element */ while (k_inflected!=NULL) { if (k_inflected->sequence!=NULL && is_equal_or_uppercase(e->inflected,k_inflected->sequence,alphabet)) { /* We have a match */ k_lemma->weight+=k_inflected->weight; k_inflected->lemmatized=1; } k_inflected=k_inflected->next; } free_Ustring(tmp); }
/** * Loads a compound word file, adding each word to the keywords. */ void load_compound_words(char* name,VersatileEncodingConfig* vec, struct string_hash_ptr* keywords) { U_FILE* f=u_fopen(vec,name,U_READ); if (f==NULL) return; Ustring* line=new_Ustring(256); Ustring* lower=new_Ustring(256); while (EOF!=readline(line,f)) { if (line->str[0]=='{') { /* We skip tags */ continue; } u_strcpy(lower,line->str); u_tolower(lower->str); int index=get_value_index(lower->str,keywords,INSERT_IF_NEEDED,NULL); if (index==-1) { fatal_error("Internal error in load_tokens_by_freq\n"); } KeyWord* value=(KeyWord*)keywords->value[index]; add_keyword(&value,line->str,1); keywords->value[index]=value; } free_Ustring(line); free_Ustring(lower); u_fclose(f); }
/** * This function optimizes a pattern of the form "eat". */ void optimize_token_pattern(int i,Fst2Tag* tag,Alphabet* alph, struct locate_parameters* p,Abstract_allocator prv_alloc) { /* Whatever happens, this pattern will be turned into a token list */ tag[i]->type=TOKEN_LIST_TAG; unichar* opt_token=tag[i]->pattern->inflected; /* First, we check if this token pattern can recognize some tag tokens */ struct list_int* list=p->tag_token_list; while (list!=NULL) { struct dela_entry* entry=tokenize_tag_token(p->tokens->value[list->n],1); if ((!is_bit_mask_set(tag[i]->control,RESPECT_CASE_TAG_BIT_MASK) && is_equal_or_uppercase(opt_token,entry->inflected,alph)) || !u_strcmp(opt_token,entry->inflected)) { tag[i]->matching_tokens=sorted_insert(list->n,tag[i]->matching_tokens,prv_alloc); } free_dela_entry(entry); list=list->next; } /* Then, we look for normal tokens */ if (is_bit_mask_set(tag[i]->control,RESPECT_CASE_TAG_BIT_MASK)) { /* If no case variants are allowed, then we just have to insert the number * of the token, but only if this token in the text ones. */ int token_number; if (-1!=(token_number=get_value_index(opt_token,p->tokens,DONT_INSERT))) { tag[i]->matching_tokens=sorted_insert(token_number,tag[i]->matching_tokens,prv_alloc); } return; } /* Here, we have to get all the case variants of the token. */ tag[i]->matching_tokens=destructive_sorted_merge(get_token_list_for_sequence(opt_token,alph,p->tokens,prv_alloc),tag[i]->matching_tokens,prv_alloc); }
int get_val(char type, char *values, t_arena *arena, t_proc *proc) { t_conv val; char *ind_val; int addr; val.integer = 0; if (type == A_REG) { if (is_valid_reg(type, values[0])) val.integer = proc->reg[values[0] - 1]; else val.integer = 0; } if (type == A_DIR || type == A_IND) { val.integer = oct_to_int(values); if (type == A_IND) { addr = my_mod((proc->pc + (val.integer % IDX_MOD)), MEM_SIZE); ind_val = get_value_index(&addr, arena); val.integer = oct_to_int(ind_val); } } return (val.integer); }
/** * Loads the initial keyword list from a tok_by_freq.txt file, * and turns all those tokens in a list whose primary key is the * lower case token: * The/20 THE/2 the/50 => the->(The/20 THE/2 the/50) */ struct string_hash_ptr* load_tokens_by_freq(char* name,VersatileEncodingConfig* vec) { U_FILE* f=u_fopen(vec,name,U_READ); if (f==NULL) return NULL; Ustring* line=new_Ustring(128); Ustring* lower=new_Ustring(128); struct string_hash_ptr* res=new_string_hash_ptr(1024); int val,pos; /* We skip the first line of the file, containing the number * of tokens */ if (EOF==readline(line,f)) { fatal_error("Invalid empty file %s\n",name); } while (EOF!=readline(line,f)) { if (1!=u_sscanf(line->str,"%d%n",&val,&pos)) { fatal_error("Invalid line in file %s:\n%S\n",name,line->str); } u_strcpy(lower,line->str+pos); u_tolower(lower->str); int index=get_value_index(lower->str,res,INSERT_IF_NEEDED,NULL); if (index==-1) { fatal_error("Internal error in load_tokens_by_freq\n"); } KeyWord* value=(KeyWord*)res->value[index]; res->value[index]=new_KeyWord(val,line->str+pos,value); } free_Ustring(line); free_Ustring(lower); u_fclose(f); return res; }
/** * This function adds the given variable to the given variable list. * No tests is done to check if there is already a transition with the * given variable, because it cannot happen if the grammar is deterministic. */ void add_output_variable(OutputVariables* var,unichar* variable,Transition* transition, struct opt_variable** variable_list,Abstract_allocator prv_alloc) { int n=get_value_index(variable,var->variable_index,DONT_INSERT); struct opt_variable* v=new_opt_variable(n,transition,prv_alloc); v->next=(*variable_list); (*variable_list)=v; }
void LEXIC_trans_write(Elag_fst_file_out * fstf, int to) { unichar label[8]; int idx; u_strcpy(label, "<MOT>"); idx=get_value_index(label,fstf->labels); u_fprintf(fstf->f, "%d %d ", idx, to); u_strcpy(label, "<!MOT>"); idx=get_value_index(label,fstf->labels); u_fprintf(fstf->f, "%d %d ", idx, to); }
Ints ClassnamePredicate::get_value(const PLURALVARIABLETYPE &o) const { IMPKERNEL_DEPRECATED_METHOD_DEF(2.1, "Use index version"); if (o.empty()) return Ints(); Ints ret(o.size()); Model *m = internal::get_model(o[0]); for (unsigned int i = 0; i < o.size(); ++i) { ret[i] += get_value_index(m, internal::get_index(o[i])); } return ret; }
/** * Loads the lines of a text file into a string_hash and returns it, or NULL * if the file can not be opened. We arbitrary fix the limit of a line to 4096 * characters. Each line is splitted into a key and a value, according to a * given separator character. An error message will be printed if a line does not * contain the separator character, if an empty line is found, or if a line contains * an empty key. In case of empty values, the empty string will be used. * Note that keys and values can contain characters protected with the \ character, * including protected new lines like: * * 123\ * =ONE_TWO_THREE_NEW_LINE * */ struct string_hash* load_key_value_list(const char* name,int mask_encoding_compatibility_input,unichar separator) { U_FILE* f=u_fopen_existing_versatile_encoding(mask_encoding_compatibility_input,name,U_READ); if (f==NULL) return NULL; struct string_hash* hash=new_string_hash(); unichar temp[4096]; unichar key[4096]; unichar value[4096]; /* We build a string with the separator character */ unichar stop[2]; stop[0]=separator; stop[1]='\0'; int code; while (EOF!=(code=u_fgets2(temp,f))) { if (code==0) { error("Empty line\n"); } else { /* First, we try to read a non empty key */ int pos=0; code=parse_string(temp,&pos,key,stop); if (code==P_BACKSLASH_AT_END) { error("Backslash at end of line:<%S>\n\n",temp); } else if (pos==0 &&temp[pos]=='\0') { /* Empty line */ continue; } else if (pos==0) { /* If the line starts with the separator */ error("Line with empty key:\n<%S>\n",temp); } else { /* We jump over the separator */ pos++; /* We initialize 'value' with the empty string in case it is not * defined in the file */ value[0]='\0'; if(P_BACKSLASH_AT_END==parse_string(temp,&pos,value,P_EMPTY)) { error("Backslash at end of line:\n<%S>\n",temp); } else { /* If we have a valid (key,value) pair, we insert it into the string_hash */ get_value_index(key,hash,INSERT_IF_NEEDED,value); } } } } u_fclose(f); return hash; }
/** * We remove every keyword that is tagged with the forbidden code. If * a forbidden keyword has several tags, all of them are removed: * * the,.DET + the,.XXX => all 'the' keywords are removed */ struct string_hash* compute_forbidden_lemmas(struct string_hash_ptr* keywords,unichar* code) { struct string_hash* hash=new_string_hash(DONT_USE_VALUES,DONT_ENLARGE); Ustring* tmp=new_Ustring(); for (int i=0;i<keywords->size;i++) { KeyWord* list=(KeyWord*)(keywords->value[i]); while (list!=NULL) { if (get_forbidden_keyword(list,code,tmp)) { get_value_index(tmp->str,hash); } list=list->next; } } free_Ustring(tmp); return hash; }
/** * Loads the lines of a text file info a string_hash and returns it, or NULL * if the file can not be opened. We arbitrary fix the limit of a line to 4096 * characters. For each line, we ignore the carriage return, if any, and we use * the remaining string as key and value. An error message will be printed if * an empty line is found. */ struct string_hash* load_key_list(const char* name,int mask_encoding_compatibility_input) { U_FILE* f=u_fopen_existing_versatile_encoding(mask_encoding_compatibility_input,name,U_READ); if (f==NULL) return NULL; struct string_hash* hash=new_string_hash(DONT_USE_VALUES); unichar temp[4096]; while (EOF!=u_fgets_limit2(temp,4096,f)) { if (temp[0]=='\0') { error("Empty line in %s\n",name); } else { get_value_index(temp,hash); } } u_fclose(f); return hash; }
void CHFA_trans_write(Elag_fst_file_out * fstf, int to) { unichar label[2]; int idx; label[1] = 0; for (unichar C = '0'; C <= '9'; C++) { label[0] = C; idx=get_value_index(label,fstf->labels); u_fprintf(fstf->f, "%d %d ", idx, to); } }
/** * Adds the given DELA entry to the given tree. If the entry is already * present in the tree, then it is freed. Otherwise, it is put in the tree * so that IT MUST NOT BE FREED! */ void add_entry(struct DELA_tree* tree,struct dela_entry* entry) { int n=get_value_index(entry->inflected,tree->inflected_forms); if (n==tree->size) { /* If there was no entry list for the given inflected form */ if (n==tree->capacity) { /* If we must double the array capacity */ tree->capacity=2*tree->capacity; tree->dela_entries=(struct dela_entry_list**)realloc(tree->dela_entries,tree->capacity*sizeof(struct dela_entry_list*)); if (tree->dela_entries==NULL) { fatal_alloc_error("add_entry"); } } tree->dela_entries[n]=NULL; (tree->size)++; } tree->dela_entries[n]=insert_if_not_present(entry,tree->dela_entries[n]); }
/** * This function takes a unicode string 'word' representing a compound word, and * tokenizes it into tokens. The output is an array 'tokens' that contains the * numbers of the tokens that constitute the word. If case variants are allowed, * a token can be replaced by a token list delimited by the special values * BEGIN_CASE_VARIANT_LIST and END_CASE_VARIANT_LIST. The token list is ended * by END_TOKEN_LIST. * * The array 'tokens' is supposed to be large enough. 'tok' represents the text tokens. * 'tokenization_mode' indicates if the word must be tokenized character by character * or not. */ void tokenize_compound_word(const unichar* word,int tokens[],const Alphabet* alphabet, struct string_hash* tok,TokenizationPolicy tokenization_mode) { int n_token,j; struct list_ustring* list=tokenize(word,tokenization_mode,alphabet); struct list_ustring* tmp; struct list_int* ptr; n_token=0; while (list!=NULL) { j=get_value_index(list->string,tok,DONT_INSERT); /* If a token of a compound word is not a token of the text, * we MUST NOT ignore it. For instance, if we have the compound * word "a priori" and if the text only contains "PRIORI", it is not * an error case. The error case is when there is no case equivalent of * "priori" in the text. In such a situation, we traduce it by an empty * list. We don't raise an error because if there is by accident a token * in a dictionary that is not in the text, it would block the Locate * without necessity. */ if (is_letter(list->string[0],alphabet) || j==-1) { /* If the current token is made of letters, we look for all * its case variants. If we have a non letter token that is * not in the text tokens, we handle it here to produce an * empty case variant list. */ tokens[n_token++]=BEGIN_CASE_VARIANT_LIST; ptr=get_token_list_for_sequence(list->string,alphabet,tok); struct list_int* ptr_copy=ptr; // s.n. while (ptr!=NULL) { j=ptr->n; tokens[n_token++]=j; ptr=ptr->next; } free_list_int(ptr_copy); // s.n. tokens[n_token++]=END_CASE_VARIANT_LIST; } else { /* If we have a non letter single character, we just add its number to * the token array */ tokens[n_token++]=j; } tmp=list; list=list->next; free_list_ustring_element(tmp); } /* Finally, we end the token list. */ tokens[n_token]=END_TOKEN_LIST; }
void PNC_trans_write(Elag_fst_file_out * fstf, int to) { unichar label[4]; int idx; label[1] = 0; for (const unichar * pnc = PUNC_TAB; *pnc; pnc++) { if (*pnc != '{') { label[0] = *pnc; idx=get_value_index(label,fstf->labels); u_fprintf(fstf->f, "%d %d ", idx, to); } } }
/** * This function constructs and returns a token tree from a normalization grammar. * Tokens are represented by integers. */ struct normalization_tree* load_normalization_fst2(const VersatileEncodingConfig* vec,const char* grammar, const Alphabet* alph,struct text_tokens* tok) { struct FST2_free_info fst2_free; Fst2* fst2=load_abstract_fst2(vec,grammar,0,&fst2_free); if (fst2==NULL) { return NULL; } struct string_hash* hash=new_string_hash(DONT_USE_VALUES); /* We create the token tree to speed up the consultation */ for (int i=0;i<tok->N;i++) { get_value_index(tok->token[i],hash); } struct normalization_tree* root=new_normalization_tree(); explore_normalization_fst2(fst2,fst2->initial_states[1],root,hash,U_EMPTY,alph,NULL); free_abstract_Fst2(fst2,&fst2_free); free_string_hash(hash); return root; }
/** * Loads the tags of the given .fst2 file. Returns 0 in case of success; -1 otherwise. * Note that the position in the file is unchanged after a call to this function. */ int load_elag_fst2_tags(Elag_fst_file_in* fst) { /* We backup the position in the file, and we come back at the * beginning of the file */ long fpos=ftell(fst->f); rewind(fst->f); /* Now, we go to the tags section, skipping all the automata */ unichar buf[MAXBUF]; int i=0; int len; while (i<fst->nb_automata) { if ((len=u_fgets(buf,MAXBUF,fst->f))==EOF) { error("load_fst_tags: %s: unexpected EOF\n",fst->name); return -1; } if (buf[0]=='f' && isspace(buf[1])) { i++; } /* If we have read the beginning of a long line, we skip the rest of the line */ while ((len==MAXBUF-1) && (buf[len-1]!='\n')) { len=u_fgets(buf,MAXBUF,fst->f); } } Ustring* ustr=new_Ustring(64); while (readline(ustr,fst->f) && ustr->str[0]!='f') { if (ustr->str[0]!='%' && ustr->str[0]!='@') { error("load_fst_tags: %s: bad symbol line: '%S'\n",fst->name,ustr->str); return -1; } /* +1 because we ignore the % or @ at the beginning of the line */ symbol_t* symbol=load_grammar_symbol(fst->language,ustr->str+1); /* If 'symbol' is NULL, then an error message has already * been printed. Moreover, we want to associate NULL to the * string, so that we don't exit the function. Whatever it is, * we add the symbol to the symbols of the .fst2 */ get_value_index(ustr->str+1,fst->symbols,INSERT_IF_NEEDED,symbol); } if (*ustr->str==0) { fatal_error("load_fst_tags: unexpected EOF\n"); } free_Ustring(ustr); /* We set back the position in the file */ fseek(fst->f,fpos,SEEK_SET); return 0; }
static int get_value_index_for_string_colon_string(const unichar* str1,const unichar* str2,struct string_hash* hash) { int value; unichar*allocated_buffer = NULL; unichar tmp_default[DEFAULT_TMP_GET_VALUE_INDEX_BUFFER_SIZE]; unichar*tmp=tmp_default; int nb_unichar_buffer=u_strlen(str1)+u_strlen(str2)+2; if (nb_unichar_buffer>DEFAULT_TMP_GET_VALUE_INDEX_BUFFER_SIZE) { tmp=allocated_buffer=(unichar*)malloc(sizeof(unichar*)*nb_unichar_buffer); if (allocated_buffer==NULL) { fatal_alloc_error("get_value_index_for_string_colon_string"); } } u_sprintf(tmp,"%S,%S",str1,str2); value=get_value_index(tmp,hash); if (allocated_buffer != NULL) { free(allocated_buffer); } return value; }
/** * This function adds the given token to the given token tree, if not already * present. Then, it adds the given transition to its transition list. */ void add_tag(unichar* token,int tag_number,int dest_state,struct fst2txt_token_tree* tree, Abstract_allocator prv_alloc) { int n=get_value_index(token,tree->hash); if (n==tree->size) { /* If we have to create a new transition list because the token was not already in * the tree. */ if (tree->size==tree->capacity) { /* If necessary, we double the size of the transition array */ tree->capacity=2*tree->capacity; tree->transition_array=(Transition**)realloc_cb(tree->transition_array,(tree->capacity/2)*sizeof(Transition*),tree->capacity*sizeof(Transition*),prv_alloc); if (tree->transition_array==NULL) { fatal_alloc_error("add_tag"); } } (tree->size)++; /* We don't forget to initialize the new transition list */ tree->transition_array[n]=NULL; } /* We add the new transition, assuming that it is not already in the list, becauses * it would mean that the fst2 is not deterministic. */ tree->transition_array[n]=new_Transition(tag_number,dest_state,tree->transition_array[n],prv_alloc); }
/** * This function explores a dictionary tree in order to insert an entry. * 'inflected' is the inflected form to insert, and 'pos' is the current position * in the string 'inflected'. 'node' is the current node in the dictionary tree. * 'infos' is used to access to constant parameters. */ static void add_entry_to_dictionary_tree(const unichar* inflected,int pos,struct dictionary_node* node, struct info* infos,int /*line*/, Abstract_allocator prv_alloc) { for (;;) { if (inflected[pos]=='\0') { /* If we have reached the end of 'inflected', then we are in the * node where the INF code must be inserted */ int N=get_value_index(infos->INF_code,infos->INF_code_list); if (node->single_INF_code_list==NULL) { /* If there is no INF code in the node, then * we add one and we return */ node->single_INF_code_list=new_list_int(N,prv_alloc); node->INF_code=N; return; } /* If there is an INF code list in the node ...*/ if (is_in_list(N,node->single_INF_code_list)) { /* If the INF code has already been taken into account for this node * (case of duplicates), we do nothing */ return; } /* Otherwise, we add it to the INF code list */ node->single_INF_code_list=head_insert(N,node->single_INF_code_list,prv_alloc); /* And we update the global INF line for this node */ node->INF_code=get_value_index_for_string_colon_string(infos->INF_code_list->value[node->INF_code],infos->INF_code,infos->INF_code_list); return; } /* If we are not at the end of 'inflected', then we look for * the correct outgoing transition and we follow it */ struct dictionary_node_transition* t=get_transition(inflected[pos],&node,prv_alloc); if (t->node==NULL) { /* We create the node if necessary */ t->node=new_dictionary_node(prv_alloc); (t->node->incoming)++; } node=t->node; pos++; } }
/** * Adds a transition to 'automaton'. */ void add_transition(SingleGraph automaton,struct string_hash_ptr* symbols,int from, symbol_t* label,int to) { if (label==SYMBOL_DEF) { if (automaton->states[from]->default_state!=-1) { fatal_error("add_transition: more than one default transition\n"); } automaton->states[from]->default_state=to; return; } while (label!=NULL) { if (label==SYMBOL_DEF) { fatal_error("add_transition: unexpected default transition\n"); } /* We build a string representation of the symbol to avoid * duplicates in the value array */ Ustring* u=new_Ustring(); symbol_to_str(label,u); int n=get_value_index(u->str,symbols,INSERT_IF_NEEDED,label); free_Ustring(u); add_outgoing_transition(automaton->states[from],n,to); label=label->next; } }
/** * Tests if s is a code pattern (V:Kms, N+Hum, ...). * 'semantic_codes' is a string_hash that contains all the possible * grammatical/semantic codes. If NULL, the return value can be * AMBIGUOUS_PATTERN if there no indication that helps to guess if * we have a code or a lemma. */ enum pattern_type is_code_pattern(const unichar* s,struct string_hash* semantic_codes,int tilde_negation_operator) { if ((s==NULL)||(s[0]=='\0')) { fatal_error("NULL or empty pattern in is_code_pattern\n"); } int i=0; unichar tmp[2048]; if (P_BACKSLASH_AT_END==parse_string(s,&i,tmp,tilde_negation_operator ? P_PLUS_TILDE_COLON : P_PLUS_MINUS_COLON)) { fatal_error("Backslash at end of a pattern\n"); } /* If we have found '+' '~' (or '-' is tilde_negation_operator==0) or ':', then we have a code pattern */ if (s[i]!='\0') { return CODE_PATTERN; } /* If we have no grammatical codes, we can't decide */ if (semantic_codes==NULL) { return AMBIGUOUS_PATTERN; } /* Otherwise, we test if the string is a grammatical or semantic code */ if (get_value_index(s,semantic_codes,DONT_INSERT)!=-1) { return CODE_PATTERN; } return LEMMA_PATTERN; }
/** * Explores all the partial matches to produce outputs in MERGE or REPLACE mode. * * If *var_starts!=NULL, it means that there are pending $var_start( tags * that wait for being taken into account when a text dependent tag is found. */ void explore_match_for_MERGE_or_REPLACE_mode(struct locate_tfst_infos* infos, struct tfst_simple_match_list* element, vector_ptr* items,int current_item,Ustring* s, int last_text_dependent_tfst_tag, struct list_pointer* *var_starts) { if (current_item==items->nbelems) { /* If we have finished, we can save the current output */ element->output=s->str; infos->matches=add_element_to_list(infos,infos->matches,element); element->output=NULL; return; } /* We save the length because it will be modified */ int len=s->len; struct tfst_match* item=(struct tfst_match*)(items->tab[current_item]); if (item==NULL) { fatal_error("Unexpected NULL item in explore_match_for_MERGE_mode\n"); } if (item->debug_output!=NULL) { /* If we have a debug output, we deal it */ u_strcat(s,item->debug_output); explore_match_for_MERGE_or_REPLACE_mode(infos,element,items,current_item+1,s,last_text_dependent_tfst_tag,var_starts); s->len=len; s->str[len]='\0'; return; } unichar* output=infos->fst2->tags[item->fst2_transition->tag_number]->output; unichar name[MAX_TRANSDUCTION_VAR_LENGTH]; int capture; struct dela_entry* old_value_dela=NULL; capture=is_capture_variable(output,name); if (capture) { /* If we have a capture variable $:X$, we must save the previous value * for this dictionary variable */ old_value_dela=clone_dela_entry(get_dic_variable(name,infos->dic_variables)); } Match saved_element=element->m; struct list_int* text_tags=item->text_tag_numbers; int captured_chars=0; /* We explore all the text tags */ while (text_tags!=NULL) { /* First, we restore the output string */ s->len=len; s->str[len]='\0'; captured_chars=0; /* We deal with the fst2 tag output, if any */ if (item->first_time) { /* We only have to process the output only once, * since it will have the same effect on all tfst tags. * * Example: the fst2 tag "cybercrime/ZZ" may match the two tfst tags "cyber" and * "crime", but we must process the "ZZ" output only before the first tfst tag "cyber" */ if (capture) { /* If we have a capture variable, then we have to check whether the tfst tag * is a tagged token or not */ int tfst_tag_number=text_tags->n; int fst2_tag_number=item->fst2_transition->tag_number; if (!do_variable_capture(tfst_tag_number,fst2_tag_number,infos,name)) { goto restore_dic_variable; } } else if (!deal_with_output_tfst(s,output,infos,&captured_chars)) { /* We do not take into account matches with variable errors if the * process_output_for_tfst_match function has decided that backtracking * was necessary, either because of a variable error of because of a * $a.SET$ or $a.UNSET$ test */ goto restore_dic_variable; } } int last_tag=last_text_dependent_tfst_tag; TfstTag* current_tag=NULL; if (text_tags->n==-1) { /* We have a text independent match */ Fst2Tag fst2_tag=infos->fst2->tags[item->fst2_transition->tag_number]; if (fst2_tag->type==BEGIN_OUTPUT_VAR_TAG) { /* If we an output variable start $|a( */ int var_index=get_value_index(fst2_tag->variable,infos->output_variables->variable_index); Ustring* old_value = new_Ustring(); swap_output_variable_content(infos->output_variables, var_index, old_value); // now old_value contain the backup set_output_variable_pending(infos->output_variables,fst2_tag->variable); explore_match_for_MERGE_or_REPLACE_mode(infos,element,items,current_item+1,s,last_tag,var_starts); unset_output_variable_pending(infos->output_variables,fst2_tag->variable); // restore the good content from backup swap_output_variable_content(infos->output_variables, var_index, old_value); free_Ustring(old_value); goto restore_dic_variable; } else if (fst2_tag->type==END_OUTPUT_VAR_TAG) { /* If we an output variable end $|a) */ unset_output_variable_pending(infos->output_variables,fst2_tag->variable); explore_match_for_MERGE_or_REPLACE_mode(infos,element,items,current_item+1,s,last_tag,var_starts); set_output_variable_pending(infos->output_variables,fst2_tag->variable); goto restore_dic_variable; } else if (fst2_tag->type==BEGIN_VAR_TAG) { /* If we have a variable start tag $a(, we add it to our * variable tag list */ struct transduction_variable* v=get_transduction_variable(infos->input_variables,fst2_tag->variable); int old_value=v->start_in_tokens; /* We add the address of the start field to our list */ (*var_starts)=new_list_pointer(&(v->start_in_tokens),(var_starts==NULL)?NULL:(*var_starts)); /* Then, we go on the next item */ explore_match_for_MERGE_or_REPLACE_mode(infos,element,items,current_item+1,s,last_tag,var_starts); /* After the exploration, there are 2 cases: * 1) *var_starts is NULL: nothing to do * 2) *var_starts is not NULL: we reached the end of the items without findind any * text dependent match, so we can free the list */ free_list_pointer(*var_starts); (*var_starts)=NULL; v->start_in_tokens=old_value; /* If we have a $a( tag, we know that we can only have just one text tag * with special value -1 */ goto restore_dic_variable; } else if (fst2_tag->type==END_VAR_TAG) { /* If we have found a $a) tag */ if (last_tag==-1) { /* If we have no tfst tag to use, then it's a variable definition error, * and we have nothing special to do */ explore_match_for_MERGE_or_REPLACE_mode(infos,element,items,current_item+1,s,last_tag,var_starts); goto restore_dic_variable; } else { /* We can set the end of the variable, it's 'last_tag' */ struct transduction_variable* v=get_transduction_variable(infos->input_variables,fst2_tag->variable); int old_value=v->end_in_tokens; v->end_in_tokens=last_tag; explore_match_for_MERGE_or_REPLACE_mode(infos,element,items,current_item+1,s,last_tag,var_starts); v->end_in_tokens=old_value; goto restore_dic_variable; } } else if (fst2_tag->type==LEFT_CONTEXT_TAG) { /* If we have found a $* tag, we must reset the stack string and the * start position, so we save them */ unichar* old_stack=u_strdup(s->str); int old_pos_token=element->m.start_pos_in_token; int old_pos_char=element->m.start_pos_in_char; int old_pos_letter=element->m.start_pos_in_letter; /* We set the new values */ empty(s); element->m.start_pos_in_token=LEFT_CONTEXT_PENDING; /* We must reset last_tag to -1, because is not, we will have an * extra space on the left of the match */ explore_match_for_MERGE_or_REPLACE_mode(infos,element,items,current_item+1,s,-1,var_starts); /* And we restore previous values */ element->m.start_pos_in_token=old_pos_token; element->m.start_pos_in_char=old_pos_char; element->m.start_pos_in_letter=old_pos_letter; u_strcpy(s,old_stack); free(old_stack); /* If we have a $* tag, we know that we can only have just one text tag * with special value -1 */ goto restore_dic_variable; } else if (fst2_tag->type==BEGIN_POSITIVE_CONTEXT_TAG) { fatal_error("problem $[\n"); } } else { current_tag=(TfstTag*)(infos->tfst->tags->tab[text_tags->n]); /* We update the last tag */ last_tag=text_tags->n; /* If the current text tag is not a text independent one */ /* If there are some pending $a( tags, we set them to the current tag */ if (var_starts!=NULL) { struct list_pointer* ptr=(*var_starts); while (ptr!=NULL) { int* start=(int*)(ptr->pointer); (*start)=text_tags->n; ptr=ptr->next; } } int previous_start_token,previous_start_char; if (last_text_dependent_tfst_tag!=-1) { /* If the item is not the first, we must insert the original text that is * between the end of the previous merged text and the beginning of the * current one, typically to insert spaces */ TfstTag* previous_tag=(TfstTag*)(infos->tfst->tags->tab[last_text_dependent_tfst_tag]); previous_start_token=previous_tag->m.end_pos_in_token; previous_start_char=previous_tag->m.end_pos_in_char; /* We start just after the end of the previous match */ if (infos->tfst->token_content[previous_start_token][previous_start_char+1]!='\0') { /* If we were not at the end of the previous text token, we just inscrease * the char position */ previous_start_char++; } else { /* Otherwise, we go on the next token */ previous_start_token++; previous_start_char=0; } } else { /* Otherwise, we start on the beginning of the current text tag */ //error("current item=%d\n",text_tags->n); previous_start_token=current_tag->m.start_pos_in_token; previous_start_char=current_tag->m.start_pos_in_char; } /* Here we have to insert the text that is between current_start and current_end, * and then, the ouput of the fst2 transition */ if (infos->output_policy==MERGE_OUTPUTS) { insert_text_interval_tfst(infos,s,previous_start_token,previous_start_char, current_tag->m.end_pos_in_token,current_tag->m.end_pos_in_char); } } /* Then, we go on the next item */ struct list_pointer* ptr2=NULL; if (element->m.start_pos_in_token==LEFT_CONTEXT_PENDING && current_tag!=NULL) { element->m.start_pos_in_token=infos->tfst->offset_in_tokens+current_tag->m.start_pos_in_token; element->m.start_pos_in_char=current_tag->m.start_pos_in_char; element->m.start_pos_in_letter=current_tag->m.start_pos_in_letter; } explore_match_for_MERGE_or_REPLACE_mode(infos,element,items,current_item+1,s,last_tag ,&ptr2 /* We have encountered a text dependent tag, so there is no * more pending start tag like $a( */ ); element->m=saved_element; /* If there was a $* tag pending */ free_list_pointer(ptr2); if (infos->ambiguous_output_policy==IGNORE_AMBIGUOUS_OUTPUTS) { /* If we don't want ambiguous outputs, then the first path is * enough for our purpose */ goto restore_dic_variable; } text_tags=text_tags->next; remove_chars_from_output_variables(infos->output_variables,captured_chars); /* We reset to 0, because if we exit the while normally, we don't want to * modify output variables twice when reaching the 'restore_dic_variable' * label */ captured_chars=0; } restore_dic_variable: /* We redo this about output variables here, since we may have jumped here directly */ remove_chars_from_output_variables(infos->output_variables,captured_chars); if (capture) { /* If we have a capture variable $:X$, we must restore the previous value * for this dictionary variable */ set_dic_variable(name,old_value_dela,&(infos->dic_variables),0); } }
} return; } /* Here, we have to get all the case variants of the token. */ tag[i]->matching_tokens=destructive_sorted_merge(get_token_list_for_sequence(opt_token,alph,p->tokens,prv_alloc),tag[i]->matching_tokens,prv_alloc); } /** * This function checks if a pattern of the form "<eat>", "<eat.V>" or "<eaten,eat.V>" * can match the given tag token like "{today,.ADV}". */ void optimize_full_pattern_for_tag(unichar* tag_token,int i,Fst2Tag* tag,Alphabet* alph, struct locate_parameters* p,Abstract_allocator prv_alloc) { DISCARD_UNUSED_PARAMETER(alph) int token_number=get_value_index(tag_token,p->tokens); struct dela_entry* entry=tokenize_tag_token(tag_token,1); struct pattern* pattern=tag[i]->pattern; if ((pattern->type==LEMMA_PATTERN) || (pattern->type==INFLECTED_AND_LEMMA_PATTERN)) { /* If the pattern has a constraint on the lemma, we check it */ if (u_strcmp(entry->lemma,pattern->lemma)) { free_dela_entry(entry,prv_alloc); return; } } if ((pattern->type==LEMMA_AND_CODE_PATTERN) || (pattern->type==FULL_PATTERN)) { /* If the pattern contains a constraint on grammatical/semantic/inflectional * codes, then it has been put in the pattern tree, and so, this pattern * was tried on the current tag token in the 'check_patterns_for_tag_tokens' * function. Then, we just have to test if the tag token matches this pattern. */ if (p->matching_patterns==NULL || p->matching_patterns[token_number]==NULL ||
/** * Saves the given automaton into the given .fst2 file. */ void fst_file_write(Elag_fst_file_out* fstf,const Fst2Automaton* A) { Ustring* tag=new_Ustring(); void (*symbol_to_tag)(const symbol_t*,Ustring*)=NULL; switch (fstf->type) { case FST_TEXT: symbol_to_tag=symbol_to_text_label; break; case FST_GRAMMAR: symbol_to_tag=symbol_to_grammar_label; break; case FST_LOCATE: symbol_to_tag=symbol_to_locate_label; break; default: fatal_error("fst_file_write: invalid fstf->type: %d\n",fstf->type); } /* We save the graph number and name */ u_fprintf(fstf->f,"-%d %S\n",fstf->nb_automata+1,A->name); int index; unichar deflabel[]={'<','d','e','f','>',0}; for (int q=0;q<A->automaton->number_of_states;q++) { SingleGraphState state=A->automaton->states[q]; u_fprintf(fstf->f,"%C ",is_final_state(state)?'t':':'); for (Transition* t=state->outgoing_transitions;t!=NULL;t=t->next) { if (t->tag_number==-1) { /* If we are in the case of an "EMPTY" transition created because * the automaton was emptied as trim time */ u_strcpy(tag,"EMPTY"); } else { symbol_t* symbol=t->label; symbol_to_tag(symbol,tag); } if (fstf->type==FST_LOCATE) { /* If we are saving a Locate .fst2, we have to perform * some special things */ if (u_strcmp(tag->str, "<PNC>") == 0) { PNC_trans_write(fstf, t->state_number); } else if (u_strcmp(tag->str, "<CHFA>") == 0 || u_strcmp(tag->str, "<NB>") == 0) { CHFA_trans_write(fstf, t->state_number); } else if (u_strcmp(tag->str, "<.>") == 0) { LEXIC_trans_write(fstf, t->state_number); } else { goto normal_output; } } else { /* If we have a normal transition to print */ normal_output: index=get_value_index(tag->str,fstf->labels); u_fprintf(fstf->f,"%d %d ",index,t->state_number); } } if (state->default_state!=-1) { if (fstf->type!=FST_GRAMMAR) { error("Unexpected <def> label in text/locate automaton\n"); } index=get_value_index(deflabel,fstf->labels); u_fprintf(fstf->f,"%d %d ",index,state->default_state); } u_fputc('\n',fstf->f); } u_fprintf(fstf->f,"f \n"); free_Ustring(tag); fstf->nb_automata++; }
int ClassnamePredicate::get_value(ARGUMENTTYPE vt) const { IMPKERNEL_DEPRECATED_METHOD_DEF(2.1, "Use index version"); return get_value_index(internal::get_model(vt), internal::get_index(vt)); }
/** * Returns a control byte that represents the characteristics of the given token. */ unsigned char get_control_byte(const unichar* token,const Alphabet* alph,struct string_hash* err,TokenizationPolicy tokenization_policy) { int i; int tmp; unsigned char c=0; if (token==NULL || token[0]=='\0') { fatal_error("NULL or empty token in get_control_byte\n"); } /* We consider that a token starting with a letter is a word */ if (is_letter(token[0],alph)) { set_bit_mask(&c,MOT_TOKEN_BIT_MASK); /* If a token is a word, we check if it is in the 'err' word list * in order to answer the question <!DIC>. We perform this test in order * to avoid taking "priori" as an unknown word if the compound "a priori" * is in the text. */ if (err!=NULL && get_value_index(token,err,DONT_INSERT)!=-1) { set_bit_mask(&c,NOT_DIC_TOKEN_BIT_MASK); } if (is_upper(token[0],alph)) { set_bit_mask(&c,PRE_TOKEN_BIT_MASK); i=0; tmp=0; while (token[i]!='\0') { if (is_lower(token[i],alph)) { tmp=1; break; } i++; } if (!tmp) { set_bit_mask(&c,MAJ_TOKEN_BIT_MASK); } return c; } i=0; tmp=0; while (token[i]!='\0') { if (is_upper(token[i],alph)) { tmp=1; break; } i++; } if (!tmp) { set_bit_mask(&c,MIN_TOKEN_BIT_MASK); } return c; } /* If the token doesn't start with a letter, we start with * checking if it is a tag like {today,.ADV} */ if (token[0]=='{' && u_strcmp(token,"{S}") && u_strcmp(token,"{STOP}")) { /* Anyway, such a tag is classed as verifying <MOT> and <DIC> */ set_bit_mask(&c,MOT_TOKEN_BIT_MASK|DIC_TOKEN_BIT_MASK|TDIC_TOKEN_BIT_MASK); struct dela_entry* temp=tokenize_tag_token(token); if (is_upper(temp->inflected[0],alph)) { set_bit_mask(&c,PRE_TOKEN_BIT_MASK); i=0; tmp=0; while (temp->inflected[i]!='\0') { if (is_letter(temp->inflected[i],alph) && is_lower(temp->inflected[i],alph)) { tmp=1; break; } i++; } if (!tmp) { set_bit_mask(&c,MAJ_TOKEN_BIT_MASK); } } else { i=0; tmp=0; while (temp->inflected[i]!='\0') { if (is_letter(temp->inflected[i],alph) && is_upper(temp->inflected[i],alph)) { tmp=1; break; } i++; } if (!tmp) { set_bit_mask(&c,MIN_TOKEN_BIT_MASK); } } if (!is_a_simple_word(temp->inflected,tokenization_policy,alph)) { /* If the tag is a compound word, we say that it verifies the <CDIC> pattern */ set_bit_mask(&c,CDIC_TOKEN_BIT_MASK); } free_dela_entry(temp); } return c; }