コード例 #1
0
/**
 * Loads a compound word file, adding each word to the keywords.
 */
void load_compound_words(char* name,VersatileEncodingConfig* vec,
		struct string_hash_ptr* keywords) {
U_FILE* f=u_fopen(vec,name,U_READ);
if (f==NULL) return;
Ustring* line=new_Ustring(256);
Ustring* lower=new_Ustring(256);
while (EOF!=readline(line,f)) {
	if (line->str[0]=='{') {
		/* We skip tags */
		continue;
	}
	u_strcpy(lower,line->str);
	u_tolower(lower->str);
	int index=get_value_index(lower->str,keywords,INSERT_IF_NEEDED,NULL);
	if (index==-1) {
		fatal_error("Internal error in load_tokens_by_freq\n");
	}
	KeyWord* value=(KeyWord*)keywords->value[index];
	add_keyword(&value,line->str,1);
	keywords->value[index]=value;
}
free_Ustring(line);
free_Ustring(lower);
u_fclose(f);
}
コード例 #2
0
/**
 * Loads the initial keyword list from a tok_by_freq.txt file,
 * and turns all those tokens in a list whose primary key is the
 * lower case token:
 * The/20 THE/2 the/50 => the->(The/20 THE/2 the/50)
 */
struct string_hash_ptr* load_tokens_by_freq(char* name,VersatileEncodingConfig* vec) {
U_FILE* f=u_fopen(vec,name,U_READ);
if (f==NULL) return NULL;
Ustring* line=new_Ustring(128);
Ustring* lower=new_Ustring(128);
struct string_hash_ptr* res=new_string_hash_ptr(1024);
int val,pos;
/* We skip the first line of the file, containing the number
 * of tokens
 */
if (EOF==readline(line,f)) {
	fatal_error("Invalid empty file %s\n",name);
}
while (EOF!=readline(line,f)) {
	if (1!=u_sscanf(line->str,"%d%n",&val,&pos)) {
		fatal_error("Invalid line in file %s:\n%S\n",name,line->str);
	}
	u_strcpy(lower,line->str+pos);
	u_tolower(lower->str);
	int index=get_value_index(lower->str,res,INSERT_IF_NEEDED,NULL);
	if (index==-1) {
		fatal_error("Internal error in load_tokens_by_freq\n");
	}
	KeyWord* value=(KeyWord*)res->value[index];
	res->value[index]=new_KeyWord(val,line->str+pos,value);
}
free_Ustring(line);
free_Ustring(lower);
u_fclose(f);
return res;
}
コード例 #3
0
/**
 * This function moves outputs from final nodes to transitions leading to final nodes.
 */
static void subsequential_to_normal_transducer(struct dictionary_node* root,
		struct dictionary_node* node,
		struct string_hash* inf_codes,
		int pos,unichar* z,
		Ustring* normalizedOutput) {
struct dictionary_node_transition* tmp=node->trans;
int prefix_set=0;
Ustring* prefix=new_Ustring();
while (tmp!=NULL) {
	z[pos]=tmp->letter;
	z[pos+1]='\0';
	subsequential_to_normal_transducer(root,tmp->node,inf_codes,pos+1,z,normalizedOutput);
	/* First, if the destination state is final, we place its output on the output
	 * of the current transition */

	if (tmp->node->single_INF_code_list!=NULL) {
		//error("<%S>: output=<%S>\n",z,normalizedOutput->str);
		tmp->output=u_strdup(inf_codes->value[tmp->node->INF_code]);
	}
	if (normalizedOutput->len!=0) {
		/* Then, we add the normalized output obtained recursively, if any */
		//error("<%S>: moving normalized output <%S>\n",z,normalizedOutput->str);
		if (tmp->output==NULL) {
			tmp->output=u_strdup(normalizedOutput->str);
		} else {
			tmp->output=(unichar*)realloc(tmp->output,sizeof(unichar)*(1+normalizedOutput->len+u_strlen(tmp->output)));
		}
	}
	if (!prefix_set) {
		prefix_set=1;
		u_strcpy(prefix,tmp->output);
	} else {
		get_longest_common_prefix(prefix,tmp->output);
	}
	tmp=tmp->next;
}
if (node==root || node->single_INF_code_list!=NULL) {
	/* If we are in the initial state or a final one, we let the transitions as they are, since
	 * their outputs can not move more to the left */
	z[pos]='\0';
	free_Ustring(prefix);
	empty(normalizedOutput);
	return;
}
tmp=node->trans;
while (tmp!=NULL) {
	//error("prefix removal: <%S> => ",tmp->output);
	remove_prefix(prefix->len,tmp->output);
	//error("<%S>\n",tmp->output);
	tmp=tmp->next;
}
z[pos]='\0';
u_strcpy(normalizedOutput,prefix);
free_Ustring(prefix);
}
コード例 #4
0
/**
 * Prints the given hypotheses to the output, and if needed,
 * print the word to the modified input file.
 */
static void display_hypotheses(unichar* word,SpellCheckHypothesis* list,SpellCheckConfig* cfg) {
Ustring* line=new_Ustring(128);
int printed=0;
while (list!=NULL) {
	printed=1;
	struct dela_entry* entry=tokenize_DELAF_line(list->entry);
	if (entry==NULL) {
		fatal_error("Internal error in display_hypotheses; cannot tokenize entry:\n%S\n",list->entry);
	}
	unichar* inflected=entry->inflected;
	entry->inflected=u_strdup(word);
	entry->semantic_codes[entry->n_semantic_codes++]=u_strdup("SP_ERR");
	u_sprintf(line,"SP_INF=%S",inflected);
	entry->semantic_codes[entry->n_semantic_codes++]=u_strdup(line->str);
	dela_entry_to_string(line,entry);
	u_fprintf(cfg->out,"%S/score=%d\n",line->str,list->score);
	free(inflected);
	free_dela_entry(entry);
	list=list->next;
}
free_Ustring(line);
/* Now, we may have to print the word to the modified input file */
if (cfg->input_op=='M') {
	/* If we must keep matched words, then we print the word if it had matched */
	if (printed) u_fprintf(cfg->modified_input,"%S\n",word);
} else if (cfg->input_op=='U') {
	/* If we must keep unmatched words, then we print the word if it had matched */
	if (!printed) u_fprintf(cfg->modified_input,"%S\n",word);
}
}
コード例 #5
0
/**
 * This function takes a lexicographic tree with inf codes stored as
 * integer on nodes, and turns it into a real transducer where outputs
 * are stored on transitions.
 */
void move_outputs_on_transitions(struct dictionary_node* root,struct string_hash* inf_codes) {
int pos=0;
unichar z[0x400];
Ustring* normalizedOutput=new_Ustring();
subsequential_to_normal_transducer(root,root,inf_codes,pos,z,normalizedOutput);
free_Ustring(normalizedOutput);
}
コード例 #6
0
void lemmatize(struct dela_entry* e,struct string_hash_ptr* keywords,Alphabet* alphabet) {
unichar* lower=u_strdup(e->inflected);
u_tolower(lower);
KeyWord* k_inflected=(KeyWord*)get_value(lower,keywords);
free(lower);
if (k_inflected==NULL) return;
Ustring* tmp=new_Ustring(64);
u_sprintf(tmp,"%S.%S",e->lemma,e->semantic_codes[0]);
KeyWord* k_lemma=(KeyWord*)get_value(tmp->str,keywords);
if (k_lemma==NULL) {
	k_lemma=new_KeyWord(0,tmp->str,NULL);
	k_lemma->lemmatized=LEMMATIZED_KEYWORD;
	get_value_index(tmp->str,keywords,INSERT_IF_NEEDED,k_lemma);
}
/* Now, we look for all the case compatible tokens, and we add
 * their weights to the new lemmatized element
 */
while (k_inflected!=NULL) {
	if (k_inflected->sequence!=NULL && is_equal_or_uppercase(e->inflected,k_inflected->sequence,alphabet)) {
		/* We have a match */
		k_lemma->weight+=k_inflected->weight;
		k_inflected->lemmatized=1;
	}
	k_inflected=k_inflected->next;
}
free_Ustring(tmp);
}
コード例 #7
0
//
// this function reads words in the word file and try analyse them
//
void analyse_word_list(Dictionary* d,
			       U_FILE* words,
			       U_FILE* result,
			       U_FILE* debug,
			       U_FILE* new_unknown_words,
			       const Alphabet* alph,
			       const bool* prefix,const bool* suffix,
			       struct utags UTAG,
			       vector_ptr* rules,
			       vector_ptr* entries)
{
  u_printf("Analysing russian unknown words...\n");
  int n=0;
  int words_done = 0;
  Ustring* s=new_Ustring(MAX_WORD_LENGTH);
  while (EOF!=readline(s,words)) {
    if (!analyse_word(s->str,d,debug,result,prefix,suffix,alph,UTAG,rules,entries)) {
      // if the analysis has failed, we store the word in the new unknown word file
      u_fprintf(new_unknown_words,"%S\n",s->str);
    } else {
      n++;
    }
    if ( (++words_done % 10000) == 0)
      u_printf("%d words done", words_done);
  }
  free_Ustring(s);
  u_printf("%d words decomposed as compound words\n",n);
}
コード例 #8
0
/**
 * Loads a match list. Match lists are supposed to have been
 * generated by the Locate program.
 */
struct match_list* load_match_list(U_FILE* f,OutputPolicy *output_policy,unichar *header,Abstract_allocator prv_alloc) {
struct match_list* l=NULL;
struct match_list* end_of_list=NULL;
int start,end,start_char,end_char,start_letter,end_letter;
Ustring* line=new_Ustring();
char is_an_output;
/* We read the header */
unichar foo=0;
if (header==NULL) {
  header=&foo;
}
u_fscanf(f,"#%C\n",header);
OutputPolicy policy;
switch(*header) {
   case 'D': {
     policy=DEBUG_OUTPUTS;
     /* In debug mode, we have to skip the debug header */
     int n_graphs;
     u_fscanf(f,"%d\n",&n_graphs);
     while ((n_graphs--)>-1) {
       /* -1, because we also have to skip the #[IMR] line */
       readline(line,f);
     }
     break;
   }
   case 'M': policy=MERGE_OUTPUTS; break;
   case 'R':
   case 'T':
   case 'X': policy=REPLACE_OUTPUTS; break;
   case 'I':
   default: policy=IGNORE_OUTPUTS; break;
}
if (output_policy!=NULL) {
   (*output_policy)=policy;
}
while (6==u_fscanf(f,"%d.%d.%d %d.%d.%d",&start,&start_char,&start_letter,&end,&end_char,&end_letter)) {
   /* We look if there is an output or not, i.e. a space or a new line */
   int c=u_fgetc(f);
   if (c==' ') {
      /* If we have an output to read */
    readline(line,f);
    /* In debug mode, we have to stop at the char #1 */
      int i=-1;
      while (line->str[++i]!=1 && line->str[i]!='\0') {
    }
      line->str[i]='\0';
   }
   is_an_output=(policy!=IGNORE_OUTPUTS);
   if (l==NULL) {
      l=new_match(start,end,start_char,end_char,start_letter,end_letter,is_an_output?line->str:NULL,-1,NULL,prv_alloc);
      end_of_list=l;
   } else {
      end_of_list->next=new_match(start,end,start_char,end_char,start_letter,end_letter,is_an_output?line->str:NULL,-1,NULL,prv_alloc);
      end_of_list=end_of_list->next;
   }
}
free_Ustring(line);
return l;
}
コード例 #9
0
ファイル: LocateTfstMatches.cpp プロジェクト: adri87/Q-A
/**
 * This function explores the partial matches that constitute the given match in order to produce
 * one or all possible outputs, depending on infos->ambiguous_output_policy.
 * The output(s) is(are) then used to add matches to the infos->matches list.
 */
void explore_match_to_get_outputs(struct locate_tfst_infos* infos,struct tfst_match* m,
		                          struct tfst_simple_match_list* element) {
/* As m is a reversed list, we first need to get its elements in the right order */
vector_ptr* items=new_vector_ptr(16);
fill_vector(items,m);
Ustring* s=new_Ustring(1024);
/* In MERGE/REPLACE mode, we have to explore the combination of partial matches */
struct list_pointer* ptr=NULL;
explore_match_for_MERGE_or_REPLACE_mode(infos,element,items,0,s,-1,&ptr);
free_list_pointer(ptr);
free_Ustring(s);
free_vector_ptr(items);
}
コード例 #10
0
/**
 * We remove every keyword that is tagged with the forbidden code. If
 * a forbidden keyword has several tags, all of them are removed:
 *
 * the,.DET + the,.XXX => all 'the' keywords are removed
 */
struct string_hash* compute_forbidden_lemmas(struct string_hash_ptr* keywords,unichar* code) {
struct string_hash* hash=new_string_hash(DONT_USE_VALUES,DONT_ENLARGE);
Ustring* tmp=new_Ustring();
for (int i=0;i<keywords->size;i++) {
	KeyWord* list=(KeyWord*)(keywords->value[i]);
	while (list!=NULL) {
		if (get_forbidden_keyword(list,code,tmp)) {
			get_value_index(tmp->str,hash);
		}
		list=list->next;
	}
}
free_Ustring(tmp);
return hash;
}
コード例 #11
0
/**
 * Looks for a keyword that has a forbidden lemma or is a forbidden lemma
 * if the keyword is not a lemmatized one of the form XXX.YYY
 */
int has_forbidden_lemma(KeyWord* list,struct string_hash* lemmas) {
if (list==NULL || list->sequence==NULL) return 0;
int pos=last_index_of(list->sequence,(unichar)'.');
if (pos==-1) {
	/* If the keyword is not lemmatized, we just test
	 * if it is a forbidden lemma
	 */
	return (-1!=get_value_index(list->sequence,lemmas,DONT_INSERT));
}
Ustring* tmp=new_Ustring(list->sequence);
truncate(tmp,pos);
int index=get_value_index(tmp->str,lemmas,DONT_INSERT);
free_Ustring(tmp);
return index!=-1;
}
コード例 #12
0
/**
 * Loads the given DELAF and modifies the given keywords accordingly by
 * replacing any non removed token that appear in a DELAF entry
 * by its lemma. If there are ambiguities, several keywords are
 * generated. Doing that may merge keywords by adding their weights:
 * eats/2 + eaten/3 => eat/5
 */
void filter_keywords_with_dic(struct string_hash_ptr* keywords,char* name,
						VersatileEncodingConfig* vec,Alphabet* alphabet) {
U_FILE* f=u_fopen(vec,name,U_READ);
if (f==NULL) {
	error("Cannot load file %s\n",name);
	return;
}
Ustring* line=new_Ustring(128);
while (EOF!=readline(line,f)) {
	struct dela_entry* e=tokenize_DELAF_line(line->str);
	if (e==NULL) continue;
	lemmatize(e,keywords,alphabet);
	free_dela_entry(e);
}
free_Ustring(line);
u_fclose(f);
}
コード例 #13
0
/**
 * Reads the start and end positions of each token stored in the file
 * produced by Tokenize's --output_offsets option.
 */
vector_uima_offset* load_uima_offsets(const VersatileEncodingConfig* vec,const char* name) {
U_FILE* f;
f=u_fopen(vec,name,U_READ);
if (f==NULL) {
   return NULL;
}
vector_int* v=new_vector_int();
Ustring* line=new_Ustring();
int a,b,c;
while (EOF!=readline(line,f)) {
	u_sscanf(line->str,"%d%d%d",&a,&b,&c);
	vector_int_add(v,b);
	vector_int_add(v,c);
}
free_Ustring(line);
u_fclose(f);
return (vector_uima_offset*)v;
}
コード例 #14
0
/**
 * Loads the tags of the given .fst2 file. Returns 0 in case of success; -1 otherwise.
 * Note that the position in the file is unchanged after a call to this function.
 */
int load_elag_fst2_tags(Elag_fst_file_in* fst) {
/* We backup the position in the file, and we come back at the
 * beginning of the file */
long fpos=ftell(fst->f);
rewind(fst->f);
/* Now, we go to the tags section, skipping all the automata */
unichar buf[MAXBUF];
int i=0;
int len;
while (i<fst->nb_automata) {
   if ((len=u_fgets(buf,MAXBUF,fst->f))==EOF) {
      error("load_fst_tags: %s: unexpected EOF\n",fst->name);
      return -1;
   }
   if (buf[0]=='f' && isspace(buf[1])) {
      i++;
   }
   /* If we have read the beginning of a long line, we skip the rest of the line */
   while ((len==MAXBUF-1) && (buf[len-1]!='\n')) {
      len=u_fgets(buf,MAXBUF,fst->f);
   }
}
Ustring* ustr=new_Ustring(64);
while (readline(ustr,fst->f) && ustr->str[0]!='f') {
   if (ustr->str[0]!='%' && ustr->str[0]!='@') {
      error("load_fst_tags: %s: bad symbol line: '%S'\n",fst->name,ustr->str);
      return -1;
   }
   /* +1 because we ignore the % or @ at the beginning of the line */
   symbol_t* symbol=load_grammar_symbol(fst->language,ustr->str+1);
   /* If 'symbol' is NULL, then an error message has already
    * been printed. Moreover, we want to associate NULL to the
    * string, so that we don't exit the function. Whatever it is,
    * we add the symbol to the symbols of the .fst2 */
   get_value_index(ustr->str+1,fst->symbols,INSERT_IF_NEEDED,symbol);
}
if (*ustr->str==0) {
   fatal_error("load_fst_tags: unexpected EOF\n");
}
free_Ustring(ustr);
/* We set back the position in the file */
fseek(fst->f,fpos,SEEK_SET);
return 0;
}
コード例 #15
0
/**
 * Loads the given DELA into the given DELA tree.
 */
void load_DELA(const VersatileEncodingConfig* vec,const char* name,struct DELA_tree* tree) {
U_FILE* f=u_fopen(vec,name,U_READ);
if (f==NULL) {
   error("Cannot load dictionary %s\n",name);
   return;
}
u_printf("Loading %s...\n",name);
Ustring* line=new_Ustring(4096);
while (EOF!=readline(line,f)) {
   struct dela_entry* entry=tokenize_DELAF_line(line->str,1);
   if (entry!=NULL) {
      add_entry(tree,entry);
   }
   /* We don't need to free the entry, since it's done (if needed)
    * in the insertion function */
}
free_Ustring(line);
u_fclose(f);
}
コード例 #16
0
ファイル: Fst2Automaton.cpp プロジェクト: adri87/Q-A
/**
 * Adds a transition to 'automaton'.
 */
void add_transition(SingleGraph automaton,struct string_hash_ptr* symbols,int from,
                    symbol_t* label,int to) {
if (label==SYMBOL_DEF) {
   if (automaton->states[from]->default_state!=-1) {
      fatal_error("add_transition: more than one default transition\n");
   }
   automaton->states[from]->default_state=to;
   return;
}
while (label!=NULL) {
   if (label==SYMBOL_DEF) {
      fatal_error("add_transition: unexpected default transition\n");
   }
   /* We build a string representation of the symbol to avoid
    * duplicates in the value array */
   Ustring* u=new_Ustring();
   symbol_to_str(label,u);
   int n=get_value_index(u->str,symbols,INSERT_IF_NEEDED,label);
   free_Ustring(u);
   add_outgoing_transition(automaton->states[from],n,to);
   label=label->next;
}
}
コード例 #17
0
//
// this function try to analyse an unknown russian word
//
int analyse_word(const unichar* mot,Dictionary* d,U_FILE* debug,U_FILE* result_file,
                 const bool* prefix,const bool* suffix,const Alphabet* alphabet,
                 struct utags UTAG,vector_ptr* rules,vector_ptr* entries)
{
#if DDEBUG > 0
  {
    u_fprintf(debug,"\n  %S\n",mot);
  }
#endif

  unichar decomposition[MAX_DICT_LINE_LENGTH];
  unichar dela_line[MAX_DICT_LINE_LENGTH];
  unichar correct_word[MAX_DICT_LINE_LENGTH];
  decomposition[0]='\0';
  dela_line[0]='\0';
  correct_word[0]='\0';
  struct decomposed_word_list* l = 0;
  Ustring* ustr=new_Ustring();
  explore_state(d->initial_state_offset,correct_word,0,mot,mot,0,decomposition,dela_line,&l,1,0,0,d,
        prefix,suffix,alphabet,debug,UTAG,rules,entries,ustr,0);
  free_Ustring(ustr);
  free_all_dic_entries(entries);
  free_all_rule_lists(rules);
  if ( l == 0 ) {
    return 0;
  }
  struct decomposed_word_list* tmp = l;
  while ( tmp != NULL ) {
	  if (debug!=NULL) {
	     u_fprintf(debug,"%S = %S\n",mot,tmp->element->decomposition);
	  }
	  u_fprintf(result_file,"%S\n",tmp->element->dela_line);
     tmp=tmp->suivant;
  }
  free_decomposed_word_list(l);
  return 1;
}
コード例 #18
0
/**
 * This function reads a file that contains a list of Elag grammar names,
 * and it compiles them into the file 'outname'. However, if the result
 * automaton is too big, it will be saved in several automata inside
 * the output file.
 */
int compile_elag_rules(char* rulesname,char* outname, const VersatileEncodingConfig* vec,language_t* language) {
u_printf("Compilation of %s\n",rulesname);
U_FILE* f=NULL;
U_FILE* frules=u_fopen(ASCII,rulesname,U_READ);
if (frules==NULL) {
   fatal_error("Cannot open file '%s'\n",rulesname);
}
U_FILE* out=u_fopen(ASCII,outname,U_WRITE);
if (out==NULL) {
   fatal_error("cannot open file '%s'\n",outname);
}
/* Name of the file that contains the result automaton */
char fstoutname[FILENAME_MAX];
int nbRules=0;
char buf[FILENAME_MAX];
time_t start_time=time(0);
Fst2Automaton* res=NULL;
Fst2Automaton* A;
int fst_number=0;
Ustring* ustr=new_Ustring();

char buf2[FILENAME_MAX];
char directory[FILENAME_MAX];
get_path(rulesname,directory);

while (af_fgets(buf,FILENAME_MAX,frules->f)) {
   /* We read one by one the Elag grammar names in the .lst file */
   chomp(buf);
   if (*buf=='\0') {
      /* If we have an empty line */
      continue;
   }
   if (!is_absolute_path(buf)) {
      strcpy(buf2,buf);
      sprintf(buf,"%s%s",directory,buf2);
   }

   u_printf("\n%s...\n",buf);
   remove_extension(buf);
   strcat(buf,".elg");
   if ((f=u_fopen(ASCII,buf,U_READ))==NULL) {
      /* If the .elg file doesn't exist, we create one */
      remove_extension(buf);
      u_printf("Precompiling %s.fst2\n",buf);
      strcat(buf,".fst2");
      elRule* rule=new_elRule(buf,vec,language);
      if (rule==NULL) {
         fatal_error("Unable to read grammar '%s'\n",buf);
      }
      if ((A=compile_elag_rule(rule,language))==NULL) {
         fatal_error("Unable to compile rule '%s'\n",buf);
      }
      free_elRule(rule);
   } else {
      /* If there is already .elg, we use it */
      u_fclose(f);
      A=load_elag_grammar_automaton(vec,buf,language);
      if (A==NULL) {
         fatal_error("Unable to load '%s'\n",buf);
      }
   }
   if (A->automaton->number_of_states==0) {
      error("Grammar %s forbids everything!\n",buf);
   }
   if (res!=NULL) {
      /* If there is already an automaton, we intersect it with the new one */
      SingleGraph tmp=res->automaton;
      res->automaton=elag_intersection(language,tmp,A->automaton,GRAMMAR_GRAMMAR);
      free_SingleGraph(tmp,NULL);
      free_Fst2Automaton(A,NULL);
      trim(res->automaton,NULL);
   } else {
      res=A;
   }
   nbRules++;
   if (res->automaton->number_of_states>MAX_GRAM_SIZE) {
      /* If the automaton is too large, we will split the grammar
       * into several automata */
      elag_minimize(res->automaton,1);
      sprintf(fstoutname,"%s-%d.elg",outname,fst_number++);
      u_fprintf(out,"<%s>\n",fstoutname);
      u_printf("Splitting big grammar in '%s' (%d states)\n",fstoutname,res->automaton->number_of_states);
      u_sprintf(ustr,"%s: compiled elag grammar",fstoutname);
      free(res->name);
      res->name=u_strdup(ustr->str);
      save_automaton(res,fstoutname,vec,FST_GRAMMAR);
      free_Fst2Automaton(res,NULL);
      res=NULL;
   }
}
if (res!=NULL) {
   /* We save the last automaton, if any */
   sprintf(fstoutname,"%s-%d.elg",outname,fst_number++);
   u_fprintf(out,"<%s>\n",fstoutname);
   u_printf("Saving grammar in '%s'(%d states)\n",fstoutname,res->automaton->number_of_states);
   elag_minimize(res->automaton,1);
   u_sprintf(ustr,"%s: compiled elag grammar",fstoutname);
   free(res->name);
   res->name=u_strdup(ustr->str);
   save_automaton(res,fstoutname,vec,FST_GRAMMAR);
   free_Fst2Automaton(res,free_symbol);
}
time_t end_time=time(0);
u_fclose(frules);
u_fclose(out);
free_Ustring(ustr);
u_printf("\nDone.\nElapsed time: %.0f s\n",difftime(end_time,start_time));
u_printf("\n%d rule%s from %s compiled in %s (%d automat%s)\n",
         nbRules,(nbRules>1)?"s":"",rulesname,outname,fst_number,
         (fst_number>1)?"a":"on");
return 0;
}
コード例 #19
0
/////////////////////////////////////////////////////////////////////////////////
// Inflect a DELAS/DELAC into a DELAF/DELACF.
// On error returns 1, 0 otherwise.
int inflect(char* DLC, char* DLCF, 
		    MultiFlex_ctx* p_multiFlex_ctx, Alphabet* alph,
		    int error_check_status) {
	U_FILE *dlc, *dlcf; //DELAS/DELAC and DELAF/DELACF files
	unichar output_line[DIC_LINE_SIZE]; //current DELAF/DELACF line
	int l; //length of the line scanned
	DLC_entry_T* dlc_entry;
	MU_forms_T MU_forms; //inflected forms of the MWU
	int err;

	//Open DELAS/DELAC
	dlc = u_fopen(p_multiFlex_ctx->vec, DLC, U_READ);
	if (!dlc) {
		return 1;
	}
	//Open DELAF/DELACF
	dlcf = u_fopen(p_multiFlex_ctx->vec, DLCF, U_WRITE);
	if (!dlcf) {
		error("Unable to open file: '%s' !\n", DLCF);
		return 1;
	}
	//Inflect one entry at a time
	Ustring* input_line=new_Ustring(DIC_LINE_SIZE);
	l = readline(input_line,dlc);
	//Omit the final newline
	int flag = 0;
	//If a line is empty the file is not necessarily finished.
	//If the last entry has no newline, we should not skip this entry
	struct dela_entry* DELAS_entry;
	int semitic = 0;
	int current_line=0;
	while (l != EOF) {
	    current_line++;
		DELAS_entry = is_strict_DELAS_line(input_line->str, alph);
		if (DELAS_entry != NULL) {
			/* If we have a strict DELAS line, that is to say, one with
			 * a simple word */
			if (error_check_status==ONLY_COMPOUND_WORDS) {
				error("Unexpected simple word forbidden by -c:\n%S\n",input_line);
				free_dela_entry(DELAS_entry);
				goto next_line;
			}
			SU_forms_T forms;
			SU_init_forms(&forms); //Allocate the space for forms and initialize it to null values
			char inflection_code[1024];
			unichar code_gramm[1024];
			/* We take the first grammatical code, and we extract from it the name
			 * of the inflection transducer to use */
			get_inflection_code(DELAS_entry->semantic_codes[0],
					inflection_code, code_gramm, &semitic);
			/* And we inflect the word */
			// Fix bug#8 - "Inflection with Semitic Mode is not working anymore"
			p_multiFlex_ctx->semitic  = semitic;      
			//   err=SU_inflect(DELAS_entry->lemma,inflection_code,&forms,semitic);
			if (DELAS_entry->n_filter_codes != 0) {

				p_multiFlex_ctx->n_filter_codes = DELAS_entry->n_filter_codes;
				p_multiFlex_ctx->filter_polarity = DELAS_entry->filter_polarity;
				p_multiFlex_ctx->filter_codes = DELAS_entry->filter_codes;

				err = SU_inflect(p_multiFlex_ctx,DELAS_entry->lemma, inflection_code,&forms);

				p_multiFlex_ctx->n_filter_codes=0;
			}
			else err = SU_inflect(p_multiFlex_ctx,DELAS_entry->lemma, inflection_code,&forms);


#ifdef REMINDER_WARNING
#ifdef __GNUC__
#warning mettre toutes les entrees sur une meme ligne
#elif ((defined(__VISUALC__)) || defined(_MSC_VER))
#pragma message("warning : mettre toutes les entrees sur une meme ligne")
#endif
#endif


			/* Then, we print its inflected forms to the output */
			for (int i = 0; i < forms.no_forms; i++) {
			   unichar foo[1024];   
			   if (p_multiFlex_ctx->korean!=NULL) {

			      Hanguls_to_Jamos(forms.forms[i].form,foo,p_multiFlex_ctx->korean,1);
			   } else {
			      u_strcpy(foo,forms.forms[i].form);
			   }
			   
			   u_fprintf(dlcf, "%S,%S.%S", foo/*forms.forms[i].form*/,
						DELAS_entry->lemma, code_gramm);
				/* We add the semantic codes, if any */
				for (int j = 1; j < DELAS_entry->n_semantic_codes; j++) {
					u_fprintf(dlcf, "+%S", DELAS_entry->semantic_codes[j]);
				}
				if (forms.forms[i].local_semantic_code != NULL) {
					u_fprintf(dlcf, "%S", forms.forms[i].local_semantic_code);
				}
				if (forms.forms[i].raw_features != NULL
						&& forms.forms[i].raw_features[0] != '\0') {
					u_fprintf(dlcf, ":%S", forms.forms[i].raw_features);
				}
				u_fprintf(dlcf, "\n");
			}
			SU_delete_inflection(&forms);
			free_dela_entry(DELAS_entry);
			/* End of simple word case */
		} else {
			u_fprintf(U_STDERR,"we no have a strict DELAS line\n");
			/* If we have not a simple word DELAS line, we try to analyse it
			 * as a compound word DELAC line */
			if (error_check_status==ONLY_SIMPLE_WORDS) {
				error("Unexpected compound word forbidden by -s:\n%S\n",input_line);
				goto next_line;
			}
			if (p_multiFlex_ctx->config_files_status != CONFIG_FILES_ERROR) {
				/* If this is a compound word, we process it if and only if the
				 * configuration files have been correctly loaded */
				dlc_entry = (DLC_entry_T*) malloc(sizeof(DLC_entry_T));
				if (!dlc_entry) {
					fatal_alloc_error("inflect");
				}
				/* Convert a DELAC entry into the internal multi-word format */
				err = DLC_line2entry(alph,p_multiFlex_ctx->pL_MORPHO,input_line->str, dlc_entry, &(p_multiFlex_ctx->D_CLASS_EQUIV));
				if (!err) {
					//Inflect the entry
					MU_init_forms(&MU_forms);
					err = MU_inflect(p_multiFlex_ctx,dlc_entry->lemma,&MU_forms);
					if (!err) {
						int f; //index of the current inflected form
						//Inform the user if no form generated
						if (MU_forms.no_forms == 0) {
							error("No inflected form could be generated for ");
							DLC_print_entry(U_STDERR,p_multiFlex_ctx->pL_MORPHO,dlc_entry);
						}
						//Print inflected forms
						for (f = 0; f < MU_forms.no_forms; f++) {
							//Format the inflected form to the DELACF format
							err = DLC_format_form(p_multiFlex_ctx->pL_MORPHO,output_line, DIC_LINE_SIZE
									- 1, MU_forms.forms[f], dlc_entry,
									&(p_multiFlex_ctx->D_CLASS_EQUIV));
							if (!err) {
								//Print one inflected form at a time to the DELACF file
								u_fprintf(dlcf, "%S\n", output_line);
							}
						}
					}
					MU_delete_inflection(&MU_forms);
					DLC_delete_entry(dlc_entry);
				}
			} else {
				/* We try to inflect a compound word whereas the "Morphology.txt" and/or
				 * "Equivalences.txt" file(s) has/have not been loaded */
				if (!flag) {
					/* We use a flag to print the error message only once */
					error(
							"WARNING: Compound words won't be inflected because configuration files\n");
					error("         have not been correctly loaded.\n");
					flag = 1;
				}
			}
		}
		next_line:
		//Get next entry
		l = readline(input_line,dlc);
		if (l!=EOF) {
			if (input_line->str[0]=='\0') {
				/* If we find an empty line, then we go on */
				goto next_line;
			}
		}
	}
	free_Ustring(input_line);
	u_fclose(dlc);
	u_fclose(dlcf);
	return 0;
}
コード例 #20
0
/**
 * Loads and returns an automaton from the given .fst2.
 * Returns NULL if there is no more automaton to load.
 */
Fst2Automaton* load_automaton(Elag_fst_file_in* fstf) {
if (fstf->pos>=fstf->nb_automata) {
   return NULL;
}
Ustring* ustr=new_Ustring();
readline(ustr,fstf->f);
const unichar* p=ustr->str;
if (p[0]!='-') {
   fatal_error("load_automaton: %s: bad file format\n",fstf->name);
}
p++;
int i=u_parse_int(p,&p);
if (i!=fstf->pos+1) {
   /* We make sure that the automaton number is what it should be */
   fatal_error("load_automaton: %s: parsing error with line '%S' ('-%d ...' expected)\n",fstf->name,ustr->str,fstf->pos+1);
}
/* Now p points on the automaton name */
p++;
Fst2Automaton* A=new_Fst2Automaton(p);
while (readline(ustr,fstf->f) && ustr->str[0]!='f') {
   /* If there is a state to read */
   p=ustr->str;
   SingleGraphState state=add_state(A->automaton);
   if (*p=='t') {
      /* If necessary, we set the state final */
      set_final_state(state);
   }
   /* We puts p on the first digit */
   while (*p!='\0' && !u_is_digit(*p)) {
      p++;
   }
   while (*p!='\0') {
      /* If there is a transition to read */
      int tag_number=u_parse_int(p,&p);
      if (fstf->renumber!=NULL) {
         tag_number=fstf->renumber[tag_number];
      }
      while (*p==' ') {
         p++;
      }
      if (!u_is_digit(*p)) {
         fatal_error("load_automaton: %s: bad file format (line='%S')\n",fstf->name,ustr->str);
      }
      int state_number=u_parse_int(p,&p);
      symbol_t* tmp=(symbol_t*)fstf->symbols->value[tag_number];
      if (tmp!=NULL) {
         /* If it is a good symbol (successfully loaded), we add transition(s) */
         if (fstf->type!=FST_TEXT) {
            add_all_outgoing_transitions(state,tmp,state_number);
         } else {
            /* In a text automaton, we add one transition per element of
             * the symbol list. For instance, if we have:
             *
             * tmp = "{domestique,.N:fs}" => "{domestique,.N:ms}" => NULL
             *
             * then we add two transitions. */
            add_all_outgoing_transitions(state,tmp,state_number);
         }
      }
      while (*p==' ') {
         p++;
      }
   }
}
if (*ustr->str=='\0') {
   fatal_error("load_automaton: unexpected end of file\n");
}
if (A->automaton->number_of_states==0) {
   error("load_automaton: automaton with no state\n");
} else {
   set_initial_state(A->automaton->states[0]);
}
fstf->pos++;
free_Ustring(ustr);
return A;
}
コード例 #21
0
/**
 * Saves the given automaton into the given .fst2 file.
 */
void fst_file_write(Elag_fst_file_out* fstf,const Fst2Automaton* A) {
Ustring* tag=new_Ustring();
void (*symbol_to_tag)(const symbol_t*,Ustring*)=NULL;
switch (fstf->type) {
   case FST_TEXT:
      symbol_to_tag=symbol_to_text_label;
      break;

   case FST_GRAMMAR:
      symbol_to_tag=symbol_to_grammar_label;
      break;

  case FST_LOCATE:
      symbol_to_tag=symbol_to_locate_label;
      break;

  default:
      fatal_error("fst_file_write: invalid fstf->type: %d\n",fstf->type);
}
/* We save the graph number and name */
u_fprintf(fstf->f,"-%d %S\n",fstf->nb_automata+1,A->name);
int index;
unichar deflabel[]={'<','d','e','f','>',0};
for (int q=0;q<A->automaton->number_of_states;q++) {
   SingleGraphState state=A->automaton->states[q];
   u_fprintf(fstf->f,"%C ",is_final_state(state)?'t':':');
   for (Transition* t=state->outgoing_transitions;t!=NULL;t=t->next) {
      if (t->tag_number==-1) {
         /* If we are in the case of an "EMPTY" transition created because
          * the automaton was emptied as trim time */
         u_strcpy(tag,"EMPTY");
      } else {
         symbol_t* symbol=t->label;
         symbol_to_tag(symbol,tag);
      }
      if (fstf->type==FST_LOCATE) {
         /* If we are saving a Locate .fst2, we have to perform
          * some special things */
         if (u_strcmp(tag->str, "<PNC>") == 0) {
            PNC_trans_write(fstf, t->state_number);
         } else if (u_strcmp(tag->str, "<CHFA>") == 0 || u_strcmp(tag->str, "<NB>") == 0) {
            CHFA_trans_write(fstf, t->state_number);
         } else if (u_strcmp(tag->str, "<.>") == 0) {
            LEXIC_trans_write(fstf, t->state_number);
         } else {
            goto normal_output;
         }
      } else {
         /* If we have a normal transition to print */
         normal_output:
         index=get_value_index(tag->str,fstf->labels);
         u_fprintf(fstf->f,"%d %d ",index,t->state_number);
      }
   }
   if (state->default_state!=-1) {
      if (fstf->type!=FST_GRAMMAR) {
         error("Unexpected <def> label in text/locate automaton\n");
      }
      index=get_value_index(deflabel,fstf->labels);
      u_fprintf(fstf->f,"%d %d ",index,state->default_state);
   }
   u_fputc('\n',fstf->f);
}
u_fprintf(fstf->f,"f \n");
free_Ustring(tag);
fstf->nb_automata++;
}
コード例 #22
0
/**
 * Explores all the partial matches to produce outputs in MERGE or REPLACE mode.
 * 
 * If *var_starts!=NULL, it means that there are pending $var_start( tags
 * that wait for being taken into account when a text dependent tag is found.
 */
void explore_match_for_MERGE_or_REPLACE_mode(struct locate_tfst_infos* infos,
                                  struct tfst_simple_match_list* element,
                                  vector_ptr* items,int current_item,Ustring* s,
                                  int last_text_dependent_tfst_tag,
                                  struct list_pointer* *var_starts) {
if (current_item==items->nbelems) {
   /* If we have finished, we can save the current output */
   element->output=s->str;
   infos->matches=add_element_to_list(infos,infos->matches,element);
   element->output=NULL;
   return;
}
/* We save the length because it will be modified */
int len=s->len;
struct tfst_match* item=(struct tfst_match*)(items->tab[current_item]);
if (item==NULL) {
   fatal_error("Unexpected NULL item in explore_match_for_MERGE_mode\n");
}
if (item->debug_output!=NULL) {
	/* If we have a debug output, we deal it */
	u_strcat(s,item->debug_output);
	explore_match_for_MERGE_or_REPLACE_mode(infos,element,items,current_item+1,s,last_text_dependent_tfst_tag,var_starts);
	s->len=len;
	s->str[len]='\0';
	return;
}


unichar* output=infos->fst2->tags[item->fst2_transition->tag_number]->output;

unichar name[MAX_TRANSDUCTION_VAR_LENGTH];
int capture;
struct dela_entry* old_value_dela=NULL;
capture=is_capture_variable(output,name);
if (capture) {
	/* If we have a capture variable $:X$, we must save the previous value
	 * for this dictionary variable */
	old_value_dela=clone_dela_entry(get_dic_variable(name,infos->dic_variables));
}

Match saved_element=element->m;
struct list_int* text_tags=item->text_tag_numbers;
int captured_chars=0;
/* We explore all the text tags */
while (text_tags!=NULL) {
   /* First, we restore the output string */
   s->len=len;
   s->str[len]='\0';
   captured_chars=0;
   /* We deal with the fst2 tag output, if any */
   if (item->first_time) {
	   /* We only have to process the output only once,
	    * since it will have the same effect on all tfst tags.
	    *
	    * Example: the fst2 tag "cybercrime/ZZ" may match the two tfst tags "cyber" and
	    * "crime", but we must process the "ZZ" output only before the first tfst tag "cyber" */
	   if (capture) {
		   /* If we have a capture variable, then we have to check whether the tfst tag
	   	    * is a tagged token or not */
	   	   int tfst_tag_number=text_tags->n;
	   	   int fst2_tag_number=item->fst2_transition->tag_number;
	   	   if (!do_variable_capture(tfst_tag_number,fst2_tag_number,infos,name)) {
	   		   goto restore_dic_variable;
	   	   }
	   } else if (!deal_with_output_tfst(s,output,infos,&captured_chars)) {
         /* We do not take into account matches with variable errors if the
          * process_output_for_tfst_match function has decided that backtracking
          * was necessary, either because of a variable error of because of a
          * $a.SET$ or $a.UNSET$ test */
		  goto restore_dic_variable;
      }
   }
   int last_tag=last_text_dependent_tfst_tag;
   TfstTag* current_tag=NULL;
   if (text_tags->n==-1) {
      /* We have a text independent match */
      Fst2Tag fst2_tag=infos->fst2->tags[item->fst2_transition->tag_number];
      if (fst2_tag->type==BEGIN_OUTPUT_VAR_TAG) {
          /* If we an output variable start $|a( */
          int var_index=get_value_index(fst2_tag->variable,infos->output_variables->variable_index);

		  Ustring* old_value = new_Ustring();
		  swap_output_variable_content(infos->output_variables, var_index, old_value);
		  // now old_value contain the backup

          set_output_variable_pending(infos->output_variables,fst2_tag->variable);
          explore_match_for_MERGE_or_REPLACE_mode(infos,element,items,current_item+1,s,last_tag,var_starts);
          unset_output_variable_pending(infos->output_variables,fst2_tag->variable);

		  // restore the good content from backup
		  swap_output_variable_content(infos->output_variables, var_index, old_value);
		  free_Ustring(old_value);

          goto restore_dic_variable;
      } else if (fst2_tag->type==END_OUTPUT_VAR_TAG) {
          /* If we an output variable end $|a) */
          unset_output_variable_pending(infos->output_variables,fst2_tag->variable);
          explore_match_for_MERGE_or_REPLACE_mode(infos,element,items,current_item+1,s,last_tag,var_starts);
          set_output_variable_pending(infos->output_variables,fst2_tag->variable);
          goto restore_dic_variable;
      } else if (fst2_tag->type==BEGIN_VAR_TAG) {
         /* If we have a variable start tag $a(, we add it to our 
          * variable tag list */
         struct transduction_variable* v=get_transduction_variable(infos->input_variables,fst2_tag->variable);
         int old_value=v->start_in_tokens;
         /* We add the address of the start field to our list */
         (*var_starts)=new_list_pointer(&(v->start_in_tokens),(var_starts==NULL)?NULL:(*var_starts));
         /* Then, we go on the next item */
         explore_match_for_MERGE_or_REPLACE_mode(infos,element,items,current_item+1,s,last_tag,var_starts);
         /* After the exploration, there are 2 cases:
          * 1) *var_starts is NULL: nothing to do
          * 2) *var_starts is not NULL: we reached the end of the items without findind any
          *                             text dependent match, so we can free the list */
         free_list_pointer(*var_starts);
         (*var_starts)=NULL;
         v->start_in_tokens=old_value;
         /* If we have a $a( tag, we know that we can only have just one text tag 
          * with special value -1 */
         goto restore_dic_variable;
      } else if (fst2_tag->type==END_VAR_TAG) {
         /* If we have found a $a) tag */
         if (last_tag==-1) {
            /* If we have no tfst tag to use, then it's a variable definition error,
             * and we have nothing special to do */
            explore_match_for_MERGE_or_REPLACE_mode(infos,element,items,current_item+1,s,last_tag,var_starts);
            goto restore_dic_variable;
         } else {
            /* We can set the end of the variable, it's 'last_tag' */
            struct transduction_variable* v=get_transduction_variable(infos->input_variables,fst2_tag->variable);
            int old_value=v->end_in_tokens;
            v->end_in_tokens=last_tag;
            explore_match_for_MERGE_or_REPLACE_mode(infos,element,items,current_item+1,s,last_tag,var_starts);
            v->end_in_tokens=old_value;
            goto restore_dic_variable;
         }
      } else if (fst2_tag->type==LEFT_CONTEXT_TAG) {
         /* If we have found a $* tag, we must reset the stack string and the 
          * start position, so we save them */
         unichar* old_stack=u_strdup(s->str);
         int old_pos_token=element->m.start_pos_in_token;
         int old_pos_char=element->m.start_pos_in_char;
         int old_pos_letter=element->m.start_pos_in_letter;
         /* We set the new values */
         empty(s);
         element->m.start_pos_in_token=LEFT_CONTEXT_PENDING;
         /* We must reset last_tag to -1, because is not, we will have an 
          * extra space on the left of the match */
         explore_match_for_MERGE_or_REPLACE_mode(infos,element,items,current_item+1,s,-1,var_starts);
         
         /* And we restore previous values */
         element->m.start_pos_in_token=old_pos_token;
         element->m.start_pos_in_char=old_pos_char;
         element->m.start_pos_in_letter=old_pos_letter;
         u_strcpy(s,old_stack);
         free(old_stack);
         /* If we have a $* tag, we know that we can only have just one text tag 
          * with special value -1 */
         goto restore_dic_variable;
      } else if (fst2_tag->type==BEGIN_POSITIVE_CONTEXT_TAG) {
    	  fatal_error("problem $[\n");
      }
   } else {
      current_tag=(TfstTag*)(infos->tfst->tags->tab[text_tags->n]);
      /* We update the last tag */
      last_tag=text_tags->n;
      /* If the current text tag is not a text independent one */
      
      /* If there are some pending $a( tags, we set them to the current tag */
      if (var_starts!=NULL) {
         struct list_pointer* ptr=(*var_starts);
         while (ptr!=NULL) {
            int* start=(int*)(ptr->pointer);
            (*start)=text_tags->n;
            ptr=ptr->next;
         }
      }
      int previous_start_token,previous_start_char; 
      if (last_text_dependent_tfst_tag!=-1) {
         /* If the item is not the first, we must insert the original text that is
          * between the end of the previous merged text and the beginning of the
          * current one, typically to insert spaces */
         TfstTag* previous_tag=(TfstTag*)(infos->tfst->tags->tab[last_text_dependent_tfst_tag]);
         previous_start_token=previous_tag->m.end_pos_in_token;
         previous_start_char=previous_tag->m.end_pos_in_char;
         /* We start just after the end of the previous match */
         if (infos->tfst->token_content[previous_start_token][previous_start_char+1]!='\0') {
            /* If we were not at the end of the previous text token, we just inscrease
             * the char position */
            previous_start_char++;
         } else {
            /* Otherwise, we go on the next token */
            previous_start_token++;
            previous_start_char=0;
         }
      } else {
         /* Otherwise, we start on the beginning of the current text tag */
         //error("current item=%d\n",text_tags->n);
         previous_start_token=current_tag->m.start_pos_in_token;
         previous_start_char=current_tag->m.start_pos_in_char;
      }
      /* Here we have to insert the text that is between current_start and current_end,
       * and then, the ouput of the fst2 transition */
      if (infos->output_policy==MERGE_OUTPUTS) {
    	  insert_text_interval_tfst(infos,s,previous_start_token,previous_start_char,
                 current_tag->m.end_pos_in_token,current_tag->m.end_pos_in_char);
      }
   }
   /* Then, we go on the next item */
   struct list_pointer* ptr2=NULL;
   if (element->m.start_pos_in_token==LEFT_CONTEXT_PENDING && current_tag!=NULL) {
      element->m.start_pos_in_token=infos->tfst->offset_in_tokens+current_tag->m.start_pos_in_token;
      element->m.start_pos_in_char=current_tag->m.start_pos_in_char;
      element->m.start_pos_in_letter=current_tag->m.start_pos_in_letter;
   }
   explore_match_for_MERGE_or_REPLACE_mode(infos,element,items,current_item+1,s,last_tag
         ,&ptr2 /* We have encountered a text dependent tag, so there is no
                 * more pending start tag like $a( */
         );
   element->m=saved_element;
   /* If there was a $* tag pending */
   free_list_pointer(ptr2);
   if (infos->ambiguous_output_policy==IGNORE_AMBIGUOUS_OUTPUTS) {
      /* If we don't want ambiguous outputs, then the first path is
       * enough for our purpose */ 
      goto restore_dic_variable;
   }
   text_tags=text_tags->next;
   remove_chars_from_output_variables(infos->output_variables,captured_chars);
   /* We reset to 0, because if we exit the while normally, we don't want to
    * modify output variables twice when reaching the 'restore_dic_variable'
    * label */
   captured_chars=0;
}
restore_dic_variable:
/* We redo this about output variables here, since we may have jumped here directly */
remove_chars_from_output_variables(infos->output_variables,captured_chars);
if (capture) {
	/* If we have a capture variable $:X$, we must restore the previous value
	 * for this dictionary variable */
	set_dic_variable(name,old_value_dela,&(infos->dic_variables),0);
}
}