Пример #1
0
//
// this function analyses russian compound words
//
void analyse_compounds(const Alphabet* alph,
		       Dictionary* d,
			   U_FILE* words,
		       U_FILE* result,
		       U_FILE* debug,
		       U_FILE* new_unknown_words,struct utags UTAG)
{
   bool* prefix;
   bool* suffix;
   vector_ptr* rules=new_vector_ptr(16);
   vector_ptr* entries=new_vector_ptr(16);
  init_tableaux(d->inf,&prefix,&suffix,UTAG);
  analyse_word_list(d,words,result,debug,new_unknown_words,alph,prefix,suffix,UTAG,rules,entries);
  free_tableaux(prefix,suffix);
  free_vector_ptr(rules);
  free_vector_ptr(entries);
}
Пример #2
0
//
// this function analyses russian compound words
//
void analyse_compounds(const Alphabet* alph,
		       const unsigned char* bin,
		       const struct INF_codes* inf,
		       U_FILE* words,
		       U_FILE* result,
		       U_FILE* debug,
		       U_FILE* new_unknown_words,struct utags UTAG)
{
   bool* prefix;
   bool* suffix;
   vector_ptr* rules=new_vector_ptr(16);
   vector_ptr* entries=new_vector_ptr(16);
  init_tableaux(inf,&prefix,&suffix,UTAG);
  analyse_word_list(bin,inf,words,result,debug,new_unknown_words,alph,prefix,suffix,UTAG,rules,entries);
  free_tableaux(prefix,suffix);
  free_vector_ptr(rules);
  free_vector_ptr(entries);
}
Пример #3
0
/**
 * Frees a locate_parameters structure.
 */
void free_locate_parameters(struct locate_parameters* p) {
    if (p==NULL) return;
    if (p->recyclable_wchart_buffer!=NULL) {
        free(p->recyclable_wchart_buffer);
    }
    if (p->recyclable_unichar_buffer!=NULL) {
        free(p->recyclable_unichar_buffer);
    }
    free_vector_ptr(p->cached_match_vector,NULL);
    free(p);
}
Пример #4
0
/**
 * This function explores the partial matches that constitute the given match in order to produce
 * one or all possible outputs, depending on infos->ambiguous_output_policy.
 * The output(s) is(are) then used to add matches to the infos->matches list.
 */
void explore_match_to_get_outputs(struct locate_tfst_infos* infos,struct tfst_match* m,
		                          struct tfst_simple_match_list* element) {
/* As m is a reversed list, we first need to get its elements in the right order */
vector_ptr* items=new_vector_ptr(16);
fill_vector(items,m);
Ustring* s=new_Ustring(1024);
/* In MERGE/REPLACE mode, we have to explore the combination of partial matches */
struct list_pointer* ptr=NULL;
explore_match_for_MERGE_or_REPLACE_mode(infos,element,items,0,s,-1,&ptr);
free_list_pointer(ptr);
free_Ustring(s);
free_vector_ptr(items);
}
Пример #5
0
/*
 * This function behaves in the same way that a main one, except that it does
 * not invoke the setBufferMode function.
 */
int main_LocateTfst(int argc,char* const argv[]) {
if (argc==1) {
   usage();
   return SUCCESS_RETURN_CODE;
}

VersatileEncodingConfig vec=VEC_DEFAULT;
int val,index=-1;
char text[FILENAME_MAX]="";
char alphabet[FILENAME_MAX]="";
int is_korean=0;
int tilde_negation_operator=1;
int selected_negation_operator=0;
int tagging=0;
int single_tags_only=0;
int match_word_boundaries=1;
MatchPolicy match_policy=LONGEST_MATCHES;
OutputPolicy output_policy=IGNORE_OUTPUTS;
AmbiguousOutputPolicy ambiguous_output_policy=ALLOW_AMBIGUOUS_OUTPUTS;
VariableErrorPolicy variable_error_policy=IGNORE_VARIABLE_ERRORS;
int search_limit=NO_MATCH_LIMIT;
char foo;
vector_ptr* injected=new_vector_ptr();
bool only_verify_arguments = false;
UnitexGetOpt options;
while (EOF!=(val=options.parse_long(argc,argv,optstring_LocateTfst,lopts_LocateTfst,&index))) {
   switch(val) {
   case 't': if (options.vars()->optarg[0]=='\0') {
                error("You must specify a non empty .tfst name\n");
                free_vector_ptr(injected);
                return USAGE_ERROR_CODE;
             }
             strcpy(text,options.vars()->optarg);
             break;
   case 'a': if (options.vars()->optarg[0]=='\0') {
                error("You must specify a non empty alphabet name\n");
                free_vector_ptr(injected);
                return USAGE_ERROR_CODE;
             }
             strcpy(alphabet,options.vars()->optarg);
             break;
   case 'K': is_korean=1;
   	   	   	  match_word_boundaries=0;
              break;
   case 'l': search_limit=NO_MATCH_LIMIT; break;
   case 'g': if (options.vars()->optarg[0]=='\0') {
                error("You must specify an argument for negation operator\n");
                free_vector_ptr(injected);
                return USAGE_ERROR_CODE;
             }
             selected_negation_operator=1;
             if ((strcmp(options.vars()->optarg,"minus")==0) || (strcmp(options.vars()->optarg,"-")==0)) {
                 tilde_negation_operator=0;
             }
             else
             if ((strcmp(options.vars()->optarg,"tilde")!=0) && (strcmp(options.vars()->optarg,"~")!=0)) {
                 error("You must specify a valid argument for negation operator\n");
                 free_vector_ptr(injected);
                 return USAGE_ERROR_CODE;                 
             }
             break;
   case 'n': if (1!=sscanf(options.vars()->optarg,"%d%c",&search_limit,&foo) || search_limit<=0) {
                /* foo is used to check that the search limit is not like "45gjh" */
                error("Invalid search limit argument: %s\n",options.vars()->optarg);
                free_vector_ptr(injected);
                return USAGE_ERROR_CODE;                
             }
             break;
   case 'S': match_policy=SHORTEST_MATCHES; break;
   case 'L': match_policy=LONGEST_MATCHES; break;
   case 'A': match_policy=ALL_MATCHES; break;
   case 'I': output_policy=IGNORE_OUTPUTS; break;
   case 'M': output_policy=MERGE_OUTPUTS; break;
   case 'R': output_policy=REPLACE_OUTPUTS; break;
   case 'X': variable_error_policy=EXIT_ON_VARIABLE_ERRORS; break;
   case 'Y': variable_error_policy=IGNORE_VARIABLE_ERRORS; break;
   case 'Z': variable_error_policy=BACKTRACK_ON_VARIABLE_ERRORS; break;
   case 'b': ambiguous_output_policy=ALLOW_AMBIGUOUS_OUTPUTS; break;
   case 'z': ambiguous_output_policy=IGNORE_AMBIGUOUS_OUTPUTS; break;
   case 'V': only_verify_arguments = true;
             break;
   case 'h': usage(); 
             return SUCCESS_RETURN_CODE;
   case 1: tagging=1; break;
   case 2: single_tags_only=1; break;
   case 3: match_word_boundaries=0; break;
   case 'k': if (options.vars()->optarg[0]=='\0') {
                error("Empty input_encoding argument\n");
                free_vector_ptr(injected);
                return USAGE_ERROR_CODE;                
             }
             decode_reading_encoding_parameter(&(vec.mask_encoding_compatibility_input),options.vars()->optarg);
             break;
   case 'q': if (options.vars()->optarg[0]=='\0') {
                error("Empty output_encoding argument\n");
                free_vector_ptr(injected);
                return USAGE_ERROR_CODE;                
             }
             decode_writing_encoding_parameter(&(vec.encoding_output),&(vec.bom_output),options.vars()->optarg);
             break;
   case 'v': {
	   unichar* key=u_strdup(options.vars()->optarg);
	   unichar* value=u_strchr(key,'=');
	   if (value==NULL) {
		   error("Invalid variable injection: %s\n",options.vars()->optarg);
       free_vector_ptr(injected);
       return USAGE_ERROR_CODE;       
	   }
	   (*value)='\0';
	   value++;
	   value=u_strdup(value);
	   vector_ptr_add(injected,key);
	   vector_ptr_add(injected,value);
	   break;
   }
   case ':': index==-1 ? error("Missing argument for option -%c\n",options.vars()->optopt) :
                         error("Missing argument for option --%s\n",lopts_LocateTfst[index].name);
             free_vector_ptr(injected);
             return USAGE_ERROR_CODE;
   case '?': index==-1 ? error("Invalid option -%c\n",options.vars()->optopt) :
                         error("Invalid option --%s\n",options.vars()->optarg);
             free_vector_ptr(injected);
             return USAGE_ERROR_CODE;
             break;
   }
   index=-1;
}

if (options.vars()->optind!=argc-1) {
   error("Invalid arguments: rerun with --help\n");
   free_vector_ptr(injected);
   return USAGE_ERROR_CODE;
}

if (only_verify_arguments) {
  // freeing all allocated memory
  free_vector_ptr(injected);
  return SUCCESS_RETURN_CODE;
}

if (selected_negation_operator==0) {
    get_graph_compatibility_mode_by_file(&vec,&tilde_negation_operator);
}

char grammar[FILENAME_MAX];
char output[FILENAME_MAX];
strcpy(grammar,argv[options.vars()->optind]);
get_path(text,output);
strcat(output,"concord.ind");

int OK=locate_tfst(text,
                   grammar,
                   alphabet,
                   output,
                   &vec,
                   match_policy,
                   output_policy,
                   ambiguous_output_policy,
                   variable_error_policy,
                   search_limit,
                   is_korean,
                   tilde_negation_operator,
                   injected,
                   tagging,
                   single_tags_only,
                   match_word_boundaries);

free_vector_ptr(injected);

return (!OK);
}
Пример #6
0
/**
 * The same than main, but no call to setBufferMode.
 */
int main_KeyWords(int argc,char* const argv[]) {
if (argc==1) {
   usage();
   return SUCCESS_RETURN_CODE;
}

VersatileEncodingConfig vec=VEC_DEFAULT;

char tokens[FILENAME_MAX];
char output[FILENAME_MAX]="";
char alph[FILENAME_MAX]="";
char cdic[FILENAME_MAX]="";
unichar* code=u_strdup("XXX");
int val,index=-1;
bool only_verify_arguments = false;
UnitexGetOpt options;

while (EOF!=(val=options.parse_long(argc,argv,optstring_KeyWords,lopts_KeyWords,&index))) {
   switch(val) {
   case 'o': if (options.vars()->optarg[0]=='\0') {
                error("You must specify a non empty output\n");
                free(code);
                return USAGE_ERROR_CODE;
             }
             strcpy(output,options.vars()->optarg);
             break;
   case 'a': if (options.vars()->optarg[0]=='\0') {
                error("You must specify a non empty alphabet file name\n");
                free(code);
                return USAGE_ERROR_CODE;                
             }
             strcpy(alph,options.vars()->optarg);
             break;
   case 'f': if (options.vars()->optarg[0]=='\0') {
                error("You must specify a non empty forbidden code\n");
                free(code);
                return USAGE_ERROR_CODE;                
             }
             free(code);
             code=u_strdup(options.vars()->optarg);
             break;
   case 'c': if (options.vars()->optarg[0]=='\0') {
                error("You must specify a non empty file name\n");
                free(code);
                return USAGE_ERROR_CODE;                
             }
             strcpy(cdic,options.vars()->optarg);
             break;
   case 'V': only_verify_arguments = true;
             break;
   case 'h': usage();
             free(code);
             return SUCCESS_RETURN_CODE;
   case 'k': if (options.vars()->optarg[0]=='\0') {
                error("Empty input_encoding argument\n");
                free(code);
                return USAGE_ERROR_CODE;                
             }
             decode_reading_encoding_parameter(&(vec.mask_encoding_compatibility_input),options.vars()->optarg);
             break;
   case 'q': if (options.vars()->optarg[0]=='\0') {
                error("Empty output_encoding argument\n");
                free(code);
                return USAGE_ERROR_CODE;                
             }
             decode_writing_encoding_parameter(&(vec.encoding_output),&(vec.bom_output),options.vars()->optarg);
             break;
   case ':': index==-1 ? error("Missing argument for option -%c\n",options.vars()->optopt) :
                         error("Missing argument for option --%s\n",lopts_KeyWords[index].name);
             free(code);
             return USAGE_ERROR_CODE;
             break;
   case '?': index==-1 ? error("Invalid option -%c\n",options.vars()->optopt) :
                         error("Invalid option --%s\n",options.vars()->optarg);
             free(code);
             return USAGE_ERROR_CODE;
             break;
   }
   index=-1;
}

if (options.vars()->optind==argc || options.vars()->optind==argc-1) {
   error("Invalid arguments: rerun with --help\n");
   free(code);
   return USAGE_ERROR_CODE;
}

if (only_verify_arguments) {
  // freeing all allocated memory
  free(code);
  return SUCCESS_RETURN_CODE;
}

Alphabet* alphabet=NULL;
if (alph[0]!='\0') {
  alphabet=load_alphabet(&vec,alph);
  if (alphabet==NULL) {
    error("Cannot load alphabet file %s\n",alph);
    free(code);
    return DEFAULT_ERROR_CODE;    
  }
}

strcpy(tokens,argv[(options.vars()->optind++)]);
if (output[0]=='\0') {
	get_path(tokens,output);
	strcat(output,"keywords.txt");
}
struct string_hash_ptr* keywords=load_tokens_by_freq(tokens,&vec);
filter_non_letter_keywords(keywords,alphabet);
if (cdic[0]!='\0') {
	load_compound_words(cdic,&vec,keywords);
}

for (;options.vars()->optind!=argc;(options.vars()->optind)++) {
	filter_keywords_with_dic(keywords,argv[options.vars()->optind],&vec,alphabet);
}
merge_case_equivalent_unknown_words(keywords,alphabet);
struct string_hash* forbidden_lemmas=compute_forbidden_lemmas(keywords,code);
remove_keywords_with_forbidden_lemma(keywords,forbidden_lemmas);
free_string_hash(forbidden_lemmas);
vector_ptr* sorted=sort_keywords(keywords);
U_FILE* f_output=u_fopen(&vec,output,U_WRITE);

if (f_output==NULL) {
	error("Cannot write in file %s\n",output);
  free_vector_ptr(sorted,(void(*)(void*))free_KeyWord_list);
  free_string_hash_ptr(keywords,(void(*)(void*))free_KeyWord_list);
  free_alphabet(alphabet);
  free(code);
  return DEFAULT_ERROR_CODE;  
}

dump_keywords(sorted,f_output);

u_fclose(f_output);
free_vector_ptr(sorted,(void(*)(void*))free_KeyWord_list);
free_string_hash_ptr(keywords,(void(*)(void*))free_KeyWord_list);
free_alphabet(alphabet);
free(code);

return SUCCESS_RETURN_CODE;
}
Пример #7
0
int main_Tokenize(int argc,char* const argv[]) {
if (argc==1) {
   usage();
   return 0;
}

char alphabet[FILENAME_MAX]="";
char token_file[FILENAME_MAX]="";

Encoding encoding_output = DEFAULT_ENCODING_OUTPUT;
int bom_output = DEFAULT_BOM_OUTPUT;
int mask_encoding_compatibility_input = DEFAULT_MASK_ENCODING_COMPATIBILITY_INPUT;
int val,index=-1;
int mode=NORMAL;
struct OptVars* vars=new_OptVars();
while (EOF!=(val=getopt_long_TS(argc,argv,optstring_Tokenize,lopts_Tokenize,&index,vars))) {
   switch(val) {
   case 'a': if (vars->optarg[0]=='\0') {
                fatal_error("You must specify a non empty alphabet file name\n");
             }
             strcpy(alphabet,vars->optarg);
             break;
   case 'c': mode=CHAR_BY_CHAR; break;
   case 'w': mode=NORMAL; break;
   case 't': if (vars->optarg[0]=='\0') {
                fatal_error("You must specify a non empty token file name\n");
             }
             strcpy(token_file,vars->optarg);
             break;
   case 'k': if (vars->optarg[0]=='\0') {
                fatal_error("Empty input_encoding argument\n");
             }
             decode_reading_encoding_parameter(&mask_encoding_compatibility_input,vars->optarg);
             break;
   case 'q': if (vars->optarg[0]=='\0') {
                fatal_error("Empty output_encoding argument\n");
             }
             decode_writing_encoding_parameter(&encoding_output,&bom_output,vars->optarg);
             break;
   case 'h': usage(); return 0;
   case ':': if (index==-1) fatal_error("Missing argument for option -%c\n",vars->optopt);
             else fatal_error("Missing argument for option --%s\n",lopts_Tokenize[index].name);
   case '?': if (index==-1) fatal_error("Invalid option -%c\n",vars->optopt);
             else fatal_error("Invalid option --%s\n",vars->optarg);
             break;
   }
   index=-1;
}

if (vars->optind!=argc-1) {
   fatal_error("Invalid arguments: rerun with --help\n");
}
U_FILE* text;
U_FILE* out;
U_FILE* output;
U_FILE* enter;
char tokens_txt[FILENAME_MAX];
char text_cod[FILENAME_MAX];
char enter_pos[FILENAME_MAX];
Alphabet* alph=NULL;

get_snt_path(argv[vars->optind],text_cod);
strcat(text_cod,"text.cod");
get_snt_path(argv[vars->optind],tokens_txt);
strcat(tokens_txt,"tokens.txt");
get_snt_path(argv[vars->optind],enter_pos);
strcat(enter_pos,"enter.pos");
text=u_fopen_existing_versatile_encoding(mask_encoding_compatibility_input,argv[vars->optind],U_READ);
if (text==NULL) {
   fatal_error("Cannot open text file %s\n",argv[vars->optind]);
}
if (alphabet[0]!='\0') {
   alph=load_alphabet(alphabet);
   if (alph==NULL) {
      error("Cannot load alphabet file %s\n",alphabet);
      u_fclose(text);
      return 1;
   }
}
out=u_fopen(BINARY,text_cod,U_WRITE);
if (out==NULL) {
   error("Cannot create file %s\n",text_cod);
   u_fclose(text);
   if (alph!=NULL) {
      free_alphabet(alph);
   }
   return 1;
}
enter=u_fopen(BINARY,enter_pos,U_WRITE);
if (enter==NULL) {
   error("Cannot create file %s\n",enter_pos);
   u_fclose(text);
   u_fclose(out);
   if (alph!=NULL) {
      free_alphabet(alph);
   }
   return 1;
}


vector_ptr* tokens=new_vector_ptr(4096);
vector_int* n_occur=new_vector_int(4096);
vector_int* n_enter_pos=new_vector_int(4096);
struct hash_table* hashtable=new_hash_table((HASH_FUNCTION)hash_unichar,(EQUAL_FUNCTION)u_equal,
                                            (FREE_FUNCTION)free,NULL,(KEYCOPY_FUNCTION)keycopy);
if (token_file[0]!='\0') {
   load_token_file(token_file,mask_encoding_compatibility_input,tokens,hashtable,n_occur);
}

output=u_fopen_creating_versatile_encoding(encoding_output,bom_output,tokens_txt,U_WRITE);
if (output==NULL) {
   error("Cannot create file %s\n",tokens_txt);
   u_fclose(text);
   u_fclose(out);
   u_fclose(enter);
   if (alph!=NULL) {
      free_alphabet(alph);
   }

   free_hash_table(hashtable);
   free_vector_ptr(tokens,free);
   free_vector_int(n_occur);
   free_vector_int(n_enter_pos);

   return 1;
}
u_fprintf(output,"0000000000\n");

int SENTENCES=0;
int TOKENS_TOTAL=0;
int WORDS_TOTAL=0;
int DIGITS_TOTAL=0;
u_printf("Tokenizing text...\n");
if (mode==NORMAL) {
   normal_tokenization(text,out,output,alph,tokens,hashtable,n_occur,n_enter_pos,
		   &SENTENCES,&TOKENS_TOTAL,&WORDS_TOTAL,&DIGITS_TOTAL);
}
else {
   char_by_char_tokenization(text,out,output,alph,tokens,hashtable,n_occur,n_enter_pos,
		   &SENTENCES,&TOKENS_TOTAL,&WORDS_TOTAL,&DIGITS_TOTAL);
}
u_printf("\nDone.\n");
save_new_line_positions(enter,n_enter_pos);
u_fclose(enter);
u_fclose(text);
u_fclose(out);
u_fclose(output);
write_number_of_tokens(tokens_txt,encoding_output,bom_output,tokens->nbelems);
// we compute some statistics
get_snt_path(argv[vars->optind],tokens_txt);
strcat(tokens_txt,"stats.n");
output=u_fopen_creating_versatile_encoding(encoding_output,bom_output,tokens_txt,U_WRITE);
if (output==NULL) {
   error("Cannot write %s\n",tokens_txt);
}
else {
   compute_statistics(output,tokens,alph,SENTENCES,TOKENS_TOTAL,WORDS_TOTAL,DIGITS_TOTAL);
   u_fclose(output);
}
// we save the tokens by frequence
get_snt_path(argv[vars->optind],tokens_txt);
strcat(tokens_txt,"tok_by_freq.txt");
output=u_fopen_creating_versatile_encoding(encoding_output,bom_output,tokens_txt,U_WRITE);
if (output==NULL) {
   error("Cannot write %s\n",tokens_txt);
}
else {
   sort_and_save_by_frequence(output,tokens,n_occur);
   u_fclose(output);
}
// we save the tokens by alphabetical order
get_snt_path(argv[vars->optind],tokens_txt);
strcat(tokens_txt,"tok_by_alph.txt");
output=u_fopen_creating_versatile_encoding(encoding_output,bom_output,tokens_txt,U_WRITE);
if (output==NULL) {
   error("Cannot write %s\n",tokens_txt);
}
else {
   sort_and_save_by_alph_order(output,tokens,n_occur);
   u_fclose(output);
}
free_hash_table(hashtable);
free_vector_ptr(tokens,free);
free_vector_int(n_occur);
free_vector_int(n_enter_pos);
if (alph!=NULL) {
   free_alphabet(alph);
}
free_OptVars(vars);
return 0;
}
Пример #8
0
/**
 * Frees all the memory associated to the given invoker.
 */
void free_ProgramInvoker(ProgramInvoker* i) {
if (i==NULL) return;
free_vector_ptr(i->args,free);
free(i);
}