コード例 #1
0
//
// this function analyses russian compound words
//
void analyse_compounds(const Alphabet* alph,
		       Dictionary* d,
			   U_FILE* words,
		       U_FILE* result,
		       U_FILE* debug,
		       U_FILE* new_unknown_words,struct utags UTAG)
{
   bool* prefix;
   bool* suffix;
   vector_ptr* rules=new_vector_ptr(16);
   vector_ptr* entries=new_vector_ptr(16);
  init_tableaux(d->inf,&prefix,&suffix,UTAG);
  analyse_word_list(d,words,result,debug,new_unknown_words,alph,prefix,suffix,UTAG,rules,entries);
  free_tableaux(prefix,suffix);
  free_vector_ptr(rules);
  free_vector_ptr(entries);
}
コード例 #2
0
ファイル: GeneralDerivation.cpp プロジェクト: adri87/Q-A
//
// this function analyses russian compound words
//
void analyse_compounds(const Alphabet* alph,
		       const unsigned char* bin,
		       const struct INF_codes* inf,
		       U_FILE* words,
		       U_FILE* result,
		       U_FILE* debug,
		       U_FILE* new_unknown_words,struct utags UTAG)
{
   bool* prefix;
   bool* suffix;
   vector_ptr* rules=new_vector_ptr(16);
   vector_ptr* entries=new_vector_ptr(16);
  init_tableaux(inf,&prefix,&suffix,UTAG);
  analyse_word_list(bin,inf,words,result,debug,new_unknown_words,alph,prefix,suffix,UTAG,rules,entries);
  free_tableaux(prefix,suffix);
  free_vector_ptr(rules);
  free_vector_ptr(entries);
}
コード例 #3
0
ファイル: LocateTfstMatches.cpp プロジェクト: adri87/Q-A
/**
 * This function explores the partial matches that constitute the given match in order to produce
 * one or all possible outputs, depending on infos->ambiguous_output_policy.
 * The output(s) is(are) then used to add matches to the infos->matches list.
 */
void explore_match_to_get_outputs(struct locate_tfst_infos* infos,struct tfst_match* m,
		                          struct tfst_simple_match_list* element) {
/* As m is a reversed list, we first need to get its elements in the right order */
vector_ptr* items=new_vector_ptr(16);
fill_vector(items,m);
Ustring* s=new_Ustring(1024);
/* In MERGE/REPLACE mode, we have to explore the combination of partial matches */
struct list_pointer* ptr=NULL;
explore_match_for_MERGE_or_REPLACE_mode(infos,element,items,0,s,-1,&ptr);
free_list_pointer(ptr);
free_Ustring(s);
free_vector_ptr(items);
}
コード例 #4
0
ファイル: ProgramInvoker.cpp プロジェクト: adri87/Q-A
/**
 * Allocates, initializes and returns a new program invoker.
 */
ProgramInvoker* new_ProgramInvoker(MAIN_FUNCTION f,const char* name) {
if (name==NULL) {
   fatal_error("NULL program name in new_ProgramInvoker\n");
}
ProgramInvoker* res=(ProgramInvoker*)malloc(sizeof(ProgramInvoker));
if (res==NULL) {
   fatal_alloc_error("new_ProgramInvoker");
}
res->main=f;
res->args=new_vector_ptr(16);
add_argument(res,name);
return res;
}
コード例 #5
0
/**
 * We build an array of single keywords (lists of only one element),
 * sorted by descending weight.
 */
vector_ptr* sort_keywords(struct string_hash_ptr* keywords) {
vector_ptr* res=new_vector_ptr();
for (int i=0;i<keywords->size;i++) {
	KeyWord* k=(KeyWord*)(keywords->value[i]);
	while (k!=NULL) {
		if (k->sequence!=NULL && k->lemmatized!=PART_OF_A_LEMMATIZED_KEYWORD) {
			vector_ptr_add(res,new_KeyWord(k->weight,k->sequence,NULL));
		}
		k=k->next;
	}
}
qsort(res->tab,res->nbelems,sizeof(KeyWord*),(int(*)(const void*,const void*))cmp_keywords);
return res;
}
コード例 #6
0
/*
 * This function behaves in the same way that a main one, except that it does
 * not invoke the setBufferMode function.
 */
int main_LocateTfst(int argc,char* const argv[]) {
if (argc==1) {
   usage();
   return SUCCESS_RETURN_CODE;
}

VersatileEncodingConfig vec=VEC_DEFAULT;
int val,index=-1;
char text[FILENAME_MAX]="";
char alphabet[FILENAME_MAX]="";
int is_korean=0;
int tilde_negation_operator=1;
int selected_negation_operator=0;
int tagging=0;
int single_tags_only=0;
int match_word_boundaries=1;
MatchPolicy match_policy=LONGEST_MATCHES;
OutputPolicy output_policy=IGNORE_OUTPUTS;
AmbiguousOutputPolicy ambiguous_output_policy=ALLOW_AMBIGUOUS_OUTPUTS;
VariableErrorPolicy variable_error_policy=IGNORE_VARIABLE_ERRORS;
int search_limit=NO_MATCH_LIMIT;
char foo;
vector_ptr* injected=new_vector_ptr();
bool only_verify_arguments = false;
UnitexGetOpt options;
while (EOF!=(val=options.parse_long(argc,argv,optstring_LocateTfst,lopts_LocateTfst,&index))) {
   switch(val) {
   case 't': if (options.vars()->optarg[0]=='\0') {
                error("You must specify a non empty .tfst name\n");
                free_vector_ptr(injected);
                return USAGE_ERROR_CODE;
             }
             strcpy(text,options.vars()->optarg);
             break;
   case 'a': if (options.vars()->optarg[0]=='\0') {
                error("You must specify a non empty alphabet name\n");
                free_vector_ptr(injected);
                return USAGE_ERROR_CODE;
             }
             strcpy(alphabet,options.vars()->optarg);
             break;
   case 'K': is_korean=1;
   	   	   	  match_word_boundaries=0;
              break;
   case 'l': search_limit=NO_MATCH_LIMIT; break;
   case 'g': if (options.vars()->optarg[0]=='\0') {
                error("You must specify an argument for negation operator\n");
                free_vector_ptr(injected);
                return USAGE_ERROR_CODE;
             }
             selected_negation_operator=1;
             if ((strcmp(options.vars()->optarg,"minus")==0) || (strcmp(options.vars()->optarg,"-")==0)) {
                 tilde_negation_operator=0;
             }
             else
             if ((strcmp(options.vars()->optarg,"tilde")!=0) && (strcmp(options.vars()->optarg,"~")!=0)) {
                 error("You must specify a valid argument for negation operator\n");
                 free_vector_ptr(injected);
                 return USAGE_ERROR_CODE;                 
             }
             break;
   case 'n': if (1!=sscanf(options.vars()->optarg,"%d%c",&search_limit,&foo) || search_limit<=0) {
                /* foo is used to check that the search limit is not like "45gjh" */
                error("Invalid search limit argument: %s\n",options.vars()->optarg);
                free_vector_ptr(injected);
                return USAGE_ERROR_CODE;                
             }
             break;
   case 'S': match_policy=SHORTEST_MATCHES; break;
   case 'L': match_policy=LONGEST_MATCHES; break;
   case 'A': match_policy=ALL_MATCHES; break;
   case 'I': output_policy=IGNORE_OUTPUTS; break;
   case 'M': output_policy=MERGE_OUTPUTS; break;
   case 'R': output_policy=REPLACE_OUTPUTS; break;
   case 'X': variable_error_policy=EXIT_ON_VARIABLE_ERRORS; break;
   case 'Y': variable_error_policy=IGNORE_VARIABLE_ERRORS; break;
   case 'Z': variable_error_policy=BACKTRACK_ON_VARIABLE_ERRORS; break;
   case 'b': ambiguous_output_policy=ALLOW_AMBIGUOUS_OUTPUTS; break;
   case 'z': ambiguous_output_policy=IGNORE_AMBIGUOUS_OUTPUTS; break;
   case 'V': only_verify_arguments = true;
             break;
   case 'h': usage(); 
             return SUCCESS_RETURN_CODE;
   case 1: tagging=1; break;
   case 2: single_tags_only=1; break;
   case 3: match_word_boundaries=0; break;
   case 'k': if (options.vars()->optarg[0]=='\0') {
                error("Empty input_encoding argument\n");
                free_vector_ptr(injected);
                return USAGE_ERROR_CODE;                
             }
             decode_reading_encoding_parameter(&(vec.mask_encoding_compatibility_input),options.vars()->optarg);
             break;
   case 'q': if (options.vars()->optarg[0]=='\0') {
                error("Empty output_encoding argument\n");
                free_vector_ptr(injected);
                return USAGE_ERROR_CODE;                
             }
             decode_writing_encoding_parameter(&(vec.encoding_output),&(vec.bom_output),options.vars()->optarg);
             break;
   case 'v': {
	   unichar* key=u_strdup(options.vars()->optarg);
	   unichar* value=u_strchr(key,'=');
	   if (value==NULL) {
		   error("Invalid variable injection: %s\n",options.vars()->optarg);
       free_vector_ptr(injected);
       return USAGE_ERROR_CODE;       
	   }
	   (*value)='\0';
	   value++;
	   value=u_strdup(value);
	   vector_ptr_add(injected,key);
	   vector_ptr_add(injected,value);
	   break;
   }
   case ':': index==-1 ? error("Missing argument for option -%c\n",options.vars()->optopt) :
                         error("Missing argument for option --%s\n",lopts_LocateTfst[index].name);
             free_vector_ptr(injected);
             return USAGE_ERROR_CODE;
   case '?': index==-1 ? error("Invalid option -%c\n",options.vars()->optopt) :
                         error("Invalid option --%s\n",options.vars()->optarg);
             free_vector_ptr(injected);
             return USAGE_ERROR_CODE;
             break;
   }
   index=-1;
}

if (options.vars()->optind!=argc-1) {
   error("Invalid arguments: rerun with --help\n");
   free_vector_ptr(injected);
   return USAGE_ERROR_CODE;
}

if (only_verify_arguments) {
  // freeing all allocated memory
  free_vector_ptr(injected);
  return SUCCESS_RETURN_CODE;
}

if (selected_negation_operator==0) {
    get_graph_compatibility_mode_by_file(&vec,&tilde_negation_operator);
}

char grammar[FILENAME_MAX];
char output[FILENAME_MAX];
strcpy(grammar,argv[options.vars()->optind]);
get_path(text,output);
strcat(output,"concord.ind");

int OK=locate_tfst(text,
                   grammar,
                   alphabet,
                   output,
                   &vec,
                   match_policy,
                   output_policy,
                   ambiguous_output_policy,
                   variable_error_policy,
                   search_limit,
                   is_korean,
                   tilde_negation_operator,
                   injected,
                   tagging,
                   single_tags_only,
                   match_word_boundaries);

free_vector_ptr(injected);

return (!OK);
}
コード例 #7
0
ファイル: LocatePattern.cpp プロジェクト: adri87/Q-A
/**
 * Allocates, initializes and returns a new locate_parameters structure.
 */
struct locate_parameters* new_locate_parameters() {
    struct locate_parameters* p=(struct locate_parameters*)malloc(sizeof(struct locate_parameters));
    if (p==NULL) {
        fatal_alloc_error("new_locate_parameters");
    }
    p->tilde_negation_operator=1;
    p->useLocateCache=1;
    p->token_control=NULL;
    p->matching_patterns=NULL;
    p->current_compound_pattern=0;
    p->pattern_tree_root=NULL;
    /* We use -1 because there may be no space, {S} or {STOP} in the text */
    p->SPACE=-1;
    p->SENTENCE=-1;
    p->STOP=-1;
    p->tag_token_list=NULL;
#ifdef TRE_WCHAR
    p->filters=NULL;
    p->filter_match_index=NULL;
#endif
    p->DLC_tree=NULL;
    p->optimized_states=NULL;
    p->fst2=NULL;
    p->tokens=NULL;
    p->current_origin=-1;
    p->max_count_call=0;
    p->max_count_call_warning=0;
    p->buffer=NULL;
    p->tokenization_policy=WORD_BY_WORD_TOKENIZATION;
    p->space_policy=DONT_START_WITH_SPACE;
    p->matching_units=0;
    p->match_policy=LONGEST_MATCHES;
    p->output_policy=IGNORE_OUTPUTS;
    p->ambiguous_output_policy=ALLOW_AMBIGUOUS_OUTPUTS;
    p->variable_error_policy=IGNORE_VARIABLE_ERRORS;
    p->match_list=NULL;
    p->number_of_matches=0;
    p->number_of_outputs=0;
    p->start_position_last_printed_match=-1;
    p->end_position_last_printed_match=-1;
    p->search_limit=0;
    p->input_variables=NULL;
    p->output_variables=NULL;
    p->nb_output_variables=0;
    p->stack=new_stack_unichar(TRANSDUCTION_STACK_SIZE);
    p->alphabet=NULL;
    p->morpho_dic_inf=NULL;
    p->morpho_dic_inf_free=NULL;
    p->morpho_dic_bin=NULL;
    p->morpho_dic_bin_free=NULL;
    p->n_morpho_dics=0;
    p->dic_variables=NULL;
    p->left_ctx_shift=0;
    p->left_ctx_base=0;
    p->protect_dic_chars=0;
    p->graph_depth=0;
    p->korean=NULL;
    p->jamo_tags=NULL;
    p->mask_encoding_compatibility_input = DEFAULT_MASK_ENCODING_COMPATIBILITY_INPUT;
    p->recyclable_wchart_buffer=(wchar_t*)malloc(sizeof(wchar_t)*SIZE_RECYCLABLE_WCHAR_T_BUFFER);
    if (p->recyclable_wchart_buffer==NULL) {
        fatal_alloc_error("new_locate_parameters");
    }
    p->recyclable_unichar_buffer=(unichar*)malloc(sizeof(unichar)*SIZE_RECYCLABLE_UNICHAR_BUFFER);
    if (p->recyclable_unichar_buffer==NULL) {
        fatal_alloc_error("new_locate_parameters");
    }
    p->size_recyclable_unichar_buffer = SIZE_RECYCLABLE_UNICHAR_BUFFER;
    p->failfast=NULL;
    p->match_cache_first=NULL;
    p->match_cache_last=NULL;
    p->match_cache=NULL;
    p->prv_alloc=NULL;
    p->prv_alloc_recycle=NULL;
    p->token_error_ctx.last_length=0;
    p->token_error_ctx.last_start=0;
    p->token_error_ctx.n_errors=0;
    p->token_error_ctx.n_matches_at_token_pos__locate=0;
    p->token_error_ctx.n_matches_at_token_pos__morphological_locate=0;
    p->counting_step.count_call=0;
    p->counting_step.count_cancel_trying=0;
    p->explore_depth=0;
    p->backup_memory_reserve=NULL;
    p->cached_match_vector=new_vector_ptr(16);
    p->fnc_locate_trace_step=NULL;
    p->private_param_locate_trace=NULL;
    memset(&(p->arabic),0,sizeof(ArabicTypoRules));
    p->is_in_cancel_state = 0;
    p->is_in_trace_state = 0;
    p->counting_step_count_cancel_trying_real_in_debug_or_trace = 0;
    return p;
}
コード例 #8
0
ファイル: Tokenize.cpp プロジェクト: adri87/Q-A
int main_Tokenize(int argc,char* const argv[]) {
if (argc==1) {
   usage();
   return 0;
}

char alphabet[FILENAME_MAX]="";
char token_file[FILENAME_MAX]="";

Encoding encoding_output = DEFAULT_ENCODING_OUTPUT;
int bom_output = DEFAULT_BOM_OUTPUT;
int mask_encoding_compatibility_input = DEFAULT_MASK_ENCODING_COMPATIBILITY_INPUT;
int val,index=-1;
int mode=NORMAL;
struct OptVars* vars=new_OptVars();
while (EOF!=(val=getopt_long_TS(argc,argv,optstring_Tokenize,lopts_Tokenize,&index,vars))) {
   switch(val) {
   case 'a': if (vars->optarg[0]=='\0') {
                fatal_error("You must specify a non empty alphabet file name\n");
             }
             strcpy(alphabet,vars->optarg);
             break;
   case 'c': mode=CHAR_BY_CHAR; break;
   case 'w': mode=NORMAL; break;
   case 't': if (vars->optarg[0]=='\0') {
                fatal_error("You must specify a non empty token file name\n");
             }
             strcpy(token_file,vars->optarg);
             break;
   case 'k': if (vars->optarg[0]=='\0') {
                fatal_error("Empty input_encoding argument\n");
             }
             decode_reading_encoding_parameter(&mask_encoding_compatibility_input,vars->optarg);
             break;
   case 'q': if (vars->optarg[0]=='\0') {
                fatal_error("Empty output_encoding argument\n");
             }
             decode_writing_encoding_parameter(&encoding_output,&bom_output,vars->optarg);
             break;
   case 'h': usage(); return 0;
   case ':': if (index==-1) fatal_error("Missing argument for option -%c\n",vars->optopt);
             else fatal_error("Missing argument for option --%s\n",lopts_Tokenize[index].name);
   case '?': if (index==-1) fatal_error("Invalid option -%c\n",vars->optopt);
             else fatal_error("Invalid option --%s\n",vars->optarg);
             break;
   }
   index=-1;
}

if (vars->optind!=argc-1) {
   fatal_error("Invalid arguments: rerun with --help\n");
}
U_FILE* text;
U_FILE* out;
U_FILE* output;
U_FILE* enter;
char tokens_txt[FILENAME_MAX];
char text_cod[FILENAME_MAX];
char enter_pos[FILENAME_MAX];
Alphabet* alph=NULL;

get_snt_path(argv[vars->optind],text_cod);
strcat(text_cod,"text.cod");
get_snt_path(argv[vars->optind],tokens_txt);
strcat(tokens_txt,"tokens.txt");
get_snt_path(argv[vars->optind],enter_pos);
strcat(enter_pos,"enter.pos");
text=u_fopen_existing_versatile_encoding(mask_encoding_compatibility_input,argv[vars->optind],U_READ);
if (text==NULL) {
   fatal_error("Cannot open text file %s\n",argv[vars->optind]);
}
if (alphabet[0]!='\0') {
   alph=load_alphabet(alphabet);
   if (alph==NULL) {
      error("Cannot load alphabet file %s\n",alphabet);
      u_fclose(text);
      return 1;
   }
}
out=u_fopen(BINARY,text_cod,U_WRITE);
if (out==NULL) {
   error("Cannot create file %s\n",text_cod);
   u_fclose(text);
   if (alph!=NULL) {
      free_alphabet(alph);
   }
   return 1;
}
enter=u_fopen(BINARY,enter_pos,U_WRITE);
if (enter==NULL) {
   error("Cannot create file %s\n",enter_pos);
   u_fclose(text);
   u_fclose(out);
   if (alph!=NULL) {
      free_alphabet(alph);
   }
   return 1;
}


vector_ptr* tokens=new_vector_ptr(4096);
vector_int* n_occur=new_vector_int(4096);
vector_int* n_enter_pos=new_vector_int(4096);
struct hash_table* hashtable=new_hash_table((HASH_FUNCTION)hash_unichar,(EQUAL_FUNCTION)u_equal,
                                            (FREE_FUNCTION)free,NULL,(KEYCOPY_FUNCTION)keycopy);
if (token_file[0]!='\0') {
   load_token_file(token_file,mask_encoding_compatibility_input,tokens,hashtable,n_occur);
}

output=u_fopen_creating_versatile_encoding(encoding_output,bom_output,tokens_txt,U_WRITE);
if (output==NULL) {
   error("Cannot create file %s\n",tokens_txt);
   u_fclose(text);
   u_fclose(out);
   u_fclose(enter);
   if (alph!=NULL) {
      free_alphabet(alph);
   }

   free_hash_table(hashtable);
   free_vector_ptr(tokens,free);
   free_vector_int(n_occur);
   free_vector_int(n_enter_pos);

   return 1;
}
u_fprintf(output,"0000000000\n");

int SENTENCES=0;
int TOKENS_TOTAL=0;
int WORDS_TOTAL=0;
int DIGITS_TOTAL=0;
u_printf("Tokenizing text...\n");
if (mode==NORMAL) {
   normal_tokenization(text,out,output,alph,tokens,hashtable,n_occur,n_enter_pos,
		   &SENTENCES,&TOKENS_TOTAL,&WORDS_TOTAL,&DIGITS_TOTAL);
}
else {
   char_by_char_tokenization(text,out,output,alph,tokens,hashtable,n_occur,n_enter_pos,
		   &SENTENCES,&TOKENS_TOTAL,&WORDS_TOTAL,&DIGITS_TOTAL);
}
u_printf("\nDone.\n");
save_new_line_positions(enter,n_enter_pos);
u_fclose(enter);
u_fclose(text);
u_fclose(out);
u_fclose(output);
write_number_of_tokens(tokens_txt,encoding_output,bom_output,tokens->nbelems);
// we compute some statistics
get_snt_path(argv[vars->optind],tokens_txt);
strcat(tokens_txt,"stats.n");
output=u_fopen_creating_versatile_encoding(encoding_output,bom_output,tokens_txt,U_WRITE);
if (output==NULL) {
   error("Cannot write %s\n",tokens_txt);
}
else {
   compute_statistics(output,tokens,alph,SENTENCES,TOKENS_TOTAL,WORDS_TOTAL,DIGITS_TOTAL);
   u_fclose(output);
}
// we save the tokens by frequence
get_snt_path(argv[vars->optind],tokens_txt);
strcat(tokens_txt,"tok_by_freq.txt");
output=u_fopen_creating_versatile_encoding(encoding_output,bom_output,tokens_txt,U_WRITE);
if (output==NULL) {
   error("Cannot write %s\n",tokens_txt);
}
else {
   sort_and_save_by_frequence(output,tokens,n_occur);
   u_fclose(output);
}
// we save the tokens by alphabetical order
get_snt_path(argv[vars->optind],tokens_txt);
strcat(tokens_txt,"tok_by_alph.txt");
output=u_fopen_creating_versatile_encoding(encoding_output,bom_output,tokens_txt,U_WRITE);
if (output==NULL) {
   error("Cannot write %s\n",tokens_txt);
}
else {
   sort_and_save_by_alph_order(output,tokens,n_occur);
   u_fclose(output);
}
free_hash_table(hashtable);
free_vector_ptr(tokens,free);
free_vector_int(n_occur);
free_vector_int(n_enter_pos);
if (alph!=NULL) {
   free_alphabet(alph);
}
free_OptVars(vars);
return 0;
}