/**
 * Opens a .fst2 file in output mode and returns the associated fst_file_out_t
 * structure, or NULL in case of error.
 */
Elag_fst_file_out* fst_file_out_open(const VersatileEncodingConfig* vec,const char* fname,int type) {
Elag_fst_file_out* res=(Elag_fst_file_out*)malloc(sizeof(Elag_fst_file_out));
if (res==NULL) {
   fatal_alloc_error("fst_file_out_open");
}
if (type<0 || type>=FST_BAD_TYPE) {
   fatal_error("fst_file_out_open: bad FST_TYPE\n");
}
if ((res->f=u_fopen(vec,fname,U_WRITE))==NULL) {
   error("fst_out_open: unable to open '%s'\n",fname);
   free(res);
   return NULL;
}
res->fstart=ftell(res->f);
u_fprintf(res->f,"0000000000\n");
res->name=strdup(fname);
if (res->name==NULL) {
   fatal_alloc_error("fst_file_out_open");
}
res->type=type;
res->nb_automata=0;
res->labels=new_string_hash(16);
/* We add <E> to the tags in order to be sure that this special tag will have #0 */
get_value_index(EPSILON,res->labels);
return res;
}
Beispiel #2
0
uint32_t need_hash(V string)
{
	NewString *s = toNewString(string);
	if (s->hash == 0)
	{
		s->hash = new_string_hash(s->size, s->text);
	}
	return s->hash;
}
Beispiel #3
0
/**
 * Loads the lines of a text file into a string_hash and returns it, or NULL
 * if the file can not be opened. We arbitrary fix the limit of a line to 4096
 * characters. Each line is splitted into a key and a value, according to a
 * given separator character. An error message will be printed if a line does not
 * contain the separator character, if an empty line is found, or if a line contains
 * an empty key. In case of empty values, the empty string will be used.
 * Note that keys and values can contain characters protected with the \ character,
 * including protected new lines like:
 *
 * 123\
 * =ONE_TWO_THREE_NEW_LINE
 *
 */
struct string_hash* load_key_value_list(const char* name,int mask_encoding_compatibility_input,unichar separator) {
U_FILE* f=u_fopen_existing_versatile_encoding(mask_encoding_compatibility_input,name,U_READ);
if (f==NULL) return NULL;
struct string_hash* hash=new_string_hash();
unichar temp[4096];
unichar key[4096];
unichar value[4096];
/* We build a string with the separator character */
unichar stop[2];
stop[0]=separator;
stop[1]='\0';
int code;
while (EOF!=(code=u_fgets2(temp,f))) {
   if (code==0) {
      error("Empty line\n");
   }
   else {
      /* First, we try to read a non empty key */
      int pos=0;
      code=parse_string(temp,&pos,key,stop);
      if (code==P_BACKSLASH_AT_END) {
         error("Backslash at end of line:<%S>\n\n",temp);
      }
      else if (pos==0 &&temp[pos]=='\0') {
         /* Empty line */
    	  continue;
      }
      else if (pos==0) {
         /* If the line starts with the separator */
         error("Line with empty key:\n<%S>\n",temp);
      }
      else {
         /* We jump over the separator */
         pos++;
         /* We initialize 'value' with the empty string in case it is not
          * defined in the file */
         value[0]='\0';
         if(P_BACKSLASH_AT_END==parse_string(temp,&pos,value,P_EMPTY)) {
            error("Backslash at end of line:\n<%S>\n",temp);
         }
         else {
            /* If we have a valid (key,value) pair, we insert it into the string_hash */
            get_value_index(key,hash,INSERT_IF_NEEDED,value);
         }
      }
   }
}
u_fclose(f);
return hash;
}
/**
 * Allocates, initializes and returns a new DELA tree.
 */
struct DELA_tree* new_DELA_tree() {
struct DELA_tree* tree;
tree=(struct DELA_tree*)malloc(sizeof(struct DELA_tree));
if (tree==NULL) {
   fatal_alloc_error("new_DELA_tree");
}
tree->inflected_forms=new_string_hash(DONT_USE_VALUES);
tree->size=0;
tree->capacity=256;
tree->dela_entries=(struct dela_entry_list**)malloc(tree->capacity*sizeof(struct dela_entry_list*));
if (tree->dela_entries==NULL) {
   fatal_alloc_error("new_DELA_tree");
}
return tree;
}
Beispiel #5
0
/**
 * Loads the lines of a text file info a string_hash and returns it, or NULL
 * if the file can not be opened. We arbitrary fix the limit of a line to 4096
 * characters. For each line, we ignore the carriage return, if any, and we use
 * the remaining string as key and value. An error message will be printed if
 * an empty line is found.
 */
struct string_hash* load_key_list(const char* name,int mask_encoding_compatibility_input) {
U_FILE* f=u_fopen_existing_versatile_encoding(mask_encoding_compatibility_input,name,U_READ);
if (f==NULL) return NULL;
struct string_hash* hash=new_string_hash(DONT_USE_VALUES);
unichar temp[4096];
while (EOF!=u_fgets_limit2(temp,4096,f)) {
   if (temp[0]=='\0') {
      error("Empty line in %s\n",name);
   } else {
      get_value_index(temp,hash);
   }
}
u_fclose(f);
return hash;
}
/**
 * We remove every keyword that is tagged with the forbidden code. If
 * a forbidden keyword has several tags, all of them are removed:
 *
 * the,.DET + the,.XXX => all 'the' keywords are removed
 */
struct string_hash* compute_forbidden_lemmas(struct string_hash_ptr* keywords,unichar* code) {
struct string_hash* hash=new_string_hash(DONT_USE_VALUES,DONT_ENLARGE);
Ustring* tmp=new_Ustring();
for (int i=0;i<keywords->size;i++) {
	KeyWord* list=(KeyWord*)(keywords->value[i]);
	while (list!=NULL) {
		if (get_forbidden_keyword(list,code,tmp)) {
			get_value_index(tmp->str,hash);
		}
		list=list->next;
	}
}
free_Ustring(tmp);
return hash;
}
Beispiel #7
0
/**
 * Returns a new string_hash_ptr object with the given capacity.
 * All such objects use values. If not, the normal string_hash should be
 * used. The 'value' array will be enlarged if needed.
 */
struct string_hash_ptr* new_string_hash_ptr(int capacity) {
struct string_hash_ptr* s=(struct string_hash_ptr*)malloc(sizeof(struct string_hash_ptr));
if (s==NULL) {
   fatal_alloc_error("new_string_hash_ptr");
}
/* We don't use the unichar* values of the normal string hash */
s->hash=new_string_hash(DONT_USE_VALUES);
s->capacity=capacity;
s->size=0;
s->value=(void**)malloc(capacity*sizeof(void*));
if (s->value==NULL) {
   fatal_alloc_error("new_string_hash_ptr");
}
return s;
}
/**
 * Allocates, initializes and returns a new token tree.
 */
struct fst2txt_token_tree* new_fst2txt_token_tree(Abstract_allocator prv_alloc) {
struct fst2txt_token_tree* t=(struct fst2txt_token_tree*)malloc_cb(sizeof(struct fst2txt_token_tree),prv_alloc);
if (t==NULL) {
   fatal_alloc_error("new_fst2txt_token_tree");
}
t->hash=new_string_hash(DONT_USE_VALUES);
/* We set a small default capacity since there will be one structure of
 * this kind for each state of the fst2 */
t->capacity=2;
t->size=0;
t->transition_array=(Transition**)malloc_cb(t->capacity*sizeof(Transition*), prv_alloc);
if (t->transition_array==NULL) {
   fatal_alloc_error("new_fst2txt_token_tree");
}
return t;
}
/**
 * This function constructs and returns a token tree from a normalization grammar.
 * Tokens are represented by integers.
 */
struct normalization_tree* load_normalization_fst2(const VersatileEncodingConfig* vec,const char* grammar,
		const Alphabet* alph,struct text_tokens* tok) {
struct FST2_free_info fst2_free;
Fst2* fst2=load_abstract_fst2(vec,grammar,0,&fst2_free);
if (fst2==NULL) {
   return NULL;
}
struct string_hash* hash=new_string_hash(DONT_USE_VALUES);
/* We create the token tree to speed up the consultation */
for (int i=0;i<tok->N;i++) {
   get_value_index(tok->token[i],hash);
}
struct normalization_tree* root=new_normalization_tree();
explore_normalization_fst2(fst2,fst2->initial_states[1],root,hash,U_EMPTY,alph,NULL);
free_abstract_Fst2(fst2,&fst2_free);
free_string_hash(hash);
return root;
}
Beispiel #10
0
int main(void)
{
	string_tree t;
	string_hash h;
	it_string_tree itt;
	it_string_hash ith;

	if (!new_string_tree(&t)) {
		fprintf(stderr, "Error allocating tree.\n");
		exit(1);
	}
	if (!new_string_hash(&h)) {
		fprintf(stderr, "Error allocating hash.\n");
		exit(1);
	}

	insert_string_tree(&t, "cat");
	insert_string_tree(&t, "dog");
	insert_string_tree(&t, "mouse");

	insert_string_hash(&h, "cat");
	insert_string_hash(&h, "dog");
	insert_string_hash(&h, "mouse");

	itt = get_string_tree(&t, "dog");
	if (!itt)
		printf("Dog not found.\n");
	else
		printf("%s barks woof!\n", itt->value);

	ith = get_string_hash(&h, "cat");
	if (!ith)
		printf("Cat not found.\n");
	else
		printf("%s say meeow!\n", ith->value);

	free_string_tree(&t);
	free_string_hash(&h);
	return 0;
}
Beispiel #11
0
int locate_pattern(const char* text_cod,const char* tokens,const char* fst2_name,const char* dlf,const char* dlc,const char* err,
                   const char* alphabet,MatchPolicy match_policy,OutputPolicy output_policy,
                   Encoding encoding_output,int bom_output,int mask_encoding_compatibility_input,
                   const char* dynamicDir,TokenizationPolicy tokenization_policy,
                   SpacePolicy space_policy,int search_limit,const char* morpho_dic_list,
                   AmbiguousOutputPolicy ambiguous_output_policy,
                   VariableErrorPolicy variable_error_policy,int protect_dic_chars,
                   int is_korean,int max_count_call,int max_count_call_warning,
                   char* arabic_rules,int tilde_negation_operator,int useLocateCache,int allow_trace) {

    U_FILE* out;
    U_FILE* info;
    struct locate_parameters* p=new_locate_parameters();
    p->text_cod=af_open_mapfile(text_cod,MAPFILE_OPTION_READ,0);
    p->buffer=(int*)af_get_mapfile_pointer(p->text_cod);
    long text_size=(long)af_get_mapfile_size(p->text_cod)/sizeof(int);
    p->buffer_size=(int)text_size;
    p->tilde_negation_operator=tilde_negation_operator;
    p->useLocateCache=useLocateCache;
    if (max_count_call == -1) {
        max_count_call = (int)text_size;
    }
    if (max_count_call_warning == -1) {
        max_count_call_warning = (int)text_size;
    }
    p->match_policy=match_policy;
    p->tokenization_policy=tokenization_policy;
    p->space_policy=space_policy;
    p->output_policy=output_policy;
    p->search_limit=search_limit;
    p->ambiguous_output_policy=ambiguous_output_policy;
    p->variable_error_policy=variable_error_policy;
    p->protect_dic_chars=protect_dic_chars;
    p->mask_encoding_compatibility_input = mask_encoding_compatibility_input;
    p->max_count_call = max_count_call;
    p->max_count_call_warning = max_count_call_warning;
    p->token_filename = tokens;
    char concord[FILENAME_MAX];
    char concord_info[FILENAME_MAX];

    strcpy(concord,dynamicDir);
    strcat(concord,"concord.ind");

    strcpy(concord_info,dynamicDir);
    strcat(concord_info,"concord.n");

    char morpho_bin[FILENAME_MAX];
    strcpy(morpho_bin,dynamicDir);
    strcat(morpho_bin,"morpho.bin");
    if (arabic_rules!=NULL && arabic_rules[0]!='\0') {
        load_arabic_typo_rules(arabic_rules,&(p->arabic));
    }
    out=u_fopen_versatile_encoding(encoding_output,bom_output,mask_encoding_compatibility_input,concord,U_WRITE);
    if (out==NULL) {
        error("Cannot write %s\n",concord);
        af_release_mapfile_pointer(p->text_cod,p->buffer);
        af_close_mapfile(p->text_cod);
        free_stack_unichar(p->stack);
        free_locate_parameters(p);
        u_fclose(out);
        return 0;
    }
    info=u_fopen_versatile_encoding(encoding_output,bom_output,mask_encoding_compatibility_input,concord_info,U_WRITE);
    if (info==NULL) {
        error("Cannot write %s\n",concord_info);
    }
    switch(output_policy) {
    case IGNORE_OUTPUTS:
        u_fprintf(out,"#I\n");
        break;
    case MERGE_OUTPUTS:
        u_fprintf(out,"#M\n");
        break;
    case REPLACE_OUTPUTS:
        u_fprintf(out,"#R\n");
        break;
    }
    if (alphabet!=NULL && alphabet[0]!='\0') {
        u_printf("Loading alphabet...\n");
        p->alphabet=load_alphabet(alphabet,is_korean);
        if (p->alphabet==NULL) {
            error("Cannot load alphabet file %s\n",alphabet);
            af_release_mapfile_pointer(p->text_cod,p->buffer);
            af_close_mapfile(p->text_cod);
            free_stack_unichar(p->stack);
            free_locate_parameters(p);
            if (info!=NULL) u_fclose(info);
            u_fclose(out);
            return 0;
        }
    }
    struct string_hash* semantic_codes=new_string_hash();
    extract_semantic_codes(dlf,semantic_codes);
    extract_semantic_codes(dlc,semantic_codes);

    if (is_cancelling_requested() != 0) {
        error("user cancel request.\n");
        free_alphabet(p->alphabet);
        free_string_hash(semantic_codes);
        af_release_mapfile_pointer(p->text_cod,p->buffer);
        af_close_mapfile(p->text_cod);
        free_stack_unichar(p->stack);
        free_locate_parameters(p);
        if (info!=NULL) u_fclose(info);
        u_fclose(out);
        return 0;
    }

    u_printf("Loading fst2...\n");
    struct FST2_free_info fst2load_free;
    Fst2* fst2load=load_abstract_fst2(fst2_name,1,&fst2load_free);
    if (fst2load==NULL) {
        error("Cannot load grammar %s\n",fst2_name);
        free_alphabet(p->alphabet);
        free_string_hash(semantic_codes);
        af_release_mapfile_pointer(p->text_cod,p->buffer);
        af_close_mapfile(p->text_cod);
        free_stack_unichar(p->stack);
        free_locate_parameters(p);
        if (info!=NULL) u_fclose(info);
        u_fclose(out);
        return 0;
    }

    Abstract_allocator locate_abstract_allocator=create_abstract_allocator("locate_pattern",AllocatorCreationFlagAutoFreePrefered);


    p->fst2=new_Fst2_clone(fst2load,locate_abstract_allocator);
    free_abstract_Fst2(fst2load,&fst2load_free);

    if (is_cancelling_requested() != 0) {
        error("User cancel request..\n");
        free_alphabet(p->alphabet);
        free_string_hash(semantic_codes);
        free_Fst2(p->fst2,locate_abstract_allocator);
        close_abstract_allocator(locate_abstract_allocator);
        af_release_mapfile_pointer(p->text_cod,p->buffer);
        af_close_mapfile(p->text_cod);
        free_stack_unichar(p->stack);
        free_locate_parameters(p);
        if (info!=NULL) u_fclose(info);
        u_fclose(out);
        return 0;
    }

    p->tags=p->fst2->tags;
#ifdef TRE_WCHAR
    p->filters=new_FilterSet(p->fst2,p->alphabet);
    if (p->filters==NULL) {
        error("Cannot compile filter(s)\n");
        free_alphabet(p->alphabet);
        free_string_hash(semantic_codes);
        free_Fst2(p->fst2,locate_abstract_allocator);
        close_abstract_allocator(locate_abstract_allocator);
        free_stack_unichar(p->stack);
        free_locate_parameters(p);
        af_release_mapfile_pointer(p->text_cod,p->buffer);
        af_close_mapfile(p->text_cod);
        if (info!=NULL) u_fclose(info);
        u_fclose(out);
        return 0;
    }
#endif
    u_printf("Loading token list...\n");
    int n_text_tokens=0;

    p->tokens=load_text_tokens_hash(tokens,mask_encoding_compatibility_input,&(p->SENTENCE),&(p->STOP),&n_text_tokens);
    if (p->tokens==NULL) {
        error("Cannot load token list %s\n",tokens);
        free_alphabet(p->alphabet);
        free_string_hash(semantic_codes);
        free_Fst2(p->fst2,locate_abstract_allocator);
        close_abstract_allocator(locate_abstract_allocator);
        free_locate_parameters(p);
        af_release_mapfile_pointer(p->text_cod,p->buffer);
        af_close_mapfile(p->text_cod);
        if (info!=NULL) u_fclose(info);
        u_fclose(out);
        return 0;
    }
    Abstract_allocator locate_work_abstract_allocator = locate_abstract_allocator;

    p->match_cache=(LocateCache*)malloc_cb(p->tokens->size * sizeof(LocateCache),locate_work_abstract_allocator);
    memset(p->match_cache,0,p->tokens->size * sizeof(LocateCache));
    if (p->match_cache==NULL) {
        fatal_alloc_error("locate_pattern");
    }

#ifdef TRE_WCHAR
    p->filter_match_index=new_FilterMatchIndex(p->filters,p->tokens);
    if (p->filter_match_index==NULL) {
        error("Cannot optimize filter(s)\n");
        free_alphabet(p->alphabet);
        free_string_hash(semantic_codes);
        free_string_hash(p->tokens);
        close_abstract_allocator(locate_abstract_allocator);
        free_locate_parameters(p);
        af_release_mapfile_pointer(p->text_cod,p->buffer);
        af_close_mapfile(p->text_cod);
        if (info!=NULL) u_fclose(info);
        u_fclose(out);
        return 0;
    }
#endif

    if (allow_trace!=0) {
        open_locate_trace(p,&p->fnc_locate_trace_step,&p->private_param_locate_trace);
    }
    extract_semantic_codes_from_tokens(p->tokens,semantic_codes,locate_abstract_allocator);
    u_printf("Loading morphological dictionaries...\n");
    load_morphological_dictionaries(morpho_dic_list,p,morpho_bin);
    extract_semantic_codes_from_morpho_dics(p->morpho_dic_inf,p->n_morpho_dics,semantic_codes,locate_abstract_allocator);
    p->token_control=(unsigned char*)malloc(n_text_tokens*sizeof(unsigned char));
    if (p->token_control==NULL) {
        fatal_alloc_error("locate_pattern");
    }
    p->matching_patterns=(struct bit_array**)malloc(n_text_tokens*sizeof(struct bit_array*));
    if (p->matching_patterns==NULL) {
        fatal_alloc_error("locate_pattern");
    }
    for (int i=0; i<n_text_tokens; i++) {
        p->token_control[i]=0;
        p->matching_patterns[i]=NULL;
    }
    compute_token_controls(p->alphabet,err,p);
    int number_of_patterns,is_DIC,is_CDIC,is_SDIC;
    p->pattern_tree_root=new_pattern_node(locate_abstract_allocator);
    u_printf("Computing fst2 tags...\n");
    process_tags(&number_of_patterns,semantic_codes,&is_DIC,&is_CDIC,&is_SDIC,p,locate_abstract_allocator);
    p->current_compound_pattern=number_of_patterns;
    p->DLC_tree=new_DLC_tree(p->tokens->size);
    struct lemma_node* root=new_lemma_node();
    u_printf("Loading dlf...\n");
    load_dic_for_locate(dlf,mask_encoding_compatibility_input,p->alphabet,number_of_patterns,is_DIC,is_CDIC,root,p);
    u_printf("Loading dlc...\n");
    load_dic_for_locate(dlc,mask_encoding_compatibility_input,p->alphabet,number_of_patterns,is_DIC,is_CDIC,root,p);
    /* We look if tag tokens like "{today,.ADV}" verify some patterns */
    check_patterns_for_tag_tokens(p->alphabet,number_of_patterns,root,p,locate_abstract_allocator);
    u_printf("Optimizing fst2 pattern tags...\n");
    optimize_pattern_tags(p->alphabet,root,p,locate_abstract_allocator);
    u_printf("Optimizing compound word dictionary...\n");
    optimize_DLC(p->DLC_tree);
    free_string_hash(semantic_codes);
    int nb_input_variable=0;
    p->input_variables=new_Variables(p->fst2->input_variables,&nb_input_variable);
    p->output_variables=new_OutputVariables(p->fst2->output_variables,&p->nb_output_variables);


    Abstract_allocator locate_recycle_abstract_allocator=NULL;
    locate_recycle_abstract_allocator=create_abstract_allocator("locate_pattern_recycle",
                                      AllocatorFreeOnlyAtAllocatorDelete|AllocatorTipOftenRecycledObject,
                                      get_prefered_allocator_item_size_for_nb_variable(nb_input_variable));

    u_printf("Optimizing fst2...\n");
    p->optimized_states=build_optimized_fst2_states(p->input_variables,p->output_variables,p->fst2,locate_abstract_allocator);
    if (is_korean) {
        p->korean=new Korean(p->alphabet);
        p->jamo_tags=create_jamo_tags(p->korean,p->tokens);
    }
    p->failfast=new_bit_array(n_text_tokens,ONE_BIT);

    u_printf("Working...\n");
    p->prv_alloc=locate_work_abstract_allocator;
    p->prv_alloc_recycle=locate_recycle_abstract_allocator;
    launch_locate(out,text_size,info,p);
    if (allow_trace!=0) {
        close_locate_trace(p,p->fnc_locate_trace_step,p->private_param_locate_trace);
    }
    free_bit_array(p->failfast);
    free_Variables(p->input_variables);
    free_OutputVariables(p->output_variables);
    af_release_mapfile_pointer(p->text_cod,p->buffer);
    af_close_mapfile(p->text_cod);
    if (info!=NULL) u_fclose(info);
    u_fclose(out);

    if (p->match_cache!=NULL) {
        for (int i=0; i<p->tokens->size; i++) {
            free_LocateCache(p->match_cache[i],locate_work_abstract_allocator);
        }
        free_cb(p->match_cache,locate_work_abstract_allocator);
    }
    int free_abstract_allocator_item=(get_allocator_cb_flag(locate_abstract_allocator) & AllocatorGetFlagAutoFreePresent) ? 0 : 1;

    if (free_abstract_allocator_item) {
        free_optimized_states(p->optimized_states,p->fst2->number_of_states,locate_abstract_allocator);
    }
    free_stack_unichar(p->stack);
    /** Too long to free the DLC tree if it is big
     * free_DLC_tree(p->DLC_tree);
     */
    if (free_abstract_allocator_item) {
        free_pattern_node(p->pattern_tree_root,locate_abstract_allocator);
        free_Fst2(p->fst2,locate_abstract_allocator);
        free_list_int(p->tag_token_list,locate_abstract_allocator);
    }
    close_abstract_allocator(locate_abstract_allocator);
    close_abstract_allocator(locate_recycle_abstract_allocator);
    locate_recycle_abstract_allocator=locate_abstract_allocator=NULL;

    /* We don't free 'parameters->tags' because it was just a link on 'parameters->fst2->tags' */
    free_alphabet(p->alphabet);
    if (p->korean!=NULL) {
        delete p->korean;
    }
    if (p->jamo_tags!=NULL) {
        /* jamo tags must be freed before tokens, because we need to know how
         * many jamo tags there are, and this number is the number of tokens */
        for (int i=0; i<p->tokens->size; i++) {
            free(p->jamo_tags[i]);
        }
        free(p->jamo_tags);
    }
    free_string_hash(p->tokens);
    free_lemma_node(root);
    free(p->token_control);
    for (int i=0; i<n_text_tokens; i++) {
        free_bit_array(p->matching_patterns[i]);
    }
    free(p->matching_patterns);
#ifdef TRE_WCHAR
    free_FilterSet(p->filters);
    free_FilterMatchIndex(p->filter_match_index);
#endif
    for (int i=0; i<p->n_morpho_dics; i++) {
        free_abstract_INF(p->morpho_dic_inf[i],&(p->morpho_dic_inf_free[i]));
        free_abstract_BIN(p->morpho_dic_bin[i],&(p->morpho_dic_bin_free[i]));
    }
    free(p->morpho_dic_inf);
    free(p->morpho_dic_inf_free);
    free(p->morpho_dic_bin);
    free(p->morpho_dic_bin_free);
#if (defined(UNITEX_LIBRARY) || defined(UNITEX_RELEASE_MEMORY_AT_EXIT))
    free_DLC_tree(p->DLC_tree);
#endif
    free_locate_parameters(p);
    u_printf("Done.\n");
    return 1;
}
Beispiel #12
0
int main_CheckDic(int argc,char* const argv[]) {
if (argc==1) {
   usage();
   return 0;
}

int is_a_DELAF=-1;
int strict_unprotected=0;
int skip_path=0;
char alph[FILENAME_MAX]="";
Encoding encoding_output = DEFAULT_ENCODING_OUTPUT;
int bom_output = DEFAULT_BOM_OUTPUT;
int mask_encoding_compatibility_input = DEFAULT_MASK_ENCODING_COMPATIBILITY_INPUT;
int val,index=-1;
int space_warnings=1;
struct OptVars* vars=new_OptVars();
while (EOF!=(val=getopt_long_TS(argc,argv,optstring_CheckDic,lopts_CheckDic,&index,vars))) {
   switch(val) {
   case 'f': is_a_DELAF=1; break;
   case 's': is_a_DELAF=0; break;
   case 'h': usage(); return 0;
   case 'r': strict_unprotected=1; break;
   case 't': strict_unprotected=0; break;
   case 'n': space_warnings=0; break;
   case 'p': skip_path=1; break;
   case 'a': if (vars->optarg[0]=='\0') {
                fatal_error("Empty alphabet argument\n");
             }
             strcpy(alph,vars->optarg);
             break;
   case 'k': if (vars->optarg[0]=='\0') {
                fatal_error("Empty input_encoding argument\n");
             }
             decode_reading_encoding_parameter(&mask_encoding_compatibility_input,vars->optarg);
             break;
   case 'q': if (vars->optarg[0]=='\0') {
                fatal_error("Empty output_encoding argument\n");
             }
             decode_writing_encoding_parameter(&encoding_output,&bom_output,vars->optarg);
             break;
   case ':': if (index==-1) fatal_error("Missing argument for option -%c\n",vars->optopt);
             else fatal_error("Missing argument for option --%s\n",lopts_CheckDic[index].name);
   case '?': if (index==-1) fatal_error("Invalid option -%c\n",vars->optopt);
             else fatal_error("Invalid option --%s\n",vars->optarg);
             break;
   }
   index=-1;
}

if (is_a_DELAF==-1 || vars->optind!=argc-1) {
   error("Invalid arguments: rerun with --help\n");
   return 1;
}

U_FILE* dic=u_fopen_existing_versatile_encoding(mask_encoding_compatibility_input,argv[vars->optind],U_READ);
if (dic==NULL) {
	fatal_error("Cannot open dictionary %s\n",argv[vars->optind]);
}
Alphabet* alphabet0=NULL;
if (alph[0]!='\0') {
   alphabet0=load_alphabet(alph,1);
}
char output_filename[FILENAME_MAX];
get_path(argv[vars->optind],output_filename);
strcat(output_filename,"CHECK_DIC.TXT");
U_FILE* out=u_fopen_versatile_encoding(encoding_output,bom_output,mask_encoding_compatibility_input,output_filename,U_WRITE);
if (out==NULL) {
	u_fclose(dic);
	fatal_error("Cannot create %s\n",output_filename);
}
u_printf("Checking %s...\n",argv[vars->optind]);
unichar line[CHECKDIC_LINE_SIZE];
int line_number=1;
/*
 * We declare and initialize an array in order to know which
 * letters are used in the dictionary.
 */
int i;
char* alphabet=(char*)malloc(sizeof(char)*MAX_NUMBER_OF_UNICODE_CHARS);
if (alphabet==NULL) {
	fatal_alloc_error("CheckDic's main");
}
memset(alphabet,0,sizeof(char)*MAX_NUMBER_OF_UNICODE_CHARS);
/*
 * We use two structures for the storage of the codes found in the
 * dictionary. Note that 'semantic_codes' is used to store both grammatical and
 * semantic codes.
 */
struct string_hash* semantic_codes=new_string_hash();
struct string_hash* inflectional_codes=new_string_hash();
struct string_hash* simple_lemmas=new_string_hash(DONT_USE_VALUES);
struct string_hash* compound_lemmas=new_string_hash(DONT_USE_VALUES);
int n_simple_entries=0;
int n_compound_entries=0;
/*
 * We read all the lines and check them.
 */
while (EOF!=u_fgets_limit2(line,DIC_LINE_SIZE,dic)) {
   if (line[0]=='\0') {
		/* If we have an empty line, we print a unicode error message
		 * into the output file */
		u_fprintf(out,"Line %d: empty line\n",line_number);
	}
	else if (line[0]=='/') {
		/* If a line starts with '/', it is a commment line, so
		 * we ignore it */
	}
	else {
		/* If we have a line to check, we check it according to the
		 * dictionary type */
		check_DELA_line(line,out,is_a_DELAF,line_number,alphabet,semantic_codes,
		                inflectional_codes,simple_lemmas,compound_lemmas,
		                &n_simple_entries,&n_compound_entries,alphabet0,strict_unprotected);
	}
	/* At regular intervals, we display a message on the standard
	 * output to show that the program is working */
	if (line_number%10000==0) {
		u_printf("%d lines read...\r",line_number);
	}
	line_number++;
}
u_printf("%d lines read\n",line_number-1);
u_fclose(dic);
/*
 * Once we have checked all the lines, we print some informations
 * in the output file.
 */
u_fprintf(out,"-----------------------------------\n");
u_fprintf(out,"-------------  Stats  -------------\n");
u_fprintf(out,"-----------------------------------\n");
if (skip_path != 0) { 
    char filename_without_path[FILENAME_MAX];
    remove_path(argv[vars->optind],filename_without_path);
    u_fprintf(out,"File: %s\n",filename_without_path);
}
else {
    u_fprintf(out,"File: %s\n",argv[vars->optind]);
}
u_fprintf(out,"Type: %s\n",is_a_DELAF?"DELAF":"DELAS");
u_fprintf(out,"%d line%s read\n",line_number-1,(line_number-1>1)?"s":"");
u_fprintf(out,"%d simple entr%s ",n_simple_entries,(n_simple_entries>1)?"ies":"y");
u_fprintf(out,"for %d distinct lemma%s\n",simple_lemmas->size,(simple_lemmas->size>1)?"s":"");
u_fprintf(out,"%d compound entr%s ",n_compound_entries,(n_compound_entries>1)?"ies":"y");
u_fprintf(out,"for %d distinct lemma%s\n",compound_lemmas->size,(compound_lemmas->size>1)?"s":"");
/**
 * We print the list of the characters that are used, with
 * their unicode numbers shown in hexadecimal. This can be useful
 * to detect different characters that are graphically identical
 * like 'A' (upper of latin 'a' or upper of greek alpha ?).
 */
u_fprintf(out,"-----------------------------------\n");
u_fprintf(out,"----  All chars used in forms  ----\n");
u_fprintf(out,"-----------------------------------\n");
unichar r[4];
unichar r2[7];
r[1]=' ';
r[2]='(';
r[3]='\0';
r2[5]='\n';
r2[6]='\0';
for (i=0;i<MAX_NUMBER_OF_UNICODE_CHARS;i++) {
	if (alphabet[i]) {
      u_fprintf(out,"%C (%04X)\n",i,i);
	}
}
/*
 * Then we print the list of all grammatical and semantic codes used in the
 * dictionary. If a code contains a non ASCII character, a space or a tabulation,
 * we print a warning.
 */
u_fprintf(out,"-------------------------------------------------------------\n");
u_fprintf(out,"----  %3d grammatical/semantic code%s",semantic_codes->size,(semantic_codes->size>1)?"s used in dictionary  ----\n":" used in dictionary  -----\n");
u_fprintf(out,"-------------------------------------------------------------\n");
unichar comment[2000];
for (i=0;i<semantic_codes->size;i++) {
	/* We print the code, followed if necessary by a warning */
	u_fprintf(out,"%S",semantic_codes->value[i]);
	if (warning_on_code(semantic_codes->value[i],comment,space_warnings)) {
		u_fprintf(out," %S",comment);
	}
	u_fprintf(out,"\n");
}
/*
 * Finally, we print the list of inflectional codes,
 * with warnings in the case of non ASCII letters, spaces
 * or tabulations.
 */
u_fprintf(out,"-----------------------------------------------------\n");
u_fprintf(out,"----  %3d inflectional code%s",inflectional_codes->size,(inflectional_codes->size>1)?"s used in dictionary  ----\n":" used in dictionary  -----\n");
u_fprintf(out,"-----------------------------------------------------\n");


for (i=0;i<inflectional_codes->size;i++) {
	u_fprintf(out,"%S",inflectional_codes->value[i]);
	if (warning_on_code(inflectional_codes->value[i],comment,space_warnings)) {
		u_fprintf(out," %S",comment);
	}
	u_fprintf(out,"\n");
}
u_fclose(out);
free_OptVars(vars);
u_printf("Done.\n");
/* Note that we don't free anything since it would only waste time */

free(alphabet);
if (alphabet0!=NULL) {
   free_alphabet(alphabet0);
}
#if (defined(UNITEX_LIBRARY) || defined(UNITEX_RELEASE_MEMORY_AT_EXIT))
/* cleanup for no leak on library */
free_string_hash(semantic_codes);
free_string_hash(inflectional_codes);
free_string_hash(simple_lemmas);
free_string_hash(compound_lemmas);
#endif
return 0;
}
Beispiel #13
0
/**
 * Returns a new string_hash object with the default capacity.
 * Its bound policy will be to enlarge the 'value' array if needed.
 */
struct string_hash* new_string_hash() {
return new_string_hash(DEFAULT_STRING_HASH_SIZE,ENLARGE_IF_NEEDED);
}
Beispiel #14
0
/**
 * Returns a new string_hash object with the given capacity. Its
 * bound policy will be to enlarge the 'value' array if needed.
 */
struct string_hash* new_string_hash(int capacity) {
return new_string_hash(capacity,ENLARGE_IF_NEEDED);
}
Beispiel #15
0
/**
 * This function produces a normalized version of 'input' and stores it into 'ouput'.
 * The following rules are applied in the given order:
 *
 * 1) If there is a { at the current position, we try to read a {S}, a {STOP} or
 *    a tag token like {today,.ADV}. If we fail, we replace the { and the }, if any,
 *    according to the replacement rules. Otherwise, we let the token unchanged.
 * 2) If there is one or more replacement rules that can apply to the current
 *    position in 'input', then we apply the longest one.
 * 3) If we we find a separator (space, tab, new line) sequence, we replace it:
 *    - by a new line if the sequence contains one and if 'carriage_return_policy' is
 *      set to KEEP_CARRIAGE_RETURN;
 *    - by a space otherwise.
 * 4) We copy the character that was read to the output.
 *
 * Note that 'replacements' is supposed to contain replacement rules for { and }
 */
int normalize(const char *fin, const char *fout, 
              Encoding encoding_output, int bom_output, int mask_encoding_compatibility_input,
              int carriage_return_policy, const char *rules) {
	U_FILE* input;
	input = u_fopen_existing_versatile_encoding(mask_encoding_compatibility_input,fin,U_READ);
	if (input == NULL) {
		error("Cannot open file %s\n", fin);
		return 1;
	}

	U_FILE* output;
	output = u_fopen_creating_versatile_encoding(encoding_output,bom_output,fout,U_WRITE);
	if (output == NULL) {
		error("Cannot create file %s\n", fout);
		u_fclose(input);
		return 1;
	}

	struct string_hash* replacements=NULL;
	if(rules != NULL && rules[0]!='\0') {
		replacements=load_key_value_list(rules,mask_encoding_compatibility_input,'\t');
		if (replacements==NULL) {
		   error("Cannot load replacement rules file %s\n", rules);
		   replacements=new_string_hash();
		}
	}
	/* If there is no replacement rules file, we simulate one */
	else {
	   replacements=new_string_hash();
	}

	/* If there is a replacement rule file, we ensure that there are replacement
	 * rules for { and }. If not, we add our default ones, so that in any case,
	 * we are sure to have rules for { and } */
	unichar key[2];
	unichar value[2];
	u_strcpy(key,"{");
	u_strcpy(value,"[");
	get_value_index(key,replacements,INSERT_IF_NEEDED,value);
	u_strcpy(key,"}");
	u_strcpy(value,"]");
	get_value_index(key,replacements,INSERT_IF_NEEDED,value);

    struct OUTBUF OutBuf;
    OutBuf.pos=0;
	unichar tmp[MAX_TAG_LENGTH];
	//struct buffer* buffer=new_buffer_for_file(UNICHAR_BUFFER,input);

    long save_pos=ftell(input);
    fseek(input,0,SEEK_END);
    long file_size_input=ftell(input);
    fseek(input,save_pos,SEEK_SET);

    int line_buffer_size = (int)(((file_size_input+1) < MAX_LINE_BUFFER_SIZE) ? (file_size_input+1) : MAX_LINE_BUFFER_SIZE);

    unichar *line_read;
    line_read=(unichar*)malloc((line_buffer_size+0x10)*sizeof(unichar));
    if (line_read==NULL) {
        fatal_alloc_error("normalize");
    }

	/* We define some things that will be used for parsing the buffer */


    static const unichar stop_chars[]= { '{', '}', 0 };
    static const unichar forbidden_chars[]= { '\n', 0 };
    static const unichar open_bracket[]= { '{', 0 };
    static const unichar close_bracket[]= { '}', 0 };
    static const unichar empty_string[]= { 0 };

   int corrupted_file=0;
   int eof_found=0;
   /* First, we fill the buffer */
	
    int lastline_was_terminated=0;

    while (eof_found==0) {
        int current_start_pos=0;
        int found_null=0;
        const unichar*buff=line_read;
        int result_read = 0;

        result_read = u_fgets_treat_cr_as_lf(line_read,line_buffer_size,input,1,&found_null);
        if ((found_null != 0) && (corrupted_file==0)) {
          corrupted_file=1;
          error("Corrupted text file containing NULL characters!\n");
          error("They have been ignored by Normalize, but you should clean your text\n");
        }

        if (result_read>0)
            if (line_read[result_read-1]==0x0d)
                line_read[result_read-1]='\n';
        
        if (result_read==EOF)
            break;

        if (lastline_was_terminated != 0)
            while (current_start_pos<result_read) {
                if (buff[current_start_pos]!=' ' && buff[current_start_pos]!='\t'
							    && buff[current_start_pos]!=0x0d
                                && buff[current_start_pos]!='\n')
                                break;
                current_start_pos++;
            }

        lastline_was_terminated = 0;
        if (result_read > 0)
            if ((buff[result_read-1]=='\n') || (buff[result_read-1]==0x0d))
                lastline_was_terminated = 1;


        while (current_start_pos<result_read) {
            if ((lastline_was_terminated == 0) && (eof_found == 0) && 
                (current_start_pos + MINIMAL_CHAR_IN_BUFFER_BEFORE_CONTINUE_LINE >= result_read))
            {
                int i;
                int nb_to_keep = result_read-current_start_pos;
                for (i=0;i<nb_to_keep;i++)
                    line_read[i]=line_read[current_start_pos+i];
                int found_null_read=0;
                int result_read_continue = u_fgets_treat_cr_as_lf(line_read+nb_to_keep,line_buffer_size-nb_to_keep,input,1,&found_null_read);

                if ((found_null_read != 0) && (corrupted_file==0)) {
                    corrupted_file=1;
                    error("Corrupted text file containing NULL characters!\n");
                    error("They have been ignored by Normalize, but you should clean your text\n");
                }

                if (result_read_continue>0)
                    if (line_read[(result_read_continue+nb_to_keep)-1]==0x0d)
                        line_read[(result_read_continue+nb_to_keep)-1]='\n';
                lastline_was_terminated = 0;
                if (result_read_continue==EOF)
                    eof_found = lastline_was_terminated = 1;

                if (result_read_continue > 0)
                    if ((buff[(result_read_continue+nb_to_keep)-1]=='\n') || (buff[(result_read_continue+nb_to_keep)-1]==0x0d))
                        lastline_was_terminated = 1;

                result_read = nb_to_keep;
                current_start_pos = 0;

                if (result_read_continue > 0)
                    result_read += result_read_continue;
            }

		if (buff[current_start_pos]=='{') {
			/* If we have a {, we try to find a sequence like {....}, that does not contain
			 * new lines. If the sequence contains protected character, we want to keep them
			 * protected. */
			int old_position=current_start_pos;
			/* If we don't increase the position, the parse will stop on the initial { */
			current_start_pos++;
			tmp[0]='{';
			int code=parse_string(buff,&current_start_pos,&(tmp[1]),stop_chars,forbidden_chars,NULL);
			if (code==P_FORBIDDEN_CHAR || code==P_BACKSLASH_AT_END || buff[current_start_pos]!='}') {
				/* If we have found a new line or a {, or if there is
				 * a backslash at the end of the buffer, or if we have reached the end
				 * of the buffer, we assume that the initial
				 * { was not a tag beginning, so we print the substitute of { */
				WriteOufBuf(&OutBuf,replacements->value[get_value_index(open_bracket,replacements)],output, 0);
				/* And we rewind the current position after the { */
				current_start_pos=old_position+1;
			}
			else {
				/* If we have read a sequence like {....}, we assume that there won't be
				 * a buffer overflow if we add the } */
				u_strcat(tmp,close_bracket);
				if (!u_strcmp(tmp,"{S}") || !u_strcmp(tmp,"{STOP}") || check_tag_token(tmp)) {
					/* If this is a special tag or a valid tag token, we just print
					 * it to the output */
					WriteOufBuf(&OutBuf,tmp,output, 0);
					current_start_pos++;
				}
				else {
					/* If we have a non valid tag token, we print the equivalent of {
					 * and we rewind the current position after the { */
					WriteOufBuf(&OutBuf,replacements->value[get_value_index(open_bracket,replacements)],output, 0);
					current_start_pos=old_position+1;
				}
			}
		}
		else {
			/* If we have a character that is not {, first we try to look if there
			 * is a replacement to do */
			int key_length;
			int index=get_longest_key_index(&buff[current_start_pos],&key_length,replacements);
			if (index!=NO_VALUE_INDEX) {
				/* If there is something to replace */
				WriteOufBuf(&OutBuf,replacements->value[index],output, 0);
				current_start_pos=current_start_pos+key_length;
			}
			else {
				if (buff[current_start_pos]==' ' || buff[current_start_pos]=='\t' || buff[current_start_pos]=='\n' || buff[current_start_pos]==0x0d) {
					/* If we have a separator, we try to read the longest separator sequence
					 * that we can read. By the way, we note if it contains a new line */
					int new_line=0;
					while (buff[current_start_pos]==' ' || buff[current_start_pos]=='\t'
							|| buff[current_start_pos]=='\n' || buff[current_start_pos]==0x0d) {
						/* Note 1: no bound check is needed, since an unichar buffer is always
						 *        ended by a \0
						 *
						 * Note 2: we don't take into account the case of a buffer ended by
						 *         separator while it's not the end of file: that would mean
						 *         that the text contains something like MARGIN_BEFORE_BUFFER_END
						 *         contiguous separators. Such a text would not be a reasonable one.
						 */
						if (buff[current_start_pos]=='\n' || buff[current_start_pos]==0x0d) {
							new_line=1;
						}
						current_start_pos++;
					}
					if (new_line && (carriage_return_policy==KEEP_CARRIAGE_RETURN)) {
						/* We print a new line if the sequence contains one and if we are
						 * allowed to; otherwise, we print a space. */
						WriteOufBuf(&OutBuf,'\n',output, 0);
					}
					else {
						WriteOufBuf(&OutBuf,' ',output, 0);
					}
				}
				else {
					/* If, finally, we have a normal character to normalize, we just print it */
                    WriteOufBuf(&OutBuf,buff[current_start_pos++],output, 0);
				}
			}
		}
	    }
    }


    WriteOufBuf(&OutBuf,empty_string,output, 1);

	free(line_read);
	free_string_hash(replacements);

	u_fclose(input);
	u_fclose(output);
	return 0;
}
Beispiel #16
0
int main_PolyLex(int argc,char* const argv[]) {
if (argc==1) {
   usage();
   return SUCCESS_RETURN_CODE;
}

int language=-1;
char alphabet[FILENAME_MAX]="";
char name_bin[FILENAME_MAX]="";
char output[FILENAME_MAX]="";
char info[FILENAME_MAX]="";
VersatileEncodingConfig vec=VEC_DEFAULT;
int val,index=-1;
bool only_verify_arguments = false;
UnitexGetOpt options;
while (EOF!=(val=options.parse_long(argc,argv,optstring_PolyLex,lopts_PolyLex,&index))) {
   switch(val) {
   case 'D': language=DUTCH; break;
   case 'G': language=GERMAN; break;
   case 'N': language=NORWEGIAN; break;
   case 'R': language=RUSSIAN; break;
   case 'a': if (options.vars()->optarg[0]=='\0') {
                error("You must specify a non empty alphabet file name\n");
                return USAGE_ERROR_CODE;
             }
             strcpy(alphabet,options.vars()->optarg);
             break;
   case 'd': if (options.vars()->optarg[0]=='\0') {
                error("You must specify a non empty dictionary file name\n");
                return USAGE_ERROR_CODE;
             }
             strcpy(name_bin,options.vars()->optarg);
             break;
   case 'o': if (options.vars()->optarg[0]=='\0') {
                error("You must specify a non empty output file name\n");
                return USAGE_ERROR_CODE;
             }
             strcpy(output,options.vars()->optarg);
             break;
   case 'i': if (options.vars()->optarg[0]=='\0') {
                error("You must specify a non empty information file name\n");
                return USAGE_ERROR_CODE;
             }
             strcpy(info,options.vars()->optarg);
             break;
   case 'k': if (options.vars()->optarg[0]=='\0') {
                error("Empty input_encoding argument\n");
                return USAGE_ERROR_CODE;
             }
             decode_reading_encoding_parameter(&(vec.mask_encoding_compatibility_input),options.vars()->optarg);
             break;
   case 'q': if (options.vars()->optarg[0]=='\0') {
                error("Empty output_encoding argument\n");
                return USAGE_ERROR_CODE;
             }
             decode_writing_encoding_parameter(&(vec.encoding_output),&(vec.bom_output),options.vars()->optarg);
             break;
   case 'V': only_verify_arguments = true;
             break;
   case 'h': usage();
             return SUCCESS_RETURN_CODE;
   case ':': index==-1 ? error("Missing argument for option -%c\n",options.vars()->optopt) :
                         error("Missing argument for option --%s\n",lopts_PolyLex[index].name);
             return USAGE_ERROR_CODE;
   case '?': index==-1 ? error("Invalid option -%c\n",options.vars()->optopt) :
                         error("Invalid option --%s\n",options.vars()->optarg);
             return USAGE_ERROR_CODE;
   }
   index=-1;
}

if (options.vars()->optind!=argc-1) {
   error("Invalid arguments: rerun with --help\n");
   return USAGE_ERROR_CODE;
}

if (name_bin[0]=='\0') {
   error("You must specify the .bin dictionary to use\n");
   return USAGE_ERROR_CODE;
}

if (output[0]=='\0') {
   error("You must specify the output dictionary file name\n");
   return USAGE_ERROR_CODE;
}

if (language==-1) {
   error("You must specify the language\n");
   return USAGE_ERROR_CODE;
}

if (only_verify_arguments) {
  // freeing all allocated memory
  return SUCCESS_RETURN_CODE;
}

Alphabet* alph=NULL;
if (alphabet[0]!='\0') {
   u_printf("Loading alphabet...\n");
   alph=load_alphabet(&vec,alphabet);
   if (alph==NULL) {
      error("Cannot load alphabet file %s\n",alphabet);
      return USAGE_ERROR_CODE;
   }
}

char name_inf[FILENAME_MAX];
struct string_hash* forbiddenWords=NULL;
if (language==DUTCH || language==NORWEGIAN) {
   get_path(name_bin,name_inf);
   strcat(name_inf,"ForbiddenWords.txt");
   forbiddenWords=load_key_list(&vec,name_inf);
   if (forbiddenWords==NULL) {
       /* If there was no file, we don't want to block the process */
       forbiddenWords=new_string_hash(DONT_USE_VALUES);
   }
}

strcpy(name_inf,name_bin);
name_inf[strlen(name_bin)-3]='\0';
strcat(name_inf,"inf");
Dictionary* d=new_Dictionary(&vec,name_bin,name_inf);
if (d==NULL) {
    error("Cannot load dictionary %s\n",name_bin);
    free_string_hash(forbiddenWords);
    free_alphabet(alph);
    return DEFAULT_ERROR_CODE;
}

char tmp[FILENAME_MAX];
strcpy(tmp,argv[options.vars()->optind]);
strcat(tmp,".tmp");

U_FILE* words=u_fopen(&vec,argv[options.vars()->optind],U_READ);
if (words==NULL) {
   error("Cannot open word list file %s\n",argv[options.vars()->optind]);
   free_Dictionary(d);
   free_string_hash(forbiddenWords);
   free_alphabet(alph);
   // here we return 0 in order to do not block the preprocessing
   // in the Unitex/GramLab IDE interface, if no dictionary was applied
   // so that there is no "err" file
   return SUCCESS_RETURN_CODE;
}

U_FILE* new_unknown_words=u_fopen(&vec,tmp,U_WRITE);
if (new_unknown_words==NULL) {
   error("Cannot open temporary word list file %s\n",tmp);
   u_fclose(words);
   free_Dictionary(d);
   free_string_hash(forbiddenWords);
   free_alphabet(alph);
   return DEFAULT_ERROR_CODE;
}

U_FILE* res=u_fopen(&vec,output,U_APPEND);
if (res==NULL) {
   error("Cannot open result file %s\n",output);
   u_fclose(new_unknown_words);
   u_fclose(words);
   free_Dictionary(d);
   free_string_hash(forbiddenWords);
   free_alphabet(alph);
   u_fclose(words);
   return DEFAULT_ERROR_CODE;
}

U_FILE* debug=NULL;
if ((*info)!='\0') {
   debug=u_fopen(&vec,info,U_WRITE);
   if (debug==NULL) {
      error("Cannot open debug file %s\n",info);
   }
}
struct utags UTAG;

switch(language) {
  case DUTCH:
    analyse_dutch_unknown_words(alph,
                                d,
                                words,
                                res,
                                debug,
                                new_unknown_words,
                                forbiddenWords);
    break;
  case GERMAN:
    analyse_german_compounds(alph,
                             d,
                             words,
                             res,
                             debug,
                             new_unknown_words);
    break;
  case NORWEGIAN:
    analyse_norwegian_unknown_words(alph,
                                    d,
                                    words,
                                    res,
                                    debug,
                                    new_unknown_words,
                                    forbiddenWords);
    break;
  case RUSSIAN:
     init_russian(&UTAG);
     analyse_compounds(alph,
                       d,
                       words,
                       res,
                       debug,
                       new_unknown_words,
                       UTAG);
     break;
}

free_alphabet(alph);
free_Dictionary(d);
u_fclose(words);
u_fclose(new_unknown_words);
free_string_hash(forbiddenWords);
af_remove(argv[options.vars()->optind]);
af_rename(tmp,argv[options.vars()->optind]);
u_fclose(res);

if (debug!=NULL) {
   u_fclose(debug);
}

return SUCCESS_RETURN_CODE;
}
Beispiel #17
0
//
// this function builds the normalization grammar adapted to the match list
// passed in parameter
//
void build_portuguese_normalization_grammar(const Alphabet* alph,struct match_list* list,const unsigned char* root_bin,
                                            const struct INF_codes* root_inf,const unsigned char* inflected_bin,
                                            const struct INF_codes* inflected_inf,const char* res_grf_name,
                                            Encoding encoding_output, int bom_output,
                                            struct normalization_tree* norm_tree,
                                            struct normalization_tree* nasal_norm_tree) {
DISCARD_UNUSED_PARAMETER(nasal_norm_tree)
struct match_list* L=list;
int N=0;
unichar temp[2000];
unichar prefix[2000];
struct string_hash* hash=new_string_hash();
while (L!=NULL) {
   if (L->output!=NULL) {
      // first, we normalize the sequences by removing all spaces
      u_strcpy_without_space(temp,L->output);
      u_strcpy(L->output,temp);
      // then we check if this sequence has already been processed
      int J=get_value_index(L->output,hash,DONT_INSERT);
      if (J!=-1) {
         // if the sequence has already been analyzed, we do nothing
      }
      else {
         get_value_index(L->output,hash);
         get_bracket_prefix(L->output,prefix);
         if (!u_strcmp(prefix,"FuturConditional")) {
            N=N+replace_match_output_by_normalization_line(L,alph,root_bin,root_inf,inflected_bin,