/** * Closes the given file and frees the memory associated to the structure. */ void fst_file_close_in(Elag_fst_file_in* fstf) { if (fstf==NULL) return; if (fstf->name!=NULL) free(fstf->name); u_fclose(fstf->f); free_string_hash_ptr(fstf->symbols,(void(*)(void*))free_symbols); if (fstf->renumber!=NULL) free(fstf->renumber); free(fstf); }
/** * Loads a .fst2 file with the given name and type, according to the * given language description. */ Elag_fst_file_in* load_elag_fst2_file(const VersatileEncodingConfig* vec,const char* fname,language_t* language) { Elag_fst_file_in* fstf=(Elag_fst_file_in*)malloc(sizeof(Elag_fst_file_in)); if (fstf==NULL) { fatal_alloc_error("load_elag_fst2_file"); } fstf->name=strdup(fname); if (fstf->name==NULL) { fatal_alloc_error("load_elag_fst2_file"); } if ((fstf->f=u_fopen(vec,fname,U_READ))==NULL) { error("load_fst_file: unable to open '%s' for reading\n",fname); goto error_fstf; } unichar buf[MAXBUF]; if (u_fgets(buf,MAXBUF,fstf->f)==EOF) { error("load_fst_file: '%s' is empty\n",fname); goto error_f; } if (!u_is_digit(*buf)) { error("load_fst_file: %s: bad file format\n",fname); goto error_f; } fstf->nb_automata=u_parse_int(buf); fstf->language=language; fstf->type=FST_GRAMMAR; fstf->pos0=(int)ftell(fstf->f); fstf->symbols=new_string_hash_ptr(64); fstf->renumber=NULL; if (load_elag_fst2_tags(fstf)==-1) { error("load_fst_file: %s: cannot load symbols\n",fstf->name); goto error_symbols; } fstf->pos=0; return fstf; /* If an error occurs */ error_symbols: free_string_hash_ptr(fstf->symbols,(void(*)(void*))free_symbols); error_f: u_fclose(fstf->f); error_fstf: free(fstf->name); free(fstf); return NULL; }
/** * Computes training by extracting statistics from a tagged corpus file. */ void do_training(U_FILE* input_text,U_FILE* rforms_file,U_FILE* iforms_file){ /* these two hash tables are respectively for simple and compound entries */ struct string_hash_ptr* rforms_table = NULL, *iforms_table = NULL; if(rforms_file != NULL){ rforms_table = new_string_hash_ptr(200000); } if(iforms_file != NULL){ iforms_table = new_string_hash_ptr(200000); } /* we initialize a contextual matrix */ struct corpus_entry** context = new_context_matrix(); initialize_context_matrix(context); unichar line[MAX_TAGGED_CORPUS_LINE]; /* check the format of the corpus */ long previous_file_position = ftell(input_text); if(u_fgets(line,input_text) == EOF){ fatal_error("File is empty"); } fseek(input_text,previous_file_position,SEEK_SET); int format_corpus = check_corpus_entry(line); if(format_corpus == 0){ // the corpus is in the Tagger format, one word per line where line=word/tag while(u_fgets(line,input_text) !=EOF){ if(u_strlen(line) == 0){ initialize_context_matrix(context); } else{ corpus_entry* entry = new_corpus_entry(line); if(u_strchr(line,'_')!=NULL && line[0]!='_'){ corpus_entry** entries = extract_simple_words(entry); free_corpus_entry(entry); for(int i=0;entries[i]!=NULL;i++){ push_corpus_entry(entries[i],context); add_statistics(context,rforms_table,iforms_table); } free(entries); } else { push_corpus_entry(entry,context); add_statistics(context,rforms_table,iforms_table); } } } } else { // the corpus is in the Unitex tagged format, one sentence per line where token={word,lemma.tag} unichar *tmp,*s = (unichar*)malloc(sizeof(unichar)*(MAX_TAGGED_CORPUS_LINE)); int current_len,len; unsigned int i; while(u_fgets(line,input_text) != EOF){ current_len = 0, len = 0; /* extract each token of the sentence */ for (;;) { len = 1+u_strlen(line+current_len)-u_strlen(u_strchr(line+current_len,'}')); tmp = u_strcpy_sized(s,len-1,line+current_len+1); u_strcat(tmp,"\0"); if(u_strcmp(s,"S") == 0) break; //particular case: '\},\}.PONCT' if(line[current_len+2] == '}'){ int start = current_len+3; do{ tmp = u_strchr(line+start,'}'); start += 1+u_strlen(line+start)-u_strlen(tmp); } while(*(tmp+1) != ' '); tmp = u_strcpy_sized(s,start-current_len-1,line+current_len+1); u_strcat(tmp,"\0"); len += start-current_len-3; } /* format the {XX.YY} into standard tagger format, XX/YY */ unichar* newline = (unichar*)malloc(sizeof(unichar)*(8096)); if(u_strchr(s,',')[1] == ','){ u_strcpy(newline,","); } else u_strcpy_sized(newline,1+u_strlen(s)-u_strlen(u_strchr(s,',')),s); u_sprintf(newline,"%S/%S\0",newline,s+u_strrchr(s,'.')+1); for(i=0;i<u_strlen(newline);i++){ if(newline[i] == ' ') newline[i] = '_'; } //create corpus entry corpus_entry* entry = new_corpus_entry(newline); if(u_strchr(newline,'_') != NULL && newline[0] != '_'){ corpus_entry** entries = extract_simple_words(entry); free_corpus_entry(entry); for(int j=0;entries[j]!=NULL;j++){ push_corpus_entry(entries[j],context); add_statistics(context,rforms_table,iforms_table); } free(entries); } else { push_corpus_entry(entry,context); add_statistics(context,rforms_table,iforms_table); } free(newline); current_len += len+1; } initialize_context_matrix(context); } free(s); } free_context_matrix(context); /* we fill dictionary files with pairs (tuple,value) and then * we add a special line "CODE\tFEATURES,.value" in order to * specify whether the dictionary contains inflected or raw form tuples*/ unichar* str = u_strdup(""); if(rforms_table != NULL){ write_keys_values(rforms_table,rforms_table->hash->root,str,rforms_file); u_fprintf(rforms_file,"%s,.%d\n","CODE\tFEATURES",0); free_string_hash_ptr(rforms_table,NULL); } if(iforms_table != NULL){ write_keys_values(iforms_table,iforms_table->hash->root,str,iforms_file); u_fprintf(iforms_file,"%s,.%d\n","CODE\tFEATURES",1); free_string_hash_ptr(iforms_table,NULL); } free(str); }
/** * The same than main, but no call to setBufferMode. */ int main_KeyWords(int argc,char* const argv[]) { if (argc==1) { usage(); return SUCCESS_RETURN_CODE; } VersatileEncodingConfig vec=VEC_DEFAULT; char tokens[FILENAME_MAX]; char output[FILENAME_MAX]=""; char alph[FILENAME_MAX]=""; char cdic[FILENAME_MAX]=""; unichar* code=u_strdup("XXX"); int val,index=-1; bool only_verify_arguments = false; UnitexGetOpt options; while (EOF!=(val=options.parse_long(argc,argv,optstring_KeyWords,lopts_KeyWords,&index))) { switch(val) { case 'o': if (options.vars()->optarg[0]=='\0') { error("You must specify a non empty output\n"); free(code); return USAGE_ERROR_CODE; } strcpy(output,options.vars()->optarg); break; case 'a': if (options.vars()->optarg[0]=='\0') { error("You must specify a non empty alphabet file name\n"); free(code); return USAGE_ERROR_CODE; } strcpy(alph,options.vars()->optarg); break; case 'f': if (options.vars()->optarg[0]=='\0') { error("You must specify a non empty forbidden code\n"); free(code); return USAGE_ERROR_CODE; } free(code); code=u_strdup(options.vars()->optarg); break; case 'c': if (options.vars()->optarg[0]=='\0') { error("You must specify a non empty file name\n"); free(code); return USAGE_ERROR_CODE; } strcpy(cdic,options.vars()->optarg); break; case 'V': only_verify_arguments = true; break; case 'h': usage(); free(code); return SUCCESS_RETURN_CODE; case 'k': if (options.vars()->optarg[0]=='\0') { error("Empty input_encoding argument\n"); free(code); return USAGE_ERROR_CODE; } decode_reading_encoding_parameter(&(vec.mask_encoding_compatibility_input),options.vars()->optarg); break; case 'q': if (options.vars()->optarg[0]=='\0') { error("Empty output_encoding argument\n"); free(code); return USAGE_ERROR_CODE; } decode_writing_encoding_parameter(&(vec.encoding_output),&(vec.bom_output),options.vars()->optarg); break; case ':': index==-1 ? error("Missing argument for option -%c\n",options.vars()->optopt) : error("Missing argument for option --%s\n",lopts_KeyWords[index].name); free(code); return USAGE_ERROR_CODE; break; case '?': index==-1 ? error("Invalid option -%c\n",options.vars()->optopt) : error("Invalid option --%s\n",options.vars()->optarg); free(code); return USAGE_ERROR_CODE; break; } index=-1; } if (options.vars()->optind==argc || options.vars()->optind==argc-1) { error("Invalid arguments: rerun with --help\n"); free(code); return USAGE_ERROR_CODE; } if (only_verify_arguments) { // freeing all allocated memory free(code); return SUCCESS_RETURN_CODE; } Alphabet* alphabet=NULL; if (alph[0]!='\0') { alphabet=load_alphabet(&vec,alph); if (alphabet==NULL) { error("Cannot load alphabet file %s\n",alph); free(code); return DEFAULT_ERROR_CODE; } } strcpy(tokens,argv[(options.vars()->optind++)]); if (output[0]=='\0') { get_path(tokens,output); strcat(output,"keywords.txt"); } struct string_hash_ptr* keywords=load_tokens_by_freq(tokens,&vec); filter_non_letter_keywords(keywords,alphabet); if (cdic[0]!='\0') { load_compound_words(cdic,&vec,keywords); } for (;options.vars()->optind!=argc;(options.vars()->optind)++) { filter_keywords_with_dic(keywords,argv[options.vars()->optind],&vec,alphabet); } merge_case_equivalent_unknown_words(keywords,alphabet); struct string_hash* forbidden_lemmas=compute_forbidden_lemmas(keywords,code); remove_keywords_with_forbidden_lemma(keywords,forbidden_lemmas); free_string_hash(forbidden_lemmas); vector_ptr* sorted=sort_keywords(keywords); U_FILE* f_output=u_fopen(&vec,output,U_WRITE); if (f_output==NULL) { error("Cannot write in file %s\n",output); free_vector_ptr(sorted,(void(*)(void*))free_KeyWord_list); free_string_hash_ptr(keywords,(void(*)(void*))free_KeyWord_list); free_alphabet(alphabet); free(code); return DEFAULT_ERROR_CODE; } dump_keywords(sorted,f_output); u_fclose(f_output); free_vector_ptr(sorted,(void(*)(void*))free_KeyWord_list); free_string_hash_ptr(keywords,(void(*)(void*))free_KeyWord_list); free_alphabet(alphabet); free(code); return SUCCESS_RETURN_CODE; }