/**
 * Closes the given file and frees the memory associated to the structure.
 */
void fst_file_close_in(Elag_fst_file_in* fstf) {
if (fstf==NULL) return;
if (fstf->name!=NULL) free(fstf->name);
u_fclose(fstf->f);
free_string_hash_ptr(fstf->symbols,(void(*)(void*))free_symbols);
if (fstf->renumber!=NULL) free(fstf->renumber);
free(fstf);
}
/**
 * Loads a .fst2 file with the given name and type, according to the
 * given language description.
 */
Elag_fst_file_in* load_elag_fst2_file(const VersatileEncodingConfig* vec,const char* fname,language_t* language) {
Elag_fst_file_in* fstf=(Elag_fst_file_in*)malloc(sizeof(Elag_fst_file_in));
if (fstf==NULL) {
   fatal_alloc_error("load_elag_fst2_file");
}
fstf->name=strdup(fname);
if (fstf->name==NULL) {
   fatal_alloc_error("load_elag_fst2_file");
}
if ((fstf->f=u_fopen(vec,fname,U_READ))==NULL) {
   error("load_fst_file: unable to open '%s' for reading\n",fname);
   goto error_fstf;
}
unichar buf[MAXBUF];
if (u_fgets(buf,MAXBUF,fstf->f)==EOF) {
   error("load_fst_file: '%s' is empty\n",fname);
   goto error_f;
}
if (!u_is_digit(*buf)) {
   error("load_fst_file: %s: bad file format\n",fname);
   goto error_f;
}
fstf->nb_automata=u_parse_int(buf);
fstf->language=language;
fstf->type=FST_GRAMMAR;
fstf->pos0=(int)ftell(fstf->f);
fstf->symbols=new_string_hash_ptr(64);
fstf->renumber=NULL;
if (load_elag_fst2_tags(fstf)==-1) {
   error("load_fst_file: %s: cannot load symbols\n",fstf->name);
   goto error_symbols;
}
fstf->pos=0;
return fstf;
/* If an error occurs */
error_symbols: free_string_hash_ptr(fstf->symbols,(void(*)(void*))free_symbols);

error_f: u_fclose(fstf->f);

error_fstf: free(fstf->name);

free(fstf);
return NULL;
}
/**
 * Computes training by extracting statistics from a tagged corpus file.
 */
void do_training(U_FILE* input_text,U_FILE* rforms_file,U_FILE* iforms_file){
/* these two hash tables are respectively for simple and compound entries */
struct string_hash_ptr* rforms_table = NULL, *iforms_table = NULL;
if(rforms_file != NULL){
	rforms_table = new_string_hash_ptr(200000);
}
if(iforms_file != NULL){
	iforms_table = new_string_hash_ptr(200000);
}


/* we initialize a contextual matrix */
struct corpus_entry** context = new_context_matrix();
initialize_context_matrix(context);


unichar line[MAX_TAGGED_CORPUS_LINE];

/* check the format of the corpus */
long previous_file_position = ftell(input_text);
if(u_fgets(line,input_text) == EOF){
	fatal_error("File is empty");
}
fseek(input_text,previous_file_position,SEEK_SET);

int format_corpus = check_corpus_entry(line);

if(format_corpus == 0){
	// the corpus is in the Tagger format, one word per line where line=word/tag
	while(u_fgets(line,input_text) !=EOF){
		if(u_strlen(line) == 0){
			initialize_context_matrix(context);
		}
		else{
			corpus_entry* entry = new_corpus_entry(line);
			if(u_strchr(line,'_')!=NULL && line[0]!='_'){
				corpus_entry** entries = extract_simple_words(entry);
				free_corpus_entry(entry);
				for(int i=0;entries[i]!=NULL;i++){
					push_corpus_entry(entries[i],context);
					add_statistics(context,rforms_table,iforms_table);
				}
				free(entries);
			}
			else {
				push_corpus_entry(entry,context);
				add_statistics(context,rforms_table,iforms_table);
			}
		}
	}
}
else {
	// the corpus is in the Unitex tagged format, one sentence per line where token={word,lemma.tag}
	unichar *tmp,*s = (unichar*)malloc(sizeof(unichar)*(MAX_TAGGED_CORPUS_LINE));
	int current_len,len;
	unsigned int i;
	while(u_fgets(line,input_text) != EOF){
		current_len = 0, len = 0;
		/* extract each token of the sentence */
		for (;;) {
			len = 1+u_strlen(line+current_len)-u_strlen(u_strchr(line+current_len,'}'));
			tmp = u_strcpy_sized(s,len-1,line+current_len+1);
			u_strcat(tmp,"\0");
			if(u_strcmp(s,"S") == 0)
				break;

			//particular case: '\},\}.PONCT'
			if(line[current_len+2] == '}'){
				int start = current_len+3;
				do{
					tmp = u_strchr(line+start,'}');
					start += 1+u_strlen(line+start)-u_strlen(tmp);
				}
				while(*(tmp+1) != ' ');
				tmp = u_strcpy_sized(s,start-current_len-1,line+current_len+1);
				u_strcat(tmp,"\0");
				len += start-current_len-3;
			}

			/* format the {XX.YY} into standard tagger format, XX/YY */
			unichar* newline = (unichar*)malloc(sizeof(unichar)*(8096));
			if(u_strchr(s,',')[1] == ','){
				u_strcpy(newline,",");
			}
			else
				u_strcpy_sized(newline,1+u_strlen(s)-u_strlen(u_strchr(s,',')),s);
			u_sprintf(newline,"%S/%S\0",newline,s+u_strrchr(s,'.')+1);
			for(i=0;i<u_strlen(newline);i++){
				if(newline[i] == ' ')
					newline[i] = '_';
			}

			//create corpus entry
			corpus_entry* entry = new_corpus_entry(newline);
			if(u_strchr(newline,'_') != NULL && newline[0] != '_'){
				corpus_entry** entries = extract_simple_words(entry);
				free_corpus_entry(entry);
				for(int j=0;entries[j]!=NULL;j++){
					push_corpus_entry(entries[j],context);
					add_statistics(context,rforms_table,iforms_table);
				}
				free(entries);
			}
			else {
				push_corpus_entry(entry,context);
				add_statistics(context,rforms_table,iforms_table);
			}

			free(newline);
			current_len += len+1;
		}
		initialize_context_matrix(context);
	}
	free(s);
}
free_context_matrix(context);
/* we fill dictionary files with pairs (tuple,value) and then
 * we add a special line "CODE\tFEATURES,.value" in order to
 * specify whether the dictionary contains inflected or raw form tuples*/
unichar* str = u_strdup("");
if(rforms_table != NULL){
	write_keys_values(rforms_table,rforms_table->hash->root,str,rforms_file);
	u_fprintf(rforms_file,"%s,.%d\n","CODE\tFEATURES",0);
	free_string_hash_ptr(rforms_table,NULL);
}
if(iforms_table != NULL){
	write_keys_values(iforms_table,iforms_table->hash->root,str,iforms_file);
	u_fprintf(iforms_file,"%s,.%d\n","CODE\tFEATURES",1);
	free_string_hash_ptr(iforms_table,NULL);
}
free(str);
}
Example #4
0
/**
 * The same than main, but no call to setBufferMode.
 */
int main_KeyWords(int argc,char* const argv[]) {
if (argc==1) {
   usage();
   return SUCCESS_RETURN_CODE;
}

VersatileEncodingConfig vec=VEC_DEFAULT;

char tokens[FILENAME_MAX];
char output[FILENAME_MAX]="";
char alph[FILENAME_MAX]="";
char cdic[FILENAME_MAX]="";
unichar* code=u_strdup("XXX");
int val,index=-1;
bool only_verify_arguments = false;
UnitexGetOpt options;

while (EOF!=(val=options.parse_long(argc,argv,optstring_KeyWords,lopts_KeyWords,&index))) {
   switch(val) {
   case 'o': if (options.vars()->optarg[0]=='\0') {
                error("You must specify a non empty output\n");
                free(code);
                return USAGE_ERROR_CODE;
             }
             strcpy(output,options.vars()->optarg);
             break;
   case 'a': if (options.vars()->optarg[0]=='\0') {
                error("You must specify a non empty alphabet file name\n");
                free(code);
                return USAGE_ERROR_CODE;                
             }
             strcpy(alph,options.vars()->optarg);
             break;
   case 'f': if (options.vars()->optarg[0]=='\0') {
                error("You must specify a non empty forbidden code\n");
                free(code);
                return USAGE_ERROR_CODE;                
             }
             free(code);
             code=u_strdup(options.vars()->optarg);
             break;
   case 'c': if (options.vars()->optarg[0]=='\0') {
                error("You must specify a non empty file name\n");
                free(code);
                return USAGE_ERROR_CODE;                
             }
             strcpy(cdic,options.vars()->optarg);
             break;
   case 'V': only_verify_arguments = true;
             break;
   case 'h': usage();
             free(code);
             return SUCCESS_RETURN_CODE;
   case 'k': if (options.vars()->optarg[0]=='\0') {
                error("Empty input_encoding argument\n");
                free(code);
                return USAGE_ERROR_CODE;                
             }
             decode_reading_encoding_parameter(&(vec.mask_encoding_compatibility_input),options.vars()->optarg);
             break;
   case 'q': if (options.vars()->optarg[0]=='\0') {
                error("Empty output_encoding argument\n");
                free(code);
                return USAGE_ERROR_CODE;                
             }
             decode_writing_encoding_parameter(&(vec.encoding_output),&(vec.bom_output),options.vars()->optarg);
             break;
   case ':': index==-1 ? error("Missing argument for option -%c\n",options.vars()->optopt) :
                         error("Missing argument for option --%s\n",lopts_KeyWords[index].name);
             free(code);
             return USAGE_ERROR_CODE;
             break;
   case '?': index==-1 ? error("Invalid option -%c\n",options.vars()->optopt) :
                         error("Invalid option --%s\n",options.vars()->optarg);
             free(code);
             return USAGE_ERROR_CODE;
             break;
   }
   index=-1;
}

if (options.vars()->optind==argc || options.vars()->optind==argc-1) {
   error("Invalid arguments: rerun with --help\n");
   free(code);
   return USAGE_ERROR_CODE;
}

if (only_verify_arguments) {
  // freeing all allocated memory
  free(code);
  return SUCCESS_RETURN_CODE;
}

Alphabet* alphabet=NULL;
if (alph[0]!='\0') {
  alphabet=load_alphabet(&vec,alph);
  if (alphabet==NULL) {
    error("Cannot load alphabet file %s\n",alph);
    free(code);
    return DEFAULT_ERROR_CODE;    
  }
}

strcpy(tokens,argv[(options.vars()->optind++)]);
if (output[0]=='\0') {
	get_path(tokens,output);
	strcat(output,"keywords.txt");
}
struct string_hash_ptr* keywords=load_tokens_by_freq(tokens,&vec);
filter_non_letter_keywords(keywords,alphabet);
if (cdic[0]!='\0') {
	load_compound_words(cdic,&vec,keywords);
}

for (;options.vars()->optind!=argc;(options.vars()->optind)++) {
	filter_keywords_with_dic(keywords,argv[options.vars()->optind],&vec,alphabet);
}
merge_case_equivalent_unknown_words(keywords,alphabet);
struct string_hash* forbidden_lemmas=compute_forbidden_lemmas(keywords,code);
remove_keywords_with_forbidden_lemma(keywords,forbidden_lemmas);
free_string_hash(forbidden_lemmas);
vector_ptr* sorted=sort_keywords(keywords);
U_FILE* f_output=u_fopen(&vec,output,U_WRITE);

if (f_output==NULL) {
	error("Cannot write in file %s\n",output);
  free_vector_ptr(sorted,(void(*)(void*))free_KeyWord_list);
  free_string_hash_ptr(keywords,(void(*)(void*))free_KeyWord_list);
  free_alphabet(alphabet);
  free(code);
  return DEFAULT_ERROR_CODE;  
}

dump_keywords(sorted,f_output);

u_fclose(f_output);
free_vector_ptr(sorted,(void(*)(void*))free_KeyWord_list);
free_string_hash_ptr(keywords,(void(*)(void*))free_KeyWord_list);
free_alphabet(alphabet);
free(code);

return SUCCESS_RETURN_CODE;
}