/**
 * Loads the initial keyword list from a tok_by_freq.txt file,
 * and turns all those tokens in a list whose primary key is the
 * lower case token:
 * The/20 THE/2 the/50 => the->(The/20 THE/2 the/50)
 */
struct string_hash_ptr* load_tokens_by_freq(char* name,VersatileEncodingConfig* vec) {
U_FILE* f=u_fopen(vec,name,U_READ);
if (f==NULL) return NULL;
Ustring* line=new_Ustring(128);
Ustring* lower=new_Ustring(128);
struct string_hash_ptr* res=new_string_hash_ptr(1024);
int val,pos;
/* We skip the first line of the file, containing the number
 * of tokens
 */
if (EOF==readline(line,f)) {
	fatal_error("Invalid empty file %s\n",name);
}
while (EOF!=readline(line,f)) {
	if (1!=u_sscanf(line->str,"%d%n",&val,&pos)) {
		fatal_error("Invalid line in file %s:\n%S\n",name,line->str);
	}
	u_strcpy(lower,line->str+pos);
	u_tolower(lower->str);
	int index=get_value_index(lower->str,res,INSERT_IF_NEEDED,NULL);
	if (index==-1) {
		fatal_error("Internal error in load_tokens_by_freq\n");
	}
	KeyWord* value=(KeyWord*)res->value[index];
	res->value[index]=new_KeyWord(val,line->str+pos,value);
}
free_Ustring(line);
free_Ustring(lower);
u_fclose(f);
return res;
}
/**
 * Loads a .fst2 file with the given name and type, according to the
 * given language description.
 */
Elag_fst_file_in* load_elag_fst2_file(const VersatileEncodingConfig* vec,const char* fname,language_t* language) {
Elag_fst_file_in* fstf=(Elag_fst_file_in*)malloc(sizeof(Elag_fst_file_in));
if (fstf==NULL) {
   fatal_alloc_error("load_elag_fst2_file");
}
fstf->name=strdup(fname);
if (fstf->name==NULL) {
   fatal_alloc_error("load_elag_fst2_file");
}
if ((fstf->f=u_fopen(vec,fname,U_READ))==NULL) {
   error("load_fst_file: unable to open '%s' for reading\n",fname);
   goto error_fstf;
}
unichar buf[MAXBUF];
if (u_fgets(buf,MAXBUF,fstf->f)==EOF) {
   error("load_fst_file: '%s' is empty\n",fname);
   goto error_f;
}
if (!u_is_digit(*buf)) {
   error("load_fst_file: %s: bad file format\n",fname);
   goto error_f;
}
fstf->nb_automata=u_parse_int(buf);
fstf->language=language;
fstf->type=FST_GRAMMAR;
fstf->pos0=(int)ftell(fstf->f);
fstf->symbols=new_string_hash_ptr(64);
fstf->renumber=NULL;
if (load_elag_fst2_tags(fstf)==-1) {
   error("load_fst_file: %s: cannot load symbols\n",fstf->name);
   goto error_symbols;
}
fstf->pos=0;
return fstf;
/* If an error occurs */
error_symbols: free_string_hash_ptr(fstf->symbols,(void(*)(void*))free_symbols);

error_f: u_fclose(fstf->f);

error_fstf: free(fstf->name);

free(fstf);
return NULL;
}
/**
 * Computes training by extracting statistics from a tagged corpus file.
 */
void do_training(U_FILE* input_text,U_FILE* rforms_file,U_FILE* iforms_file){
/* these two hash tables are respectively for simple and compound entries */
struct string_hash_ptr* rforms_table = NULL, *iforms_table = NULL;
if(rforms_file != NULL){
	rforms_table = new_string_hash_ptr(200000);
}
if(iforms_file != NULL){
	iforms_table = new_string_hash_ptr(200000);
}


/* we initialize a contextual matrix */
struct corpus_entry** context = new_context_matrix();
initialize_context_matrix(context);


unichar line[MAX_TAGGED_CORPUS_LINE];

/* check the format of the corpus */
long previous_file_position = ftell(input_text);
if(u_fgets(line,input_text) == EOF){
	fatal_error("File is empty");
}
fseek(input_text,previous_file_position,SEEK_SET);

int format_corpus = check_corpus_entry(line);

if(format_corpus == 0){
	// the corpus is in the Tagger format, one word per line where line=word/tag
	while(u_fgets(line,input_text) !=EOF){
		if(u_strlen(line) == 0){
			initialize_context_matrix(context);
		}
		else{
			corpus_entry* entry = new_corpus_entry(line);
			if(u_strchr(line,'_')!=NULL && line[0]!='_'){
				corpus_entry** entries = extract_simple_words(entry);
				free_corpus_entry(entry);
				for(int i=0;entries[i]!=NULL;i++){
					push_corpus_entry(entries[i],context);
					add_statistics(context,rforms_table,iforms_table);
				}
				free(entries);
			}
			else {
				push_corpus_entry(entry,context);
				add_statistics(context,rforms_table,iforms_table);
			}
		}
	}
}
else {
	// the corpus is in the Unitex tagged format, one sentence per line where token={word,lemma.tag}
	unichar *tmp,*s = (unichar*)malloc(sizeof(unichar)*(MAX_TAGGED_CORPUS_LINE));
	int current_len,len;
	unsigned int i;
	while(u_fgets(line,input_text) != EOF){
		current_len = 0, len = 0;
		/* extract each token of the sentence */
		for (;;) {
			len = 1+u_strlen(line+current_len)-u_strlen(u_strchr(line+current_len,'}'));
			tmp = u_strcpy_sized(s,len-1,line+current_len+1);
			u_strcat(tmp,"\0");
			if(u_strcmp(s,"S") == 0)
				break;

			//particular case: '\},\}.PONCT'
			if(line[current_len+2] == '}'){
				int start = current_len+3;
				do{
					tmp = u_strchr(line+start,'}');
					start += 1+u_strlen(line+start)-u_strlen(tmp);
				}
				while(*(tmp+1) != ' ');
				tmp = u_strcpy_sized(s,start-current_len-1,line+current_len+1);
				u_strcat(tmp,"\0");
				len += start-current_len-3;
			}

			/* format the {XX.YY} into standard tagger format, XX/YY */
			unichar* newline = (unichar*)malloc(sizeof(unichar)*(8096));
			if(u_strchr(s,',')[1] == ','){
				u_strcpy(newline,",");
			}
			else
				u_strcpy_sized(newline,1+u_strlen(s)-u_strlen(u_strchr(s,',')),s);
			u_sprintf(newline,"%S/%S\0",newline,s+u_strrchr(s,'.')+1);
			for(i=0;i<u_strlen(newline);i++){
				if(newline[i] == ' ')
					newline[i] = '_';
			}

			//create corpus entry
			corpus_entry* entry = new_corpus_entry(newline);
			if(u_strchr(newline,'_') != NULL && newline[0] != '_'){
				corpus_entry** entries = extract_simple_words(entry);
				free_corpus_entry(entry);
				for(int j=0;entries[j]!=NULL;j++){
					push_corpus_entry(entries[j],context);
					add_statistics(context,rforms_table,iforms_table);
				}
				free(entries);
			}
			else {
				push_corpus_entry(entry,context);
				add_statistics(context,rforms_table,iforms_table);
			}

			free(newline);
			current_len += len+1;
		}
		initialize_context_matrix(context);
	}
	free(s);
}
free_context_matrix(context);
/* we fill dictionary files with pairs (tuple,value) and then
 * we add a special line "CODE\tFEATURES,.value" in order to
 * specify whether the dictionary contains inflected or raw form tuples*/
unichar* str = u_strdup("");
if(rforms_table != NULL){
	write_keys_values(rforms_table,rforms_table->hash->root,str,rforms_file);
	u_fprintf(rforms_file,"%s,.%d\n","CODE\tFEATURES",0);
	free_string_hash_ptr(rforms_table,NULL);
}
if(iforms_table != NULL){
	write_keys_values(iforms_table,iforms_table->hash->root,str,iforms_file);
	u_fprintf(iforms_file,"%s,.%d\n","CODE\tFEATURES",1);
	free_string_hash_ptr(iforms_table,NULL);
}
free(str);
}
Beispiel #4
0
/**
 * Returns a new string_hash_ptr object with the default capacity.
 * All such objects use values. If not, the normal string_hash should be
 * used. The 'value' array will be enlarged if needed.
 */
struct string_hash_ptr* new_string_hash_ptr() {
return new_string_hash_ptr(DEFAULT_CAPACITY);
}