/** * \brief Reads a 'concord.ind' file and returns a fifo list of all matches found and their replacement * * \param[in] concord_file_name the name of the concord.ind file * * \return a fifo list of all the matches found with their replacement sentences. Each element is * stored in a locate_pos structure */ struct fifo *read_concord_file(const char *concord_file_name,int mask_encoding_compatibility_input){ unichar line[4096]; struct fifo *f = new_fifo(); U_FILE *concord_desc_file; concord_desc_file = u_fopen_existing_versatile_encoding(mask_encoding_compatibility_input, concord_file_name,U_READ); if( concord_desc_file == NULL){ perror("u_fopen\n"); fprintf(stderr,"Cannot open file %s\n",concord_file_name); exit(1); } if(u_fgets(line,4096,concord_desc_file)==EOF){ fatal_error("Malformed concordance file %s",concord_file_name); } while(u_fgets(line,4096,concord_desc_file)!=EOF){ // we don't want the end of line char line[u_strlen(line)-1]='\0'; locate_pos *l = read_concord_line(line); put_ptr(f,l); } u_fclose(concord_desc_file); return f; }
/** * This function loads the given tagset file and returns the corresponding tagset_t * structure. */ tagset_t* load_tagset(U_FILE* f) { unichar buf[MAXBUF]; token_t* toks=NULL; /* First, we read the language name */ while (toks==NULL) { if ((u_fgets(buf,MAXBUF,f)) == EOF) { error("Tagset definition file is empty\n"); return NULL; } line_cleanup(buf); toks=tokenize(buf); } if (toks->type!=TOK_NAME || toks->next==NULL || toks->next->str==NULL) { fatal_error("Tagset language needs a name\n"); } tagset_t* tagset=new_tagset_t(toks->next->str); int nb=0; pos_section_t* pos; while ((pos=parse_pos_section(f))!=NULL) { pos->next=tagset->pos_sections; tagset->pos_sections=pos; nb++; } free_token_t(toks); u_printf("%d POS definitions loaded.\n",nb); return tagset; }
int main_Reg2Grf(int argc,char* const argv[]) { if (argc==1) { usage(); return 0; } Encoding encoding_output = DEFAULT_ENCODING_OUTPUT; int bom_output = DEFAULT_BOM_OUTPUT; int mask_encoding_compatibility_input = DEFAULT_MASK_ENCODING_COMPATIBILITY_INPUT; int val,index=-1; struct OptVars* vars=new_OptVars(); while (EOF!=(val=getopt_long_TS(argc,argv,optstring_Reg2Grf,lopts_Reg2Grf,&index,vars))) { switch(val) { case 'k': if (vars->optarg[0]=='\0') { fatal_error("Empty input_encoding argument\n"); } decode_reading_encoding_parameter(&mask_encoding_compatibility_input,vars->optarg); break; case 'q': if (vars->optarg[0]=='\0') { fatal_error("Empty output_encoding argument\n"); } decode_writing_encoding_parameter(&encoding_output,&bom_output,vars->optarg); break; case 'h': usage(); return 0; case ':': if (index==-1) fatal_error("Missing argument for option -%c\n",vars->optopt); else fatal_error("Missing argument for option --%s\n",lopts_Reg2Grf[index].name); case '?': if (index==-1) fatal_error("Invalid option -%c\n",vars->optopt); else fatal_error("Invalid option --%s\n",vars->optarg); break; } index=-1; } if (vars->optind!=argc-1) { fatal_error("Invalid arguments: rerun with --help\n"); } U_FILE* f=u_fopen_existing_versatile_encoding(mask_encoding_compatibility_input,argv[vars->optind],U_READ); if (f==NULL) { fatal_error("Cannot open file %s\n",argv[vars->optind]); } /* We read the regular expression in the file */ unichar exp[REG_EXP_MAX_LENGTH]; if ((REG_EXP_MAX_LENGTH-1)==u_fgets(exp,REG_EXP_MAX_LENGTH,f)) { fatal_error("Too long regular expression\n"); } u_fclose(f); char grf_name[FILENAME_MAX]; get_path(argv[vars->optind],grf_name); strcat(grf_name,"regexp.grf"); if (!reg2grf(exp,grf_name,encoding_output,bom_output)) { return 1; } free_OptVars(vars); u_printf("Expression converted.\n"); return 0; }
/** * Loads the tags of the given .fst2 file. Returns 0 in case of success; -1 otherwise. * Note that the position in the file is unchanged after a call to this function. */ int load_elag_fst2_tags(Elag_fst_file_in* fst) { /* We backup the position in the file, and we come back at the * beginning of the file */ long fpos=ftell(fst->f); rewind(fst->f); /* Now, we go to the tags section, skipping all the automata */ unichar buf[MAXBUF]; int i=0; int len; while (i<fst->nb_automata) { if ((len=u_fgets(buf,MAXBUF,fst->f))==EOF) { error("load_fst_tags: %s: unexpected EOF\n",fst->name); return -1; } if (buf[0]=='f' && isspace(buf[1])) { i++; } /* If we have read the beginning of a long line, we skip the rest of the line */ while ((len==MAXBUF-1) && (buf[len-1]!='\n')) { len=u_fgets(buf,MAXBUF,fst->f); } } Ustring* ustr=new_Ustring(64); while (readline(ustr,fst->f) && ustr->str[0]!='f') { if (ustr->str[0]!='%' && ustr->str[0]!='@') { error("load_fst_tags: %s: bad symbol line: '%S'\n",fst->name,ustr->str); return -1; } /* +1 because we ignore the % or @ at the beginning of the line */ symbol_t* symbol=load_grammar_symbol(fst->language,ustr->str+1); /* If 'symbol' is NULL, then an error message has already * been printed. Moreover, we want to associate NULL to the * string, so that we don't exit the function. Whatever it is, * we add the symbol to the symbols of the .fst2 */ get_value_index(ustr->str+1,fst->symbols,INSERT_IF_NEEDED,symbol); } if (*ustr->str==0) { fatal_error("load_fst_tags: unexpected EOF\n"); } free_Ustring(ustr); /* We set back the position in the file */ fseek(fst->f,fpos,SEEK_SET); return 0; }
/** * This function sets the position in the given .fst2 immediately * before the nth automaton. For instance, if we have n=2, the * file position will be set at the beginning of the line "-2 .....". */ void fst_file_seek(Elag_fst_file_in* fstin,int n) { if (n<=0 || n>fstin->nb_automata) { fatal_error("fst_file_seek(%d): automaton number should be in [1;%d]\n",n,fstin->nb_automata); } /* If necessary, we return at the beginning of the file */ if (n<fstin->pos) { fseek(fstin->f,fstin->pos0,SEEK_SET); fstin->pos=0; } unichar buf[MAXBUF]; int len; while (fstin->pos<n-1) { if ((len=u_fgets(buf,MAXBUF,fstin->f))==EOF) { fatal_error("fst_file_seek: %s: unexpected EOF\n",fstin->name); } if (buf[0]=='f' && isspace(buf[1])) { fstin->pos++; } /* In case of a long line, we read the rest of the line */ while ((len==MAXBUF-1) && (buf[len-1] !='\n')) { len=u_fgets(buf,MAXBUF,fstin->f); } } }
/** * Loads a .fst2 file with the given name and type, according to the * given language description. */ Elag_fst_file_in* load_elag_fst2_file(const VersatileEncodingConfig* vec,const char* fname,language_t* language) { Elag_fst_file_in* fstf=(Elag_fst_file_in*)malloc(sizeof(Elag_fst_file_in)); if (fstf==NULL) { fatal_alloc_error("load_elag_fst2_file"); } fstf->name=strdup(fname); if (fstf->name==NULL) { fatal_alloc_error("load_elag_fst2_file"); } if ((fstf->f=u_fopen(vec,fname,U_READ))==NULL) { error("load_fst_file: unable to open '%s' for reading\n",fname); goto error_fstf; } unichar buf[MAXBUF]; if (u_fgets(buf,MAXBUF,fstf->f)==EOF) { error("load_fst_file: '%s' is empty\n",fname); goto error_f; } if (!u_is_digit(*buf)) { error("load_fst_file: %s: bad file format\n",fname); goto error_f; } fstf->nb_automata=u_parse_int(buf); fstf->language=language; fstf->type=FST_GRAMMAR; fstf->pos0=(int)ftell(fstf->f); fstf->symbols=new_string_hash_ptr(64); fstf->renumber=NULL; if (load_elag_fst2_tags(fstf)==-1) { error("load_fst_file: %s: cannot load symbols\n",fstf->name); goto error_symbols; } fstf->pos=0; return fstf; /* If an error occurs */ error_symbols: free_string_hash_ptr(fstf->symbols,(void(*)(void*))free_symbols); error_f: u_fclose(fstf->f); error_fstf: free(fstf->name); free(fstf); return NULL; }
// Utility functions - based on equivalents in liolib.c static int read_line(lua_State *L, UFILE *ufile) { luaL_Buffer b; luaL_buffinit(L, &b); for (;;) { size_t l; UChar* p = icu4lua_prepubuffer(&b); if (u_fgets(p, ICU4LUA_UBUFFERSIZE, ufile) == NULL) { icu4lua_pushuresult(&b, UFILE_UV_USTRING_META, UFILE_UV_USTRING_POOL); if (icu4lua_ustrlen(L,-1) == 0) { return 0; } return 1; } l = u_strlen(p); if (l == 0 || p[l-1] != '\n') { icu4lua_addusize(&b, l); } else { icu4lua_addusize(&b, l - 1); icu4lua_pushuresult(&b, UFILE_UV_USTRING_META, UFILE_UV_USTRING_POOL); return 1; } } }
U_CDECL_BEGIN static void U_CALLCONV DataDrivenPrintf(void) { #if !UCONFIG_NO_FORMATTING && !UCONFIG_NO_FILE_IO UErrorCode errorCode; TestDataModule *dataModule; TestData *testData; const DataMap *testCase; DataDrivenLogger logger; UChar uBuffer[512]; char cBuffer[512]; char cFormat[sizeof(cBuffer)]; char cExpected[sizeof(cBuffer)]; UnicodeString tempStr; UChar format[512]; UChar expectedResult[512]; UChar argument[512]; int32_t i; int8_t i8; int16_t i16; int32_t i32; int64_t i64; double dbl; int32_t uBufferLenReturned; const char *fileLocale = "en_US_POSIX"; int32_t uFileBufferLenReturned; LocalUFILEPointer testFile; errorCode=U_ZERO_ERROR; dataModule=TestDataModule::getTestDataModule("icuio", logger, errorCode); if(U_SUCCESS(errorCode)) { testData=dataModule->createTestData("printf", errorCode); if(U_SUCCESS(errorCode)) { for(i=0; testData->nextCase(testCase, errorCode); ++i) { if(U_FAILURE(errorCode)) { log_err("error retrieving icuio/printf test case %d - %s\n", i, u_errorName(errorCode)); errorCode=U_ZERO_ERROR; continue; } testFile.adoptInstead(u_fopen(STANDARD_TEST_FILE, "w", fileLocale, "UTF-8")); if (testFile.isNull()) { log_err("Can't open test file - %s\n", STANDARD_TEST_FILE); continue; } u_memset(uBuffer, 0x2A, UPRV_LENGTHOF(uBuffer)); uBuffer[UPRV_LENGTHOF(uBuffer)-1] = 0; tempStr=testCase->getString("format", errorCode); tempStr.extract(format, UPRV_LENGTHOF(format), errorCode); tempStr=testCase->getString("result", errorCode); tempStr.extract(expectedResult, UPRV_LENGTHOF(expectedResult), errorCode); tempStr=testCase->getString("argument", errorCode); tempStr.extract(argument, UPRV_LENGTHOF(argument), errorCode); u_austrncpy(cBuffer, format, sizeof(cBuffer)); if(U_FAILURE(errorCode)) { log_err("error retrieving icuio/printf test case %d - %s\n", i, u_errorName(errorCode)); errorCode=U_ZERO_ERROR; continue; } log_verbose("Test %d: format=\"%s\"\n", i, cBuffer); switch (testCase->getString("argumentType", errorCode)[0]) { case 0x64: // 'd' double dbl = atof(u_austrcpy(cBuffer, argument)); uBufferLenReturned = u_sprintf_u(uBuffer, format, dbl); uFileBufferLenReturned = u_fprintf_u(testFile.getAlias(), format, dbl); break; case 0x31: // '1' int8_t i8 = (int8_t)uto64(argument); uBufferLenReturned = u_sprintf_u(uBuffer, format, i8); uFileBufferLenReturned = u_fprintf_u(testFile.getAlias(), format, i8); break; case 0x32: // '2' int16_t i16 = (int16_t)uto64(argument); uBufferLenReturned = u_sprintf_u(uBuffer, format, i16); uFileBufferLenReturned = u_fprintf_u(testFile.getAlias(), format, i16); break; case 0x34: // '4' int32_t i32 = (int32_t)uto64(argument); uBufferLenReturned = u_sprintf_u(uBuffer, format, i32); uFileBufferLenReturned = u_fprintf_u(testFile.getAlias(), format, i32); break; case 0x38: // '8' int64_t i64 = uto64(argument); uBufferLenReturned = u_sprintf_u(uBuffer, format, i64); uFileBufferLenReturned = u_fprintf_u(testFile.getAlias(), format, i64); break; case 0x73: // 's' char * u_austrncpy(cBuffer, argument, sizeof(cBuffer)); uBufferLenReturned = u_sprintf_u(uBuffer, format, cBuffer); uFileBufferLenReturned = u_fprintf_u(testFile.getAlias(), format, cBuffer); break; case 0x53: // 'S' UChar * uBufferLenReturned = u_sprintf_u(uBuffer, format, argument); uFileBufferLenReturned = u_fprintf_u(testFile.getAlias(), format, argument); break; default: uBufferLenReturned = 0; uFileBufferLenReturned = 0; log_err("Unknown type %c for test %d\n", testCase->getString("argumentType", errorCode)[0], i); } if (u_strcmp(uBuffer, expectedResult) != 0) { u_austrncpy(cBuffer, uBuffer, sizeof(cBuffer)); u_austrncpy(cFormat, format, sizeof(cFormat)); u_austrncpy(cExpected, expectedResult, sizeof(cExpected)); cBuffer[sizeof(cBuffer)-1] = 0; log_err("FAILURE string test case %d \"%s\" - Got: \"%s\" Expected: \"%s\"\n", i, cFormat, cBuffer, cExpected); } if (uBufferLenReturned <= 0) { log_err("FAILURE test case %d - \"%s\" is an empty string.\n", i, cBuffer); } else if (uBuffer[uBufferLenReturned-1] == 0 || uBuffer[uBufferLenReturned] != 0 || uBuffer[uBufferLenReturned+1] != 0x2A || uBuffer[uBufferLenReturned+2] != 0x2A) { u_austrncpy(cBuffer, uBuffer, sizeof(cBuffer)); cBuffer[sizeof(cBuffer)-1] = 0; log_err("FAILURE test case %d - \"%s\" wrong amount of characters was written. Got %d.\n", i, cBuffer, uBufferLenReturned); } testFile.adoptInstead(u_fopen(STANDARD_TEST_FILE, "r", fileLocale, "UTF-8")); if (testFile.isNull()) { log_err("Can't open test file - %s\n", STANDARD_TEST_FILE); } uBuffer[0]=0; u_fgets(uBuffer, UPRV_LENGTHOF(uBuffer), testFile.getAlias()); if (u_strcmp(uBuffer, expectedResult) != 0) { u_austrncpy(cBuffer, uBuffer, sizeof(cBuffer)); u_austrncpy(cFormat, format, sizeof(cFormat)); u_austrncpy(cExpected, expectedResult, sizeof(cExpected)); cBuffer[sizeof(cBuffer)-1] = 0; log_err("FAILURE file test case %d \"%s\" - Got: \"%s\" Expected: \"%s\"\n", i, cFormat, cBuffer, cExpected); } if (uFileBufferLenReturned != uBufferLenReturned) { u_austrncpy(cBuffer, uBuffer, sizeof(cBuffer)); cBuffer[sizeof(cBuffer)-1] = 0; log_err("FAILURE uFileBufferLenReturned(%d) != uBufferLenReturned(%d)\n", uFileBufferLenReturned, uBufferLenReturned); } if(U_FAILURE(errorCode)) { log_err("error running icuio/printf test case %d - %s\n", i, u_errorName(errorCode)); errorCode=U_ZERO_ERROR; continue; } } delete testData; } delete dataModule; } else { log_data_err("Failed: could not load test icuio data\n"); } #endif }
/** * This function loads a DLF or a DLC. It computes information about tokens * that will be used during the Locate operation. For instance, if we have the * following line: * * extended,.A * * and if the .fst2 to be applied to the text contains the pattern <A> with, * number 456, then the function will mark the "extended" token to be matched * by the pattern 456. Moreover, all case variations will be taken into account, * so that the "Extended" and "EXTENDED" tokens will also be updated. * * The two parameters 'is_DIC_pattern' and 'is_CDIC_pattern' * indicate if the .fst2 contains the corresponding patterns. For instance, if * the pattern "<CDIC>" is used in the grammar, it means that any token sequence that is a * compound word must be marked as be matched by this pattern. */ void load_dic_for_locate(const char* dic_name,int mask_encoding_compatibility_input,Alphabet* alphabet, int number_of_patterns,int is_DIC_pattern, int is_CDIC_pattern, struct lemma_node* root,struct locate_parameters* parameters) { struct string_hash* tokens=parameters->tokens; U_FILE* f; unichar line[DIC_LINE_SIZE]; f=u_fopen_existing_versatile_encoding(mask_encoding_compatibility_input,dic_name,U_READ); if (f==NULL) { error("Cannot open dictionary %s\n",dic_name); return; } /* We parse all the lines */ int lines=0; char name[FILENAME_MAX]; remove_path(dic_name,name); while (EOF!=u_fgets(line,f)) { lines++; if (lines%10000==0) { u_printf("%s: %d lines loaded... \r",name,lines); } if (line[0]=='/') { /* NOTE: DLF and DLC files are not supposed to contain comment * lines, but we test them, just in the case */ continue; } struct dela_entry* entry=tokenize_DELAF_line(line,1); if (entry==NULL) { /* This case should never happen */ error("Invalid dictionary line in load_dic_for_locate\n"); continue; } /* We add the inflected form to the list of forms associated to the lemma. * This will be used to replace patterns like "<be>" by the actual list of * forms that can be matched by it, for optimization reasons */ add_inflected_form_for_lemma(entry->inflected,entry->lemma,root); /* We get the list of all tokens that can be matched by the inflected form of this * this entry, with regards to case variations (see the "extended" example above). */ struct list_int* ptr=get_token_list_for_sequence(entry->inflected,alphabet,tokens); /* We save the list pointer to free it later */ struct list_int* ptr_copy=ptr; /* Here, we will deal with all simple words */ while (ptr!=NULL) { int i=ptr->n; /* If the current token can be matched, then it can be recognized by the "<DIC>" pattern */ parameters->token_control[i]=(unsigned char)(get_control_byte(tokens->value[i],alphabet,NULL,parameters->tokenization_policy)|DIC_TOKEN_BIT_MASK); if (number_of_patterns) { /* We look for matching patterns only if there are some */ struct list_pointer* list=get_matching_patterns(entry,parameters->pattern_tree_root); if (list!=NULL) { /* If we have some patterns to add */ if (parameters->matching_patterns[i]==NULL) { /* We allocate the pattern bit array, if needed */ parameters->matching_patterns[i]=new_bit_array(number_of_patterns,ONE_BIT); } struct list_pointer* tmp=list; while (tmp!=NULL) { /* Then we add all the pattern numbers to the bit array */ set_value(parameters->matching_patterns[i],((struct constraint_list*)(tmp->pointer))->pattern_number,1); tmp=tmp->next; } /* Finally, we free the constraint list */ free_list_pointer(list); } } ptr=ptr->next; } /* Finally, we free the token list */ free_list_int(ptr_copy); if (!is_a_simple_word(entry->inflected,parameters->tokenization_policy,alphabet)) { /* If the inflected form is a compound word */ if (is_DIC_pattern || is_CDIC_pattern) { /* If the .fst2 contains "<DIC>" and/or "<CDIC>", then we * must note that all compound words can be matched by them */ add_compound_word_with_no_pattern(entry->inflected,alphabet,tokens,parameters->DLC_tree,parameters->tokenization_policy); } if (number_of_patterns) { /* We look for matching patterns only if there are some */ /* We look if the compound word can be matched by some patterns */ struct list_pointer* list=get_matching_patterns(entry,parameters->pattern_tree_root); struct list_pointer* tmp=list; while (tmp!=NULL) { /* If the word is matched by at least one pattern, we store it. */ int pattern_number=((struct constraint_list*)(tmp->pointer))->pattern_number; add_compound_word_with_pattern(entry->inflected,pattern_number,alphabet,tokens,parameters->DLC_tree,parameters->tokenization_policy); tmp=tmp->next; } free_list_pointer(list); } } free_dela_entry(entry); } if (lines>10000) { u_printf("\n"); } u_fclose(f); }
///////////////////////////////////////////////////////////////////////////////// // Inflect a DELAS/DELAC into a DELAF/DELACF. // On error returns 1, 0 otherwise. int inflect(char* DLC, char* DLCF, MultiFlex_ctx* p_multiFlex_ctx, struct l_morpho_t* pL_MORPHO, Alphabet* alph, Encoding encoding_output, int bom_output, int mask_encoding_compatibility_input, int config_files_status, d_class_equiv_T* D_CLASS_EQUIV, int error_check_status, Korean* korean,const char* pkgdir) { U_FILE *dlc, *dlcf; //DELAS/DELAC and DELAF/DELACF files unichar input_line[DIC_LINE_SIZE]; //current DELAS/DELAC line unichar output_line[DIC_LINE_SIZE]; //current DELAF/DELACF line int l; //length of the line scanned DLC_entry_T* dlc_entry; MU_forms_T MU_forms; //inflected forms of the MWU int err; //Open DELAS/DELAC dlc = u_fopen_existing_versatile_encoding(mask_encoding_compatibility_input, DLC, U_READ); if (!dlc) { return 1; } //Open DELAF/DELACF dlcf = u_fopen_creating_versatile_encoding(encoding_output, bom_output, DLCF, U_WRITE); if (!dlcf) { error("Unable to open file: '%s' !\n", DLCF); return 1; } //Inflect one entry at a time l = u_fgets(input_line, DIC_LINE_SIZE - 1, dlc); //Omit the final newline u_chomp_new_line(input_line); int flag = 0; //If a line is empty the file is not necessarily finished. //If the last entry has no newline, we should not skip this entry struct dela_entry* DELAS_entry; int semitic; int current_line=0; while (l != EOF) { current_line++; DELAS_entry = is_strict_DELAS_line(input_line, alph); if (DELAS_entry != NULL) { /* If we have a strict DELAS line, that is to say, one with * a simple word */ if (error_check_status==ONLY_COMPOUND_WORDS) { error("Unexpected simple word forbidden by -c:\n%S\n",input_line); free_dela_entry(DELAS_entry); goto next_line; } SU_forms_T forms; SU_init_forms(&forms); //Allocate the space for forms and initialize it to null values char inflection_code[1024]; unichar code_gramm[1024]; /* We take the first grammatical code, and we extract from it the name * of the inflection transducer to use */ get_inflection_code(DELAS_entry->semantic_codes[0], inflection_code, code_gramm, &semitic); /* And we inflect the word */ // err=SU_inflect(DELAS_entry->lemma,inflection_code,&forms,semitic); err = SU_inflect(p_multiFlex_ctx,pL_MORPHO,encoding_output,bom_output,mask_encoding_compatibility_input,DELAS_entry->lemma, inflection_code, DELAS_entry->filters, &forms, semitic, korean,pkgdir); #ifdef __GNUC__ #warning mettre toutes les entrees sur une meme ligne #elif ((defined(__VISUALC__)) || defined(_MSC_VER)) #pragma message("warning : mettre toutes les entrees sur une meme ligne") #endif /* Then, we print its inflected forms to the output */ for (int i = 0; i < forms.no_forms; i++) { unichar foo[1024]; if (korean!=NULL) { Hanguls_to_Jamos(forms.forms[i].form,foo,korean,1); } else { u_strcpy(foo,forms.forms[i].form); } u_fprintf(dlcf, "%S,%S.%S", foo/*forms.forms[i].form*/, DELAS_entry->lemma, code_gramm); /* We add the semantic codes, if any */ for (int j = 1; j < DELAS_entry->n_semantic_codes; j++) { u_fprintf(dlcf, "+%S", DELAS_entry->semantic_codes[j]); } if (forms.forms[i].local_semantic_code != NULL) { u_fprintf(dlcf, "%S", forms.forms[i].local_semantic_code); } if (forms.forms[i].raw_features != NULL && forms.forms[i].raw_features[0] != '\0') { u_fprintf(dlcf, ":%S", forms.forms[i].raw_features); } u_fprintf(dlcf, "\n"); } SU_delete_inflection(&forms); free_dela_entry(DELAS_entry); /* End of simple word case */ } else { /* If we have not a simple word DELAS line, we try to analyse it * as a compound word DELAC line */ if (error_check_status==ONLY_SIMPLE_WORDS) { error("Unexpected compound word forbidden by -s:\n%S\n",input_line); goto next_line; } if (config_files_status != CONFIG_FILES_ERROR) { /* If this is a compound word, we process it if and only if the * configuration files have been correctly loaded */ dlc_entry = (DLC_entry_T*) malloc(sizeof(DLC_entry_T)); if (!dlc_entry) { fatal_alloc_error("inflect"); } /* Convert a DELAC entry into the internal multi-word format */ err = DLC_line2entry(alph,pL_MORPHO,input_line, dlc_entry, D_CLASS_EQUIV); if (!err) { //Inflect the entry MU_init_forms(&MU_forms); err = MU_inflect(p_multiFlex_ctx,pL_MORPHO,encoding_output,bom_output, mask_encoding_compatibility_input,dlc_entry->lemma, &MU_forms,pkgdir); if (!err) { int f; //index of the current inflected form //Inform the user if no form generated if (MU_forms.no_forms == 0) { error("No inflected form could be generated for "); DLC_print_entry(pL_MORPHO,dlc_entry); } //Print inflected forms for (f = 0; f < MU_forms.no_forms; f++) { //Format the inflected form to the DELACF format err = DLC_format_form(pL_MORPHO,output_line, DIC_LINE_SIZE - 1, MU_forms.forms[f], dlc_entry, D_CLASS_EQUIV); if (!err) { //Print one inflected form at a time to the DELACF file u_fprintf(dlcf, "%S\n", output_line); } } } MU_delete_inflection(&MU_forms); DLC_delete_entry(dlc_entry); } } else { /* We try to inflect a compound word whereas the "Morphology.txt" and/or * "Equivalences.txt" file(s) has/have not been loaded */ if (!flag) { /* We use a flag to print the error message only once */ error( "WARNING: Compound words won't be inflected because configuration files\n"); error(" have not been correctly loaded.\n"); flag = 1; } } } next_line: //Get next entry l = u_fgets(input_line, DIC_LINE_SIZE - 1, dlc); if (l!=EOF) { //Omit the final newline u_chomp_new_line(input_line); if (input_line[0]=='\0') { /* If we find an empty line, then we go on */ goto next_line; } } } u_fclose(dlc); u_fclose(dlcf); return 0; }
/** * Computes training by extracting statistics from a tagged corpus file. */ void do_training(U_FILE* input_text,U_FILE* rforms_file,U_FILE* iforms_file){ /* these two hash tables are respectively for simple and compound entries */ struct string_hash_ptr* rforms_table = NULL, *iforms_table = NULL; if(rforms_file != NULL){ rforms_table = new_string_hash_ptr(200000); } if(iforms_file != NULL){ iforms_table = new_string_hash_ptr(200000); } /* we initialize a contextual matrix */ struct corpus_entry** context = new_context_matrix(); initialize_context_matrix(context); unichar line[MAX_TAGGED_CORPUS_LINE]; /* check the format of the corpus */ long previous_file_position = ftell(input_text); if(u_fgets(line,input_text) == EOF){ fatal_error("File is empty"); } fseek(input_text,previous_file_position,SEEK_SET); int format_corpus = check_corpus_entry(line); if(format_corpus == 0){ // the corpus is in the Tagger format, one word per line where line=word/tag while(u_fgets(line,input_text) !=EOF){ if(u_strlen(line) == 0){ initialize_context_matrix(context); } else{ corpus_entry* entry = new_corpus_entry(line); if(u_strchr(line,'_')!=NULL && line[0]!='_'){ corpus_entry** entries = extract_simple_words(entry); free_corpus_entry(entry); for(int i=0;entries[i]!=NULL;i++){ push_corpus_entry(entries[i],context); add_statistics(context,rforms_table,iforms_table); } free(entries); } else { push_corpus_entry(entry,context); add_statistics(context,rforms_table,iforms_table); } } } } else { // the corpus is in the Unitex tagged format, one sentence per line where token={word,lemma.tag} unichar *tmp,*s = (unichar*)malloc(sizeof(unichar)*(MAX_TAGGED_CORPUS_LINE)); int current_len,len; unsigned int i; while(u_fgets(line,input_text) != EOF){ current_len = 0, len = 0; /* extract each token of the sentence */ for (;;) { len = 1+u_strlen(line+current_len)-u_strlen(u_strchr(line+current_len,'}')); tmp = u_strcpy_sized(s,len-1,line+current_len+1); u_strcat(tmp,"\0"); if(u_strcmp(s,"S") == 0) break; //particular case: '\},\}.PONCT' if(line[current_len+2] == '}'){ int start = current_len+3; do{ tmp = u_strchr(line+start,'}'); start += 1+u_strlen(line+start)-u_strlen(tmp); } while(*(tmp+1) != ' '); tmp = u_strcpy_sized(s,start-current_len-1,line+current_len+1); u_strcat(tmp,"\0"); len += start-current_len-3; } /* format the {XX.YY} into standard tagger format, XX/YY */ unichar* newline = (unichar*)malloc(sizeof(unichar)*(8096)); if(u_strchr(s,',')[1] == ','){ u_strcpy(newline,","); } else u_strcpy_sized(newline,1+u_strlen(s)-u_strlen(u_strchr(s,',')),s); u_sprintf(newline,"%S/%S\0",newline,s+u_strrchr(s,'.')+1); for(i=0;i<u_strlen(newline);i++){ if(newline[i] == ' ') newline[i] = '_'; } //create corpus entry corpus_entry* entry = new_corpus_entry(newline); if(u_strchr(newline,'_') != NULL && newline[0] != '_'){ corpus_entry** entries = extract_simple_words(entry); free_corpus_entry(entry); for(int j=0;entries[j]!=NULL;j++){ push_corpus_entry(entries[j],context); add_statistics(context,rforms_table,iforms_table); } free(entries); } else { push_corpus_entry(entry,context); add_statistics(context,rforms_table,iforms_table); } free(newline); current_len += len+1; } initialize_context_matrix(context); } free(s); } free_context_matrix(context); /* we fill dictionary files with pairs (tuple,value) and then * we add a special line "CODE\tFEATURES,.value" in order to * specify whether the dictionary contains inflected or raw form tuples*/ unichar* str = u_strdup(""); if(rforms_table != NULL){ write_keys_values(rforms_table,rforms_table->hash->root,str,rforms_file); u_fprintf(rforms_file,"%s,.%d\n","CODE\tFEATURES",0); free_string_hash_ptr(rforms_table,NULL); } if(iforms_table != NULL){ write_keys_values(iforms_table,iforms_table->hash->root,str,iforms_file); u_fprintf(iforms_file,"%s,.%d\n","CODE\tFEATURES",1); free_string_hash_ptr(iforms_table,NULL); } free(str); }
/** * This function reads a POS section from the given tagset file and returns the * corresponding structure. */ pos_section_t* parse_pos_section(U_FILE* f) { unichar buf[MAXBUF]; unichar line[MAXBUF]; /* We look for a non empty line containing "POS xxx" */ token_t* toks=NULL; while (toks==NULL) { if (u_fgets(line,MAXBUF,f)==EOF) { return NULL; } line_cleanup(line); u_strcpy(buf,line); toks=tokenize(buf); } if (toks->type!=TOK_POS) { fatal_error("Parsing error: 'POS' section expected (%S).\n",line); } if (toks->next==NULL || toks->next->str==NULL) { fatal_error("POS section needs a name\n"); } pos_section_t* pos_section=new_pos_section_t(toks->next->str); free_token_t(toks); /* Then, we look for all the elements of the POS definition */ int partid=PART_NUM; while (partid!=-1 && u_fgets(line,MAXBUF,f)>0) { line_cleanup(line); u_strcpy(buf,line); toks=tokenize(buf); if (toks==NULL) { continue; } switch (toks->type) { case TOK_IGNORE: pos_section->ignore=true; break; case TOK_DISCR: partid=PART_DISCR; free_token_t(toks); break; case TOK_FLEX: partid=PART_FLEX; free_token_t(toks); break; case TOK_CAT: partid=PART_CAT; free_token_t(toks); break; case TOK_COMPLET: partid=PART_COMP; free_token_t(toks); break; case TOK_END: partid=-1; free_token_t(toks); break; /* We add a tokenized line to the current POS section part */ case TOK_STR: case TOK_ANGLE: case TOK_BLANK: switch (partid) { case PART_DISCR: if (pos_section->parts[PART_DISCR]!=NULL) { fatal_error("Only one discriminant category can be specified.\n"); } case PART_CAT: if (check_cat_line(toks)==-1) { fatal_error("Bad cat line format: '%S'\n", line); } break; case PART_FLEX: if (check_flex_line(toks)==-1) { fatal_error("Bad flex line format: '%S'\n", line); } break; case PART_COMP: if (check_complete_line(toks)==-1) { fatal_error("Bad complete line format: '%S'\n", line); } break; case PART_NUM: fatal_error("No section specified. (line '%S')\n", line); default: fatal_error("While parsing POS section: what am i doing here?\n"); } pos_section->parts[partid]=tokens_list_append(pos_section->parts[partid],toks); break; default: fatal_error("Error while parsing POS section with line '%S'\n", line); } } return pos_section; }