示例#1
0
文件: Cassys.cpp 项目: adri87/Q-A
/**
 * \brief Reads a 'concord.ind' file and returns a fifo list of all matches found and their replacement
 *
 * \param[in] concord_file_name the name of the concord.ind file
 *
 * \return a fifo list of all the matches found with their replacement sentences. Each element is
 * stored in a locate_pos structure
 */
struct fifo *read_concord_file(const char *concord_file_name,int mask_encoding_compatibility_input){
	unichar line[4096];

	struct fifo *f = new_fifo();

	U_FILE *concord_desc_file;
	concord_desc_file = u_fopen_existing_versatile_encoding(mask_encoding_compatibility_input, concord_file_name,U_READ);
	if( concord_desc_file == NULL){
		perror("u_fopen\n");
		fprintf(stderr,"Cannot open file %s\n",concord_file_name);
		exit(1);
	}

	if(u_fgets(line,4096,concord_desc_file)==EOF){
		fatal_error("Malformed concordance file %s",concord_file_name);
	}

	while(u_fgets(line,4096,concord_desc_file)!=EOF){

		// we don't want the end of line char
		line[u_strlen(line)-1]='\0';
		locate_pos *l = read_concord_line(line);
		put_ptr(f,l);

	}

	u_fclose(concord_desc_file);
	return f;
}
示例#2
0
文件: Tagset.cpp 项目: adri87/Q-A
/**
 * This function loads the given tagset file and returns the corresponding tagset_t
 * structure.
 */
tagset_t* load_tagset(U_FILE* f) {
unichar buf[MAXBUF];
token_t* toks=NULL;
/* First, we read the language name */
while (toks==NULL) {
   if ((u_fgets(buf,MAXBUF,f)) == EOF) {
      error("Tagset definition file is empty\n");
      return NULL;
   }
   line_cleanup(buf);
   toks=tokenize(buf);
}
if (toks->type!=TOK_NAME || toks->next==NULL || toks->next->str==NULL) {
   fatal_error("Tagset language needs a name\n");
}
tagset_t* tagset=new_tagset_t(toks->next->str);
int nb=0;
pos_section_t* pos;
while ((pos=parse_pos_section(f))!=NULL) {
   pos->next=tagset->pos_sections;
   tagset->pos_sections=pos;
   nb++;
}
free_token_t(toks);
u_printf("%d POS definitions loaded.\n",nb);
return tagset;
}
示例#3
0
文件: Reg2Grf.cpp 项目: adri87/Q-A
int main_Reg2Grf(int argc,char* const argv[]) {
if (argc==1) {
   usage();
   return 0;
}

Encoding encoding_output = DEFAULT_ENCODING_OUTPUT;
int bom_output = DEFAULT_BOM_OUTPUT;
int mask_encoding_compatibility_input = DEFAULT_MASK_ENCODING_COMPATIBILITY_INPUT;

int val,index=-1;
struct OptVars* vars=new_OptVars();
while (EOF!=(val=getopt_long_TS(argc,argv,optstring_Reg2Grf,lopts_Reg2Grf,&index,vars))) {
   switch(val) {
   case 'k': if (vars->optarg[0]=='\0') {
                fatal_error("Empty input_encoding argument\n");
             }
             decode_reading_encoding_parameter(&mask_encoding_compatibility_input,vars->optarg);
             break;
   case 'q': if (vars->optarg[0]=='\0') {
                fatal_error("Empty output_encoding argument\n");
             }
             decode_writing_encoding_parameter(&encoding_output,&bom_output,vars->optarg);
             break;
   case 'h': usage(); return 0;
   case ':': if (index==-1) fatal_error("Missing argument for option -%c\n",vars->optopt);
             else fatal_error("Missing argument for option --%s\n",lopts_Reg2Grf[index].name);
   case '?': if (index==-1) fatal_error("Invalid option -%c\n",vars->optopt);
             else fatal_error("Invalid option --%s\n",vars->optarg);
             break;
   }
   index=-1;
}

if (vars->optind!=argc-1) {
   fatal_error("Invalid arguments: rerun with --help\n");
}

U_FILE* f=u_fopen_existing_versatile_encoding(mask_encoding_compatibility_input,argv[vars->optind],U_READ);
if (f==NULL) {
   fatal_error("Cannot open file %s\n",argv[vars->optind]);
}
/* We read the regular expression in the file */
unichar exp[REG_EXP_MAX_LENGTH];
if ((REG_EXP_MAX_LENGTH-1)==u_fgets(exp,REG_EXP_MAX_LENGTH,f)) {
   fatal_error("Too long regular expression\n");
}
u_fclose(f);
char grf_name[FILENAME_MAX];
get_path(argv[vars->optind],grf_name);
strcat(grf_name,"regexp.grf");
if (!reg2grf(exp,grf_name,encoding_output,bom_output)) {
   return 1;
}
free_OptVars(vars);
u_printf("Expression converted.\n");
return 0;
}
/**
 * Loads the tags of the given .fst2 file. Returns 0 in case of success; -1 otherwise.
 * Note that the position in the file is unchanged after a call to this function.
 */
int load_elag_fst2_tags(Elag_fst_file_in* fst) {
/* We backup the position in the file, and we come back at the
 * beginning of the file */
long fpos=ftell(fst->f);
rewind(fst->f);
/* Now, we go to the tags section, skipping all the automata */
unichar buf[MAXBUF];
int i=0;
int len;
while (i<fst->nb_automata) {
   if ((len=u_fgets(buf,MAXBUF,fst->f))==EOF) {
      error("load_fst_tags: %s: unexpected EOF\n",fst->name);
      return -1;
   }
   if (buf[0]=='f' && isspace(buf[1])) {
      i++;
   }
   /* If we have read the beginning of a long line, we skip the rest of the line */
   while ((len==MAXBUF-1) && (buf[len-1]!='\n')) {
      len=u_fgets(buf,MAXBUF,fst->f);
   }
}
Ustring* ustr=new_Ustring(64);
while (readline(ustr,fst->f) && ustr->str[0]!='f') {
   if (ustr->str[0]!='%' && ustr->str[0]!='@') {
      error("load_fst_tags: %s: bad symbol line: '%S'\n",fst->name,ustr->str);
      return -1;
   }
   /* +1 because we ignore the % or @ at the beginning of the line */
   symbol_t* symbol=load_grammar_symbol(fst->language,ustr->str+1);
   /* If 'symbol' is NULL, then an error message has already
    * been printed. Moreover, we want to associate NULL to the
    * string, so that we don't exit the function. Whatever it is,
    * we add the symbol to the symbols of the .fst2 */
   get_value_index(ustr->str+1,fst->symbols,INSERT_IF_NEEDED,symbol);
}
if (*ustr->str==0) {
   fatal_error("load_fst_tags: unexpected EOF\n");
}
free_Ustring(ustr);
/* We set back the position in the file */
fseek(fst->f,fpos,SEEK_SET);
return 0;
}
/**
 * This function sets the position in the given .fst2 immediately
 * before the nth automaton. For instance, if we have n=2, the
 * file position will be set at the beginning of the line "-2 .....".
 */
void fst_file_seek(Elag_fst_file_in* fstin,int n) {
if (n<=0 || n>fstin->nb_automata) {
   fatal_error("fst_file_seek(%d): automaton number should be in [1;%d]\n",n,fstin->nb_automata);
}
/* If necessary, we return at the beginning of the file */
if (n<fstin->pos) {
   fseek(fstin->f,fstin->pos0,SEEK_SET);
   fstin->pos=0;
}
unichar buf[MAXBUF];
int len;
while (fstin->pos<n-1) {
   if ((len=u_fgets(buf,MAXBUF,fstin->f))==EOF) {
      fatal_error("fst_file_seek: %s: unexpected EOF\n",fstin->name);
   }
   if (buf[0]=='f' && isspace(buf[1])) {
      fstin->pos++;
   }
   /* In case of a long line, we read the rest of the line */
   while ((len==MAXBUF-1) && (buf[len-1] !='\n')) {
      len=u_fgets(buf,MAXBUF,fstin->f);
   }
}
}
/**
 * Loads a .fst2 file with the given name and type, according to the
 * given language description.
 */
Elag_fst_file_in* load_elag_fst2_file(const VersatileEncodingConfig* vec,const char* fname,language_t* language) {
Elag_fst_file_in* fstf=(Elag_fst_file_in*)malloc(sizeof(Elag_fst_file_in));
if (fstf==NULL) {
   fatal_alloc_error("load_elag_fst2_file");
}
fstf->name=strdup(fname);
if (fstf->name==NULL) {
   fatal_alloc_error("load_elag_fst2_file");
}
if ((fstf->f=u_fopen(vec,fname,U_READ))==NULL) {
   error("load_fst_file: unable to open '%s' for reading\n",fname);
   goto error_fstf;
}
unichar buf[MAXBUF];
if (u_fgets(buf,MAXBUF,fstf->f)==EOF) {
   error("load_fst_file: '%s' is empty\n",fname);
   goto error_f;
}
if (!u_is_digit(*buf)) {
   error("load_fst_file: %s: bad file format\n",fname);
   goto error_f;
}
fstf->nb_automata=u_parse_int(buf);
fstf->language=language;
fstf->type=FST_GRAMMAR;
fstf->pos0=(int)ftell(fstf->f);
fstf->symbols=new_string_hash_ptr(64);
fstf->renumber=NULL;
if (load_elag_fst2_tags(fstf)==-1) {
   error("load_fst_file: %s: cannot load symbols\n",fstf->name);
   goto error_symbols;
}
fstf->pos=0;
return fstf;
/* If an error occurs */
error_symbols: free_string_hash_ptr(fstf->symbols,(void(*)(void*))free_symbols);

error_f: u_fclose(fstf->f);

error_fstf: free(fstf->name);

free(fstf);
return NULL;
}
示例#7
0
// Utility functions - based on equivalents in liolib.c
static int read_line(lua_State *L, UFILE *ufile) {
	luaL_Buffer b;
	luaL_buffinit(L, &b);
	for (;;) {
		size_t l;
		UChar* p = icu4lua_prepubuffer(&b);
		if (u_fgets(p, ICU4LUA_UBUFFERSIZE, ufile) == NULL) {
			icu4lua_pushuresult(&b, UFILE_UV_USTRING_META, UFILE_UV_USTRING_POOL);
			if (icu4lua_ustrlen(L,-1) == 0) {
				return 0;
			}
			return 1;
		}
		l = u_strlen(p);
		if (l == 0 || p[l-1] != '\n') {
			icu4lua_addusize(&b, l);
		}
		else {
			icu4lua_addusize(&b, l - 1);
			icu4lua_pushuresult(&b, UFILE_UV_USTRING_META, UFILE_UV_USTRING_POOL);
			return 1;
		}
	}
}
示例#8
0
U_CDECL_BEGIN
static void U_CALLCONV DataDrivenPrintf(void)
{
#if !UCONFIG_NO_FORMATTING && !UCONFIG_NO_FILE_IO
    UErrorCode errorCode;
    TestDataModule *dataModule;
    TestData *testData;
    const DataMap *testCase;
    DataDrivenLogger logger;
    UChar uBuffer[512];
    char cBuffer[512];
    char cFormat[sizeof(cBuffer)];
    char cExpected[sizeof(cBuffer)];
    UnicodeString tempStr;
    UChar format[512];
    UChar expectedResult[512];
    UChar argument[512];
    int32_t i;
    int8_t i8;
    int16_t i16;
    int32_t i32;
    int64_t i64;
    double dbl;
    int32_t uBufferLenReturned;

    const char *fileLocale = "en_US_POSIX";
    int32_t uFileBufferLenReturned;
    LocalUFILEPointer testFile;

    errorCode=U_ZERO_ERROR;
    dataModule=TestDataModule::getTestDataModule("icuio", logger, errorCode);
    if(U_SUCCESS(errorCode)) {
        testData=dataModule->createTestData("printf", errorCode);
        if(U_SUCCESS(errorCode)) {
            for(i=0; testData->nextCase(testCase, errorCode); ++i) {
                if(U_FAILURE(errorCode)) {
                    log_err("error retrieving icuio/printf test case %d - %s\n",
                            i, u_errorName(errorCode));
                    errorCode=U_ZERO_ERROR;
                    continue;
                }
                testFile.adoptInstead(u_fopen(STANDARD_TEST_FILE, "w", fileLocale, "UTF-8"));
                if (testFile.isNull()) {
                    log_err("Can't open test file - %s\n",
                            STANDARD_TEST_FILE);
                    continue;
                }
                u_memset(uBuffer, 0x2A, UPRV_LENGTHOF(uBuffer));
                uBuffer[UPRV_LENGTHOF(uBuffer)-1] = 0;
                tempStr=testCase->getString("format", errorCode);
                tempStr.extract(format, UPRV_LENGTHOF(format), errorCode);
                tempStr=testCase->getString("result", errorCode);
                tempStr.extract(expectedResult, UPRV_LENGTHOF(expectedResult), errorCode);
                tempStr=testCase->getString("argument", errorCode);
                tempStr.extract(argument, UPRV_LENGTHOF(argument), errorCode);
                u_austrncpy(cBuffer, format, sizeof(cBuffer));
                if(U_FAILURE(errorCode)) {
                    log_err("error retrieving icuio/printf test case %d - %s\n",
                            i, u_errorName(errorCode));
                    errorCode=U_ZERO_ERROR;
                    continue;
                }
                log_verbose("Test %d: format=\"%s\"\n", i, cBuffer);
                switch (testCase->getString("argumentType", errorCode)[0]) {
                case 0x64:  // 'd' double
                    dbl = atof(u_austrcpy(cBuffer, argument));
                    uBufferLenReturned = u_sprintf_u(uBuffer, format, dbl);
                    uFileBufferLenReturned = u_fprintf_u(testFile.getAlias(), format, dbl);
                    break;
                case 0x31:  // '1' int8_t
                    i8 = (int8_t)uto64(argument);
                    uBufferLenReturned = u_sprintf_u(uBuffer, format, i8);
                    uFileBufferLenReturned = u_fprintf_u(testFile.getAlias(), format, i8);
                    break;
                case 0x32:  // '2' int16_t
                    i16 = (int16_t)uto64(argument);
                    uBufferLenReturned = u_sprintf_u(uBuffer, format, i16);
                    uFileBufferLenReturned = u_fprintf_u(testFile.getAlias(), format, i16);
                    break;
                case 0x34:  // '4' int32_t
                    i32 = (int32_t)uto64(argument);
                    uBufferLenReturned = u_sprintf_u(uBuffer, format, i32);
                    uFileBufferLenReturned = u_fprintf_u(testFile.getAlias(), format, i32);
                    break;
                case 0x38:  // '8' int64_t
                    i64 = uto64(argument);
                    uBufferLenReturned = u_sprintf_u(uBuffer, format, i64);
                    uFileBufferLenReturned = u_fprintf_u(testFile.getAlias(), format, i64);
                    break;
                case 0x73:  // 's' char *
                    u_austrncpy(cBuffer, argument, sizeof(cBuffer));
                    uBufferLenReturned = u_sprintf_u(uBuffer, format, cBuffer);
                    uFileBufferLenReturned = u_fprintf_u(testFile.getAlias(), format, cBuffer);
                    break;
                case 0x53:  // 'S' UChar *
                    uBufferLenReturned = u_sprintf_u(uBuffer, format, argument);
                    uFileBufferLenReturned = u_fprintf_u(testFile.getAlias(), format, argument);
                    break;
                default:
                    uBufferLenReturned = 0;
                    uFileBufferLenReturned = 0;
                    log_err("Unknown type %c for test %d\n", testCase->getString("argumentType", errorCode)[0], i);
                }
                if (u_strcmp(uBuffer, expectedResult) != 0) {
                    u_austrncpy(cBuffer, uBuffer, sizeof(cBuffer));
                    u_austrncpy(cFormat, format, sizeof(cFormat));
                    u_austrncpy(cExpected, expectedResult, sizeof(cExpected));
                    cBuffer[sizeof(cBuffer)-1] = 0;
                    log_err("FAILURE string test case %d \"%s\" - Got: \"%s\" Expected: \"%s\"\n",
                            i, cFormat, cBuffer, cExpected);
                }
                if (uBufferLenReturned <= 0) {
                    log_err("FAILURE test case %d - \"%s\" is an empty string.\n",
                            i, cBuffer);
                }
                else if (uBuffer[uBufferLenReturned-1] == 0
                    || uBuffer[uBufferLenReturned] != 0
                    || uBuffer[uBufferLenReturned+1] != 0x2A
                    || uBuffer[uBufferLenReturned+2] != 0x2A)
                {
                    u_austrncpy(cBuffer, uBuffer, sizeof(cBuffer));
                    cBuffer[sizeof(cBuffer)-1] = 0;
                    log_err("FAILURE test case %d - \"%s\" wrong amount of characters was written. Got %d.\n",
                            i, cBuffer, uBufferLenReturned);
                }
                testFile.adoptInstead(u_fopen(STANDARD_TEST_FILE, "r", fileLocale, "UTF-8"));
                if (testFile.isNull()) {
                    log_err("Can't open test file - %s\n",
                            STANDARD_TEST_FILE);
                }
                uBuffer[0]=0;
                u_fgets(uBuffer, UPRV_LENGTHOF(uBuffer), testFile.getAlias());
                if (u_strcmp(uBuffer, expectedResult) != 0) {
                    u_austrncpy(cBuffer, uBuffer, sizeof(cBuffer));
                    u_austrncpy(cFormat, format, sizeof(cFormat));
                    u_austrncpy(cExpected, expectedResult, sizeof(cExpected));
                    cBuffer[sizeof(cBuffer)-1] = 0;
                    log_err("FAILURE file test case %d \"%s\" - Got: \"%s\" Expected: \"%s\"\n",
                            i, cFormat, cBuffer, cExpected);
                }
                if (uFileBufferLenReturned != uBufferLenReturned)
                {
                    u_austrncpy(cBuffer, uBuffer, sizeof(cBuffer));
                    cBuffer[sizeof(cBuffer)-1] = 0;
                    log_err("FAILURE uFileBufferLenReturned(%d) != uBufferLenReturned(%d)\n",
                            uFileBufferLenReturned, uBufferLenReturned);
                }

                if(U_FAILURE(errorCode)) {
                    log_err("error running icuio/printf test case %d - %s\n",
                            i, u_errorName(errorCode));
                    errorCode=U_ZERO_ERROR;
                    continue;
                }
            }
            delete testData;
        }
        delete dataModule;
    }
    else {
        log_data_err("Failed: could not load test icuio data\n");
    }
#endif
}
示例#9
0
/**
 * This function loads a DLF or a DLC. It computes information about tokens
 * that will be used during the Locate operation. For instance, if we have the
 * following line:
 *
 *   extended,.A
 *
 * and if the .fst2 to be applied to the text contains the pattern <A> with,
 * number 456, then the function will mark the "extended" token to be matched
 * by the pattern 456. Moreover, all case variations will be taken into account,
 * so that the "Extended" and "EXTENDED" tokens will also be updated.
 *
 * The two parameters 'is_DIC_pattern' and 'is_CDIC_pattern'
 * indicate if the .fst2 contains the corresponding patterns. For instance, if
 * the pattern "<CDIC>" is used in the grammar, it means that any token sequence that is a
 * compound word must be marked as be matched by this pattern.
 */
void load_dic_for_locate(const char* dic_name,int mask_encoding_compatibility_input,Alphabet* alphabet,
                         int number_of_patterns,int is_DIC_pattern,
                         int is_CDIC_pattern,
                         struct lemma_node* root,struct locate_parameters* parameters) {
    struct string_hash* tokens=parameters->tokens;
    U_FILE* f;
    unichar line[DIC_LINE_SIZE];
    f=u_fopen_existing_versatile_encoding(mask_encoding_compatibility_input,dic_name,U_READ);
    if (f==NULL) {
        error("Cannot open dictionary %s\n",dic_name);
        return;
    }
    /* We parse all the lines */
    int lines=0;
    char name[FILENAME_MAX];
    remove_path(dic_name,name);
    while (EOF!=u_fgets(line,f)) {
        lines++;
        if (lines%10000==0) {
            u_printf("%s: %d lines loaded...                          \r",name,lines);
        }
        if (line[0]=='/') {
            /* NOTE: DLF and DLC files are not supposed to contain comment
             *       lines, but we test them, just in the case */
            continue;
        }
        struct dela_entry* entry=tokenize_DELAF_line(line,1);
        if (entry==NULL) {
            /* This case should never happen */
            error("Invalid dictionary line in load_dic_for_locate\n");
            continue;
        }
        /* We add the inflected form to the list of forms associated to the lemma.
         * This will be used to replace patterns like "<be>" by the actual list of
         * forms that can be matched by it, for optimization reasons */
        add_inflected_form_for_lemma(entry->inflected,entry->lemma,root);
        /* We get the list of all tokens that can be matched by the inflected form of this
         * this entry, with regards to case variations (see the "extended" example above). */
        struct list_int* ptr=get_token_list_for_sequence(entry->inflected,alphabet,tokens);
        /* We save the list pointer to free it later */
        struct list_int* ptr_copy=ptr;
        /* Here, we will deal with all simple words */
        while (ptr!=NULL) {
            int i=ptr->n;
            /* If the current token can be matched, then it can be recognized by the "<DIC>" pattern */
            parameters->token_control[i]=(unsigned char)(get_control_byte(tokens->value[i],alphabet,NULL,parameters->tokenization_policy)|DIC_TOKEN_BIT_MASK);
            if (number_of_patterns) {
                /* We look for matching patterns only if there are some */
                struct list_pointer* list=get_matching_patterns(entry,parameters->pattern_tree_root);
                if (list!=NULL) {
                    /* If we have some patterns to add */
                    if (parameters->matching_patterns[i]==NULL) {
                        /* We allocate the pattern bit array, if needed */
                        parameters->matching_patterns[i]=new_bit_array(number_of_patterns,ONE_BIT);
                    }
                    struct list_pointer* tmp=list;
                    while (tmp!=NULL) {
                        /* Then we add all the pattern numbers to the bit array */
                        set_value(parameters->matching_patterns[i],((struct constraint_list*)(tmp->pointer))->pattern_number,1);
                        tmp=tmp->next;
                    }
                    /* Finally, we free the constraint list */
                    free_list_pointer(list);
                }
            }
            ptr=ptr->next;
        }
        /* Finally, we free the token list */
        free_list_int(ptr_copy);
        if (!is_a_simple_word(entry->inflected,parameters->tokenization_policy,alphabet)) {
            /* If the inflected form is a compound word */
            if (is_DIC_pattern || is_CDIC_pattern) {
                /* If the .fst2 contains "<DIC>" and/or "<CDIC>", then we
                 * must note that all compound words can be matched by them */
                add_compound_word_with_no_pattern(entry->inflected,alphabet,tokens,parameters->DLC_tree,parameters->tokenization_policy);
            }
            if (number_of_patterns) {
                /* We look for matching patterns only if there are some */
                /* We look if the compound word can be matched by some patterns */
                struct list_pointer* list=get_matching_patterns(entry,parameters->pattern_tree_root);
                struct list_pointer* tmp=list;
                while (tmp!=NULL) {
                    /* If the word is matched by at least one pattern, we store it. */
                    int pattern_number=((struct constraint_list*)(tmp->pointer))->pattern_number;
                    add_compound_word_with_pattern(entry->inflected,pattern_number,alphabet,tokens,parameters->DLC_tree,parameters->tokenization_policy);
                    tmp=tmp->next;
                }
                free_list_pointer(list);
            }
        }
        free_dela_entry(entry);
    }
    if (lines>10000) {
        u_printf("\n");
    }
    u_fclose(f);
}
示例#10
0
/////////////////////////////////////////////////////////////////////////////////
// Inflect a DELAS/DELAC into a DELAF/DELACF.
// On error returns 1, 0 otherwise.
int inflect(char* DLC, char* DLCF, 
		    MultiFlex_ctx* p_multiFlex_ctx, struct l_morpho_t* pL_MORPHO, Alphabet* alph,
		    Encoding encoding_output, int bom_output, int mask_encoding_compatibility_input,
		    int config_files_status,
		    d_class_equiv_T* D_CLASS_EQUIV, int error_check_status,
		    Korean* korean,const char* pkgdir) {
	U_FILE *dlc, *dlcf; //DELAS/DELAC and DELAF/DELACF files
	unichar input_line[DIC_LINE_SIZE]; //current DELAS/DELAC line
	unichar output_line[DIC_LINE_SIZE]; //current DELAF/DELACF line
	int l; //length of the line scanned
	DLC_entry_T* dlc_entry;
	MU_forms_T MU_forms; //inflected forms of the MWU
	int err;

	//Open DELAS/DELAC
	dlc = u_fopen_existing_versatile_encoding(mask_encoding_compatibility_input, DLC, U_READ);
	if (!dlc) {
		return 1;
	}
	//Open DELAF/DELACF
	dlcf = u_fopen_creating_versatile_encoding(encoding_output, bom_output, DLCF, U_WRITE);
	if (!dlcf) {
		error("Unable to open file: '%s' !\n", DLCF);
		return 1;
	}
	//Inflect one entry at a time
	l = u_fgets(input_line, DIC_LINE_SIZE - 1, dlc);
	//Omit the final newline
	u_chomp_new_line(input_line);
	int flag = 0;
	//If a line is empty the file is not necessarily finished.
	//If the last entry has no newline, we should not skip this entry
	struct dela_entry* DELAS_entry;
	int semitic;
	int current_line=0;
	while (l != EOF) {
	   current_line++;
		DELAS_entry = is_strict_DELAS_line(input_line, alph);
		if (DELAS_entry != NULL) {
			/* If we have a strict DELAS line, that is to say, one with
			 * a simple word */
			if (error_check_status==ONLY_COMPOUND_WORDS) {
				error("Unexpected simple word forbidden by -c:\n%S\n",input_line);
				free_dela_entry(DELAS_entry);
				goto next_line;
			}
			SU_forms_T forms;
			SU_init_forms(&forms); //Allocate the space for forms and initialize it to null values
			char inflection_code[1024];
			unichar code_gramm[1024];
			/* We take the first grammatical code, and we extract from it the name
			 * of the inflection transducer to use */
			get_inflection_code(DELAS_entry->semantic_codes[0],
					inflection_code, code_gramm, &semitic);
			/* And we inflect the word */
			//   err=SU_inflect(DELAS_entry->lemma,inflection_code,&forms,semitic);
			err = SU_inflect(p_multiFlex_ctx,pL_MORPHO,encoding_output,bom_output,mask_encoding_compatibility_input,DELAS_entry->lemma, inflection_code,
					DELAS_entry->filters, &forms, semitic, korean,pkgdir);
#ifdef __GNUC__
#warning mettre toutes les entrees sur une meme ligne
#elif ((defined(__VISUALC__)) || defined(_MSC_VER))
#pragma message("warning : mettre toutes les entrees sur une meme ligne")
#endif
			/* Then, we print its inflected forms to the output */
			for (int i = 0; i < forms.no_forms; i++) {
			   
			   unichar foo[1024];   
			   if (korean!=NULL) {
			      Hanguls_to_Jamos(forms.forms[i].form,foo,korean,1);
			   } else {
			      u_strcpy(foo,forms.forms[i].form);
			   }
			   
			   u_fprintf(dlcf, "%S,%S.%S", foo/*forms.forms[i].form*/,
						DELAS_entry->lemma, code_gramm);
				/* We add the semantic codes, if any */
				for (int j = 1; j < DELAS_entry->n_semantic_codes; j++) {
					u_fprintf(dlcf, "+%S", DELAS_entry->semantic_codes[j]);
				}
				if (forms.forms[i].local_semantic_code != NULL) {
					u_fprintf(dlcf, "%S", forms.forms[i].local_semantic_code);
				}
				if (forms.forms[i].raw_features != NULL
						&& forms.forms[i].raw_features[0] != '\0') {
					u_fprintf(dlcf, ":%S", forms.forms[i].raw_features);
				}
				u_fprintf(dlcf, "\n");
			}
			SU_delete_inflection(&forms);
			free_dela_entry(DELAS_entry);
			/* End of simple word case */
		} else {
			/* If we have not a simple word DELAS line, we try to analyse it
			 * as a compound word DELAC line */
			if (error_check_status==ONLY_SIMPLE_WORDS) {
				error("Unexpected compound word forbidden by -s:\n%S\n",input_line);
				goto next_line;
			}
			if (config_files_status != CONFIG_FILES_ERROR) {
				/* If this is a compound word, we process it if and only if the
				 * configuration files have been correctly loaded */
				dlc_entry = (DLC_entry_T*) malloc(sizeof(DLC_entry_T));
				if (!dlc_entry) {
					fatal_alloc_error("inflect");
				}
				/* Convert a DELAC entry into the internal multi-word format */
				err = DLC_line2entry(alph,pL_MORPHO,input_line, dlc_entry, D_CLASS_EQUIV);
				if (!err) {
					//Inflect the entry
					MU_init_forms(&MU_forms);
					err = MU_inflect(p_multiFlex_ctx,pL_MORPHO,encoding_output,bom_output,
							mask_encoding_compatibility_input,dlc_entry->lemma, &MU_forms,pkgdir);
					if (!err) {
						int f; //index of the current inflected form
						//Inform the user if no form generated
						if (MU_forms.no_forms == 0) {
							error("No inflected form could be generated for ");
							DLC_print_entry(pL_MORPHO,dlc_entry);
						}
						//Print inflected forms
						for (f = 0; f < MU_forms.no_forms; f++) {
							//Format the inflected form to the DELACF format
							err = DLC_format_form(pL_MORPHO,output_line, DIC_LINE_SIZE
									- 1, MU_forms.forms[f], dlc_entry,
									D_CLASS_EQUIV);
							if (!err) {
								//Print one inflected form at a time to the DELACF file
								u_fprintf(dlcf, "%S\n", output_line);
							}
						}
					}
					MU_delete_inflection(&MU_forms);
					DLC_delete_entry(dlc_entry);
				}
			} else {
				/* We try to inflect a compound word whereas the "Morphology.txt" and/or
				 * "Equivalences.txt" file(s) has/have not been loaded */
				if (!flag) {
					/* We use a flag to print the error message only once */
					error(
							"WARNING: Compound words won't be inflected because configuration files\n");
					error("         have not been correctly loaded.\n");
					flag = 1;
				}
			}
		}
		next_line:
		//Get next entry
		l = u_fgets(input_line, DIC_LINE_SIZE - 1, dlc);
		if (l!=EOF) {
			//Omit the final newline
			u_chomp_new_line(input_line);
			if (input_line[0]=='\0') {
				/* If we find an empty line, then we go on */
				goto next_line;
			}
		}
	}
	u_fclose(dlc);
	u_fclose(dlcf);
	return 0;
}
示例#11
0
/**
 * Computes training by extracting statistics from a tagged corpus file.
 */
void do_training(U_FILE* input_text,U_FILE* rforms_file,U_FILE* iforms_file){
/* these two hash tables are respectively for simple and compound entries */
struct string_hash_ptr* rforms_table = NULL, *iforms_table = NULL;
if(rforms_file != NULL){
	rforms_table = new_string_hash_ptr(200000);
}
if(iforms_file != NULL){
	iforms_table = new_string_hash_ptr(200000);
}


/* we initialize a contextual matrix */
struct corpus_entry** context = new_context_matrix();
initialize_context_matrix(context);


unichar line[MAX_TAGGED_CORPUS_LINE];

/* check the format of the corpus */
long previous_file_position = ftell(input_text);
if(u_fgets(line,input_text) == EOF){
	fatal_error("File is empty");
}
fseek(input_text,previous_file_position,SEEK_SET);

int format_corpus = check_corpus_entry(line);

if(format_corpus == 0){
	// the corpus is in the Tagger format, one word per line where line=word/tag
	while(u_fgets(line,input_text) !=EOF){
		if(u_strlen(line) == 0){
			initialize_context_matrix(context);
		}
		else{
			corpus_entry* entry = new_corpus_entry(line);
			if(u_strchr(line,'_')!=NULL && line[0]!='_'){
				corpus_entry** entries = extract_simple_words(entry);
				free_corpus_entry(entry);
				for(int i=0;entries[i]!=NULL;i++){
					push_corpus_entry(entries[i],context);
					add_statistics(context,rforms_table,iforms_table);
				}
				free(entries);
			}
			else {
				push_corpus_entry(entry,context);
				add_statistics(context,rforms_table,iforms_table);
			}
		}
	}
}
else {
	// the corpus is in the Unitex tagged format, one sentence per line where token={word,lemma.tag}
	unichar *tmp,*s = (unichar*)malloc(sizeof(unichar)*(MAX_TAGGED_CORPUS_LINE));
	int current_len,len;
	unsigned int i;
	while(u_fgets(line,input_text) != EOF){
		current_len = 0, len = 0;
		/* extract each token of the sentence */
		for (;;) {
			len = 1+u_strlen(line+current_len)-u_strlen(u_strchr(line+current_len,'}'));
			tmp = u_strcpy_sized(s,len-1,line+current_len+1);
			u_strcat(tmp,"\0");
			if(u_strcmp(s,"S") == 0)
				break;

			//particular case: '\},\}.PONCT'
			if(line[current_len+2] == '}'){
				int start = current_len+3;
				do{
					tmp = u_strchr(line+start,'}');
					start += 1+u_strlen(line+start)-u_strlen(tmp);
				}
				while(*(tmp+1) != ' ');
				tmp = u_strcpy_sized(s,start-current_len-1,line+current_len+1);
				u_strcat(tmp,"\0");
				len += start-current_len-3;
			}

			/* format the {XX.YY} into standard tagger format, XX/YY */
			unichar* newline = (unichar*)malloc(sizeof(unichar)*(8096));
			if(u_strchr(s,',')[1] == ','){
				u_strcpy(newline,",");
			}
			else
				u_strcpy_sized(newline,1+u_strlen(s)-u_strlen(u_strchr(s,',')),s);
			u_sprintf(newline,"%S/%S\0",newline,s+u_strrchr(s,'.')+1);
			for(i=0;i<u_strlen(newline);i++){
				if(newline[i] == ' ')
					newline[i] = '_';
			}

			//create corpus entry
			corpus_entry* entry = new_corpus_entry(newline);
			if(u_strchr(newline,'_') != NULL && newline[0] != '_'){
				corpus_entry** entries = extract_simple_words(entry);
				free_corpus_entry(entry);
				for(int j=0;entries[j]!=NULL;j++){
					push_corpus_entry(entries[j],context);
					add_statistics(context,rforms_table,iforms_table);
				}
				free(entries);
			}
			else {
				push_corpus_entry(entry,context);
				add_statistics(context,rforms_table,iforms_table);
			}

			free(newline);
			current_len += len+1;
		}
		initialize_context_matrix(context);
	}
	free(s);
}
free_context_matrix(context);
/* we fill dictionary files with pairs (tuple,value) and then
 * we add a special line "CODE\tFEATURES,.value" in order to
 * specify whether the dictionary contains inflected or raw form tuples*/
unichar* str = u_strdup("");
if(rforms_table != NULL){
	write_keys_values(rforms_table,rforms_table->hash->root,str,rforms_file);
	u_fprintf(rforms_file,"%s,.%d\n","CODE\tFEATURES",0);
	free_string_hash_ptr(rforms_table,NULL);
}
if(iforms_table != NULL){
	write_keys_values(iforms_table,iforms_table->hash->root,str,iforms_file);
	u_fprintf(iforms_file,"%s,.%d\n","CODE\tFEATURES",1);
	free_string_hash_ptr(iforms_table,NULL);
}
free(str);
}
示例#12
0
文件: Tagset.cpp 项目: adri87/Q-A
/**
 * This function reads a POS section from the given tagset file and returns the
 * corresponding structure.
 */
pos_section_t* parse_pos_section(U_FILE* f) {
unichar buf[MAXBUF];
unichar line[MAXBUF];
/* We look for a non empty line containing "POS xxx" */
token_t* toks=NULL;
while (toks==NULL) {
   if (u_fgets(line,MAXBUF,f)==EOF) {
      return NULL;
   }
   line_cleanup(line);
   u_strcpy(buf,line);
   toks=tokenize(buf);
}
if (toks->type!=TOK_POS) {
   fatal_error("Parsing error: 'POS' section expected (%S).\n",line);
}
if (toks->next==NULL || toks->next->str==NULL) {
   fatal_error("POS section needs a name\n");
}
pos_section_t* pos_section=new_pos_section_t(toks->next->str);
free_token_t(toks);
/* Then, we look for all the elements of the POS definition */
int partid=PART_NUM;
while (partid!=-1 && u_fgets(line,MAXBUF,f)>0) {
   line_cleanup(line);
   u_strcpy(buf,line);
   toks=tokenize(buf);
   if (toks==NULL) {
      continue;
   }
   switch (toks->type) {
      case TOK_IGNORE:
         pos_section->ignore=true;
         break;

      case TOK_DISCR:
         partid=PART_DISCR;
         free_token_t(toks);
         break;

      case TOK_FLEX:
         partid=PART_FLEX;
         free_token_t(toks);
         break;

      case TOK_CAT:
         partid=PART_CAT;
         free_token_t(toks);
         break;

      case TOK_COMPLET:
         partid=PART_COMP;
         free_token_t(toks);
         break;

      case TOK_END:
         partid=-1;
         free_token_t(toks);
         break;

      /* We add a tokenized line to the current POS section part */
      case TOK_STR:
      case TOK_ANGLE:
      case TOK_BLANK:
         switch (partid) {
            case PART_DISCR:
               if (pos_section->parts[PART_DISCR]!=NULL) {
                  fatal_error("Only one discriminant category can be specified.\n");
               }

            case PART_CAT:
               if (check_cat_line(toks)==-1) {
                  fatal_error("Bad cat line format: '%S'\n", line);
               }
               break;

            case PART_FLEX:
               if (check_flex_line(toks)==-1) {
                  fatal_error("Bad flex line format: '%S'\n", line);
               }
               break;

            case PART_COMP:
               if (check_complete_line(toks)==-1) {
                  fatal_error("Bad complete line format: '%S'\n", line);
               }
               break;

            case PART_NUM:
               fatal_error("No section specified. (line '%S')\n", line);

            default:	fatal_error("While parsing POS section: what am i doing here?\n");
         }
         pos_section->parts[partid]=tokens_list_append(pos_section->parts[partid],toks);
         break;

      default: fatal_error("Error while parsing POS section with line '%S'\n", line);
   }
}
return pos_section;
}