void protect_text(const char *fileName, const VersatileEncodingConfig* vec){ U_FILE *file_reader = u_fopen(vec, fileName, U_READ); if(file_reader == NULL){ fatal_error("u_fopen"); } unichar *text = read_file(file_reader); unichar *protected_text = protect_lexical_tag(text, false); free(text); u_fclose(file_reader); U_FILE *file_write = u_fopen(vec, fileName, U_WRITE); if(file_write == NULL){ fatal_error("u_fopen"); } int written = u_fwrite(protected_text, u_strlen(protected_text),file_write); if(written != (int)u_strlen(protected_text)){ fatal_error("u_fwrite"); } u_fclose(file_write); free(protected_text); }
static OFILE * find_config_file (OlyStatus *status) { char *path = strcat(strcpy( omalloc((strlen(getenv("HOME")) + strlen(DOT_CONFIG_NAME)+1)), getenv("HOME")), DOT_CONFIG_NAME), result[BUFSIZ], *token, *watch; const char *basename = ETC_CONFIG_NAME, *sysconf_path = SYSCONFDIR; const char *the_colon = ":"; size_t name_size = ( strlen(basename) + 1 ), result_len = BUFSIZ; OFILE *return_file = NULL; void *free_me; if (*status != OLY_OKAY) HANDLE_OLY_STATUS(*status); { return NULL; } if ( path != NULL ) { return_file = u_fopen( path , "rb", NULL, NULL ); free_me = (void *)path; OFREE(free_me); } if (return_file == NULL) { path = omalloc( (strlen(sysconf_path) + 1) ); strcpy(path, sysconf_path); for ( token = strtok_r(path, the_colon, &watch); ( token != NULL ); token = strtok_r(NULL, the_colon, &watch) ) { if ((token != NULL) && ((strlen(token) + name_size) < result_len)) { strcpy( result, token ); strcat( result, basename ); return_file = u_fopen( result, "rb", char_default_locale(), char_default_encoding() ); } else { *status = OLY_ERR_FILE_NOT_FOUND; *result = '\0'; } } free_me = (void *)path; OFREE(free_me); } return return_file; }
/** * This function reads the given char order file. */ void read_char_order(const VersatileEncodingConfig* vec, const char* name, struct sort_infos* inf) { int c; int current_line = 1; U_FILE* f = u_fopen(vec, name, U_READ); if (f == NULL) { error("Cannot open file %s\n", name); return; } unichar current_canonical = '\0'; int current_priority = 0; while ((c = u_fgetc(f)) != EOF) { if (c != '\n') { /* we ignore the \n char */ if (inf->class_numbers[(unichar) c] != 0) { error("Error in %s: char 0x%x appears several times\n", name, c); } else { inf->class_numbers[(unichar) c] = current_line; if (current_canonical == '\0') { current_canonical = (unichar) c; } inf->canonical[(unichar) c] = current_canonical; inf->priority[(unichar) c] = ++current_priority; } } else { current_line++; current_canonical = '\0'; current_priority = 0; } } u_fclose(f); }
/** * Loads a compound word file, adding each word to the keywords. */ void load_compound_words(char* name,VersatileEncodingConfig* vec, struct string_hash_ptr* keywords) { U_FILE* f=u_fopen(vec,name,U_READ); if (f==NULL) return; Ustring* line=new_Ustring(256); Ustring* lower=new_Ustring(256); while (EOF!=readline(line,f)) { if (line->str[0]=='{') { /* We skip tags */ continue; } u_strcpy(lower,line->str); u_tolower(lower->str); int index=get_value_index(lower->str,keywords,INSERT_IF_NEEDED,NULL); if (index==-1) { fatal_error("Internal error in load_tokens_by_freq\n"); } KeyWord* value=(KeyWord*)keywords->value[index]; add_keyword(&value,line->str,1); keywords->value[index]=value; } free_Ustring(line); free_Ustring(lower); u_fclose(f); }
/** * Opens a .fst2 file in output mode and returns the associated fst_file_out_t * structure, or NULL in case of error. */ Elag_fst_file_out* fst_file_out_open(const VersatileEncodingConfig* vec,const char* fname,int type) { Elag_fst_file_out* res=(Elag_fst_file_out*)malloc(sizeof(Elag_fst_file_out)); if (res==NULL) { fatal_alloc_error("fst_file_out_open"); } if (type<0 || type>=FST_BAD_TYPE) { fatal_error("fst_file_out_open: bad FST_TYPE\n"); } if ((res->f=u_fopen(vec,fname,U_WRITE))==NULL) { error("fst_out_open: unable to open '%s'\n",fname); free(res); return NULL; } res->fstart=ftell(res->f); u_fprintf(res->f,"0000000000\n"); res->name=strdup(fname); if (res->name==NULL) { fatal_alloc_error("fst_file_out_open"); } res->type=type; res->nb_automata=0; res->labels=new_string_hash(16); /* We add <E> to the tags in order to be sure that this special tag will have #0 */ get_value_index(EPSILON,res->labels); return res; }
U_CAPI UFILE* U_EXPORT2 u_fopen_u(const UChar *filename, const char *perm, const char *locale, const char *codepage) { UFILE *result; char buffer[256]; u_austrcpy(buffer, filename); result = u_fopen(buffer, perm, locale, codepage); #if U_PLATFORM_USES_ONLY_WIN32_API /* Try Windows API _wfopen if the above fails. */ if (!result) { // TODO: test this code path, including wperm. wchar_t wperm[40] = {}; size_t retVal; mbstowcs_s(&retVal, wperm, perm, _TRUNCATE); FILE *systemFile = _wfopen((const wchar_t *)filename, wperm); if (systemFile) { result = finit_owner(systemFile, locale, codepage, TRUE); } if (!result) { /* Something bad happened. Maybe the converter couldn't be opened. */ fclose(systemFile); } } #endif return result; /* not a file leak */ }
U_CAPI UFILE* U_EXPORT2 u_fopen_u(const UChar *filename, const char *perm, const char *locale, const char *codepage) { UFILE *result; char buffer[256]; u_austrcpy(buffer, filename); result = u_fopen(buffer, perm, locale, codepage); #if U_PLATFORM_USES_ONLY_WIN32_API /* Try Windows API _wfopen if the above fails. */ if (!result) { FILE *systemFile = _wfopen(filename, (UChar*)perm); if (systemFile) { result = finit_owner(systemFile, locale, codepage, TRUE); } if (!result) { /* Something bad happened. Maybe the converter couldn't be opened. */ fclose(systemFile); } } #endif return result; /* not a file leak */ }
cassys_tokens_list *cassys_load_text(const char *tokens_text_name, const char *text_cod_name, struct text_tokens **tokens){ int mask_encoding_compatibility_input = DEFAULT_MASK_ENCODING_COMPATIBILITY_INPUT; *tokens = load_text_tokens(tokens_text_name,mask_encoding_compatibility_input); U_FILE *f = u_fopen(BINARY, text_cod_name,U_READ); if( f == NULL){ perror("fopen\n"); fprintf(stderr,"Cannot open file %s\n",text_cod_name); exit(1); } cassys_tokens_list *list = NULL; cassys_tokens_list *temp = list; int token_id; int char_read = (int)fread(&token_id,sizeof(int),1,f); while(char_read ==1){ if(list==NULL){ list = new_element((*tokens)->token[token_id],0); temp = list; } else { temp ->next_token = new_element((*tokens)->token[token_id],0); temp = temp -> next_token; } char_read = (int)fread(&token_id,sizeof(int),1,f); } u_fclose(f); return list; }
/** * Loads the initial keyword list from a tok_by_freq.txt file, * and turns all those tokens in a list whose primary key is the * lower case token: * The/20 THE/2 the/50 => the->(The/20 THE/2 the/50) */ struct string_hash_ptr* load_tokens_by_freq(char* name,VersatileEncodingConfig* vec) { U_FILE* f=u_fopen(vec,name,U_READ); if (f==NULL) return NULL; Ustring* line=new_Ustring(128); Ustring* lower=new_Ustring(128); struct string_hash_ptr* res=new_string_hash_ptr(1024); int val,pos; /* We skip the first line of the file, containing the number * of tokens */ if (EOF==readline(line,f)) { fatal_error("Invalid empty file %s\n",name); } while (EOF!=readline(line,f)) { if (1!=u_sscanf(line->str,"%d%n",&val,&pos)) { fatal_error("Invalid line in file %s:\n%S\n",name,line->str); } u_strcpy(lower,line->str+pos); u_tolower(lower->str); int index=get_value_index(lower->str,res,INSERT_IF_NEEDED,NULL); if (index==-1) { fatal_error("Internal error in load_tokens_by_freq\n"); } KeyWord* value=(KeyWord*)res->value[index]; res->value[index]=new_KeyWord(val,line->str+pos,value); } free_Ustring(line); free_Ustring(lower); u_fclose(f); return res; }
/** * Returns the size in bytes of the given file, or -1 if not found. */ long get_file_size(const char* name) { U_FILE* f=u_fopen(ASCII,name,U_READ); if (f==NULL) return -1; fseek(f,0,SEEK_END); long size=ftell(f); u_fclose(f); return size; }
/** * This function takes two concordance index (in1 and in2) and * produces a HTML file (out) that shows the differences between * those two concordances. */ int diff(const VersatileEncodingConfig* vec,const char* in1,const char* in2,const char* out, const char* font,int size,int diff_only) { char concor1[FILENAME_MAX]; char concor2[FILENAME_MAX]; get_path(in1,concor1); strcat(concor1,"concord-1.txt"); get_path(in2,concor2); strcat(concor2,"concord-2.txt"); /* First, we build the two concordances */ create_text_concordances(vec,in1,in2,concor1,concor2); /* Then, we load the two index */ U_FILE* f1=u_fopen(vec,in1,U_READ); if (f1==NULL) return 0; struct match_list* l1=load_match_list(f1,NULL,NULL); u_fclose(f1); U_FILE* f2=u_fopen(vec,in2,U_READ); if (f2==NULL) { return 0; } struct match_list* l2=load_match_list(f2,NULL,NULL); u_fclose(f2); /* We open the output file in UTF8, because the GUI expects this file * to be that encoded */ U_FILE* output=u_fopen(UTF8,out,U_WRITE); if (output==NULL) { fatal_error("Cannot open output file %s\n",out); return 0; } /* We open the two concordance files */ f1=u_fopen(vec,concor1,U_READ); f2=u_fopen(vec,concor2,U_READ); /* And then we fill the output file with the differences * between the two concordances */ print_diff_HTML_header(output,font,size); compute_concordance_differences(l1,l2,f1,f2,output,diff_only); print_diff_HTML_end(output); free_match_list(l1); free_match_list(l2); u_fclose(f1); u_fclose(f2); u_fclose(output); /* We remove the tmp files */ //af_remove(concor1); //af_remove(concor2); return 1; }
/** * This function takes a unicode string representing a regular expression and * compiles it into a .grf file. It returns 1 in case of success; 0 otherwise. */ int reg2grf(const unichar* regexp,const char* name_grf, const VersatileEncodingConfig* vec) { if (regexp[0]=='\0') { error("You must specify a non empty regular expression\n"); return 0; } U_FILE* out=u_fopen(vec,name_grf,U_WRITE); if (out==NULL) { error("Cannot open the output file for the regular expression\n"); return 0; } struct reg2grf_info* INFO=new_reg2grf_info(); /* We create the initial and final states that must have numbers 0 and 1 */ add_state(INFO,u_strdup("<E>")); add_state(INFO,u_strdup("")); /* We print the grf header */ u_fprintf(out,"#Unigraph\n"); u_fprintf(out,"SIZE 1313 950\n"); u_fprintf(out,"FONT Times New Roman: 12\n"); u_fprintf(out,"OFONT Times New Roman:B 12\n"); u_fprintf(out,"BCOLOR 16777215\n"); u_fprintf(out,"FCOLOR 0\n"); u_fprintf(out,"ACOLOR 12632256\n"); u_fprintf(out,"SCOLOR 16711680\n"); u_fprintf(out,"CCOLOR 255\n"); u_fprintf(out,"DBOXES y\n"); u_fprintf(out,"DFRAME y\n"); u_fprintf(out,"DDATE y\n"); u_fprintf(out,"DFILE y\n"); u_fprintf(out,"DDIR y\n"); u_fprintf(out,"DRIG n\n"); u_fprintf(out,"DRST n\n"); u_fprintf(out,"FITS 100\n"); u_fprintf(out,"PORIENT L\n"); u_fprintf(out,"#\n"); int input_state; int output_state; int result=reg_2_grf(regexp,&input_state,&output_state,INFO); if (result!=1) { u_fclose(out); af_remove(name_grf); free_reg2grf_info(INFO); if (result==0) { error("Syntax error in regular expression\n"); } return 0; } /* If the compilation has successed, we must link the resulting automaton piece * to the grf's initial and final states */ add_transition(0,input_state,INFO); add_transition(output_state,1,INFO); save_states(out,INFO); free_reg2grf_info(INFO); u_fclose(out); return 1; }
int save_offsets(const VersatileEncodingConfig* vec, const char* filename, const vector_offset* offsets) { U_FILE* f_output_offsets = u_fopen(vec, filename, U_WRITE); if (f_output_offsets == NULL) { error("Cannot create offset file %s\n", filename); return 1; } save_offsets(f_output_offsets, offsets); u_fclose(f_output_offsets); return 0; }
/** * Loads an alphabet file and returns the associated 'Alphabet*' structure. * If 'korean' is non null, we compute the equivalences between Chinese and Hangul * characters. */ Alphabet* load_alphabet(const VersatileEncodingConfig* vec,const char* filename,int korean) { void* a=get_persistent_structure(filename); if (a!=NULL) { return (Alphabet*)a; } U_FILE* f; f=u_fopen(vec,filename,U_READ); if (f==NULL) { return NULL; } Alphabet* alphabet=new_alphabet(korean); int c; unichar lower,upper; while ((c=u_fgetc(f))!=EOF) { upper=(unichar)c; if (upper=='\n') { /* We skip empty lines */ continue; } if (upper=='#') { // we are in the case of an interval #AZ -> [A..Z] lower=(unichar)u_fgetc(f); upper=(unichar)u_fgetc(f); if (lower>upper) { error("Error in alphabet file: for an interval like #AZ, A must be before Z\n"); free_alphabet(alphabet); u_fclose(f); return NULL; } for (c=lower;c<=upper;c++) { SET_CASE_FLAG_MACRO(c,alphabet,1|2); add_letter_equivalence(alphabet,(unichar)c,(unichar)c); } u_fgetc(f); // reading the \n } else { SET_CASE_FLAG_MACRO(upper,alphabet,1); lower=(unichar)u_fgetc(f); if (lower!='\n') { SET_CASE_FLAG_MACRO(lower,alphabet,2); u_fgetc(f); // reading the \n add_letter_equivalence(alphabet,lower,upper); } else { // we are in the case of a single (no min/maj distinction like in thai) SET_CASE_FLAG_MACRO(upper,alphabet,2); add_letter_equivalence(alphabet,upper,upper); } } } u_fclose(f); return alphabet; }
/** * Saves snt offsets to the given file, as a binary file containing integers. * Returns 1 in case of success; 0 otherwise. */ int save_snt_offsets(vector_int* snt_offsets,const char* name) { if (snt_offsets==NULL) { fatal_error("Unexpected NULL offsets in save_snt_offsets\n"); } if (snt_offsets->nbelems%3 != 0) { fatal_error("Invalid offsets in save_snt_offsets\n"); } U_FILE* f=u_fopen(BINARY,name,U_WRITE); if (f==NULL) return 0; int ret=(int)(fwrite(snt_offsets->tab,sizeof(int),snt_offsets->nbelems,f)); u_fclose(f); return (ret==snt_offsets->nbelems); }
/** * Change this to report differently when a library or commandline tool */ void jni_report( const char *fmt, ... ) { va_list ap; UChar message[128]; va_start( ap, fmt ); u_vsnprintf( message, 128, fmt, ap ); UFILE *db = u_fopen("/tmp/formatter-debug.txt","a+",NULL,NULL); if ( db != NULL ) { u_fprintf( db, "%s", message ); u_fclose( db ); } va_end( ap ); }
/** * Loads the given offset file. Returns NULL in case of error. */ vector_offset* load_offsets(const VersatileEncodingConfig* vec,const char* name) { U_FILE* f=u_fopen(vec,name,U_READ); if (f==NULL) return NULL; int a,b,c,d,n; vector_offset* res=new_vector_offset(); while ((n=u_fscanf(f,"%d%d%d%d",&a,&b,&c,&d))!=EOF) { if (n!=4) { fatal_error("Corrupted offset file %s\n",name); } vector_offset_add(res,a,b,c,d); } u_fclose(f); return res; }
int do_list_file_in_pack_archive_to_file_with_encoding(const char* packFileName, const char* filename_out, Encoding encoding, int filename_only) { U_FILE* fileout = NULL; if (filename_out != NULL) if (*filename_out != '\0') { fileout = u_fopen(encoding, filename_out, U_WRITE); } int result = do_list_file_in_pack_archive_to_filehandle(packFileName, fileout, filename_only); if (fileout != NULL) u_fclose(fileout); return result; }
static jz_val load(JZ_STATE, jz_args* args, jz_val arg) { char* filename = jz_str_to_chars(jz, jz_to_str(jz, arg)); UFILE* file = u_fopen(filename, "r", NULL, NULL); jz_val result; if (file == NULL) { fprintf(stderr, "File does not exist: %s\n", filename); exit(1); } free(filename); result = jz_load(jz, file); u_fclose(file); return result; }
/** * Loads the given DELAF and modifies the given keywords accordingly by * replacing any non removed token that appear in a DELAF entry * by its lemma. If there are ambiguities, several keywords are * generated. Doing that may merge keywords by adding their weights: * eats/2 + eaten/3 => eat/5 */ void filter_keywords_with_dic(struct string_hash_ptr* keywords,char* name, VersatileEncodingConfig* vec,Alphabet* alphabet) { U_FILE* f=u_fopen(vec,name,U_READ); if (f==NULL) { error("Cannot load file %s\n",name); return; } Ustring* line=new_Ustring(128); while (EOF!=readline(line,f)) { struct dela_entry* e=tokenize_DELAF_line(line->str); if (e==NULL) continue; lemmatize(e,keywords,alphabet); free_dela_entry(e); } free_Ustring(line); u_fclose(f); }
/** * Reads the start and end positions of each token stored in the file * produced by Tokenize's --output_offsets option. */ vector_uima_offset* load_uima_offsets(const VersatileEncodingConfig* vec,const char* name) { U_FILE* f; f=u_fopen(vec,name,U_READ); if (f==NULL) { return NULL; } vector_int* v=new_vector_int(); Ustring* line=new_Ustring(); int a,b,c; while (EOF!=readline(line,f)) { u_sscanf(line->str,"%d%d%d",&a,&b,&c); vector_int_add(v,b); vector_int_add(v,c); } free_Ustring(line); u_fclose(f); return (vector_uima_offset*)v; }
struct transducer_name_and_mode_linked_list *load_transducer_list_file(const char *transducer_list_name) { U_FILE *file_transducer_list; struct transducer_name_and_mode_linked_list * res=NULL; file_transducer_list = u_fopen(ASCII, transducer_list_name,U_READ); if( file_transducer_list == NULL){ perror("u_fopen\n"); fprintf(stderr,"Impossible d'ouvrir le fichier %s\n",transducer_list_name); exit(1); } char line[1024]; int i=1; while (cassys_fgets(line,1024,file_transducer_list) != NULL){ char *transducer_file_name; OutputPolicy transducer_policy; remove_cassys_comments(line); transducer_file_name = extract_cassys_transducer_name(line); //fprintf(stdout, "transducer name read =%s\n",transducer_file_name); transducer_policy = extract_cassys_transducer_policy(line); if (transducer_file_name != NULL && transducer_policy != IGNORE_OUTPUTS) { res=add_transducer_linked_list_new_name(res,transducer_file_name); set_last_transducer_linked_list_mode(res,transducer_policy); } else { if (transducer_file_name == NULL) { fprintf(stdout, "Line %d : Empty line\n",i); } else if (transducer_policy == IGNORE_OUTPUTS) { fprintf(stdout, "Line %d : Transducer policy not recognized\n",i); } } free(transducer_file_name); i++; } u_fclose(file_transducer_list); return res; }
/** * Loads a .fst2 file with the given name and type, according to the * given language description. */ Elag_fst_file_in* load_elag_fst2_file(const VersatileEncodingConfig* vec,const char* fname,language_t* language) { Elag_fst_file_in* fstf=(Elag_fst_file_in*)malloc(sizeof(Elag_fst_file_in)); if (fstf==NULL) { fatal_alloc_error("load_elag_fst2_file"); } fstf->name=strdup(fname); if (fstf->name==NULL) { fatal_alloc_error("load_elag_fst2_file"); } if ((fstf->f=u_fopen(vec,fname,U_READ))==NULL) { error("load_fst_file: unable to open '%s' for reading\n",fname); goto error_fstf; } unichar buf[MAXBUF]; if (u_fgets(buf,MAXBUF,fstf->f)==EOF) { error("load_fst_file: '%s' is empty\n",fname); goto error_f; } if (!u_is_digit(*buf)) { error("load_fst_file: %s: bad file format\n",fname); goto error_f; } fstf->nb_automata=u_parse_int(buf); fstf->language=language; fstf->type=FST_GRAMMAR; fstf->pos0=(int)ftell(fstf->f); fstf->symbols=new_string_hash_ptr(64); fstf->renumber=NULL; if (load_elag_fst2_tags(fstf)==-1) { error("load_fst_file: %s: cannot load symbols\n",fstf->name); goto error_symbols; } fstf->pos=0; return fstf; /* If an error occurs */ error_symbols: free_string_hash_ptr(fstf->symbols,(void(*)(void*))free_symbols); error_f: u_fclose(fstf->f); error_fstf: free(fstf->name); free(fstf); return NULL; }
/** * Loads the given DELA into the given DELA tree. */ void load_DELA(const VersatileEncodingConfig* vec,const char* name,struct DELA_tree* tree) { U_FILE* f=u_fopen(vec,name,U_READ); if (f==NULL) { error("Cannot load dictionary %s\n",name); return; } u_printf("Loading %s...\n",name); Ustring* line=new_Ustring(4096); while (EOF!=readline(line,f)) { struct dela_entry* entry=tokenize_DELAF_line(line->str,1); if (entry!=NULL) { add_entry(tree,entry); } /* We don't need to free the entry, since it's done (if needed) * in the insertion function */ } free_Ustring(line); u_fclose(f); }
/** * Loads snt offsets from the given binary file. */ vector_int* load_snt_offsets(const char* name) { U_FILE* f=u_fopen(BINARY,name,U_READ); if (f==NULL) return NULL; long size=get_file_size(f); if (size%(3*sizeof(int))!=0) { u_fclose(f); return NULL; } vector_int* v=new_vector_int((int)(size/sizeof(int))); if (size!=0) { int n=(int)fread(v->tab,sizeof(int),size/sizeof(int),f); u_fclose(f); if (n!=(int)(size/sizeof(int))) { free_vector_int(v); return NULL; } v->nbelems=v->size; } return v; }
static int icu_ufile_open(lua_State *L) { const char *filename = luaL_checkstring(L, 1); const char *mode = luaL_optstring(L, 2, "r"); UFILE* ufile; /* FILE* f = fopen(filename, mode); if (f == NULL) { return pushresult(L, 0, filename); } ufile = u_finit(f, luaL_optstring(L,4,NULL), luaL_optstring(L,3,NULL)); */ ufile = u_fopen(filename, mode, luaL_optstring(L,4,NULL), luaL_optstring(L,3,NULL)); if (ufile == NULL) { lua_pushnil(L); lua_pushstring(L, "unable to initialize ufile"); return 2; } *(UFILE**)lua_newuserdata(L, sizeof(UFILE*)) = ufile; lua_pushvalue(L, UFILE_UV_META); lua_setmetatable(L, -2); return 1; }
/** * This function reads a file that contains a list of Elag grammar names, * and it compiles them into the file 'outname'. However, if the result * automaton is too big, it will be saved in several automata inside * the output file. */ int compile_elag_rules(char* rulesname,char* outname, const VersatileEncodingConfig* vec,language_t* language) { u_printf("Compilation of %s\n",rulesname); U_FILE* f=NULL; U_FILE* frules=u_fopen(ASCII,rulesname,U_READ); if (frules==NULL) { fatal_error("Cannot open file '%s'\n",rulesname); } U_FILE* out=u_fopen(ASCII,outname,U_WRITE); if (out==NULL) { fatal_error("cannot open file '%s'\n",outname); } /* Name of the file that contains the result automaton */ char fstoutname[FILENAME_MAX]; int nbRules=0; char buf[FILENAME_MAX]; time_t start_time=time(0); Fst2Automaton* res=NULL; Fst2Automaton* A; int fst_number=0; Ustring* ustr=new_Ustring(); char buf2[FILENAME_MAX]; char directory[FILENAME_MAX]; get_path(rulesname,directory); while (af_fgets(buf,FILENAME_MAX,frules->f)) { /* We read one by one the Elag grammar names in the .lst file */ chomp(buf); if (*buf=='\0') { /* If we have an empty line */ continue; } if (!is_absolute_path(buf)) { strcpy(buf2,buf); sprintf(buf,"%s%s",directory,buf2); } u_printf("\n%s...\n",buf); remove_extension(buf); strcat(buf,".elg"); if ((f=u_fopen(ASCII,buf,U_READ))==NULL) { /* If the .elg file doesn't exist, we create one */ remove_extension(buf); u_printf("Precompiling %s.fst2\n",buf); strcat(buf,".fst2"); elRule* rule=new_elRule(buf,vec,language); if (rule==NULL) { fatal_error("Unable to read grammar '%s'\n",buf); } if ((A=compile_elag_rule(rule,language))==NULL) { fatal_error("Unable to compile rule '%s'\n",buf); } free_elRule(rule); } else { /* If there is already .elg, we use it */ u_fclose(f); A=load_elag_grammar_automaton(vec,buf,language); if (A==NULL) { fatal_error("Unable to load '%s'\n",buf); } } if (A->automaton->number_of_states==0) { error("Grammar %s forbids everything!\n",buf); } if (res!=NULL) { /* If there is already an automaton, we intersect it with the new one */ SingleGraph tmp=res->automaton; res->automaton=elag_intersection(language,tmp,A->automaton,GRAMMAR_GRAMMAR); free_SingleGraph(tmp,NULL); free_Fst2Automaton(A,NULL); trim(res->automaton,NULL); } else { res=A; } nbRules++; if (res->automaton->number_of_states>MAX_GRAM_SIZE) { /* If the automaton is too large, we will split the grammar * into several automata */ elag_minimize(res->automaton,1); sprintf(fstoutname,"%s-%d.elg",outname,fst_number++); u_fprintf(out,"<%s>\n",fstoutname); u_printf("Splitting big grammar in '%s' (%d states)\n",fstoutname,res->automaton->number_of_states); u_sprintf(ustr,"%s: compiled elag grammar",fstoutname); free(res->name); res->name=u_strdup(ustr->str); save_automaton(res,fstoutname,vec,FST_GRAMMAR); free_Fst2Automaton(res,NULL); res=NULL; } } if (res!=NULL) { /* We save the last automaton, if any */ sprintf(fstoutname,"%s-%d.elg",outname,fst_number++); u_fprintf(out,"<%s>\n",fstoutname); u_printf("Saving grammar in '%s'(%d states)\n",fstoutname,res->automaton->number_of_states); elag_minimize(res->automaton,1); u_sprintf(ustr,"%s: compiled elag grammar",fstoutname); free(res->name); res->name=u_strdup(ustr->str); save_automaton(res,fstoutname,vec,FST_GRAMMAR); free_Fst2Automaton(res,free_symbol); } time_t end_time=time(0); u_fclose(frules); u_fclose(out); free_Ustring(ustr); u_printf("\nDone.\nElapsed time: %.0f s\n",difftime(end_time,start_time)); u_printf("\n%d rule%s from %s compiled in %s (%d automat%s)\n", nbRules,(nbRules>1)?"s":"",rulesname,outname,fst_number, (fst_number>1)?"a":"on"); return 0; }
int main_Uncompress(int argc,char* const argv[]) { if (argc==1) { usage(); return SUCCESS_RETURN_CODE; } VersatileEncodingConfig vec=VEC_DEFAULT; int val,index=-1; char output[FILENAME_MAX]=""; bool only_verify_arguments = false; UnitexGetOpt options; while (EOF!=(val=options.parse_long(argc,argv,optstring_Uncompress,lopts_Uncompress,&index))) { switch(val) { case 'o': if (options.vars()->optarg[0]=='\0') { error("You must specify a non empty output file name\n"); return USAGE_ERROR_CODE; } strcpy(output,options.vars()->optarg); break; case 'k': if (options.vars()->optarg[0]=='\0') { error("Empty input_encoding argument\n"); return USAGE_ERROR_CODE; } decode_reading_encoding_parameter(&(vec.mask_encoding_compatibility_input),options.vars()->optarg); break; case 'q': if (options.vars()->optarg[0]=='\0') { error("Empty output_encoding argument\n"); return USAGE_ERROR_CODE; } decode_writing_encoding_parameter(&(vec.encoding_output),&(vec.bom_output),options.vars()->optarg); break; case 'V': only_verify_arguments = true; break; case 'h': usage(); return SUCCESS_RETURN_CODE; case ':': index==-1 ? error("Missing argument for option -%c\n",options.vars()->optopt) : error("Missing argument for option --%s\n",lopts_Uncompress[index].name); return USAGE_ERROR_CODE; case '?': index==-1 ? error("Invalid option -%c\n",options.vars()->optopt) : error("Invalid option --%s\n",options.vars()->optarg); return USAGE_ERROR_CODE; } index=-1; } if (options.vars()->optind!=argc-1) { error("Invalid arguments: rerun with --help\n"); return USAGE_ERROR_CODE; } if (only_verify_arguments) { // freeing all allocated memory return SUCCESS_RETURN_CODE; } if (output[0]=='\0') { remove_extension(argv[options.vars()->optind],output); strcat(output,".dic"); } U_FILE* f=u_fopen(&vec,output,U_WRITE); if (f==NULL) { error("Cannot open file %s\n",output); return DEFAULT_ERROR_CODE; } char inf_file[FILENAME_MAX]; remove_extension(argv[options.vars()->optind],inf_file); strcat(inf_file,".inf"); u_printf("Uncompressing %s...\n",argv[options.vars()->optind]); Dictionary* d=new_Dictionary(&vec,argv[options.vars()->optind],inf_file); if (d!=NULL) { rebuild_dictionary(d,f); } u_fclose(f); free_Dictionary(d); u_printf("Done.\n"); return SUCCESS_RETURN_CODE; }
int main_PolyLex(int argc,char* const argv[]) { if (argc==1) { usage(); return SUCCESS_RETURN_CODE; } int language=-1; char alphabet[FILENAME_MAX]=""; char name_bin[FILENAME_MAX]=""; char output[FILENAME_MAX]=""; char info[FILENAME_MAX]=""; VersatileEncodingConfig vec=VEC_DEFAULT; int val,index=-1; bool only_verify_arguments = false; UnitexGetOpt options; while (EOF!=(val=options.parse_long(argc,argv,optstring_PolyLex,lopts_PolyLex,&index))) { switch(val) { case 'D': language=DUTCH; break; case 'G': language=GERMAN; break; case 'N': language=NORWEGIAN; break; case 'R': language=RUSSIAN; break; case 'a': if (options.vars()->optarg[0]=='\0') { error("You must specify a non empty alphabet file name\n"); return USAGE_ERROR_CODE; } strcpy(alphabet,options.vars()->optarg); break; case 'd': if (options.vars()->optarg[0]=='\0') { error("You must specify a non empty dictionary file name\n"); return USAGE_ERROR_CODE; } strcpy(name_bin,options.vars()->optarg); break; case 'o': if (options.vars()->optarg[0]=='\0') { error("You must specify a non empty output file name\n"); return USAGE_ERROR_CODE; } strcpy(output,options.vars()->optarg); break; case 'i': if (options.vars()->optarg[0]=='\0') { error("You must specify a non empty information file name\n"); return USAGE_ERROR_CODE; } strcpy(info,options.vars()->optarg); break; case 'k': if (options.vars()->optarg[0]=='\0') { error("Empty input_encoding argument\n"); return USAGE_ERROR_CODE; } decode_reading_encoding_parameter(&(vec.mask_encoding_compatibility_input),options.vars()->optarg); break; case 'q': if (options.vars()->optarg[0]=='\0') { error("Empty output_encoding argument\n"); return USAGE_ERROR_CODE; } decode_writing_encoding_parameter(&(vec.encoding_output),&(vec.bom_output),options.vars()->optarg); break; case 'V': only_verify_arguments = true; break; case 'h': usage(); return SUCCESS_RETURN_CODE; case ':': index==-1 ? error("Missing argument for option -%c\n",options.vars()->optopt) : error("Missing argument for option --%s\n",lopts_PolyLex[index].name); return USAGE_ERROR_CODE; case '?': index==-1 ? error("Invalid option -%c\n",options.vars()->optopt) : error("Invalid option --%s\n",options.vars()->optarg); return USAGE_ERROR_CODE; } index=-1; } if (options.vars()->optind!=argc-1) { error("Invalid arguments: rerun with --help\n"); return USAGE_ERROR_CODE; } if (name_bin[0]=='\0') { error("You must specify the .bin dictionary to use\n"); return USAGE_ERROR_CODE; } if (output[0]=='\0') { error("You must specify the output dictionary file name\n"); return USAGE_ERROR_CODE; } if (language==-1) { error("You must specify the language\n"); return USAGE_ERROR_CODE; } if (only_verify_arguments) { // freeing all allocated memory return SUCCESS_RETURN_CODE; } Alphabet* alph=NULL; if (alphabet[0]!='\0') { u_printf("Loading alphabet...\n"); alph=load_alphabet(&vec,alphabet); if (alph==NULL) { error("Cannot load alphabet file %s\n",alphabet); return USAGE_ERROR_CODE; } } char name_inf[FILENAME_MAX]; struct string_hash* forbiddenWords=NULL; if (language==DUTCH || language==NORWEGIAN) { get_path(name_bin,name_inf); strcat(name_inf,"ForbiddenWords.txt"); forbiddenWords=load_key_list(&vec,name_inf); if (forbiddenWords==NULL) { /* If there was no file, we don't want to block the process */ forbiddenWords=new_string_hash(DONT_USE_VALUES); } } strcpy(name_inf,name_bin); name_inf[strlen(name_bin)-3]='\0'; strcat(name_inf,"inf"); Dictionary* d=new_Dictionary(&vec,name_bin,name_inf); if (d==NULL) { error("Cannot load dictionary %s\n",name_bin); free_string_hash(forbiddenWords); free_alphabet(alph); return DEFAULT_ERROR_CODE; } char tmp[FILENAME_MAX]; strcpy(tmp,argv[options.vars()->optind]); strcat(tmp,".tmp"); U_FILE* words=u_fopen(&vec,argv[options.vars()->optind],U_READ); if (words==NULL) { error("Cannot open word list file %s\n",argv[options.vars()->optind]); free_Dictionary(d); free_string_hash(forbiddenWords); free_alphabet(alph); // here we return 0 in order to do not block the preprocessing // in the Unitex/GramLab IDE interface, if no dictionary was applied // so that there is no "err" file return SUCCESS_RETURN_CODE; } U_FILE* new_unknown_words=u_fopen(&vec,tmp,U_WRITE); if (new_unknown_words==NULL) { error("Cannot open temporary word list file %s\n",tmp); u_fclose(words); free_Dictionary(d); free_string_hash(forbiddenWords); free_alphabet(alph); return DEFAULT_ERROR_CODE; } U_FILE* res=u_fopen(&vec,output,U_APPEND); if (res==NULL) { error("Cannot open result file %s\n",output); u_fclose(new_unknown_words); u_fclose(words); free_Dictionary(d); free_string_hash(forbiddenWords); free_alphabet(alph); u_fclose(words); return DEFAULT_ERROR_CODE; } U_FILE* debug=NULL; if ((*info)!='\0') { debug=u_fopen(&vec,info,U_WRITE); if (debug==NULL) { error("Cannot open debug file %s\n",info); } } struct utags UTAG; switch(language) { case DUTCH: analyse_dutch_unknown_words(alph, d, words, res, debug, new_unknown_words, forbiddenWords); break; case GERMAN: analyse_german_compounds(alph, d, words, res, debug, new_unknown_words); break; case NORWEGIAN: analyse_norwegian_unknown_words(alph, d, words, res, debug, new_unknown_words, forbiddenWords); break; case RUSSIAN: init_russian(&UTAG); analyse_compounds(alph, d, words, res, debug, new_unknown_words, UTAG); break; } free_alphabet(alph); free_Dictionary(d); u_fclose(words); u_fclose(new_unknown_words); free_string_hash(forbiddenWords); af_remove(argv[options.vars()->optind]); af_rename(tmp,argv[options.vars()->optind]); u_fclose(res); if (debug!=NULL) { u_fclose(debug); } return SUCCESS_RETURN_CODE; }
int main_SortTxt(int argc, char* const argv[]) { if (argc == 1) { usage(); return SUCCESS_RETURN_CODE; } struct sort_infos* inf = new_sort_infos(); if(!inf) { return ALLOC_ERROR_CODE; } int mode = DEFAULT; char line_info[FILENAME_MAX] = ""; char sort_order[FILENAME_MAX] = ""; VersatileEncodingConfig vec = { DEFAULT_MASK_ENCODING_COMPATIBILITY_INPUT, DEFAULT_ENCODING_OUTPUT, DEFAULT_BOM_OUTPUT }; int val, index = -1; bool only_verify_arguments = false; UnitexGetOpt options; while (EOF != (val = options.parse_long(argc, argv, optstring_SortTxt, lopts_SortTxt, &index))) { switch (val) { case 'n': inf->REMOVE_DUPLICATES = 1; break; case 'd': inf->REMOVE_DUPLICATES = 0; break; case 'r': inf->REVERSE = -1; break; case 'o': if (options.vars()->optarg[0] == '\0') { error("You must specify a non empty sort order file name\n"); free_sort_infos(inf); return USAGE_ERROR_CODE; } strcpy(sort_order, options.vars()->optarg); break; case 'l': if (options.vars()->optarg[0] == '\0') { error("You must specify a non empty information file name\n"); free_sort_infos(inf); return USAGE_ERROR_CODE; } strcpy(line_info, options.vars()->optarg); break; case 't': mode = THAI; break; case 'f': inf->factorize_inflectional_codes = 1; break; case 'V': only_verify_arguments = true; break; case 'h': usage(); free_sort_infos(inf); return SUCCESS_RETURN_CODE; case 'k': if (options.vars()->optarg[0] == '\0') { error("Empty input_encoding argument\n"); free_sort_infos(inf); return USAGE_ERROR_CODE; } decode_reading_encoding_parameter( &(vec.mask_encoding_compatibility_input), options.vars()->optarg); break; case 'q': if (options.vars()->optarg[0] == '\0') { error("Empty output_encoding argument\n"); free_sort_infos(inf); return USAGE_ERROR_CODE; } decode_writing_encoding_parameter(&(vec.encoding_output), &(vec.bom_output), options.vars()->optarg); break; case ':': index == -1 ? error("Missing argument for option -%c\n", options.vars()->optopt) : error("Missing argument for option --%s\n",lopts_SortTxt[index].name); free_sort_infos(inf); return USAGE_ERROR_CODE; case '?': index == -1 ? error("Invalid option -%c\n", options.vars()->optopt) : error("Invalid option --%s\n", options.vars()->optarg); free_sort_infos(inf); return USAGE_ERROR_CODE; } index = -1; } if (options.vars()->optind != argc - 1) { error("Invalid arguments: rerun with --help\n"); free_sort_infos(inf); return USAGE_ERROR_CODE; } if (only_verify_arguments) { // freeing all allocated memory free_sort_infos(inf); return SUCCESS_RETURN_CODE; } if (sort_order[0] != '\0') { read_char_order(&vec, sort_order, inf); } char new_name[FILENAME_MAX]; strcpy(new_name, argv[options.vars()->optind]); strcat(new_name, ".new"); inf->f = u_fopen(&vec, argv[options.vars()->optind], U_READ); if (inf->f == NULL) { error("Cannot open file %s\n", argv[options.vars()->optind]); free_sort_infos(inf); return DEFAULT_ERROR_CODE; } inf->f_out = u_fopen(&vec, new_name, U_WRITE); if (inf->f_out == NULL) { error("Cannot open temporary file %s\n", new_name); u_fclose(inf->f); free_sort_infos(inf); return DEFAULT_ERROR_CODE; } switch (mode) { case DEFAULT: sort(inf); break; case THAI: sort_thai(inf); break; } if (line_info[0] != '\0') { U_FILE* F = u_fopen(&vec, line_info, U_WRITE); if (F == NULL) { error("Cannot write %s\n", line_info); } else { u_fprintf(F, "%d\n", inf->resulting_line_number); u_fclose(F); } } u_fclose(inf->f_out); u_fclose(inf->f); af_remove(argv[options.vars()->optind]); af_rename(new_name, argv[options.vars()->optind]); free_sort_infos(inf); u_printf("Done.\n"); return SUCCESS_RETURN_CODE; }