U_CAPI UChar* U_EXPORT2 u_strFindLast(const UChar* s, int32_t length, const UChar* sub, int32_t subLength) { const UChar* start, * limit, * p, * q, * subLimit; UChar c, cs; if (sub == NULL || subLength < -1) { return (UChar*) s; } if (s == NULL || length < -1) { return NULL; } /* * This implementation is more lazy than the one for u_strFindFirst(): * There is no special search code for NUL-terminated strings. * It does not seem to be worth it for searching substrings to * search forward and find all matches like in u_strrchr() and similar. * Therefore, we simply get both string lengths and search backward. * * markus 2002oct23 */ if (subLength < 0) { subLength = u_strlen(sub); } if (subLength == 0) { return (UChar*) s; } /* get sub[subLength-1] to search for it fast */ subLimit = sub + subLength; cs = *(--subLimit); --subLength; if (subLength == 0 && !U16_IS_SURROGATE(cs)) { /* the substring consists of a single, non-surrogate BMP code point */ return length < 0 ? u_strrchr(s, cs) : u_memrchr(s, cs, length); } if (length < 0) { length = u_strlen(s); } /* subLength was decremented above */ if (length <= subLength) { return NULL; /* s is shorter than sub */ } start = s; limit = s + length; /* the substring must start no later than s+subLength */ s += subLength; while (s != limit) { c = *(--limit); if (c == cs) { /* found last substring UChar, compare rest */ p = limit; q = subLimit; for (; ;) { if (q == sub) { if (isMatchAtCPBoundary(start, p, limit + 1, start + length)) { return (UChar*) p; /* well-formed match */ } else { break; /* no match because surrogate pair is split */ } } if (*(--p) != *(--q)) { break; /* no match */ } } } } /* not found */ return NULL; }
/** * Computes training by extracting statistics from a tagged corpus file. */ void do_training(U_FILE* input_text,U_FILE* rforms_file,U_FILE* iforms_file){ /* these two hash tables are respectively for simple and compound entries */ struct string_hash_ptr* rforms_table = NULL, *iforms_table = NULL; if(rforms_file != NULL){ rforms_table = new_string_hash_ptr(200000); } if(iforms_file != NULL){ iforms_table = new_string_hash_ptr(200000); } /* we initialize a contextual matrix */ struct corpus_entry** context = new_context_matrix(); initialize_context_matrix(context); unichar line[MAX_TAGGED_CORPUS_LINE]; /* check the format of the corpus */ long previous_file_position = ftell(input_text); if(u_fgets(line,input_text) == EOF){ fatal_error("File is empty"); } fseek(input_text,previous_file_position,SEEK_SET); int format_corpus = check_corpus_entry(line); if(format_corpus == 0){ // the corpus is in the Tagger format, one word per line where line=word/tag while(u_fgets(line,input_text) !=EOF){ if(u_strlen(line) == 0){ initialize_context_matrix(context); } else{ corpus_entry* entry = new_corpus_entry(line); if(u_strchr(line,'_')!=NULL && line[0]!='_'){ corpus_entry** entries = extract_simple_words(entry); free_corpus_entry(entry); for(int i=0;entries[i]!=NULL;i++){ push_corpus_entry(entries[i],context); add_statistics(context,rforms_table,iforms_table); } free(entries); } else { push_corpus_entry(entry,context); add_statistics(context,rforms_table,iforms_table); } } } } else { // the corpus is in the Unitex tagged format, one sentence per line where token={word,lemma.tag} unichar *tmp,*s = (unichar*)malloc(sizeof(unichar)*(MAX_TAGGED_CORPUS_LINE)); int current_len,len; unsigned int i; while(u_fgets(line,input_text) != EOF){ current_len = 0, len = 0; /* extract each token of the sentence */ for (;;) { len = 1+u_strlen(line+current_len)-u_strlen(u_strchr(line+current_len,'}')); tmp = u_strcpy_sized(s,len-1,line+current_len+1); u_strcat(tmp,"\0"); if(u_strcmp(s,"S") == 0) break; //particular case: '\},\}.PONCT' if(line[current_len+2] == '}'){ int start = current_len+3; do{ tmp = u_strchr(line+start,'}'); start += 1+u_strlen(line+start)-u_strlen(tmp); } while(*(tmp+1) != ' '); tmp = u_strcpy_sized(s,start-current_len-1,line+current_len+1); u_strcat(tmp,"\0"); len += start-current_len-3; } /* format the {XX.YY} into standard tagger format, XX/YY */ unichar* newline = (unichar*)malloc(sizeof(unichar)*(8096)); if(u_strchr(s,',')[1] == ','){ u_strcpy(newline,","); } else u_strcpy_sized(newline,1+u_strlen(s)-u_strlen(u_strchr(s,',')),s); u_sprintf(newline,"%S/%S\0",newline,s+u_strrchr(s,'.')+1); for(i=0;i<u_strlen(newline);i++){ if(newline[i] == ' ') newline[i] = '_'; } //create corpus entry corpus_entry* entry = new_corpus_entry(newline); if(u_strchr(newline,'_') != NULL && newline[0] != '_'){ corpus_entry** entries = extract_simple_words(entry); free_corpus_entry(entry); for(int j=0;entries[j]!=NULL;j++){ push_corpus_entry(entries[j],context); add_statistics(context,rforms_table,iforms_table); } free(entries); } else { push_corpus_entry(entry,context); add_statistics(context,rforms_table,iforms_table); } free(newline); current_len += len+1; } initialize_context_matrix(context); } free(s); } free_context_matrix(context); /* we fill dictionary files with pairs (tuple,value) and then * we add a special line "CODE\tFEATURES,.value" in order to * specify whether the dictionary contains inflected or raw form tuples*/ unichar* str = u_strdup(""); if(rforms_table != NULL){ write_keys_values(rforms_table,rforms_table->hash->root,str,rforms_file); u_fprintf(rforms_file,"%s,.%d\n","CODE\tFEATURES",0); free_string_hash_ptr(rforms_table,NULL); } if(iforms_table != NULL){ write_keys_values(iforms_table,iforms_table->hash->root,str,iforms_file); u_fprintf(iforms_file,"%s,.%d\n","CODE\tFEATURES",1); free_string_hash_ptr(iforms_table,NULL); } free(str); }
unichar_t *GIOguessMimeType(const unichar_t *path,int isdir) { unichar_t *pt; if ( isdir ) return( dir ); path = u_GFileNameTail(path); pt = u_strrchr(path,'.'); if ( pt==NULL ) { if ( uc_strmatch(path,"makefile")==0 || uc_strmatch(path,"makefile~")==0 ) return( textmake ); else if ( uc_strmatch(path,"core")==0 ) return( core ); } else if ( uc_strmatch(pt,".text")==0 || uc_strmatch(pt,".txt")==0 || uc_strmatch(pt,".text~")==0 || uc_strmatch(pt,".txt~")==0 ) return( textplain ); else if ( uc_strmatch(pt,".c")==0 || uc_strmatch(pt,".h")==0 || uc_strmatch(pt,".c~")==0 || uc_strmatch(pt,".h~")==0 ) return( textc ); else if ( uc_strmatch(pt,".java")==0 || uc_strmatch(pt,".java~")==0 ) return( textjava ); else if ( uc_strmatch(pt,".css")==0 || uc_strmatch(pt,".css~")==0 ) return( textcss ); else if ( uc_strmatch(pt,".html")==0 || uc_strmatch(pt,".htm")==0 || uc_strmatch(pt,".html~")==0 || uc_strmatch(pt,".htm~")==0 ) return( texthtml ); else if ( uc_strmatch(pt,".xml")==0 || uc_strmatch(pt,".xml~")==0 ) return( textxml ); else if ( uc_strmatch(pt,".pfa")==0 || uc_strmatch(pt,".pfb")==0 || uc_strmatch(pt,".pt3")==0 || uc_strmatch(pt,".cff")==0 ) return( textpsfont ); else if ( uc_strmatch(pt,".sfd")==0 ) return( sfdfont ); else if ( uc_strmatch(pt,".ttf")==0 ) return( fontttf ); else if ( uc_strmatch(pt,".otf")==0 || uc_strmatch(pt,".otb")==0 || uc_strmatch(pt,".gai")==0 ) return( fontotf ); else if ( uc_strmatch(pt,".cid")==0 ) return( fontcid ); else if ( uc_strmatch(pt,".ps")==0 || uc_strmatch(pt,".eps")==0 ) return( textps ); else if ( uc_strmatch(pt,".bdf")==0 ) return( textbdffont ); else if ( uc_strmatch(pt,".pdf")==0 ) return( pdf ); else if ( uc_strmatch(pt,".gif")==0 ) return( imagegif ); else if ( uc_strmatch(pt,".png")==0 ) return( imagepng ); else if ( uc_strmatch(pt,".svg")==0 ) return( imagesvg ); else if ( uc_strmatch(pt,".jpeg")==0 || uc_strmatch(pt,".jpg")==0 ) return( imagejpeg ); else if ( uc_strmatch(pt,".mov")==0 || uc_strmatch(pt,".movie")==0 ) return( videoquick ); else if ( uc_strmatch(pt,".wav")==0 ) return( audiowav ); else if ( uc_strmatch(pt,".o")==0 || uc_strmatch(pt,".obj")==0 ) return( object ); else if ( uc_strmatch(pt,".bin")==0 ) return( macbin ); else if ( uc_strmatch(pt,".hqx")==0 ) return( machqx ); else if ( uc_strmatch(pt,".dfont")==0 ) return( macdfont ); else if ( uc_strmatch(pt,".gz")==0 || uc_strmatch(pt,".tgz")==0 || uc_strmatch(pt,".Z")==0 || uc_strmatch(pt,".zip")==0 || uc_strmatch(pt,".bz2")==0 || uc_strmatch(pt,".tbz")==0 || uc_strmatch(pt,".rpm")==0 ) return( compressed ); else if ( uc_strmatch(pt,".tar")==0 ) return( tar ); else if ( uc_strmatch(pt,".pcf")==0 ) return( fontpcf ); else if ( uc_strmatch(pt,".snf")==0 ) return( fontsnf ); return( unknown ); }
/** * Check the format of the first line of the corpus in order * to determine the format of the whole corpus. */ int check_corpus_entry(const unichar* line){ if(u_strrchr(line,'/') == -1 || line[0] == '{') return 1; return 0; }