Beispiel #1
0
U_CAPI UChar* U_EXPORT2
u_strFindLast(const UChar* s, int32_t length,
              const UChar* sub, int32_t subLength) {
    const UChar* start, * limit, * p, * q, * subLimit;
    UChar c, cs;

    if (sub == NULL || subLength < -1) {
        return (UChar*) s;
    }
    if (s == NULL || length < -1) {
        return NULL;
    }

    /*
     * This implementation is more lazy than the one for u_strFindFirst():
     * There is no special search code for NUL-terminated strings.
     * It does not seem to be worth it for searching substrings to
     * search forward and find all matches like in u_strrchr() and similar.
     * Therefore, we simply get both string lengths and search backward.
     *
     * markus 2002oct23
     */

    if (subLength < 0) {
        subLength = u_strlen(sub);
    }
    if (subLength == 0) {
        return (UChar*) s;
    }

    /* get sub[subLength-1] to search for it fast */
    subLimit = sub + subLength;
    cs = *(--subLimit);
    --subLength;

    if (subLength == 0 && !U16_IS_SURROGATE(cs)) {
        /* the substring consists of a single, non-surrogate BMP code point */
        return length < 0 ? u_strrchr(s, cs) : u_memrchr(s, cs, length);
    }

    if (length < 0) {
        length = u_strlen(s);
    }

    /* subLength was decremented above */
    if (length <= subLength) {
        return NULL; /* s is shorter than sub */
    }

    start = s;
    limit = s + length;

    /* the substring must start no later than s+subLength */
    s += subLength;

    while (s != limit) {
        c = *(--limit);
        if (c == cs) {
            /* found last substring UChar, compare rest */
            p = limit;
            q = subLimit;
            for (; ;) {
                if (q == sub) {
                    if (isMatchAtCPBoundary(start, p, limit + 1, start + length)) {
                        return (UChar*) p; /* well-formed match */
                    } else {
                        break; /* no match because surrogate pair is split */
                    }
                }
                if (*(--p) != *(--q)) {
                    break; /* no match */
                }
            }
        }
    }

    /* not found */
    return NULL;
}
/**
 * Computes training by extracting statistics from a tagged corpus file.
 */
void do_training(U_FILE* input_text,U_FILE* rforms_file,U_FILE* iforms_file){
/* these two hash tables are respectively for simple and compound entries */
struct string_hash_ptr* rforms_table = NULL, *iforms_table = NULL;
if(rforms_file != NULL){
	rforms_table = new_string_hash_ptr(200000);
}
if(iforms_file != NULL){
	iforms_table = new_string_hash_ptr(200000);
}


/* we initialize a contextual matrix */
struct corpus_entry** context = new_context_matrix();
initialize_context_matrix(context);


unichar line[MAX_TAGGED_CORPUS_LINE];

/* check the format of the corpus */
long previous_file_position = ftell(input_text);
if(u_fgets(line,input_text) == EOF){
	fatal_error("File is empty");
}
fseek(input_text,previous_file_position,SEEK_SET);

int format_corpus = check_corpus_entry(line);

if(format_corpus == 0){
	// the corpus is in the Tagger format, one word per line where line=word/tag
	while(u_fgets(line,input_text) !=EOF){
		if(u_strlen(line) == 0){
			initialize_context_matrix(context);
		}
		else{
			corpus_entry* entry = new_corpus_entry(line);
			if(u_strchr(line,'_')!=NULL && line[0]!='_'){
				corpus_entry** entries = extract_simple_words(entry);
				free_corpus_entry(entry);
				for(int i=0;entries[i]!=NULL;i++){
					push_corpus_entry(entries[i],context);
					add_statistics(context,rforms_table,iforms_table);
				}
				free(entries);
			}
			else {
				push_corpus_entry(entry,context);
				add_statistics(context,rforms_table,iforms_table);
			}
		}
	}
}
else {
	// the corpus is in the Unitex tagged format, one sentence per line where token={word,lemma.tag}
	unichar *tmp,*s = (unichar*)malloc(sizeof(unichar)*(MAX_TAGGED_CORPUS_LINE));
	int current_len,len;
	unsigned int i;
	while(u_fgets(line,input_text) != EOF){
		current_len = 0, len = 0;
		/* extract each token of the sentence */
		for (;;) {
			len = 1+u_strlen(line+current_len)-u_strlen(u_strchr(line+current_len,'}'));
			tmp = u_strcpy_sized(s,len-1,line+current_len+1);
			u_strcat(tmp,"\0");
			if(u_strcmp(s,"S") == 0)
				break;

			//particular case: '\},\}.PONCT'
			if(line[current_len+2] == '}'){
				int start = current_len+3;
				do{
					tmp = u_strchr(line+start,'}');
					start += 1+u_strlen(line+start)-u_strlen(tmp);
				}
				while(*(tmp+1) != ' ');
				tmp = u_strcpy_sized(s,start-current_len-1,line+current_len+1);
				u_strcat(tmp,"\0");
				len += start-current_len-3;
			}

			/* format the {XX.YY} into standard tagger format, XX/YY */
			unichar* newline = (unichar*)malloc(sizeof(unichar)*(8096));
			if(u_strchr(s,',')[1] == ','){
				u_strcpy(newline,",");
			}
			else
				u_strcpy_sized(newline,1+u_strlen(s)-u_strlen(u_strchr(s,',')),s);
			u_sprintf(newline,"%S/%S\0",newline,s+u_strrchr(s,'.')+1);
			for(i=0;i<u_strlen(newline);i++){
				if(newline[i] == ' ')
					newline[i] = '_';
			}

			//create corpus entry
			corpus_entry* entry = new_corpus_entry(newline);
			if(u_strchr(newline,'_') != NULL && newline[0] != '_'){
				corpus_entry** entries = extract_simple_words(entry);
				free_corpus_entry(entry);
				for(int j=0;entries[j]!=NULL;j++){
					push_corpus_entry(entries[j],context);
					add_statistics(context,rforms_table,iforms_table);
				}
				free(entries);
			}
			else {
				push_corpus_entry(entry,context);
				add_statistics(context,rforms_table,iforms_table);
			}

			free(newline);
			current_len += len+1;
		}
		initialize_context_matrix(context);
	}
	free(s);
}
free_context_matrix(context);
/* we fill dictionary files with pairs (tuple,value) and then
 * we add a special line "CODE\tFEATURES,.value" in order to
 * specify whether the dictionary contains inflected or raw form tuples*/
unichar* str = u_strdup("");
if(rforms_table != NULL){
	write_keys_values(rforms_table,rforms_table->hash->root,str,rforms_file);
	u_fprintf(rforms_file,"%s,.%d\n","CODE\tFEATURES",0);
	free_string_hash_ptr(rforms_table,NULL);
}
if(iforms_table != NULL){
	write_keys_values(iforms_table,iforms_table->hash->root,str,iforms_file);
	u_fprintf(iforms_file,"%s,.%d\n","CODE\tFEATURES",1);
	free_string_hash_ptr(iforms_table,NULL);
}
free(str);
}
Beispiel #3
0
unichar_t *GIOguessMimeType(const unichar_t *path,int isdir) {
    unichar_t *pt;

    if ( isdir )
return( dir );
    path = u_GFileNameTail(path);
    pt = u_strrchr(path,'.');

    if ( pt==NULL ) {
	if ( uc_strmatch(path,"makefile")==0 || uc_strmatch(path,"makefile~")==0 )
return( textmake );
	else if ( uc_strmatch(path,"core")==0 )
return( core );
    } else if ( uc_strmatch(pt,".text")==0 || uc_strmatch(pt,".txt")==0 ||
	    uc_strmatch(pt,".text~")==0 || uc_strmatch(pt,".txt~")==0 )
return( textplain );
    else if ( uc_strmatch(pt,".c")==0 || uc_strmatch(pt,".h")==0 ||
	    uc_strmatch(pt,".c~")==0 || uc_strmatch(pt,".h~")==0 )
return( textc );
    else if ( uc_strmatch(pt,".java")==0 || uc_strmatch(pt,".java~")==0 )
return( textjava );
    else if ( uc_strmatch(pt,".css")==0 || uc_strmatch(pt,".css~")==0 )
return( textcss );
    else if ( uc_strmatch(pt,".html")==0 || uc_strmatch(pt,".htm")==0 ||
	    uc_strmatch(pt,".html~")==0 || uc_strmatch(pt,".htm~")==0 )
return( texthtml );
    else if ( uc_strmatch(pt,".xml")==0 || uc_strmatch(pt,".xml~")==0 )
return( textxml );
    else if ( uc_strmatch(pt,".pfa")==0 || uc_strmatch(pt,".pfb")==0 ||
	    uc_strmatch(pt,".pt3")==0 || uc_strmatch(pt,".cff")==0 )
return( textpsfont );
    else if ( uc_strmatch(pt,".sfd")==0 )
return( sfdfont );
    else if ( uc_strmatch(pt,".ttf")==0 )
return( fontttf );
    else if ( uc_strmatch(pt,".otf")==0 || uc_strmatch(pt,".otb")==0 ||
	    uc_strmatch(pt,".gai")==0 )
return( fontotf );
    else if ( uc_strmatch(pt,".cid")==0 )
return( fontcid );
    else if ( uc_strmatch(pt,".ps")==0 || uc_strmatch(pt,".eps")==0 )
return( textps );
    else if ( uc_strmatch(pt,".bdf")==0 )
return( textbdffont );
    else if ( uc_strmatch(pt,".pdf")==0 )
return( pdf );
    else if ( uc_strmatch(pt,".gif")==0 )
return( imagegif );
    else if ( uc_strmatch(pt,".png")==0 )
return( imagepng );
    else if ( uc_strmatch(pt,".svg")==0 )
return( imagesvg );
    else if ( uc_strmatch(pt,".jpeg")==0 || uc_strmatch(pt,".jpg")==0 )
return( imagejpeg );
    else if ( uc_strmatch(pt,".mov")==0 || uc_strmatch(pt,".movie")==0 )
return( videoquick );
    else if ( uc_strmatch(pt,".wav")==0 )
return( audiowav );
    else if ( uc_strmatch(pt,".o")==0 || uc_strmatch(pt,".obj")==0 )
return( object );
    else if ( uc_strmatch(pt,".bin")==0 )
return( macbin );
    else if ( uc_strmatch(pt,".hqx")==0 )
return( machqx );
    else if ( uc_strmatch(pt,".dfont")==0 )
return( macdfont );
    else if ( uc_strmatch(pt,".gz")==0 || uc_strmatch(pt,".tgz")==0 ||
	    uc_strmatch(pt,".Z")==0 || uc_strmatch(pt,".zip")==0 ||
	    uc_strmatch(pt,".bz2")==0 || uc_strmatch(pt,".tbz")==0 ||
	    uc_strmatch(pt,".rpm")==0 )
return( compressed );
    else if ( uc_strmatch(pt,".tar")==0 )
return( tar );
    else if ( uc_strmatch(pt,".pcf")==0 )
return( fontpcf );
    else if ( uc_strmatch(pt,".snf")==0 )
return( fontsnf );

return( unknown );
}
/**
 * Check the format of the first line of the corpus in order
 * to determine the format of the whole corpus.
 */
int check_corpus_entry(const unichar* line){
if(u_strrchr(line,'/') == -1 || line[0] == '{')
	return 1;
return 0;
}