Пример #1
0
int cq_fields_to_utf8(char *buf, size_t buflen, size_t fieldc,
        char **fieldnames, bool usequotes)
{
    UChar *buf16;
    UErrorCode status = U_ZERO_ERROR;
    size_t num_left = fieldc;
    int rc = 0;

    if (num_left == 0)
        return 1;

    buf16 = calloc(buflen, sizeof(UChar));
    if (buf16 == NULL)
        return -1;

    for (size_t i = 0; i < fieldc; ++i) {
        UChar *temp = calloc(buflen, sizeof(UChar));
        if (temp == NULL) {
            rc = -2;
            break;
        }

        u_strFromUTF8(temp, buflen, NULL, fieldnames[i], strlen(fieldnames[i]),
                &status);
        if (!U_SUCCESS(status)) {
            rc = 2;
            free(temp);
            break;
        }

        bool isstr = false;
        if (usequotes) {
            for (int32_t j = 0; j < u_strlen(temp); ++j) {
                if (!isdigit(temp[j])) {
                    isstr = true;
                    break;
                }
            }
        }

        if (isstr) u_strcat(buf16, u"'");
        u_strcat(buf16, temp);
        if (isstr) u_strcat(buf16, u"'");
        free(temp);
        if (--num_left > 0) {
            u_strcat(buf16, u",");
        }
    }

    u_strToUTF8(buf, buflen, NULL, buf16, u_strlen(buf16), &status);
    if (!U_SUCCESS(status))
        rc = 3;

    free(buf16);
    return rc;
}
//
// this function explores a sub-graph, considering tokens as strings
//
void explorer_sub_automate_normalization_string(Fst2* automate,int n,
                                     struct normalization_tree* noeud_normalization,
                                     unichar* output,struct norm_info** TEMP_LIST) {
Fst2State etat;
etat=automate->states[n];
if (is_final_state(etat)) {
   // if we are in a final state
   (*TEMP_LIST)=insert_in_norm_info_list(output,noeud_normalization,(*TEMP_LIST));
}
Transition* trans;
trans=etat->transitions;
unichar tmp[1000];
while (trans!=NULL) {
   if (trans->tag_number<0) {
      // case of a sub-graph
      struct norm_info* TMP=NULL;
      explorer_sub_automate_normalization_string(automate,automate->initial_states[-(trans->tag_number)],noeud_normalization,
                                        output,&TMP);
      while (TMP!=NULL) {
         // we continue to explore the current automaton
         explorer_sub_automate_normalization_string(automate,trans->state_number,TMP->node,
                                        TMP->output,TEMP_LIST);
         struct norm_info* z=TMP;
         TMP=TMP->next;
         free_norm_info(z);
      }
   }
   else {
      // normal transition
      Fst2Tag etiq;
      etiq=automate->tags[trans->tag_number];
      u_strcpy(tmp,output);
      u_strcat(tmp," ");
      if (etiq->output!=NULL && u_strcmp(etiq->output,"")
          && u_strcmp(etiq->output,"<E>") && !only_spaces(etiq->output)) {
         // we append the output if it exists and is not epsilon
         u_strcat(tmp,etiq->output);
      }
      struct normalization_tree_transition* trans_norm;
      trans_norm=get_trans_arbre_normalization_string(etiq->input,noeud_normalization->trans);
      if (trans_norm==NULL) {
         // if the transition does not exist in the tree, we create it
         trans_norm=new_trans_arbre_normalization_string(etiq->input);
         // we also create the destination node
         trans_norm->node=new_normalization_tree();
         trans_norm->next=noeud_normalization->trans;
         noeud_normalization->trans=trans_norm;
      }
      explorer_sub_automate_normalization_string(automate,trans->state_number,trans_norm->node,
                                        tmp,TEMP_LIST);
   }
   trans=trans->next;
}
}
Пример #3
0
/**
 * Allocates, initializes and returns a new corpus_entry structure.
 */
struct corpus_entry* new_corpus_entry(const unichar* line){
struct corpus_entry* entry = (corpus_entry*)malloc(sizeof(corpus_entry));
if(entry == NULL){
	fatal_alloc_error("compute_corpus_entry");
}
/* we fill corpus entry with information extracted from the corpus line*/
int pos = u_strrchr(line,'/');
if(pos == -1){
	fatal_error("Wrong format for line %S\n",line);
}
entry->word = (unichar*)malloc(sizeof(unichar)*(pos+1));
if(entry->word == NULL){
	fatal_alloc_error("compute_corpus_entry");
}
unichar* tmp = u_strcpy_sized(entry->word,pos+1,line);
u_strcat(tmp,"\0");

int code_pos = u_strrchr(line,':');
/* there are no morphological codes associated to this entry */
if(code_pos == -1){
	entry->pos_code = (unichar*)malloc(sizeof(unichar)*(u_strlen(line)-pos));
	if(entry->pos_code == NULL){
		fatal_alloc_error("new_corpus_entry");
	}
	u_strcpy(entry->pos_code,&line[pos+1]);
	entry->overall_codes = u_strdup(entry->pos_code);
}
else{
	entry->pos_code = (unichar*)malloc(sizeof(unichar)*(code_pos-pos));
	if(entry->pos_code == NULL){
		fatal_alloc_error("new_corpus_entry");
	}
	entry->overall_codes = (unichar*)malloc(sizeof(unichar)*(u_strlen(line)-pos));
	if(entry->overall_codes == NULL){
		fatal_alloc_error("new_corpus_entry");
	}
	unichar* tmp2 = u_strcpy_sized(entry->pos_code,code_pos-pos,&line[pos+1]);
	u_strcat(tmp2,"\0");
	u_strcpy(entry->overall_codes,&line[pos+1]);
}
/* if the token is not annotated in the corpus, we put "UNK" */
if(u_strlen(entry->pos_code) == 0){
	free(entry->pos_code);
	free(entry->overall_codes);
	entry->pos_code = u_strdup("UNK");
	entry->overall_codes = u_strdup("UNK");
}
return entry;
}
Пример #4
0
static void demo_C_Unicode_strings() {
    printf("\n* demo_C_Unicode_strings() --------- ***\n\n");

    static const UChar text[]={ 0x41, 0x42, 0x43, 0 };          /* "ABC" */
    static const UChar appendText[]={ 0x61, 0x62, 0x63, 0 };    /* "abc" */
    static const UChar cmpText[]={ 0x61, 0x53, 0x73, 0x43, 0 }; /* "aSsC" */
    UChar buffer[32];
    int32_t compare;
    int32_t length=u_strlen(text); /* length=3 */

    /* simple ANSI C-style functions */
    buffer[0]=0;                    /* empty, NUL-terminated string */
    u_strncat(buffer, text, 1);     /* append just n=1 character ('A') */
    u_strcat(buffer, appendText);   /* buffer=="Aabc" */
    length=u_strlen(buffer);        /* length=4 */
    printUString("should be \"Aabc\": ", buffer, -1);

    /* bitwise comparing buffer with text */
    compare=u_strcmp(buffer, text);
    if(compare<=0) {
        printf("String comparison error, expected \"Aabc\" > \"ABC\"\n");
    }

    /* Build "A<sharp s>C" in the buffer... */
    u_strcpy(buffer, text);
    buffer[1]=0xdf; /* sharp s, case-compares equal to "ss" */
    printUString("should be \"A<sharp s>C\": ", buffer, -1);

    /* Compare two strings case-insensitively using full case folding */
    compare=u_strcasecmp(buffer, cmpText, U_FOLD_CASE_DEFAULT);
    if(compare!=0) {
        printf("String case insensitive comparison error, expected \"AbC\" to be equal to \"ABC\"\n");
    }
}
Пример #5
0
unichar* read_file(U_FILE *f){

	unichar *text = NULL;

	text = (unichar *)malloc(sizeof(unichar));
	if(text==NULL){
		fatal_alloc_error("malloc");
	}
	text[0]='\0';

	int total_read = 0;
	int read;
	do {
		unichar buffer[READ_FILE_BUFFER_SIZE+1];
		memset(buffer,0,sizeof(unichar)*(READ_FILE_BUFFER_SIZE+1));

		int ok=1;

		read = u_fread(buffer,READ_FILE_BUFFER_SIZE,f,&ok);

		total_read += u_strlen(buffer);
		text = (unichar *)realloc(text,sizeof(unichar)*(total_read+1));
		if(text==NULL){
				fatal_alloc_error("realloc");
		}
		u_strcat(text,buffer);

	} while (read == READ_FILE_BUFFER_SIZE);

	text[total_read]='\0';

	return text;
}
Пример #6
0
corpus_entry* new_simple_word_entry(const unichar* word,corpus_entry* entry,int start){
	corpus_entry* wentry = (corpus_entry*)malloc(sizeof(corpus_entry));
	wentry->word = u_strdup(word);
	wentry->pos_code = (unichar*)malloc(sizeof(unichar)*(u_strlen(entry->pos_code)+3));
	wentry->overall_codes = (unichar*)malloc(sizeof(unichar)*(u_strlen(entry->overall_codes)+3));
	unichar* tmp = u_strcpy_sized(wentry->pos_code,u_strlen(entry->pos_code)+1,entry->pos_code);
	unichar* tmp2 = u_strcpy_sized(wentry->overall_codes,u_strlen(entry->overall_codes)+1,entry->overall_codes);
	if(start == 0){
		u_strcat(tmp,"+I\0");
		u_strcat(tmp2,"+I\0");
	}
	else {
		u_strcat(tmp,"+B\0");
		u_strcat(tmp2,"+B\0");
	}
	return wentry;
}
Пример #7
0
//
// returns 1 if the INF code refers to a valid right component, 0 else
//
char check_valid_right_component_for_one_INF_code_german(const unichar* s) {
unichar temp[2000];
u_strcpy(temp,"x,");
u_strcat(temp,s);
struct dela_entry* d=tokenize_DELAF_line(temp,0);
char res=check_N_not_FF(d);
free_dela_entry(d);
return res;
}
Пример #8
0
int check_is_valid_for_one_INF_code(const unichar* t, const unichar* s)
{
  unichar temp[MAX_DICT_LINE_LENGTH];
  u_strcpy(temp,"x,");
  u_strcat(temp,s);
  struct dela_entry* d = tokenize_DELAF_line(temp,0);
  int res = check_is_valid(t, d);
  free_dela_entry(d);
  return res;
}
Пример #9
0
/**
 * Returns 1 if the given INF code is a ":a" one.
 */
char check_a(unichar* INF_code) {
/* We produce an artifical dictionary entry with the given INF code,
 * and then, we tokenize it in order to get grammatical and inflectional
 * codes in a structured way. */
unichar temp[2000];
u_strcpy(temp,"x,");
u_strcat(temp,INF_code);
struct dela_entry* d=tokenize_DELAF_line(temp,0);
char res=check_a(d);
/* We free the artifical dictionary entry */
free_dela_entry(d);
return res;
}
Пример #10
0
/**
 * Returns 1 if the INF code refers to a valid left component, 0 otherwise.
 */
char check_valid_right_component_for_one_INF_code(const unichar* INF_code) {
/* We produce an artifical dictionary entry with the given INF code,
 * and then, we tokenize it in order to get grammatical and inflectional
 * codes in a structured way. */
unichar temp[2000];
u_strcpy(temp,"x,");
u_strcat(temp,INF_code);
struct dela_entry* d=tokenize_DELAF_line(temp,0);
char res=(check_N(d)||check_A(d)/*||check_V_but_not_Y(d)*/)&&(!check_Nsie(d));
/* We free the artifical dictionary entry */
free_dela_entry(d);
return res;
}
Пример #11
0
/**
 * Returns 1 if the INF code refers to a valid left component, 0 otherwise.
 */
char check_valid_left_component_for_one_INF_code(const unichar* INF_code) {
/* We produce an artifical dictionary entry with the given INF code,
 * and then, we tokenize it in order to get grammatical and inflectional
 * codes in a structured way. */
unichar temp[2000];
u_strcpy(temp,"x,");
u_strcat(temp,INF_code);
struct dela_entry* d=tokenize_DELAF_line(temp,0);
/* Now, we can use this structured representation to check if the INF code
 * corresponds to a valid left component. */
char res=check_Nsia(d)||check_Nsie(d)||check_Nsig(d)||check_Asio(d)||check_Asie(d)||check_VW(d)||check_ADV(d);
/* Finally, we free the artificial dictionary entry */
free_dela_entry(d);
return res;
}
Пример #12
0
/**
 * Returns 1 if the line is a valid right "A" component.
 */
char check_A_right_component(unichar* s) {
/* We produce an artifical dictionary entry with the given INF code,
 * and then, we tokenize it in order to get grammatical and inflectional
 * codes in a structured way. */
unichar temp[2000];
u_strcpy(temp,"x,");
u_strcat(temp,s);
struct dela_entry* d=tokenize_DELAF_line(temp,0);
unichar t1[2];
u_strcpy(t1,"A");
unichar t2[4];
u_strcpy(t2,"sie");
char res=dic_entry_contain_gram_code(d,t1) && !dic_entry_contain_inflectional_code(d,t2);
/* We free the artifical dictionary entry */
free_dela_entry(d);
return res;
}
Пример #13
0
unichar_t *u_GFileGetAbsoluteName(unichar_t *name, unichar_t *result, int rsiz) {
    /* result may be the same as name */
    unichar_t buffer[1000];

    if ( ! u_GFileIsAbsolute(name) ) {
	unichar_t *pt, *spt, *rpt, *bpt;

	if ( dirname_[0]=='\0' ) {
	    getcwd(dirname_,sizeof(dirname_));
	}
	uc_strcpy(buffer,dirname_);
	if ( buffer[u_strlen(buffer)-1]!='/' )
	    uc_strcat(buffer,"/");
	u_strcat(buffer,name);
	_u_backslash_to_slash(buffer);

	/* Normalize out any .. */
	spt = rpt = buffer;
	while ( *spt!='\0' ) {
	    if ( *spt=='/' ) ++spt;
	    for ( pt = spt; *pt!='\0' && *pt!='/'; ++pt );
	    if ( pt==spt )	/* Found // in a path spec, reduce to / (we've*/
		u_strcpy(spt,pt); /*  skipped past the :// of the machine name) */
	    else if ( pt==spt+1 && spt[0]=='.' && *pt=='/' )	/* Noop */
		u_strcpy(spt,spt+2);
	    else if ( pt==spt+2 && spt[0]=='.' && spt[1]=='.' ) {
		for ( bpt=spt-2 ; bpt>rpt && *bpt!='/'; --bpt );
		if ( bpt>=rpt && *bpt=='/' ) {
		    u_strcpy(bpt,pt);
		    spt = bpt;
		} else {
		    rpt = pt;
		    spt = pt;
		}
	    } else
		spt = pt;
	}
	name = buffer;
    }
    if (result!=name) {
	u_strncpy(result,name,rsiz);
	result[rsiz-1]='\0';
	_u_backslash_to_slash(result);
    }
return(result);
}
Пример #14
0
static EC_OBJ ustring_radd( EC_OBJ obj1, EC_OBJ obj2 )
{
	/* ec_string str; */
	EC_OBJ    res;
	EcInt     len;

	if (! EC_USTRINGP(obj2))
		return EcTypeError( EC_NIL, EC_NIL, 2, tc_string, obj2, TRUE, "string radd" );

	EC_ASSERT( EC_USTRINGP(obj1) );
	EC_ASSERT( EC_USTRINGP(obj2) );

	len = EC_USTRLEN(obj1) + EC_USTRLEN(obj2);
	res = EcMakeUString( EC_USTRDATA(obj2), len , EcTrue );
	u_strcat(EC_USTRDATA(res), EC_USTRDATA(obj1) );
	EC_USTRLEN(res) = len;

	return res;
}
Пример #15
0
static EC_OBJ ustring_add( EC_OBJ obj1, EC_OBJ obj2 )
{
	/* ec_string str; */
	EC_OBJ    res;
	EcInt     len;

	/* add chars later */
	if (/* (! EC_CHARP(obj2)) && */(! EC_USTRINGP(obj2)))
		return EcTypeError( EC_NIL, EC_NIL, 2, tc_none, obj2, TRUE, "string add" );

	EC_ASSERT( EC_USTRINGP(obj1) );

	len = EC_USTRLEN(obj1) + EC_USTRLEN(obj2);
	res = EcMakeUString( EC_USTRDATA(obj1), len , EcTrue );
	u_strcat(EC_USTRDATA(res), EC_USTRDATA(obj2) );
	EC_USTRLEN(res) = len;

	return res;
}
Пример #16
0
/**
 * This function analyzes an INF code and returns a value that indicates
 * if it is a valid left component or not.
 */
int get_valid_left_component_type_for_one_INF_code(const unichar* INF_code) {
/* We produce an artifical dictionary entry with the given INF code,
 * and then, we tokenize it in order to get grammatical and inflectional
 * codes in a structured way. */
unichar temp[2000];
u_strcpy(temp,"x,");
u_strcat(temp,INF_code);
struct dela_entry* d=tokenize_DELAF_line(temp,0);
int res;
/* Now we can test if the INF code corresponds to a valid left component */
if (check_Nsia(d)) res=N_SIA;
else if (check_Nsie(d)) res=N_SIE;
else if (check_Nsig(d)) res=N_SIG;
else if (check_Asio(d)) res=A_SIO;
else if (check_Asie(d)) res=A_SIE;
else if (check_VW(d)) res=V_W;
else if (check_ADV(d)) res=ADV;
else res=INVALID_LEFT_COMPONENT;
/* Finally we free the artifical dictionary entry */
free_dela_entry(d);
return res;
}
Пример #17
0
static void
TestCompare(){
    int32_t i;

    const char* testName ="uidna_compare";
    CompareFunc func = uidna_compare;

    UChar www[] = {0x0057, 0x0057, 0x0057, 0x002E, 0x0000};
    UChar com[] = {0x002E, 0x0043, 0x004F, 0x004D, 0x0000};
    UChar buf[MAX_DEST_SIZE]={0x0057, 0x0057, 0x0057, 0x002E, 0x0000};
    UChar source[MAX_DEST_SIZE]={0},
          uni0[MAX_DEST_SIZE]={0},
          uni1[MAX_DEST_SIZE]={0},
          ascii0[MAX_DEST_SIZE]={0},
          ascii1[MAX_DEST_SIZE]={0},
          temp[MAX_DEST_SIZE] ={0};


    u_strcat(uni0,unicodeIn[0]);
    u_strcat(uni0,com);

    u_strcat(uni1,unicodeIn[1]);
    u_strcat(uni1,com);

    u_charsToUChars(asciiIn[0], temp, (int32_t)strlen(asciiIn[0]));
    u_strcat(ascii0,temp);
    u_strcat(ascii0,com);

    memset(temp, 0, U_SIZEOF_UCHAR * MAX_DEST_SIZE);

    u_charsToUChars(asciiIn[1], temp, (int32_t)strlen(asciiIn[1]));
    u_strcat(ascii1,temp);
    u_strcat(ascii1,com);

    /* prepend www. */
    u_strcat(source, www);

    for(i=0;i< (int32_t)(sizeof(unicodeIn)/sizeof(unicodeIn[0])); i++){
        UChar* src;
        int32_t srcLen;

        memset(buf+4, 0, (MAX_DEST_SIZE-4) * U_SIZEOF_UCHAR);

        u_charsToUChars(asciiIn[i],buf+4, (int32_t)strlen(asciiIn[i]));
        u_strcat(buf,com);


        /* for every entry in unicodeIn array
           prepend www. and append .com*/
        source[4]=0;
        u_strncat(source,unicodeIn[i], u_strlen(unicodeIn[i]));
        u_strcat(source,com);

        /* a) compare it with itself*/
        src = source;
        srcLen = u_strlen(src);

        testCompareWithSrc(src,srcLen,src,srcLen,testName, func, TRUE);

        /* b) compare it with asciiIn equivalent */
        testCompareWithSrc(src,srcLen,buf,u_strlen(buf),testName, func,TRUE);

        /* c) compare it with unicodeIn not equivalent*/
        if(i==0){
            testCompareWithSrc(src,srcLen,uni1,u_strlen(uni1),testName, func,FALSE);
        }else{
            testCompareWithSrc(src,srcLen,uni0,u_strlen(uni0),testName, func,FALSE);
        }
        /* d) compare it with asciiIn not equivalent */
        if(i==0){
            testCompareWithSrc(src,srcLen,ascii1,u_strlen(ascii1),testName, func,FALSE);
        }else{
            testCompareWithSrc(src,srcLen,ascii0,u_strlen(ascii0),testName, func,FALSE);
        }

    }
}
Пример #18
0
/**
 * Explores the given dictionary to match the given word.
 */
static void explore_dic(int offset,unichar* word,int pos_word,Dictionary* d,SpellCheckConfig* cfg,
		Ustring* output,SpellCheckHypothesis* *list,int base,Ustring* inflected) {
int original_offset=offset;
int original_base=base;
int final,n_transitions,inf_code;
int z=save_output(output);
int size_pairs=cfg->pairs->nbelems;
offset=read_dictionary_state(d,offset,&final,&n_transitions,&inf_code);
if (final) {
	if (word[pos_word]=='\0') {
		/* If we have a match */
		deal_with_matches(d,inflected->str,inf_code,output,cfg,base,list);
	}
	base=output->len;
}
/* If we are at the end of the token, then we stop */
if (word[pos_word]=='\0') {
	return;
}
unsigned int l2=inflected->len;
unichar c;
int dest_offset;
for (int i=0;i<n_transitions;i++) {
	restore_output(z,output);
	offset=read_dictionary_transition(d,offset,&c,&dest_offset,output);
	/* For backup_output, see comment below */
	int backup_output=save_output(output);
	if (c==word[pos_word] || word[pos_word]==u_toupper(c)) {
		u_strcat(inflected,c);
		explore_dic(dest_offset,word,pos_word+1,d,cfg,output,list,base,inflected);
	} else {
		/* We deal with the SP_SWAP case, made of 2 SP_CHANGE_XXX */
		if (cfg->current_errors!=cfg->max_errors && cfg->current_SP_SWAP!=cfg->max_SP_SWAP
				&& is_letter_swap(cfg,word,pos_word,inflected,c)) {
			/* We don't modify the number of errors since we override an existing
			 * SP_CHANGE_XXX one */
			cfg->current_SP_SWAP++;
			/* We override the previous change */
			int a=cfg->pairs->tab[cfg->pairs->nbelems-2];
			int b=cfg->pairs->tab[cfg->pairs->nbelems-1];
			cfg->pairs->tab[cfg->pairs->nbelems-2]=pos_word-1;
			cfg->pairs->tab[cfg->pairs->nbelems-1]=SP_SWAP_DEFAULT;
			u_strcat(inflected,c);
			explore_dic(dest_offset,word,pos_word+1,d,cfg,output,list,base,inflected);
			cfg->pairs->tab[cfg->pairs->nbelems-2]=a;
			cfg->pairs->tab[cfg->pairs->nbelems-1]=b;
			cfg->current_SP_SWAP--;
		} else /* We deal with the SP_CHANGE case */
		       if (cfg->current_errors!=cfg->max_errors && cfg->current_SP_CHANGE!=cfg->max_SP_CHANGE
				/* We want letters, not spaces or anything else */
				&& is_letter(c,NULL)
		        /* We do not allow the replacement of a lowercase letter by an uppercase
		         * letter at the beginning of the word like Niserable, unless the whole word
		         * is in uppercase or the letter is the same, module the case */
		        && (cfg->allow_uppercase_initial || pos_word>0 || (!is_upper(word[0],NULL) || is_upper(word[1],NULL) || word[0]==u_toupper(c)))) {
			cfg->current_errors++;
			cfg->current_SP_CHANGE++;
			/* Now we test all possible kinds of change */
			vector_int_add(cfg->pairs,pos_word);
			u_strcat(inflected,c);
			/* We always add the default case */
			vector_int_add(cfg->pairs,SP_CHANGE_DEFAULT);
			int n_elem=cfg->pairs->nbelems;
			explore_dic(dest_offset,word,pos_word+1,d,cfg,output,list,base,inflected);
			/* Then we test the accent case */
			if (u_deaccentuate(c)==u_deaccentuate(word[pos_word])) {
				/* After a call to explore_dic, we must restore the output.
				 * But, when dealing with SP_CHANGE_XXX ops, we must restore the
				 * output including the output associated to the current transition,
				 * which is why we don't use z (output before the current transition)
				 * but backup_output */
				restore_output(backup_output,output);
			    cfg->pairs->nbelems=n_elem;
			    cfg->pairs->tab[cfg->pairs->nbelems-1]=SP_CHANGE_DIACRITIC;
			    explore_dic(dest_offset,word,pos_word+1,d,cfg,output,list,base,inflected);
			}
			/* And the case variations */
			if (u_tolower(c)==u_tolower(word[pos_word])) {
			    restore_output(backup_output,output);
			    cfg->pairs->nbelems=n_elem;
				cfg->pairs->tab[cfg->pairs->nbelems-1]=SP_CHANGE_CASE;
				explore_dic(dest_offset,word,pos_word+1,d,cfg,output,list,base,inflected);
			}
			/* And finally the position on keyboard */
			if (areCloseOnKeyboard(c,word[pos_word],cfg->keyboard)) {
			    restore_output(backup_output,output);
			    cfg->pairs->nbelems=n_elem;
				cfg->pairs->tab[cfg->pairs->nbelems-1]=SP_CHANGE_KEYBOARD;
				explore_dic(dest_offset,word,pos_word+1,d,cfg,output,list,base,inflected);
			}
			cfg->pairs->nbelems=size_pairs;
			cfg->current_errors--;
			cfg->current_SP_CHANGE--;
			/* End of the SP_CHANGE case */
		}
	}
    restore_output(backup_output,output);
	truncate(inflected,l2);
	/* Now we deal with the SP_SUPPR case */
	if (cfg->current_errors!=cfg->max_errors && cfg->current_SP_SUPPR!=cfg->max_SP_SUPPR
		/* We want letters, not spaces or anything else */
		&& is_letter(c,NULL)) {
		cfg->current_errors++;
		cfg->current_SP_SUPPR++;
		vector_int_add(cfg->pairs,pos_word);
		if (pos_word>=1 && c==word[pos_word-1]) {
			vector_int_add(cfg->pairs,SP_SUPPR_DOUBLE);
		} else {
			vector_int_add(cfg->pairs,SP_SUPPR_DEFAULT);
		}
		u_strcat(inflected,c);
		explore_dic(dest_offset,word,pos_word,d,cfg,output,list,original_base,inflected);
		truncate(inflected,l2);
		cfg->pairs->nbelems=size_pairs;
		cfg->current_errors--;
		cfg->current_SP_SUPPR--;
	}
}
restore_output(z,output);
/* Finally, we deal with the SP_INSERT case, by calling again the current
 * function with the same parameters, except pos_word that will be increased of 1 */
if (cfg->current_errors!=cfg->max_errors && cfg->current_SP_INSERT!=cfg->max_SP_INSERT
	/* We want letters, not spaces or anything else */
	&& is_letter(word[pos_word],NULL)
	/* We do not allow the insertion of a capital letter at the beginning of
	 * the word like Astreet, unless the whole word is in uppercase like ASTREET */
    && (cfg->allow_uppercase_initial || pos_word>0 || (!is_upper(word[0],NULL) || is_upper(word[1],NULL)))) {
	cfg->current_errors++;
	cfg->current_SP_INSERT++;
	vector_int_add(cfg->pairs,pos_word);
	if (pos_word>=1 && word[pos_word]==word[pos_word-1]) {
		vector_int_add(cfg->pairs,SP_INSERT_DOUBLE);
	} else {
		vector_int_add(cfg->pairs,SP_INSERT_DEFAULT);
	}
	explore_dic(original_offset,word,pos_word+1,d,cfg,output,list,original_base,inflected);
	truncate(inflected,l2);
	cfg->pairs->nbelems=size_pairs;
	cfg->current_errors--;
	cfg->current_SP_INSERT--;
}
/* Finally, we restore the output as it was when we enter the function */
restore_output(z,output);
}
/**
 * Explores all the partial matches to produce outputs in MERGE or REPLACE mode.
 * 
 * If *var_starts!=NULL, it means that there are pending $var_start( tags
 * that wait for being taken into account when a text dependent tag is found.
 */
void explore_match_for_MERGE_or_REPLACE_mode(struct locate_tfst_infos* infos,
                                  struct tfst_simple_match_list* element,
                                  vector_ptr* items,int current_item,Ustring* s,
                                  int last_text_dependent_tfst_tag,
                                  struct list_pointer* *var_starts) {
if (current_item==items->nbelems) {
   /* If we have finished, we can save the current output */
   element->output=s->str;
   infos->matches=add_element_to_list(infos,infos->matches,element);
   element->output=NULL;
   return;
}
/* We save the length because it will be modified */
int len=s->len;
struct tfst_match* item=(struct tfst_match*)(items->tab[current_item]);
if (item==NULL) {
   fatal_error("Unexpected NULL item in explore_match_for_MERGE_mode\n");
}
if (item->debug_output!=NULL) {
	/* If we have a debug output, we deal it */
	u_strcat(s,item->debug_output);
	explore_match_for_MERGE_or_REPLACE_mode(infos,element,items,current_item+1,s,last_text_dependent_tfst_tag,var_starts);
	s->len=len;
	s->str[len]='\0';
	return;
}


unichar* output=infos->fst2->tags[item->fst2_transition->tag_number]->output;

unichar name[MAX_TRANSDUCTION_VAR_LENGTH];
int capture;
struct dela_entry* old_value_dela=NULL;
capture=is_capture_variable(output,name);
if (capture) {
	/* If we have a capture variable $:X$, we must save the previous value
	 * for this dictionary variable */
	old_value_dela=clone_dela_entry(get_dic_variable(name,infos->dic_variables));
}

Match saved_element=element->m;
struct list_int* text_tags=item->text_tag_numbers;
int captured_chars=0;
/* We explore all the text tags */
while (text_tags!=NULL) {
   /* First, we restore the output string */
   s->len=len;
   s->str[len]='\0';
   captured_chars=0;
   /* We deal with the fst2 tag output, if any */
   if (item->first_time) {
	   /* We only have to process the output only once,
	    * since it will have the same effect on all tfst tags.
	    *
	    * Example: the fst2 tag "cybercrime/ZZ" may match the two tfst tags "cyber" and
	    * "crime", but we must process the "ZZ" output only before the first tfst tag "cyber" */
	   if (capture) {
		   /* If we have a capture variable, then we have to check whether the tfst tag
	   	    * is a tagged token or not */
	   	   int tfst_tag_number=text_tags->n;
	   	   int fst2_tag_number=item->fst2_transition->tag_number;
	   	   if (!do_variable_capture(tfst_tag_number,fst2_tag_number,infos,name)) {
	   		   goto restore_dic_variable;
	   	   }
	   } else if (!deal_with_output_tfst(s,output,infos,&captured_chars)) {
         /* We do not take into account matches with variable errors if the
          * process_output_for_tfst_match function has decided that backtracking
          * was necessary, either because of a variable error of because of a
          * $a.SET$ or $a.UNSET$ test */
		  goto restore_dic_variable;
      }
   }
   int last_tag=last_text_dependent_tfst_tag;
   TfstTag* current_tag=NULL;
   if (text_tags->n==-1) {
      /* We have a text independent match */
      Fst2Tag fst2_tag=infos->fst2->tags[item->fst2_transition->tag_number];
      if (fst2_tag->type==BEGIN_OUTPUT_VAR_TAG) {
          /* If we an output variable start $|a( */
          int var_index=get_value_index(fst2_tag->variable,infos->output_variables->variable_index);

		  Ustring* old_value = new_Ustring();
		  swap_output_variable_content(infos->output_variables, var_index, old_value);
		  // now old_value contain the backup

          set_output_variable_pending(infos->output_variables,fst2_tag->variable);
          explore_match_for_MERGE_or_REPLACE_mode(infos,element,items,current_item+1,s,last_tag,var_starts);
          unset_output_variable_pending(infos->output_variables,fst2_tag->variable);

		  // restore the good content from backup
		  swap_output_variable_content(infos->output_variables, var_index, old_value);
		  free_Ustring(old_value);

          goto restore_dic_variable;
      } else if (fst2_tag->type==END_OUTPUT_VAR_TAG) {
          /* If we an output variable end $|a) */
          unset_output_variable_pending(infos->output_variables,fst2_tag->variable);
          explore_match_for_MERGE_or_REPLACE_mode(infos,element,items,current_item+1,s,last_tag,var_starts);
          set_output_variable_pending(infos->output_variables,fst2_tag->variable);
          goto restore_dic_variable;
      } else if (fst2_tag->type==BEGIN_VAR_TAG) {
         /* If we have a variable start tag $a(, we add it to our 
          * variable tag list */
         struct transduction_variable* v=get_transduction_variable(infos->input_variables,fst2_tag->variable);
         int old_value=v->start_in_tokens;
         /* We add the address of the start field to our list */
         (*var_starts)=new_list_pointer(&(v->start_in_tokens),(var_starts==NULL)?NULL:(*var_starts));
         /* Then, we go on the next item */
         explore_match_for_MERGE_or_REPLACE_mode(infos,element,items,current_item+1,s,last_tag,var_starts);
         /* After the exploration, there are 2 cases:
          * 1) *var_starts is NULL: nothing to do
          * 2) *var_starts is not NULL: we reached the end of the items without findind any
          *                             text dependent match, so we can free the list */
         free_list_pointer(*var_starts);
         (*var_starts)=NULL;
         v->start_in_tokens=old_value;
         /* If we have a $a( tag, we know that we can only have just one text tag 
          * with special value -1 */
         goto restore_dic_variable;
      } else if (fst2_tag->type==END_VAR_TAG) {
         /* If we have found a $a) tag */
         if (last_tag==-1) {
            /* If we have no tfst tag to use, then it's a variable definition error,
             * and we have nothing special to do */
            explore_match_for_MERGE_or_REPLACE_mode(infos,element,items,current_item+1,s,last_tag,var_starts);
            goto restore_dic_variable;
         } else {
            /* We can set the end of the variable, it's 'last_tag' */
            struct transduction_variable* v=get_transduction_variable(infos->input_variables,fst2_tag->variable);
            int old_value=v->end_in_tokens;
            v->end_in_tokens=last_tag;
            explore_match_for_MERGE_or_REPLACE_mode(infos,element,items,current_item+1,s,last_tag,var_starts);
            v->end_in_tokens=old_value;
            goto restore_dic_variable;
         }
      } else if (fst2_tag->type==LEFT_CONTEXT_TAG) {
         /* If we have found a $* tag, we must reset the stack string and the 
          * start position, so we save them */
         unichar* old_stack=u_strdup(s->str);
         int old_pos_token=element->m.start_pos_in_token;
         int old_pos_char=element->m.start_pos_in_char;
         int old_pos_letter=element->m.start_pos_in_letter;
         /* We set the new values */
         empty(s);
         element->m.start_pos_in_token=LEFT_CONTEXT_PENDING;
         /* We must reset last_tag to -1, because is not, we will have an 
          * extra space on the left of the match */
         explore_match_for_MERGE_or_REPLACE_mode(infos,element,items,current_item+1,s,-1,var_starts);
         
         /* And we restore previous values */
         element->m.start_pos_in_token=old_pos_token;
         element->m.start_pos_in_char=old_pos_char;
         element->m.start_pos_in_letter=old_pos_letter;
         u_strcpy(s,old_stack);
         free(old_stack);
         /* If we have a $* tag, we know that we can only have just one text tag 
          * with special value -1 */
         goto restore_dic_variable;
      } else if (fst2_tag->type==BEGIN_POSITIVE_CONTEXT_TAG) {
    	  fatal_error("problem $[\n");
      }
   } else {
      current_tag=(TfstTag*)(infos->tfst->tags->tab[text_tags->n]);
      /* We update the last tag */
      last_tag=text_tags->n;
      /* If the current text tag is not a text independent one */
      
      /* If there are some pending $a( tags, we set them to the current tag */
      if (var_starts!=NULL) {
         struct list_pointer* ptr=(*var_starts);
         while (ptr!=NULL) {
            int* start=(int*)(ptr->pointer);
            (*start)=text_tags->n;
            ptr=ptr->next;
         }
      }
      int previous_start_token,previous_start_char; 
      if (last_text_dependent_tfst_tag!=-1) {
         /* If the item is not the first, we must insert the original text that is
          * between the end of the previous merged text and the beginning of the
          * current one, typically to insert spaces */
         TfstTag* previous_tag=(TfstTag*)(infos->tfst->tags->tab[last_text_dependent_tfst_tag]);
         previous_start_token=previous_tag->m.end_pos_in_token;
         previous_start_char=previous_tag->m.end_pos_in_char;
         /* We start just after the end of the previous match */
         if (infos->tfst->token_content[previous_start_token][previous_start_char+1]!='\0') {
            /* If we were not at the end of the previous text token, we just inscrease
             * the char position */
            previous_start_char++;
         } else {
            /* Otherwise, we go on the next token */
            previous_start_token++;
            previous_start_char=0;
         }
      } else {
         /* Otherwise, we start on the beginning of the current text tag */
         //error("current item=%d\n",text_tags->n);
         previous_start_token=current_tag->m.start_pos_in_token;
         previous_start_char=current_tag->m.start_pos_in_char;
      }
      /* Here we have to insert the text that is between current_start and current_end,
       * and then, the ouput of the fst2 transition */
      if (infos->output_policy==MERGE_OUTPUTS) {
    	  insert_text_interval_tfst(infos,s,previous_start_token,previous_start_char,
                 current_tag->m.end_pos_in_token,current_tag->m.end_pos_in_char);
      }
   }
   /* Then, we go on the next item */
   struct list_pointer* ptr2=NULL;
   if (element->m.start_pos_in_token==LEFT_CONTEXT_PENDING && current_tag!=NULL) {
      element->m.start_pos_in_token=infos->tfst->offset_in_tokens+current_tag->m.start_pos_in_token;
      element->m.start_pos_in_char=current_tag->m.start_pos_in_char;
      element->m.start_pos_in_letter=current_tag->m.start_pos_in_letter;
   }
   explore_match_for_MERGE_or_REPLACE_mode(infos,element,items,current_item+1,s,last_tag
         ,&ptr2 /* We have encountered a text dependent tag, so there is no
                 * more pending start tag like $a( */
         );
   element->m=saved_element;
   /* If there was a $* tag pending */
   free_list_pointer(ptr2);
   if (infos->ambiguous_output_policy==IGNORE_AMBIGUOUS_OUTPUTS) {
      /* If we don't want ambiguous outputs, then the first path is
       * enough for our purpose */ 
      goto restore_dic_variable;
   }
   text_tags=text_tags->next;
   remove_chars_from_output_variables(infos->output_variables,captured_chars);
   /* We reset to 0, because if we exit the while normally, we don't want to
    * modify output variables twice when reaching the 'restore_dic_variable'
    * label */
   captured_chars=0;
}
restore_dic_variable:
/* We redo this about output variables here, since we may have jumped here directly */
remove_chars_from_output_variables(infos->output_variables,captured_chars);
if (capture) {
	/* If we have a capture variable $:X$, we must restore the previous value
	 * for this dictionary variable */
	set_dic_variable(name,old_value_dela,&(infos->dic_variables),0);
}
}
Пример #20
0
int cq_dlist_to_update_utf8(char *buf, size_t buflen, struct dlist list,
        struct drow row)
{
    UChar *buf16;
    UErrorCode status = U_ZERO_ERROR;
    size_t num_left = list.fieldc;
    int rc = 0;

    if (num_left == 0)
        return 1;

    buf16 = calloc(buflen, sizeof(UChar));
    if (buf16 == NULL)
        return -2;

    for (size_t i = 0; i < list.fieldc; ++i) {
        if (!strcmp(list.fieldnames[i], list.primkey)) {
            --num_left;
            continue;
        }

        UChar *ftemp = calloc(buflen, sizeof(UChar));
        if (ftemp == NULL) {
            rc = -3;
            break;
        }

        UChar *vtemp = calloc(buflen, sizeof(UChar));
        if (vtemp == NULL) {
            rc = -4;
            free(ftemp);
            break;
        }

        u_strFromUTF8(ftemp, buflen, NULL, list.fieldnames[i],
                strlen(list.fieldnames[i]), &status);
        if (!U_SUCCESS(status)) {
            rc = 2;
            free(ftemp);
            free(vtemp);
            break;
        }

        u_strFromUTF8(vtemp, buflen, NULL, row.values[i], strlen(row.values[i]),
                &status);
        if (!U_SUCCESS(status)) {
            rc = 3;
            free(ftemp);
            free(vtemp);
            break;
        }

        bool isstr = false;
        for (int32_t j = 0; j < u_strlen(vtemp); ++j)
            if (!isdigit(vtemp[j]))
                isstr = true;

        u_strcat(buf16, ftemp);
        u_strcat(buf16, u"=");
        if (isstr) u_strcat(buf16, u"'");
        u_strcat(buf16, vtemp);
        if (isstr) u_strcat(buf16, u"'");

        free(ftemp);
        free(vtemp);

        if (--num_left > 0)
            u_strcat(buf16, u",");
    }

    u_strToUTF8(buf, buflen, NULL, buf16, u_strlen(buf16), &status);
    if (!U_SUCCESS(status))
        rc = 4;

    free(buf16);
    return rc;
}
Пример #21
0
//
// this function explores the dictionary to decompose the word mot
//
void explore_state (int adresse,
		    unichar* current_component,
		    int pos_in_current_component,
		    const unichar* original_word,
		    const unichar* remaining_word,
		    int pos_in_remaining_word,
		    const unichar* decomposition,
		    const unichar* lemma_prefix,
		    struct decomposed_word_list** L,
		    int n_decomp,
		    struct rule_list* rule_list_called,
		    const struct dela_entry* dic_entr_called,
		    const unsigned char* tableau_bin,
		    const struct INF_codes* inf_codes,
		    const bool* prefix,const bool* suffix,const Alphabet* alphabet,
		    U_FILE* debug_file,struct utags UTAG,
		    vector_ptr* rules,vector_ptr* entries)
{

  int c = tableau_bin[adresse]*256+tableau_bin[adresse+1];
  int index;
  int t = 0;

  if ( !(c&32768) ) { // if we are in a terminal state

    index = tableau_bin[adresse+2]*256*256+tableau_bin[adresse+3]*256+tableau_bin[adresse+4];
    current_component[pos_in_current_component] = '\0';

    if (pos_in_current_component >= 1) {
      // go on if word length equals zero

#if DDEBUG > 0
      {
         u_fprintf(debug_file,". %S\n",current_component);
      }
#endif

      struct list_ustring* l = inf_codes->codes[index];
      while ( l != 0 ) {

//	int one_rule_already_matched = 0; // one rule matched each entry is enough

	unichar entry[MAX_DICT_LINE_LENGTH];
	uncompress_entry(current_component, l->string, entry);

#if DDEBUG > 0
	{
	  u_fprintf(debug_file,": %S\n",entry);
	}
#endif

	struct dela_entry* dic_entr = new_dic_entry(entry,entries);

	unichar lemma_prefix_new[MAX_DICT_LINE_LENGTH];
	struct rule_list* rule_list_new = 0;
	unichar next_remaining_word[MAX_WORD_LENGTH];

	struct rule_list* rule_list = 0;
	if (prefix_is_valid(index,prefix) || suffix_is_valid(index,suffix))
	  rule_list = parse_rules(entry,UTAG,rules);
	else {
	  rule_list = new_rule_list(rules);
	  rule_list->rule = new_composition_rule();
	}
	// entry is now cleaned from rules for composition and derivation

	// log decomposition of word
	// ("cleaned" entries for better overview)
	unichar decomposition_new[MAX_DICT_LINE_LENGTH];
	u_strcpy(decomposition_new, decomposition);
	if (decomposition_new[0] != '\0') u_strcat(decomposition_new, " +++ ");
	u_strcat(decomposition_new, entry);


	// loop on all composition_rules called
	struct rule_list* called = rule_list_called;
	do { // while ( rule_list* called != 0 )

// 	  if (one_rule_already_matched)
// 	    break;

 	  struct composition_rule* rule_called
	    = ( called != 0 ) ? called->rule : 0; // may be undefined

	  // loop on all actual composition_rules
	  struct rule_list* r_list = rule_list;
 	  while ( r_list != 0 ) {

// 	    if (one_rule_already_matched)
// 	      break;

	    struct composition_rule* rule = r_list->rule; // ever defined, see upwards

	    if (remaining_word[pos_in_remaining_word]=='\0' &&
		// we have explored the entire original word
		((((dic_entr_called != 0) &&
		   composition_rule_matches_entry(rule->before, dic_entr_called,debug_file))  &&
		  ((rule_called != 0) &&
		   composition_rule_matches_entry(rule_called->after, dic_entr,debug_file))) ||
		 // and we have a valid right component, i.e. rules match
		 ((dic_entr_called == 0) &&  // or a simple entry (i.e. no prefix),
		  (! affix_is_valid(index,prefix,suffix))) // but no affix
		 )
		)  {

//	      one_rule_already_matched = 1;

	      unichar inflected[MAX_WORD_LENGTH];
	      unichar lemma[MAX_WORD_LENGTH];
	      unichar codes[MAX_DICT_LINE_LENGTH];
	      tokenize_DELA_line_into_3_parts(entry, inflected, lemma, codes);

	      /* generating new lexicon entry */
	      unichar new_dela_line[MAX_DICT_LINE_LENGTH];

	      /* word form */
	      u_strcpy(new_dela_line, original_word);
	      u_strcat(new_dela_line, ",");

	      /* lemma */                           // lemmatize word
	      if (rule->then.repl[0] == '\0'	    // if there are no replace codes
		  && (rule_called != 0              // either in actual nor in preceeding rule
		      && rule_called->then.repl[0] == '\0')) {
		u_strcat(new_dela_line, lemma_prefix);
		unichar affix[MAX_WORD_LENGTH];
		u_strcpy(affix, lemma);
		substring_operation(affix, rule->then.substr_act);
		if (rule_called != 0 && rule_called->then.undo_substr_next[0] != '\0')
		  substring_operation(affix, rule_called->then.undo_substr_next);
		u_strcat(new_dela_line, affix);
	      } else {
		u_strcat(new_dela_line, original_word);
	      }

	      /* codes */
	      u_strcat(new_dela_line,".");
	      if (rule->then.repl[0] != '\0') {            // replacing codes by
		u_strcat(new_dela_line,rule->then.repl);   // suffix' ones
	      }
	      else if (rule_called == 0) { // prohibit SGV
		u_strcat(new_dela_line,codes);
	      }
	      else if (rule_called->then.repl[0] != '\0') {
		u_strcat(new_dela_line,rule_called->then.repl); // prefix' ones
	      }
	      // replace replaces all and blocks adding and deleting
	      // maybe this is not optimal ???
	      else {
		if (rule_called->then.add[0] != '\0') {        // add codes
		  if (!dic_entry_contain_gram_code(dic_entr, rule_called->then.add)) {
		    bool done = 0;
		    unichar tmp[MAX_COMPOSITION_RULE_LENGTH];
		    int j = 0;
		    for (int i = 0; codes[i] != '\0'; i++) {
		      if (codes[i] == ':' && (!done)) {
			tmp[j++] = '+';
			tmp[j] = '\0';
			u_strcat(new_dela_line,tmp);
			u_strcat(new_dela_line,rule_called->then.add);
			done = 1;
			j = 0;
		      }
		      tmp[j++] = codes[i];
		    }
		    tmp[j] = '\0';
		    u_strcat(new_dela_line,tmp);
		    if (!done) {
		      u_strcat(new_dela_line,"+");
		      u_strcat(new_dela_line,rule_called->then.add);
		    }
		  } else {
		    u_strcat(new_dela_line,codes);
		  }
		} else if (rule_called->then.del[0] != '\0') { // delete codes

		} else {
		  u_strcat(new_dela_line,codes);
		}
	      }

#if DDEBUG > 0
	      {
            u_fprintf(debug_file,"= %S\n",new_dela_line);
	      }
#endif

	      struct decomposed_word* wd = new_decomposed_word();
	      wd->n_parts = n_decomp;
	      u_strcpy(wd->decomposition,decomposition_new);
	      u_strcpy(wd->dela_line,new_dela_line);
	      struct decomposed_word_list* wdl=new_decomposed_word_list();
	      // unshift actual decomposition to decomposition list L
	      wdl->element = wd;
	      wdl->suivant = (*L);
	      (*L) = wdl;

	    } // end if end of word and valid right component
	    else if
	      // beginning or middle of word: explore the rest of the original word
	      (prefix_is_valid(index,prefix) &&
	       check_is_valid(UTAG.PREFIX, dic_entr) &&
	       // but only if the current component was a valid left one
	       // we go on with the next component
	       (
		(n_decomp == 1) // prefix as first part of a word: no rule matching
		||
		(               // prefix in the middle of a word
		 (rule_called &&
		  composition_rule_matches_entry(rule_called->after, dic_entr,debug_file)) &&
		 (dic_entr_called &&
		  composition_rule_matches_entry(rule->before, dic_entr_called,debug_file))
		)
	       )) {

//	      one_rule_already_matched = 1;

	      u_strcpy(lemma_prefix_new, lemma_prefix);
	      unichar affix[MAX_WORD_LENGTH];
	      u_strcpy(affix, current_component);
	      if (rule_called != 0 && rule_called->then.undo_substr_next[0] != '\0') {
            substring_operation(affix, rule_called->then.undo_substr_next);
            u_fprintf(debug_file,"yes\n");
	      }
	      substring_operation(affix, rule->then.substr_act);
	      u_strcat(lemma_prefix_new, affix);
	      int j = 0;
	      for (int i = pos_in_remaining_word; remaining_word[i] != '\0'; i++) {
            next_remaining_word[j++] = remaining_word[i];
         }
	      next_remaining_word[j] = '\0';
	      if (rule->then.substr_next[0] != '\0') {
            substring_operation(next_remaining_word, rule->then.substr_next);
#if DDEBUG > 0
            {
               u_fprintf(debug_file,"| %S|%S\n",affix,next_remaining_word);
            }
#endif
	      }

#if DDEBUG > 0
	      {
            u_fprintf(debug_file,"- %S\n",entry);
	      }
#endif
	      struct rule_list* tmp = new_rule_list(rules);
	      tmp->rule = new_composition_rule();
	      copy_composition_rule(tmp->rule, rule);
	      tmp->next = 0;
	      if ( rule_list_new == 0 ) {
            rule_list_new = tmp;
	      }
	      else {
            struct rule_list* trl = rule_list_new;
            while ( trl->next != 0 ) {
               trl=trl->next;
            }
            trl->next = tmp;
	      }

	    }
	    else {
	      // no valid suffix nor prefix
	    }

	    r_list = r_list->next;
	  } // while ( rule_list* r_list != 0 )

	  if ( called != 0 )
	    called = called->next;
	} while ( called != 0 );

	// prefix found, try to decomposite rest of word
	if ( rule_list_new != 0 && dic_entr != 0 ) {
	  unichar next_component[MAX_WORD_LENGTH];
#if DDEBUG > 0
	  {
	    u_fprintf(debug_file,"> %S\n",next_remaining_word);
	  }
#endif
	  explore_state(4,
			next_component,
			0,
			original_word,
			next_remaining_word,
			0,
			decomposition_new,
			lemma_prefix_new,
			L,
			n_decomp+1,
			rule_list_new,
			dic_entr,
			tableau_bin,inf_codes,prefix,suffix,alphabet,debug_file,UTAG,rules,entries);
	}
	else {
// 	  free_dic_entry(dic_entr);
// 	  free_rule_list(rule_list);
	}

	l = l->next;

      } // end of while (token_list* l != 0)

      t = adresse+5;

    } // end of word length >= 1
  }
  else { // not a final state
    c = c-32768;
    t = adresse+2;
  }
  if (remaining_word[pos_in_remaining_word]=='\0') {
    // if we have finished, we return
//     free_dic_entry(dic_entr_called);
//     free_rule_list(rule_list_called);
    return;
  }
  // if not, we go on with the next letter
  for (int i=0;i<c;i++) {
    if (is_equal_or_uppercase((unichar)(tableau_bin[t]*256+tableau_bin[t+1]),
			       remaining_word[pos_in_remaining_word],
			       alphabet)
	||
	is_equal_or_uppercase(remaining_word[pos_in_remaining_word],
			       (unichar)(tableau_bin[t]*256+tableau_bin[t+1]),
			       alphabet)) {
      index = tableau_bin[t+2]*256*256+tableau_bin[t+3]*256+tableau_bin[t+4];
      current_component[pos_in_current_component] =
	(unichar)(tableau_bin[t]*256+tableau_bin[t+1]);
      explore_state(index,
		    current_component,
		    pos_in_current_component+1,
		    original_word,
		    remaining_word,
		    pos_in_remaining_word+1,
		    decomposition,
		    lemma_prefix,
		    L,
		    n_decomp,
		    rule_list_called,
		    dic_entr_called,
		    tableau_bin,
		    inf_codes,prefix,suffix,alphabet,debug_file,UTAG,rules,entries);
    }
    t += 5;
  }
}
Пример #22
0
/**
 * This explores the dictionary in order decompose the given word into a valid sequence
 * of simple words. For instance, if we have the word "Sommervarmt", we will first
 * explore the dictionary and find that "sommer" is a valid left component that
 * corresponds to the dictionary entry "sommer,.N:msia". Then we will
 * look if the following word "varmt" is in the dictionary. It is
 * the case, with the entry "varmt,varm.A:nsio". As we are at the end of the word to
 * analyze and as "varmt" is a valid rightmost component, we will generate an entry
 * according to the following things:
 *
 * 'output_dela_line'="sommervarmt,sommervarm.A:nsio"
 * 'analysis'="sommer,.N:msia +++ varmt,varm.A:nsio"
 * 'number_of_components'=2
 *
 * Note that the initial "S" was put in lowercase, because the dictionary
 * contains "sommer" and not "Sommer". The lemma is obtained with
 * the lemma of the rightmost component (here "varm"), and the word inherits
 * from the grammatical information of its rightmost component.
 *
 * 'offset': offset of the current node in the binary array 'infos->bin'
 * 'current_component': string that represents the current simple word
 * 'pos_in_current_component': position in the string 'current_component'
 * 'word_to_analyze': the word to analyze
 * 'pos_in_word_to_analyze': position in the string 'word_to_analyze'
 * 'analysis': string that represents the analysis as a concatenation like
 *             "sommer,.N:msia +++ varmt,varm.A:nsio"
 * 'output_dela_line': string that contains the final DELA line. The lemma is
 *                     obtained by replacing the rightmost term of
 *                     the word to analyze by its lemma.
 * 'L': list of all analysis for the given word
 * 'number_of_components': number of components that compose the word.
 * 'infos': global settings.
 */
void explore_state(int offset,unichar* current_component,int pos_in_current_component,
                   const unichar* word_to_analyze,int pos_in_word_to_analyze,const unichar* analysis,
                   const unichar* output_dela_line,struct word_decomposition_list** L,
                   int number_of_components,struct norwegian_infos* infos) {
int c;
int index,t;
c=infos->bin[offset]*256+infos->bin[offset+1];
if (!(c&32768)) {
	/* If we are in a final state, we compute the index of the
	 * corresponding INF line */
	index=infos->bin[offset+2]*256*256+infos->bin[offset+3]*256+infos->bin[offset+4];
	/* We can set the end of our current component */
	current_component[pos_in_current_component]='\0';
	/* We do not consider words of length 1 */
	if (pos_in_current_component>1) {
		/* We don't consider components with a length of 1 */
		if (word_to_analyze[pos_in_word_to_analyze]=='\0') {
			/* If we have explored the entire original word */
			if (get_value_index(current_component,infos->forbidden_words,DONT_INSERT)==NO_VALUE_INDEX) {
				/* And if we do not have forbidden word in last position */
				struct list_ustring* l=infos->inf->codes[index];
				/* We will look at all the INF codes of the last component in order
				 * to produce analysis */
				while (l!=NULL) {
					unichar dec[2000];
					u_strcpy(dec,analysis);
					if (dec[0]!='\0') {
						/* If we have already something in the analysis (i.e. if
						 * we have not a simple word), we insert the concatenation
						 * mark before the entry to come */
						u_strcat(dec," +++ ");
					}
					unichar entry[2000];
					/* We get the dictionary line that corresponds to the current INF code */
					uncompress_entry(current_component,l->string,entry);
					/* And we add it to the analysis */
					u_strcat(dec,entry);
					unichar new_dela_line[2000];
					/* We copy the current output DELA line that contains
					 * the concatenation of the previous components */
					u_strcpy(new_dela_line,output_dela_line);
					/* Then we tokenize the DELA line that corresponds the current INF
					 * code in order to obtain its lemma and grammatical/inflectional
					 * information */
					struct dela_entry* tmp_entry=tokenize_DELAF_line(entry,1);
					/* We concatenate the inflected form of the last component to
					 * the output DELA line */
					u_strcat(new_dela_line,tmp_entry->inflected);
					/* We put the comma that separates the inflected form and the lemma */
					u_strcat(new_dela_line,",");
					/* And we build the lemma in the same way than the inflected form */
					u_strcat(new_dela_line,output_dela_line);
					u_strcat(new_dela_line,tmp_entry->lemma);
					/* We put the dot that separates the the lemma and the grammatical/inflectional
					 * information */
					u_strcat(new_dela_line,".");
					/* And finally we put the grammatical/inflectional information */
					u_strcat(new_dela_line,tmp_entry->semantic_codes[0]);
               int k;
               for (k=1;k<tmp_entry->n_semantic_codes;k++) {
                  u_strcat(new_dela_line,"+");
                  u_strcat(new_dela_line,tmp_entry->semantic_codes[k]);
               }
               for (k=0;k<tmp_entry->n_inflectional_codes;k++) {
                  u_strcat(new_dela_line,":");
                  u_strcat(new_dela_line,tmp_entry->inflectional_codes[k]);
               }
					free_dela_entry(tmp_entry);
					/*
					 * Now we can build an analysis in the form of a word decomposition
					 * structure, but only if the last component is a valid
					 * right one or if it is a verb long enough, or if we find out
					 * that the word to analyze was in fact a simple word
					 * in the dictionary */
					if (verb_of_more_than_4_letters(entry)
						|| check_valid_right_component_for_one_INF_code(l->string)
						|| number_of_components==1) {
						/*
						 * We set the number of components, the analysis, the actual
						 * DELA line and information about
						 */
						struct word_decomposition* wd=new_word_decomposition();
						wd->n_parts=number_of_components;
						u_strcpy(wd->decomposition,dec);
						u_strcpy(wd->dela_line,new_dela_line);
						wd->is_a_valid_right_N=check_N_right_component(l->string);
						wd->is_a_valid_right_A=check_A_right_component(l->string);
						/* Then we add the decomposition word structure to the list that
						 * contains all the analysis for the word to analyze */
						struct word_decomposition_list* wdl=new_word_decomposition_list();
						wdl->element=wd;
						wdl->next=(*L);
						(*L)=wdl;
					}
					/* We go on with the next INF code of the last component */
					l=l->next;
				}
			}
			/* If are at the end of the word to analyze, we have nothing more to do */
			return;
		} else {
			/* If we are not at the end of the word to analyze, we must
			 * 1) look if the current component is a valid left one
			 * 2) look if it is not a forbidden component and
			 * 3) explore the rest of the original word
			 */
			if (infos->valid_left_component[index] &&
				(get_value_index(current_component,infos->forbidden_words,DONT_INSERT)==NO_VALUE_INDEX)) {
				/* If we have a valid component, we look first if we are
				 * in the case of a word ending by a double letter like "kupp" */
				if (pos_in_current_component>2 &&
					(current_component[pos_in_current_component-1]==current_component[pos_in_current_component-2])) {
					/* If we have such a word, we add it to the current analysis,
					 * putting "+++" if the current component is not the first one */
					unichar dec[2000];
					u_strcpy(dec,analysis);
					if (dec[0]!='\0') {
						u_strcat(dec," +++ ");
					}
					/* In order to print the component in the analysis, we arbitrary
					 * take a valid left component among all those that are available
					 * for the current component */
					unichar sia_code[2000];
					unichar entry[2000];
					unichar line[2000];
					get_first_valid_left_component(infos->inf->codes[index],sia_code);
					uncompress_entry(current_component,sia_code,entry);
					u_strcat(dec,entry);
					u_strcpy(line,output_dela_line);
					u_strcat(line,current_component);
					/* As we have a double letter at the end of the word,
					 * we must remove a character */
					line[u_strlen(line)-1]='\0';
					unichar temp[2000];
					unichar dec_temp[2000];
					u_strcpy(dec_temp,dec);
					/* Then, we explore the dictionary in order to analyze the
					 * next component. We start at the root of the dictionary
					 * (offset=4) and we go back one position in the word to analyze.
					 * For instance, if we have "kupplaner", we read "kupp" and then
					 * we try to analyze "planner". */
					explore_state(4,temp,0,word_to_analyze,pos_in_word_to_analyze-1,
						dec_temp,line,L,number_of_components+1,infos);
				}
				/* Now, we try to analyze the component normally, even if
				 * it was ended by double letter, because we can have things
				 * like "oppbrent = opp,.ADV +++ brent,brenne.V:K" */
				unichar dec[2000];
				unichar line[2000];
				u_strcpy(dec,analysis);
				if (dec[0]!='\0') {
					/* We add the "+++" mark if the current component is not the first one */
					u_strcat(dec," +++ ");
				}
				unichar sia_code[2000];
				unichar entry[2000];
				/* In order to print the component in the analysis, we arbitrary
				 * take a valid left component among all those that are available
				 * for the current component */
				get_first_valid_left_component(infos->inf->codes[index],sia_code);
				uncompress_entry(current_component,sia_code,entry);
				u_strcat(dec,entry);
				u_strcpy(line,output_dela_line);
				u_strcat(line,current_component);
				unichar temp[2000];
				unichar dec_temp[2000];
				u_strcpy(dec_temp,dec);
				/* Then, we explore the dictionary in order to analyze the
				 * next component. We start at the root of the dictionary
				 * (offset=4). */
				explore_state(4,temp,0,word_to_analyze,pos_in_word_to_analyze,
					dec_temp,line,L,number_of_components+1,infos);
			}
		}
	}
	/* Once we have finished to deal with the current final dictionary node,
	 * we go on because we may match a longer word */
	t=offset+5;
}
else {
	/* If the node is not a final one, we get compute the number of transitions by
	 * removing the highest bit */
	c=c-32768;
	t=offset+2;
}
/* We examine each transition that goes out from the node */
for (int i=0;i<c;i++) {
	if (is_equal_or_uppercase((unichar)(infos->bin[t]*256+infos->bin[t+1]),word_to_analyze[pos_in_word_to_analyze],infos->alphabet)) {
		/* If the transition's letter is case compatible with the current letter of the
		 * word to analyze, we follow it */
		index=infos->bin[t+2]*256*256+infos->bin[t+3]*256+infos->bin[t+4];
		current_component[pos_in_current_component]=(unichar)(infos->bin[t]*256+infos->bin[t+1]);
		explore_state(index,current_component,pos_in_current_component+1,word_to_analyze,pos_in_word_to_analyze+1,
			analysis,output_dela_line,L,number_of_components,infos);
	}
	/* We move the offset to the next transition */
	t=t+5;
}
}
Пример #23
0
//
// this function explores the dictionary to decompose the word mot
//
void explore_state_german(int adresse,unichar* current_component,int pos_in_current_component,
                   const unichar* original_word,int pos_in_original_word,const unichar* decomposition,
                   unichar* dela_line,struct german_word_decomposition_list** L,int n_decomp,
                   const char* left,const char* right,
                   const struct INF_codes* inf_codes,const Alphabet* alphabet,
                   const unsigned char* tableau_bin) {
int c;
int index,t;
c=tableau_bin[adresse]*256+tableau_bin[adresse+1];
if (!(c&32768)) {
  // if we are in a terminal state
  index=tableau_bin[adresse+2]*256*256+tableau_bin[adresse+3]*256+tableau_bin[adresse+4];
  current_component[pos_in_current_component]='\0';
  if (pos_in_current_component>1) {
    // we don't consider words with a length of 1
    if (original_word[pos_in_original_word]=='\0') {
      // if we have explored the entire original word
      if (right[index]) {
         // and if we have a valid right component
         struct list_ustring* l=inf_codes->codes[index];
         while (l!=NULL) {
            unichar dec[500];
            u_strcpy(dec,decomposition);
            if (dec[0]!='\0') {u_strcat(dec," +++ ");}
            unichar entry[500];
            uncompress_entry(current_component,l->string,entry);
            u_strcat(dec,entry);
            unichar new_dela_line[500];
            struct dela_entry* tmp_entry=tokenize_DELAF_line(entry,1);
            if (tmp_entry==NULL) {
               /* If there was an error in the dictionary, we skip the entry */
               l=l->next;
               continue;
            }
            // change case if there is a prefix
            // prefixes are downcase, nouns (=suffixes) uppercase:
            // "investitionsObjekte" -> "Investitionsobjekte"
            if ( u_strlen(dela_line) != 0 ) {
              // capitalize dela_line
              dela_line[0] = u_toupper((unichar) dela_line[0]);
              // downcase lemma and inflected
              tmp_entry->inflected[0] = u_tolower(tmp_entry->inflected[0]);
              tmp_entry->lemma[0] = u_tolower(tmp_entry->lemma[0]);
            }
            u_strcpy(new_dela_line,dela_line);
            u_strcat(new_dela_line,tmp_entry->inflected);
            u_strcat(new_dela_line,",");
            u_strcat(new_dela_line,dela_line);
            u_strcat(new_dela_line,tmp_entry->lemma);
            u_strcat(new_dela_line,".");
            u_strcat(new_dela_line,tmp_entry->semantic_codes[0]);
            int k;
            for (k=1;k<tmp_entry->n_semantic_codes;k++) {
               u_strcat(new_dela_line,"+");
               u_strcat(new_dela_line,tmp_entry->semantic_codes[k]);
            }
            for (k=0;k<tmp_entry->n_inflectional_codes;k++) {
               u_strcat(new_dela_line,":");
               u_strcat(new_dela_line,tmp_entry->inflectional_codes[k]);
            }
            free_dela_entry(tmp_entry);
            struct german_word_decomposition* wd=new_german_word_decomposition();
            wd->n_parts=n_decomp;
            u_strcpy(wd->decomposition,dec);
            u_strcpy(wd->dela_line,new_dela_line);
            if (check_valid_right_component_for_one_INF_code_german(l->string)) {
               // if we got a correct right component (N-FF)
               struct german_word_decomposition_list* wdl=new_german_word_decomposition_list();
               wdl->element=wd;
               wdl->suivant=(*L);
               (*L)=wdl;
            } else {
               free_german_word_decomposition(wd);
            }
            l=l->next;
         }
      }
    }
    else {
      // else, we must explore the rest of the original word
      if (left[index]) {
         // but only if the current component was a valid left one
         // we go on with the next component
         unichar dec[2000];
         unichar line[500];
         u_strcpy(dec,decomposition);
         if (dec[0]!='\0') {u_strcat(dec," +++ ");}
         unichar sia_code[500];
         unichar entry[500];
         get_first_sia_code_german(index,sia_code,inf_codes);
         uncompress_entry(current_component,sia_code,entry);
         u_strcat(dec,entry);
         u_strcpy(line,dela_line);
         u_strcat(line,current_component);
         unichar temp[500];
         explore_state_german(4,temp,0,original_word,pos_in_original_word,
                  dec,line,L,n_decomp+1,left,right,inf_codes,alphabet,tableau_bin);
      }
    }
  }
  t=adresse+5;
}
else {
  c=c-32768;
  t=adresse+2;
}
if (original_word[pos_in_original_word]=='\0') {
   // if we have finished, we return
   return;
}
// if not, we go on with the next letter
for (int i=0;i<c;i++) {
  if (is_equal_or_uppercase((unichar)(tableau_bin[t]*256+tableau_bin[t+1]),original_word[pos_in_original_word],alphabet)
      || is_equal_or_uppercase(original_word[pos_in_original_word],(unichar)(tableau_bin[t]*256+tableau_bin[t+1]),alphabet)) {
    index=tableau_bin[t+2]*256*256+tableau_bin[t+3]*256+tableau_bin[t+4];
    current_component[pos_in_current_component]=(unichar)(tableau_bin[t]*256+tableau_bin[t+1]);
    explore_state_german(index,current_component,pos_in_current_component+1,original_word,pos_in_original_word+1,
                  decomposition,dela_line,L,n_decomp,left,right,inf_codes,alphabet,tableau_bin);
  }
  t=t+5;
}
}
Пример #24
0
/**
 * Computes training by extracting statistics from a tagged corpus file.
 */
void do_training(U_FILE* input_text,U_FILE* rforms_file,U_FILE* iforms_file){
/* these two hash tables are respectively for simple and compound entries */
struct string_hash_ptr* rforms_table = NULL, *iforms_table = NULL;
if(rforms_file != NULL){
	rforms_table = new_string_hash_ptr(200000);
}
if(iforms_file != NULL){
	iforms_table = new_string_hash_ptr(200000);
}


/* we initialize a contextual matrix */
struct corpus_entry** context = new_context_matrix();
initialize_context_matrix(context);


unichar line[MAX_TAGGED_CORPUS_LINE];

/* check the format of the corpus */
long previous_file_position = ftell(input_text);
if(u_fgets(line,input_text) == EOF){
	fatal_error("File is empty");
}
fseek(input_text,previous_file_position,SEEK_SET);

int format_corpus = check_corpus_entry(line);

if(format_corpus == 0){
	// the corpus is in the Tagger format, one word per line where line=word/tag
	while(u_fgets(line,input_text) !=EOF){
		if(u_strlen(line) == 0){
			initialize_context_matrix(context);
		}
		else{
			corpus_entry* entry = new_corpus_entry(line);
			if(u_strchr(line,'_')!=NULL && line[0]!='_'){
				corpus_entry** entries = extract_simple_words(entry);
				free_corpus_entry(entry);
				for(int i=0;entries[i]!=NULL;i++){
					push_corpus_entry(entries[i],context);
					add_statistics(context,rforms_table,iforms_table);
				}
				free(entries);
			}
			else {
				push_corpus_entry(entry,context);
				add_statistics(context,rforms_table,iforms_table);
			}
		}
	}
}
else {
	// the corpus is in the Unitex tagged format, one sentence per line where token={word,lemma.tag}
	unichar *tmp,*s = (unichar*)malloc(sizeof(unichar)*(MAX_TAGGED_CORPUS_LINE));
	int current_len,len;
	unsigned int i;
	while(u_fgets(line,input_text) != EOF){
		current_len = 0, len = 0;
		/* extract each token of the sentence */
		for (;;) {
			len = 1+u_strlen(line+current_len)-u_strlen(u_strchr(line+current_len,'}'));
			tmp = u_strcpy_sized(s,len-1,line+current_len+1);
			u_strcat(tmp,"\0");
			if(u_strcmp(s,"S") == 0)
				break;

			//particular case: '\},\}.PONCT'
			if(line[current_len+2] == '}'){
				int start = current_len+3;
				do{
					tmp = u_strchr(line+start,'}');
					start += 1+u_strlen(line+start)-u_strlen(tmp);
				}
				while(*(tmp+1) != ' ');
				tmp = u_strcpy_sized(s,start-current_len-1,line+current_len+1);
				u_strcat(tmp,"\0");
				len += start-current_len-3;
			}

			/* format the {XX.YY} into standard tagger format, XX/YY */
			unichar* newline = (unichar*)malloc(sizeof(unichar)*(8096));
			if(u_strchr(s,',')[1] == ','){
				u_strcpy(newline,",");
			}
			else
				u_strcpy_sized(newline,1+u_strlen(s)-u_strlen(u_strchr(s,',')),s);
			u_sprintf(newline,"%S/%S\0",newline,s+u_strrchr(s,'.')+1);
			for(i=0;i<u_strlen(newline);i++){
				if(newline[i] == ' ')
					newline[i] = '_';
			}

			//create corpus entry
			corpus_entry* entry = new_corpus_entry(newline);
			if(u_strchr(newline,'_') != NULL && newline[0] != '_'){
				corpus_entry** entries = extract_simple_words(entry);
				free_corpus_entry(entry);
				for(int j=0;entries[j]!=NULL;j++){
					push_corpus_entry(entries[j],context);
					add_statistics(context,rforms_table,iforms_table);
				}
				free(entries);
			}
			else {
				push_corpus_entry(entry,context);
				add_statistics(context,rforms_table,iforms_table);
			}

			free(newline);
			current_len += len+1;
		}
		initialize_context_matrix(context);
	}
	free(s);
}
free_context_matrix(context);
/* we fill dictionary files with pairs (tuple,value) and then
 * we add a special line "CODE\tFEATURES,.value" in order to
 * specify whether the dictionary contains inflected or raw form tuples*/
unichar* str = u_strdup("");
if(rforms_table != NULL){
	write_keys_values(rforms_table,rforms_table->hash->root,str,rforms_file);
	u_fprintf(rforms_file,"%s,.%d\n","CODE\tFEATURES",0);
	free_string_hash_ptr(rforms_table,NULL);
}
if(iforms_table != NULL){
	write_keys_values(iforms_table,iforms_table->hash->root,str,iforms_file);
	u_fprintf(iforms_file,"%s,.%d\n","CODE\tFEATURES",1);
	free_string_hash_ptr(iforms_table,NULL);
}
free(str);
}
Пример #25
0
/**
 * This function produces a normalized version of 'input' and stores it into 'ouput'.
 * The following rules are applied in the given order:
 *
 * 1) If there is a { at the current position, we try to read a {S}, a {STOP} or
 *    a tag token like {today,.ADV}. If we fail, we replace the { and the }, if any,
 *    according to the replacement rules. Otherwise, we let the token unchanged.
 * 2) If there is one or more replacement rules that can apply to the current
 *    position in 'input', then we apply the longest one.
 * 3) If we we find a separator (space, tab, new line) sequence, we replace it:
 *    - by a new line if the sequence contains one and if 'carriage_return_policy' is
 *      set to KEEP_CARRIAGE_RETURN;
 *    - by a space otherwise.
 * 4) We copy the character that was read to the output.
 *
 * Note that 'replacements' is supposed to contain replacement rules for { and }
 */
int normalize(const char *fin, const char *fout, 
              Encoding encoding_output, int bom_output, int mask_encoding_compatibility_input,
              int carriage_return_policy, const char *rules) {
	U_FILE* input;
	input = u_fopen_existing_versatile_encoding(mask_encoding_compatibility_input,fin,U_READ);
	if (input == NULL) {
		error("Cannot open file %s\n", fin);
		return 1;
	}

	U_FILE* output;
	output = u_fopen_creating_versatile_encoding(encoding_output,bom_output,fout,U_WRITE);
	if (output == NULL) {
		error("Cannot create file %s\n", fout);
		u_fclose(input);
		return 1;
	}

	struct string_hash* replacements=NULL;
	if(rules != NULL && rules[0]!='\0') {
		replacements=load_key_value_list(rules,mask_encoding_compatibility_input,'\t');
		if (replacements==NULL) {
		   error("Cannot load replacement rules file %s\n", rules);
		   replacements=new_string_hash();
		}
	}
	/* If there is no replacement rules file, we simulate one */
	else {
	   replacements=new_string_hash();
	}

	/* If there is a replacement rule file, we ensure that there are replacement
	 * rules for { and }. If not, we add our default ones, so that in any case,
	 * we are sure to have rules for { and } */
	unichar key[2];
	unichar value[2];
	u_strcpy(key,"{");
	u_strcpy(value,"[");
	get_value_index(key,replacements,INSERT_IF_NEEDED,value);
	u_strcpy(key,"}");
	u_strcpy(value,"]");
	get_value_index(key,replacements,INSERT_IF_NEEDED,value);

    struct OUTBUF OutBuf;
    OutBuf.pos=0;
	unichar tmp[MAX_TAG_LENGTH];
	//struct buffer* buffer=new_buffer_for_file(UNICHAR_BUFFER,input);

    long save_pos=ftell(input);
    fseek(input,0,SEEK_END);
    long file_size_input=ftell(input);
    fseek(input,save_pos,SEEK_SET);

    int line_buffer_size = (int)(((file_size_input+1) < MAX_LINE_BUFFER_SIZE) ? (file_size_input+1) : MAX_LINE_BUFFER_SIZE);

    unichar *line_read;
    line_read=(unichar*)malloc((line_buffer_size+0x10)*sizeof(unichar));
    if (line_read==NULL) {
        fatal_alloc_error("normalize");
    }

	/* We define some things that will be used for parsing the buffer */


    static const unichar stop_chars[]= { '{', '}', 0 };
    static const unichar forbidden_chars[]= { '\n', 0 };
    static const unichar open_bracket[]= { '{', 0 };
    static const unichar close_bracket[]= { '}', 0 };
    static const unichar empty_string[]= { 0 };

   int corrupted_file=0;
   int eof_found=0;
   /* First, we fill the buffer */
	
    int lastline_was_terminated=0;

    while (eof_found==0) {
        int current_start_pos=0;
        int found_null=0;
        const unichar*buff=line_read;
        int result_read = 0;

        result_read = u_fgets_treat_cr_as_lf(line_read,line_buffer_size,input,1,&found_null);
        if ((found_null != 0) && (corrupted_file==0)) {
          corrupted_file=1;
          error("Corrupted text file containing NULL characters!\n");
          error("They have been ignored by Normalize, but you should clean your text\n");
        }

        if (result_read>0)
            if (line_read[result_read-1]==0x0d)
                line_read[result_read-1]='\n';
        
        if (result_read==EOF)
            break;

        if (lastline_was_terminated != 0)
            while (current_start_pos<result_read) {
                if (buff[current_start_pos]!=' ' && buff[current_start_pos]!='\t'
							    && buff[current_start_pos]!=0x0d
                                && buff[current_start_pos]!='\n')
                                break;
                current_start_pos++;
            }

        lastline_was_terminated = 0;
        if (result_read > 0)
            if ((buff[result_read-1]=='\n') || (buff[result_read-1]==0x0d))
                lastline_was_terminated = 1;


        while (current_start_pos<result_read) {
            if ((lastline_was_terminated == 0) && (eof_found == 0) && 
                (current_start_pos + MINIMAL_CHAR_IN_BUFFER_BEFORE_CONTINUE_LINE >= result_read))
            {
                int i;
                int nb_to_keep = result_read-current_start_pos;
                for (i=0;i<nb_to_keep;i++)
                    line_read[i]=line_read[current_start_pos+i];
                int found_null_read=0;
                int result_read_continue = u_fgets_treat_cr_as_lf(line_read+nb_to_keep,line_buffer_size-nb_to_keep,input,1,&found_null_read);

                if ((found_null_read != 0) && (corrupted_file==0)) {
                    corrupted_file=1;
                    error("Corrupted text file containing NULL characters!\n");
                    error("They have been ignored by Normalize, but you should clean your text\n");
                }

                if (result_read_continue>0)
                    if (line_read[(result_read_continue+nb_to_keep)-1]==0x0d)
                        line_read[(result_read_continue+nb_to_keep)-1]='\n';
                lastline_was_terminated = 0;
                if (result_read_continue==EOF)
                    eof_found = lastline_was_terminated = 1;

                if (result_read_continue > 0)
                    if ((buff[(result_read_continue+nb_to_keep)-1]=='\n') || (buff[(result_read_continue+nb_to_keep)-1]==0x0d))
                        lastline_was_terminated = 1;

                result_read = nb_to_keep;
                current_start_pos = 0;

                if (result_read_continue > 0)
                    result_read += result_read_continue;
            }

		if (buff[current_start_pos]=='{') {
			/* If we have a {, we try to find a sequence like {....}, that does not contain
			 * new lines. If the sequence contains protected character, we want to keep them
			 * protected. */
			int old_position=current_start_pos;
			/* If we don't increase the position, the parse will stop on the initial { */
			current_start_pos++;
			tmp[0]='{';
			int code=parse_string(buff,&current_start_pos,&(tmp[1]),stop_chars,forbidden_chars,NULL);
			if (code==P_FORBIDDEN_CHAR || code==P_BACKSLASH_AT_END || buff[current_start_pos]!='}') {
				/* If we have found a new line or a {, or if there is
				 * a backslash at the end of the buffer, or if we have reached the end
				 * of the buffer, we assume that the initial
				 * { was not a tag beginning, so we print the substitute of { */
				WriteOufBuf(&OutBuf,replacements->value[get_value_index(open_bracket,replacements)],output, 0);
				/* And we rewind the current position after the { */
				current_start_pos=old_position+1;
			}
			else {
				/* If we have read a sequence like {....}, we assume that there won't be
				 * a buffer overflow if we add the } */
				u_strcat(tmp,close_bracket);
				if (!u_strcmp(tmp,"{S}") || !u_strcmp(tmp,"{STOP}") || check_tag_token(tmp)) {
					/* If this is a special tag or a valid tag token, we just print
					 * it to the output */
					WriteOufBuf(&OutBuf,tmp,output, 0);
					current_start_pos++;
				}
				else {
					/* If we have a non valid tag token, we print the equivalent of {
					 * and we rewind the current position after the { */
					WriteOufBuf(&OutBuf,replacements->value[get_value_index(open_bracket,replacements)],output, 0);
					current_start_pos=old_position+1;
				}
			}
		}
		else {
			/* If we have a character that is not {, first we try to look if there
			 * is a replacement to do */
			int key_length;
			int index=get_longest_key_index(&buff[current_start_pos],&key_length,replacements);
			if (index!=NO_VALUE_INDEX) {
				/* If there is something to replace */
				WriteOufBuf(&OutBuf,replacements->value[index],output, 0);
				current_start_pos=current_start_pos+key_length;
			}
			else {
				if (buff[current_start_pos]==' ' || buff[current_start_pos]=='\t' || buff[current_start_pos]=='\n' || buff[current_start_pos]==0x0d) {
					/* If we have a separator, we try to read the longest separator sequence
					 * that we can read. By the way, we note if it contains a new line */
					int new_line=0;
					while (buff[current_start_pos]==' ' || buff[current_start_pos]=='\t'
							|| buff[current_start_pos]=='\n' || buff[current_start_pos]==0x0d) {
						/* Note 1: no bound check is needed, since an unichar buffer is always
						 *        ended by a \0
						 *
						 * Note 2: we don't take into account the case of a buffer ended by
						 *         separator while it's not the end of file: that would mean
						 *         that the text contains something like MARGIN_BEFORE_BUFFER_END
						 *         contiguous separators. Such a text would not be a reasonable one.
						 */
						if (buff[current_start_pos]=='\n' || buff[current_start_pos]==0x0d) {
							new_line=1;
						}
						current_start_pos++;
					}
					if (new_line && (carriage_return_policy==KEEP_CARRIAGE_RETURN)) {
						/* We print a new line if the sequence contains one and if we are
						 * allowed to; otherwise, we print a space. */
						WriteOufBuf(&OutBuf,'\n',output, 0);
					}
					else {
						WriteOufBuf(&OutBuf,' ',output, 0);
					}
				}
				else {
					/* If, finally, we have a normal character to normalize, we just print it */
                    WriteOufBuf(&OutBuf,buff[current_start_pos++],output, 0);
				}
			}
		}
	    }
    }


    WriteOufBuf(&OutBuf,empty_string,output, 1);

	free(line_read);
	free_string_hash(replacements);

	u_fclose(input);
	u_fclose(output);
	return 0;
}
/**
 * This function explore the normalization grammar to construct
 * the normalization tree. If the 'list' parameter is NULL, then we
 * are in the main call to the main graph; otherwise, we are within
 * a subgraph.
 */
void explore_normalization_fst2(Fst2* fst2,int current_state,
                                struct normalization_tree* node,
                                struct string_hash* tokens,const unichar* output,
                                const Alphabet* alph,struct norm_info** list) {
Fst2State state=fst2->states[current_state];
if (is_final_state(state)) {
   /* If we are in a final state, we behave differently if we are in a subgraph
    * or in the main call to the main graph. */
   if (list!=NULL) {
      (*list)=insert_in_norm_info_list(output,node,(*list));
   }
   else {
      node->outputs=sorted_insert(output,node->outputs);
   }
}
Transition* trans=state->transitions;
unichar tmp[1024];
while (trans!=NULL) {
   if (trans->tag_number<0) {
      /* Case of a subgraph call */
      struct norm_info* tmp_list=NULL;
      explore_normalization_fst2(fst2,fst2->initial_states[-(trans->tag_number)],node,
                                        tokens,output,alph,&tmp_list);
      while (tmp_list!=NULL) {
         /* We continue to explore the current graph */
         explore_normalization_fst2(fst2,trans->state_number,tmp_list->node,
                                        tokens,tmp_list->output,alph,list);
         struct norm_info* z=tmp_list;
         tmp_list=tmp_list->next;
         free_norm_info(z);
      }
   }
   else {
      /* If we have a normal transition */
      Fst2Tag tag=fst2->tags[trans->tag_number];
      u_strcpy(tmp,output);
      u_strcat(tmp," ");
      if (tag->output!=NULL && tag->output[0]!='\0' && u_strcmp(tag->output,"<E>") && !only_spaces(tag->output)) {
         /* We append the output if it exists and is not epsilon */
         u_strcat(tmp,tag->output);
      }
      if (!u_strcmp(tag->input,"<E>")) {
         /* If we have an epsilon transition, we go on in the fst2, but
          * we don't move in the normalization tree */
         explore_normalization_fst2(fst2,trans->state_number,node,tokens,tmp,alph,list);
      } else {
         /* If we have a normal transition, we explore all the tokens that match it */
         struct list_int* l=get_token_list_for_sequence(tag->input,alph,tokens);
         while (l!=NULL) {
            /* Then, we add a branch in the normalization tree for
             * each token. Note that it may introduce combinatory explosions
             * if the the fst2 matches large sequences */
            struct normalization_tree_transition* trans_norm;
            trans_norm=get_transition(l->n,node->trans);
            if (trans_norm==NULL) {
               /* If the transition does not exist in the tree, we create it */
               trans_norm=new_normalization_tree_transition(l->n,new_normalization_tree(),node->trans);
               node->trans=trans_norm;
            }
            explore_normalization_fst2(fst2,trans->state_number,trans_norm->node,
                                           tokens,tmp,alph,list);
            struct list_int* L=l;
            l=l->next;
            free(L);
         }
      }
   }
   trans=trans->next;
}
}
Пример #27
0
int composition_rule_matches_entry (const struct pattern* rule,
				     const struct dela_entry* d,U_FILE* 
#if DDEBUG > 1                         
				     debug_file
#endif
                     ) {
  int ok = 1;
  // "ok = 0;"  may be replaced by "return 0;"
  int flex_code_already_matched = 1;
#if DDEBUG > 1
    u_strcat(tmp, "   trying ");
#endif
  for (int i = 0; i < MAX_NUMBER_OF_COMPOSITION_RULES; i++) {
    if (rule[i].string[0] == '\0')
      break; // last rule reached: return 1
#if DDEBUG > 1
    {
      if (rule[i].type == 'f')
	u_strcat(tmp, ":");
      else if (rule[i].YesNo)
	u_strcat(tmp, "+");
      else
	u_strcat(tmp, "-");
      u_strcat(tmp, rule[i].string);
    }
#endif
    if (rule[i].YesNo) { // rule '+' => pattern must be in entry, too
      if (rule[i].type == 'g') {
	if (dic_entry_contain_gram_code(d,rule[i].string))
	  continue; // rule matched, try next one
	ok = 0;
      }
      else if (rule[i].type == 'f') {
	if (dic_entry_contain_inflectional_code(d,rule[i].string)) {
	  // rule matched, try next one, but mark flex codes as matched
	  flex_code_already_matched = 2;
	  continue;
	}
	else if (flex_code_already_matched == 2) {
	  // no matter if any flex code already matched
	  continue;
	}
	else {
	  // no-matches before first match
	  flex_code_already_matched = 0;
	}
      }
    }
    else { // rule '-' => pattern must not be in entry
      if (rule[i].type == 'g') {
	if (dic_entry_contain_gram_code(d,rule[i].string))
	  ok = 0;
      }
      else if (rule[i].type == 'f') {
	// implemented although not possible in rule syntax
	if (dic_entry_contain_inflectional_code(d,rule[i].string))
	  ok = 0;
      }
    }
  }
#if DDEBUG > 1
  {
    if (ok && flex_code_already_matched) u_fprintf(debug_file,"\n   === matched ");
    else u_fprintf(debug_file,"\n   === not matched ");
    if ( d->semantic_codes != 0 ) {
      for (int i = 0; i < d->n_semantic_codes; i++) {
         u_fprintf(debug_file,"+%S",d->semantic_codes[i]);
      }
    }
    if ( d->inflectional_codes != 0 ) {
      for (int i = 0; i < d->n_inflectional_codes; i++) {
         u_fprintf(debug_file,":%S",d->inflectional_codes[i]);
      }
    }
    u_fprintf(debug_file,"\n");
  }
#endif
  return (ok && flex_code_already_matched);
}
Пример #28
0
/////////////////////////////////////////////////////////////////////////////////
// Puts an inflected multi-word form 'f' corresponding to the DELAC entry 'dlc_entry' into the DELACF format ('entry').
// The resulting enntry may takes up to 'max' characters.
// 'entry' almready has its space allocated.
// Returns 1 on error, 0 otherwise.
int DLC_format_form(struct l_morpho_t* pL_MORPHO,unichar* entry, int max, MU_f_T f, DLC_entry_T* dlc_entry,
		d_class_equiv_T* D_CLASS_EQUIV) {
	int l; //length of the entry

	//Inflected form
	l = u_strlen(f.form);
	if (l >= max)
		return 1;
	u_strcpy(entry, f.form);

	//Comma
	l++;
	if (l >= max)
		return 1;
	u_strcat(entry, ",");

	//Lemma
	int u; //index of the current unit in the lemma of the MW form
	for (u = 0; u < dlc_entry->lemma->no_units; u++)
		l = l + u_strlen(dlc_entry->lemma->units[u]->form);
	if (l >= max)
		return 1;
	for (u = 0; u < dlc_entry->lemma->no_units; u++)
		u_strcat(entry, dlc_entry->lemma->units[u]->form);

	//Full stop
	l++;
	if (l >= max)
		return 1;
	u_strcat(entry, ".");

	//Inflection paradigm
	//l = l + strlen(dlc_entry->lemma->paradigm);
	//if (l >= max) return 1;
	//u_strcat(entry,dlc_entry->lemma->paradigm);

	//Inflection class
	l = l + u_strlen(d_get_str_class(dlc_entry->lemma->cl, D_CLASS_EQUIV));
	if (l >= max)
		return 1;
	u_strcat(entry, d_get_str_class(dlc_entry->lemma->cl, D_CLASS_EQUIV));

	//Semantic codes
	int c; //index of the current semantic code
	for (c = 0; dlc_entry->codes[c]; c++)
		l = l + u_strlen(dlc_entry->codes[c]) + 1;
	if (l >= max)
		return 1;
	for (c = 0; dlc_entry->codes[c]; c++) {
		u_strcat(entry, "+");
		u_strcat(entry, dlc_entry->codes[c]);
	}

	//Inflection features
	unichar* feat; //sequence of single-letter inflection features, e.g. 'sIf'
	if (f.features && f.features->no_cats > 0) {
		feat = d_get_str_feat(pL_MORPHO,f.features);
		l = l + u_strlen(feat) + 1; //Place for a ':' and all features
		if (l >= max)
			return 1;
		u_strcat(entry, ":");
		u_strcat(entry, feat);
		free(feat);
	}

	//Comment
	if (dlc_entry->comment && u_strlen(dlc_entry->comment)) {
		l = l + u_strlen(dlc_entry->comment);//Place for a '/' and the comment
		if (l >= max)
			return 1;
		u_strcat(entry, "/");
		u_strcat(entry, dlc_entry->comment);
	}
	return 0;
}