struct rule_list* parse_rules (unichar* entry,struct utags UTAG,vector_ptr* rules) { // parses dictionary entry to extract rules for derivation and composition struct rule_list* rule_list = new_rule_list(rules); struct rule_list* actual_list_pos = rule_list; unichar cleaned_entry[MAX_DICT_LINE_LENGTH]; // rules will be stripped off unichar beforcond[MAX_COMPOSITION_RULE_LENGTH]; unichar aftercond[MAX_COMPOSITION_RULE_LENGTH]; unichar then_code[MAX_COMPOSITION_RULE_LENGTH]; int bcpos, acpos, tpos; bcpos = acpos = tpos = 0; enum { BEGIN, BEFORE_COND, AFTER_COND, THEN }; int state = 0; int k = 0; for (int i = 0; entry[i] != '\0'; i++) { if ( state != BEGIN ) { // inside a rule if (entry[i] == '\\') i++; // unescaping escaped chars in rule if (entry[i] == ')') { // end of rule struct composition_rule* rule = new_composition_rule(); beforcond[bcpos] = '\0'; aftercond[acpos] = '\0'; then_code[tpos] = '\0'; parse_condition(beforcond, rule->before); parse_condition(aftercond, rule->after); parse_then_code(then_code, &rule->then); bcpos = acpos = tpos = 0; if (actual_list_pos->rule != 0) { // not first rule struct rule_list* tmp = new_rule_list(rules); actual_list_pos->next = tmp; actual_list_pos = tmp; } actual_list_pos->rule = rule; state = BEGIN; } else if (state == BEFORE_COND) { // condition before if (entry[i] == '#') state = AFTER_COND; else beforcond[bcpos++] = entry[i]; } else if (state == AFTER_COND) { // condition after if (entry[i] == '=') state = THEN; else aftercond[acpos++] = entry[i]; } else if (state == THEN) // then-code then_code[tpos++] = entry[i]; } else { // not inside a rule if (entry[i] == '+') { unichar tmp[MAX_DICT_LINE_LENGTH]; int j; for (j = i+1; ((entry[j] != '+') && (entry[j] != ':') && (entry[j] != '(') && (entry[j] != '\0')); j++) tmp[j-(i+1)] = entry[j]; tmp[j-(i+1)] = '\0'; if ((!u_strcmp(tmp, UTAG.PREFIX)) || (!u_strcmp(tmp, UTAG.SUFFIX))) { i = j-1; } else if (!u_strcmp(tmp, UTAG.RULE)) { i = j; // including '(' state = BEFORE_COND; } else { cleaned_entry[k++] = entry[i]; } } else { cleaned_entry[k++] = entry[i]; } } } cleaned_entry[k] = '\0'; u_strcpy(entry, cleaned_entry); if (rule_list->rule == 0) rule_list->rule = new_composition_rule(); return rule_list; }
// // this function explores the dictionary to decompose the word mot // void explore_state (int adresse, unichar* current_component, int pos_in_current_component, const unichar* original_word, const unichar* remaining_word, int pos_in_remaining_word, const unichar* decomposition, const unichar* lemma_prefix, struct decomposed_word_list** L, int n_decomp, struct rule_list* rule_list_called, const struct dela_entry* dic_entr_called, const unsigned char* tableau_bin, const struct INF_codes* inf_codes, const bool* prefix,const bool* suffix,const Alphabet* alphabet, U_FILE* debug_file,struct utags UTAG, vector_ptr* rules,vector_ptr* entries) { int c = tableau_bin[adresse]*256+tableau_bin[adresse+1]; int index; int t = 0; if ( !(c&32768) ) { // if we are in a terminal state index = tableau_bin[adresse+2]*256*256+tableau_bin[adresse+3]*256+tableau_bin[adresse+4]; current_component[pos_in_current_component] = '\0'; if (pos_in_current_component >= 1) { // go on if word length equals zero #if DDEBUG > 0 { u_fprintf(debug_file,". %S\n",current_component); } #endif struct list_ustring* l = inf_codes->codes[index]; while ( l != 0 ) { // int one_rule_already_matched = 0; // one rule matched each entry is enough unichar entry[MAX_DICT_LINE_LENGTH]; uncompress_entry(current_component, l->string, entry); #if DDEBUG > 0 { u_fprintf(debug_file,": %S\n",entry); } #endif struct dela_entry* dic_entr = new_dic_entry(entry,entries); unichar lemma_prefix_new[MAX_DICT_LINE_LENGTH]; struct rule_list* rule_list_new = 0; unichar next_remaining_word[MAX_WORD_LENGTH]; struct rule_list* rule_list = 0; if (prefix_is_valid(index,prefix) || suffix_is_valid(index,suffix)) rule_list = parse_rules(entry,UTAG,rules); else { rule_list = new_rule_list(rules); rule_list->rule = new_composition_rule(); } // entry is now cleaned from rules for composition and derivation // log decomposition of word // ("cleaned" entries for better overview) unichar decomposition_new[MAX_DICT_LINE_LENGTH]; u_strcpy(decomposition_new, decomposition); if (decomposition_new[0] != '\0') u_strcat(decomposition_new, " +++ "); u_strcat(decomposition_new, entry); // loop on all composition_rules called struct rule_list* called = rule_list_called; do { // while ( rule_list* called != 0 ) // if (one_rule_already_matched) // break; struct composition_rule* rule_called = ( called != 0 ) ? called->rule : 0; // may be undefined // loop on all actual composition_rules struct rule_list* r_list = rule_list; while ( r_list != 0 ) { // if (one_rule_already_matched) // break; struct composition_rule* rule = r_list->rule; // ever defined, see upwards if (remaining_word[pos_in_remaining_word]=='\0' && // we have explored the entire original word ((((dic_entr_called != 0) && composition_rule_matches_entry(rule->before, dic_entr_called,debug_file)) && ((rule_called != 0) && composition_rule_matches_entry(rule_called->after, dic_entr,debug_file))) || // and we have a valid right component, i.e. rules match ((dic_entr_called == 0) && // or a simple entry (i.e. no prefix), (! affix_is_valid(index,prefix,suffix))) // but no affix ) ) { // one_rule_already_matched = 1; unichar inflected[MAX_WORD_LENGTH]; unichar lemma[MAX_WORD_LENGTH]; unichar codes[MAX_DICT_LINE_LENGTH]; tokenize_DELA_line_into_3_parts(entry, inflected, lemma, codes); /* generating new lexicon entry */ unichar new_dela_line[MAX_DICT_LINE_LENGTH]; /* word form */ u_strcpy(new_dela_line, original_word); u_strcat(new_dela_line, ","); /* lemma */ // lemmatize word if (rule->then.repl[0] == '\0' // if there are no replace codes && (rule_called != 0 // either in actual nor in preceeding rule && rule_called->then.repl[0] == '\0')) { u_strcat(new_dela_line, lemma_prefix); unichar affix[MAX_WORD_LENGTH]; u_strcpy(affix, lemma); substring_operation(affix, rule->then.substr_act); if (rule_called != 0 && rule_called->then.undo_substr_next[0] != '\0') substring_operation(affix, rule_called->then.undo_substr_next); u_strcat(new_dela_line, affix); } else { u_strcat(new_dela_line, original_word); } /* codes */ u_strcat(new_dela_line,"."); if (rule->then.repl[0] != '\0') { // replacing codes by u_strcat(new_dela_line,rule->then.repl); // suffix' ones } else if (rule_called == 0) { // prohibit SGV u_strcat(new_dela_line,codes); } else if (rule_called->then.repl[0] != '\0') { u_strcat(new_dela_line,rule_called->then.repl); // prefix' ones } // replace replaces all and blocks adding and deleting // maybe this is not optimal ??? else { if (rule_called->then.add[0] != '\0') { // add codes if (!dic_entry_contain_gram_code(dic_entr, rule_called->then.add)) { bool done = 0; unichar tmp[MAX_COMPOSITION_RULE_LENGTH]; int j = 0; for (int i = 0; codes[i] != '\0'; i++) { if (codes[i] == ':' && (!done)) { tmp[j++] = '+'; tmp[j] = '\0'; u_strcat(new_dela_line,tmp); u_strcat(new_dela_line,rule_called->then.add); done = 1; j = 0; } tmp[j++] = codes[i]; } tmp[j] = '\0'; u_strcat(new_dela_line,tmp); if (!done) { u_strcat(new_dela_line,"+"); u_strcat(new_dela_line,rule_called->then.add); } } else { u_strcat(new_dela_line,codes); } } else if (rule_called->then.del[0] != '\0') { // delete codes } else { u_strcat(new_dela_line,codes); } } #if DDEBUG > 0 { u_fprintf(debug_file,"= %S\n",new_dela_line); } #endif struct decomposed_word* wd = new_decomposed_word(); wd->n_parts = n_decomp; u_strcpy(wd->decomposition,decomposition_new); u_strcpy(wd->dela_line,new_dela_line); struct decomposed_word_list* wdl=new_decomposed_word_list(); // unshift actual decomposition to decomposition list L wdl->element = wd; wdl->suivant = (*L); (*L) = wdl; } // end if end of word and valid right component else if // beginning or middle of word: explore the rest of the original word (prefix_is_valid(index,prefix) && check_is_valid(UTAG.PREFIX, dic_entr) && // but only if the current component was a valid left one // we go on with the next component ( (n_decomp == 1) // prefix as first part of a word: no rule matching || ( // prefix in the middle of a word (rule_called && composition_rule_matches_entry(rule_called->after, dic_entr,debug_file)) && (dic_entr_called && composition_rule_matches_entry(rule->before, dic_entr_called,debug_file)) ) )) { // one_rule_already_matched = 1; u_strcpy(lemma_prefix_new, lemma_prefix); unichar affix[MAX_WORD_LENGTH]; u_strcpy(affix, current_component); if (rule_called != 0 && rule_called->then.undo_substr_next[0] != '\0') { substring_operation(affix, rule_called->then.undo_substr_next); u_fprintf(debug_file,"yes\n"); } substring_operation(affix, rule->then.substr_act); u_strcat(lemma_prefix_new, affix); int j = 0; for (int i = pos_in_remaining_word; remaining_word[i] != '\0'; i++) { next_remaining_word[j++] = remaining_word[i]; } next_remaining_word[j] = '\0'; if (rule->then.substr_next[0] != '\0') { substring_operation(next_remaining_word, rule->then.substr_next); #if DDEBUG > 0 { u_fprintf(debug_file,"| %S|%S\n",affix,next_remaining_word); } #endif } #if DDEBUG > 0 { u_fprintf(debug_file,"- %S\n",entry); } #endif struct rule_list* tmp = new_rule_list(rules); tmp->rule = new_composition_rule(); copy_composition_rule(tmp->rule, rule); tmp->next = 0; if ( rule_list_new == 0 ) { rule_list_new = tmp; } else { struct rule_list* trl = rule_list_new; while ( trl->next != 0 ) { trl=trl->next; } trl->next = tmp; } } else { // no valid suffix nor prefix } r_list = r_list->next; } // while ( rule_list* r_list != 0 ) if ( called != 0 ) called = called->next; } while ( called != 0 ); // prefix found, try to decomposite rest of word if ( rule_list_new != 0 && dic_entr != 0 ) { unichar next_component[MAX_WORD_LENGTH]; #if DDEBUG > 0 { u_fprintf(debug_file,"> %S\n",next_remaining_word); } #endif explore_state(4, next_component, 0, original_word, next_remaining_word, 0, decomposition_new, lemma_prefix_new, L, n_decomp+1, rule_list_new, dic_entr, tableau_bin,inf_codes,prefix,suffix,alphabet,debug_file,UTAG,rules,entries); } else { // free_dic_entry(dic_entr); // free_rule_list(rule_list); } l = l->next; } // end of while (token_list* l != 0) t = adresse+5; } // end of word length >= 1 } else { // not a final state c = c-32768; t = adresse+2; } if (remaining_word[pos_in_remaining_word]=='\0') { // if we have finished, we return // free_dic_entry(dic_entr_called); // free_rule_list(rule_list_called); return; } // if not, we go on with the next letter for (int i=0;i<c;i++) { if (is_equal_or_uppercase((unichar)(tableau_bin[t]*256+tableau_bin[t+1]), remaining_word[pos_in_remaining_word], alphabet) || is_equal_or_uppercase(remaining_word[pos_in_remaining_word], (unichar)(tableau_bin[t]*256+tableau_bin[t+1]), alphabet)) { index = tableau_bin[t+2]*256*256+tableau_bin[t+3]*256+tableau_bin[t+4]; current_component[pos_in_current_component] = (unichar)(tableau_bin[t]*256+tableau_bin[t+1]); explore_state(index, current_component, pos_in_current_component+1, original_word, remaining_word, pos_in_remaining_word+1, decomposition, lemma_prefix, L, n_decomp, rule_list_called, dic_entr_called, tableau_bin, inf_codes,prefix,suffix,alphabet,debug_file,UTAG,rules,entries); } t += 5; } }