Exemplo n.º 1
0
struct rule_list* parse_rules (unichar* entry,struct utags UTAG,vector_ptr* rules)
{
  // parses dictionary entry to extract rules for derivation and composition

  struct rule_list* rule_list = new_rule_list(rules);
  struct rule_list* actual_list_pos = rule_list;

  unichar cleaned_entry[MAX_DICT_LINE_LENGTH]; // rules will be stripped off
  unichar beforcond[MAX_COMPOSITION_RULE_LENGTH];
  unichar aftercond[MAX_COMPOSITION_RULE_LENGTH];
  unichar then_code[MAX_COMPOSITION_RULE_LENGTH];

  int bcpos, acpos, tpos;
  bcpos = acpos = tpos = 0;
  enum { BEGIN, BEFORE_COND, AFTER_COND, THEN };
  int state = 0;
  int k = 0;
  for (int i = 0; entry[i] != '\0'; i++) {
    if ( state != BEGIN ) { // inside a rule
      if (entry[i] == '\\')
	i++; // unescaping escaped chars in rule
      if (entry[i] == ')') {
	// end of rule
 	struct composition_rule* rule = new_composition_rule();
	beforcond[bcpos] = '\0';
	aftercond[acpos] = '\0';
	then_code[tpos]  = '\0';
	parse_condition(beforcond, rule->before);
	parse_condition(aftercond, rule->after);
	parse_then_code(then_code, &rule->then);
	bcpos = acpos = tpos = 0;
	if (actual_list_pos->rule != 0) { // not first rule
	  struct rule_list* tmp = new_rule_list(rules);
	  actual_list_pos->next = tmp;
	  actual_list_pos = tmp;
	}
	actual_list_pos->rule = rule;
	state = BEGIN;
      }
      else if (state == BEFORE_COND) {
	// condition before
	if (entry[i] == '#')
	  state = AFTER_COND;
	else
	  beforcond[bcpos++] = entry[i];
      }
      else if (state == AFTER_COND) {
	// condition after
	if (entry[i] == '=')
	  state = THEN;
	else
	  aftercond[acpos++] = entry[i];
      }
      else if (state == THEN)
	// then-code
	then_code[tpos++] = entry[i];
    }
    else { // not inside a rule
      if (entry[i] == '+') {
	unichar tmp[MAX_DICT_LINE_LENGTH];
	int j;
	for (j = i+1; ((entry[j] != '+') &&
		       (entry[j] != ':') &&
		       (entry[j] != '(') &&
		       (entry[j] != '\0')); j++)
	  tmp[j-(i+1)] = entry[j];
	tmp[j-(i+1)] = '\0';
	if ((!u_strcmp(tmp, UTAG.PREFIX)) ||
	    (!u_strcmp(tmp, UTAG.SUFFIX))) {
	  i = j-1;
	}
	else if (!u_strcmp(tmp, UTAG.RULE)) {
	  i = j; // including '('
	  state = BEFORE_COND;
	}
	else {
	  cleaned_entry[k++] = entry[i];
	}
      } else {
	cleaned_entry[k++] = entry[i];
      }
    }
  }
  cleaned_entry[k] = '\0';
  u_strcpy(entry, cleaned_entry);
  if (rule_list->rule == 0)
    rule_list->rule = new_composition_rule();
  return rule_list;
}
Exemplo n.º 2
0
//
// this function explores the dictionary to decompose the word mot
//
void explore_state (int adresse,
		    unichar* current_component,
		    int pos_in_current_component,
		    const unichar* original_word,
		    const unichar* remaining_word,
		    int pos_in_remaining_word,
		    const unichar* decomposition,
		    const unichar* lemma_prefix,
		    struct decomposed_word_list** L,
		    int n_decomp,
		    struct rule_list* rule_list_called,
		    const struct dela_entry* dic_entr_called,
		    const unsigned char* tableau_bin,
		    const struct INF_codes* inf_codes,
		    const bool* prefix,const bool* suffix,const Alphabet* alphabet,
		    U_FILE* debug_file,struct utags UTAG,
		    vector_ptr* rules,vector_ptr* entries)
{

  int c = tableau_bin[adresse]*256+tableau_bin[adresse+1];
  int index;
  int t = 0;

  if ( !(c&32768) ) { // if we are in a terminal state

    index = tableau_bin[adresse+2]*256*256+tableau_bin[adresse+3]*256+tableau_bin[adresse+4];
    current_component[pos_in_current_component] = '\0';

    if (pos_in_current_component >= 1) {
      // go on if word length equals zero

#if DDEBUG > 0
      {
         u_fprintf(debug_file,". %S\n",current_component);
      }
#endif

      struct list_ustring* l = inf_codes->codes[index];
      while ( l != 0 ) {

//	int one_rule_already_matched = 0; // one rule matched each entry is enough

	unichar entry[MAX_DICT_LINE_LENGTH];
	uncompress_entry(current_component, l->string, entry);

#if DDEBUG > 0
	{
	  u_fprintf(debug_file,": %S\n",entry);
	}
#endif

	struct dela_entry* dic_entr = new_dic_entry(entry,entries);

	unichar lemma_prefix_new[MAX_DICT_LINE_LENGTH];
	struct rule_list* rule_list_new = 0;
	unichar next_remaining_word[MAX_WORD_LENGTH];

	struct rule_list* rule_list = 0;
	if (prefix_is_valid(index,prefix) || suffix_is_valid(index,suffix))
	  rule_list = parse_rules(entry,UTAG,rules);
	else {
	  rule_list = new_rule_list(rules);
	  rule_list->rule = new_composition_rule();
	}
	// entry is now cleaned from rules for composition and derivation

	// log decomposition of word
	// ("cleaned" entries for better overview)
	unichar decomposition_new[MAX_DICT_LINE_LENGTH];
	u_strcpy(decomposition_new, decomposition);
	if (decomposition_new[0] != '\0') u_strcat(decomposition_new, " +++ ");
	u_strcat(decomposition_new, entry);


	// loop on all composition_rules called
	struct rule_list* called = rule_list_called;
	do { // while ( rule_list* called != 0 )

// 	  if (one_rule_already_matched)
// 	    break;

 	  struct composition_rule* rule_called
	    = ( called != 0 ) ? called->rule : 0; // may be undefined

	  // loop on all actual composition_rules
	  struct rule_list* r_list = rule_list;
 	  while ( r_list != 0 ) {

// 	    if (one_rule_already_matched)
// 	      break;

	    struct composition_rule* rule = r_list->rule; // ever defined, see upwards

	    if (remaining_word[pos_in_remaining_word]=='\0' &&
		// we have explored the entire original word
		((((dic_entr_called != 0) &&
		   composition_rule_matches_entry(rule->before, dic_entr_called,debug_file))  &&
		  ((rule_called != 0) &&
		   composition_rule_matches_entry(rule_called->after, dic_entr,debug_file))) ||
		 // and we have a valid right component, i.e. rules match
		 ((dic_entr_called == 0) &&  // or a simple entry (i.e. no prefix),
		  (! affix_is_valid(index,prefix,suffix))) // but no affix
		 )
		)  {

//	      one_rule_already_matched = 1;

	      unichar inflected[MAX_WORD_LENGTH];
	      unichar lemma[MAX_WORD_LENGTH];
	      unichar codes[MAX_DICT_LINE_LENGTH];
	      tokenize_DELA_line_into_3_parts(entry, inflected, lemma, codes);

	      /* generating new lexicon entry */
	      unichar new_dela_line[MAX_DICT_LINE_LENGTH];

	      /* word form */
	      u_strcpy(new_dela_line, original_word);
	      u_strcat(new_dela_line, ",");

	      /* lemma */                           // lemmatize word
	      if (rule->then.repl[0] == '\0'	    // if there are no replace codes
		  && (rule_called != 0              // either in actual nor in preceeding rule
		      && rule_called->then.repl[0] == '\0')) {
		u_strcat(new_dela_line, lemma_prefix);
		unichar affix[MAX_WORD_LENGTH];
		u_strcpy(affix, lemma);
		substring_operation(affix, rule->then.substr_act);
		if (rule_called != 0 && rule_called->then.undo_substr_next[0] != '\0')
		  substring_operation(affix, rule_called->then.undo_substr_next);
		u_strcat(new_dela_line, affix);
	      } else {
		u_strcat(new_dela_line, original_word);
	      }

	      /* codes */
	      u_strcat(new_dela_line,".");
	      if (rule->then.repl[0] != '\0') {            // replacing codes by
		u_strcat(new_dela_line,rule->then.repl);   // suffix' ones
	      }
	      else if (rule_called == 0) { // prohibit SGV
		u_strcat(new_dela_line,codes);
	      }
	      else if (rule_called->then.repl[0] != '\0') {
		u_strcat(new_dela_line,rule_called->then.repl); // prefix' ones
	      }
	      // replace replaces all and blocks adding and deleting
	      // maybe this is not optimal ???
	      else {
		if (rule_called->then.add[0] != '\0') {        // add codes
		  if (!dic_entry_contain_gram_code(dic_entr, rule_called->then.add)) {
		    bool done = 0;
		    unichar tmp[MAX_COMPOSITION_RULE_LENGTH];
		    int j = 0;
		    for (int i = 0; codes[i] != '\0'; i++) {
		      if (codes[i] == ':' && (!done)) {
			tmp[j++] = '+';
			tmp[j] = '\0';
			u_strcat(new_dela_line,tmp);
			u_strcat(new_dela_line,rule_called->then.add);
			done = 1;
			j = 0;
		      }
		      tmp[j++] = codes[i];
		    }
		    tmp[j] = '\0';
		    u_strcat(new_dela_line,tmp);
		    if (!done) {
		      u_strcat(new_dela_line,"+");
		      u_strcat(new_dela_line,rule_called->then.add);
		    }
		  } else {
		    u_strcat(new_dela_line,codes);
		  }
		} else if (rule_called->then.del[0] != '\0') { // delete codes

		} else {
		  u_strcat(new_dela_line,codes);
		}
	      }

#if DDEBUG > 0
	      {
            u_fprintf(debug_file,"= %S\n",new_dela_line);
	      }
#endif

	      struct decomposed_word* wd = new_decomposed_word();
	      wd->n_parts = n_decomp;
	      u_strcpy(wd->decomposition,decomposition_new);
	      u_strcpy(wd->dela_line,new_dela_line);
	      struct decomposed_word_list* wdl=new_decomposed_word_list();
	      // unshift actual decomposition to decomposition list L
	      wdl->element = wd;
	      wdl->suivant = (*L);
	      (*L) = wdl;

	    } // end if end of word and valid right component
	    else if
	      // beginning or middle of word: explore the rest of the original word
	      (prefix_is_valid(index,prefix) &&
	       check_is_valid(UTAG.PREFIX, dic_entr) &&
	       // but only if the current component was a valid left one
	       // we go on with the next component
	       (
		(n_decomp == 1) // prefix as first part of a word: no rule matching
		||
		(               // prefix in the middle of a word
		 (rule_called &&
		  composition_rule_matches_entry(rule_called->after, dic_entr,debug_file)) &&
		 (dic_entr_called &&
		  composition_rule_matches_entry(rule->before, dic_entr_called,debug_file))
		)
	       )) {

//	      one_rule_already_matched = 1;

	      u_strcpy(lemma_prefix_new, lemma_prefix);
	      unichar affix[MAX_WORD_LENGTH];
	      u_strcpy(affix, current_component);
	      if (rule_called != 0 && rule_called->then.undo_substr_next[0] != '\0') {
            substring_operation(affix, rule_called->then.undo_substr_next);
            u_fprintf(debug_file,"yes\n");
	      }
	      substring_operation(affix, rule->then.substr_act);
	      u_strcat(lemma_prefix_new, affix);
	      int j = 0;
	      for (int i = pos_in_remaining_word; remaining_word[i] != '\0'; i++) {
            next_remaining_word[j++] = remaining_word[i];
         }
	      next_remaining_word[j] = '\0';
	      if (rule->then.substr_next[0] != '\0') {
            substring_operation(next_remaining_word, rule->then.substr_next);
#if DDEBUG > 0
            {
               u_fprintf(debug_file,"| %S|%S\n",affix,next_remaining_word);
            }
#endif
	      }

#if DDEBUG > 0
	      {
            u_fprintf(debug_file,"- %S\n",entry);
	      }
#endif
	      struct rule_list* tmp = new_rule_list(rules);
	      tmp->rule = new_composition_rule();
	      copy_composition_rule(tmp->rule, rule);
	      tmp->next = 0;
	      if ( rule_list_new == 0 ) {
            rule_list_new = tmp;
	      }
	      else {
            struct rule_list* trl = rule_list_new;
            while ( trl->next != 0 ) {
               trl=trl->next;
            }
            trl->next = tmp;
	      }

	    }
	    else {
	      // no valid suffix nor prefix
	    }

	    r_list = r_list->next;
	  } // while ( rule_list* r_list != 0 )

	  if ( called != 0 )
	    called = called->next;
	} while ( called != 0 );

	// prefix found, try to decomposite rest of word
	if ( rule_list_new != 0 && dic_entr != 0 ) {
	  unichar next_component[MAX_WORD_LENGTH];
#if DDEBUG > 0
	  {
	    u_fprintf(debug_file,"> %S\n",next_remaining_word);
	  }
#endif
	  explore_state(4,
			next_component,
			0,
			original_word,
			next_remaining_word,
			0,
			decomposition_new,
			lemma_prefix_new,
			L,
			n_decomp+1,
			rule_list_new,
			dic_entr,
			tableau_bin,inf_codes,prefix,suffix,alphabet,debug_file,UTAG,rules,entries);
	}
	else {
// 	  free_dic_entry(dic_entr);
// 	  free_rule_list(rule_list);
	}

	l = l->next;

      } // end of while (token_list* l != 0)

      t = adresse+5;

    } // end of word length >= 1
  }
  else { // not a final state
    c = c-32768;
    t = adresse+2;
  }
  if (remaining_word[pos_in_remaining_word]=='\0') {
    // if we have finished, we return
//     free_dic_entry(dic_entr_called);
//     free_rule_list(rule_list_called);
    return;
  }
  // if not, we go on with the next letter
  for (int i=0;i<c;i++) {
    if (is_equal_or_uppercase((unichar)(tableau_bin[t]*256+tableau_bin[t+1]),
			       remaining_word[pos_in_remaining_word],
			       alphabet)
	||
	is_equal_or_uppercase(remaining_word[pos_in_remaining_word],
			       (unichar)(tableau_bin[t]*256+tableau_bin[t+1]),
			       alphabet)) {
      index = tableau_bin[t+2]*256*256+tableau_bin[t+3]*256+tableau_bin[t+4];
      current_component[pos_in_current_component] =
	(unichar)(tableau_bin[t]*256+tableau_bin[t+1]);
      explore_state(index,
		    current_component,
		    pos_in_current_component+1,
		    original_word,
		    remaining_word,
		    pos_in_remaining_word+1,
		    decomposition,
		    lemma_prefix,
		    L,
		    n_decomp,
		    rule_list_called,
		    dic_entr_called,
		    tableau_bin,
		    inf_codes,prefix,suffix,alphabet,debug_file,UTAG,rules,entries);
    }
    t += 5;
  }
}