Beispiel #1
0
//
// returns 1 if the line is a N  but not FF one
//
char check_N_not_FF(const struct dela_entry* d) {
unichar t1[2];
u_strcpy(t1,"N");
unichar t2[3];
u_strcpy(t2,"FF");
return (char)(dic_entry_contain_gram_code(d,t1) && !(dic_entry_contain_gram_code(d,t2)));
}
Beispiel #2
0
/**
 * Returns 1 if the given DELAF entry is compatible with the given code part of this pattern;
 * 0 otherwise.
 */
int is_compatible_code_pattern(const struct dela_entry* entry,const struct pattern* pattern) {
struct list_ustring* tmp=pattern->grammatical_codes;
while (tmp!=NULL) {
   if (!dic_entry_contain_gram_code(entry,tmp->string)) {
      /* If one code of the pattern is not present in the entry, we fail */
      return 0;
   }
   tmp=tmp->next;
}
tmp=pattern->forbidden_codes;
while (tmp!=NULL) {
   if (dic_entry_contain_gram_code(entry,tmp->string)) {
      /* If one forbidden code of the pattern is present in the entry, we fail */
      return 0;
   }
   tmp=tmp->next;
}
tmp=pattern->inflectional_codes;
while (tmp!=NULL) {
   if (!dic_entry_contain_inflectional_code(entry,tmp->string)) {
      /* If one inflectional code of the pattern is not present in the entry, we fail */
      return 0;
   }
   tmp=tmp->next;
}
return 1;
}
Beispiel #3
0
/**
 * Returns 1 if the given dictionary entry is a "V" one that does
 * not have the inflectional code "Y".
 */
char check_V_but_not_Y(struct dela_entry* d) {
unichar t1[2];
u_strcpy(t1,"V");
unichar t2[2];
u_strcpy(t2,"Y");
return dic_entry_contain_gram_code(d,t1) && (!dic_entry_contain_inflectional_code(d,t2));
}
Beispiel #4
0
/**
 * Returns 1 if the given dictionary entry is a "V:W" one.
 */
char check_VW(const struct dela_entry* d) {
unichar t1[2];
u_strcpy(t1,"V");
unichar t2[2];
u_strcpy(t2,"W");
return dic_entry_contain_gram_code(d,t1) && dic_entry_contain_inflectional_code(d,t2);
}
Beispiel #5
0
/**
 * Returns 1 if the given DELAF entry is compatible with the given pattern;
 * 0 otherwise.
 */
int is_entry_compatible_with_pattern(const struct dela_entry* entry,const struct pattern* pattern) {
switch(pattern->type) {
   case LEMMA_PATTERN: return (!u_strcmp(entry->lemma,pattern->lemma));
   case CODE_PATTERN: return is_compatible_code_pattern(entry,pattern);
   case LEMMA_AND_CODE_PATTERN: return (!u_strcmp(entry->lemma,pattern->lemma)) && is_compatible_code_pattern(entry,pattern);
   case FULL_PATTERN: return (!u_strcmp(entry->inflected,pattern->inflected)) && (!u_strcmp(entry->lemma,pattern->lemma)) && is_compatible_code_pattern(entry,pattern);
   case AMBIGUOUS_PATTERN: return !u_strcmp(entry->lemma,pattern->lemma) || dic_entry_contain_gram_code(entry,pattern->lemma);
   case INFLECTED_AND_LEMMA_PATTERN: return (!u_strcmp(entry->inflected,pattern->inflected)) && (!u_strcmp(entry->lemma,pattern->lemma));
      default: fatal_error("Unexpected case in is_entry_compatible_with_pattern\n");
}
return 0;
}
Beispiel #6
0
/**
 * Returns 1 if the line is a valid right "A" component.
 */
char check_A_right_component(unichar* s) {
/* We produce an artifical dictionary entry with the given INF code,
 * and then, we tokenize it in order to get grammatical and inflectional
 * codes in a structured way. */
unichar temp[2000];
u_strcpy(temp,"x,");
u_strcat(temp,s);
struct dela_entry* d=tokenize_DELAF_line(temp,0);
unichar t1[2];
u_strcpy(t1,"A");
unichar t2[4];
u_strcpy(t2,"sie");
char res=dic_entry_contain_gram_code(d,t1) && !dic_entry_contain_inflectional_code(d,t2);
/* We free the artifical dictionary entry */
free_dela_entry(d);
return res;
}
int composition_rule_matches_entry (const struct pattern* rule,
				     const struct dela_entry* d,U_FILE* 
#if DDEBUG > 1                         
				     debug_file
#endif
                     ) {
  int ok = 1;
  // "ok = 0;"  may be replaced by "return 0;"
  int flex_code_already_matched = 1;
#if DDEBUG > 1
    u_strcat(tmp, "   trying ");
#endif
  for (int i = 0; i < MAX_NUMBER_OF_COMPOSITION_RULES; i++) {
    if (rule[i].string[0] == '\0')
      break; // last rule reached: return 1
#if DDEBUG > 1
    {
      if (rule[i].type == 'f')
	u_strcat(tmp, ":");
      else if (rule[i].YesNo)
	u_strcat(tmp, "+");
      else
	u_strcat(tmp, "-");
      u_strcat(tmp, rule[i].string);
    }
#endif
    if (rule[i].YesNo) { // rule '+' => pattern must be in entry, too
      if (rule[i].type == 'g') {
	if (dic_entry_contain_gram_code(d,rule[i].string))
	  continue; // rule matched, try next one
	ok = 0;
      }
      else if (rule[i].type == 'f') {
	if (dic_entry_contain_inflectional_code(d,rule[i].string)) {
	  // rule matched, try next one, but mark flex codes as matched
	  flex_code_already_matched = 2;
	  continue;
	}
	else if (flex_code_already_matched == 2) {
	  // no matter if any flex code already matched
	  continue;
	}
	else {
	  // no-matches before first match
	  flex_code_already_matched = 0;
	}
      }
    }
    else { // rule '-' => pattern must not be in entry
      if (rule[i].type == 'g') {
	if (dic_entry_contain_gram_code(d,rule[i].string))
	  ok = 0;
      }
      else if (rule[i].type == 'f') {
	// implemented although not possible in rule syntax
	if (dic_entry_contain_inflectional_code(d,rule[i].string))
	  ok = 0;
      }
    }
  }
#if DDEBUG > 1
  {
    if (ok && flex_code_already_matched) u_fprintf(debug_file,"\n   === matched ");
    else u_fprintf(debug_file,"\n   === not matched ");
    if ( d->semantic_codes != 0 ) {
      for (int i = 0; i < d->n_semantic_codes; i++) {
         u_fprintf(debug_file,"+%S",d->semantic_codes[i]);
      }
    }
    if ( d->inflectional_codes != 0 ) {
      for (int i = 0; i < d->n_inflectional_codes; i++) {
         u_fprintf(debug_file,":%S",d->inflectional_codes[i]);
      }
    }
    u_fprintf(debug_file,"\n");
  }
#endif
  return (ok && flex_code_already_matched);
}
int check_is_valid(const unichar* t, const struct dela_entry* d)
{
  return dic_entry_contain_gram_code(d, t);
}
Beispiel #9
0
//
// this function explores the dictionary to decompose the word mot
//
void explore_state (int adresse,
		    unichar* current_component,
		    int pos_in_current_component,
		    const unichar* original_word,
		    const unichar* remaining_word,
		    int pos_in_remaining_word,
		    const unichar* decomposition,
		    const unichar* lemma_prefix,
		    struct decomposed_word_list** L,
		    int n_decomp,
		    struct rule_list* rule_list_called,
		    const struct dela_entry* dic_entr_called,
		    const unsigned char* tableau_bin,
		    const struct INF_codes* inf_codes,
		    const bool* prefix,const bool* suffix,const Alphabet* alphabet,
		    U_FILE* debug_file,struct utags UTAG,
		    vector_ptr* rules,vector_ptr* entries)
{

  int c = tableau_bin[adresse]*256+tableau_bin[adresse+1];
  int index;
  int t = 0;

  if ( !(c&32768) ) { // if we are in a terminal state

    index = tableau_bin[adresse+2]*256*256+tableau_bin[adresse+3]*256+tableau_bin[adresse+4];
    current_component[pos_in_current_component] = '\0';

    if (pos_in_current_component >= 1) {
      // go on if word length equals zero

#if DDEBUG > 0
      {
         u_fprintf(debug_file,". %S\n",current_component);
      }
#endif

      struct list_ustring* l = inf_codes->codes[index];
      while ( l != 0 ) {

//	int one_rule_already_matched = 0; // one rule matched each entry is enough

	unichar entry[MAX_DICT_LINE_LENGTH];
	uncompress_entry(current_component, l->string, entry);

#if DDEBUG > 0
	{
	  u_fprintf(debug_file,": %S\n",entry);
	}
#endif

	struct dela_entry* dic_entr = new_dic_entry(entry,entries);

	unichar lemma_prefix_new[MAX_DICT_LINE_LENGTH];
	struct rule_list* rule_list_new = 0;
	unichar next_remaining_word[MAX_WORD_LENGTH];

	struct rule_list* rule_list = 0;
	if (prefix_is_valid(index,prefix) || suffix_is_valid(index,suffix))
	  rule_list = parse_rules(entry,UTAG,rules);
	else {
	  rule_list = new_rule_list(rules);
	  rule_list->rule = new_composition_rule();
	}
	// entry is now cleaned from rules for composition and derivation

	// log decomposition of word
	// ("cleaned" entries for better overview)
	unichar decomposition_new[MAX_DICT_LINE_LENGTH];
	u_strcpy(decomposition_new, decomposition);
	if (decomposition_new[0] != '\0') u_strcat(decomposition_new, " +++ ");
	u_strcat(decomposition_new, entry);


	// loop on all composition_rules called
	struct rule_list* called = rule_list_called;
	do { // while ( rule_list* called != 0 )

// 	  if (one_rule_already_matched)
// 	    break;

 	  struct composition_rule* rule_called
	    = ( called != 0 ) ? called->rule : 0; // may be undefined

	  // loop on all actual composition_rules
	  struct rule_list* r_list = rule_list;
 	  while ( r_list != 0 ) {

// 	    if (one_rule_already_matched)
// 	      break;

	    struct composition_rule* rule = r_list->rule; // ever defined, see upwards

	    if (remaining_word[pos_in_remaining_word]=='\0' &&
		// we have explored the entire original word
		((((dic_entr_called != 0) &&
		   composition_rule_matches_entry(rule->before, dic_entr_called,debug_file))  &&
		  ((rule_called != 0) &&
		   composition_rule_matches_entry(rule_called->after, dic_entr,debug_file))) ||
		 // and we have a valid right component, i.e. rules match
		 ((dic_entr_called == 0) &&  // or a simple entry (i.e. no prefix),
		  (! affix_is_valid(index,prefix,suffix))) // but no affix
		 )
		)  {

//	      one_rule_already_matched = 1;

	      unichar inflected[MAX_WORD_LENGTH];
	      unichar lemma[MAX_WORD_LENGTH];
	      unichar codes[MAX_DICT_LINE_LENGTH];
	      tokenize_DELA_line_into_3_parts(entry, inflected, lemma, codes);

	      /* generating new lexicon entry */
	      unichar new_dela_line[MAX_DICT_LINE_LENGTH];

	      /* word form */
	      u_strcpy(new_dela_line, original_word);
	      u_strcat(new_dela_line, ",");

	      /* lemma */                           // lemmatize word
	      if (rule->then.repl[0] == '\0'	    // if there are no replace codes
		  && (rule_called != 0              // either in actual nor in preceeding rule
		      && rule_called->then.repl[0] == '\0')) {
		u_strcat(new_dela_line, lemma_prefix);
		unichar affix[MAX_WORD_LENGTH];
		u_strcpy(affix, lemma);
		substring_operation(affix, rule->then.substr_act);
		if (rule_called != 0 && rule_called->then.undo_substr_next[0] != '\0')
		  substring_operation(affix, rule_called->then.undo_substr_next);
		u_strcat(new_dela_line, affix);
	      } else {
		u_strcat(new_dela_line, original_word);
	      }

	      /* codes */
	      u_strcat(new_dela_line,".");
	      if (rule->then.repl[0] != '\0') {            // replacing codes by
		u_strcat(new_dela_line,rule->then.repl);   // suffix' ones
	      }
	      else if (rule_called == 0) { // prohibit SGV
		u_strcat(new_dela_line,codes);
	      }
	      else if (rule_called->then.repl[0] != '\0') {
		u_strcat(new_dela_line,rule_called->then.repl); // prefix' ones
	      }
	      // replace replaces all and blocks adding and deleting
	      // maybe this is not optimal ???
	      else {
		if (rule_called->then.add[0] != '\0') {        // add codes
		  if (!dic_entry_contain_gram_code(dic_entr, rule_called->then.add)) {
		    bool done = 0;
		    unichar tmp[MAX_COMPOSITION_RULE_LENGTH];
		    int j = 0;
		    for (int i = 0; codes[i] != '\0'; i++) {
		      if (codes[i] == ':' && (!done)) {
			tmp[j++] = '+';
			tmp[j] = '\0';
			u_strcat(new_dela_line,tmp);
			u_strcat(new_dela_line,rule_called->then.add);
			done = 1;
			j = 0;
		      }
		      tmp[j++] = codes[i];
		    }
		    tmp[j] = '\0';
		    u_strcat(new_dela_line,tmp);
		    if (!done) {
		      u_strcat(new_dela_line,"+");
		      u_strcat(new_dela_line,rule_called->then.add);
		    }
		  } else {
		    u_strcat(new_dela_line,codes);
		  }
		} else if (rule_called->then.del[0] != '\0') { // delete codes

		} else {
		  u_strcat(new_dela_line,codes);
		}
	      }

#if DDEBUG > 0
	      {
            u_fprintf(debug_file,"= %S\n",new_dela_line);
	      }
#endif

	      struct decomposed_word* wd = new_decomposed_word();
	      wd->n_parts = n_decomp;
	      u_strcpy(wd->decomposition,decomposition_new);
	      u_strcpy(wd->dela_line,new_dela_line);
	      struct decomposed_word_list* wdl=new_decomposed_word_list();
	      // unshift actual decomposition to decomposition list L
	      wdl->element = wd;
	      wdl->suivant = (*L);
	      (*L) = wdl;

	    } // end if end of word and valid right component
	    else if
	      // beginning or middle of word: explore the rest of the original word
	      (prefix_is_valid(index,prefix) &&
	       check_is_valid(UTAG.PREFIX, dic_entr) &&
	       // but only if the current component was a valid left one
	       // we go on with the next component
	       (
		(n_decomp == 1) // prefix as first part of a word: no rule matching
		||
		(               // prefix in the middle of a word
		 (rule_called &&
		  composition_rule_matches_entry(rule_called->after, dic_entr,debug_file)) &&
		 (dic_entr_called &&
		  composition_rule_matches_entry(rule->before, dic_entr_called,debug_file))
		)
	       )) {

//	      one_rule_already_matched = 1;

	      u_strcpy(lemma_prefix_new, lemma_prefix);
	      unichar affix[MAX_WORD_LENGTH];
	      u_strcpy(affix, current_component);
	      if (rule_called != 0 && rule_called->then.undo_substr_next[0] != '\0') {
            substring_operation(affix, rule_called->then.undo_substr_next);
            u_fprintf(debug_file,"yes\n");
	      }
	      substring_operation(affix, rule->then.substr_act);
	      u_strcat(lemma_prefix_new, affix);
	      int j = 0;
	      for (int i = pos_in_remaining_word; remaining_word[i] != '\0'; i++) {
            next_remaining_word[j++] = remaining_word[i];
         }
	      next_remaining_word[j] = '\0';
	      if (rule->then.substr_next[0] != '\0') {
            substring_operation(next_remaining_word, rule->then.substr_next);
#if DDEBUG > 0
            {
               u_fprintf(debug_file,"| %S|%S\n",affix,next_remaining_word);
            }
#endif
	      }

#if DDEBUG > 0
	      {
            u_fprintf(debug_file,"- %S\n",entry);
	      }
#endif
	      struct rule_list* tmp = new_rule_list(rules);
	      tmp->rule = new_composition_rule();
	      copy_composition_rule(tmp->rule, rule);
	      tmp->next = 0;
	      if ( rule_list_new == 0 ) {
            rule_list_new = tmp;
	      }
	      else {
            struct rule_list* trl = rule_list_new;
            while ( trl->next != 0 ) {
               trl=trl->next;
            }
            trl->next = tmp;
	      }

	    }
	    else {
	      // no valid suffix nor prefix
	    }

	    r_list = r_list->next;
	  } // while ( rule_list* r_list != 0 )

	  if ( called != 0 )
	    called = called->next;
	} while ( called != 0 );

	// prefix found, try to decomposite rest of word
	if ( rule_list_new != 0 && dic_entr != 0 ) {
	  unichar next_component[MAX_WORD_LENGTH];
#if DDEBUG > 0
	  {
	    u_fprintf(debug_file,"> %S\n",next_remaining_word);
	  }
#endif
	  explore_state(4,
			next_component,
			0,
			original_word,
			next_remaining_word,
			0,
			decomposition_new,
			lemma_prefix_new,
			L,
			n_decomp+1,
			rule_list_new,
			dic_entr,
			tableau_bin,inf_codes,prefix,suffix,alphabet,debug_file,UTAG,rules,entries);
	}
	else {
// 	  free_dic_entry(dic_entr);
// 	  free_rule_list(rule_list);
	}

	l = l->next;

      } // end of while (token_list* l != 0)

      t = adresse+5;

    } // end of word length >= 1
  }
  else { // not a final state
    c = c-32768;
    t = adresse+2;
  }
  if (remaining_word[pos_in_remaining_word]=='\0') {
    // if we have finished, we return
//     free_dic_entry(dic_entr_called);
//     free_rule_list(rule_list_called);
    return;
  }
  // if not, we go on with the next letter
  for (int i=0;i<c;i++) {
    if (is_equal_or_uppercase((unichar)(tableau_bin[t]*256+tableau_bin[t+1]),
			       remaining_word[pos_in_remaining_word],
			       alphabet)
	||
	is_equal_or_uppercase(remaining_word[pos_in_remaining_word],
			       (unichar)(tableau_bin[t]*256+tableau_bin[t+1]),
			       alphabet)) {
      index = tableau_bin[t+2]*256*256+tableau_bin[t+3]*256+tableau_bin[t+4];
      current_component[pos_in_current_component] =
	(unichar)(tableau_bin[t]*256+tableau_bin[t+1]);
      explore_state(index,
		    current_component,
		    pos_in_current_component+1,
		    original_word,
		    remaining_word,
		    pos_in_remaining_word+1,
		    decomposition,
		    lemma_prefix,
		    L,
		    n_decomp,
		    rule_list_called,
		    dic_entr_called,
		    tableau_bin,
		    inf_codes,prefix,suffix,alphabet,debug_file,UTAG,rules,entries);
    }
    t += 5;
  }
}
Beispiel #10
0
/**
 * Returns 1 if the given dictionary entry is a "A" one.
 */
char check_A(struct dela_entry* d) {
unichar t1[2];
u_strcpy(t1,"A");
return (char)dic_entry_contain_gram_code(d,t1);
}
Beispiel #11
0
/**
 * Returns 1 if the given dictionary entry is a "ADV" one.
 */
char check_ADV(const struct dela_entry* d) {
unichar t1[4];
u_strcpy(t1,"ADV");
return (char)dic_entry_contain_gram_code(d,t1);
}