Example #1
0
/**
 * Explores the transitions that outgo from the given state.
 * Returns 1 if a recursion is found; 0 otherwise.
 */
int explore_state(int state_number,struct list_int* l,Fst2* fst2,int* graphs_matching_E,U_FILE*ferr) {
Fst2State s=fst2->states[state_number];
int ret=0;
if (s==NULL) return 0;
if (is_bit_mask_set(s->control,TMP_LOOP_MARK|VISITED_MARK)) {
   /* If this state has already been processed */
   return 0;
}
set_bit_mask(&(s->control),TMP_LOOP_MARK|VISITED_MARK);
Transition* list=s->transitions;
while (list!=NULL) {
   if (list->tag_number<0) {
      /* If we have a subgraph call */
      if (look_for_recursion(-(list->tag_number),l,fst2,graphs_matching_E,ferr)) {
         /* If there is a recursion */
         return 1;
      }
      if (graphs_matching_E[-list->tag_number] && explore_state(list->state_number,l,fst2,graphs_matching_E,ferr)) {
         /* If the graph matches <E> */
         return 1;
      }
   } else if (fst2->tags[list->tag_number]->control==1) {
      /* If we have a transition that can match <E> */
      if (explore_state(list->state_number,l,fst2,graphs_matching_E,ferr)) {
         return 1;
      }
   }
   list=list->next;
}
unset_bit_mask(&(s->control),TMP_LOOP_MARK);
return ret;
}
Example #2
0
//
// this function try to analyse an unknown russian word
//
int analyse_word(const unichar* mot,const unsigned char* tableau_bin,U_FILE* debug,U_FILE* result_file,
                 const struct INF_codes* inf_codes,const bool* prefix,const bool* suffix,const Alphabet* alphabet,
                 struct utags UTAG,vector_ptr* rules,vector_ptr* entries)
{
#if DDEBUG > 0
  {
    u_fprintf(debug,"\n  %S\n",mot);
  }
#endif

  unichar decomposition[MAX_DICT_LINE_LENGTH];
  unichar dela_line[MAX_DICT_LINE_LENGTH];
  unichar correct_word[MAX_DICT_LINE_LENGTH];
  decomposition[0]='\0';
  dela_line[0]='\0';
  correct_word[0]='\0';
  struct decomposed_word_list* l = 0;
  explore_state(4,correct_word,0,mot,mot,0,decomposition,dela_line,&l,1,0,0,tableau_bin,
        inf_codes,prefix,suffix,alphabet,debug,UTAG,rules,entries);
  free_all_dic_entries(entries);
  free_all_rule_lists(rules);
  if ( l == 0 ) {
    return 0;
  }
  struct decomposed_word_list* tmp = l;
  while ( tmp != NULL ) {
	  if (debug!=NULL) {
	     u_fprintf(debug,"%S = %S\n",mot,tmp->element->decomposition);
	  }
	  u_fprintf(result_file,"%S\n",tmp->element->dela_line);
     tmp=tmp->suivant;
  }
  free_decomposed_word_list(l);
  return 1;
}
Example #3
0
/**
 * Returns 1 and prints an error message if a recursion is found in graph #n;
 * returns 0 otherwise.
 */
int look_for_recursion(int n,struct list_int* l,Fst2* fst2,int* graphs_matching_E,U_FILE*ferr) {
if (is_in_list(n,l)) {
   /* If we find a graph that has already been visited */
   print_reversed_list(l,n,fst2->graph_names,ferr);
   error(" recalls the graph %S\n",fst2->graph_names[n]);
   if (ferr != NULL)
      u_fprintf(ferr," recalls the graph %S\n",fst2->graph_names[n]);
   return 1;
}
l=new_list_int(n,l);
int ret=explore_state(fst2->initial_states[n],l,fst2,graphs_matching_E,ferr);
delete_head(&l);
return ret;
}
Example #4
0
//
// this function explores the dictionary to decompose the word mot
//
void explore_state (int adresse,
		    unichar* current_component,
		    int pos_in_current_component,
		    const unichar* original_word,
		    const unichar* remaining_word,
		    int pos_in_remaining_word,
		    const unichar* decomposition,
		    const unichar* lemma_prefix,
		    struct decomposed_word_list** L,
		    int n_decomp,
		    struct rule_list* rule_list_called,
		    const struct dela_entry* dic_entr_called,
		    const unsigned char* tableau_bin,
		    const struct INF_codes* inf_codes,
		    const bool* prefix,const bool* suffix,const Alphabet* alphabet,
		    U_FILE* debug_file,struct utags UTAG,
		    vector_ptr* rules,vector_ptr* entries)
{

  int c = tableau_bin[adresse]*256+tableau_bin[adresse+1];
  int index;
  int t = 0;

  if ( !(c&32768) ) { // if we are in a terminal state

    index = tableau_bin[adresse+2]*256*256+tableau_bin[adresse+3]*256+tableau_bin[adresse+4];
    current_component[pos_in_current_component] = '\0';

    if (pos_in_current_component >= 1) {
      // go on if word length equals zero

#if DDEBUG > 0
      {
         u_fprintf(debug_file,". %S\n",current_component);
      }
#endif

      struct list_ustring* l = inf_codes->codes[index];
      while ( l != 0 ) {

//	int one_rule_already_matched = 0; // one rule matched each entry is enough

	unichar entry[MAX_DICT_LINE_LENGTH];
	uncompress_entry(current_component, l->string, entry);

#if DDEBUG > 0
	{
	  u_fprintf(debug_file,": %S\n",entry);
	}
#endif

	struct dela_entry* dic_entr = new_dic_entry(entry,entries);

	unichar lemma_prefix_new[MAX_DICT_LINE_LENGTH];
	struct rule_list* rule_list_new = 0;
	unichar next_remaining_word[MAX_WORD_LENGTH];

	struct rule_list* rule_list = 0;
	if (prefix_is_valid(index,prefix) || suffix_is_valid(index,suffix))
	  rule_list = parse_rules(entry,UTAG,rules);
	else {
	  rule_list = new_rule_list(rules);
	  rule_list->rule = new_composition_rule();
	}
	// entry is now cleaned from rules for composition and derivation

	// log decomposition of word
	// ("cleaned" entries for better overview)
	unichar decomposition_new[MAX_DICT_LINE_LENGTH];
	u_strcpy(decomposition_new, decomposition);
	if (decomposition_new[0] != '\0') u_strcat(decomposition_new, " +++ ");
	u_strcat(decomposition_new, entry);


	// loop on all composition_rules called
	struct rule_list* called = rule_list_called;
	do { // while ( rule_list* called != 0 )

// 	  if (one_rule_already_matched)
// 	    break;

 	  struct composition_rule* rule_called
	    = ( called != 0 ) ? called->rule : 0; // may be undefined

	  // loop on all actual composition_rules
	  struct rule_list* r_list = rule_list;
 	  while ( r_list != 0 ) {

// 	    if (one_rule_already_matched)
// 	      break;

	    struct composition_rule* rule = r_list->rule; // ever defined, see upwards

	    if (remaining_word[pos_in_remaining_word]=='\0' &&
		// we have explored the entire original word
		((((dic_entr_called != 0) &&
		   composition_rule_matches_entry(rule->before, dic_entr_called,debug_file))  &&
		  ((rule_called != 0) &&
		   composition_rule_matches_entry(rule_called->after, dic_entr,debug_file))) ||
		 // and we have a valid right component, i.e. rules match
		 ((dic_entr_called == 0) &&  // or a simple entry (i.e. no prefix),
		  (! affix_is_valid(index,prefix,suffix))) // but no affix
		 )
		)  {

//	      one_rule_already_matched = 1;

	      unichar inflected[MAX_WORD_LENGTH];
	      unichar lemma[MAX_WORD_LENGTH];
	      unichar codes[MAX_DICT_LINE_LENGTH];
	      tokenize_DELA_line_into_3_parts(entry, inflected, lemma, codes);

	      /* generating new lexicon entry */
	      unichar new_dela_line[MAX_DICT_LINE_LENGTH];

	      /* word form */
	      u_strcpy(new_dela_line, original_word);
	      u_strcat(new_dela_line, ",");

	      /* lemma */                           // lemmatize word
	      if (rule->then.repl[0] == '\0'	    // if there are no replace codes
		  && (rule_called != 0              // either in actual nor in preceeding rule
		      && rule_called->then.repl[0] == '\0')) {
		u_strcat(new_dela_line, lemma_prefix);
		unichar affix[MAX_WORD_LENGTH];
		u_strcpy(affix, lemma);
		substring_operation(affix, rule->then.substr_act);
		if (rule_called != 0 && rule_called->then.undo_substr_next[0] != '\0')
		  substring_operation(affix, rule_called->then.undo_substr_next);
		u_strcat(new_dela_line, affix);
	      } else {
		u_strcat(new_dela_line, original_word);
	      }

	      /* codes */
	      u_strcat(new_dela_line,".");
	      if (rule->then.repl[0] != '\0') {            // replacing codes by
		u_strcat(new_dela_line,rule->then.repl);   // suffix' ones
	      }
	      else if (rule_called == 0) { // prohibit SGV
		u_strcat(new_dela_line,codes);
	      }
	      else if (rule_called->then.repl[0] != '\0') {
		u_strcat(new_dela_line,rule_called->then.repl); // prefix' ones
	      }
	      // replace replaces all and blocks adding and deleting
	      // maybe this is not optimal ???
	      else {
		if (rule_called->then.add[0] != '\0') {        // add codes
		  if (!dic_entry_contain_gram_code(dic_entr, rule_called->then.add)) {
		    bool done = 0;
		    unichar tmp[MAX_COMPOSITION_RULE_LENGTH];
		    int j = 0;
		    for (int i = 0; codes[i] != '\0'; i++) {
		      if (codes[i] == ':' && (!done)) {
			tmp[j++] = '+';
			tmp[j] = '\0';
			u_strcat(new_dela_line,tmp);
			u_strcat(new_dela_line,rule_called->then.add);
			done = 1;
			j = 0;
		      }
		      tmp[j++] = codes[i];
		    }
		    tmp[j] = '\0';
		    u_strcat(new_dela_line,tmp);
		    if (!done) {
		      u_strcat(new_dela_line,"+");
		      u_strcat(new_dela_line,rule_called->then.add);
		    }
		  } else {
		    u_strcat(new_dela_line,codes);
		  }
		} else if (rule_called->then.del[0] != '\0') { // delete codes

		} else {
		  u_strcat(new_dela_line,codes);
		}
	      }

#if DDEBUG > 0
	      {
            u_fprintf(debug_file,"= %S\n",new_dela_line);
	      }
#endif

	      struct decomposed_word* wd = new_decomposed_word();
	      wd->n_parts = n_decomp;
	      u_strcpy(wd->decomposition,decomposition_new);
	      u_strcpy(wd->dela_line,new_dela_line);
	      struct decomposed_word_list* wdl=new_decomposed_word_list();
	      // unshift actual decomposition to decomposition list L
	      wdl->element = wd;
	      wdl->suivant = (*L);
	      (*L) = wdl;

	    } // end if end of word and valid right component
	    else if
	      // beginning or middle of word: explore the rest of the original word
	      (prefix_is_valid(index,prefix) &&
	       check_is_valid(UTAG.PREFIX, dic_entr) &&
	       // but only if the current component was a valid left one
	       // we go on with the next component
	       (
		(n_decomp == 1) // prefix as first part of a word: no rule matching
		||
		(               // prefix in the middle of a word
		 (rule_called &&
		  composition_rule_matches_entry(rule_called->after, dic_entr,debug_file)) &&
		 (dic_entr_called &&
		  composition_rule_matches_entry(rule->before, dic_entr_called,debug_file))
		)
	       )) {

//	      one_rule_already_matched = 1;

	      u_strcpy(lemma_prefix_new, lemma_prefix);
	      unichar affix[MAX_WORD_LENGTH];
	      u_strcpy(affix, current_component);
	      if (rule_called != 0 && rule_called->then.undo_substr_next[0] != '\0') {
            substring_operation(affix, rule_called->then.undo_substr_next);
            u_fprintf(debug_file,"yes\n");
	      }
	      substring_operation(affix, rule->then.substr_act);
	      u_strcat(lemma_prefix_new, affix);
	      int j = 0;
	      for (int i = pos_in_remaining_word; remaining_word[i] != '\0'; i++) {
            next_remaining_word[j++] = remaining_word[i];
         }
	      next_remaining_word[j] = '\0';
	      if (rule->then.substr_next[0] != '\0') {
            substring_operation(next_remaining_word, rule->then.substr_next);
#if DDEBUG > 0
            {
               u_fprintf(debug_file,"| %S|%S\n",affix,next_remaining_word);
            }
#endif
	      }

#if DDEBUG > 0
	      {
            u_fprintf(debug_file,"- %S\n",entry);
	      }
#endif
	      struct rule_list* tmp = new_rule_list(rules);
	      tmp->rule = new_composition_rule();
	      copy_composition_rule(tmp->rule, rule);
	      tmp->next = 0;
	      if ( rule_list_new == 0 ) {
            rule_list_new = tmp;
	      }
	      else {
            struct rule_list* trl = rule_list_new;
            while ( trl->next != 0 ) {
               trl=trl->next;
            }
            trl->next = tmp;
	      }

	    }
	    else {
	      // no valid suffix nor prefix
	    }

	    r_list = r_list->next;
	  } // while ( rule_list* r_list != 0 )

	  if ( called != 0 )
	    called = called->next;
	} while ( called != 0 );

	// prefix found, try to decomposite rest of word
	if ( rule_list_new != 0 && dic_entr != 0 ) {
	  unichar next_component[MAX_WORD_LENGTH];
#if DDEBUG > 0
	  {
	    u_fprintf(debug_file,"> %S\n",next_remaining_word);
	  }
#endif
	  explore_state(4,
			next_component,
			0,
			original_word,
			next_remaining_word,
			0,
			decomposition_new,
			lemma_prefix_new,
			L,
			n_decomp+1,
			rule_list_new,
			dic_entr,
			tableau_bin,inf_codes,prefix,suffix,alphabet,debug_file,UTAG,rules,entries);
	}
	else {
// 	  free_dic_entry(dic_entr);
// 	  free_rule_list(rule_list);
	}

	l = l->next;

      } // end of while (token_list* l != 0)

      t = adresse+5;

    } // end of word length >= 1
  }
  else { // not a final state
    c = c-32768;
    t = adresse+2;
  }
  if (remaining_word[pos_in_remaining_word]=='\0') {
    // if we have finished, we return
//     free_dic_entry(dic_entr_called);
//     free_rule_list(rule_list_called);
    return;
  }
  // if not, we go on with the next letter
  for (int i=0;i<c;i++) {
    if (is_equal_or_uppercase((unichar)(tableau_bin[t]*256+tableau_bin[t+1]),
			       remaining_word[pos_in_remaining_word],
			       alphabet)
	||
	is_equal_or_uppercase(remaining_word[pos_in_remaining_word],
			       (unichar)(tableau_bin[t]*256+tableau_bin[t+1]),
			       alphabet)) {
      index = tableau_bin[t+2]*256*256+tableau_bin[t+3]*256+tableau_bin[t+4];
      current_component[pos_in_current_component] =
	(unichar)(tableau_bin[t]*256+tableau_bin[t+1]);
      explore_state(index,
		    current_component,
		    pos_in_current_component+1,
		    original_word,
		    remaining_word,
		    pos_in_remaining_word+1,
		    decomposition,
		    lemma_prefix,
		    L,
		    n_decomp,
		    rule_list_called,
		    dic_entr_called,
		    tableau_bin,
		    inf_codes,prefix,suffix,alphabet,debug_file,UTAG,rules,entries);
    }
    t += 5;
  }
}
Example #5
0
void
tarjan_run (run_t *run, wctx_t *ctx)
{
    alg_local_t        *loc        = ctx->local;
    raw_data_t         *addr;
    raw_data_t          state_data;
    bool                on_stack;
    hash32_t            hash;

#ifdef HAVE_PROFILER
    Warning (info, "Using the profiler");
    ProfilerStart ("tarjan.perf");
#endif

#ifdef SEARCH_COMPLETE_GRAPH
    int              init_state = dlopen_get_worker_initial_state (ctx->id, W);
    int              inits      = 0;

    // loop until every state of the graph has been visited
    while ( 1 )
    {
        inits ++;
        // use loc->target as a dummy for the initial state
        loc->target->ref = init_state;
#endif

    tarjan_init (ctx);
    
    // continue until we are done exploring the graph
    while ( !run_is_stopped (run) ) {

        state_data = dfs_stack_top (loc->search_stack);

        if (state_data != NULL) {
            // there is a state on the current stackframe ==> explore it

            state_info_deserialize (ctx->state, state_data);

            // pop the state and continue if it is part of a completed SCC
            if (state_store_has_color (ctx->state->ref, SCC_STATE, 0)) {
                dfs_stack_pop (loc->search_stack);
                continue;
            }

            hash     = ref_hash (ctx->state->ref);
            on_stack = fset_find (loc->visited_states, &hash,
                                  &ctx->state->ref, (void **) &addr, true);

            if (!on_stack) {
                // unseen state ==> initialize and explore

                HREassert (loc->cnt.tarjan_counter != UINT32_MAX);
                loc->cnt.tarjan_counter ++;
                loc->state_tarjan.index   = loc->cnt.tarjan_counter;
                loc->state_tarjan.lowlink = loc->cnt.tarjan_counter;

                // point visited_states data to stack
                *addr = state_data;

                explore_state (ctx);

                state_info_serialize (ctx->state, state_data);

            } else {
                // previously visited state ==> update parent
                // NB: state is on tarjan_stack

                state_info_deserialize (ctx->state, *addr);
                update_parent (ctx, loc->state_tarjan.lowlink);
                dfs_stack_pop (loc->search_stack);
            }

        } else {
            // there is no state on the current stackframe ==> backtrack

            // we are done if we backtrack from the initial state
            if (0 == dfs_stack_nframes (loc->search_stack))
                break;

            // leave the stackframe
            dfs_stack_leave (loc->search_stack);
            ctx->counters->level_cur--;

            // retrieve the parent state from search_stack (to be removed)
            state_data = dfs_stack_top (loc->search_stack);
            state_info_deserialize (ctx->state, state_data);

            Debug ("Backtracking %zu (%d, %d)", ctx->state->ref,
                   loc->state_tarjan.index, loc->state_tarjan.lowlink);

            if (loc->state_tarjan.index == loc->state_tarjan.lowlink) {
                // index == lowlink ==> root of the SCC ==> report the SCC
                pop_scc (ctx, ctx->state->ref, loc->state_tarjan.lowlink);

            } else {
                // lowlink < index ==> LIVE SCC ==> move to tarjan_stack
                move_tarjan (ctx, ctx->state, state_data);
                update_parent (ctx, loc->state_tarjan.lowlink);
            }

            dfs_stack_pop (loc->search_stack);
        }
    }

#ifdef SEARCH_COMPLETE_GRAPH
        init_state = dlopen_get_new_initial_state (init_state);
        if (init_state == -1) {
            Warning(info, "Number of inits : %d", inits);
            break;
        }
    }
#endif

#ifdef HAVE_PROFILER
    Warning(info, "Done profiling");
    ProfilerStop();
#endif

    if (!run_is_stopped(run) && dfs_stack_size(loc->tarjan_stack) != 0)
        Warning (info, "Tarjan stack not empty: %zu (stack %zu)",
                 dfs_stack_size(loc->tarjan_stack),
                 dfs_stack_size(loc->search_stack));
    if (!run_is_stopped(run) && fset_count(loc->visited_states) != 0)
        Warning (info, "Stack-set not empty: %zu",
                 fset_count(loc->visited_states));
}
Example #6
0
/**
 * This explores the dictionary in order decompose the given word into a valid sequence
 * of simple words. For instance, if we have the word "Sommervarmt", we will first
 * explore the dictionary and find that "sommer" is a valid left component that
 * corresponds to the dictionary entry "sommer,.N:msia". Then we will
 * look if the following word "varmt" is in the dictionary. It is
 * the case, with the entry "varmt,varm.A:nsio". As we are at the end of the word to
 * analyze and as "varmt" is a valid rightmost component, we will generate an entry
 * according to the following things:
 *
 * 'output_dela_line'="sommervarmt,sommervarm.A:nsio"
 * 'analysis'="sommer,.N:msia +++ varmt,varm.A:nsio"
 * 'number_of_components'=2
 *
 * Note that the initial "S" was put in lowercase, because the dictionary
 * contains "sommer" and not "Sommer". The lemma is obtained with
 * the lemma of the rightmost component (here "varm"), and the word inherits
 * from the grammatical information of its rightmost component.
 *
 * 'offset': offset of the current node in the binary array 'infos->bin'
 * 'current_component': string that represents the current simple word
 * 'pos_in_current_component': position in the string 'current_component'
 * 'word_to_analyze': the word to analyze
 * 'pos_in_word_to_analyze': position in the string 'word_to_analyze'
 * 'analysis': string that represents the analysis as a concatenation like
 *             "sommer,.N:msia +++ varmt,varm.A:nsio"
 * 'output_dela_line': string that contains the final DELA line. The lemma is
 *                     obtained by replacing the rightmost term of
 *                     the word to analyze by its lemma.
 * 'L': list of all analysis for the given word
 * 'number_of_components': number of components that compose the word.
 * 'infos': global settings.
 */
void explore_state(int offset,unichar* current_component,int pos_in_current_component,
                   const unichar* word_to_analyze,int pos_in_word_to_analyze,const unichar* analysis,
                   const unichar* output_dela_line,struct word_decomposition_list** L,
                   int number_of_components,struct norwegian_infos* infos) {
int c;
int index,t;
c=infos->bin[offset]*256+infos->bin[offset+1];
if (!(c&32768)) {
	/* If we are in a final state, we compute the index of the
	 * corresponding INF line */
	index=infos->bin[offset+2]*256*256+infos->bin[offset+3]*256+infos->bin[offset+4];
	/* We can set the end of our current component */
	current_component[pos_in_current_component]='\0';
	/* We do not consider words of length 1 */
	if (pos_in_current_component>1) {
		/* We don't consider components with a length of 1 */
		if (word_to_analyze[pos_in_word_to_analyze]=='\0') {
			/* If we have explored the entire original word */
			if (get_value_index(current_component,infos->forbidden_words,DONT_INSERT)==NO_VALUE_INDEX) {
				/* And if we do not have forbidden word in last position */
				struct list_ustring* l=infos->inf->codes[index];
				/* We will look at all the INF codes of the last component in order
				 * to produce analysis */
				while (l!=NULL) {
					unichar dec[2000];
					u_strcpy(dec,analysis);
					if (dec[0]!='\0') {
						/* If we have already something in the analysis (i.e. if
						 * we have not a simple word), we insert the concatenation
						 * mark before the entry to come */
						u_strcat(dec," +++ ");
					}
					unichar entry[2000];
					/* We get the dictionary line that corresponds to the current INF code */
					uncompress_entry(current_component,l->string,entry);
					/* And we add it to the analysis */
					u_strcat(dec,entry);
					unichar new_dela_line[2000];
					/* We copy the current output DELA line that contains
					 * the concatenation of the previous components */
					u_strcpy(new_dela_line,output_dela_line);
					/* Then we tokenize the DELA line that corresponds the current INF
					 * code in order to obtain its lemma and grammatical/inflectional
					 * information */
					struct dela_entry* tmp_entry=tokenize_DELAF_line(entry,1);
					/* We concatenate the inflected form of the last component to
					 * the output DELA line */
					u_strcat(new_dela_line,tmp_entry->inflected);
					/* We put the comma that separates the inflected form and the lemma */
					u_strcat(new_dela_line,",");
					/* And we build the lemma in the same way than the inflected form */
					u_strcat(new_dela_line,output_dela_line);
					u_strcat(new_dela_line,tmp_entry->lemma);
					/* We put the dot that separates the the lemma and the grammatical/inflectional
					 * information */
					u_strcat(new_dela_line,".");
					/* And finally we put the grammatical/inflectional information */
					u_strcat(new_dela_line,tmp_entry->semantic_codes[0]);
               int k;
               for (k=1;k<tmp_entry->n_semantic_codes;k++) {
                  u_strcat(new_dela_line,"+");
                  u_strcat(new_dela_line,tmp_entry->semantic_codes[k]);
               }
               for (k=0;k<tmp_entry->n_inflectional_codes;k++) {
                  u_strcat(new_dela_line,":");
                  u_strcat(new_dela_line,tmp_entry->inflectional_codes[k]);
               }
					free_dela_entry(tmp_entry);
					/*
					 * Now we can build an analysis in the form of a word decomposition
					 * structure, but only if the last component is a valid
					 * right one or if it is a verb long enough, or if we find out
					 * that the word to analyze was in fact a simple word
					 * in the dictionary */
					if (verb_of_more_than_4_letters(entry)
						|| check_valid_right_component_for_one_INF_code(l->string)
						|| number_of_components==1) {
						/*
						 * We set the number of components, the analysis, the actual
						 * DELA line and information about
						 */
						struct word_decomposition* wd=new_word_decomposition();
						wd->n_parts=number_of_components;
						u_strcpy(wd->decomposition,dec);
						u_strcpy(wd->dela_line,new_dela_line);
						wd->is_a_valid_right_N=check_N_right_component(l->string);
						wd->is_a_valid_right_A=check_A_right_component(l->string);
						/* Then we add the decomposition word structure to the list that
						 * contains all the analysis for the word to analyze */
						struct word_decomposition_list* wdl=new_word_decomposition_list();
						wdl->element=wd;
						wdl->next=(*L);
						(*L)=wdl;
					}
					/* We go on with the next INF code of the last component */
					l=l->next;
				}
			}
			/* If are at the end of the word to analyze, we have nothing more to do */
			return;
		} else {
			/* If we are not at the end of the word to analyze, we must
			 * 1) look if the current component is a valid left one
			 * 2) look if it is not a forbidden component and
			 * 3) explore the rest of the original word
			 */
			if (infos->valid_left_component[index] &&
				(get_value_index(current_component,infos->forbidden_words,DONT_INSERT)==NO_VALUE_INDEX)) {
				/* If we have a valid component, we look first if we are
				 * in the case of a word ending by a double letter like "kupp" */
				if (pos_in_current_component>2 &&
					(current_component[pos_in_current_component-1]==current_component[pos_in_current_component-2])) {
					/* If we have such a word, we add it to the current analysis,
					 * putting "+++" if the current component is not the first one */
					unichar dec[2000];
					u_strcpy(dec,analysis);
					if (dec[0]!='\0') {
						u_strcat(dec," +++ ");
					}
					/* In order to print the component in the analysis, we arbitrary
					 * take a valid left component among all those that are available
					 * for the current component */
					unichar sia_code[2000];
					unichar entry[2000];
					unichar line[2000];
					get_first_valid_left_component(infos->inf->codes[index],sia_code);
					uncompress_entry(current_component,sia_code,entry);
					u_strcat(dec,entry);
					u_strcpy(line,output_dela_line);
					u_strcat(line,current_component);
					/* As we have a double letter at the end of the word,
					 * we must remove a character */
					line[u_strlen(line)-1]='\0';
					unichar temp[2000];
					unichar dec_temp[2000];
					u_strcpy(dec_temp,dec);
					/* Then, we explore the dictionary in order to analyze the
					 * next component. We start at the root of the dictionary
					 * (offset=4) and we go back one position in the word to analyze.
					 * For instance, if we have "kupplaner", we read "kupp" and then
					 * we try to analyze "planner". */
					explore_state(4,temp,0,word_to_analyze,pos_in_word_to_analyze-1,
						dec_temp,line,L,number_of_components+1,infos);
				}
				/* Now, we try to analyze the component normally, even if
				 * it was ended by double letter, because we can have things
				 * like "oppbrent = opp,.ADV +++ brent,brenne.V:K" */
				unichar dec[2000];
				unichar line[2000];
				u_strcpy(dec,analysis);
				if (dec[0]!='\0') {
					/* We add the "+++" mark if the current component is not the first one */
					u_strcat(dec," +++ ");
				}
				unichar sia_code[2000];
				unichar entry[2000];
				/* In order to print the component in the analysis, we arbitrary
				 * take a valid left component among all those that are available
				 * for the current component */
				get_first_valid_left_component(infos->inf->codes[index],sia_code);
				uncompress_entry(current_component,sia_code,entry);
				u_strcat(dec,entry);
				u_strcpy(line,output_dela_line);
				u_strcat(line,current_component);
				unichar temp[2000];
				unichar dec_temp[2000];
				u_strcpy(dec_temp,dec);
				/* Then, we explore the dictionary in order to analyze the
				 * next component. We start at the root of the dictionary
				 * (offset=4). */
				explore_state(4,temp,0,word_to_analyze,pos_in_word_to_analyze,
					dec_temp,line,L,number_of_components+1,infos);
			}
		}
	}
	/* Once we have finished to deal with the current final dictionary node,
	 * we go on because we may match a longer word */
	t=offset+5;
}
else {
	/* If the node is not a final one, we get compute the number of transitions by
	 * removing the highest bit */
	c=c-32768;
	t=offset+2;
}
/* We examine each transition that goes out from the node */
for (int i=0;i<c;i++) {
	if (is_equal_or_uppercase((unichar)(infos->bin[t]*256+infos->bin[t+1]),word_to_analyze[pos_in_word_to_analyze],infos->alphabet)) {
		/* If the transition's letter is case compatible with the current letter of the
		 * word to analyze, we follow it */
		index=infos->bin[t+2]*256*256+infos->bin[t+3]*256+infos->bin[t+4];
		current_component[pos_in_current_component]=(unichar)(infos->bin[t]*256+infos->bin[t+1]);
		explore_state(index,current_component,pos_in_current_component+1,word_to_analyze,pos_in_word_to_analyze+1,
			analysis,output_dela_line,L,number_of_components,infos);
	}
	/* We move the offset to the next transition */
	t=t+5;
}
}
Example #7
0
/**
 * This function tries to analyse an unknown norwegian word. If OK,
 * it returns 1 and print the dictionary entry to the output (and
 * information if an information file has been specified in 'infos');
 * returns 0 otherwise.
 */
int analyse_norwegian_word(const unichar* word,struct norwegian_infos* infos) {
unichar decomposition[4096];
unichar dela_line[4096];
unichar correct_word[4096];
decomposition[0]='\0';
dela_line[0]='\0';
correct_word[0]='\0';
struct word_decomposition_list* l=NULL;
/* We look if there are decompositions for this word */
explore_state(4,correct_word,0,word,0,decomposition,dela_line,&l,1,infos);
if (l==NULL) {
	/* If there is no decomposition, we return */
	return 0;
}
/* Otherwise, we will choose the one to keep */
struct word_decomposition_list* tmp=l;
int n=1000;
int is_a_valid_right_N=0;
int is_a_valid_right_A=0;
/* First, we count the minimal number of components, because
 * we want to give priority to analysis with smallest number
 * of components. By the way, we note if there is a minimal
 * analysis ending by a noun or an adjective. */
while (tmp!=NULL) {
	if (tmp->element->n_parts<=n) {
		if (tmp->element->n_parts<n) {
			/* If we change of component number, we reset the
			 * 'is_a_valid_right_N' and 'is_a_valid_right_A' fields,
			 * because they only concern the head word. */
			is_a_valid_right_N=0;
			is_a_valid_right_A=0;
		}
		n=tmp->element->n_parts;
		if (tmp->element->is_a_valid_right_N) {
			is_a_valid_right_N=1;
		}
		if (tmp->element->is_a_valid_right_A) {
			is_a_valid_right_A=1;
		}
	}
	tmp=tmp->next;
}
tmp=l;
while (tmp!=NULL) {
	if (n==tmp->element->n_parts) {
		/* We only consider the words that have shortest decompositions.
		 * The test (tmp->element->n_parts==1) is used to
		 * match simple words that would have been wrongly considered
		 * as unknown words. */
		int OK=0;
		if (tmp->element->n_parts==1) {
			/* Simple words must be matched */
			OK=1;
		}
		else if (is_a_valid_right_N) {
			 	if (tmp->element->is_a_valid_right_N) {
					/* We give priority to analysis that ends with a noun */
					OK=1;
			 	}
			}
		else if (is_a_valid_right_A) {
				if (tmp->element->is_a_valid_right_A) {
					/* Our second priority goes to analysis that ends with an adjective */
					OK=1;
				}
			}
		else OK=1;
		/* We put a restriction on the grammatical code:
		 * we don't produce a x<A> or x<V> analysis when a x<N> exists */
		if (OK) {
			if (infos->info_output!=NULL) {
				u_fprintf(infos->info_output,"%S = %S\n",word,tmp->element->decomposition);
			}
			u_fprintf(infos->output,"%S\n",tmp->element->dela_line);
		}
	}
	tmp=tmp->next;
}
free_word_decomposition_list(l);
return 1;
}