/** * Explores the transitions that outgo from the given state. * Returns 1 if a recursion is found; 0 otherwise. */ int explore_state(int state_number,struct list_int* l,Fst2* fst2,int* graphs_matching_E,U_FILE*ferr) { Fst2State s=fst2->states[state_number]; int ret=0; if (s==NULL) return 0; if (is_bit_mask_set(s->control,TMP_LOOP_MARK|VISITED_MARK)) { /* If this state has already been processed */ return 0; } set_bit_mask(&(s->control),TMP_LOOP_MARK|VISITED_MARK); Transition* list=s->transitions; while (list!=NULL) { if (list->tag_number<0) { /* If we have a subgraph call */ if (look_for_recursion(-(list->tag_number),l,fst2,graphs_matching_E,ferr)) { /* If there is a recursion */ return 1; } if (graphs_matching_E[-list->tag_number] && explore_state(list->state_number,l,fst2,graphs_matching_E,ferr)) { /* If the graph matches <E> */ return 1; } } else if (fst2->tags[list->tag_number]->control==1) { /* If we have a transition that can match <E> */ if (explore_state(list->state_number,l,fst2,graphs_matching_E,ferr)) { return 1; } } list=list->next; } unset_bit_mask(&(s->control),TMP_LOOP_MARK); return ret; }
// // this function try to analyse an unknown russian word // int analyse_word(const unichar* mot,const unsigned char* tableau_bin,U_FILE* debug,U_FILE* result_file, const struct INF_codes* inf_codes,const bool* prefix,const bool* suffix,const Alphabet* alphabet, struct utags UTAG,vector_ptr* rules,vector_ptr* entries) { #if DDEBUG > 0 { u_fprintf(debug,"\n %S\n",mot); } #endif unichar decomposition[MAX_DICT_LINE_LENGTH]; unichar dela_line[MAX_DICT_LINE_LENGTH]; unichar correct_word[MAX_DICT_LINE_LENGTH]; decomposition[0]='\0'; dela_line[0]='\0'; correct_word[0]='\0'; struct decomposed_word_list* l = 0; explore_state(4,correct_word,0,mot,mot,0,decomposition,dela_line,&l,1,0,0,tableau_bin, inf_codes,prefix,suffix,alphabet,debug,UTAG,rules,entries); free_all_dic_entries(entries); free_all_rule_lists(rules); if ( l == 0 ) { return 0; } struct decomposed_word_list* tmp = l; while ( tmp != NULL ) { if (debug!=NULL) { u_fprintf(debug,"%S = %S\n",mot,tmp->element->decomposition); } u_fprintf(result_file,"%S\n",tmp->element->dela_line); tmp=tmp->suivant; } free_decomposed_word_list(l); return 1; }
/** * Returns 1 and prints an error message if a recursion is found in graph #n; * returns 0 otherwise. */ int look_for_recursion(int n,struct list_int* l,Fst2* fst2,int* graphs_matching_E,U_FILE*ferr) { if (is_in_list(n,l)) { /* If we find a graph that has already been visited */ print_reversed_list(l,n,fst2->graph_names,ferr); error(" recalls the graph %S\n",fst2->graph_names[n]); if (ferr != NULL) u_fprintf(ferr," recalls the graph %S\n",fst2->graph_names[n]); return 1; } l=new_list_int(n,l); int ret=explore_state(fst2->initial_states[n],l,fst2,graphs_matching_E,ferr); delete_head(&l); return ret; }
// // this function explores the dictionary to decompose the word mot // void explore_state (int adresse, unichar* current_component, int pos_in_current_component, const unichar* original_word, const unichar* remaining_word, int pos_in_remaining_word, const unichar* decomposition, const unichar* lemma_prefix, struct decomposed_word_list** L, int n_decomp, struct rule_list* rule_list_called, const struct dela_entry* dic_entr_called, const unsigned char* tableau_bin, const struct INF_codes* inf_codes, const bool* prefix,const bool* suffix,const Alphabet* alphabet, U_FILE* debug_file,struct utags UTAG, vector_ptr* rules,vector_ptr* entries) { int c = tableau_bin[adresse]*256+tableau_bin[adresse+1]; int index; int t = 0; if ( !(c&32768) ) { // if we are in a terminal state index = tableau_bin[adresse+2]*256*256+tableau_bin[adresse+3]*256+tableau_bin[adresse+4]; current_component[pos_in_current_component] = '\0'; if (pos_in_current_component >= 1) { // go on if word length equals zero #if DDEBUG > 0 { u_fprintf(debug_file,". %S\n",current_component); } #endif struct list_ustring* l = inf_codes->codes[index]; while ( l != 0 ) { // int one_rule_already_matched = 0; // one rule matched each entry is enough unichar entry[MAX_DICT_LINE_LENGTH]; uncompress_entry(current_component, l->string, entry); #if DDEBUG > 0 { u_fprintf(debug_file,": %S\n",entry); } #endif struct dela_entry* dic_entr = new_dic_entry(entry,entries); unichar lemma_prefix_new[MAX_DICT_LINE_LENGTH]; struct rule_list* rule_list_new = 0; unichar next_remaining_word[MAX_WORD_LENGTH]; struct rule_list* rule_list = 0; if (prefix_is_valid(index,prefix) || suffix_is_valid(index,suffix)) rule_list = parse_rules(entry,UTAG,rules); else { rule_list = new_rule_list(rules); rule_list->rule = new_composition_rule(); } // entry is now cleaned from rules for composition and derivation // log decomposition of word // ("cleaned" entries for better overview) unichar decomposition_new[MAX_DICT_LINE_LENGTH]; u_strcpy(decomposition_new, decomposition); if (decomposition_new[0] != '\0') u_strcat(decomposition_new, " +++ "); u_strcat(decomposition_new, entry); // loop on all composition_rules called struct rule_list* called = rule_list_called; do { // while ( rule_list* called != 0 ) // if (one_rule_already_matched) // break; struct composition_rule* rule_called = ( called != 0 ) ? called->rule : 0; // may be undefined // loop on all actual composition_rules struct rule_list* r_list = rule_list; while ( r_list != 0 ) { // if (one_rule_already_matched) // break; struct composition_rule* rule = r_list->rule; // ever defined, see upwards if (remaining_word[pos_in_remaining_word]=='\0' && // we have explored the entire original word ((((dic_entr_called != 0) && composition_rule_matches_entry(rule->before, dic_entr_called,debug_file)) && ((rule_called != 0) && composition_rule_matches_entry(rule_called->after, dic_entr,debug_file))) || // and we have a valid right component, i.e. rules match ((dic_entr_called == 0) && // or a simple entry (i.e. no prefix), (! affix_is_valid(index,prefix,suffix))) // but no affix ) ) { // one_rule_already_matched = 1; unichar inflected[MAX_WORD_LENGTH]; unichar lemma[MAX_WORD_LENGTH]; unichar codes[MAX_DICT_LINE_LENGTH]; tokenize_DELA_line_into_3_parts(entry, inflected, lemma, codes); /* generating new lexicon entry */ unichar new_dela_line[MAX_DICT_LINE_LENGTH]; /* word form */ u_strcpy(new_dela_line, original_word); u_strcat(new_dela_line, ","); /* lemma */ // lemmatize word if (rule->then.repl[0] == '\0' // if there are no replace codes && (rule_called != 0 // either in actual nor in preceeding rule && rule_called->then.repl[0] == '\0')) { u_strcat(new_dela_line, lemma_prefix); unichar affix[MAX_WORD_LENGTH]; u_strcpy(affix, lemma); substring_operation(affix, rule->then.substr_act); if (rule_called != 0 && rule_called->then.undo_substr_next[0] != '\0') substring_operation(affix, rule_called->then.undo_substr_next); u_strcat(new_dela_line, affix); } else { u_strcat(new_dela_line, original_word); } /* codes */ u_strcat(new_dela_line,"."); if (rule->then.repl[0] != '\0') { // replacing codes by u_strcat(new_dela_line,rule->then.repl); // suffix' ones } else if (rule_called == 0) { // prohibit SGV u_strcat(new_dela_line,codes); } else if (rule_called->then.repl[0] != '\0') { u_strcat(new_dela_line,rule_called->then.repl); // prefix' ones } // replace replaces all and blocks adding and deleting // maybe this is not optimal ??? else { if (rule_called->then.add[0] != '\0') { // add codes if (!dic_entry_contain_gram_code(dic_entr, rule_called->then.add)) { bool done = 0; unichar tmp[MAX_COMPOSITION_RULE_LENGTH]; int j = 0; for (int i = 0; codes[i] != '\0'; i++) { if (codes[i] == ':' && (!done)) { tmp[j++] = '+'; tmp[j] = '\0'; u_strcat(new_dela_line,tmp); u_strcat(new_dela_line,rule_called->then.add); done = 1; j = 0; } tmp[j++] = codes[i]; } tmp[j] = '\0'; u_strcat(new_dela_line,tmp); if (!done) { u_strcat(new_dela_line,"+"); u_strcat(new_dela_line,rule_called->then.add); } } else { u_strcat(new_dela_line,codes); } } else if (rule_called->then.del[0] != '\0') { // delete codes } else { u_strcat(new_dela_line,codes); } } #if DDEBUG > 0 { u_fprintf(debug_file,"= %S\n",new_dela_line); } #endif struct decomposed_word* wd = new_decomposed_word(); wd->n_parts = n_decomp; u_strcpy(wd->decomposition,decomposition_new); u_strcpy(wd->dela_line,new_dela_line); struct decomposed_word_list* wdl=new_decomposed_word_list(); // unshift actual decomposition to decomposition list L wdl->element = wd; wdl->suivant = (*L); (*L) = wdl; } // end if end of word and valid right component else if // beginning or middle of word: explore the rest of the original word (prefix_is_valid(index,prefix) && check_is_valid(UTAG.PREFIX, dic_entr) && // but only if the current component was a valid left one // we go on with the next component ( (n_decomp == 1) // prefix as first part of a word: no rule matching || ( // prefix in the middle of a word (rule_called && composition_rule_matches_entry(rule_called->after, dic_entr,debug_file)) && (dic_entr_called && composition_rule_matches_entry(rule->before, dic_entr_called,debug_file)) ) )) { // one_rule_already_matched = 1; u_strcpy(lemma_prefix_new, lemma_prefix); unichar affix[MAX_WORD_LENGTH]; u_strcpy(affix, current_component); if (rule_called != 0 && rule_called->then.undo_substr_next[0] != '\0') { substring_operation(affix, rule_called->then.undo_substr_next); u_fprintf(debug_file,"yes\n"); } substring_operation(affix, rule->then.substr_act); u_strcat(lemma_prefix_new, affix); int j = 0; for (int i = pos_in_remaining_word; remaining_word[i] != '\0'; i++) { next_remaining_word[j++] = remaining_word[i]; } next_remaining_word[j] = '\0'; if (rule->then.substr_next[0] != '\0') { substring_operation(next_remaining_word, rule->then.substr_next); #if DDEBUG > 0 { u_fprintf(debug_file,"| %S|%S\n",affix,next_remaining_word); } #endif } #if DDEBUG > 0 { u_fprintf(debug_file,"- %S\n",entry); } #endif struct rule_list* tmp = new_rule_list(rules); tmp->rule = new_composition_rule(); copy_composition_rule(tmp->rule, rule); tmp->next = 0; if ( rule_list_new == 0 ) { rule_list_new = tmp; } else { struct rule_list* trl = rule_list_new; while ( trl->next != 0 ) { trl=trl->next; } trl->next = tmp; } } else { // no valid suffix nor prefix } r_list = r_list->next; } // while ( rule_list* r_list != 0 ) if ( called != 0 ) called = called->next; } while ( called != 0 ); // prefix found, try to decomposite rest of word if ( rule_list_new != 0 && dic_entr != 0 ) { unichar next_component[MAX_WORD_LENGTH]; #if DDEBUG > 0 { u_fprintf(debug_file,"> %S\n",next_remaining_word); } #endif explore_state(4, next_component, 0, original_word, next_remaining_word, 0, decomposition_new, lemma_prefix_new, L, n_decomp+1, rule_list_new, dic_entr, tableau_bin,inf_codes,prefix,suffix,alphabet,debug_file,UTAG,rules,entries); } else { // free_dic_entry(dic_entr); // free_rule_list(rule_list); } l = l->next; } // end of while (token_list* l != 0) t = adresse+5; } // end of word length >= 1 } else { // not a final state c = c-32768; t = adresse+2; } if (remaining_word[pos_in_remaining_word]=='\0') { // if we have finished, we return // free_dic_entry(dic_entr_called); // free_rule_list(rule_list_called); return; } // if not, we go on with the next letter for (int i=0;i<c;i++) { if (is_equal_or_uppercase((unichar)(tableau_bin[t]*256+tableau_bin[t+1]), remaining_word[pos_in_remaining_word], alphabet) || is_equal_or_uppercase(remaining_word[pos_in_remaining_word], (unichar)(tableau_bin[t]*256+tableau_bin[t+1]), alphabet)) { index = tableau_bin[t+2]*256*256+tableau_bin[t+3]*256+tableau_bin[t+4]; current_component[pos_in_current_component] = (unichar)(tableau_bin[t]*256+tableau_bin[t+1]); explore_state(index, current_component, pos_in_current_component+1, original_word, remaining_word, pos_in_remaining_word+1, decomposition, lemma_prefix, L, n_decomp, rule_list_called, dic_entr_called, tableau_bin, inf_codes,prefix,suffix,alphabet,debug_file,UTAG,rules,entries); } t += 5; } }
void tarjan_run (run_t *run, wctx_t *ctx) { alg_local_t *loc = ctx->local; raw_data_t *addr; raw_data_t state_data; bool on_stack; hash32_t hash; #ifdef HAVE_PROFILER Warning (info, "Using the profiler"); ProfilerStart ("tarjan.perf"); #endif #ifdef SEARCH_COMPLETE_GRAPH int init_state = dlopen_get_worker_initial_state (ctx->id, W); int inits = 0; // loop until every state of the graph has been visited while ( 1 ) { inits ++; // use loc->target as a dummy for the initial state loc->target->ref = init_state; #endif tarjan_init (ctx); // continue until we are done exploring the graph while ( !run_is_stopped (run) ) { state_data = dfs_stack_top (loc->search_stack); if (state_data != NULL) { // there is a state on the current stackframe ==> explore it state_info_deserialize (ctx->state, state_data); // pop the state and continue if it is part of a completed SCC if (state_store_has_color (ctx->state->ref, SCC_STATE, 0)) { dfs_stack_pop (loc->search_stack); continue; } hash = ref_hash (ctx->state->ref); on_stack = fset_find (loc->visited_states, &hash, &ctx->state->ref, (void **) &addr, true); if (!on_stack) { // unseen state ==> initialize and explore HREassert (loc->cnt.tarjan_counter != UINT32_MAX); loc->cnt.tarjan_counter ++; loc->state_tarjan.index = loc->cnt.tarjan_counter; loc->state_tarjan.lowlink = loc->cnt.tarjan_counter; // point visited_states data to stack *addr = state_data; explore_state (ctx); state_info_serialize (ctx->state, state_data); } else { // previously visited state ==> update parent // NB: state is on tarjan_stack state_info_deserialize (ctx->state, *addr); update_parent (ctx, loc->state_tarjan.lowlink); dfs_stack_pop (loc->search_stack); } } else { // there is no state on the current stackframe ==> backtrack // we are done if we backtrack from the initial state if (0 == dfs_stack_nframes (loc->search_stack)) break; // leave the stackframe dfs_stack_leave (loc->search_stack); ctx->counters->level_cur--; // retrieve the parent state from search_stack (to be removed) state_data = dfs_stack_top (loc->search_stack); state_info_deserialize (ctx->state, state_data); Debug ("Backtracking %zu (%d, %d)", ctx->state->ref, loc->state_tarjan.index, loc->state_tarjan.lowlink); if (loc->state_tarjan.index == loc->state_tarjan.lowlink) { // index == lowlink ==> root of the SCC ==> report the SCC pop_scc (ctx, ctx->state->ref, loc->state_tarjan.lowlink); } else { // lowlink < index ==> LIVE SCC ==> move to tarjan_stack move_tarjan (ctx, ctx->state, state_data); update_parent (ctx, loc->state_tarjan.lowlink); } dfs_stack_pop (loc->search_stack); } } #ifdef SEARCH_COMPLETE_GRAPH init_state = dlopen_get_new_initial_state (init_state); if (init_state == -1) { Warning(info, "Number of inits : %d", inits); break; } } #endif #ifdef HAVE_PROFILER Warning(info, "Done profiling"); ProfilerStop(); #endif if (!run_is_stopped(run) && dfs_stack_size(loc->tarjan_stack) != 0) Warning (info, "Tarjan stack not empty: %zu (stack %zu)", dfs_stack_size(loc->tarjan_stack), dfs_stack_size(loc->search_stack)); if (!run_is_stopped(run) && fset_count(loc->visited_states) != 0) Warning (info, "Stack-set not empty: %zu", fset_count(loc->visited_states)); }
/** * This explores the dictionary in order decompose the given word into a valid sequence * of simple words. For instance, if we have the word "Sommervarmt", we will first * explore the dictionary and find that "sommer" is a valid left component that * corresponds to the dictionary entry "sommer,.N:msia". Then we will * look if the following word "varmt" is in the dictionary. It is * the case, with the entry "varmt,varm.A:nsio". As we are at the end of the word to * analyze and as "varmt" is a valid rightmost component, we will generate an entry * according to the following things: * * 'output_dela_line'="sommervarmt,sommervarm.A:nsio" * 'analysis'="sommer,.N:msia +++ varmt,varm.A:nsio" * 'number_of_components'=2 * * Note that the initial "S" was put in lowercase, because the dictionary * contains "sommer" and not "Sommer". The lemma is obtained with * the lemma of the rightmost component (here "varm"), and the word inherits * from the grammatical information of its rightmost component. * * 'offset': offset of the current node in the binary array 'infos->bin' * 'current_component': string that represents the current simple word * 'pos_in_current_component': position in the string 'current_component' * 'word_to_analyze': the word to analyze * 'pos_in_word_to_analyze': position in the string 'word_to_analyze' * 'analysis': string that represents the analysis as a concatenation like * "sommer,.N:msia +++ varmt,varm.A:nsio" * 'output_dela_line': string that contains the final DELA line. The lemma is * obtained by replacing the rightmost term of * the word to analyze by its lemma. * 'L': list of all analysis for the given word * 'number_of_components': number of components that compose the word. * 'infos': global settings. */ void explore_state(int offset,unichar* current_component,int pos_in_current_component, const unichar* word_to_analyze,int pos_in_word_to_analyze,const unichar* analysis, const unichar* output_dela_line,struct word_decomposition_list** L, int number_of_components,struct norwegian_infos* infos) { int c; int index,t; c=infos->bin[offset]*256+infos->bin[offset+1]; if (!(c&32768)) { /* If we are in a final state, we compute the index of the * corresponding INF line */ index=infos->bin[offset+2]*256*256+infos->bin[offset+3]*256+infos->bin[offset+4]; /* We can set the end of our current component */ current_component[pos_in_current_component]='\0'; /* We do not consider words of length 1 */ if (pos_in_current_component>1) { /* We don't consider components with a length of 1 */ if (word_to_analyze[pos_in_word_to_analyze]=='\0') { /* If we have explored the entire original word */ if (get_value_index(current_component,infos->forbidden_words,DONT_INSERT)==NO_VALUE_INDEX) { /* And if we do not have forbidden word in last position */ struct list_ustring* l=infos->inf->codes[index]; /* We will look at all the INF codes of the last component in order * to produce analysis */ while (l!=NULL) { unichar dec[2000]; u_strcpy(dec,analysis); if (dec[0]!='\0') { /* If we have already something in the analysis (i.e. if * we have not a simple word), we insert the concatenation * mark before the entry to come */ u_strcat(dec," +++ "); } unichar entry[2000]; /* We get the dictionary line that corresponds to the current INF code */ uncompress_entry(current_component,l->string,entry); /* And we add it to the analysis */ u_strcat(dec,entry); unichar new_dela_line[2000]; /* We copy the current output DELA line that contains * the concatenation of the previous components */ u_strcpy(new_dela_line,output_dela_line); /* Then we tokenize the DELA line that corresponds the current INF * code in order to obtain its lemma and grammatical/inflectional * information */ struct dela_entry* tmp_entry=tokenize_DELAF_line(entry,1); /* We concatenate the inflected form of the last component to * the output DELA line */ u_strcat(new_dela_line,tmp_entry->inflected); /* We put the comma that separates the inflected form and the lemma */ u_strcat(new_dela_line,","); /* And we build the lemma in the same way than the inflected form */ u_strcat(new_dela_line,output_dela_line); u_strcat(new_dela_line,tmp_entry->lemma); /* We put the dot that separates the the lemma and the grammatical/inflectional * information */ u_strcat(new_dela_line,"."); /* And finally we put the grammatical/inflectional information */ u_strcat(new_dela_line,tmp_entry->semantic_codes[0]); int k; for (k=1;k<tmp_entry->n_semantic_codes;k++) { u_strcat(new_dela_line,"+"); u_strcat(new_dela_line,tmp_entry->semantic_codes[k]); } for (k=0;k<tmp_entry->n_inflectional_codes;k++) { u_strcat(new_dela_line,":"); u_strcat(new_dela_line,tmp_entry->inflectional_codes[k]); } free_dela_entry(tmp_entry); /* * Now we can build an analysis in the form of a word decomposition * structure, but only if the last component is a valid * right one or if it is a verb long enough, or if we find out * that the word to analyze was in fact a simple word * in the dictionary */ if (verb_of_more_than_4_letters(entry) || check_valid_right_component_for_one_INF_code(l->string) || number_of_components==1) { /* * We set the number of components, the analysis, the actual * DELA line and information about */ struct word_decomposition* wd=new_word_decomposition(); wd->n_parts=number_of_components; u_strcpy(wd->decomposition,dec); u_strcpy(wd->dela_line,new_dela_line); wd->is_a_valid_right_N=check_N_right_component(l->string); wd->is_a_valid_right_A=check_A_right_component(l->string); /* Then we add the decomposition word structure to the list that * contains all the analysis for the word to analyze */ struct word_decomposition_list* wdl=new_word_decomposition_list(); wdl->element=wd; wdl->next=(*L); (*L)=wdl; } /* We go on with the next INF code of the last component */ l=l->next; } } /* If are at the end of the word to analyze, we have nothing more to do */ return; } else { /* If we are not at the end of the word to analyze, we must * 1) look if the current component is a valid left one * 2) look if it is not a forbidden component and * 3) explore the rest of the original word */ if (infos->valid_left_component[index] && (get_value_index(current_component,infos->forbidden_words,DONT_INSERT)==NO_VALUE_INDEX)) { /* If we have a valid component, we look first if we are * in the case of a word ending by a double letter like "kupp" */ if (pos_in_current_component>2 && (current_component[pos_in_current_component-1]==current_component[pos_in_current_component-2])) { /* If we have such a word, we add it to the current analysis, * putting "+++" if the current component is not the first one */ unichar dec[2000]; u_strcpy(dec,analysis); if (dec[0]!='\0') { u_strcat(dec," +++ "); } /* In order to print the component in the analysis, we arbitrary * take a valid left component among all those that are available * for the current component */ unichar sia_code[2000]; unichar entry[2000]; unichar line[2000]; get_first_valid_left_component(infos->inf->codes[index],sia_code); uncompress_entry(current_component,sia_code,entry); u_strcat(dec,entry); u_strcpy(line,output_dela_line); u_strcat(line,current_component); /* As we have a double letter at the end of the word, * we must remove a character */ line[u_strlen(line)-1]='\0'; unichar temp[2000]; unichar dec_temp[2000]; u_strcpy(dec_temp,dec); /* Then, we explore the dictionary in order to analyze the * next component. We start at the root of the dictionary * (offset=4) and we go back one position in the word to analyze. * For instance, if we have "kupplaner", we read "kupp" and then * we try to analyze "planner". */ explore_state(4,temp,0,word_to_analyze,pos_in_word_to_analyze-1, dec_temp,line,L,number_of_components+1,infos); } /* Now, we try to analyze the component normally, even if * it was ended by double letter, because we can have things * like "oppbrent = opp,.ADV +++ brent,brenne.V:K" */ unichar dec[2000]; unichar line[2000]; u_strcpy(dec,analysis); if (dec[0]!='\0') { /* We add the "+++" mark if the current component is not the first one */ u_strcat(dec," +++ "); } unichar sia_code[2000]; unichar entry[2000]; /* In order to print the component in the analysis, we arbitrary * take a valid left component among all those that are available * for the current component */ get_first_valid_left_component(infos->inf->codes[index],sia_code); uncompress_entry(current_component,sia_code,entry); u_strcat(dec,entry); u_strcpy(line,output_dela_line); u_strcat(line,current_component); unichar temp[2000]; unichar dec_temp[2000]; u_strcpy(dec_temp,dec); /* Then, we explore the dictionary in order to analyze the * next component. We start at the root of the dictionary * (offset=4). */ explore_state(4,temp,0,word_to_analyze,pos_in_word_to_analyze, dec_temp,line,L,number_of_components+1,infos); } } } /* Once we have finished to deal with the current final dictionary node, * we go on because we may match a longer word */ t=offset+5; } else { /* If the node is not a final one, we get compute the number of transitions by * removing the highest bit */ c=c-32768; t=offset+2; } /* We examine each transition that goes out from the node */ for (int i=0;i<c;i++) { if (is_equal_or_uppercase((unichar)(infos->bin[t]*256+infos->bin[t+1]),word_to_analyze[pos_in_word_to_analyze],infos->alphabet)) { /* If the transition's letter is case compatible with the current letter of the * word to analyze, we follow it */ index=infos->bin[t+2]*256*256+infos->bin[t+3]*256+infos->bin[t+4]; current_component[pos_in_current_component]=(unichar)(infos->bin[t]*256+infos->bin[t+1]); explore_state(index,current_component,pos_in_current_component+1,word_to_analyze,pos_in_word_to_analyze+1, analysis,output_dela_line,L,number_of_components,infos); } /* We move the offset to the next transition */ t=t+5; } }
/** * This function tries to analyse an unknown norwegian word. If OK, * it returns 1 and print the dictionary entry to the output (and * information if an information file has been specified in 'infos'); * returns 0 otherwise. */ int analyse_norwegian_word(const unichar* word,struct norwegian_infos* infos) { unichar decomposition[4096]; unichar dela_line[4096]; unichar correct_word[4096]; decomposition[0]='\0'; dela_line[0]='\0'; correct_word[0]='\0'; struct word_decomposition_list* l=NULL; /* We look if there are decompositions for this word */ explore_state(4,correct_word,0,word,0,decomposition,dela_line,&l,1,infos); if (l==NULL) { /* If there is no decomposition, we return */ return 0; } /* Otherwise, we will choose the one to keep */ struct word_decomposition_list* tmp=l; int n=1000; int is_a_valid_right_N=0; int is_a_valid_right_A=0; /* First, we count the minimal number of components, because * we want to give priority to analysis with smallest number * of components. By the way, we note if there is a minimal * analysis ending by a noun or an adjective. */ while (tmp!=NULL) { if (tmp->element->n_parts<=n) { if (tmp->element->n_parts<n) { /* If we change of component number, we reset the * 'is_a_valid_right_N' and 'is_a_valid_right_A' fields, * because they only concern the head word. */ is_a_valid_right_N=0; is_a_valid_right_A=0; } n=tmp->element->n_parts; if (tmp->element->is_a_valid_right_N) { is_a_valid_right_N=1; } if (tmp->element->is_a_valid_right_A) { is_a_valid_right_A=1; } } tmp=tmp->next; } tmp=l; while (tmp!=NULL) { if (n==tmp->element->n_parts) { /* We only consider the words that have shortest decompositions. * The test (tmp->element->n_parts==1) is used to * match simple words that would have been wrongly considered * as unknown words. */ int OK=0; if (tmp->element->n_parts==1) { /* Simple words must be matched */ OK=1; } else if (is_a_valid_right_N) { if (tmp->element->is_a_valid_right_N) { /* We give priority to analysis that ends with a noun */ OK=1; } } else if (is_a_valid_right_A) { if (tmp->element->is_a_valid_right_A) { /* Our second priority goes to analysis that ends with an adjective */ OK=1; } } else OK=1; /* We put a restriction on the grammatical code: * we don't produce a x<A> or x<V> analysis when a x<N> exists */ if (OK) { if (infos->info_output!=NULL) { u_fprintf(infos->info_output,"%S = %S\n",word,tmp->element->decomposition); } u_fprintf(infos->output,"%S\n",tmp->element->dela_line); } } tmp=tmp->next; } free_word_decomposition_list(l); return 1; }