void main (int argc, uchar **argv) { int i, j, c; struct stat f_stat; printf("compdic: Utility for compilation of word dictionaries\n"); if (argc!=4) error("Usage:\ncompdic <alphabet> <text_dic> <comp_dic>"); if ((fi=fopen(argv[1],"rb"))==NULL) error("Error opening alphabet file"); memset(codes, 0, 256); for (letter_count=1; (c=getc(fi))>=' '; codes[c] = letter_count++) ; fclose(fi); stat(argv[2],&f_stat); init_mem(letter_count, f_stat.st_size + 50000); if ((fi=fopen(argv[2],"rb"))==NULL) error("Error opening input file"); clear_state(0); owf[0]=0; wf1[0]=0; get_word_info(wf); // init of get_word_inf() while (get_word_info(wf)) { for (i=0; wf[i]==owf[i]; i++) ; // find difference location for (j=strlen(owf)-1; j>=i; j--) state(j, codes[owf[j]]) = save_state(j+1); for (j=i+1; j<=strlen(wf); j++) clear_state(j); state(--j,0) = 1; strcpy(owf, wf); } fclose(fi); for (j=strlen(owf)-1; j>=0; j--) state(j, codes[owf[j]]) = save_state(j+1); save_cell(0, 'S', save_state(0)); save_cell(1, 'T', last_full_cell+1); fo = fopen(argv[3], "wb"); fwrite (cells, sizeof(tcell), last_full_cell+1, fo); fwrite (strings, 1, last_string, fo); fclose(fo); print_statistics(); }
static gboolean parser_next (TrackerParser *parser, gint *byte_offset_start, gint *byte_offset_end, gboolean *stop_word) { gsize word_length = 0; gchar *processed_word = NULL; *byte_offset_start = 0; *byte_offset_end = 0; g_return_val_if_fail (parser, FALSE); /* Loop to look for next valid word */ while (!processed_word && parser->cursor < parser->txt_size) { TrackerParserWordType type; gsize truncated_length; gboolean is_allowed; /* Get word info */ if (!get_word_info (parser, &word_length, &is_allowed, &type)) { /* Quit loop just in case */ parser->cursor = parser->txt_size; break; } /* Ignore the word if not an allowed word start */ if (!is_allowed) { /* Ignore this word and keep on looping */ parser->cursor += word_length; continue; } /* Ignore the word if longer than the maximum allowed */ if (word_length >= parser->max_word_length) { /* Ignore this word and keep on looping */ parser->cursor += word_length; continue; } /* check if word is reserved and ignore it if so */ if (parser->ignore_reserved_words && tracker_parser_is_reserved_word_utf8 (&parser->txt[parser->cursor], word_length)) { /* Ignore this word and keep on looping */ parser->cursor += word_length; continue; } /* compute truncated word length if needed (to avoid extremely * long words)*/ truncated_length = (word_length < WORD_BUFFER_LENGTH ? word_length : WORD_BUFFER_LENGTH - 1); /* Process the word here. If it fails, we can still go * to the next one. Returns newly allocated string * always */ processed_word = process_word_utf8 (parser, &(parser->txt[parser->cursor]), truncated_length, type, stop_word); if (!processed_word) { /* Ignore this word and keep on looping */ parser->cursor += word_length; continue; } } /* If we got a word here, set output */ if (processed_word) { /* Set outputs */ *byte_offset_start = parser->cursor; *byte_offset_end = parser->cursor + word_length; /* Update cursor */ parser->cursor += word_length; parser->word_length = strlen (processed_word); parser->word = processed_word; return TRUE; } /* No more words... */ return FALSE; }