Example #1
0
void main (int argc, uchar **argv) {
    int i, j, c;
    struct stat f_stat;

    printf("compdic: Utility for compilation of word dictionaries\n");
    if (argc!=4) error("Usage:\ncompdic <alphabet> <text_dic> <comp_dic>");

    if ((fi=fopen(argv[1],"rb"))==NULL) error("Error opening alphabet file");
    memset(codes, 0, 256);
    for (letter_count=1; (c=getc(fi))>=' '; codes[c] = letter_count++) ;
    fclose(fi);

    stat(argv[2],&f_stat);
    init_mem(letter_count, f_stat.st_size + 50000);

    if ((fi=fopen(argv[2],"rb"))==NULL) error("Error opening input file");

    clear_state(0);
    owf[0]=0;

    wf1[0]=0;
    get_word_info(wf); // init of get_word_inf()

    while (get_word_info(wf)) {

        for (i=0; wf[i]==owf[i]; i++) ;		// find difference location

        for (j=strlen(owf)-1; j>=i; j--) state(j, codes[owf[j]]) = save_state(j+1);

        for (j=i+1; j<=strlen(wf); j++) clear_state(j);

        state(--j,0) = 1;

        strcpy(owf, wf);
    }
    fclose(fi);

    for (j=strlen(owf)-1; j>=0; j--) state(j, codes[owf[j]]) = save_state(j+1);

    save_cell(0, 'S', save_state(0));
    save_cell(1, 'T', last_full_cell+1);

    fo = fopen(argv[3], "wb");
    fwrite (cells, sizeof(tcell), last_full_cell+1, fo);
    fwrite (strings, 1, last_string, fo);
    fclose(fo);

    print_statistics();
}
static gboolean
parser_next (TrackerParser *parser,
             gint          *byte_offset_start,
             gint          *byte_offset_end,
             gboolean      *stop_word)
{
	gsize word_length = 0;
	gchar *processed_word = NULL;

	*byte_offset_start = 0;
	*byte_offset_end = 0;

	g_return_val_if_fail (parser, FALSE);

	/* Loop to look for next valid word */
	while (!processed_word &&
	       parser->cursor < parser->txt_size) {
		TrackerParserWordType type;
		gsize truncated_length;
		gboolean is_allowed;

		/* Get word info */
		if (!get_word_info (parser,
		                    &word_length,
		                    &is_allowed,
		                    &type)) {
			/* Quit loop just in case */
			parser->cursor = parser->txt_size;
			break;
		}

		/* Ignore the word if not an allowed word start */
		if (!is_allowed) {
			/* Ignore this word and keep on looping */
			parser->cursor += word_length;
			continue;
		}

		/* Ignore the word if longer than the maximum allowed */
		if (word_length >= parser->max_word_length) {
			/* Ignore this word and keep on looping */
			parser->cursor += word_length;
			continue;
		}

		/* check if word is reserved and ignore it if so */
		if (parser->ignore_reserved_words &&
		    tracker_parser_is_reserved_word_utf8 (&parser->txt[parser->cursor],
		                                          word_length)) {
			/* Ignore this word and keep on looping */
			parser->cursor += word_length;
			continue;
		}

		/* compute truncated word length if needed (to avoid extremely
		 *  long words)*/
		truncated_length = (word_length < WORD_BUFFER_LENGTH ?
		                    word_length :
		                    WORD_BUFFER_LENGTH - 1);

		/* Process the word here. If it fails, we can still go
		 *  to the next one. Returns newly allocated string
		 *  always */
		processed_word = process_word_utf8 (parser,
		                                    &(parser->txt[parser->cursor]),
		                                    truncated_length,
		                                    type,
		                                    stop_word);
		if (!processed_word) {
			/* Ignore this word and keep on looping */
			parser->cursor += word_length;
			continue;
		}
	}

	/* If we got a word here, set output */
	if (processed_word) {
		/* Set outputs */
		*byte_offset_start = parser->cursor;
		*byte_offset_end = parser->cursor + word_length;

		/* Update cursor */
		parser->cursor += word_length;

		parser->word_length = strlen (processed_word);
		parser->word = processed_word;

		return TRUE;
	}

	/* No more words... */
	return FALSE;
}