static int load_lex(LEXICON *lex, char *tab) { int ret; SPIPlanPtr SPIplan; Portal SPIportal; bool moredata = TRUE; #ifdef DEBUG struct timeval t1, t2; double elapsed; #endif char *sql; int ntuples; int total_tuples = 0; lex_columns_t lex_columns = {seq: -1, word: -1, stdword: -1, token: -1}; int seq; char *word; char *stdword; int token; DBG("start load_lex\n"); SET_TIME(t1); if (!tab || !strlen(tab)) { elog(NOTICE, "load_lex: rules table is not usable"); return -1; } if (!tableNameOk(tab)) { elog(NOTICE, "load_lex: lex and gaz table names may only be alphanum and '.\"_' characters (%s)", tab); return -1; } sql = SPI_palloc(strlen(tab)+65); strcpy(sql, "select seq, word, stdword, token from "); strcat(sql, tab); strcat(sql, " order by id "); /* get the sql for the lexicon records and prepare the query */ SPIplan = SPI_prepare(sql, 0, NULL); if (SPIplan == NULL) { elog(NOTICE, "load_lex: couldn't create query plan for the lex data via SPI (%s)", sql); return -1; } /* get the sql for the lexicon records and prepare the query */ SPIplan = SPI_prepare(sql, 0, NULL); if (SPIplan == NULL) { elog(NOTICE, "load_lex: couldn't create query plan for the lexicon data via SPI"); return -1; } if ((SPIportal = SPI_cursor_open(NULL, SPIplan, NULL, NULL, true)) == NULL) { elog(NOTICE, "load_lex: SPI_cursor_open('%s') returns NULL", sql); return -1; } while (moredata == TRUE) { //DBG("calling SPI_cursor_fetch"); SPI_cursor_fetch(SPIportal, TRUE, TUPLIMIT); if (SPI_tuptable == NULL) { elog(NOTICE, "load_lex: SPI_tuptable is NULL"); return -1; } if (lex_columns.seq == -1) { ret = fetch_lex_columns(SPI_tuptable, &lex_columns); if (ret) return ret; } ntuples = SPI_processed; //DBG("Reading edges: %i - %i", total_tuples, total_tuples+ntuples); total_tuples += ntuples; if (ntuples > 0) { int t; Datum binval; bool isnull; SPITupleTable *tuptable = SPI_tuptable; TupleDesc tupdesc = SPI_tuptable->tupdesc; for (t = 0; t < ntuples; t++) { //if (t%100 == 0) { DBG(" t: %i", t); } HeapTuple tuple = tuptable->vals[t]; GET_INT_FROM_TUPLE(seq,lex_columns.seq,"load_lex: seq contains a null value"); GET_TEXT_FROM_TUPLE(word,lex_columns.word); GET_TEXT_FROM_TUPLE(stdword,lex_columns.stdword); GET_INT_FROM_TUPLE(token,lex_columns.token,"load_lex: token contains a null value"); lex_add_entry(lex, seq, word, stdword, token); } //DBG("calling SPI_freetuptable"); SPI_freetuptable(tuptable); //DBG("back from SPI_freetuptable"); } else moredata = FALSE; } SET_TIME(t2); ELAPSED_T(t1, t2); DBG("Time to read %i lexicon records: %.1f ms.", total_tuples, elapsed); return 0; } static int fetch_rules_columns(SPITupleTable *tuptable, rules_columns_t *rules_cols) { int err = 0; FETCH_COL(rules_cols,rule,"rule"); if (err) { elog(NOTICE, "rules queries must return column 'rule'"); return -1; } CHECK_TYP(rules_cols,rule,TEXTOID); if (err) { elog(NOTICE, "rules column type must be: 'rule' text"); return -1; } return 0; }
int main(int argc, char *argv[]) { STANDARDIZER *std; LEXICON *lex; LEXICON *gaz; RULES *rules; char buf[1024]; int seq; char input_str[ 4096 ] ; char word[512]; char stdword[512]; int token; int nr; int rule[RULESIZE]; int err; int cnt; int option = 0; FILE *in; if (argc == 3 && !strcmp(argv[1], "-o")) { option = strtol(argv[2], NULL, 10); argc -= 2; argv += 2; } else if (argc != 1) Usage(); std = std_init(); assert(std); lex = lex_init(std->err_p); assert(lex); in = fopen(LEXIN, "rb"); assert(in); cnt = 0; while (!feof(in) && fgets(buf, 1024, in)) { cnt++; /* parse into fields */ if (parse_csv(buf, &seq, word, stdword, &token)) { /* add the record to the lexicon */ err = lex_add_entry(lex, seq, word, stdword, token); if (err != 1) printf("lex: Failed: %d: %s", cnt, buf); } else { printf("lex: Skipping: %d: %s", cnt, buf); } } fclose(in); if (option & 1) { printf("------------ address lexicon --------------\n"); print_lexicon(lex->hash_table); printf("\n"); } gaz = lex_init(std->err_p); assert(gaz); in = fopen(GAZIN, "rb"); assert(in); cnt = 0; while (!feof(in) && fgets(buf, 1024, in)) { cnt++; /* parse into fields */ if (parse_csv(buf, &seq, word, stdword, &token)) { /* add the record to the lexicon */ err = lex_add_entry(gaz, seq, word, stdword, token); if (err != 1) printf("gaz: Failed: %d: %s", cnt, buf); } else { printf("gaz: Skipping: %d: %s", cnt, buf); } } fclose(in); if (option & 2) { printf("------------ gazeteer lexicon --------------\n"); print_lexicon(gaz->hash_table); printf("\n"); } rules = rules_init(std->err_p); assert(rules); rules -> r_p -> collect_statistics = TRUE ; /* ************ RULES **************** */ in = fopen(RULESIN, "rb"); assert(in); cnt = 0; while (!feof(in) && fgets(buf, 1024, in)) { cnt++; /* parse into fields */ nr = parse_rule(buf, rule); /* add the record to the rules */ err = rules_add_rule(rules, nr, rule); if (err != 0) printf("rules: Failed: %d (%d): %s", cnt, err, buf); } err = rules_ready(rules); if (err != 0) printf("rules: Failed: err=%d\n", err); fclose(in); std_use_lex(std, lex); std_use_gaz(std, gaz); std_use_rules(std, rules); std_ready_standardizer(std); printf( "Standardization test. Type \"exit\" to quit:\n" ) ; fflush( stdout ) ; while ( TRUE ) { err = standardize_command_line( std, input_str, option ) ; if ( err == FAIL ) { break ; } } printf( "OK\n" ) ; fflush( stdout ) ; std_free(std); /* these were freed when we bound them with std_use_*() rules_free(rules); lex_free(gaz); lex_free(lex); */ return 0; }