/* ////////////////////////////////////////////////////////////////////////////////////// * interfaces */ tb_timer_ref_t tb_timer_init(tb_size_t maxn, tb_bool_t ctime) { // done tb_bool_t ok = tb_false; tb_timer_impl_t* impl = tb_null; do { // make timer impl = tb_malloc0_type(tb_timer_impl_t); tb_assert_and_check_break(impl); // init func tb_item_func_t func = tb_item_func_ptr(tb_null, tb_null); func.comp = tb_timer_comp_by_when; // init timer impl->maxn = tb_max(maxn, 16); impl->ctime = ctime; // init lock if (!tb_spinlock_init(&impl->lock)) break; // init pool impl->pool = tb_fixed_pool_init(tb_null, (maxn >> 4) + 16, sizeof(tb_timer_task_impl_t), tb_null, tb_null, tb_null); tb_assert_and_check_break(impl->pool); // init heap impl->heap = tb_heap_init((maxn >> 2) + 16, func); tb_assert_and_check_break(impl->heap); // register lock profiler #ifdef TB_LOCK_PROFILER_ENABLE tb_lock_profiler_register(tb_lock_profiler(), (tb_pointer_t)&impl->lock, TB_TRACE_MODULE_NAME); #endif // ok ok = tb_true; } while (0); // failed? if (!ok) { // exit it if (impl) tb_timer_exit((tb_timer_ref_t)impl); impl = tb_null; } // ok? return (tb_timer_ref_t)impl; }
tb_char_t const* tb_regex_replace(tb_regex_ref_t self, tb_char_t const* cstr, tb_size_t size, tb_size_t start, tb_char_t const* replace_cstr, tb_size_t replace_size, tb_size_t* plength) { // check tb_regex_t* regex = (tb_regex_t*)self; tb_assert_and_check_return_val(regex && regex->code && cstr && replace_cstr, tb_null); // done tb_char_t const* result = tb_null; do { // clear length first if (plength) *plength = 0; // end? tb_check_break(start < size); // init options #ifdef __tb_debug__ tb_uint32_t options = 0; #else tb_uint32_t options = PCRE2_NO_UTF_CHECK; #endif if (regex->mode & TB_REGEX_MODE_GLOBAL) options |= PCRE2_SUBSTITUTE_GLOBAL; // init buffer if (!regex->buffer_data) { regex->buffer_maxn = tb_max(size + replace_size + 64, 256); regex->buffer_data = (PCRE2_UCHAR*)tb_malloc_bytes(regex->buffer_maxn); } tb_assert_and_check_break(regex->buffer_data); // done tb_long_t ok = -1; PCRE2_SIZE length = 0; while (1) { // replace it length = (PCRE2_SIZE)regex->buffer_maxn; ok = pcre2_substitute(regex->code, (PCRE2_SPTR)cstr, (PCRE2_SIZE)size, (PCRE2_SIZE)start, options, tb_null, tb_null, (PCRE2_SPTR)replace_cstr, (PCRE2_SIZE)replace_size, regex->buffer_data, &length); // no space? if (ok == PCRE2_ERROR_NOMEMORY) { // grow buffer regex->buffer_maxn <<= 1; regex->buffer_data = (PCRE2_UCHAR*)tb_ralloc_bytes(regex->buffer_data, regex->buffer_maxn); tb_assert_and_check_break(regex->buffer_data); } // failed else if (ok < 0) { #if defined(__tb_debug__) && !defined(TB_CONFIG_OS_WINDOWS) // get error info PCRE2_UCHAR info[256]; pcre2_get_error_message(ok, info, sizeof(info)); // trace tb_trace_d("replace failed at offset %lu: error: %ld, %s\n", start, ok, info); #endif // end break; } else break; } // check tb_check_break(ok > 0); tb_assert_and_check_break(length < regex->buffer_maxn); // end regex->buffer_data[length] = '\0'; // trace tb_trace_d(" replace: [%lu]: %s", length, regex->buffer_data); // save length if (plength) *plength = (tb_size_t)length; // ok result = (tb_char_t const*)regex->buffer_data; } while (0); // ok? return result; }
tb_long_t tb_filter_spak(tb_filter_ref_t self, tb_byte_t const* data, tb_size_t size, tb_byte_t const** pdata, tb_size_t need, tb_long_t sync) { // check tb_filter_t* filter = (tb_filter_t*)self; tb_assert_and_check_return_val(filter && filter->spak && pdata, -1); // init odata *pdata = tb_null; // save the input offset filter->offset += size; // eof? if (filter->limit >= 0 && filter->offset == filter->limit) filter->beof = tb_true; // eof? sync it if (filter->beof) sync = -1; // the idata tb_byte_t const* idata = tb_buffer_data(&filter->idata); tb_size_t isize = tb_buffer_size(&filter->idata); if (data && size) { // append data to cache if have the cache data if (idata && isize) { // trace tb_trace_d("[%p]: append idata: %lu", self, size); // append data idata = tb_buffer_memncat(&filter->idata, data, size); isize = tb_buffer_size(&filter->idata); } // using the data directly if no cache data else { // trace tb_trace_d("[%p]: using idata directly: %lu", self, size); // using it directly idata = data; isize = size; } } // sync data if null else { // check sync tb_assert_and_check_return_val(sync, 0); } // the need if (!need) need = tb_max(size, tb_queue_buffer_maxn(&filter->odata)); tb_assert_and_check_return_val(need, -1); // init pull tb_size_t omaxn = 0; tb_byte_t* odata = tb_queue_buffer_pull_init(&filter->odata, &omaxn); if (odata) { // the osize tb_long_t osize = omaxn >= need? need : 0; // exit pull if (odata) tb_queue_buffer_pull_exit(&filter->odata, osize > 0? osize : 0); // enough? if (osize > 0) { // append to the cache if idata is not belong to the cache if (size && idata == data) tb_buffer_memncat(&filter->idata, data, size); // return it directly *pdata = odata; return osize; } } // grow odata maxn if not enough if (need > tb_queue_buffer_maxn(&filter->odata)) tb_queue_buffer_resize(&filter->odata, need); // the odata omaxn = 0; odata = tb_queue_buffer_push_init(&filter->odata, &omaxn); tb_assert_and_check_return_val(odata && omaxn, -1); // init stream tb_static_stream_t istream = {0}; tb_static_stream_t ostream = {0}; if (idata && isize) { // @note istream maybe null for sync the end data if (!tb_static_stream_init(&istream, (tb_byte_t*)idata, isize)) return -1; } if (!tb_static_stream_init(&ostream, (tb_byte_t*)odata, omaxn)) return -1; // trace tb_trace_d("[%p]: spak: ileft: %lu, oleft: %lu, offset: %llu, limit: %lld, beof: %d: ..", self, tb_buffer_size(&filter->idata), tb_queue_buffer_size(&filter->odata), filter->offset, filter->limit, filter->beof); // spak data tb_long_t osize = filter->spak(filter, &istream, &ostream, sync); // eof? if (osize < 0) filter->beof = tb_true; // no data and eof? if (!osize && !tb_static_stream_left(&istream) && filter->beof) osize = -1; // eof? sync it if (filter->beof) sync = -1; // exit odata tb_queue_buffer_push_exit(&filter->odata, osize > 0? osize : 0); // have the left idata? tb_size_t left = tb_static_stream_left(&istream); if (left) { // move to the cache head if idata is belong to the cache if (idata != data) { // trace tb_trace_d("[%p]: move to the cache head: %lu", self, left); tb_buffer_memnmov(&filter->idata, tb_static_stream_offset(&istream), left); } // append to the cache if idata is not belong to the cache else { // trace tb_trace_d("[%p]: append to the cache: %lu", self, left); tb_buffer_memncat(&filter->idata, tb_static_stream_pos(&istream), left); } } // clear the cache else tb_buffer_clear(&filter->idata); // init pull omaxn = 0; odata = tb_queue_buffer_pull_init(&filter->odata, &omaxn); // no sync? cache the output data if (!sync) osize = omaxn >= need? need : 0; // sync and has data? return it directly else if (omaxn) osize = tb_min(omaxn, need); // sync, no data or end? // else osize = osize; // exit pull if (odata) tb_queue_buffer_pull_exit(&filter->odata, osize > 0? osize : 0); // return it if have the odata if (osize > 0) *pdata = odata; // trace tb_trace_d("[%p]: spak: ileft: %lu, oleft: %lu, offset: %llu, limit: %lld, beof: %d: %ld", self, tb_buffer_size(&filter->idata), tb_queue_buffer_size(&filter->odata), filter->offset, filter->limit, filter->beof, osize); // ok? return osize; }
tb_void_t tb_pool_data_dump(tb_cpointer_t data, tb_bool_t verbose, tb_char_t const* prefix) { // done tb_pool_data_head_t* data_head = tb_null; do { // no data? tb_assert_and_check_break(data); // the data head data_head = &(((tb_pool_data_head_t*)data)[-1]); // dump the head info tb_size_t data_limit = 256; if (data_head->debug.magic == TB_POOL_DATA_MAGIC) { // the data size tb_size_t data_size = (tb_size_t)data_head->size; // format the backtrace prefix tb_char_t backtrace_prefix[256] = {0}; tb_snprintf(backtrace_prefix, sizeof(backtrace_prefix), "%s ", prefix? prefix : ""); // dump backtrace tb_size_t nframe = 0; while (nframe < tb_arrayn(data_head->debug.backtrace) && data_head->debug.backtrace[nframe]) nframe++; tb_trace_i("%sdata: from: %s(): %u, %s", prefix? prefix : "", data_head->debug.func, data_head->debug.line, data_head->debug.file); tb_backtrace_dump(backtrace_prefix, data_head->debug.backtrace, nframe); // dump the data info tb_trace_i("%sdata: %p, size: %lu, patch: %x", prefix? prefix : "", data, data_size, ((tb_byte_t const*)data)[data_size]); // dump the first 256-bytes data if (data_size && verbose) { // the dump size tb_size_t dump_size = tb_min(data_size, data_limit); // dump it tb_trace_i("%sdata: first %lu-bytes:", prefix? prefix : "", dump_size); tb_pool_data_dump_data((tb_byte_t const*)data, dump_size); // dump the last 256-bytes data if (data_size > dump_size) { // the last data tb_byte_t const* data_last = tb_max((tb_byte_t const*)data + data_size - data_limit, (tb_byte_t const*)data + dump_size); // update the dump size dump_size = (tb_byte_t const*)data + data_size - data_last; // dump it tb_trace_i("%sdata: last %lu-bytes:", prefix? prefix : "", dump_size); tb_pool_data_dump_data(data_last, dump_size); } } } // for the public fixed_pool else if (data_head->debug.magic == TB_POOL_DATA_EMPTY_MAGIC) { // format the backtrace prefix tb_char_t backtrace_prefix[256] = {0}; tb_snprintf(backtrace_prefix, sizeof(backtrace_prefix), "%s ", prefix? prefix : ""); // dump backtrace tb_size_t nframe = 0; while (nframe < tb_arrayn(data_head->debug.backtrace) && data_head->debug.backtrace[nframe]) nframe++; tb_trace_i("%sdata: from: %s(): %u, %s", prefix? prefix : "", data_head->debug.func, data_head->debug.line, data_head->debug.file); tb_backtrace_dump(backtrace_prefix, data_head->debug.backtrace, nframe); // dump the data info tb_trace_i("%sdata: %p, size: fixed", prefix? prefix : "", data); } else { // dump the data head tb_trace_i("%sdata: invalid head:", prefix? prefix : ""); tb_pool_data_dump_data((tb_byte_t const*)data_head, sizeof(tb_pool_data_head_t)); // dump the first 256-bytes data tb_trace_i("%sdata: first %lu-bytes:", prefix? prefix : "", data_limit); tb_pool_data_dump_data((tb_byte_t const*)data, data_limit); } } while (0); }
int main (int argc, char *argv[]) { int i, k = 0, n; char *s; char **optarg = NULL; char *fn_in = NULL; char *fn_out = NULL; char *fn_app = NULL; char *blanks = NULL; char *fldseps = NULL; char *recseps = NULL; char *comment = NULL; char *isep = " "; char *impl = " <- "; char *dflt = " (%1S)"; char *format = dflt; int target = 's'; int min = 1; int max = INT_MAX; double supp = 0.1; double smax = 1.0; double conf = 0.8; int dir = 0; int eval = 0; int aggm = 0; double minval = 0.1; int prune = 0; double filter = 0.1; int sort = 2; int tree = 1; int heap = 1; int post = 0; int report = 0; int mode = APP_BODY|IST_PERFECT; int size; int wgt; int frq, body, head; int *items; clock_t t, tt, tc, x; #ifndef QUIET prgname = argv[0]; if (argc > 1) { fprintf(stderr, "%s - %s\n", argv[0], DESCRIPTION); fprintf(stderr, VERSION); } else { printf("usage: %s [options] infile outfile\n", argv[0]); printf("%s\n", DESCRIPTION); printf("%s\n", VERSION); printf("-t# target type " "(default: %c)\n", target); printf(" (s: frequent item sets, c: closed item sets,\n" " m: maximal item sets, r: association rules)\n"); printf("-m# minimum number of items per set/rule " "(default: %d)\n", min); printf("-n# maximum number of items per set/rule " "(default: no limit)\n"); printf("-s# minimum support of a set/rule " "(default: %g%%)\n", supp *100); printf("-S# maximum support of a set/rule " "(default: %g%%)\n", smax *100); printf(" (positive: percentage, " "negative: absolute number)\n"); printf("-c# minimum confidence of a rule " "(default: %g%%)\n", conf *100); printf("infile file to read transactions from\n"); printf("outfile file to write item sets to\n"); return 0; } #endif for (i = 1; i < argc; i++) { s = argv[i]; if (optarg) { *optarg = s; optarg = NULL; continue; } if ((*s == '-') && *++s) { while (*s) { switch (*s++) { case '!': help(); break; case 't': target = (*s) ? *s++ : 's'; break; case 'm': min = (int)strtol(s, &s, 0); break; case 'n': max = (int)strtol(s, &s, 0); break; case 's': supp = 0.01*strtod(s, &s); break; case 'S': smax = 0.01*strtod(s, &s); break; case 'c': conf = 0.01*strtod(s, &s); break; case 'o': mode |= APP_BOTH; break; case 'e': eval = (*s) ? *s++ : 0; break; case 'a': aggm = (*s) ? *s++ : 0; break; case 'd': minval = 0.01*strtod(s, &s); break; case 'p': prune = (int)strtol(s, &s, 0); break; case 'g': report = ISR_SCAN; break; case 'k': optarg = &isep; break; case 'i': optarg = &impl; break; case 'v': optarg = &format; break; case 'l': dir = (int)strtol(s, &s, 0); break; case 'q': sort = (int)strtol(s, &s, 0); break; case 'u': filter = strtod(s, &s); break; case 'h': tree = 0; break; case 'j': heap = 0; break; case 'x': mode &= ~IST_PERFECT; break; case 'y': post = 1; break; case 'b': optarg = &blanks; break; case 'f': optarg = &fldseps; break; case 'r': optarg = &recseps; break; case 'C': optarg = &comment; break; default : error(E_OPTION, *--s); break; } if (optarg && *s) { *optarg = s; optarg = NULL; break; } } } else { switch (k++) { case 0: fn_in = s; break; case 1: fn_out = s; break; case 2: fn_app = s; break; default: error(E_ARGCNT); break; } } } if (optarg) error(E_OPTARG); if ((k < 2) || (k > 3)) error(E_ARGCNT); if ((!fn_in || !*fn_in) && (fn_app && !*fn_app)) error(E_STDIN); switch (target) { case 's': target = TT_SET; break; case 'c': target = TT_CLOSED; break; case 'm': target = TT_MAXIMAL; break; case 'r': target = TT_RULE; break; default : error(E_TARGET, (char)target); break; } if (min < 0) error(E_SIZE, min); if (max < 0) error(E_SIZE, max); if (supp > 1) error(E_SUPP, supp); if ((conf < 0) || (conf > 1)) error(E_CONF, conf); switch (eval) { case 'x': case 0: eval = IST_NONE; break; case 'c': eval = IST_CONF; break; case 'd': eval = IST_DIFF; break; case 'l': eval = IST_LIFT; break; case 'a': eval = IST_LD21; break; case 'q': eval = IST_QUOT; break; case 'n': eval = IST_CHI2; break; case 'p': eval = IST_PVAL; break; case 'i': eval = IST_INFO; break; case 'g': eval = IST_PGST; break; case 'b': eval = IST_LOGQ; break; default : error(E_MEASURE, (char)eval); break; } switch (aggm) { case 'x': case 0: aggm = IST_NONE; break; case 'm': aggm = IST_MIN; break; case 'n': aggm = IST_MAX; break; case 'a': aggm = IST_AVG; break; default : error(E_MEASURE, (char)aggm); break; } if ((target > TT_SET) || ((eval > IST_NONE) && (eval < IST_LOGQ))) mode &= ~IST_PERFECT; if (target <= TT_MAXIMAL) { mode |= APP_BOTH; conf = 1;} if ((filter <= -1) || (filter >= 1)) filter = 0; ibase = ib_create(-1); if (!ibase) error(E_NOMEM); ib_chars(ibase, blanks, fldseps, recseps, comment); MSG(stderr, "\n"); if (fn_app) { t = clock(); if (*fn_app) in = fopen(fn_app, "r"); else { in = stdin; fn_app = "<stdin>"; } MSG(stderr, "reading %s ... ", fn_app); if (!in) error(E_FOPEN, fn_app); k = ib_readapp(ibase, in); if (k != 0) error(k, fn_app, RECCNT(ibase), BUFFER(ibase)); if (in != stdin) fclose(in); in = NULL; MSG(stderr, "[%d item(s)]", ib_cnt(ibase)); MSG(stderr, " done [%.2fs].\n", SEC_SINCE(t)); } t = clock(); if (fn_in && *fn_in) in = fopen(fn_in, "r"); else { in = stdin; fn_in = "<stdin>"; } MSG(stderr, "reading %s ... ", fn_in); if (!in) error(E_FOPEN, fn_in); tabag = tb_create(ibase); if (!tabag) error(E_NOMEM); while (1) { k = ib_read(ibase, in); if (k) { if (k > 0) break; error(k, fn_in, RECCNT(ibase), BUFFER(ibase)); } if (tb_add(tabag, NULL) != 0) error(E_NOMEM); } if (in != stdin) fclose(in); in = NULL; n = ib_cnt(ibase); k = tb_cnt(tabag); wgt = tb_wgt(tabag); MSG(stderr, "[%d item(s), ", n); if (k == wgt) MSG(stderr, "%d transaction(s)]", k); else MSG(stderr, "%d/%d transaction(s)]", k, wgt); MSG(stderr, " done [%.2fs].", SEC_SINCE(t)); if ((n <= 0) || (wgt <= 0)) error(E_NOTRANS); MSG(stderr, "\n"); if (format == dflt) { if (target != TT_RULE) format = (supp < 0) ? " (%a)" : " (%1S)"; else format = (supp < 0) ? " (%b, %1C)" : " (%1X, %1C)"; } supp = ceil (((supp < 0) ? -100 : wgt) *supp); smax = floor(((smax < 0) ? -100 : wgt) *smax); t = clock(); MSG(stderr, "filtering, sorting and recoding items ... "); map = (int*)malloc(n *sizeof(int)); if (!map) error(E_NOMEM); k = (int)((mode & APP_HEAD) ? supp : ceil(supp *conf)); n = ib_recode(ibase, k, sort, map); tb_recode(tabag, map); tb_itsort(tabag, 1, heap); free(map); map = NULL; MSG(stderr, "[%d item(s)] done [%.2fs].", n, SEC_SINCE(t)); if (n <= 0) error(E_NOFREQ); MSG(stderr, "\n"); k = tb_max(tabag); if (max > k) max = k; t = clock(); MSG(stderr, "reducing transactions ... "); tb_filter(tabag, min, NULL); tb_sort(tabag, 1, heap); k = tb_reduce(tabag); if (k == wgt) MSG(stderr, "[%d transaction(s)]", k); else MSG(stderr, "[%d/%d transaction(s)]", k, wgt); MSG(stderr, " done [%.2fs].\n", SEC_SINCE(t)); tt = 0; if (tree) { t = clock(); MSG(stderr, "building transaction tree ... "); tatree = tt_create(tabag); if (!tatree) error(E_NOMEM); if (filter == 0) { tb_delete(tabag, 0); tabag = NULL; } MSG(stderr, "[%d node(s)]", tt_nodecnt(tatree)); MSG(stderr, " done [%.2fs].\n", SEC_SINCE(t)); tt = clock() -t; } t = clock(); tc = 0; istree = ist_create(ibase, mode, (int)supp, (int)smax, conf); if (!istree) error(E_NOMEM); ist_seteval(istree, eval, aggm, minval, prune); /* --- check item subsets --- */ MSG(stderr, "checking subsets of size 1"); map = (int*)malloc(n *sizeof(int)); if (!map) error(E_NOMEM); while (1) { size = ist_height(istree); if (size >= max) break; if ((filter != 0) && (ist_check(istree, map) <= size)) break; if (post) ist_prune(istree); k = ist_addlvl(istree); if (k) { if (k > 0) break; error(E_NOMEM); } if (((filter < 0) && (i < -filter *n)) || ((filter > 0) && (i < n) && (i *(double)tt < filter *n *tc))) { n = i; x = clock(); tb_filter(tabag, size+1, map); tb_sort(tabag, 0, heap); tb_reduce(tabag); if (tatree) { tt_delete(tatree, 0); tatree = tt_create(tabag); if (!tatree) error(E_NOMEM); } tt = clock() -x; } MSG(stderr, " %d", ++size); x = clock(); if (tatree) ist_countx(istree, tatree); else ist_countb(istree, tabag); tc = clock() -x; } free(map); map = NULL; MSG(stderr, " done [%.2fs].\n", SEC_SINCE(t)); if ((target == TT_CLOSED) || (target == TT_MAXIMAL)) { t = clock(); MSG(stderr, "filtering for %s item sets ... ", (target == TT_MAXIMAL) ? "maximal" : "closed"); k = target | ((prune < 0) ? IST_EVAL : 0); ist_mark(istree, k); MSG(stderr, "done [%.2fs].\n", SEC_SINCE(t)); } t = clock(); if (fn_out && *fn_out) out = fopen(fn_out, "w"); else { out = stdout; fn_out = "<stdout>"; } MSG(stderr, "writing %s ... ", fn_out); if (!out) error(E_FOPEN, fn_out); if (eval == IST_LOGQ) report |= ISR_LOGS; if ((target == TT_CLOSED) || (target == TT_MAXIMAL)) report |= ISR_CLOSED; isrep = isr_create(ibase, out, report, isep, impl); if (!isrep) error(E_NOMEM); isr_setfmt (isrep, format); isr_setsize(isrep, min, max); ist_setsize(istree, min, max, dir); ist_init (istree); items = t_items(ib_tract(ibase)); if ((target <= TT_MAXIMAL) && (dir == 0)) { if (eval == IST_LOGQ) isr_seteval(isrep, isr_logq, NULL, minval); else if (eval > IST_NONE) isr_seteval(isrep, ist_evalx, istree, minval); n = ist_report(istree, isrep); } else if (target <= TT_MAXIMAL) { for (n = 0; 1; ) { k = ist_set(istree, items, &frq, &minval); if (k < 0) break; if (k > 0) fputs(isr_name(isrep, items[0]), out); for (i = 0; ++i < k; ) { fputs(isep, out); fputs(isr_name(isrep, items[i]), out); } if (format) isr_sinfo(isrep, frq, minval); fputc('\n', out); n++; } } else if (target == TT_RULE) { for (n = 0; 1; ) { k = ist_rule(istree, items, &frq, &body, &head, &minval); if (k < 0) break; fputs(isr_name(isrep, items[0]), out); fputs(impl, out); if (k > 1) fputs(isr_name(isrep, items[1]), out); for (i = 1; ++i < k; ) { fputs(isep, out); fputs(isr_name(isrep, items[i]), out); } if (format) isr_rinfo(isrep, frq, body, head, minval); fputc('\n', out); n++; } } / if (fflush(out) != 0) error(E_FWRITE, fn_out);
int main (int argc, char *argv[]) { /* --- main function */ int i, k = 0, n; /* loop variables, counters */ char *s; /* to traverse the options */ char **optarg = NULL; /* option argument */ char *fn_in = NULL; /* name of input file */ char *fn_out = NULL; /* name of output file */ char *fn_app = NULL; /* name of item appearances file */ char *blanks = NULL; /* blanks */ char *fldseps = NULL; /* field separators */ char *recseps = NULL; /* record separators */ char *comment = NULL; /* comment indicators */ char *isep = " "; /* item separator for output */ char *impl = " <- "; /* implication sign for ass. rules */ char *dflt = " (%1S)"; /* default format for check */ char *format = dflt; /* format for information output */ int target = 's'; /* target type (sets/rules/h.edges) */ int min = 1; /* minimum rule/item set size */ int max = INT_MAX; /* maximum rule/item set size */ double supp = 10; /* minimum support (in percent) */ double smax = 100; /* maximum support (in percent) */ double conf = 80; /* minimum confidence (in percent) */ int dir = 0; /* direction for size sorting */ int eval = 0; /* additional evaluation measure */ int zero = 0; /* flag for zero eval. below expect. */ int aggm = 0; /* aggregation mode for eval. measure */ double minval = 10; /* minimum evaluation measure value */ int prune = 0; /* (min. size for) evaluation pruning */ double filter = 0.1; /* item usage filtering parameter */ int sort = 2; /* flag for item sorting and recoding */ int tree = 1; /* flag for transaction tree */ int heap = 1; /* flag for heap sort vs. quick sort */ int post = 0; /* flag for a-posteriori pruning */ int report = 0; /* other flags for reporting */ int mode = APP_BODY|IST_PERFECT; /* search mode */ int size; /* current item set size */ int wgt; /* total transaction weight */ int frq, body, head; /* frequency of an item set */ int *items; /* item set (for reporting) */ clock_t t, tt, tc, x; /* timers for measurements */ #ifndef QUIET /* if not quiet version */ prgname = argv[0]; /* get program name for error msgs. */ /* --- print usage message --- */ if (argc > 1) { /* if arguments are given */ fprintf(stderr, "%s - %s\n", argv[0], DESCRIPTION); fprintf(stderr, VERSION); } /* print a startup message */ else { /* if no arguments are given */ printf("usage: %s [options] infile outfile [appfile]\n", argv[0]); printf("%s\n", DESCRIPTION); printf("%s\n", VERSION); printf("-t# target type " "(default: %c)\n", target); printf(" (s: frequent item sets, c: closed item sets,\n" " m: maximal item sets, r: association rules)\n"); printf("-m# minimum number of items per set/rule " "(default: %d)\n", min); printf("-n# maximum number of items per set/rule " "(default: no limit)\n"); printf("-s# minimum support of a set/rule " "(default: %g%%)\n", supp); printf("-S# maximum support of a set/rule " "(default: %g%%)\n", smax); printf(" (positive: percentage, " "negative: absolute number)\n"); printf("-c# minimum confidence of a rule " "(default: %g%%)\n", conf); printf("-o use the original rule support definition " "(body & head)\n"); printf("-e# additional evaluation measure " "(default: none)\n"); printf("-a# aggregation mode for evaluation measure " "(default: none)\n"); printf("-z zero evaluation below expected support " "(default: evaluate all)\n"); printf("-d# minimum value of add. evaluation measure " "(default: %g%%)\n", minval); printf("-p# (min. size for) pruning with evaluation " "(default: no pruning)\n"); printf(" (< 0: backward, > 0: forward)\n"); printf("-l# sort item sets in output by their size " "(default: no sorting)\n"); printf(" (< 0: descending, > 0: ascending)\n"); printf("-g write item names in scanable form " "(quote certain characters)\n"); printf("-k# item separator for output " "(default: \"%s\")\n", isep); printf("-i# implication sign for association rules " "(default: \"%s\")\n", impl); printf("-v# output format for set/rule information " "(default: \"%s\")\n", format); printf("-q# sort items w.r.t. their frequency " "(default: %d)\n", sort); printf(" (1: ascending, -1: descending, 0: do not sort,\n" " 2: ascending, -2: descending w.r.t. " "transaction size sum)\n"); printf("-u# filter unused items from transactions " "(default: %g)\n", filter); printf(" (0: do not filter items w.r.t. usage in sets,\n" " <0: fraction of removed items for filtering,\n" " >0: take execution times ratio into account)\n"); printf("-j use quicksort to sort the transactions " "(default: heapsort)\n"); printf("-x do not prune the search " "with perfect extensions\n"); printf("-y a-posteriori pruning of infrequent item sets\n"); printf("-h do not organize transactions as a prefix tree\n"); printf("-b# blank characters " "(default: \" \\t\\r\")\n"); printf("-f# field separators " "(default: \" \\t,\")\n"); printf("-r# record separators " "(default: \"\\n\")\n"); printf("-C# comment characters " "(default: \"#\")\n"); printf("-! print additional option information\n"); printf("infile file to read transactions from\n"); printf("outfile file to write item sets/association rules" "/hyperedges to\n"); printf("appfile file stating item appearances (optional)\n"); return 0; /* print a usage message */ } /* and abort the program */ #endif /* #ifndef QUIET */ /* free option characters: w [A-Z]\[SC] */ /* --- evaluate arguments --- */ for (i = 1; i < argc; i++) { /* traverse the arguments */ s = argv[i]; /* get an option argument */ if (optarg) { *optarg = s; optarg = NULL; continue; } if ((*s == '-') && *++s) { /* -- if argument is an option */ while (*s) { /* traverse the options */ switch (*s++) { /* evaluate the options */ case '!': help(); break; case 't': target = (*s) ? *s++ : 's'; break; case 'm': min = (int)strtol(s, &s, 0); break; case 'n': max = (int)strtol(s, &s, 0); break; case 's': supp = strtod(s, &s); break; case 'S': smax = strtod(s, &s); break; case 'c': conf = strtod(s, &s); break; case 'o': mode |= APP_BOTH; break; case 'e': eval = (*s) ? *s++ : 0; break; case 'z': zero = IST_ZERO; break; case 'a': aggm = (*s) ? *s++ : 0; break; case 'd': minval = strtod(s, &s); break; case 'p': prune = (int)strtol(s, &s, 0); break; case 'g': report = ISR_SCAN; break; case 'k': optarg = &isep; break; case 'i': optarg = &impl; break; case 'v': optarg = &format; break; case 'l': dir = (int)strtol(s, &s, 0); break; case 'q': sort = (int)strtol(s, &s, 0); break; case 'u': filter = strtod(s, &s); break; case 'h': tree = 0; break; case 'j': heap = 0; break; case 'x': mode &= ~IST_PERFECT; break; case 'y': post = 1; break; case 'b': optarg = &blanks; break; case 'f': optarg = &fldseps; break; case 'r': optarg = &recseps; break; case 'C': optarg = &comment; break; default : error(E_OPTION, *--s); break; } /* set the option variables */ if (optarg && *s) { *optarg = s; optarg = NULL; break; } } } /* get an option argument */ else { /* -- if argument is no option */ switch (k++) { /* evaluate non-options */ case 0: fn_in = s; break; case 1: fn_out = s; break; case 2: fn_app = s; break; default: error(E_ARGCNT); break; } /* note filenames */ } } if (optarg) error(E_OPTARG); /* check the option argument */ if ((k < 2) || (k > 3)) /* and the number of arguments */ error(E_ARGCNT); /* (either in/out or in/out/app) */ if ((!fn_in || !*fn_in) && (fn_app && !*fn_app)) error(E_STDIN); /* stdin must not be used twice */ switch (target) { /* check and translate target type */ case 's': target = TT_ALL; break; case 'c': target = TT_CLOSED; break; case 'm': target = TT_MAXIMAL; break; case 'r': target = TT_RULE; break; default : error(E_TARGET, (char)target); break; } if (min < 0) error(E_SIZE, min); /* check the limits */ if (max < 0) error(E_SIZE, max); /* for the set size */ if (supp > 100) /* check the minimum support */ error(E_SUPP, supp); /* (< 0: absolute support) */ if ((conf < 0) || (conf > 100)) error(E_CONF, conf); /* check the minimum confidence */ switch (eval) { /* check and translate measure */ case 'x': case 0: eval = IST_NONE; break; case 'c': eval = IST_CONF; break; case 'd': eval = IST_CONF_DIFF; break; case 'l': eval = IST_LIFT; break; case 'a': eval = IST_LIFT_DIFF; break; case 'q': eval = IST_LIFT_QUOT; break; case 'v': eval = IST_CVCT; break; case 'e': eval = IST_CVCT_DIFF; break; case 'r': eval = IST_CVCT_QUOT; break; case 'f': eval = IST_CERT; break; case 'n': eval = IST_CHI2; break; case 'p': eval = IST_CHI2_PVAL; break; case 'i': eval = IST_INFO; break; case 'g': eval = IST_INFO_PVAL; break; case 'b': eval = IST_LOGQ; break; default : error(E_MEASURE, (char)eval); break; } switch (aggm) { /* check and translate agg. mode */ case 'x': case 0: aggm = IST_NONE; break; case 'm': aggm = IST_MIN; break; case 'n': aggm = IST_MAX; break; case 'a': aggm = IST_AVG; break; default : error(E_MEASURE, (char)aggm); break; } if ((target > TT_ALL) /* if individual set counters needed */ || ((eval > IST_NONE) && (eval < IST_LOGQ))) mode &= ~IST_PERFECT; /* remove perfect extension pruning */ if (target <= TT_MAXIMAL) { /* remove rule specific settings */ mode |= APP_BOTH; conf = 100; } if ((filter <= -1) || (filter >= 1)) filter = 0; /* check and adapt the filter option */ if (dir) /* if to sort output by size, */ mode &= ~IST_PERFECT; /* do not use perfect ext. pruning */ /* --- create item base --- */ ibase = ib_create(0, 0); /* create an item base and */ if (!ibase) error(E_NOMEM); /* set the special characters */ ib_chars(ibase, blanks, fldseps, recseps, "", comment); MSG(stderr, "\n"); /* terminate the startup message */ /* --- read item appearance indicators --- */ if (fn_app) { /* if item appearances are given */ t = clock(); /* start the timer for the reading */ if (*fn_app) /* if an app. file name is given, */ in = fopen(fn_app, "r"); /* open the item appearances file */ else { /* if no app. file name is given, */ in = stdin; fn_app = "<stdin>"; } /* read from std. input */ MSG(stderr, "reading %s ... ", fn_app); if (!in) error(E_FOPEN, fn_app); k = ib_readapp(ibase, in); /* read the item appearances */ if (k != 0) error(k, fn_app, RECCNT(ibase), BUFFER(ibase)); if (in != stdin) fclose(in);/* if not read from standard input, */ in = NULL; /* close the input file */ MSG(stderr, "[%d item(s)]", ib_cnt(ibase)); MSG(stderr, " done [%.2fs].\n", SEC_SINCE(t)); } /* print a log message */ /* --- read transactions --- */ t = clock(); /* start the timer for the reading */ if (fn_in && *fn_in) /* if an input file name is given, */ in = fopen(fn_in, "r"); /* open input file for reading */ else { /* if no input file name is given, */ in = stdin; fn_in = "<stdin>"; } /* read from standard input */ MSG(stderr, "reading %s ... ", fn_in); if (!in) error(E_FOPEN, fn_in); tabag = tb_create(ibase, 0); /* create a transaction bag/multiset */ if (!tabag) error(E_NOMEM); /* to store the transactions */ while (1) { /* transaction read loop */ k = ib_read(ibase, in); /* read the next transaction */ if (k) { if (k > 0) break; /* check for error and end of file */ error(k, fn_in, RECCNT(ibase), BUFFER(ibase)); } if (tb_add(tabag, NULL) != 0) error(E_NOMEM); } /* add transaction to bag/multiset */ if (in != stdin) fclose(in); /* if not read from standard input, */ in = NULL; /* close the input file */ n = ib_cnt(ibase); /* get the number of items */ k = tb_cnt(tabag); /* get the number of transactions */ wgt = tb_wgt(tabag); /* the total transaction weight */ MSG(stderr, "[%d item(s), ", n); if (k == wgt) MSG(stderr, "%d transaction(s)]", k); else MSG(stderr, "%d/%d transaction(s)]", k, wgt); MSG(stderr, " done [%.2fs].", SEC_SINCE(t)); if ((n <= 0) || (wgt <= 0)) /* check for at least one item */ error(E_NOTRANS); /* and at least one transaction */ MSG(stderr, "\n"); /* terminate the log message */ if (format == dflt) { /* if default info. format is used */ if (target != TT_RULE) format = (supp < 0) ? " (%a)" : " (%1S)"; else format = (supp < 0) ? " (%b, %1C)" : " (%1X, %1C)"; } /* set default according to target */ supp = ceil ((supp >= 0) ? 0.01 *supp *wgt : -supp); smax = floor((smax >= 0) ? 0.01 *smax *wgt : -smax); conf *= 0.01; /* transform support and confidence */ /* --- sort and recode items --- */ t = clock(); /* compute absolute support values */ MSG(stderr, "filtering, sorting and recoding items ... "); map = (int*)malloc(n *sizeof(int)); if (!map) error(E_NOMEM); /* create an item identifier map */ k = (int)((mode & APP_HEAD) ? supp : ceil(supp *conf)); n = ib_recode(ibase, k, sort, map); tb_recode(tabag, map); /* recode the items and transactions */ tb_itsort(tabag, 1, heap); /* and sort items in transactions */ free(map); map = NULL; /* delete the item identifier map */ MSG(stderr, "[%d item(s)] done [%.2fs].", n, SEC_SINCE(t)); if (n <= 0) error(E_NOFREQ); /* print a log message and */ MSG(stderr, "\n"); /* check the number of items */ k = tb_max(tabag); /* clamp the set/rule length to */ if (max > k) max = k; /* the maximum transaction size */ /* --- reduce transactions --- */ t = clock(); /* start the timer for the reduction */ MSG(stderr, "reducing transactions ... "); tb_filter(tabag, min, NULL); /* remove items of short transactions */ tb_sort(tabag, 1, heap); /* sort the trans. lexicographically */ k = tb_reduce(tabag); /* reduce transactions to unique ones */ if (k == wgt) MSG(stderr, "[%d transaction(s)]", k); else MSG(stderr, "[%d/%d transaction(s)]", k, wgt); MSG(stderr, " done [%.2fs].\n", SEC_SINCE(t)); /* --- create transaction tree --- */ tt = 0; /* init. the tree construction time */ if (tree) { /* if to use a transaction tree */ t = clock(); /* start the timer for construction */ MSG(stderr, "building transaction tree ... "); tatree = tt_create(tabag); /* create a transaction tree */ if (!tatree) error(E_NOMEM); if (filter == 0) { /* if not to filter items, */ tb_delete(tabag, 0); /* delete the transaction bag */ tabag = NULL; /* (redundant data storage) */ } MSG(stderr, "[%d node(s)]", tt_nodecnt(tatree)); MSG(stderr, " done [%.2fs].\n", SEC_SINCE(t)); tt = clock() -t; /* note the time for the construction */ } /* of the transaction tree */ /* --- create item set tree --- */ t = clock(); tc = 0; /* start the timer for the search */ istree = ist_create(ibase, mode, (int)supp, (int)smax, conf); if (!istree) error(E_NOMEM); /* create an item set tree */ ist_seteval(istree, eval|zero, aggm, 0.01*minval, prune); /* --- check item subsets --- */ MSG(stderr, "checking subsets of size 1"); map = (int*)malloc(n *sizeof(int)); if (!map) error(E_NOMEM); /* create a filter map */ while (1) { /* traverse the item set sizes */ size = ist_height(istree); /* get the current item set size and */ if (size >= max) break; /* abort if maximal size is reached */ if ((filter != 0) /* if to filter w.r.t. item usage */ && (ist_check(istree, map) <= size)) break; /* check which items are still used */ if (post) /* if a-posteriori pruning requested, */ ist_prune(istree); /* prune infrequent item sets */ k = ist_addlvl(istree); /* while max. height is not reached, */ if (k) { if (k > 0) break; /* add a level to the item set tree */ error(E_NOMEM); } /* if no level was added, abort */ if (((filter < 0) /* if to filter w.r.t. item usage */ && (i < -filter *n)) /* and enough items were removed */ || ((filter > 0) /* or counting time is long enough */ && (i < n) && (i *(double)tt < filter *n *tc))) { n = i; /* note the new number of items */ x = clock(); /* start the timer for filtering */ tb_filter(tabag, size+1, map); tb_sort(tabag, 0, heap); /* remove unnec. items and trans. */ tb_reduce(tabag); /* and reduce trans. to unique ones */ if (tatree) { /* if a transaction tree was created */ tt_delete(tatree, 0); /* delete the transaction tree */ tatree = tt_create(tabag); if (!tatree) error(E_NOMEM); } /* rebuild the transaction tree */ tt = clock() -x; /* note the filter/rebuild time */ } MSG(stderr, " %d", ++size); /* print the current item set size */ x = clock(); /* start the timer for counting */ if (tatree) ist_countx(istree, tatree); else ist_countb(istree, tabag); tc = clock() -x; /* count the transaction tree/bag */ } /* and compute the new counting time */ free(map); map = NULL; /* delete the filter map */ MSG(stderr, " done [%.2fs].\n", SEC_SINCE(t)); /* --- filter found item sets --- */ if ((target == TT_CLOSED) || (target == TT_MAXIMAL)) { t = clock(); /* start the timer for filtering */ MSG(stderr, "filtering for %s item sets ... ", (target == TT_MAXIMAL) ? "maximal" : "closed"); k = target | ((prune < 0) ? IST_EVAL : 0); ist_mark(istree, k); /* filter closed/maximal item sets */ MSG(stderr, "done [%.2fs].\n", SEC_SINCE(t)); } /* print a log message */ /* --- print item sets/rules/hyperedges --- */ t = clock(); /* start the timer for the output */ if (fn_out && *fn_out) /* if an output file name is given, */ out = fopen(fn_out, "w"); /* open the output file */ else { /* if no output file name is given, */ out = stdout; fn_out = "<stdout>"; } /* write to std. output */ MSG(stderr, "writing %s ... ", fn_out); if (!out) error(E_FOPEN, fn_out); if (eval == IST_LOGQ) report |= ISR_LOGS; if ((target == TT_CLOSED) || (target == TT_MAXIMAL)) report |= ISR_NOEXP; /* combine the report mode flags */ isrep = isr_create(ibase, out, report, isep, impl); if (!isrep) error(E_NOMEM); /* create an item set reporter */ isr_setfmt (isrep, format); /* and configure it */ isr_setsize(isrep, min, max); ist_setsize(istree, min, max, dir); ist_init (istree); /* initialize the extraction */ items = t_items(ib_tract(ibase)); if ((target <= TT_MAXIMAL) /* if to find frequent item sets */ && (dir == 0)) { /* and not to sort them by size */ if (eval == IST_LOGQ) /* if to compute an add. evaluation */ isr_seteval(isrep, isr_logq, NULL, 0.01*minval); else if (eval > IST_NONE) /* set the add. evaluation function */ isr_seteval(isrep, ist_evalx, istree, 0.01*minval); n = ist_report(istree, isrep); } /* report the item sets */ else if (target <= TT_MAXIMAL) { /* if to find frequent item sets */ for (n = 0; 1; ) { /* extract item sets from the tree */ k = ist_set(istree, items, &frq, &minval); if (k < 0) break; /* get the next frequent item set */ if (k > 0) fputs(isr_name(isrep, items[0]), out); for (i = 0; ++i < k; ) { /* print the item names */ fputs(isep, out); fputs(isr_name(isrep, items[i]), out); } if (format) /* if requested, print information */ isr_sinfo(isrep, frq, minval); fputc('\n', out); n++; /* terminate the output line and */ } } /* count the reported item set */ else if (target == TT_RULE) { /* if to find association rules, */ for (n = 0; 1; ) { /* extract rules from tree */ k = ist_rule(istree, items, &frq, &body, &head, &minval); if (k < 0) break; /* get the next association rule */ fputs(isr_name(isrep, items[0]), out); fputs(impl, out); /* print name of rule head item */ if (k > 1) fputs(isr_name(isrep, items[1]), out); for (i = 1; ++i < k; ) { /* print names of items in rule body */ fputs(isep, out); fputs(isr_name(isrep, items[i]), out); } if (format) /* if requested, print information */ isr_rinfo(isrep, frq, body, head, minval); fputc('\n', out); n++; /* terminate the output line and */ } /* count the reported ass. rule */ } /* if (target <= TT_MAXIMAL) .. else .. */ if (fflush(out) != 0) error(E_FWRITE, fn_out); if (out != stdout) fclose(out); out = NULL; /* close the output file */ MSG(stderr, "[%d %s(s)] done ", n, (target == TT_RULE) ? "rule" : "set"); MSG(stderr, "[%.2fs].\n", SEC_SINCE(t)); #ifdef BENCH /* if benchmark version, */ ist_stats(istree); /* show the search statistics */ #endif /* (especially memory usage) */ /* --- clean up --- */ #ifndef NDEBUG /* if this is a debug version */ isr_delete(isrep, 0); /* the item set reporter, */ ist_delete(istree); /* the item set tree, */ if (tatree) tt_delete(tatree, 0); /* the transaction tree, */ if (tabag) tb_delete(tabag, 0); /* the transaction bag, */ ib_delete(ibase); /* and the item base */ #endif #ifdef STORAGE /* if storage debugging */ showmem("at end of program"); /* check memory usage */ #endif return 0; /* return 'ok' */ } /* main() */