int main (int argc, char *argv[]) { int i, k = 0, n; char *s; char **optarg = NULL; char *fn_in = NULL; char *fn_out = NULL; char *fn_app = NULL; char *blanks = NULL; char *fldseps = NULL; char *recseps = NULL; char *comment = NULL; char *isep = " "; char *impl = " <- "; char *dflt = " (%1S)"; char *format = dflt; int target = 's'; int min = 1; int max = INT_MAX; double supp = 0.1; double smax = 1.0; double conf = 0.8; int dir = 0; int eval = 0; int aggm = 0; double minval = 0.1; int prune = 0; double filter = 0.1; int sort = 2; int tree = 1; int heap = 1; int post = 0; int report = 0; int mode = APP_BODY|IST_PERFECT; int size; int wgt; int frq, body, head; int *items; clock_t t, tt, tc, x; #ifndef QUIET prgname = argv[0]; if (argc > 1) { fprintf(stderr, "%s - %s\n", argv[0], DESCRIPTION); fprintf(stderr, VERSION); } else { printf("usage: %s [options] infile outfile\n", argv[0]); printf("%s\n", DESCRIPTION); printf("%s\n", VERSION); printf("-t# target type " "(default: %c)\n", target); printf(" (s: frequent item sets, c: closed item sets,\n" " m: maximal item sets, r: association rules)\n"); printf("-m# minimum number of items per set/rule " "(default: %d)\n", min); printf("-n# maximum number of items per set/rule " "(default: no limit)\n"); printf("-s# minimum support of a set/rule " "(default: %g%%)\n", supp *100); printf("-S# maximum support of a set/rule " "(default: %g%%)\n", smax *100); printf(" (positive: percentage, " "negative: absolute number)\n"); printf("-c# minimum confidence of a rule " "(default: %g%%)\n", conf *100); printf("infile file to read transactions from\n"); printf("outfile file to write item sets to\n"); return 0; } #endif for (i = 1; i < argc; i++) { s = argv[i]; if (optarg) { *optarg = s; optarg = NULL; continue; } if ((*s == '-') && *++s) { while (*s) { switch (*s++) { case '!': help(); break; case 't': target = (*s) ? *s++ : 's'; break; case 'm': min = (int)strtol(s, &s, 0); break; case 'n': max = (int)strtol(s, &s, 0); break; case 's': supp = 0.01*strtod(s, &s); break; case 'S': smax = 0.01*strtod(s, &s); break; case 'c': conf = 0.01*strtod(s, &s); break; case 'o': mode |= APP_BOTH; break; case 'e': eval = (*s) ? *s++ : 0; break; case 'a': aggm = (*s) ? *s++ : 0; break; case 'd': minval = 0.01*strtod(s, &s); break; case 'p': prune = (int)strtol(s, &s, 0); break; case 'g': report = ISR_SCAN; break; case 'k': optarg = &isep; break; case 'i': optarg = &impl; break; case 'v': optarg = &format; break; case 'l': dir = (int)strtol(s, &s, 0); break; case 'q': sort = (int)strtol(s, &s, 0); break; case 'u': filter = strtod(s, &s); break; case 'h': tree = 0; break; case 'j': heap = 0; break; case 'x': mode &= ~IST_PERFECT; break; case 'y': post = 1; break; case 'b': optarg = &blanks; break; case 'f': optarg = &fldseps; break; case 'r': optarg = &recseps; break; case 'C': optarg = &comment; break; default : error(E_OPTION, *--s); break; } if (optarg && *s) { *optarg = s; optarg = NULL; break; } } } else { switch (k++) { case 0: fn_in = s; break; case 1: fn_out = s; break; case 2: fn_app = s; break; default: error(E_ARGCNT); break; } } } if (optarg) error(E_OPTARG); if ((k < 2) || (k > 3)) error(E_ARGCNT); if ((!fn_in || !*fn_in) && (fn_app && !*fn_app)) error(E_STDIN); switch (target) { case 's': target = TT_SET; break; case 'c': target = TT_CLOSED; break; case 'm': target = TT_MAXIMAL; break; case 'r': target = TT_RULE; break; default : error(E_TARGET, (char)target); break; } if (min < 0) error(E_SIZE, min); if (max < 0) error(E_SIZE, max); if (supp > 1) error(E_SUPP, supp); if ((conf < 0) || (conf > 1)) error(E_CONF, conf); switch (eval) { case 'x': case 0: eval = IST_NONE; break; case 'c': eval = IST_CONF; break; case 'd': eval = IST_DIFF; break; case 'l': eval = IST_LIFT; break; case 'a': eval = IST_LD21; break; case 'q': eval = IST_QUOT; break; case 'n': eval = IST_CHI2; break; case 'p': eval = IST_PVAL; break; case 'i': eval = IST_INFO; break; case 'g': eval = IST_PGST; break; case 'b': eval = IST_LOGQ; break; default : error(E_MEASURE, (char)eval); break; } switch (aggm) { case 'x': case 0: aggm = IST_NONE; break; case 'm': aggm = IST_MIN; break; case 'n': aggm = IST_MAX; break; case 'a': aggm = IST_AVG; break; default : error(E_MEASURE, (char)aggm); break; } if ((target > TT_SET) || ((eval > IST_NONE) && (eval < IST_LOGQ))) mode &= ~IST_PERFECT; if (target <= TT_MAXIMAL) { mode |= APP_BOTH; conf = 1;} if ((filter <= -1) || (filter >= 1)) filter = 0; ibase = ib_create(-1); if (!ibase) error(E_NOMEM); ib_chars(ibase, blanks, fldseps, recseps, comment); MSG(stderr, "\n"); if (fn_app) { t = clock(); if (*fn_app) in = fopen(fn_app, "r"); else { in = stdin; fn_app = "<stdin>"; } MSG(stderr, "reading %s ... ", fn_app); if (!in) error(E_FOPEN, fn_app); k = ib_readapp(ibase, in); if (k != 0) error(k, fn_app, RECCNT(ibase), BUFFER(ibase)); if (in != stdin) fclose(in); in = NULL; MSG(stderr, "[%d item(s)]", ib_cnt(ibase)); MSG(stderr, " done [%.2fs].\n", SEC_SINCE(t)); } t = clock(); if (fn_in && *fn_in) in = fopen(fn_in, "r"); else { in = stdin; fn_in = "<stdin>"; } MSG(stderr, "reading %s ... ", fn_in); if (!in) error(E_FOPEN, fn_in); tabag = tb_create(ibase); if (!tabag) error(E_NOMEM); while (1) { k = ib_read(ibase, in); if (k) { if (k > 0) break; error(k, fn_in, RECCNT(ibase), BUFFER(ibase)); } if (tb_add(tabag, NULL) != 0) error(E_NOMEM); } if (in != stdin) fclose(in); in = NULL; n = ib_cnt(ibase); k = tb_cnt(tabag); wgt = tb_wgt(tabag); MSG(stderr, "[%d item(s), ", n); if (k == wgt) MSG(stderr, "%d transaction(s)]", k); else MSG(stderr, "%d/%d transaction(s)]", k, wgt); MSG(stderr, " done [%.2fs].", SEC_SINCE(t)); if ((n <= 0) || (wgt <= 0)) error(E_NOTRANS); MSG(stderr, "\n"); if (format == dflt) { if (target != TT_RULE) format = (supp < 0) ? " (%a)" : " (%1S)"; else format = (supp < 0) ? " (%b, %1C)" : " (%1X, %1C)"; } supp = ceil (((supp < 0) ? -100 : wgt) *supp); smax = floor(((smax < 0) ? -100 : wgt) *smax); t = clock(); MSG(stderr, "filtering, sorting and recoding items ... "); map = (int*)malloc(n *sizeof(int)); if (!map) error(E_NOMEM); k = (int)((mode & APP_HEAD) ? supp : ceil(supp *conf)); n = ib_recode(ibase, k, sort, map); tb_recode(tabag, map); tb_itsort(tabag, 1, heap); free(map); map = NULL; MSG(stderr, "[%d item(s)] done [%.2fs].", n, SEC_SINCE(t)); if (n <= 0) error(E_NOFREQ); MSG(stderr, "\n"); k = tb_max(tabag); if (max > k) max = k; t = clock(); MSG(stderr, "reducing transactions ... "); tb_filter(tabag, min, NULL); tb_sort(tabag, 1, heap); k = tb_reduce(tabag); if (k == wgt) MSG(stderr, "[%d transaction(s)]", k); else MSG(stderr, "[%d/%d transaction(s)]", k, wgt); MSG(stderr, " done [%.2fs].\n", SEC_SINCE(t)); tt = 0; if (tree) { t = clock(); MSG(stderr, "building transaction tree ... "); tatree = tt_create(tabag); if (!tatree) error(E_NOMEM); if (filter == 0) { tb_delete(tabag, 0); tabag = NULL; } MSG(stderr, "[%d node(s)]", tt_nodecnt(tatree)); MSG(stderr, " done [%.2fs].\n", SEC_SINCE(t)); tt = clock() -t; } t = clock(); tc = 0; istree = ist_create(ibase, mode, (int)supp, (int)smax, conf); if (!istree) error(E_NOMEM); ist_seteval(istree, eval, aggm, minval, prune); /* --- check item subsets --- */ MSG(stderr, "checking subsets of size 1"); map = (int*)malloc(n *sizeof(int)); if (!map) error(E_NOMEM); while (1) { size = ist_height(istree); if (size >= max) break; if ((filter != 0) && (ist_check(istree, map) <= size)) break; if (post) ist_prune(istree); k = ist_addlvl(istree); if (k) { if (k > 0) break; error(E_NOMEM); } if (((filter < 0) && (i < -filter *n)) || ((filter > 0) && (i < n) && (i *(double)tt < filter *n *tc))) { n = i; x = clock(); tb_filter(tabag, size+1, map); tb_sort(tabag, 0, heap); tb_reduce(tabag); if (tatree) { tt_delete(tatree, 0); tatree = tt_create(tabag); if (!tatree) error(E_NOMEM); } tt = clock() -x; } MSG(stderr, " %d", ++size); x = clock(); if (tatree) ist_countx(istree, tatree); else ist_countb(istree, tabag); tc = clock() -x; } free(map); map = NULL; MSG(stderr, " done [%.2fs].\n", SEC_SINCE(t)); if ((target == TT_CLOSED) || (target == TT_MAXIMAL)) { t = clock(); MSG(stderr, "filtering for %s item sets ... ", (target == TT_MAXIMAL) ? "maximal" : "closed"); k = target | ((prune < 0) ? IST_EVAL : 0); ist_mark(istree, k); MSG(stderr, "done [%.2fs].\n", SEC_SINCE(t)); } t = clock(); if (fn_out && *fn_out) out = fopen(fn_out, "w"); else { out = stdout; fn_out = "<stdout>"; } MSG(stderr, "writing %s ... ", fn_out); if (!out) error(E_FOPEN, fn_out); if (eval == IST_LOGQ) report |= ISR_LOGS; if ((target == TT_CLOSED) || (target == TT_MAXIMAL)) report |= ISR_CLOSED; isrep = isr_create(ibase, out, report, isep, impl); if (!isrep) error(E_NOMEM); isr_setfmt (isrep, format); isr_setsize(isrep, min, max); ist_setsize(istree, min, max, dir); ist_init (istree); items = t_items(ib_tract(ibase)); if ((target <= TT_MAXIMAL) && (dir == 0)) { if (eval == IST_LOGQ) isr_seteval(isrep, isr_logq, NULL, minval); else if (eval > IST_NONE) isr_seteval(isrep, ist_evalx, istree, minval); n = ist_report(istree, isrep); } else if (target <= TT_MAXIMAL) { for (n = 0; 1; ) { k = ist_set(istree, items, &frq, &minval); if (k < 0) break; if (k > 0) fputs(isr_name(isrep, items[0]), out); for (i = 0; ++i < k; ) { fputs(isep, out); fputs(isr_name(isrep, items[i]), out); } if (format) isr_sinfo(isrep, frq, minval); fputc('\n', out); n++; } } else if (target == TT_RULE) { for (n = 0; 1; ) { k = ist_rule(istree, items, &frq, &body, &head, &minval); if (k < 0) break; fputs(isr_name(isrep, items[0]), out); fputs(impl, out); if (k > 1) fputs(isr_name(isrep, items[1]), out); for (i = 1; ++i < k; ) { fputs(isep, out); fputs(isr_name(isrep, items[i]), out); } if (format) isr_rinfo(isrep, frq, body, head, minval); fputc('\n', out); n++; } } / if (fflush(out) != 0) error(E_FWRITE, fn_out);
int main (int argc, char *argv[]) { /* --- main function */ int i, k = 0; /* loop variables */ char *s; /* to traverse the options */ CCHAR **optarg = NULL; /* option argument */ CCHAR *fn_inp = NULL; /* name of input file */ CCHAR *fn_out = NULL; /* name of output file */ CCHAR *fn_sel = NULL; /* name of item selection file */ #ifdef ISR_PATSPEC /* if to allow a pattern spectrum */ CCHAR *fn_psp = NULL; /* name of pattern spectrum file */ #endif CCHAR *recseps = NULL; /* record separators */ CCHAR *fldseps = NULL; /* field separators */ CCHAR *blanks = NULL; /* blank characters */ CCHAR *comment = NULL; /* comment characters */ CCHAR *hdr = ""; /* record header for output */ CCHAR *sep = " "; /* item separator for output */ CCHAR *dflt = " (%1S)"; /* default format for check */ CCHAR *format = dflt; /* format for information output */ int target = 's'; /* target type (closed/maximal) */ ITEM min = 1; /* minimum size of an item set */ ITEM max = 16; /* maximum size of an item set */ double supp = 10; /* minimum support (in percent) */ int eval = 'x'; /* additional evaluation measure */ double minval = 10; /* minimum evaluation measure value */ int sort = 2; /* flag for item sorting and recoding */ int dir = +1; /* item processing order */ long repeat = 1; /* number of repetitions */ int mtar = 0; /* mode for transaction reading */ int mrep = 0; /* mode for item set reporting */ int stats = 0; /* flag for item set statistics */ ITEM m; /* number of items */ TID n; /* number of transactions */ SUPP w; /* total transaction weight */ clock_t t; /* timer for measurements */ ISEVALFN *evalfn = (ISEVALFN*)0; /* evaluation function */ #ifndef QUIET /* if not quiet version */ prgname = argv[0]; /* get program name for error msgs. */ /* --- print usage message --- */ if (argc > 1) { /* if arguments are given */ fprintf(stderr, "%s - %s\n", argv[0], DESCRIPTION); fprintf(stderr, VERSION); } /* print a startup message */ else { /* if no arguments given */ printf("usage: %s [options] infile [outfile]\n", argv[0]); printf("%s\n", DESCRIPTION); printf("%s\n", VERSION); printf("-t# target type " "(default: %c)\n", target); printf(" (s: frequent, c: closed, m: maximal item sets, " "g: generators)\n"); printf("-m# minimum number of items per item set " "(default: %"ITEM_FMT")\n", min); printf("-n# maximum number of items per item set " "(default: %"ITEM_FMT")\n", max); printf("-s# minimum support of an item set " "(default: %g%%)\n", supp); printf(" (positive: percentage, " "negative: absolute number)\n"); printf("-e# additional evaluation measure " "(default: none)\n"); printf("-d# minimum value of add. evaluation measure " "(default: %g%%)\n", minval); printf("-q# sort items w.r.t. their frequency " "(default: %d)\n", sort); printf(" (1: ascending, -1: descending, 0: do not sort,\n" " 2: ascending, -2: descending w.r.t. " "transaction size sum)\n"); printf("-u# item processing order/search direction " "(default: %d)\n", dir); printf(" (fixed to -1 for closed/maximal item sets\n" " fixed to +1 for generators, free otherwise)\n"); printf("-x# number of repetitions (for benchmarking) " "(default: 1)\n"); printf("-R# read an item selection from a file\n"); #ifdef ISR_PATSPEC printf("-P# write a pattern spectrum to a file\n"); #endif printf("-Z print item set statistics (counts per size)\n"); printf("-g write output in scanable form " "(quote certain characters)\n"); printf("-h# record header for output " "(default: \"%s\")\n", hdr); printf("-k# item separator for output " "(default: \"%s\")\n", sep); printf("-v# output format for item set information " "(default: \"%s\")\n", format); printf("-w transaction weight in last field " "(default: only items)\n"); printf("-r# record/transaction separators " "(default: \"\\n\")\n"); printf("-f# field /item separators " "(default: \" \\t,\")\n"); printf("-b# blank characters " "(default: \" \\t\\r\")\n"); printf("-C# comment characters " "(default: \"#\")\n"); printf("-! print additional option information\n"); printf("infile file to read transactions from " "[required]\n"); printf("outfile file to write frequent item sets to " "[optional]\n"); return 0; /* print a usage message */ } /* and abort the program */ #endif /* #ifndef QUIET */ /* free option characters: acilopy [A-Z]\[CT] */ /* --- evaluate arguments --- */ for (i = 1; i < argc; i++) { /* traverse arguments */ s = argv[i]; /* get option argument */ if (optarg) { *optarg = s; optarg = NULL; continue; } if ((*s == '-') && *++s) { /* -- if argument is an option */ while (*s) { /* traverse options */ switch (*s++) { /* evaluate switches */ case '!': help(); break; case 't': target = (*s) ? *s++ : 's'; break; case 'm': min = (ITEM)strtol(s, &s, 0); break; case 'n': max = (ITEM)strtol(s, &s, 0); break; case 's': supp = strtod(s, &s); break; case 'e': eval = (*s) ? *s++ : 0; break; case 'd': minval = strtod(s, &s); break; case 'q': sort = (int) strtol(s, &s, 0); break; case 'u': dir = (int) strtol(s, &s, 0); break; case 'x': repeat = strtol(s, &s, 0); break; case 'R': optarg = &fn_sel; break; #ifdef ISR_PATSPEC case 'P': optarg = &fn_psp; break; #endif case 'Z': stats = 1; break; case 'g': mrep = ISR_SCAN; break; case 'h': optarg = &hdr; break; case 'k': optarg = &sep; break; case 'v': optarg = &format; break; case 'w': mtar |= TA_WEIGHT; break; case 'r': optarg = &recseps; break; case 'f': optarg = &fldseps; break; case 'b': optarg = &blanks; break; case 'C': optarg = &comment; break; default : error(E_OPTION, *--s); break; } /* set option variables */ if (optarg && *s) { *optarg = s; optarg = NULL; break; } } } /* get option argument */ else { /* -- if argument is no option */ switch (k++) { /* evaluate non-options */ case 0: fn_inp = s; break; case 1: fn_out = s; break; default: error(E_ARGCNT); break; } /* note filenames */ } } if (optarg) error(E_OPTARG); /* check (option) arguments */ if (k < 1) error(E_ARGCNT); /* and number of arguments */ if (min < 0) error(E_SIZE, min); /* check the size limits */ if (max < 0) error(E_SIZE, max); /* and the minimum support */ if (max > 16) error(E_SIZE, max); /* and the minimum support */ if (supp > 100) error(E_SUPPORT, supp); if (repeat < 1) error(E_REPEAT, repeat); if ((!fn_inp || !*fn_inp) && (fn_sel && !*fn_sel)) error(E_STDIN); /* stdin must not be used twice */ switch (target) { /* check and translate target type */ case 's': target = ISR_ALL; break; case 'c': target = ISR_CLOSED; break; case 'm': target = ISR_MAXIMAL; break; case 'g': target = ISR_GENERA; break; default : error(E_TARGET, (char)target); break; } /* (get target type code) */ switch (eval) { /* check and translate measure */ case 'x': evalfn = (ISEVALFN*)0; break; case 'b': evalfn = isr_logrto; break; default : error(E_MEASURE, (char)eval); break; } /* (get evaluation measure code) */ if ((format == dflt) && (supp < 0)) format = " (%a)"; /* adapt the default info. format */ MSG(stderr, "\n"); /* terminate the startup message */ /* --- read item selection --- */ ibase = ib_create(0, 0); /* create an item base */ if (!ibase) error(E_NOMEM); /* to manage the items */ tread = trd_create(); /* create a transaction reader */ if (!tread) error(E_NOMEM); /* and configure the characters */ trd_allchs(tread, recseps, fldseps, blanks, "", comment); if (fn_sel) { /* if item appearances are given */ t = clock(); /* start timer, open input file */ if (trd_open(tread, NULL, fn_sel) != 0) error(E_FOPEN, trd_name(tread)); MSG(stderr, "reading %s ... ", trd_name(tread)); m = ib_readsel(ibase,tread);/* read the given item selection */ if (m < 0) error((int)-m, ib_errmsg(ibase, NULL, 0)); trd_close(tread); /* close the input file */ MSG(stderr, "[%"ITEM_FMT" item(s)]", m); MSG(stderr, " done [%.2fs].\n", SEC_SINCE(t)); } /* print a log message */ /* --- read transaction database --- */ tabag = tbg_create(ibase); /* create a transaction bag */ if (!tabag) error(E_NOMEM); /* to store the transactions */ t = clock(); /* start timer, open input file */ if (trd_open(tread, NULL, fn_inp) != 0) error(E_FOPEN, trd_name(tread)); MSG(stderr, "reading %s ... ", trd_name(tread)); k = tbg_read(tabag, tread, mtar); if (k < 0) /* read the transaction database */ error(-k, tbg_errmsg(tabag, NULL, 0)); trd_delete(tread, 1); /* close the input file and */ tread = NULL; /* delete the table reader */ m = ib_cnt(ibase); /* get the number of items, */ n = tbg_cnt(tabag); /* the number of transactions, */ w = tbg_wgt(tabag); /* the total transaction weight */ MSG(stderr, "[%"ITEM_FMT" item(s), %"TID_FMT, m, n); if (w != (SUPP)n) MSG(stderr, "/%"SUPP_FMT, w); MSG(stderr, " transaction(s)] done [%.2fs].", SEC_SINCE(t)); if ((m <= 0) || (n <= 0)) /* check for at least one item */ error(E_NOITEMS); /* and at least one transaction */ MSG(stderr, "\n"); /* compute absolute support value */ supp = ceilsupp((supp >= 0) ? 0.01 *supp *(double)w : -supp); /* --- sort and recode items --- */ t = clock(); /* start timer, print log message */ MSG(stderr, "filtering, sorting and recoding items ... "); m = tbg_recode(tabag, (SUPP)supp, -1, 16, -sort); if (m < 0) error(E_NOMEM); /* recode items and transactions */ if (m <= 0) error(E_NOITEMS); /* and check the number of items */ MSG(stderr, "[%"ITEM_FMT" item(s)]", m); MSG(stderr, " done [%.2fs].\n", SEC_SINCE(t)); /* --- sort and reduce transactions --- */ t = clock(); /* start timer, print log message */ MSG(stderr, "sorting and reducing transactions ... "); tbg_filter(tabag,min,NULL,0); /* remove items of short transactions */ tbg_itsort(tabag, +1, 1); /* sort items in transactions and */ tbg_sort (tabag, +1, 1); /* sort the trans. lexicographically */ n = tbg_reduce(tabag, 0); /* reduce transactions to unique ones */ tbg_pack(tabag, 16); /* pack items with codes < 16 */ MSG(stderr, "[%"TID_FMT, n); /* print number of transactions */ if (w != (SUPP)n) MSG(stderr, "/%"SUPP_FMT, w); MSG(stderr, " transaction(s)] done [%.2fs].\n", SEC_SINCE(t)); /* --- find frequent item sets --- */ t = clock(); /* start the timer */ if (eval == 'b') mrep |= ISR_LOGS; report = isr_create(ibase, target|mrep, -1, hdr, sep, NULL); if (!report) error(E_NOMEM); /* create an item set reporter */ isr_setfmt (report, format); /* and configure it: set flags, */ isr_setsize(report, min, max);/* info. format and size range, */ if (evalfn) /* and the evaluation function */ isr_seteval(report, evalfn, NULL, +1, 0.01*minval); #ifdef ISR_PATSPEC /* if to allow a pattern spectrum */ if (fn_psp && (isr_addpsp(report, NULL) < 0)) error(E_NOMEM); /* add a pattern spectrum if req. */ #endif if (isr_open(report, NULL, fn_out) != 0) error(E_FOPEN, isr_name(report)); /* open the output file */ MSG(stderr, "writing %s ... ", isr_name(report)); if (target == ISR_GENERA) dir = +1; else if (target & (ISR_CLOSED|ISR_MAXIMAL)) dir = -1; fim16 = m16_create(dir, (SUPP)supp, report); if (!fim16) error(E_NOMEM); /* create a 16 items machine */ for (i = 0; i < repeat; i++){ /* repeated mining loop */ isr_reset(report); /* (re)init. the output counters */ m16_addtbg(fim16, tabag); /* add trans. bag to 16 items machine */ k = m16_mine(fim16); /* find frequent item sets */ if (k < 0) error(E_NOMEM); /* with 16 items machine */ } if (isr_report(report) < 0) /* report the empty set (if needed) */ error(E_NOMEM); if (isr_close(report) != 0) /* close the output file */ error(E_FWRITE, isr_name(report)); MSG(stderr, "[%"SIZE_FMT" set(s)]", isr_repcnt(report)); MSG(stderr, " done [%.2fs].\n", SEC_SINCE(t)); if (stats) isr_prstats(report, stdout, 0); /* --- write pattern spectrum --- */ #ifdef ISR_PATSPEC /* if to allow a pattern spectrum */ if (fn_psp) { /* if to write a pattern spectrum */ psp = isr_getpsp(report);/* get the pattern spectrum */ twrite = twr_create(); /* create a table writer and */ if (!twrite) error(E_NOMEM);/* open the output file */ if (twr_open(twrite, NULL, fn_psp) != 0) error(E_FOPEN, twr_name(twrite)); if (psp_report(psp, twrite) != 0) error(E_FWRITE, twr_name(twrite)); twr_delete(twrite, 1); /* write the pattern spectrum, */ twrite = NULL; /* delete the table writer, and */ } /* clear the writer variable */ #endif /* --- clean up --- */ CLEANUP; /* clean up memory and close files */ SHOWMEM; /* show (final) memory usage */ return 0; /* return 'ok' */ } /* main() */
int main (int argc, char *argv[]) { /* --- main function */ int i, k = 0, n; /* loop variables, counters */ char *s; /* to traverse the options */ char **optarg = NULL; /* option argument */ char *fn_in = NULL; /* name of input file */ char *fn_out = NULL; /* name of output file */ char *fn_app = NULL; /* name of item appearances file */ char *blanks = NULL; /* blanks */ char *fldseps = NULL; /* field separators */ char *recseps = NULL; /* record separators */ char *comment = NULL; /* comment indicators */ char *isep = " "; /* item separator for output */ char *impl = " <- "; /* implication sign for ass. rules */ char *dflt = " (%1S)"; /* default format for check */ char *format = dflt; /* format for information output */ int target = 's'; /* target type (sets/rules/h.edges) */ int min = 1; /* minimum rule/item set size */ int max = INT_MAX; /* maximum rule/item set size */ double supp = 10; /* minimum support (in percent) */ double smax = 100; /* maximum support (in percent) */ double conf = 80; /* minimum confidence (in percent) */ int dir = 0; /* direction for size sorting */ int eval = 0; /* additional evaluation measure */ int zero = 0; /* flag for zero eval. below expect. */ int aggm = 0; /* aggregation mode for eval. measure */ double minval = 10; /* minimum evaluation measure value */ int prune = 0; /* (min. size for) evaluation pruning */ double filter = 0.1; /* item usage filtering parameter */ int sort = 2; /* flag for item sorting and recoding */ int tree = 1; /* flag for transaction tree */ int heap = 1; /* flag for heap sort vs. quick sort */ int post = 0; /* flag for a-posteriori pruning */ int report = 0; /* other flags for reporting */ int mode = APP_BODY|IST_PERFECT; /* search mode */ int size; /* current item set size */ int wgt; /* total transaction weight */ int frq, body, head; /* frequency of an item set */ int *items; /* item set (for reporting) */ clock_t t, tt, tc, x; /* timers for measurements */ #ifndef QUIET /* if not quiet version */ prgname = argv[0]; /* get program name for error msgs. */ /* --- print usage message --- */ if (argc > 1) { /* if arguments are given */ fprintf(stderr, "%s - %s\n", argv[0], DESCRIPTION); fprintf(stderr, VERSION); } /* print a startup message */ else { /* if no arguments are given */ printf("usage: %s [options] infile outfile [appfile]\n", argv[0]); printf("%s\n", DESCRIPTION); printf("%s\n", VERSION); printf("-t# target type " "(default: %c)\n", target); printf(" (s: frequent item sets, c: closed item sets,\n" " m: maximal item sets, r: association rules)\n"); printf("-m# minimum number of items per set/rule " "(default: %d)\n", min); printf("-n# maximum number of items per set/rule " "(default: no limit)\n"); printf("-s# minimum support of a set/rule " "(default: %g%%)\n", supp); printf("-S# maximum support of a set/rule " "(default: %g%%)\n", smax); printf(" (positive: percentage, " "negative: absolute number)\n"); printf("-c# minimum confidence of a rule " "(default: %g%%)\n", conf); printf("-o use the original rule support definition " "(body & head)\n"); printf("-e# additional evaluation measure " "(default: none)\n"); printf("-a# aggregation mode for evaluation measure " "(default: none)\n"); printf("-z zero evaluation below expected support " "(default: evaluate all)\n"); printf("-d# minimum value of add. evaluation measure " "(default: %g%%)\n", minval); printf("-p# (min. size for) pruning with evaluation " "(default: no pruning)\n"); printf(" (< 0: backward, > 0: forward)\n"); printf("-l# sort item sets in output by their size " "(default: no sorting)\n"); printf(" (< 0: descending, > 0: ascending)\n"); printf("-g write item names in scanable form " "(quote certain characters)\n"); printf("-k# item separator for output " "(default: \"%s\")\n", isep); printf("-i# implication sign for association rules " "(default: \"%s\")\n", impl); printf("-v# output format for set/rule information " "(default: \"%s\")\n", format); printf("-q# sort items w.r.t. their frequency " "(default: %d)\n", sort); printf(" (1: ascending, -1: descending, 0: do not sort,\n" " 2: ascending, -2: descending w.r.t. " "transaction size sum)\n"); printf("-u# filter unused items from transactions " "(default: %g)\n", filter); printf(" (0: do not filter items w.r.t. usage in sets,\n" " <0: fraction of removed items for filtering,\n" " >0: take execution times ratio into account)\n"); printf("-j use quicksort to sort the transactions " "(default: heapsort)\n"); printf("-x do not prune the search " "with perfect extensions\n"); printf("-y a-posteriori pruning of infrequent item sets\n"); printf("-h do not organize transactions as a prefix tree\n"); printf("-b# blank characters " "(default: \" \\t\\r\")\n"); printf("-f# field separators " "(default: \" \\t,\")\n"); printf("-r# record separators " "(default: \"\\n\")\n"); printf("-C# comment characters " "(default: \"#\")\n"); printf("-! print additional option information\n"); printf("infile file to read transactions from\n"); printf("outfile file to write item sets/association rules" "/hyperedges to\n"); printf("appfile file stating item appearances (optional)\n"); return 0; /* print a usage message */ } /* and abort the program */ #endif /* #ifndef QUIET */ /* free option characters: w [A-Z]\[SC] */ /* --- evaluate arguments --- */ for (i = 1; i < argc; i++) { /* traverse the arguments */ s = argv[i]; /* get an option argument */ if (optarg) { *optarg = s; optarg = NULL; continue; } if ((*s == '-') && *++s) { /* -- if argument is an option */ while (*s) { /* traverse the options */ switch (*s++) { /* evaluate the options */ case '!': help(); break; case 't': target = (*s) ? *s++ : 's'; break; case 'm': min = (int)strtol(s, &s, 0); break; case 'n': max = (int)strtol(s, &s, 0); break; case 's': supp = strtod(s, &s); break; case 'S': smax = strtod(s, &s); break; case 'c': conf = strtod(s, &s); break; case 'o': mode |= APP_BOTH; break; case 'e': eval = (*s) ? *s++ : 0; break; case 'z': zero = IST_ZERO; break; case 'a': aggm = (*s) ? *s++ : 0; break; case 'd': minval = strtod(s, &s); break; case 'p': prune = (int)strtol(s, &s, 0); break; case 'g': report = ISR_SCAN; break; case 'k': optarg = &isep; break; case 'i': optarg = &impl; break; case 'v': optarg = &format; break; case 'l': dir = (int)strtol(s, &s, 0); break; case 'q': sort = (int)strtol(s, &s, 0); break; case 'u': filter = strtod(s, &s); break; case 'h': tree = 0; break; case 'j': heap = 0; break; case 'x': mode &= ~IST_PERFECT; break; case 'y': post = 1; break; case 'b': optarg = &blanks; break; case 'f': optarg = &fldseps; break; case 'r': optarg = &recseps; break; case 'C': optarg = &comment; break; default : error(E_OPTION, *--s); break; } /* set the option variables */ if (optarg && *s) { *optarg = s; optarg = NULL; break; } } } /* get an option argument */ else { /* -- if argument is no option */ switch (k++) { /* evaluate non-options */ case 0: fn_in = s; break; case 1: fn_out = s; break; case 2: fn_app = s; break; default: error(E_ARGCNT); break; } /* note filenames */ } } if (optarg) error(E_OPTARG); /* check the option argument */ if ((k < 2) || (k > 3)) /* and the number of arguments */ error(E_ARGCNT); /* (either in/out or in/out/app) */ if ((!fn_in || !*fn_in) && (fn_app && !*fn_app)) error(E_STDIN); /* stdin must not be used twice */ switch (target) { /* check and translate target type */ case 's': target = TT_ALL; break; case 'c': target = TT_CLOSED; break; case 'm': target = TT_MAXIMAL; break; case 'r': target = TT_RULE; break; default : error(E_TARGET, (char)target); break; } if (min < 0) error(E_SIZE, min); /* check the limits */ if (max < 0) error(E_SIZE, max); /* for the set size */ if (supp > 100) /* check the minimum support */ error(E_SUPP, supp); /* (< 0: absolute support) */ if ((conf < 0) || (conf > 100)) error(E_CONF, conf); /* check the minimum confidence */ switch (eval) { /* check and translate measure */ case 'x': case 0: eval = IST_NONE; break; case 'c': eval = IST_CONF; break; case 'd': eval = IST_CONF_DIFF; break; case 'l': eval = IST_LIFT; break; case 'a': eval = IST_LIFT_DIFF; break; case 'q': eval = IST_LIFT_QUOT; break; case 'v': eval = IST_CVCT; break; case 'e': eval = IST_CVCT_DIFF; break; case 'r': eval = IST_CVCT_QUOT; break; case 'f': eval = IST_CERT; break; case 'n': eval = IST_CHI2; break; case 'p': eval = IST_CHI2_PVAL; break; case 'i': eval = IST_INFO; break; case 'g': eval = IST_INFO_PVAL; break; case 'b': eval = IST_LOGQ; break; default : error(E_MEASURE, (char)eval); break; } switch (aggm) { /* check and translate agg. mode */ case 'x': case 0: aggm = IST_NONE; break; case 'm': aggm = IST_MIN; break; case 'n': aggm = IST_MAX; break; case 'a': aggm = IST_AVG; break; default : error(E_MEASURE, (char)aggm); break; } if ((target > TT_ALL) /* if individual set counters needed */ || ((eval > IST_NONE) && (eval < IST_LOGQ))) mode &= ~IST_PERFECT; /* remove perfect extension pruning */ if (target <= TT_MAXIMAL) { /* remove rule specific settings */ mode |= APP_BOTH; conf = 100; } if ((filter <= -1) || (filter >= 1)) filter = 0; /* check and adapt the filter option */ if (dir) /* if to sort output by size, */ mode &= ~IST_PERFECT; /* do not use perfect ext. pruning */ /* --- create item base --- */ ibase = ib_create(0, 0); /* create an item base and */ if (!ibase) error(E_NOMEM); /* set the special characters */ ib_chars(ibase, blanks, fldseps, recseps, "", comment); MSG(stderr, "\n"); /* terminate the startup message */ /* --- read item appearance indicators --- */ if (fn_app) { /* if item appearances are given */ t = clock(); /* start the timer for the reading */ if (*fn_app) /* if an app. file name is given, */ in = fopen(fn_app, "r"); /* open the item appearances file */ else { /* if no app. file name is given, */ in = stdin; fn_app = "<stdin>"; } /* read from std. input */ MSG(stderr, "reading %s ... ", fn_app); if (!in) error(E_FOPEN, fn_app); k = ib_readapp(ibase, in); /* read the item appearances */ if (k != 0) error(k, fn_app, RECCNT(ibase), BUFFER(ibase)); if (in != stdin) fclose(in);/* if not read from standard input, */ in = NULL; /* close the input file */ MSG(stderr, "[%d item(s)]", ib_cnt(ibase)); MSG(stderr, " done [%.2fs].\n", SEC_SINCE(t)); } /* print a log message */ /* --- read transactions --- */ t = clock(); /* start the timer for the reading */ if (fn_in && *fn_in) /* if an input file name is given, */ in = fopen(fn_in, "r"); /* open input file for reading */ else { /* if no input file name is given, */ in = stdin; fn_in = "<stdin>"; } /* read from standard input */ MSG(stderr, "reading %s ... ", fn_in); if (!in) error(E_FOPEN, fn_in); tabag = tb_create(ibase, 0); /* create a transaction bag/multiset */ if (!tabag) error(E_NOMEM); /* to store the transactions */ while (1) { /* transaction read loop */ k = ib_read(ibase, in); /* read the next transaction */ if (k) { if (k > 0) break; /* check for error and end of file */ error(k, fn_in, RECCNT(ibase), BUFFER(ibase)); } if (tb_add(tabag, NULL) != 0) error(E_NOMEM); } /* add transaction to bag/multiset */ if (in != stdin) fclose(in); /* if not read from standard input, */ in = NULL; /* close the input file */ n = ib_cnt(ibase); /* get the number of items */ k = tb_cnt(tabag); /* get the number of transactions */ wgt = tb_wgt(tabag); /* the total transaction weight */ MSG(stderr, "[%d item(s), ", n); if (k == wgt) MSG(stderr, "%d transaction(s)]", k); else MSG(stderr, "%d/%d transaction(s)]", k, wgt); MSG(stderr, " done [%.2fs].", SEC_SINCE(t)); if ((n <= 0) || (wgt <= 0)) /* check for at least one item */ error(E_NOTRANS); /* and at least one transaction */ MSG(stderr, "\n"); /* terminate the log message */ if (format == dflt) { /* if default info. format is used */ if (target != TT_RULE) format = (supp < 0) ? " (%a)" : " (%1S)"; else format = (supp < 0) ? " (%b, %1C)" : " (%1X, %1C)"; } /* set default according to target */ supp = ceil ((supp >= 0) ? 0.01 *supp *wgt : -supp); smax = floor((smax >= 0) ? 0.01 *smax *wgt : -smax); conf *= 0.01; /* transform support and confidence */ /* --- sort and recode items --- */ t = clock(); /* compute absolute support values */ MSG(stderr, "filtering, sorting and recoding items ... "); map = (int*)malloc(n *sizeof(int)); if (!map) error(E_NOMEM); /* create an item identifier map */ k = (int)((mode & APP_HEAD) ? supp : ceil(supp *conf)); n = ib_recode(ibase, k, sort, map); tb_recode(tabag, map); /* recode the items and transactions */ tb_itsort(tabag, 1, heap); /* and sort items in transactions */ free(map); map = NULL; /* delete the item identifier map */ MSG(stderr, "[%d item(s)] done [%.2fs].", n, SEC_SINCE(t)); if (n <= 0) error(E_NOFREQ); /* print a log message and */ MSG(stderr, "\n"); /* check the number of items */ k = tb_max(tabag); /* clamp the set/rule length to */ if (max > k) max = k; /* the maximum transaction size */ /* --- reduce transactions --- */ t = clock(); /* start the timer for the reduction */ MSG(stderr, "reducing transactions ... "); tb_filter(tabag, min, NULL); /* remove items of short transactions */ tb_sort(tabag, 1, heap); /* sort the trans. lexicographically */ k = tb_reduce(tabag); /* reduce transactions to unique ones */ if (k == wgt) MSG(stderr, "[%d transaction(s)]", k); else MSG(stderr, "[%d/%d transaction(s)]", k, wgt); MSG(stderr, " done [%.2fs].\n", SEC_SINCE(t)); /* --- create transaction tree --- */ tt = 0; /* init. the tree construction time */ if (tree) { /* if to use a transaction tree */ t = clock(); /* start the timer for construction */ MSG(stderr, "building transaction tree ... "); tatree = tt_create(tabag); /* create a transaction tree */ if (!tatree) error(E_NOMEM); if (filter == 0) { /* if not to filter items, */ tb_delete(tabag, 0); /* delete the transaction bag */ tabag = NULL; /* (redundant data storage) */ } MSG(stderr, "[%d node(s)]", tt_nodecnt(tatree)); MSG(stderr, " done [%.2fs].\n", SEC_SINCE(t)); tt = clock() -t; /* note the time for the construction */ } /* of the transaction tree */ /* --- create item set tree --- */ t = clock(); tc = 0; /* start the timer for the search */ istree = ist_create(ibase, mode, (int)supp, (int)smax, conf); if (!istree) error(E_NOMEM); /* create an item set tree */ ist_seteval(istree, eval|zero, aggm, 0.01*minval, prune); /* --- check item subsets --- */ MSG(stderr, "checking subsets of size 1"); map = (int*)malloc(n *sizeof(int)); if (!map) error(E_NOMEM); /* create a filter map */ while (1) { /* traverse the item set sizes */ size = ist_height(istree); /* get the current item set size and */ if (size >= max) break; /* abort if maximal size is reached */ if ((filter != 0) /* if to filter w.r.t. item usage */ && (ist_check(istree, map) <= size)) break; /* check which items are still used */ if (post) /* if a-posteriori pruning requested, */ ist_prune(istree); /* prune infrequent item sets */ k = ist_addlvl(istree); /* while max. height is not reached, */ if (k) { if (k > 0) break; /* add a level to the item set tree */ error(E_NOMEM); } /* if no level was added, abort */ if (((filter < 0) /* if to filter w.r.t. item usage */ && (i < -filter *n)) /* and enough items were removed */ || ((filter > 0) /* or counting time is long enough */ && (i < n) && (i *(double)tt < filter *n *tc))) { n = i; /* note the new number of items */ x = clock(); /* start the timer for filtering */ tb_filter(tabag, size+1, map); tb_sort(tabag, 0, heap); /* remove unnec. items and trans. */ tb_reduce(tabag); /* and reduce trans. to unique ones */ if (tatree) { /* if a transaction tree was created */ tt_delete(tatree, 0); /* delete the transaction tree */ tatree = tt_create(tabag); if (!tatree) error(E_NOMEM); } /* rebuild the transaction tree */ tt = clock() -x; /* note the filter/rebuild time */ } MSG(stderr, " %d", ++size); /* print the current item set size */ x = clock(); /* start the timer for counting */ if (tatree) ist_countx(istree, tatree); else ist_countb(istree, tabag); tc = clock() -x; /* count the transaction tree/bag */ } /* and compute the new counting time */ free(map); map = NULL; /* delete the filter map */ MSG(stderr, " done [%.2fs].\n", SEC_SINCE(t)); /* --- filter found item sets --- */ if ((target == TT_CLOSED) || (target == TT_MAXIMAL)) { t = clock(); /* start the timer for filtering */ MSG(stderr, "filtering for %s item sets ... ", (target == TT_MAXIMAL) ? "maximal" : "closed"); k = target | ((prune < 0) ? IST_EVAL : 0); ist_mark(istree, k); /* filter closed/maximal item sets */ MSG(stderr, "done [%.2fs].\n", SEC_SINCE(t)); } /* print a log message */ /* --- print item sets/rules/hyperedges --- */ t = clock(); /* start the timer for the output */ if (fn_out && *fn_out) /* if an output file name is given, */ out = fopen(fn_out, "w"); /* open the output file */ else { /* if no output file name is given, */ out = stdout; fn_out = "<stdout>"; } /* write to std. output */ MSG(stderr, "writing %s ... ", fn_out); if (!out) error(E_FOPEN, fn_out); if (eval == IST_LOGQ) report |= ISR_LOGS; if ((target == TT_CLOSED) || (target == TT_MAXIMAL)) report |= ISR_NOEXP; /* combine the report mode flags */ isrep = isr_create(ibase, out, report, isep, impl); if (!isrep) error(E_NOMEM); /* create an item set reporter */ isr_setfmt (isrep, format); /* and configure it */ isr_setsize(isrep, min, max); ist_setsize(istree, min, max, dir); ist_init (istree); /* initialize the extraction */ items = t_items(ib_tract(ibase)); if ((target <= TT_MAXIMAL) /* if to find frequent item sets */ && (dir == 0)) { /* and not to sort them by size */ if (eval == IST_LOGQ) /* if to compute an add. evaluation */ isr_seteval(isrep, isr_logq, NULL, 0.01*minval); else if (eval > IST_NONE) /* set the add. evaluation function */ isr_seteval(isrep, ist_evalx, istree, 0.01*minval); n = ist_report(istree, isrep); } /* report the item sets */ else if (target <= TT_MAXIMAL) { /* if to find frequent item sets */ for (n = 0; 1; ) { /* extract item sets from the tree */ k = ist_set(istree, items, &frq, &minval); if (k < 0) break; /* get the next frequent item set */ if (k > 0) fputs(isr_name(isrep, items[0]), out); for (i = 0; ++i < k; ) { /* print the item names */ fputs(isep, out); fputs(isr_name(isrep, items[i]), out); } if (format) /* if requested, print information */ isr_sinfo(isrep, frq, minval); fputc('\n', out); n++; /* terminate the output line and */ } } /* count the reported item set */ else if (target == TT_RULE) { /* if to find association rules, */ for (n = 0; 1; ) { /* extract rules from tree */ k = ist_rule(istree, items, &frq, &body, &head, &minval); if (k < 0) break; /* get the next association rule */ fputs(isr_name(isrep, items[0]), out); fputs(impl, out); /* print name of rule head item */ if (k > 1) fputs(isr_name(isrep, items[1]), out); for (i = 1; ++i < k; ) { /* print names of items in rule body */ fputs(isep, out); fputs(isr_name(isrep, items[i]), out); } if (format) /* if requested, print information */ isr_rinfo(isrep, frq, body, head, minval); fputc('\n', out); n++; /* terminate the output line and */ } /* count the reported ass. rule */ } /* if (target <= TT_MAXIMAL) .. else .. */ if (fflush(out) != 0) error(E_FWRITE, fn_out); if (out != stdout) fclose(out); out = NULL; /* close the output file */ MSG(stderr, "[%d %s(s)] done ", n, (target == TT_RULE) ? "rule" : "set"); MSG(stderr, "[%.2fs].\n", SEC_SINCE(t)); #ifdef BENCH /* if benchmark version, */ ist_stats(istree); /* show the search statistics */ #endif /* (especially memory usage) */ /* --- clean up --- */ #ifndef NDEBUG /* if this is a debug version */ isr_delete(isrep, 0); /* the item set reporter, */ ist_delete(istree); /* the item set tree, */ if (tatree) tt_delete(tatree, 0); /* the transaction tree, */ if (tabag) tb_delete(tabag, 0); /* the transaction bag, */ ib_delete(ibase); /* and the item base */ #endif #ifdef STORAGE /* if storage debugging */ showmem("at end of program"); /* check memory usage */ #endif return 0; /* return 'ok' */ } /* main() */