static void error (int code, ...) { #ifndef QUIET va_list args; const char *msg; assert(prgname); if (code < E_UNKNOWN) code = E_UNKNOWN; if (code < 0) { msg = errmsgs[-code]; if (!msg) msg = errmsgs[-E_UNKNOWN]; fprintf(stderr, "\n%s: ", prgname); va_start(args, code); vfprintf(stderr, msg, args); va_end(args); } #endif #ifndef NDEBUG if (map) free(map); if (isrep) isr_delete(isrep, 0); if (istree) ist_delete(istree); if (tatree) tt_delete(tatree, 0); if (tabag) tb_delete(tabag, 0); if (ibase) ib_delete(ibase); if (in && (in != stdin)) fclose(in); if (out && (out != stdout)) fclose(out); #endif #ifdef STORAGE showmem("at end of program"); #endif exit(code); } /* error() */
static void error (int code, ...) { /* --- print an error message */ #ifndef QUIET /* if not quiet version */ va_list args; /* list of variable arguments */ const char *msg; /* error message */ assert(prgname); /* check the program name */ if (code < E_UNKNOWN) code = E_UNKNOWN; if (code < 0) { /* if to report an error, */ msg = errmsgs[-code]; /* get the error message */ if (!msg) msg = errmsgs[-E_UNKNOWN]; fprintf(stderr, "\n%s: ", prgname); va_start(args, code); /* get variable arguments */ vfprintf(stderr, msg, args);/* print error message */ va_end(args); /* end argument evaluation */ } #endif #ifndef NDEBUG /* if debug version */ if (istree) ist_delete(istree); /* clean up memory */ if (tatree) tat_delete(tatree); /* and close files */ if (taset) tas_delete(taset, 0); if (itemset) is_delete(itemset); if (in && (in != stdin)) fclose(in); if (out && (out != stdout)) fclose(out); #endif #ifdef STORAGE /* if storage debugging */ showmem("at end of program"); /* check memory usage */ #endif exit(code); /* abort the program */ } /* error() */
int do_apriori (int argc, char *argv[]) { /* --- main function */ int i, k = 0, n; /* loop variables, counters */ char *s; /* to traverse the options */ char **optarg = NULL; /* option argument */ char *fn_in = NULL; /* name of input file */ char *fn_out = NULL; /* name of output file */ char *fn_app = NULL; /* name of item appearances file */ char *blanks = NULL; /* blanks */ char *fldseps = NULL; /* field separators */ char *recseps = NULL; /* record separators */ char *comment = NULL; /* comment indicators */ char *used = NULL; /* item usage vector */ double supp = 0.1; /* minimal support (in percent) */ double smax = 1.0; /* maximal support (in percent) */ double conf = 0.8; /* minimal confidence (in percent) */ int mode = IST_BODY; /* search mode (rule support def.) */ int target = 'r'; /* target type (sets/rules/h.edges) */ int arem = 0; /* additional rule evaluation measure */ int lift = 0; /* flag for printing the lift */ double minval = 0.1; /* minimal evaluation measure value */ double lftval = 0; /* lift value (confidence/prior) */ int minlen = 1; /* minimal rule length */ int maxlen = INT_MAX; /* maximal rule length */ int load = 1; /* flag for loading transactions */ int sort = 2; /* flag for item sorting and recoding */ double filter = 0.1; /* item usage filtering parameter */ int tree = 1; /* flag for transaction tree */ int heap = 1; /* flag for heap sort vs. quick sort */ int c2scf = 0; /* flag for conv. to scanable form */ char *sep = " "; /* item separator for output */ char *fmt = "%.1f"; /* output format for support/conf. */ int sout = 1; /* flag for abs./rel. support output */ int ext = 0; /* flag for extended support output */ int aval = 0; /* flag for add. eval. measure value */ int maxcnt = 0; /* maximal number of items per set */ int tacnt; /* number of transactions */ int frq; /* frequency of an item set */ int *map, *set; /* identifier map, item set */ int verbose = 0; /* flag for verboseness */ const char *name; /* buffer for item names */ static char buf[4*TS_SIZE+4]; /* buffer for formatting */ clock_t t, tt, tc, x; /* timer for measurements */ #ifndef QUIET /* if not quiet version */ prgname = argv[0]; /* get program name for error msgs. */ /* --- print usage message --- */ if (argc > 1) { /* if arguments are given */ fprintf(stderr, "%s - %s\n", argv[0], DESCRIPTION); fprintf(stderr, VERSION); } /* print a startup message */ else { /* if no arguments given */ printf("usage: %s [options] infile outfile [appfile]\n", argv[0]); printf("%s\n", DESCRIPTION); printf("%s\n", VERSION); printf("-t# target type (default: association rules)\n" " (s: item sets, c: closed item sets," " m: maximal item sets,\n" " r: association rules," " h: association hyperedges)\n"); printf("-m# minimal number of items per set/rule/hyperedge " "(default: %d)\n", minlen); printf("-n# maximal number of items per set/rule/hyperedge " "(default: no limit)\n"); printf("-s# minimal support of a set/rule/hyperedge " "(default: %g%%)\n", supp *100); printf("-S# maximal support of a set/rule/hyperedge " "(default: %g%%)\n", smax *100); printf("-c# minimal confidence of a rule/hyperedge " "(default: %g%%)\n", conf *100); printf("-o use original definition of the support of a rule " "(body & head)\n"); printf("-k# item separator for output " "(default: \"%s\")\n", sep); printf("-p# output format for support/confidence " "(default: \"%s\")\n", fmt); printf("-x extended support output " "(print both rule support types)\n"); printf("-a print absolute support " "(number of transactions)\n"); printf("-y print lift value (confidence divided by prior)\n"); printf("-e# additional evaluation measure (default: none)\n"); printf("-! print a list of additional evaluation measures\n"); printf("-d# minimal value of additional evaluation measure " "(default: %g%%)\n", minval *100); printf("-v print value of additional " "rule evaluation measure\n"); printf("-g write output in scanable form " "(quote certain characters)\n"); printf("-l do not load transactions into memory " "(work on input file)\n"); printf("-q# sort items w.r.t. their frequency (default: %d)\n" " (1: ascending, -1: descending, 0: do not sort,\n" " 2: ascending, -2: descending w.r.t. " "transaction size sum)\n", sort); printf("-u# filter unused items from transactions " "(default: %g)\n", filter); printf(" (0: do not filter items w.r.t. usage in sets,\n" " <0: fraction of removed items for filtering,\n" " >0: take execution times ratio into account)\n"); printf("-h do not organize transactions as a prefix tree\n"); printf("-j use quicksort to sort the transactions " "(default: heapsort)\n"); printf("-z minimize memory usage " "(default: maximize speed)\n"); printf("-b/f/r# blank characters, field and record separators\n" " (default: \" \\t\\r\", \" \\t\", \"\\n\")\n"); printf("-C# comment characters (default: \"#\")\n"); printf("-V verbose\n"); printf("infile file to read transactions from\n"); printf("outfile file to write item sets/association rules" "/hyperedges to\n"); printf("appfile file stating item appearances (optional)\n"); return 0; /* print a usage message */ } /* and abort the program */ #endif /* #ifndef QUIET */ /* --- evaluate arguments --- */ for (i = 1; i < argc; i++) { /* traverse arguments */ s = argv[i]; /* get option argument */ if (optarg) { *optarg = s; optarg = NULL; continue; } if ((*s == '-') && *++s) { /* -- if argument is an option */ while (*s) { /* traverse options */ switch (*s++) { /* evaluate switches */ case '!': help(); break; case 't': target = (*s) ? *s++ : 'r'; break; case 'm': minlen = (int)strtol(s, &s, 0); break; case 'n': maxlen = (int)strtol(s, &s, 0); break; case 's': supp = 0.01*strtod(s, &s); break; case 'S': smax = 0.01*strtod(s, &s); break; case 'c': conf = 0.01*strtod(s, &s); break; case 'o': mode |= IST_BOTH; break; case 'k': optarg = &sep; break; case 'p': optarg = &fmt; break; case 'x': ext = 1; break; case 'a': sout |= 2; break; case 'y': lift = 1; break; case 'e': arem = (*s) ? *s++ : 0; break; case 'd': minval = 0.01*strtod(s, &s); break; case 'v': aval = 1; break; case 'g': c2scf = 1; break; case 'l': load = 0; break; case 'q': sort = (int)strtol(s, &s, 0); break; case 'u': filter = strtod(s, &s); break; case 'h': tree = 0; break; case 'j': heap = 0; break; case 'z': mode |= IST_MEMOPT; break; case 'b': optarg = &blanks; break; case 'f': optarg = &fldseps; break; case 'r': optarg = &recseps; break; case 'C': optarg = &comment; break; case 'V': verbose = 1; break; default : error(E_OPTION, *--s); break; } /* set option variables */ if (optarg && *s) { *optarg = s; optarg = NULL; break; } } } /* get option argument */ else { /* -- if argument is no option */ switch (k++) { /* evaluate non-options */ case 0: fn_in = s; break; case 1: fn_out = s; break; case 2: fn_app = s; break; default: error(E_ARGCNT); break; } /* note filenames */ } } if (optarg) error(E_OPTARG); /* check option argument */ if ((k < 2) || (k > 3)) /* and the number of arguments */ error(E_ARGCNT); /* (either in/out or in/out/app) */ if ((!fn_in || !*fn_in) && (fn_app && !*fn_app)) error(E_STDIN); /* stdin must not be used twice */ switch (target) { /* check and translate target type */ case 's': target = TT_SET; break; case 'c': target = TT_CLSET; break; case 'm': target = TT_MFSET; break; case 'r': target = TT_RULE; break; case 'h': target = TT_HEDGE; break; case 'g': target = TT_GROUP; break; default : error(E_TARGET, (char)target); break; } if (supp > 1) /* check the minimal support */ error(E_SUPP, supp); /* (< 0: absolute number) */ if ((conf < 0) || (conf > 1)) error(E_CONF, conf); /* check the minimal confidence */ if (minlen <= 0) error(E_RULELEN, minlen); /* check the limits */ if (maxlen <= 0) error(E_RULELEN, maxlen); /* for the rule length */ switch (arem) { /* check and translate measure */ case 0 : case '0': arem = EM_NONE; break; case 'd': case '1': arem = EM_DIFF; break; case 'q': case '2': arem = EM_QUOT; break; case 'a': case '3': arem = EM_AIMP; break; case 'i': case '4': arem = EM_INFO; break; case 'c': case '5': arem = EM_CHI2; break; case 'p': case '6': arem = EM_PVAL; break; default : error(E_MEASURE, (char)arem); break; } if (target <= TT_MFSET) { /* in item set mode neutralize */ mode |= IST_BOTH; conf = 1;}/* rule specific settings */ if (arem == EM_NONE) /* if no add. rule eval. measure, */ aval = 0; /* clear the corresp. output flag */ if ((filter <= -1) || (filter >= 1)) filter = 0; /* --- create item set and transaction set --- */ itemset = is_create(-1); /* create an item set and */ if (!itemset) error(E_NOMEM); /* set the special characters */ is_chars(itemset, blanks, fldseps, recseps, comment); if (load) { /* if to load the transactions */ taset = tas_create(itemset); if (!taset) error(E_NOMEM); /* create a transaction set */ } /* to store the transactions */ MSG(fprintf(stderr, "\n")); /* terminate the startup message */ /* --- read item appearances --- */ if (fn_app) { /* if item appearances are given */ t = clock(); /* start the timer */ if (*fn_app) /* if an app. file name is given, */ in = fopen(fn_app, "r"); /* open the item appearances file */ else { /* if no app. file name is given, */ in = stdin; fn_app = "<stdin>"; } /* read from std. input */ MSG(fprintf(stderr, "reading %s ... ", fn_app)); if (!in) error(E_FOPEN, fn_app); k = is_readapp(itemset,in); /* read the item appearances */ if (k != 0) error(k, fn_app, RECCNT(itemset), BUFFER(itemset)); if (in != stdin) /* if not read from standard input, */ fclose(in); /* close the input file */ MSG(fprintf(stderr, "[%d item(s)]", is_cnt(itemset))); MSG(fprintf(stderr, " done [%.2fs].\n", SEC_SINCE(t))); } /* print a log message */ /* --- read transactions --- */ t = clock(); /* start the timer */ if (fn_in && *fn_in) /* if an input file name is given, */ in = fopen(fn_in, "r"); /* open input file for reading */ else { /* if no input file name is given, */ in = stdin; fn_in = "<stdin>"; } /* read from standard input */ MSG(fprintf(stderr, "reading %s ... \n", fn_in)); if (!in) error(E_FOPEN, fn_in); while (1) { /* transaction read loop */ k = is_read(itemset, in); /* read the next transaction */ if (k < 0) error(k, fn_in, RECCNT(itemset), BUFFER(itemset)); if (k > 0) break; /* check for error and end of file */ k = is_tsize(itemset); /* update the maximal */ if (k > maxcnt) maxcnt = k; /* transaction size */ if (taset && (tas_add(taset, NULL, 0) != 0)) error(E_NOMEM); /* add the loaded transaction */ } /* to the transaction set */ if (taset) { /* if transactions have been loaded */ if (in != stdin) fclose(in);/* if not read from standard input, */ in = NULL; /* close the input file */ } /* clear the file variable */ n = is_cnt(itemset); /* get the number of items */ tacnt = is_gettac(itemset); /* and the number of transactions */ MSG(fprintf(stderr, "[%d item(s), %d transaction(s)]", n, tacnt)); MSG(fprintf(stderr, " done [%.2fs].", SEC_SINCE(t))); if ((n <= 0) || (tacnt <= 0)) error(E_NOTAS); MSG(fprintf(stderr, "\n")); /* check for at least one transaction */ if (supp >= 0) /* if relative support is given */ supp = ceil(tacnt *supp); /* compute absolute support */ else { /* if absolute support is given, */ supp = ceil(-100 *supp); /* make the support value positive */ if (!(sout & 2)) sout = 2; /* switch to absolute support output */ } /* do the same with the max. support */ smax = floor(((smax >= 0) ? tacnt : -100) *smax); /* --- sort and recode items --- */ MSG(fprintf(stderr, "filtering, sorting and recoding items ... ")); t = clock(); /* start the timer */ map = (int*)malloc(is_cnt(itemset) *sizeof(int)); if (!map) error(E_NOMEM); /* create an item identifier map */ k = (int)((mode & IST_HEAD) ? supp : ceil(supp *conf)); n = is_recode(itemset, k, sort, map); if (taset) { /* sort and recode the items and */ tas_recode(taset, map,n); /* recode the loaded transactions */ maxcnt = tas_max(taset); /* get the new maximal t.a. size */ } /* (may be smaller than before) */ free(map); /* delete the item identifier map */ MSG(fprintf(stderr, "[%d item(s)] ", n)); MSG(fprintf(stderr, "done [%.2fs].", SEC_SINCE(t))); if (n <= 0) error(E_NOFREQ); /* print a log message and */ MSG(fprintf(stderr, "\n")); /* check the number of items */ if (maxlen > maxcnt) /* clamp the set/rule length */ maxlen = maxcnt; /* to the maximum set size */ /* --- create a transaction tree --- */ tt = 0; /* init. the tree construction time */ if (tree && taset) { /* if transactions were loaded */ MSG(fprintf(stderr, "creating transaction tree ... ")); t = clock(); /* start the timer */ tatree = tat_create(taset, heap); if (!tatree) error(E_NOMEM);/* create a transaction tree */ if (filter == 0) { /* if a tree rebuild is not needed, */ tas_delete(taset, 0); taset = NULL; } /* delete transactions */ tt = clock() -t; /* note the time for the construction */ MSG(fprintf(stderr, "done [%.2fs].\n", SEC_SINCE(t))); } /* print a log message */ /* --- create an item set tree --- */ t = clock(); tc = 0; /* start the timer */ istree = ist_create(itemset, mode, (int)supp, conf); if (!istree) error(E_NOMEM); /* create an item set tree */ /* --- check item subsets --- */ if (filter) { /* if to filter unused items */ used = (char*)malloc(is_cnt(itemset) *sizeof(char)); if (!used) error(E_NOMEM); /* create a flag vector */ } /* for the items */ MSG(fprintf(stderr, "checking subsets of size 1")); while (ist_height(istree) < maxlen) { if (filter != 0) { /* if to filter w.r.t. item usage, */ i = ist_check(istree, used); /* check current item usage */ if (i < maxlen) maxlen = i; /* update the maximum size */ if (ist_height(istree) >= i) break; } /* check the tree height */ k = ist_addlvl(istree); /* while max. height is not reached, */ if (k < 0) error(E_NOMEM); /* add a level to the item set tree */ if (k != 0) break; /* if no level was added, abort */ MSG(fprintf(stderr, " %d", ist_height(istree))); if (tatree) { /* if a transaction tree was created */ if (((filter < 0) /* if to filter w.r.t. item usage */ && (i < -filter *n)) /* and enough items were removed */ || ((filter > 0) /* or counting time is long enough */ && (i < n) && (i *(double)tt < filter *n *tc))) { n = i; x = clock(); /* note the new number of items */ tas_filter(taset, used);/* and remove unnecessary items */ tat_delete(tatree); /* delete the transaction tree */ tatree = tat_create(taset, heap); if (!tatree) error(E_NOMEM); tt = clock() -x; /* rebuild the transaction tree and */ } /* note the new construction time */ x = clock(); /* count the transaction tree */ ist_countx(istree, tatree); tc = clock() -x; } /* note the new count time */ else if (taset) { /* if transactions were loaded */ if (((filter < 0) /* if to filter w.r.t. item usage */ && (i <= -filter *n)) /* and enough items were removed */ || ((filter > 0) /* or counting time is long enough */ && (i *(double)tt <= filter *n *tc))) { n = i; x = clock(); /* note the new number of items */ tas_filter(taset, used);/* and remove unnecessary items */ tt = clock() -t; /* from the transactions */ } /* note the filtering time */ for (i = tacnt; --i >= 0;)/* traverse and count transactions */ ist_count(istree, tas_tract(taset, i), tas_tsize(taset, i)); tc = clock() -t; } /* note the new count time */ else { /* if to work on the input file, */ rewind(in); /* reset the file position */ for (maxcnt = 0; (i = is_read(itemset, in)) == 0; ) { if (filter != 0) /* (re)read the transactions and */ is_filter(itemset, used); /* remove unnecessary items */ k = is_tsize(itemset); /* update the maximum size */ if (k > maxcnt) maxcnt = k; /* of a transaction */ ist_count(istree, is_tract(itemset), k); } /* count the transaction in the tree */ if (i < 0) error(i, fn_in, RECCNT(itemset), BUFFER(itemset)); if (maxcnt < maxlen) /* update the maximal rule length */ maxlen = maxcnt; /* according to the max. t.a. size */ } /* (may be smaller than before) */ } if (!taset && !tatree) { /* if transactions were not loaded */ if (in != stdin) fclose(in);/* if not read from standard input, */ in = NULL; /* close the input file */ } /* clear the file variable */ MSG(fprintf(stderr, " done [%.2fs].\n", SEC_SINCE(t))); /* --- filter found item sets --- */ if ((target == TT_CLSET) || (target == TT_MFSET)) { MSG(fprintf(stderr, "filtering %s item sets ... ", (target == TT_MFSET) ? "maximal" : "closed")); t = clock(); /* filter the item sets */ ist_filter(istree, (target == TT_MFSET) ? IST_MAXFRQ : IST_CLOSED); MSG(fprintf(stderr, "done [%.2fs].\n", SEC_SINCE(t))); } /* (filter takes longer than print) */ /* --- sort transactions --- */ if (target <= TT_MFSET) { /* if to find frequent item sets */ if (!taset) /* transactions must be loaded */ ext = 0; /* for extended support output */ else if (ext) { /* if extended output is requested */ MSG(fprintf(stderr, "sorting transactions ... ")); t = clock(); /* start the timer */ tas_sort(taset, heap); /* sort the transactions */ MSG(fprintf(stderr, "done [%.2fs].\n", SEC_SINCE(t))); } /* (sorting is necessary to find the */ } /* number of identical transactions) */ /* --- print item sets/rules/hyperedges --- */ t = clock(); /* start the timer */ if (fn_out && *fn_out) /* if an output file name is given, */ out = fopen(fn_out, "w"); /* open the output file */ else { /* if no output file name is given, */ out = stdout; fn_out = "<stdout>"; } /* write to std. output */ MSG(fprintf(stderr, "writing %s ... ", fn_out)); if (!out) error(E_FOPEN, fn_out); ist_init(istree, minlen, arem, minval); set = is_tract(itemset); /* get the transaction buffer */ if (target <= TT_MFSET) { /* if to find frequent item sets */ for (n = 0; 1; ) { /* extract item sets from the tree */ k = ist_set(istree, set, &frq, &conf); if (k <= 0) break; /* get the next frequent item set */ if (frq > smax) continue; /* check against maximal support */ for (i = 0; i < k; i++) { /* traverse the set's items */ name = is_name(itemset, set[i]); if (c2scf) { sc_format(buf, name, 0); name = buf; } fputs(name, out); /* print the name of the next item */ fputs((i < k-1) ? sep : " ", out); } /* print a separator */ fputs(" (", out); /* print the item set's support */ if (sout & 1) { fprintf(out, fmt, (frq/(double)tacnt) *100); if (sout & 2) fputc('/', out); } if (sout & 2) { fprintf(out, "%d", frq); } if (ext) { /* if to print the extended support */ frq = tas_occur(taset, set, k); fputs(", ", out); /* get the number of occurrences */ fprintf(out, fmt, (frq/(double)tacnt) *100); if (sout & 2) fprintf(out, "/%d", frq); } /* print the extended support data */ if (aval) { fputs(", ", out); fprintf(out, fmt, conf *100); } fputs(")\n", out); /* print the add. eval. measure, */ n++; /* terminate the support output, */ } } /* and count the item set */ else if (target == TT_RULE) { /* if to find association rules, */ for (n = 0; 1; ) { /* extract rules from tree */ k = ist_rule(istree, set, &frq, &conf, &lftval, &minval); if (k <= 0) break; /* get the next association rule */ if (frq > smax) continue; /* check against maximal support */ for (i = 0; i < k; i++) { /* traverse the rule's items */ name = is_name(itemset, set[i]); if (c2scf) { sc_format(buf, name, 0); name = buf; } fputs(name, out); /* print the next item */ fputs((i <= 0) ? " <- " : ((i < k-1) ? sep : " "), out); } /* print a separator */ fputs(" (", out); /* print the rule evaluation */ if (sout & 1) supp = frq/(double)tacnt; if (ext && !(mode & IST_HEAD)) { if (sout & 1) { fprintf(out, fmt, supp *conf *100); if (sout & 2) fputc('/', out); } if (sout & 2) { fprintf(out, "%d", (int)(frq *conf +0.5));} fputs(", ", out); /* print the support of the rule */ } /* from the support of the body */ if (sout & 1) { fprintf(out, fmt, supp *100); if (sout & 2) fputc('/', out); } if (sout & 2) { fprintf(out, "%d", frq); } fputs(", ", out); /* print the rule support */ if (ext && (mode & IST_HEAD)) { if (sout & 1) { fprintf(out, fmt, (supp/conf) *100); if (sout & 2) fputc('/', out); } if (sout & 2) { fprintf(out, "%d", (int)(frq /conf +0.5));} fputs(", ", out); /* print the support of the body */ } /* from the support of the rule */ fprintf(out, fmt, conf *100); /* print the rule confidence */ if (lift) { fputs(", ", out); fprintf(out, fmt, lftval *100); } if (aval) { fputs(", ", out); fprintf(out, fmt, minval *100); } fputs(")\n", out); /* print the value of the additional */ n++; /* rule evaluation measure and */ } } /* count the association rule */ else if (target == TT_HEDGE){ /* if to find association hyperedges */ for (n = 0; 1; ) { /* extract hyperedges from tree */ k = ist_hedge(istree, set, &frq, &conf, &minval); if (k <= 0) break; /* get the next hyperedge */ if (frq > smax) continue; /* check against maximal support */ for (i = 0; i < k; i++) { /* traverse the edge's items */ name = is_name(itemset, set[i]); if (c2scf) { sc_format(buf, name, 0); name = buf; } fputs(name, out); /* print the name of the next item */ fputs((i < k-1) ? sep : " ", out); } /* print a separator */ fputs(" (", out); /* print the hyperedge evaluation */ if (sout & 1) { fprintf(out, fmt, (frq/(double)tacnt) *100); if (sout & 2) fputc('/', out); } if (sout & 2) { fprintf(out, "%d", frq); } fputs(", ", out); fprintf(out, fmt, conf *100); if (aval) { fputs(", ", out); fprintf(out, fmt, minval *100); } fputs(")\n", out); /* print support and confidence */ n++; /* of the hyperedge and */ } } /* count the hyperedge */ else { /* if to find association groups */ for (n = 0; 1; ) { /* extract groups from tree */ k = ist_group(istree, set, &frq, &minval); if (k <= 0) break; /* get the next group */ if (frq > smax) continue; /* check against maximal support */ for (i = 0; i < k; i++) { /* traverse the group's items */ name = is_name(itemset, set[i]); if (c2scf) { sc_format(buf, name, 0); name = buf; } fputs(name, out); /* print the name of the next item */ fputs((i < k-1) ? sep : " ", out); } /* print a separator */ fputs(" (", out); /* print the group evaluation */ if (sout & 1) { fprintf(out, fmt, (frq/(double)tacnt) *100); if (sout & 2) fputc('/', out); } if (sout & 2) { fprintf(out, "%d", frq); } if (aval) { fputs(", ", out); fprintf(out, fmt, minval *100); } fputs(")\n", out); /* print support and add. measure */ n++; /* and count the group */ } } /* if (target <= TT_MFSET) .. else .. */ if (fflush(out) != 0) error(E_FWRITE, fn_out); if (out != stdout) fclose(out); out = NULL; /* close the output file */ MSG(fprintf(stderr, "[%d %s(s)] done ", n, ttypes[target])); MSG(fprintf(stderr, "[%.2fs].\n", SEC_SINCE(t))); #ifdef BENCH printf("number of support counters: %d\n", istree->sccnt); printf("necessary support counters: %d\n", istree->scnec); printf("number of child pointers : %d\n", istree->cpcnt); printf("necessary child pointers : %d\n", istree->cpnec); printf("allocated memory (bytes) : %d\n", istree->bytes); #endif /* --- clean up --- */ #ifndef NDEBUG /* if this is a debug version */ free(used); /* delete the item app. vector */ ist_delete(istree); /* delete the item set tree, */ if (tatree) tat_delete(tatree); /* the transaction tree, */ if (taset) tas_delete(taset, 0); /* the transaction set, */ is_delete(itemset); /* and the item set */ #endif #ifdef STORAGE /* if storage debugging */ showmem("at end of program"); /* check memory usage */ #endif return 0; /* return 'ok' */ } /* main() */
int main (int argc, char *argv[]) { /* --- main function */ int i, k = 0, n; /* loop variables, counters */ char *s; /* to traverse the options */ char **optarg = NULL; /* option argument */ char *fn_in = NULL; /* name of input file */ char *fn_out = NULL; /* name of output file */ char *fn_app = NULL; /* name of item appearances file */ char *blanks = NULL; /* blanks */ char *fldseps = NULL; /* field separators */ char *recseps = NULL; /* record separators */ char *comment = NULL; /* comment indicators */ char *isep = " "; /* item separator for output */ char *impl = " <- "; /* implication sign for ass. rules */ char *dflt = " (%1S)"; /* default format for check */ char *format = dflt; /* format for information output */ int target = 's'; /* target type (sets/rules/h.edges) */ int min = 1; /* minimum rule/item set size */ int max = INT_MAX; /* maximum rule/item set size */ double supp = 10; /* minimum support (in percent) */ double smax = 100; /* maximum support (in percent) */ double conf = 80; /* minimum confidence (in percent) */ int dir = 0; /* direction for size sorting */ int eval = 0; /* additional evaluation measure */ int zero = 0; /* flag for zero eval. below expect. */ int aggm = 0; /* aggregation mode for eval. measure */ double minval = 10; /* minimum evaluation measure value */ int prune = 0; /* (min. size for) evaluation pruning */ double filter = 0.1; /* item usage filtering parameter */ int sort = 2; /* flag for item sorting and recoding */ int tree = 1; /* flag for transaction tree */ int heap = 1; /* flag for heap sort vs. quick sort */ int post = 0; /* flag for a-posteriori pruning */ int report = 0; /* other flags for reporting */ int mode = APP_BODY|IST_PERFECT; /* search mode */ int size; /* current item set size */ int wgt; /* total transaction weight */ int frq, body, head; /* frequency of an item set */ int *items; /* item set (for reporting) */ clock_t t, tt, tc, x; /* timers for measurements */ #ifndef QUIET /* if not quiet version */ prgname = argv[0]; /* get program name for error msgs. */ /* --- print usage message --- */ if (argc > 1) { /* if arguments are given */ fprintf(stderr, "%s - %s\n", argv[0], DESCRIPTION); fprintf(stderr, VERSION); } /* print a startup message */ else { /* if no arguments are given */ printf("usage: %s [options] infile outfile [appfile]\n", argv[0]); printf("%s\n", DESCRIPTION); printf("%s\n", VERSION); printf("-t# target type " "(default: %c)\n", target); printf(" (s: frequent item sets, c: closed item sets,\n" " m: maximal item sets, r: association rules)\n"); printf("-m# minimum number of items per set/rule " "(default: %d)\n", min); printf("-n# maximum number of items per set/rule " "(default: no limit)\n"); printf("-s# minimum support of a set/rule " "(default: %g%%)\n", supp); printf("-S# maximum support of a set/rule " "(default: %g%%)\n", smax); printf(" (positive: percentage, " "negative: absolute number)\n"); printf("-c# minimum confidence of a rule " "(default: %g%%)\n", conf); printf("-o use the original rule support definition " "(body & head)\n"); printf("-e# additional evaluation measure " "(default: none)\n"); printf("-a# aggregation mode for evaluation measure " "(default: none)\n"); printf("-z zero evaluation below expected support " "(default: evaluate all)\n"); printf("-d# minimum value of add. evaluation measure " "(default: %g%%)\n", minval); printf("-p# (min. size for) pruning with evaluation " "(default: no pruning)\n"); printf(" (< 0: backward, > 0: forward)\n"); printf("-l# sort item sets in output by their size " "(default: no sorting)\n"); printf(" (< 0: descending, > 0: ascending)\n"); printf("-g write item names in scanable form " "(quote certain characters)\n"); printf("-k# item separator for output " "(default: \"%s\")\n", isep); printf("-i# implication sign for association rules " "(default: \"%s\")\n", impl); printf("-v# output format for set/rule information " "(default: \"%s\")\n", format); printf("-q# sort items w.r.t. their frequency " "(default: %d)\n", sort); printf(" (1: ascending, -1: descending, 0: do not sort,\n" " 2: ascending, -2: descending w.r.t. " "transaction size sum)\n"); printf("-u# filter unused items from transactions " "(default: %g)\n", filter); printf(" (0: do not filter items w.r.t. usage in sets,\n" " <0: fraction of removed items for filtering,\n" " >0: take execution times ratio into account)\n"); printf("-j use quicksort to sort the transactions " "(default: heapsort)\n"); printf("-x do not prune the search " "with perfect extensions\n"); printf("-y a-posteriori pruning of infrequent item sets\n"); printf("-h do not organize transactions as a prefix tree\n"); printf("-b# blank characters " "(default: \" \\t\\r\")\n"); printf("-f# field separators " "(default: \" \\t,\")\n"); printf("-r# record separators " "(default: \"\\n\")\n"); printf("-C# comment characters " "(default: \"#\")\n"); printf("-! print additional option information\n"); printf("infile file to read transactions from\n"); printf("outfile file to write item sets/association rules" "/hyperedges to\n"); printf("appfile file stating item appearances (optional)\n"); return 0; /* print a usage message */ } /* and abort the program */ #endif /* #ifndef QUIET */ /* free option characters: w [A-Z]\[SC] */ /* --- evaluate arguments --- */ for (i = 1; i < argc; i++) { /* traverse the arguments */ s = argv[i]; /* get an option argument */ if (optarg) { *optarg = s; optarg = NULL; continue; } if ((*s == '-') && *++s) { /* -- if argument is an option */ while (*s) { /* traverse the options */ switch (*s++) { /* evaluate the options */ case '!': help(); break; case 't': target = (*s) ? *s++ : 's'; break; case 'm': min = (int)strtol(s, &s, 0); break; case 'n': max = (int)strtol(s, &s, 0); break; case 's': supp = strtod(s, &s); break; case 'S': smax = strtod(s, &s); break; case 'c': conf = strtod(s, &s); break; case 'o': mode |= APP_BOTH; break; case 'e': eval = (*s) ? *s++ : 0; break; case 'z': zero = IST_ZERO; break; case 'a': aggm = (*s) ? *s++ : 0; break; case 'd': minval = strtod(s, &s); break; case 'p': prune = (int)strtol(s, &s, 0); break; case 'g': report = ISR_SCAN; break; case 'k': optarg = &isep; break; case 'i': optarg = &impl; break; case 'v': optarg = &format; break; case 'l': dir = (int)strtol(s, &s, 0); break; case 'q': sort = (int)strtol(s, &s, 0); break; case 'u': filter = strtod(s, &s); break; case 'h': tree = 0; break; case 'j': heap = 0; break; case 'x': mode &= ~IST_PERFECT; break; case 'y': post = 1; break; case 'b': optarg = &blanks; break; case 'f': optarg = &fldseps; break; case 'r': optarg = &recseps; break; case 'C': optarg = &comment; break; default : error(E_OPTION, *--s); break; } /* set the option variables */ if (optarg && *s) { *optarg = s; optarg = NULL; break; } } } /* get an option argument */ else { /* -- if argument is no option */ switch (k++) { /* evaluate non-options */ case 0: fn_in = s; break; case 1: fn_out = s; break; case 2: fn_app = s; break; default: error(E_ARGCNT); break; } /* note filenames */ } } if (optarg) error(E_OPTARG); /* check the option argument */ if ((k < 2) || (k > 3)) /* and the number of arguments */ error(E_ARGCNT); /* (either in/out or in/out/app) */ if ((!fn_in || !*fn_in) && (fn_app && !*fn_app)) error(E_STDIN); /* stdin must not be used twice */ switch (target) { /* check and translate target type */ case 's': target = TT_ALL; break; case 'c': target = TT_CLOSED; break; case 'm': target = TT_MAXIMAL; break; case 'r': target = TT_RULE; break; default : error(E_TARGET, (char)target); break; } if (min < 0) error(E_SIZE, min); /* check the limits */ if (max < 0) error(E_SIZE, max); /* for the set size */ if (supp > 100) /* check the minimum support */ error(E_SUPP, supp); /* (< 0: absolute support) */ if ((conf < 0) || (conf > 100)) error(E_CONF, conf); /* check the minimum confidence */ switch (eval) { /* check and translate measure */ case 'x': case 0: eval = IST_NONE; break; case 'c': eval = IST_CONF; break; case 'd': eval = IST_CONF_DIFF; break; case 'l': eval = IST_LIFT; break; case 'a': eval = IST_LIFT_DIFF; break; case 'q': eval = IST_LIFT_QUOT; break; case 'v': eval = IST_CVCT; break; case 'e': eval = IST_CVCT_DIFF; break; case 'r': eval = IST_CVCT_QUOT; break; case 'f': eval = IST_CERT; break; case 'n': eval = IST_CHI2; break; case 'p': eval = IST_CHI2_PVAL; break; case 'i': eval = IST_INFO; break; case 'g': eval = IST_INFO_PVAL; break; case 'b': eval = IST_LOGQ; break; default : error(E_MEASURE, (char)eval); break; } switch (aggm) { /* check and translate agg. mode */ case 'x': case 0: aggm = IST_NONE; break; case 'm': aggm = IST_MIN; break; case 'n': aggm = IST_MAX; break; case 'a': aggm = IST_AVG; break; default : error(E_MEASURE, (char)aggm); break; } if ((target > TT_ALL) /* if individual set counters needed */ || ((eval > IST_NONE) && (eval < IST_LOGQ))) mode &= ~IST_PERFECT; /* remove perfect extension pruning */ if (target <= TT_MAXIMAL) { /* remove rule specific settings */ mode |= APP_BOTH; conf = 100; } if ((filter <= -1) || (filter >= 1)) filter = 0; /* check and adapt the filter option */ if (dir) /* if to sort output by size, */ mode &= ~IST_PERFECT; /* do not use perfect ext. pruning */ /* --- create item base --- */ ibase = ib_create(0, 0); /* create an item base and */ if (!ibase) error(E_NOMEM); /* set the special characters */ ib_chars(ibase, blanks, fldseps, recseps, "", comment); MSG(stderr, "\n"); /* terminate the startup message */ /* --- read item appearance indicators --- */ if (fn_app) { /* if item appearances are given */ t = clock(); /* start the timer for the reading */ if (*fn_app) /* if an app. file name is given, */ in = fopen(fn_app, "r"); /* open the item appearances file */ else { /* if no app. file name is given, */ in = stdin; fn_app = "<stdin>"; } /* read from std. input */ MSG(stderr, "reading %s ... ", fn_app); if (!in) error(E_FOPEN, fn_app); k = ib_readapp(ibase, in); /* read the item appearances */ if (k != 0) error(k, fn_app, RECCNT(ibase), BUFFER(ibase)); if (in != stdin) fclose(in);/* if not read from standard input, */ in = NULL; /* close the input file */ MSG(stderr, "[%d item(s)]", ib_cnt(ibase)); MSG(stderr, " done [%.2fs].\n", SEC_SINCE(t)); } /* print a log message */ /* --- read transactions --- */ t = clock(); /* start the timer for the reading */ if (fn_in && *fn_in) /* if an input file name is given, */ in = fopen(fn_in, "r"); /* open input file for reading */ else { /* if no input file name is given, */ in = stdin; fn_in = "<stdin>"; } /* read from standard input */ MSG(stderr, "reading %s ... ", fn_in); if (!in) error(E_FOPEN, fn_in); tabag = tb_create(ibase, 0); /* create a transaction bag/multiset */ if (!tabag) error(E_NOMEM); /* to store the transactions */ while (1) { /* transaction read loop */ k = ib_read(ibase, in); /* read the next transaction */ if (k) { if (k > 0) break; /* check for error and end of file */ error(k, fn_in, RECCNT(ibase), BUFFER(ibase)); } if (tb_add(tabag, NULL) != 0) error(E_NOMEM); } /* add transaction to bag/multiset */ if (in != stdin) fclose(in); /* if not read from standard input, */ in = NULL; /* close the input file */ n = ib_cnt(ibase); /* get the number of items */ k = tb_cnt(tabag); /* get the number of transactions */ wgt = tb_wgt(tabag); /* the total transaction weight */ MSG(stderr, "[%d item(s), ", n); if (k == wgt) MSG(stderr, "%d transaction(s)]", k); else MSG(stderr, "%d/%d transaction(s)]", k, wgt); MSG(stderr, " done [%.2fs].", SEC_SINCE(t)); if ((n <= 0) || (wgt <= 0)) /* check for at least one item */ error(E_NOTRANS); /* and at least one transaction */ MSG(stderr, "\n"); /* terminate the log message */ if (format == dflt) { /* if default info. format is used */ if (target != TT_RULE) format = (supp < 0) ? " (%a)" : " (%1S)"; else format = (supp < 0) ? " (%b, %1C)" : " (%1X, %1C)"; } /* set default according to target */ supp = ceil ((supp >= 0) ? 0.01 *supp *wgt : -supp); smax = floor((smax >= 0) ? 0.01 *smax *wgt : -smax); conf *= 0.01; /* transform support and confidence */ /* --- sort and recode items --- */ t = clock(); /* compute absolute support values */ MSG(stderr, "filtering, sorting and recoding items ... "); map = (int*)malloc(n *sizeof(int)); if (!map) error(E_NOMEM); /* create an item identifier map */ k = (int)((mode & APP_HEAD) ? supp : ceil(supp *conf)); n = ib_recode(ibase, k, sort, map); tb_recode(tabag, map); /* recode the items and transactions */ tb_itsort(tabag, 1, heap); /* and sort items in transactions */ free(map); map = NULL; /* delete the item identifier map */ MSG(stderr, "[%d item(s)] done [%.2fs].", n, SEC_SINCE(t)); if (n <= 0) error(E_NOFREQ); /* print a log message and */ MSG(stderr, "\n"); /* check the number of items */ k = tb_max(tabag); /* clamp the set/rule length to */ if (max > k) max = k; /* the maximum transaction size */ /* --- reduce transactions --- */ t = clock(); /* start the timer for the reduction */ MSG(stderr, "reducing transactions ... "); tb_filter(tabag, min, NULL); /* remove items of short transactions */ tb_sort(tabag, 1, heap); /* sort the trans. lexicographically */ k = tb_reduce(tabag); /* reduce transactions to unique ones */ if (k == wgt) MSG(stderr, "[%d transaction(s)]", k); else MSG(stderr, "[%d/%d transaction(s)]", k, wgt); MSG(stderr, " done [%.2fs].\n", SEC_SINCE(t)); /* --- create transaction tree --- */ tt = 0; /* init. the tree construction time */ if (tree) { /* if to use a transaction tree */ t = clock(); /* start the timer for construction */ MSG(stderr, "building transaction tree ... "); tatree = tt_create(tabag); /* create a transaction tree */ if (!tatree) error(E_NOMEM); if (filter == 0) { /* if not to filter items, */ tb_delete(tabag, 0); /* delete the transaction bag */ tabag = NULL; /* (redundant data storage) */ } MSG(stderr, "[%d node(s)]", tt_nodecnt(tatree)); MSG(stderr, " done [%.2fs].\n", SEC_SINCE(t)); tt = clock() -t; /* note the time for the construction */ } /* of the transaction tree */ /* --- create item set tree --- */ t = clock(); tc = 0; /* start the timer for the search */ istree = ist_create(ibase, mode, (int)supp, (int)smax, conf); if (!istree) error(E_NOMEM); /* create an item set tree */ ist_seteval(istree, eval|zero, aggm, 0.01*minval, prune); /* --- check item subsets --- */ MSG(stderr, "checking subsets of size 1"); map = (int*)malloc(n *sizeof(int)); if (!map) error(E_NOMEM); /* create a filter map */ while (1) { /* traverse the item set sizes */ size = ist_height(istree); /* get the current item set size and */ if (size >= max) break; /* abort if maximal size is reached */ if ((filter != 0) /* if to filter w.r.t. item usage */ && (ist_check(istree, map) <= size)) break; /* check which items are still used */ if (post) /* if a-posteriori pruning requested, */ ist_prune(istree); /* prune infrequent item sets */ k = ist_addlvl(istree); /* while max. height is not reached, */ if (k) { if (k > 0) break; /* add a level to the item set tree */ error(E_NOMEM); } /* if no level was added, abort */ if (((filter < 0) /* if to filter w.r.t. item usage */ && (i < -filter *n)) /* and enough items were removed */ || ((filter > 0) /* or counting time is long enough */ && (i < n) && (i *(double)tt < filter *n *tc))) { n = i; /* note the new number of items */ x = clock(); /* start the timer for filtering */ tb_filter(tabag, size+1, map); tb_sort(tabag, 0, heap); /* remove unnec. items and trans. */ tb_reduce(tabag); /* and reduce trans. to unique ones */ if (tatree) { /* if a transaction tree was created */ tt_delete(tatree, 0); /* delete the transaction tree */ tatree = tt_create(tabag); if (!tatree) error(E_NOMEM); } /* rebuild the transaction tree */ tt = clock() -x; /* note the filter/rebuild time */ } MSG(stderr, " %d", ++size); /* print the current item set size */ x = clock(); /* start the timer for counting */ if (tatree) ist_countx(istree, tatree); else ist_countb(istree, tabag); tc = clock() -x; /* count the transaction tree/bag */ } /* and compute the new counting time */ free(map); map = NULL; /* delete the filter map */ MSG(stderr, " done [%.2fs].\n", SEC_SINCE(t)); /* --- filter found item sets --- */ if ((target == TT_CLOSED) || (target == TT_MAXIMAL)) { t = clock(); /* start the timer for filtering */ MSG(stderr, "filtering for %s item sets ... ", (target == TT_MAXIMAL) ? "maximal" : "closed"); k = target | ((prune < 0) ? IST_EVAL : 0); ist_mark(istree, k); /* filter closed/maximal item sets */ MSG(stderr, "done [%.2fs].\n", SEC_SINCE(t)); } /* print a log message */ /* --- print item sets/rules/hyperedges --- */ t = clock(); /* start the timer for the output */ if (fn_out && *fn_out) /* if an output file name is given, */ out = fopen(fn_out, "w"); /* open the output file */ else { /* if no output file name is given, */ out = stdout; fn_out = "<stdout>"; } /* write to std. output */ MSG(stderr, "writing %s ... ", fn_out); if (!out) error(E_FOPEN, fn_out); if (eval == IST_LOGQ) report |= ISR_LOGS; if ((target == TT_CLOSED) || (target == TT_MAXIMAL)) report |= ISR_NOEXP; /* combine the report mode flags */ isrep = isr_create(ibase, out, report, isep, impl); if (!isrep) error(E_NOMEM); /* create an item set reporter */ isr_setfmt (isrep, format); /* and configure it */ isr_setsize(isrep, min, max); ist_setsize(istree, min, max, dir); ist_init (istree); /* initialize the extraction */ items = t_items(ib_tract(ibase)); if ((target <= TT_MAXIMAL) /* if to find frequent item sets */ && (dir == 0)) { /* and not to sort them by size */ if (eval == IST_LOGQ) /* if to compute an add. evaluation */ isr_seteval(isrep, isr_logq, NULL, 0.01*minval); else if (eval > IST_NONE) /* set the add. evaluation function */ isr_seteval(isrep, ist_evalx, istree, 0.01*minval); n = ist_report(istree, isrep); } /* report the item sets */ else if (target <= TT_MAXIMAL) { /* if to find frequent item sets */ for (n = 0; 1; ) { /* extract item sets from the tree */ k = ist_set(istree, items, &frq, &minval); if (k < 0) break; /* get the next frequent item set */ if (k > 0) fputs(isr_name(isrep, items[0]), out); for (i = 0; ++i < k; ) { /* print the item names */ fputs(isep, out); fputs(isr_name(isrep, items[i]), out); } if (format) /* if requested, print information */ isr_sinfo(isrep, frq, minval); fputc('\n', out); n++; /* terminate the output line and */ } } /* count the reported item set */ else if (target == TT_RULE) { /* if to find association rules, */ for (n = 0; 1; ) { /* extract rules from tree */ k = ist_rule(istree, items, &frq, &body, &head, &minval); if (k < 0) break; /* get the next association rule */ fputs(isr_name(isrep, items[0]), out); fputs(impl, out); /* print name of rule head item */ if (k > 1) fputs(isr_name(isrep, items[1]), out); for (i = 1; ++i < k; ) { /* print names of items in rule body */ fputs(isep, out); fputs(isr_name(isrep, items[i]), out); } if (format) /* if requested, print information */ isr_rinfo(isrep, frq, body, head, minval); fputc('\n', out); n++; /* terminate the output line and */ } /* count the reported ass. rule */ } /* if (target <= TT_MAXIMAL) .. else .. */ if (fflush(out) != 0) error(E_FWRITE, fn_out); if (out != stdout) fclose(out); out = NULL; /* close the output file */ MSG(stderr, "[%d %s(s)] done ", n, (target == TT_RULE) ? "rule" : "set"); MSG(stderr, "[%.2fs].\n", SEC_SINCE(t)); #ifdef BENCH /* if benchmark version, */ ist_stats(istree); /* show the search statistics */ #endif /* (especially memory usage) */ /* --- clean up --- */ #ifndef NDEBUG /* if this is a debug version */ isr_delete(isrep, 0); /* the item set reporter, */ ist_delete(istree); /* the item set tree, */ if (tatree) tt_delete(tatree, 0); /* the transaction tree, */ if (tabag) tb_delete(tabag, 0); /* the transaction bag, */ ib_delete(ibase); /* and the item base */ #endif #ifdef STORAGE /* if storage debugging */ showmem("at end of program"); /* check memory usage */ #endif return 0; /* return 'ok' */ } /* main() */