void PrintNewBestSub(Substructure *sub, SubList *subList, Parameters *parameters) { ULONG outputLevel = parameters->outputLevel; if ((subList->head == NULL) || (sub->value > subList->head->sub->value)) { parameters->outputLevel = 1; // turn off instance printing printf("\nNew best "); PrintSub(sub, parameters); printf("\n"); parameters->outputLevel = outputLevel; } }
void SuffixTree<Symb,NSymb>::Print(std::ostream& str, uint flags, PNode n, PNode prev, int ind) { if(n == NIL) { n = ROOT; prev = NIL; } Node& node = GetNode(n); //if(IsInternal(n)) { int len = (n==ROOT) ? 0 : node.len; PrintInd(str, ind); if(flags & 1) { PrintSub(str, node.pos, node.pos + len); str << ' '; } if(flags & 2) str << len << ' '; //if(flags & 4) // str << node.dep << ' '; if(flags & 8) str << node.count << ' '; str << " [" << n << "]->[" << node.suf << "]" << std::endl; ind += len; for(int i = 1; i <= NSymb; i++) { PNode ch = GetChild(n, i % NSymb); if(ch != NIL) Print(str, flags, ch, n, ind); } //} //else { // int pos = GetLeafPos(prev, n); // // PrintInd(str, ind); // if(flags & 1) { // PrintSub(str, pos); // str << ' '; // } // if(flags & 2) // str << (dlen - pos) << ' '; // if(flags & 4) // str << dlen - node.dep << ' '; // if(flags & 8) // str << node.count << ' '; // str << " (" << node.dep << ")" << endl; //} }
void PrintSubList(SubList *subList, Parameters *parameters) { ULONG counter = 1; SubListNode *subListNode = NULL; if (subList != NULL) { subListNode = subList->head; while (subListNode != NULL) { printf("(%lu) ", counter); counter++; PrintSub(subListNode->sub, parameters); printf("\n"); subListNode = subListNode->next; } } }
SubList *DiscoverSubs(Parameters *parameters) { SubList *parentSubList; SubList *childSubList; SubList *extendedSubList; SubList *discoveredSubList; SubListNode *parentSubListNode; SubListNode *extendedSubListNode; Substructure *parentSub; Substructure *extendedSub; Substructure *recursiveSub = NULL; // parameters used ULONG limit = parameters->limit; ULONG numBestSubs = parameters->numBestSubs; ULONG beamWidth = parameters->beamWidth; BOOLEAN valueBased = parameters->valueBased; LabelList *labelList = parameters->labelList; BOOLEAN prune = parameters->prune; ULONG maxVertices = parameters->maxVertices; ULONG minVertices = parameters->minVertices; ULONG outputLevel = parameters->outputLevel; BOOLEAN recursion = parameters->recursion; ULONG evalMethod = parameters->evalMethod; // get initial one-vertex substructures parentSubList = GetInitialSubs(parameters); discoveredSubList = AllocateSubList(); while ((limit > 0) && (parentSubList->head != NULL)) { parentSubListNode = parentSubList->head; childSubList = AllocateSubList(); // extend each substructure in parent list while (parentSubListNode != NULL) { parentSub = parentSubListNode->sub; parentSubListNode->sub = NULL; if (outputLevel > 4) { parameters->outputLevel = 1; // turn off instance printing printf("\nConsidering "); PrintSub(parentSub, parameters); printf("\n"); parameters->outputLevel = outputLevel; } if ((((parentSub->numInstances > 1) && (evalMethod != EVAL_SETCOVER)) || (parentSub->numNegInstances > 0)) && (limit > 0)) { limit--; if (outputLevel > 3) printf("%lu substructures left to be considered\n", limit); fflush(stdout); extendedSubList = ExtendSub(parentSub, parameters); extendedSubListNode = extendedSubList->head; while (extendedSubListNode != NULL) { extendedSub = extendedSubListNode->sub; extendedSubListNode->sub = NULL; if (extendedSub->definition->numVertices <= maxVertices) { // evaluate each extension and add to child list EvaluateSub(extendedSub, parameters); if (prune && (extendedSub->value < parentSub->value)) { FreeSub(extendedSub); } else { SubListInsert(extendedSub, childSubList, beamWidth, valueBased, labelList); } } else { FreeSub(extendedSub); } extendedSubListNode = extendedSubListNode->next; } FreeSubList(extendedSubList); } // add parent substructure to final discovered list if (parentSub->definition->numVertices >= minVertices) { if (! SinglePreviousSub(parentSub, parameters)) { // consider recursive substructure, if requested if (recursion) recursiveSub = RecursifySub(parentSub, parameters); if (outputLevel > 3) PrintNewBestSub(parentSub, discoveredSubList, parameters); SubListInsert(parentSub, discoveredSubList, numBestSubs, FALSE, labelList); if (recursion && (recursiveSub != NULL)) { if (outputLevel > 4) { parameters->outputLevel = 1; // turn off instance printing printf("\nConsidering Recursive "); PrintSub(recursiveSub, parameters); printf ("\n"); parameters->outputLevel = outputLevel; } if (outputLevel > 3) PrintNewBestSub(recursiveSub, discoveredSubList, parameters); SubListInsert(recursiveSub, discoveredSubList, numBestSubs, FALSE, labelList); } } } else { FreeSub (parentSub); } parentSubListNode = parentSubListNode->next; } FreeSubList(parentSubList); parentSubList = childSubList; } if ((limit > 0) && (outputLevel > 2)) printf ("\nSubstructure queue empty.\n"); // try to insert any remaining subs in parent list on to discovered list parentSubListNode = parentSubList->head; while (parentSubListNode != NULL) { parentSub = parentSubListNode->sub; parentSubListNode->sub = NULL; if (parentSub->definition->numVertices >= minVertices) { if (! SinglePreviousSub(parentSub, parameters)) { if (outputLevel > 3) PrintNewBestSub(parentSub, discoveredSubList, parameters); SubListInsert(parentSub, discoveredSubList, numBestSubs, FALSE, labelList); } } else { FreeSub(parentSub); } parentSubListNode = parentSubListNode->next; } FreeSubList(parentSubList); return discoveredSubList; }
int main(int argc, char **argv) { struct tms tmsstart, tmsend; clock_t startTime, endTime; static long clktck = 0; time_t iterationStartTime; time_t iterationEndTime; SubList *subList; Substructure *normSub = NULL; Parameters *parameters; FILE *outputFile; ULONG iteration; BOOLEAN done; clktck = sysconf(_SC_CLK_TCK); startTime = times(&tmsstart); printf("GBAD %s\n\n", GBAD_VERSION); parameters = GetParameters(argc, argv); // compress positive graphs with predefined subs, if given if (parameters->numPreSubs > 0) CompressWithPredefinedSubs(parameters); PrintParameters(parameters); if (parameters->iterations > 1) printf("----- Iteration 1 -----\n\n"); iteration = 1; parameters->currentIteration = iteration; done = FALSE; while ((iteration <= parameters->iterations) && (!done)) { iterationStartTime = time(NULL); if (iteration > 1) printf("----- Iteration %lu -----\n\n", iteration); printf("%lu positive graphs: %lu vertices, %lu edges", parameters->numPosEgs, parameters->posGraph->numVertices, parameters->posGraph->numEdges); if (parameters->evalMethod == EVAL_MDL) printf(", %.0f bits\n", parameters->posGraphDL); else printf("\n"); printf("%lu unique labels\n", parameters->labelList->numLabels); printf("\n"); if ((parameters->prob) && (iteration > 1)) { // // If GBAD-P option chosen, after the first iteration, we no longer // care about minsize of maxsize after the first iteration (if the // user specified these parameters), as we are just dealing with // single extensions from the normative - so set it to where we // just look at substructures that are composed of the normative // pattern (SUB_) and the single vertex extension. // parameters->minVertices = 1; parameters->maxVertices = 2; } // // If the user has specified a normative pattern, on the first iteration // need to save the top-N substructures, where N is what the user // specified with the -norm parameter. // ULONG saveNumBestSubs = parameters->numBestSubs; if ((iteration == 1) && (!parameters->noAnomalyDetection) && (parameters->norm > parameters->numBestSubs)) parameters->numBestSubs = parameters->norm; // // -prune is useful to get to the initial normative pattern, but // possibly detremental to discovering anomalies... so, turn off // pruning (in case it was turned on), so that it is not used in // future iterations. // if ((parameters->prob) && (iteration > 1)) { parameters->prune = FALSE; } subList = DiscoverSubs(parameters, iteration); // // Now that we have the best substructure(s), return the user // specified number of best substructures to its original value. // if (iteration == 1) parameters->numBestSubs = saveNumBestSubs; if (subList->head == NULL) { done = TRUE; printf("No substructures found.\n\n"); } else { // // GBAD-MDL // if (parameters->mdl) GBAD_MDL(subList,parameters); // // GBAD-MPS // if (parameters->mps) { GBAD_MPS(subList,parameters); } // // GBAD-P // if (parameters->prob) { normSub = GBAD_P(subList,iteration,parameters); } // write output to stdout if (parameters->outputLevel > 1) { printf("\nBest %lu substructures:\n\n", CountSubs (subList)); PrintSubList(subList, parameters); } else { printf("\nBest substructure: "); if ((CountSubs(subList) > 0) && (subList->head->sub != NULL)) PrintSub(subList->head->sub, parameters); else printf("None."); printf("\n\n"); } // write machine-readable output to file, if given if (parameters->outputToFile) { outputFile = fopen(parameters->outFileName, "a"); if (outputFile == NULL) { printf("WARNING: unable to write to output file %s,", parameters->outFileName); printf("disabling\n"); parameters->outputToFile = FALSE; } WriteGraphToFile(outputFile, subList->head->sub->definition, parameters->labelList, 0, 0, subList->head->sub->definition->numVertices, TRUE); fclose(outputFile); } if (iteration < parameters->iterations) { // Another iteration? if (parameters->evalMethod == EVAL_SETCOVER) { printf("Removing positive examples covered by"); printf(" best substructure.\n\n"); RemovePosEgsCovered(subList->head->sub, parameters); } else { // // For the GBAD-P algorithm, multiple iterations will need // to be performed, and if it is the first iteration // AND the user has specified a different normative // pattern (other than the best one), we need to // use the substructure that was set above. // if ((iteration == 1) && (parameters->prob)) { printf("Compressing graph by best substructure (%lu):\n", parameters->norm); PrintSub(normSub,parameters); printf("\n"); CompressFinalGraphs(normSub, parameters, iteration, FALSE); } else CompressFinalGraphs(subList->head->sub, parameters, iteration, FALSE); } // check for stopping condition // if set-covering, then no more positive examples // if MDL or size, then positive graph contains no edges if (parameters->evalMethod == EVAL_SETCOVER) { if (parameters->numPosEgs == 0) { done = TRUE; printf("Ending iterations - "); printf("all positive examples covered.\n\n"); } } else { if (parameters->posGraph->numEdges == 0) { done = TRUE; printf("Ending iterations - graph fully compressed.\n\n"); } } } if ((iteration == parameters->iterations) && (parameters->compress)) { if (parameters->evalMethod == EVAL_SETCOVER) WriteUpdatedGraphToFile(subList->head->sub, parameters); else WriteCompressedGraphToFile(subList->head->sub, parameters, iteration); } } // // Need to store information regarding initial best substructure, for use // in future GBAD-P calculations // if ((parameters->prob) && (iteration == 1) && (subList->head != NULL)) { parameters->numPreviousInstances = subList->head->sub->numInstances; } if ((parameters->prob) && (iteration > 1) && (subList->head != NULL)) parameters->numPreviousInstances = subList->head->sub->numInstances; FreeSubList(subList); if (parameters->iterations > 1) { iterationEndTime = time(NULL); printf("Elapsed time for iteration %lu = %lu seconds.\n\n", iteration, (iterationEndTime - iterationStartTime)); } iteration++; parameters->currentIteration = iteration; } FreeParameters(parameters); endTime = times(&tmsend); printf("\nGBAD done (elapsed CPU time = %7.2f seconds).\n", (endTime - startTime) / (double) clktck); return 0; }