void byte_distribution_add(byte_distribution_t *bd, LST_String *nbytes, int offset){ if(!bd || !nbytes) { return; } if(offset < 0 || offset >= lst_string_get_length(nbytes)) { return; } char *value = (char *) lst_string_get_item(nbytes, offset); HashTableValue frequency; char * key = (char *) malloc(sizeof(char)); *key = value[0]; if((frequency = hash_table_lookup(bd->value_frequency, key)) != HASH_TABLE_NULL){ (*(int *)frequency)++; } else { int *freq = (int *) malloc (sizeof(int)); *freq = 1; hash_table_insert(bd->value_frequency, key, freq); } return; }
/* * Here, tree is made up of tokens, and we're returning * which tokens are contained in search string. * Naive implementation for now- can make this linear time. */ static PyObject* stree_find_tokens(LST_STree *tree, LST_String *string) { int pos = 0; int offset = 0; int common = 0; int length = lst_string_get_length(string); int skip = 0; LST_Node *node = tree->root_node; LST_Edge *edge = NULL; LST_Edge *c_edge = NULL; LST_String *longest_match = NULL; PyObject *pydict = PyDict_New(); for(pos=0; pos < length; pos++) { longest_match = NULL; while(1) { // printf("pos %d offset %d\n", pos, offset); /* if any out-edges are the end of a string, mark this as our best * match so far. */ for (c_edge = node->kids.lh_first; c_edge; c_edge = c_edge->siblings.le_next) { if(c_edge->range.start_index == lst_string_get_length(c_edge->range.string) && // lst_node_get_string_length(node) == offset == lst_string_get_length(c_edge->range.string)) longest_match = c_edge->range.string; } edge = node_find_edge_with_startitem(node, string, pos+offset); if (!edge) { /* mismatch */ // printf("No matching child edge\n"); common = 0; break; } if (skip >= lst_edge_get_length(edge)) { common = lst_edge_get_length(edge); skip -= common; // printf("skipping edge of length %d\n", lst_edge_get_length(edge)); } else { // printf("matching edge from byte %d\n", skip); common = skip + lst_string_items_common(edge->range.string, edge->range.start_index + skip, string, pos+offset+skip, lst_edge_get_length(edge)-skip); // printf("Matched %d characters\n", common); skip = 0; } if (common < (u_int) lst_edge_get_length(edge)) { /* mismatch in edge */ // printf("Mismatch in edge\n"); if (edge->range.start_index + common == lst_string_get_length(edge->range.string) && // lst_node_get_string_length(node) + common == offset + common == lst_string_get_length(edge->range.string)) { longest_match = edge->range.string; } break; } node = edge->dst_node; offset += lst_edge_get_length(edge); } if (longest_match != NULL) { PyObject *pyindex = PyInt_FromLong(pos); PyObject *pystring = PyInt_FromLong(lst_stree_get_string_index(tree, longest_match)); PyDict_SetItem(pydict, pyindex, pystring); Py_DECREF(pyindex); Py_DECREF(pystring); // printf("%d %s\n", pos, lst_string_print(longest_match)); } /* change this to do follow suffix links and * do skip/count for linear time bound */ skip = offset + common - 1; skip = (skip < 0 ? 0 : skip); node = tree->root_node; offset = 0; /* if (node->suffix_link_node != NULL) { node = node->suffix_link_node; offset = lst_node_get_string_length(node); // printf("Following suffix link to depth %d\n", offset); } else { node = tree->root_node; offset = 0; } */ } return pydict; }