Esempio n. 1
0
void byte_distribution_add(byte_distribution_t *bd, LST_String *nbytes, int offset){
	
	if(!bd || !nbytes) {
		return;
	}

	if(offset < 0 || offset >= lst_string_get_length(nbytes)) {
		return;
	}

	char *value = (char *) lst_string_get_item(nbytes, offset);

	HashTableValue frequency;
	char * key = (char *) malloc(sizeof(char));
	*key = value[0];

	if((frequency = hash_table_lookup(bd->value_frequency, key)) != HASH_TABLE_NULL){
	       (*(int *)frequency)++;				
	} else {
		int *freq = (int *) malloc (sizeof(int)); 
		*freq = 1;
		hash_table_insert(bd->value_frequency, key, freq);
	}
	return;
}
Esempio n. 2
0
/*
 * Here, tree is made up of tokens, and we're returning
 * which tokens are contained in search string.
 * Naive implementation for now- can make this linear time.
 */
static PyObject*
stree_find_tokens(LST_STree *tree, LST_String *string)
{
	int pos = 0;
	int offset = 0;
	int common = 0;
	int length = lst_string_get_length(string);
	int skip = 0;
	
	LST_Node *node = tree->root_node;
	LST_Edge   *edge = NULL;
	LST_Edge   *c_edge = NULL;
	LST_String *longest_match = NULL;
	PyObject *pydict = PyDict_New();

	for(pos=0; pos < length; pos++) {
		longest_match = NULL;
		while(1) {
//			printf("pos %d offset %d\n", pos, offset);

			/* if any out-edges are the end of a string, mark this as our best
			 * match so far.
			 */
			for (c_edge = node->kids.lh_first; c_edge; c_edge = c_edge->siblings.le_next) {
				if(c_edge->range.start_index == 
				   lst_string_get_length(c_edge->range.string) &&
//				   lst_node_get_string_length(node) ==
				   offset ==
				   lst_string_get_length(c_edge->range.string))
					longest_match = c_edge->range.string;
			}

			edge = node_find_edge_with_startitem(node, string, pos+offset);
			if (!edge) {
				/* mismatch */
//				printf("No matching child edge\n");
				common = 0;
				break;
			}

			if (skip >= lst_edge_get_length(edge)) {
				common = lst_edge_get_length(edge);
				skip -= common;
//				printf("skipping edge of length %d\n", lst_edge_get_length(edge));
			} else {
//				printf("matching edge from byte %d\n", skip);
				common = skip + lst_string_items_common(edge->range.string,
								 edge->range.start_index + skip,
								 string, pos+offset+skip,
								 lst_edge_get_length(edge)-skip);
	//			printf("Matched %d characters\n", common);
				skip = 0;
			}
							 
			if (common < (u_int) lst_edge_get_length(edge)) {
				/* mismatch in edge */
//				printf("Mismatch in edge\n");
				if (edge->range.start_index + common == 
				    lst_string_get_length(edge->range.string) &&
//				    lst_node_get_string_length(node) + common ==
				    offset + common ==
				    lst_string_get_length(edge->range.string)) {
					longest_match = edge->range.string;
				}

				break;
			}

			node = edge->dst_node;
			offset += lst_edge_get_length(edge);
		}

		if (longest_match != NULL) {
			PyObject *pyindex = PyInt_FromLong(pos);
			PyObject *pystring = PyInt_FromLong(lst_stree_get_string_index(tree, longest_match));
			PyDict_SetItem(pydict, pyindex, pystring);
			Py_DECREF(pyindex);
			Py_DECREF(pystring);
//			printf("%d %s\n", pos, lst_string_print(longest_match));
		}

		/* change this to do follow suffix links and
		 * do skip/count for linear time bound 
		 */

		skip = offset + common - 1;
		skip = (skip < 0 ? 0 : skip);
		node = tree->root_node;
		offset = 0;


/*
		if (node->suffix_link_node != NULL) {
			node = node->suffix_link_node;
			offset = lst_node_get_string_length(node);
//			printf("Following suffix link to depth %d\n", offset);
		} else {
			node = tree->root_node;
			offset = 0;
		}
*/

	}
	return pydict;
}