static void _yr_atoms_tree_node_destroy( ATOM_TREE_NODE* node) { ATOM_TREE_NODE* child; ATOM_TREE_NODE* next_child; if (node == NULL) return; if (node->type == ATOM_TREE_OR || node->type == ATOM_TREE_AND) { child = node->children_head; while (child != NULL) { next_child = child->next_sibling; _yr_atoms_tree_node_destroy(child); child = next_child; } } yr_free(node); }
static ATOM_TREE_NODE* _yr_atoms_extract_from_re_node( RE_NODE* re_node, ATOM_TREE* atom_tree, ATOM_TREE_NODE* current_node) { ATOM_TREE_NODE* left_node; ATOM_TREE_NODE* right_node; ATOM_TREE_NODE* and_node; ATOM_TREE_NODE* current_leaf; ATOM_TREE_NODE* temp; int quality; int new_quality; int i; uint8_t new_atom[MAX_ATOM_LENGTH]; switch(re_node->type) { case RE_NODE_LITERAL: if (atom_tree->current_leaf == NULL) { atom_tree->current_leaf = _yr_atoms_tree_node_create(ATOM_TREE_LEAF); if (atom_tree->current_leaf == NULL) return NULL; atom_tree->current_leaf->forward_code = re_node->forward_code; atom_tree->current_leaf->backward_code = re_node->backward_code; assert(atom_tree->current_leaf->forward_code != NULL); assert(atom_tree->current_leaf->backward_code != NULL); } current_leaf = atom_tree->current_leaf; if (current_leaf->atom_length < MAX_ATOM_LENGTH) { current_leaf->atom[current_leaf->atom_length] = (uint8_t) re_node->value; current_leaf->recent_nodes[current_leaf->atom_length] = re_node; current_leaf->atom_length++; } else { for (i = 1; i < MAX_ATOM_LENGTH; i++) current_leaf->recent_nodes[i - 1] = current_leaf->recent_nodes[i]; current_leaf->recent_nodes[MAX_ATOM_LENGTH - 1] = re_node; for (i = 0; i < MAX_ATOM_LENGTH; i++) new_atom[i] = (uint8_t) current_leaf->recent_nodes[i]->value; quality = _yr_atoms_quality( current_leaf->atom, MAX_ATOM_LENGTH); new_quality = _yr_atoms_quality( new_atom, MAX_ATOM_LENGTH); if (new_quality > quality) { for (i = 0; i < MAX_ATOM_LENGTH; i++) current_leaf->atom[i] = new_atom[i]; current_leaf->forward_code = \ current_leaf->recent_nodes[0]->forward_code; current_leaf->backward_code = \ current_leaf->recent_nodes[0]->backward_code; assert(current_leaf->forward_code != NULL); assert(current_leaf->backward_code != NULL); } } return current_node; case RE_NODE_CONCAT: current_node = _yr_atoms_extract_from_re_node( re_node->left, atom_tree, current_node); if (current_node == NULL) return NULL; current_node = _yr_atoms_extract_from_re_node( re_node->right, atom_tree, current_node); return current_node; case RE_NODE_ALT: append_current_leaf_to_node(current_node); left_node = _yr_atoms_tree_node_create(ATOM_TREE_OR); if (left_node == NULL) return NULL; left_node = _yr_atoms_extract_from_re_node( re_node->left, atom_tree, left_node); if (left_node == NULL) return NULL; append_current_leaf_to_node(left_node); if (left_node->children_head == NULL) { _yr_atoms_tree_node_destroy(left_node); return current_node; } if (left_node->children_head == left_node->children_tail) { temp = left_node; left_node = left_node->children_head; yr_free(temp); } right_node = _yr_atoms_tree_node_create(ATOM_TREE_OR); if (right_node == NULL) return NULL; right_node = _yr_atoms_extract_from_re_node( re_node->right, atom_tree, right_node); if (right_node == NULL) return NULL; append_current_leaf_to_node(right_node); if (right_node->children_head == NULL) { _yr_atoms_tree_node_destroy(left_node); _yr_atoms_tree_node_destroy(right_node); return current_node; } if (right_node->children_head == right_node->children_tail) { temp = right_node; right_node = right_node->children_head; yr_free(temp); } and_node = _yr_atoms_tree_node_create(ATOM_TREE_AND); if (and_node == NULL) return NULL; and_node->children_head = left_node; and_node->children_tail = right_node; left_node->next_sibling = right_node; _yr_atoms_tree_node_append(current_node, and_node); return current_node; case RE_NODE_RANGE: if (re_node->start == 0) append_current_leaf_to_node(current_node); for (i = 0; i < re_node->start; i++) { current_node = _yr_atoms_extract_from_re_node( re_node->left, atom_tree, current_node); if (current_node == NULL) return NULL; } if (re_node->start != re_node->end) append_current_leaf_to_node(current_node); return current_node; case RE_NODE_PLUS: current_node = _yr_atoms_extract_from_re_node( re_node->left, atom_tree, current_node); if (current_node == NULL) return NULL; append_current_leaf_to_node(current_node); return current_node; case RE_NODE_ANY: case RE_NODE_RANGE_ANY: case RE_NODE_STAR: case RE_NODE_CLASS: case RE_NODE_MASKED_LITERAL: case RE_NODE_WORD_CHAR: case RE_NODE_NON_WORD_CHAR: case RE_NODE_SPACE: case RE_NODE_NON_SPACE: case RE_NODE_DIGIT: case RE_NODE_NON_DIGIT: case RE_NODE_EMPTY: case RE_NODE_ANCHOR_START: case RE_NODE_ANCHOR_END: case RE_NODE_WORD_BOUNDARY: case RE_NODE_NON_WORD_BOUNDARY: append_current_leaf_to_node(current_node); return current_node; default: assert(FALSE); } return NULL; }
static void _yr_atoms_tree_destroy( ATOM_TREE* atom_tree) { _yr_atoms_tree_node_destroy(atom_tree->root_node); yr_free(atom_tree); }
static int _yr_atoms_extract_from_re( YR_ATOMS_CONFIG* config, RE_AST* re_ast, YR_ATOM_TREE_NODE* appending_node) { YR_STACK* stack; RE_NODE* re_node; YR_ATOM atom; YR_ATOM best_atom; struct STACK_ITEM si; int i, shift; int quality; int best_quality = -1; int n = 0; YR_ATOM_TREE_NODE* and_node; YR_ATOM_TREE_NODE* left_node; YR_ATOM_TREE_NODE* right_node; // The RE_NODEs most recently visited that can conform an atom (ie: // RE_NODE_LITERAL, RE_NODE_MASKED_LITERAL and RE_NODE_ANY). The number of // items in this array is n. RE_NODE* recent_re_nodes[YR_MAX_ATOM_LENGTH]; // The RE_NODEs corresponding to the best atom found so far for the current // appending node. RE_NODE* best_atom_re_nodes[YR_MAX_ATOM_LENGTH]; // This holds the ATOM_TREE_OR node where leaves (ATOM_TREE_LEAF) are // currently being appended. YR_ATOM_TREE_NODE* current_appending_node = NULL; // This holds the ATOM_TREE_LEAF node whose atom is currently being updated. YR_ATOM_TREE_NODE* leaf = NULL; FAIL_ON_ERROR(yr_stack_create(1024, sizeof(si), &stack)); // This first item pushed in the stack is the last one to be poped out, its // sole purpose is forcing that any pending si.re_node = NULL; si.new_appending_node = appending_node; FAIL_ON_ERROR_WITH_CLEANUP( yr_stack_push(stack, (void*) &si), yr_stack_destroy(stack)); // Start processing the root node. si.re_node = re_ast->root_node; // Leaf nodes are initially appended to the node passed in the appending_node, // argument which is the root ATOM_TREE_OR node that is empty at this point. si.new_appending_node = appending_node; FAIL_ON_ERROR_WITH_CLEANUP( yr_stack_push(stack, (void*) &si), yr_stack_destroy(stack)); while (yr_stack_pop(stack, (void*) &si)) { // Change the appending node if the item poped from the stack says so. if (si.new_appending_node != NULL) { // Before changing the appending node let's append any pending leaf to // the current appending node. if (n > 0) { make_atom_from_re_nodes(atom, n, recent_re_nodes); shift = _yr_atoms_trim(&atom); quality = config->get_atom_quality(config, &atom); FAIL_ON_NULL_WITH_CLEANUP( leaf = _yr_atoms_tree_node_create(ATOM_TREE_LEAF), yr_stack_destroy(stack)); if (quality > best_quality) { memcpy(&leaf->atom, &atom, sizeof(atom)); memcpy( &leaf->re_nodes, &recent_re_nodes[shift], sizeof(recent_re_nodes) - shift * sizeof(recent_re_nodes[0])); } else { memcpy(&leaf->atom, &best_atom, sizeof(best_atom)); memcpy( &leaf->re_nodes, &best_atom_re_nodes, sizeof(best_atom_re_nodes)); } _yr_atoms_tree_node_append(current_appending_node, leaf); n = 0; } current_appending_node = si.new_appending_node; } if (si.re_node != NULL) { switch(si.re_node->type) { case RE_NODE_LITERAL: case RE_NODE_MASKED_LITERAL: case RE_NODE_ANY: if (n < YR_MAX_ATOM_LENGTH) { recent_re_nodes[n] = si.re_node; best_atom_re_nodes[n] = si.re_node; best_atom.bytes[n] = (uint8_t) si.re_node->value; best_atom.mask[n] = (uint8_t) si.re_node->mask; best_atom.length = ++n; } else if (best_quality < YR_MAX_ATOM_QUALITY) { make_atom_from_re_nodes(atom, n, recent_re_nodes); shift = _yr_atoms_trim(&atom); quality = config->get_atom_quality(config, &atom); if (quality > best_quality) { for (i = 0; i < atom.length; i++) { best_atom.bytes[i] = atom.bytes[i]; best_atom.mask[i] = atom.mask[i]; best_atom_re_nodes[i] = recent_re_nodes[i + shift]; } best_atom.length = atom.length; best_quality = quality; } for (i = 1; i < YR_MAX_ATOM_LENGTH; i++) recent_re_nodes[i - 1] = recent_re_nodes[i]; recent_re_nodes[YR_MAX_ATOM_LENGTH - 1] = si.re_node; } break; case RE_NODE_CONCAT: re_node = si.re_node->children_tail; // Push children right to left, they are poped left to right. while (re_node != NULL) { si.new_appending_node = NULL; si.re_node = re_node; FAIL_ON_ERROR_WITH_CLEANUP( yr_stack_push(stack, &si), yr_stack_destroy(stack)); re_node = re_node->prev_sibling; } break; case RE_NODE_ALT: // Create ATOM_TREE_AND node with two ATOM_TREE_OR children nodes. and_node = _yr_atoms_tree_node_create(ATOM_TREE_AND); left_node = _yr_atoms_tree_node_create(ATOM_TREE_OR); right_node = _yr_atoms_tree_node_create(ATOM_TREE_OR); if (and_node == NULL || left_node == NULL || right_node == NULL) { _yr_atoms_tree_node_destroy(and_node); _yr_atoms_tree_node_destroy(left_node); _yr_atoms_tree_node_destroy(right_node); yr_stack_destroy(stack); return ERROR_INSUFFICIENT_MEMORY; } and_node->children_head = left_node; and_node->children_tail = right_node; left_node->next_sibling = right_node; // Add the ATOM_TREE_AND as children of the current node. _yr_atoms_tree_node_append(current_appending_node, and_node); re_node = si.re_node; si.new_appending_node = current_appending_node; si.re_node = NULL; FAIL_ON_ERROR_WITH_CLEANUP( yr_stack_push(stack, &si), yr_stack_destroy(stack)); // RE_NODE_ALT nodes has only two children, so children_head is the // left one, and children_tail is right one. si.new_appending_node = right_node; si.re_node = re_node->children_tail; FAIL_ON_ERROR_WITH_CLEANUP( yr_stack_push(stack, &si), yr_stack_destroy(stack)); si.new_appending_node = left_node; si.re_node = re_node->children_head; FAIL_ON_ERROR_WITH_CLEANUP( yr_stack_push(stack, &si), yr_stack_destroy(stack)); break; case RE_NODE_PLUS: re_node = si.re_node; si.new_appending_node = current_appending_node; si.re_node = NULL; FAIL_ON_ERROR_WITH_CLEANUP( yr_stack_push(stack, &si), yr_stack_destroy(stack)); si.new_appending_node = NULL; // RE_NODE_PLUS nodes has a single child, which is children_head. si.re_node = re_node->children_head; FAIL_ON_ERROR_WITH_CLEANUP( yr_stack_push(stack, &si), yr_stack_destroy(stack)); break; case RE_NODE_RANGE: re_node = si.re_node; si.new_appending_node = current_appending_node; si.re_node = NULL; FAIL_ON_ERROR_WITH_CLEANUP( yr_stack_push(stack, &si), yr_stack_destroy(stack)); si.new_appending_node = NULL; // RE_NODE_RANGE nodes has a single child, which is children_head. si.re_node = re_node->children_head; // In a regexp like /a{10,20}/ the optimal atom is 'aaaa' (assuming // that YR_MAX_ATOM_LENGTH = 4) because the 'a' character must appear // at least 10 times in the matching string. Each call in the loop // will append one 'a' to the atom, so YR_MAX_ATOM_LENGTH iterations // are enough. for (i = 0; i < yr_min(re_node->start, YR_MAX_ATOM_LENGTH); i++) { FAIL_ON_ERROR_WITH_CLEANUP( yr_stack_push(stack, &si), yr_stack_destroy(stack)); } break; case RE_NODE_RANGE_ANY: case RE_NODE_STAR: case RE_NODE_CLASS: case RE_NODE_WORD_CHAR: case RE_NODE_NON_WORD_CHAR: case RE_NODE_SPACE: case RE_NODE_NON_SPACE: case RE_NODE_DIGIT: case RE_NODE_NON_DIGIT: case RE_NODE_EMPTY: case RE_NODE_ANCHOR_START: case RE_NODE_ANCHOR_END: case RE_NODE_WORD_BOUNDARY: case RE_NODE_NON_WORD_BOUNDARY: si.new_appending_node = current_appending_node; si.re_node = NULL; FAIL_ON_ERROR_WITH_CLEANUP( yr_stack_push(stack, &si), yr_stack_destroy(stack)); break; default: assert(false); } } } yr_stack_destroy(stack); return ERROR_SUCCESS; }