int yr_re_emit_code( RE* re, YR_ARENA* arena) { int code_size; // Emit code for matching the regular expressions forwards. FAIL_ON_ERROR(_yr_re_emit( re->root_node, arena, 0, NULL, &code_size)); FAIL_ON_ERROR(_yr_emit_inst( arena, RE_OPCODE_MATCH, NULL, &code_size)); // Emit code for matching the regular expressions backwards. FAIL_ON_ERROR(_yr_re_emit( re->root_node, arena, EMIT_FLAGS_BACKWARDS, NULL, &code_size)); FAIL_ON_ERROR(_yr_emit_inst( arena, RE_OPCODE_MATCH, NULL, &code_size)); return ERROR_SUCCESS; }
int _yr_re_emit( RE_NODE* re_node, YR_ARENA* arena, int flags, uint8_t** code_addr, int* code_size) { int i; int branch_size; int split_size; int inst_size; int jmp_size; RE_NODE* left; RE_NODE* right; int16_t* split_offset_addr = NULL; int16_t* jmp_offset_addr = NULL; uint8_t* instruction_addr = NULL; *code_size = 0; switch(re_node->type) { case RE_NODE_LITERAL: FAIL_ON_ERROR(_yr_emit_inst_arg_uint8( arena, RE_OPCODE_LITERAL, re_node->value, &instruction_addr, NULL, code_size)); break; case RE_NODE_MASKED_LITERAL: FAIL_ON_ERROR(_yr_emit_inst_arg_uint16( arena, RE_OPCODE_MASKED_LITERAL, re_node->mask << 8 | re_node->value, &instruction_addr, NULL, code_size)); break; case RE_NODE_WORD_CHAR: FAIL_ON_ERROR(_yr_emit_inst( arena, RE_OPCODE_WORD_CHAR, &instruction_addr, code_size)); break; case RE_NODE_NON_WORD_CHAR: FAIL_ON_ERROR(_yr_emit_inst( arena, RE_OPCODE_NON_WORD_CHAR, &instruction_addr, code_size)); break; case RE_NODE_SPACE: FAIL_ON_ERROR(_yr_emit_inst( arena, RE_OPCODE_SPACE, &instruction_addr, code_size)); break; case RE_NODE_NON_SPACE: FAIL_ON_ERROR(_yr_emit_inst( arena, RE_OPCODE_NON_SPACE, &instruction_addr, code_size)); break; case RE_NODE_DIGIT: FAIL_ON_ERROR(_yr_emit_inst( arena, RE_OPCODE_DIGIT, &instruction_addr, code_size)); break; case RE_NODE_NON_DIGIT: FAIL_ON_ERROR(_yr_emit_inst( arena, RE_OPCODE_NON_DIGIT, &instruction_addr, code_size)); break; case RE_NODE_ANY: FAIL_ON_ERROR(_yr_emit_inst( arena, RE_OPCODE_ANY, &instruction_addr, code_size)); break; case RE_NODE_CLASS: FAIL_ON_ERROR(_yr_emit_inst( arena, RE_OPCODE_CLASS, &instruction_addr, code_size)); FAIL_ON_ERROR(yr_arena_write_data( arena, re_node->class_vector, 32, NULL)); *code_size += 32; break; case RE_NODE_ANCHOR_START: FAIL_ON_ERROR(_yr_emit_inst( arena, RE_OPCODE_MATCH_AT_START, &instruction_addr, code_size)); break; case RE_NODE_ANCHOR_END: FAIL_ON_ERROR(_yr_emit_inst( arena, RE_OPCODE_MATCH_AT_END, &instruction_addr, code_size)); break; case RE_NODE_CONCAT: if (flags & EMIT_FLAGS_BACKWARDS) { left = re_node->right; right = re_node->left; } else { left = re_node->left; right = re_node->right; } FAIL_ON_ERROR(_yr_re_emit( left, arena, flags, &instruction_addr, &branch_size)); *code_size += branch_size; FAIL_ON_ERROR(_yr_re_emit( right, arena, flags, NULL, &branch_size)); *code_size += branch_size; break; case RE_NODE_PLUS: // Code for e+ looks like: // // L1: code for e // split L1, L2 // L2: FAIL_ON_ERROR(_yr_re_emit( re_node->left, arena, flags, &instruction_addr, &branch_size)); *code_size += branch_size; FAIL_ON_ERROR(_yr_emit_inst_arg_int16( arena, re_node->greedy ? RE_OPCODE_SPLIT_B : RE_OPCODE_SPLIT_A, -branch_size, NULL, &split_offset_addr, &split_size)); *code_size += split_size; break; case RE_NODE_STAR: // Code for e* looks like: // // L1: split L1, L2 // code for e // jmp L1 // L2: FAIL_ON_ERROR(_yr_emit_inst_arg_int16( arena, re_node->greedy ? RE_OPCODE_SPLIT_A : RE_OPCODE_SPLIT_B, 0, &instruction_addr, &split_offset_addr, &split_size)); *code_size += split_size; FAIL_ON_ERROR(_yr_re_emit( re_node->left, arena, flags, NULL, &branch_size)); *code_size += branch_size; // Emit jump with offset set to 0. FAIL_ON_ERROR(_yr_emit_inst_arg_int16( arena, RE_OPCODE_JUMP, -(branch_size + split_size), NULL, &jmp_offset_addr, &jmp_size)); *code_size += jmp_size; // Update split offset. *split_offset_addr = split_size + branch_size + jmp_size; break; case RE_NODE_ALT: // Code for e1|e2 looks like: // // split L1, L2 // L1: code for e1 // jmp L3 // L2: code for e2 // L3: // Emit a split instruction with offset set to 0 temporarily. Offset // will be updated after we know the size of the code generated for // the left node (e1). FAIL_ON_ERROR(_yr_emit_inst_arg_int16( arena, RE_OPCODE_SPLIT_A, 0, &instruction_addr, &split_offset_addr, &split_size)); *code_size += split_size; FAIL_ON_ERROR(_yr_re_emit( re_node->left, arena, flags, NULL, &branch_size)); *code_size += branch_size; // Emit jump with offset set to 0. FAIL_ON_ERROR(_yr_emit_inst_arg_int16( arena, RE_OPCODE_JUMP, 0, NULL, &jmp_offset_addr, &jmp_size)); *code_size += jmp_size; // Update split offset. *split_offset_addr = split_size + branch_size + jmp_size; FAIL_ON_ERROR(_yr_re_emit( re_node->right, arena, flags, NULL, &branch_size)); *code_size += branch_size; // Update offset for jmp instruction. *jmp_offset_addr = branch_size + jmp_size; break; case RE_NODE_RANGE: // Code for e1{n,m} looks like: // // code for e1 (n times) // push m-n // L0: split L1, L2 // L1: code for e1 // jnztop L0 // L2: pop if (re_node->start > 0) { FAIL_ON_ERROR(_yr_re_emit( re_node->left, arena, flags, &instruction_addr, &branch_size)); *code_size += branch_size; for (i = 0; i < re_node->start - 1; i++) { FAIL_ON_ERROR(_yr_re_emit( re_node->left, arena, flags | EMIT_FLAGS_DONT_ANNOTATE_RE, NULL, &branch_size)); *code_size += branch_size; } } // m == n, no more code needed. if (re_node->end == re_node->start) break; FAIL_ON_ERROR(_yr_emit_inst_arg_uint16( arena, RE_OPCODE_PUSH, re_node->end - re_node->start, re_node->start == 0 ? &instruction_addr : NULL, NULL, &inst_size)); *code_size += inst_size; FAIL_ON_ERROR(_yr_emit_inst_arg_int16( arena, re_node->greedy ? RE_OPCODE_SPLIT_A : RE_OPCODE_SPLIT_B, 0, NULL, &split_offset_addr, &split_size)); *code_size += split_size; FAIL_ON_ERROR(_yr_re_emit( re_node->left, arena, flags | EMIT_FLAGS_DONT_ANNOTATE_RE, NULL, &branch_size)); *code_size += branch_size; FAIL_ON_ERROR(_yr_emit_inst_arg_int16( arena, RE_OPCODE_JNZ, -(branch_size + split_size), NULL, &jmp_offset_addr, &jmp_size)); *code_size += jmp_size; *split_offset_addr = split_size + branch_size + jmp_size; FAIL_ON_ERROR(_yr_emit_inst( arena, RE_OPCODE_POP, NULL, &inst_size)); *code_size += inst_size; break; } if (!(flags & EMIT_FLAGS_DONT_ANNOTATE_RE)) { if (flags & EMIT_FLAGS_BACKWARDS) re_node->backward_code = instruction_addr; else re_node->forward_code = instruction_addr; } if (code_addr != NULL) *code_addr = instruction_addr; return ERROR_SUCCESS; }
int _yr_re_emit( RE_EMIT_CONTEXT* emit_context, RE_NODE* re_node, int flags, uint8_t** code_addr, int* code_size) { int i; int branch_size; int split_size; int inst_size; int jmp_size; RE_NODE* left; RE_NODE* right; int16_t* split_offset_addr = NULL; int16_t* jmp_offset_addr = NULL; uint8_t* instruction_addr = NULL; *code_size = 0; switch(re_node->type) { case RE_NODE_LITERAL: FAIL_ON_ERROR(_yr_emit_inst_arg_uint8( emit_context, flags & EMIT_NO_CASE ? RE_OPCODE_LITERAL_NO_CASE : RE_OPCODE_LITERAL, re_node->value, &instruction_addr, NULL, code_size)); break; case RE_NODE_MASKED_LITERAL: FAIL_ON_ERROR(_yr_emit_inst_arg_uint16( emit_context, RE_OPCODE_MASKED_LITERAL, re_node->mask << 8 | re_node->value, &instruction_addr, NULL, code_size)); break; case RE_NODE_WORD_CHAR: FAIL_ON_ERROR(_yr_emit_inst( emit_context, RE_OPCODE_WORD_CHAR, &instruction_addr, code_size)); break; case RE_NODE_NON_WORD_CHAR: FAIL_ON_ERROR(_yr_emit_inst( emit_context, RE_OPCODE_NON_WORD_CHAR, &instruction_addr, code_size)); break; case RE_NODE_WORD_BOUNDARY: FAIL_ON_ERROR(_yr_emit_inst( emit_context, RE_OPCODE_WORD_BOUNDARY, &instruction_addr, code_size)); break; case RE_NODE_NON_WORD_BOUNDARY: FAIL_ON_ERROR(_yr_emit_inst( emit_context, RE_OPCODE_NON_WORD_BOUNDARY, &instruction_addr, code_size)); break; case RE_NODE_SPACE: FAIL_ON_ERROR(_yr_emit_inst( emit_context, RE_OPCODE_SPACE, &instruction_addr, code_size)); break; case RE_NODE_NON_SPACE: FAIL_ON_ERROR(_yr_emit_inst( emit_context, RE_OPCODE_NON_SPACE, &instruction_addr, code_size)); break; case RE_NODE_DIGIT: FAIL_ON_ERROR(_yr_emit_inst( emit_context, RE_OPCODE_DIGIT, &instruction_addr, code_size)); break; case RE_NODE_NON_DIGIT: FAIL_ON_ERROR(_yr_emit_inst( emit_context, RE_OPCODE_NON_DIGIT, &instruction_addr, code_size)); break; case RE_NODE_ANY: FAIL_ON_ERROR(_yr_emit_inst( emit_context, flags & EMIT_DOT_ALL ? RE_OPCODE_ANY : RE_OPCODE_ANY_EXCEPT_NEW_LINE, &instruction_addr, code_size)); break; case RE_NODE_CLASS: FAIL_ON_ERROR(_yr_emit_inst( emit_context, (flags & EMIT_NO_CASE) ? RE_OPCODE_CLASS_NO_CASE : RE_OPCODE_CLASS, &instruction_addr, code_size)); FAIL_ON_ERROR(yr_arena_write_data( emit_context->arena, re_node->class_vector, 32, NULL)); *code_size += 32; break; case RE_NODE_ANCHOR_START: FAIL_ON_ERROR(_yr_emit_inst( emit_context, RE_OPCODE_MATCH_AT_START, &instruction_addr, code_size)); break; case RE_NODE_ANCHOR_END: FAIL_ON_ERROR(_yr_emit_inst( emit_context, RE_OPCODE_MATCH_AT_END, &instruction_addr, code_size)); break; case RE_NODE_CONCAT: if (flags & EMIT_BACKWARDS) { left = re_node->right; right = re_node->left; } else { left = re_node->left; right = re_node->right; } FAIL_ON_ERROR(_yr_re_emit( emit_context, left, flags, &instruction_addr, &branch_size)); *code_size += branch_size; FAIL_ON_ERROR(_yr_re_emit( emit_context, right, flags, NULL, &branch_size)); *code_size += branch_size; break; case RE_NODE_PLUS: // Code for e+ looks like: // // L1: code for e // split L1, L2 // L2: FAIL_ON_ERROR(_yr_re_emit( emit_context, re_node->left, flags, &instruction_addr, &branch_size)); *code_size += branch_size; FAIL_ON_ERROR(_yr_emit_split( emit_context, re_node->greedy ? RE_OPCODE_SPLIT_B : RE_OPCODE_SPLIT_A, -branch_size, NULL, &split_offset_addr, &split_size)); *code_size += split_size; break; case RE_NODE_STAR: // Code for e* looks like: // // L1: split L1, L2 // code for e // jmp L1 // L2: FAIL_ON_ERROR(_yr_emit_split( emit_context, re_node->greedy ? RE_OPCODE_SPLIT_A : RE_OPCODE_SPLIT_B, 0, &instruction_addr, &split_offset_addr, &split_size)); *code_size += split_size; FAIL_ON_ERROR(_yr_re_emit( emit_context, re_node->left, flags, NULL, &branch_size)); *code_size += branch_size; // Emit jump with offset set to 0. FAIL_ON_ERROR(_yr_emit_inst_arg_int16( emit_context, RE_OPCODE_JUMP, -(branch_size + split_size), NULL, &jmp_offset_addr, &jmp_size)); *code_size += jmp_size; // Update split offset. *split_offset_addr = split_size + branch_size + jmp_size; break; case RE_NODE_ALT: // Code for e1|e2 looks like: // // split L1, L2 // L1: code for e1 // jmp L3 // L2: code for e2 // L3: // Emit a split instruction with offset set to 0 temporarily. Offset // will be updated after we know the size of the code generated for // the left node (e1). FAIL_ON_ERROR(_yr_emit_split( emit_context, RE_OPCODE_SPLIT_A, 0, &instruction_addr, &split_offset_addr, &split_size)); *code_size += split_size; FAIL_ON_ERROR(_yr_re_emit( emit_context, re_node->left, flags, NULL, &branch_size)); *code_size += branch_size; // Emit jump with offset set to 0. FAIL_ON_ERROR(_yr_emit_inst_arg_int16( emit_context, RE_OPCODE_JUMP, 0, NULL, &jmp_offset_addr, &jmp_size)); *code_size += jmp_size; // Update split offset. *split_offset_addr = split_size + branch_size + jmp_size; FAIL_ON_ERROR(_yr_re_emit( emit_context, re_node->right, flags, NULL, &branch_size)); *code_size += branch_size; // Update offset for jmp instruction. *jmp_offset_addr = branch_size + jmp_size; break; case RE_NODE_RANGE: // Code for e{n,m} looks like: // // code for e (repeated n times) // push m-n-1 // L0: split L1, L2 // L1: code for e // jnz L0 // L2: pop // split L3, L4 // L3: code for e // L4: // // Instead of generating a loop with m-n iterations, we generate a loop // with m-n-1 iterations and the last one is unrolled outside the loop. // This is because re_node->backward_code pointers *must* point to code // past the loop. If they point to code before the loop then when some atom // contained inside "e" is found, the loop will be executed in both // forward and backward code. This causes an overlap in forward and backward // matches and the reported matching string will be longer than expected. if (re_node->start > 0) { FAIL_ON_ERROR(_yr_re_emit( emit_context, re_node->left, flags, &instruction_addr, &branch_size)); *code_size += branch_size; for (i = 0; i < re_node->start - 1; i++) { // Don't want re_node->forward_code updated in this call // forward_code must remain pointing to the code generated by // by the _yr_re_emit above. However we want re_node->backward_code // being updated. FAIL_ON_ERROR(_yr_re_emit( emit_context, re_node->left, flags | EMIT_DONT_SET_FORWARDS_CODE, NULL, &branch_size)); *code_size += branch_size; } } if (re_node->end > re_node->start + 1) { FAIL_ON_ERROR(_yr_emit_inst_arg_uint16( emit_context, RE_OPCODE_PUSH, re_node->end - re_node->start - 1, re_node->start == 0 ? &instruction_addr : NULL, NULL, &inst_size)); *code_size += inst_size; FAIL_ON_ERROR(_yr_emit_split( emit_context, re_node->greedy ? RE_OPCODE_SPLIT_A : RE_OPCODE_SPLIT_B, 0, NULL, &split_offset_addr, &split_size)); *code_size += split_size; FAIL_ON_ERROR(_yr_re_emit( emit_context, re_node->left, flags | EMIT_DONT_SET_FORWARDS_CODE | EMIT_DONT_SET_BACKWARDS_CODE, NULL, &branch_size)); *code_size += branch_size; FAIL_ON_ERROR(_yr_emit_inst_arg_int16( emit_context, RE_OPCODE_JNZ, -(branch_size + split_size), NULL, &jmp_offset_addr, &jmp_size)); *code_size += jmp_size; *split_offset_addr = split_size + branch_size + jmp_size; FAIL_ON_ERROR(_yr_emit_inst( emit_context, RE_OPCODE_POP, NULL, &inst_size)); *code_size += inst_size; } if (re_node->end > re_node->start) { FAIL_ON_ERROR(_yr_emit_split( emit_context, re_node->greedy ? RE_OPCODE_SPLIT_A : RE_OPCODE_SPLIT_B, 0, NULL, &split_offset_addr, &split_size)); *code_size += split_size; FAIL_ON_ERROR(_yr_re_emit( emit_context, re_node->left, flags | EMIT_DONT_SET_FORWARDS_CODE, re_node->start == 0 && re_node->end == 1 ? &instruction_addr : NULL, &branch_size)); *code_size += branch_size; *split_offset_addr = split_size + branch_size; } break; } if (flags & EMIT_BACKWARDS) { if (!(flags & EMIT_DONT_SET_BACKWARDS_CODE)) re_node->backward_code = instruction_addr + *code_size; } else { if (!(flags & EMIT_DONT_SET_FORWARDS_CODE)) re_node->forward_code = instruction_addr; } if (code_addr != NULL) *code_addr = instruction_addr; return ERROR_SUCCESS; }
int yr_re_emit_code( RE* re, YR_ARENA* arena) { int code_size; int total_size; int emit_flags = 0; if (re->flags & RE_FLAGS_NO_CASE) emit_flags |= EMIT_NO_CASE; if (re->flags & RE_FLAGS_DOT_ALL) emit_flags |= EMIT_DOT_ALL; // Ensure that we have enough contiguos memory space in the arena to // contain the regular expression code. The code can't span over multiple // non-contiguos pages. yr_arena_reserve_memory(arena, RE_MAX_CODE_SIZE); // Emit code for matching the regular expressions forwards. total_size = 0; FAIL_ON_ERROR(_yr_re_emit( re->root_node, arena, emit_flags, &re->code, &code_size)); total_size += code_size; FAIL_ON_ERROR(_yr_emit_inst( arena, RE_OPCODE_MATCH, NULL, &code_size)); total_size += code_size; assert(total_size < RE_MAX_CODE_SIZE); yr_arena_reserve_memory(arena, RE_MAX_CODE_SIZE); // Emit code for matching the regular expressions backwards. total_size = 0; FAIL_ON_ERROR(_yr_re_emit( re->root_node, arena, emit_flags | EMIT_BACKWARDS, NULL, &code_size)); total_size += code_size; FAIL_ON_ERROR(_yr_emit_inst( arena, RE_OPCODE_MATCH, NULL, &code_size)); total_size += code_size; assert(total_size < RE_MAX_CODE_SIZE); return ERROR_SUCCESS; }
int yr_re_emit_code( RE* re, YR_ARENA* arena) { RE_EMIT_CONTEXT emit_context; int code_size; int total_size; int emit_flags = 0; if (re->flags & RE_FLAGS_NO_CASE) emit_flags |= EMIT_NO_CASE; if (re->flags & RE_FLAGS_DOT_ALL) emit_flags |= EMIT_DOT_ALL; emit_context.arena = arena; emit_context.next_split_id = 0; // Ensure that we have enough contiguous memory space in the arena to // contain the regular expression code. The code can't span over multiple // non-contiguous pages. yr_arena_reserve_memory(arena, RE_MAX_CODE_SIZE); // Emit code for matching the regular expressions forwards. total_size = 0; FAIL_ON_ERROR(_yr_re_emit( &emit_context, re->root_node, emit_flags, &re->code, &code_size)); total_size += code_size; FAIL_ON_ERROR(_yr_emit_inst( &emit_context, RE_OPCODE_MATCH, NULL, &code_size)); total_size += code_size; if (total_size > RE_MAX_CODE_SIZE) return ERROR_REGULAR_EXPRESSION_TOO_LARGE; yr_arena_reserve_memory(arena, RE_MAX_CODE_SIZE); // Emit code for matching the regular expressions backwards. total_size = 0; FAIL_ON_ERROR(_yr_re_emit( &emit_context, re->root_node, emit_flags | EMIT_BACKWARDS, NULL, &code_size)); total_size += code_size; FAIL_ON_ERROR(_yr_emit_inst( &emit_context, RE_OPCODE_MATCH, NULL, &code_size)); total_size += code_size; if (total_size > RE_MAX_CODE_SIZE) return ERROR_REGULAR_EXPRESSION_TOO_LARGE; return ERROR_SUCCESS; }