std::string normalizeInstructionsToHTML(std::vector<SgAsmx86Instruction*>::iterator beg, std::vector<SgAsmx86Instruction*>::iterator end) { string normalizedUnparsedInstructions; map<SgAsmExpression*, size_t> valueNumbers[3]; numberOperands( beg,end, valueNumbers); // Unparse the normalized forms of the instructions for (; beg != end; ++beg ) { SgAsmx86Instruction* insn = *beg; string mne = insn->get_mnemonic(); boost::to_lower(mne); mne = "<font color=\"red\">" + htmlEscape(mne)+"</font>"; normalizedUnparsedInstructions += mne; const SgAsmExpressionPtrList& operands = getOperands(insn); // Add to total for this variant // Add to total for each kind of operand size_t operandCount = operands.size(); normalizedUnparsedInstructions += "<font color=\"blue\">"; for (size_t i = 0; i < operandCount; ++i) { SgAsmExpression* operand = operands[i]; ExpressionCategory cat = getCategory(operand); map<SgAsmExpression*, size_t>::const_iterator numIter = valueNumbers[(int)cat].find(operand); assert (numIter != valueNumbers[(int)cat].end()); size_t num = numIter->second; normalizedUnparsedInstructions += (cat == ec_reg ? " R" : cat == ec_mem ? " M" : " V") + boost::lexical_cast<string>(num); } normalizedUnparsedInstructions += "; </font> <br> "; } return normalizedUnparsedInstructions; };
bool createVectorsForAllInstructions(SgNode* top, const std::string& filename, const std::string& functionName, int functionId, size_t windowSize, size_t stride, sqlite3_connection& con) { // Ignores function boundaries bool retVal = false; vector<SgAsmx86Instruction*> insns; FindInstructionsVisitor vis; AstQueryNamespace::querySubTree(top, std::bind2nd( vis, &insns )); std::cout << "Number of instructions: " << insns.size() << std::endl; size_t insnCount = insns.size(); for (size_t windowStart = 0; windowStart + windowSize <= insnCount; windowStart += stride) { static SignatureVector vec; vec.clear(); hash_map<SgAsmExpression*, size_t> valueNumbers[3]; numberOperands(&insns[windowStart], windowSize, valueNumbers); string normalizedUnparsedInstructions; // Unparse the normalized forms of the instructions for (size_t insnNumber = 0; insnNumber < windowSize; ++insnNumber) { SgAsmx86Instruction* insn = insns[windowStart + insnNumber]; size_t var = getInstructionKind(insn); #ifdef NORMALIZED_UNPARSED_INSTRUCTIONS string mne = insn->get_mnemonic(); boost::to_lower(mne); normalizedUnparsedInstructions += mne; #endif const SgAsmExpressionPtrList& operands = getOperands(insn); size_t operandCount = operands.size(); // Add to total for this variant ++vec.totalForVariant(var); // Add to total for each kind of operand for (size_t i = 0; i < operandCount; ++i) { SgAsmExpression* operand = operands[i]; ExpressionCategory cat = getCategory(operand); ++vec.opsForVariant(cat, var); // Add to total for this unique operand number (for this window) hash_map<SgAsmExpression*, size_t>::const_iterator numIter = valueNumbers[(int)cat].find(operand); assert (numIter != valueNumbers[(int)cat].end()); size_t num = numIter->second; ++vec.specificOp(cat, num); // Add to total for this kind of operand ++vec.operandTotal(cat); #ifdef NORMALIZED_UNPARSED_INSTRUCTIONS normalizedUnparsedInstructions += (cat == ec_reg ? "R" : cat == ec_mem ? "M" : "V") + boost::lexical_cast<string>(num); #endif } //Try to see what the effect is of jumps on the false positive rate //uint64_t addr =0; /* if( x86GetKnownBranchTarget(insn, addr) == true ) { uint64_t insn_addr = insn->get_address(); if( addr < insn_addr ) normalizedUnparsedInstructions += " UP "; else normalizedUnparsedInstructions += " DOWN "; }*/ // Add to total for this pair of operand kinds if (operandCount >= 2) { ExpressionCategory cat1 = getCategory(operands[0]); ExpressionCategory cat2 = getCategory(operands[1]); ++vec.operandPair(cat1, cat2); } #ifdef NORMALIZED_UNPARSED_INSTRUCTIONS if (insnNumber + 1 < windowSize) { normalizedUnparsedInstructions += ";"; } #endif } #if 0 // Print out this vector cout << "{"; for (size_t i = 0; i < SignatureVector::Size; ++i) { if (i != 0) cout << ", "; cout << vec[i]; } cout << "}\n"; #endif // cout << "Normalized instruction stream: " << normalizedUnparsedInstructions << endl; // Add vector to database addVectorToDatabase(con, vec, functionName, functionId, windowStart/stride, normalizedUnparsedInstructions, &insns[windowStart], filename, windowSize, stride); retVal = true; } addFunctionStatistics(con, filename, functionName, functionId, insnCount); return retVal; }
// Ignores function boundaries bool createVectorsForAllInstructions(SgNode* top, const std::string& filename, const std::string& functionName, int functionId, size_t windowSize, size_t stride, const SqlDatabase::TransactionPtr &tx) { bool retVal = false; vector<SgAsmx86Instruction*> insns; FindInstructionsVisitor vis; AstQueryNamespace::querySubTree(top, std::bind2nd( vis, &insns )); size_t insnCount = insns.size(); for (size_t windowStart = 0; windowStart + windowSize <= insnCount; windowStart += stride) { static SignatureVector vec; vec.clear(); hash_map<SgAsmExpression*, size_t> valueNumbers[3]; numberOperands(&insns[windowStart], windowSize, valueNumbers); string normalizedUnparsedInstructions; // Unparse the normalized forms of the instructions for (size_t insnNumber = 0; insnNumber < windowSize; ++insnNumber) { SgAsmx86Instruction* insn = insns[windowStart + insnNumber]; size_t var = getInstructionKind(insn); #ifdef NORMALIZED_UNPARSED_INSTRUCTIONS string mne = insn->get_mnemonic(); boost::to_lower(mne); normalizedUnparsedInstructions += mne; #endif const SgAsmExpressionPtrList& operands = getOperands(insn); size_t operandCount = operands.size(); // Add to total for this variant ++vec.totalForVariant(var); // Add to total for each kind of operand for (size_t i = 0; i < operandCount; ++i) { SgAsmExpression* operand = operands[i]; ExpressionCategory cat = getCategory(operand); ++vec.opsForVariant(cat, var); // Add to total for this unique operand number (for this window) hash_map<SgAsmExpression*, size_t>::const_iterator numIter = valueNumbers[(int)cat].find(operand); assert (numIter != valueNumbers[(int)cat].end()); size_t num = numIter->second; ++vec.specificOp(cat, num); // Add to total for this kind of operand ++vec.operandTotal(cat); #ifdef NORMALIZED_UNPARSED_INSTRUCTIONS normalizedUnparsedInstructions += (cat == ec_reg ? "R" : cat == ec_mem ? "M" : "V") + boost::lexical_cast<string>(num); #endif } // Add to total for this pair of operand kinds if (operandCount >= 2) { ExpressionCategory cat1 = getCategory(operands[0]); ExpressionCategory cat2 = getCategory(operands[1]); ++vec.operandPair(cat1, cat2); } #ifdef NORMALIZED_UNPARSED_INSTRUCTIONS if (insnNumber + 1 < windowSize) { normalizedUnparsedInstructions += ";"; } #endif } // Add vector to database addVectorToDatabase(tx, vec, functionName, functionId, windowStart/stride, normalizedUnparsedInstructions, &insns[windowStart], filename, windowSize, stride); retVal = true; } addFunctionStatistics(tx, filename, functionName, functionId, insnCount); return retVal; }
void RoseBin_FlowAnalysis::process_jumps() { if (RoseBin_support::DEBUG_MODE()) cerr << "\n >>>>>>>>> processing jumps ... " << endl; rose_hash::unordered_map <uint64_t, SgAsmInstruction* >::iterator it; for (it=rememberInstructions.begin();it!=rememberInstructions.end();++it) { SgAsmx86Instruction* inst = isSgAsmx86Instruction(it->second); if (inst->get_kind() == x86_call) { //cerr << "Found call at " << std::hex << inst->get_address() << endl; SgAsmx86Instruction* target = isSgAsmx86Instruction(process_jumps_get_target(inst)); if (target) { //cerr << "Target is " << std::hex << target->get_address() << endl; // inst->get_targets().push_back(target); // we set the sources (for each node) ROSE_ASSERT(g_algo->info); g_algo->info->incomingEdges[target].insert(inst->get_address()); // tps: changed this algorithm so that it runs in // linear time! ROSE_ASSERT (target->get_parent()); if (target->get_parent()) { // ROSE_ASSERT(target->get_parent()); SgAsmNode* b_b = target; if (!db) b_b = isSgAsmNode(target->get_parent()); ROSE_ASSERT(b_b); SgAsmFunction* b_func = isSgAsmFunction(b_b->get_parent()); if (b_func) { // (16/Oct/07) tps: this is tricky, it appears that sometimes the target can // be just a jmp to a new location, so we should forward this information to the correct // function. // Therefore we need to check if the current function has a return statement. // If not, we want to forward this information. if (target->get_kind() == x86_jmp) { //cerr << " >>>>>>>> found a jmp target - number of children: " << b_func->get_traversalSuccessorContainer().size() << endl; if (b_func->get_numberOfTraversalSuccessors()==1) { SgAsmx86Instruction* target2 = isSgAsmx86Instruction(process_jumps_get_target(inst)); if (target2) { b_b = target2; if (!db) b_b = isSgAsmNode(target2->get_parent()); b_func = isSgAsmFunction(b_b->get_parent()); } } } if (inst->get_parent()) { //cerr << "Inst has a parent" << endl; if (inst->get_comment()=="") inst->set_comment(""+b_func->get_name()); ROSE_ASSERT(g_algo->info); SgAsmInstruction* inst_after = g_algo->info->getInstructionAtAddress(inst->get_address() + inst->get_raw_bytes().size()); // inst->cfgBinFlowOutEdge(info); if (inst_after) { //cerr << "Added dest " << std::hex << isSgAsmStatement(inst_after)->get_address() << " for function" << endl; b_func->append_dest(isSgAsmStatement(inst_after)); } } } else { if (RoseBin_support::DEBUG_MODE()) cerr << " NO FUNCTION DETECTED ABOVE BLOCK . " << endl; } } else { if (RoseBin_support::DEBUG_MODE()) cerr << " WARNING :: process_jumps: target has no parent ... i.e. no FunctionDeclaration to it " << target->class_name() << endl; } } else { if (inst) if (RoseBin_support::DEBUG_MODE()) cerr << " WARNING :: process_jumps: No target found for node " << RoseBin_support::HexToString(inst->get_address()) << " " << inst->get_mnemonic() << endl; } } else { // might be a jmp SgAsmx86Instruction* target = isSgAsmx86Instruction(process_jumps_get_target(inst)); if (target) { // inst->get_targets().push_back(target); // we set the sources (for each node) ROSE_ASSERT(g_algo->info); g_algo->info->incomingEdges[target].insert(inst->get_address()); } } } //cerr << "\n >>>>>>>>> processing jumps ... done. " << endl; // cerr << "\n >>>>>>>>> resolving RET jumps ... " << endl; rose_hash::unordered_map <uint64_t, SgAsmInstruction* >::iterator it2; for (it2=rememberInstructions.begin();it2!=rememberInstructions.end();++it2) { //int id = it2->first; SgAsmx86Instruction* target = isSgAsmx86Instruction(it2->second); ROSE_ASSERT (target); #if 1 if (target->get_kind() == x86_ret) { SgAsmNode* b_b = target; if (!db) b_b = isSgAsmNode(target->get_parent()); SgAsmFunction* parent = isSgAsmFunction(b_b->get_parent()); if (parent) { //ROSE_ASSERT(parent); std::vector <SgAsmStatement*> dest_list = parent->get_dest(); for (size_t i = 0; i < dest_list.size(); ++i) { ROSE_ASSERT (isSgAsmInstruction(dest_list[i])); //cerr << "Adding ret target " << std::hex << dest_list[i]->get_address() << " to " << std::hex << target->get_address() << endl; //info->indirectJumpAndReturnTargets[target].insert(dest_list[i]->get_address()); ROSE_ASSERT(g_algo->info); g_algo->info->incomingEdges[isSgAsmInstruction(dest_list[i])].insert(target->get_address()); } std::vector <SgAsmStatement*>::iterator it3 = dest_list.begin(); for (; it3!=dest_list.end();++it3) { SgAsmInstruction* dest = isSgAsmInstruction(*it3); if (dest) { dest->append_sources(target); //cerr << " appending source to " << dest->get_address() << " target: " << target->get_address() << endl; } } // for } else { // if parent if (RoseBin_support::DEBUG_MODE()) cerr << " ERROR :: RET jumps :: no parent found for ret : " << target->class_name() << endl; //exit (0); } } // if ret #endif } if (RoseBin_support::DEBUG_MODE()) cerr << " >>>>>>>>> resolving RET jumps ... done." << endl; }