// Ignores function boundaries bool createVectorsForAllInstructions(SgNode* top, const std::string& filename, const std::string& functionName, int functionId, size_t windowSize, size_t stride, const SqlDatabase::TransactionPtr &tx) { bool retVal = false; vector<SgAsmx86Instruction*> insns; FindInstructionsVisitor vis; AstQueryNamespace::querySubTree(top, std::bind2nd( vis, &insns )); size_t insnCount = insns.size(); for (size_t windowStart = 0; windowStart + windowSize <= insnCount; windowStart += stride) { static SignatureVector vec; vec.clear(); hash_map<SgAsmExpression*, size_t> valueNumbers[3]; numberOperands(&insns[windowStart], windowSize, valueNumbers); string normalizedUnparsedInstructions; // Unparse the normalized forms of the instructions for (size_t insnNumber = 0; insnNumber < windowSize; ++insnNumber) { SgAsmx86Instruction* insn = insns[windowStart + insnNumber]; size_t var = getInstructionKind(insn); #ifdef NORMALIZED_UNPARSED_INSTRUCTIONS string mne = insn->get_mnemonic(); boost::to_lower(mne); normalizedUnparsedInstructions += mne; #endif const SgAsmExpressionPtrList& operands = getOperands(insn); size_t operandCount = operands.size(); // Add to total for this variant ++vec.totalForVariant(var); // Add to total for each kind of operand for (size_t i = 0; i < operandCount; ++i) { SgAsmExpression* operand = operands[i]; ExpressionCategory cat = getCategory(operand); ++vec.opsForVariant(cat, var); // Add to total for this unique operand number (for this window) hash_map<SgAsmExpression*, size_t>::const_iterator numIter = valueNumbers[(int)cat].find(operand); assert (numIter != valueNumbers[(int)cat].end()); size_t num = numIter->second; ++vec.specificOp(cat, num); // Add to total for this kind of operand ++vec.operandTotal(cat); #ifdef NORMALIZED_UNPARSED_INSTRUCTIONS normalizedUnparsedInstructions += (cat == ec_reg ? "R" : cat == ec_mem ? "M" : "V") + boost::lexical_cast<string>(num); #endif } // Add to total for this pair of operand kinds if (operandCount >= 2) { ExpressionCategory cat1 = getCategory(operands[0]); ExpressionCategory cat2 = getCategory(operands[1]); ++vec.operandPair(cat1, cat2); } #ifdef NORMALIZED_UNPARSED_INSTRUCTIONS if (insnNumber + 1 < windowSize) { normalizedUnparsedInstructions += ";"; } #endif } // Add vector to database addVectorToDatabase(tx, vec, functionName, functionId, windowStart/stride, normalizedUnparsedInstructions, &insns[windowStart], filename, windowSize, stride); retVal = true; } addFunctionStatistics(tx, filename, functionName, functionId, insnCount); return retVal; }
bool createVectorsForAllInstructions(SgNode* top, const std::string& filename, const std::string& functionName, int functionId, size_t windowSize, size_t stride, sqlite3_connection& con) { // Ignores function boundaries bool retVal = false; vector<SgAsmx86Instruction*> insns; FindInstructionsVisitor vis; AstQueryNamespace::querySubTree(top, std::bind2nd( vis, &insns )); std::cout << "Number of instructions: " << insns.size() << std::endl; size_t insnCount = insns.size(); for (size_t windowStart = 0; windowStart + windowSize <= insnCount; windowStart += stride) { static SignatureVector vec; vec.clear(); hash_map<SgAsmExpression*, size_t> valueNumbers[3]; numberOperands(&insns[windowStart], windowSize, valueNumbers); string normalizedUnparsedInstructions; // Unparse the normalized forms of the instructions for (size_t insnNumber = 0; insnNumber < windowSize; ++insnNumber) { SgAsmx86Instruction* insn = insns[windowStart + insnNumber]; size_t var = getInstructionKind(insn); #ifdef NORMALIZED_UNPARSED_INSTRUCTIONS string mne = insn->get_mnemonic(); boost::to_lower(mne); normalizedUnparsedInstructions += mne; #endif const SgAsmExpressionPtrList& operands = getOperands(insn); size_t operandCount = operands.size(); // Add to total for this variant ++vec.totalForVariant(var); // Add to total for each kind of operand for (size_t i = 0; i < operandCount; ++i) { SgAsmExpression* operand = operands[i]; ExpressionCategory cat = getCategory(operand); ++vec.opsForVariant(cat, var); // Add to total for this unique operand number (for this window) hash_map<SgAsmExpression*, size_t>::const_iterator numIter = valueNumbers[(int)cat].find(operand); assert (numIter != valueNumbers[(int)cat].end()); size_t num = numIter->second; ++vec.specificOp(cat, num); // Add to total for this kind of operand ++vec.operandTotal(cat); #ifdef NORMALIZED_UNPARSED_INSTRUCTIONS normalizedUnparsedInstructions += (cat == ec_reg ? "R" : cat == ec_mem ? "M" : "V") + boost::lexical_cast<string>(num); #endif } //Try to see what the effect is of jumps on the false positive rate //uint64_t addr =0; /* if( x86GetKnownBranchTarget(insn, addr) == true ) { uint64_t insn_addr = insn->get_address(); if( addr < insn_addr ) normalizedUnparsedInstructions += " UP "; else normalizedUnparsedInstructions += " DOWN "; }*/ // Add to total for this pair of operand kinds if (operandCount >= 2) { ExpressionCategory cat1 = getCategory(operands[0]); ExpressionCategory cat2 = getCategory(operands[1]); ++vec.operandPair(cat1, cat2); } #ifdef NORMALIZED_UNPARSED_INSTRUCTIONS if (insnNumber + 1 < windowSize) { normalizedUnparsedInstructions += ";"; } #endif } #if 0 // Print out this vector cout << "{"; for (size_t i = 0; i < SignatureVector::Size; ++i) { if (i != 0) cout << ", "; cout << vec[i]; } cout << "}\n"; #endif // cout << "Normalized instruction stream: " << normalizedUnparsedInstructions << endl; // Add vector to database addVectorToDatabase(con, vec, functionName, functionId, windowStart/stride, normalizedUnparsedInstructions, &insns[windowStart], filename, windowSize, stride); retVal = true; } addFunctionStatistics(con, filename, functionName, functionId, insnCount); return retVal; }