CallVec* load_api_calls_for(int func_id, int igroup_id, bool ignore_no_compares, int call_depth, bool expand_ncalls) { SqlDatabase::StatementPtr stmt = transaction->statement("select distinct fio.pos, fio.callee_id, fio.ncalls" " from semantic_fio_calls as fio" " join tmp_interesting_funcs as f1" // filter out functions with no compares " on f1.func_id = fio.callee_id" // filter on current parameters " where fio.func_id = ? and fio.igroup_id = ?" // filter out function not called directly + std::string(call_depth >= 0 ? " and fio.caller_id = ?" : "") +" order by fio.pos"); stmt->bind(0, func_id); stmt->bind(1, igroup_id); if (call_depth >= 0) stmt->bind(2, func_id); CallVec* call_vec = new CallVec; for (SqlDatabase::Statement::iterator row=stmt->begin(); row!=stmt->end(); ++row) { int callee_id = row.get<int>(1); int ncalls = row.get<int>(2); if (expand_ncalls) { for (int i = 0; i < ncalls; i++) call_vec->push_back(callee_id); } else { call_vec->push_back(callee_id); } } return call_vec; }
void insert_timing(const SqlDatabase::TransactionPtr &tx, std::string property_name, const timeval& before, const timeval& after, const rusage& ru_before, const rusage& ru_after) { SqlDatabase::StatementPtr cmd = tx->statement("insert into timing" // 0 1 2 3 4 " (property_name, total_wallclock, total_usertime, total_systime, wallclock," // 5 6 " usertime, systime)" " values (?,?,?,?,?,?,?)"); cmd->bind(0, property_name); cmd->bind(1, 0); cmd->bind(2, tvToDouble(ru_after.ru_utime)); cmd->bind(3, tvToDouble(ru_after.ru_stime)); cmd->bind(4, (tvToDouble(after) - tvToDouble(before))); cmd->bind(5, (tvToDouble(ru_after.ru_utime) - tvToDouble(ru_before.ru_utime))); cmd->bind(6, (tvToDouble(ru_after.ru_stime) - tvToDouble(ru_before.ru_stime))); cmd->execute(); }
static void postprocess(const SqlDatabase::TransactionPtr &tx) { int windowSize = tx->statement("select window_size from run_parameters limit 1")->execute_int(); int stride = tx->statement("select stride from run_parameters limit 1")->execute_int(); assert(windowSize != 0); assert(stride != 0); cerr << "About to delete from postprocessed_clusters" << endl; tx->execute("delete from postprocessed_clusters"); cerr << "... done" << endl; cerr << "About to postprocess" << endl; SqlDatabase::StatementPtr cmd = tx->statement("select cluster, function_id, index_within_function, vectors_row" " from clusters order by cluster, function_id, index_within_function"); SqlDatabase::StatementPtr insertCmd = tx->statement("insert into postprocessed_clusters" " select * from clusters where row_number = ?"); const size_t numStridesThatMustBeDifferent = windowSize / (stride * 2); string last_cluster = ""; string last_func_id = ""; size_t last_index_within_function = 0; vector<string> rows_in_this_cluster; bool first = true; for (SqlDatabase::Statement::iterator postproc_reader=cmd->begin(); postproc_reader!=cmd->end(); ++postproc_reader) { string cluster = postproc_reader.get<std::string>(0); string function_id = postproc_reader.get<std::string>(1); size_t index_within_function = postproc_reader.get<size_t>(2); string cluster_row_number = postproc_reader.get<std::string>(3); bool differentFunction = cluster != last_cluster || function_id != last_func_id; bool endingCluster = differentFunction; bool beginningNewCluster = first || differentFunction; first = false; if (endingCluster) { if (rows_in_this_cluster.size() > 1) { // Skip clusters that have only one element left for (size_t i = 0; i < rows_in_this_cluster.size(); ++i) { insertCmd->bind(0, rows_in_this_cluster[i]); insertCmd->execute(); } } } if (beginningNewCluster) { last_cluster = cluster; last_func_id = function_id; last_index_within_function = index_within_function; rows_in_this_cluster.clear(); } bool keep = beginningNewCluster || (index_within_function >= last_index_within_function + numStridesThatMustBeDifferent); if (keep) { last_index_within_function = index_within_function; rows_in_this_cluster.push_back(cluster_row_number); } } cerr << "... done" << endl; }
CallVec* load_function_api_calls_for(int func_id, bool reachability_graph) { SqlDatabase::StatementPtr stmt = transaction->statement("select distinct scg.callee from " + std::string(reachability_graph ? "semantic_rg" : "semantic_cg ") + " as scg " //" join tmp_interesting_funcs as tif on tif.func_id = scg.callee " " where scg.caller=? ORDER BY scg.callee"); stmt->bind(0, func_id); CallVec* call_vec = new CallVec; for (SqlDatabase::Statement::iterator row=stmt->begin(); row!=stmt->end(); ++row) { int callee_id = row.get<int>(0); call_vec->push_back(callee_id); } return call_vec; }
void addVectorToDatabase(const SqlDatabase::TransactionPtr &tx, const SignatureVector& vec, const std::string& functionName, size_t functionId, size_t indexWithinFunction, const std::string& normalizedUnparsedInstructions, SgAsmx86Instruction* firstInsn[], const std::string& filename, size_t windowSize, size_t stride) { ++numVectorsGenerated; vector<uint8_t> compressedCounts = compressVector(vec.getBase(), SignatureVector::Size); size_t vectorSum = 0; for (size_t i=0; i<SignatureVector::Size; ++i) vectorSum += vec[i]; ExtentMap extent; for (size_t i=0; i<windowSize; ++i) extent.insert(Extent(firstInsn[i]->get_address(), firstInsn[i]->get_size())); unsigned char md[16]; MD5((const unsigned char*)normalizedUnparsedInstructions.data(), normalizedUnparsedInstructions.size(), md); SqlDatabase::StatementPtr cmd = tx->statement("insert into vectors" // 0 1 2 3 4 5 " (id, function_id, index_within_function, line, last_insn_va, size," // 6 7 8 "sum_of_counts, counts_b64, instr_seq_b64)" " values (?,?,?,?,?,?,?,?,?)"); int vector_id = tx->statement("select coalesce(max(id),0)+1 from vectors")->execute_int(); // 1-origin cmd->bind(0, vector_id); cmd->bind(1, functionId); cmd->bind(2, indexWithinFunction); cmd->bind(3, firstInsn[0]->get_address()); cmd->bind(4, firstInsn[windowSize-1]->get_address()); cmd->bind(5, extent.size()); cmd->bind(6, vectorSum); cmd->bind(7, StringUtility::encode_base64(&compressedCounts[0], compressedCounts.size())); cmd->bind(8, StringUtility::encode_base64(md, 16)); cmd->execute(); }
void insert_timing(const SqlDatabase::TransactionPtr &tx, std::string property_name, const int groupLow, const int groupHigh, const int num_elements, const int k, const int l, const timeval& before, const timeval& after, const rusage& ru_before, const rusage& ru_after) { SqlDatabase::StatementPtr cmd = tx->statement("insert into group_timing" // 0 1 2 3 4 5 " (groupLow, groupHigh, num_elements, K, L, total_wallclock," // 6 7 8 9 10 " total_usertime, total_systime, wallclock, usertime, systime)" " values (?,?,?,?,?,?,?,?,?,?,?)"); cmd->bind(0, groupLow); cmd->bind(1, groupHigh); cmd->bind(2, num_elements); cmd->bind(3, k); cmd->bind(4, l); cmd->bind(5, 0); cmd->bind(6, tvToDouble(ru_after.ru_utime)); cmd->bind(7, tvToDouble(ru_after.ru_stime)); cmd->bind(8, (tvToDouble(after) - tvToDouble(before))); cmd->bind(9, (tvToDouble(ru_after.ru_utime) - tvToDouble(ru_before.ru_utime))); cmd->bind(10, (tvToDouble(ru_after.ru_stime) - tvToDouble(ru_before.ru_stime))); cmd->execute(); }
// Bind arguments to a statement static void sqlBindArgs(const SqlDatabase::StatementPtr &stmt, const std::vector<std::string> &args) { for (size_t i=0; i<args.size(); ++i) stmt->bind(i, args[i]); }
// Load all events into memory. Events are emitted for a particular function ID being analyzed, but if the 25-run-test // --follow-calls was specified, then events for that function ID might be at instructions that are outside that function. // We need to make note of those functions so that we can load all their instructions. static void load_events(const SqlDatabase::TransactionPtr &tx, int func_id, Events &events/*in,out*/) { int specimen_id = tx->statement("select specimen_id from semantic_functions where id = ?") ->bind(0, func_id)->execute_int(); SqlDatabase::StatementPtr stmt = tx->statement("select" // 0 1 2 3 4 " event.addr, event.event_id, event.minor, event.val, func.id," // 5 6 " event.igroup_id, event.pos" " from tmp_events as event" " join semantic_instructions as insn on event.addr = insn.address" " join semantic_functions as func on insn.func_id = func.id" " where func.specimen_id = ?" " order by igroup_id, pos"); stmt->bind(0, specimen_id); for (SqlDatabase::Statement::iterator row=stmt->begin(); row!=stmt->end(); ++row) { rose_addr_t addr = row.get<rose_addr_t>(0); int event_id = row.get<int>(1); int minor = row.get<int>(2); int64_t val = row.get<int64_t>(3); events[addr].func_id = row.get<int>(4); // the hard-to-get ID, not the one stored in the events func_id column. int igroup_id = row.get<int>(5); int pos = row.get<int>(6); switch (event_id) { case CloneDetection::EV_REACHED: { ++events[addr].nexecuted; break; } case CloneDetection::EV_BRANCHED: { ++events[addr].nbranches; ++events[addr].branches[val]; break; } case CloneDetection::EV_RETURNED: { ++events[addr].nreturns; break; } case CloneDetection::EV_CONSUME_INPUT: { ++events[addr].ninputs; assert(minor>=0); if ((size_t)minor>=events[addr].inputs.size()) events[addr].inputs.resize(minor+1); ++events[addr].inputs[minor][val]; break; } case CloneDetection::EV_FAULT: { CloneDetection::AnalysisFault::Fault fault = (CloneDetection::AnalysisFault::Fault)minor; ++events[addr].nfaults; ++events[addr].faults[fault]; break; } case CloneDetection::EV_MEM_WRITE: { OutputEventKey output_key(igroup_id, val); OutputEventValue output_val(pos, minor); // Track final writes to each address final_output_events[output_key] = output_val; // Append event to the appropriate instruction events[addr].outputs.push_back(std::make_pair(output_key, output_val)); } default: /*void*/ break; } } }
/* Remove the functions from the compilation unit that is only available in one of the traces. * - criteria complement of the functions from the files of the caller functions in the call trace is removed. */ std::pair<CallVec*, CallVec*> remove_compilation_unit_complement(int func1_id, int func2_id, int igroup_id, int similarity, CallVec* func1_vec, CallVec* func2_vec) { CallVec* new_func1_vec = new CallVec; CallVec* new_func2_vec = new CallVec; if (func1_vec->size() > 0 || func2_vec->size() > 0) { // Find the set complement of functions called by the two functions // - we are not interested in functions called by both std::set<int> func1_vec_set; std::set<int> func2_vec_set; for (CallVec::iterator it = func1_vec->begin(); it != func1_vec->end(); ++it) func1_vec_set.insert(*it); for (CallVec::iterator it = func2_vec->begin(); it != func2_vec->end(); ++it) func2_vec_set.insert(*it); std::set<int> func1_func2_complement; std::set_difference(func1_vec_set.begin(), func1_vec_set.end(), func2_vec_set.begin(), func2_vec_set.end(), std::inserter(func1_func2_complement, func1_func2_complement.end())); // Find the compilation units in question. A compilation unit is in our case a file. SqlDatabase::StatementPtr func1_file_stmt = transaction->statement("select file_id from semantic_functions" " where id = ?"); func1_file_stmt->bind(0, func1_id); int func1_file_id = func1_file_stmt->execute_int(); SqlDatabase::StatementPtr func2_file_stmt = transaction->statement("select file_id from semantic_functions" " where id = ?"); func2_file_stmt->bind(0, func2_id); int func2_file_id = func2_file_stmt->execute_int(); // Find the functions that needs to be removed // - all functions that has a clone in between the files SqlDatabase::StatementPtr stmt = transaction->statement("select sem.func1_id, sem.func2_id from semantic_funcsim as sem" " join semantic_functions as sf1 on sem.func1_id = sf1.id" " join semantic_functions as sf2 on sem.func2_id = sf2.id" " where similarity >= ? and sf1.file_id in (?,?)" " and sf2.file_id in (?, ?) and sf1.file_id != sf2.file_id"); stmt->bind(0, similarity); stmt->bind(1, func1_file_id); stmt->bind(2, func2_file_id); stmt->bind(3, func1_file_id); stmt->bind(4, func2_file_id); std::set<int> complement_functions; for (SqlDatabase::Statement::iterator row=stmt->begin(); row!=stmt->end(); ++row) { int clone_func1 = row.get<int>(0); int clone_func2 = row.get<int>(1); complement_functions.insert(clone_func1); complement_functions.insert(clone_func2); } // Find the functions we want to remove // - functions present with clones in between the files that is not part of both traces std::set<int> remove_these; std::set_intersection(complement_functions.begin(), complement_functions.end(), func1_func2_complement.begin(), func1_func2_complement.end(), std::inserter(remove_these, remove_these.end())); //prune functions to remove away from the call trace into new vectors for (CallVec::iterator it = func1_vec->begin(); it != func1_vec->end(); ++it) { if (remove_these.find(*it) == remove_these.end()) new_func1_vec->push_back(*it); } for (CallVec::iterator it = func2_vec->begin(); it != func2_vec->end(); ++it) { if (remove_these.find(*it) == remove_these.end()) new_func2_vec->push_back(*it); } } return std::pair<CallVec*, CallVec*>(new_func1_vec, new_func2_vec); }
static void callLSH(const SqlDatabase::TransactionPtr &tx, const std::string databaseName, double similarity_threshold, const string& Exec, int norm, size_t hash_function_size, size_t hash_table_count) { double distance = sqrt((1. - similarity_threshold) * 50.); double false_negative_rate = ( similarity_threshold != 1.0) ? 0.0100 : 0; vector<CloneRange> ranges = computeranges(distance, 50, 100000); int maxNumElementsInGroup = -1; int maxNumElementIdx = -1; // FIXME: We can't pass parameters to the exec'd process this way because the parent's SQL statements are // being executed in a transaction -- they won't be visible in the child. [Robb P. Matzke 2013-08-12] tx->execute("delete from detection_parameters"); tx->statement("insert into detection_parameters (similarity_threshold, false_negative_rate) values (?, ?)") ->bind(0, similarity_threshold) ->bind(1, false_negative_rate) ->execute(); map<size_t, int> groupSizes; std::cout << "Looking for the biggest group" << std::endl; for (size_t i = 0; i < ranges.size(); ++i) { std::string sql = std::string("select count(*) from vectors where sum_of_counts >= ?") + (ranges[i].high != -1 ? " and sum_of_counts <= ?" : ""); SqlDatabase::StatementPtr cmd = tx->statement(sql); cmd->bind(0, ranges[i].low); if (ranges[i].high != -1) cmd->bind(1, ranges[i].high); int numElementsInGroup = cmd->execute_int(); groupSizes[i] = numElementsInGroup; std::cerr << "The current group from " << ranges[i].low << " to " << ranges[i].high << " is of size " << numElementsInGroup << std::endl; if (numElementsInGroup > maxNumElementsInGroup) { maxNumElementsInGroup = numElementsInGroup; maxNumElementIdx = i; } } std::cout << "Biggest group found " << ranges[maxNumElementIdx].low << " " << ranges[maxNumElementIdx].high << std::endl; char tempDirName[] = "/tmp/paramdirXXXXXX"; char* mkdtempResult = mkdtemp(tempDirName); if (!mkdtempResult) { perror("mkdtemp: "); exit (1); } string paramFileName = string(tempDirName) + "/params"; paramFileName = "/tmp/lshparamdirE40hF1/params"; std::cout << "Number of groups :" << ranges.size() << std::endl; for (int i = 0; i < (int)ranges.size(); ++i) { size_t group = (i == 0) ? maxNumElementIdx : (i <= maxNumElementIdx) ? i - 1 : i; if (groupSizes[group] > 1) { std::cout << "Executing LSH code low " << ranges[group].low << " high " << ranges[group].high << " group " << group << " size " << groupSizes[group] << std::endl; if(norm == 3) { executeLSHCode(tx, databaseName, Exec, paramFileName, ranges[group]); } else { executeLSHCodeLLNL(tx, databaseName, Exec, paramFileName, ranges[group], norm, similarity_threshold, false_negative_rate, groupSizes[group]); } } } unlink(paramFileName.c_str()); rmdir(tempDirName); }
void add_calls_to_syscalls_to_db(SqlDatabase::TransactionPtr tx, DirectedGraph* G, std::vector<SgAsmFunction*> all_functions) { // load the functions in db into memory std::map<std::string, std::set<int> > symbolToId; SqlDatabase::StatementPtr cmd3 = tx->statement("select id, name from semantic_functions"); for (SqlDatabase::Statement::iterator r=cmd3->begin(); r!=cmd3->end(); ++r) { int func_id = r.get<int>(0); std::string func_name = r.get<std::string>(1); if (func_name.size() == 0) continue; std::map<std::string, std::set<int> >::iterator fit = symbolToId.find(func_name); if (fit == symbolToId.end()) { std::set<int> function_ids; function_ids.insert(func_id); symbolToId[func_name] = function_ids; } else { fit->second.insert(func_id); } } DirectedGraph& graph = *G; SqlDatabase::StatementPtr stmt = tx->statement("insert into syscalls_made(caller, syscall_id, syscall_name) values(?,?,?)"); // Iterate over all components of the reachability graph typedef graph_traits<DirectedGraph>::vertex_descriptor Vertex; graph_traits<DirectedGraph>::vertex_iterator i, end; for (tie(i, end) = vertices(graph); i != end; ++i) { if (*i < ids_reserved_for_syscalls) continue; std::set<int> syscalls; // Iterate through the child vertex indices for [current_index] std::vector<Vertex> reachable; boost::breadth_first_search(graph, *i, boost::visitor(boost::make_bfs_visitor(boost::write_property(boost::identity_property_map(), std::back_inserter(reachable), boost::on_discover_vertex())))); for (std::vector<Vertex>::iterator it = reachable.begin(); it != reachable.end(); ++it) { if (*it < ids_reserved_for_syscalls) syscalls.insert(*it); } int caller_id = *i - ids_reserved_for_syscalls; ROSE_ASSERT(caller_id >= 0); SgAsmFunction* caller = all_functions[caller_id]; ROSE_ASSERT(isSgAsmFunction(caller) != NULL); std::string func_name = caller->get_name(); if (func_name.length() == 0) continue; std::map<std::string, std::set<int> >::iterator equivalent_ids = symbolToId.find(func_name); if (equivalent_ids == symbolToId.end()) equivalent_ids = symbolToId.find(func_name+"@plt"); if (syscalls.size() > 0 && equivalent_ids != symbolToId.end()) { for (std::set<int>::iterator sit = syscalls.begin(); sit != syscalls.end(); ++sit) { int syscall_callee_id = *sit; extern std::map<int, std::string> linux32_syscalls; // defined in linux_syscalls.C const std::string &syscall_name = linux32_syscalls[syscall_callee_id]; for (std::set<int>::iterator equivalent_id = equivalent_ids->second.begin(); equivalent_id != equivalent_ids->second.end(); ++ equivalent_id) { stmt->bind(0, *equivalent_id); stmt->bind(1, syscall_callee_id); stmt->bind(2, syscall_name); stmt->execute(); } } } } }
void operator()() { // Database connections don't survive over fork() according to SqLite and PostgreSQL documentation, so open it again SqlDatabase::TransactionPtr tx = SqlDatabase::Connection::create(databaseUrl)->transaction(); // Use zero for the number of tests ran so that this child process doesn't try to update the semantic_history table. // If two or more processes try to change the same row (which they will if there's a non-zero number of tests) then // they will deadlock with each other. static const size_t NO_TESTS_RAN = 0; NameSet builtin_function_names; add_builtin_functions(builtin_function_names/*out*/); InputGroup igroup; WorkItem prevWorkItem; SgAsmInterpretation *prev_interp = NULL; MemoryMap ro_map; Disassembler::AddressSet whitelist_exports; // dynamic functions that should be called PointerDetectors pointers; InsnCoverage insn_coverage; DynamicCallGraph dynamic_cg; Tracer tracer; ConsumedInputs consumed_inputs; FuncAnalyses funcinfo; OutputGroups ogroups; // do not load from database (that might take a very long time) time_t last_checkpoint = time(NULL); for (size_t workIdx=0; workIdx<work.size(); ++workIdx) { WorkItem &workItem = work[workIdx]; // Load the input group from the database if necessary. if (workItem.igroup_id!=prevWorkItem.igroup_id) { if (!igroup.load(tx, workItem.igroup_id)) { std::cerr <<argv0 <<": input group " <<workItem.igroup_id <<" is empty or does not exist\n"; exit(1); } } // Find the function to test IdFunctionMap::iterator func_found = functions.find(workItem.func_id); assert(func_found!=functions.end()); SgAsmFunction *func = func_found->second; if (opt.verbosity>=LACONIC) { if (opt.verbosity>=EFFUSIVE) std::cerr <<argv0 <<": " <<std::string(100, '=') <<"\n"; std::cerr <<argv0 <<": processing function " <<function_to_str(func, function_ids) <<"\n"; } SgAsmInterpretation *interp = SageInterface::getEnclosingNode<SgAsmInterpretation>(func); assert(interp!=NULL); // Do per-interpretation stuff if (interp!=prev_interp) { prev_interp = interp; assert(interp->get_map()!=NULL); ro_map = *interp->get_map(); ro_map.require(MemoryMap::READABLE).prohibit(MemoryMap::WRITABLE).keep(); Disassembler::AddressSet whitelist_imports = get_import_addresses(interp, builtin_function_names); whitelist_exports.clear(); // imports are addresses of import table slots; exports are functions overmap_dynlink_addresses(interp, *insns, opt.params.follow_calls, &ro_map, GOTPLT_VALUE, whitelist_imports, whitelist_exports/*out*/); if (opt.verbosity>=EFFUSIVE) { std::cerr <<argv0 <<": memory map for SgAsmInterpretation:\n"; interp->get_map()->dump(std::cerr, argv0+": "); } } // Run the test assert(insns!=NULL); assert(entry2id!=NULL); std::cerr <<"process " <<getpid() <<" about to run test " <<workIdx <<"/" <<work.size() <<" " <<workItem <<"\n"; runOneTest(tx, workItem, pointers, func, function_ids, insn_coverage, dynamic_cg, tracer, consumed_inputs, interp, whitelist_exports, cmd_id, igroup, funcinfo, *insns, &ro_map, *entry2id, ogroups); ++ntests_ran; // Checkpoint if (opt.checkpoint>0 && time(NULL)-last_checkpoint > opt.checkpoint) { if (!opt.dry_run) tx = checkpoint(tx, ogroups, tracer, insn_coverage, dynamic_cg, consumed_inputs, NULL, NO_TESTS_RAN, cmd_id); last_checkpoint = time(NULL); } prevWorkItem = workItem; } std::cerr <<"process " <<getpid() <<" is done testing; now finishing up...\n"; if (!tx->is_terminated()) { SqlDatabase::StatementPtr stmt = tx->statement("insert into semantic_funcpartials" " (func_id, ncalls, nretused, ntests, nvoids) values" " (?, ?, ?, ?, ?)"); for (FuncAnalyses::iterator fi=funcinfo.begin(); fi!=funcinfo.end(); ++fi) { stmt->bind(0, fi->first); stmt->bind(1, fi->second.ncalls); stmt->bind(2, fi->second.nretused); stmt->bind(3, fi->second.ntests); stmt->bind(4, fi->second.nvoids); stmt->execute(); } } // Cleanup if (!tx->is_terminated() && !opt.dry_run) { std::cerr <<"process " <<getpid() <<" is doing the final checkpoint\n"; checkpoint(tx, ogroups, tracer, insn_coverage, dynamic_cg, consumed_inputs, NULL, NO_TESTS_RAN, cmd_id); } tx.reset(); std::cerr <<"process " <<getpid() <<" finished\n"; }