void compute_percent_similarity_statistics(double bucket_size, double increment, SqlDatabase::TransactionPtr transaction) { int num_pairs = transaction->statement("select count(*) from semantic_funcsim")->execute_int(); transaction->execute("drop table IF EXISTS fr_percent_similar"); transaction->execute("create table fr_percent_similar(similarity_low double precision, similarity_middle double precision," " similarity_high double precision, percent double precision, num_matches integer);"); SqlDatabase::StatementPtr pecent_similar_stmt = transaction->statement("insert into fr_percent_similar" // 0 1 2 "(similarity_low, similarity_middle, similarity_high," // 3 4 " percent, num_matches) " " values (?, ?, ?, ?, ?)"); for (double cur_bucket = 0.0; cur_bucket <= 1.0+bucket_size; cur_bucket+=increment) { int num_matches = transaction->statement("select count(*) from semantic_funcsim where " " similarity >= " + boost::lexical_cast<std::string>(cur_bucket - bucket_size) + " and similarity < " + boost::lexical_cast<std::string>(cur_bucket + bucket_size))->execute_int(); pecent_similar_stmt->bind(0, cur_bucket - bucket_size < 0 ? 0 : cur_bucket - bucket_size); pecent_similar_stmt->bind(1, cur_bucket); pecent_similar_stmt->bind(2, cur_bucket + bucket_size >= 1.0 ? 1.0 : cur_bucket + bucket_size); pecent_similar_stmt->bind(3, num_pairs > 0 ? ((double) num_matches*100.0)/num_pairs : 0); pecent_similar_stmt->bind(4, num_matches); pecent_similar_stmt->execute(); } }
int main(int argc, char *argv[]) { std::ios::sync_with_stdio(); argv0 = argv[0]; { size_t slash = argv0.rfind('/'); argv0 = slash==std::string::npos ? argv0 : argv0.substr(slash+1); if (0==argv0.substr(0, 3).compare("lt-")) argv0 = argv0.substr(3); } int argno = 1; bool link = false; std::vector<std::string> signature_components; for (/*void*/; argno<argc && '-'==argv[argno][0]; ++argno) { std::cout << argv[argno] << std::endl; if (!strcmp(argv[argno], "--")) { ++argno; break; } else if (!strcmp(argv[argno], "--help") || !strcmp(argv[argno], "-h")) { ::usage(0); } else if (!strcmp(argv[argno], "--link")) { link = true; } else if (!strcmp(argv[argno], "--no-link")) { link = false; } else { std::cerr <<argv0 <<": unrecognized switch: " <<argv[argno] <<"\n" <<"see \"" <<argv0 <<" --help\" for usage info.\n"; exit(1); } } if (argno+2!=argc) ::usage(1); std::string db_name(argv[argno++]); std::cout << "Connecting to db:" << db_name << std::endl; SqlDatabase::ConnectionPtr conn = SqlDatabase::Connection::create(db_name); transaction = conn->transaction(); transaction->execute("drop table if exists syscalls_made;"); transaction->execute("create table syscalls_made (caller integer references semantic_functions(id)," " syscall_id integer, syscall_name text)"); std::cout << "database name is : " << std::string(argv[argno]) << std::endl; std::string specimen_name = argv[argno++]; // Parse the binary specimen SgAsmInterpretation *interp = open_specimen(specimen_name, argv0, link); assert(interp!=NULL); // Figure out what functions need to be added to the database. std::vector<SgAsmFunction*> all_functions = SageInterface::querySubTree<SgAsmFunction>(interp); DirectedGraph* G = create_reachability_graph(all_functions, interp); add_calls_to_syscalls_to_db(transaction, G, all_functions); analyze_data(transaction); transaction->commit(); return 0; }
static void gather_source_code(const SqlDatabase::TransactionPtr &tx) { tx->execute("create temporary table tmp_src as" " select distinct src.*" " from tmp_insns as insn" " join semantic_sources as src" " on insn.src_file_id=src.file_id" " and src.linenum >= insn.src_line-10" " and src.linenum <= insn.src_line+10"); }
static void postprocess(const SqlDatabase::TransactionPtr &tx) { int windowSize = tx->statement("select window_size from run_parameters limit 1")->execute_int(); int stride = tx->statement("select stride from run_parameters limit 1")->execute_int(); assert(windowSize != 0); assert(stride != 0); cerr << "About to delete from postprocessed_clusters" << endl; tx->execute("delete from postprocessed_clusters"); cerr << "... done" << endl; cerr << "About to postprocess" << endl; SqlDatabase::StatementPtr cmd = tx->statement("select cluster, function_id, index_within_function, vectors_row" " from clusters order by cluster, function_id, index_within_function"); SqlDatabase::StatementPtr insertCmd = tx->statement("insert into postprocessed_clusters" " select * from clusters where row_number = ?"); const size_t numStridesThatMustBeDifferent = windowSize / (stride * 2); string last_cluster = ""; string last_func_id = ""; size_t last_index_within_function = 0; vector<string> rows_in_this_cluster; bool first = true; for (SqlDatabase::Statement::iterator postproc_reader=cmd->begin(); postproc_reader!=cmd->end(); ++postproc_reader) { string cluster = postproc_reader.get<std::string>(0); string function_id = postproc_reader.get<std::string>(1); size_t index_within_function = postproc_reader.get<size_t>(2); string cluster_row_number = postproc_reader.get<std::string>(3); bool differentFunction = cluster != last_cluster || function_id != last_func_id; bool endingCluster = differentFunction; bool beginningNewCluster = first || differentFunction; first = false; if (endingCluster) { if (rows_in_this_cluster.size() > 1) { // Skip clusters that have only one element left for (size_t i = 0; i < rows_in_this_cluster.size(); ++i) { insertCmd->bind(0, rows_in_this_cluster[i]); insertCmd->execute(); } } } if (beginningNewCluster) { last_cluster = cluster; last_func_id = function_id; last_index_within_function = index_within_function; rows_in_this_cluster.clear(); } bool keep = beginningNewCluster || (index_within_function >= last_index_within_function + numStridesThatMustBeDifferent); if (keep) { last_index_within_function = index_within_function; rows_in_this_cluster.push_back(cluster_row_number); } } cerr << "... done" << endl; }
// Create the tmp_insns table to hold all the instructions for the function-to-be-listed and all the instructions of all // the functions that are mentioned in events. static void gather_instructions(const SqlDatabase::TransactionPtr tx, int func_id, const Events &events) { std::set<std::string> func_ids; func_ids.insert(StringUtility::numberToString(func_id)); for (Events::const_iterator ei=events.begin(); ei!=events.end(); ++ei) func_ids.insert(StringUtility::numberToString(ei->second.func_id)); std::string sql = "create temporary table tmp_insns as" " select * from semantic_instructions" " where func_id in ("+StringUtility::join_range(", ", func_ids.begin(), func_ids.end())+")"; tx->execute(sql); }
// Create and populate the tmp_events table. static void gather_events(const SqlDatabase::TransactionPtr &tx, int func_id) { tx->execute("create temporary table tmp_events as select * from semantic_fio_trace limit 0"); if (opt.show_trace) { std::string sql = "insert into tmp_events select * from semantic_fio_trace where func_id = ?"; std::vector<std::string> igroups; for (std::set<int>::const_iterator i=opt.traces.begin(); i!=opt.traces.end(); ++i) igroups.push_back(StringUtility::numberToString(*i)); if (!igroups.empty()) sql += " and igroup_id in (" + StringUtility::join(", ", igroups) + ")"; tx->statement(sql)->bind(0, func_id)->execute(); } }
/**************************************************************************************** * * * Compute how mean similar functions are to all other functions. * * The result is inserted into fr_mean_similarity on the test db, and fr_mean_similar on * the global db. * */ void compute_mean_similarity_statistics(double bucket_size, double increment, SqlDatabase::TransactionPtr transaction) { int num_pairs = transaction->statement("select count(*) from semantic_funcsim")->execute_int(); transaction->execute("drop table IF EXISTS fr_mean_similarity;"); transaction->execute("create table fr_mean_similarity as select coalesce(sum(sf.similarity)/"+ boost::lexical_cast<std::string>(num_pairs)+ " ,0) as similarity, ttf.id as func_id from semantic_funcsim as sf"+ " join semantic_functions as ttf on ttf.id = sf.func1_id OR ttf.id = sf.func2_id GROUP BY ttf.id"); transaction->execute("drop table IF EXISTS fr_mean_similar"); transaction->execute("create table fr_mean_similar(similarity_low double precision, similarity_middle double precision," " similarity_high double precision, percent double precision);"); SqlDatabase::StatementPtr mean_similar_stmt = transaction->statement("insert into fr_mean_similar" // 0 1 2 "(similarity_low, similarity_middle, similarity_high," // 3 " percent) " " values (?, ?, ?, ?)"); for (double cur_bucket = 0.0; cur_bucket <= 1.0+bucket_size; cur_bucket+=increment) { int num_matches = transaction->statement("select count(*) from fr_mean_similarity where " " similarity >= " + boost::lexical_cast<std::string>(cur_bucket - bucket_size) + " and similarity < " + boost::lexical_cast<std::string>(cur_bucket + bucket_size))->execute_int(); mean_similar_stmt->bind(0, cur_bucket - bucket_size < 0 ? 0 : cur_bucket - bucket_size); mean_similar_stmt->bind(1, cur_bucket); mean_similar_stmt->bind(2, cur_bucket + bucket_size >= 1.0 ? 1.0 : cur_bucket + bucket_size); mean_similar_stmt->bind(3, num_pairs > 0 ? ((double) num_matches*100.0)/num_pairs : 0); mean_similar_stmt->execute(); } }
static void callExact(const SqlDatabase::TransactionPtr &tx, const std::string databaseName, const string& Exec) { // FIXME: We can't pass parameters to the exec'd process this way because the parent's SQL statements are // being executed in a transaction -- they won't be visible in the child. [Robb P. Matzke 2013-08-12] tx->execute("delete from detection_parameters"); tx->statement("insert into detection_parameters (similarity_threshold, false_negative_rate) values (?, ?)") ->bind(0, 1.0) ->bind(1, 0) ->execute(); std::cout << "Start running exact clone detection" << std::endl; pid_t p = fork(); if (p == -1) { // Error perror("fork: "); exit (1); } if (p == 0) { // Child vector<char*> args; args.push_back(strdup(Exec.c_str())); args.push_back(strdup("--database")); args.push_back(strdup(databaseName.c_str())); args.push_back(0); ostringstream outStr; for (vector<char*>::iterator iItr = args.begin(); iItr != args.end(); ++iItr) outStr << *iItr << " "; std::cout << "Calling " << outStr.str() << std::endl; execv(Exec.c_str(), &args[0]); perror("execv: "); exit (1); } else { // Parent int status; if (waitpid(p, &status, 0) == -1) { perror("waitpid"); abort(); } cerr << "Status: " << status << endl; cerr << "Done waiting for Exact Clone Detection" << endl; } }
void createDatabases(const SqlDatabase::TransactionPtr &tx) { extern const char *syntactic_schema_create; // defined in machine-generated SyntacticSchema.C tx->execute(syntactic_schema_create); }
static void callLSH(const SqlDatabase::TransactionPtr &tx, const std::string databaseName, double similarity_threshold, const string& Exec, int norm, size_t hash_function_size, size_t hash_table_count) { double distance = sqrt((1. - similarity_threshold) * 50.); double false_negative_rate = ( similarity_threshold != 1.0) ? 0.0100 : 0; vector<CloneRange> ranges = computeranges(distance, 50, 100000); int maxNumElementsInGroup = -1; int maxNumElementIdx = -1; // FIXME: We can't pass parameters to the exec'd process this way because the parent's SQL statements are // being executed in a transaction -- they won't be visible in the child. [Robb P. Matzke 2013-08-12] tx->execute("delete from detection_parameters"); tx->statement("insert into detection_parameters (similarity_threshold, false_negative_rate) values (?, ?)") ->bind(0, similarity_threshold) ->bind(1, false_negative_rate) ->execute(); map<size_t, int> groupSizes; std::cout << "Looking for the biggest group" << std::endl; for (size_t i = 0; i < ranges.size(); ++i) { std::string sql = std::string("select count(*) from vectors where sum_of_counts >= ?") + (ranges[i].high != -1 ? " and sum_of_counts <= ?" : ""); SqlDatabase::StatementPtr cmd = tx->statement(sql); cmd->bind(0, ranges[i].low); if (ranges[i].high != -1) cmd->bind(1, ranges[i].high); int numElementsInGroup = cmd->execute_int(); groupSizes[i] = numElementsInGroup; std::cerr << "The current group from " << ranges[i].low << " to " << ranges[i].high << " is of size " << numElementsInGroup << std::endl; if (numElementsInGroup > maxNumElementsInGroup) { maxNumElementsInGroup = numElementsInGroup; maxNumElementIdx = i; } } std::cout << "Biggest group found " << ranges[maxNumElementIdx].low << " " << ranges[maxNumElementIdx].high << std::endl; char tempDirName[] = "/tmp/paramdirXXXXXX"; char* mkdtempResult = mkdtemp(tempDirName); if (!mkdtempResult) { perror("mkdtemp: "); exit (1); } string paramFileName = string(tempDirName) + "/params"; paramFileName = "/tmp/lshparamdirE40hF1/params"; std::cout << "Number of groups :" << ranges.size() << std::endl; for (int i = 0; i < (int)ranges.size(); ++i) { size_t group = (i == 0) ? maxNumElementIdx : (i <= maxNumElementIdx) ? i - 1 : i; if (groupSizes[group] > 1) { std::cout << "Executing LSH code low " << ranges[group].low << " high " << ranges[group].high << " group " << group << " size " << groupSizes[group] << std::endl; if(norm == 3) { executeLSHCode(tx, databaseName, Exec, paramFileName, ranges[group]); } else { executeLSHCodeLLNL(tx, databaseName, Exec, paramFileName, ranges[group], norm, similarity_threshold, false_negative_rate, groupSizes[group]); } } } unlink(paramFileName.c_str()); rmdir(tempDirName); }
void analyze_data(SqlDatabase::TransactionPtr tx) { transaction->execute("drop table IF EXISTS functions_cg_accumulate;"); transaction->execute("create table functions_cg_accumulate as select sm.caller, count(sm.callee) as num_calls," " sf.ninsns, sf.name from semantic_rg as sm join semantic_functions as sf on sf.id=sm.caller " " group by sm.caller, sf.ninsns, sf.name;"); transaction->execute("drop table IF EXISTS functions_rg_accumulate;"); transaction->execute("create table functions_rg_accumulate as select sm.caller, count(sm.callee) as num_calls," " sf.ninsns, sf.name from semantic_cg as sm join semantic_functions as sf on sf.id=sm.caller " " group by sm.caller, sf.ninsns, sf.name;"); transaction->execute("drop table IF EXISTS syscalls_cg_accumulate;"); transaction->execute("create table syscalls_cg_accumulate as select sm.caller, count(sm.callee) as num_calls," " sf.ninsns, sf.name from semantic_cg as sm join semantic_functions as sf on sf.id=sm.caller " " join syscalls_made as sysm on sysm.caller=sm.callee " " group by sm.caller, sf.ninsns, sf.name;"); transaction->execute("drop table IF EXISTS syscalls_rg_accumulate;"); transaction->execute("create table syscalls_rg_accumulate as select sm.caller, count(sm.callee) as num_calls," " sf.ninsns, sf.name from semantic_rg as sm join semantic_functions as sf on sf.id=sm.caller " " join syscalls_made as sysm on sysm.caller=sm.callee " " group by sm.caller, sf.ninsns, sf.name;"); transaction->execute("drop table IF EXISTS syscalls_fio_accumulate"); transaction->execute("create table syscalls_fio_accumulate as " " select fio.caller_id, count(fio.callee_id) as num_calls, sf.ninsns, sf.name from syscalls_made as sm " " join semantic_fio_calls as fio on fio.callee_id = sm.caller " " join semantic_functions as sf on sf.id=fio.caller_id " " group by fio.caller_id, sf.ninsns, sf.name"); { //all functions that is not a stub function for a dynamic library call std::cout << "\n\n\n################# COMPARING ALL FUNCTIONS \n\n"; int num_functions = tx->statement("select count(*) from functions_rg_accumulate where name NOT LIKE '%@plt'") ->execute_int(); int num_cg_syscalls = tx->statement("select count(*) from syscalls_cg_accumulate where name NOT LIKE '%@plt'; ") ->execute_int(); int num_rg_syscalls = tx->statement("select count(*) from syscalls_rg_accumulate where name NOT LIKE '%@plt'; ") ->execute_int(); int path_calls = tx->statement("select count(distinct fio.caller_id) from syscalls_made as sm" " join semantic_fio_calls as fio on fio.callee_id = sm.caller" " join semantic_functions as sf on fio.caller_id=sf.id" " where sf.name not like '%@plt'") ->execute_int(); int total_num_functions = tx->statement("select count(*) from semantic_functions where name not like '%@plt'") ->execute_int(); std::cout << std::fixed << std::setprecision(2); std::cout << "num functions with calls: " << num_functions << std::endl; std::cout << "num functions: " << total_num_functions << std::endl; std::cout << "num callgraph syscalls: " << num_cg_syscalls << " fraction " << 100*((double) num_cg_syscalls/num_functions) << std::endl; std::cout << "num reachability graph syscalls: " << num_rg_syscalls << " fraction " << 100*((double) num_rg_syscalls/num_functions) << std::endl; std::cout << "path calls: " << path_calls << " fraction " << 100*((double) path_calls/num_functions) << std::endl; } { std::cout << "\n\n\n################# COMPARING FUNCTIONS WITH CALLS\n\n"; int num_functions = tx->statement("select count(distinct caller)" " from functions_rg_accumulate where name not like '%@plt'")->execute_int(); int num_cg_syscalls = tx->statement("select count(distinct caller) from syscalls_cg_accumulate" " where name not like '%@plt'; ")->execute_int(); int num_rg_syscalls = tx->statement("select count(distinct caller) from syscalls_rg_accumulate" " where name not like '%@plt'; ")->execute_int(); int path_calls = tx->statement("select count(distinct fio.caller_id) from semantic_fio_calls as fio" " join syscalls_made as sm on fio.callee_id = sm.caller" " join semantic_functions as sf on sf.id=fio.caller_id" " where sf.name not like '%@plt'")->execute_int(); int total_num_functions = tx->statement("select count(*) from semantic_functions" " where name not like '%@plt'")->execute_int(); std::cout << std::fixed << std::setprecision(2); std::cout << "num functions with calls: " << num_functions << std::endl; std::cout << "num functions: " << total_num_functions << std::endl; std::cout << "fraction of functions with calls: " << 100.0*num_functions/total_num_functions << std::endl; std::cout << "num callgraph syscalls: " << num_cg_syscalls << " fraction " << 100*((double) num_cg_syscalls/num_functions) << std::endl; std::cout << "num reachability graph syscalls: " << num_rg_syscalls << " fraction " << 100*((double) num_rg_syscalls/num_functions) << std::endl; std::cout << "path calls: " << path_calls << " fraction " << 100*((double) path_calls/num_functions) << std::endl; } { std::cout << "\n\n\n################# COMPARING FUNCTIONS WITH CALLS WITH MORE THAN 100 INSTRUCTIONS\n\n"; int num_functions = tx->statement("select count(distinct fr.caller) from functions_rg_accumulate as fr" " join semantic_functions as sf on sf.id=fr.caller" " where sf.name not like '%@plt' and sf.ninsns >=100")->execute_int(); int num_cg_syscalls = tx->statement("select count(distinct sc.caller) from syscalls_cg_accumulate as sc" " join semantic_functions as sf on sf.id=sc.caller" " where sf.name not like '%@plt' and sf.ninsns >=100")->execute_int(); int num_rg_syscalls = tx->statement("select count(distinct sr.caller) from syscalls_rg_accumulate as sr" " join semantic_functions as sf on sf.id=sr.caller" " where sf.name NOT LIKE '%@plt' and sf.ninsns >= 100")->execute_int(); int path_calls = tx->statement("select count(distinct fio.caller_id) from semantic_fio_calls as fio" " join syscalls_made as sm on fio.callee_id = sm.caller" " join semantic_functions as sf on sf.id=fio.caller_id" " where sf.name not like '%@plt' and sf.ninsns >= 100")->execute_int(); int total_num_functions = tx->statement("select count(*) from semantic_functions" " where ninsns >= 100 and name not like '%@plt'")->execute_int(); std::cout << std::fixed << std::setprecision(2); std::cout << "num functions with calls: " << num_functions << std::endl; std::cout << "num functions: " << total_num_functions << std::endl; std::cout << "fraction of functions with calls: " << 100.0*num_functions/total_num_functions << std::endl; std::cout << "num callgraph syscalls: " << num_cg_syscalls << " fraction " << 100*((double) num_cg_syscalls/num_functions) << std::endl; std::cout << "num reachability graph syscalls: " << num_rg_syscalls << " fraction " << 100*((double) num_rg_syscalls/num_functions) << std::endl; std::cout << "path calls: " << path_calls << " fraction " << 100*((double) path_calls/num_functions) << std::endl; } tx->statement("drop table IF EXISTS syscall_statistics;"); tx->statement("create table syscall_statistics as select distinct rg.caller, sm.syscall_id, sm.syscall_name" " from syscalls_made as sm" " join semantic_rg as rg on rg.callee = sm.caller"); }
int main(int argc, char *argv[]) { std::ios::sync_with_stdio(); argv0 = argv[0]; { size_t slash = argv0.rfind('/'); argv0 = slash==std::string::npos ? argv0 : argv0.substr(slash+1); if (0==argv0.substr(0, 3).compare("lt-")) argv0 = argv0.substr(3); } Switches opt; int argno = 1; for (/*void*/; argno<argc && '-'==argv[argno][0]; ++argno) { if (!strcmp(argv[argno], "--")) { ++argno; break; } else if (!strcmp(argv[argno], "--help") || !strcmp(argv[argno], "-h")) { ::usage(0); } else if (!strncmp(argv[argno], "--entry=", 8)) { opt.entry_vas.insert(strtoull(argv[argno]+8, NULL, 0)); } else if (!strcmp(argv[argno], "--file=list") || !strcmp(argv[argno], "--files=list")) { opt.list_files = true; } else if (!strncmp(argv[argno], "--file=", 7) || !strncmp(argv[argno], "--files=", 8)) { std::vector<std::string> ids = StringUtility::split(",", strchr(argv[argno], '=')+1, (size_t)-1, true); for (size_t i=0; i<ids.size(); ++i) { const char *s = ids[i].c_str(); char *rest; errno = 0; int id = strtoul(s, &rest, 0); if (errno || rest==s || *rest) { std::cerr <<argv0 <<": invalid file ID: " <<ids[i] <<"\n"; exit(1); } opt.files.insert(id); } } else if (!strncmp(argv[argno], "--function=", 11) || !strncmp(argv[argno], "--functions=", 12)) { std::vector<std::string> ids = StringUtility::split(",", strchr(argv[argno], '=')+1, (size_t)-1, true); if (ids.size()==1 && isalpha(ids[0][0]) && ids[0].find_first_of('.')!=std::string::npos) { std::vector<std::string> words = StringUtility::split(".", ids[0]); if (words.size()!=2 || !SqlDatabase::is_valid_table_name(words[0]) || !SqlDatabase::is_valid_table_name(words[1])) { std::cerr <<argv0 <<": --function switch needs either IDs or a database TABLE.COLUMN\n"; exit(1); } opt.function_table = words[0]; opt.function_column = words[1]; } else { for (size_t i=0; i<ids.size(); ++i) { const char *s = ids[i].c_str(); char *rest; errno = 0; int id = strtoul(s, &rest, 0); if (errno || rest==s || *rest) { std::cerr <<argv0 <<": invalid function ID: " <<ids[i] <<"\n"; exit(1); } opt.functions.insert(id); } } } else if (!strncmp(argv[argno], "--first-fuzz=", 13)) { opt.first_fuzz = strtoul(argv[argno]+13, NULL, 0); } else if (!strncmp(argv[argno], "--name=", 7)) { opt.names.insert(argv[argno]+7); } else if (!strncmp(argv[argno], "--nfuzz=", 8)) { opt.nfuzz = strtoul(argv[argno]+8, NULL, 0); opt.nfuzz_set = true; } else if (!strncmp(argv[argno], "--size=", 7)) { opt.ninsns = strtoul(argv[argno]+7, NULL, 0); } else if (!strcmp(argv[argno], "--specimen=list") || !strcmp(argv[argno], "--specimens=list")) { opt.list_specimens = true; } else if (!strncmp(argv[argno], "--specimen=", 11) || !strncmp(argv[argno], "--specimens=", 12)) { std::vector<std::string> ids = StringUtility::split(",", strchr(argv[argno], '=')+1, (size_t)-1, true); for (size_t i=0; i<ids.size(); ++i) { const char *s = ids[i].c_str(); char *rest; errno = 0; int id = strtoul(s, &rest, 0); if (errno || rest==s || *rest) { std::cerr <<argv0 <<": invalid specimen ID: " <<ids[i] <<"\n"; exit(1); } opt.specimens.insert(id); } } else { std::cerr <<argv0 <<": unrecognized switch: " <<argv[argno] <<"\n" <<"see \"" <<argv0 <<" --help\" for usage info.\n"; exit(1); } } if (argno+1!=argc) ::usage(1); SqlDatabase::TransactionPtr tx = SqlDatabase::Connection::create(argv[argno++])->transaction(); // List the ID numbers and names for all specimen files if (opt.list_specimens) { SqlDatabase::Table<int, std::string> specimens; specimens.insert(tx->statement("select file.id, file.name" " from (select distinct specimen_id as id from semantic_functions) as specimen" " join semantic_files as file on specimen.id = file.id" " order by file.name")); specimens.headers("File ID", "Specimen Name"); specimens.print(std::cout); return 0; } // List the ID numbers and names for all files containing functions if (opt.list_files) { SqlDatabase::Table<int, std::string> files; files.insert(tx->statement("select file.id, file.name" " from (select distinct file_id as id from semantic_functions) as used" " join semantic_files as file on used.id = file.id" " order by file.name")); files.headers("File ID", "Binary File Name"); files.print(std::cout); return 0; } // Sanity checks if (!opt.functions.empty() && !opt.function_table.empty()) { std::cerr <<argv0 <<": --function=ID and --function=TABLE are mutually exclusive\n"; exit(1); } if (0==tx->statement("select count(*) from semantic_functions")->execute_int()) { std::cerr <<argv0 <<": database has no functions; nothing to test\n"; return 0; } if (0==tx->statement("select count(*) from semantic_inputvalues")->execute_int()) { std::cerr <<argv0 <<": database has no input groups; nothing to test\n"; return 0; } // Create table tmp_functions containing IDs for selected functions and their specimen IDs std::vector<std::string> constraints; if (!opt.entry_vas.empty()) constraints.push_back("func.entry_va " + SqlDatabase::in(opt.entry_vas)); if (!opt.names.empty()) constraints.push_back("func.name " + SqlDatabase::in_strings(opt.names, tx->driver())); if (!opt.specimens.empty()) constraints.push_back("func.specimen_id " + SqlDatabase::in(opt.specimens)); if (!opt.files.empty()) constraints.push_back("func.file_id " + SqlDatabase::in(opt.files)); if (!opt.functions.empty()) constraints.push_back("func.id " + SqlDatabase::in(opt.functions)); if (opt.ninsns>0) constraints.push_back("func.ninsns >= " + StringUtility::numberToString(opt.ninsns)); std::string sql1 = "select func.id, func.specimen_id from semantic_functions as func"; if (!opt.function_table.empty()) sql1 += " join "+opt.function_table+" as flist on func.id = flist."+opt.function_column; if (!constraints.empty()) sql1 += " where " + StringUtility::join(" and ", constraints); tx->execute("create temporary table tmp_functions as " + sql1); // Create table tmp_inputgroups containing IDs for selected input groups std::string sql2 = "select distinct igroup_id from semantic_inputvalues where igroup_id >= " + StringUtility::numberToString(opt.first_fuzz); if (opt.nfuzz_set) sql2 += " and igroup_id < " + StringUtility::numberToString(opt.first_fuzz+opt.nfuzz); tx->execute("create temporary table tmp_inputgroups as " + sql2); // Create tmp_pending as the cross product of functions and inputgroups except for those already tested tx->execute("create temporary table tmp_pending as" " select func.specimen_id as specimen_id, func.id as func_id, igroup.igroup_id as igroup_id" " from tmp_functions as func" " join tmp_inputgroups as igroup" " on igroup.igroup_id is not null" // "on" clause and "is not null" (rather than "true") for portability " except" " select func.specimen_id, func.id, fio.igroup_id" " from semantic_fio as fio" " join semantic_functions as func on fio.func_id=func.id"); SqlDatabase::StatementPtr stmt = tx->statement("select distinct specimen_id, func_id, igroup_id" " from tmp_pending" " order by specimen_id, igroup_id, func_id"); for (SqlDatabase::Statement::iterator row=stmt->begin(); row!=stmt->end(); ++row) std::cout <<row.get<int>(0) <<"\t" <<row.get<int>(1) <<"\t" <<row.get<int>(2) <<"\n"; // no need to commit, but if we change this in the future, be sure to add begin_command()/finish_command() return 0; }
int main(int argc, char* argv[]) { std::string database; size_t l = 4, k = 700; size_t hashTableNumBuckets = 13000000, hashTableElementsPerBucket = 20; double distBound = 1.; double similarity=1.; double r = 4.; int norm = 1; int groupLow=-1; int groupHigh=-1; //Timing struct timeval before, after; struct rusage ru_before, ru_after; gettimeofday(&before, NULL); getrusage(RUSAGE_SELF, &ru_before); bool nodelete = false; try { options_description desc("Allowed options"); desc.add_options() ("help", "Produce a help message") ("nodelete", "Do not delete from vectors") ("groupLow,g", value< int >(&groupLow), "The lowest count of elements") ("groupHigh,G", value< int >(&groupHigh), "The highest count of elements") ("database", value< string >(&database), "The sqlite database that we are to use") ("hash-function-size,k", value< size_t >(&k), "The number of elements in a single hash function") ("hash-table-count,l", value< size_t >(&l), "The number of separate hash tables to create") ("buckets,b", value< size_t >(&hashTableNumBuckets), "The number of buckets in each hash table (buckets may store multiple elements)") ("bucket-size,s", value< size_t >(&hashTableElementsPerBucket), "The number of elements that can be stored in each hash table bucket") ("similarity,t", value< double >(&similarity), "The similarity threshold that is allowed in a clone pair") ("distance,d", value< double >(&distBound), "The maximum distance that is allowed in a clone pair") ("interval-size,r", value< double >(&r), "The divisor for the l_2 hash function family") ("norm,p", value< int >(&norm), "Exponent in p-norm to use (1 or 2)") ; variables_map vm; store(parse_command_line(argc, argv, desc), vm); notify(vm); distBound = similarity==1 ? 0.0 : sqrt(2*groupLow*(1.-similarity)); std::cerr << "similarity " << similarity << " distBound " << distBound << std::endl; if (vm.count("help")) { cout << desc << endl; exit(0); } if (vm.count("nodelete")) { nodelete = true; } if (vm.count("groupLow") == 0) { groupLow = -1; } if (vm.count("groupHigh") == 0) { groupHigh = -1; } if (database == "") { std::cerr << "Missing options. Call as: " << argv[0] << " --database <database-name> [other parameters]" << std::endl; exit(1); } if (hashTableNumBuckets >= (1ULL << 32)) { cerr << "Number of buckets must be less than 2**32" << endl; exit (1); } if (norm != 1 && norm != 2) { cerr << "Norm must be either 1 or 2" << endl; exit (1); } if (nodelete == false) { cerr << "groupLow: " << groupLow << std::endl; cerr << "groupHigh: " << groupHigh << std::endl; cerr << "norm: l_" << norm << std::endl; cerr << "database: " << database << std::endl; cerr << "k: " << k << std::endl; cerr << "l: " << l << std::endl; cerr << "buckets: " << hashTableNumBuckets << std::endl; cerr << "bucket size: " << hashTableElementsPerBucket << std::endl; cerr << "distance: " << distBound << std::endl; cerr << "r: " << r << std::endl; } } catch(exception& e) { cout << e.what() << "\n"; exit (1); } SqlDatabase::TransactionPtr tx = SqlDatabase::Connection::create(database)->transaction(); scoped_array_with_size<VectorEntry> vectors; scoped_array_with_size<scoped_array_with_size<VectorEntry> > duplicateVectors; //Step to pass to LSH only the vectors that are not part of an exact clone pass { std::vector<int> functionsThatWeAreInterestedIn; scoped_array_with_size<VectorEntry> allVectors; map<string, std::vector<int> > internTable; read_vector_data(tx, allVectors, functionsThatWeAreInterestedIn, internTable, groupLow, groupHigh, false); //Assign to vectors the first element of each hash bucket int numberOfBuckets = 0; for (map<string, std::vector<int> >::iterator iItr = internTable.begin(); iItr != internTable.end() ; iItr++) numberOfBuckets++; vectors.allocate(numberOfBuckets); duplicateVectors.allocate(numberOfBuckets); int indexInVectors=0; std::cout << "All is size: " << allVectors.size() << " reduced size is " << vectors.size() << std::endl; for (map<string, std::vector<int> >::iterator iItr = internTable.begin(); iItr != internTable.end() ; iItr++) { if (iItr->second.size()>1) duplicateVectors[indexInVectors].allocate(iItr->second.size()-1); for (unsigned int i = 0 ; i < iItr->second.size() ; i++) { VectorEntry& allVe = allVectors[iItr->second[i]]; VectorEntry& ve = i == 0 ? vectors[indexInVectors] : duplicateVectors[indexInVectors][i-1]; ve.rowNumber = allVe.rowNumber; ve.functionId = allVe.functionId; ve.indexWithinFunction = allVe.indexWithinFunction; ve.line = allVe.line; ve.offset = allVe.offset; ve.compressedCounts.allocate(allVe.compressedCounts.size()); memcpy(ve.compressedCounts.get(), allVe.compressedCounts.get(), allVe.compressedCounts.size()); } indexInVectors++; } } if (vectors[0].compressedCounts.size() == 0) { cerr << "Vector slot 0 is empty" << endl; abort(); } size_t numVectorElements = getUncompressedSizeOfVector(vectors[0].compressedCounts.get(), vectors[0].compressedCounts.size()); if (debug_messages) { cout << "Vectors have " << numVectorElements << " elements" << endl; cout << "Number of vectors fetched is " << vectors.size() << std::endl; } LSHTableBase* table = NULL; switch (norm) { case 1: table = new LSHTable<HammingHashFunctionSet, L1DistanceObject>(vectors, L1DistanceObject(), k, l, r, numVectorElements, hashTableNumBuckets, hashTableElementsPerBucket, distBound); break; case 2: table = new LSHTable<StableDistributionHashFunctionSet, L2DistanceObject>(vectors, L2DistanceObject(), k, l, r, numVectorElements, hashTableNumBuckets, hashTableElementsPerBucket, distBound); break; default: cerr << "Bad value for --norm" << endl; abort(); // Should have been caught earlier } assert(table); // Setup stuff for postprocessing int windowSize = 0; int stride = 0; get_run_parameters(tx, windowSize, stride); if (nodelete == false) { cerr << "About to delete from clusters" << endl; tx->execute("delete from clusters"); cerr << "... done" << endl; cerr << "About to delete from postprocessed_clusters" << endl; tx->execute("delete from postprocessed_clusters"); cerr << "... done" << endl; } const size_t numStridesThatMustBeDifferent = windowSize / (stride * 2); // Get clusters and postprocess them vector<bool> liveVectors(vectors.size(), true); size_t clusterNum = 0, postprocessedClusterNum = 0; for (size_t i = 0; i < vectors.size(); ++i) { //Loop over vectors //Creating potential clusters if (!liveVectors[i]) continue; liveVectors[i] = false; vector<pair<size_t, double> > clusterElementsRaw = table->query(i); // Pairs are vector number, distance vector<pair<uint64_t, double> > clusterElements; vector<uint64_t > postprocessedClusterElements; clusterElements.push_back(make_pair(i, 0)); //const VectorEntry& ve = vectors[i]; for (size_t j = 0; j < clusterElementsRaw.size(); ++j) { size_t entry = clusterElementsRaw[j].first; //double dist = clusterElementsRaw[j].second; // All entries less than i were in previous clusters, so we save an array lookup if (entry <= i || !liveVectors[entry]) continue; clusterElements.push_back(clusterElementsRaw[j]); liveVectors[entry] = false; } if (clusterElements.size() < 2 && duplicateVectors[i].size() == 0 ) continue; //Insert raw cluster data for (vector<pair<uint64_t, double> >::const_iterator j = clusterElements.begin(); j != clusterElements.end(); ++j) { for(size_t k = 0; k < duplicateVectors[j->first].size(); k++) { const VectorEntry& ve = duplicateVectors[j->first][k]; insert_into_clusters(tx, clusterNum, ve.functionId, ve.indexWithinFunction, ve.rowNumber, j->second); } const VectorEntry& ve = vectors[j->first]; insert_into_clusters(tx, clusterNum, ve.functionId, ve.indexWithinFunction, ve.rowNumber, j->second); } if (clusterNum % 10000 == 0 && debug_messages) cerr << "cluster " << clusterNum << " has " << clusterElements.size() << " elements" << endl; ++clusterNum; //Postprocessing does not make sense for inexact clones if (similarity != 1.0 ) continue; // This implicitly groups elements in the same function together and order by index_within_function in each function // Not needed because of the sort in LSHTable::query() which is on the cluster number: // std::sort(clusterElements.begin(), clusterElements.end()); //The next two variables will we initialized in first run size_t lastFunctionId=0; size_t lastIndexWithinFunction=0; bool first = true; std::vector<VectorEntry*> clusterElemPtr; for (size_t j = 0; j < clusterElements.size(); ++j) { clusterElemPtr.push_back( &vectors[ clusterElements[j].first ] ); for (size_t k = 0; k < duplicateVectors[clusterElements[j].first].size(); k++) clusterElemPtr.push_back(&duplicateVectors[ clusterElements[j].first ][k]); } std::sort(clusterElemPtr.begin(), clusterElemPtr.end(), compare_rows ); for (size_t j = 0; j < clusterElemPtr.size(); ++j) { const VectorEntry& ve = *clusterElemPtr[j]; if (first || ve.functionId != lastFunctionId || ve.indexWithinFunction >= lastIndexWithinFunction + numStridesThatMustBeDifferent) { lastFunctionId = ve.functionId; lastIndexWithinFunction = ve.indexWithinFunction; postprocessedClusterElements.push_back(j); } first = false; } if (postprocessedClusterElements.size() >= 2) { //insert post processed data for (vector<uint64_t >::const_iterator j = postprocessedClusterElements.begin(); j != postprocessedClusterElements.end(); ++j) { const VectorEntry& ve = *clusterElemPtr[*j]; insert_into_postprocessed_clusters(tx, postprocessedClusterNum, ve.functionId, ve.indexWithinFunction, ve.rowNumber, 0); } if (postprocessedClusterNum % 1000 == 0) { cerr << "postprocessed cluster " << postprocessedClusterNum << " has " << postprocessedClusterElements.size() << " elements" << endl; } ++postprocessedClusterNum; } } cerr << clusterNum << " total cluster(s), " << postprocessedClusterNum << " after postprocessing" << endl; gettimeofday(&after, NULL); getrusage(RUSAGE_SELF, &ru_after); insert_timing(tx, "lsh", groupLow, groupHigh, vectors.size(), k, l, before,after, ru_before, ru_after); tx->commit(); return 0; }
int main(int argc, char *argv[]) { std::ios::sync_with_stdio(); argv0 = argv[0]; { size_t slash = argv0.rfind('/'); argv0 = slash==std::string::npos ? argv0 : argv0.substr(slash+1); if (0==argv0.substr(0, 3).compare("lt-")) argv0 = argv0.substr(3); } int argno = 1; for (/*void*/; argno<argc && '-'==argv[argno][0]; ++argno) { if (!strcmp(argv[argno], "--")) { ++argno; break; } else if (!strcmp(argv[argno], "--help") || !strcmp(argv[argno], "-h")) { ::usage(0); } else if (!strcmp(argv[argno], "--delete")) { opt.delete_old_data = true; } else if (!strncmp(argv[argno], "--exclude-functions=", 20)) { opt.exclude_functions_table = argv[argno]+20; } else if (!strcmp(argv[argno], "--no-delete")) { opt.delete_old_data = false; } else if (!strncmp(argv[argno], "--relation=", 11)) { opt.relation_id = strtol(argv[argno]+11, NULL, 0); } else { std::cerr <<argv0 <<": unknown switch: " <<argv[argno] <<"\n" <<argv0 <<": see --help for more info\n"; exit(1); } }; if (argno+1!=argc) ::usage(1); time_t start_time = time(NULL); SqlDatabase::ConnectionPtr conn = SqlDatabase::Connection::create(argv[argno++]); SqlDatabase::TransactionPtr tx = conn->transaction(); // Save ourself in the history if we're modifying the database. int64_t cmd_id=-1; if (opt.delete_old_data) cmd_id = CloneDetection::start_command(tx, argc, argv, "clearing funcsim data for relation #"+ StringUtility::numberToString(opt.relation_id), start_time); // The 32-func-similarity tool needs this index, so we might as well create it here when we're running serially. The // semantic_outputvalues table can be HUGE depending on how the analysis is configured (i.e., whether it saves output // values as a vector or set, whether it saves function calls and system calls, etc.). Since creating the index could take // a few minutes, we'd rather not create it if it alread exists, but PostgreSQL v8 doesn't have a "CREATE INDEX IF NOT // EXISTS" ability. Therefore, try to create the index right away before we make any other changes, and if creation fails // then start a new transaction (because the current one is hosed). std::cerr <<argv0 <<": creating output group index (could take a while)\n"; try { SqlDatabase::TransactionPtr tx = conn->transaction(); tx->execute("create index idx_ogroups_hashkey on semantic_outputvalues(hashkey)"); tx->commit(); } catch (const SqlDatabase::Exception&) { std::cerr <<argv0 <<": idx_ogroups_hashkey index already exists; NOT dropping and recreating\n"; } // Delete old data. if (opt.delete_old_data) tx->statement("delete from semantic_funcsim where relation_id = ?")->bind(0, opt.relation_id)->execute(); // Get the list of functions that should appear in the worklist. std::cerr <<argv0 <<": obtaining function list\n"; std::string stmt1 = "create temporary table tmp_tested_funcs as" " select distinct fio.func_id as func_id" " from semantic_fio as fio"; if (!opt.exclude_functions_table.empty()) { std::vector<std::string> parts = StringUtility::split('.', opt.exclude_functions_table, 2, true); if (parts.size()<2) parts.push_back("func_id"); stmt1 += " left join " + parts.front() + " as exclude" " on fio.func_id = exclude." + parts.back() + " where exclude." + parts.back() + " is null"; } tx->execute(stmt1); // Create pairs of function IDs for those functions which have been tested and for which no similarity measurement has been // computed. (FIXME: We should probably recompute similarity that might have changed due to rerunning tests or running the // same function but with more input groups. [Robb P. Matzke 2013-06-19]) std::cerr <<argv0 <<": creating work list\n"; SqlDatabase::StatementPtr stmt2 = tx->statement("select distinct f1.func_id as func1_id, f2.func_id as func2_id" " from tmp_tested_funcs as f1" " join tmp_tested_funcs as f2 on f1.func_id < f2.func_id" " except" " select func1_id, func2_id from semantic_funcsim as sim" " where sim.relation_id = ?"); stmt2->bind(0, opt.relation_id); for (SqlDatabase::Statement::iterator row=stmt2->begin(); row!=stmt2->end(); ++row) std::cout <<row.get<int>(0) <<"\t" <<row.get<int>(1) <<"\n"; if (cmd_id>=0) CloneDetection::finish_command(tx, cmd_id, "cleared funcsim table for relation #"+ StringUtility::numberToString(opt.relation_id)); tx->commit(); return 0; }