int main(int argc, char *argv[]) { std::ios::sync_with_stdio(); argv0 = argv[0]; { size_t slash = argv0.rfind('/'); argv0 = slash==std::string::npos ? argv0 : argv0.substr(slash+1); if (0==argv0.substr(0, 3).compare("lt-")) argv0 = argv0.substr(3); } int argno = 1; bool link = false; std::vector<std::string> signature_components; for (/*void*/; argno<argc && '-'==argv[argno][0]; ++argno) { std::cout << argv[argno] << std::endl; if (!strcmp(argv[argno], "--")) { ++argno; break; } else if (!strcmp(argv[argno], "--help") || !strcmp(argv[argno], "-h")) { ::usage(0); } else if (!strcmp(argv[argno], "--link")) { link = true; } else if (!strcmp(argv[argno], "--no-link")) { link = false; } else { std::cerr <<argv0 <<": unrecognized switch: " <<argv[argno] <<"\n" <<"see \"" <<argv0 <<" --help\" for usage info.\n"; exit(1); } } if (argno+2!=argc) ::usage(1); std::string db_name(argv[argno++]); std::cout << "Connecting to db:" << db_name << std::endl; SqlDatabase::ConnectionPtr conn = SqlDatabase::Connection::create(db_name); transaction = conn->transaction(); transaction->execute("drop table if exists syscalls_made;"); transaction->execute("create table syscalls_made (caller integer references semantic_functions(id)," " syscall_id integer, syscall_name text)"); std::cout << "database name is : " << std::string(argv[argno]) << std::endl; std::string specimen_name = argv[argno++]; // Parse the binary specimen SgAsmInterpretation *interp = open_specimen(specimen_name, argv0, link); assert(interp!=NULL); // Figure out what functions need to be added to the database. std::vector<SgAsmFunction*> all_functions = SageInterface::querySubTree<SgAsmFunction>(interp); DirectedGraph* G = create_reachability_graph(all_functions, interp); add_calls_to_syscalls_to_db(transaction, G, all_functions); analyze_data(transaction); transaction->commit(); return 0; }
void operator()() { if (work.empty()) return; int specimen_id = work.front().specimen_id; // Database connections don't survive over fork() according to SqLite and PostgreSQL documentation, so open it again SqlDatabase::TransactionPtr tx = SqlDatabase::Connection::create(databaseUrl)->transaction(); OutputGroups ogroups; // do not load from database (that might take a very long time) if (opt.verbosity>=LACONIC) { if (opt.verbosity>=EFFUSIVE) std::cerr <<argv0 <<": " <<std::string(100, '#') <<"\n"; std::cerr <<argv0 <<": processing binary specimen \"" <<files.name(specimen_id) <<"\"\n"; } // Parse the specimen SgProject *project = files.load_ast(tx, specimen_id); if (!project) project = open_specimen(tx, files, specimen_id, argv0); if (!project) { std::cerr <<argv0 <<": problems loading specimen\n"; exit(1); } // Get list of specimen functions and initialize the instruction cache std::vector<SgAsmFunction*> all_functions = SageInterface::querySubTree<SgAsmFunction>(project); IdFunctionMap functions = existing_functions(tx, files, all_functions); FunctionIdMap function_ids; AddressIdMap entry2id; // maps function entry address to function ID for (IdFunctionMap::iterator fi=functions.begin(); fi!=functions.end(); ++fi) { function_ids[fi->second] = fi->first; entry2id[fi->second->get_entry_va()] = fi->first; } InstructionProvidor insns = InstructionProvidor(all_functions); // Split the work list into chunks, each containing testsPerChunk except the last, which may contain fewer. static const size_t testsPerChunk = 25; size_t nChunks = (work.size() + testsPerChunk - 1) / testsPerChunk; std::vector<SomeTests> jobs; for (size_t i=0; i<nChunks; ++i) { size_t beginWorkIdx = i * testsPerChunk; size_t endWorkIdx = std::min((i+1)*testsPerChunk, work.size()); Work partWork(work.begin()+beginWorkIdx, work.begin()+endWorkIdx); jobs.push_back(SomeTests(partWork, databaseUrl, functions, function_ids, &insns, cmd_id, &entry2id)); } // Run the parts in parallel using the maximum parallelism specified on the command-line. We must commit our // transaction before forking, otherwise the children won't see the rows we've added to various tables. tx->commit(); tx.reset(); size_t nfailed = runInParallel(jobs, opt.nprocs); if (nfailed!=0) { std::cerr <<"SpecimenProcessor: " <<StringUtility::plural(nfailed, "jobs") <<" failed\n"; exit(1); } }
int main(int argc, char* argv[]) { std::string database; int norm = 1; double similarity_threshold=1.; size_t k; size_t l; try { options_description desc("Allowed options"); desc.add_options() ("help", "produce a help message") ("database,q", value< string >()->composing(), "the sqlite database that we are to use") ("norm,p", value< int >(&norm), "Exponent in p-norm to use (1 or 2 or 3 (MIT implementation) )") ("hash-function-size,k", value< size_t >(&k), "The number of elements in a single hash function") ("hash-table-count,l", value< size_t >(&l), "The number of separate hash tables to create") ("similarity,t", value< double >(&similarity_threshold), "The similarity threshold that is allowed in a clone pair"); variables_map vm; store(parse_command_line(argc, argv, desc), vm); notify(vm); if (vm.count("help")) { cout << desc; exit(0); } if (vm.count("database")!=1) { std::cerr << "Missing options. Call as: findClones --database <database-name>" << std::endl; exit(1); } database = vm["database"].as<string >(); similarity_threshold = vm["similarity"].as<double>(); cout << "database: " << database << std::endl; } catch(exception& e) { cout << e.what() << "\n"; } std::cout << "The similarity threshold is " << similarity_threshold << std::endl; SqlDatabase::TransactionPtr tx = SqlDatabase::Connection::create(database)->transaction(); tx->statement("update run_parameters set similarity_threshold = ?") ->bind(0, similarity_threshold) ->execute(); OperateOnClusters op(database, norm, similarity_threshold, k , l); op.analyzeClusters(); //op.calculate_false_positives(); tx->commit(); return 0; };
int main(int argc, char *argv[]) { // Parse command-line opt.nprocs = nProcessors(); int argno = parse_commandline(argc, argv); if (argno+1!=argc) usage(1); std::string databaseUrl = argv[argno++]; SqlDatabase::TransactionPtr tx = SqlDatabase::Connection::create(databaseUrl)->transaction(); int64_t cmd_id = start_command(tx, argc, argv, "running tests"); // Load worklist MultiWork work; load_sorted_work(work/*out*/); if (work.empty()) return 0; // Load information about files. The transaction is not saved anywhere. FilesTable files(tx); // We must commit our transaction before we fork, otherwise the child processes won't be able to see the rows we've // inserted. Specifically, the row in the semantic_history table that says who we are. Destroy the smart pointer so that // the connection is even closed. tx->commit(); tx.reset(); // Process work items for each specimen sequentially BOOST_FOREACH (const Work &workForSpecimen, work) if (forkAndWait(SpecimenProcessor(workForSpecimen, files, databaseUrl, cmd_id))) exit(1); // Indicate that this command is finished tx = SqlDatabase::Connection::create(databaseUrl)->transaction(); finish_command(tx, cmd_id, "ran tests"); tx->commit(); return 0; }
int main(int argc, char *argv[]) { Sawyer::initializeLibrary(); mlog = Sawyer::Message::Facility("tool"); Sawyer::Message::mfacilities.insertAndAdjust(mlog); // Parse the command-line Settings settings; std::vector<std::string> args = parseCommandLine(argc, argv, settings); SqlDatabase::TransactionPtr tx = SqlDatabase::Connection::create(settings.databaseUri)->transaction(); if (args.size() != 1) { mlog[FATAL] <<"incorrect usage; see --help\n"; exit(1); } if (args[0] == "clear") { clearErrors(tx, settings); } else if (args[0] == "update") { updateDatabase(tx, settings); } else if (args[0] == "missing") { listMissingErrors(tx, settings); } else if (args[0] == "count-missing") { countMissingErrors(tx, settings); } else if (args[0] == "list") { listErrors(tx, settings); } else { mlog[FATAL] <<"unknown command \"" <<StringUtility::cEscape(args[0]) <<"\"; see --help\n"; exit(1); } if (settings.dryRun) { mlog[WARN] <<"database was not modified (running with --dry-run)\n"; } else { tx->commit(); } }
int main(int argc, char *argv[]) { std::ios::sync_with_stdio(); argv0 = argv[0]; { size_t slash = argv0.rfind('/'); argv0 = slash==std::string::npos ? argv0 : argv0.substr(slash+1); if (0==argv0.substr(0, 3).compare("lt-")) argv0 = argv0.substr(3); } bool ignore_inline_candidates = false; bool ignore_no_compares = false; int call_depth = -1; bool ignore_faults = true; double semantic_similarity_threshold = 0.70; bool expand_ncalls = false; bool reachability_graph = true; bool show_progress = false; bool verbose = false; std::string input_file_name; int argno = 1; for (/*void*/; argno<argc && '-'==argv[argno][0]; ++argno) { if (!strcmp(argv[argno], "--")) { ++argno; break; } else if (!strcmp(argv[argno], "--help") || !strcmp(argv[argno], "-h")) { usage(0); } else if (!strcmp(argv[argno], "--ignore-inline-candidates")) { ignore_inline_candidates = true; } else if (!strcmp(argv[argno], "--ignore-no-compares")) { ignore_no_compares = false; } else if (!strcmp(argv[argno], "--progress")) { show_progress = true; } else if (!strcmp(argv[argno], "--no-expand-ncalls")) { expand_ncalls = false; } else if (!strncmp(argv[argno], "--file=", 7)) { input_file_name = argv[argno]+7; } else if (!strcmp(argv[argno], "--verbose")) { verbose = true; } else if (!strncmp(argv[argno], "--call-depth=",13)) { call_depth = strtol(argv[argno]+13, NULL, 0); } else { std::cerr <<argv0 <<": unknown switch: " <<argv[argno] <<"\n" <<argv0 <<": see --help for more info\n"; exit(1); } } if (argno+1!=argc) usage(1); SqlDatabase::ConnectionPtr conn = SqlDatabase::Connection::create(argv[argno++]); transaction = conn->transaction(); int64_t cmd_id = CloneDetection::start_command(transaction, argc, argv, "calculating api similarity"); // Read function pairs from standard input or the file FunctionPairs worklist; if (input_file_name.empty()) { std::cerr <<argv0 <<": reading function pairs worklist from stdin...\n"; worklist = load_worklist("stdin", stdin); } else { FILE *in = fopen(input_file_name.c_str(), "r"); if (NULL==in) { std::cerr <<argv0 <<": " <<strerror(errno) <<": " << input_file_name <<"\n"; exit(1); } worklist = load_worklist(input_file_name, in); fclose(in); } size_t npairs = worklist.size(); std::cerr <<argv0 <<": work list has " <<npairs <<" function pair" <<(1==npairs?"":"s") <<"\n"; // Process each function pair CloneDetection::Progress progress(npairs); progress.force_output(show_progress); // Load the computational equivalence classes std::map<int,int> norm_map; computational_equivalent_classes(norm_map); // Create list of functions and igroups to analyze SqlDatabase::StatementPtr insert_stmt = transaction->statement("insert into api_call_similarity" "(func1_id, func2_id, max_similarity, min_similarity," " ave_similarity, cg_similarity)" " values (?, ?, ?, ?, ?, ?)"); while (!worklist.empty()) { ++progress; int func1_id, func2_id; boost::tie(func1_id, func2_id) = worklist.shift(); if (verbose) std::cerr <<argv0 <<": func1_id=" <<func1_id <<" func2_id=" <<func2_id <<"\n"; SqlDatabase::StatementPtr igroup_stmt = transaction->statement("select distinct sem1.igroup_id" " from semantic_fio as sem1 " " join semantic_fio as sem2" " on sem2.igroup_id = sem1.igroup_id" " and sem2.func_id = ?" " where sem1.func_id = ? " + std::string(ignore_faults ? " and sem1.status = 0 and sem2.status = 0" : "") + " order by sem1.igroup_id"); igroup_stmt->bind(0, func2_id); igroup_stmt->bind(1, func1_id); int ncompares = 0; double max_api_similarity = 0; double min_api_similarity = INT_MAX; double ave_api_similarity = 0; for (SqlDatabase::Statement::iterator row=igroup_stmt->begin(); row!=igroup_stmt->end(); ++row) { int igroup_id = row.get<int>(0); double api_similarity = similarity(func1_id, func2_id, igroup_id, semantic_similarity_threshold, ignore_inline_candidates, ignore_no_compares, call_depth, expand_ncalls, norm_map); if (api_similarity < 0) continue; max_api_similarity = std::max(api_similarity, max_api_similarity); min_api_similarity = std::min(api_similarity, min_api_similarity); ave_api_similarity += api_similarity; ncompares++; } if (ncompares == 0) { ave_api_similarity = 1.0; max_api_similarity = 1.0; min_api_similarity = 1.0; } else { ave_api_similarity = ave_api_similarity/ncompares; } // Find call similarity between functions double cg_similarity = whole_function_similarity(func1_id, func2_id, norm_map, reachability_graph); insert_stmt->bind(0, func1_id); insert_stmt->bind(1, func2_id); insert_stmt->bind(2, max_api_similarity); insert_stmt->bind(3, min_api_similarity); insert_stmt->bind(4, ave_api_similarity); insert_stmt->bind(5, cg_similarity); insert_stmt->execute(); } progress.message("committing changes"); std::string mesg = "calculated api similarity for "+ StringUtility::numberToString(npairs)+" function pair"+(1==npairs?"":"s"); CloneDetection::finish_command(transaction, cmd_id, mesg); transaction->commit(); progress.clear(); return 0; }
int main(int argc, char *argv[]) { std::ios::sync_with_stdio(); argv0 = argv[0]; { size_t slash = argv0.rfind('/'); argv0 = slash==std::string::npos ? argv0 : argv0.substr(slash+1); if (0==argv0.substr(0, 3).compare("lt-")) argv0 = argv0.substr(3); } // Parse switches Switches opt; int argno = 1, ngenerators = 0; for (/*void*/; argno<argc && '-'==argv[argno][0]; ++argno) { if (!strcmp(argv[argno], "--")) { ++argno; break; } else if (!strcmp(argv[argno], "--help") || !strcmp(argv[argno], "-h")) { usage(0); } else if (!strncmp(argv[argno], "--ngroups=", 10)) { opt.ngroups = strtoul(argv[argno]+10, NULL, 0); opt.ngroups_set = true; } else if (!strcmp(argv[argno], "--collection")) { opt.single_collection = true; } else if (!strncmp(argv[argno], "--collection=", 13)) { opt.collection_id = strtoul(argv[argno]+13, NULL, 0); opt.collection_id_set = opt.single_collection = true; } else if (!strncmp(argv[argno], "--default=", 10)) { opt.default_queue = parse_queuename(argv[argno]+10); } else if (!strcmp(argv[argno], "--memhash") || !strncmp(argv[argno], "--memhash=", 10)) { std::string s; if (char *equal = strchr(argv[argno], '=')) s = equal+1; std::vector<std::string> valargs = StringUtility::split(',', s, 3); valargs.resize(3); if (valargs[0].empty()) valargs[0] = "0"; if (valargs[1].empty()) valargs[1] = "255"; std::vector<std::string> randargs(1, "1"); randargs.push_back(valargs[2]); valargs.pop_back(); opt.queue_modifiers.push_back(new ValuesGenerator(IQ_MEMHASH, valargs)); opt.queue_modifiers.push_back(new RandomGenerator(IQ_MEMHASH, randargs)); ++ngenerators; } else { std::cerr <<argv0 <<": unrecognized switch: " <<argv[argno] <<"\n" <<"see \"" <<argv0 <<" --help\" for usage info.\n"; exit(1); } } if (!opt.ngroups_set) { std::cerr <<argv0 <<": missing --ngroups switch\n" <<argv0 <<": see --help for more info\n"; exit(1); } // Parse non-switch arguments (allow the database connection string to appear anywhere in this list since simplifies // the run-analysis.sh script. for (/*void*/; argno<argc; argno++) { std::vector<std::string> colon_parts = StringUtility::split(':', argv[argno], 2); InputQueueName qn = parse_queuename(colon_parts[0]); if (IQ_NONE!=qn && 2==colon_parts.size()) { std::vector<std::string> equal_parts = StringUtility::split('=', colon_parts[1], 2); std::string gname = equal_parts[0]; std::vector<std::string> args; try { if (2==equal_parts.size()) args = StringUtility::split(',', equal_parts[1], (size_t)-1, true); if (0==gname.compare("values")) { opt.queue_modifiers.push_back(new ValuesGenerator(qn, args)); } else if (0==gname.compare("set") || 0==gname.compare("reset")) { opt.queue_modifiers.push_back(new ResetGenerator(qn, args)); } else if (0==gname.compare("pad")) { opt.queue_modifiers.push_back(new PaddingGenerator(qn, args)); ++ngenerators; } else if (0==gname.compare("random")) { opt.queue_modifiers.push_back(new RandomGenerator(qn, args)); ++ngenerators; } else if (0==gname.compare("copy")) { opt.queue_modifiers.push_back(new CopyGenerator(qn, args)); ++ngenerators; } else if (0==gname.compare("permute")) { opt.queue_modifiers.push_back(new PermuteFilter(qn, args)); } else if (0==gname.compare("shuffle")) { opt.queue_modifiers.push_back(new ShuffleFilter(qn, args)); } else if (0==gname.compare("redirect")) { opt.queue_modifiers.push_back(new RedirectFilter(qn, args)); } else { std::cerr <<argv0 <<": unknown generator or filter name: " <<gname <<"\n"; exit(1); } } catch (const Exception &e) { std::cerr <<argv0 <<": " <<argv[argno] <<": " <<e <<"\n"; exit(1); } } else if (transaction==NULL) { transaction = SqlDatabase::Connection::create(argv[argno])->transaction(); } else { std::cerr <<argv0 <<": unknown generator or filter: " <<argv[argno] <<"\n"; exit(1); } } if (transaction==NULL) { std::cerr <<argv0 <<": missing database URL\n" <<argv0 <<": see --help for more info\n"; exit(1); } if (opt.queue_modifiers.empty()) { if (opt.ngroups>0) { std::cerr <<argv0 <<": no generators specified; all input groups would be empty\n"; exit(1); } else { exit(0); } } // Get the list of queues that are affected. std::vector<int> queue_modified(IQ_NQUEUES, 0); for (size_t i=0; i<opt.queue_modifiers.size(); ++i) queue_modified[opt.queue_modifiers[i]->queuename] = 1; // Generate the inputs int64_t cmd_id = start_command(transaction, argc, argv, "generating input groups"); int first_id = transaction->statement("select coalesce(max(igroup_id),-1)+1 from semantic_inputvalues")->execute_int(); if (!opt.collection_id_set) opt.collection_id = first_id; Progress progress(opt.ngroups); for (size_t gi=0; gi<opt.ngroups; ++gi) { ++progress; int igroup_id = first_id + gi; InputGroup igroup; igroup.set_collection_id(opt.single_collection ? opt.collection_id : igroup_id); // All queues initialize redirect to the default queue. If any generator or filter is applied, then // we don't do the default redirect. for (size_t qi=0; qi<IQ_NQUEUES; ++qi) { if (!queue_modified[qi]) igroup.queue((InputQueueName)qi).redirect(opt.default_queue); } // Build the queues for (size_t qmi=0; qmi<opt.queue_modifiers.size(); ++qmi) { QueueModifier *qm = opt.queue_modifiers[qmi]; qm->reseed(igroup_id); InputQueue &q = igroup.queue(qm->queuename); qm->operator()(q, igroup_id); } // Save all queues igroup.save(transaction, igroup_id, cmd_id); } progress.clear(); std::string desc = "generated "+StringUtility::numberToString(opt.ngroups)+" input group"+(1==opt.ngroups?"":"s")+ " starting at "+StringUtility::numberToString(first_id); if (opt.ngroups>0) { finish_command(transaction, cmd_id, desc); transaction->commit(); } std::cerr <<argv0 <<": " <<desc <<"\n"; return 0; }
int main(int argc, char* argv[]) { std::string database; size_t l = 4, k = 700; size_t hashTableNumBuckets = 13000000, hashTableElementsPerBucket = 20; double distBound = 1.; double similarity=1.; double r = 4.; int norm = 1; int groupLow=-1; int groupHigh=-1; //Timing struct timeval before, after; struct rusage ru_before, ru_after; gettimeofday(&before, NULL); getrusage(RUSAGE_SELF, &ru_before); bool nodelete = false; try { options_description desc("Allowed options"); desc.add_options() ("help", "Produce a help message") ("nodelete", "Do not delete from vectors") ("groupLow,g", value< int >(&groupLow), "The lowest count of elements") ("groupHigh,G", value< int >(&groupHigh), "The highest count of elements") ("database", value< string >(&database), "The sqlite database that we are to use") ("hash-function-size,k", value< size_t >(&k), "The number of elements in a single hash function") ("hash-table-count,l", value< size_t >(&l), "The number of separate hash tables to create") ("buckets,b", value< size_t >(&hashTableNumBuckets), "The number of buckets in each hash table (buckets may store multiple elements)") ("bucket-size,s", value< size_t >(&hashTableElementsPerBucket), "The number of elements that can be stored in each hash table bucket") ("similarity,t", value< double >(&similarity), "The similarity threshold that is allowed in a clone pair") ("distance,d", value< double >(&distBound), "The maximum distance that is allowed in a clone pair") ("interval-size,r", value< double >(&r), "The divisor for the l_2 hash function family") ("norm,p", value< int >(&norm), "Exponent in p-norm to use (1 or 2)") ; variables_map vm; store(parse_command_line(argc, argv, desc), vm); notify(vm); distBound = similarity==1 ? 0.0 : sqrt(2*groupLow*(1.-similarity)); std::cerr << "similarity " << similarity << " distBound " << distBound << std::endl; if (vm.count("help")) { cout << desc << endl; exit(0); } if (vm.count("nodelete")) { nodelete = true; } if (vm.count("groupLow") == 0) { groupLow = -1; } if (vm.count("groupHigh") == 0) { groupHigh = -1; } if (database == "") { std::cerr << "Missing options. Call as: " << argv[0] << " --database <database-name> [other parameters]" << std::endl; exit(1); } if (hashTableNumBuckets >= (1ULL << 32)) { cerr << "Number of buckets must be less than 2**32" << endl; exit (1); } if (norm != 1 && norm != 2) { cerr << "Norm must be either 1 or 2" << endl; exit (1); } if (nodelete == false) { cerr << "groupLow: " << groupLow << std::endl; cerr << "groupHigh: " << groupHigh << std::endl; cerr << "norm: l_" << norm << std::endl; cerr << "database: " << database << std::endl; cerr << "k: " << k << std::endl; cerr << "l: " << l << std::endl; cerr << "buckets: " << hashTableNumBuckets << std::endl; cerr << "bucket size: " << hashTableElementsPerBucket << std::endl; cerr << "distance: " << distBound << std::endl; cerr << "r: " << r << std::endl; } } catch(exception& e) { cout << e.what() << "\n"; exit (1); } SqlDatabase::TransactionPtr tx = SqlDatabase::Connection::create(database)->transaction(); scoped_array_with_size<VectorEntry> vectors; scoped_array_with_size<scoped_array_with_size<VectorEntry> > duplicateVectors; //Step to pass to LSH only the vectors that are not part of an exact clone pass { std::vector<int> functionsThatWeAreInterestedIn; scoped_array_with_size<VectorEntry> allVectors; map<string, std::vector<int> > internTable; read_vector_data(tx, allVectors, functionsThatWeAreInterestedIn, internTable, groupLow, groupHigh, false); //Assign to vectors the first element of each hash bucket int numberOfBuckets = 0; for (map<string, std::vector<int> >::iterator iItr = internTable.begin(); iItr != internTable.end() ; iItr++) numberOfBuckets++; vectors.allocate(numberOfBuckets); duplicateVectors.allocate(numberOfBuckets); int indexInVectors=0; std::cout << "All is size: " << allVectors.size() << " reduced size is " << vectors.size() << std::endl; for (map<string, std::vector<int> >::iterator iItr = internTable.begin(); iItr != internTable.end() ; iItr++) { if (iItr->second.size()>1) duplicateVectors[indexInVectors].allocate(iItr->second.size()-1); for (unsigned int i = 0 ; i < iItr->second.size() ; i++) { VectorEntry& allVe = allVectors[iItr->second[i]]; VectorEntry& ve = i == 0 ? vectors[indexInVectors] : duplicateVectors[indexInVectors][i-1]; ve.rowNumber = allVe.rowNumber; ve.functionId = allVe.functionId; ve.indexWithinFunction = allVe.indexWithinFunction; ve.line = allVe.line; ve.offset = allVe.offset; ve.compressedCounts.allocate(allVe.compressedCounts.size()); memcpy(ve.compressedCounts.get(), allVe.compressedCounts.get(), allVe.compressedCounts.size()); } indexInVectors++; } } if (vectors[0].compressedCounts.size() == 0) { cerr << "Vector slot 0 is empty" << endl; abort(); } size_t numVectorElements = getUncompressedSizeOfVector(vectors[0].compressedCounts.get(), vectors[0].compressedCounts.size()); if (debug_messages) { cout << "Vectors have " << numVectorElements << " elements" << endl; cout << "Number of vectors fetched is " << vectors.size() << std::endl; } LSHTableBase* table = NULL; switch (norm) { case 1: table = new LSHTable<HammingHashFunctionSet, L1DistanceObject>(vectors, L1DistanceObject(), k, l, r, numVectorElements, hashTableNumBuckets, hashTableElementsPerBucket, distBound); break; case 2: table = new LSHTable<StableDistributionHashFunctionSet, L2DistanceObject>(vectors, L2DistanceObject(), k, l, r, numVectorElements, hashTableNumBuckets, hashTableElementsPerBucket, distBound); break; default: cerr << "Bad value for --norm" << endl; abort(); // Should have been caught earlier } assert(table); // Setup stuff for postprocessing int windowSize = 0; int stride = 0; get_run_parameters(tx, windowSize, stride); if (nodelete == false) { cerr << "About to delete from clusters" << endl; tx->execute("delete from clusters"); cerr << "... done" << endl; cerr << "About to delete from postprocessed_clusters" << endl; tx->execute("delete from postprocessed_clusters"); cerr << "... done" << endl; } const size_t numStridesThatMustBeDifferent = windowSize / (stride * 2); // Get clusters and postprocess them vector<bool> liveVectors(vectors.size(), true); size_t clusterNum = 0, postprocessedClusterNum = 0; for (size_t i = 0; i < vectors.size(); ++i) { //Loop over vectors //Creating potential clusters if (!liveVectors[i]) continue; liveVectors[i] = false; vector<pair<size_t, double> > clusterElementsRaw = table->query(i); // Pairs are vector number, distance vector<pair<uint64_t, double> > clusterElements; vector<uint64_t > postprocessedClusterElements; clusterElements.push_back(make_pair(i, 0)); //const VectorEntry& ve = vectors[i]; for (size_t j = 0; j < clusterElementsRaw.size(); ++j) { size_t entry = clusterElementsRaw[j].first; //double dist = clusterElementsRaw[j].second; // All entries less than i were in previous clusters, so we save an array lookup if (entry <= i || !liveVectors[entry]) continue; clusterElements.push_back(clusterElementsRaw[j]); liveVectors[entry] = false; } if (clusterElements.size() < 2 && duplicateVectors[i].size() == 0 ) continue; //Insert raw cluster data for (vector<pair<uint64_t, double> >::const_iterator j = clusterElements.begin(); j != clusterElements.end(); ++j) { for(size_t k = 0; k < duplicateVectors[j->first].size(); k++) { const VectorEntry& ve = duplicateVectors[j->first][k]; insert_into_clusters(tx, clusterNum, ve.functionId, ve.indexWithinFunction, ve.rowNumber, j->second); } const VectorEntry& ve = vectors[j->first]; insert_into_clusters(tx, clusterNum, ve.functionId, ve.indexWithinFunction, ve.rowNumber, j->second); } if (clusterNum % 10000 == 0 && debug_messages) cerr << "cluster " << clusterNum << " has " << clusterElements.size() << " elements" << endl; ++clusterNum; //Postprocessing does not make sense for inexact clones if (similarity != 1.0 ) continue; // This implicitly groups elements in the same function together and order by index_within_function in each function // Not needed because of the sort in LSHTable::query() which is on the cluster number: // std::sort(clusterElements.begin(), clusterElements.end()); //The next two variables will we initialized in first run size_t lastFunctionId=0; size_t lastIndexWithinFunction=0; bool first = true; std::vector<VectorEntry*> clusterElemPtr; for (size_t j = 0; j < clusterElements.size(); ++j) { clusterElemPtr.push_back( &vectors[ clusterElements[j].first ] ); for (size_t k = 0; k < duplicateVectors[clusterElements[j].first].size(); k++) clusterElemPtr.push_back(&duplicateVectors[ clusterElements[j].first ][k]); } std::sort(clusterElemPtr.begin(), clusterElemPtr.end(), compare_rows ); for (size_t j = 0; j < clusterElemPtr.size(); ++j) { const VectorEntry& ve = *clusterElemPtr[j]; if (first || ve.functionId != lastFunctionId || ve.indexWithinFunction >= lastIndexWithinFunction + numStridesThatMustBeDifferent) { lastFunctionId = ve.functionId; lastIndexWithinFunction = ve.indexWithinFunction; postprocessedClusterElements.push_back(j); } first = false; } if (postprocessedClusterElements.size() >= 2) { //insert post processed data for (vector<uint64_t >::const_iterator j = postprocessedClusterElements.begin(); j != postprocessedClusterElements.end(); ++j) { const VectorEntry& ve = *clusterElemPtr[*j]; insert_into_postprocessed_clusters(tx, postprocessedClusterNum, ve.functionId, ve.indexWithinFunction, ve.rowNumber, 0); } if (postprocessedClusterNum % 1000 == 0) { cerr << "postprocessed cluster " << postprocessedClusterNum << " has " << postprocessedClusterElements.size() << " elements" << endl; } ++postprocessedClusterNum; } } cerr << clusterNum << " total cluster(s), " << postprocessedClusterNum << " after postprocessing" << endl; gettimeofday(&after, NULL); getrusage(RUSAGE_SELF, &ru_after); insert_timing(tx, "lsh", groupLow, groupHigh, vectors.size(), k, l, before,after, ru_before, ru_after); tx->commit(); return 0; }
int main(int argc, char *argv[]) { std::ios::sync_with_stdio(); argv0 = argv[0]; { size_t slash = argv0.rfind('/'); argv0 = slash==std::string::npos ? argv0 : argv0.substr(slash+1); if (0==argv0.substr(0, 3).compare("lt-")) argv0 = argv0.substr(3); } int argno = 1; for (/*void*/; argno<argc && '-'==argv[argno][0]; ++argno) { if (!strcmp(argv[argno], "--")) { ++argno; break; } else if (!strcmp(argv[argno], "--help") || !strcmp(argv[argno], "-h")) { ::usage(0); } else if (!strcmp(argv[argno], "--delete")) { opt.delete_old_data = true; } else if (!strncmp(argv[argno], "--exclude-functions=", 20)) { opt.exclude_functions_table = argv[argno]+20; } else if (!strcmp(argv[argno], "--no-delete")) { opt.delete_old_data = false; } else if (!strncmp(argv[argno], "--relation=", 11)) { opt.relation_id = strtol(argv[argno]+11, NULL, 0); } else { std::cerr <<argv0 <<": unknown switch: " <<argv[argno] <<"\n" <<argv0 <<": see --help for more info\n"; exit(1); } }; if (argno+1!=argc) ::usage(1); time_t start_time = time(NULL); SqlDatabase::ConnectionPtr conn = SqlDatabase::Connection::create(argv[argno++]); SqlDatabase::TransactionPtr tx = conn->transaction(); // Save ourself in the history if we're modifying the database. int64_t cmd_id=-1; if (opt.delete_old_data) cmd_id = CloneDetection::start_command(tx, argc, argv, "clearing funcsim data for relation #"+ StringUtility::numberToString(opt.relation_id), start_time); // The 32-func-similarity tool needs this index, so we might as well create it here when we're running serially. The // semantic_outputvalues table can be HUGE depending on how the analysis is configured (i.e., whether it saves output // values as a vector or set, whether it saves function calls and system calls, etc.). Since creating the index could take // a few minutes, we'd rather not create it if it alread exists, but PostgreSQL v8 doesn't have a "CREATE INDEX IF NOT // EXISTS" ability. Therefore, try to create the index right away before we make any other changes, and if creation fails // then start a new transaction (because the current one is hosed). std::cerr <<argv0 <<": creating output group index (could take a while)\n"; try { SqlDatabase::TransactionPtr tx = conn->transaction(); tx->execute("create index idx_ogroups_hashkey on semantic_outputvalues(hashkey)"); tx->commit(); } catch (const SqlDatabase::Exception&) { std::cerr <<argv0 <<": idx_ogroups_hashkey index already exists; NOT dropping and recreating\n"; } // Delete old data. if (opt.delete_old_data) tx->statement("delete from semantic_funcsim where relation_id = ?")->bind(0, opt.relation_id)->execute(); // Get the list of functions that should appear in the worklist. std::cerr <<argv0 <<": obtaining function list\n"; std::string stmt1 = "create temporary table tmp_tested_funcs as" " select distinct fio.func_id as func_id" " from semantic_fio as fio"; if (!opt.exclude_functions_table.empty()) { std::vector<std::string> parts = StringUtility::split('.', opt.exclude_functions_table, 2, true); if (parts.size()<2) parts.push_back("func_id"); stmt1 += " left join " + parts.front() + " as exclude" " on fio.func_id = exclude." + parts.back() + " where exclude." + parts.back() + " is null"; } tx->execute(stmt1); // Create pairs of function IDs for those functions which have been tested and for which no similarity measurement has been // computed. (FIXME: We should probably recompute similarity that might have changed due to rerunning tests or running the // same function but with more input groups. [Robb P. Matzke 2013-06-19]) std::cerr <<argv0 <<": creating work list\n"; SqlDatabase::StatementPtr stmt2 = tx->statement("select distinct f1.func_id as func1_id, f2.func_id as func2_id" " from tmp_tested_funcs as f1" " join tmp_tested_funcs as f2 on f1.func_id < f2.func_id" " except" " select func1_id, func2_id from semantic_funcsim as sim" " where sim.relation_id = ?"); stmt2->bind(0, opt.relation_id); for (SqlDatabase::Statement::iterator row=stmt2->begin(); row!=stmt2->end(); ++row) std::cout <<row.get<int>(0) <<"\t" <<row.get<int>(1) <<"\n"; if (cmd_id>=0) CloneDetection::finish_command(tx, cmd_id, "cleared funcsim table for relation #"+ StringUtility::numberToString(opt.relation_id)); tx->commit(); return 0; }