DataGenerator::DataGenerator() { srand(0); t = 0; current_id = 0; for (int i = 0; i < NUM_POINTS; i++) { pts[i * 3 + 0] = rand() % (6 * MAX_BOX) - 3 * MAX_BOX; pts[i * 3 + 1] = rand() % (6 * MAX_BOX) - 3 * MAX_BOX; pts[i * 3 + 2] = rand() % (6 * MAX_BOX) - 3 * MAX_BOX; } Ric << 0, 0, -1, -1, 0, 0, 0, 1, 0; //Tic << 4, 5, 6; Tic << 0.02, -0.14, 0.0; //acc_cov << 1.3967e-04, 1.4357e-06, 2.1468e-06, // 1.4357e-06, 1.4352e-04, 5.7168e-05, // 2.1468e-06, 5.7168e-05, 1.5757e-04; //acc_cov << 1.3967e-04, 0, 0, // 0, 1.4352e-04, 0, // 0, 0, 1.5757e-04; acc_cov = 1e-2 * Matrix3d::Identity(); gyr_cov = 1e-4 * Matrix3d::Identity(); pts_cov << .1 * .1 / 3.6349576068362910e+02 / 3.6349576068362910e+02, 0, 0, .1 * .1 / 3.6356654972681025e+02 / 3.6356654972681025e+02; generator = default_random_engine(0); distribution = normal_distribution<double>(0.0, 1); }
/******************************************************************************* Function: Randomize Author: Stephanie Athow Description: Randomize the order of the vectors in a vector of vectors, but does not randomize the contents the second vector Parameters: in/out: data A vector of vectors where one vector contains the year, burned acres, and 12 months of PDSI data Returns: none *******************************************************************************/ void randomize( vector< vector<double> > & data ) { // seeding for random number generator unsigned seed = chrono::system_clock::now().time_since_epoch().count(); // randomize first vector (shuffle the data based on years, but not the data for the year) shuffle( data.begin(), data.end(), default_random_engine( seed ) ); }
int main(int argc, char* argv[]) { if (argc < 4) { cout << "usage: ./bayes train-set-file test-set-file mode:n|t [size-of-train-set] [debug-output:f|t]" << endl; } else { string trainSetFile = argv[1]; string testSetFile = argv[2]; bool treeAugmented = argv[3][0] == 't' ? true : false; int sizeOfTrainSet = argc >= 5 ? atoi(argv[4]) : 0; bool debugOutput = argc >= 6 ? (argv[5][0] == 't' ? true : false) : false; shared_ptr<Dataset> dataset(Dataset::loadDataset(trainSetFile, testSetFile)); const DatasetMetadata* metadata = dataset->getMetadata(); vector<Instance*> trainSet(dataset->getTrainSet().begin(), dataset->getTrainSet().end()); if (sizeOfTrainSet > 0 && sizeOfTrainSet < trainSet.size()) { unsigned int seed = (unsigned int)chrono::system_clock::now().time_since_epoch().count(); shuffle (trainSet.begin(), trainSet.end(), default_random_engine(seed)); trainSet.resize(sizeOfTrainSet); } BayesNet bayesNet(metadata, trainSet, treeAugmented); if (debugOutput) { cout << bayesNet.getMutualInfoTable() << endl; cout << bayesNet.getMaximalSpanningTree() << endl; cout << bayesNet.getProbabilityTables() << endl; } cout << bayesNet.getBayesNet() << endl; const vector<Instance*>& testSet = dataset->getTestSet(); int correctCount = 0; cout << "<Predictions for Test-set Instances>" << endl; cout << "Predicted" << DELIMITER << "Actual" << DELIMITER << "Probability" << endl; cout.setf(ios::fixed, ios::floatfield); cout.precision(PRECISION); for (int i = 0; i < testSet.size(); ++i) { Instance* inst = testSet[i]; double prob = 0.0; string predicted = bayesNet.predict(inst, &prob); string actual = inst->toString(metadata, true); if (predicted == actual) correctCount++; cout << predicted << DELIMITER << actual << DELIMITER << prob << endl; } cout << correctCount << " out of " << testSet.size() << " test instances were correctly classified" << endl; } }
int main(int argc, const char * argv[]) { if (argc < 4) { cout << "usage: ./dt-learn train-set-file test-set-file m [percentage-of-train-set]" << endl; } else { string trainSetFile = argv[1]; string testSetFile = argv[2]; int stopThreshold = atoi(argv[3]); int percentageOfTrainSet = argc == 5 ? atoi(argv[4]) : 100; shared_ptr<Dataset> dataset(Dataset::loadDataset(trainSetFile, testSetFile)); const DatasetMetadata* metadata = dataset->getMetadata(); vector<Instance*> trainSet(dataset->getTrainSet().begin(), dataset->getTrainSet().end()); if (percentageOfTrainSet < 100) { unsigned int seed = (unsigned int)chrono::system_clock::now().time_since_epoch().count(); shuffle (trainSet.begin(), trainSet.end(), default_random_engine(seed)); int newSize = (int)trainSet.size() * percentageOfTrainSet / 100; trainSet.resize(newSize); } DecisionTree tree(metadata, trainSet, stopThreshold); cout << tree.toString(); const vector<Instance*>& testSet = dataset->getTestSet(); int correctCount = 0; cout << "<Predictions for the Test Set Instances>" << endl; for (int i = 0; i < testSet.size(); ++i) { Instance* inst = testSet[i]; string predicted = tree.predict(inst); string actual = inst->toString(metadata, true); if (predicted == actual) correctCount++; cout << setfill(' ') << setw(3) << (i + 1) << ": "; cout << "Actual: " << actual << " Predicted: " << predicted<< endl; } cout << "Number of correctly classified: " << correctCount << " Total number of test instances: " << testSet.size() << endl; } }
/* Finalizes a chord progression graph and sets up the random integer generator. Note: any chords added after calling this function is inaccessible by the getRandomChord method. */ void ProgressionGraph::finalize() { ChordRNG = uniform_int_distribution<>(0, chords.size() - 1); unsigned seed = std::chrono::system_clock::now().time_since_epoch().count(); generator = default_random_engine(seed); }
void generate_data_q2() { long compressed_bytes_docs = 0; long compressed_bytes_terms = 0; long compressed_bytes_freqs = 0; a_exact_docs_oo = input::docs_bench_items(); // check at index 5 show_info("[P1] Generating random data..."); a_p1_terms = new unsigned short[input::NUM_TUPLES]; int index = 0; for (int t = 0; t < input::T_PM; ++t) { int cnt = pubmed::get_group_by_term(t); for (int d = 0; d < cnt; ++d) { a_p1_terms[index++] = t; } } // shuffle show_info("[P1] Shuffle..."); shuffle(a_p1_terms, a_p1_terms + input::NUM_TUPLES, default_random_engine(42)); #ifdef HUFFMAN // compress show_info("[P1] Generating Huffman tree..."); generate_array_tree_representation(a_p1_terms, input::NUM_TUPLES, a_p1_huffman_array, a_p1_terminator_array, a_p1_tree); encoding_dict<unsigned short> encoding_dict_terms; build_inverse_mapping(a_p1_tree, encoding_dict_terms); delete a_p1_tree; #endif // single lists show_info("[P1] Generating single lists..."); a_p1_terms_fragments = new unsigned short*[input::D_PM]; index = 0; for (int d = 0; d < input::D_PM; ++d) { int num_terms = pubmed::get_group_by_doc(d); a_p1_terms_fragments[d] = new unsigned short[num_terms]; for (int t = 0; t < num_terms; ++t) { a_p1_terms_fragments[d][t] = a_p1_terms[index++]; } } delete[] a_p1_terms; #ifdef HUFFMAN // compress show_info("[P1] Compressing..."); a_p1_terms_fragments_compressed = new char*[input::D_PM]; a_p1_terms_fragments_compressed_bytes = new int[input::D_PM]; for (int d = 0; d < input::D_PM; ++d) { a_p1_terms_fragments_compressed_bytes[d] = encode(a_p1_terms_fragments[d], pubmed::get_group_by_doc(d), a_p1_terms_fragments_compressed[d], encoding_dict_terms); compressed_bytes_terms += a_p1_terms_fragments_compressed_bytes[d]; delete[] a_p1_terms_fragments[d]; } delete[] a_p1_terms_fragments; #endif show_info("[P2] Generating random data..."); a_p2_docs = new int[input::NUM_TUPLES]; a_p2_freqs = new unsigned char[input::NUM_TUPLES]; index = 0; for (int d = 0; d < input::D_PM; ++d) { int cnt = pubmed::get_group_by_doc(d); for (int t = 0; t < cnt; ++t) { int r = rand() % 100; if (r < 25) a_p2_freqs[index] = 1; else if (r < 45) a_p2_freqs[index] = 2; else if (r < 60) a_p2_freqs[index] = 3; else if (r < 70) a_p2_freqs[index] = 4; else if (r < 75) a_p2_freqs[index] = 5; else if (r < 77) a_p2_freqs[index] = 6; else a_p2_freqs[index] = rand() % 40; a_p2_docs[index++] = d; } } // shuffle show_info("[P2] Shuffle..."); shuffle(a_p2_docs, a_p2_docs + input::NUM_TUPLES, default_random_engine(42)); shuffle(a_p2_freqs, a_p2_freqs + input::NUM_TUPLES, default_random_engine(42)); #ifdef HUFFMAN // generate Huffman tree show_info("[P2] Generating Huffman tree..."); generate_array_tree_representation(a_p2_docs, input::NUM_TUPLES, a_p2_huffman_array, a_p2_terminator_array, a_p2_tree); encoding_dict<int> encoding_dict_docs; build_inverse_mapping(a_p2_tree, encoding_dict_docs); generate_array_tree_representation(a_p2_freqs, input::NUM_TUPLES, a_p2_f_huffman_array, a_p2_f_terminator_array, a_p2_f_tree); encoding_dict<unsigned char> encoding_dict_freqs; build_inverse_mapping(a_p2_f_tree, encoding_dict_freqs); delete a_p2_tree; delete a_p2_f_tree; #endif // single lists show_info("[P2] Generating single lists..."); a_p2_docs_fragments = new int*[input::T_PM]; a_p2_freqs_fragments = new unsigned char*[input::T_PM]; index = 0; for (int t = 0; t < input::T_PM; ++t) { int num_docs = pubmed::get_group_by_term(t); a_p2_docs_fragments[t] = new int[num_docs]; a_p2_freqs_fragments[t] = new unsigned char[num_docs]; for (int d = 0; d < num_docs; ++d) { a_p2_freqs_fragments[t][d] = a_p2_freqs[index]; a_p2_docs_fragments[t][d] = a_p2_docs[index++]; } } delete[] a_p2_docs; delete[] a_p2_freqs; #ifdef HUFFMAN // compress show_info("[P2] Compressing..."); a_p2_docs_fragments_compressed = new char*[input::T_PM]; a_p2_freqs_fragments_compressed = new char*[input::T_PM]; a_p2_docs_fragments_compressed_bytes = new int[input::T_PM]; a_p2_freqs_fragments_compressed_bytes = new int[input::T_PM]; for (int t = 0; t < input::T_PM; ++t) { a_p2_docs_fragments_compressed_bytes[t] = encode(a_p2_docs_fragments[t], pubmed::get_group_by_term(t), a_p2_docs_fragments_compressed[t], encoding_dict_docs); delete[] a_p2_docs_fragments[t]; a_p2_freqs_fragments_compressed_bytes[t] = encode(a_p2_freqs_fragments[t], pubmed::get_group_by_term(t), a_p2_freqs_fragments_compressed[t], encoding_dict_freqs); delete[] a_p2_freqs_fragments[t]; compressed_bytes_freqs += a_p2_freqs_fragments_compressed_bytes[t]; compressed_bytes_docs += a_p2_docs_fragments_compressed_bytes[t]; } delete[] a_p2_docs_fragments; delete[] a_p2_freqs_fragments; #endif show_info("Swapping prevention..."); #ifdef HUFFMAN int tmp = 0; for (int d = 0; d < input::D_PM; ++d) { for (int t = 0; t < a_p1_terms_fragments_compressed_bytes[d]; ++t) { tmp = (tmp + a_p1_terms_fragments_compressed[d][t]) % 256; } } for (int t = 0; t < input::T_PM; ++t) { for (int d = 0; d < a_p2_docs_fragments_compressed_bytes[t]; ++d) { tmp = (tmp + a_p2_docs_fragments_compressed[t][d]) % 256; } for (int d = 0; d < a_p2_freqs_fragments_compressed_bytes[t]; ++d) { tmp = (tmp + a_p2_freqs_fragments_compressed[t][d]) % 256; } } #else #ifndef FASTBIT int tmp = 0; for (int d = 0; d < input::D_PM; ++d) { for (int t = 0; t < pubmed::get_group_by_doc(d); ++t) { tmp = (tmp + a_p1_terms_fragments[d][t]) % 256; } } for (int t = 0; t < input::T_PM; ++t) { for (int d = 0; d < pubmed::get_group_by_doc(t); ++d) { tmp = (tmp + a_p2_docs_fragments[t][d] + a_p2_freqs_fragments[t][d]) % 256; } } #endif #endif show_info("Compressed bytes terms: " << compressed_bytes_terms); show_info("Compressed bytes docs: " << compressed_bytes_docs); show_info("Compressed bytes freqs: " << compressed_bytes_freqs); show_info("DONE."); }
void huffman_query_generate_lists() { output::start_timer("run/bench_huffman_query_generate"); // generate tuples show_info("[1] Generating tuples..."); show_info("Allocating " << input::NUM_TUPLES << " shorts/chars (array)."); tuples = new unsigned short[input::NUM_TUPLES]; tuples_freq = new unsigned char[input::NUM_TUPLES]; show_info("Alloc success"); long next_index = 0; for (short t = 0; t < input::T_PM; ++t) { for (long c = 0; c < pubmed::get_group_by_term(t); ++c) { if (rand() % 100 < 20) { // 20% probability for < 1024 tuples_freq[next_index] = rand(); } else { tuples_freq[next_index] = rand() % 15; } tuples[next_index++] = t; } if (t % (input::T_PM/1000) == 0) debug_n(" " << t*100.0/input::T_PM << " % complete. "); } debug_n(" " << 100 << " % complete. \n"); // shuffle show_info("[2] Shuffling..."); shuffle(tuples, tuples + input::NUM_TUPLES, default_random_engine(42)); // generate terms per doc show_info("[3] Generating separate lists..."); terms_per_doc = new unsigned short*[input::D_PM]; terms_per_doc_size = new unsigned short[input::D_PM]; freqs_per_doc = new unsigned char*[input::D_PM]; next_index = 0; for (long d = 0; d < input::D_PM; ++d) { unsigned short* list = new unsigned short[pubmed::get_group_by_doc(d)]; memcpy(list, tuples + next_index, pubmed::get_group_by_doc(d) * sizeof(unsigned short)); terms_per_doc[d] = list; unsigned char* freqs_list = new unsigned char[pubmed::get_group_by_doc(d)]; memcpy(freqs_list, tuples_freq + next_index, pubmed::get_group_by_doc(d) * sizeof(unsigned char)); freqs_per_doc[d] = freqs_list; terms_per_doc_size[d] = pubmed::get_group_by_doc(d); if (d % (input::D_PM/1000) == 0) debug_n(" " << d*100.0/input::D_PM << " % complete. "); } debug_n(" " << 100 << " % complete. \n"); #ifdef s_compress #ifndef use_fastbit // compress with Huffman show_info("[4] Generating Huffman tree for terms..."); terms_per_doc_compressed = new char*[input::D_PM]; generate_array_tree_representation(tuples, input::NUM_TUPLES, huffman_array_terms, terminator_array_terms, tree); encoding_dict<unsigned short> encoding_dict_terms; build_inverse_mapping(tree, encoding_dict_terms); delete(tuples); show_info("[5] Compressing terms..."); for (long d = 0; d < input::D_PM; ++d) { char* terms_compressed; terms_bytes_uncompressed += terms_per_doc_size[d] * sizeof(unsigned short); terms_bytes_compressed += encode(terms_per_doc[d], terms_per_doc_size[d], terms_compressed, encoding_dict_terms); terms_per_doc_compressed[d] = terms_compressed; delete terms_per_doc[d]; if (d % (input::D_PM/1000) == 0) debug_n(" " << d*100.0/input::D_PM << " % complete. "); } debug_n(" " << 100 << " % complete. \n"); #else // compress with FastBit show_info("[4] Compressing terms with bit vector..."); terms_per_doc_bitvector = new ibis::bitvector*[input::D_PM]; delete tuples; for (long d = 0; d < input::D_PM; ++d) { ibis::bitvector* terms_compressed = new ibis::bitvector(); terms_bytes_uncompressed += terms_per_doc_size[d] * sizeof(unsigned short); for (long term_index = 0; term_index < terms_per_doc_size[d]; ++ term_index) { terms_compressed->setBit(terms_per_doc[d][term_index], 1); } terms_compressed->compress(); // force new allocate ibis::array_t<uint32_t>* arr = new ibis::array_t<uint32_t>(); terms_compressed->write(*arr); delete terms_compressed; terms_compressed = new ibis::bitvector(*arr); delete arr; delete terms_per_doc[d]; terms_per_doc_bitvector[d] = terms_compressed; terms_bytes_compressed += terms_compressed->bytes(); if (d % (input::D_PM/1000) == 0) { debug_n(" " << d*100.0/input::D_PM << " % complete. Using " << terms_bytes_compressed << " / " << terms_bytes_uncompressed << " bytes. "); } } debug_n(" " << 100 << " % complete. \n"); show_info("[5] n/a"); #endif // compress show_info("[6] Generating Huffman tree for frequencies..."); freqs_per_doc_compressed = new char*[input::D_PM]; generate_array_tree_representation(tuples_freq, input::NUM_TUPLES, huffman_array_freqs, terminator_array_freqs, tree_freqs); encoding_dict<unsigned char> encoding_dict_freqs; build_inverse_mapping(tree_freqs, encoding_dict_freqs); delete(tuples_freq); show_info("[7] Compressing frequencies..."); for (long d = 0; d < input::D_PM; ++d) { char* freqs_compressed; freqs_bytes_uncompressed += terms_per_doc_size[d] * sizeof(unsigned char); freqs_bytes_compressed += encode(freqs_per_doc[d], terms_per_doc_size[d], freqs_compressed, encoding_dict_freqs); freqs_per_doc_compressed[d] = freqs_compressed; delete freqs_per_doc[d]; if (d % (input::D_PM/1000) == 0) debug_n(" " << d*100.0/input::D_PM << " % complete. "); } debug_n(" " << 100 << " % complete. \n"); delete freqs_per_doc; delete terms_per_doc; show_info("terms bytes uncompressed: " << terms_bytes_uncompressed); show_info("terms bytes compressed: " << terms_bytes_compressed); show_info("freqs bytes uncompressed: " << freqs_bytes_uncompressed); show_info("freqs bytes compressed: " << freqs_bytes_compressed); #else delete(tuples); delete(tuples_freq); show_info("No compression."); #endif exact_docs_a = input::docs_bench_items(); output::stop_timer("run/bench_huffman_query_generate"); show_info("Done."); }
/** @param head The linked list's head. Note that the head is guaranteed to be not null, so it contains at least one node. */ Solution(ListNode* head) { this->head = head; generator = default_random_engine(); }
default_random_engine randomizer::chance() { random_device rd; return default_random_engine(rd()); }
namespace Diehard { default_random_engine ProblemTweet::random_engine = default_random_engine(random_device()()); ProblemTweet::ProblemTweet() : ProblemDot(*GetRandomCapacities()) {} shared_ptr<list<string> > ProblemTweet::GetTweets() const { list<const Node*> nodes_route; auto node_goal = GetGoalNodeRandomly(); if (!node_goal) return nullptr; auto node_ptr = node_goal; while (node_ptr) { nodes_route.push_front(node_ptr); if (node_ptr && node_ptr->GetSum() == 0) break; node_ptr = GetFromNodeRandomly(node_ptr); } auto tweets = shared_ptr<list<string> >(new list<string>()); { stringstream ss; ss << "Capacities: " << GetName(); ss << " -> "; ss << "Request: " << goal_sum; tweets->push_back(ss.str()); } for_each(nodes_route.begin(), nodes_route.end(), [&](const Node* node) { stringstream ss; ss << "Bucket: " << node->GetName(false); tweets->push_back(ss.str()); }); { stringstream ss; ss << "Final result: "; for (Dimention d = 0; d < capacities.size(); d++) { if (d != 0) ss << "+"; ss << node_goal->volumes[d]; } ss << "=" << node_goal->GetSum(); tweets->push_back(ss.str()); } return tweets; } #pragma mark Random generator const Node* ProblemTweet::GetGoalNodeRandomly() const { list<const Node*> nodes_goal; for (auto& node : nodes) { if ( node.is_used && node.cost < Node::cost_max && node.GetSum() == goal_sum) nodes_goal.push_back(&node); } return GetLowestCostRandomly(nodes_goal); } const Node* ProblemTweet::GetFromNodeRandomly(const Node* node) { list<const Node*> nodes_from(node->from.begin(), node->from.end()); return GetLowestCostRandomly(nodes_from); } const Node* ProblemTweet::GetLowestCostRandomly(const std::list<const Node*>& nodes) { Node::Cost cost_min = Node::cost_max; for_each(nodes.begin(), nodes.end(), [&](const Node* node) { if (node->cost < cost_min) cost_min = node->cost; }); vector<const Node*> nodes_lowcost; for_each(nodes.begin(), nodes.end(), [&](const Node* node) { if (node->cost == cost_min) nodes_lowcost.push_back(node); }); if (nodes_lowcost.empty()) { return nullptr; } else { uniform_int_distribution<size_t> dist_idx(0, nodes_lowcost.size() - 1); return nodes_lowcost[dist_idx(random_engine)]; } } shared_ptr<vector<Volume> > ProblemTweet::GetRandomCapacities() { uniform_int_distribution<Volume> dist_capacity(2, 200); uniform_int_distribution<Dimention> dist_dimention(2, 4); auto rand_capacity = bind(dist_capacity, random_engine); auto dimention = dist_dimention(random_engine); shared_ptr<vector<Volume> > capacities(new vector<Volume>(dimention)); for (Dimention i = 0; i < dimention; i++) { (*capacities)[i] = rand_capacity(); } return capacities; } Volume ProblemTweet::GetRandomGoal() { Volume sum = 0; for_each(capacities.begin(), capacities.end(), [&](Volume v) { sum += v; }); uniform_int_distribution<Volume> dist_goal(1, sum); return dist_goal(random_engine); } }
void generate_tuples_q5_omc() { // scale int index = 0; show_info("[1] Generating terms per doc fragments..."); t_terms_per_doc = new int[input::NUM_TUPLES]; for (int term = 0; term < input::T_PM; ++term) { for (int i = 0; i < pubmed::get_group_by_term(term); ++i) { t_terms_per_doc[index++] = term; } } shuffle(t_terms_per_doc, t_terms_per_doc + input::NUM_TUPLES, default_random_engine(42)); // split index = 0; t_terms_per_doc_docs = new rle_tuple[input::D_PM]; for (int doc = 0; doc < input::D_PM; ++doc) { t_terms_per_doc_docs[doc].row_id = index; t_terms_per_doc_docs[doc].length = pubmed::get_group_by_doc(doc); t_terms_per_doc_docs[doc].id = doc; index += pubmed::get_group_by_doc(doc); } show_info("[2] Generating docs per term fragments..."); t_docs_per_term = new int[input::NUM_TUPLES]; index = 0; for (int doc = 0; doc < input::D_PM; ++doc) { for (int i = 0; i < pubmed::get_group_by_doc(doc); ++i) { t_docs_per_term[index++] = doc; } } shuffle(t_docs_per_term, t_docs_per_term + input::NUM_TUPLES, default_random_engine(42)); // split index = 0; t_docs_per_term_terms = new rle_tuple[input::T_PM]; for (int term = 0; term < input::T_PM; ++term) { t_docs_per_term_terms[term].row_id = index; t_docs_per_term_terms[term].length = pubmed::get_group_by_term(term); t_docs_per_term_terms[term].id = term; index += pubmed::get_group_by_term(term); } show_info("[3] Generate authors per doc fragments..."); t_authors_per_doc = new int[input::NUM_TUPLES_DA]; index = 0; for (int author = 0; author < input::A_PM; ++author) { for (int i = 0; i < pubmed::get_DA_group_by_author(author); ++i) { t_authors_per_doc[index++] = author; } } shuffle(t_authors_per_doc, t_authors_per_doc + input::NUM_TUPLES_DA, default_random_engine(42)); // split index = 0; t_authors_per_doc_docs = new rle_tuple[input::D_PM]; for (int doc = 0; doc < input::D_PM; ++doc) { t_authors_per_doc_docs[doc].row_id = index; t_authors_per_doc_docs[doc].length = pubmed::get_DA_group_by_doc(doc); t_authors_per_doc_docs[doc].id = doc; index += pubmed::get_DA_group_by_doc(doc); } show_info("[4] Generate docs per author fragments..."); t_docs_per_author = new int[input::NUM_TUPLES_DA]; index = 0; for (int doc = 0; doc < input::D_PM; ++doc) { for (int i = 0; i < pubmed::get_DA_group_by_doc(doc); ++i) { t_docs_per_author[index++] = doc; } } shuffle(t_docs_per_author, t_docs_per_author + input::NUM_TUPLES_DA, default_random_engine(42)); // split index = 0; t_docs_per_author_authors = new rle_tuple[input::A_PM]; for (int author = 0; author < input::A_PM; ++author) { t_docs_per_author_authors[author].row_id = index; t_docs_per_author_authors[author].length = pubmed::get_DA_group_by_author(author); t_docs_per_author_authors[author].id = author; index += pubmed::get_DA_group_by_author(author); } show_info("[5] Generating years per doc..."); year_doc = new int[input::D_PM]; for (int i = 0; i < input::D_PM; ++i) { year_doc[i] = rand() % 100 + 1915; } }
void generate_random_tuples() { debug("Generating " << input::NUM_TUPLES / TUPLES_DIVIDER << " random tuples."); output::start_timer("run/top_k_column_db_tf_in_documents_generate_random"); if (TUPLES_DIVIDER > 1) { show_info("Running benchmark with " << input::NUM_TUPLES/TUPLES_DIVIDER << " instead of " << input::NUM_TUPLES << " tuples."); } c_term = new unsigned short[input::NUM_TUPLES/TUPLES_DIVIDER]; debug("c_term alloc success"); c_doc = new unsigned int[input::NUM_TUPLES/TUPLES_DIVIDER]; debug("c_doc alloc success"); c_freq = new unsigned char[input::NUM_TUPLES/TUPLES_DIVIDER]; debug("c_freq alloc success"); // offsets define where cluster of same items begins term_offsets = new long[input::T_PM + 1]; for (DOMAIN_TYPE i = 0; i < input::T_PM + 1; ++i) term_offsets[i] = 20000000000L; doc_offsets = new long[input::D_PM + 1]; for (DOMAIN_TYPE i = 0; i < input::D_PM + 1; ++i) doc_offsets[i] = 200000000000L; int next_index = 0; for (long term = 0; term < input::T_PM; ++term) { long times = MAX(1, pubmed::get_group_by_term(term) / TUPLES_DIVIDER); term_offsets[term] = next_index; for (int i = 0; i < times; ++i) { if (next_index < input::NUM_TUPLES / TUPLES_DIVIDER) { c_freq[next_index] = rand() % input::b_MAX_FREQUENCY; c_term[next_index++] = term; } } } if (sorted_by_term == S_UNOPTIMIZED) { shuffle(c_term, c_term + input::NUM_TUPLES / TUPLES_DIVIDER, default_random_engine(42)); } debug("Done generating terms and frequencies."); next_index = 0; for (long doc = 0; doc < input::D_PM; ++doc) { long times = MAX(1, pubmed::get_group_by_doc(doc) / TUPLES_DIVIDER); doc_offsets[doc] = next_index; for (int i = 0; i < times; ++i) { if (next_index < input::NUM_TUPLES / TUPLES_DIVIDER) { c_doc[next_index++] = doc; } } } if (sorted_by_doc == S_UNOPTIMIZED) { shuffle(c_doc, c_doc + input::NUM_TUPLES / TUPLES_DIVIDER, default_random_engine(42)); } debug("Done generating documents."); output::stop_timer("run/top_k_column_db_tf_in_documents_generate_random"); }