std::pair<bool, std::vector<target_text> > QueryEngine::query(StringPiece source_phrase){ bool found; std::vector<target_text> translation_entries; const Entry * entry; //Convert source frase to VID std::vector<uint64_t> source_phrase_vid = getVocabIDs(source_phrase); //TOO SLOW //uint64_t key = util::MurmurHashNative(&source_phrase_vid[0], source_phrase_vid.size()); uint64_t key = 0; for (int i = 0; i < source_phrase_vid.size(); i++){ key += source_phrase_vid[i]; } found = table.Find(key, entry); if (found){ //The phrase that was searched for was found! We need to get the translation entries. //We will read the largest entry in bytes and then filter the unnecesarry with functions //from line_splitter uint64_t initial_index = entry -> GetValue(); unsigned int bytes_toread = entry -> bytes_toread; //At the end of the file we can't readd + largest_entry cause we get a segfault. std::cerr << "Entry size is bytes is: " << bytes_toread << std::endl; //ASK HIEU FOR MORE EFFICIENT WAY TO DO THIS! std::vector<unsigned char> encoded_text; //Assign to the vector the relevant portion of the array. encoded_text.reserve(bytes_toread); for (int i = 0; i < bytes_toread; i++){ encoded_text.push_back(binary_mmaped[i+initial_index]); } //Get only the translation entries necessary translation_entries = decoder.full_decode_line(encoded_text); } std::pair<bool, std::vector<target_text> > output (found, translation_entries); return output; }
std::pair<bool, std::vector<target_text> > QueryEngine::query(StringPiece source_phrase){ bool found; std::vector<target_text> translation_entries; const Entry * entry; //Convert source frase to VID std::vector<uint64_t> source_phrase_vid = getVocabIDs(source_phrase); //TOO SLOW //uint64_t key = util::MurmurHashNative(&source_phrase_vid[0], source_phrase_vid.size()); uint64_t key = 0; for (int i = 0; i < source_phrase_vid.size(); i++){ key += source_phrase_vid[i]; } found = table.Find(key, entry); if (found){ //The phrase that was searched for was found! We need to get the translation entries. //We will read the largest entry in bytes and then filter the unnecesarry with functions //from line_splitter uint64_t initial_index = entry -> GetValue(); uint64_t end_index = initial_index + largest_entry; //At the end of the file we can't readd + largest_entry cause we get a segfault. //Instead read till the end of the file. if (end_index > binary_filesize){ end_index = binary_filesize; } std::string text_entry(&binary_mmaped[initial_index] , &binary_mmaped[end_index]); StringPiece raw_string = StringPiece(text_entry); //Get only the translation entries necessary translation_entries = splitTargetLine(raw_string); } std::pair<bool, std::vector<target_text> > output (found, translation_entries); return output; }
void createProbingPT(const char * phrasetable_path, const char * target_path){ //Get basepath and create directory if missing std::string basepath(target_path); mkdir(basepath.c_str(), S_IRWXU | S_IRWXG | S_IROTH | S_IXOTH); //Set up huffman and serialize decoder maps. Huffman huffmanEncoder(phrasetable_path); //initialize huffmanEncoder.assign_values(); huffmanEncoder.produce_lookups(); huffmanEncoder.serialize_maps(target_path); //Get uniq lines: unsigned long uniq_entries = huffmanEncoder.getUniqLines(); //Source phrase vocabids std::map<uint64_t, std::string> source_vocabids; //Read the file util::FilePiece filein(phrasetable_path); //Init the probing hash table size_t size = Table::Size(uniq_entries, 1.2); char * mem = new char[size]; memset(mem, 0, size); Table table(mem, size); BinaryFileWriter binfile(basepath); //Init the binary file writer. line_text prev_line; //Check if the source phrase of the previous line is the same //Keep track of the size of each group of target phrases uint64_t entrystartidx = 0; //uint64_t line_num = 0; //Read everything and processs while(true){ try { //Process line read line_text line; line = splitLine(filein.ReadLine()); //Add source phrases to vocabularyIDs add_to_map(&source_vocabids, line.source_phrase); if ((binfile.dist_from_start + binfile.extra_counter) == 0) { prev_line = line; //For the first iteration assume the previous line is } //The same as this one. if (line.source_phrase != prev_line.source_phrase){ //Create a new entry even //Create an entry for the previous source phrase: Entry pesho; pesho.value = entrystartidx; //The key is the sum of hashes of individual words. Probably not entirerly correct, but fast pesho.key = 0; std::vector<uint64_t> vocabid_source = getVocabIDs(prev_line.source_phrase); for (int i = 0; i < vocabid_source.size(); i++){ pesho.key += vocabid_source[i]; } pesho.bytes_toread = binfile.dist_from_start + binfile.extra_counter - entrystartidx; //Put into table table.Insert(pesho); entrystartidx = binfile.dist_from_start + binfile.extra_counter; //Designate start idx for new entry //Encode a line and write it to disk. std::vector<unsigned char> encoded_line = huffmanEncoder.full_encode_line(line); binfile.write(&encoded_line); //Set prevLine prev_line = line; } else{ //If we still have the same line, just append to it: std::vector<unsigned char> encoded_line = huffmanEncoder.full_encode_line(line); binfile.write(&encoded_line); } } catch (util::EndOfFileException e){ std::cerr << "Reading phrase table finished, writing remaining files to disk." << std::endl; binfile.flush(); //After the final entry is constructed we need to add it to the phrase_table //Create an entry for the previous source phrase: Entry pesho; pesho.value = entrystartidx; //The key is the sum of hashes of individual words. Probably not entirerly correct, but fast pesho.key = 0; std::vector<uint64_t> vocabid_source = getVocabIDs(prev_line.source_phrase); for (int i = 0; i < vocabid_source.size(); i++){ pesho.key += vocabid_source[i]; } pesho.bytes_toread = binfile.dist_from_start + binfile.extra_counter - entrystartidx; //Put into table table.Insert(pesho); break; } } serialize_table(mem, size, (basepath + "/probing_hash.dat").c_str()); serialize_map(&source_vocabids, (basepath + "/source_vocabids").c_str()); delete[] mem; //Write configfile std::ofstream configfile; configfile.open((basepath + "/config").c_str()); configfile << uniq_entries << '\n'; configfile.close(); }