Example #1
0
std::pair<bool, std::vector<target_text> > QueryEngine::query(StringPiece source_phrase){
    bool found;
    std::vector<target_text> translation_entries;
    const Entry * entry;
    //Convert source frase to VID
    std::vector<uint64_t> source_phrase_vid = getVocabIDs(source_phrase);
    //TOO SLOW
    //uint64_t key = util::MurmurHashNative(&source_phrase_vid[0], source_phrase_vid.size());
    uint64_t key = 0;
    for (int i = 0; i < source_phrase_vid.size(); i++){
        key += source_phrase_vid[i];
    }

    found = table.Find(key, entry);


    if (found){
        //The phrase that was searched for was found! We need to get the translation entries.
        //We will read the largest entry in bytes and then filter the unnecesarry with functions
        //from line_splitter
        uint64_t initial_index = entry -> GetValue();
        unsigned int bytes_toread = entry -> bytes_toread;
        //At the end of the file we can't readd + largest_entry cause we get a segfault.
        std::cerr << "Entry size is bytes is: " << bytes_toread << std::endl;
        
        //ASK HIEU FOR MORE EFFICIENT WAY TO DO THIS!
        std::vector<unsigned char> encoded_text; //Assign to the vector the relevant portion of the array.
        encoded_text.reserve(bytes_toread);
        for (int i = 0; i < bytes_toread; i++){
            encoded_text.push_back(binary_mmaped[i+initial_index]);
        }

        //Get only the translation entries necessary
        translation_entries = decoder.full_decode_line(encoded_text);

    }

    std::pair<bool, std::vector<target_text> > output (found, translation_entries);

    return output;

}
Example #2
0
std::pair<bool, std::vector<target_text> > QueryEngine::query(StringPiece source_phrase){
	bool found;
	std::vector<target_text> translation_entries;
	const Entry * entry;
	//Convert source frase to VID
	std::vector<uint64_t> source_phrase_vid = getVocabIDs(source_phrase);
	//TOO SLOW
	//uint64_t key = util::MurmurHashNative(&source_phrase_vid[0], source_phrase_vid.size());
	uint64_t key = 0;
	for (int i = 0; i < source_phrase_vid.size(); i++){
		key += source_phrase_vid[i];
	}

	found = table.Find(key, entry);


	if (found){
		//The phrase that was searched for was found! We need to get the translation entries.
		//We will read the largest entry in bytes and then filter the unnecesarry with functions
		//from line_splitter

		uint64_t initial_index = entry -> GetValue();
		uint64_t end_index = initial_index + largest_entry;
		//At the end of the file we can't readd + largest_entry cause we get a segfault.
		//Instead read till the end of the file.
		if (end_index > binary_filesize){
			end_index = binary_filesize;
		}
		std::string text_entry(&binary_mmaped[initial_index] , &binary_mmaped[end_index]);
		StringPiece raw_string = StringPiece(text_entry);

		//Get only the translation entries necessary
		translation_entries = splitTargetLine(raw_string);

	}

	std::pair<bool, std::vector<target_text> > output (found, translation_entries);

	return output;

}
Example #3
0
void createProbingPT(const char * phrasetable_path, const char * target_path){
    //Get basepath and create directory if missing
    std::string basepath(target_path);
    mkdir(basepath.c_str(), S_IRWXU | S_IRWXG | S_IROTH | S_IXOTH);

    //Set up huffman and serialize decoder maps.
    Huffman huffmanEncoder(phrasetable_path); //initialize
    huffmanEncoder.assign_values();
    huffmanEncoder.produce_lookups();
    huffmanEncoder.serialize_maps(target_path);

    //Get uniq lines:
    unsigned long uniq_entries = huffmanEncoder.getUniqLines();

    //Source phrase vocabids
    std::map<uint64_t, std::string> source_vocabids;

    //Read the file
    util::FilePiece filein(phrasetable_path);

    //Init the probing hash table
    size_t size = Table::Size(uniq_entries, 1.2);
    char * mem = new char[size];
    memset(mem, 0, size);
    Table table(mem, size);

    BinaryFileWriter binfile(basepath); //Init the binary file writer.

    line_text prev_line; //Check if the source phrase of the previous line is the same

    //Keep track of the size of each group of target phrases
    uint64_t entrystartidx = 0;
    //uint64_t line_num = 0;


    //Read everything and processs
    while(true){
        try {
            //Process line read
            line_text line;
            line = splitLine(filein.ReadLine());
            //Add source phrases to vocabularyIDs
            add_to_map(&source_vocabids, line.source_phrase);

            if ((binfile.dist_from_start + binfile.extra_counter) == 0) {
                prev_line = line; //For the first iteration assume the previous line is
            } //The same as this one.

            if (line.source_phrase != prev_line.source_phrase){

                //Create a new entry even

                //Create an entry for the previous source phrase:
                Entry pesho;
                pesho.value = entrystartidx;
                //The key is the sum of hashes of individual words. Probably not entirerly correct, but fast
                pesho.key = 0;
                std::vector<uint64_t> vocabid_source = getVocabIDs(prev_line.source_phrase);
                for (int i = 0; i < vocabid_source.size(); i++){
                    pesho.key += vocabid_source[i];
                }
                pesho.bytes_toread = binfile.dist_from_start + binfile.extra_counter - entrystartidx;

                //Put into table
                table.Insert(pesho);

                entrystartidx = binfile.dist_from_start + binfile.extra_counter; //Designate start idx for new entry

                //Encode a line and write it to disk.
                std::vector<unsigned char> encoded_line = huffmanEncoder.full_encode_line(line);
                binfile.write(&encoded_line);

                //Set prevLine
                prev_line = line;

            } else{
                //If we still have the same line, just append to it:
                std::vector<unsigned char> encoded_line = huffmanEncoder.full_encode_line(line);
                binfile.write(&encoded_line);
            }

        } catch (util::EndOfFileException e){
            std::cerr << "Reading phrase table finished, writing remaining files to disk." << std::endl;
            binfile.flush();

            //After the final entry is constructed we need to add it to the phrase_table
            //Create an entry for the previous source phrase:
            Entry pesho;
            pesho.value = entrystartidx;
            //The key is the sum of hashes of individual words. Probably not entirerly correct, but fast
            pesho.key = 0;
            std::vector<uint64_t> vocabid_source = getVocabIDs(prev_line.source_phrase);
            for (int i = 0; i < vocabid_source.size(); i++){
                pesho.key += vocabid_source[i];
            }
            pesho.bytes_toread = binfile.dist_from_start + binfile.extra_counter - entrystartidx;
            //Put into table
            table.Insert(pesho);

            break;
        }
    }

    serialize_table(mem, size, (basepath + "/probing_hash.dat").c_str());

    serialize_map(&source_vocabids, (basepath + "/source_vocabids").c_str());
    
    delete[] mem;

    //Write configfile
    std::ofstream configfile;
    configfile.open((basepath + "/config").c_str());
    configfile << uniq_entries << '\n';
    configfile.close();
}