bool ParallelCorpus::ReadDocumentPairs(const string& source_file, const string& target_file) { vector<Document> source_docs; std::ifstream source_in(source_file.c_str()); if (source_in) { ReadDocuments(&source_in, &(source_docs), &source_vocab_, source_stemming_); } else { return false; } source_in.close(); vector<Document> target_docs; std::ifstream target_in(target_file.c_str()); if (target_in) { ReadDocuments(&target_in, &(target_docs), &target_vocab_, target_stemming_); } else { return false; } target_in.close(); if (source_docs.size() != target_docs.size()) { return false; } for (int i = 0; i < source_docs.size(); ++i) { DocumentPair doc_pair; doc_pair.first.swap(source_docs.at(i)); doc_pair.second.swap(target_docs.at(i)); doc_pairs_.push_back(doc_pair); } return true; }
int main(int argc, char *argv[]) { namespace po = boost::program_options; namespace fs = boost::filesystem; po::options_description desc("Allowed options"); desc.add_options() ("input,i", po::value<std::string>(), "input file") ("output,o", po::value<std::string>(), "output file") ("cpp,x", "produce cpp/header file on successful compilation") ("platform,p", po::value<unsigned>(), "set platform") ("device,d", po::value<unsigned>(), "set device") ("options,c", po::value<std::string>(), "compile options") ("help,h", "print this help message") ("list,l", "list platforms and devices"); po::positional_options_description pos; pos.add("input", 1); pos.add("output", 1); po::variables_map vm; try { po::store( po::command_line_parser(argc, argv).options(desc) .positional(pos).run(), vm); } catch(std::exception &e) { std::cerr << "options parsing error.\n" << desc << std::endl; return EXIT_FAILURE; } po::notify(vm); if(vm.count("help")) { std::cout << desc << std::endl; return EXIT_FAILURE; } if(vm.count("list")) { std::cout << "Accessible platforms:\n"; std::vector<cl::platform_ref> platforms; cl::platform_ref::get_platforms(platforms); for(unsigned i=0; i<platforms.size(); ++i) { cl::platform_ref p = platforms[i]; std::cout << i << ") " << p.name() << "\n"; std::cout << "\tAssociated devices:\n"; std::vector<cl::device_ref> devices; p.get_devices(devices); for(unsigned j=0; j<devices.size(); ++j) { cl::device_ref d = devices[j]; std::cout << "\t" << j << ") " << d.name() << "\n"; } } return EXIT_SUCCESS; } std::string input_path; std::string output_path; std::string options; bool do_output = false; unsigned platform_id = 0; unsigned device_id = 0; if(vm.count("input")) { input_path = vm["input"].as<std::string>(); } else { std::cout << "must provide input file.\n"; return EXIT_FAILURE; } if(vm.count("output")) { output_path = vm["output"].as<std::string>(); do_output = true; } if(vm.count("platform")) { platform_id = vm["platform"].as<unsigned>(); } if(vm.count("device")) { device_id = vm["device"].as<unsigned>(); } if(vm.count("options")) { options = vm["options"].as<std::string>(); } std::stringstream ss; std::ifstream in(input_path.c_str()); std::string line; while(std::getline(in, line)) { ss << line << "\n"; } std::vector<cl::platform_ref> platforms; std::vector<cl::device_ref> devices; cl::platform_ref::get_platforms(platforms); if(platform_id >= platforms.size()) { std::cerr << "invalid platform id\n"; return EXIT_FAILURE; } cl::platform_ref platform = platforms[platform_id]; platform.get_devices(devices); if(device_id >= devices.size()) { std::cerr << "invalid device id\n"; return EXIT_FAILURE; } cl::device_ref device = devices[device_id]; cl::context_ref context(platform, device); cl::program_ref program(context, ss.str()); std::cout << "context is " << platform.name() << "/" << device.name() << "\n"; std::cout << "OpenCL compiler options: " << options << "\n"; try { program.build(options); std::cout << "build successful!\n"; if(vm.count("cpp")) { fs::path input_path_obj(input_path); // older versions of boost don't have this method //std::string file_base = input_path_obj.stem(); std::size_t start = input_path.find_last_of('/'); if(start == std::string::npos) { start = 0; } else { ++start; } std::string file_base = input_path.substr(start, input_path.find_last_of('.')-start); std::string file_base_upper = file_base; boost::to_upper(file_base_upper); // write header std::stringstream filess; filess << input_path << ".hpp"; std::ofstream header_out(filess.str().c_str()); filess.str(""); filess << "#ifndef _" << file_base_upper << "_OPENCL_HPP_" << "\n"; filess << "#define _" << file_base_upper << "_OPENCL_HPP_" << "\n"; filess << "\n"; filess << "extern const char *" << file_base << "_opencl_source;" << "\n"; filess << "\n"; filess << "#endif\n\n"; header_out << filess.str(); header_out.close(); // write cpp file boost::regex rx; rx.assign( "(\\\\)|" "(\")"); const char *format = "(?1\\\\\\\\)" "(?2\\\\\")"; filess.str(""); filess << input_path << ".cpp"; std::ofstream source_out(filess.str().c_str()); std::ifstream source_in(input_path.c_str()); filess.str(""); filess << "const char *" << file_base << "_opencl_source = " << "\n"; while(getline(source_in, line)) { std::string sanitized_line = boost::regex_replace(line, rx, format, boost::match_default | boost::format_all); filess << "\t\"" << sanitized_line << "\\n\"\n"; } filess << ";"; source_out << filess.str(); source_out.close(); } } catch(const cl::cl_error &c) { std::cout << "error building program\n"; } std::cout << "build log:\n" << program.get_build_log(device) << "\n"; return EXIT_SUCCESS; }
bool ParallelCorpus::ReadParallelData(const string& source_file, const string& target_file) { typedef boost::tokenizer<boost::char_separator<char> > tokenizer; boost::char_separator<char> sep(" \t"); std::string line; vector<Sentence> source_sents; std::ifstream source_in(source_file.c_str()); if (source_in.good()) { Document doc; while (getline(source_in, line)) { Sentence current_sentence; tokenizer line_tokenizer(line, sep); for (tokenizer::iterator it = line_tokenizer.begin(); it != line_tokenizer.end(); ++it) { string token = *it; if (use_lowercase_) { boost::to_lower(token); } if (source_stemming_) { Stem(token); } current_sentence.push_back(source_vocab_.AddWord(token)); } source_sents.push_back(current_sentence); } source_in.close(); } else { return false; } vector<Sentence> target_sents; std::ifstream target_in(target_file.c_str()); if (target_in.good()) { Document doc; while (getline(target_in, line)) { Sentence current_sentence; tokenizer line_tokenizer(line, sep); for (tokenizer::iterator it = line_tokenizer.begin(); it != line_tokenizer.end(); ++it) { string token = *it; if (use_lowercase_) { boost::to_lower(token); } if (target_stemming_) { Stem(token); } current_sentence.push_back(target_vocab_.AddWord(token)); } target_sents.push_back(current_sentence); } target_in.close(); } else { return false; } if (source_sents.size() != target_sents.size()) { return false; } for (int i = 0; i < source_sents.size(); ++i) { //if ((source_sents.at(i).size() > 0) // && (target_sents.at(i).size() > 0)) { DocumentPair doc_pair; doc_pair.first.push_back(source_sents.at(i)); doc_pair.second.push_back(target_sents.at(i)); doc_pairs_.push_back(doc_pair); //} } return true; }