void imagenet_toolset::prepare_validating_data() { std::vector<unsigned int> classid_list; { boost::filesystem::path validating_class_labels_filepath = get_input_data_folder() / devkit_folder_name / devkit_data_folder_name / validation_ground_truth_file_name; std::cout << "Reading ground truth labels from " + validating_class_labels_filepath.string() << "..." << std::endl; boost::filesystem::ifstream file_input(validating_class_labels_filepath, std::ios_base::in); std::string str; while (true) { std::getline(file_input, str); if (str.empty()) break; unsigned int wnid = atol(str.c_str()); unsigned int classid = get_classid_by_wnid(wnid); classid_list.push_back(classid); } } std::cout << classid_list.size() << " labels read\n"; nnforge::varying_data_stream_writer_smart_ptr validating_data_writer; { boost::filesystem::path validating_file_path = get_working_data_folder() / validating_data_filename; std::cout << "Writing validating data to " << validating_file_path.string() << "..." << std::endl; nnforge_shared_ptr<std::ofstream> validating_file(new boost::filesystem::ofstream(validating_file_path, std::ios_base::out | std::ios_base::binary | std::ios_base::trunc)); validating_data_writer = nnforge::varying_data_stream_writer_smart_ptr(new nnforge::varying_data_stream_writer( validating_file, static_cast<unsigned int>(classid_list.size()))); } boost::filesystem::path validating_images_folder_path = get_input_data_folder() / validating_images_folder_name; for(int i = 0; i < classid_list.size(); ++i) { unsigned int class_id = classid_list[i]; unsigned int image_id = i + 1; boost::filesystem::path image_file_path = validating_images_folder_path / (boost::format("ILSVRC2012_val_%|1$08d|.JPEG") % image_id).str(); write_supervised_data(image_file_path, *validating_data_writer, class_id); } std::cout << classid_list.size() << " entries written" << std::endl; }
void gtsrb_toolset::write_folder( nnforge::supervised_data_stream_writer& writer, const boost::filesystem::path& relative_subfolder_path, const char * annotation_file_name, bool jitter) { boost::filesystem::path subfolder_path = get_input_data_folder() / relative_subfolder_path; boost::filesystem::path annotation_file_path = subfolder_path / annotation_file_name; std::cout << "Reading input data from " << subfolder_path.string() << std::endl; boost::filesystem::ifstream file_input(annotation_file_path, std::ios_base::in); nnforge::random_generator generator = nnforge::rnd::get_random_generator(); std::tr1::uniform_real<float> rotate_angle_distribution(-max_rotation_angle_in_degrees, max_rotation_angle_in_degrees); std::tr1::uniform_real<float> scale_distribution(1.0F / max_scale_factor, max_scale_factor); std::tr1::uniform_real<float> shift_distribution(-max_shift, max_shift); std::tr1::uniform_real<float> contrast_distribution(1.0F / max_contrast_factor, max_contrast_factor); std::tr1::uniform_real<float> brightness_shift_distribution(-max_brightness_shift, max_brightness_shift); std::string str; std::getline(file_input, str); // read the header while (true) { std::getline(file_input, str); std::vector<std::string> strs; boost::split(strs, str, boost::is_any_of(";")); if (strs.size() != 8) break; std::string file_name = strs[0]; boost::filesystem::path absolute_file_path = subfolder_path / file_name; char* end; unsigned int top_left_x = static_cast<unsigned int>(strtol(strs[3].c_str(), &end, 10)); unsigned int top_left_y = static_cast<unsigned int>(strtol(strs[4].c_str(), &end, 10)); unsigned int bottom_right_x = static_cast<unsigned int>(strtol(strs[5].c_str(), &end, 10)); unsigned int bottom_right_y = static_cast<unsigned int>(strtol(strs[6].c_str(), &end, 10)); unsigned int class_id = static_cast<unsigned int>(strtol(strs[7].c_str(), &end, 10)); if (jitter) { for(int i = 0; i < random_sample_count; ++i) { float rotation_angle = rotate_angle_distribution(generator); float scale = scale_distribution(generator); float shift_x = shift_distribution(generator); float shift_y = shift_distribution(generator); float contrast = contrast_distribution(generator); float brightness_shift = brightness_shift_distribution(generator); write_single_entry( writer, absolute_file_path, class_id, top_left_x, top_left_y, bottom_right_x, bottom_right_y, rotation_angle, scale, shift_x, shift_y, contrast, brightness_shift); } } else { write_single_entry( writer, absolute_file_path, class_id, top_left_x, top_left_y, bottom_right_x, bottom_right_y); } } }
void imagenet_toolset::prepare_randomized_training_data() { boost::filesystem::path training_images_folder_path = get_input_data_folder() / training_images_folder_name; unsigned int total_training_image_count = 0; std::cout << "Enumerating training images from " + training_images_folder_path.string() << "..." << std::endl; std::map<std::string, std::vector<unsigned int> > ilsvrc2014id_to_localid_list_map; { nnforge_regex folder_expression(ilsvrc2014id_pattern); nnforge_regex file_expression(training_image_filename_pattern); nnforge_cmatch what; for(boost::filesystem::directory_iterator it = boost::filesystem::directory_iterator(training_images_folder_path); it != boost::filesystem::directory_iterator(); ++it) { if (it->status().type() == boost::filesystem::directory_file) { boost::filesystem::path folder_path = it->path(); std::string folder_name = folder_path.filename().string(); if (nnforge_regex_match(folder_name, folder_expression)) { const std::string& ilsvrc2014id = folder_name; unsigned int class_id = get_classid_by_wnid(get_wnid_by_ilsvrc2014id(ilsvrc2014id)); std::vector<unsigned int>& localid_list = ilsvrc2014id_to_localid_list_map.insert(std::make_pair(ilsvrc2014id, std::vector<unsigned int>())).first->second; for(boost::filesystem::directory_iterator it2 = boost::filesystem::directory_iterator(folder_path); it2 != boost::filesystem::directory_iterator(); ++it2) { if (it2->status().type() == boost::filesystem::regular_file) { boost::filesystem::path file_path = it2->path(); std::string file_name = file_path.filename().string(); if (nnforge_regex_search(file_name.c_str(), what, file_expression)) { std::string ilsvrc2014id2 = std::string(what[1].first, what[1].second); int localid = atol(std::string(what[2].first, what[2].second).c_str()); localid_list.push_back(localid); ++total_training_image_count; } } } } } } } std::cout << total_training_image_count << " training images found\n"; std::map<std::string, std::pair<unsigned int, float> > ilsvrc2014id_to_localid_count_and_remaining_ratio_map; for(std::map<std::string, std::vector<unsigned int> >::iterator it = ilsvrc2014id_to_localid_list_map.begin(); it != ilsvrc2014id_to_localid_list_map.end(); ++it) ilsvrc2014id_to_localid_count_and_remaining_ratio_map.insert(std::make_pair(it->first, std::make_pair(it->second.size(), it->second.size() > 0 ? 1.0F : 0.0F))); nnforge::random_generator rnd; nnforge::varying_data_stream_writer_smart_ptr training_data_writer; { boost::filesystem::path training_file_path = get_working_data_folder() / training_randomized_data_filename; std::cout << "Writing randomized training data to " << training_file_path.string() << "..." << std::endl; nnforge_shared_ptr<std::ofstream> training_file(new boost::filesystem::ofstream(training_file_path, std::ios_base::out | std::ios_base::binary | std::ios_base::trunc)); training_data_writer = nnforge::varying_data_stream_writer_smart_ptr(new nnforge::varying_data_stream_writer( training_file, total_training_image_count)); } std::vector<std::string> best_ilsvrc2014id_list; for(unsigned int entry_to_write_count = 0; entry_to_write_count < total_training_image_count; ++entry_to_write_count) { if (best_ilsvrc2014id_list.empty()) { float best_ratio = -1.0F; for(std::map<std::string, std::pair<unsigned int, float> >::const_iterator it = ilsvrc2014id_to_localid_count_and_remaining_ratio_map.begin(); it != ilsvrc2014id_to_localid_count_and_remaining_ratio_map.end(); ++it) { float new_ratio = it->second.second; if (new_ratio > best_ratio) { best_ilsvrc2014id_list.clear(); best_ilsvrc2014id_list.push_back(it->first); best_ratio = new_ratio; } else if (new_ratio == best_ratio) best_ilsvrc2014id_list.push_back(it->first); } } std::string best_ilsvrc2014id; { nnforge_uniform_int_distribution<unsigned int> dist(0, static_cast<unsigned int>(best_ilsvrc2014id_list.size()) - 1); unsigned int index = dist(rnd); best_ilsvrc2014id = best_ilsvrc2014id_list[index]; best_ilsvrc2014id_list[index] = best_ilsvrc2014id_list.back(); best_ilsvrc2014id_list.pop_back(); } std::map<std::string, std::vector<unsigned int> >::iterator bucket_it = ilsvrc2014id_to_localid_list_map.find(best_ilsvrc2014id); std::vector<unsigned int>& localid_list = bucket_it->second; if (localid_list.empty()) throw std::runtime_error("Unexpected error in prepare_training_data: No elements left"); nnforge_uniform_int_distribution<unsigned int> dist(0, static_cast<unsigned int>(localid_list.size()) - 1); unsigned int index = dist(rnd); unsigned int local_id = localid_list[index]; unsigned int leftover_local_id = localid_list[localid_list.size() - 1]; localid_list[index] = leftover_local_id; localid_list.pop_back(); std::map<std::string, std::pair<unsigned int, float> >::iterator it = ilsvrc2014id_to_localid_count_and_remaining_ratio_map.find(best_ilsvrc2014id); it->second.second = static_cast<float>(localid_list.size()) / static_cast<float>(it->second.first); std::string filename = (boost::format("%1%_%2%.JPEG") % best_ilsvrc2014id % local_id).str(); boost::filesystem::path image_file_path = training_images_folder_path / best_ilsvrc2014id / filename; int class_id = get_classid_by_wnid(get_wnid_by_ilsvrc2014id(best_ilsvrc2014id)); write_supervised_data(image_file_path, *training_data_writer, class_id); if (((entry_to_write_count + 1) % 100000) == 0) std::cout << (entry_to_write_count + 1) << " entries written" << std::endl; } std::cout << total_training_image_count << " entries written" << std::endl; }
void imagenet_toolset::prepare_true_randomized_training_data() { boost::filesystem::path training_images_folder_path = get_input_data_folder() / training_images_folder_name; std::cout << "Enumerating training images from " + training_images_folder_path.string() << "..." << std::endl; std::vector<std::pair<std::string, unsigned int> > ilsvrc2014id_localid_pair_list; { nnforge_regex folder_expression(ilsvrc2014id_pattern); nnforge_regex file_expression(training_image_filename_pattern); nnforge_cmatch what; for(boost::filesystem::directory_iterator it = boost::filesystem::directory_iterator(training_images_folder_path); it != boost::filesystem::directory_iterator(); ++it) { if (it->status().type() == boost::filesystem::directory_file) { boost::filesystem::path folder_path = it->path(); std::string folder_name = folder_path.filename().string(); if (nnforge_regex_match(folder_name, folder_expression)) { const std::string& ilsvrc2014id = folder_name; unsigned int class_id = get_classid_by_wnid(get_wnid_by_ilsvrc2014id(ilsvrc2014id)); for(boost::filesystem::directory_iterator it2 = boost::filesystem::directory_iterator(folder_path); it2 != boost::filesystem::directory_iterator(); ++it2) { if (it2->status().type() == boost::filesystem::regular_file) { boost::filesystem::path file_path = it2->path(); std::string file_name = file_path.filename().string(); if (nnforge_regex_search(file_name.c_str(), what, file_expression)) { int localid = atol(std::string(what[2].first, what[2].second).c_str()); ilsvrc2014id_localid_pair_list.push_back(std::make_pair(ilsvrc2014id, localid)); } } } } } } } unsigned int total_training_image_count = static_cast<unsigned int>(ilsvrc2014id_localid_pair_list.size()); std::cout << "Training images found: " << total_training_image_count << std::endl; nnforge::random_generator rnd; nnforge::varying_data_stream_writer_smart_ptr training_data_writer; { boost::filesystem::path training_file_path = get_working_data_folder() / training_randomized_data_filename; std::cout << "Writing randomized training data to " << training_file_path.string() << "..." << std::endl; nnforge_shared_ptr<std::ofstream> training_file(new boost::filesystem::ofstream(training_file_path, std::ios_base::out | std::ios_base::binary | std::ios_base::trunc)); training_data_writer = nnforge::varying_data_stream_writer_smart_ptr(new nnforge::varying_data_stream_writer( training_file, static_cast<unsigned int>(ilsvrc2014id_localid_pair_list.size()))); } for(unsigned int entry_written_count = 0; entry_written_count < total_training_image_count; ++entry_written_count) { nnforge_uniform_int_distribution<unsigned int> dist(0, static_cast<unsigned int>(ilsvrc2014id_localid_pair_list.size()) - 1); unsigned int index = dist(rnd); std::pair<std::string, unsigned int> ilsvrc2014id_localid_pair = ilsvrc2014id_localid_pair_list[index]; ilsvrc2014id_localid_pair_list[index] = ilsvrc2014id_localid_pair_list[ilsvrc2014id_localid_pair_list.size() - 1]; ilsvrc2014id_localid_pair_list.pop_back(); std::string filename = (boost::format("%1%_%2%.JPEG") % ilsvrc2014id_localid_pair.first % ilsvrc2014id_localid_pair.second).str(); boost::filesystem::path image_file_path = training_images_folder_path / ilsvrc2014id_localid_pair.first / filename; int class_id = get_classid_by_wnid(get_wnid_by_ilsvrc2014id(ilsvrc2014id_localid_pair.first)); write_supervised_data(image_file_path, *training_data_writer, class_id); if (((entry_written_count + 1) % 100000) == 0) std::cout << (entry_written_count + 1) << " entries written" << std::endl; } std::cout << total_training_image_count << " entries written" << std::endl; }