Esempio n. 1
0
int main(int argc, char* argv[]) {
  /*
 CIF* cif = read_cif(std::string(argv[1]));
 std::cout << *cif;
 delete cif;
  */

 std::string bam_file    = argv[1];
 std::string fasta_dir   = argv[2];
 std::string fasta_file  = argv[3];
 bool skip_soft_clipped  = std::string(argv[4]).compare("true") == 0;
 int32_t max_read_length = atoi(argv[5]);
 std::ofstream output;
 output.open(argv[6], std::ofstream::out);
 compute_confusion_matrix(max_read_length, bam_file, fasta_file, fasta_dir, skip_soft_clipped, output);
 output.close();
}
int main(int argc, const char* argv[]) {
    try {
        // Parse command line arguments.
        TCLAP::CmdLine cmd("Depth RF trainer", ' ', "0.3");
        TCLAP::ValueArg<std::string> image_list_file_arg("f", "image-list-file", "File containing the names of image files", true, "", "string", cmd);
        TCLAP::ValueArg<int> num_of_classes_arg("n", "num-of-classes", "Number of classes in the data", true, 1, "int", cmd);
        TCLAP::SwitchArg print_confusion_matrix_switch("m", "conf-matrix", "Print confusion matrix", cmd, true);
        TCLAP::ValueArg<int> background_label_arg("l", "background-label", "Lower bound of background labels to be ignored", false, -1, "int", cmd);
        TCLAP::ValueArg<std::string> json_forest_file_arg("j", "json-forest-file", "JSON file where the trained forest should be saved", false, "forest.json", "string");
        TCLAP::ValueArg<std::string> binary_forest_file_arg("b", "binary-forest-file", "Binary file where the trained forest should be saved", false, "forest.bin", "string");
        TCLAP::ValueArg<std::string> config_file_arg("c", "config", "YAML file with training parameters", false, "", "string", cmd);
#if AIT_MULTI_THREADING
        TCLAP::ValueArg<int> num_of_threads_arg("t", "threads", "Number of threads to use", false, -1, "int", cmd);
#endif
        cmd.xorAdd(json_forest_file_arg, binary_forest_file_arg);
        cmd.parse(argc, argv);
        
        const int num_of_classes = num_of_classes_arg.getValue();
        bool print_confusion_matrix = print_confusion_matrix_switch.getValue();
        const std::string image_list_file = image_list_file_arg.getValue();

        // Initialize training and weak-learner parameters to defaults or load from file
        ForestTrainerT::ParametersT training_parameters;
        WeakLearnerT::ParametersT weak_learner_parameters;
        if (config_file_arg.isSet()) {
            ait::log_info(false) << "Reading config file " << config_file_arg.getValue() << "... " << std::flush;
            std::ifstream ifile_config(config_file_arg.getValue());
            cereal::JSONInputArchive iarchive(ifile_config);
            iarchive(cereal::make_nvp("training_parameters", training_parameters));
            iarchive(cereal::make_nvp("weak_learner_parameters", weak_learner_parameters));
            ait::log_info(false) << " Done." << std::endl;
        }
#if AIT_MULTI_THREADING
        if (num_of_threads_arg.isSet()) {
            training_parameters.num_of_threads = num_of_threads_arg.getValue();
        }
#endif

        // Read image file list
        ait::log_info(false) << "Reading image list ... " << std::flush;
        std::vector<std::tuple<std::string, std::string>> image_list;
        std::ifstream ifile(image_list_file);
        if (!ifile.good()) {
            throw std::runtime_error("Unable to open image list file");
        }
        ait::CSVReader<std::string> csv_reader(ifile);
        for (auto it = csv_reader.begin(); it != csv_reader.end(); ++it) {
            if (it->size() != 2) {
                cmd.getOutput()->usage(cmd);
                ait::log_error() << "Image list file should contain two columns with the data and label filenames.";
                exit(-1);
            }
            const std::string& data_filename = (*it)[0];
            const std::string& label_filename = (*it)[1];
            
            boost::filesystem::path data_path = boost::filesystem::path(data_filename);
            boost::filesystem::path label_path = boost::filesystem::path(label_filename);
            if (!data_path.is_absolute()) {
                data_path = boost::filesystem::path(image_list_file).parent_path();
                data_path /= data_filename;
            }
            if (!label_path.is_absolute()) {
                label_path = boost::filesystem::path(image_list_file).parent_path();
                label_path /= label_filename;
            }
            
            image_list.push_back(std::make_tuple(data_path.string(), label_path.string()));
        }
        ait::log_info(false) << " Done." << std::endl;
        
        // TODO: Ensure that label images do not contain values > num_of_classes except for background pixels. Other approach: Test samples directly below.
        
        // Set lower bound for background pixel lables
        ait::label_type background_label;
        if (background_label_arg.isSet()) {
            background_label = background_label_arg.getValue();
        } else {
            background_label = num_of_classes;
        }
        weak_learner_parameters.background_label = background_label;

        // Create weak learner and trainer.
        StatisticsT::Factory statistics_factory(num_of_classes);
        WeakLearnerT iwl(weak_learner_parameters, statistics_factory);
        ForestTrainerT trainer(iwl, training_parameters);
        SampleProviderT sample_provider(image_list, weak_learner_parameters);
        BaggingWrapperT bagging_wrapper(trainer, sample_provider);

#ifdef AIT_TESTING
        RandomEngineT rnd_engine(11);
#else
        std::random_device rnd_device;
        ait::log_info() << "rnd(): " << rnd_device();
        RandomEngineT rnd_engine(rnd_device());
#endif

        // Train a forest and time it.
        auto start_time = std::chrono::high_resolution_clock::now();
        // TODO
        //		ForestTrainerT::ForestT forest = bagging_wrapper.train_forest(rnd_engine);
        // TODO: Testing all samples for comparison with depth_trainer
        sample_provider.clear_samples();
        for (int i = 0; i < image_list.size(); ++i) {
            sample_provider.load_samples_from_image(i, rnd_engine);
        }
        SampleIteratorT samples_start = sample_provider.get_samples_begin();
        SampleIteratorT samples_end = sample_provider.get_samples_end();
        ait::log_info() << "Starting training ...";
        ForestTrainerT::ForestT forest = trainer.train_forest(samples_start, samples_end, rnd_engine);
        auto stop_time = std::chrono::high_resolution_clock::now();
        auto duration = stop_time - start_time;
        auto period = std::chrono::high_resolution_clock::period();
        double elapsed_seconds = duration.count() * period.num / static_cast<double>(period.den);
        ait::log_info() << "Done.";
        ait::log_info() << "Running time: " << elapsed_seconds;
        
        // Optionally: Serialize forest to JSON file.
        if (json_forest_file_arg.isSet()) {
            {
                ait::log_info(false) << "Writing json forest file " << json_forest_file_arg.getValue() << "... " << std::flush;
                std::ofstream ofile(json_forest_file_arg.getValue());
                cereal::JSONOutputArchive oarchive(ofile);
                oarchive(cereal::make_nvp("forest", forest));
                ait::log_info(false) << " Done." << std::endl;
            }
        // Optionally: Serialize forest to binary file.
        } else if (binary_forest_file_arg.isSet()) {
            {
                ait::log_info(false) << "Writing binary forest file " << binary_forest_file_arg.getValue() << "... " << std::flush;
                std::ofstream ofile(binary_forest_file_arg.getValue(), std::ios_base::binary);
                cereal::BinaryOutputArchive oarchive(ofile);
                oarchive(cereal::make_nvp("forest", forest));
                ait::log_info(false) << " Done." << std::endl;
            }
        } else {
            throw("This should never happen. Either a JSON or a binary forest file have to be specified!");
        }

        // Optionally: Compute some stats and print them.
        if (print_confusion_matrix) {
            ait::log_info(false) << "Creating samples for testing ... " << std::flush;
            sample_provider.clear_samples();
            for (int i = 0; i < image_list.size(); ++i) {
                sample_provider.load_samples_from_image(i, rnd_engine);
            }
            SampleIteratorT samples_start = sample_provider.get_samples_begin();
            SampleIteratorT samples_end = sample_provider.get_samples_end();
            ait::log_info(false) << " Done." << std::endl;
            
            std::vector<ait::size_type> sample_counts(num_of_classes, 0);
            for (auto sample_it = samples_start; sample_it != samples_end; sample_it++) {
                ++sample_counts[sample_it->get_label()];
            }
            auto logger = ait::log_info(true);
            logger << "Sample counts>> ";
            for (int c = 0; c < num_of_classes; ++c) {
                if (c > 0) {
                    logger << ", ";
                }
                logger << "class " << c << ": " << sample_counts[c];
            }
            logger.close();
            // For each tree extract leaf node indices for each sample.
            std::vector<std::vector<ait::size_type>> forest_leaf_indices = forest.evaluate(samples_start, samples_end);
            
            // Compute number of prediction matches based on a majority vote among the forest.
            int match = 0;
            int no_match = 0;
            for (auto tree_it = forest.cbegin(); tree_it != forest.cend(); ++tree_it) {
                for (auto sample_it = samples_start; sample_it != samples_end; sample_it++) {
                    const auto &node_it = tree_it->cbegin() + (forest_leaf_indices[tree_it - forest.cbegin()][sample_it - samples_start]);
                    const auto &statistics = node_it->get_statistics();
                    auto max_it = std::max_element(statistics.get_histogram().cbegin(), statistics.get_histogram().cend());
                    auto label = max_it - statistics.get_histogram().cbegin();
                    if (label == sample_it->get_label()) {
                        match++;
                    } else {
                        no_match++;
                    }
                }
            }
            ait::log_info() << "Match: " << match << ", no match: " << no_match;
            
            // Compute confusion matrix.
            auto forest_utils = ait::make_forest_utils(forest);
            auto confusion_matrix = forest_utils.compute_confusion_matrix(samples_start, samples_end);
            ait::log_info() << "Confusion matrix:" << std::endl << confusion_matrix;
            auto norm_confusion_matrix = ait::EvaluationUtils::normalize_confusion_matrix(confusion_matrix);
            ait::log_info() << "Normalized confusion matrix:" << std::endl << norm_confusion_matrix;
            ait::log_info() << "Diagonal of normalized confusion matrix:" << std::endl << norm_confusion_matrix.diagonal();
            
            // Computing per-frame confusion matrix
            ait::log_info() << "Computing per-frame confusion matrix.";
            using ConfusionMatrixType = typename decltype(forest_utils)::MatrixType;
            ConfusionMatrixType per_frame_confusion_matrix(num_of_classes, num_of_classes);
            per_frame_confusion_matrix.setZero();
            WeakLearnerT::ParametersT full_parameters(weak_learner_parameters);
            // Modify parameters to retrieve all pixels per sample
            full_parameters.samples_per_image_fraction = 1.0;
            SampleProviderT full_sample_provider(image_list, full_parameters);
            for (int i = 0; i < image_list.size(); ++i) {
                full_sample_provider.clear_samples();
                full_sample_provider.load_samples_from_image(i, rnd_engine);
                samples_start = full_sample_provider.get_samples_begin();
                samples_end = full_sample_provider.get_samples_end();
                forest_utils.update_confusion_matrix(per_frame_confusion_matrix, samples_start, samples_end);
            }
            ait::log_info() << "Per-frame confusion matrix:" << std::endl << per_frame_confusion_matrix;
            ConfusionMatrixType per_frame_norm_confusion_matrix = ait::EvaluationUtils::normalize_confusion_matrix(per_frame_confusion_matrix);
            ait::log_info() << "Normalized per-frame confusion matrix:" << std::endl << per_frame_norm_confusion_matrix;
            ait::log_info() << "Diagonal of normalized per-frame confusion matrix:" << std::endl << per_frame_norm_confusion_matrix.diagonal();
            ait::log_info() << "Mean of diagonal of normalized per-frame confusion matrix:" << std::endl << per_frame_norm_confusion_matrix.diagonal().mean();
        }

    } catch (const std::runtime_error& error) {
        std::cerr << "Runtime exception occured" << std::endl;
        std::cerr << error.what() << std::endl;
    }
    
    return 0;
}