document gz_corpus::next() { class_label label{"[none]"}; if (class_stream_) std::getline(class_stream_, static_cast<std::string&>(label)); std::string line; std::getline(corpus_stream_, line); document doc{cur_id_++, label}; doc.content(line, encoding()); doc.mdata(next_metadata()); return doc; }
document gz_corpus::next() { class_label label{"[none]"}; if (class_stream_) std::getline(class_stream_, static_cast<std::string&>(label)); std::string line; std::getline(corpus_stream_, line); document doc{cur_id_++, label}; doc.content(line, encoding()); auto mdata = next_metadata(); if (store_full_text()) mdata.insert(mdata.begin(), metadata::field{doc.content()}); doc.mdata(std::move(mdata)); return doc; }
document file_corpus::next() { document doc{doc_id{cur_}, docs_[cur_].second}; if (!filesystem::file_exists(prefix_ + docs_[cur_].first)) throw corpus_exception{"file \"" + docs_[cur_].first + "\" does not exist"}; doc.content(filesystem::file_text(prefix_ + docs_[cur_].first), encoding()); auto mdata = next_metadata(); if (store_full_text()) mdata.insert(mdata.begin(), metadata::field{doc.content()}); // add "path" metadata manually mdata.insert(mdata.begin(), metadata::field{prefix_ + docs_[cur_].first}); doc.mdata(std::move(mdata)); ++cur_; return doc; }