void Book::parseOPF(BLUnZip& zip) { std::string data = zip.ExtractToString(m_opf); XMLDocument doc; doc.Parse(data.c_str()); XMLElement* package = doc.FirstChildElement("package"); XMLElement* metadata = package->FirstChildElement("metadata"); XMLElement* manifest = package->FirstChildElement("manifest"); XMLElement* item = manifest->FirstChildElement("item"); m_title = metadata->FirstChildElement("dc:title")->GetText(); m_author = metadata->FirstChildElement("dc:creator")->GetText(); for(XMLElement* rfe = item; rfe != nullptr; rfe = rfe->NextSiblingElement("item")) { m_manifest.emplace(rfe->Attribute("id"), rfe->Attribute("href")); } XMLElement* spine = package->FirstChildElement("spine"); XMLElement* itemref = spine->FirstChildElement("itemref"); for (XMLElement* it = itemref; it != nullptr; it = it->NextSiblingElement("itemref")) { m_spine.push_back(it->Attribute("idref")); } }
void Book::parsePages(BLUnZip& zip) { for (size_t i = 0; i < m_spine.size(); ++i) { std::string data = zip.ExtractToString(m_container + m_manifest[m_spine[i]]); XMLDocument doc; doc.Parse(data.c_str()); XMLElement* html = doc.FirstChildElement("html"); XMLElement* body = html->FirstChildElement("body"); for (XMLElement* next = body->FirstChildElement(); next != nullptr; next = next->NextSiblingElement()) { if (next) { const char* textStr = next->GetText(); if (textStr) { std::string str(textStr); shortenString(m_text, str); } } } } }
void Book::ParseContainer(BLUnZip& zipfile) { std::string unclean( zipfile.ExtractToString("META-INF/container.xml") ); XMLDocument doc; doc.Parse( unclean.c_str() ); XMLElement* container = doc.FirstChildElement( "container" ); XMLElement* rootfiles = container->FirstChildElement( "rootfiles" ); XMLElement* rootfile = rootfiles->FirstChildElement( "rootfile" ); opf = rootfile->Attribute("full-path"); }
void Book::ParsePages(BLUnZip& zipfile) { std::vector<char> filter(std::numeric_limits<unsigned char>::max(), 1); for (unsigned char c : valid) { filter[c] = 0; } // spine.size(); or 7 for (unsigned int i = 0; i != spine.size(); i++) { TextVisitor tv; std::string page ( zipfile.ExtractToString( manifest[spine[i]]) ); XMLDocument doc; doc.Parse(page.c_str()); XMLElement* body = doc.FirstChildElement("html")->FirstChildElement("body"); body->Accept(&tv); for (auto& v : tv.GetText()) { alltext.push_back(v); } // clean up text, remove any random / corrupt characters // https://github.com/dietmarkuehl/cputube/blob/master/cpu/test/replace.cpp for (auto& text : alltext) {/* text.erase(std::remove_if(text.begin(), text.end(), [&](unsigned char c) { return filter[c]; } ), text.end()); */ std::replace_if(text.begin(), text.end(), [&](unsigned char c) { return filter[c]; }, '\''); } } }
void Book::parseContainer(BLUnZip& zip) { std::string data = zip.ExtractToString("META-INF/container.xml"); XMLDocument doc; doc.Parse(data.c_str()); XMLElement* container = doc.FirstChildElement("container"); XMLElement* rootfiles = container->FirstChildElement("rootfiles"); XMLElement* rootfile = rootfiles->FirstChildElement("rootfile"); m_opf = rootfile->Attribute("full-path"); auto pos = m_opf.find("content.opf"); if (pos == 0) { m_container = ""; } else { m_container = m_opf.substr(0, pos); } }
void Book::ParseOPF(BLUnZip& zipfile) { std::string unclean( zipfile.ExtractToString( opf ) ); XMLDocument doc; doc.Parse( unclean.c_str() ); XMLElement* package = doc.FirstChildElement("package"); XMLElement* manifest_ = package->FirstChildElement("manifest"); XMLElement* item = manifest_->FirstChildElement("item"); for(XMLElement* rfe = item; rfe != nullptr; rfe = rfe->NextSiblingElement("item")) { manifest.emplace(rfe->Attribute("id"), rfe->Attribute("href")); } XMLElement* spine_ = package->FirstChildElement("spine"); XMLElement* itemref = spine_->FirstChildElement("itemref"); for (XMLElement* rfe = itemref; rfe != nullptr; rfe = rfe->NextSiblingElement("itemref")) { spine.push_back(rfe->Attribute("idref")); } }