Article::Article(const std::string& path, const bool detectRedirects) { invalid = false; /* aid */ aid = path.substr(directoryPath.size()+1); /* url */ url = aid; /* mime-type */ mimeType = getMimeTypeForFile(aid); /* namespace */ ns = getNamespaceForMimeType(mimeType)[0]; /* HTML specific code */ if (mimeType.find("text/html") != std::string::npos) { std::size_t found; std::string html = getFileContent(path); GumboOutput* output = gumbo_parse(html.c_str()); GumboNode* root = output->root; /* Search the content of the <title> tag in the HTML */ if (root->type == GUMBO_NODE_ELEMENT && root->v.element.children.length >= 2) { const GumboVector* root_children = &root->v.element.children; GumboNode* head = NULL; for (int i = 0; i < root_children->length; ++i) { GumboNode* child = (GumboNode*)(root_children->data[i]); if (child->type == GUMBO_NODE_ELEMENT && child->v.element.tag == GUMBO_TAG_HEAD) { head = child; break; } } if (head != NULL) { GumboVector* head_children = &head->v.element.children; for (int i = 0; i < head_children->length; ++i) { GumboNode* child = (GumboNode*)(head_children->data[i]); if (child->type == GUMBO_NODE_ELEMENT && child->v.element.tag == GUMBO_TAG_TITLE) { if (child->v.element.children.length == 1) { GumboNode* title_text = (GumboNode*)(child->v.element.children.data[0]); if (title_text->type == GUMBO_NODE_TEXT) { title = title_text->v.text.text; stripTitleInvalidChars(title); } } } } /* Detect if this is a redirection (if no redirects CSV specified) */ std::string targetUrl; try { targetUrl = detectRedirects ? extractRedirectUrlFromHtml(head_children) : ""; } catch (std::string &error) { std::cerr << error << std::endl; } if (!targetUrl.empty()) { redirectAid = computeAbsolutePath(aid, decodeUrl(targetUrl)); if (!fileExists(directoryPath + "/" + redirectAid)) { redirectAid.clear(); invalid = true; } } } /* If no title, then compute one from the filename */ if (title.empty()) { found = path.rfind("/"); if (found != std::string::npos) { title = path.substr(found+1); found = title.rfind("."); if (found!=std::string::npos) { title = title.substr(0, found); } } else { title = path; } std::replace(title.begin(), title.end(), '_', ' '); } } gumbo_destroy_output(&kGumboDefaultOptions, output); } }
std::string getAbsolutePath() { return computeAbsolutePath(m_FilePath, m_FileName); }
GeoValue::GeoValue(const std::string& FilePath, const std::string& FileName) : m_FilePath(FilePath), m_FileName(FileName) { m_AbsolutePath = computeAbsolutePath(m_FilePath, m_FileName); }