/* Article parser methods */ void *Indexer::parseArticles(void *ptr) { pthread_setcanceltype(PTHREAD_CANCEL_DEFERRED, NULL); kiwix::Indexer *self = (kiwix::Indexer *)ptr; size_t found; indexerToken token; while (self->popFromToParseQueue(token)) { MyHtmlParser htmlParser; /* The parser generate a lot of exceptions which should be avoided */ try { htmlParser.parse_html(token.content, "UTF-8", true); } catch (...) { } /* If content does not have the noindex meta tag */ /* Seems that the parser generates an exception in such case */ found = htmlParser.dump.find("NOINDEX"); if (found == string::npos) { /* Get the accented title */ token.accentedTitle = (htmlParser.title.empty() ? token.title : htmlParser.title); /* count words */ stringstream countWordStringStream; countWordStringStream << self->countWords(htmlParser.dump); token.wordCount = countWordStringStream.str(); /* snippet */ std::string snippet = std::string(htmlParser.dump, 0, 300); std::string::size_type last = snippet.find_last_of('.'); if (last == snippet.npos) last = snippet.find_last_of(' '); if (last != snippet.npos) snippet = snippet.substr(0, last); token.snippet = snippet; /* size */ stringstream sizeStringStream; sizeStringStream << token.content.size() / 1024; token.size = sizeStringStream.str(); /* Remove accent */ token.title = kiwix::removeAccents(token.accentedTitle); token.keywords = kiwix::removeAccents(htmlParser.keywords); token.content = kiwix::removeAccents(htmlParser.dump); self->pushToIndexQueue(token); } /* Test if the thread should be cancelled */ pthread_testcancel(); } self->articleParserRunning(false); pthread_exit(NULL); return NULL; }
bool MimeHandlerHtml::next_document() { if (m_havedoc == false) return false; m_havedoc = false; // If set_doc(fn), take note of file name. string fn = m_filename; m_filename.erase(); string charset = m_dfltInputCharset; LOGDEB(("MHHtml::next_doc.: default supposed input charset: [%s]\n", charset.c_str())); // Override default input charset if someone took care to set one: map<string,string>::const_iterator it = m_metaData.find(cstr_dj_keycharset); if (it != m_metaData.end() && !it->second.empty()) { charset = it->second; LOGDEB(("MHHtml: next_doc.: input charset from ext. metadata: [%s]\n", charset.c_str())); } // - We first try to convert from the supposed charset // (which may depend of the current directory) to utf-8. If this // fails, we keep the original text // - During parsing, if we find a charset parameter, and it differs from // what we started with, we abort and restart with the parameter value // instead of the configuration one. MyHtmlParser result; for (int pass = 0; pass < 2; pass++) { string transcoded; LOGDEB(("Html::mkDoc: pass %d\n", pass)); MyHtmlParser p; // Try transcoding. If it fails, use original text. int ecnt; if (!transcode(m_html, transcoded, charset, "UTF-8", &ecnt)) { LOGDEB(("textHtmlToDoc: transcode failed from cs '%s' to UTF-8 for" "[%s]", charset.c_str(), fn.empty()?"unknown":fn.c_str())); transcoded = m_html; // We don't know the charset, at all p.reset_charsets(); charset.clear(); } else { if (ecnt) { if (pass == 0) { LOGDEB(("textHtmlToDoc: init transcode had %d errors for " "[%s]\n", ecnt, fn.empty()?"unknown":fn.c_str())); } else { LOGERR(("textHtmlToDoc: final transcode had %d errors for " "[%s]\n", ecnt, fn.empty()?"unknown":fn.c_str())); } } // charset has the putative source charset, transcoded is now // in utf-8 p.set_charsets(charset, "utf-8"); } try { p.parse_html(transcoded); // No exception: ok? But throw true to use the same // code path as if an exception had been thrown by parse_html throw true; break; } catch (bool diag) { result = p; if (diag == true) { // Parser throws true at end of text. ok if (m_forPreview) { // Save the html text m_html = transcoded; // In many cases, we need to change the charset decl, // because the file was transcoded. It seems that just // inserting one is enough (only the 1st one seems to // be used by browsers/qtextedit). string::size_type idx = m_html.find("<head>"); if (idx == string::npos) idx = m_html.find("<HEAD>"); if (idx != string::npos) m_html.replace(idx+6, 0, "<meta http-equiv=\"content-type\" " "content=\"text/html; charset=utf-8\">"); } break; } LOGDEB(("textHtmlToDoc: charset [%s] doc charset [%s]\n", charset.c_str(), result.get_charset().c_str())); if (!result.get_charset().empty() && !samecharset(result.get_charset(), result.fromcharset)) { LOGDEB(("textHtmlToDoc: reparse for charsets\n")); // Set the origin charset as specified in document before // transcoding again charset = result.get_charset(); } else { LOGERR(("textHtmlToDoc:: error: non charset exception\n")); return false; } } } m_metaData[cstr_dj_keyorigcharset] = result.get_charset(); m_metaData[cstr_dj_keycontent] = result.dump; m_metaData[cstr_dj_keycharset] = cstr_utf8; // Avoid setting empty values which would crush ones possibly inherited // from parent (if we're an attachment) if (!result.dmtime.empty()) m_metaData[cstr_dj_keymd] = result.dmtime; m_metaData[cstr_dj_keymt] = cstr_textplain; for (map<string,string>::const_iterator it = result.meta.begin(); it != result.meta.end(); it++) { if (!it->second.empty()) m_metaData[it->first] = it->second; } return true; }