예제 #1
0
  /* Article parser methods */
  void *Indexer::parseArticles(void *ptr) {
    pthread_setcanceltype(PTHREAD_CANCEL_DEFERRED, NULL);
    kiwix::Indexer *self = (kiwix::Indexer *)ptr;
    size_t found;
    indexerToken token;

    while (self->popFromToParseQueue(token)) {
      MyHtmlParser htmlParser;

      /* The parser generate a lot of exceptions which should be avoided */
      try {
	htmlParser.parse_html(token.content, "UTF-8", true);
      } catch (...) {
      }

      /* If content does not have the noindex meta tag */
      /* Seems that the parser generates an exception in such case */
      found = htmlParser.dump.find("NOINDEX");
      
      if (found == string::npos) {
	/* Get the accented title */
	token.accentedTitle = (htmlParser.title.empty() ? token.title : htmlParser.title);

	/* count words */
	stringstream countWordStringStream;
	countWordStringStream << self->countWords(htmlParser.dump);
	token.wordCount = countWordStringStream.str();
	
	/* snippet */
	std::string snippet = std::string(htmlParser.dump, 0, 300);
	std::string::size_type last = snippet.find_last_of('.');
	if (last == snippet.npos)
	  last = snippet.find_last_of(' ');
	if (last != snippet.npos)
	  snippet = snippet.substr(0, last);
	token.snippet = snippet;

	/* size */
	stringstream sizeStringStream;
	sizeStringStream << token.content.size() / 1024;
	token.size = sizeStringStream.str();

	/* Remove accent */
	token.title = kiwix::removeAccents(token.accentedTitle);
	token.keywords = kiwix::removeAccents(htmlParser.keywords);
	token.content = kiwix::removeAccents(htmlParser.dump);
	self->pushToIndexQueue(token);
      }

      /* Test if the thread should be cancelled */
      pthread_testcancel(); 
    }
    
    self->articleParserRunning(false);
    pthread_exit(NULL);
    return NULL;
  }
예제 #2
0
파일: mh_html.cpp 프로젝트: norandom/recoll
bool MimeHandlerHtml::next_document()
{
    if (m_havedoc == false)
	return false;
    m_havedoc = false;
    // If set_doc(fn), take note of file name.
    string fn = m_filename;
    m_filename.erase();

    string charset = m_dfltInputCharset;
    LOGDEB(("MHHtml::next_doc.: default supposed input charset: [%s]\n", 
	    charset.c_str()));
    // Override default input charset if someone took care to set one:
    map<string,string>::const_iterator it = m_metaData.find(cstr_dj_keycharset);
    if (it != m_metaData.end() && !it->second.empty()) {
	charset = it->second;
	LOGDEB(("MHHtml: next_doc.: input charset from ext. metadata: [%s]\n", 
		charset.c_str()));
    }

    // - We first try to convert from the supposed charset
    //   (which may depend of the current directory) to utf-8. If this
    //   fails, we keep the original text
    // - During parsing, if we find a charset parameter, and it differs from
    //   what we started with, we abort and restart with the parameter value
    //   instead of the configuration one.

    MyHtmlParser result;
    for (int pass = 0; pass < 2; pass++) {
	string transcoded;
	LOGDEB(("Html::mkDoc: pass %d\n", pass));
	MyHtmlParser p;

	// Try transcoding. If it fails, use original text.
	int ecnt;
	if (!transcode(m_html, transcoded, charset, "UTF-8", &ecnt)) {
	    LOGDEB(("textHtmlToDoc: transcode failed from cs '%s' to UTF-8 for"
		    "[%s]", charset.c_str(), fn.empty()?"unknown":fn.c_str()));
	    transcoded = m_html;
	    // We don't know the charset, at all
	    p.reset_charsets();
	    charset.clear();
	} else {
	    if (ecnt) {
		if (pass == 0) {
		    LOGDEB(("textHtmlToDoc: init transcode had %d errors for "
			    "[%s]\n", ecnt, fn.empty()?"unknown":fn.c_str()));
		} else {
		    LOGERR(("textHtmlToDoc: final transcode had %d errors for "
			    "[%s]\n", ecnt, fn.empty()?"unknown":fn.c_str()));
		}
	    }
	    // charset has the putative source charset, transcoded is now
	    // in utf-8
	    p.set_charsets(charset, "utf-8");
	}

	try {
	    p.parse_html(transcoded);
	    // No exception: ok? But throw true to use the same
	    // code path as if an exception had been thrown by parse_html
	    throw true;
	    break;
	} catch (bool diag) {
	    result = p;
	    if (diag == true) {
		// Parser throws true at end of text. ok

		if (m_forPreview) {
		    // Save the html text
		    m_html = transcoded;
		    // In many cases, we need to change the charset decl,
		    // because the file was transcoded. It seems that just
		    // inserting one is enough (only the 1st one seems to
		    // be used by browsers/qtextedit).
                    string::size_type idx = m_html.find("<head>");
		    if (idx == string::npos)
			idx = m_html.find("<HEAD>");
		    if (idx != string::npos)
			m_html.replace(idx+6, 0, 
				       "<meta http-equiv=\"content-type\" "
				       "content=\"text/html; charset=utf-8\">");
		}

		break;
	    }

	    LOGDEB(("textHtmlToDoc: charset [%s] doc charset [%s]\n",
		    charset.c_str(), result.get_charset().c_str()));
	    if (!result.get_charset().empty() && 
		!samecharset(result.get_charset(), result.fromcharset)) {
		LOGDEB(("textHtmlToDoc: reparse for charsets\n"));
		// Set the origin charset as specified in document before
		// transcoding again
		charset = result.get_charset();
	    } else {
		LOGERR(("textHtmlToDoc:: error: non charset exception\n"));
		return false;
	    }
	}
    }

    m_metaData[cstr_dj_keyorigcharset] = result.get_charset();
    m_metaData[cstr_dj_keycontent] = result.dump;
    m_metaData[cstr_dj_keycharset] = cstr_utf8;
    // Avoid setting empty values which would crush ones possibly inherited
    // from parent (if we're an attachment)
    if (!result.dmtime.empty())
	m_metaData[cstr_dj_keymd] = result.dmtime;
    m_metaData[cstr_dj_keymt] = cstr_textplain;

    for (map<string,string>::const_iterator it = result.meta.begin(); 
	 it != result.meta.end(); it++) {
	if (!it->second.empty())
	    m_metaData[it->first] = it->second;
    }
    return true;
}