void indri::parse::HTMLParser::initialize( TokenizedDocument* tokenized, indri::api::ParsedDocument* parsed ) { indri::parse::TaggedTextParser::initialize( tokenized, parsed ); // clear URL url[0] = 0; base_url[0] = 0; bool have_URL = false; // find the DOCHDR tag, so we can yank out the URL for( size_t i=0; i<tokenized->metadata.size(); i++ ) { if( !strcmp(tokenized->metadata[i].key, "url") ) have_URL = true; if( !strcmp(tokenized->metadata[i].key, "dochdr") ) { char* beginURL = (char*) tokenized->metadata[i].value; char* endURL = beginURL + strcspn( (char*) tokenized->metadata[i].value, " \t\r\n" ); int length = lemur_compat::min<int>( endURL-beginURL, sizeof url-1 ); memcpy( url, beginURL, length ); url[length] = 0; strncpy( base_url, url, sizeof url-1 ); base_url[length] = 0; char* lastSlash = strrchr( base_url, '/' ); if( lastSlash ) *lastSlash = 0; break; } } // set url normalizeURL(url); // set base_url normalizeURL(base_url); // get tag definitions _absoluteUrlTag = _findTag("absolute-url"); _relativeUrlTag = _findTag("relative-url"); _anchorTag = _findTag("a"); // add URL to metadata if ( ! have_URL ) { indri::parse::MetadataPair pair; pair.key = "url"; pair.value = url; pair.valueLength = (int)strlen(url)+1; parsed->metadata.push_back( pair ); } _urlBuffer.clear(); // _urlBuffer.grow( parsed->textLength * 4 ); // will this be large enough? _urlBuffer.grow( 1024 * 1024 * 25 ); }
Interpreter Interpreter::fromXML(const std::string& xml, const std::string& baseURL) { URL absUrl = normalizeURL(baseURL); std::shared_ptr<InterpreterImpl> interpreterImpl(new InterpreterImpl()); Interpreter interpreter(interpreterImpl); std::unique_ptr<XERCESC_NS::XercesDOMParser> parser(new XERCESC_NS::XercesDOMParser()); std::unique_ptr<XERCESC_NS::ErrorHandler> errHandler(new XERCESC_NS::HandlerBase()); try { parser->setValidationScheme(XERCESC_NS::XercesDOMParser::Val_Always); parser->setDoNamespaces(true); parser->useScanner(XERCESC_NS::XMLUni::fgWFXMLScanner); parser->setErrorHandler(errHandler.get()); XERCESC_NS::MemBufInputSource is((XMLByte*)xml.c_str(), xml.size(), X("fake")); parser->parse(is); interpreterImpl->_document = parser->adoptDocument(); interpreterImpl->_baseURL = absUrl; InterpreterImpl::addInstance(interpreterImpl); } catch (const XERCESC_NS::SAXParseException& toCatch) { ERROR_PLATFORM_THROW(X(toCatch.getMessage()).str()); } catch (const XERCESC_NS::RuntimeException& toCatch) { ERROR_PLATFORM_THROW(X(toCatch.getMessage()).str()); } catch (const XERCESC_NS::XMLException& toCatch) { ERROR_PLATFORM_THROW(X(toCatch.getMessage()).str()); } catch (const XERCESC_NS::DOMException& toCatch) { ERROR_PLATFORM_THROW(X(toCatch.getMessage()).str()); } return interpreter; }
std::string normalizeStyleURL(const std::string& url, const std::string& accessToken) { if (url.compare(0, mapbox.length(), mapbox) != 0) return url; const std::string user = url.substr(mapbox.length(), url.find('.') - mapbox.length()); return normalizeURL(url, "/styles/v1/" + user + "/", accessToken); }
std::string normalizeSourceURL(const std::string& url, const std::string& accessToken) { if (url.compare(0, mapbox.length(), mapbox) != 0) return url; std::string result = normalizeURL(url + ".json", "/v4/", accessToken); // TileJSON requests need a secure flag appended to their URLs so // that the server knows to send SSL-ified resource references. result += "&secure"; return result; }
ResourceRequest* ResourceManager::createResourceRequest(QObject* parent, const QUrl& url) { auto normalizedURL = normalizeURL(url); auto scheme = normalizedURL.scheme(); if (scheme == URL_SCHEME_FILE) { return new FileResourceRequest(parent, normalizedURL); } else if (scheme == URL_SCHEME_HTTP || scheme == URL_SCHEME_HTTPS || scheme == URL_SCHEME_FTP) { return new HTTPResourceRequest(parent, normalizedURL); } else if (scheme == URL_SCHEME_ATP) { return new AssetResourceRequest(parent, normalizedURL); } qDebug() << "Unknown scheme (" << scheme << ") for URL: " << url.url(); return nullptr; }
QUrl ResourceManager::normalizeURL(const QUrl& originalUrl) { QUrl url = QUrl(normalizeURL(originalUrl.toString())); auto scheme = url.scheme(); if (!(scheme == URL_SCHEME_FILE || scheme == URL_SCHEME_HTTP || scheme == URL_SCHEME_HTTPS || scheme == URL_SCHEME_FTP || scheme == URL_SCHEME_ATP)) { // check the degenerative file case: on windows we can often have urls of the form c:/filename // this checks for and works around that case. QUrl urlWithFileScheme{ URL_SCHEME_FILE + ":///" + url.toString() }; if (!urlWithFileScheme.toLocalFile().isEmpty()) { return urlWithFileScheme; } } return url; }
ResourceRequest* ResourceManager::createResourceRequest(QObject* parent, const QUrl& url) { auto normalizedURL = normalizeURL(url); auto scheme = normalizedURL.scheme(); ResourceRequest* request = nullptr; if (scheme == URL_SCHEME_FILE) { request = new FileResourceRequest(normalizedURL); } else if (scheme == URL_SCHEME_HTTP || scheme == URL_SCHEME_HTTPS || scheme == URL_SCHEME_FTP) { request = new HTTPResourceRequest(normalizedURL); } else if (scheme == URL_SCHEME_ATP) { request = new AssetResourceRequest(normalizedURL); } else { qDebug() << "Unknown scheme (" << scheme << ") for URL: " << url.url(); return nullptr; } Q_ASSERT(request); request->moveToThread(&_thread); return request; }
Interpreter Interpreter::fromElement(XERCESC_NS::DOMElement* scxml, const std::string& baseURL) { URL absUrl = normalizeURL(baseURL); std::shared_ptr<InterpreterImpl> interpreterImpl(new InterpreterImpl()); Interpreter interpreter(interpreterImpl); // *copy* the given XERCESC_NS::DOM to get rid of event listeners XERCESC_NS::DOMImplementation* implementation = XERCESC_NS::DOMImplementationRegistry::getDOMImplementation(X("core")); interpreterImpl->_document = implementation->createDocument(); // we need to import the parent - to support xpath test150 XERCESC_NS::DOMNode* newNode = interpreterImpl->_document->importNode(scxml, true); // interpreterImpl->_document->adoptNode(newNode); interpreterImpl->_document->appendChild(newNode); // std::cerr << *(interpreterImpl->_document); interpreterImpl->_baseURL = absUrl; InterpreterImpl::addInstance(interpreterImpl); return interpreter; }
Interpreter Interpreter::fromURL(const std::string& url) { URL absUrl = normalizeURL(url); std::shared_ptr<InterpreterImpl> interpreterImpl(new InterpreterImpl()); Interpreter interpreter(interpreterImpl); std::unique_ptr<XERCESC_NS::XercesDOMParser> parser(new XERCESC_NS::XercesDOMParser()); parser->setValidationScheme(XERCESC_NS::XercesDOMParser::Val_Always); parser->setDoNamespaces(true); // we do not have a real schema anyway parser->useScanner(XERCESC_NS::XMLUni::fgWFXMLScanner); std::unique_ptr<XERCESC_NS::ErrorHandler> errHandler(new XERCESC_NS::HandlerBase()); parser->setErrorHandler(errHandler.get()); try { std::string tmp = absUrl; parser->parse(tmp.c_str()); interpreterImpl->_document = parser->adoptDocument(); interpreterImpl->_baseURL = absUrl; InterpreterImpl::addInstance(interpreterImpl); } catch (const XERCESC_NS::SAXParseException& toCatch) { LOGD(USCXML_ERROR) << X(toCatch.getMessage()); } catch (const XERCESC_NS::RuntimeException& toCatch) { LOGD(USCXML_ERROR) << X(toCatch.getMessage()); } catch (const XERCESC_NS::XMLException& toCatch) { LOGD(USCXML_ERROR) << X(toCatch.getMessage()); } catch (const XERCESC_NS::DOMException& toCatch) { LOGD(USCXML_ERROR) << X(toCatch.getMessage()); } return interpreter; }
Interpreter Interpreter::fromDocument(XERCESC_NS::DOMDocument* dom, const std::string& baseURL, bool copy) { URL absUrl = normalizeURL(baseURL); std::shared_ptr<InterpreterImpl> interpreterImpl(new InterpreterImpl()); Interpreter interpreter(interpreterImpl); if (copy) { // *copy* the given XERCESC_NS::DOM to get rid of event listeners XERCESC_NS::DOMImplementation* implementation = XERCESC_NS::DOMImplementationRegistry::getDOMImplementation(X("core")); interpreterImpl->_document = implementation->createDocument(); // we need to import the parent - to support xpath test150 XERCESC_NS::DOMNode* newNode = interpreterImpl->_document->importNode(dom->getDocumentElement(), true); interpreterImpl->_document->appendChild(newNode); } else { interpreterImpl->_document = dom; } interpreterImpl->_baseURL = absUrl; InterpreterImpl::addInstance(interpreterImpl); return interpreter; }
std::string normalizeGlyphsURL(const std::string& url, const std::string& accessToken) { if (url.compare(0, mapbox.length(), mapbox) != 0) return url; return normalizeURL(url, "/v4/", accessToken); }
void indri::parse::HTMLParser::handleTag( TagEvent* te ) { // All tag names and attribute names will have been case folded by // the Tokenizer. if ( ! strcmp( te->name, "a" ) ) { // <A HREF ...> tag bool handled_tag = false; // Check for an "href" attribute: for ( indri::utility::greedy_vector<indri::parse::AttributeValuePair>::iterator i = te->attributes.begin(); i != te->attributes.end(); i++ ) { if ( ! strcmp( (*i).attribute, "href" ) ) { if ( ! _anchorTag && ! _relativeUrlTag && ! _absoluteUrlTag ) return; // URL has already been extracted and is stored in (*i).value prepURL( (*i).value ); char tmp_buf[MAX_URL_LENGTH*4]; strncpy( tmp_buf, (*i).value, lemur_compat::min<int>( strlen( (*i).value ), MAX_URL_LENGTH - 1 ) ); tmp_buf[lemur_compat::min<int>( strlen( (*i).value ), MAX_URL_LENGTH - 1 )] = '\0'; bool relative = normalizeURL( tmp_buf ); // if special url tags are requested, we'll // store the url of the anchor text in the document itself const TaggedTextParser::tag_properties* tagProps; if( !relative ) { tagProps = _absoluteUrlTag; } else { tagProps = _relativeUrlTag; } _p_conflater->conflate( te ); //hack to count number of terms injected int cnt=0; if( tagProps && !tagProps->exclude && !_exclude ) { // Original flag check from TaggedTextParser::writeToken if ( ! ( _exclude || ! _include ) ) { // A HREF attribute value needs to be inserted at the // current position in the terms vector. A TermExtent for // the attribute value needs to be inserted at the current // position in the positions vector. // Need to get position of attribute value from // AttributeValuePair //strip scheme, tokenize url, inject into positions and terms int len = (int)strlen( tmp_buf ); // Allocate space within HTMLParser's Buffer char* write_location = _urlBuffer.write( len + 1 ); memcpy( write_location, tmp_buf, len + 1 ); // hack to make whole url available to harvest links _document.terms.push_back( write_location ); write_location = _urlBuffer.write( len + 1 ); memcpy( write_location, tmp_buf, len + 1 ); cnt++; tokens_excluded--; // end hack -- dmf char *c; char *urlText=write_location; bool lastSkipped = true; // skip the beginning stuff (http://) for( c = urlText; *c; c++ ) { if( *c == '/' && c[1] && c[1] == '/' ) { urlText = c + 2; } } for( c = urlText; *c; c++ ) { if( *c >= 'A' && *c <= 'Z' || *c >= 'a' && *c <= 'z' || *c >= '0' && *c <= '9' ) { if( lastSkipped ) { lastSkipped = false; _document.terms.push_back( c ); // decrement number of tokens removed from the stream // so that future field positions line up correctly. tokens_excluded--; cnt++; } } else { lastSkipped = true; *c = 0; } } int tokBegin = (*i).begin; // update the positions. for (size_t n = _document.terms.size()-cnt; n < _document.terms.size(); n++) { // cant be sure there's actually text in document with relative TermExtent extent; extent.begin = tokBegin++; extent.end = tokBegin; _document.positions.push_back( extent ); } } addTag( tagProps->name, tagProps->name, te->pos ); endTag( tagProps->name, tagProps->name, te->pos + cnt ); } tagProps = _anchorTag; if( tagProps && !tagProps->exclude && !_exclude ) addTag( tagProps->name, tagProps->name, te->pos + cnt ); handled_tag = true; } } if ( ! handled_tag ) indri::parse::TaggedTextParser::handleTag( te ); } else if ( ! strcmp( te->name, "base" ) ) { // <BASE HREF ...> tag bool handled_tag = false; for ( indri::utility::greedy_vector<indri::parse::AttributeValuePair,2>::iterator i = te->attributes.begin(); i != te->attributes.end(); i++ ) { if ( ! strcmp( (*i).attribute, "href" ) ) { // URL has already been extracted and is stored in (*i).value prepURL( (*i).value ); int len = (int)strlen( (*i).value ); char tmp_buf[MAX_URL_LENGTH*4]; strncpy( tmp_buf, (*i).value, lemur_compat::min<int>( len, MAX_URL_LENGTH - 1) ); tmp_buf[lemur_compat::min<int>( strlen( (*i).value ), MAX_URL_LENGTH - 1 )] = '\0'; normalizeURL( tmp_buf ); len = (int)strlen( tmp_buf ); strncpy( base_url, tmp_buf, lemur_compat::min<int>( len, MAX_URL_LENGTH-1 ) ); base_url[lemur_compat::min<int>( len, MAX_URL_LENGTH - 1 )] = '\0'; handled_tag = true; } } if ( ! handled_tag ) TaggedTextParser::handleTag( te ); } else { // any other tag TaggedTextParser::handleTag( te ); } }