Beispiel #1
0
void indri::parse::HTMLParser::initialize( TokenizedDocument* tokenized, indri::api::ParsedDocument* parsed ) {
  indri::parse::TaggedTextParser::initialize( tokenized, parsed );

  // clear URL
  url[0] = 0;
  base_url[0] = 0;

  bool have_URL = false;

  // find the DOCHDR tag, so we can yank out the URL
  for( size_t i=0; i<tokenized->metadata.size(); i++ ) {
    if( !strcmp(tokenized->metadata[i].key, "url") ) have_URL = true;
    if( !strcmp(tokenized->metadata[i].key, "dochdr") ) {
      char* beginURL = (char*) tokenized->metadata[i].value;
      char* endURL = beginURL + strcspn( (char*) tokenized->metadata[i].value, " \t\r\n" );
      int length = lemur_compat::min<int>( endURL-beginURL, sizeof url-1 );
      memcpy( url, beginURL, length );
      url[length] = 0;

      strncpy( base_url, url, sizeof url-1 );
      base_url[length] = 0;
      char* lastSlash = strrchr( base_url, '/' );
      if( lastSlash ) *lastSlash = 0;
      break;
    }
  }

  // set url
  normalizeURL(url);

  // set base_url
  normalizeURL(base_url);

  // get tag definitions
  _absoluteUrlTag = _findTag("absolute-url");
  _relativeUrlTag = _findTag("relative-url");
  _anchorTag = _findTag("a");

  // add URL to metadata
  if ( ! have_URL ) {
    indri::parse::MetadataPair pair;
    pair.key = "url";
    pair.value = url;
    pair.valueLength = (int)strlen(url)+1;
    parsed->metadata.push_back( pair );
  }

  _urlBuffer.clear();
  //  _urlBuffer.grow( parsed->textLength * 4 ); // will this be large enough?
  _urlBuffer.grow( 1024 * 1024 * 25 );
}
Beispiel #2
0
Interpreter Interpreter::fromXML(const std::string& xml, const std::string& baseURL) {
	URL absUrl = normalizeURL(baseURL);

	std::shared_ptr<InterpreterImpl> interpreterImpl(new InterpreterImpl());
	Interpreter interpreter(interpreterImpl);

	std::unique_ptr<XERCESC_NS::XercesDOMParser> parser(new XERCESC_NS::XercesDOMParser());
	std::unique_ptr<XERCESC_NS::ErrorHandler> errHandler(new XERCESC_NS::HandlerBase());

	try {
		parser->setValidationScheme(XERCESC_NS::XercesDOMParser::Val_Always);
		parser->setDoNamespaces(true);
		parser->useScanner(XERCESC_NS::XMLUni::fgWFXMLScanner);

		parser->setErrorHandler(errHandler.get());

		XERCESC_NS::MemBufInputSource is((XMLByte*)xml.c_str(), xml.size(), X("fake"));
		parser->parse(is);

		interpreterImpl->_document = parser->adoptDocument();
		interpreterImpl->_baseURL = absUrl;
		InterpreterImpl::addInstance(interpreterImpl);

	} catch (const XERCESC_NS::SAXParseException& toCatch) {
		ERROR_PLATFORM_THROW(X(toCatch.getMessage()).str());
	} catch (const XERCESC_NS::RuntimeException& toCatch) {
		ERROR_PLATFORM_THROW(X(toCatch.getMessage()).str());
	} catch (const XERCESC_NS::XMLException& toCatch) {
		ERROR_PLATFORM_THROW(X(toCatch.getMessage()).str());
	} catch (const XERCESC_NS::DOMException& toCatch) {
		ERROR_PLATFORM_THROW(X(toCatch.getMessage()).str());
	}

	return interpreter;
}
Beispiel #3
0
std::string normalizeStyleURL(const std::string& url, const std::string& accessToken) {
    if (url.compare(0, mapbox.length(), mapbox) != 0)
        return url;

    const std::string user = url.substr(mapbox.length(), url.find('.') - mapbox.length());

    return normalizeURL(url, "/styles/v1/" + user + "/", accessToken);
}
Beispiel #4
0
std::string normalizeSourceURL(const std::string& url, const std::string& accessToken) {
    if (url.compare(0, mapbox.length(), mapbox) != 0)
        return url;

    std::string result = normalizeURL(url + ".json", "/v4/", accessToken);

    // TileJSON requests need a secure flag appended to their URLs so
    // that the server knows to send SSL-ified resource references.
    result += "&secure";

    return result;
}
Beispiel #5
0
ResourceRequest* ResourceManager::createResourceRequest(QObject* parent, const QUrl& url) {
    auto normalizedURL = normalizeURL(url);
    auto scheme = normalizedURL.scheme();
    if (scheme == URL_SCHEME_FILE) {
        return new FileResourceRequest(parent, normalizedURL);
    } else if (scheme == URL_SCHEME_HTTP || scheme == URL_SCHEME_HTTPS || scheme == URL_SCHEME_FTP) {
        return new HTTPResourceRequest(parent, normalizedURL);
    } else if (scheme == URL_SCHEME_ATP) {
        return new AssetResourceRequest(parent, normalizedURL);
    }

    qDebug() << "Unknown scheme (" << scheme << ") for URL: " << url.url();

    return nullptr;
}
Beispiel #6
0
QUrl ResourceManager::normalizeURL(const QUrl& originalUrl) {
    QUrl url = QUrl(normalizeURL(originalUrl.toString()));
    auto scheme = url.scheme();
    if (!(scheme == URL_SCHEME_FILE ||
          scheme == URL_SCHEME_HTTP || scheme == URL_SCHEME_HTTPS || scheme == URL_SCHEME_FTP ||
          scheme == URL_SCHEME_ATP)) {

        // check the degenerative file case: on windows we can often have urls of the form c:/filename
        // this checks for and works around that case.
        QUrl urlWithFileScheme{ URL_SCHEME_FILE + ":///" + url.toString() };
        if (!urlWithFileScheme.toLocalFile().isEmpty()) {
            return urlWithFileScheme;
        }
    }
    return url;
}
Beispiel #7
0
ResourceRequest* ResourceManager::createResourceRequest(QObject* parent, const QUrl& url) {
    auto normalizedURL = normalizeURL(url);
    auto scheme = normalizedURL.scheme();

    ResourceRequest* request = nullptr;

    if (scheme == URL_SCHEME_FILE) {
        request = new FileResourceRequest(normalizedURL);
    } else if (scheme == URL_SCHEME_HTTP || scheme == URL_SCHEME_HTTPS || scheme == URL_SCHEME_FTP) {
        request = new HTTPResourceRequest(normalizedURL);
    } else if (scheme == URL_SCHEME_ATP) {
        request = new AssetResourceRequest(normalizedURL);
    } else {
        qDebug() << "Unknown scheme (" << scheme << ") for URL: " << url.url();
        return nullptr;
    }
    Q_ASSERT(request);

    request->moveToThread(&_thread);
    return request;
}
Beispiel #8
0
Interpreter Interpreter::fromElement(XERCESC_NS::DOMElement* scxml, const std::string& baseURL) {
	URL absUrl = normalizeURL(baseURL);

	std::shared_ptr<InterpreterImpl> interpreterImpl(new InterpreterImpl());
	Interpreter interpreter(interpreterImpl);

	// *copy* the given XERCESC_NS::DOM to get rid of event listeners
	XERCESC_NS::DOMImplementation* implementation = XERCESC_NS::DOMImplementationRegistry::getDOMImplementation(X("core"));
	interpreterImpl->_document = implementation->createDocument();

	// we need to import the parent - to support xpath test150
	XERCESC_NS::DOMNode* newNode = interpreterImpl->_document->importNode(scxml, true);
//    interpreterImpl->_document->adoptNode(newNode);
	interpreterImpl->_document->appendChild(newNode);

//    std::cerr << *(interpreterImpl->_document);

	interpreterImpl->_baseURL = absUrl;

	InterpreterImpl::addInstance(interpreterImpl);
	return interpreter;
}
Beispiel #9
0
Interpreter Interpreter::fromURL(const std::string& url) {
	URL absUrl = normalizeURL(url);

	std::shared_ptr<InterpreterImpl> interpreterImpl(new InterpreterImpl());
	Interpreter interpreter(interpreterImpl);

	std::unique_ptr<XERCESC_NS::XercesDOMParser> parser(new XERCESC_NS::XercesDOMParser());
	parser->setValidationScheme(XERCESC_NS::XercesDOMParser::Val_Always);
	parser->setDoNamespaces(true);

	// we do not have a real schema anyway
	parser->useScanner(XERCESC_NS::XMLUni::fgWFXMLScanner);

	std::unique_ptr<XERCESC_NS::ErrorHandler> errHandler(new XERCESC_NS::HandlerBase());
	parser->setErrorHandler(errHandler.get());


	try {
		std::string tmp = absUrl;
		parser->parse(tmp.c_str());
		interpreterImpl->_document = parser->adoptDocument();
		interpreterImpl->_baseURL = absUrl;
		InterpreterImpl::addInstance(interpreterImpl);
	}

	catch (const XERCESC_NS::SAXParseException& toCatch) {
		LOGD(USCXML_ERROR) << X(toCatch.getMessage());
	} catch (const XERCESC_NS::RuntimeException& toCatch) {
		LOGD(USCXML_ERROR) << X(toCatch.getMessage());
	} catch (const XERCESC_NS::XMLException& toCatch) {
		LOGD(USCXML_ERROR) << X(toCatch.getMessage());
	} catch (const XERCESC_NS::DOMException& toCatch) {
		LOGD(USCXML_ERROR) << X(toCatch.getMessage());
	}

	return interpreter;

}
Beispiel #10
0
Interpreter Interpreter::fromDocument(XERCESC_NS::DOMDocument* dom, const std::string& baseURL, bool copy) {
	URL absUrl = normalizeURL(baseURL);

	std::shared_ptr<InterpreterImpl> interpreterImpl(new InterpreterImpl());
	Interpreter interpreter(interpreterImpl);

	if (copy) {
		// *copy* the given XERCESC_NS::DOM to get rid of event listeners
		XERCESC_NS::DOMImplementation* implementation = XERCESC_NS::DOMImplementationRegistry::getDOMImplementation(X("core"));
		interpreterImpl->_document = implementation->createDocument();

		// we need to import the parent - to support xpath test150
		XERCESC_NS::DOMNode* newNode = interpreterImpl->_document->importNode(dom->getDocumentElement(), true);
		interpreterImpl->_document->appendChild(newNode);

	} else {
		interpreterImpl->_document = dom;
	}

	interpreterImpl->_baseURL = absUrl;

	InterpreterImpl::addInstance(interpreterImpl);
	return interpreter;
}
Beispiel #11
0
std::string normalizeGlyphsURL(const std::string& url, const std::string& accessToken) {
    if (url.compare(0, mapbox.length(), mapbox) != 0)
        return url;

    return normalizeURL(url, "/v4/", accessToken);
}
Beispiel #12
0
void indri::parse::HTMLParser::handleTag( TagEvent* te ) {

  // All tag names and attribute names will have been case folded by
  // the Tokenizer.

  if ( ! strcmp( te->name, "a" ) ) {  // <A HREF ...> tag

    bool handled_tag = false;

    // Check for an "href" attribute:

    for ( indri::utility::greedy_vector<indri::parse::AttributeValuePair>::iterator
            i = te->attributes.begin(); i != te->attributes.end(); i++ ) {
            
      if ( ! strcmp( (*i).attribute, "href" ) ) {

        if ( ! _anchorTag && ! _relativeUrlTag && ! _absoluteUrlTag )
          return;

        // URL has already been extracted and is stored in (*i).value

        prepURL( (*i).value );

        char tmp_buf[MAX_URL_LENGTH*4];
        strncpy( tmp_buf, (*i).value, lemur_compat::min<int>( strlen( (*i).value ), MAX_URL_LENGTH - 1 ) );
        tmp_buf[lemur_compat::min<int>( strlen( (*i).value ), MAX_URL_LENGTH - 1 )] = '\0';
        
        bool relative = normalizeURL( tmp_buf );

        // if special url tags are requested, we'll
        // store the url of the anchor text in the document itself
        
        const TaggedTextParser::tag_properties* tagProps;
        if( !relative ) {
          tagProps = _absoluteUrlTag;
        } else {
          tagProps = _relativeUrlTag;
        }
        
        _p_conflater->conflate( te );
        //hack to count number of terms injected
        int cnt=0;
        if( tagProps && !tagProps->exclude && !_exclude ) {
          
          // Original flag check from TaggedTextParser::writeToken
          if ( ! ( _exclude || ! _include ) ) {

            // A HREF attribute value needs to be inserted at the
            // current position in the terms vector.  A TermExtent for
            // the attribute value needs to be inserted at the current
            // position in the positions vector.
          
            // Need to get position of attribute value from
            // AttributeValuePair

            //strip scheme, tokenize url, inject into positions and terms

            int len = (int)strlen( tmp_buf );
            // Allocate space within HTMLParser's Buffer
            char* write_location = _urlBuffer.write( len + 1 );
            memcpy( write_location, tmp_buf, len + 1 );
            // hack to make whole url available to harvest links
            _document.terms.push_back( write_location );
            write_location = _urlBuffer.write( len + 1 );
            memcpy( write_location, tmp_buf, len + 1 );
            cnt++; tokens_excluded--;
            // end hack -- dmf
            char *c;
            char *urlText=write_location;
            bool lastSkipped = true; 
        
            // skip the beginning stuff (http://)
            for( c = urlText; *c; c++ ) {
              if( *c == '/' && c[1] && c[1] == '/' ) {
                urlText = c + 2;                            
              }
            }
            for( c = urlText; *c; c++ ) {              
              if( *c >= 'A' && *c <= 'Z' ||
                  *c >= 'a' && *c <= 'z' ||
                  *c >= '0' && *c <= '9' ) 
                {
                  if( lastSkipped ) {
                    lastSkipped = false;
                    _document.terms.push_back( c );
                    // decrement number of tokens removed from the stream 
                    // so that future field positions line up correctly.
                    tokens_excluded--;
                    cnt++;
                  }
                } else {
                  lastSkipped = true;
                  *c = 0;
                }            
            }

            int tokBegin = (*i).begin;
            // update the positions.
            for (size_t n = _document.terms.size()-cnt; n < _document.terms.size(); n++) {
              // cant be sure there's actually text in document with relative
              TermExtent extent;
              extent.begin = tokBegin++;
              extent.end = tokBegin;
              _document.positions.push_back( extent );
            }
          }          
          addTag( tagProps->name, tagProps->name, te->pos );
          endTag( tagProps->name, tagProps->name, te->pos + cnt );
        }
        
        tagProps = _anchorTag;
        if( tagProps && !tagProps->exclude && !_exclude )
          addTag( tagProps->name, tagProps->name, te->pos + cnt );

        handled_tag = true;
      }
    }

    if ( ! handled_tag ) indri::parse::TaggedTextParser::handleTag( te );
  
  } else if ( ! strcmp( te->name, "base" ) ) { // <BASE HREF ...> tag

    bool handled_tag = false;

    for ( indri::utility::greedy_vector<indri::parse::AttributeValuePair,2>::iterator
            i = te->attributes.begin(); i != te->attributes.end(); i++ ) {

      if ( ! strcmp( (*i).attribute, "href" ) ) {

        // URL has already been extracted and is stored in (*i).value

        prepURL( (*i).value );

        int len = (int)strlen( (*i).value );

        char tmp_buf[MAX_URL_LENGTH*4];
        strncpy( tmp_buf, (*i).value, lemur_compat::min<int>( len, MAX_URL_LENGTH - 1) );
        tmp_buf[lemur_compat::min<int>( strlen( (*i).value ), MAX_URL_LENGTH - 1 )] = '\0';

        normalizeURL( tmp_buf );
        
        len = (int)strlen( tmp_buf );
        strncpy( base_url, tmp_buf, lemur_compat::min<int>( len, MAX_URL_LENGTH-1 ) );
        base_url[lemur_compat::min<int>( len, MAX_URL_LENGTH - 1 )] = '\0';

        handled_tag = true;
      }
    }

    if ( ! handled_tag ) TaggedTextParser::handleTag( te );

  } else { // any other tag

    TaggedTextParser::handleTag( te );
  }
}