void QueryDocument::parseQuery ( XProcessor& xproc, bool isQuery ) { ElementRef cookiesList = queryElement.getDocument().createElement ( queryElement, xem_web.cookies() ); queryElement.appendLastChild ( cookiesList ); ElementRef headersList = queryElement.getDocument().createElement ( queryElement, xem_web.headers() ); queryElement.appendLastChild ( headersList ); QParseState state = QParseState_HeadLine; String accu = ""; String method; ElementRef headerElement(*this); ElementRef responseElement(*this); ElementRef urlElement(*this); ElementRef urlParams(*this); ElementRef urlParam(*this); ElementRef cookieElement(*this); __ui64 contentLength = 0; bool transferEncodingChunked = false; #define Invalid() throwException(Exception, "Invalid character %d at state %d\n", r, state ); while ( state != QParseState_FinishedHTTPHeader && !reader.isFinished()) { int r = reader.getNextChar(); if ( r >= 0x80 ) { throwException ( Exception, "Invalid char %d\n", r ); } // Log_QParse ( "state=%d, r=%x (%d, %c)\n", state, r, r, r ); switch ( state ) { case QParseState_HeadLine: if ( r == ' ' ) { Log_QParse ( "Method : '%s'\n", accu.c_str() ); if ( isQuery ) { if ( accu == "GET" || accu == "POST" ) { ElementRef methodElement = addTextualElement ( queryElement, xem_web.method(), accu ); method = methodElement.getText(); state = QParseState_URLStart; accu = ""; continue; } else Invalid(); } else { if ( accu == "HTTP/1.1" ) { Log_QParse ( "Got a Response, protocol=%s\n", accu.c_str() ); addTextualElement ( queryElement, xem_web.protocol(), accu ); accu = ""; state = QParseState_ResponseCode; responseElement = createElement ( headersList, xem_web.response() ); headersList.appendLastChild ( responseElement ); } else { Invalid(); } } } else if ( r == '\r' || r == '\n' ) { Invalid(); } accu.appendUtf8(r); break; case QParseState_URLStart: if ( r == '/' ) { state = QParseState_URL; continue; } Invalid(); break; case QParseState_URL: if ( r == ' ' || r == '?' ) { Log_QParse ( "Url : '%s'\n", accu.c_str() ); urlElement = createElement ( queryElement, xem_web.url() ); queryElement.appendLastChild ( urlElement ); addTextualElement ( urlElement, xem_web.base(), accu ); accu = ""; if ( r == ' ' ) { state = QParseState_Protocol; } else { state = QParseState_URLParam; } continue; } else if ( r == '?' ) { Invalid(); } else if ( r == '\r' || r == '\n' ) { Invalid(); } else if ( r == '%' ) { Invalid (); } accu.appendUtf8(r); break; case QParseState_URLParam: if ( r == '\n' || r == '%' || r == '\r' ) { Invalid(); } else if ( r == '=' || r == ' ' || r == '&' ) { if ( ! urlParams ) { urlParams = createElement ( urlElement, xem_web.parameters() ); urlElement.appendLastChild ( urlParams ); } Log_QParse ( "Param : '%s'\n", accu.c_str() ); urlParam = createElement ( urlParams, xem_web.param() ); urlParams.appendLastChild ( urlParam ); urlParam.addAttr ( xem_web.name(), accu ); accu = ""; if ( r == '=' ) state = QParseState_URLParamValue; else if ( r == '&' ) state = QParseState_URLParam; else if ( r == ' ' ) state = QParseState_Protocol; else { Invalid(); } continue; } accu.appendUtf8(r); break; case QParseState_URLParamValue: if ( r == '&' || r == ' ') { Log_QParse ( "Param Value : '%s'\n", accu.c_str() ); ElementRef valueNode = createTextNode ( urlParam, accu ); urlParam.appendLastChild ( valueNode ); urlParam = ElementRef(*this); accu = ""; if ( r == '&' ) state = QParseState_URLParam; else state = QParseState_Protocol; continue; } accu.appendUtf8(r); break; case QParseState_Protocol: if ( r == ' ' ) { Invalid(); } else if ( r == '\r' ) { continue; } else if ( r == '\n' ) { Log_QParse ( "Protocol = '%s'\n", accu.c_str() ); addTextualElement ( queryElement, xem_web.protocol(), accu ); accu = ""; state = QParseState_BeginLine; continue; } accu.appendUtf8(r); break; case QParseState_ResponseCode: if ( r == ' ' ) { Log_QParse ( "ResponseCode : %s\n", accu.c_str() ); responseElement.addAttr ( xem_web.response_code(), accu ); accu = ""; state = QParseState_ResponseString; continue; } if ( '0' <= r && r <= '9' ) { accu.appendUtf8(r); continue; } Invalid(); break; case QParseState_ResponseString: if ( r == '\r' ) continue; else if ( r == '\n' ) { Log_QParse ( "ResponseString : %s\n", accu.c_str() ); responseElement.addAttr ( xem_web.response_string(), accu ); accu = ""; state = QParseState_BeginLine; continue; } accu.appendUtf8(r); break; case QParseState_BeginLine: if ( r == '\r' ) continue; else if ( r == '\n' ) { state = QParseState_FinishedHTTPHeader; break; } else if ( r == ' ' ) { Invalid(); } accu.appendUtf8(r); state = QParseState_FieldName; continue; case QParseState_FieldName: if ( r == ':' ) { Log_QParse ( "Field : '%s'\n", accu.c_str() ); if ( accu == "Cookie" ) { accu = ""; state = QParseState_CookieStart; continue; } headerElement = createElement ( headersList, xem_web.param() ); headersList.appendLastChild ( headerElement ); headerElement.addAttr ( xem_web.name(), accu ); accu = ""; state = QParseState_PostFieldName; continue; } else if ( r == ' ' || r == '\n' || r == '\r' ) { Invalid(); } accu.appendUtf8(r); break; case QParseState_PostFieldName: if ( r == ' ' ) { state = QParseState_Value; continue; } Invalid(); case QParseState_Value: if ( r == '\r') continue; else if ( r == '\n' ) { Log_QParse ( "Field value : '%s'\n", accu.c_str() ); ElementRef valueNode = createTextNode ( headerElement, accu ); headerElement.appendLastChild ( valueNode ); String fieldName = headerElement.getAttr(xem_web.name()); #ifdef __XEM_WEBSERVER_QUERYDOCUMENT_HAS_HEADERFIELDSMAP headerFieldsMap[fieldName] = valueNode.getText(); #endif if ( fieldName == "Content-Length" ) { contentLength = accu.toUI64(); Log_QParse ( "ContentLength : %llu\n", contentLength ); } else if ( fieldName == "Transfer-Encoding" && accu == "chunked" ) { Log_QParse ( "TransferEncoding chuncked !\n" ); transferEncodingChunked = true; } headerElement = ElementRef(*this); accu = ""; state = QParseState_BeginLine; continue; } accu.appendUtf8(r); break; case QParseState_CookieStart: if ( r == ' ' ) continue; accu.appendUtf8(r); state = QParseState_CookieName; break; case QParseState_CookieName: if ( r == '=' ) { Log_QParse ( "Cookie name : '%s'\n", accu.c_str() ); cookieElement = createElement ( cookiesList, xem_web.cookie() ); cookieElement.addAttr ( xem_web.name(), accu ); cookiesList.appendLastChild ( cookieElement ); accu = ""; state = QParseState_CookieValue; continue; } accu.appendUtf8(r); break; case QParseState_CookieValue: if ( r == '\r' ) continue; if ( r == ';' || r == '\n' ) { Log_QParse ( "Cookie value : '%s'\n", accu.c_str() ); ElementRef cookieValueNode = createTextNode ( cookieElement, accu ); cookieElement.appendLastChild ( cookieValueNode ); accu = ""; if ( r == ';' ) state = QParseState_CookieStart; else state = QParseState_BeginLine; continue; } accu.appendUtf8(r); break; default: Bug ( "Case %d Not implemented !\n", state ); } } if ( state != QParseState_FinishedHTTPHeader ) { throwException ( Exception, "HTTP reader not finished ! state=%d\n", state ); } if ( contentLength || transferEncodingChunked ) { if ( !isQuery || method == "POST" ) { ElementRef content = createElement ( queryElement, xem_web.content() ); queryElement.appendLastChild ( content ); BlobRef blob = content.addBlob(xem_web.blob_contents()); if ( contentLength ) parseToBlob(blob, contentLength); else parseChunkedToBlob(blob); } else { throwException ( Exception, "Invalid content-length with method = %s\n", method.c_str() ); } } }
void Url::parseString(const char* urlString, UtlBoolean isAddrSpec) { // If isAddrSpec: // userinfo@hostport;uriParameters?headerParameters // If !isAddrSpec: // DisplayName<userinfo@hostport;urlParameters?headerParameters>;fieldParameters # ifdef TIME_PARSE OsTimeLog timeLog; LOG_TIME("start "); # endif // Try to catch when a name-addr is passed but we are expecting an // addr-spec -- many name-addr's start with '<' or '"'. if (isAddrSpec && (urlString[0] == '<' || urlString[0] == '"')) { OsSysLog::add(FAC_SIP, PRI_ERR, "Url::parseString Invalid addr-spec found (probably name-addr format): '%s'", urlString); } int workingOffset = 0; // begin at the beginning... size_t afterAngleBrackets = UTL_NOT_FOUND; if (isAddrSpec) { mAngleBracketsIncluded = FALSE; } else // ! addr-spec { // Is there a display name on the front? mDisplayName.remove(0); LOG_TIME("display <"); RegEx displayName(DisplayName); if (displayName.SearchAt(urlString, workingOffset)) { LOG_TIME("display > "); switch (displayName.Matches() /* number of substrings that matched */) { case 2: // matched unquoted sequence of tokens displayName.MatchString(&mDisplayName, 1); break; case 3: // matched a double quoted string // see performance note on DisplayName mDisplayName.append("\""); displayName.MatchString(&mDisplayName, 2); mDisplayName.append("\""); break; default: assert(false); } // does not include whitespace or the '<' workingOffset = displayName.AfterMatch(0); } // Are there angle brackets around the URI? LOG_TIME("angles < "); RegEx angleBrackets(AngleBrackets); if (angleBrackets.SearchAt(urlString, workingOffset)) { LOG_TIME("angles > "); // yes, there are angle brackets workingOffset = angleBrackets.MatchStart(1); // inside the angle brackets afterAngleBrackets = angleBrackets.AfterMatch(0); // following the '>' /* * Note: We do not set mAngleBracketsIncluded just because we saw them * That is only used for explicit control from the outside. * The local knowledge of whether or not there are angle brackets * is whether or not afterAngleBrackets == UTL_NOT_FOUND */ } } /* * AMBIGUITY - there is a potential ambiguity when parsing real URLs. * * Consider the url 'foo:333' - it could be: * scheme 'foo' host '333' ('333' is a valid local host name - bad idea, but legal) * or host 'foo' port '333' (and scheme 'sip' is implied) * * Now make it worse by using 'sips' as a hostname: * 'sips:333' * scheme 'sips' host '333' * or host 'sips' port '333' (and scheme 'sip' is implied) * * We resolve the first case by treating anything left of the colon as a scheme if * it is one of the supported schemes. Otherwise, we set the scheme to the * default (sip) and go on so that it will be parsed as a hostname. This does not * do the right thing for the (scheme 'sips' host '333') case, but they get what * they deserve. */ // Parse the scheme (aka url type) LOG_TIME("scheme < "); RegEx supportedScheme(SupportedScheme); if ( (supportedScheme.SearchAt(urlString,workingOffset)) && (supportedScheme.MatchStart(0) == workingOffset) ) { LOG_TIME("scheme > "); // the scheme name matches one of the supported schemes mScheme = static_cast<Scheme>(supportedScheme.Matches()-1); workingOffset = supportedScheme.AfterMatch(0); // past the ':' } else { /* * It did not match one of the supported scheme names * so proceed on the assumption that it's a host and "sip:" is implied * Leave the workingOffset where it is (before the token). * The code below, through the parsing of host and port * treats this as an implicit 'sip:' url; if it parses ok * up to that point, it resets the scheme to SipsUrlScheme */ mScheme = UnknownUrlScheme; } // skip over any '//' following the scheme for the ones we know use that switch (mScheme) { case FileUrlScheme: case FtpUrlScheme: case HttpUrlScheme: case HttpsUrlScheme: case RtspUrlScheme: if (0==strncmp("//", urlString+workingOffset, 2)) { workingOffset += 2; } break; case UnknownUrlScheme: case SipUrlScheme: case SipsUrlScheme: case MailtoUrlScheme: default: break; } if (FileUrlScheme != mScheme) // no user part in file urls { // Parse the username and password LOG_TIME("userpass < "); RegEx usernameAndPassword(UsernameAndPassword); if ( (usernameAndPassword.SearchAt(urlString, workingOffset)) && usernameAndPassword.MatchStart(0) == workingOffset ) { LOG_TIME("userpass > "); usernameAndPassword.MatchString(&mUserId, 1); usernameAndPassword.MatchString(&mPassword, 2); workingOffset = usernameAndPassword.AfterMatch(0); } else { // username and password are optional, so not finding them is ok // leave workingOffset where it is } } // Parse the hostname and port LOG_TIME("hostport < "); RegEx hostAndPort(HostAndPort); if ( (hostAndPort.SearchAt(urlString,workingOffset)) && (hostAndPort.MatchStart(0) == workingOffset) ) { LOG_TIME("hostport > "); hostAndPort.MatchString(&mHostAddress,1); UtlString portStr; if (hostAndPort.MatchString(&portStr,2)) { mHostPort = atoi(portStr.data()); } workingOffset = hostAndPort.AfterMatch(0); if (UnknownUrlScheme == mScheme) { /* * Resolve AMBIGUITY * Since we were able to parse this as a host and port, it is now safe to * set the scheme to the implied 'sip:'. */ mScheme = SipUrlScheme; } } else { if (FileUrlScheme != mScheme) // no host is ok in a file URL { /* * This is not a file URL, so not having a recognized host name is invalid. * * Since we may have been called from a constructor, there is no way to * return an error, but at this point we know this is bad, so instead * we just log an error and set the scheme to the unknown url type and * clear any components that might have been set. */ OsSysLog::add(FAC_SIP, PRI_ERR, "Url::parseString no valid host found at char %d in '%s', " "isAddrSpec = %d", workingOffset, urlString, isAddrSpec ); mScheme = UnknownUrlScheme; mDisplayName.remove(0); mUserId.remove(0); mPassword.remove(0); } } // Next is a path if http, https, or ftp, // OR url parameters if sip or sips. // There can be no Url parameters for http, https, or ftp // because semicolon is a valid part of the path value switch ( mScheme ) { case FileUrlScheme: case FtpUrlScheme: case HttpUrlScheme: case HttpsUrlScheme: case RtspUrlScheme: { // this is an http, https, or ftp URL, so get the path LOG_TIME("path < "); RegEx urlPath(UrlPath); if ( (urlPath.SearchAt(urlString, workingOffset)) && (urlPath.MatchStart(0) == workingOffset) ) { LOG_TIME("path > "); urlPath.MatchString(&mPath,1); workingOffset = urlPath.AfterMatch(1); } } break; case SipUrlScheme: case SipsUrlScheme: { // it may have url parameters of the form ";" param "=" value ... // if it meets the right conditions: if ( isAddrSpec // in addr-spec, any param is a url param || afterAngleBrackets != UTL_NOT_FOUND // inside angle brackets there may be a url param ) { LOG_TIME("urlparm < "); RegEx urlParams(UrlParams); if ( (urlParams.SearchAt(urlString, workingOffset)) && (urlParams.MatchStart(0) == workingOffset) ) { LOG_TIME("urlparm > "); urlParams.MatchString(&mRawUrlParameters, 1); workingOffset = urlParams.AfterMatch(1); // actual parsing of the parameters is in parseUrlParameters // so that it only happens if someone asks for them. } } } break; case MailtoUrlScheme: default: // no path component break; } if (UnknownUrlScheme != mScheme) { // Parse any header or query parameters LOG_TIME("hdrparm < "); RegEx headerOrQueryParams(HeaderOrQueryParams); if( (headerOrQueryParams.SearchAt(urlString, workingOffset)) && (headerOrQueryParams.MatchStart(0) == workingOffset) ) { LOG_TIME("hdrparm > "); headerOrQueryParams.MatchString(&mRawHeaderOrQueryParameters, 1); workingOffset = headerOrQueryParams.AfterMatch(0); // actual parsing of the parameters is in parseHeaderOrQueryParameters // so that it only happens if someone asks for them. } // Parse the field parameters if (!isAddrSpec) // can't have field parameters in an addrspec { if (afterAngleBrackets != UTL_NOT_FOUND) { workingOffset = afterAngleBrackets; } LOG_TIME("fldparm < "); RegEx fieldParameters(FieldParams); if ( (fieldParameters.SearchAt(urlString, workingOffset)) && (fieldParameters.MatchStart(0) == workingOffset) ) { LOG_TIME("fldparm > "); fieldParameters.MatchString(&mRawFieldParameters, 1); // actual parsing of the parameters is in parseFieldParameters // so that it only happens if someone asks for them. } } } # ifdef TIME_PARSE UtlString timeDump; timeLog.getLogString(timeDump); printf("\n%s\n", timeDump.data()); # endif }