void QueryDocument::parseQuery ( XProcessor& xproc, bool isQuery )
  {
    ElementRef cookiesList = queryElement.getDocument().createElement ( queryElement, xem_web.cookies() );
    queryElement.appendLastChild ( cookiesList );

    ElementRef headersList = queryElement.getDocument().createElement ( queryElement, xem_web.headers() );
    queryElement.appendLastChild ( headersList );

    QParseState state = QParseState_HeadLine;
    String accu = "";
    String method;
    ElementRef headerElement(*this);
    ElementRef responseElement(*this);
    ElementRef urlElement(*this);
    ElementRef urlParams(*this);
    ElementRef urlParam(*this);
    ElementRef cookieElement(*this);

    __ui64 contentLength = 0;
    bool transferEncodingChunked = false;

#define Invalid() throwException(Exception, "Invalid character %d at state %d\n", r, state );

    while ( state != QParseState_FinishedHTTPHeader && !reader.isFinished())
      {
        int r = reader.getNextChar();
        if ( r >= 0x80 )
          {
            throwException ( Exception, "Invalid char %d\n", r );
          }
        // Log_QParse ( "state=%d, r=%x (%d, %c)\n", state, r, r, r );
        switch ( state )
        {
        case QParseState_HeadLine:
          if ( r == ' ' )
            {
              Log_QParse ( "Method : '%s'\n", accu.c_str() );
              if ( isQuery )
                {
                  if ( accu == "GET" || accu == "POST" )
                    {
                      ElementRef methodElement = addTextualElement ( queryElement, xem_web.method(), accu );
                      method = methodElement.getText();
                      state = QParseState_URLStart;
                      accu = "";
                      continue;
                    }
                  else
                    Invalid();
                }
              else
                {
                  if ( accu == "HTTP/1.1" )
                    {
                      Log_QParse ( "Got a Response, protocol=%s\n", accu.c_str() );
                      addTextualElement ( queryElement, xem_web.protocol(), accu );
                      accu = "";
                      state = QParseState_ResponseCode;
                      responseElement = createElement ( headersList, xem_web.response() );
                      headersList.appendLastChild ( responseElement );
                    }
                  else
                    {
                      Invalid();
                    }
                }
            }
          else if ( r == '\r' || r == '\n' )
            {
              Invalid();
            }
          accu.appendUtf8(r);
          break;
        case QParseState_URLStart:
          if ( r == '/' )
            {
              state = QParseState_URL;
              continue;
            }
          Invalid();
          break;
        case QParseState_URL:
          if ( r == ' ' || r == '?' )
            {
              Log_QParse ( "Url : '%s'\n", accu.c_str() );
              urlElement = createElement ( queryElement, xem_web.url() );
              queryElement.appendLastChild ( urlElement );
              addTextualElement ( urlElement, xem_web.base(), accu );

              accu = "";
              if ( r == ' ' )
                {
                  state = QParseState_Protocol;
                }
              else
                {
                  state = QParseState_URLParam;
                }
              continue;
            }
          else if ( r == '?' )
            {
              Invalid();
            }
          else if ( r == '\r' || r == '\n' )
            {
              Invalid();
            }
          else if ( r == '%' )
            {
              Invalid ();
            }
          accu.appendUtf8(r);
          break;
        case QParseState_URLParam:
          if ( r == '\n' || r == '%' || r == '\r' )
            {
              Invalid();
            }
          else if ( r == '=' || r == ' ' || r == '&' )
            {
              if ( ! urlParams )
                {
                  urlParams = createElement ( urlElement, xem_web.parameters() );
                  urlElement.appendLastChild ( urlParams );
                }
              Log_QParse ( "Param : '%s'\n", accu.c_str() );
              urlParam = createElement ( urlParams, xem_web.param() );
              urlParams.appendLastChild ( urlParam );
              urlParam.addAttr ( xem_web.name(), accu );
              accu = "";
              if ( r == '=' )
                state = QParseState_URLParamValue;
              else if ( r == '&' )
                state = QParseState_URLParam;
              else if ( r == ' ' )
                state = QParseState_Protocol;
              else
                { Invalid(); }
              continue;
            }
          accu.appendUtf8(r);
          break;
        case QParseState_URLParamValue:
          if ( r == '&' || r == ' ')
            {
              Log_QParse ( "Param Value : '%s'\n", accu.c_str() );
              ElementRef valueNode = createTextNode ( urlParam, accu );
              urlParam.appendLastChild ( valueNode );
              urlParam = ElementRef(*this);
              accu = "";
              if ( r == '&' )
                state = QParseState_URLParam;
              else
                state = QParseState_Protocol;
              continue;
            }
          accu.appendUtf8(r);
          break;
        case QParseState_Protocol:
          if ( r == ' ' )
            {
              Invalid();
            }
          else if ( r == '\r' )
            {
              continue;
            }
          else if ( r == '\n' )
            {
              Log_QParse ( "Protocol = '%s'\n", accu.c_str() );
              addTextualElement ( queryElement, xem_web.protocol(), accu );
              accu = "";
              state = QParseState_BeginLine;
              continue;
            }
          accu.appendUtf8(r);
          break;
        case QParseState_ResponseCode:
          if ( r == ' ' )
            {
              Log_QParse ( "ResponseCode : %s\n", accu.c_str() );
              responseElement.addAttr ( xem_web.response_code(), accu );
              accu = "";
              state = QParseState_ResponseString;
              continue;
            }
          if ( '0' <= r && r <= '9' )
            {
              accu.appendUtf8(r);
              continue;
            }
          Invalid();
          break;
        case QParseState_ResponseString:
          if ( r == '\r' )
            continue;
          else if ( r == '\n' )
            {
              Log_QParse ( "ResponseString : %s\n", accu.c_str() );
              responseElement.addAttr ( xem_web.response_string(), accu );
              accu = "";
              state = QParseState_BeginLine;
              continue;
            }
          accu.appendUtf8(r);
          break;
        case QParseState_BeginLine:
          if ( r == '\r' )
            continue;
          else if ( r == '\n' )
            {
              state = QParseState_FinishedHTTPHeader;
              break;
            }
          else if ( r == ' ' )
            {
              Invalid();
            }
          accu.appendUtf8(r);
          state = QParseState_FieldName;
          continue;
        case QParseState_FieldName:
          if ( r == ':' )
            {
              Log_QParse ( "Field : '%s'\n", accu.c_str() );
              if ( accu == "Cookie" )
                {
                  accu = "";
                  state = QParseState_CookieStart;
                  continue;
                }
              headerElement = createElement ( headersList, xem_web.param() );
              headersList.appendLastChild ( headerElement );
              headerElement.addAttr ( xem_web.name(), accu );
              accu = "";
              state = QParseState_PostFieldName;
              continue;
            }
          else if ( r == ' ' || r == '\n' || r == '\r' )
            {
               Invalid();
            }
          accu.appendUtf8(r);
          break;
        case QParseState_PostFieldName:
          if ( r == ' ' )
            {
              state = QParseState_Value;
              continue;
            }
          Invalid();
        case QParseState_Value:
          if ( r == '\r')
            continue;
          else if ( r == '\n' )
            {
              Log_QParse ( "Field value : '%s'\n", accu.c_str() );
              ElementRef valueNode = createTextNode ( headerElement, accu );
              headerElement.appendLastChild ( valueNode );
              String fieldName = headerElement.getAttr(xem_web.name());
#ifdef __XEM_WEBSERVER_QUERYDOCUMENT_HAS_HEADERFIELDSMAP
              headerFieldsMap[fieldName] = valueNode.getText();
#endif
              if ( fieldName == "Content-Length" )
                {
                  contentLength = accu.toUI64();
                  Log_QParse ( "ContentLength : %llu\n", contentLength );
                }
              else if ( fieldName == "Transfer-Encoding" && accu == "chunked" )
                {
                  Log_QParse ( "TransferEncoding chuncked !\n" );
                  transferEncodingChunked = true;
                }
              headerElement = ElementRef(*this);
              accu = "";
              state = QParseState_BeginLine;
              continue;
            }
          accu.appendUtf8(r);
          break;
        case QParseState_CookieStart:
          if ( r == ' ' )
            continue;
          accu.appendUtf8(r);
          state = QParseState_CookieName;
          break;
        case QParseState_CookieName:
          if ( r == '=' )
            {
              Log_QParse ( "Cookie name : '%s'\n", accu.c_str() );
              cookieElement = createElement ( cookiesList, xem_web.cookie() );
              cookieElement.addAttr ( xem_web.name(), accu );
              cookiesList.appendLastChild ( cookieElement );

              accu = "";
              state = QParseState_CookieValue;
              continue;
            }
          accu.appendUtf8(r);
          break;
        case QParseState_CookieValue:
          if ( r == '\r' )
            continue;
          if ( r == ';' || r == '\n' )
            {
              Log_QParse ( "Cookie value : '%s'\n", accu.c_str() );
              ElementRef cookieValueNode = createTextNode ( cookieElement, accu );
              cookieElement.appendLastChild ( cookieValueNode );

              accu = "";
              if ( r == ';' )
                state = QParseState_CookieStart;
              else
                state = QParseState_BeginLine;
              continue;
            }
          accu.appendUtf8(r);
          break;
        default:
          Bug ( "Case %d Not implemented !\n", state );
        }
      }
    if ( state != QParseState_FinishedHTTPHeader )
      {
        throwException ( Exception, "HTTP reader not finished ! state=%d\n", state );
      }
    if ( contentLength || transferEncodingChunked )
      {
        if ( !isQuery || method == "POST" )
          {
            ElementRef content = createElement ( queryElement, xem_web.content() );
            queryElement.appendLastChild ( content );
            BlobRef blob = content.addBlob(xem_web.blob_contents());
            if ( contentLength )
              parseToBlob(blob, contentLength);
            else
              parseChunkedToBlob(blob);
          }
        else
          {
            throwException ( Exception, "Invalid content-length with method = %s\n", method.c_str() );
          }
      }
  }
Beispiel #2
0
void Url::parseString(const char* urlString, UtlBoolean isAddrSpec)
{
   // If isAddrSpec:
   //                userinfo@hostport;uriParameters?headerParameters
   // If !isAddrSpec:
   //    DisplayName<userinfo@hostport;urlParameters?headerParameters>;fieldParameters

#  ifdef TIME_PARSE
   OsTimeLog timeLog;
   LOG_TIME("start    ");
#  endif

   // Try to catch when a name-addr is passed but we are expecting an
   // addr-spec -- many name-addr's start with '<' or '"'.
   if (isAddrSpec && (urlString[0] == '<' || urlString[0] == '"'))
   {
      OsSysLog::add(FAC_SIP, PRI_ERR,
                    "Url::parseString Invalid addr-spec found (probably name-addr format): '%s'",
                    urlString);
   }

   int workingOffset = 0; // begin at the beginning...
   
   size_t afterAngleBrackets = UTL_NOT_FOUND;
   
   if (isAddrSpec)
   {
      mAngleBracketsIncluded = FALSE; 
   }
   else // ! addr-spec
   {
      // Is there a display name on the front?
      mDisplayName.remove(0);
      LOG_TIME("display   <");
      RegEx displayName(DisplayName);
      if (displayName.SearchAt(urlString, workingOffset))
      {
         LOG_TIME("display   > ");
         switch (displayName.Matches() /* number of substrings that matched */)
         {
         case 2: // matched unquoted sequence of tokens
            displayName.MatchString(&mDisplayName, 1);
            break;
            
         case 3: // matched a double quoted string
            // see performance note on DisplayName
            mDisplayName.append("\"");
            displayName.MatchString(&mDisplayName, 2);
            mDisplayName.append("\"");
            break;

         default:
            assert(false);
         }

         // does not include whitespace or the '<'
         workingOffset = displayName.AfterMatch(0);
      }

      // Are there angle brackets around the URI?
      LOG_TIME("angles   < ");
      RegEx angleBrackets(AngleBrackets);
      if (angleBrackets.SearchAt(urlString, workingOffset))
      {
         LOG_TIME("angles   > ");
         // yes, there are angle brackets
         workingOffset = angleBrackets.MatchStart(1); // inside the angle brackets
         afterAngleBrackets = angleBrackets.AfterMatch(0); // following the '>'
         
         /*
          * Note: We do not set mAngleBracketsIncluded just because we saw them
          *       That is only used for explicit control from the outside.
          *       The local knowledge of whether or not there are angle brackets
          *       is whether or not afterAngleBrackets == UTL_NOT_FOUND
          */
      }
   }

      /*
       * AMBIGUITY - there is a potential ambiguity when parsing real URLs.
       *
       * Consider the url 'foo:333' - it could be:
       *       scheme 'foo' host '333' ('333' is a valid local host name - bad idea, but legal)
       *   or  host   'foo' port '333' (and scheme 'sip' is implied)
       *
       * Now make it worse by using 'sips' as a hostname:
       *   'sips:333'     
       *       scheme 'sips' host '333'
       *   or  host   'sips' port '333' (and scheme 'sip' is implied)
       *
       * We resolve the first case by treating anything left of the colon as a scheme if
       * it is one of the supported schemes.  Otherwise, we set the scheme to the
       * default (sip) and go on so that it will be parsed as a hostname.  This does not
       * do the right thing for the (scheme 'sips' host '333') case, but they get what
       * they deserve.
       */
   
   // Parse the scheme (aka url type)
   LOG_TIME("scheme   < ");
   RegEx supportedScheme(SupportedScheme);
   if (   (supportedScheme.SearchAt(urlString,workingOffset))
       && (supportedScheme.MatchStart(0) == workingOffset)
       )
      {
      LOG_TIME("scheme   > ");
      // the scheme name matches one of the supported schemes
      mScheme = static_cast<Scheme>(supportedScheme.Matches()-1);
      workingOffset = supportedScheme.AfterMatch(0); // past the ':'
   }
   else
   {
      /*
       * It did not match one of the supported scheme names
       * so proceed on the assumption that it's a host and "sip:" is implied
       * Leave the workingOffset where it is (before the token).
       * The code below, through the parsing of host and port
       * treats this as an implicit 'sip:' url; if it parses ok
       * up to that point, it resets the scheme to SipsUrlScheme
       */
      mScheme = UnknownUrlScheme;
   }


   // skip over any '//' following the scheme for the ones we know use that
   switch (mScheme)
   {
   case FileUrlScheme:
   case FtpUrlScheme:
   case HttpUrlScheme:
   case HttpsUrlScheme:
   case RtspUrlScheme:
      if (0==strncmp("//", urlString+workingOffset, 2))
      {
         workingOffset += 2;
      }
      break;

   case UnknownUrlScheme:
   case SipUrlScheme:
   case SipsUrlScheme:
   case MailtoUrlScheme:
   default:
      break;
   }
   
   if (FileUrlScheme != mScheme) // no user part in file urls
   {
      // Parse the username and password
      LOG_TIME("userpass   < ");
      RegEx usernameAndPassword(UsernameAndPassword);
      if (   (usernameAndPassword.SearchAt(urlString, workingOffset))
          && usernameAndPassword.MatchStart(0) == workingOffset 
          )
      {
         LOG_TIME("userpass   > ");
         usernameAndPassword.MatchString(&mUserId, 1);
         usernameAndPassword.MatchString(&mPassword, 2);
         workingOffset = usernameAndPassword.AfterMatch(0);
      }
      else
      {
         // username and password are optional, so not finding them is ok
         // leave workingOffset where it is
      }
   }

   // Parse the hostname and port
   LOG_TIME("hostport   < ");
   RegEx hostAndPort(HostAndPort);
   if (   (hostAndPort.SearchAt(urlString,workingOffset))
       && (hostAndPort.MatchStart(0) == workingOffset)
       )
   {
      LOG_TIME("hostport   > ");
      hostAndPort.MatchString(&mHostAddress,1);
      UtlString portStr;
      if (hostAndPort.MatchString(&portStr,2))
      {
         mHostPort = atoi(portStr.data());
      }

      workingOffset = hostAndPort.AfterMatch(0);

      if (UnknownUrlScheme == mScheme)
      {
         /*
          * Resolve AMBIGUITY
          *   Since we were able to parse this as a host and port, it is now safe to
          *   set the scheme to the implied 'sip:'.
          */
         mScheme = SipUrlScheme;
      }
   }
   else
   {
      if (FileUrlScheme != mScheme) // no host is ok in a file URL
      {
         /*
          * This is not a file URL, so not having a recognized host name is invalid.
          *
          * Since we may have been called from a constructor, there is no way to
          * return an error, but at this point we know this is bad, so instead
          * we just log an error and set the scheme to the unknown url type and
          * clear any components that might have been set.
          */
         OsSysLog::add(FAC_SIP, PRI_ERR,
                       "Url::parseString no valid host found at char %d in '%s', "
                       "isAddrSpec = %d",
                       workingOffset, urlString, isAddrSpec
                       );
         mScheme = UnknownUrlScheme;
         mDisplayName.remove(0);
         mUserId.remove(0);
         mPassword.remove(0);
      }
   }
   
   // Next is a path if http, https, or ftp,
   //      OR url parameters if sip or sips.
   // There can be no Url parameters for http, https, or ftp
   //    because semicolon is a valid part of the path value
   switch ( mScheme )
   {
   case FileUrlScheme:
   case FtpUrlScheme:
   case HttpUrlScheme:
   case HttpsUrlScheme:
   case RtspUrlScheme:
   {
      // this is an http, https, or ftp URL, so get the path
      LOG_TIME("path   < ");
      RegEx urlPath(UrlPath);
      if (   (urlPath.SearchAt(urlString, workingOffset))
          && (urlPath.MatchStart(0) == workingOffset)
          )
      {
         LOG_TIME("path   > ");
         urlPath.MatchString(&mPath,1);
         workingOffset = urlPath.AfterMatch(1);
      }
   }
   break;

   case SipUrlScheme:
   case SipsUrlScheme:
   {
      // it may have url parameters of the form ";" param "=" value ...
      //                if it meets the right conditions:
      if (   isAddrSpec                          // in addr-spec, any param is a url param
          || afterAngleBrackets != UTL_NOT_FOUND // inside angle brackets there may be a url param
          ) 
      {
         LOG_TIME("urlparm   < ");
         RegEx urlParams(UrlParams);
         if (   (urlParams.SearchAt(urlString, workingOffset))
             && (urlParams.MatchStart(0) == workingOffset)
             )
         {
            LOG_TIME("urlparm   > ");
            urlParams.MatchString(&mRawUrlParameters, 1);
            workingOffset = urlParams.AfterMatch(1);

            // actual parsing of the parameters is in parseUrlParameters
            // so that it only happens if someone asks for them.
         }
      }
   }
   break;
   
   case MailtoUrlScheme:
   default:
      // no path component
      break;
   }

   if (UnknownUrlScheme != mScheme)
   {
   // Parse any header or query parameters
      LOG_TIME("hdrparm   < ");
   RegEx headerOrQueryParams(HeaderOrQueryParams);
   if(   (headerOrQueryParams.SearchAt(urlString, workingOffset))
      && (headerOrQueryParams.MatchStart(0) == workingOffset)
      )
   {
         LOG_TIME("hdrparm   > ");
      headerOrQueryParams.MatchString(&mRawHeaderOrQueryParameters, 1);
      workingOffset = headerOrQueryParams.AfterMatch(0);
            
      // actual parsing of the parameters is in parseHeaderOrQueryParameters
      // so that it only happens if someone asks for them.
   }

   // Parse the field parameters
   if (!isAddrSpec) // can't have field parameters in an addrspec
   {
      if (afterAngleBrackets != UTL_NOT_FOUND)
      {
         workingOffset = afterAngleBrackets;
      }

         LOG_TIME("fldparm   < ");
         RegEx fieldParameters(FieldParams);
      if (   (fieldParameters.SearchAt(urlString, workingOffset))
          && (fieldParameters.MatchStart(0) == workingOffset)
          )
      {
            LOG_TIME("fldparm   > ");
         fieldParameters.MatchString(&mRawFieldParameters, 1);

         // actual parsing of the parameters is in parseFieldParameters
         // so that it only happens if someone asks for them.
      }
   }
}
#  ifdef TIME_PARSE
     UtlString timeDump;
   timeLog.getLogString(timeDump);
   printf("\n%s\n", timeDump.data());
#  endif
}