示例#1
0
static wxString::const_iterator
find_first_not_of(const wxChar *delims, size_t len,
                  const wxString::const_iterator& from,
                  const wxString::const_iterator& end)
{
    wxASSERT_MSG( from <= end,  wxT("invalid index") );

    for ( wxString::const_iterator i = from; i != end; ++i )
    {
        if ( !wxTmemchr(delims, *i, len) )
            return i;
    }

    return end;
}
示例#2
0
int URLDetector::FindURL(const wxChar *text, int& len)
{
   // offset of the current value of text from the initial one
   int offset = 0;

match:
   int pos = scan(text, len);
   if ( !len )
      return -1;

   // the provisional start and end of the URL, will be changed below
   const wxChar *start = text + pos;
   const wxChar *p = start + len;

   // there are 2 different cases: a mailto: URL or a mail address and
   // anything else which we need to treat differently
   bool isMail = *start == '@';

   if ( isMail )
   {
      // look for the start of the address
      start--;
      while ( start > text && IsLocalPartChar(*start) )
         start--;

      // have we stopped at '<'?
      bool hasAngleBracket = *start == '<';
      if ( !hasAngleBracket )
      {
         if ( !IsLocalPartChar(*start) )
         {
            // we went too far backwards
            start++;
         }
         //else: we stopped at the start of the text
      }
      //else: keep '<' as part of the URL

      // now look for the end of it
      while ( *p && IsDomainChar(*p) )
      {
         p++;
      }

      // finally we should either have the brackets from both sides or none
      // at all
      if ( hasAngleBracket )
      {
         if ( *p == '>' )
         {
            // take the right bracket as well
            p++;
         }
         else
         {
            // forget about the left one
            start++;
         }
      }
   }
   else // !bare mail address
   {
      for ( ;; )
      {
         size_t lenURL = 0;
         while ( IsURLChar(*p) )
         {
            lenURL++;
            p++;
         }

         // URLs are frequently so long that they're spread across multiple
         // lines, so try to see if this might be the case here
         //
         // first of all we need to check whether it is at the end of line but
         // we should allow some trailing spaces
         const wxChar* q = p;
         while ( *q == ' ' )
            q++;

         if ( q[0] != '\r' || q[1] != '\n' )
            break; // not at the line end

         // also check if it's really long enough to be wrapped:
         // the short URLs normally shouldn't be wrapped
         static const size_t URL_WRAP_LEN = 30; // min len of wrapped URL
         if ( lenURL < URL_WRAP_LEN )
            break; // too short

         if ( !IsURLChar(q[2]) )
            break; // doesn't seem to be continued on the next line

         // heuristic text for end of URL detection
         if ( p - start > 5 && !CanBeWrapped(p) )
         {
            // it seems that the URL ends here
            break;
         }

         p = q + 2; // go to the start of next line

         // Check that the beginning of next line is not the start of
         // another URL.
         //
         // Note that although '@' alone is recognized as the beginning
         // of an URL: here it should not be the case.
         int nextlen = 0;
         int nextpos = scan(p, nextlen);
         if ( nextlen && nextpos == 0 && *p != '@')
         {
            p -= 2;

            // The start of the next line being the start of an URL on its own,
            // do not join the two.
            break;
         }

         // check whether the next line starts with a word -- this is a good
         // indication that the URL hasn't wrapped
         q = p;
         while ( wxIsalpha(*q) )
            q++;

         if ( *q == _T(' ') || (wxStrchr(_T(".,:;"), *q) && q[1] == _T(' ')) )
         {
            // looks like we've a word (i.e. sequence of letters terminated by
            // space or punctuation) at the start of the next line
            p -= 2;
            break;
         }

         // another special case: subsequent dashes are very unusual in URLs
         // but often used as separator lines, so we assume that they indicate
         // the end of the URL if we find them on the next line.
         if ( p[0] == '-' && p[1] == '-' )
            break;

         // it might be a wrapped URL but it might be not: it seems like we
         // get way too many false positives if we suppose that it's always
         // the case... so restrict the wrapped URLs detection to the case
         // when they occur at the beginning of the line, possibly after some
         // white space as this is how people usually format them
         q = start;
         while ( q >= text && *q != '\n' )
         {
            q--;

            if ( !wxIsspace(*q) )
               break;
         }

         // Does the URL start at the beginning of the line, or does it have
         // a '<' just in front?
         if ( q >= text && *q != '\n' && *q != '<')
            break;

         // it did occur at the start (or after '<'), suppose the URL is
         // wrapped and so we continue on the next line (and no need to test
         // the first character, it had been already done above)
         p++;
      }
   }

   // truncate any punctuation at the end
   while ( strchr(".:,)]!?", *(p - 1)) )
      p--;

   // additional checks for the matches which didn't have an explicit scheme
   if ( isMail || text[pos + len - 3 /* len of "://" */ ] != _T(':') )
   {
      // '@' matches may result in false positives, as not every '@' character
      // is inside a mailto URL so try to weed them out by requiring that the
      // mail address has a reasonable minimal length ("*****@*****.**" and
      // "www.xy.fr" are probably the shortest ones we can have, hence 8)
      // which at least avoids matching the bare '@'s
      bool good = (p - start) >= 8;

      if ( good )
      {
         // also check that we have at least one dot in the domain part for the
         // mail addresses
         const wxChar *
            pDot = wxTmemchr(text + pos + 1, '.', p - text - pos - 1);
         if ( !pDot )
         {
            good = false;
         }
         else if ( !isMail )
         {
            // and has either two dots or at least a slash the other URLs,
            // otherwise it probably isn't an address/URL neither (stuff like
            // "... using ftp.If you ... " shouldn't be recognized as an URL)
            good = wxTmemchr(pDot + 1, '.', p - pDot - 1) != NULL ||
                     wxTmemchr(pDot + 1, '/', p - pDot - 1) != NULL;
         }
      }

      if ( !good )
      {
         const int offDiff = pos + len;
         offset += offDiff;
         text += offDiff;

         // slightly more efficient than recursion...
         goto match;
      }
   }

   // return the length of the match
   len = p - start;

   return start - text + offset;
}