FoundState FindPartialEMail(const UChar* chars, unsigned length,
    FindState* s)
{
    // the following tables were generated by tests/browser/focusNavigation/BrowserDebug.cpp
    // hand-edit at your own risk
    static const int domainTwoLetter[] = {
        0x02df797c,  // a followed by: [cdefgilmnoqrstuwxz]
        0x036e73fb,  // b followed by: [abdefghijmnorstvwyz]
        0x03b67ded,  // c followed by: [acdfghiklmnorsuvxyz]
        0x02005610,  // d followed by: [ejkmoz]
        0x001e00d4,  // e followed by: [ceghrstu]
        0x00025700,  // f followed by: [ijkmor]
        0x015fb9fb,  // g followed by: [abdefghilmnpqrstuwy]
        0x001a3400,  // h followed by: [kmnrtu]
        0x000f7818,  // i followed by: [delmnoqrst]
        0x0000d010,  // j followed by: [emop]
        0x0342b1d0,  // k followed by: [eghimnprwyz]
        0x013e0507,  // l followed by: [abcikrstuvy]
        0x03fffccd,  // m followed by: [acdghklmnopqrstuvwxyz]
        0x0212c975,  // n followed by: [acefgilopruz]
        0x00001000,  // o followed by: [m]
        0x014e3cf1,  // p followed by: [aefghklmnrstwy]
        0x00000001,  // q followed by: [a]
        0x00504010,  // r followed by: [eouw]
        0x032a7fdf,  // s followed by: [abcdeghijklmnortvyz]
        0x026afeec,  // t followed by: [cdfghjklmnoprtvwz]
        0x03041441,  // u followed by: [agkmsyz]
        0x00102155,  // v followed by: [aceginu]
        0x00040020,  // w followed by: [fs]
        0x00000000,  // x
        0x00180010,  // y followed by: [etu]
        0x00401001,  // z followed by: [amw]
    };

    static char const* const longDomainNames[] = {
        "\x03" "ero" "\x03" "rpa",  // aero, arpa
        "\x02" "iz",  // biz
        "\x02" "at" "\x02" "om" "\x03" "oop",  // cat, com, coop
        NULL,  // d
        "\x02" "du",  // edu
        NULL,  // f
        "\x02" "ov",  // gov
        NULL,  // h
        "\x03" "nfo" "\x02" "nt",  // info, int
        "\x03" "obs",  // jobs
        NULL,  // k
        NULL,  // l
        "\x02" "il" "\x03" "obi" "\x05" "useum",  // mil, mobi, museum
        "\x03" "ame" "\x02" "et",  // name, net
        "\x02" "rg",  // , org
        "\x02" "ro",  // pro
        NULL,  // q
        NULL,  // r
        NULL,  // s
        "\x05" "ravel",  // travel
        NULL,  // u
        NULL,  // v
        NULL,  // w
        NULL,  // x
        NULL,  // y
        NULL,  // z
    };

    const UChar* start = chars;
    const UChar* end = chars + length;
    while (chars < end) {
        UChar ch = *chars++;
        if (ch != '@')
            continue;
        const UChar* atLocation = chars - 1;
        // search for domain
        ch = *chars++ | 0x20; // convert uppercase to lower
        if (ch < 'a' || ch > 'z')
            continue;
        while (chars < end) {
            ch = *chars++;
            if (IsDomainChar(ch) == false)
                goto nextAt;
            if (ch != '.')
                continue;
            UChar firstLetter = *chars++ | 0x20; // first letter of the domain
            if (chars >= end)
                return FOUND_NONE; // only one letter; must be at least two
            firstLetter -= 'a';
            if (firstLetter > 'z' - 'a')
                continue; // non-letter followed '.'
            int secondLetterMask = domainTwoLetter[firstLetter];
            ch = *chars | 0x20; // second letter of the domain
            ch -= 'a';
            if (ch >= 'z' - 'a')
                continue;
            bool secondMatch = (secondLetterMask & 1 << ch) != 0;
            const char* wordMatch = longDomainNames[firstLetter];
            int wordIndex = 0;
            while (wordMatch != NULL) {
                int len = *wordMatch++;
                char match;
                do {
                    match = wordMatch[wordIndex];
                    if (match < 0x20)
                        goto foundDomainStart;
                    if (chars[wordIndex] != match)
                        break;
                    wordIndex++;
                } while (true);
                wordMatch += len;
                if (*wordMatch == '\0')
                    break;
                wordIndex = 0;
            }
            if (secondMatch) {
                wordIndex = 1;
        foundDomainStart:
                chars += wordIndex;
                if (chars < end) {
                    ch = *chars;
                    if (ch != '.') {
                        if (IsDomainChar(ch))
                            goto nextDot;
                    } else if (chars + 1 < end && IsDomainChar(chars[1]))
                        goto nextDot;
                }
                // found domain. Search backwards from '@' for beginning of email address
                s->mEndResult = chars - start;
                chars = atLocation;
                if (chars <= start)
                    goto nextAt;
                ch = *--chars;
                if (ch == '.')
                    goto nextAt; // mailbox can't end in period
                do {
                    if (IsMailboxChar(ch) == false) {
                        chars++;
                        break;
                    }
                    if (chars == start)
                        break;
                    ch = *--chars;
                } while (true);
                UChar firstChar = *chars;
                if (firstChar == '.' || firstChar == '@') // mailbox can't start with period or be empty
                    goto nextAt;
                s->mStartResult = chars - start;
                return FOUND_COMPLETE;
            }
    nextDot:
            ;
        }
nextAt:
        chars = atLocation + 1;
    }
    return FOUND_NONE;
}
Esempio n. 2
0
int URLDetector::FindURL(const wxChar *text, int& len)
{
   // offset of the current value of text from the initial one
   int offset = 0;

match:
   int pos = scan(text, len);
   if ( !len )
      return -1;

   // the provisional start and end of the URL, will be changed below
   const wxChar *start = text + pos;
   const wxChar *p = start + len;

   // there are 2 different cases: a mailto: URL or a mail address and
   // anything else which we need to treat differently
   bool isMail = *start == '@';

   if ( isMail )
   {
      // look for the start of the address
      start--;
      while ( start > text && IsLocalPartChar(*start) )
         start--;

      // have we stopped at '<'?
      bool hasAngleBracket = *start == '<';
      if ( !hasAngleBracket )
      {
         if ( !IsLocalPartChar(*start) )
         {
            // we went too far backwards
            start++;
         }
         //else: we stopped at the start of the text
      }
      //else: keep '<' as part of the URL

      // now look for the end of it
      while ( *p && IsDomainChar(*p) )
      {
         p++;
      }

      // finally we should either have the brackets from both sides or none
      // at all
      if ( hasAngleBracket )
      {
         if ( *p == '>' )
         {
            // take the right bracket as well
            p++;
         }
         else
         {
            // forget about the left one
            start++;
         }
      }
   }
   else // !bare mail address
   {
      for ( ;; )
      {
         size_t lenURL = 0;
         while ( IsURLChar(*p) )
         {
            lenURL++;
            p++;
         }

         // URLs are frequently so long that they're spread across multiple
         // lines, so try to see if this might be the case here
         //
         // first of all we need to check whether it is at the end of line but
         // we should allow some trailing spaces
         const wxChar* q = p;
         while ( *q == ' ' )
            q++;

         if ( q[0] != '\r' || q[1] != '\n' )
            break; // not at the line end

         // also check if it's really long enough to be wrapped:
         // the short URLs normally shouldn't be wrapped
         static const size_t URL_WRAP_LEN = 30; // min len of wrapped URL
         if ( lenURL < URL_WRAP_LEN )
            break; // too short

         if ( !IsURLChar(q[2]) )
            break; // doesn't seem to be continued on the next line

         // heuristic text for end of URL detection
         if ( p - start > 5 && !CanBeWrapped(p) )
         {
            // it seems that the URL ends here
            break;
         }

         p = q + 2; // go to the start of next line

         // Check that the beginning of next line is not the start of
         // another URL.
         //
         // Note that although '@' alone is recognized as the beginning
         // of an URL: here it should not be the case.
         int nextlen = 0;
         int nextpos = scan(p, nextlen);
         if ( nextlen && nextpos == 0 && *p != '@')
         {
            p -= 2;

            // The start of the next line being the start of an URL on its own,
            // do not join the two.
            break;
         }

         // check whether the next line starts with a word -- this is a good
         // indication that the URL hasn't wrapped
         q = p;
         while ( wxIsalpha(*q) )
            q++;

         if ( *q == _T(' ') || (wxStrchr(_T(".,:;"), *q) && q[1] == _T(' ')) )
         {
            // looks like we've a word (i.e. sequence of letters terminated by
            // space or punctuation) at the start of the next line
            p -= 2;
            break;
         }

         // another special case: subsequent dashes are very unusual in URLs
         // but often used as separator lines, so we assume that they indicate
         // the end of the URL if we find them on the next line.
         if ( p[0] == '-' && p[1] == '-' )
            break;

         // it might be a wrapped URL but it might be not: it seems like we
         // get way too many false positives if we suppose that it's always
         // the case... so restrict the wrapped URLs detection to the case
         // when they occur at the beginning of the line, possibly after some
         // white space as this is how people usually format them
         q = start;
         while ( q >= text && *q != '\n' )
         {
            q--;

            if ( !wxIsspace(*q) )
               break;
         }

         // Does the URL start at the beginning of the line, or does it have
         // a '<' just in front?
         if ( q >= text && *q != '\n' && *q != '<')
            break;

         // it did occur at the start (or after '<'), suppose the URL is
         // wrapped and so we continue on the next line (and no need to test
         // the first character, it had been already done above)
         p++;
      }
   }

   // truncate any punctuation at the end
   while ( strchr(".:,)]!?", *(p - 1)) )
      p--;

   // additional checks for the matches which didn't have an explicit scheme
   if ( isMail || text[pos + len - 3 /* len of "://" */ ] != _T(':') )
   {
      // '@' matches may result in false positives, as not every '@' character
      // is inside a mailto URL so try to weed them out by requiring that the
      // mail address has a reasonable minimal length ("*****@*****.**" and
      // "www.xy.fr" are probably the shortest ones we can have, hence 8)
      // which at least avoids matching the bare '@'s
      bool good = (p - start) >= 8;

      if ( good )
      {
         // also check that we have at least one dot in the domain part for the
         // mail addresses
         const wxChar *
            pDot = wxTmemchr(text + pos + 1, '.', p - text - pos - 1);
         if ( !pDot )
         {
            good = false;
         }
         else if ( !isMail )
         {
            // and has either two dots or at least a slash the other URLs,
            // otherwise it probably isn't an address/URL neither (stuff like
            // "... using ftp.If you ... " shouldn't be recognized as an URL)
            good = wxTmemchr(pDot + 1, '.', p - pDot - 1) != NULL ||
                     wxTmemchr(pDot + 1, '/', p - pDot - 1) != NULL;
         }
      }

      if ( !good )
      {
         const int offDiff = pos + len;
         offset += offDiff;
         text += offDiff;

         // slightly more efficient than recursion...
         goto match;
      }
   }

   // return the length of the match
   len = p - start;

   return start - text + offset;
}