FoundState FindPartialEMail(const UChar* chars, unsigned length, FindState* s) { // the following tables were generated by tests/browser/focusNavigation/BrowserDebug.cpp // hand-edit at your own risk static const int domainTwoLetter[] = { 0x02df797c, // a followed by: [cdefgilmnoqrstuwxz] 0x036e73fb, // b followed by: [abdefghijmnorstvwyz] 0x03b67ded, // c followed by: [acdfghiklmnorsuvxyz] 0x02005610, // d followed by: [ejkmoz] 0x001e00d4, // e followed by: [ceghrstu] 0x00025700, // f followed by: [ijkmor] 0x015fb9fb, // g followed by: [abdefghilmnpqrstuwy] 0x001a3400, // h followed by: [kmnrtu] 0x000f7818, // i followed by: [delmnoqrst] 0x0000d010, // j followed by: [emop] 0x0342b1d0, // k followed by: [eghimnprwyz] 0x013e0507, // l followed by: [abcikrstuvy] 0x03fffccd, // m followed by: [acdghklmnopqrstuvwxyz] 0x0212c975, // n followed by: [acefgilopruz] 0x00001000, // o followed by: [m] 0x014e3cf1, // p followed by: [aefghklmnrstwy] 0x00000001, // q followed by: [a] 0x00504010, // r followed by: [eouw] 0x032a7fdf, // s followed by: [abcdeghijklmnortvyz] 0x026afeec, // t followed by: [cdfghjklmnoprtvwz] 0x03041441, // u followed by: [agkmsyz] 0x00102155, // v followed by: [aceginu] 0x00040020, // w followed by: [fs] 0x00000000, // x 0x00180010, // y followed by: [etu] 0x00401001, // z followed by: [amw] }; static char const* const longDomainNames[] = { "\x03" "ero" "\x03" "rpa", // aero, arpa "\x02" "iz", // biz "\x02" "at" "\x02" "om" "\x03" "oop", // cat, com, coop NULL, // d "\x02" "du", // edu NULL, // f "\x02" "ov", // gov NULL, // h "\x03" "nfo" "\x02" "nt", // info, int "\x03" "obs", // jobs NULL, // k NULL, // l "\x02" "il" "\x03" "obi" "\x05" "useum", // mil, mobi, museum "\x03" "ame" "\x02" "et", // name, net "\x02" "rg", // , org "\x02" "ro", // pro NULL, // q NULL, // r NULL, // s "\x05" "ravel", // travel NULL, // u NULL, // v NULL, // w NULL, // x NULL, // y NULL, // z }; const UChar* start = chars; const UChar* end = chars + length; while (chars < end) { UChar ch = *chars++; if (ch != '@') continue; const UChar* atLocation = chars - 1; // search for domain ch = *chars++ | 0x20; // convert uppercase to lower if (ch < 'a' || ch > 'z') continue; while (chars < end) { ch = *chars++; if (IsDomainChar(ch) == false) goto nextAt; if (ch != '.') continue; UChar firstLetter = *chars++ | 0x20; // first letter of the domain if (chars >= end) return FOUND_NONE; // only one letter; must be at least two firstLetter -= 'a'; if (firstLetter > 'z' - 'a') continue; // non-letter followed '.' int secondLetterMask = domainTwoLetter[firstLetter]; ch = *chars | 0x20; // second letter of the domain ch -= 'a'; if (ch >= 'z' - 'a') continue; bool secondMatch = (secondLetterMask & 1 << ch) != 0; const char* wordMatch = longDomainNames[firstLetter]; int wordIndex = 0; while (wordMatch != NULL) { int len = *wordMatch++; char match; do { match = wordMatch[wordIndex]; if (match < 0x20) goto foundDomainStart; if (chars[wordIndex] != match) break; wordIndex++; } while (true); wordMatch += len; if (*wordMatch == '\0') break; wordIndex = 0; } if (secondMatch) { wordIndex = 1; foundDomainStart: chars += wordIndex; if (chars < end) { ch = *chars; if (ch != '.') { if (IsDomainChar(ch)) goto nextDot; } else if (chars + 1 < end && IsDomainChar(chars[1])) goto nextDot; } // found domain. Search backwards from '@' for beginning of email address s->mEndResult = chars - start; chars = atLocation; if (chars <= start) goto nextAt; ch = *--chars; if (ch == '.') goto nextAt; // mailbox can't end in period do { if (IsMailboxChar(ch) == false) { chars++; break; } if (chars == start) break; ch = *--chars; } while (true); UChar firstChar = *chars; if (firstChar == '.' || firstChar == '@') // mailbox can't start with period or be empty goto nextAt; s->mStartResult = chars - start; return FOUND_COMPLETE; } nextDot: ; } nextAt: chars = atLocation + 1; } return FOUND_NONE; }
int URLDetector::FindURL(const wxChar *text, int& len) { // offset of the current value of text from the initial one int offset = 0; match: int pos = scan(text, len); if ( !len ) return -1; // the provisional start and end of the URL, will be changed below const wxChar *start = text + pos; const wxChar *p = start + len; // there are 2 different cases: a mailto: URL or a mail address and // anything else which we need to treat differently bool isMail = *start == '@'; if ( isMail ) { // look for the start of the address start--; while ( start > text && IsLocalPartChar(*start) ) start--; // have we stopped at '<'? bool hasAngleBracket = *start == '<'; if ( !hasAngleBracket ) { if ( !IsLocalPartChar(*start) ) { // we went too far backwards start++; } //else: we stopped at the start of the text } //else: keep '<' as part of the URL // now look for the end of it while ( *p && IsDomainChar(*p) ) { p++; } // finally we should either have the brackets from both sides or none // at all if ( hasAngleBracket ) { if ( *p == '>' ) { // take the right bracket as well p++; } else { // forget about the left one start++; } } } else // !bare mail address { for ( ;; ) { size_t lenURL = 0; while ( IsURLChar(*p) ) { lenURL++; p++; } // URLs are frequently so long that they're spread across multiple // lines, so try to see if this might be the case here // // first of all we need to check whether it is at the end of line but // we should allow some trailing spaces const wxChar* q = p; while ( *q == ' ' ) q++; if ( q[0] != '\r' || q[1] != '\n' ) break; // not at the line end // also check if it's really long enough to be wrapped: // the short URLs normally shouldn't be wrapped static const size_t URL_WRAP_LEN = 30; // min len of wrapped URL if ( lenURL < URL_WRAP_LEN ) break; // too short if ( !IsURLChar(q[2]) ) break; // doesn't seem to be continued on the next line // heuristic text for end of URL detection if ( p - start > 5 && !CanBeWrapped(p) ) { // it seems that the URL ends here break; } p = q + 2; // go to the start of next line // Check that the beginning of next line is not the start of // another URL. // // Note that although '@' alone is recognized as the beginning // of an URL: here it should not be the case. int nextlen = 0; int nextpos = scan(p, nextlen); if ( nextlen && nextpos == 0 && *p != '@') { p -= 2; // The start of the next line being the start of an URL on its own, // do not join the two. break; } // check whether the next line starts with a word -- this is a good // indication that the URL hasn't wrapped q = p; while ( wxIsalpha(*q) ) q++; if ( *q == _T(' ') || (wxStrchr(_T(".,:;"), *q) && q[1] == _T(' ')) ) { // looks like we've a word (i.e. sequence of letters terminated by // space or punctuation) at the start of the next line p -= 2; break; } // another special case: subsequent dashes are very unusual in URLs // but often used as separator lines, so we assume that they indicate // the end of the URL if we find them on the next line. if ( p[0] == '-' && p[1] == '-' ) break; // it might be a wrapped URL but it might be not: it seems like we // get way too many false positives if we suppose that it's always // the case... so restrict the wrapped URLs detection to the case // when they occur at the beginning of the line, possibly after some // white space as this is how people usually format them q = start; while ( q >= text && *q != '\n' ) { q--; if ( !wxIsspace(*q) ) break; } // Does the URL start at the beginning of the line, or does it have // a '<' just in front? if ( q >= text && *q != '\n' && *q != '<') break; // it did occur at the start (or after '<'), suppose the URL is // wrapped and so we continue on the next line (and no need to test // the first character, it had been already done above) p++; } } // truncate any punctuation at the end while ( strchr(".:,)]!?", *(p - 1)) ) p--; // additional checks for the matches which didn't have an explicit scheme if ( isMail || text[pos + len - 3 /* len of "://" */ ] != _T(':') ) { // '@' matches may result in false positives, as not every '@' character // is inside a mailto URL so try to weed them out by requiring that the // mail address has a reasonable minimal length ("*****@*****.**" and // "www.xy.fr" are probably the shortest ones we can have, hence 8) // which at least avoids matching the bare '@'s bool good = (p - start) >= 8; if ( good ) { // also check that we have at least one dot in the domain part for the // mail addresses const wxChar * pDot = wxTmemchr(text + pos + 1, '.', p - text - pos - 1); if ( !pDot ) { good = false; } else if ( !isMail ) { // and has either two dots or at least a slash the other URLs, // otherwise it probably isn't an address/URL neither (stuff like // "... using ftp.If you ... " shouldn't be recognized as an URL) good = wxTmemchr(pDot + 1, '.', p - pDot - 1) != NULL || wxTmemchr(pDot + 1, '/', p - pDot - 1) != NULL; } } if ( !good ) { const int offDiff = pos + len; offset += offDiff; text += offDiff; // slightly more efficient than recursion... goto match; } } // return the length of the match len = p - start; return start - text + offset; }