static wxString::const_iterator find_first_not_of(const wxChar *delims, size_t len, const wxString::const_iterator& from, const wxString::const_iterator& end) { wxASSERT_MSG( from <= end, wxT("invalid index") ); for ( wxString::const_iterator i = from; i != end; ++i ) { if ( !wxTmemchr(delims, *i, len) ) return i; } return end; }
int URLDetector::FindURL(const wxChar *text, int& len) { // offset of the current value of text from the initial one int offset = 0; match: int pos = scan(text, len); if ( !len ) return -1; // the provisional start and end of the URL, will be changed below const wxChar *start = text + pos; const wxChar *p = start + len; // there are 2 different cases: a mailto: URL or a mail address and // anything else which we need to treat differently bool isMail = *start == '@'; if ( isMail ) { // look for the start of the address start--; while ( start > text && IsLocalPartChar(*start) ) start--; // have we stopped at '<'? bool hasAngleBracket = *start == '<'; if ( !hasAngleBracket ) { if ( !IsLocalPartChar(*start) ) { // we went too far backwards start++; } //else: we stopped at the start of the text } //else: keep '<' as part of the URL // now look for the end of it while ( *p && IsDomainChar(*p) ) { p++; } // finally we should either have the brackets from both sides or none // at all if ( hasAngleBracket ) { if ( *p == '>' ) { // take the right bracket as well p++; } else { // forget about the left one start++; } } } else // !bare mail address { for ( ;; ) { size_t lenURL = 0; while ( IsURLChar(*p) ) { lenURL++; p++; } // URLs are frequently so long that they're spread across multiple // lines, so try to see if this might be the case here // // first of all we need to check whether it is at the end of line but // we should allow some trailing spaces const wxChar* q = p; while ( *q == ' ' ) q++; if ( q[0] != '\r' || q[1] != '\n' ) break; // not at the line end // also check if it's really long enough to be wrapped: // the short URLs normally shouldn't be wrapped static const size_t URL_WRAP_LEN = 30; // min len of wrapped URL if ( lenURL < URL_WRAP_LEN ) break; // too short if ( !IsURLChar(q[2]) ) break; // doesn't seem to be continued on the next line // heuristic text for end of URL detection if ( p - start > 5 && !CanBeWrapped(p) ) { // it seems that the URL ends here break; } p = q + 2; // go to the start of next line // Check that the beginning of next line is not the start of // another URL. // // Note that although '@' alone is recognized as the beginning // of an URL: here it should not be the case. int nextlen = 0; int nextpos = scan(p, nextlen); if ( nextlen && nextpos == 0 && *p != '@') { p -= 2; // The start of the next line being the start of an URL on its own, // do not join the two. break; } // check whether the next line starts with a word -- this is a good // indication that the URL hasn't wrapped q = p; while ( wxIsalpha(*q) ) q++; if ( *q == _T(' ') || (wxStrchr(_T(".,:;"), *q) && q[1] == _T(' ')) ) { // looks like we've a word (i.e. sequence of letters terminated by // space or punctuation) at the start of the next line p -= 2; break; } // another special case: subsequent dashes are very unusual in URLs // but often used as separator lines, so we assume that they indicate // the end of the URL if we find them on the next line. if ( p[0] == '-' && p[1] == '-' ) break; // it might be a wrapped URL but it might be not: it seems like we // get way too many false positives if we suppose that it's always // the case... so restrict the wrapped URLs detection to the case // when they occur at the beginning of the line, possibly after some // white space as this is how people usually format them q = start; while ( q >= text && *q != '\n' ) { q--; if ( !wxIsspace(*q) ) break; } // Does the URL start at the beginning of the line, or does it have // a '<' just in front? if ( q >= text && *q != '\n' && *q != '<') break; // it did occur at the start (or after '<'), suppose the URL is // wrapped and so we continue on the next line (and no need to test // the first character, it had been already done above) p++; } } // truncate any punctuation at the end while ( strchr(".:,)]!?", *(p - 1)) ) p--; // additional checks for the matches which didn't have an explicit scheme if ( isMail || text[pos + len - 3 /* len of "://" */ ] != _T(':') ) { // '@' matches may result in false positives, as not every '@' character // is inside a mailto URL so try to weed them out by requiring that the // mail address has a reasonable minimal length ("*****@*****.**" and // "www.xy.fr" are probably the shortest ones we can have, hence 8) // which at least avoids matching the bare '@'s bool good = (p - start) >= 8; if ( good ) { // also check that we have at least one dot in the domain part for the // mail addresses const wxChar * pDot = wxTmemchr(text + pos + 1, '.', p - text - pos - 1); if ( !pDot ) { good = false; } else if ( !isMail ) { // and has either two dots or at least a slash the other URLs, // otherwise it probably isn't an address/URL neither (stuff like // "... using ftp.If you ... " shouldn't be recognized as an URL) good = wxTmemchr(pDot + 1, '.', p - pDot - 1) != NULL || wxTmemchr(pDot + 1, '/', p - pDot - 1) != NULL; } } if ( !good ) { const int offDiff = pos + len; offset += offDiff; text += offDiff; // slightly more efficient than recursion... goto match; } } // return the length of the match len = p - start; return start - text + offset; }