bool Tokenizer::SkipToOneOfChars(const wxChar* chars, bool supportNesting) { // skip everything until we find any one of chars while (1) { while (NotEOF() && !CharInString(CurrentChar(), chars)) { if (CurrentChar() == '"' || CurrentChar() == '\'') { // this is the case that match is inside a string! wxChar ch = CurrentChar(); MoveToNextChar(); SkipToChar(ch); } MoveToNextChar(); // make sure we skip comments if (CurrentChar() == '/') SkipComment(); // this will decide if it is a comment // use 'while' here to cater for consecutive blocks to skip (e.g. sometemplate<foo>(bar) // must skip <foo> and immediately after (bar)) // because if we don't, the next block won't be skipped ((bar) in the example) leading to weird // parsing results bool done = false; while (supportNesting && !done) { switch (CurrentChar()) { case '{': SkipBlock('{'); break; case '(': SkipBlock('('); break; case '[': SkipBlock('['); break; case '<': // don't skip if << operator if (NextChar() == '<') MoveToNextChar(2); // skip it and also the next '<' or the next '<' leads to a SkipBlock('<'); else SkipBlock('<'); break; default: done = true; break; } } } if (PreviousChar() != '\\') break; else { // check for "\\" if (m_TokenIndex - 2 >= 0 && m_Buffer.GetChar(m_TokenIndex - 2) == '\\') break; } MoveToNextChar(); } if (IsEOF()) return false; return true; }
void TextCutter::GetChunk(wxString& chunk, bool& isWord) { isWord = false; chunk = wxEmptyString; if (IsEOF()) return; unsigned int start = m_CurIdx; if (isalpha(CurrentChar()) || CurrentChar() == '_') { while (!IsEOF() && (isalnum(CurrentChar()) || CurrentChar() == '_')) MoveToNextChar(); chunk = m_Text.Mid(start, m_CurIdx - start); isWord = true; } else { SkipWhiteSpace(); SkipUnwanted(); if (start != m_CurIdx) { chunk = m_Text.Mid(start, m_CurIdx - start); return; } if (isdigit(CurrentChar())) { // numbers while (!IsEOF() && CharInString(CurrentChar(), "0123456789.abcdefABCDEFXxLl")) MoveToNextChar(); } else if (CurrentChar() == '"' || CurrentChar() == '\'') { // string, char, etc. wxChar match = CurrentChar(); MoveToNextChar(); // skip starting ' or " SkipToChar(match); MoveToNextChar(); // skip ending ' or " } else { MoveToNextChar(); } chunk = m_Text.Mid(start, m_CurIdx - start); } return; }
// if we really move forward, return true, which means we have the new m_TokenIndex // if we stay here, return false bool Tokenizer::SkipComment() { if (IsEOF()) return false; bool cstyle; // C or C++ style comments //check the comment prompt if (CurrentChar() == '/') { if (NextChar() == '*') cstyle = true; else if (NextChar() == '/') cstyle = false; else return false; // Not a comment, return false; } else return false; // Not a comment, return false; TRACE(_T("SkipComment() : Start from line = %d"), m_LineNumber); MoveToNextChar(2); // Skip the comment prompt // Here, we are in the comment body while (true) { if (cstyle) // C style comment { SkipToChar('/'); if (PreviousChar() == '*') // end of a C style comment { MoveToNextChar(); break; } if (!MoveToNextChar()) break; } else // C++ style comment { TRACE(_T("SkipComment() : Need to call SkipToEOL() here at line = %d"), m_LineNumber); SkipToInlineCommentEnd(); break; } } return true; }
bool Tokenizer::SkipBlock(const wxChar& ch) { // skip blocks () [] {} <> wxChar match; switch (ch) { case '(': match = ')'; break; case '[': match = ']'; break; case '{': match = '}'; break; case '<': match = '>'; break; default : return false; } MoveToNextChar(); int count = 1; // counter for nested blocks (xxx()) while (NotEOF()) { bool noMove = false; if (CurrentChar() == '/') SkipComment(); // this will decide if it is a comment if (CurrentChar() == '"' || CurrentChar() == '\'') { // this is the case that match is inside a string! wxChar ch = CurrentChar(); MoveToNextChar(); SkipToChar(ch); MoveToNextChar(); // don't move to next char below if concatenating strings (e.g. printf("" "")) if (CurrentChar() == '"' || CurrentChar() == '\'') noMove = true; } if (CurrentChar() == ch) ++count; else if (CurrentChar() == match) --count; if (!noMove) MoveToNextChar(); if (count == 0) break; } if (IsEOF()) return false; return true; }
bool Tokenizer::SkipToInlineCommentEnd() { TRACE(_T("%s : line=%d, CurrentChar='%c', PreviousChar='%c', NextChar='%c'"), wxString(__PRETTY_FUNCTION__, wxConvUTF8).wc_str(), m_LineNumber, CurrentChar(), PreviousChar(), NextChar()); // skip everything until we find EOL while (true) { SkipToChar(_T('\n')); if (!IsBackslashBeforeEOL() || IsEOF()) break; else MoveToNextChar(); } TRACE(_T("SkipToInlineCommentEnd(): (END) We are now at line %d, CurrentChar='%c', PreviousChar='%c',") _T(" NextChar='%c'"), m_LineNumber, CurrentChar(), PreviousChar(), NextChar()); return NotEOF(); }
bool Tokenizer::SkipComment(bool skipWhiteAtEnd) // = true { // C/C++ style comments bool is_comment = CurrentChar() == '/' && (NextChar() == '/' || NextChar() == '*'); if (!is_comment) return true; bool cstyle = NextChar() == '*'; MoveToNextChar(2); while (1) { if (!cstyle) { if (!SkipToEOL(false, true)) return false; MoveToNextChar(); break; } else { if (SkipToChar('/')) { if (PreviousChar() == '*') { MoveToNextChar(); break; } MoveToNextChar(); } else return false; } } if (IsEOF()) return false; if (skipWhiteAtEnd && !SkipWhiteSpace()) return false; return CurrentChar() == '/' ? SkipComment() : true; // handle chained comments }
//vfc add bGetValue wxString Tokenizer::DoGetToken(bool bGetValue, bool bTemplate) { if (IsEOF()) return wxEmptyString; if (!SkipWhiteSpace()) return wxEmptyString; if (m_SkipUnwantedTokens && !SkipUnwanted(bGetValue)) return wxEmptyString; // if m_SkipUnwantedTokens is false, we need to handle comments here too if (!m_SkipUnwantedTokens) SkipComment(); int start = m_TokenIndex; wxString m_Str; wxChar c = CurrentChar(); if (c == '_' || wxIsalpha(c)) { // keywords, identifiers, etc. // operator== is cheaper than wxIsalnum, also MoveToNextChar already includes IsEOF while ( ( CurrentChar() == '_' || wxIsalnum(CurrentChar()) ) && MoveToNextChar() ) ; if (IsEOF()) return wxEmptyString; m_Str = m_Buffer.Mid(start, m_TokenIndex - start); m_IsOperator = m_Str.IsSameAs(TokenizerConsts::operator_str); } #ifdef __WXMSW__ // This is a Windows only bug! else if (c == 178 || c == 179 || c == 185) // fetch ?and ? { m_Str = c; MoveToNextChar(); } #endif else if (wxIsdigit(CurrentChar())) { // numbers while (NotEOF() && CharInString(CurrentChar(), _T("0123456789.abcdefABCDEFXxLl"))) MoveToNextChar(); if (IsEOF()) return wxEmptyString; m_Str = m_Buffer.Mid(start, m_TokenIndex - start); m_IsOperator = false; } else if (CurrentChar() == '"' || CurrentChar() == '\'') { // string, char, etc. wxChar match = CurrentChar(); MoveToNextChar(); // skip starting ' or " if (!SkipToChar(match)) return wxEmptyString; MoveToNextChar(); // skip ending ' or " m_Str = m_Buffer.Mid(start, m_TokenIndex - start); } else if (CurrentChar() == ':') { if (NextChar() == ':') { MoveToNextChar(); MoveToNextChar(); m_Str.assign(TokenizerConsts::colon_colon); // this only copies a pointer, but operator= allocates memory and does a memcpy! } else { MoveToNextChar(); m_Str.assign(TokenizerConsts::colon); } } else if (CurrentChar() == '<' && bTemplate) { wxChar match = _T('>'); MoveToNextChar(); if (!SkipToOneOfChars(_T(">\r\n")),false) return wxEmptyString; MoveToNextChar(); wxString tmp = m_Buffer.Mid(start+1,m_TokenIndex-start-2); tmp.Trim(); m_Str = _T("<"); m_Str += tmp; m_Str += _T(">");//m_Buffer.Mid(start, m_TokenIndex - start); } else if (CurrentChar() == '(') { m_IsOperator = false; // skip blocks () [] if (!SkipBlock(CurrentChar())) return wxEmptyString; wxString tmp = m_Buffer.Mid(start, m_TokenIndex - start); // tmp.Replace(_T("\t"), _T(" ")); // replace tabs with spaces // tmp.Replace(_T("\n"), _T(" ")); // replace LF with spaces // tmp.Replace(_T("\r"), _T(" ")); // replace CR with spaces { // this is much faster: size_t i; while((i = tmp.find_first_of(TokenizerConsts::tabcrlf)) != wxString::npos) //tmp[i] = _T(' '); tmp.SetAt(i,_T(' ')); } // fix-up arguments (remove excessive spaces/tabs/newlines) for (unsigned int i = 0; i < tmp.Length() - 1; ++i) { //skip spaces before '=' and ',' if (tmp.GetChar(i) == ' ' && (tmp.GetChar(i + 1) == ',' || tmp.GetChar(i + 1) == '=')) continue; if (tmp.GetChar(i) == '/' && tmp.GetChar(i + 1) == '*') { // skip C comments i += 2; while (i < tmp.Length() - 1) { if (tmp.GetChar(i) == '*' && tmp.GetChar(i + 1) == '/') break; ++i; } if (i >= tmp.Length() - 1 || tmp.GetChar(i + 1) != '/') continue; // we failed... i += 2; } else if (tmp.GetChar(i) == '=') { // skip default assignments ++i; int level = 0; // nesting parenthesis while (i < tmp.Length()) { if (tmp.GetChar(i) == '(') ++level; else if (tmp.GetChar(i) == ')') --level; if ((tmp.GetChar(i) == ',' && level == 0) || (tmp.GetChar(i) == ')' && level < 0)) break; ++i; } if (i < tmp.Length() && tmp.GetChar(i) == ',') --i; continue; // we are done here } if (i < tmp.Length() - 1) { if ((tmp.GetChar(i) == ' ') && (tmp.GetChar(i + 1) == ' ')) continue; // skip excessive spaces // in case of c-style comments "i" might already be tmp.Length() // thus do only add the current char otherwise. // otherwise the following statement: // m_Str << _T(')'); // below would add another closing bracket. m_Str << tmp.GetChar(i); } } m_Str << _T(')'); // add closing parenthesis (see "i < tmp.Length() - 1" in previous "for") // m_Str.Replace(_T(" "), _T(" ")); // replace two-spaces with single-space (introduced if it skipped comments or assignments) // m_Str.Replace(_T("( "), _T("(")); // m_Str.Replace(_T(" )"), _T(")")); //Str.Replace is massive overkill here since it has to allocate one new block per replacement CompactSpaces(m_Str); } else { if (CurrentChar() == '{') ++m_NestLevel; else if (CurrentChar() == '}') --m_NestLevel; m_Str = CurrentChar(); MoveToNextChar(); } if (m_LastWasPreprocessor && !m_Str.IsSameAs(_T("#")) && !m_LastPreprocessor.IsSameAs(_T("#"))) { if (!m_LastPreprocessor.IsSameAs(TokenizerConsts::include_str)) { // except for #include and #if[[n]def], all other preprocessor directives need only // one word exactly after the directive, e.g. #define THIS_WORD SkipToEOL(); } m_LastPreprocessor.Clear(); } if (m_LastWasPreprocessor) m_LastPreprocessor << m_Str; m_LastWasPreprocessor = false; return m_Str; }