void parseSourceFile(pSourceModule* pMod, bool debug=false) { boost::object_pool<pSourceRef> tokenPool; lexer::pLexer lexer(pMod->source()); void* pParser = corvusParseAlloc(malloc); #ifndef NDEBUG // DEBUG if (debug) corvusParseTrace(stderr, (char*)"trace: "); #endif // start at begining of source file AST::pParseContext& context = pMod->context(); context.incLineNum(); // line 1 context.setLastToken(tokenPool.construct(pSourceRef(lexer.sourceBegin(), 0))); context.setLastNewline(lexer.sourceBegin()); pSourceRef* curRange; pSourceCharIterator lastNL; bool inlineHtml = false; std::string HEREDOC_ID; pSourceCharIterator sourceEnd(lexer.sourceEnd()); lexer::rmatch match(lexer.sourceBegin(), lexer.sourceEnd()); do { corvus_nextLangToken(match); //std::cout << "match: [" << match.str() << "]\n"; // always make a range unless the scanner didn't match, in which case // we handle separately below if (match.id != match.npos()) { curRange = tokenPool.construct(pSourceRef(match.start, match.end-match.start)); context.setTokenLine(curRange); } switch (match.id) { case 0: { // end of input (success) break; } case T_CLOSE_TAG: { // we swallow one newline if it follows //if ((match.end != sourceEnd) && (*match.end == '\n')) // ++tokEnd; break; } case T_OPEN_TAG: { // state change (no parse), but count newlines from OPEN tag countNewlines(context, match, lastNL); break; } case ~0: // npos { // if state is HTML, collect characters for INLINE HTML token if (match.state == 0) { // we go until a single < is found, or end of input // this potentially breaks up inline htmls // at tags that don't turn out to be php open tags, // but that way we let the lexer handle the matching // and limit the special handler code here while ((*match.end != '<') && (match.end != sourceEnd)) { match.end++; } inlineHtml = true; curRange = tokenPool.construct(pSourceRef(match.start, match.end-match.start)); context.setTokenLine(curRange); countNewlines(context, match, lastNL); } // if state is HEREDOC, collect heredoc string, looking for heredoc id else if (match.state == 3) { // assert we have a heredoc ID assert(HEREDOC_ID.length() && "no heredoc id"); std::pair<pSourceCharIterator,pSourceCharIterator> idr = find_heredoc_id(HEREDOC_ID, lexer, match, pMod); countNewlines(context, match, lastNL); curRange = tokenPool.construct(pSourceRef(match.start, match.end-match.start)); context.setTokenLine(curRange); corvusParse(pParser, T_HEREDOC_STRING, curRange, pMod); match.start = idr.first; match.end = idr.second; curRange = tokenPool.construct(pSourceRef(match.start, match.end-match.start)); context.setTokenLine(curRange); corvusParse(pParser, T_HEREDOC_END, curRange, pMod); match.state = 1; HEREDOC_ID.clear(); } else { // unmatched token: error pMod->context().parseError(curRange, pSourceRange()); } break; } case T_HEREDOC_START: { // save the heredoc id so we can match the end pSourceCharIterator ms = match.start; while (*ms == '<' || *ms == ' ' || *ms == '\t' || *ms == '\'' || *ms == '"' ) ms++; if (*(match.end-2) == '"' || *(match.end-2) == '\'') HEREDOC_ID.assign(ms, match.end-2); else HEREDOC_ID.assign(ms, match.end-1); countNewlines(context, match, lastNL); corvusParse(pParser, T_HEREDOC_START, curRange, pMod); break; } case T_WHITESPACE: case T_INLINE_HTML: case T_DOC_COMMENT: case T_MULTILINE_COMMENT: case T_SINGLELINE_COMMENT: { // handle newlines countNewlines(context, match, lastNL); break; } default: { // parse corvusParse(pParser, match.id, curRange, pMod); break; } } // next token context.setLastToken(curRange); } while (match.id != 0); // finish parse corvusParse(pParser, 0, 0, pMod); // note, this may generate a parse error still context.finishParse(); // so don't finish until here corvusParseFree(pParser, free); }
void pLexer::dumpTokens(void) { std::string tokID; std::stringstream val; std::string HEREDOC_ID; rmatch match(sourceBegin_, sourceEnd_); do { corvus_nextLangToken(match); if (match.id == 0) { // end of input break; } else if (match.id == match.npos()) { // if state is HTML, collect characters for INLINE HTML token if (match.state == 0) { while ((*match.end != '<') && (match.end != sourceEnd_)) { match.end++; } std::cout << match.str() << " " << getTokenDescription(T_INLINE_HTML) << std::endl; } // if state is HEREDOC, collect heredoc string, looking for heredoc id else if (match.state == 3) { // assert we have a heredoc ID assert(HEREDOC_ID.length() && "no heredoc id"); match.end--; // we need to reverse this once to check for the // case of a heredoc with no body, only a newline look_for_id: if (sourceEnd_ - match.end < HEREDOC_ID.length()) { // the remaining source text is shorter than the heredocid length, // which means we're never going to match it std::cout << "dangling HEREDOC looking for: \"" << HEREDOC_ID << "\"" << std::endl; break; } pSourceCharIterator ms = match.end; pSourceCharIterator me = match.end+HEREDOC_ID.length(); std::string maybeID(ms, me); if (maybeID != HEREDOC_ID) { while ((*match.end != '\n') && (match.end != sourceEnd_)) { match.end++; } match.end++; // skip newline goto look_for_id; } // if we get here, we matched the heredoc id std::cout << match.str() << " " << getTokenDescription(T_DQ_STRING) << std::endl; match.start = ms; match.end = me; std::cout << match.str() << " " << getTokenDescription(T_HEREDOC_END) << std::endl; match.state = 1; HEREDOC_ID.clear(); } else { // unmatched character in PHP state std::cout << "breaking on unmatched: " << match.str() << std::endl; break; } } else { // matched // skip plain newlines in html state val.str(""); if (match.id == T_HEREDOC_START) { // save the heredoc id so we can match the end pSourceCharIterator ms = match.start; while (*ms == '<' || *ms == ' ' || *ms == '\t' || *ms == '\'' || *ms == '"' ) ms++; if (*(match.end-2) == '"' || *(match.end-2) == '\'') HEREDOC_ID.assign(ms, match.end-2); // cut end quote and newline else HEREDOC_ID.assign(ms, match.end-1); // just cut newline std::cout << std::string(match.start, match.end-1) << " T_HEREDOC_START" << std::endl; continue; } if (match.id != T_WHITESPACE) val << match.str(); if ((match.state == 0) && (val.str() == "\n")) continue; tokID = getTokenDescription(match.id); if (tokID.size() == 0) tokID = val.str(); std::cout << val.str() << " " << tokID << std::endl; } } while (match.id != 0); }