void Entity::handleTag( const Comment* openingTag, const Comment* closingTag, QTextStream& file, QIODevice& output) { file.seek(openingTag->getCommentEnd()+1); while (!file.atEnd()) { qint64 pos=file.pos(); if (pos>=closingTag->getCommentStart()){ file.seek(closingTag->getCommentEnd()+1); break; } QChar c1; file >> c1; const Comment* p= isSpecial(pos); if (p!=nullptr){ if (p->isAutoClosing()){ // output.write(p->comment); // output.write("<!--REMOVESTART-->"); p->output(this,output); // output.write("<!--REMOVEEND-->"); } else { const Comment* closingTag=findClosingTag(p,file); if (closingTag != nullptr){ QBuffer buf; buf.open(QBuffer::WriteOnly|QBuffer::Text); handleTag(p,closingTag,file,buf); buf.close(); if (false /*p->getTag()==STYLE_START*/){ //embedded_styles.append(buf.buffer()); } else { output.write(buf.buffer()); } } } } else if (isInOuput(pos)){ if (openingTag->isHTML() && c1=='\n'){ output.write(QString("<BR/>").toUtf8()); } else { if (!c1.isNonCharacter()) output.write(QString(c1).toUtf8()); } } } return ; }
bool Entity::executeComment(){ QFile fil(fileInfo.absoluteFilePath()); if (!fil.open(QIODevice::ReadOnly | QIODevice::Text)) return -1; QTextStream file_utf8(&fil); file_utf8.setAutoDetectUnicode(true); Comment dummyStartTag(DUMMY,-1,-1); Comment dummyEndTag(DUMMY,fil.size(),fil.size()); qDebug() << fil.size(); QBuffer mem; mem.open(QBuffer::WriteOnly|QBuffer::Text); handleTag(&dummyStartTag, &dummyEndTag, file_utf8,mem ); mem.close(); check(); QFileInfo outputfileInfo(outputSite,fileInfo.baseName()+".html"); QFile output(outputfileInfo.absoluteFilePath()); if (!output.open(QIODevice::WriteOnly | QIODevice::Text | QIODevice::Truncate)) return -1; outputStringList(output,static_header_1); if (!embedded_styles.isEmpty()){ output.write(QString("<style>").toUtf8()); outputStringList(output,embedded_styles); output.write(QString("</style>").toUtf8()); } outputStringList(output,static_header_2); output.write(mem.buffer()); generatePageFooter(output); output.close(); }
indri::api::ParsedDocument* indri::parse::TaggedTextParser::parse( indri::parse::TokenizedDocument* document ) { _termBuffer.clear(); _termBuffer.grow( document->textLength * 4 ); // need to leave room here for relative->absolute URL expansion _document.text = document->text; _document.textLength = document->textLength; for (size_t t = 0; t < _document.tags.size(); t++) { delete _document.tags[t]; } _document.terms.clear(); _document.tags.clear(); _document.positions.clear(); _document.metadata = document->metadata; // have to process metadata tag conflations. for (size_t idx = 0; idx < _document.metadata.size(); idx++) { _document.metadata[idx].key = _p_conflater->conflate(_document.metadata[idx].key); } _document.content = document->content; _document.contentLength = document->contentLength; token_pos = 0; tokens_excluded = 0; indri::utility::greedy_vector<TermExtent>::iterator i = document->positions.begin(); indri::utility::greedy_vector<char*>::iterator j = document->terms.begin(); indri::utility::greedy_vector<TagEvent>::iterator k = document->tags.begin(); initialize(document, &_document); // Add a global parent element for all tags. // parameters need to be set for this in the FCE TagEvent globalTag; globalTag.name = "document"; globalTag.open_tag = true; globalTag.pos = 0; // start at the beginning globalTag.begin = 0; globalTag.end = 0; handleTag(&globalTag); while ( i != document->positions.end() ) { // As it iterates through the token events, Parser must also run // through the TagEvents recognized by the Tokenizer and call // handleTag on each event when its token position is reached. // handleTag builds the TagList, which checks for overlapping // tags, uses the Conflater to conflate tags, and also sets the // include and exclude regions. Tags are checked first, because // if there is a tag at position i that starts an include region, // then we also need to include the token at position i. while ( k != document->tags.end() && token_pos == (*k).pos ) { // There may be multiple tags at a token position // Adjust actual token position for tokens that may have // been excluded: (*k).pos -= tokens_excluded; handleTag( &(*k) ); k++; } // The Parser's job is to have a look at the token stream produced // by the Tokenizer, and insert, discard or rewrite any tokens as // it sees fit. Any kind of token-level processing that a Parser // writer sees fit could be done here. if ( ! ( _exclude || ! _include ) ) { _document.positions.push_back( (*i) ); _document.terms.push_back( (*j) ); // std::cout << "Token [" << (*j) << "] <" << (*i).begin // << ", " << (*i).end << ">" << std::endl; } else { tokens_excluded++; } i++; j++; token_pos++; } // We've reached the end of the term positions, so close any tags // we've opened. while ( k != document->tags.end() && token_pos == (*k).pos ) { // There may be multiple tags at a token position // Adjust actual token position for tokens that may have // been excluded: (*k).pos -= tokens_excluded; handleTag( &(*k) ); k++; } // close the global document tag; globalTag.open_tag = false; globalTag.pos = _document.positions.size(); globalTag.begin = _document.textLength; globalTag.end = _document.textLength; handleTag(&globalTag); // Tag lists are actually written in the cleanup function: cleanup(document, &_document); return &_document; }
void nsHtml5MetaScanner::stateLoop(PRInt32 state) { PRInt32 c = -1; bool reconsume = false; stateloop: for (; ; ) { switch(state) { case NS_HTML5META_SCANNER_DATA: { for (; ; ) { if (reconsume) { reconsume = false; } else { c = read(); } switch(c) { case -1: { NS_HTML5_BREAK(stateloop); } case '<': { state = NS_HTML5META_SCANNER_TAG_OPEN; NS_HTML5_BREAK(dataloop); } default: { continue; } } } dataloop_end: ; } case NS_HTML5META_SCANNER_TAG_OPEN: { for (; ; ) { c = read(); switch(c) { case -1: { NS_HTML5_BREAK(stateloop); } case 'm': case 'M': { metaState = NS_HTML5META_SCANNER_M; state = NS_HTML5META_SCANNER_TAG_NAME; NS_HTML5_BREAK(tagopenloop); } case '!': { state = NS_HTML5META_SCANNER_MARKUP_DECLARATION_OPEN; NS_HTML5_CONTINUE(stateloop); } case '\?': case '/': { state = NS_HTML5META_SCANNER_SCAN_UNTIL_GT; NS_HTML5_CONTINUE(stateloop); } case '>': { state = NS_HTML5META_SCANNER_DATA; NS_HTML5_CONTINUE(stateloop); } default: { if ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z')) { metaState = NS_HTML5META_SCANNER_NO; state = NS_HTML5META_SCANNER_TAG_NAME; NS_HTML5_BREAK(tagopenloop); } state = NS_HTML5META_SCANNER_DATA; reconsume = true; NS_HTML5_CONTINUE(stateloop); } } } tagopenloop_end: ; } case NS_HTML5META_SCANNER_TAG_NAME: { for (; ; ) { c = read(); switch(c) { case -1: { NS_HTML5_BREAK(stateloop); } case ' ': case '\t': case '\n': case '\f': { state = NS_HTML5META_SCANNER_BEFORE_ATTRIBUTE_NAME; NS_HTML5_BREAK(tagnameloop); } case '/': { state = NS_HTML5META_SCANNER_SELF_CLOSING_START_TAG; NS_HTML5_CONTINUE(stateloop); } case '>': { state = NS_HTML5META_SCANNER_DATA; NS_HTML5_CONTINUE(stateloop); } case 'e': case 'E': { if (metaState == NS_HTML5META_SCANNER_M) { metaState = NS_HTML5META_SCANNER_E; } else { metaState = NS_HTML5META_SCANNER_NO; } continue; } case 't': case 'T': { if (metaState == NS_HTML5META_SCANNER_E) { metaState = NS_HTML5META_SCANNER_T; } else { metaState = NS_HTML5META_SCANNER_NO; } continue; } case 'a': case 'A': { if (metaState == NS_HTML5META_SCANNER_T) { metaState = NS_HTML5META_SCANNER_A; } else { metaState = NS_HTML5META_SCANNER_NO; } continue; } default: { metaState = NS_HTML5META_SCANNER_NO; continue; } } } tagnameloop_end: ; } case NS_HTML5META_SCANNER_BEFORE_ATTRIBUTE_NAME: { for (; ; ) { if (reconsume) { reconsume = false; } else { c = read(); } switch(c) { case -1: { NS_HTML5_BREAK(stateloop); } case ' ': case '\t': case '\n': case '\f': { continue; } case '/': { state = NS_HTML5META_SCANNER_SELF_CLOSING_START_TAG; NS_HTML5_CONTINUE(stateloop); } case '>': { if (handleTag()) { NS_HTML5_BREAK(stateloop); } state = NS_HTML5META_SCANNER_DATA; NS_HTML5_CONTINUE(stateloop); } case 'c': case 'C': { contentIndex = 0; charsetIndex = 0; httpEquivIndex = PR_INT32_MAX; contentTypeIndex = PR_INT32_MAX; state = NS_HTML5META_SCANNER_ATTRIBUTE_NAME; NS_HTML5_BREAK(beforeattributenameloop); } case 'h': case 'H': { contentIndex = PR_INT32_MAX; charsetIndex = PR_INT32_MAX; httpEquivIndex = 0; contentTypeIndex = PR_INT32_MAX; state = NS_HTML5META_SCANNER_ATTRIBUTE_NAME; NS_HTML5_BREAK(beforeattributenameloop); } default: { contentIndex = PR_INT32_MAX; charsetIndex = PR_INT32_MAX; httpEquivIndex = PR_INT32_MAX; contentTypeIndex = PR_INT32_MAX; state = NS_HTML5META_SCANNER_ATTRIBUTE_NAME; NS_HTML5_BREAK(beforeattributenameloop); } } } beforeattributenameloop_end: ; } case NS_HTML5META_SCANNER_ATTRIBUTE_NAME: { for (; ; ) { c = read(); switch(c) { case -1: { NS_HTML5_BREAK(stateloop); } case ' ': case '\t': case '\n': case '\f': { state = NS_HTML5META_SCANNER_AFTER_ATTRIBUTE_NAME; NS_HTML5_CONTINUE(stateloop); } case '/': { state = NS_HTML5META_SCANNER_SELF_CLOSING_START_TAG; NS_HTML5_CONTINUE(stateloop); } case '=': { strBufLen = 0; contentTypeIndex = 0; state = NS_HTML5META_SCANNER_BEFORE_ATTRIBUTE_VALUE; NS_HTML5_BREAK(attributenameloop); } case '>': { if (handleTag()) { NS_HTML5_BREAK(stateloop); } state = NS_HTML5META_SCANNER_DATA; NS_HTML5_CONTINUE(stateloop); } default: { if (metaState == NS_HTML5META_SCANNER_A) { if (c >= 'A' && c <= 'Z') { c += 0x20; } if (contentIndex < CONTENT.length && c == CONTENT[contentIndex]) { ++contentIndex; } else { contentIndex = PR_INT32_MAX; } if (charsetIndex < CHARSET.length && c == CHARSET[charsetIndex]) { ++charsetIndex; } else { charsetIndex = PR_INT32_MAX; } if (httpEquivIndex < HTTP_EQUIV.length && c == HTTP_EQUIV[httpEquivIndex]) { ++httpEquivIndex; } else { httpEquivIndex = PR_INT32_MAX; } } continue; } } } attributenameloop_end: ; } case NS_HTML5META_SCANNER_BEFORE_ATTRIBUTE_VALUE: { for (; ; ) { c = read(); switch(c) { case -1: { NS_HTML5_BREAK(stateloop); } case ' ': case '\t': case '\n': case '\f': { continue; } case '\"': { state = NS_HTML5META_SCANNER_ATTRIBUTE_VALUE_DOUBLE_QUOTED; NS_HTML5_BREAK(beforeattributevalueloop); } case '\'': { state = NS_HTML5META_SCANNER_ATTRIBUTE_VALUE_SINGLE_QUOTED; NS_HTML5_CONTINUE(stateloop); } case '>': { if (handleTag()) { NS_HTML5_BREAK(stateloop); } state = NS_HTML5META_SCANNER_DATA; NS_HTML5_CONTINUE(stateloop); } default: { handleCharInAttributeValue(c); state = NS_HTML5META_SCANNER_ATTRIBUTE_VALUE_UNQUOTED; NS_HTML5_CONTINUE(stateloop); } } } beforeattributevalueloop_end: ; } case NS_HTML5META_SCANNER_ATTRIBUTE_VALUE_DOUBLE_QUOTED: { for (; ; ) { if (reconsume) { reconsume = false; } else { c = read(); } switch(c) { case -1: { NS_HTML5_BREAK(stateloop); } case '\"': { handleAttributeValue(); state = NS_HTML5META_SCANNER_AFTER_ATTRIBUTE_VALUE_QUOTED; NS_HTML5_BREAK(attributevaluedoublequotedloop); } default: { handleCharInAttributeValue(c); continue; } } } attributevaluedoublequotedloop_end: ; } case NS_HTML5META_SCANNER_AFTER_ATTRIBUTE_VALUE_QUOTED: { for (; ; ) { c = read(); switch(c) { case -1: { NS_HTML5_BREAK(stateloop); } case ' ': case '\t': case '\n': case '\f': { state = NS_HTML5META_SCANNER_BEFORE_ATTRIBUTE_NAME; NS_HTML5_CONTINUE(stateloop); } case '/': { state = NS_HTML5META_SCANNER_SELF_CLOSING_START_TAG; NS_HTML5_BREAK(afterattributevaluequotedloop); } case '>': { if (handleTag()) { NS_HTML5_BREAK(stateloop); } state = NS_HTML5META_SCANNER_DATA; NS_HTML5_CONTINUE(stateloop); } default: { state = NS_HTML5META_SCANNER_BEFORE_ATTRIBUTE_NAME; reconsume = true; NS_HTML5_CONTINUE(stateloop); } } } afterattributevaluequotedloop_end: ; } case NS_HTML5META_SCANNER_SELF_CLOSING_START_TAG: { c = read(); switch(c) { case -1: { NS_HTML5_BREAK(stateloop); } case '>': { if (handleTag()) { NS_HTML5_BREAK(stateloop); } state = NS_HTML5META_SCANNER_DATA; NS_HTML5_CONTINUE(stateloop); } default: { state = NS_HTML5META_SCANNER_BEFORE_ATTRIBUTE_NAME; reconsume = true; NS_HTML5_CONTINUE(stateloop); } } } case NS_HTML5META_SCANNER_ATTRIBUTE_VALUE_UNQUOTED: { for (; ; ) { if (reconsume) { reconsume = false; } else { c = read(); } switch(c) { case -1: { NS_HTML5_BREAK(stateloop); } case ' ': case '\t': case '\n': case '\f': { handleAttributeValue(); state = NS_HTML5META_SCANNER_BEFORE_ATTRIBUTE_NAME; NS_HTML5_CONTINUE(stateloop); } case '>': { handleAttributeValue(); if (handleTag()) { NS_HTML5_BREAK(stateloop); } state = NS_HTML5META_SCANNER_DATA; NS_HTML5_CONTINUE(stateloop); } default: { handleCharInAttributeValue(c); continue; } } } } case NS_HTML5META_SCANNER_AFTER_ATTRIBUTE_NAME: { for (; ; ) { c = read(); switch(c) { case -1: { NS_HTML5_BREAK(stateloop); } case ' ': case '\t': case '\n': case '\f': { continue; } case '/': { handleAttributeValue(); state = NS_HTML5META_SCANNER_SELF_CLOSING_START_TAG; NS_HTML5_CONTINUE(stateloop); } case '=': { strBufLen = 0; contentTypeIndex = 0; state = NS_HTML5META_SCANNER_BEFORE_ATTRIBUTE_VALUE; NS_HTML5_CONTINUE(stateloop); } case '>': { handleAttributeValue(); if (handleTag()) { NS_HTML5_BREAK(stateloop); } state = NS_HTML5META_SCANNER_DATA; NS_HTML5_CONTINUE(stateloop); } case 'c': case 'C': { contentIndex = 0; charsetIndex = 0; state = NS_HTML5META_SCANNER_ATTRIBUTE_NAME; NS_HTML5_CONTINUE(stateloop); } default: { contentIndex = PR_INT32_MAX; charsetIndex = PR_INT32_MAX; state = NS_HTML5META_SCANNER_ATTRIBUTE_NAME; NS_HTML5_CONTINUE(stateloop); } } } } case NS_HTML5META_SCANNER_MARKUP_DECLARATION_OPEN: { for (; ; ) { c = read(); switch(c) { case -1: { NS_HTML5_BREAK(stateloop); } case '-': { state = NS_HTML5META_SCANNER_MARKUP_DECLARATION_HYPHEN; NS_HTML5_BREAK(markupdeclarationopenloop); } default: { state = NS_HTML5META_SCANNER_SCAN_UNTIL_GT; reconsume = true; NS_HTML5_CONTINUE(stateloop); } } } markupdeclarationopenloop_end: ; } case NS_HTML5META_SCANNER_MARKUP_DECLARATION_HYPHEN: { for (; ; ) { c = read(); switch(c) { case -1: { NS_HTML5_BREAK(stateloop); } case '-': { state = NS_HTML5META_SCANNER_COMMENT_START; NS_HTML5_BREAK(markupdeclarationhyphenloop); } default: { state = NS_HTML5META_SCANNER_SCAN_UNTIL_GT; reconsume = true; NS_HTML5_CONTINUE(stateloop); } } } markupdeclarationhyphenloop_end: ; } case NS_HTML5META_SCANNER_COMMENT_START: { for (; ; ) { c = read(); switch(c) { case -1: { NS_HTML5_BREAK(stateloop); } case '-': { state = NS_HTML5META_SCANNER_COMMENT_START_DASH; NS_HTML5_CONTINUE(stateloop); } case '>': { state = NS_HTML5META_SCANNER_DATA; NS_HTML5_CONTINUE(stateloop); } default: { state = NS_HTML5META_SCANNER_COMMENT; NS_HTML5_BREAK(commentstartloop); } } } commentstartloop_end: ; } case NS_HTML5META_SCANNER_COMMENT: { for (; ; ) { c = read(); switch(c) { case -1: { NS_HTML5_BREAK(stateloop); } case '-': { state = NS_HTML5META_SCANNER_COMMENT_END_DASH; NS_HTML5_BREAK(commentloop); } default: { continue; } } } commentloop_end: ; } case NS_HTML5META_SCANNER_COMMENT_END_DASH: { for (; ; ) { c = read(); switch(c) { case -1: { NS_HTML5_BREAK(stateloop); } case '-': { state = NS_HTML5META_SCANNER_COMMENT_END; NS_HTML5_BREAK(commentenddashloop); } default: { state = NS_HTML5META_SCANNER_COMMENT; NS_HTML5_CONTINUE(stateloop); } } } commentenddashloop_end: ; } case NS_HTML5META_SCANNER_COMMENT_END: { for (; ; ) { c = read(); switch(c) { case -1: { NS_HTML5_BREAK(stateloop); } case '>': { state = NS_HTML5META_SCANNER_DATA; NS_HTML5_CONTINUE(stateloop); } case '-': { continue; } default: { state = NS_HTML5META_SCANNER_COMMENT; NS_HTML5_CONTINUE(stateloop); } } } } case NS_HTML5META_SCANNER_COMMENT_START_DASH: { c = read(); switch(c) { case -1: { NS_HTML5_BREAK(stateloop); } case '-': { state = NS_HTML5META_SCANNER_COMMENT_END; NS_HTML5_CONTINUE(stateloop); } case '>': { state = NS_HTML5META_SCANNER_DATA; NS_HTML5_CONTINUE(stateloop); } default: { state = NS_HTML5META_SCANNER_COMMENT; NS_HTML5_CONTINUE(stateloop); } } } case NS_HTML5META_SCANNER_ATTRIBUTE_VALUE_SINGLE_QUOTED: { for (; ; ) { if (reconsume) { reconsume = false; } else { c = read(); } switch(c) { case -1: { NS_HTML5_BREAK(stateloop); } case '\'': { handleAttributeValue(); state = NS_HTML5META_SCANNER_AFTER_ATTRIBUTE_VALUE_QUOTED; NS_HTML5_CONTINUE(stateloop); } default: { handleCharInAttributeValue(c); continue; } } } } case NS_HTML5META_SCANNER_SCAN_UNTIL_GT: { for (; ; ) { if (reconsume) { reconsume = false; } else { c = read(); } switch(c) { case -1: { NS_HTML5_BREAK(stateloop); } case '>': { state = NS_HTML5META_SCANNER_DATA; NS_HTML5_CONTINUE(stateloop); } default: { continue; } } } } } } stateloop_end: ; stateSave = state; }