Ejemplo n.º 1
0
void Entity::handleTag( const Comment* openingTag, const Comment* closingTag, QTextStream& file, QIODevice& output) {

    file.seek(openingTag->getCommentEnd()+1);

    while (!file.atEnd()) {
        qint64 pos=file.pos();
        if (pos>=closingTag->getCommentStart()){
            file.seek(closingTag->getCommentEnd()+1);
            break;
        }
        QChar c1;
        file >> c1;
        const Comment* p= isSpecial(pos);

        if (p!=nullptr){
            if (p->isAutoClosing()){
                //     output.write(p->comment);
                //    output.write("<!--REMOVESTART-->");
                p->output(this,output);
                //    output.write("<!--REMOVEEND-->");
            } else {
                const Comment* closingTag=findClosingTag(p,file);
                if (closingTag != nullptr){
                    QBuffer buf;
                    buf.open(QBuffer::WriteOnly|QBuffer::Text);
                    handleTag(p,closingTag,file,buf);
                    buf.close();

                    if (false /*p->getTag()==STYLE_START*/){
                        //embedded_styles.append(buf.buffer());
                    } else {
                        output.write(buf.buffer());
                    }
                }
            }
        } else if (isInOuput(pos)){

            if (openingTag->isHTML() && c1=='\n'){
                output.write(QString("<BR/>").toUtf8());
            } else {
                if (!c1.isNonCharacter()) output.write(QString(c1).toUtf8());
            }
        }
    }

    return ;
}
Ejemplo n.º 2
0
bool Entity::executeComment(){

    QFile fil(fileInfo.absoluteFilePath());
    if (!fil.open(QIODevice::ReadOnly | QIODevice::Text))
        return -1;

    QTextStream file_utf8(&fil);
    file_utf8.setAutoDetectUnicode(true);

    Comment dummyStartTag(DUMMY,-1,-1);
    Comment dummyEndTag(DUMMY,fil.size(),fil.size());

    qDebug() << fil.size();

    QBuffer mem;
    mem.open(QBuffer::WriteOnly|QBuffer::Text);
    handleTag(&dummyStartTag, &dummyEndTag, file_utf8,mem );
    mem.close();

    check();

    QFileInfo outputfileInfo(outputSite,fileInfo.baseName()+".html");

    QFile output(outputfileInfo.absoluteFilePath());

    if (!output.open(QIODevice::WriteOnly | QIODevice::Text | QIODevice::Truncate))
        return -1;

    outputStringList(output,static_header_1);

    if (!embedded_styles.isEmpty()){
        output.write(QString("<style>").toUtf8());
        outputStringList(output,embedded_styles);
        output.write(QString("</style>").toUtf8());
    }

    outputStringList(output,static_header_2);

    output.write(mem.buffer());

    generatePageFooter(output);

    output.close();

}
Ejemplo n.º 3
0
indri::api::ParsedDocument* indri::parse::TaggedTextParser::parse( indri::parse::TokenizedDocument* document ) {
  _termBuffer.clear();
  _termBuffer.grow( document->textLength * 4 );
  // need to leave room here for relative->absolute URL expansion

  _document.text = document->text;
  _document.textLength = document->textLength;

  for (size_t t = 0; t < _document.tags.size(); t++) {
    delete _document.tags[t];
  }

  _document.terms.clear();
  _document.tags.clear();
  _document.positions.clear();

  _document.metadata = document->metadata;
  // have to process metadata tag conflations.
  for (size_t idx = 0; idx < _document.metadata.size(); idx++) {
    _document.metadata[idx].key = _p_conflater->conflate(_document.metadata[idx].key);
  }
  

  _document.content = document->content;
  _document.contentLength = document->contentLength;
  

  token_pos = 0;
  tokens_excluded = 0;

  indri::utility::greedy_vector<TermExtent>::iterator i  = 
    document->positions.begin();

  indri::utility::greedy_vector<char*>::iterator j  = 
    document->terms.begin();

  indri::utility::greedy_vector<TagEvent>::iterator k = 
    document->tags.begin();

  initialize(document, &_document);

  // Add a global parent element for all tags.
  // parameters need to be set for this in the FCE
  TagEvent globalTag;  
  globalTag.name = "document";
  globalTag.open_tag = true;
  globalTag.pos = 0; // start at the beginning
  globalTag.begin = 0;
  globalTag.end = 0;
  handleTag(&globalTag);

  while ( i != document->positions.end() ) {

    // As it iterates through the token events, Parser must also run
    // through the TagEvents recognized by the Tokenizer and call
    // handleTag on each event when its token position is reached.
    // handleTag builds the TagList, which checks for overlapping
    // tags, uses the Conflater to conflate tags, and also sets the
    // include and exclude regions.  Tags are checked first, because
    // if there is a tag at position i that starts an include region,
    // then we also need to include the token at position i.

    while ( k != document->tags.end() && token_pos == (*k).pos ) { // There may be multiple tags at a token position
      
      // Adjust actual token position for tokens that may have
      // been excluded:
      (*k).pos -= tokens_excluded;
      
      handleTag( &(*k) );
      k++;
    }

    // The Parser's job is to have a look at the token stream produced
    // by the Tokenizer, and insert, discard or rewrite any tokens as
    // it sees fit.  Any kind of token-level processing that a Parser
    // writer sees fit could be done here.

    if ( ! ( _exclude || ! _include ) ) {

      _document.positions.push_back( (*i) );
      _document.terms.push_back( (*j) );

      //       std::cout << "Token [" << (*j) << "] <" << (*i).begin 
      //                << ", " << (*i).end << ">" << std::endl;

    } else {

      tokens_excluded++;
    }

    i++;
    j++;

    token_pos++;
  }

  // We've reached the end of the term positions, so close any tags
  // we've opened.

  while ( k != document->tags.end() && token_pos == (*k).pos ) { // There may be multiple tags at a token position
    
    // Adjust actual token position for tokens that may have
    // been excluded:
    (*k).pos -= tokens_excluded;
    
    handleTag( &(*k) );
    k++;
  }
  // close the global document tag;
  
  globalTag.open_tag = false;
  globalTag.pos = _document.positions.size();
  globalTag.begin = _document.textLength;
  globalTag.end = _document.textLength;
  handleTag(&globalTag);
  
  // Tag lists are actually written in the cleanup function:
  cleanup(document, &_document);
  return &_document;
}
Ejemplo n.º 4
0
void 
nsHtml5MetaScanner::stateLoop(PRInt32 state)
{
  PRInt32 c = -1;
  bool reconsume = false;
  stateloop: for (; ; ) {
    switch(state) {
      case NS_HTML5META_SCANNER_DATA: {
        for (; ; ) {
          if (reconsume) {
            reconsume = false;
          } else {
            c = read();
          }
          switch(c) {
            case -1: {
              NS_HTML5_BREAK(stateloop);
            }
            case '<': {
              state = NS_HTML5META_SCANNER_TAG_OPEN;
              NS_HTML5_BREAK(dataloop);
            }
            default: {
              continue;
            }
          }
        }
        dataloop_end: ;
      }
      case NS_HTML5META_SCANNER_TAG_OPEN: {
        for (; ; ) {
          c = read();
          switch(c) {
            case -1: {
              NS_HTML5_BREAK(stateloop);
            }
            case 'm':
            case 'M': {
              metaState = NS_HTML5META_SCANNER_M;
              state = NS_HTML5META_SCANNER_TAG_NAME;
              NS_HTML5_BREAK(tagopenloop);
            }
            case '!': {
              state = NS_HTML5META_SCANNER_MARKUP_DECLARATION_OPEN;
              NS_HTML5_CONTINUE(stateloop);
            }
            case '\?':
            case '/': {
              state = NS_HTML5META_SCANNER_SCAN_UNTIL_GT;
              NS_HTML5_CONTINUE(stateloop);
            }
            case '>': {
              state = NS_HTML5META_SCANNER_DATA;
              NS_HTML5_CONTINUE(stateloop);
            }
            default: {
              if ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z')) {
                metaState = NS_HTML5META_SCANNER_NO;
                state = NS_HTML5META_SCANNER_TAG_NAME;
                NS_HTML5_BREAK(tagopenloop);
              }
              state = NS_HTML5META_SCANNER_DATA;
              reconsume = true;
              NS_HTML5_CONTINUE(stateloop);
            }
          }
        }
        tagopenloop_end: ;
      }
      case NS_HTML5META_SCANNER_TAG_NAME: {
        for (; ; ) {
          c = read();
          switch(c) {
            case -1: {
              NS_HTML5_BREAK(stateloop);
            }
            case ' ':
            case '\t':
            case '\n':
            case '\f': {
              state = NS_HTML5META_SCANNER_BEFORE_ATTRIBUTE_NAME;
              NS_HTML5_BREAK(tagnameloop);
            }
            case '/': {
              state = NS_HTML5META_SCANNER_SELF_CLOSING_START_TAG;
              NS_HTML5_CONTINUE(stateloop);
            }
            case '>': {
              state = NS_HTML5META_SCANNER_DATA;
              NS_HTML5_CONTINUE(stateloop);
            }
            case 'e':
            case 'E': {
              if (metaState == NS_HTML5META_SCANNER_M) {
                metaState = NS_HTML5META_SCANNER_E;
              } else {
                metaState = NS_HTML5META_SCANNER_NO;
              }
              continue;
            }
            case 't':
            case 'T': {
              if (metaState == NS_HTML5META_SCANNER_E) {
                metaState = NS_HTML5META_SCANNER_T;
              } else {
                metaState = NS_HTML5META_SCANNER_NO;
              }
              continue;
            }
            case 'a':
            case 'A': {
              if (metaState == NS_HTML5META_SCANNER_T) {
                metaState = NS_HTML5META_SCANNER_A;
              } else {
                metaState = NS_HTML5META_SCANNER_NO;
              }
              continue;
            }
            default: {
              metaState = NS_HTML5META_SCANNER_NO;
              continue;
            }
          }
        }
        tagnameloop_end: ;
      }
      case NS_HTML5META_SCANNER_BEFORE_ATTRIBUTE_NAME: {
        for (; ; ) {
          if (reconsume) {
            reconsume = false;
          } else {
            c = read();
          }
          switch(c) {
            case -1: {
              NS_HTML5_BREAK(stateloop);
            }
            case ' ':
            case '\t':
            case '\n':
            case '\f': {
              continue;
            }
            case '/': {
              state = NS_HTML5META_SCANNER_SELF_CLOSING_START_TAG;
              NS_HTML5_CONTINUE(stateloop);
            }
            case '>': {
              if (handleTag()) {
                NS_HTML5_BREAK(stateloop);
              }
              state = NS_HTML5META_SCANNER_DATA;
              NS_HTML5_CONTINUE(stateloop);
            }
            case 'c':
            case 'C': {
              contentIndex = 0;
              charsetIndex = 0;
              httpEquivIndex = PR_INT32_MAX;
              contentTypeIndex = PR_INT32_MAX;
              state = NS_HTML5META_SCANNER_ATTRIBUTE_NAME;
              NS_HTML5_BREAK(beforeattributenameloop);
            }
            case 'h':
            case 'H': {
              contentIndex = PR_INT32_MAX;
              charsetIndex = PR_INT32_MAX;
              httpEquivIndex = 0;
              contentTypeIndex = PR_INT32_MAX;
              state = NS_HTML5META_SCANNER_ATTRIBUTE_NAME;
              NS_HTML5_BREAK(beforeattributenameloop);
            }
            default: {
              contentIndex = PR_INT32_MAX;
              charsetIndex = PR_INT32_MAX;
              httpEquivIndex = PR_INT32_MAX;
              contentTypeIndex = PR_INT32_MAX;
              state = NS_HTML5META_SCANNER_ATTRIBUTE_NAME;
              NS_HTML5_BREAK(beforeattributenameloop);
            }
          }
        }
        beforeattributenameloop_end: ;
      }
      case NS_HTML5META_SCANNER_ATTRIBUTE_NAME: {
        for (; ; ) {
          c = read();
          switch(c) {
            case -1: {
              NS_HTML5_BREAK(stateloop);
            }
            case ' ':
            case '\t':
            case '\n':
            case '\f': {
              state = NS_HTML5META_SCANNER_AFTER_ATTRIBUTE_NAME;
              NS_HTML5_CONTINUE(stateloop);
            }
            case '/': {
              state = NS_HTML5META_SCANNER_SELF_CLOSING_START_TAG;
              NS_HTML5_CONTINUE(stateloop);
            }
            case '=': {
              strBufLen = 0;
              contentTypeIndex = 0;
              state = NS_HTML5META_SCANNER_BEFORE_ATTRIBUTE_VALUE;
              NS_HTML5_BREAK(attributenameloop);
            }
            case '>': {
              if (handleTag()) {
                NS_HTML5_BREAK(stateloop);
              }
              state = NS_HTML5META_SCANNER_DATA;
              NS_HTML5_CONTINUE(stateloop);
            }
            default: {
              if (metaState == NS_HTML5META_SCANNER_A) {
                if (c >= 'A' && c <= 'Z') {
                  c += 0x20;
                }
                if (contentIndex < CONTENT.length && c == CONTENT[contentIndex]) {
                  ++contentIndex;
                } else {
                  contentIndex = PR_INT32_MAX;
                }
                if (charsetIndex < CHARSET.length && c == CHARSET[charsetIndex]) {
                  ++charsetIndex;
                } else {
                  charsetIndex = PR_INT32_MAX;
                }
                if (httpEquivIndex < HTTP_EQUIV.length && c == HTTP_EQUIV[httpEquivIndex]) {
                  ++httpEquivIndex;
                } else {
                  httpEquivIndex = PR_INT32_MAX;
                }
              }
              continue;
            }
          }
        }
        attributenameloop_end: ;
      }
      case NS_HTML5META_SCANNER_BEFORE_ATTRIBUTE_VALUE: {
        for (; ; ) {
          c = read();
          switch(c) {
            case -1: {
              NS_HTML5_BREAK(stateloop);
            }
            case ' ':
            case '\t':
            case '\n':
            case '\f': {
              continue;
            }
            case '\"': {
              state = NS_HTML5META_SCANNER_ATTRIBUTE_VALUE_DOUBLE_QUOTED;
              NS_HTML5_BREAK(beforeattributevalueloop);
            }
            case '\'': {
              state = NS_HTML5META_SCANNER_ATTRIBUTE_VALUE_SINGLE_QUOTED;
              NS_HTML5_CONTINUE(stateloop);
            }
            case '>': {
              if (handleTag()) {
                NS_HTML5_BREAK(stateloop);
              }
              state = NS_HTML5META_SCANNER_DATA;
              NS_HTML5_CONTINUE(stateloop);
            }
            default: {
              handleCharInAttributeValue(c);
              state = NS_HTML5META_SCANNER_ATTRIBUTE_VALUE_UNQUOTED;
              NS_HTML5_CONTINUE(stateloop);
            }
          }
        }
        beforeattributevalueloop_end: ;
      }
      case NS_HTML5META_SCANNER_ATTRIBUTE_VALUE_DOUBLE_QUOTED: {
        for (; ; ) {
          if (reconsume) {
            reconsume = false;
          } else {
            c = read();
          }
          switch(c) {
            case -1: {
              NS_HTML5_BREAK(stateloop);
            }
            case '\"': {
              handleAttributeValue();
              state = NS_HTML5META_SCANNER_AFTER_ATTRIBUTE_VALUE_QUOTED;
              NS_HTML5_BREAK(attributevaluedoublequotedloop);
            }
            default: {
              handleCharInAttributeValue(c);
              continue;
            }
          }
        }
        attributevaluedoublequotedloop_end: ;
      }
      case NS_HTML5META_SCANNER_AFTER_ATTRIBUTE_VALUE_QUOTED: {
        for (; ; ) {
          c = read();
          switch(c) {
            case -1: {
              NS_HTML5_BREAK(stateloop);
            }
            case ' ':
            case '\t':
            case '\n':
            case '\f': {
              state = NS_HTML5META_SCANNER_BEFORE_ATTRIBUTE_NAME;
              NS_HTML5_CONTINUE(stateloop);
            }
            case '/': {
              state = NS_HTML5META_SCANNER_SELF_CLOSING_START_TAG;
              NS_HTML5_BREAK(afterattributevaluequotedloop);
            }
            case '>': {
              if (handleTag()) {
                NS_HTML5_BREAK(stateloop);
              }
              state = NS_HTML5META_SCANNER_DATA;
              NS_HTML5_CONTINUE(stateloop);
            }
            default: {
              state = NS_HTML5META_SCANNER_BEFORE_ATTRIBUTE_NAME;
              reconsume = true;
              NS_HTML5_CONTINUE(stateloop);
            }
          }
        }
        afterattributevaluequotedloop_end: ;
      }
      case NS_HTML5META_SCANNER_SELF_CLOSING_START_TAG: {
        c = read();
        switch(c) {
          case -1: {
            NS_HTML5_BREAK(stateloop);
          }
          case '>': {
            if (handleTag()) {
              NS_HTML5_BREAK(stateloop);
            }
            state = NS_HTML5META_SCANNER_DATA;
            NS_HTML5_CONTINUE(stateloop);
          }
          default: {
            state = NS_HTML5META_SCANNER_BEFORE_ATTRIBUTE_NAME;
            reconsume = true;
            NS_HTML5_CONTINUE(stateloop);
          }
        }
      }
      case NS_HTML5META_SCANNER_ATTRIBUTE_VALUE_UNQUOTED: {
        for (; ; ) {
          if (reconsume) {
            reconsume = false;
          } else {
            c = read();
          }
          switch(c) {
            case -1: {
              NS_HTML5_BREAK(stateloop);
            }
            case ' ':
            case '\t':
            case '\n':
            case '\f': {
              handleAttributeValue();
              state = NS_HTML5META_SCANNER_BEFORE_ATTRIBUTE_NAME;
              NS_HTML5_CONTINUE(stateloop);
            }
            case '>': {
              handleAttributeValue();
              if (handleTag()) {
                NS_HTML5_BREAK(stateloop);
              }
              state = NS_HTML5META_SCANNER_DATA;
              NS_HTML5_CONTINUE(stateloop);
            }
            default: {
              handleCharInAttributeValue(c);
              continue;
            }
          }
        }
      }
      case NS_HTML5META_SCANNER_AFTER_ATTRIBUTE_NAME: {
        for (; ; ) {
          c = read();
          switch(c) {
            case -1: {
              NS_HTML5_BREAK(stateloop);
            }
            case ' ':
            case '\t':
            case '\n':
            case '\f': {
              continue;
            }
            case '/': {
              handleAttributeValue();
              state = NS_HTML5META_SCANNER_SELF_CLOSING_START_TAG;
              NS_HTML5_CONTINUE(stateloop);
            }
            case '=': {
              strBufLen = 0;
              contentTypeIndex = 0;
              state = NS_HTML5META_SCANNER_BEFORE_ATTRIBUTE_VALUE;
              NS_HTML5_CONTINUE(stateloop);
            }
            case '>': {
              handleAttributeValue();
              if (handleTag()) {
                NS_HTML5_BREAK(stateloop);
              }
              state = NS_HTML5META_SCANNER_DATA;
              NS_HTML5_CONTINUE(stateloop);
            }
            case 'c':
            case 'C': {
              contentIndex = 0;
              charsetIndex = 0;
              state = NS_HTML5META_SCANNER_ATTRIBUTE_NAME;
              NS_HTML5_CONTINUE(stateloop);
            }
            default: {
              contentIndex = PR_INT32_MAX;
              charsetIndex = PR_INT32_MAX;
              state = NS_HTML5META_SCANNER_ATTRIBUTE_NAME;
              NS_HTML5_CONTINUE(stateloop);
            }
          }
        }
      }
      case NS_HTML5META_SCANNER_MARKUP_DECLARATION_OPEN: {
        for (; ; ) {
          c = read();
          switch(c) {
            case -1: {
              NS_HTML5_BREAK(stateloop);
            }
            case '-': {
              state = NS_HTML5META_SCANNER_MARKUP_DECLARATION_HYPHEN;
              NS_HTML5_BREAK(markupdeclarationopenloop);
            }
            default: {
              state = NS_HTML5META_SCANNER_SCAN_UNTIL_GT;
              reconsume = true;
              NS_HTML5_CONTINUE(stateloop);
            }
          }
        }
        markupdeclarationopenloop_end: ;
      }
      case NS_HTML5META_SCANNER_MARKUP_DECLARATION_HYPHEN: {
        for (; ; ) {
          c = read();
          switch(c) {
            case -1: {
              NS_HTML5_BREAK(stateloop);
            }
            case '-': {
              state = NS_HTML5META_SCANNER_COMMENT_START;
              NS_HTML5_BREAK(markupdeclarationhyphenloop);
            }
            default: {
              state = NS_HTML5META_SCANNER_SCAN_UNTIL_GT;
              reconsume = true;
              NS_HTML5_CONTINUE(stateloop);
            }
          }
        }
        markupdeclarationhyphenloop_end: ;
      }
      case NS_HTML5META_SCANNER_COMMENT_START: {
        for (; ; ) {
          c = read();
          switch(c) {
            case -1: {
              NS_HTML5_BREAK(stateloop);
            }
            case '-': {
              state = NS_HTML5META_SCANNER_COMMENT_START_DASH;
              NS_HTML5_CONTINUE(stateloop);
            }
            case '>': {
              state = NS_HTML5META_SCANNER_DATA;
              NS_HTML5_CONTINUE(stateloop);
            }
            default: {
              state = NS_HTML5META_SCANNER_COMMENT;
              NS_HTML5_BREAK(commentstartloop);
            }
          }
        }
        commentstartloop_end: ;
      }
      case NS_HTML5META_SCANNER_COMMENT: {
        for (; ; ) {
          c = read();
          switch(c) {
            case -1: {
              NS_HTML5_BREAK(stateloop);
            }
            case '-': {
              state = NS_HTML5META_SCANNER_COMMENT_END_DASH;
              NS_HTML5_BREAK(commentloop);
            }
            default: {
              continue;
            }
          }
        }
        commentloop_end: ;
      }
      case NS_HTML5META_SCANNER_COMMENT_END_DASH: {
        for (; ; ) {
          c = read();
          switch(c) {
            case -1: {
              NS_HTML5_BREAK(stateloop);
            }
            case '-': {
              state = NS_HTML5META_SCANNER_COMMENT_END;
              NS_HTML5_BREAK(commentenddashloop);
            }
            default: {
              state = NS_HTML5META_SCANNER_COMMENT;
              NS_HTML5_CONTINUE(stateloop);
            }
          }
        }
        commentenddashloop_end: ;
      }
      case NS_HTML5META_SCANNER_COMMENT_END: {
        for (; ; ) {
          c = read();
          switch(c) {
            case -1: {
              NS_HTML5_BREAK(stateloop);
            }
            case '>': {
              state = NS_HTML5META_SCANNER_DATA;
              NS_HTML5_CONTINUE(stateloop);
            }
            case '-': {
              continue;
            }
            default: {
              state = NS_HTML5META_SCANNER_COMMENT;
              NS_HTML5_CONTINUE(stateloop);
            }
          }
        }
      }
      case NS_HTML5META_SCANNER_COMMENT_START_DASH: {
        c = read();
        switch(c) {
          case -1: {
            NS_HTML5_BREAK(stateloop);
          }
          case '-': {
            state = NS_HTML5META_SCANNER_COMMENT_END;
            NS_HTML5_CONTINUE(stateloop);
          }
          case '>': {
            state = NS_HTML5META_SCANNER_DATA;
            NS_HTML5_CONTINUE(stateloop);
          }
          default: {
            state = NS_HTML5META_SCANNER_COMMENT;
            NS_HTML5_CONTINUE(stateloop);
          }
        }
      }
      case NS_HTML5META_SCANNER_ATTRIBUTE_VALUE_SINGLE_QUOTED: {
        for (; ; ) {
          if (reconsume) {
            reconsume = false;
          } else {
            c = read();
          }
          switch(c) {
            case -1: {
              NS_HTML5_BREAK(stateloop);
            }
            case '\'': {
              handleAttributeValue();
              state = NS_HTML5META_SCANNER_AFTER_ATTRIBUTE_VALUE_QUOTED;
              NS_HTML5_CONTINUE(stateloop);
            }
            default: {
              handleCharInAttributeValue(c);
              continue;
            }
          }
        }
      }
      case NS_HTML5META_SCANNER_SCAN_UNTIL_GT: {
        for (; ; ) {
          if (reconsume) {
            reconsume = false;
          } else {
            c = read();
          }
          switch(c) {
            case -1: {
              NS_HTML5_BREAK(stateloop);
            }
            case '>': {
              state = NS_HTML5META_SCANNER_DATA;
              NS_HTML5_CONTINUE(stateloop);
            }
            default: {
              continue;
            }
          }
        }
      }
    }
  }
  stateloop_end: ;
  stateSave = state;
}