Example #1
0
short
PhysicalFastExtract::codeGen(Generator *generator)
{
  short result = 0;
  Space *space = generator->getSpace();
  CmpContext *cmpContext = generator->currentCmpContext();

  const ULng32 downQueueMaxSize = getDefault(GEN_FE_SIZE_DOWN);
  const ULng32 upQueueMaxSize = getDefault(GEN_FE_SIZE_UP);


  const ULng32 defaultBufferSize = getDefault(GEN_FE_BUFFER_SIZE);
  const ULng32 outputBufferSize = defaultBufferSize;
  const ULng32 requestBufferSize = defaultBufferSize;
  const ULng32 replyBufferSize = defaultBufferSize;
  const ULng32 numOutputBuffers = getDefault(GEN_FE_NUM_BUFFERS);

  // used in runtime stats
  Cardinality estimatedRowCount = (Cardinality)
                       (getInputCardinality() * getEstRowsUsed()).getValue();

  Int32 numChildren = getArity();
  ex_cri_desc * givenDesc = generator->getCriDesc(Generator::DOWN);
  ComTdb * childTdb = (ComTdb*) new (space) ComTdb();
  ExplainTuple *firstExplainTuple = 0;

  // Allocate a new map table for this child.
  //
  MapTable *localMapTable = generator->appendAtEnd();
  generator->setCriDesc(givenDesc, Generator::DOWN);
  child(0)->codeGen(generator);
  childTdb = (ComTdb *)(generator->getGenObj());
  firstExplainTuple = generator->getExplainTuple();

  ComTdbFastExtract *newTdb = NULL;
  char * targetName = NULL;
  char * hiveTableName = NULL;
  char * delimiter = NULL;
  char * header = NULL;
  char * nullString = NULL;
  char * recordSeparator = NULL;
  char * hdfsHostName = NULL;
  Int32 hdfsPortNum = getHdfsPort();

  char * newDelimiter = (char *)getDelimiter().data();
  char specChar = '0';
  if (!isHiveInsert() && isSpecialChar(newDelimiter, specChar))
  {
    newDelimiter = new (cmpContext->statementHeap()) char[2];
    newDelimiter[0] = specChar;
    newDelimiter[1] = '\0';
  }

  char * newRecordSep = (char *)getRecordSeparator().data();
  specChar = '0';
  if (!isHiveInsert() && isSpecialChar(newRecordSep, specChar))
  {
    newRecordSep = new (cmpContext->statementHeap()) char[2];
    newRecordSep[0] = specChar;
    newRecordSep[1] = '\0';
  }

  targetName = AllocStringInSpace(*space, (char *)getTargetName().data());
  hdfsHostName = AllocStringInSpace(*space, (char *)getHdfsHostName().data());
  hiveTableName = AllocStringInSpace(*space, (char *)getHiveTableName().data());
  delimiter = AllocStringInSpace(*space,  newDelimiter);
  header = AllocStringInSpace(*space, (char *)getHeader().data());
  nullString = AllocStringInSpace(*space, (char *)getNullString().data());
  recordSeparator = AllocStringInSpace(*space, newRecordSep);

   result = ft_codegen(generator,
                       *this,              // RelExpr &relExpr
                       newTdb,             // ComTdbUdr *&newTdb
                       estimatedRowCount,
                       targetName,
                       hdfsHostName,
                       hdfsPortNum,
                       hiveTableName,
                       delimiter,
                       header,
                       nullString,
                       recordSeparator,
                       downQueueMaxSize,
                       upQueueMaxSize,
                       outputBufferSize,
                       requestBufferSize,
                       replyBufferSize,
                       numOutputBuffers,
                       childTdb,
                       isSequenceFile());

  if (!generator->explainDisabled())
  {
    generator->setExplainTuple(addExplainInfo(newTdb, firstExplainTuple, 0, generator));
  }

  if (getTargetType() == FILE)
    newTdb->setTargetFile(1);
  else if (getTargetType() == SOCKET)
    newTdb->setTargetSocket(1);
  else
  GenAssert(0, "Unexpected Fast Extract target type")

  if (isAppend())
    newTdb->setIsAppend(1);
  if (this->includeHeader())
    newTdb->setIncludeHeader(1);

  if (isHiveInsert())
  {
    newTdb->setIsHiveInsert(1);
    newTdb->setIncludeHeader(0);
    setOverwriteHiveTable( getOverwriteHiveTable());
  }
  else
  {
    if (includeHeader())
      newTdb->setIncludeHeader(1);
  }
  if (getCompressionType() != NONE)
  {
    if (getCompressionType() == LZO)
      newTdb->setCompressLZO(1);
    else
    GenAssert(0, "Unexpected Fast Extract compression type")
  }
     if((ActiveSchemaDB()->getDefaults()).getToken(FAST_EXTRACT_DIAGS) == DF_ON)
    	 newTdb->setPrintDiags(1);

  return result;
}
Example #2
0
/*
 *  IdentifyInputToken(char *caCleanInputTokens[])
 *  take each token and check if its a reserved word, numerical or symbol pail, or variable
 *  if token is an illegal lexeme, print out error and exit program
 *
 */
void IdentifyInputToken(char *caCleanInputTokens[], namerecord_t *record_table){
    
    int ssNext = 0;
    int ss = 0;
    int rw = 0;
    int tknlen = 0;
    int tknLenNext = 0;
    int i = 0;
    int vs = 0;
    int LexRecordIndex = 0; // keep track of the lexeme table records
    
    for (i = 0; i < m_nCleanInputTokens ; i++) {
        
        char *str = caCleanInputTokens[i];
        tknlen = strlen(str);
        // typedef enum {lexConstant = 1, lexVar, lexProc} eLexemeKind;
        // [integer = constant = 1], [variable = var = 2], [reserved word = proc = 3]
        
        // check if it is a reserved word
        if ( ( rw = isReserverdWord(str) ) && tknlen > 1 ){
            // insert token record in record_table
            insertNamerecord_table(LexRecordIndex++, record_table, lexProc, 0, ' ', ' ', str, m_naWsym[(rw - 1)] );
            
            continue;
        }
        
        // check if token is number
        if  (stringIsNumber(str)){
            // if is invalid length or illegal number, it will fail at check
            
            // insert token record in record_table
            insertNamerecord_table(LexRecordIndex++, record_table, lexConstant, 0, ' ', ' ', str, 3 );
            
            continue;
        }
        
        // check if token is variable
        if  (isValidVariableAndNotReserved(str)){
            // if is invalid length or illegal variable, it will fail at check
            
            // insert token record in record_table
            insertNamerecord_table(LexRecordIndex++, record_table, lexVar, 0, ' ', ' ', str, 2 );
            
            continue;
        }
        
        // check if token special symbol
        if  ( ( ss = isSpecialChar(str[0]) ) && tknlen == 1 ){
            
            // current token is a symbol, check if next token is symbol
            // do not increase index counter, just look ahead
            if (i < m_nCleanInputTokens -1){
                char *strNext = caCleanInputTokens[i + 1];
                tknLenNext = strlen(str);
                if ( strlen(strNext) == 1 && (isSpecialChar(strNext[0]) == 7 || isSpecialChar(strNext[0]) == 11  )   ) {
                    // next token is of length 1, is a symbol
                    // check if they are a legal symbol pair : >=, <=, < >, :=
                    if ((vs = validSymbolPair(str[0], strNext[0])) ) {
                        i++;
                        
                        // insert token record in record_table
                        insertNamerecord_table(LexRecordIndex++, record_table, lexProc, 1, str[0], strNext[0], " ", vs );
                        
                        continue;
                    }
                }
            }
            
            // insert token record in record_table
            insertNamerecord_table(LexRecordIndex++, record_table, lexProc, 0, ' ', ' ', str, m_naSpecialSymbols[(ss - 1)]  );
            
            continue;
        }
    }
    
}
Example #3
0
void SyntaxHighlighter::colorize(QString text)
{
    Row *currentRow;
    TextField *field = NULL;
    enum {IDLE,
        MULTI_COMMENT,
        SPACES,
        WORD, GLOBAL_INCLUDE_FILE, COMMENT1,COMMENT,
        STRING,
        ESCAPED_CHAR,
        INC_STRING
    } state = IDLE;
    char c = '\n';
    char prevC = ' ';
    char prevPrevC = ' ';
    bool isEscaped = false;
    
    reset();

    currentRow = new Row;
    m_rows.push_back(currentRow);
    

    for(int i = 0;i < text.size();i++)
    {
        c = text[i].toLatin1();

        // Was the last character an escape?
        if(prevC == '\\' && prevPrevC != '\\')
            isEscaped = true;
        else
            isEscaped = false;
        prevPrevC = prevC;
        prevC = c;
        
        
        switch(state)
        {   
            case IDLE:
            {
                if(c == '/')
                {
                    state = COMMENT1;
                    field = new TextField;
                    field->m_type = TextField::WORD;
                    field->m_color = Qt::white;
                    currentRow->appendField(field);
                    field->m_text = c;
                }
                else if(c == ' ' || c == '\t')
                {
                    state = SPACES;
                    field = new TextField;
                    field->m_type = TextField::SPACES;
                    field->m_color = Qt::white;
                    currentRow->appendField(field);
                    field->m_text = c;
                }
                else if(c == '\'')
                {
                    state = ESCAPED_CHAR;
                    field = new TextField;
                    field->m_type = TextField::STRING;
                    currentRow->appendField(field);
                    field->m_text = c;
                }
                else if(c == '"')
                {
                    state = STRING;
                    field = new TextField;
                    if(currentRow->isCppRow)
                        field->m_type = TextField::INC_STRING;
                    else
                        field->m_type = TextField::STRING;
                    currentRow->appendField(field);
                    field->m_text = c;
                }
                else if(c == '<' && currentRow->isCppRow)
                {
                    // Is it a include string?
                    bool isIncString = false;
                    TextField *lastField = currentRow->getLastNonSpaceField();
                    if(lastField)
                    {
                        if(lastField->m_text == "include")
                            isIncString = true;
                    }

                    // Add the field
                    field = new TextField;
                    field->m_text = c;
                    if(isIncString)
                    {
                        state = INC_STRING;
                        field->m_type = TextField::INC_STRING;
                    }
                    else
                    {
                        field->m_type = TextField::WORD;
                        field->m_color = Qt::white;
                    }
                    currentRow->appendField(field);
                
                }
                else if(c == '#')
                {
                    // Only spaces before the '#' at the line?
                    bool onlySpaces = true;
                    for(int j = 0;onlySpaces == true && j < currentRow->m_fields.size();j++)
                    {
                        if(currentRow->m_fields[j]->m_type != TextField::SPACES &&
                            currentRow->m_fields[j]->m_type != TextField::COMMENT)
                        {
                            onlySpaces = false;
                        }
                    }
                    currentRow->isCppRow = onlySpaces ? true : false;

                    // Create a new field structure
                    field = new TextField;
                    if(currentRow->isCppRow)
                        field->m_type = TextField::CPP_KEYWORD;
                    else
                        field->m_type = TextField::WORD;
                    field->m_color = Qt::white;
                    currentRow->appendField(field);
                    field->m_text = c;
                }
                else if(isSpecialChar(c))
                {
                    field = new TextField;
                    field->m_type = TextField::WORD;
                    field->m_color = Qt::white;
                    currentRow->appendField(field);
                    field->m_text = c;
                }
                else if(c == '\n')
                {
                    currentRow = new Row;
                    m_rows.push_back(currentRow);
                    state = IDLE;
                }
                else
                {
                    state = WORD;
                    field = new TextField;
                    if(QChar(c).isDigit())
                        field->m_type = TextField::NUMBER;
                    else
                        field->m_type = TextField::WORD;
                    field->m_color = Qt::white;
                    currentRow->appendField(field);
                    field->m_text = c;
                }
            };break;
            case COMMENT1:
            {
                if(c == '*')
                {
                    field->m_text += c;
                    field->m_type = TextField::COMMENT;
                    field->m_color = Qt::green;
                    state = MULTI_COMMENT;
                    
                }
                else if(c == '/')
                {
                    field->m_text += c;
                    field->m_type = TextField::COMMENT;
                    field->m_color = Qt::green;
                    state = COMMENT;
                }
                else
                {
                    i--;
                    state = IDLE;
                }
            };break;
            case MULTI_COMMENT:
            {
                if(c == '\n')
                {
                    currentRow = new Row;
                    m_rows.push_back(currentRow);

                    field = new TextField;
                    field->m_type = TextField::COMMENT;
                    currentRow->appendField(field);
                    
                }
                else if(text[i-1].toLatin1() == '*' && c == '/')
                {
                    field->m_text += c;
                    state = IDLE;
                }
                else
                {
                    field->m_text += c;
                }
            };break;
            case COMMENT:
            {
                if(c == '\n')
                {
                    i--;
                    state = IDLE;
                }
                else
                    field->m_text += c;
                    
            };break;
            case SPACES:
            {
                if(c == ' ' || c == '\t')
                {
                    field->m_text += c;
                }
                else
                {
                    i--;
                    field = NULL;
                    state = IDLE;
                }  
            };break;
            case GLOBAL_INCLUDE_FILE:
            {
                if(!isEscaped && c == '\n')
                {
                    state = IDLE;
                }
                else
                {
                    field->m_text += c;
                    if(c == '>')
                    {
                        state = IDLE;
                    }
                }
            };break;
            case ESCAPED_CHAR:
            {
                field->m_text += c;
                if(!isEscaped && c == '\'')
                {
                    field = NULL;
                    state = IDLE;
                }
            };break;
            case INC_STRING:
            {
                if(!isEscaped && c == '\n')
                {
                    i--;
                    field = NULL;
                    state = IDLE;
                }
                else
                {
                    field->m_text += c;
                    if(!isEscaped && c == '>')
                    {
                        field = NULL;
                        state = IDLE;
                    }
                }
            };break;
            case STRING:
            {
                field->m_text += c;
                if(!isEscaped && c == '"')
                {
                    field = NULL;
                    state = IDLE;
                }
                  
            };break;
            case WORD:
            {
                if(isSpecialChar(c) || c == ' ' || c == '\t' || c == '\n')
                {
                    i--;
                    if(currentRow->isCppRow)
                    {
                        if(isCppKeyword(field->m_text))
                            field->m_type = TextField::CPP_KEYWORD;
                    }
                    else
                    {
                        if(isKeyword(field->m_text))
                            field->m_type = TextField::KEYWORD;
                    }
    
                    field = NULL;
                    state = IDLE;
                }
                else
                {
                    
                    field->m_text += c;
                }
                
            };break;
        }
    }

    for(int r = 0;r < m_rows.size();r++)
    {
        Row *currentRow = m_rows[r];

        for(int j = 0;j < currentRow->m_fields.size();j++)
        {
            TextField* currentField = currentRow->m_fields[j];
            pickColor(currentField);
        }
    }
}
int CommonLanguageAnalyzer::analyze_impl(const Term& input, void* data, HookType func)
{
    parse(input.text_);

    unsigned char topAndOrBit = Term::AND;
    int tempOffset = 0;
    int lastWordOffset = -1;

    while (nextToken())
    {
        if (len_ == 0)
            continue;

        if (bRemoveStopwords_ && isStopword())
            continue;

/*            {
            UString foo(token_, len_); string bar; foo.convertString(bar, UString::UTF_8);
            cout << "(" << bar << ") --<> " << isIndex_ << "," << offset_ << "," << isRaw_ << "," << level_ << endl;
            }*/

        if (bChinese_)
        {
            int curWordOffset = offset_;
            if (curWordOffset == lastWordOffset)
                topAndOrBit = Term::OR;
            else
                topAndOrBit = Term::AND;
            lastWordOffset = curWordOffset;
        }

        if (isIndex_)
        {
            if (isSpecialChar())
            {
                func(data, token_, len_, offset_, Term::SpecialCharPOS, Term::AND, level_, true);
                tempOffset = offset_;
                continue;
            }
            if (isRaw_)
            {
                func(data, token_, len_, offset_, pos_, Term::OR, level_, false);
                tempOffset = offset_;
                continue;
            }

            // foreign language, e.g. English
            if (isAlpha())
            {
                UString::CharT* lowercaseTermUstr = lowercase_ustring_buffer_;
                bool lowercaseIsDifferent = UString::toLowerString(token_, len_,
                                            lowercase_ustring_buffer_, term_ustring_buffer_limit_);

                char* lowercaseTerm = lowercase_string_buffer_;
                UString::convertString(UString::UTF_8, lowercaseTermUstr, len_, lowercase_string_buffer_, term_string_buffer_limit_);

                UString::CharT* stemmingTermUstr = NULL;
                size_t stemmingTermUstrSize = 0;

                UString::CharT * synonymResultUstr = NULL;
                size_t synonymResultUstrLen = 0;

                if (bExtractEngStem_)
                {
                    /// TODO: write a UCS2 based stemmer
                    string stem_term;
                    pStemmer_->stem(lowercaseTerm, stem_term);
                    if (strcmp(stem_term.c_str(), lowercaseTerm))
                    {
                        stemmingTermUstr = stemming_ustring_buffer_;
                        stemmingTermUstrSize = UString::toUcs2(UString::UTF_8,
                                stem_term.c_str(), stem_term.size(), stemming_ustring_buffer_, term_ustring_buffer_limit_);
                    }
                }

//              if (false /*bExtractSynonym_, preprocessed*/)
//              {
//                  pSynonymContainer_ = uscSPtr_->getSynonymContainer();
//                  pSynonymContainer_->searchNgetSynonym(lowercaseTerm, pSynonymResult_);
//                  char * synonymResult = pSynonymResult_->getHeadWord(0);
//                  if (synonymResult)
//                  {
//                      size_t synonymResultLen = strlen(synonymResult);
//                      if (synonymResultLen <= term_ustring_buffer_limit_)
//                      {
//                          synonymResultUstr = synonym_ustring_buffer_;
//                          synonymResultUstrLen = UString::toUcs2(synonymEncode_,
//                                  synonymResult, synonymResultLen, synonym_ustring_buffer_, term_ustring_buffer_limit_);
//                      }
//                  }
//              }

                if (stemmingTermUstr || synonymResultUstr || (bCaseSensitive_ && bContainLower_ && lowercaseIsDifferent))
                {
                    /// have more than one output
                    if (bCaseSensitive_)
                    {
                        func(data,  token_, len_, offset_, Term::EnglishPOS, Term::OR, level_+1, false);
                        tempOffset = offset_;
                    }
                    else
                    {
                        func(data, lowercaseTermUstr, len_, offset_, Term::EnglishPOS, Term::OR, level_+1, false);
                        tempOffset = offset_;
                    }
                    if (stemmingTermUstr)
                    {
                        func(data, stemmingTermUstr, stemmingTermUstrSize, offset_, Term::EnglishPOS, Term::OR, level_+1, false);
                        tempOffset = offset_;
                    }
                    if (synonymResultUstr)
                    {
                        func(data, synonymResultUstr, synonymResultUstrLen, offset_, NULL, Term::OR, level_+1, false);
                        tempOffset = offset_;
                    }
                    if (bCaseSensitive_ && bContainLower_ && lowercaseIsDifferent)
                    {
                        func(data, lowercaseTermUstr, len_, offset_, Term::EnglishPOS, Term::OR, level_+1, false);
                        tempOffset = offset_;
                    }
                }
                else
                {
                    /// have only one output
                    if (bCaseSensitive_)
                    {
                        func(data,  token_, len_, offset_, Term::EnglishPOS, Term::AND, level_, false);
                        tempOffset = offset_;
                    }
                    else
                    {
                        func(data, lowercaseTermUstr, len_, offset_, Term::EnglishPOS, Term::AND, level_, false);
                        tempOffset = offset_;
                    }
                }
            }
            else
            {
//              if (false /*bExtractSynonym_, preprocessed*/)
//              {
//                  UString::CharT * synonymResultUstr = NULL;
//                  size_t synonymResultUstrLen = 0;

//                  pSynonymContainer_ = uscSPtr_->getSynonymContainer();
//                  pSynonymContainer_->searchNgetSynonym(nativeToken_, pSynonymResult_);

//                  bool hasSynonym = false;
//                  for (int i =0; i<pSynonymResult_->getSynonymCount(0); i++)
//                  {
//                      char * synonymResult = pSynonymResult_->getWord(0, i);
//                      if (synonymResult)
//                      {
//                          if (strcmp(nativeToken_, synonymResult) == 0)
//                          {
//                              //cout << "synonym self: "<<string(synonymResult) <<endl;
//                              continue;
//                          }
//                          //cout << "synonym : "<<string(synonymResult) <<endl;

//                          size_t synonymResultLen = strlen(synonymResult);
//                          if (synonymResultLen <= term_ustring_buffer_limit_)
//                          {
//                              synonymResultUstr = synonym_ustring_buffer_;
//                              synonymResultUstrLen = UString::toUcs2(synonymEncode_,
//                                      synonymResult, synonymResultLen, synonym_ustring_buffer_, term_ustring_buffer_limit_);
//                          }

//                          hasSynonym = true;
//                          func(data, synonymResultUstr, synonymResultUstrLen, offset_, NULL, Term::OR, level_+1, false);
//                      }
//                  }

//                  if (hasSynonym)
//                  {
//                      func(data, token_, len_, offset_, pos_, Term::OR, level_+1, false);
//                  }
//                  else
//                  {
//                      func(data, token_, len_, offset_, pos_, topAndOrBit, level_, false);
//                  }
//              }
//              else
                {
                    func(data, token_, len_, offset_, pos_, topAndOrBit, level_, false);
                    tempOffset = offset_;
                }
            }
        }
    }
    return tempOffset + 1;
}