short PhysicalFastExtract::codeGen(Generator *generator) { short result = 0; Space *space = generator->getSpace(); CmpContext *cmpContext = generator->currentCmpContext(); const ULng32 downQueueMaxSize = getDefault(GEN_FE_SIZE_DOWN); const ULng32 upQueueMaxSize = getDefault(GEN_FE_SIZE_UP); const ULng32 defaultBufferSize = getDefault(GEN_FE_BUFFER_SIZE); const ULng32 outputBufferSize = defaultBufferSize; const ULng32 requestBufferSize = defaultBufferSize; const ULng32 replyBufferSize = defaultBufferSize; const ULng32 numOutputBuffers = getDefault(GEN_FE_NUM_BUFFERS); // used in runtime stats Cardinality estimatedRowCount = (Cardinality) (getInputCardinality() * getEstRowsUsed()).getValue(); Int32 numChildren = getArity(); ex_cri_desc * givenDesc = generator->getCriDesc(Generator::DOWN); ComTdb * childTdb = (ComTdb*) new (space) ComTdb(); ExplainTuple *firstExplainTuple = 0; // Allocate a new map table for this child. // MapTable *localMapTable = generator->appendAtEnd(); generator->setCriDesc(givenDesc, Generator::DOWN); child(0)->codeGen(generator); childTdb = (ComTdb *)(generator->getGenObj()); firstExplainTuple = generator->getExplainTuple(); ComTdbFastExtract *newTdb = NULL; char * targetName = NULL; char * hiveTableName = NULL; char * delimiter = NULL; char * header = NULL; char * nullString = NULL; char * recordSeparator = NULL; char * hdfsHostName = NULL; Int32 hdfsPortNum = getHdfsPort(); char * newDelimiter = (char *)getDelimiter().data(); char specChar = '0'; if (!isHiveInsert() && isSpecialChar(newDelimiter, specChar)) { newDelimiter = new (cmpContext->statementHeap()) char[2]; newDelimiter[0] = specChar; newDelimiter[1] = '\0'; } char * newRecordSep = (char *)getRecordSeparator().data(); specChar = '0'; if (!isHiveInsert() && isSpecialChar(newRecordSep, specChar)) { newRecordSep = new (cmpContext->statementHeap()) char[2]; newRecordSep[0] = specChar; newRecordSep[1] = '\0'; } targetName = AllocStringInSpace(*space, (char *)getTargetName().data()); hdfsHostName = AllocStringInSpace(*space, (char *)getHdfsHostName().data()); hiveTableName = AllocStringInSpace(*space, (char *)getHiveTableName().data()); delimiter = AllocStringInSpace(*space, newDelimiter); header = AllocStringInSpace(*space, (char *)getHeader().data()); nullString = AllocStringInSpace(*space, (char *)getNullString().data()); recordSeparator = AllocStringInSpace(*space, newRecordSep); result = ft_codegen(generator, *this, // RelExpr &relExpr newTdb, // ComTdbUdr *&newTdb estimatedRowCount, targetName, hdfsHostName, hdfsPortNum, hiveTableName, delimiter, header, nullString, recordSeparator, downQueueMaxSize, upQueueMaxSize, outputBufferSize, requestBufferSize, replyBufferSize, numOutputBuffers, childTdb, isSequenceFile()); if (!generator->explainDisabled()) { generator->setExplainTuple(addExplainInfo(newTdb, firstExplainTuple, 0, generator)); } if (getTargetType() == FILE) newTdb->setTargetFile(1); else if (getTargetType() == SOCKET) newTdb->setTargetSocket(1); else GenAssert(0, "Unexpected Fast Extract target type") if (isAppend()) newTdb->setIsAppend(1); if (this->includeHeader()) newTdb->setIncludeHeader(1); if (isHiveInsert()) { newTdb->setIsHiveInsert(1); newTdb->setIncludeHeader(0); setOverwriteHiveTable( getOverwriteHiveTable()); } else { if (includeHeader()) newTdb->setIncludeHeader(1); } if (getCompressionType() != NONE) { if (getCompressionType() == LZO) newTdb->setCompressLZO(1); else GenAssert(0, "Unexpected Fast Extract compression type") } if((ActiveSchemaDB()->getDefaults()).getToken(FAST_EXTRACT_DIAGS) == DF_ON) newTdb->setPrintDiags(1); return result; }
/* * IdentifyInputToken(char *caCleanInputTokens[]) * take each token and check if its a reserved word, numerical or symbol pail, or variable * if token is an illegal lexeme, print out error and exit program * */ void IdentifyInputToken(char *caCleanInputTokens[], namerecord_t *record_table){ int ssNext = 0; int ss = 0; int rw = 0; int tknlen = 0; int tknLenNext = 0; int i = 0; int vs = 0; int LexRecordIndex = 0; // keep track of the lexeme table records for (i = 0; i < m_nCleanInputTokens ; i++) { char *str = caCleanInputTokens[i]; tknlen = strlen(str); // typedef enum {lexConstant = 1, lexVar, lexProc} eLexemeKind; // [integer = constant = 1], [variable = var = 2], [reserved word = proc = 3] // check if it is a reserved word if ( ( rw = isReserverdWord(str) ) && tknlen > 1 ){ // insert token record in record_table insertNamerecord_table(LexRecordIndex++, record_table, lexProc, 0, ' ', ' ', str, m_naWsym[(rw - 1)] ); continue; } // check if token is number if (stringIsNumber(str)){ // if is invalid length or illegal number, it will fail at check // insert token record in record_table insertNamerecord_table(LexRecordIndex++, record_table, lexConstant, 0, ' ', ' ', str, 3 ); continue; } // check if token is variable if (isValidVariableAndNotReserved(str)){ // if is invalid length or illegal variable, it will fail at check // insert token record in record_table insertNamerecord_table(LexRecordIndex++, record_table, lexVar, 0, ' ', ' ', str, 2 ); continue; } // check if token special symbol if ( ( ss = isSpecialChar(str[0]) ) && tknlen == 1 ){ // current token is a symbol, check if next token is symbol // do not increase index counter, just look ahead if (i < m_nCleanInputTokens -1){ char *strNext = caCleanInputTokens[i + 1]; tknLenNext = strlen(str); if ( strlen(strNext) == 1 && (isSpecialChar(strNext[0]) == 7 || isSpecialChar(strNext[0]) == 11 ) ) { // next token is of length 1, is a symbol // check if they are a legal symbol pair : >=, <=, < >, := if ((vs = validSymbolPair(str[0], strNext[0])) ) { i++; // insert token record in record_table insertNamerecord_table(LexRecordIndex++, record_table, lexProc, 1, str[0], strNext[0], " ", vs ); continue; } } } // insert token record in record_table insertNamerecord_table(LexRecordIndex++, record_table, lexProc, 0, ' ', ' ', str, m_naSpecialSymbols[(ss - 1)] ); continue; } } }
void SyntaxHighlighter::colorize(QString text) { Row *currentRow; TextField *field = NULL; enum {IDLE, MULTI_COMMENT, SPACES, WORD, GLOBAL_INCLUDE_FILE, COMMENT1,COMMENT, STRING, ESCAPED_CHAR, INC_STRING } state = IDLE; char c = '\n'; char prevC = ' '; char prevPrevC = ' '; bool isEscaped = false; reset(); currentRow = new Row; m_rows.push_back(currentRow); for(int i = 0;i < text.size();i++) { c = text[i].toLatin1(); // Was the last character an escape? if(prevC == '\\' && prevPrevC != '\\') isEscaped = true; else isEscaped = false; prevPrevC = prevC; prevC = c; switch(state) { case IDLE: { if(c == '/') { state = COMMENT1; field = new TextField; field->m_type = TextField::WORD; field->m_color = Qt::white; currentRow->appendField(field); field->m_text = c; } else if(c == ' ' || c == '\t') { state = SPACES; field = new TextField; field->m_type = TextField::SPACES; field->m_color = Qt::white; currentRow->appendField(field); field->m_text = c; } else if(c == '\'') { state = ESCAPED_CHAR; field = new TextField; field->m_type = TextField::STRING; currentRow->appendField(field); field->m_text = c; } else if(c == '"') { state = STRING; field = new TextField; if(currentRow->isCppRow) field->m_type = TextField::INC_STRING; else field->m_type = TextField::STRING; currentRow->appendField(field); field->m_text = c; } else if(c == '<' && currentRow->isCppRow) { // Is it a include string? bool isIncString = false; TextField *lastField = currentRow->getLastNonSpaceField(); if(lastField) { if(lastField->m_text == "include") isIncString = true; } // Add the field field = new TextField; field->m_text = c; if(isIncString) { state = INC_STRING; field->m_type = TextField::INC_STRING; } else { field->m_type = TextField::WORD; field->m_color = Qt::white; } currentRow->appendField(field); } else if(c == '#') { // Only spaces before the '#' at the line? bool onlySpaces = true; for(int j = 0;onlySpaces == true && j < currentRow->m_fields.size();j++) { if(currentRow->m_fields[j]->m_type != TextField::SPACES && currentRow->m_fields[j]->m_type != TextField::COMMENT) { onlySpaces = false; } } currentRow->isCppRow = onlySpaces ? true : false; // Create a new field structure field = new TextField; if(currentRow->isCppRow) field->m_type = TextField::CPP_KEYWORD; else field->m_type = TextField::WORD; field->m_color = Qt::white; currentRow->appendField(field); field->m_text = c; } else if(isSpecialChar(c)) { field = new TextField; field->m_type = TextField::WORD; field->m_color = Qt::white; currentRow->appendField(field); field->m_text = c; } else if(c == '\n') { currentRow = new Row; m_rows.push_back(currentRow); state = IDLE; } else { state = WORD; field = new TextField; if(QChar(c).isDigit()) field->m_type = TextField::NUMBER; else field->m_type = TextField::WORD; field->m_color = Qt::white; currentRow->appendField(field); field->m_text = c; } };break; case COMMENT1: { if(c == '*') { field->m_text += c; field->m_type = TextField::COMMENT; field->m_color = Qt::green; state = MULTI_COMMENT; } else if(c == '/') { field->m_text += c; field->m_type = TextField::COMMENT; field->m_color = Qt::green; state = COMMENT; } else { i--; state = IDLE; } };break; case MULTI_COMMENT: { if(c == '\n') { currentRow = new Row; m_rows.push_back(currentRow); field = new TextField; field->m_type = TextField::COMMENT; currentRow->appendField(field); } else if(text[i-1].toLatin1() == '*' && c == '/') { field->m_text += c; state = IDLE; } else { field->m_text += c; } };break; case COMMENT: { if(c == '\n') { i--; state = IDLE; } else field->m_text += c; };break; case SPACES: { if(c == ' ' || c == '\t') { field->m_text += c; } else { i--; field = NULL; state = IDLE; } };break; case GLOBAL_INCLUDE_FILE: { if(!isEscaped && c == '\n') { state = IDLE; } else { field->m_text += c; if(c == '>') { state = IDLE; } } };break; case ESCAPED_CHAR: { field->m_text += c; if(!isEscaped && c == '\'') { field = NULL; state = IDLE; } };break; case INC_STRING: { if(!isEscaped && c == '\n') { i--; field = NULL; state = IDLE; } else { field->m_text += c; if(!isEscaped && c == '>') { field = NULL; state = IDLE; } } };break; case STRING: { field->m_text += c; if(!isEscaped && c == '"') { field = NULL; state = IDLE; } };break; case WORD: { if(isSpecialChar(c) || c == ' ' || c == '\t' || c == '\n') { i--; if(currentRow->isCppRow) { if(isCppKeyword(field->m_text)) field->m_type = TextField::CPP_KEYWORD; } else { if(isKeyword(field->m_text)) field->m_type = TextField::KEYWORD; } field = NULL; state = IDLE; } else { field->m_text += c; } };break; } } for(int r = 0;r < m_rows.size();r++) { Row *currentRow = m_rows[r]; for(int j = 0;j < currentRow->m_fields.size();j++) { TextField* currentField = currentRow->m_fields[j]; pickColor(currentField); } } }
int CommonLanguageAnalyzer::analyze_impl(const Term& input, void* data, HookType func) { parse(input.text_); unsigned char topAndOrBit = Term::AND; int tempOffset = 0; int lastWordOffset = -1; while (nextToken()) { if (len_ == 0) continue; if (bRemoveStopwords_ && isStopword()) continue; /* { UString foo(token_, len_); string bar; foo.convertString(bar, UString::UTF_8); cout << "(" << bar << ") --<> " << isIndex_ << "," << offset_ << "," << isRaw_ << "," << level_ << endl; }*/ if (bChinese_) { int curWordOffset = offset_; if (curWordOffset == lastWordOffset) topAndOrBit = Term::OR; else topAndOrBit = Term::AND; lastWordOffset = curWordOffset; } if (isIndex_) { if (isSpecialChar()) { func(data, token_, len_, offset_, Term::SpecialCharPOS, Term::AND, level_, true); tempOffset = offset_; continue; } if (isRaw_) { func(data, token_, len_, offset_, pos_, Term::OR, level_, false); tempOffset = offset_; continue; } // foreign language, e.g. English if (isAlpha()) { UString::CharT* lowercaseTermUstr = lowercase_ustring_buffer_; bool lowercaseIsDifferent = UString::toLowerString(token_, len_, lowercase_ustring_buffer_, term_ustring_buffer_limit_); char* lowercaseTerm = lowercase_string_buffer_; UString::convertString(UString::UTF_8, lowercaseTermUstr, len_, lowercase_string_buffer_, term_string_buffer_limit_); UString::CharT* stemmingTermUstr = NULL; size_t stemmingTermUstrSize = 0; UString::CharT * synonymResultUstr = NULL; size_t synonymResultUstrLen = 0; if (bExtractEngStem_) { /// TODO: write a UCS2 based stemmer string stem_term; pStemmer_->stem(lowercaseTerm, stem_term); if (strcmp(stem_term.c_str(), lowercaseTerm)) { stemmingTermUstr = stemming_ustring_buffer_; stemmingTermUstrSize = UString::toUcs2(UString::UTF_8, stem_term.c_str(), stem_term.size(), stemming_ustring_buffer_, term_ustring_buffer_limit_); } } // if (false /*bExtractSynonym_, preprocessed*/) // { // pSynonymContainer_ = uscSPtr_->getSynonymContainer(); // pSynonymContainer_->searchNgetSynonym(lowercaseTerm, pSynonymResult_); // char * synonymResult = pSynonymResult_->getHeadWord(0); // if (synonymResult) // { // size_t synonymResultLen = strlen(synonymResult); // if (synonymResultLen <= term_ustring_buffer_limit_) // { // synonymResultUstr = synonym_ustring_buffer_; // synonymResultUstrLen = UString::toUcs2(synonymEncode_, // synonymResult, synonymResultLen, synonym_ustring_buffer_, term_ustring_buffer_limit_); // } // } // } if (stemmingTermUstr || synonymResultUstr || (bCaseSensitive_ && bContainLower_ && lowercaseIsDifferent)) { /// have more than one output if (bCaseSensitive_) { func(data, token_, len_, offset_, Term::EnglishPOS, Term::OR, level_+1, false); tempOffset = offset_; } else { func(data, lowercaseTermUstr, len_, offset_, Term::EnglishPOS, Term::OR, level_+1, false); tempOffset = offset_; } if (stemmingTermUstr) { func(data, stemmingTermUstr, stemmingTermUstrSize, offset_, Term::EnglishPOS, Term::OR, level_+1, false); tempOffset = offset_; } if (synonymResultUstr) { func(data, synonymResultUstr, synonymResultUstrLen, offset_, NULL, Term::OR, level_+1, false); tempOffset = offset_; } if (bCaseSensitive_ && bContainLower_ && lowercaseIsDifferent) { func(data, lowercaseTermUstr, len_, offset_, Term::EnglishPOS, Term::OR, level_+1, false); tempOffset = offset_; } } else { /// have only one output if (bCaseSensitive_) { func(data, token_, len_, offset_, Term::EnglishPOS, Term::AND, level_, false); tempOffset = offset_; } else { func(data, lowercaseTermUstr, len_, offset_, Term::EnglishPOS, Term::AND, level_, false); tempOffset = offset_; } } } else { // if (false /*bExtractSynonym_, preprocessed*/) // { // UString::CharT * synonymResultUstr = NULL; // size_t synonymResultUstrLen = 0; // pSynonymContainer_ = uscSPtr_->getSynonymContainer(); // pSynonymContainer_->searchNgetSynonym(nativeToken_, pSynonymResult_); // bool hasSynonym = false; // for (int i =0; i<pSynonymResult_->getSynonymCount(0); i++) // { // char * synonymResult = pSynonymResult_->getWord(0, i); // if (synonymResult) // { // if (strcmp(nativeToken_, synonymResult) == 0) // { // //cout << "synonym self: "<<string(synonymResult) <<endl; // continue; // } // //cout << "synonym : "<<string(synonymResult) <<endl; // size_t synonymResultLen = strlen(synonymResult); // if (synonymResultLen <= term_ustring_buffer_limit_) // { // synonymResultUstr = synonym_ustring_buffer_; // synonymResultUstrLen = UString::toUcs2(synonymEncode_, // synonymResult, synonymResultLen, synonym_ustring_buffer_, term_ustring_buffer_limit_); // } // hasSynonym = true; // func(data, synonymResultUstr, synonymResultUstrLen, offset_, NULL, Term::OR, level_+1, false); // } // } // if (hasSynonym) // { // func(data, token_, len_, offset_, pos_, Term::OR, level_+1, false); // } // else // { // func(data, token_, len_, offset_, pos_, topAndOrBit, level_, false); // } // } // else { func(data, token_, len_, offset_, pos_, topAndOrBit, level_, false); tempOffset = offset_; } } } } return tempOffset + 1; }