void GLECSVData::setDelims(const char* delims) { int pos = 0; unsigned int size = 256; for (unsigned int i = 0; i < size; i++) { m_delims[i] = false; } while (delims[pos] != 0) { m_delims[(int)delims[pos]] = true; pos++; } m_lastDelimWasSpace = isDelim(' ') || isDelim('\t'); }
/* ** Extract the next token from a tokenization cursor. The cursor must ** have been opened by a prior call to simpleOpen(). */ static int simpleNext( sqlite3_tokenizer_cursor *pCursor, /* Cursor returned by simpleOpen */ const char **ppToken, /* OUT: *ppToken is the token text */ int *pnBytes, /* OUT: Number of bytes in token */ int *piStartOffset, /* OUT: Starting offset of token */ int *piEndOffset, /* OUT: Ending offset of token */ int *piPosition /* OUT: Position integer of token */ ){ simple_tokenizer_cursor *c = (simple_tokenizer_cursor *) pCursor; simple_tokenizer *t = (simple_tokenizer *) pCursor->pTokenizer; unsigned char *p = (unsigned char *)c->pInput; while( c->iOffset<c->nBytes ){ int iStartOffset; /* Scan past delimiter characters */ while( c->iOffset<c->nBytes && isDelim(t, p[c->iOffset]) ){ c->iOffset++; } /* Count non-delimiter characters. */ iStartOffset = c->iOffset; while( c->iOffset<c->nBytes && !isDelim(t, p[c->iOffset]) ){ c->iOffset++; } if( c->iOffset>iStartOffset ){ int i, n = c->iOffset-iStartOffset; if( n>c->nTokenAllocated ){ c->nTokenAllocated = n+20; c->pToken = realloc(c->pToken, c->nTokenAllocated); if( c->pToken==NULL ) return SQLITE_NOMEM; } for(i=0; i<n; i++){ /* TODO(shess) This needs expansion to handle UTF-8 ** case-insensitivity. */ unsigned char ch = p[iStartOffset+i]; c->pToken[i] = ch<0x80 ? tolower(ch) : ch; } *ppToken = c->pToken; *pnBytes = n; *piStartOffset = iStartOffset; *piEndOffset = c->iOffset; *piPosition = c->iToken++; return SQLITE_OK; } } return SQLITE_DONE; }
token_type getIntToken(const char *text, int *len) { int sign = 0; if (text[0] == '+' || text[0] == '-') { sign = 1; } int int_len = readInt(text + sign); if (!isDelim(text[sign+int_len])) { return tok_error; } if (sign == 1 && int_len == 0) { if (text[0] == '+') { *len = 1; return tok_add; } if (text[0] == '-') { *len = 1; return tok_sub; } } *len = sign + int_len; return tok_integer; }
/* ** Extract the next token from a tokenization cursor. The cursor must ** have been opened by a prior call to porterOpen(). */ static int porterNext( sqlite3_tokenizer_cursor *pCursor, /* Cursor returned by porterOpen */ const char **pzToken, /* OUT: *pzToken is the token text */ int *pnBytes, /* OUT: Number of bytes in token */ int *piStartOffset, /* OUT: Starting offset of token */ int *piEndOffset, /* OUT: Ending offset of token */ int *piPosition /* OUT: Position integer of token */ ){ porter_tokenizer_cursor *c = (porter_tokenizer_cursor *) pCursor; const char *z = c->zInput; while( c->iOffset<c->nInput ){ int iStartOffset, ch; /* Scan past delimiter characters */ while( c->iOffset<c->nInput && isDelim(z[c->iOffset]) ){ c->iOffset++; } /* Count non-delimiter characters. */ iStartOffset = c->iOffset; while( c->iOffset<c->nInput && !isDelim(z[c->iOffset]) ){ c->iOffset++; } if( c->iOffset>iStartOffset ){ int n = c->iOffset-iStartOffset; if( n>c->nAllocated ){ char *pNew; c->nAllocated = n+20; pNew = sqlite3_realloc(c->zToken, c->nAllocated); if( !pNew ) return SQLITE_NOMEM; c->zToken = pNew; } porter_stemmer(&z[iStartOffset], n, c->zToken, pnBytes); *pzToken = c->zToken; *piStartOffset = iStartOffset; *piEndOffset = c->iOffset; *piPosition = c->iToken++; return SQLITE_OK; } } return SQLITE_DONE; }
std::string Tokenizer::firstToken(void) { thisToken.erase(); skipDelim(); std::string::iterator iter = currentPosition; for( ; *iter && !isDelim(*iter); iter++ ) { thisToken += *iter; } (*iter == NULL) ? (currentPosition = iter) : (currentPosition = ++iter); return thisToken; }
bool Tokenizer::next(void) { mToken = mNextToken; if (!mToken || (*mToken == '\0')) { return false; } while (isDelim(*mToken)) { mToken++; } char *tokenend = mToken + 1; while (!isDelim(*tokenend)) { tokenend++; } if (*tokenend == '\0') { mNextToken = tokenend; } else { *tokenend = '\0'; mNextToken = tokenend + 1; } return true; }
void Invocation::getNoWarnings() { int kk = cmdParams.size(); for (int i = 0; i < kk; ++i) if (cmdParams[i] == "-nowarnings") { showWarnings = false; return; } string s1, s2, s; int x, ls; ifstream * in = new ifstream(params->getCFGFileName()); if (in->fail()) { in->close(); delete in; return; } while (getline(*in, s)) { trim(s); if (s[0] == '[' || s[0] == ';') continue; ls = s.length(); x = -1; for (int i = 0; i < ls; ++i) if (isDelim(s[i]) || s[i] == '=') { x = i; break; } if (x == -1) continue; s1 = lowercase(s.substr(0, x)); if (s1[0] != '-') s1 = "-" + s1; while (x < ls && s[x] != '=') ++x; if (x++ >= ls) continue; while (x < ls && isDelim(s[x])) ++x; if (x >= ls) continue; s2 = lowercase(s.substr(x)); if (s1 == "-nowarnings" && s2 == "true") { showWarnings = false; break; } } in->close(); delete in; }
GLECSVDataStatus GLECSVData::skipSpacesAndFirstDelim(GLEBYTE ch) { while (true) { if (!isSpace(ch)) { if (ch == 0) { return GLECSVDataStatusEOF; } else if (isEol(ch)) { return readNewline(ch); } else if (isDelim(ch)) { m_lastDelimWasSpace = isSpace(ch); return GLECSVDataStatusOK; } else { goBack(); return GLECSVDataStatusOK; } } ch = readChar(); } return GLECSVDataStatusOK; }
const char *StatTokeniser::nextToken() { if (mInput[0] == '\0') { return nullptr; } int foundChars = 0; int pos = 0; int outPos = 0; char ch = mInput[0]; while (ch != '\0') { bool isWhite = IS_WHITE(ch); if (isWhite) { if (foundChars > 0) { break; } ch = mInput[++pos]; continue; } int currentChar = isDelim(ch) ? 2 : 1; if (foundChars == 0) { foundChars = currentChar; } else { if (foundChars != currentChar) { break; } } mBuff[outPos++] = ch; ch = mInput[++pos]; } mBuff[outPos] = '\0'; mInput += pos; return mBuff; }
void tokenize(std::istream &in, std::vector<token> &out) { std::string line; while(!in.fail() && !in.eof()) { std::getline(in,line,'\n'); std::string word; for (char c : line) { if (isDelim(c)) { if (word == "attributes") break; if (word == "align") {word.clear();break;} check_for_token(word,out); check_for_delim(c,out); } else { if (c != '\n') word.push_back(c); } } check_for_token(word,out); out.push_back({ENDLINE,"\n"}); } }
GLECSVDataStatus GLECSVData::readCell() { GLEBYTE ch = readSignificantChar(); if (ch == '"' || ch == '\'') { return readCellString(ch); } unsigned int cellCount = 0; unsigned int cellSize = 0; unsigned int cellPos = lastCharPos(); while (true) { if (ch == 0) { if (isSizeCheckOKEndOfLine(cellSize)) { createCell(cellSize, cellPos); } return GLECSVDataStatusEOF; } else if (isEol(ch)) { if (isSizeCheckOKEndOfLine(cellSize)) { createCell(cellSize, cellPos); } return readNewline(ch); } else if (isDelim(ch)) { m_lastDelimWasSpace = isSpace(ch); if (isSizeCheckOKAtDelim(ch, cellSize)) { createCell(cellSize, cellPos); } return skipSpacesAndFirstDelim(ch); } else if (isComment(ch)) { if (isSizeCheckOKEndOfLine(cellSize)) { createCell(cellSize, cellPos); } return skipTillEol(); } cellCount++; if (!isSpace(ch)) { cellSize = cellCount; } ch = readChar(); } return GLECSVDataStatusOK; }
void SRExpressionsParser::parse(void) { type = VT_UNKNOWN; token.reset(); while(isspace(*expression)) expression++; if(isDelim(*expression)) { // parse for valid ops type = VT_DEL; char op=*expression; token.add(*expression++); switch(op) { case '|': if (*expression == '|') token.add(*expression++); break; case '*': if (*expression == '*') token.add(*expression++);// power... break; case '&': if (*expression == '&') token.add(*expression++); break; case '!': if (*expression == '=') token.add(*expression++); break; case '=': if (*expression == '=') token.add(*expression++); break; case '<': if (*expression == '=') token.add(*expression++); if (*expression == '<') token.add(*expression++); break; case '>': if (*expression == '=') token.add(*expression++); if (*expression == '>') token.add(*expression++); break; } } else if((expression[0]=='0') && (expression[1]=='x')) // hex value { type = VT_NUM; token.add(*expression++); token.add(*expression++); while(isxdigit(*expression)) { token.add(*expression++); } } else if(isNumeric(*expression)) { type = VT_NUM; while(isNumeric(*expression) || (*expression=='e')) // floating point, scientific { if (*expression=='e') { token.add(*expression++); if (*expression=='-') token.add(*expression++); } else { token.add(*expression++); } } switch(*expression) { case 'o': case 'b': case 'd': token.add(*expression++); break; } } else if(isAlpha(*expression) || ((unsigned char)*expression>=0x80)) { type = VT_VAR; while(isAlpha(*expression) || ((unsigned char)*expression>=0x80)) { token.add(*expression++); } } else if(*expression) { token.add(*expression++); throw(E_SYNTAX); } if((*expression==':')&&(expression[1]=='=')) { expression++; type = VT_DEF; } else if((*expression=='=') && (expression[1]!='=')) { type = VT_VAR; } while(isspace(*expression)) expression++; }
char *dfnmerge(char *fnam, const char * const dr, const char * const Xpath , const char * const nam, const char * const ext) { int len; const char *path; char *p; int delim; /* type of last path component delimiter: 0: none 1: a "real" one ('/' or '\\') or none necessary 2: a colon */ #ifdef SUPPORT_UNC_PATH DBG_ENTER("dfnumerge", Suppl_dfn) #else DBG_ENTER("dfnmerge", Suppl_dfn) #endif DBG_ARGUMENTS( ("dr=\"%s\", pa=\"%s\", na=\"%s\", ex=\"%s\", out=%p", dr, Xpath, nam, ext, fnam) ) path = Xpath; if((p = fnam) == 0) { /* determine file nam length */ #ifdef SUPPORT_UNC_PATH len = dr? (*dr == '\\'? strlen(dr) + 1: 3): 1; #else len = dr? 3: 1; /* add the NUL terminator */ #endif if(path) { if(*path) len += strlen(path) + 1; else path = 0; /* no path component specified */ } if(nam) len += strlen(nam); if(ext) len += strlen(ext) + 1; if((fnam = p = eno_malloc(len)) == 0) DBG_RETURN_S( 0) *fnam = NUL; } if(dr) { #ifdef SUPPORT_UNC_PATH if(*dr == '\\') { p = stpcpy(p, dr); /* Test if the drive spec already ends with a delimiter */ delim = isDelim(p[-1]); } else { #endif if(0 != (*p = *dr)) ++p; *p++ = ':'; /* The colon is not a delimiter for root directories */ delim = 2; #ifdef SUPPORT_UNC_PATH } #endif *p = NUL; } else delim = 1; /* no drive --> no delim necessary */ if(path) { switch(delim) { case 0: /* missing delimiter --> need one unless path has one */ if(!isDelim(*path)) *p++ = '\\'; break; } if(!*path) /* The root dir always requires a backslash */ *p++ = '\\'; /* Now, the delimiter is definitely there */ p = stpcpy(p, path); if(0 == (delim = isDelim(p[-1]))) { if(p[-1] == ':') delim = 2; } } if(nam) { if(!delim) { if(!isDelim(*nam)) *p++ = '\\'; delim = 1; } p = stpcpy(p, nam); } if(ext) { if(!delim) { *p++ = '\\'; delim = 1; } *p++ = '.'; strcpy(p, ext); } DBG_RETURN_S( fnam) }
void block::printOut(ofstream* fOut,ifstream * fInput,int t, map<string,bool> * printList){ //******* okay, let's try this; this may be a fairly complicated way to write out the .pde for compiling //******* and uploading, but it seems to work okay. This function reads a block of code from a file handed from the functions //******* below, parses it to find references to contained blocks or dropdowns, and properly formats it bool printed=printList->find(title)->second; bool partP=partnerWritten(printList); //-------- init the buffer, the pos counters and the end flag string buffer; int strtPos=0,endPos=0; bool bEnd=false; //-------- while we're not at the end of the file and we haven't reached the end of the read section while ((*fInput).peek()!=EOF&&!bEnd) { //-------- get a line and reset the position counters getline((*fInput),buffer); strtPos=endPos=0; //-------- write t amount of tabs to fOut for (int i=0; i<t&&buffer.compare("}"); i++) { *fOut << "\t"; } //-------- if the buffer is not a single '}', then parse the line; otherwise, it indicates the end of the section if(buffer.compare("}")){ //-------- init the foundTab var, and start stepping through the buffer int foundTab=0; for (unsigned int i=0; i<buffer.length(); i++) { //-------- if we find a '$' if(buffer[i]=='$'){ //-------- step through the buffer until you find [ ,\n;")] strtPos=endPos=++i; while (!isDelim(buffer[endPos]," ,\n;\")")) { endPos++; } i=endPos-1; // move the step counter to the end of the word //-------- once you have found the segment of code after the $, store it in "temp" and split the string by "[]" //-------- doing this allows you to find which dropdown to look at (most often, it's 0) string temp(buffer,strtPos,endPos-strtPos); vector<string> tempVec=ofSplitString(temp, "[]"); //-------- pos stores the vector position of the dd you are looking for int pos=0; if(tempVec.size()>1){ pos=atoi(tempVec[1].c_str()); } //-------- reassemble the string without the [] if(tempVec.size()>2) temp = tempVec[0] + tempVec[2]; if(temp.compare("blockIn")==0&&buffer[i+1]==';') i++; //-------- generate the map for use in the switch map<string,int> list; list["dd.num"]=0; list["dd.str"]=1; list["dd.ind"]=2; list["blockOn"]=3; list["blockIn"]=4; list["blockIf"]=5; switch (list.find(temp)->second) { case 0: //-------- if temp=="dd.num" write the value of the "pos" dropdown *fOut << ddGroup[pos].getValue(); break; case 1: //-------- if temp=="dd.str" write the string of the value stored in the dd[pos] *fOut << ddGroup[pos].getString(); break; case 2: //-------- if temp=="dd.ind" write the index of the value stored in the dd[pos] *fOut << ddGroup[pos].getIndex(); break; case 3: //-------- if temp==blockOn, printout the loop function for each of the blocks on for(unsigned int i=0; i<blocksOn.size(); i++){ blocksOn[i].printData("loop(){",fOut,t,printList); } break; case 4:{ //-------- if temp==blockIn, printout the loop for each of the blocks inside int outP=fOut->tellp(); fOut->seekp(outP-2); for(unsigned int i=0; i<blocksIn.size(); i++){ blocksIn[i].printData("loop(){",fOut,t+1,printList,false); } resetList(*this, *printList); for(unsigned int i=0; i<blocksIn.size(); i++){ //blocksIn[i].printData("end(){",fOut,t+1,printList); } outP=fOut->tellp(); fOut->seekp(outP-1); break; } case 5: //-------- if temp==blockIf, print the numBlocks //-------- just realized this won't work with more than one block, need to do it like the //-------- dd blocks above //-------- TODO: don't be a dumbshit for(unsigned int i=0; i<numBlocks.size(); i++){ if(!numBlocks[i].placeHolder) numBlocks[i].printData("loop(){",fOut,0,printList); else { *fOut << "0"; } } break; default: break; } } //-------- if the buff line is preceded by a '@', print the line only if the block has not already appeared else if(buffer[i]=='@'){ if (printed) i=buffer.length(); } //-------- if the buffer at the current pos is '~', print only if the block or a complement block //-------- has not been printed before else if(buffer[i]=='~'){ if (partP) i=buffer.length(); } //-------- increment the foundtab count if we find a '\t' else if(buffer[i]=='\t'){ foundTab++; if (foundTab>1) *fOut << buffer[i]; } //-------- if nothing else, printout the character else if(buffer[i]!='\n'&&buffer[i]!='\r') *fOut << buffer[i]; } //-------- newline after buffer if it was not a numblock //-------- TODO: figure out why no new line if printed already if(!numBlock) *fOut << '\n'; } else { //-------- if we found a '}' by itself, end. (*fInput).seekg (0, ios::end); bEnd=true; } } }
void Tokenizer::addDelimiter(char d) { if (!isDelim(d)) { mDelims[mNumDelim] = d; mNumDelim++; } }
/** Feed the LineInfoList for the given text * * \param vText The text to draw * \param vWidth The allowed width * \param vOut The LineInfoList to feed * \param vWrap The word wrap parameter * */ void RainbruRPG::OgreGui::Font:: processText( const std::string& vText, float vWidth, LineInfoList& vOut, bool vWrap)const{ // Get the total size of the text unsigned int count = (unsigned int)vText.size( ); // Stores pixel width of line and word float lineWidth = 0.0f; float wordWidth = 0.0f; // Stores current word std::string word; // Stores the current line std::string line; unsigned int x; for ( x = 0; x < count; x++ ){ char c = vText[x]; // Add the new character to the current word Glyph* gl=getGlyph(c); wordWidth += gl->getSpace(); word += c; bool delim=isDelim(c); if ( delim || ( x == ( count-1 ) ) ){ // Is this line too long to fit? if ( vWrap && ( lineWidth + wordWidth > vWidth ) ){ // Save current line vOut.push_back( LineInfo( line, lineWidth ) ); // Reset line width lineWidth = 0.0f; line = ""; } if ( c == '\n' ){ // Save current line vOut.push_back( LineInfo( line + word, lineWidth + wordWidth ) ); // Reset line width lineWidth = 0.0f; wordWidth = 0.0f; line = ""; word = ""; } else{ lineWidth += wordWidth; line += word; wordWidth = 0.0f; word = ""; } } } // Push any remaining text onto list vOut.push_back( LineInfo( line + word, lineWidth + wordWidth ) ); }
void Invocation::loadCFGFile() { ifstream * in = new ifstream(params->getCFGFileName()); if (in->fail()) { in->close(); delete in; if (params->getIsCFGFileSet()) warningsQueue.pb("configuration file \"" + params->getCFGFileName() + "\" was not found"); return; } string s, curParam, curValue; int ls, x; bool ok; while (getline(*in, s)) { trim(s); if (s[0] == '[' || s[0] == ';') continue; ls = s.length(); x = -1; for (int i = 0; i < ls; ++i) if (isDelim(s[i]) || s[i] == '=') { x = i; break; } if (x == -1) { warningsQueue.pb("error in configuration file format :: string \"" + s + "\" is incorrect. Use \"parameter=value\" definitions. Run with \"-helpconfig\" flag to see help message"); continue; } curParam = lowercase(s.substr(0, x)); if (curParam[0] != '-') curParam = "-" + curParam; while (x < ls && s[x] != '=') ++x; if (x++ >= ls) { warningsQueue.pb("error in configuration file format :: string \"" + s + "\" is incorrect. Use \"parameter=value\" definitions. Run with \"-helpconfig\" flag to see help message"); continue; } while (x < ls && isDelim(s[x])) ++x; if (x >= ls) { warningsQueue.pb("error in configuration file format :: string \"" + s + "\" is incorrect. Use \"parameter=value\" definitions. Run with \"-helpconfig\" flag to see help message"); continue; } curValue = s.substr(x); if (!flagsDict.count(curParam)) { warningsQueue.pb("unknown parameter \"" + curParam + "\" is defined in configuration file"); continue; } ok = !definedParams.count(curParam); if (!ok) warningsQueue.pb("parameter \"" + curParam + "\" redefinition if configuration file. Last definition is accepted"); int intValue; if (curParam == "-c") params->setCheckerFileName(curValue); else if (curParam == "-checkertl") { intValue = toi(curValue); if (intValue == -1) { warningsQueue.pb("parameter \"" + curParam + "\" - expected value is a number but \"" + curValue + "\" found in configuration file"); continue; } params->setCheckerTimeLimit(intValue); } else if (curParam == "-i") params->setInputFileName(curValue); else if (curParam == "-ml") { intValue = toi(curValue); if (intValue == -1) { warningsQueue.pb("configuration file :: parameter \"" + curParam + "\" - expected value is a number but \"" + curValue + "\" found"); continue; } params->setMemoryLimit(intValue); } else if (curParam == "-o") params->setOutputFileName(curValue); else if (curParam == "-p") params->setProgramFileName(curValue); else if (curParam == "-tc") { intValue = toi(curValue); if (intValue == -1) { warningsQueue.pb("configuration file :: parameter \"" + curParam + "\" - expected value is a number but \"" + curValue + "\" found"); continue; } params->setTestsCount(intValue); } else if (curParam == "-tim") params->getInputFileMask()->setFileMask(curValue); else if (curParam == "-tl") { intValue = toi(curValue); if (intValue == -1) { warningsQueue.pb("parameter \"" + curParam + "\" - expected value is a number but \"" + curValue + "\" found in configuration file"); continue; } params->setTimeLimit(intValue); } else if (curParam == "-tom") params->getOutputFileMask()->setFileMask(curValue); else if (curParam != "-nowarnings") { ok = false; warningsQueue.pb("parameter \"" + curParam + "\" can not be defined in configuration file"); } if (ok) definedParams.insert(curParam); } in->close(); delete in; }
/** * Generate a new token. There are basically three types of token we can * generate: * - A porter stemmed token. This is a word entirely comprised of ASCII * characters. We run the porter stemmer algorithm against the word. * Because we have no way to know what is and is not an English word * (the only language for which the porter stemmer was designed), this * could theoretically map multiple words that are not variations of the * same word down to the same root, resulting in potentially unexpected * result inclusions in the search results. We accept this result because * there's not a lot we can do about it and false positives are much * better than false negatives. * - A copied token; case/accent-folded but not stemmed. We call the porter * stemmer for all non-CJK cases and it diverts to the copy stemmer if it * sees any non-ASCII characters (after folding) or if the string is too * long. The copy stemmer will shrink the string if it is deemed too long. * - A bi-gram token; two CJK-ish characters. For query reasons we generate a * series of overlapping bi-grams. (We can't require the user to start their * search based on the arbitrary context of the indexed documents.) * * It may be useful to think of this function as operating at the points between * characters. While we are considering the 'current' character (the one after * the 'point'), we are also interested in the 'previous' character (the one * preceding the point). * At any 'point', there are a number of possible situations which I will * illustrate with pairs of characters. 'a' means alphanumeric ASCII or a * non-ASCII character that is not bi-grammable or a delimeter, '.' * means a delimiter (space or punctuation), '&' means a bi-grammable * character. * - aa: We are in the midst of a token. State remains BIGRAM_ALPHA. * - a.: We will generate a porter stemmed or copied token. State was * BIGRAM_ALPHA, gets set to BIGRAM_RESET. * - a&: We will generate a porter stemmed or copied token; we will set our * state to BIGRAM_UNKNOWN to indicate we have seen one bigram character * but that it is not yet time to emit a bigram. * - .a: We are starting a token. State was BIGRAM_RESET, gets set to * BIGRAM_ALPHA. * - ..: We skip/eat the delimeters. State stays BIGRAM_RESET. * - .&: State set to BIGRAM_UNKNOWN to indicate we have seen one bigram char. * - &a: If the state was BIGRAM_USE, we generate a bi-gram token. If the state * was BIGRAM_UNKNOWN we had only seen one CJK character and so don't do * anything. State is set to BIGRAM_ALPHA. * - &.: Same as the "&a" case, but state is set to BIGRAM_RESET. * - &&: We will generate a bi-gram token. State was either BIGRAM_UNKNOWN or * BIGRAM_USE, gets set to BIGRAM_USE. */ static int porterNext( sqlite3_tokenizer_cursor *pCursor, /* Cursor returned by porterOpen */ const char **pzToken, /* OUT: *pzToken is the token text */ int *pnBytes, /* OUT: Number of bytes in token */ int *piStartOffset, /* OUT: Starting offset of token */ int *piEndOffset, /* OUT: Ending offset of token */ int *piPosition /* OUT: Position integer of token */ ){ porter_tokenizer_cursor *c = (porter_tokenizer_cursor *) pCursor; const unsigned char *z = (unsigned char *) c->zInput; int len = 0; int state; while( c->iOffset < c->nInput ){ int iStartOffset, numChars; /* * This loop basically has two modes of operation: * - general processing (iPrevBigramOffset == 0 here) * - CJK processing (iPrevBigramOffset != 0 here) * * In an general processing pass we skip over all the delimiters, leaving us * at a character that promises to produce a token. This could be a CJK * token (state == BIGRAM_USE) or an ALPHA token (state == BIGRAM_ALPHA). * If it was a CJK token, we transition into CJK state for the next loop. * If it was an alpha token, our current offset is pointing at a delimiter * (which could be a CJK character), so it is good that our next pass * through the function and loop will skip over any delimiters. If the * delimiter we hit was a CJK character, the next time through we will * not treat it as a delimiter though; the entry state for that scan is * BIGRAM_RESET so the transition is not treated as a delimiter! * * The CJK pass always starts with the second character in a bi-gram emitted * as a token in the previous step. No delimiter skipping is required * because we know that first character might produce a token for us. It * only 'might' produce a token because the previous pass performed no * lookahead and cannot be sure it is followed by another CJK character. * This is why */ // If we have a previous bigram offset if (c->iPrevBigramOffset == 0) { /* Scan past delimiter characters */ state = BIGRAM_RESET; /* reset */ while (c->iOffset < c->nInput && isDelim(z + c->iOffset, z + c->nInput, &len, &state)) { c->iOffset += len; } } else { /* for bigram indexing, use previous offset */ c->iOffset = c->iPrevBigramOffset; } /* Count non-delimiter characters. */ iStartOffset = c->iOffset; numChars = 0; // Start from a reset state. This means the first character we see // (which will not be a delimiter) determines which of ALPHA or CJK modes // we are operating in. (It won't be a delimiter because in a 'general' // pass as defined above, we will have eaten all the delimiters, and in // a CJK pass we are guaranteed that the first character is CJK.) state = BIGRAM_RESET; /* state is reset */ // Advance until it is time to emit a token. // For ALPHA characters, this means advancing until we encounter a delimiter // or a CJK character. iOffset will be pointing at the delimiter or CJK // character, aka one beyond the last ALPHA character. // For CJK characters this means advancing until we encounter an ALPHA // character, a delimiter, or we have seen two consecutive CJK // characters. iOffset points at the ALPHA/delimiter in the first 2 cases // and the second of two CJK characters in the last case. // Because of the way this loop is structured, iOffset is only updated // when we don't terminate. However, if we terminate, len still contains // the number of bytes in the character found at iOffset. (This is useful // in the CJK case.) while (c->iOffset < c->nInput && !isDelim(z + c->iOffset, z + c->nInput, &len, &state)) { c->iOffset += len; numChars++; } if (state == BIGRAM_USE) { /* Split word by bigram */ // Right now iOffset is pointing at the second character in a pair. // Save this offset so next-time through we start with that as the // first character. c->iPrevBigramOffset = c->iOffset; // And now advance so that iOffset is pointing at the character after // the second character in the bi-gram pair. Also count the char. c->iOffset += len; numChars++; } else { /* Reset bigram offset */ c->iPrevBigramOffset = 0; } /* We emit a token if: * - there are two ideograms together, * - there are three chars or more, * - we think this is a query and wildcard magic is desired. * We think is a wildcard query when we have a single character, it starts * at the start of the buffer, it's CJK, our current offset is one shy of * nInput and the character at iOffset is '*'. Because the state gets * clobbered by the incidence of '*' our requirement for CJK is that the * implied character length is at least 3 given that it takes at least 3 * bytes to encode to 0x2000. */ // It is possible we have no token to emit here if iPrevBigramOffset was not // 0 on entry and there was no second CJK character. iPrevBigramOffset // will now be 0 if that is the case (and c->iOffset == iStartOffset). if (// allow two-character words only if in bigram (numChars == 2 && state == BIGRAM_USE) || // otherwise, drop two-letter words (considered stop-words) (numChars >=3) || // wildcard case: (numChars == 1 && iStartOffset == 0 && (c->iOffset >= 3) && (c->iOffset == c->nInput - 1) && (z[c->iOffset] == '*'))) { /* figure out the number of bytes to copy/stem */ int n = c->iOffset - iStartOffset; /* make sure there is enough buffer space */ if (n * MAX_UTF8_GROWTH_FACTOR > c->nAllocated) { c->nAllocated = n * MAX_UTF8_GROWTH_FACTOR + 20; c->zToken = sqlite3_realloc(c->zToken, c->nAllocated); if (c->zToken == NULL) return SQLITE_NOMEM; } if (state == BIGRAM_USE) { /* This is by bigram. So it is unnecessary to convert word */ copy_stemmer(&z[iStartOffset], n, c->zToken, pnBytes); } else { porter_stemmer(&z[iStartOffset], n, c->zToken, pnBytes); } *pzToken = (const char*) c->zToken; *piStartOffset = iStartOffset; *piEndOffset = c->iOffset; *piPosition = c->iToken++; return SQLITE_OK; } } return SQLITE_DONE; }
void Tokenizer::skipDelim(void) { std::string::iterator iter = currentPosition; for( ; isDelim(*iter); ++iter ) ; currentPosition = iter; }
U8 timeLegal(U8* timeStr, U16 strLen, em_time_state* pState) { U16 i = 0, digitalLen = 0; U8 data, value[3] = { 0 }; em_time_state state = tm_state_init; for (i = 0; i < strLen; i++) { data = timeStr[i]; //printf("data: %c\n", data); switch (state) { case tm_state_init: if (isdigit(data)) { value[digitalLen] = data; digitalLen++; state = tm_state_hour; } else { state = tm_state_end_illegal; goto result; } break; case tm_state_hour: if (isdigit(data)) { if (digitalLen == 2) { state = tm_state_end_illegal; goto result; } value[digitalLen] = data; if (atoi(value) > 23) { state = tm_state_end_illegal; goto result; } digitalLen++; } else if (isDelim(data)) { digitalLen = 0; memset(value, 0, sizeof(value)); state = tm_state_delim; } else { state = tm_state_end_illegal; goto result; } break; case tm_state_delim: if (isdigit(data)) { value[digitalLen] = data; digitalLen++; state = tm_state_min; } else { state = tm_state_end_illegal; goto result; } break; case tm_state_min: if (isdigit(data)) { if (digitalLen == 2) { state = tm_state_end_illegal; goto result; } value[digitalLen] = data; if (atoi(value) > 59) { state = tm_state_end_illegal; goto result; } digitalLen++; } else { state = tm_state_end_illegal; goto result; } break; default: break; } } result: //printf("reach result\n"); *pState = state; if (state == tm_state_hour || state == tm_state_min) return NO_ERR; else return ERROR; }