Esempio n. 1
0
void GLECSVData::setDelims(const char* delims) {
	int pos = 0;
	unsigned int size = 256;
	for (unsigned int i = 0; i < size; i++) {
		m_delims[i] = false;
	}
	while (delims[pos] != 0) {
		m_delims[(int)delims[pos]] = true;
		pos++;
	}
	m_lastDelimWasSpace = isDelim(' ') || isDelim('\t');
}
/*
** Extract the next token from a tokenization cursor.  The cursor must
** have been opened by a prior call to simpleOpen().
*/
static int simpleNext(
  sqlite3_tokenizer_cursor *pCursor,  /* Cursor returned by simpleOpen */
  const char **ppToken,               /* OUT: *ppToken is the token text */
  int *pnBytes,                       /* OUT: Number of bytes in token */
  int *piStartOffset,                 /* OUT: Starting offset of token */
  int *piEndOffset,                   /* OUT: Ending offset of token */
  int *piPosition                     /* OUT: Position integer of token */
){
  simple_tokenizer_cursor *c = (simple_tokenizer_cursor *) pCursor;
  simple_tokenizer *t = (simple_tokenizer *) pCursor->pTokenizer;
  unsigned char *p = (unsigned char *)c->pInput;

  while( c->iOffset<c->nBytes ){
    int iStartOffset;

    /* Scan past delimiter characters */
    while( c->iOffset<c->nBytes && isDelim(t, p[c->iOffset]) ){
      c->iOffset++;
    }

    /* Count non-delimiter characters. */
    iStartOffset = c->iOffset;
    while( c->iOffset<c->nBytes && !isDelim(t, p[c->iOffset]) ){
      c->iOffset++;
    }

    if( c->iOffset>iStartOffset ){
      int i, n = c->iOffset-iStartOffset;
      if( n>c->nTokenAllocated ){
        c->nTokenAllocated = n+20;
        c->pToken = realloc(c->pToken, c->nTokenAllocated);
        if( c->pToken==NULL ) return SQLITE_NOMEM;
      }
      for(i=0; i<n; i++){
        /* TODO(shess) This needs expansion to handle UTF-8
        ** case-insensitivity.
        */
        unsigned char ch = p[iStartOffset+i];
        c->pToken[i] = ch<0x80 ? tolower(ch) : ch;
      }
      *ppToken = c->pToken;
      *pnBytes = n;
      *piStartOffset = iStartOffset;
      *piEndOffset = c->iOffset;
      *piPosition = c->iToken++;

      return SQLITE_OK;
    }
  }
  return SQLITE_DONE;
}
Esempio n. 3
0
token_type getIntToken(const char *text, int *len) {
  int sign = 0;
  if (text[0] == '+' || text[0] == '-') {
    sign = 1;
  }

  int int_len = readInt(text + sign);
  if (!isDelim(text[sign+int_len])) {
    return tok_error;
  }

  if (sign == 1 && int_len == 0) {
    if (text[0] == '+') {
      *len = 1;
      return tok_add;
    }
    if (text[0] == '-') {
      *len = 1;
      return tok_sub;
    }
  }

  *len = sign + int_len;
  return tok_integer;
}
Esempio n. 4
0
/*
** Extract the next token from a tokenization cursor.  The cursor must
** have been opened by a prior call to porterOpen().
*/
static int porterNext(
  sqlite3_tokenizer_cursor *pCursor,  /* Cursor returned by porterOpen */
  const char **pzToken,               /* OUT: *pzToken is the token text */
  int *pnBytes,                       /* OUT: Number of bytes in token */
  int *piStartOffset,                 /* OUT: Starting offset of token */
  int *piEndOffset,                   /* OUT: Ending offset of token */
  int *piPosition                     /* OUT: Position integer of token */
){
  porter_tokenizer_cursor *c = (porter_tokenizer_cursor *) pCursor;
  const char *z = c->zInput;

  while( c->iOffset<c->nInput ){
    int iStartOffset, ch;

    /* Scan past delimiter characters */
    while( c->iOffset<c->nInput && isDelim(z[c->iOffset]) ){
      c->iOffset++;
    }

    /* Count non-delimiter characters. */
    iStartOffset = c->iOffset;
    while( c->iOffset<c->nInput && !isDelim(z[c->iOffset]) ){
      c->iOffset++;
    }

    if( c->iOffset>iStartOffset ){
      int n = c->iOffset-iStartOffset;
      if( n>c->nAllocated ){
        char *pNew;
        c->nAllocated = n+20;
        pNew = sqlite3_realloc(c->zToken, c->nAllocated);
        if( !pNew ) return SQLITE_NOMEM;
        c->zToken = pNew;
      }
      porter_stemmer(&z[iStartOffset], n, c->zToken, pnBytes);
      *pzToken = c->zToken;
      *piStartOffset = iStartOffset;
      *piEndOffset = c->iOffset;
      *piPosition = c->iToken++;
      return SQLITE_OK;
    }
  }
  return SQLITE_DONE;
}
Esempio n. 5
0
std::string Tokenizer::firstToken(void) {
	thisToken.erase();
	skipDelim();
	std::string::iterator iter = currentPosition;
	for( ; *iter && !isDelim(*iter); iter++ ) {
		thisToken += *iter;
	}
	(*iter == NULL) ?
		(currentPosition = iter) : (currentPosition = ++iter);
	return thisToken;
}
Esempio n. 6
0
 bool Tokenizer::next(void) {
   mToken = mNextToken;
   if (!mToken || (*mToken == '\0')) {
     return false;
   }
   while (isDelim(*mToken)) {
     mToken++;
   }
   char *tokenend = mToken + 1;
   while (!isDelim(*tokenend)) {
     tokenend++;
   }
   if (*tokenend == '\0') {
     mNextToken = tokenend;
   } else {
     *tokenend = '\0';
     mNextToken = tokenend + 1;
   }
   return true;
 }
Esempio n. 7
0
void Invocation::getNoWarnings() {
	int kk = cmdParams.size();
	for (int i = 0; i < kk; ++i)
		if (cmdParams[i] == "-nowarnings") {
			showWarnings = false;
			return;
		}
	string s1, s2, s;
	int x, ls;
	ifstream * in = new ifstream(params->getCFGFileName());
	if (in->fail()) {
		in->close();
		delete in;
		return;
	}
	while (getline(*in, s)) {
		trim(s);
		if (s[0] == '[' || s[0] == ';') continue;
		ls = s.length();
		x = -1;
		for (int i = 0; i < ls; ++i)
			if (isDelim(s[i]) || s[i] == '=') {
				x = i;
				break;
			}
		if (x == -1) continue;
		s1 = lowercase(s.substr(0, x));
		if (s1[0] != '-') s1 = "-" + s1;
		while (x < ls && s[x] != '=') ++x;
		if (x++ >= ls) continue;
		while (x < ls && isDelim(s[x])) ++x;
		if (x >= ls) continue;
		s2 = lowercase(s.substr(x));
		if (s1 == "-nowarnings" && s2 == "true") {
			showWarnings = false;
			break;
		}
	}
	in->close();
	delete in;
}
Esempio n. 8
0
GLECSVDataStatus GLECSVData::skipSpacesAndFirstDelim(GLEBYTE ch) {
	while (true) {
		if (!isSpace(ch)) {
			if (ch == 0) {
				return GLECSVDataStatusEOF;
			} else if (isEol(ch)) {
				return readNewline(ch);
			} else if (isDelim(ch)) {
				m_lastDelimWasSpace = isSpace(ch);
				return GLECSVDataStatusOK;
			} else {
				goBack();
				return GLECSVDataStatusOK;
			}
		}
		ch = readChar();
	}
	return GLECSVDataStatusOK;
}
Esempio n. 9
0
	const char *StatTokeniser::nextToken()
	{
		if (mInput[0] == '\0')
		{
			return nullptr;
		}

		int foundChars = 0;
		int pos = 0;
		int outPos = 0;
		char ch = mInput[0];
		while (ch != '\0')
		{
			bool isWhite = IS_WHITE(ch);
			if (isWhite)
			{
				if (foundChars > 0)
				{
					break;
				}
				ch = mInput[++pos];
				continue;
			}
			
			int currentChar = isDelim(ch) ? 2 : 1;
			if (foundChars == 0)
			{
				foundChars = currentChar;
			}
			else
			{
				if (foundChars != currentChar)
				{
					break;
				}
			}
			mBuff[outPos++] = ch;
			ch = mInput[++pos];
		}
		mBuff[outPos] = '\0';
		mInput += pos;
		return mBuff;
	}
Esempio n. 10
0
void tokenize(std::istream &in, std::vector<token> &out) {
    std::string line;
    while(!in.fail() && !in.eof()) {
        std::getline(in,line,'\n');
        std::string word;
        for (char c : line) {
            if (isDelim(c)) {
                if (word == "attributes") break;
                if (word == "align") {word.clear();break;}
                check_for_token(word,out);
                check_for_delim(c,out);
            } else {
            if (c != '\n') word.push_back(c);
            }
        }
         check_for_token(word,out);
        out.push_back({ENDLINE,"\n"});
    }
}
Esempio n. 11
0
GLECSVDataStatus GLECSVData::readCell() {
	GLEBYTE ch = readSignificantChar();
	if (ch == '"' || ch == '\'') {
		return readCellString(ch);
	}
	unsigned int cellCount = 0;
	unsigned int cellSize = 0;
	unsigned int cellPos = lastCharPos();
	while (true) {
		if (ch == 0) {
			if (isSizeCheckOKEndOfLine(cellSize)) {
				createCell(cellSize, cellPos);
			}
			return GLECSVDataStatusEOF;
		} else if (isEol(ch)) {
			if (isSizeCheckOKEndOfLine(cellSize)) {
				createCell(cellSize, cellPos);
			}
			return readNewline(ch);
		} else if (isDelim(ch)) {
			m_lastDelimWasSpace = isSpace(ch);
			if (isSizeCheckOKAtDelim(ch, cellSize)) {
				createCell(cellSize, cellPos);
			}
			return skipSpacesAndFirstDelim(ch);
		} else if (isComment(ch)) {
			if (isSizeCheckOKEndOfLine(cellSize)) {
				createCell(cellSize, cellPos);
			}
			return skipTillEol();
		}
		cellCount++;
		if (!isSpace(ch)) {
			cellSize = cellCount;
		}
		ch = readChar();
	}
	return GLECSVDataStatusOK;
}
Esempio n. 12
0
void SRExpressionsParser::parse(void)
{

	type = VT_UNKNOWN;
	token.reset();
	while(isspace(*expression))
		expression++;
	if(isDelim(*expression))
	{
		// parse for valid ops

		type = VT_DEL;
		char op=*expression;
        token.add(*expression++);
		switch(op)
		{
		case '|':
			if (*expression == '|')
                token.add(*expression++);
			break;
		case '*':
			if (*expression == '*')
                token.add(*expression++);// power...
			break;
		case '&':
			if (*expression == '&')
                token.add(*expression++);
			break;
		case '!':
			if (*expression == '=')
                token.add(*expression++);
			break;
		case '=':
			if (*expression == '=')
                token.add(*expression++);
			break;
		case '<':
			if (*expression == '=')
                token.add(*expression++);
			if (*expression == '<')
                token.add(*expression++);
			break;
		case '>':
			if (*expression == '=')
                token.add(*expression++);
			if (*expression == '>')
                token.add(*expression++);
			break;

		}
	}
	else if((expression[0]=='0') && (expression[1]=='x')) // hex value
	{
		type = VT_NUM;
        token.add(*expression++);
        token.add(*expression++);
		while(isxdigit(*expression))
		{
			token.add(*expression++);
		}
	}
	else
		if(isNumeric(*expression))
		{
			type = VT_NUM;
			while(isNumeric(*expression) || (*expression=='e')) // floating point, scientific
			{
				if (*expression=='e')
				{
                    token.add(*expression++);
					if (*expression=='-')
                        token.add(*expression++);
				}
				else
				{
                    token.add(*expression++);
				}
			}
			switch(*expression)
			{
			case 'o':
			case 'b':
			case 'd':
                    token.add(*expression++);
				break;
			}
		}
		else if(isAlpha(*expression) || ((unsigned char)*expression>=0x80))
		{
			type = VT_VAR;
			while(isAlpha(*expression) || ((unsigned char)*expression>=0x80))
            {
                token.add(*expression++);
			}
		}
		else if(*expression)
		{
			token.add(*expression++);
			throw(E_SYNTAX);
		}
		if((*expression==':')&&(expression[1]=='='))
		{
			expression++;
			type = VT_DEF;
		}
		else if((*expression=='=') && (expression[1]!='='))
		{
			type = VT_VAR;
		}

		while(isspace(*expression))
			expression++;
}
Esempio n. 13
0
char *dfnmerge(char *fnam, const char * const dr, const char * const Xpath
 , const char * const nam, const char * const ext)
{	int len;
	const char *path;
	char *p;
	int delim;		/* type of last path component delimiter:
						0: none
						1: a "real" one ('/' or '\\') or none necessary
						2: a colon
					*/

#ifdef SUPPORT_UNC_PATH
	DBG_ENTER("dfnumerge", Suppl_dfn)
#else
	DBG_ENTER("dfnmerge", Suppl_dfn)
#endif
	DBG_ARGUMENTS( ("dr=\"%s\", pa=\"%s\", na=\"%s\", ex=\"%s\", out=%p", dr, Xpath, nam, ext, fnam) )

	path = Xpath;
	if((p = fnam) == 0) {	/* determine file nam length */
#ifdef SUPPORT_UNC_PATH
		len = dr? (*dr == '\\'? strlen(dr) + 1: 3): 1;
#else
		len = dr? 3: 1;		/* add the NUL terminator */
#endif
		if(path) {
			if(*path)
				len += strlen(path) + 1;
			else path = 0;	/* no path component specified */
		}
		if(nam) len += strlen(nam);
		if(ext) len += strlen(ext) + 1;
		if((fnam = p = eno_malloc(len)) == 0)
			DBG_RETURN_S( 0)
		*fnam = NUL;
	}

	if(dr) {
#ifdef SUPPORT_UNC_PATH
		if(*dr == '\\') {
			p = stpcpy(p, dr);
				/* Test if the drive spec already ends with a delimiter */
			delim = isDelim(p[-1]);
		}
		else {
#endif
			if(0 != (*p = *dr))
				++p;
			*p++ = ':';
				/* The colon is not a delimiter for root directories */
			delim = 2;
#ifdef SUPPORT_UNC_PATH
		}
#endif
		*p = NUL;
	}
	else delim = 1;		/* no drive --> no delim necessary */

	if(path) {
		switch(delim) {
		case 0:		/* missing delimiter --> need one unless path
						has one */
			if(!isDelim(*path)) *p++ = '\\';
			break;
		}
		if(!*path)		/* The root dir always requires a backslash */
			*p++ = '\\';
		/* Now, the delimiter is definitely there */
		p = stpcpy(p, path);
		if(0 == (delim = isDelim(p[-1]))) {
			if(p[-1] == ':')
				delim = 2;
		}
	}

	if(nam)	{
		if(!delim) {
			if(!isDelim(*nam))
				*p++ = '\\';
			delim = 1;
		}
		p = stpcpy(p, nam);
	}

	if(ext) {
		if(!delim) {
			*p++ = '\\';
			delim = 1;
		}
		*p++ = '.';
		strcpy(p, ext);
	}

	DBG_RETURN_S( fnam)
}
Esempio n. 14
0
void block::printOut(ofstream* fOut,ifstream * fInput,int t, map<string,bool> * printList){
	//******* okay, let's try this; this may be a fairly complicated way to write out the .pde for compiling
	//******* and uploading, but it seems to work okay. This function reads a block of code from a file handed from the functions
	//******* below, parses it to find references to contained blocks or dropdowns, and properly formats it
	
	bool printed=printList->find(title)->second;
	bool partP=partnerWritten(printList);
	
	//-------- init the buffer, the pos counters and the end flag
	string buffer;
	int strtPos=0,endPos=0;
	bool bEnd=false;
	
	//-------- while we're not at the end of the file and we haven't reached the end of the read section
	while ((*fInput).peek()!=EOF&&!bEnd) {
		
		//-------- get a line and reset the position counters
		getline((*fInput),buffer);
		strtPos=endPos=0;
		
		//-------- write t amount of tabs to fOut
		for (int i=0; i<t&&buffer.compare("}"); i++) {
			*fOut << "\t";
		}
		//-------- if the buffer is not a single '}', then parse the line; otherwise, it indicates the end of the section
		if(buffer.compare("}")){
			//-------- init the foundTab var, and start stepping through the buffer
			int foundTab=0;
			for (unsigned int i=0; i<buffer.length(); i++) {
				//-------- if we find a '$'
				if(buffer[i]=='$'){
					//-------- step through the buffer until you find [ ,\n;")]
					strtPos=endPos=++i;
					while (!isDelim(buffer[endPos]," ,\n;\")")) {
						endPos++;
					}
					i=endPos-1; // move the step counter to the end of the word
					
					//-------- once you have found the segment of code after the $, store it in "temp" and split the string by "[]"
					//-------- doing this allows you to find which dropdown to look at (most often, it's 0)
					string temp(buffer,strtPos,endPos-strtPos);
					vector<string> tempVec=ofSplitString(temp, "[]");
					
					//-------- pos stores the vector position of the dd you are looking for 
					int pos=0;
					if(tempVec.size()>1){
						pos=atoi(tempVec[1].c_str());
					}
					
					//-------- reassemble the string without the []
					if(tempVec.size()>2) temp = tempVec[0] + tempVec[2];
					
					if(temp.compare("blockIn")==0&&buffer[i+1]==';') i++;
					//-------- generate the map for use in the switch
					map<string,int> list;
					list["dd.num"]=0;
					list["dd.str"]=1;
          list["dd.ind"]=2;
					list["blockOn"]=3;
					list["blockIn"]=4;
					list["blockIf"]=5;
					switch (list.find(temp)->second) {
						case 0:
							//-------- if temp=="dd.num" write the value of the "pos" dropdown
							*fOut << ddGroup[pos].getValue();
							break;
						case 1:
							//-------- if temp=="dd.str" write the string of the value stored in the dd[pos]
							*fOut << ddGroup[pos].getString();
							break;
            case 2:
							//-------- if temp=="dd.ind" write the index of the value stored in the dd[pos]
							*fOut << ddGroup[pos].getIndex();
							break;
						case 3:
							//-------- if temp==blockOn, printout the loop function for each of the blocks on
							for(unsigned int i=0; i<blocksOn.size(); i++){
								blocksOn[i].printData("loop(){",fOut,t,printList);
							}
							break;
						case 4:{
							//-------- if temp==blockIn, printout the loop for each of the blocks inside
							int outP=fOut->tellp();
							fOut->seekp(outP-2);
							for(unsigned int i=0; i<blocksIn.size(); i++){
								blocksIn[i].printData("loop(){",fOut,t+1,printList,false);
							}
							resetList(*this, *printList);
							for(unsigned int i=0; i<blocksIn.size(); i++){
								//blocksIn[i].printData("end(){",fOut,t+1,printList);
							}
							outP=fOut->tellp();
							fOut->seekp(outP-1);
							break;
						}
						case 5:
							//-------- if temp==blockIf, print the numBlocks
							//-------- just realized this won't work with more than one block, need to do it like the
							//-------- dd blocks above
							//-------- TODO: don't be a dumbshit
							for(unsigned int i=0; i<numBlocks.size(); i++){
								if(!numBlocks[i].placeHolder)
									numBlocks[i].printData("loop(){",fOut,0,printList);
								else {
									*fOut << "0";
								}

							}
							break;
						default:
							break;
					}
				}
				//-------- if the buff line is preceded by a '@', print the line only if the block has not already appeared
				else if(buffer[i]=='@'){
					if (printed) i=buffer.length();
				}
				//-------- if the buffer at the current pos is '~', print only if the block or a complement block
				//-------- has not been printed before
				else if(buffer[i]=='~'){
					if (partP) i=buffer.length();
				}
				//-------- increment the foundtab count if we find a '\t'
				else if(buffer[i]=='\t'){
					foundTab++;
					if (foundTab>1) *fOut << buffer[i];
				}
				//-------- if nothing else, printout the character
				else if(buffer[i]!='\n'&&buffer[i]!='\r') *fOut << buffer[i];
			}
			//-------- newline after buffer if it was not a numblock
			//-------- TODO: figure out why no new line if printed already
			if(!numBlock) *fOut << '\n';
		}
		else {
			//-------- if we found a '}' by itself, end.
			(*fInput).seekg (0, ios::end);
			bEnd=true;
		}
	}
}
Esempio n. 15
0
 void Tokenizer::addDelimiter(char d) {
   if (!isDelim(d)) {
     mDelims[mNumDelim] = d;
     mNumDelim++;
   }
 }
Esempio n. 16
0
/** Feed the LineInfoList for the given text
  *
  * \param vText  The text to draw
  * \param vWidth The allowed width
  * \param vOut   The LineInfoList to feed
  * \param vWrap  The word wrap parameter
  *
  */
void RainbruRPG::OgreGui::Font::
processText( const std::string& vText, float vWidth, 
	     LineInfoList& vOut, bool vWrap)const{

  // Get the total size of the text
  unsigned int count = (unsigned int)vText.size( );

  // Stores pixel width of line and word
  float lineWidth = 0.0f;
  float wordWidth = 0.0f;

  // Stores current word
  std::string word;

  // Stores the current line
  std::string line;

  unsigned int x;
  for ( x = 0; x < count; x++ ){
    char c = vText[x];

    // Add the new character to the current word
    Glyph* gl=getGlyph(c);
    wordWidth += gl->getSpace();
    word += c;

    bool delim=isDelim(c);
    if ( delim || ( x == ( count-1 ) ) ){
      // Is this line too long to fit?
      if ( vWrap && ( lineWidth + wordWidth > vWidth ) ){
	// Save current line
	vOut.push_back( LineInfo( line, lineWidth ) );

	// Reset line width
	lineWidth = 0.0f;
	line = "";
      }
      if ( c == '\n' ){
	// Save current line
	vOut.push_back( LineInfo( line + word, lineWidth + wordWidth ) );

	// Reset line width
	lineWidth = 0.0f;
	wordWidth = 0.0f;

	line = "";
	word = "";
      }
      else{
	lineWidth += wordWidth;
	line += word;

	wordWidth = 0.0f;
	word = "";
      }
    }
  }
  
  // Push any remaining text onto list
  vOut.push_back( LineInfo( line + word, lineWidth + wordWidth ) );
}
Esempio n. 17
0
void Invocation::loadCFGFile() {
	ifstream * in = new ifstream(params->getCFGFileName());
	if (in->fail()) {
		in->close();
		delete in;
		if (params->getIsCFGFileSet()) warningsQueue.pb("configuration file \"" + params->getCFGFileName() + "\" was not found");
		return;
	}
	string s, curParam, curValue;
	int ls, x;
	bool ok;
	while (getline(*in, s)) {
		trim(s);
		if (s[0] == '[' || s[0] == ';') continue;
		ls = s.length();
		x = -1;
		for (int i = 0; i < ls; ++i)
			if (isDelim(s[i]) || s[i] == '=') {
				x = i;
				break;
			}
		if (x == -1) {
			warningsQueue.pb("error in configuration file format :: string \"" + s + "\" is incorrect. Use \"parameter=value\" definitions. Run with \"-helpconfig\" flag to see help message");
			continue;
		}
		curParam = lowercase(s.substr(0, x));
		if (curParam[0] != '-') curParam = "-" + curParam;
		while (x < ls && s[x] != '=') ++x;
		if (x++ >= ls) {
			warningsQueue.pb("error in configuration file format :: string \"" + s + "\" is incorrect. Use \"parameter=value\" definitions. Run with \"-helpconfig\" flag to see help message");
			continue;
		}
		while (x < ls && isDelim(s[x])) ++x;
		if (x >= ls) {
			warningsQueue.pb("error in configuration file format :: string \"" + s + "\" is incorrect. Use \"parameter=value\" definitions. Run with \"-helpconfig\" flag to see help message");
			continue;
		}
		curValue = s.substr(x);
		if (!flagsDict.count(curParam)) {
			warningsQueue.pb("unknown parameter \"" + curParam + "\" is defined in configuration file");
			continue;
		}
		ok = !definedParams.count(curParam);
		if (!ok) warningsQueue.pb("parameter \"" + curParam + "\" redefinition if configuration file. Last definition is accepted");
		int intValue;
		if (curParam == "-c") params->setCheckerFileName(curValue); else
		if (curParam == "-checkertl") {
			intValue = toi(curValue);
			if (intValue == -1) {
				warningsQueue.pb("parameter \"" + curParam + "\" - expected value is a number but \"" + curValue + "\" found in configuration file");
				continue;
			}
			params->setCheckerTimeLimit(intValue);
		} else
		if (curParam == "-i") params->setInputFileName(curValue); else
		if (curParam == "-ml") {
			intValue = toi(curValue);
			if (intValue == -1) {
				warningsQueue.pb("configuration file :: parameter \"" + curParam + "\" - expected value is a number but \"" + curValue + "\" found");
				continue;
			}
			params->setMemoryLimit(intValue);
		} else
		if (curParam == "-o") params->setOutputFileName(curValue); else
		if (curParam == "-p") params->setProgramFileName(curValue); else
		if (curParam == "-tc") {
			intValue = toi(curValue);
			if (intValue == -1) {
				warningsQueue.pb("configuration file :: parameter \"" + curParam + "\" - expected value is a number but \"" + curValue + "\" found");
				continue;
			}
			params->setTestsCount(intValue);
		} else
		if (curParam == "-tim") params->getInputFileMask()->setFileMask(curValue); else
		if (curParam == "-tl") {
			intValue = toi(curValue);
			if (intValue == -1) {
				warningsQueue.pb("parameter \"" + curParam + "\" - expected value is a number but \"" + curValue + "\" found in configuration file");
				continue;
			}
			params->setTimeLimit(intValue);
		} else
		if (curParam == "-tom") params->getOutputFileMask()->setFileMask(curValue); else 
		if (curParam != "-nowarnings") {
			ok = false;
			warningsQueue.pb("parameter \"" + curParam + "\" can not be defined in configuration file");
		}
		if (ok) definedParams.insert(curParam);
	}
	in->close();
	delete in;
}
/**
 * Generate a new token.  There are basically three types of token we can
 *  generate:
 * - A porter stemmed token.  This is a word entirely comprised of ASCII
 *    characters.  We run the porter stemmer algorithm against the word.
 *    Because we have no way to know what is and is not an English word
 *    (the only language for which the porter stemmer was designed), this
 *    could theoretically map multiple words that are not variations of the
 *    same word down to the same root, resulting in potentially unexpected
 *    result inclusions in the search results.  We accept this result because
 *    there's not a lot we can do about it and false positives are much
 *    better than false negatives.
 * - A copied token; case/accent-folded but not stemmed.  We call the porter
 *    stemmer for all non-CJK cases and it diverts to the copy stemmer if it
 *    sees any non-ASCII characters (after folding) or if the string is too
 *    long.  The copy stemmer will shrink the string if it is deemed too long.
 * - A bi-gram token; two CJK-ish characters.  For query reasons we generate a
 *    series of overlapping bi-grams.  (We can't require the user to start their
 *    search based on the arbitrary context of the indexed documents.)
 *
 * It may be useful to think of this function as operating at the points between
 *  characters.  While we are considering the 'current' character (the one after
 *  the 'point'), we are also interested in the 'previous' character (the one
 *  preceding the point).
 * At any 'point', there are a number of possible situations which I will
 *  illustrate with pairs of characters. 'a' means alphanumeric ASCII or a
 *  non-ASCII character that is not bi-grammable or a delimeter, '.'
 *  means a delimiter (space or punctuation), '&' means a bi-grammable
 *  character.
 * - aa: We are in the midst of a token.  State remains BIGRAM_ALPHA.
 * - a.: We will generate a porter stemmed or copied token.  State was
 *        BIGRAM_ALPHA, gets set to BIGRAM_RESET.
 * - a&: We will generate a porter stemmed or copied token; we will set our
 *        state to BIGRAM_UNKNOWN to indicate we have seen one bigram character
 *        but that it is not yet time to emit a bigram.
 * - .a: We are starting a token.  State was BIGRAM_RESET, gets set to
 *        BIGRAM_ALPHA.
 * - ..: We skip/eat the delimeters.  State stays BIGRAM_RESET.
 * - .&: State set to BIGRAM_UNKNOWN to indicate we have seen one bigram char.
 * - &a: If the state was BIGRAM_USE, we generate a bi-gram token.  If the state
 *        was BIGRAM_UNKNOWN we had only seen one CJK character and so don't do
 *        anything.  State is set to BIGRAM_ALPHA.
 * - &.: Same as the "&a" case, but state is set to BIGRAM_RESET.
 * - &&: We will generate a bi-gram token.  State was either BIGRAM_UNKNOWN or
 *        BIGRAM_USE, gets set to BIGRAM_USE.
 */
static int porterNext(
  sqlite3_tokenizer_cursor *pCursor,  /* Cursor returned by porterOpen */
  const char **pzToken,               /* OUT: *pzToken is the token text */
  int *pnBytes,                       /* OUT: Number of bytes in token */
  int *piStartOffset,                 /* OUT: Starting offset of token */
  int *piEndOffset,                   /* OUT: Ending offset of token */
  int *piPosition                     /* OUT: Position integer of token */
){
  porter_tokenizer_cursor *c = (porter_tokenizer_cursor *) pCursor;
  const unsigned char *z = (unsigned char *) c->zInput;
  int len = 0;
  int state;

  while( c->iOffset < c->nInput ){
    int iStartOffset, numChars;

    /*
     * This loop basically has two modes of operation:
     * - general processing (iPrevBigramOffset == 0 here)
     * - CJK processing (iPrevBigramOffset != 0 here)
     *
     * In an general processing pass we skip over all the delimiters, leaving us
     *  at a character that promises to produce a token.  This could be a CJK
     *  token (state == BIGRAM_USE) or an ALPHA token (state == BIGRAM_ALPHA).
     * If it was a CJK token, we transition into CJK state for the next loop.
     * If it was an alpha token, our current offset is pointing at a delimiter
     *  (which could be a CJK character), so it is good that our next pass
     *  through the function and loop will skip over any delimiters.  If the
     *  delimiter we hit was a CJK character, the next time through we will
     *  not treat it as a delimiter though; the entry state for that scan is
     *  BIGRAM_RESET so the transition is not treated as a delimiter!
     * 
     * The CJK pass always starts with the second character in a bi-gram emitted
     *  as a token in the previous step.  No delimiter skipping is required
     *  because we know that first character might produce a token for us.  It
     *  only 'might' produce a token because the previous pass performed no
     *  lookahead and cannot be sure it is followed by another CJK character.
     *  This is why 
     */

    // If we have a previous bigram offset
    if (c->iPrevBigramOffset == 0) {
      /* Scan past delimiter characters */
      state = BIGRAM_RESET; /* reset */
      while (c->iOffset < c->nInput &&
             isDelim(z + c->iOffset, z + c->nInput, &len, &state)) {
        c->iOffset += len;
      }

    } else {
      /* for bigram indexing, use previous offset */
      c->iOffset = c->iPrevBigramOffset;
    }

    /* Count non-delimiter characters. */
    iStartOffset = c->iOffset;
    numChars = 0;

    // Start from a reset state.  This means the first character we see
    //  (which will not be a delimiter) determines which of ALPHA or CJK modes
    //  we are operating in.  (It won't be a delimiter because in a 'general'
    //  pass as defined above, we will have eaten all the delimiters, and in
    //  a CJK pass we are guaranteed that the first character is CJK.)
    state = BIGRAM_RESET; /* state is reset */
    // Advance until it is time to emit a token.
    // For ALPHA characters, this means advancing until we encounter a delimiter
    //  or a CJK character.  iOffset will be pointing at the delimiter or CJK
    //  character, aka one beyond the last ALPHA character.
    // For CJK characters this means advancing until we encounter an ALPHA
    //  character, a delimiter, or we have seen two consecutive CJK
    //  characters.  iOffset points at the ALPHA/delimiter in the first 2 cases
    //  and the second of two CJK characters in the last case.
    // Because of the way this loop is structured, iOffset is only updated
    //  when we don't terminate.  However, if we terminate, len still contains
    //  the number of bytes in the character found at iOffset.  (This is useful
    //  in the CJK case.)
    while (c->iOffset < c->nInput &&
           !isDelim(z + c->iOffset, z + c->nInput, &len, &state)) {
      c->iOffset += len;
      numChars++;
    }

    if (state == BIGRAM_USE) {
      /* Split word by bigram */
      // Right now iOffset is pointing at the second character in a pair.
      //  Save this offset so next-time through we start with that as the
      //  first character.
      c->iPrevBigramOffset = c->iOffset;
      // And now advance so that iOffset is pointing at the character after
      //  the second character in the bi-gram pair.  Also count the char.
      c->iOffset += len;
      numChars++;
    } else {
      /* Reset bigram offset */
      c->iPrevBigramOffset = 0;
    }

    /* We emit a token if:
     *  - there are two ideograms together,
     *  - there are three chars or more,
     *  - we think this is a query and wildcard magic is desired.
     * We think is a wildcard query when we have a single character, it starts
     *  at the start of the buffer, it's CJK, our current offset is one shy of
     *  nInput and the character at iOffset is '*'.  Because the state gets
     *  clobbered by the incidence of '*' our requirement for CJK is that the
     *  implied character length is at least 3 given that it takes at least 3
     *  bytes to encode to 0x2000.
     */
    // It is possible we have no token to emit here if iPrevBigramOffset was not
    //  0 on entry and there was no second CJK character.  iPrevBigramOffset
    //  will now be 0 if that is the case (and c->iOffset == iStartOffset).
    if (// allow two-character words only if in bigram
        (numChars == 2 && state == BIGRAM_USE) ||
        // otherwise, drop two-letter words (considered stop-words)
        (numChars >=3) ||
        // wildcard case:
        (numChars == 1 && iStartOffset == 0 &&
         (c->iOffset >= 3) &&
         (c->iOffset == c->nInput - 1) &&
         (z[c->iOffset] == '*'))) {
      /* figure out the number of bytes to copy/stem */
      int n = c->iOffset - iStartOffset;
      /* make sure there is enough buffer space */
      if (n * MAX_UTF8_GROWTH_FACTOR > c->nAllocated) {
        c->nAllocated = n * MAX_UTF8_GROWTH_FACTOR + 20;
        c->zToken = sqlite3_realloc(c->zToken, c->nAllocated);
        if (c->zToken == NULL)
          return SQLITE_NOMEM;
      }

      if (state == BIGRAM_USE) {
        /* This is by bigram. So it is unnecessary to convert word */
        copy_stemmer(&z[iStartOffset], n, c->zToken, pnBytes);
      } else {
        porter_stemmer(&z[iStartOffset], n, c->zToken, pnBytes);
      }
      *pzToken = (const char*) c->zToken;
      *piStartOffset = iStartOffset;
      *piEndOffset = c->iOffset;
      *piPosition = c->iToken++;
      return SQLITE_OK;
    }
  }
  return SQLITE_DONE;
}
Esempio n. 19
0
void Tokenizer::skipDelim(void) {
	std::string::iterator iter = currentPosition;
	for( ; isDelim(*iter); ++iter )
		;
	currentPosition = iter;
}
Esempio n. 20
0
U8 timeLegal(U8* timeStr, U16 strLen, em_time_state* pState)
{
	U16 i = 0, digitalLen = 0;
	U8 data, value[3] = { 0 };
	em_time_state state = tm_state_init;

	for (i = 0; i < strLen; i++) {
		data = timeStr[i];
		//printf("data: %c\n", data);
		switch (state) {
		case tm_state_init:
			if (isdigit(data)) {
				value[digitalLen] = data;
				digitalLen++;
				state = tm_state_hour;
			} else {
				state = tm_state_end_illegal;
				goto result;
			}
			break;
		case tm_state_hour:
			if (isdigit(data)) {
				if (digitalLen == 2) {
					state = tm_state_end_illegal;
					goto result;
				}
				value[digitalLen] = data;
				if (atoi(value) > 23) {
					state = tm_state_end_illegal;
					goto result;
				}
				digitalLen++;
			} else if (isDelim(data)) {
				digitalLen = 0;
				memset(value, 0, sizeof(value));
				state = tm_state_delim;
			} else {
				state = tm_state_end_illegal;
				goto result;
			}
			break;
		case tm_state_delim:
			if (isdigit(data)) {
				value[digitalLen] = data;
				digitalLen++;
				state = tm_state_min;
			} else {
				state = tm_state_end_illegal;
				goto result;
			}
			break;
		case tm_state_min:
			if (isdigit(data)) {
				if (digitalLen == 2) {
					state = tm_state_end_illegal;
					goto result;
				}
				value[digitalLen] = data;
				if (atoi(value) > 59) {
					state = tm_state_end_illegal;
					goto result;
				}
				digitalLen++;
			} else {
				state = tm_state_end_illegal;
				goto result;
			}
			break;
		default:
			break;
		}
	}


result:
	//printf("reach result\n");
	*pState = state;
	if (state == tm_state_hour || state == tm_state_min)
		return NO_ERR;
	else
		return ERROR;
}