static int tokenize_inv_spec(OE oe) {
	Tokenizer tk = FunCallTokenizer_New(oe);
	tk->start((byte*)"INV(1,2,3);",11);
	Token xor = {0};tk->nextToken(&xor);
	Token one = {0};tk->nextToken(&one);
	Token two = {0};tk->nextToken(&two);
	Token three = {0}; tk->nextToken(&three);
	FunCallTokenizer_Destroy(&tk);
	return xor.type == INV && one.type == NUM && one.value.num == 1 && two.type == NUM && two.value.num == 2 && three.type == NUM && three.value.num == 3;
}
Beispiel #2
0
void Semantics::setMatchMode(){

	Tokenizer tknzr = Tokenizer(this->ElementId,"-");
	this->ElementId = tknzr.nextToken();
	string str= tknzr.currToken();
	if(str.empty() || ((!isdigit(str[0])) && (str[0] != '-') && (str[0] != '+'))) this->MatchMode=false ;
   	char * p ;
   	strtol(str.c_str(), &p, 10) ;
   	this->MatchMode = (*p == 0) ;

   	while(tknzr.hasMoreTokens()){
   		this->SemanticsId = tknzr.nextToken();
   		break;
   	}
}
static int tokenize_file(OE oe) {
	uint lbuffer = 1060365;
	uint ands = 0, xors = 0, invs = 0, nums = 0, tokens = 0;
	Tokenizer tk = 0;
	Token tok = {0};
	byte * buffer = oe->getmem(lbuffer);
	uint fp = 0;
	oe->open("file ../test/AES",&fp);
	oe->read(fp,buffer,&lbuffer);
	oe->close(fp);
	tk = FunCallTokenizer_New(oe);
	tk->start(buffer,lbuffer);

	do {
		tk->nextToken(&tok);
		if (tok.type == INV) invs += 1;
		if (tok.type == AND) ands +=1;
		if (tok.type == XOR) xors += 1;
		if (tok.type == NUM) nums += 1;
		tokens++;
	} while(tok.type != DONE);

	DBG_P(oe,"\nANDS: %u\nXORS: %u\nINVS: %u\nNUMS: %u\nTOKENS: %u\n",ands,xors,invs,nums,tokens);
	oe->putmem(buffer);
	return ands == 6800 && xors == 24448 && nums == 139136 && tokens == 172076 && invs == 1691;
}
void XapianIndex::addTermsToDocument(Tokenizer &tokens, Xapian::Document &doc,
	const string &prefix, Xapian::termcount &termPos, StemmingMode mode) const
{
	Xapian::Stem *pStemmer = NULL;
	string term;

	// Do we know what language to use for stemming ?
	if (m_stemLanguage.empty() == false)
	{
		pStemmer = new Xapian::Stem(StringManip::toLowerCase(m_stemLanguage));
	}

	// Get the terms
	while (tokens.nextToken(term) == true)
	{
		if (term.empty() == true)
		{
			continue;
		}
		// Does it start with a capital letter ?
		if (isupper((int)term[0]) != 0)
		{
			// R-prefix the raw term
			doc.add_posting(string("R") + term, termPos);
		}
		// Lower case the term
		term = StringManip::toLowerCase(term);

		// Stem the term ?
		if ((mode == STORE_UNSTEM) ||
			(pStemmer == NULL))
		{
			doc.add_posting(limitTermLength(prefix + term), termPos++);
		}
		else if (mode == STORE_STEM)
		{
			string stemmedTerm = pStemmer->stem_word(term);

			doc.add_posting(limitTermLength(prefix + stemmedTerm), termPos++);
		}
		else if (mode == STORE_BOTH)
		{
			string stemmedTerm = pStemmer->stem_word(term);

			// Add both
			doc.add_posting(limitTermLength(prefix + term), termPos);
			// ...at the same position
			doc.add_posting(limitTermLength(prefix + stemmedTerm), termPos++);
		}
	}
#ifdef DEBUG
	cout << "XapianIndex::addTermsToDocument: added " << termPos << " terms" << endl;
#endif

	if (pStemmer != NULL)
	{
		delete pStemmer;
	}
}
static int tokenize_inv(OE oe) {
	Tokenizer tk = FunCallTokenizer_New(oe);
	tk->start((byte*)"INV",3);
	Token t = {0};
	tk->nextToken(&t);
	FunCallTokenizer_Destroy(&tk);
	return t.type == INV;
}
Beispiel #6
0
void Line::infToPost(Tokenizer& tokens, string& errs)
/*****************************************************************************/
{
  stack<Operator*> ops;

  //loop through all tokens and handle them
  for ( ; !tokens.eol(); tokens.nextToken()) {
    if (tokens.token().type() == FUNC) {
      process_function(tokens, errs);
      ++_tempSize; //for the return value
    }
    else if (tokens.token().type() == DECL) {
      process_newvar(tokens, errs);
    }
    else if (tokens.token().type() == CONSTANT) {
      process_number(tokens);
    }
    else if (tokens.token().type() == VAR) {
      process_existing_var(tokens, errs);
    }
    else if (tokens.token().type() == OPERATOR) {
      process_operator(tokens, ops);
    }
    else if (tokens.token().type() == SEMICOLON ||
	     tokens.token().type() == OPENBRACE) {
      tokens.nextToken();
      assert(tokens.eol());
      break;
    }
    else {
      CHECKERR(true, syntax_err(tokens.token().value()))
    }
    if (errs != "") return;
  }

  //put remaining opps at end of postfixLine
  while (!ops.empty()) {
    addNewObject(ops.top());
    ops.pop();
  }

  compile(errs, tokens);
  performNumericOps();
}
Beispiel #7
0
void Semantics::setGmshCommand(const string& Command){

	int nofTokens = 0;
	string Gcommand = "", essiTag="";
	Tokenizer inpString = Tokenizer(Command," {,;}()");
	nofTokens = inpString.countTokens()-1;
	this->NofGmshVariables = nofTokens-1;
	Gcommand = Gcommand + inpString.nextToken() + "{ ";
	essiTag = essiTag + inpString.currToken() + "{";
	
	for( int i=0 ;i<nofTokens-1; i++){

		string variable= this->delSpaces(inpString.nextToken());
		
		vector<string>::iterator it;
		it = find(this->VarList.begin(),this->VarList.end(),variable);
		
		if (it != this->VarList.end()) 
 			*it = "variable";

		Gcommand = Gcommand +variable+" ,";
		essiTag = essiTag + " ,";
	}

	string variable= this->delSpaces(inpString.nextToken());

	if(variable.compare("")){
		this->NofGmshVariables++;
	}

	vector<string>::iterator it;
	it = find(this->VarList.begin(),this->VarList.end(),variable);
	if (it != this->VarList.end()) 
 		*it = "variable";
 	
	Gcommand = Gcommand +variable + " }";
	essiTag = essiTag + " }"+to_string(this->NofGmshVariables);
	
	// cout << Gcommand << endl;
	// cout << essiTag << endl;
	this->GmshCommand= Gcommand;
	this->setEssiTag(essiTag);

}
void XapianIndex::removeFirstPostingsFromDocument(Tokenizer &tokens, Xapian::Document &doc,
	const string &prefix, const string &language, StemmingMode mode) const
{
	Xapian::TermIterator termListIter = doc.termlist_begin();
	Xapian::Stem *pStemmer = NULL;
	string term;

	// Do we know what language to use for stemming ?
	if (language.empty() == false)
	{
		pStemmer = new Xapian::Stem(StringManip::toLowerCase(language));
	}

	// Get the terms and remove the first posting for each
	while (tokens.nextToken(term) == true)
	{
		if (term.empty() == true)
		{
			continue;
		}
		// Does it start with a capital letter ?
		if (isupper((int)term[0]) != 0)
		{
			// R-prefix the raw term
			removeFirstPosting(doc, termListIter, string("R") + term);
		}
		// Lower case the term
		term = StringManip::toLowerCase(term);

		// Stem the term ?
		if ((mode == STORE_UNSTEM) ||
			(pStemmer == NULL))
		{
			removeFirstPosting(doc, termListIter, limitTermLength(prefix + term));
		}
		else if (mode == STORE_STEM)
		{
			removeFirstPosting(doc, termListIter, limitTermLength(prefix + pStemmer->stem_word(term)));
		}
		else if (mode == STORE_BOTH)
		{
			string stemmedTerm = pStemmer->stem_word(term);

			removeFirstPosting(doc, termListIter, limitTermLength(prefix + term));
			if (stemmedTerm != term)
			{
				removeFirstPosting(doc, termListIter, limitTermLength(prefix + stemmedTerm));
			}
		}
	}

	if (pStemmer != NULL)
	{
		delete pStemmer;
	}
}
static int tokenize_num(OE oe) {
	char b[64] = {0};
	Tokenizer tk = FunCallTokenizer_New(oe);
	tk->start((byte*)"145",3);
	Token t = {0};
	tk->nextToken(&t);
	FunCallTokenizer_Destroy(&tk);
	return t.type == NUM && t.value.num == 145;

}
Beispiel #10
0
void Mapping::makeFunction(string Id, string GmshCommandList, string EssiCommand) {

    Tokenizer str = Tokenizer(GmshCommandList,"|");
    while(str.hasMoreTokens()) {

        Semantics semantic = Semantics( str.nextToken(),EssiCommand);
        semantic.setElementId(Id);
        this->Function.insert(pair<string,Semantics> (semantic.getEssiTag(),semantic));
    }
}
Beispiel #11
0
void Line::process_newvar(Tokenizer& tokens, string& errs)
/*****************************************************************************/
{
  string type = tokens.token().value();
  tokens.nextToken();

  CHECKERR(tokens.eol() || tokens.token().type() != VAR, var_err())

  string name = tokens.token().value();
  tokens.nextToken();

  bool isArray = false;
  Line* sizePtr = NULL;

  //if the following token is an open index, we know that our new variable
  //is an array
  if (!tokens.eol() && tokens.token().type() == OPENINDEX) {
    vector<Token> size_expr;
    tokens.nextToken(); //move past openindex

    //get all the tokens that are part of the array's size expression
    while (!tokens.eol() && tokens.token().type() != CLOSEINDEX) {
      size_expr.push_back(tokens.token());
      tokens.nextToken();
    }
    CHECKERR ((size_expr.size() == 0), ara_err(name))
    CHECKERR ((tokens.token().type() != CLOSEINDEX), ara_err(name))

    isArray = true;
    Tokenizer tempToken(size_expr);
    sizePtr = new Line(tempToken, _parent, errs, true);
  }
  else
    tokens.previousToken();

  if (_parent->getVar(name) == NULL)
    add_newvar(type, name, sizePtr, isArray);
  else
    { CHECKERR(true, dec_err(name)) }
}
Beispiel #12
0
void PhysicalGroup::setContents(const string& PhysicDesc){

	this->PhysicDes = PhysicDesc;

	Tokenizer tknzr = Tokenizer(PhysicDesc,"  \t\v\n\r\f\"$");
	this->Type = stoi(tknzr.nextToken());
	this->Id = stoi(tknzr.nextToken());
	this->PhysicTag = tknzr.nextToken();

	tknzr.setDelimiter("");
	boost::sregex_iterator end;string gmESSI_Command = trim(tknzr.nextToken());
	gmESSI_Command = gmESSI_Command.substr(0,gmESSI_Command.length()-1);

	boost::regex CheckRegex("\\[([^(\\[\\])]|\\(*\\)*)*(\\[([^(\\[\\])]|\\(*\\)*)*\\])*([^(\\[\\])]|\\(*\\)*)*\\]");
	boost::sregex_iterator its(gmESSI_Command.begin(), gmESSI_Command.end(), CheckRegex);

	for (; its != end; ++its){
		string SubgmESSI_Command = its->str();
		if(SubgmESSI_Command.compare(""))
    		this->Process(SubgmESSI_Command.substr(1,SubgmESSI_Command.length()-2));
	}
}
Beispiel #13
0
void Line::process_existing_var(Tokenizer& tokens, string& errs)
/*****************************************************************************/
{
  string temp = tokens.token().value();
  Variable* v = _parent->getVar(temp);
  CHECKERR ((v == NULL), und_err(temp))

  //Note: we must allow for arrays to be passed to RTBoundFuncs without
  //having to use braces [].
  if (tokens.isArg()) {
    addNewObject(v);
    return;
  }

  //When we see an array variable, it must be followed by an index
  if (v->getObjectType() == ArrayVarOT) {
    tokens.nextToken();
    CHECKERR((tokens.eol() || tokens.token().type()!=OPENINDEX),ara_err(temp))
    tokens.nextToken(); //move past OPENINDEX
    vector<Token> index_list;

    //get all the tokens that are part of the array's index expression
    while (!tokens.eol() && tokens.token().type() != CLOSEINDEX) {
      index_list.push_back(tokens.token());
      tokens.nextToken();
    }
    CHECKERR ((index_list.size() == 0), ara_err(temp))
    CHECKERR ((tokens.eol()||tokens.token().type()!=CLOSEINDEX), ara_err(temp))

    Tokenizer tempToken(index_list);
    Line* indexPtr = new Line(tempToken, _parent, errs, true);
    ArrayIndex* ai = new ArrayIndex(v, indexPtr);
    addNewObject(ai);
  }
  else {
    addNewObject(v);
  }
}
Beispiel #14
0
ConditionalBlock::
ConditionalBlock(map<string, Variable*> vars, Tokenizer& lines, string& errs) 
  : Block(vars) 
/*****************************************************************************/
{
  if (errs != "") return;

  _executed  = false;
  _condition = NULL;
  
  lines.nextToken(); //move past "if"

  //create the conditional statement
  _condition = new Line(lines, this, errs, true);
  if (errs != "") return;

  //create this block's substatements
  createSubStatements(lines, errs);
}
Beispiel #15
0
int main()
{
	// todo: add a welcome message with instructions
	// todo: initialize your linked list and stack

	string printCommand = "?";
	string quitCommand = "quit";
	string input;

	Tokenizer tokenizer;
	string token;

	while(true)
	{
		getline(cin, input);
		if(input == printCommand)
		{
			// todo: print all variables in the linked list
		}
		else if(input == quitCommand)
		{
			break;
		}
		else
		{
			tokenizer.tokenize(input);
			while(tokenizer.nextToken(token))
			{
				// todo: use token and evaluate the expression
				// cout << "string token" << token << endl; // prints full string over the loop no spaces
			}
			// todo: check the result of the expression and either print it out or store it in the linked list
		}
	}

	return 0;
}
Beispiel #16
0
int main() {
	try {
		TokenizerBuilder builder;
		builder.addTokenType(NUM, "[1-9][0-9]*");
		builder.addTokenType(ID, "[a-zA-Z_]\\w*");
		builder.addTokenType(PLUS, "\\+");
		builder.addTokenType(MINUS, "\\-");
		builder.addTokenType(MUL, "\\*");
		builder.addTokenType(DIV, "\\/");
		
		Tokenizer tokenizer = builder.build();
		tokenizer.setInputStream("val21+3 - 25900/_x-MAX_SIZE");
		
		
		while(tokenizer.hasNext()) {
			Token token = tokenizer.nextToken();
			cout << token.getTokenType() << ": \"" << token.getLexeme() << "\"\n";
		}
		
	} catch(std::exception& e) {
		cout << e.what() << endl;
	}
	return 0;
}
void XapianIndex::addPostingsToDocument(Tokenizer &tokens, Xapian::Document &doc,
	const string &prefix, Xapian::termcount &termPos, StemmingMode mode) const
{
	Xapian::Stem *pStemmer = NULL;
	string upperCasePrefix("R");
	string term;

	// Do we know what language to use for stemming ?
	if (m_stemLanguage.empty() == false)
	{
		pStemmer = new Xapian::Stem(StringManip::toLowerCase(m_stemLanguage));
	}

	// Terms starting with a capital letter are R-prefixed, unless a prefix is already defined
	if (prefix.empty() == false)
	{
		upperCasePrefix = prefix;
	}

	// Get the terms
	while (tokens.nextToken(term) == true)
	{
		if (term.empty() == true)
		{
			continue;
		}
		// Does it start with a capital letter ?
		if (isupper((int)term[0]) != 0)
		{
			doc.add_posting(upperCasePrefix + XapianDatabase::limitTermLength(term), termPos);
		}
		// Lower case the term
		term = StringManip::toLowerCase(term);

		// Stem the term ?
		if ((mode == STORE_UNSTEM) ||
			(pStemmer == NULL))
		{
			doc.add_posting(prefix + XapianDatabase::limitTermLength(term), termPos);
		}
		else if (mode == STORE_STEM)
		{
#if XAPIAN_MAJOR_VERSION==0
			string stemmedTerm(pStemmer->stem_word(term));
#else
			string stemmedTerm((*pStemmer)(term));
#endif

			doc.add_posting(prefix + XapianDatabase::limitTermLength(stemmedTerm), termPos);
		}
		else if (mode == STORE_BOTH)
		{
#if XAPIAN_MAJOR_VERSION==0
			string stemmedTerm(pStemmer->stem_word(term));
#else
			string stemmedTerm((*pStemmer)(term));
#endif

			// Add both at the same position
			doc.add_posting(prefix + XapianDatabase::limitTermLength(term), termPos);
			if (stemmedTerm != term)
			{
				// No point adding the same term twice
				doc.add_posting(prefix + XapianDatabase::limitTermLength(stemmedTerm), termPos);
			}
		}

		++termPos;
	}
#ifdef DEBUG
	cout << "XapianIndex::addPostingsToDocument: added " << termPos << " terms" << endl;
#endif

	if (pStemmer != NULL)
	{
		delete pStemmer;
	}
}
void XapianIndex::addPostingsToDocument(Tokenizer &tokens, Xapian::Document &doc,
                                        const string &prefix, Xapian::termcount &termPos, StemmingMode mode) const
{
    Xapian::Stem *pStemmer = NULL;
    string stemPrefix("Z");
    string term;

    // Do we know what language to use for stemming ?
    if (m_stemLanguage.empty() == false)
    {
        try
        {
            pStemmer = new Xapian::Stem(StringManip::toLowerCase(m_stemLanguage));
        }
        catch (const Xapian::Error &error)
        {
            cerr << "Couldn't create stemmer: " << error.get_type() << ": " << error.get_msg() << endl;
        }
    }

    // Stems are Z-prefixed, unless a prefix is already defined
    if (prefix.empty() == false)
    {
        stemPrefix = prefix;
    }

    // Get the terms
    while (tokens.nextToken(term) == true)
    {
        bool addStem = false;

        if (term.empty() == true)
        {
            continue;
        }
        // Lower case the term
        term = StringManip::toLowerCase(term);

        // Stem the term ?
        if ((mode == STORE_UNSTEM) ||
                (pStemmer == NULL))
        {
            doc.add_posting(prefix + XapianDatabase::limitTermLength(term), termPos);
        }
        else if (mode == STORE_STEM)
        {
            addStem = true;
        }
        else if (mode == STORE_BOTH)
        {
            // Add both
            doc.add_posting(prefix + XapianDatabase::limitTermLength(term), termPos);
            addStem = true;
        }

        // Don't stem if the term starts with a digit
        if ((addStem == true) &&
                (isdigit((int)term[0]) == 0))
        {
#if XAPIAN_MAJOR_VERSION==0
            string stemmedTerm(pStemmer->stem_word(term));
#else
            string stemmedTerm((*pStemmer)(term));
#endif

            doc.add_term(stemPrefix + XapianDatabase::limitTermLength(stemmedTerm));
        }

        ++termPos;
    }
#ifdef DEBUG
    cout << "XapianIndex::addPostingsToDocument: added " << termPos << " terms" << endl;
#endif

    if (pStemmer != NULL)
    {
        delete pStemmer;
    }
}
Beispiel #19
0
void Line::process_function(Tokenizer& tokens, string& errs)
/*****************************************************************************/
{
  //create a function call object
  string temp = tokens.token().value();
  FunctionCall* funcCall = Registrar::generateCall(temp);
  CHECKERR((funcCall == NULL), func_err(temp))

  tokens.nextToken(); //move past funcname
  tokens.nextToken(); //move past open paren

  //put tokens into the argument lists.. We loop until we have seen the paren
  //that terminates this function call or until we have run out of tokens on
  //this line
  list< vector<Token> > args;
  vector<Token> currArg;
  int depth = 0;
  while(!((tokens.token().value() == ")" && depth == 0) || tokens.eol())) {

    //if we see a comma at paren depth zero, we have just reached the end of an
    //argument
    if (tokens.token().type() == COMMA && depth == 0) {
      assert(!currArg.empty());
      args.push_back(currArg);
      currArg.clear();
    }
    else {
      currArg.push_back(tokens.token());

      if (tokens.token() == Token(OPERATOR, "(", 0))
        ++depth;
      if (tokens.token() == Token(OPERATOR, ")", 0))
        --depth;
    }
    tokens.nextToken();
  }
  if (!currArg.empty())
    args.push_back(currArg);

  CHECKERR(
      tokens.eol() || tokens.token().value() != ")",
      arg_err(temp)
      )

  if (funcCall->hasVariableArgs()) {
    CHECKERR (
        args.size() < funcCall->getNumArgs(),
        arg_err(temp)
        )
  } else {
    CHECKERR(
        args.size() != funcCall->getNumArgs(),
        arg_err(temp)
        )
  }

  //Construct a Line for each argument
  list< vector<Token> >::iterator arg_itr = args.begin();
  for ( ; arg_itr != args.end(); ++arg_itr) {
    CHECKERR (((*arg_itr).size() == 0), arg_err(temp))
    Tokenizer tempToken(*arg_itr);
    Line* arg = new Line(tempToken, _parent, errs, true);
    funcCall->fillArg(arg);
  }

  addNewObject(funcCall);
}
Beispiel #20
0
void Mapping::mapFile() {

    fstream mapFile(this->FileName, fstream::in);
    string line;

    while(getline(mapFile,line)) {

        Tokenizer str = Tokenizer(line,"#  \t\v\n\r\f");
        if(!delSpaces(str.nextToken()).compare("ELEMENT_ID"))
            break;
    }

    while(getline(mapFile,line)) {

        string strLine = delSpaces(line);
        Tokenizer str = Tokenizer(line,"#  \t\v\n\r\f\"");

        if(!str.nextToken().compare("ENDELEMENT_ID"))
            break;
        if(!str.currToken().substr(0,2).compare("//"))
            continue;
        if(delSpaces(str.currToken()).length()==0)
            continue;

        string elementDes="";
        string elementId = "";

        elementId = elementId + str.currToken();
        elementDes= elementDes + str.nextToken();
        str.setDelimiter("\"");
        elementDes= elementDes + str.nextToken();

        this->ElementMap.insert(pair<string,string>(elementId,elementDes));
    }

    while(getline(mapFile,line)) {

        Tokenizer str = Tokenizer(line,"#  \t\v\n\r\f");
        if(!delSpaces(str.nextToken()).compare("ESSI_TAGS"))
            break;
    }

    while(getline(mapFile,line)) {

        Tokenizer str = Tokenizer(line,"#  \t\v\n\r\f");
        if(!delSpaces(str.nextToken()).compare("ENDESSI_TAGS"))
            break;
        if(!delSpaces(str.currToken()).substr(0,2).compare("//"))
            continue;
        if(delSpaces(str.currToken()).length()==0)
            continue;

        this->EssiTagList.insert(str.currToken());
    }

    while(getline(mapFile,line)) {

        string Id, GmshCommandList, EssiCommand="";
        Tokenizer str = Tokenizer(line,"!  \t\v\n\r\f");

        if(!delSpaces(str.nextToken()).substr(0,2).compare("//"))
            continue;
        if(delSpaces(str.currToken()).length()==0)
            continue;

        Id = str.currToken();
        str.setDelimiter("<>");

        if(delSpaces(str.nextToken()).length()==0)
            GmshCommandList = delSpaces(str.nextToken());
        else
            GmshCommandList = delSpaces(str.currToken());

        str.setDelimiter("  \t\v\n\r\f");

        EssiCommand= EssiCommand + delSpaces(str.nextToken())+" ";
        str.setDelimiter("<>");
        EssiCommand= EssiCommand + str.nextToken();

        this->makeFunction(Id, GmshCommandList,EssiCommand);
    }

    mapFile.close();
}
void XapianIndex::removeFirstPostingsFromDocument(Tokenizer &tokens, Xapian::Document &doc,
	const string &prefix, const string &language, StemmingMode mode) const
{
	Xapian::TermIterator termListIter = doc.termlist_begin();
	Xapian::Stem *pStemmer = NULL;
	string upperCasePrefix("R");
	string term;

	// Do we know what language to use for stemming ?
	if (language.empty() == false)
	{
		pStemmer = new Xapian::Stem(StringManip::toLowerCase(language));
	}

	// Terms starting with a capital letter are R-prefixed, unless a prefix is already defined
	if (prefix.empty() == false)
	{
		upperCasePrefix = prefix;
	}

	// Get the terms and remove the first posting for each
	while (tokens.nextToken(term) == true)
	{
		if (term.empty() == true)
		{
			continue;
		}
		// Does it start with a capital letter ?
		if (isupper((int)term[0]) != 0)
		{
			removeFirstPosting(doc, termListIter, upperCasePrefix + term);
		}
		// Lower case the term
		term = StringManip::toLowerCase(term);

		// Stem the term ?
		if ((mode == STORE_UNSTEM) ||
			(pStemmer == NULL))
		{
			removeFirstPosting(doc, termListIter, prefix + XapianDatabase::limitTermLength(term));
		}
		else if (mode == STORE_STEM)
		{
#if XAPIAN_MAJOR_VERSION==0
			string stemmedTerm(pStemmer->stem_word(term));
#else
			string stemmedTerm((*pStemmer)(term));
#endif

			removeFirstPosting(doc, termListIter, prefix + XapianDatabase::limitTermLength(stemmedTerm));
		}
		else if (mode == STORE_BOTH)
		{
#if XAPIAN_MAJOR_VERSION==0
			string stemmedTerm(pStemmer->stem_word(term));
#else
			string stemmedTerm((*pStemmer)(term));
#endif

			removeFirstPosting(doc, termListIter, prefix + XapianDatabase::limitTermLength(term));
			if (stemmedTerm != term)
			{
				removeFirstPosting(doc, termListIter, prefix + XapianDatabase::limitTermLength(stemmedTerm));
			}
		}
	}

	if (pStemmer != NULL)
	{
		delete pStemmer;
	}
}
void XapianIndex::removeFirstPostingsFromDocument(Tokenizer &tokens, Xapian::Document &doc,
        const string &prefix, const string &language, StemmingMode mode) const
{
    Xapian::TermIterator termListIter = doc.termlist_begin();
    Xapian::Stem *pStemmer = NULL;
    string stemPrefix("Z");
    string term;

    // Do we know what language to use for stemming ?
    if (language.empty() == false)
    {
        try
        {
            pStemmer = new Xapian::Stem(StringManip::toLowerCase(m_stemLanguage));
        }
        catch (const Xapian::Error &error)
        {
            cerr << "Couldn't create stemmer: " << error.get_type() << ": " << error.get_msg() << endl;
        }
    }

    // Stems are Z-prefixed, unless a prefix is already defined
    if (prefix.empty() == false)
    {
        stemPrefix = prefix;
    }

    // Get the terms and remove the first posting for each
    while (tokens.nextToken(term) == true)
    {
        bool removeStem = false;

        if (term.empty() == true)
        {
            continue;
        }
        // Lower case the term
        term = StringManip::toLowerCase(term);

        // Stem the term ?
        if ((mode == STORE_UNSTEM) ||
                (pStemmer == NULL))
        {
            removeFirstPosting(doc, termListIter, prefix + XapianDatabase::limitTermLength(term));
        }
        else if (mode == STORE_STEM)
        {
            removeStem = true;
        }
        else if (mode == STORE_BOTH)
        {
            // Remove both
            removeFirstPosting(doc, termListIter, prefix + XapianDatabase::limitTermLength(term));
            removeStem = true;
        }

        // Since stems don't have positional information, we can't simply remove them
        // since any may appear more than once in the original document
        // We can only remove those that have some prefix set
        // Don't stem if the term starts with a digit
        if ((removeStem == true) &&
                (prefix.empty() == false) &&
                (isdigit((int)term[0]) == 0))
        {
#if XAPIAN_MAJOR_VERSION==0
            string stemmedTerm(pStemmer->stem_word(term));
#else
            string stemmedTerm((*pStemmer)(term));
#endif

            doc.remove_term(stemPrefix + XapianDatabase::limitTermLength(stemmedTerm));
        }
    }

    if (pStemmer != NULL)
    {
        delete pStemmer;
    }
}
Beispiel #23
0
void Semantics::setEssiCommand(const string& Command){

	int nofTokens = 0;
	Tokenizer inpString = Tokenizer(Command," ") ;
	string Ecommand = "";
	string Fcommand = ""; // Filtered Command with spaces

	while( inpString.hasMoreTokens()){
		Fcommand = Fcommand + inpString.nextToken()+" ";
	}

	inpString.set(Fcommand,"{}");
	nofTokens = inpString.countTokens()-1;

	string prevTag = "variable";

	while(inpString.hasMoreTokens() && nofTokens-->0){

		string variable;
		Tokenizer Var = Tokenizer(inpString.nextToken(),"#()= ,");

		if(!(inpString.currToken()).compare(";")) break;                        // Termination Condition with ";"
		if((inpString.currToken()).back()=='\\'){							   // Escape sequence "\\"
			Ecommand = Ecommand + inpString.currToken().substr(0,inpString.currToken().length()-1) +Fcommand.substr(inpString.currIndex()-1,1);
			continue;					   
		}

		Ecommand = Ecommand + inpString.currToken() + "$";

		Var.setMode(1);
		Var.setcurrPos(inpString.currToken().length()-1);
		string currTag = (Var.nextToken());

		if (currTag.length()<=1)
			variable = prevTag; 		
		else{
			variable = currTag;
			prevTag= currTag;
		}

		set<string>::iterator it = this->EssiTagList.find(variable);
		if (it != this->EssiTagList.end()) {

			map<string,int>::iterator it = this->TagList.find(variable);

			if (it != this->TagList.end()){
				it->second=it->second+1;
				variable = variable + "#" +to_string(it->second);
			}
			else{
				this->TagList.insert(pair<string,int>(variable,1));
				variable = variable + "#1";
			}

			this->NofTagVariables++;
		}

		this->VarList.push_back(variable);
		this->EssiVarList.push_back(variable);
		// inpString.setDelimiter("{}#()=");
	}

	Ecommand = Ecommand+inpString.nextToken();

	this->EssiCommand = Ecommand;

	this->NofEssiVariables = this->VarList.size();
}