예제 #1
0
파일: main.cpp 프로젝트: mcandocia/thtag
int main(){
  //default initializations
  initializeStopwords();
  std::unordered_set<std::string> suffixes = getDefaultSuffixes();
  //everything else
  std::vector<std::string> rf = relevantFiles("examples",suffixes,false);//home/max/workspace/thtag/
  //printStringVector(rf);
  gramDict mainDict;
  std::vector<gramDict> dictVec;

  //read all files in example subdirectory and clean each dictionary and merge them
  for (std::vector<std::string>::const_iterator current = rf.begin();
   current != rf.end();++current)
  {
    std::cout<<(*current)<<'\n';

      gramDict tempDict;
      long fileSize = getFileSize((*current));
      std::cout<<"FILE SIZE: "<<fileSize<<'\n';
      readCall((*current),&tempDict,-3,60);
      basicNLP(&tempDict);
      trimToTopN(&tempDict,(int) std::max(2*log(fileSize+500), 6.00));
      merge_dictionaries(&mainDict,&tempDict);
      dictVec.push_back(tempDict);
  }
  read_dictionary(&mainDict);
  outputDictionary(&mainDict,"testDictOutput.txt");
  //create map from files to keywords
  convertCountToIndex(&mainDict);
  writeKeywordDataFile(&dictVec,&mainDict,"keywordMap.txt");
  outputFileList(&rf,"fileList.txt");
  std::cout<<"DONE!\n";

}
예제 #2
0
Ttoken *Tmlex::getToken()
{
    Ttoken        *token   = NULL;
    TstrBuf        text    = "";
    unsigned long  textIdx = 0;
    int            found = 0;

    if (NULL != (token = popToken()))
    {
        return(token);
    }
    else
    {
        token = new Ttoken;
    }

    token->setType(TOK_ILLEGAL);

    while (!found)
    {
        int ch = Dinput.get();
        if (EOF != ch) text[textIdx++] = ch;

        if (EOF == ch)
        {
            // End of file
            token->setType(TOK_END);
            token->setText("");
            token->setPosn(Dinput.posn());
            found = 1;
        }
        else if ('\n' == ch)
        {
            token->setType(TOK_EOL);
            token->setText("");
            found = 1;
        }
        else if (isspace(ch))
        {
            // Ignore white space
        }
        else if (SEP_ESC == ch)
        {
            // excape
            ch = Dinput.get();

            if ('\n' == ch)
            {
                // ignore the end-of-line
            }
            else if (SEP_ESC == ch)
            {
                // Litteral
                Dinput.putBack(ch);
                token->setPosn(Dinput.posn());
                found = readIdent(*token);
            }
            else
            {
                // Excaped charactor
                Dinput.putBack(ch);
            }
        }
        else if ('#' == ch)
        {
            // Check for comment
            token->setPosn(Dinput.posn());
            ch = Dinput.get();

            found = readComment(*token);
        }
        else if (SEP_TEXT == ch)
        {
            // Text
            token->setPosn(Dinput.posn());
            found = readText(*token);
        }
        else if (SEP_S_VAR == ch)
        {
            // Start an evaluated variable (define)
            readVar();
        }
        else if (SEP_CALL == ch)
        {
            // Call
            token->setPosn(Dinput.posn());
            found = readCall(*token);
        }
        else if (isSymb(ch))
        {
            // Symbol
            token->setPosn(Dinput.posn());
            Dinput.putBack(ch);
            found = readSymb(*token);
        }
        else if (isIdent(ch))
        {
            // Ident
            token->setPosn(Dinput.posn());
            Dinput.putBack(ch);
            found = readIdent(*token);
        }
        else
        {
            while ((EOF != ch) && !isspace(ch)) ch = Dinput.get();
            if (EOF != ch) Dinput.putBack(ch);
        }
    }

    return(token);
}
예제 #3
0
int Tmlex::readText(Ttoken &token)
{
    static TstrBuf buff;

    int           ch   = Dinput.get();
    Tstr          posn = Dinput.posn();
    unsigned long pos = 0;

    while ((EOF != ch) && ('\n' != ch) && (SEP_TEXT != ch))
    {
        if (SEP_ESC == ch)                              // 005
        {
            // Escaped Charactor
            ch = Dinput.get();
            if ((EOF != ch) && ('\n' != ch))
            {
                switch (ch)
                {
                case 'n' :
                    buff[pos++] = '\n';
                    break;
                case 'r' :
                    buff[pos++] = '\r';
                    break;
                case 't' :
                    buff[pos++] = '\t';
                    break;
                case 'v' :
                    buff[pos++] = '\v';
                    break;
                case 'b' :
                    buff[pos++] = '\b';
                    break;
                default  :
                    buff[pos++] =  ch;
                    break;
                }
            }
        }
        else if (SEP_S_VAR == ch)
        {
            // Posible var
            if (!readVar()) buff[pos++] = ch;
        }
        else if (SEP_CALL == ch)                       // 005
        {
            // Call argument
            Ttoken token;
            token.setPosn(Dinput.posn());
            readCall(token);
            Dinput.push(new Tpipe(token.text()));
        }
        else if (SEP_WHICH_OPEN == ch)                // 005
        {
            // Posible which argument
            Ttoken token;
            token.setPosn(Dinput.posn());
            Dinput.putBack(findExecs(token.text()));
        }
        else
        {
            buff[pos++] = ch;
        }
        ch = Dinput.get();
    }
    buff[pos++] = '\0';

    if (SEP_TEXT != ch)
    {
        printE("%s : Unterminated text\n", (char const*)posn);
    }

    token.setType(TOK_TEXT);
    token.setText((char const*)buff);

    return(1);
}