int main(){ //default initializations initializeStopwords(); std::unordered_set<std::string> suffixes = getDefaultSuffixes(); //everything else std::vector<std::string> rf = relevantFiles("examples",suffixes,false);//home/max/workspace/thtag/ //printStringVector(rf); gramDict mainDict; std::vector<gramDict> dictVec; //read all files in example subdirectory and clean each dictionary and merge them for (std::vector<std::string>::const_iterator current = rf.begin(); current != rf.end();++current) { std::cout<<(*current)<<'\n'; gramDict tempDict; long fileSize = getFileSize((*current)); std::cout<<"FILE SIZE: "<<fileSize<<'\n'; readCall((*current),&tempDict,-3,60); basicNLP(&tempDict); trimToTopN(&tempDict,(int) std::max(2*log(fileSize+500), 6.00)); merge_dictionaries(&mainDict,&tempDict); dictVec.push_back(tempDict); } read_dictionary(&mainDict); outputDictionary(&mainDict,"testDictOutput.txt"); //create map from files to keywords convertCountToIndex(&mainDict); writeKeywordDataFile(&dictVec,&mainDict,"keywordMap.txt"); outputFileList(&rf,"fileList.txt"); std::cout<<"DONE!\n"; }
Ttoken *Tmlex::getToken() { Ttoken *token = NULL; TstrBuf text = ""; unsigned long textIdx = 0; int found = 0; if (NULL != (token = popToken())) { return(token); } else { token = new Ttoken; } token->setType(TOK_ILLEGAL); while (!found) { int ch = Dinput.get(); if (EOF != ch) text[textIdx++] = ch; if (EOF == ch) { // End of file token->setType(TOK_END); token->setText(""); token->setPosn(Dinput.posn()); found = 1; } else if ('\n' == ch) { token->setType(TOK_EOL); token->setText(""); found = 1; } else if (isspace(ch)) { // Ignore white space } else if (SEP_ESC == ch) { // excape ch = Dinput.get(); if ('\n' == ch) { // ignore the end-of-line } else if (SEP_ESC == ch) { // Litteral Dinput.putBack(ch); token->setPosn(Dinput.posn()); found = readIdent(*token); } else { // Excaped charactor Dinput.putBack(ch); } } else if ('#' == ch) { // Check for comment token->setPosn(Dinput.posn()); ch = Dinput.get(); found = readComment(*token); } else if (SEP_TEXT == ch) { // Text token->setPosn(Dinput.posn()); found = readText(*token); } else if (SEP_S_VAR == ch) { // Start an evaluated variable (define) readVar(); } else if (SEP_CALL == ch) { // Call token->setPosn(Dinput.posn()); found = readCall(*token); } else if (isSymb(ch)) { // Symbol token->setPosn(Dinput.posn()); Dinput.putBack(ch); found = readSymb(*token); } else if (isIdent(ch)) { // Ident token->setPosn(Dinput.posn()); Dinput.putBack(ch); found = readIdent(*token); } else { while ((EOF != ch) && !isspace(ch)) ch = Dinput.get(); if (EOF != ch) Dinput.putBack(ch); } } return(token); }
int Tmlex::readText(Ttoken &token) { static TstrBuf buff; int ch = Dinput.get(); Tstr posn = Dinput.posn(); unsigned long pos = 0; while ((EOF != ch) && ('\n' != ch) && (SEP_TEXT != ch)) { if (SEP_ESC == ch) // 005 { // Escaped Charactor ch = Dinput.get(); if ((EOF != ch) && ('\n' != ch)) { switch (ch) { case 'n' : buff[pos++] = '\n'; break; case 'r' : buff[pos++] = '\r'; break; case 't' : buff[pos++] = '\t'; break; case 'v' : buff[pos++] = '\v'; break; case 'b' : buff[pos++] = '\b'; break; default : buff[pos++] = ch; break; } } } else if (SEP_S_VAR == ch) { // Posible var if (!readVar()) buff[pos++] = ch; } else if (SEP_CALL == ch) // 005 { // Call argument Ttoken token; token.setPosn(Dinput.posn()); readCall(token); Dinput.push(new Tpipe(token.text())); } else if (SEP_WHICH_OPEN == ch) // 005 { // Posible which argument Ttoken token; token.setPosn(Dinput.posn()); Dinput.putBack(findExecs(token.text())); } else { buff[pos++] = ch; } ch = Dinput.get(); } buff[pos++] = '\0'; if (SEP_TEXT != ch) { printE("%s : Unterminated text\n", (char const*)posn); } token.setType(TOK_TEXT); token.setText((char const*)buff); return(1); }