void Misc::removeStopwords(std::string& text) { if (stopwords.empty()) loadStopwords(); // Voy a recorrer palabra por palabra del string text std::string word; std::string::iterator it = text.begin(); while (it != text.end()) { int itPos = std::distance(text.begin(), it); int nextSpacePos = text.find(' ', itPos); word = text.substr(itPos, nextSpacePos - itPos); std::vector<std::string>::iterator stopwordIt = std::find(stopwords.begin(), stopwords.end(), word); if (stopwordIt != stopwords.end()) { // Se encontro la stopword text.erase(itPos, nextSpacePos - itPos + 1); } else { it += word.size(); if (it != text.end()) ++it; // Me salteo el espacio (si hay) } } }
int main(int argc, char* argv[]) { if (argc != 6) { cerr << "[usage] <sentencesText.buf> <patterns.csv> <stopwords.txt> <stopwordsFromText.txt> <final.csv>" << endl; return -1; } selftest(); loadSentences(argv[1]); loadPattern(argv[2]); loadStopwords(argv[3], argv[4]); int corpusTokensN = 0; for (size_t sentenceID = 0; sentenceID < sentences.size(); ++ sentenceID) { const string &text = sentences[sentenceID]; string alpha = text; for (size_t i = 0; i < alpha.size(); ++ i) { if (isalpha(alpha[i])) { alpha[i] = tolower(alpha[i]); } else { if (alpha[i] != '\'') { alpha[i] = ' '; } } } corpusTokensN += splitBy(alpha, ' ').size(); string outsideText = alpha; if (sentenceID > 0) { outsideText += " " + sentences[sentenceID - 1]; } if (sentenceID + 1 < sentences.size()) { outsideText += " " + sentences[sentenceID + 1]; } for (size_t i = 0; i < outsideText.size(); ++ i) { if (isalpha(outsideText[i])) { outsideText[i] = tolower(outsideText[i]); } else { outsideText[i] = ' '; } } vector<string> outside = splitBy(outsideText, ' '); unordered_map<string, int> outsideCnt; FOR (token, outside) { ++ outsideCnt[*token]; } vector< pair<int, int> > positions; tree.search(" " + alpha + " ", positions); unordered_map<string, int> patternCnt; FOR (pos, positions) { int st = pos->first; int ed = pos->second - 2; string pattern = alpha.substr(st, ed - st); ++ patternCnt[pattern]; } FOR (pos, positions) { int st = pos->first; int ed = pos->second - 2; string pattern = alpha.substr(st, ed - st); vector<string> tokens = splitBy(pattern, ' '); unordered_map<string, int> tokenCnt; int delta = patternCnt[pattern]; for (size_t i = 0; i < tokens.size(); ++ i) { tokenCnt[tokens[i]] += delta; } for (size_t i = 0; i < tokens.size(); ++ i) { if (outsideCnt[tokens[i]] > tokenCnt[tokens[i]]) { f[pattern][i] += 1; sumOutside[pattern][i] += outsideCnt[tokens[i]] - tokenCnt[tokens[i]]; } } total[pattern] += 1; if (st > 0 && ed < (int)text.size()) { if (text[st - 1] == '(' && text[ed] == ')') { parenthesis[pattern] += 1; } if (text[st - 1] == '"' && text[ed] == '"') { quote[pattern] += 1; } } bool found = false; for (int i = st; i < ed && !found; ++ i) { found |= text[i] == '-'; } dash[pattern] += found; bool valid = true; for (int i = st; i < ed && valid; ++ i) { if (isalpha(alpha[i]) && (i == st || alpha[i - 1] == ' ')) { if (text[i] < 'A' && text[i] > 'Z') { valid = false; } } } capital[pattern] += valid; }