Exemplo n.º 1
0
void Misc::removeStopwords(std::string& text) {
	if (stopwords.empty())
		loadStopwords();
	// Voy a recorrer palabra por palabra del string text
	std::string word;
	std::string::iterator it = text.begin();
	while (it != text.end()) {
		int itPos = std::distance(text.begin(), it);
		int nextSpacePos = text.find(' ', itPos);
		word = text.substr(itPos, nextSpacePos - itPos);
		std::vector<std::string>::iterator stopwordIt =
				std::find(stopwords.begin(), stopwords.end(), word);
		if (stopwordIt != stopwords.end()) {
			// Se encontro la stopword
			text.erase(itPos, nextSpacePos - itPos + 1);
		} else {
			it += word.size();
			if (it != text.end())
				++it; // Me salteo el espacio (si hay)
		}
	}

}
Exemplo n.º 2
0
int main(int argc, char* argv[])
{
    if (argc != 6) {
        cerr << "[usage] <sentencesText.buf> <patterns.csv> <stopwords.txt> <stopwordsFromText.txt> <final.csv>" << endl;
        return -1;
    }
    selftest();
    
    loadSentences(argv[1]);
    loadPattern(argv[2]);
    loadStopwords(argv[3], argv[4]);
    
    int corpusTokensN = 0;
    for (size_t sentenceID = 0; sentenceID < sentences.size(); ++ sentenceID) {
        const string &text = sentences[sentenceID];
        string alpha = text;
        for (size_t i = 0; i < alpha.size(); ++ i) {
            if (isalpha(alpha[i])) {
                alpha[i] = tolower(alpha[i]);
            } else {
                if (alpha[i] != '\'') {
					alpha[i] = ' ';
				}
            }
        }
        corpusTokensN += splitBy(alpha, ' ').size();
        
        string outsideText = alpha;
        if (sentenceID > 0) {
            outsideText += " " + sentences[sentenceID - 1];
        }
        if (sentenceID + 1 < sentences.size()) {
            outsideText += " " + sentences[sentenceID + 1];
        }
        for (size_t i = 0; i < outsideText.size(); ++ i) {
            if (isalpha(outsideText[i])) {
                outsideText[i] = tolower(outsideText[i]);
            } else {
                outsideText[i] = ' ';
            }
        }
        
        vector<string> outside = splitBy(outsideText, ' ');
        unordered_map<string, int> outsideCnt;
        FOR (token, outside) {
            ++ outsideCnt[*token];
        }
        
        vector< pair<int, int> > positions;
        tree.search(" " + alpha + " ", positions);
        
        unordered_map<string, int> patternCnt;
        FOR (pos, positions) {
            int st = pos->first;
            int ed = pos->second - 2;
            string pattern = alpha.substr(st, ed - st);
            ++ patternCnt[pattern];
        }
        FOR (pos, positions) {
            int st = pos->first;
            int ed = pos->second - 2;
            string pattern = alpha.substr(st, ed - st);
            
            vector<string> tokens = splitBy(pattern, ' ');
            unordered_map<string, int> tokenCnt;
            int delta = patternCnt[pattern];
            for (size_t i = 0; i < tokens.size(); ++ i) {
                tokenCnt[tokens[i]] += delta;
            }
            for (size_t i = 0; i < tokens.size(); ++ i) {
                if (outsideCnt[tokens[i]] > tokenCnt[tokens[i]]) {
                    f[pattern][i] += 1;
                    sumOutside[pattern][i] += outsideCnt[tokens[i]] - tokenCnt[tokens[i]];
                }
            }
            
            total[pattern] += 1;
            
            if (st > 0 && ed < (int)text.size()) {
                if (text[st - 1] == '(' && text[ed] == ')') {
                    parenthesis[pattern] += 1;
                }
                if (text[st - 1] == '"' && text[ed] == '"') {
                    quote[pattern] += 1;
                }
            }
            
            bool found = false;
            for (int i = st; i < ed && !found; ++ i) {
                found |= text[i] == '-';
            }
            dash[pattern] += found;
            
            bool valid = true;
            for (int i = st; i < ed && valid; ++ i) {
                if (isalpha(alpha[i]) && (i == st || alpha[i - 1] == ' ')) {
                    if (text[i] < 'A' && text[i] > 'Z') {
                        valid = false;
                    }
                }
            }
            capital[pattern] += valid;
        }