Esempio n. 1
0
void c_SentenceEntry::add(const c_LabelEntry & l)
{
  if (l.phon[l.phon.size()-1] != '-' || (l.phon[l.phon.size()-2] == '#' && l.phon[l.phon.size()-1] == '-')) {
    if(isWordBoundary(l.phon)) {
      if(l.pros.size())
	type = isSentenceDelimiter(l.pros);
      if(start) {
	word.add(l);
	word.setPros(l.pros);
      word.setFirst(l.first);
      start = false;
    }
      else {
	word.setLast(l.first);
	word.finish(l);
	words.push_back(word);
	word = c_WordEntry(sb_sym);
	word.setPros(l.pros);
	word.setFirst(l.first);
	word.add(l);
      }
    }
    else {
      word.add(l);
    }	
  }
}
void TextSplitter::getSentences(std::ostream &out, size_t &count) {
    std::string buffer;
    std::string lastWord; // Last read word
    size_t wordCount = 0; // Count words in each sentence
    char c;
    count = 0;
    while ((c = stream_.get()) != EOF) {
        // If we reached sentence delimiter (.?!) we should:
        // 1. Skip other delimiters (situations like '??!')
        // 2. Check if last word was abbreviation (then do not split by dot)
        // 3. Check for closing quote (do not move closing quote to next sentence)
        if (isSentenceDelimiter(c)) { 
            // Skip abbreviations
            if (c == '.' && isAbbreviation(lastWord)) {
                lastWord.clear();
                ++wordCount;
                buffer.push_back(c);
                continue;
            }
            if (!lastWord.empty()) { ++wordCount; }
            while (isSentenceDelimiter(c)) {
                buffer.push_back(c);
                c = stream_.get();
            }
            while (c == ' ') { c = stream_.get(); }  
            if (c == '"') {
                buffer.push_back(c);
                c = stream_.get();
            }
            if (wordCount >= MIN_SENTENCE_LEN) {
                out << removeStartingSpaces(buffer) << std::endl;
                wordCount = 0;
                ++count;
            }
            buffer.clear(); // clear sentence buffer
            lastWord.clear(); // clear word buffer
        } 
        // If we reached '\n' we will merge lines by space
        if (isCRLF(c)) {
            while (isCRLF(c)) { c = stream_.get(); }
            if (!buffer.empty()) {
                buffer.push_back(' ');
            }
            if (!lastWord.empty()) {
                ++wordCount;
                lastWord.clear();
            }
        }
        // Clear last word if we found word delimiter
        if (isWordDelimiter(c)) {
            if (!lastWord.empty()) {
                ++wordCount;
                lastWord.clear();
            }
        }
        if (c != EOF) {
            buffer.push_back(c);
        }
        if (!isWordDelimiter(c)) {
            lastWord.push_back(c);
        }
    }
}