Exemplo n.º 1
0
void InitStoplistConfig()
{
  char *stoplist_path = Config("STOPLIST");
  if (stoplist_path == NULL) {
    return;
  }

  FILE *fp = fopen(stoplist_path, "rb");
  if (fp == NULL) {
    fprintf(stderr, "Unable to load stoplist: %s\n", stoplist_path);
    return;
  }
  char term[TERM_MAX_LEN+1];

  for (;;) {
    if (fscanf(fp, "%s\n", term) < 1) break;
    strToLower(term);
    Stem(term);
    Stopword *newStopword;
    HASH_FIND_STR(stoplist, term, newStopword);
    if (newStopword == NULL) {
      newStopword = malloc(sizeof(Stopword));
      strcpy(newStopword->t, term);
      HASH_ADD_STR(stoplist, t, newStopword);
    }
  }
  fclose(fp);
}
Exemplo n.º 2
0
void ParallelCorpus::ReadDocuments(std::ifstream* in,
                                   vector<Document>* docs,
                                   Vocab* vocab,
                                   bool use_stemming) {
  typedef boost::tokenizer<boost::char_separator<char> > tokenizer;
  boost::char_separator<char> sep(" \t");
  std::string line;

  Document doc;
  while (getline(*in, line)) {
    Sentence current_sentence;
    tokenizer line_tokenizer(line, sep);
    for (tokenizer::iterator it = line_tokenizer.begin();
         it != line_tokenizer.end(); ++it) {
      string token = *it;
      if (use_lowercase_) {
        boost::to_lower(token);
      }
      if (use_stemming) {
        Stem(token);
      }
      current_sentence.push_back(vocab->AddWord(token));
    }
    if (current_sentence.size() > 0) {
      doc.push_back(current_sentence);
    } else {
      // An empty line indicates a document boundary
      if (doc.size() > 0) {
        docs->push_back(doc);
        doc.clear();
      }
    }
  }
  if (doc.size() > 0) {
    docs->push_back(doc);
  }
}
Exemplo n.º 3
0
bool ParallelCorpus::ReadParallelData(const string& source_file,
                                      const string& target_file) {
  typedef boost::tokenizer<boost::char_separator<char> > tokenizer;
  boost::char_separator<char> sep(" \t");
  std::string line;

  vector<Sentence> source_sents;
  std::ifstream source_in(source_file.c_str());
  if (source_in.good()) {
    Document doc;
    while (getline(source_in, line)) {
      Sentence current_sentence;
      tokenizer line_tokenizer(line, sep);
      for (tokenizer::iterator it = line_tokenizer.begin();
           it != line_tokenizer.end(); ++it) {
        string token = *it;
        if (use_lowercase_) {
          boost::to_lower(token);
        }
        if (source_stemming_) {
          Stem(token);
        }
        current_sentence.push_back(source_vocab_.AddWord(token));
      }
      source_sents.push_back(current_sentence);
    }
    source_in.close();
  } else {
    return false;
  }

  vector<Sentence> target_sents;
  std::ifstream target_in(target_file.c_str());
  if (target_in.good()) {
    Document doc;
    while (getline(target_in, line)) {
      Sentence current_sentence;
      tokenizer line_tokenizer(line, sep);
      for (tokenizer::iterator it = line_tokenizer.begin();
           it != line_tokenizer.end(); ++it) {
        string token = *it;
        if (use_lowercase_) {
          boost::to_lower(token);
        }
        if (target_stemming_) {
          Stem(token);
        }
        current_sentence.push_back(target_vocab_.AddWord(token));
      }
      target_sents.push_back(current_sentence);
    }
    target_in.close();
  } else {
    return false;
  }
  if (source_sents.size() != target_sents.size()) {
    return false;
  }
  for (int i = 0; i < source_sents.size(); ++i) {
    //if ((source_sents.at(i).size() > 0)
    //  && (target_sents.at(i).size() > 0)) {
      DocumentPair doc_pair;
      doc_pair.first.push_back(source_sents.at(i));
      doc_pair.second.push_back(target_sents.at(i));
      doc_pairs_.push_back(doc_pair);
    //}
  }
  return true;
}