void InitStoplistConfig() { char *stoplist_path = Config("STOPLIST"); if (stoplist_path == NULL) { return; } FILE *fp = fopen(stoplist_path, "rb"); if (fp == NULL) { fprintf(stderr, "Unable to load stoplist: %s\n", stoplist_path); return; } char term[TERM_MAX_LEN+1]; for (;;) { if (fscanf(fp, "%s\n", term) < 1) break; strToLower(term); Stem(term); Stopword *newStopword; HASH_FIND_STR(stoplist, term, newStopword); if (newStopword == NULL) { newStopword = malloc(sizeof(Stopword)); strcpy(newStopword->t, term); HASH_ADD_STR(stoplist, t, newStopword); } } fclose(fp); }
void ParallelCorpus::ReadDocuments(std::ifstream* in, vector<Document>* docs, Vocab* vocab, bool use_stemming) { typedef boost::tokenizer<boost::char_separator<char> > tokenizer; boost::char_separator<char> sep(" \t"); std::string line; Document doc; while (getline(*in, line)) { Sentence current_sentence; tokenizer line_tokenizer(line, sep); for (tokenizer::iterator it = line_tokenizer.begin(); it != line_tokenizer.end(); ++it) { string token = *it; if (use_lowercase_) { boost::to_lower(token); } if (use_stemming) { Stem(token); } current_sentence.push_back(vocab->AddWord(token)); } if (current_sentence.size() > 0) { doc.push_back(current_sentence); } else { // An empty line indicates a document boundary if (doc.size() > 0) { docs->push_back(doc); doc.clear(); } } } if (doc.size() > 0) { docs->push_back(doc); } }
bool ParallelCorpus::ReadParallelData(const string& source_file, const string& target_file) { typedef boost::tokenizer<boost::char_separator<char> > tokenizer; boost::char_separator<char> sep(" \t"); std::string line; vector<Sentence> source_sents; std::ifstream source_in(source_file.c_str()); if (source_in.good()) { Document doc; while (getline(source_in, line)) { Sentence current_sentence; tokenizer line_tokenizer(line, sep); for (tokenizer::iterator it = line_tokenizer.begin(); it != line_tokenizer.end(); ++it) { string token = *it; if (use_lowercase_) { boost::to_lower(token); } if (source_stemming_) { Stem(token); } current_sentence.push_back(source_vocab_.AddWord(token)); } source_sents.push_back(current_sentence); } source_in.close(); } else { return false; } vector<Sentence> target_sents; std::ifstream target_in(target_file.c_str()); if (target_in.good()) { Document doc; while (getline(target_in, line)) { Sentence current_sentence; tokenizer line_tokenizer(line, sep); for (tokenizer::iterator it = line_tokenizer.begin(); it != line_tokenizer.end(); ++it) { string token = *it; if (use_lowercase_) { boost::to_lower(token); } if (target_stemming_) { Stem(token); } current_sentence.push_back(target_vocab_.AddWord(token)); } target_sents.push_back(current_sentence); } target_in.close(); } else { return false; } if (source_sents.size() != target_sents.size()) { return false; } for (int i = 0; i < source_sents.size(); ++i) { //if ((source_sents.at(i).size() > 0) // && (target_sents.at(i).size() > 0)) { DocumentPair doc_pair; doc_pair.first.push_back(source_sents.at(i)); doc_pair.second.push_back(target_sents.at(i)); doc_pairs_.push_back(doc_pair); //} } return true; }