void LexicalTable::load( char *fileName ) { cerr << "Loading lexical translation table from " << fileName; ifstream inFile; inFile.open(fileName); if (inFile.fail()) { cerr << " - ERROR: could not open file\n"; exit(1); } istream *inFileP = &inFile; char line[LINE_MAX_LENGTH]; int i=0; while(true) { i++; if (i%100000 == 0) cerr << "." << flush; SAFE_GETLINE((*inFileP), line, LINE_MAX_LENGTH, '\n', __FILE__); if (inFileP->eof()) break; vector<string> token = tokenize( line ); if (token.size() != 3) { cerr << "line " << i << " in " << fileName << " has wrong number of tokens, skipping:\n" << token.size() << " " << token[0] << " " << line << endl; continue; } double prob = atof( token[2].c_str() ); WORD_ID wordT = vcbT.storeIfNew( token[0] ); WORD_ID wordS = vcbS.storeIfNew( token[1] ); ltable[ wordS ][ wordT ] = prob; } cerr << endl; }
bool PhraseAlignment::create(const char line[], int lineID ) { vector< string > token = tokenize( line ); int item = 1; PHRASE phraseF, phraseE; for (size_t j=0; j<token.size(); j++) { if (token[j] == "|||") item++; else { if (item == 1) phraseF.push_back( vcbF.storeIfNew( token[j] ) ); else if (item == 2) phraseE.push_back( vcbE.storeIfNew( token[j] ) ); else if (item == 3) { int e,f; sscanf(token[j].c_str(), "%d-%d", &f, &e); if ((size_t)e >= phraseE.size() || (size_t)f >= phraseF.size()) { cerr << "WARNING: sentence " << lineID << " has alignment point (" << f << ", " << e << ") out of bounds (" << phraseF.size() << ", " << phraseE.size() << ")\n"; } else { if (alignedToE.size() == 0) { vector< size_t > dummy; for(size_t i=0; i<phraseE.size(); i++) alignedToE.push_back( dummy ); for(size_t i=0; i<phraseF.size(); i++) alignedToF.push_back( dummy ); foreign = phraseTableF.storeIfNew( phraseF ); english = phraseTableE.storeIfNew( phraseE ); } alignedToE[e].push_back( f ); alignedToF[f].push_back( e ); } } } } return (item>2); // real phrase pair, not just foreign phrase }
void QuizFrame::editCurrentTerm() { if( controller->isQuizInProgress() ) { Folder* vocabTree = controller->getVocabTree(); Term* term = controller->getCurrentTerm(); if( !term ) { QMessageBox::warning( this, QObject::tr( "Information" ), tr( "DissociatedWord" ) ); return; } Vocabulary* vocab = vocabTree->getVocabulary( term->getVocabId() ); if( vocab == NULL || !vocab->isTermExists( term->getId() ) ) { QMessageBox::warning( this, QObject::tr( "Information" ), tr( "DissociatedWord" ) ); return; } TermDialog dialog( *vocab, controller, this, *term ); int result = dialog.exec(); if( result ) { QString firstLang( controller->getQuizFirstLanguage() ); QString testLang( controller->getQuizTestLanguage() ); Term newTerm = dialog.getTerm(); Translation firstLangTrans = newTerm.getTranslation( firstLang ); Translation testLangTrans = newTerm.getTranslation( testLang ); term->addTranslation( firstLangTrans ); term->addTranslation( testLangTrans ); BilingualKey commentKey( firstLang, testLang ); term->addComment( commentKey, newTerm.getComment( commentKey ) ); term->setImagePath( newTerm.getImagePath() ); vocab->setModificationDate( QDateTime::currentDateTime() ); vocab->setDirty( true ); setTerm( newTerm ); } } }
void LexicalTable::load( const string &filePath ) { cerr << "Loading lexical translation table from " << filePath; ifstream inFile; inFile.open(filePath.c_str()); if (inFile.fail()) { cerr << " - ERROR: could not open file\n"; exit(1); } istream *inFileP = &inFile; string line; int i=0; while(getline(*inFileP, line)) { i++; if (i%100000 == 0) cerr << "." << flush; vector<string> token = tokenize( line.c_str() ); if (token.size() != 3) { cerr << "line " << i << " in " << filePath << " has wrong number of tokens, skipping:\n" << token.size() << " " << token[0] << " " << line << endl; continue; } double prob = atof( token[2].c_str() ); WORD_ID wordE = vcbE.storeIfNew( token[0] ); WORD_ID wordF = vcbF.storeIfNew( token[1] ); ltable[ wordF ][ wordE ] = prob; } cerr << endl; }
void SearchDialog::editResultTerm() { ResultListItem* currItem = (ResultListItem*)resultsListView->currentItem(); if( currItem ) { Term* term = currItem->getTerm(); if( term ) { Vocabulary* vocab = controller->getVocabTree()->getVocabulary( term->getVocabId() ); TermDialog dialog( *vocab, controller, this, *term ); #if defined(Q_WS_HILDON) dialog.showFullScreen(); #endif int result = dialog.exec(); if( result ) { Term newTerm = dialog.getTerm(); term->addTranslation( newTerm.getTranslation( controller->getPreferences().getFirstLanguage() ) ); term->addTranslation( newTerm.getTranslation( controller->getPreferences().getTestLanguage() ) ); BilingualKey commentKey( controller->getPreferences().getFirstLanguage(), controller->getPreferences().getTestLanguage() ); term->addComment( commentKey, newTerm.getComment( commentKey ) ); term->setImagePath( newTerm.getImagePath() ); currItem->updateUi(); vocab->setModificationDate( QDateTime::currentDateTime() ); vocab->setDirty( true ); } } } }
reg_t kSetSynonyms(EngineState *s, int argc, reg_t *argv) { SegManager *segMan = s->_segMan; reg_t object = argv[0]; List *list; Node *node; int script; int numSynonyms = 0; Vocabulary *voc = g_sci->getVocabulary(); // Only SCI0-SCI1 EGA games had a parser. In newer versions, this is a stub if (getSciVersion() > SCI_VERSION_1_EGA_ONLY) return s->r_acc; voc->clearSynonyms(); list = s->_segMan->lookupList(readSelector(segMan, object, SELECTOR(elements))); node = s->_segMan->lookupNode(list->first); while (node) { reg_t objpos = node->value; int seg; script = readSelectorValue(segMan, objpos, SELECTOR(number)); seg = s->_segMan->getScriptSegment(script); if (seg > 0) numSynonyms = s->_segMan->getScript(seg)->getSynonymsNr(); if (numSynonyms) { const byte *synonyms = s->_segMan->getScript(seg)->getSynonyms(); if (synonyms) { debugC(kDebugLevelParser, "Setting %d synonyms for script.%d", numSynonyms, script); if (numSynonyms > 16384) { error("Segtable corruption: script.%03d has %d synonyms", script, numSynonyms); /* We used to reset the corrupted value here. I really don't think it's appropriate. * Lars */ } else for (int i = 0; i < numSynonyms; i++) { synonym_t tmp; tmp.replaceant = READ_LE_UINT16(synonyms + i * 4); tmp.replacement = READ_LE_UINT16(synonyms + i * 4 + 2); voc->addSynonym(tmp); } } else warning("Synonyms of script.%03d were requested, but script is not available", script); } node = s->_segMan->lookupNode(node->succ); } debugC(kDebugLevelParser, "A total of %d synonyms are active now.", numSynonyms); return s->r_acc; }
void test_simple_vocabulary(void) { Vocabulary* voc = new Vocabulary("/home/sasha/work/data/test/stopwords.csv"); cout << voc->contains("the") << endl; cout << voc->contains(".") << endl; cout << voc->contains(")") << endl; cout << voc->contains(",") << endl; delete voc; }
// Check for equal non-terminal alignment in case of SCFG rules. // Precondition: otherTargetToSourceAlignment has the same size as m_targetToSourceAlignments.begin()->first bool ExtractionPhrasePair::MatchesAlignment( ALIGNMENT *otherTargetToSourceAlignment ) const { if (!hierarchicalFlag) return true; // all or none of the phrasePair's word alignment matrices match, so just pick one const ALIGNMENT *thisTargetToSourceAlignment = m_targetToSourceAlignments.begin()->first; assert(m_phraseTarget->size() == thisTargetToSourceAlignment->size() + 1); assert(thisTargetToSourceAlignment->size() == otherTargetToSourceAlignment->size()); // loop over all symbols but the left hand side of the rule for (size_t i=0; i<thisTargetToSourceAlignment->size()-1; ++i) { if (isNonTerminal( vcbT.getWord( m_phraseTarget->at(i) ) )) { size_t thisAlign = *(thisTargetToSourceAlignment->at(i).begin()); size_t otherAlign = *(otherTargetToSourceAlignment->at(i).begin()); if (thisTargetToSourceAlignment->at(i).size() != 1 || otherTargetToSourceAlignment->at(i).size() != 1 || thisAlign != otherAlign) { return false; } } } return true; }
void SearchDialog::search() { const Preferences& prefs = controller->getPreferences(); QList<TermKey> results = controller->search( queryField->currentText(), prefs.getFirstLanguage(), prefs.getTestLanguage() ); resultsListView->clear(); for( QList<TermKey>::ConstIterator it = results.begin(); it != results.end(); it++ ) { const TermKey& termKey = *it; Term* term = controller->getTerm( termKey ); Vocabulary* vocab = controller->getVocabTree()->getVocabulary( termKey.getVocabId() ); if( vocab ) { ResultListItem* resultItem = new ResultListItem( resultsListView, term, prefs.getFirstLanguage(), prefs.getTestLanguage(), vocab->getTitle(), vocab->getParent()->getHumanReadablePath(), prefs.isAltInTermListShown() ); resultItem->setFont( 0, prefs.getMediumFont( prefs.getFirstLanguage() ) ); resultItem->setFont( 1, prefs.getMediumFont( prefs.getTestLanguage() ) ); } } resultsCounterLabel->setText( tr( "%1 term(s) found" ).arg( results.count() ) ); updateUi(); }
// read in a phrase pair and store it void PhraseAlignment::create(const vector<string>& token, int lineID) { int item = 1; PHRASE phraseS, phraseT; for (size_t j=0; j<token.size(); ++j) { if (token[j] == "|||") item++; else if (item == 1) // source phrase phraseS.push_back( vcbS.storeIfNew( token[j] ) ); else if (item == 2) // target phrase phraseT.push_back( vcbT.storeIfNew( token[j] ) ); else if (item == 3) { // alignment int s = strtol(token[j].substr(0, token[j].find("-")).c_str(), NULL, 10); int t = strtol(token[j].substr(token[j].find("-") + 1).c_str(), NULL, 10); if (t >= phraseT.size() || s >= phraseS.size()) { cerr << "WARNING: phrase pair " << lineID << " has alignment point (" << s << ", " << t << ") out of bounds (" << phraseS.size() << ", " << phraseT.size() << ")\n"; } else { // first alignment point? -> initialize if (alignedToT.size() == 0) { assert(alignedToS.size() == 0); size_t numTgtSymbols = (hierarchicalFlag ? phraseT.size()-1 : phraseT.size()); alignedToT.resize(numTgtSymbols); size_t numSrcSymbols = (hierarchicalFlag ? phraseS.size()-1 : phraseS.size()); alignedToS.resize(numSrcSymbols); source = phraseTableS.storeIfNew( phraseS ); target = phraseTableT.storeIfNew( phraseT ); } // add alignment point alignedToT[t].insert( s ); alignedToS[s].insert( t ); } } else if (item == 4) // count count = strtof(token[j].c_str(), NULL); } if (item == 3) count = 1.0; if (item < 3 || item > 4) { cerr << "ERROR: faulty line " << lineID << ": "; for(vector<string>::const_iterator i = token.begin(); i != token.end(); cerr << *(i++) << " "); cerr << endl; } }
void print_topics(const int num_per, const Vocabulary<W>& vocab) { int topic_idx = 0; for(const std::vector<double> topic : prior_word_) { INFO << "Topic " << topic_idx; std::vector<size_t> sorted_topic = ferrum::sort_indices(topic, false); for(size_t item_idx = 0; item_idx < num_per; ++item_idx) { size_t which = sorted_topic[item_idx]; INFO << "\t" << topic[which] << "\t" << vocab.word(which); } ++topic_idx; } }
int parse_sentence(const string& sentence, const Vocabulary& vocab, real subsample_thres, unsigned* p_seed, vector<uint64_t>* words) { istringstream iss(sentence); uint64_t total_cnt = vocab.total_cnt(); int word_cnt = 0; string word; while (iss >> word) { uint64_t word_id; if (!vocab.find_word_id(word, &word_id)) { continue; } ++word_cnt; if (subsample_thres > 0) { double t = subsample_thres * total_cnt / vocab.get_word_cnt(word_id); double remain_prob = (sqrt(1 / t) + 1) * t; // not the same as the paper, which is sqrt(t) if (remain_prob < static_cast<real>(rand_r(p_seed)) / RAND_MAX) { continue; } } words->push_back(word_id); } return word_cnt; }
void FissionReactor::updateVocabulary(const Vocabulary& v) { // first update anything in the Reactor base class that might be needed ConfigWriteLock cfg_lock(*this); Reactor::updateVocabulary(v); v.refreshTerm(m_input_event_type); v.refreshTerm(m_input_event_term); boost::mutex::scoped_lock codec_lock(m_codec_mutex); if (m_codec_ptr) m_codec_ptr->updateVocabulary(v); }
void save_word_vec(ostream& os, const Net& net, const Vocabulary& vocab) { size_t sz = net.hidden_layer_size(); const vector<Word>& words = vocab.vocab(); os << words.size() << " " << sz << endl; for (size_t i = 0; i != words.size(); ++i) { os << words[i].word; const real* v = net.get_input_vec(i); for (size_t j = 0; j != sz; ++j) { os << " " << v[j]; } os << endl; } }
inline void MapBackToStr(const vector<WORD_ID>& wid, vector<WORD>& tok, Vocabulary& vocab, vector<size_t>& NT_index){ tok.resize(wid.size()); NT_index.clear(); for(int i = 0; i< wid.size(); i++){ if(!ShouldIgnore(wid[i],vocab)){ tok[i] = vocab.getWord(wid[i]); } if(IsNT(wid[i],vocab)){ NT_index.push_back(i); } } }
double computeUnalignedFWPenalty( const PHRASE &phraseS, const PHRASE &phraseT, PhraseAlignment *alignment ) { // unaligned word counter double unaligned = 1.0; // only checking target words - source words are caught when computing inverse for(int ti=0; ti<alignment->alignedToT.size(); ti++) { const set< size_t > & srcIndices = alignment->alignedToT[ ti ]; if (srcIndices.empty() && functionWordList.find( vcbT.getWord( phraseT[ ti ] ) ) != functionWordList.end()) { unaligned *= 2.718; } } return unaligned; }
void print_set(FILE *f_inv, Vocabulary &voc, Translations &trans, Translations &occured_words) { int trans_no = 0; FOR_EACH (Translations, j, trans) { string_t translation = *j; if (occured_words.count(translation) > 0) { continue; } occured_words.insert(translation); if (trans_no > 0) { fprintf(f_inv, ", "); } fprintf(f_inv, "%s", translation.c_str()); occured_words.insert(translation); TranslationMap::iterator ref = voc.translation_map().find( '\1' + translation); if (ref != voc.translation_map().end()) { fprintf(f_inv, ", "); print_set(f_inv, voc, ref->second, occured_words); } ++trans_no; }
void printTargetPhrase(const PHRASE &phraseS, const PHRASE &phraseT, const PhraseAlignment &bestAlignment, ostream &out) { // output target symbols, except root, in rule table format for (int i = 0; i < phraseT.size()-1; ++i) { const std::string &word = vcbT.getWord(phraseT[i]); if (!stringToTreeFlag || !isNonTerminal(word)) { out << word << " "; continue; } // get corresponding source non-terminal and output pair std::set<size_t> alignmentPoints = bestAlignment.alignedToT[i]; assert(alignmentPoints.size() == 1); int j = *(alignmentPoints.begin()); if (inverseFlag) { out << word << vcbS.getWord(phraseS[j]) << " "; } else { out << vcbS.getWord(phraseS[j]) << word << " "; } } // output target root symbol out << vcbT.getWord(phraseT.back()); }
void LexicalTable::load(char *fileName) { cerr << "Loading lexical translation table from " << fileName; Bz2LineReader inFile(fileName, Bz2LineReader::UNCOMPRESSED); int i = 0; for (string line = inFile.readLine(); !line.empty(); line = inFile.readLine()) { if (line.empty()) break; if (++i%100000 == 0) cerr << "." << flush; vector<string> token = tokenize(line.c_str()); if (token.size() != 3) { cerr << "line " << i << " “" << line << "” in " << fileName << " has wrong number of tokens (" << token.size() << "), skipping:\n" << token.size() << " " << token[0] << " " << line << endl; continue; } WORD_ID wordT = vcbT.storeIfNew( token[0] ); WORD_ID wordS = vcbS.storeIfNew( token[1] ); ltable[ wordS ][ wordT ] = strtod(token[2].c_str(), NULL); } cerr << endl; }
int main(int argc, char * argv[]) { Options options = ProcessOptions(argc,argv); auto str_path_in = options.path_in.string(); auto path_out=options.path_out; if (boost::filesystem::create_directory(path_out)) { std::cerr << "creating target directory\n"; } provenance = std::string(); provenance += "vocab collected on " + get_str_time(); provenance += "source corpus : " + str_path_in + "\n"; std::cerr<<"assigning ids\n"; vocab.read_from_dir(str_path_in); provenance = provenance + "words in corpus : "+ FormatHelper::ConvertToStr(vocab.cnt_words_processed)+"\n"; provenance = provenance + "unique words : "+ FormatHelper::ConvertToStr(vocab.cnt_words)+"\n"; vocab.reduce(options.min_frequency); provenance=provenance+"filtered with minimal frequency: "+FormatHelper::ConvertToStr(options.min_frequency)+"\n"; provenance = provenance + "unique words : "+ FormatHelper::ConvertToStr(vocab.cnt_words)+"\n"; std::cerr<<"creating list of frequencies\n"; vocab.freq_per_id.resize(vocab.cnt_words); vocab.lst_id2word.resize(vocab.cnt_words); std::fill (vocab.freq_per_id.begin(),vocab.freq_per_id.end(),0); std::cerr<<"populating frequencies\n"; vocab.populate_frequency(); vocab.reassign_ids(vocab.freq_per_id); vocab.populate_frequency(); vocab.populate_ids(); std::cerr<<"dumping ids and frequencies\n"; vocab.dump_ids((path_out / boost::filesystem::path("ids")).string()); vocab.dump_frequency((path_out / boost::filesystem::path("frequencies")).string()); write_value_to_file((path_out / boost::filesystem::path("cnt_unique_words")).string(),vocab.cnt_words); write_value_to_file((path_out / boost::filesystem::path("cnt_words")).string(),vocab.cnt_words_processed); write_vector_to_file((path_out / boost::filesystem::path("freq_per_id")).string(),vocab.freq_per_id); write_value_to_file((path_out / boost::filesystem::path("provenance.txt")).string(),provenance); return 0; }
double computeLexicalTranslation( PHRASE &phraseS, PHRASE &phraseT, PhraseAlignment *alignment ) { // lexical translation probability double lexScore = 1.; int null = vcbS.getWordID("NULL"); // all target words have to be explained for (size_t ti=0; ti<alignment->alignedToT.size(); ++ti) { const set<size_t>& srcIndices = alignment->alignedToT[ti]; if (srcIndices.empty()) // explain unaligned word by NULL lexScore *= lexTable.permissiveLookup(null, phraseT[ti]); else { // go through all the aligned words to compute average double thisWordScore = 0.; for (set<size_t>::const_iterator p(srcIndices.begin()); p != srcIndices.end(); thisWordScore += lexTable.permissiveLookup(phraseS[*(p++)], phraseT[ti])); lexScore *= (thisWordScore / srcIndices.size()); } } return lexScore; }
void Codec::setConfig(const Vocabulary& v, const xmlNodePtr config_ptr) { PlatformPlugin::setConfig(v, config_ptr); // determine the type of event used by the codec std::string codec_event_str; if (! ConfigManager::getConfigOption(EVENT_ELEMENT_NAME, codec_event_str, config_ptr)) throw EmptyEventException(getId()); // find the Term reference number for the event type Vocabulary::TermRef event_type = v.findTerm(codec_event_str); if (event_type == Vocabulary::UNDEFINED_TERM_REF) throw UnknownTermException(codec_event_str); m_event_term = v[event_type]; // make sure that it is an object type Term if (m_event_term.term_type != Vocabulary::TYPE_OBJECT) throw NotAnObjectException(codec_event_str); }
// check if two word alignments between a phrase pairs "match" // i.e. they do not differ in the alignment of non-termimals bool PhraseAlignment::match( const PhraseAlignment& other ) { if (other.target != target || other.source != source) return false; if (!hierarchicalFlag) return true; PHRASE phraseT = phraseTableT.getPhrase( target ); assert(phraseT.size() == alignedToT.size() + 1); assert(alignedToT.size() == other.alignedToT.size()); // loop over all words (note: 0 = left hand side of rule) for(size_t i=0;i<phraseT.size()-1;++i) if (isNonTerminal( vcbT.getWord( phraseT[i] ) )) { if (alignedToT[i].size() != 1 || other.alignedToT[i].size() != 1 || *(alignedToT[i].begin()) != *(other.alignedToT[i].begin())) return false; } return true; }
void TransformReactor::setConfig(const Vocabulary& v, const xmlNodePtr config_ptr) { // first set config options for the Reactor base class ConfigWriteLock cfg_lock(*this); Reactor::setConfig(v, config_ptr); // clear the current configuration m_transforms.clear(); // Outgoing Event type -- i.e. what will the outgoing event be transformed into // Default (UNDEFINED_TERM_REF) -- make it the same as incoming event type // <OutgoingEvent>obj-term</OutgoingEvent> m_event_type = Vocabulary::UNDEFINED_TERM_REF; std::string event_type_str; if (ConfigManager::getConfigOption(OUTGOING_EVENT_ELEMENT_NAME, event_type_str, config_ptr)) { if (!event_type_str.empty()) m_event_type = v.findTerm(event_type_str); } // This really doesn't make much sense anymore -- you can wire the delivery of the original right through // it would make sense, if it was possible to deliver "if-not-changed" but TR2 always changes... // <DeliverOriginal>always|if-not-changed|never</DeliveryOriginal> -> DEFAULT: never m_deliver_original = DO_NEVER; std::string deliver_original_str; if (ConfigManager::getConfigOption(DELIVER_ORIGINAL_NAME, deliver_original_str, config_ptr)) { if (deliver_original_str == "true" || deliver_original_str == "always") m_deliver_original = DO_ALWAYS; else if (deliver_original_str == "if-not-changed") m_deliver_original = DO_SOMETIMES; // Could add code to throw if d_o_s is not "never" } // What fields/terms of the original event should be COPIED into the new event // <CopyOriginal>all-terms|if-not-defined|none</CopyOriginal> -> DEFAULT: if-not-defined m_copy_original = COPY_UNCHANGED; std::string copy_original_str; if (ConfigManager::getConfigOption(COPY_ORIGINAL_ELEMENT_NAME, copy_original_str, config_ptr)) { if (copy_original_str == "all-terms") m_copy_original = COPY_ALL; else if (copy_original_str == "none") m_copy_original = COPY_NONE; // Could add code to throw if c_o_s is not "if-not-defined" } // now, parse transformation rules // [rpt] <Transformation> xmlNodePtr transformation_node = config_ptr; while ( (transformation_node = ConfigManager::findConfigNodeByName(TRANSFORMATION_ELEMENT_NAME, transformation_node)) != NULL) { // parse new Transformation rule // get the Term used for the Transformation rule // <Term>src-term</Term> std::string term_id; if (! ConfigManager::getConfigOption(TERM_ELEMENT_NAME, term_id, transformation_node->children)) throw EmptyTermException(getId()); // make sure that the Term is valid const Vocabulary::TermRef term_ref = v.findTerm(term_id); if (term_ref == Vocabulary::UNDEFINED_TERM_REF) throw UnknownTermException(getId()); // get the Type of transformation // <Type>AssignValue|AssignTerm|Lookup|Rules</Type> std::string type_str; if (! ConfigManager::getConfigOption(TYPE_ELEMENT_NAME, type_str, transformation_node->children)) throw EmptyTypeException(getId()); // TODO: Improve the error message // Add the transformation const bool debug_mode = getReactionEngine().getDebugMode(); Transform *new_transform; if (type_str == "AssignValue") new_transform = new TransformAssignValue(v, v[term_ref], transformation_node->children, debug_mode); else if (type_str == "AssignTerm") new_transform = new TransformAssignTerm(v, v[term_ref], transformation_node->children, debug_mode); else if (type_str == "Lookup") new_transform = new TransformLookup(v, v[term_ref], transformation_node->children, debug_mode); else if (type_str == "Rules") new_transform = new TransformRules(v, v[term_ref], transformation_node->children, debug_mode); else if (type_str == "Regex") new_transform = new TransformRegex(v, v[term_ref], transformation_node->children, debug_mode); else if (type_str == "SplitTerm") new_transform = new TransformSplitTerm(v, v[term_ref], transformation_node->children, debug_mode); else if (type_str == "JoinTerm") new_transform = new TransformJoinTerm(v, v[term_ref], transformation_node->children, debug_mode); else if (type_str == "URLEncode") new_transform = new TransformURLEncode(v, v[term_ref], transformation_node->children, debug_mode); else if (type_str == "URLDecode") new_transform = new TransformURLDecode(v, v[term_ref], transformation_node->children, debug_mode); else throw InvalidTransformation(type_str); m_transforms.push_back(new_transform); // step to the next Comparison rule transformation_node = transformation_node->next; } }
void processPhrasePairs( vector< PhraseAlignment > &phrasePair ) { if (phrasePair.size() == 0) return; map<int, int> countE; map<int, int> alignmentE; int totalCount = 0; int currentCount = 0; int maxSameCount = 0; int maxSame = -1; int old = -1; for(size_t i=0; i<phrasePair.size(); i++) { if (i>0) { if (phrasePair[old].english == phrasePair[i].english) { if (! phrasePair[i].equals( phrasePair[old] )) { if (currentCount > maxSameCount) { maxSameCount = currentCount; maxSame = i-1; } currentCount = 0; } } else { // wrap up old E if (currentCount > maxSameCount) { maxSameCount = currentCount; maxSame = i-1; } alignmentE[ phrasePair[old].english ] = maxSame; // if (maxSameCount != totalCount) // cout << "max count is " << maxSameCount << "/" << totalCount << endl; // get ready for new E totalCount = 0; currentCount = 0; maxSameCount = 0; maxSame = -1; } } countE[ phrasePair[i].english ]++; old = i; currentCount++; totalCount++; } // wrap up old E if (currentCount > maxSameCount) { maxSameCount = currentCount; maxSame = phrasePair.size()-1; } alignmentE[ phrasePair[old].english ] = maxSame; // if (maxSameCount != totalCount) // cout << "max count is " << maxSameCount << "/" << totalCount << endl; // output table typedef map< int, int >::iterator II; PHRASE phraseF = phraseTableF.getPhrase( phrasePair[0].foreign ); size_t index = 0; for(II i = countE.begin(); i != countE.end(); i++) { //cout << "\tp( " << i->first << " | " << phrasePair[0].foreign << " ; " << phraseF.size() << " ) = ...\n"; //cerr << index << endl; // foreign phrase (unless inverse) if (! inverseFlag) { for(size_t j=0; j<phraseF.size(); j++) { phraseTableFile << vcbF.getWord( phraseF[j] ); phraseTableFile << " "; } phraseTableFile << "||| "; } // english phrase PHRASE phraseE = phraseTableE.getPhrase( i->first ); for(size_t j=0; j<phraseE.size(); j++) { phraseTableFile << vcbE.getWord( phraseE[j] ); phraseTableFile << " "; } phraseTableFile << "||| "; // foreign phrase (if inverse) if (inverseFlag) { for(size_t j=0; j<phraseF.size(); j++) { phraseTableFile << vcbF.getWord( phraseF[j] ); phraseTableFile << " "; } phraseTableFile << "||| "; } // phrase pair frequency phraseTableFile << i->second; //source phrase pair frequency phraseTableFile << " " << phrasePair.size(); // source phrase length phraseTableFile << " " << phraseF.size(); // target phrase length phraseTableFile << " " << phraseE.size(); phraseTableFile << endl; index += i->second; } }
int main() { int imgNum=300; vector<Mat> imgVec; imgVec.resize(imgNum); vector<string> nameVec; nameVec.resize(imgNum); vector<vector<KeyPoint> > keyPointsVec; keyPointsVec.resize(imgNum); vector<Mat> descriptorsVec; descriptorsVec.resize(imgNum); for(int i=0; i<imgNum; i++) { char fileName[1024] ={NULL}; sprintf(fileName, "/home/lili/workspace/SLAM/vocabTree/Lip6IndoorDataSet/Images/lip6kennedy_bigdoubleloop_%06d.ppm", i); nameVec[i]=string(fileName); imgVec[i]=imread(nameVec[i], CV_LOAD_IMAGE_GRAYSCALE); } //-- Step 1: Detect the keypoints using SURF Detector int minHessian = 400; SurfFeatureDetector detector(minHessian); SurfDescriptorExtractor extractor; vector<unsigned int> labels; for(int i=0; i<imgNum; i++) { detector.detect(imgVec[i], keyPointsVec[i]); extractor.compute(imgVec[i], keyPointsVec[i], descriptorsVec[i]); for(int j = 0; j<descriptorsVec[i].rows; j++) { labels.push_back(i); } } Mat all_descriptors; for(int i = 0; i<descriptorsVec.size(); i++) { all_descriptors.push_back(descriptorsVec[i]); } assert(labels.size() == all_descriptors.rows); cout<<"all_descriptors.rows "<<all_descriptors.rows<<endl; cout<<"hahha1 "<<endl; Vocabulary vocab; vocab.indexedDescriptors_ = all_descriptors; vector<KeyPoint> newKeypoints; Mat newDescriptors; ///add new image to the randomized kd tree { string newImageName="/home/lili/workspace/SLAM/vocabTree/Lip6IndoorDataSet/Images/lip6kennedy_bigdoubleloop_000350.ppm"; Mat newImg=imread(newImageName, CV_LOAD_IMAGE_GRAYSCALE); detector.detect(newImg, newKeypoints); extractor.compute(newImg, newKeypoints, newDescriptors); cout<<"newDescriptors.rows: "<<newDescriptors.rows<<endl; } vocab.notIndexedDescriptors_ = newDescriptors; ///clustering int clustersNum; Mat clusters(15000,64,CV_32F); //Mat float_all_descriptors; clustersNum=vocab.clustering(all_descriptors, clusters); cout<<"clustersNum "<<clustersNum<<endl; ///flann build tree clock_t begin1 = clock(); vocab.update(); clock_t end1 = clock(); double buildTree_time = double(end1 - begin1) / CLOCKS_PER_SEC; cout.precision(5); cout<<"buildTree time "<<buildTree_time<<endl; cout<<"hahha2 "<<endl; vector<KeyPoint> queryKeypoints; Mat queryDescriptors; ///QueryImage { string queryImageName="/home/lili/workspace/SLAM/vocabTree/Lip6IndoorDataSet/Images/lip6kennedy_bigdoubleloop_000381.ppm"; Mat queryImg=imread(queryImageName, CV_LOAD_IMAGE_GRAYSCALE); detector.detect(queryImg, queryKeypoints); extractor.compute(queryImg, queryKeypoints, queryDescriptors); cout<<"queryDescriptors.rows: "<<queryDescriptors.rows<<endl; } Mat indices; Mat dists; int k=2; clock_t begin2 = clock(); vocab.search(queryDescriptors, indices, dists, k); clock_t end2 = clock(); double query_time = double(end2 - begin2) / CLOCKS_PER_SEC; cout.precision(5); cout<<"query time "<<query_time<<endl; std::vector<int> indicesVec(indices.rows*indices.cols); if (indices.isContinuous()) { indicesVec.assign((int*)indices.datastart, (int*)indices.dataend); } cout<<"indicesVec.size() "<<indicesVec.size()<<endl; /// Process Nearest Neighbor Distance Ratio float nndRatio = 0.8; for(int i=0; i<indicesVec.size(); i++) { if(dists.at<float>(i,0)<nndRatio*dists.at<float>(i,1)) { cout<<"indicesVec["<<i<<"] "<<indicesVec[i]<<" image labels "<<labels[indicesVec[i]]<<endl; } } return 0; }
void SearchDialog::doRemoveTerms( bool allowSelectTrans /* = true */, bool confirmBeforeRemove /* = true */ ) { int selectedItemCount = 0; // Find all the translation languages of the selected terms. QStringList translationLanguages; for( int i = 0; i < resultsListView->topLevelItemCount(); i++ ) { ResultListItem* termItem = (ResultListItem*)resultsListView->topLevelItem( i ); if( termItem->isSelected() ) { selectedItemCount++; Term* term = termItem->getTerm(); for( Term::TranslationMap::ConstIterator it = term->translationsBegin(); it != term->translationsEnd(); it++ ) { const Translation& trans = *it; if( !translationLanguages.contains( trans.getLanguage() ) ) translationLanguages.append( trans.getLanguage() ); } } } if( selectedItemCount == 0 ) return; if( translationLanguages.count() <= 2 ) { int response; if( confirmBeforeRemove ) { QMessageBox msgBox( QObject::tr( "Warning" ), tr( "ConfirmRemoveSelectedTerms" ), QMessageBox::Warning, QMessageBox::Yes, QMessageBox::No | QMessageBox::Default | QMessageBox::Escape, QMessageBox::NoButton, this ); msgBox.setButtonText( QMessageBox::Yes, tr( "Yes" ) ); msgBox.setButtonText( QMessageBox::No, tr( "No" ) ); response = msgBox.exec(); } else response = QMessageBox::Yes; if( response == QMessageBox::Yes ) { for( int i = 0; i < resultsListView->topLevelItemCount(); i++ ) { ResultListItem* termItem = (ResultListItem*)resultsListView->topLevelItem( i ); if( termItem->isSelected() ) { Term* term = termItem->getTerm(); Vocabulary* vocab = controller->getVocabTree()->getVocabulary( term->getVocabId() ); if( !term->getImagePath().isNull() ) { QDir imagePath( term->getImagePath() ); if( imagePath.isRelative() ) { const QString& imagePath = controller->getApplicationDirName() + "/" + vocab->getParent()->getPath() + "/v-" + QString::number( vocab->getId() ) + "/" + term->getImagePath(); QFile imageFile( imagePath ); if( imageFile.exists() ) { if( !imageFile.remove() ) cerr << "Could not remove image " << qPrintable( imagePath ) << endl; } } } vocab->removeTerm( term->getId() ); delete( termItem ); vocab->setModificationDate( QDateTime::currentDateTime() ); vocab->setDirty( true ); } } resultsListView->clearSelection(); updateUi(); emit termsRemoved(); } } else { int response; QStringList selectedLanguages; if( allowSelectTrans ) { TranslationSelectionDialog msgBox( tr( "MultipleTranslationsDetectedForRemoveTermsCaption" ), tr( "MultipleTranslationsDetectedForRemoveTerms" ), translationLanguages, TranslationSelectionDialog::selectionModeTargetLanguage, controller, this ); msgBox.setMaximumHeight( size().height() - 40 ); msgBox.setMaximumWidth( size().width() - 40 ); response = msgBox.exec(); if( response ) selectedLanguages = msgBox.getSelectedLanguages(); } else { selectedLanguages = QStringList(); selectedLanguages.append( controller->getPreferences().getFirstLanguage() ); selectedLanguages.append( controller->getPreferences().getTestLanguage() ); } if( selectedLanguages.count() == 0 ) return; for( int i = 0; i < resultsListView->topLevelItemCount(); i++ ) { ResultListItem* termItem = (ResultListItem*)resultsListView->topLevelItem( i ); if( termItem->isSelected() ) { Term* term = termItem->getTerm(); Vocabulary* vocab = controller->getVocabTree()->getVocabulary( term->getVocabId() ); for( QStringList::ConstIterator it = selectedLanguages.begin(); it != selectedLanguages.end(); it++ ) { QString lang = *it; term->removeTranslation( lang ); } if( term->getTranslationCount() == 0 ) { if( !term->getImagePath().isNull() ) { QDir imagePath( term->getImagePath() ); if( imagePath.isRelative() ) { const QString& imagePath = controller->getApplicationDirName() + "/" + vocab->getParent()->getPath() + "/v-" + QString::number( vocab->getId() ) + "/" + term->getImagePath(); QFile imageFile( imagePath ); if( imageFile.exists() ) { if( !imageFile.remove() ) cerr << "Could not remove image " << qPrintable( imagePath ) << endl; } } } vocab->removeTerm( term->getId() ); delete( termItem ); vocab->setModificationDate( QDateTime::currentDateTime() ); vocab->setDirty( true ); } } } resultsListView->clearSelection(); updateUi(); } }
reg_t kParse(EngineState *s, int argc, reg_t *argv) { SegManager *segMan = s->_segMan; reg_t stringpos = argv[0]; Common::String string = s->_segMan->getString(stringpos); char *error; reg_t event = argv[1]; g_sci->checkVocabularySwitch(); Vocabulary *voc = g_sci->getVocabulary(); voc->parser_event = event; reg_t params[2] = { s->_segMan->getParserPtr(), stringpos }; ResultWordListList words; bool res = voc->tokenizeString(words, string.c_str(), &error); voc->parserIsValid = false; /* not valid */ if (res && !words.empty()) { voc->synonymizeTokens(words); s->r_acc = make_reg(0, 1); #ifdef DEBUG_PARSER debugC(kDebugLevelParser, "Parsed to the following blocks:"); for (ResultWordListList::const_iterator i = words.begin(); i != words.end(); ++i) { debugCN(2, kDebugLevelParser, " "); for (ResultWordList::const_iterator j = i->begin(); j != i->end(); ++j) { debugCN(2, kDebugLevelParser, "%sType[%04x] Group[%04x]", j == i->begin() ? "" : " / ", j->_class, j->_group); } debugCN(2, kDebugLevelParser, "\n"); } #endif voc->replacePronouns(words); int syntax_fail = voc->parseGNF(words); if (syntax_fail) { s->r_acc = make_reg(0, 1); writeSelectorValue(segMan, event, SELECTOR(claimed), 1); invokeSelector(s, g_sci->getGameObject(), SELECTOR(syntaxFail), argc, argv, 2, params); /* Issue warning */ debugC(kDebugLevelParser, "Tree building failed"); } else { voc->parserIsValid = true; voc->storePronounReference(); writeSelectorValue(segMan, event, SELECTOR(claimed), 0); #ifdef DEBUG_PARSER voc->dumpParseTree(); #endif } } else { s->r_acc = make_reg(0, 0); writeSelectorValue(segMan, event, SELECTOR(claimed), 1); if (error) { s->_segMan->strcpy(s->_segMan->getParserPtr(), error); debugC(kDebugLevelParser, "Word unknown: %s", error); /* Issue warning: */ invokeSelector(s, g_sci->getGameObject(), SELECTOR(wordFail), argc, argv, 2, params); free(error); return make_reg(0, 1); /* Tell them that it didn't work */ } } return s->r_acc; }
void FissionReactor::setConfig(const Vocabulary& v, const xmlNodePtr config_ptr) { // first set config options for the Reactor base class ConfigWriteLock cfg_lock(*this); Reactor::setConfig(v, config_ptr); // get the input event type std::string config_str; if (! ConfigManager::getConfigOption(INPUT_EVENT_TYPE_ELEMENT_NAME, config_str, config_ptr)) throw EmptyInputEventTypeException(getId()); // find vocabulary term for input event type Vocabulary::TermRef term_ref = v.findTerm(config_str); if (term_ref == Vocabulary::UNDEFINED_TERM_REF) throw UnknownTermException(config_str); m_input_event_type = v[term_ref]; // make sure that term is object/event type if (m_input_event_type.term_type != Vocabulary::TYPE_OBJECT) throw NotAnObjectException(config_str); // get the input event term if (! ConfigManager::getConfigOption(INPUT_EVENT_TERM_ELEMENT_NAME, config_str, config_ptr)) throw EmptyInputEventTermException(getId()); // find vocabulary term for input event term term_ref = v.findTerm(config_str); if (term_ref == Vocabulary::UNDEFINED_TERM_REF) throw UnknownTermException(config_str); m_input_event_term = v[term_ref]; // only string types are currently supported for input event term switch (m_input_event_term.term_type) { case Vocabulary::TYPE_NULL: case Vocabulary::TYPE_OBJECT: case Vocabulary::TYPE_INT8: case Vocabulary::TYPE_INT16: case Vocabulary::TYPE_INT32: case Vocabulary::TYPE_UINT8: case Vocabulary::TYPE_UINT16: case Vocabulary::TYPE_UINT32: case Vocabulary::TYPE_INT64: case Vocabulary::TYPE_UINT64: case Vocabulary::TYPE_FLOAT: case Vocabulary::TYPE_DOUBLE: case Vocabulary::TYPE_LONG_DOUBLE: case Vocabulary::TYPE_DATE_TIME: case Vocabulary::TYPE_DATE: case Vocabulary::TYPE_TIME: throw TermNotStringException(config_str); break; case Vocabulary::TYPE_SHORT_STRING: case Vocabulary::TYPE_STRING: case Vocabulary::TYPE_LONG_STRING: case Vocabulary::TYPE_CHAR: case Vocabulary::TYPE_BLOB: case Vocabulary::TYPE_ZBLOB: break; // these are all OK } // get the codec to use boost::mutex::scoped_lock codec_lock(m_codec_mutex); if (! ConfigManager::getConfigOption(CODEC_ELEMENT_NAME, m_codec_id, config_ptr)) throw EmptyCodecException(getId()); m_codec_ptr = getCodecFactory().getCodec(m_codec_id); PION_ASSERT(m_codec_ptr); codec_lock.unlock(); // check if we should copy all terms from original event m_copy_all_terms = false; std::string copy_all_terms_str; if (ConfigManager::getConfigOption(COPY_ALL_TERMS_ELEMENT_NAME, copy_all_terms_str, config_ptr)) { if (copy_all_terms_str == "true") m_copy_all_terms = true; } // get list of terms to copy from original event m_copy_terms.clear(); xmlNodePtr copy_term_node = config_ptr; while ((copy_term_node = ConfigManager::findConfigNodeByName(COPY_TERM_ELEMENT_NAME, copy_term_node)) != NULL) { xmlChar *xml_char_ptr = xmlNodeGetContent(copy_term_node); if (xml_char_ptr != NULL) { const std::string copy_term_str(reinterpret_cast<char*>(xml_char_ptr)); xmlFree(xml_char_ptr); if (! copy_term_str.empty()) { // find the term in the Vocabulary term_ref = v.findTerm(copy_term_str); if (term_ref == Vocabulary::UNDEFINED_TERM_REF) throw UnknownTermException(copy_term_str); // add it to the copy terms collection m_copy_terms.push_back(v[term_ref]); } } // step to the next copy term copy_term_node = copy_term_node->next; } }
void outputPhrasePair(const PhraseAlignmentCollection &phrasePair, float totalCount, int distinctCount, ostream &phraseTableFile ) { if (phrasePair.size() == 0) return; PhraseAlignment *bestAlignment = findBestAlignment( phrasePair ); // compute count float count = 0; for(size_t i=0; i<phrasePair.size(); i++) { count += phrasePair[i]->count; } // collect count of count statistics if (goodTuringFlag || kneserNeyFlag) { totalDistinct++; int countInt = count + 0.99999; if(countInt <= COC_MAX) countOfCounts[ countInt ]++; } // compute PCFG score float pcfgScore; if (pcfgFlag && !inverseFlag) { float pcfgSum = 0; for(size_t i=0; i<phrasePair.size(); ++i) { pcfgSum += phrasePair[i]->pcfgSum; } pcfgScore = pcfgSum / count; } // output phrases const PHRASE &phraseS = phrasePair[0]->GetSource(); const PHRASE &phraseT = phrasePair[0]->GetTarget(); // do not output if hierarchical and count below threshold if (hierarchicalFlag && count < minCountHierarchical) { for(int j=0; j<phraseS.size()-1; j++) { if (isNonTerminal(vcbS.getWord( phraseS[j] ))) return; } } // source phrase (unless inverse) if (! inverseFlag) { printSourcePhrase(phraseS, phraseT, *bestAlignment, phraseTableFile); phraseTableFile << " ||| "; } // target phrase printTargetPhrase(phraseS, phraseT, *bestAlignment, phraseTableFile); phraseTableFile << " ||| "; // source phrase (if inverse) if (inverseFlag) { printSourcePhrase(phraseS, phraseT, *bestAlignment, phraseTableFile); phraseTableFile << " ||| "; } // lexical translation probability if (lexFlag) { double lexScore = computeLexicalTranslation( phraseS, phraseT, bestAlignment); phraseTableFile << ( logProbFlag ? negLogProb*log(lexScore) : lexScore ); } // unaligned word penalty if (unalignedFlag) { double penalty = computeUnalignedPenalty( phraseS, phraseT, bestAlignment); phraseTableFile << " " << ( logProbFlag ? negLogProb*log(penalty) : penalty ); } // unaligned function word penalty if (unalignedFWFlag) { double penalty = computeUnalignedFWPenalty( phraseS, phraseT, bestAlignment); phraseTableFile << " " << ( logProbFlag ? negLogProb*log(penalty) : penalty ); } if (pcfgFlag && !inverseFlag) { // target-side PCFG score phraseTableFile << " " << pcfgScore; } phraseTableFile << " ||| "; // alignment info for non-terminals if (! inverseFlag) { if (hierarchicalFlag) { // always output alignment if hiero style, but only for non-terms assert(phraseT.size() == bestAlignment->alignedToT.size() + 1); for(int j = 0; j < phraseT.size() - 1; j++) { if (isNonTerminal(vcbT.getWord( phraseT[j] ))) { if (bestAlignment->alignedToT[ j ].size() != 1) { cerr << "Error: unequal numbers of non-terminals. Make sure the text does not contain words in square brackets (like [xxx])." << endl; phraseTableFile.flush(); assert(bestAlignment->alignedToT[ j ].size() == 1); } int sourcePos = *(bestAlignment->alignedToT[ j ].begin()); phraseTableFile << sourcePos << "-" << j << " "; } } } else if (wordAlignmentFlag) { // alignment info in pb model for(int j=0; j<bestAlignment->alignedToT.size(); j++) { const set< size_t > &aligned = bestAlignment->alignedToT[j]; for (set< size_t >::const_iterator p(aligned.begin()); p != aligned.end(); ++p) { phraseTableFile << *p << "-" << j << " "; } } } } // counts phraseTableFile << " ||| " << totalCount << " " << count; if (kneserNeyFlag) phraseTableFile << " " << distinctCount; // nt lengths if (outputNTLengths) { phraseTableFile << " ||| "; if (!inverseFlag) { map<size_t, map<size_t, float> > sourceProb, targetProb; // 1st sourcePos, 2nd = length, 3rd = prob calcNTLengthProb(phrasePair, sourceProb, targetProb); outputNTLengthProbs(phraseTableFile, sourceProb, "S"); outputNTLengthProbs(phraseTableFile, targetProb, "T"); } } phraseTableFile << endl; }