void PinyinPhraseLib::create_pinyin_index () { if (!m_pinyin_table || !m_pinyin_table->size()) return; clear_phrase_index (); uint32 pinyin_offset = 0; WideString content; Phrase phrase; for (uint32 i=0; i<m_phrase_lib.number_of_phrases (); i++) { phrase = m_phrase_lib.get_phrase_by_index (i); content = phrase.get_content (); std::vector<PinyinKeyVector> key_vv; m_pinyin_table->find_key_strings (key_vv, content); for (uint32 j=0; j<key_vv.size(); j++) { for (uint32 k=0; k<key_vv[j].size(); k++) m_pinyin_lib.push_back (key_vv[j][k]); insert_pinyin_phrase_into_index (phrase.get_phrase_offset (), pinyin_offset); pinyin_offset = m_pinyin_lib.size (); } #if 0 if (key_vv.size () > 1 && content.length () > 1) { for (uint32 x=0; x<key_vv.size (); x++) { std::cerr << phrase.frequency () << "\t| " << utf8_wcstombs (content) << " ="; for (uint32 y=0; y<key_vv[x].size (); y++) std::cerr << " " << key_vv[x][y]; std::cerr << "\n"; } } #endif std::cout << "." << std::flush; } sort_phrase_tables (); std::cout << "Phrase Number = " << count_phrase_number () << "\n"; }
bool PinyinPhraseLib::insert_phrase_into_index (const Phrase &phrase, const PinyinKeyVector &keys) { if (!phrase.valid ()) return false; // First find out all of the chars which have no valid key in keys. WideString content = phrase.get_content (); WideString nokey_content; PinyinKeyVector final_keys; std::vector<uint32> content_state; std::vector<PinyinKeyVector> key_vv; uint32 pinyin_offset = m_pinyin_lib.size (); uint32 i,j,k; for (i=0; i<content.length (); ++i) { if (i < keys.size () && keys [i].get_initial () != SCIM_PINYIN_ZeroInitial && keys [i].get_final () != SCIM_PINYIN_ZeroFinal) { //This key is valid, store it into final_key. final_keys.push_back (keys [i]); content_state.push_back (1); } else { //This key is invalid, put the content into nokey_content, //and store a zero key into final_keys, //and store the position into invalid_key_pos. nokey_content.push_back (content [i]); final_keys.push_back (PinyinKey ()); content_state.push_back (0); } } if (nokey_content.length ()) m_pinyin_table->find_key_strings (key_vv, nokey_content); else key_vv.push_back (PinyinKeyVector ()); std::sort (m_phrases [content.length () -1].begin (), m_phrases [content.length () -1].end (), PinyinKeyExactLessThan ()); if (m_pinyin_lib.capacity () < m_pinyin_lib.size () + key_vv.size () * content.length ()) m_pinyin_lib.reserve (m_pinyin_lib.size () + key_vv.size () * content.length () + 1); for (i=0; i<key_vv.size(); ++i) { for (j=0, k=0; j<content.length (); ++j) { if (content_state [j]) m_pinyin_lib.push_back (final_keys [j]); else m_pinyin_lib.push_back (key_vv [i][k++]); } insert_pinyin_phrase_into_index (phrase.get_phrase_offset (), pinyin_offset); pinyin_offset = m_pinyin_lib.size (); } std::sort (m_phrases [content.length () -1].begin (), m_phrases [content.length () -1].end (), m_pinyin_key_less); return true; }