void TStrParser::WhoAmI(const TStr& intro) const { switch (Type.Val) { case 0: printf("%s: Words [AlphN = %d]\n", intro.CStr(), GetAlphabetSize()); break; case 1: printf("%s: Syllabels [AlphN = %d]\n", intro.CStr(), GetAlphabetSize()); break; case 2: printf("%s: Chararcters [AlphN = %d]\n", intro.CStr(), GetAlphabetSize()); for (int i = 0; i < WordToIdH.Len(); i++) { printf("[%s] ", WordToIdH.GetKey(i).CStr()); } printf("\n"); break; } }
void TStrParser::GetIDFWeightV(TFltV& WeightV) { int AlphN = GetAlphabetSize(); WeightV.Gen(AlphN); for (int AlphC = 0; AlphC < AlphN; AlphC++) WeightV[AlphC] = log((double)DocsParsed / WordToIdH[AlphC]); double MaxVal = WeightV[WeightV.GetMxValN()]; for (int AlphC = 0; AlphC < AlphN; AlphC++) WeightV[AlphC] /= MaxVal; }
void CTrieHolder::UpdatePossibleOutputSymbols (const yset<size_t>& CurrentStates, yvector<bool>& PossibleOutputSymbols) const { PossibleOutputSymbols.resize(GetAlphabetSize(), false); for (yset<size_t>::const_iterator it = CurrentStates.begin(); it != CurrentStates.end(); it++ ) { UpdatePossibleOutputSymbolsbyOnState(*it, PossibleOutputSymbols); for (int r = m_Nodes[(*it)].m_FailureFunction; r != -1; r = m_Nodes[r].m_FailureFunction) UpdatePossibleOutputSymbolsbyOnState(r, PossibleOutputSymbols); }; };
void CTrieHolder::ConvertAuxChildrenToNormal() { m_Children.clear(); m_Children.reserve(m_ChildrenAux.size()); for (size_t NodeNo=0; NodeNo < m_Nodes.size(); NodeNo++) { m_Nodes[NodeNo].m_ChildrenIndex = m_Children.size(); for (size_t i=0; i<GetAlphabetSize(); i++) if (GetChildrenAux(NodeNo)[i] != -1) { CTrieRelation R; R.m_ChildNo = GetChildrenAux(NodeNo)[i]; R.m_RelationChar = i; m_Children.push_back(R); }; }; m_ChildrenAux.clear(); };
//#pragma optimize( "", off ) void CTrieHolder::CreateChildrenSequence(CTSI begin, CTSI end, size_t ParentNo, size_t WorkRuleNo) { assert (begin < end); // creating a child CTrieNode T; T.m_Parent = ParentNo; T.m_Depth = m_Nodes[ParentNo].m_Depth+1; T.m_IncomingSymbol = *begin; assert (T.m_IncomingSymbol < (int)GetAlphabetSize()); AddNode(T); // registering this child size_t ChildNo = m_Nodes.size() - 1; assert (GetChildrenAux(ParentNo)[T.m_IncomingSymbol] == -1); GetChildrenAux(ParentNo)[T.m_IncomingSymbol] = ChildNo; // inserting the next child if (end - begin > 1) CreateChildrenSequence(begin+1, end, ChildNo, WorkRuleNo); else m_Nodes[ChildNo].m_GrammarRuleNo = WorkRuleNo; };
void CTrieHolder::CreateTrie(const yset< CWorkRule >& Patterns) { assert(!Patterns.empty()); m_Nodes.clear(); m_ChildrenAux.clear(); m_Nodes.reserve(2*Patterns.size()); m_ChildrenAux.reserve(2*Patterns.size()*GetAlphabetSize()); // inserting root AddNode(CTrieNode ()); yset< CWorkRule >::const_iterator iter, prev_iter; iter = prev_iter = Patterns.begin(); size_t RuleNo = 0; CreateChildrenSequence(iter->m_RightPart.m_Items.begin(), iter->m_RightPart.m_Items.end(), 0, RuleNo); RuleNo++; for (iter++; iter != Patterns.end(); iter++, RuleNo++) { const CWorkRule& P = *iter; assert (!P.m_RightPart.m_Items.empty()); // Starter should be the node of the previous pattern, from which we should start // current sequence. // Example1: // Previous = abcd // Current = abd // We have graph (1) -a-> (2) -b-> (3) -c-> (4) -d-> (5) // Starter should be pointed to node 3. // Example2: // Previous = abc // Current = abcd // We have graph (1) -a-> (2) -b-> (3) -c-> (4) // Starter should be pointed to node 4. size_t Starter = 0; size_t CharNo =0; for (; CharNo < P.m_RightPart.m_Items.size(); CharNo++) { if ((CharNo == prev_iter->m_RightPart.m_Items.size()) || (P.m_RightPart.m_Items[CharNo] != (*prev_iter).m_RightPart.m_Items[CharNo]) ) break; Starter = GetChildrenAux(Starter)[P.m_RightPart.m_Items[CharNo]]; assert ((int)Starter != -1); }; if (CharNo < P.m_RightPart.m_Items.size()) { CreateChildrenSequence(P.m_RightPart.m_Items.begin()+CharNo, P.m_RightPart.m_Items.end(), Starter, RuleNo); } else { assert (P.m_RightPart.m_Items.size() == prev_iter->m_RightPart.m_Items.size()); // a grammar can has structural ambiguity, which causes dublicates in patterns //ErrorMessage( "a dublicate is found"); }; prev_iter = iter; }; ConvertAuxChildrenToNormal(); };
void CTrieHolder::AddNode(const CTrieNode& T) { m_Nodes.push_back(T); m_ChildrenAux.insert(m_ChildrenAux.end(), GetAlphabetSize(), -1); };