unsigned CShuangpinSegmentor::_push (unsigned ch) { int startFrom = 0; bool isInputPy; EShuangpinType shpType; m_pystr.push_back (ch); const int len = m_pystr.size(); if (m_hasInvalid) { startFrom = len - 1; m_segs.push_back (TSegment (ch, startFrom, 1, IPySegmentor::INVALID)); goto RETURN; } shpType = s_shpData.getShuangpinType(); isInputPy = ( islower(ch) || (ch == ';' && (shpType == MS2003 || shpType == ZIGUANG)) ); if (!isInputPy) { startFrom = len - 1; IPySegmentor::ESegmentType seg_type; if (ch == '\'' && m_inputBuf.size() > 1) seg_type = IPySegmentor::SYLLABLE_SEP; else seg_type = IPySegmentor::STRING; m_segs.push_back (TSegment (ch, startFrom, 1, seg_type)); m_nAlpha += 1; m_nLastValidPos += 1; } else { bool bCompleted = !((len - m_nAlpha)%2) && isInputPy; char buf[4]; if (bCompleted) { sprintf(buf, "%c%c", m_pystr[len-2], ch); } else { sprintf(buf, "%c", ch); } startFrom = _encode(buf, ch, bCompleted); if (startFrom < 0) { m_hasInvalid = true; startFrom = m_pystr.size() - 1; m_segs.push_back (TSegment (ch, startFrom, 1, IPySegmentor::INVALID)); } } RETURN:; if (m_pGetFuzzySyllablesOp && m_pGetFuzzySyllablesOp->isEnabled()) if ( m_segs.back().m_type == SYLLABLE) _addFuzzySyllables (m_segs.back ()); return startFrom; }
// TOTEST unsigned CHunpinSegmentor::_push (unsigned ch) { m_pystr.push_back (ch); TSegmentVec::iterator ite = m_segs.size() > 0 ? m_segs.end() - 1 : m_segs.begin() - 1; const unsigned maxStringCount = 6; unsigned syllableCount = 0; unsigned stringCount = 0; for(; ite != m_segs.begin() - 1 ; ite --) { stringCount += (*ite).m_len; syllableCount ++; if (stringCount > maxStringCount) { syllableCount --; break; } } unsigned strlen = m_pystr.size(); unsigned ret; for(int index = syllableCount ; index >= 0; index --) { TSegmentVec::iterator it = m_segs.end() - index; unsigned tmpl; unsigned v; if(index != 0) { if((strlen - (*it).m_start) == 2) { char buf[4]; sprintf(buf, "%c%c", m_pystr[(*it).m_start], m_pystr[(*it).m_start+1]); int startFrom = _encode(buf); if(startFrom >= 0) break; } v = m_pytrie.match_longest (m_pystr.rbegin(), m_pystr.rbegin() + strlen - (*it).m_start, tmpl); if(tmpl == (strlen - (*it).m_start)) { TSegmentVec new_segs(1, TSegment(v, (*it).m_start, tmpl)); m_segs.erase (m_segs.end()-index, m_segs.end()); std::copy (new_segs.rbegin(), new_segs.rend(), back_inserter (m_segs)); break; } } else { v = m_pytrie.match_longest (m_pystr.rbegin(), m_pystr.rbegin() + 1, tmpl); if(tmpl == 0) { IPySegmentor::ESegmentType seg_type; if (ch == '\'' && m_inputBuf.size() > 1) { seg_type = IPySegmentor::SYLLABLE_SEP; } else if (islower (ch)) { seg_type = IPySegmentor::INVALID; } else { seg_type = IPySegmentor::STRING; } ret = m_pystr.size () - 1; m_segs.push_back (TSegment (ch, ret, 1, seg_type)); } else { ret = m_pystr.size () - 1; m_segs.push_back (TSegment (v, ret, 1)); } } } TSegment &last_seg = m_segs.back(); if (m_pGetFuzzySyllablesOp && m_pGetFuzzySyllablesOp->isEnabled()) if ( m_segs.back().m_type == SYLLABLE) _addFuzzySyllables (last_seg); return last_seg.m_start; }