Ejemplo n.º 1
0
unsigned CShuangpinSegmentor::_push (unsigned ch)
{
    int startFrom = 0;
    bool isInputPy;
    EShuangpinType shpType;

    m_pystr.push_back (ch);
    const int len = m_pystr.size();
    if (m_hasInvalid) {
        startFrom = len - 1;
        m_segs.push_back (TSegment (ch, startFrom, 1, IPySegmentor::INVALID));
        goto RETURN;
    }

    shpType = s_shpData.getShuangpinType();
    isInputPy = ( islower(ch) ||
                   (ch == ';' && (shpType == MS2003 || shpType == ZIGUANG)) );
    
    if (!isInputPy) { 
        startFrom = len - 1;
        
        IPySegmentor::ESegmentType seg_type;
        if (ch == '\'' && m_inputBuf.size() > 1)
            seg_type = IPySegmentor::SYLLABLE_SEP;
        else
            seg_type = IPySegmentor::STRING;
        m_segs.push_back (TSegment (ch, startFrom, 1, seg_type));
        m_nAlpha += 1;
        m_nLastValidPos += 1;
    } else {
        bool bCompleted = !((len - m_nAlpha)%2) && isInputPy;
        char buf[4];
        if (bCompleted) {
            sprintf(buf, "%c%c", m_pystr[len-2], ch);
        } else {
            sprintf(buf, "%c", ch);
        }
        startFrom = _encode(buf, ch, bCompleted);
        if (startFrom < 0) {
            m_hasInvalid = true;
            startFrom = m_pystr.size() - 1;
            m_segs.push_back (TSegment (ch, startFrom, 1, IPySegmentor::INVALID));
        }
    }

RETURN:;

    if (m_pGetFuzzySyllablesOp && m_pGetFuzzySyllablesOp->isEnabled())
        if ( m_segs.back().m_type == SYLLABLE)
            _addFuzzySyllables (m_segs.back ());

    return startFrom;
}
Ejemplo n.º 2
0
// TOTEST
unsigned CHunpinSegmentor::_push (unsigned ch)
{
	m_pystr.push_back (ch);
	
	TSegmentVec::iterator ite =  m_segs.size() > 0 ? m_segs.end() - 1 : m_segs.begin() - 1;
	const unsigned maxStringCount = 6;
	unsigned syllableCount = 0;
	unsigned stringCount = 0;
	for(; ite != m_segs.begin() - 1 ; ite --) {
		stringCount += (*ite).m_len;
		syllableCount ++;
		if (stringCount > maxStringCount) {
			syllableCount --;
			break;
		}
	}

	unsigned strlen = m_pystr.size();
	unsigned ret;

	for(int index = syllableCount ; index >= 0; index --) {

		TSegmentVec::iterator it = m_segs.end() - index;
		unsigned tmpl;
		unsigned v;
		if(index != 0) {
			
			if((strlen - (*it).m_start) == 2) {
				char buf[4];
				sprintf(buf, "%c%c", m_pystr[(*it).m_start], m_pystr[(*it).m_start+1]);
				int startFrom = _encode(buf);
				if(startFrom >= 0)  break;
			}
			
			v = m_pytrie.match_longest (m_pystr.rbegin(), m_pystr.rbegin() + strlen - (*it).m_start, tmpl);

			if(tmpl == (strlen - (*it).m_start)) {
				TSegmentVec new_segs(1, TSegment(v, (*it).m_start, tmpl));
				m_segs.erase (m_segs.end()-index, m_segs.end());
				std::copy (new_segs.rbegin(), new_segs.rend(), back_inserter (m_segs));

				break;
			}
		}
		else {
			v = m_pytrie.match_longest (m_pystr.rbegin(), m_pystr.rbegin() + 1, tmpl);
			if(tmpl == 0) {
				IPySegmentor::ESegmentType seg_type;
				if (ch == '\'' && m_inputBuf.size() > 1) {

					seg_type = IPySegmentor::SYLLABLE_SEP;
				}
				else if (islower (ch)) {

					seg_type = IPySegmentor::INVALID;
				}
				else {

					seg_type = IPySegmentor::STRING;
				}
				ret = m_pystr.size () - 1;
				m_segs.push_back (TSegment (ch, ret, 1, seg_type));

			}
			else {
				ret = m_pystr.size () - 1;
				m_segs.push_back (TSegment (v, ret, 1));				
			}
		}
	}	
	
	TSegment &last_seg = m_segs.back();
	if (m_pGetFuzzySyllablesOp && m_pGetFuzzySyllablesOp->isEnabled())
        if ( m_segs.back().m_type == SYLLABLE)
            _addFuzzySyllables (last_seg);
	
	return last_seg.m_start;

}