Пример #1
0
/*
 *   "int isValidVariable(char *str)"
 *   check if string of valid length, does not start with number or symbol
 *   does not contain symbols in the middle, special symbols have been check by now
 */
int isValidVariableAndNotReserved(char *str){
    
    // base case check for null or is a reserved word or is numerical
    if (str == NULL || isReserverdWord(str) || stringIsNumber(str)) {
        return 0;
    }
    
    // get the string length
    int len = strlen(str);
    
    // is token a punctuation or symbol
    if (len == 1) {
        if (charType(str[0]) == 3){
            return 0;
        }
    }
    
    // does variable start with a letter, or is of legal size
    if ((charType(str[0]) != 2) || len > MAX_VAR_LEN ) {
        printError(err31, str);
        //printf("Error, variable %s is invalid\n", str);
        exit(EXIT_FAILURE);
    }
    
    // at this point, isnumber, is symbol, is reserved word, is special char have all been checked
    
    return 1;
}
Пример #2
0
/*
 *   "void splitInputTokens(char cleanSrc[])"
 *   split clean source code into tokens
 */
void splitInputTokens(char cleanSrc[], char *caCleanInputTokens[]){
    
    int curCharIsSpecial = 0;
    int i = 0;
    int j = 0;
    char tkn[MAX_STR] = " ";
    
    while (i <= m_nCleanCount ) {
        
        // if this is a non empty character, store it into local token array
        if (charType(cleanSrc[i])) {
            // increase both token index and cleanSrc index
            
            // check if this character is a special character, if it is
            //mark a flag and print it, reset token in next code block
            curCharIsSpecial = isSpecialChar(cleanSrc[i]);
            
            tkn[j++] = cleanSrc[i++];
            
        }
        // check if this is a new line or space (empty character)
        if ( isSpecialChar(cleanSrc[i]) ||(charType(cleanSrc[i]) == 0 ) || curCharIsSpecial ) {
            // if at least one chacter is in local token array, print it and reset token
            //printf("clean source %c\n", cleanSrc[i] );
            if(j) {
                // allocate space for token, store token, increase token count
                //printf("tkn %s\n",tkn);
                caCleanInputTokens[m_nCleanInputTokens] = cleanInputTokenCalloc(j);
                strcpy(caCleanInputTokens[m_nCleanInputTokens], tkn);
                m_nCleanInputTokens++;
                
            }
            
            // reset local token array
            memset(tkn, 0, sizeof(tkn));
            j = 0;
            
            // check if this is a new line or space (empty character)
            if  (charType(cleanSrc[i]) == 0 ){
                // increase cleanSrc index
                i++;
                // skip code beyond here and continue to next character
                continue;
            }
        }
        // check next character in the loop after all 3 if cases
        
    }
    
}
Пример #3
0
// -----------------------------------------------------------------------------
// CWPRoot::Visit
// -----------------------------------------------------------------------------
//
void CWPRoot::VisitL( CWPParameter& aParameter )
    {
    FLOG( _L( "[Provisioning] CWPRoot::VisitL Parameter:" ) );
    
    if( iCharStack )
        {
        TInt charType( iCharStack->Type() );
        TInt paramID( aParameter.ID() );

        // If the current characteristic is a logical proxy or access point,
        // add it to the list of potential targets of a link
        if( (charType == KWPPxLogical && paramID == EWPParameterProxyID )
            || (charType == KWPNapDef && paramID == EWPParameterNapID) )
            {
            iProviderIDs->AppendL( aParameter.Value() );
            }
        // Handle internet-capable NAPDEF here
        else if( charType == KWPNapDef && paramID == EWPParameterInternet )
            {
            User::LeaveIfError( iProviders.Append( iCharStack ) );
            iProviderIDs->AppendL( KInternet );
            }
        // If the parameter can link, append to the list of sources of links
        else if( paramID == EWPParameterToNapID
            || paramID == EWPParameterToProxy
            || (charType == KWPBootstrap && paramID == EWPParameterProxyID) )
            {
            iNeededIDs->AppendL( aParameter.Value() );
            User::LeaveIfError( iNeeders.Append( iCharStack ) );
            }
        }
    }
Пример #4
0
// -----------------------------------------------------------------------------
// CWPRoot::Visit
// -----------------------------------------------------------------------------
//
void CWPRoot::VisitL( CWPCharacteristic& aCharacteristic )
    {
    FLOG( _L( "[Provisioning] CWPRoot::VisitL Char:" ) );
    
    TInt charType( aCharacteristic.Type() );

    // Add the characteristic to the stack as current
    CWPCharacteristic* charStack = iCharStack;
    iCharStack = &aCharacteristic;

    // Logical proxy and access points as treated as potential targets of links
    if( charType == KWPPxLogical || charType == KWPNapDef )
        {
        // First get the characteristic and then use a visitor to find out the
        // id
        aCharacteristic.AcceptL( *this );

        // If no id was found, there's something wrong with the document
        if( iProviders.Count() == iProviderIDs->Count()-1 )
            {
            User::LeaveIfError( iProviders.Append( &aCharacteristic ) );
            }
        }
    else
        {
        // Just enter other characteristics
        aCharacteristic.AcceptL( *this );
        }

    // Remove the current from stack
    iCharStack = charStack;
    }
Пример #5
0
/*
 * eatSpace: increment mark over all whitespace until we hit a
 * character which is not whitespace. This ignores line breaks.
 */
static char *eatSpace( i_mark *mark, bool reverse )
{
    char        *s;

    s = ptrFromMark( mark );
    if( s == NULL ) {
        return( NULL );
    }
    while( charType( *s, true ) == BLOCK_WHITESPACE ) {
        if( reverse ) {
            s = decrementMark( mark );
        } else {
            s = incrementMark( mark );
        }
        if( s == NULL ) {
            if( EditFlags.OperatorWantsMove ) {
                return( NULL );
            }
            if( reverse ) {
                s = prevLine( mark );
            } else {
                s = nextLine( mark );
            }
            if( s == NULL ) {
                break;
            }
        }
    }
    return( s );

} /* eatSpace */
Пример #6
0
bool ScriptTokeniser::tokeniseSolidus (char c)
{
	switch (charType(c)) {
	case eNewline:
	case eWhitespace:
	case eCharQuote:
	case eCharSpecial:
		pop();
		add('/');
		m_emit = true; // emit single slash
		break;
	case eCharToken:
		pop();
		add('/');
		add(c);
		break;
	case eCharSolidus:
		pop();
		push(Tokenise(&ScriptTokeniser::tokeniseComment));
		break; // don't emit single slash
	case eCharStar:
		pop();
		push(Tokenise(&ScriptTokeniser::tokeniseBlockComment));
		break; // don't emit single slash
	default:
		break;
	}
	return true;
}
Пример #7
0
bool ScriptTokeniser::tokeniseDefault (char c)
{
	switch (charType(c)) {
	case eNewline:
		break;
	case eCharToken:
	case eCharStar:
		push(Tokenise(&ScriptTokeniser::tokeniseToken));
		add(c);
		break;
	case eCharSpecial:
		push(Tokenise(&ScriptTokeniser::tokeniseSpecial));
		add(c);
		break;
	case eCharQuote:
		push(Tokenise(&ScriptTokeniser::tokeniseQuotedToken));
		break;
	case eCharSolidus:
		push(Tokenise(&ScriptTokeniser::tokeniseSolidus));
		break;
	default:
		break;
	}
	return true;
}
Пример #8
0
/*
 *   "long stringIsNumber(char *str);"
 *   check if string is numerical, it will not check if its of 5 digits
 *   we need to know if it is a number regardless of legth
 *   a false (false return) would interfere with other checks, i.e. variable check
 */
int stringIsNumber(char *str){
    
    int i = 0;
    // base case check for null
    if (str == NULL) {
        return 0;
    }
    // get the string length
    int len = strlen(str);
    // if any of the string characters is not a numerical, then string is not numerical
    // return a defined invalid int value
    for (i = 0; i < len; i++) {
        if (charType(str[i]) != 1) {
            return 0;
        }
    }
    
    // if string is numerical, but has more than 5 digits, exit error
    if (len > 5) {
        printError(err25, str);
        //printf("Error, string: %s is numerical, but it has more than 5 digits\n", str);
        exit(EXIT_FAILURE);
    }
    
    // if the string is numerical, then return 1
    //strtol(str, (char **)NULL, 10)
    return 1;
    
}
Пример #9
0
/*
 * MarkEndOfNextWordForward - find a pointer to the end of the next
 *                            word (in the forwards direction)
 */
vi_rc MarkEndOfNextWordForward( i_mark *result, i_mark *curr, bool big )
{
    char        *s;
    btype       block_type;
    btype       block_type2;

    noWrap = false;
    *result = *curr;
    if( EditFlags.IsChangeWord ) {
        EditFlags.IsChangeWord = false;
        s = ptrFromMark( result );
        if( s == NULL ) {
            return( ERR_NO_ERR );
        }
        block_type = charType( *s, big );
        block_type2 = charType( *(s + 1), big );
        if( block_type == BLOCK_ENDOFLINE ||
                block_type != block_type2 ) {
            return( ERR_NO_ERR );
        }
    }
    s = incrementMark( result );
    if( s == NULL ) {
        s = nextLine( result );
        if( s == NULL ) {
            if( EditFlags.OperatorWantsMove ) {
                *result = *curr;
                return( ERR_NO_ERR );
            }
            return( ERR_NOT_THAT_MANY_WORDS );
        }
    }
    s = eatSpace( result, false );
    if( s == NULL ) {
        return( ERR_NOT_THAT_MANY_WORDS );
    }
    block_type = charType( *s, big );
    while( charType( *(s + 1), big ) == block_type ) {
        s = incrementMark( result );
        if( s == NULL ) {
            break;
        }
    }
    return( ERR_NO_ERR );

} /* MarkEndOfNextWordForward */
Пример #10
0
/*
 * GimmeCurrentEntireWordDim - fetch forward & backward to get the entire word
 */
vi_rc GimmeCurrentEntireWordDim( int *sc, int *ec, bool big )
{
    i_mark      curr, start, end;
    char        *s;
    int         last_col;
    btype       block_type;
    vi_rc       rc;

    noWrap = false;
    rc = ERR_NO_WORD_TO_FIND;
    curr = CurrentPos;
    s = ptrFromMark( &curr );
    if( s == NULL ) {
        return( rc );
    }
    block_type = charType( *s, big );
    if( block_type == BLOCK_WORD || block_type == BLOCK_DELIM ) {
        end = curr;
        last_col = LineLength( end.line );
        while( charType( *s, big ) == block_type ) {
            if( end.column > last_col ) {
                break;
            }
            s = incrementMark( &end );
            if( s == NULL ) {
                break;
            }
        }
        start = curr;
        s = ptrFromMark( &start );
        while( charType( *s, big ) == block_type ) {
            if( start.column < 1 ) {
                break;
            }
            s = decrementMark( &start );
            if( s == NULL ) {
                break;
            }
        }
        *sc = start.column + 1;
        *ec = end.column - 1;
        rc = ERR_NO_ERR;
    }
    return( rc );

} /* GimmeCurrentEntireWordDim */
Пример #11
0
/*********************************************************************
 *
 *  Func Name  : GetWordType
 *
 *  Description: Get the type of word
 *              
 *
 *  Parameters : sWord: the word

 *  Returns    : the type
 *  Author     : Kevin Zhang  
 *  History    : 
 *              1.create 2002-1-9
 *********************************************************************/
int CDictionary::GetWordType(char *sWord)
{
   int nType=charType((unsigned char *)sWord),nLen=strlen(sWord);
   if(nLen>0&&nType==CT_CHINESE&&IsAllChinese((unsigned char *)sWord))
	   return WT_CHINESE;//Chinese word
   else if(nLen>0&&nType==CT_DELIMITER)
       return WT_DELIMITER;//Delimiter
   else
	   return WT_OTHER;//other invalid
}
Пример #12
0
void
printHashTable(struct nlist** hashtab)
{
    struct nlist* np;
    int i;
    char chType[MAX_ID_LEN + 1];

    for (i = 0; i< HASHSIZE; i++)
	for (np = hashtab[i]; np!= NULL; np = np->next){
	    strcpy(chType, charType(np->type));
	    printf("%s = %s, %s, %s\n", np->name, chType, np->scope, np->storage);
	}
}
Пример #13
0
/*
 * MarkEndOfNextWordForward - find a pointer to the start of the next
 *                            word (in the backwards direction)
 */
vi_rc MarkStartOfNextWordBackward( i_mark *result, i_mark *curr, bool big )
{
    char        *s;
    btype       block_type;

    noWrap = false;
    *result = *curr;
    s = decrementMark( result );
    if( s == NULL ) {
        s = prevLine( result );
        if( s == NULL ) {
            return( ERR_NOT_THAT_MANY_WORDS );
        }
        if( EditFlags.Modeless ) {
            incrementMark( result );
            return ERR_NO_ERR;
        }
    }
    s = eatSpace( result, true );
    if( s == NULL ) {
        return( ERR_NOT_THAT_MANY_WORDS );
    }

    /*
     * because we are looking at the previous character in the following loop,
     * we have to be careful in case we are sitting at the start of a line.
     */
    if( result->column > 1 ) {
        block_type = charType( *s, big );
        while( charType( *(s - 1), big ) == block_type ) {
            s = decrementMark( result );
            if( s == NULL || result->column == 1 ) {
                break;
            }
        }
    }
    return( ERR_NO_ERR );

} /* MarkStartOfNextWordBackward */
Пример #14
0
/*
 * MarkStartOfNextWordForward: given a mark curr which denotes a logical
 * current position, we return the location of the next logical word in
 * the result mark, or ERR_NOT_THAT_MANY_WORDS if there are no more logical
 * words in the file. Note that this will span lines.
 */
vi_rc MarkStartOfNextWordForward( i_mark *result, i_mark *curr, bool big )
{
    char        *s;
    btype       block_type;

    noWrap = false;
    *result = *curr;

    s = ptrFromMark( result );
    while( s == NULL ) {
        s = nextLine( result );
        if( s == NULL ) {
            return( ERR_NOT_THAT_MANY_WORDS );
        }
        if( EditFlags.Modeless ) {
            return( ERR_NO_ERR );
        }
    }

    block_type = charType( *s, big );
    while( charType( *s, big ) == block_type ) {
        s = incrementMark( result );
        if( s == NULL ) {
            if( EditFlags.OperatorWantsMove || EditFlags.Modeless ) {
                return( ERR_NO_ERR );
            }
            s = nextLine( result );
            if( s == NULL ) {
                return( ERR_NOT_THAT_MANY_WORDS );
            }
            break;
        }
    }
    eatSpace( result, false );
    return( ERR_NO_ERR );

} /* MarkStartOfNextWordForward */
Пример #15
0
bool CResult::ChineseNameSplit(char *sPersonName, char *sSurname, char *sSurname2, char *sGivenName, CDictionary &personDict)
{
	int nSurNameLen=4,nLen=strlen(sPersonName),nFreq,i=0,nCharType,nFreqGiven;
	char sTemp[3];
	if(nLen<3||nLen>8)//Not a traditional Chinese person name
		return false;
	while(i<nLen)//No Including non-CHinese char
	{
		nCharType=charType((unsigned char*)sPersonName+i);
		if(nCharType!=CT_CHINESE&&nCharType!=CT_OTHER)
			return false;
		i+=2;
	}
	sSurname2[0]=0;//init 
	strncpy(sSurname,sPersonName,nSurNameLen);	
	sSurname[nSurNameLen]=0;
	if(!personDict.IsExist(sSurname,1))
	{
		nSurNameLen=2;
		sSurname[nSurNameLen]=0;
		if(!personDict.IsExist(sSurname,1))
		{
			nSurNameLen=0;
			sSurname[nSurNameLen]=0;
		}
	}
	strcpy(sGivenName,sPersonName+nSurNameLen);
	if(nLen>6)
	{
		strncpy(sTemp,sPersonName+nSurNameLen,2);
		sTemp[2]=0;//Get the second possible surname
		if(personDict.IsExist(sTemp,1))
		{//Hongkong women's name: Surname+surname+given name
			strcpy(sSurname2,sTemp);
			strcpy(sGivenName,sPersonName+nSurNameLen+2);
		}
	}
	nFreq=personDict.GetFrequency(sSurname,1);
	strncpy(sTemp,sGivenName,2);
	sTemp[2]=0;
	nFreqGiven=personDict.GetFrequency(sTemp,2);
	if(nSurNameLen!=4&&((nSurNameLen==0&&nLen>4)||strlen(sGivenName)>4||(GetForeignCharCount(sPersonName)>=3&&nFreq<personDict.GetFrequency("张",1)/40&&nFreqGiven<personDict.GetFrequency("华",2)/20)||(nFreq<10&&GetForeignCharCount(sGivenName)==(nLen-nSurNameLen)/2)))
		return false;
	if(nLen==4&&m_uPerson.IsGivenName(sPersonName))
	{//Single Surname+given name
		return false;
	}
	return true;
}
Пример #16
0
bool ScriptTokeniser::tokeniseToken (char c)
{
	switch (charType(c)) {
	case eNewline:
	case eWhitespace:
	case eCharQuote:
	case eCharSpecial:
		pop();
		m_emit = true; // emit token
		break;
	case eCharSolidus:
	case eCharToken:
	case eCharStar:
		add(c);
		break;
	default:
		break;
	}
	return true;
}
Пример #17
0
bool ScriptTokeniser::tokeniseQuotedToken (char c)
{
	switch (charType(c)) {
	case eNewline:
		break;
	case eWhitespace:
	case eCharToken:
	case eCharSolidus:
	case eCharStar:
	case eCharSpecial:
		add(c);
		break;
	case eCharQuote:
		pop();
		push(Tokenise(&ScriptTokeniser::tokeniseEndQuote));
		break;
	default:
		break;
	}
	return true;
}
Пример #18
0
//Guess the POS of No. nIndex word item
bool CSpan::GuessPOS(int nIndex,int *pSubIndex)
{
	int j=0,i=nIndex,nCharType;
	unsigned int nLen;
	switch(m_tagType)
	{
	case TT_NORMAL:
		break;
	case TT_PERSON:
		j=0;
		if(CC_Find("××",m_sWords[nIndex]))
		{
			m_nTags[i][j]=6;
			m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,6)+1);
		}
		else
		{
			m_nTags[i][j]=0;
			m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,0)+1);
			nLen=strlen(m_sWords[nIndex]);
			if(nLen>=4)
			{
				m_nTags[i][j]=0;
				m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,0)+1);
				m_nTags[i][j]=11;
				m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,11)*8);
				m_nTags[i][j]=12;
				m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,12)*8);
				m_nTags[i][j]=13;
				m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,13)*8);
			}
			else if(nLen==2)
			{
				m_nTags[i][j]=0;
				m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,0)+1);
				nCharType=charType((unsigned char *)m_sWords[nIndex]);
				if(nCharType==CT_OTHER||nCharType==CT_CHINESE)
				{
					m_nTags[i][j]=1;
					m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,1)+1);
					m_nTags[i][j]=2;
					m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,2)+1);
					m_nTags[i][j]=3;
					m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,3)+1);
					m_nTags[i][j]=4;
					m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,4)+1);
				}
					m_nTags[i][j]=11;
					m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,11)*8);
					m_nTags[i][j]=12;
					m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,12)*8);
					m_nTags[i][j]=13;
					m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,13)*8);
			}
		}
		break;
	case TT_PLACE:
		j=0;
		m_nTags[i][j]=0;
		m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,0)+1);
		nLen=strlen(m_sWords[nIndex]);
		if(nLen>=4)
		{
			m_nTags[i][j]=11;
			m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,11)*8);
			m_nTags[i][j]=12;
			m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,12)*8);
			m_nTags[i][j]=13;
			m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,13)*8);
		}
		else if(nLen==2)
		{
			m_nTags[i][j]=0;
			m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,0)+1);
			nCharType=charType((unsigned char *)m_sWords[nIndex]);
			if(nCharType==CT_OTHER||nCharType==CT_CHINESE)
			{
				m_nTags[i][j]=1;
				m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,1)+1);
				m_nTags[i][j]=2;
				m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,2)+1);
				m_nTags[i][j]=3;
				m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,3)+1);
				m_nTags[i][j]=4;
				m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,4)+1);
			}
				m_nTags[i][j]=11;
				m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,11)*8);
				m_nTags[i][j]=12;
				m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,12)*8);
				m_nTags[i][j]=13;
				m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,13)*8);
		}
		break;
	case TT_TRANS:
		j=0;
		nLen=strlen(m_sWords[nIndex]);
		
		m_nTags[i][j]=0;
		m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,0)+1);

		if(!IsAllChinese((unsigned char *)m_sWords[nIndex]))
		{
			if(IsAllLetter((unsigned char *)m_sWords[nIndex]))
			{
				m_nTags[i][j]=1;
				m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,1)+1);
				m_nTags[i][j]=11;
				m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,11)+1);
/*			}
			if(IsAllNum((unsigned char *)m_sWords[nIndex])||IsAllLetter((unsigned char *)m_sWords[nIndex]))
			{
*/				m_nTags[i][j]=2;
				m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,2)*2+1);
				m_nTags[i][j]=3;
				m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,3)*2+1);
				m_nTags[i][j]=12;
				m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,12)*2+1);
				m_nTags[i][j]=13;
				m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,13)*2+1);
			}
			m_nTags[i][j]=41;
			m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,41)*8);
			m_nTags[i][j]=42;
			m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,42)*8);
			m_nTags[i][j]=43;
			m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,43)*8);
		}
		else if(nLen>=4)
		{
			m_nTags[i][j]=41;
			m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,41)*8);
			m_nTags[i][j]=42;
			m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,42)*8);
			m_nTags[i][j]=43;
			m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,43)*8);
		}
		else if(nLen==2)
		{
			nCharType=charType((unsigned char *)m_sWords[nIndex]);
			if(nCharType==CT_OTHER||nCharType==CT_CHINESE)
			{
				m_nTags[i][j]=1;
				m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,1)*2+1);
				m_nTags[i][j]=2;
				m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,2)*2+1);
				m_nTags[i][j]=3;
				m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,3)*2+1);
				m_nTags[i][j]=30;
				m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,30)*8+1);
				m_nTags[i][j]=11;
				m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,11)*4+1);
				m_nTags[i][j]=12;
				m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,12)*4+1);
				m_nTags[i][j]=13;
				m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,13)*4+1);
				m_nTags[i][j]=21;
				m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,21)*2+1);
				m_nTags[i][j]=22;
				m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,22)*2+1);
				m_nTags[i][j]=23;
				m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,23)*2+1);
			}
				m_nTags[i][j]=41;
				m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,41)*8);
				m_nTags[i][j]=42;
				m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,42)*8);
				m_nTags[i][j]=43;
				m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,43)*8);
		}
		break;
	default:
		break;
	}
	*pSubIndex=j;
	return true;
}
bool CSegGraph::AtomSegment(char *sSentence)
{
    unsigned int i=0,j=0,nCurType,nNextType;	
	//i is the pointer of sentence string
	//j is the pointer of pAtoms
	char sChar[3];
	sChar[2]=0;//Set the char ending
	m_sAtom[j][0]=0;//Set the first word as null
	m_nAtomLength[j]=0;
	if(strncmp(sSentence,SENTENCE_BEGIN,strlen(SENTENCE_BEGIN))==0)
	{
		strcpy(m_sAtom[j],SENTENCE_BEGIN);//Set the first word as sentence begining
		m_nAtomLength[j]=strlen(SENTENCE_BEGIN);
		m_nAtomPOS[j]=CT_SENTENCE_BEGIN;//init
		i+=m_nAtomLength[j];
		j+=1;
		m_sAtom[j][0]=0;//Set the first word as null
		m_nAtomLength[j]=0;
	}
	while(i<strlen(sSentence))
	{
		if(strncmp(sSentence+i,SENTENCE_END,strlen(SENTENCE_END))==0)
		{
			strcpy(m_sAtom[j],SENTENCE_END);//Set the first word as null
			m_nAtomLength[j]=strlen(SENTENCE_END);
			m_nAtomPOS[j]=CT_SENTENCE_END;//init
			i+=m_nAtomLength[j];
			j+=1;
			m_sAtom[j][0]=0;//Set the first word as null
			m_nAtomLength[j]=0;
			continue;
		}
		sChar[0]=*(sSentence+i);//Get the char with first byte
		sChar[1]=0;//
		i+=1;
		if(sChar[0]<0)//Two byte char
		{
			sChar[1]=*(sSentence+i);//Get the char with second byte
			i+=1;//i increased by 1
		}
		strcat(m_sAtom[j],sChar);
		nCurType=charType((unsigned char *)sChar);
		if(sChar[0]=='.'&&(charType((unsigned char *)sSentence+i)==CT_NUM||(*(sSentence+i)>='0'&&*(sSentence+i)<='9')))
			nCurType=CT_NUM;//Digit after . indicate . as a point in the numeric
		m_nAtomPOS[j]=nCurType;
		//Record its property, just convience for continuous processing
		
		if(nCurType==CT_CHINESE||nCurType==CT_INDEX||nCurType==CT_DELIMITER||nCurType==CT_OTHER)
		{//Chinese char, index number,delimiter and other is treated as atom
			m_nAtomLength[j]=strlen(m_sAtom[j]);//Save its length
			j+=1;//Skip to next atom
			m_sAtom[j][0]=0;//init
		}
		else 
		{//Number,single char, letter
			nNextType=255;
			if(i<strlen(sSentence))
				nNextType=charType((unsigned char *)(sSentence+i));
			if(nNextType!=nCurType||i==strlen(sSentence))
			//Reaching end or next char type is different from current char
			{
				m_nAtomLength[j]=strlen(m_sAtom[j]);//Save its length	
				j+=1;
				m_sAtom[j][0]=0;//init
			}
		}
	}
	m_nAtomCount=j;//The count of segmentation atoms
	return true;
}
Пример #20
0
/*********************************************************************
 *
 *  Func Name  : PreProcessing
 *
 *  Description: Get the type of word
 *              
 *
 *  Parameters : sWord: the word

 *  Returns    : the type
 *  Author     : Kevin Zhang  
 *  History    : 
 *              1.create 2002-1-9
 *********************************************************************/
bool CDictionary::PreProcessing(char *sWord, int *nId, char *sWordRet,bool bAdd)
{
   //Position for the delimeters
   int nType=charType((unsigned char *)sWord),nLen=strlen(sWord);
   int nEnd=nLen-1,nBegin=0;
   if(nLen==0)
	   return false;
   while(nEnd>=0&&sWord[nEnd]==' ')
		nEnd-=1;
   while(nBegin<=nEnd&&sWord[nBegin]==' ')
		nBegin+=1;
   if(nBegin>nEnd)
	   return false;
   if(nEnd!=nLen-1||nBegin!=0)
   {
	   strncpy(sWord,sWord+nBegin,nEnd-nBegin+1);
	   sWord[nEnd-nBegin+1]=0;
   }
/*
   if((bAdd||strlen(sWord)>4)&&IsAllChineseNum(sWord))
   {  //Only convert the Chinese Num to 3755 while 
      //Get the inner code of the first Chinese Char
       strcpy(sWord,"五十八");
   }
*/   
   if(nType==CT_CHINESE)//&&IsAllChinese((unsigned char *)sWord)
   {//Chinese word
	   *nId=CC_ID(sWord[0],sWord[1]);
		   //Get the inner code of the first Chinese Char
		strcpy(sWordRet,&sWord[2]);//store the word,not store the first Chinese Char
		return true;
   }
/* if(nType==CT_NUM&&IsAllNum((unsigned char *)sWord))
   {
	   *nId=3756;
       //Get the inner code of the first Chinese Char
       sWordRet[0]=0;//store the word,not store the first Chinese Char
	   return true;
   }
*/ if(nType==CT_DELIMITER)
   {//Delimiter
	   *nId=3755;
       //Get the inner code of the first Chinese Char
       strcpy(sWordRet,sWord);//store the word,not store the first Chinese Char
	   return true;
   }
/*
   if(nType==CT_LETTER&&IsAllLetter((unsigned char *)sWord))
   {
	   *nId=3757;
       //Get the inner code of the first Chinese Char
       sWordRet[0]=0;//store the word,not store the first Chinese Char
	   return true;
   }
   if(nType==CT_SINGLE&&IsAllSingleByte((unsigned char *)sWord))
   {
	   *nId=3758;
       //Get the inner code of the first Chinese Char
       sWordRet[0]=0;//store the word,not store the first Chinese Char
	   return true;
   }
   if(nType==CT_INDEX&&IsAllIndex((unsigned char *)sWord))
   {
	   *nId=3759;
       //Get the inner code of the first Chinese Char
       sWordRet[0]=0;//store the word,not store the first Chinese Char
	   return true;
   }
*/
   return false;//other invalid
}