Exemplo n.º 1
0
void ParsePY (char *strPY, ParsePYStruct * parsePY, PYPARSEINPUTMODE mode)
{
    char           *strP;
    int             iIndex;
    int             iTemp;
    char            str_Map[3];
    char            strTemp[7];

    parsePY->iMode = PARSE_SINGLEHZ;
    strP = strPY;
    parsePY->iHZCount = 0;

    if (bSP) {
	char            strQP[7];
	char            strJP[3];

	strJP[2] = '\0';

	while (*strP) {
	    strJP[0] = *strP++;
	    strJP[1] = *strP;
	    SP2QP (strJP, strQP);
	    MapPY (strQP, str_Map, mode);

	    if (!*strP) {
		strcpy (parsePY->strMap[parsePY->iHZCount], str_Map);
		strcpy (parsePY->strPYParsed[parsePY->iHZCount++], strJP);
		break;
	    }

	    iIndex = FindPYFAIndex (strQP, 0);
	    if (iIndex != -1) {
		strcpy (parsePY->strMap[parsePY->iHZCount], str_Map);
		strcpy (parsePY->strPYParsed[parsePY->iHZCount++], strJP);
		strP++;
	    }
	    else {
		strJP[1] = '\0';
		SP2QP (strJP, strQP);
		if (!MapPY (strQP, str_Map, mode))
		    strcpy (parsePY->strMap[parsePY->iHZCount], strJP);
		else
		    strcpy (parsePY->strMap[parsePY->iHZCount], str_Map);
		strcpy (parsePY->strPYParsed[parsePY->iHZCount++], strJP);
	    }

	    if (*strP == PY_SEPARATOR) {
		strcat (parsePY->strPYParsed[parsePY->iHZCount - 1], PY_SEPARATOR_S);
		while (*strP == PY_SEPARATOR )
		    strP++;
	    }
	}
    }
    else {
	Bool            bSeperator = False;

	do {
	    iIndex = FindPYFAIndex (strP, 1);

	    if (iIndex != -1) {
		strTemp[0] = PYTable[iIndex].strPY[strlen (PYTable[iIndex].strPY) - 1];
		iTemp = -1;
		if (strTemp[0] == 'g' || strTemp[0] == 'n') {
		    strncpy (strTemp, strP, strlen (PYTable[iIndex].strPY) - 1);
		    strTemp[strlen (PYTable[iIndex].strPY) - 1] = '\0';

		    iTemp = FindPYFAIndex (strTemp, 0);
		    if (iTemp != -1) {
			iTemp = FindPYFAIndex (strP + strlen (PYTable[iTemp].strPY), 1);
			if (iTemp != -1) {
			    if (strlen (PYTable[iTemp].strPY) == 1 || !strcmp ("ng", PYTable[iTemp].strPY))
				iTemp = -1;
			}
			if (iTemp != -1) {
			    strncpy (strTemp, strP, strlen (PYTable[iIndex].strPY) - 1);
			    strTemp[strlen (PYTable[iIndex].strPY) - 1] = '\0';
			}
		    }
		}
		if (iTemp == -1)
		    strcpy (strTemp, PYTable[iIndex].strPY);
		MapPY (strTemp, str_Map, mode);
		strcpy (parsePY->strMap[parsePY->iHZCount], str_Map);
		strP += strlen (strTemp);

		if (bSeperator) {
		    bSeperator = False;
		    parsePY->strPYParsed[parsePY->iHZCount][0] = PY_SEPARATOR;
		    parsePY->strPYParsed[parsePY->iHZCount][1] = '\0';
		}
		else
		    parsePY->strPYParsed[parsePY->iHZCount][0] = '\0';
		strcat (parsePY->strPYParsed[parsePY->iHZCount++], strTemp);
	    }
	    else {
		if (bFullPY && *strP != PY_SEPARATOR)
		    parsePY->iMode = PARSE_ERROR;

		iIndex = IsConsonant (strP, 1);
		if (-1 != iIndex) {
		    parsePY->iMode = PARSE_ERROR;

		    if (bSeperator) {
			bSeperator = False;
			parsePY->strPYParsed[parsePY->iHZCount][0] = PY_SEPARATOR;
			parsePY->strPYParsed[parsePY->iHZCount][1] = '\0';
		    }
		    else
			parsePY->strPYParsed[parsePY->iHZCount][0] = '\0';
		    strcat (parsePY->strPYParsed[parsePY->iHZCount], consonantMapTable[iIndex].strPY);
		    MapPY (consonantMapTable[iIndex].strPY, str_Map, mode);
		    strcpy (parsePY->strMap[parsePY->iHZCount++], str_Map);
		    strP += strlen (consonantMapTable[iIndex].strPY);
		}
		else {
		    iIndex = IsSyllabary (strP, 1);
		    if (-1 != iIndex) {
			if (bSeperator) {
			    bSeperator = False;
			    parsePY->strPYParsed[parsePY->iHZCount][0] = PY_SEPARATOR;
			    parsePY->strPYParsed[parsePY->iHZCount][1] = '\0';
			}
			else
			    parsePY->strPYParsed[parsePY->iHZCount][0] = '\0';
			strcat (parsePY->strPYParsed[parsePY->iHZCount], syllabaryMapTable[iIndex].strPY);
			MapPY (syllabaryMapTable[iIndex].strPY, str_Map, mode);
			strcpy (parsePY->strMap[parsePY->iHZCount++], str_Map);

			strP += strlen (syllabaryMapTable[iIndex].strPY);
			if (parsePY->iMode != PARSE_ERROR)
			    parsePY->iMode = PARSE_ABBR;
		    }
		    else {	//必定是分隔符
			strP++;
			bSeperator = True;
			parsePY->strPYParsed[parsePY->iHZCount][0] = PY_SEPARATOR;
			parsePY->strPYParsed[parsePY->iHZCount][1] = '\0';
			parsePY->strMap[parsePY->iHZCount][0] = '0';
			parsePY->strMap[parsePY->iHZCount][1] = '0';
			parsePY->strMap[parsePY->iHZCount][2] = '\0';
		    }
		}
	    }
	} while (*strP);
    }

    if (strPY[strlen (strPY) - 1] == PY_SEPARATOR && !bSP)
	parsePY->iHZCount++;

    if (parsePY->iMode != PARSE_ERROR) {
	parsePY->iMode = parsePY->iMode & PARSE_ABBR;
	if (parsePY->iHZCount > 1)
	    parsePY->iMode = parsePY->iMode | PARSE_PHRASE;
	else
	    parsePY->iMode = parsePY->iMode | PARSE_SINGLEHZ;
    }
}
Exemplo n.º 2
0
void ParsePY(FcitxPinyinConfig *pyconfig, const char *strPY, ParsePYStruct * parsePY, PYPARSEINPUTMODE mode, boolean bSP)
{
    const char           *strP;
    int             iIndex;
    int             iTemp;
    char            str_Map[3];
    char            strTemp[7];

    parsePY->iMode = PARSE_SINGLEHZ;
    strP = strPY;
    parsePY->iHZCount = 0;

    if (bSP) {
        char            strQP[7];
        char            strJP[3];

        strJP[2] = '\0';

        while (*strP) {
            strJP[0] = *strP++;
            strJP[1] = *strP;
            SP2QP(pyconfig, strJP, strQP);
            MapPY(pyconfig, strQP, str_Map, mode);

            if (!*strP) {
                strcpy(parsePY->strMap[parsePY->iHZCount], str_Map);
                strcpy(parsePY->strPYParsed[parsePY->iHZCount++], strJP);
                break;
            }

            iIndex = FindPYFAIndex(pyconfig, strQP, 0);

            if (iIndex != -1) {
                strcpy(parsePY->strMap[parsePY->iHZCount], str_Map);
                strcpy(parsePY->strPYParsed[parsePY->iHZCount++], strJP);
                strP++;
            } else {
                strJP[1] = '\0';
                SP2QP(pyconfig, strJP, strQP);

                if (!MapPY(pyconfig, strQP, str_Map, mode))
                    strcpy(parsePY->strMap[parsePY->iHZCount], strJP);
                else
                    strcpy(parsePY->strMap[parsePY->iHZCount], str_Map);

                strcpy(parsePY->strPYParsed[parsePY->iHZCount++], strJP);
            }

            if (*strP == PY_SEPARATOR) {
                strcat(parsePY->strPYParsed[parsePY->iHZCount - 1], PY_SEPARATOR_S);

                while (*strP == PY_SEPARATOR)
                    strP++;
            }
        }
    } else {
        boolean            bSeperator = false;

        do {
            iIndex = FindPYFAIndex(pyconfig, strP, 1);

            if (iIndex != -1) {
                size_t lIndex = strlen(pyconfig->PYTable[iIndex].strPY);
                strTemp[0] = pyconfig->PYTable[iIndex].strPY[lIndex - 1];
                iTemp = -1;

                /*
                 * if the end of pinyin is 'g', 'n', 'e'
                 * there might be another possbility, for example "wanan" can be "wa nan" and "wan an"
                 * try resolve these problem here
                 */
                if (strTemp[0] == 'g' || strTemp[0] == 'n' || strTemp[0] == 'e' || strTemp[0] == 'a') {
                    strncpy(strTemp, strP, lIndex - 1);
                    strTemp[lIndex - 1] = '\0';

                    /* for example we have "wan", so we try to check "wa" is valid or not, with exact match */
                    iTemp = FindPYFAIndex(pyconfig, strTemp, 0);

                    /* if "wa" is valid */
                    if (iTemp != -1) {
                        /* also check "nan" is valid or not */
                        int firstIndex;
                        firstIndex = iTemp;
                        iTemp = FindPYFAIndex(pyconfig, strP + strlen(pyconfig->PYTable[iTemp].strPY), 1);

                        /* if still is valid */
                        if (iTemp != -1) {
                            /*
                             * length 1 split is what we must avoid,
                             * for example, "nin" can be "ni n", but no separator can for "nin" if we split here
                             *
                             * and "ying" can be also "yi ng", for just the same case"
                             */
                            if (strlen(pyconfig->PYTable[iTemp].strPY) == 1 || !strcmp("ng", pyconfig->PYTable[iTemp].strPY))
                                iTemp = -1;
                        }

                        if (iTemp != -1) {
                            /* check the general frequency that this shoud split or not */
                            int index2 = FindPYFAIndex(pyconfig, strP + strlen(pyconfig->PYTable[iIndex].strPY), 1);

                            boolean resplit = false;
                            do {
                                /* prefer longer */
                                if (index2 == -1) {
                                    resplit = true;
                                    break;
                                }

                                size_t length1 = strlen(pyconfig->PYTable[iIndex].strPY) + strlen(pyconfig->PYTable[index2].strPY);
                                size_t length2 = strlen(pyconfig->PYTable[firstIndex].strPY) + strlen(pyconfig->PYTable[iTemp].strPY);
                                if (length1 != length2) {
                                    resplit = (length1 < length2);
                                    break;
                                }

                                double freq1 = LookupPYFreq(pyconfig, iIndex, index2);
                                double freq2 = LookupPYFreq(pyconfig, firstIndex, iTemp);

                                resplit = (freq1 <= freq2);
                            } while(0);

                            if (resplit) {
                                strncpy(strTemp, strP, lIndex - 1);
                                strTemp[lIndex - 1] = '\0';
                            }
                            else
                                iTemp = -1;
                        }
                    }
                }

                if (iTemp == -1)
                    strcpy(strTemp, pyconfig->PYTable[iIndex].strPY);

                MapPY(pyconfig, strTemp, str_Map, mode);

                strcpy(parsePY->strMap[parsePY->iHZCount], str_Map);

                strP += strlen(strTemp);

                if (bSeperator) {
                    bSeperator = false;
                    parsePY->strPYParsed[parsePY->iHZCount][0] = PY_SEPARATOR;
                    parsePY->strPYParsed[parsePY->iHZCount][1] = '\0';
                } else
                    parsePY->strPYParsed[parsePY->iHZCount][0] = '\0';

                strcat(parsePY->strPYParsed[parsePY->iHZCount++], strTemp);
            } else {
                if (pyconfig->bFullPY && *strP != PY_SEPARATOR)
                    parsePY->iMode = PARSE_ERROR;

                iIndex = IsConsonant(strP, 1);

                if (-1 != iIndex) {
                    parsePY->iMode = PARSE_ERROR;

                    if (bSeperator) {
                        bSeperator = false;
                        parsePY->strPYParsed[parsePY->iHZCount][0] = PY_SEPARATOR;
                        parsePY->strPYParsed[parsePY->iHZCount][1] = '\0';
                    } else
                        parsePY->strPYParsed[parsePY->iHZCount][0] = '\0';

                    strcat(parsePY->strPYParsed[parsePY->iHZCount], consonantMapTable[iIndex].strPY);

                    MapPY(pyconfig, consonantMapTable[iIndex].strPY, str_Map, mode);

                    strcpy(parsePY->strMap[parsePY->iHZCount++], str_Map);

                    strP += strlen(consonantMapTable[iIndex].strPY);
                } else {
                    iIndex = IsSyllabary(strP, 1);

                    if (-1 != iIndex) {
                        if (bSeperator) {
                            bSeperator = false;
                            parsePY->strPYParsed[parsePY->iHZCount][0] = PY_SEPARATOR;
                            parsePY->strPYParsed[parsePY->iHZCount][1] = '\0';
                        } else
                            parsePY->strPYParsed[parsePY->iHZCount][0] = '\0';

                        strcat(parsePY->strPYParsed[parsePY->iHZCount], syllabaryMapTable[iIndex].strPY);

                        MapPY(pyconfig, syllabaryMapTable[iIndex].strPY, str_Map, mode);

                        strcpy(parsePY->strMap[parsePY->iHZCount++], str_Map);

                        strP += strlen(syllabaryMapTable[iIndex].strPY);

                        if (parsePY->iMode != PARSE_ERROR)
                            parsePY->iMode = PARSE_ABBR;
                    } else {
                        //必定是分隔符
                        strP++;
                        bSeperator = true;
                        parsePY->strPYParsed[parsePY->iHZCount][0] = PY_SEPARATOR;
                        parsePY->strPYParsed[parsePY->iHZCount][1] = '\0';
                        parsePY->strMap[parsePY->iHZCount][0] = '0';
                        parsePY->strMap[parsePY->iHZCount][1] = '0';
                        parsePY->strMap[parsePY->iHZCount][2] = '\0';
                    }
                }
            }
        } while (*strP);
    }

    if (strPY[strlen(strPY) - 1] == PY_SEPARATOR && !bSP)
        parsePY->iHZCount++;

    if (parsePY->iMode != PARSE_ERROR) {
        parsePY->iMode = parsePY->iMode & PARSE_ABBR;

        if (parsePY->iHZCount > 1)
            parsePY->iMode = parsePY->iMode | PARSE_PHRASE;
        else
            parsePY->iMode = parsePY->iMode | PARSE_SINGLEHZ;
    }
}
Exemplo n.º 3
0
void CreatePYBase(void)
{
    _PyStruct      *head, *pyList, *temp, *t;
    char            strPY[7], strHZ[UTF8_MAX_LENGTH * 80 + 1], strMap[3];
    int             iIndex, iCount, i;
    int             iBaseCount;
    int             s = 0;
    int             tt = 0;

    head = (_PyStruct *) malloc(sizeof(_PyStruct));
    head->prev = head;
    head->next = head;

    iBaseCount = 0;

    while (PYTable_template[iBaseCount].strPY[0] != '\0')
        iBaseCount++;

    for (iIndex = 0; iIndex < iBaseCount; iIndex++)
        YY[iIndex] = 0;

    iIndex = 0;

    while (!feof(fps)) {
        fscanf(fps, "%s", strPY);
        fscanf(fps, "%s\n", strHZ);

        if (MapPY(&pyconfig, strPY, strMap, PY_PARSE_INPUT_SYSTEM)) {
            for (i = 0; i < iBaseCount; i++)
                if ((!strcmp(PYTable_template[i].strPY, strPY)) && PYTable_template[i].control == PYTABLE_NONE)
                    YY[i] += 1;

            iIndex++;

            if (fcitx_utf8_strlen(strHZ) > 1) {
                int8_t charLen = fcitx_utf8_char_len(strHZ);
                fprintf(stderr, "%s length is larger that 1, truncated to ", strHZ);
                strHZ[charLen] = '\0';
                fprintf(stderr, "%s.\n", strHZ);
            }

            temp = (_PyStruct *) malloc(sizeof(_PyStruct));

            strcpy(temp->strHZ, strHZ);
            strcpy(temp->strPY, strMap);
            pyList = head->prev;

            while (pyList != head) {
                if (strcmp(pyList->strPY, strMap) <= 0)
                    break;

                pyList = pyList->prev;
            }

            temp->next = pyList->next;

            temp->prev = pyList;
            pyList->next->prev = temp;
            pyList->next = temp;
        } else
            fprintf(stderr, "%s Error!!!!\n", strPY);
    }

    iCount = 0;

    for (i = 0; i < iBaseCount; i++) {
        if (YY[i])
            iCount++;
    }

    fwrite(&iCount, sizeof(int), 1, fp1);

    printf("Groups: %d\n", iCount);
    iAllCount = iIndex;

    pyList = head->next;

    strcpy(strPY, pyList->strPY);
    iCount = 0;
    t = pyList;

    while (pyList != head) {
        if (!strcmp(strPY, pyList->strPY)) {
            iCount++;
        } else {
            tt++;
            fwrite(strPY, sizeof(char) * 2, 1, fp1);
            fwrite(&iCount, sizeof(int), 1, fp1);

            for (i = 0; i < iCount; i++) {
                int8_t len = strlen(t->strHZ);
                fwrite(&len, sizeof(int8_t), 1, fp1);
                fwrite(t->strHZ, sizeof(char) * len , 1, fp1);

                t = t->next;
            }

            s += iCount;

            t = pyList;
            iCount = 1;
            strcpy(strPY, pyList->strPY);
        }

        pyList = pyList->next;
    }

    fwrite(strPY, sizeof(char) * 2, 1, fp1);

    fwrite(&iCount, sizeof(int), 1, fp1);

    for (i = 0; i < iCount; i++) {
        int8_t len = strlen(t->strHZ);
        fwrite(&len, sizeof(int8_t), 1, fp1);
        fwrite(t->strHZ, sizeof(char) * len , 1, fp1);
        t = t->next;
    }

    s += iCount;

    fclose(fp1);
    fclose(fps);
}