void ParsePY (char *strPY, ParsePYStruct * parsePY, PYPARSEINPUTMODE mode) { char *strP; int iIndex; int iTemp; char str_Map[3]; char strTemp[7]; parsePY->iMode = PARSE_SINGLEHZ; strP = strPY; parsePY->iHZCount = 0; if (bSP) { char strQP[7]; char strJP[3]; strJP[2] = '\0'; while (*strP) { strJP[0] = *strP++; strJP[1] = *strP; SP2QP (strJP, strQP); MapPY (strQP, str_Map, mode); if (!*strP) { strcpy (parsePY->strMap[parsePY->iHZCount], str_Map); strcpy (parsePY->strPYParsed[parsePY->iHZCount++], strJP); break; } iIndex = FindPYFAIndex (strQP, 0); if (iIndex != -1) { strcpy (parsePY->strMap[parsePY->iHZCount], str_Map); strcpy (parsePY->strPYParsed[parsePY->iHZCount++], strJP); strP++; } else { strJP[1] = '\0'; SP2QP (strJP, strQP); if (!MapPY (strQP, str_Map, mode)) strcpy (parsePY->strMap[parsePY->iHZCount], strJP); else strcpy (parsePY->strMap[parsePY->iHZCount], str_Map); strcpy (parsePY->strPYParsed[parsePY->iHZCount++], strJP); } if (*strP == PY_SEPARATOR) { strcat (parsePY->strPYParsed[parsePY->iHZCount - 1], PY_SEPARATOR_S); while (*strP == PY_SEPARATOR ) strP++; } } } else { Bool bSeperator = False; do { iIndex = FindPYFAIndex (strP, 1); if (iIndex != -1) { strTemp[0] = PYTable[iIndex].strPY[strlen (PYTable[iIndex].strPY) - 1]; iTemp = -1; if (strTemp[0] == 'g' || strTemp[0] == 'n') { strncpy (strTemp, strP, strlen (PYTable[iIndex].strPY) - 1); strTemp[strlen (PYTable[iIndex].strPY) - 1] = '\0'; iTemp = FindPYFAIndex (strTemp, 0); if (iTemp != -1) { iTemp = FindPYFAIndex (strP + strlen (PYTable[iTemp].strPY), 1); if (iTemp != -1) { if (strlen (PYTable[iTemp].strPY) == 1 || !strcmp ("ng", PYTable[iTemp].strPY)) iTemp = -1; } if (iTemp != -1) { strncpy (strTemp, strP, strlen (PYTable[iIndex].strPY) - 1); strTemp[strlen (PYTable[iIndex].strPY) - 1] = '\0'; } } } if (iTemp == -1) strcpy (strTemp, PYTable[iIndex].strPY); MapPY (strTemp, str_Map, mode); strcpy (parsePY->strMap[parsePY->iHZCount], str_Map); strP += strlen (strTemp); if (bSeperator) { bSeperator = False; parsePY->strPYParsed[parsePY->iHZCount][0] = PY_SEPARATOR; parsePY->strPYParsed[parsePY->iHZCount][1] = '\0'; } else parsePY->strPYParsed[parsePY->iHZCount][0] = '\0'; strcat (parsePY->strPYParsed[parsePY->iHZCount++], strTemp); } else { if (bFullPY && *strP != PY_SEPARATOR) parsePY->iMode = PARSE_ERROR; iIndex = IsConsonant (strP, 1); if (-1 != iIndex) { parsePY->iMode = PARSE_ERROR; if (bSeperator) { bSeperator = False; parsePY->strPYParsed[parsePY->iHZCount][0] = PY_SEPARATOR; parsePY->strPYParsed[parsePY->iHZCount][1] = '\0'; } else parsePY->strPYParsed[parsePY->iHZCount][0] = '\0'; strcat (parsePY->strPYParsed[parsePY->iHZCount], consonantMapTable[iIndex].strPY); MapPY (consonantMapTable[iIndex].strPY, str_Map, mode); strcpy (parsePY->strMap[parsePY->iHZCount++], str_Map); strP += strlen (consonantMapTable[iIndex].strPY); } else { iIndex = IsSyllabary (strP, 1); if (-1 != iIndex) { if (bSeperator) { bSeperator = False; parsePY->strPYParsed[parsePY->iHZCount][0] = PY_SEPARATOR; parsePY->strPYParsed[parsePY->iHZCount][1] = '\0'; } else parsePY->strPYParsed[parsePY->iHZCount][0] = '\0'; strcat (parsePY->strPYParsed[parsePY->iHZCount], syllabaryMapTable[iIndex].strPY); MapPY (syllabaryMapTable[iIndex].strPY, str_Map, mode); strcpy (parsePY->strMap[parsePY->iHZCount++], str_Map); strP += strlen (syllabaryMapTable[iIndex].strPY); if (parsePY->iMode != PARSE_ERROR) parsePY->iMode = PARSE_ABBR; } else { //必定是分隔符 strP++; bSeperator = True; parsePY->strPYParsed[parsePY->iHZCount][0] = PY_SEPARATOR; parsePY->strPYParsed[parsePY->iHZCount][1] = '\0'; parsePY->strMap[parsePY->iHZCount][0] = '0'; parsePY->strMap[parsePY->iHZCount][1] = '0'; parsePY->strMap[parsePY->iHZCount][2] = '\0'; } } } } while (*strP); } if (strPY[strlen (strPY) - 1] == PY_SEPARATOR && !bSP) parsePY->iHZCount++; if (parsePY->iMode != PARSE_ERROR) { parsePY->iMode = parsePY->iMode & PARSE_ABBR; if (parsePY->iHZCount > 1) parsePY->iMode = parsePY->iMode | PARSE_PHRASE; else parsePY->iMode = parsePY->iMode | PARSE_SINGLEHZ; } }
void ParsePY(FcitxPinyinConfig *pyconfig, const char *strPY, ParsePYStruct * parsePY, PYPARSEINPUTMODE mode, boolean bSP) { const char *strP; int iIndex; int iTemp; char str_Map[3]; char strTemp[7]; parsePY->iMode = PARSE_SINGLEHZ; strP = strPY; parsePY->iHZCount = 0; if (bSP) { char strQP[7]; char strJP[3]; strJP[2] = '\0'; while (*strP) { strJP[0] = *strP++; strJP[1] = *strP; SP2QP(pyconfig, strJP, strQP); MapPY(pyconfig, strQP, str_Map, mode); if (!*strP) { strcpy(parsePY->strMap[parsePY->iHZCount], str_Map); strcpy(parsePY->strPYParsed[parsePY->iHZCount++], strJP); break; } iIndex = FindPYFAIndex(pyconfig, strQP, 0); if (iIndex != -1) { strcpy(parsePY->strMap[parsePY->iHZCount], str_Map); strcpy(parsePY->strPYParsed[parsePY->iHZCount++], strJP); strP++; } else { strJP[1] = '\0'; SP2QP(pyconfig, strJP, strQP); if (!MapPY(pyconfig, strQP, str_Map, mode)) strcpy(parsePY->strMap[parsePY->iHZCount], strJP); else strcpy(parsePY->strMap[parsePY->iHZCount], str_Map); strcpy(parsePY->strPYParsed[parsePY->iHZCount++], strJP); } if (*strP == PY_SEPARATOR) { strcat(parsePY->strPYParsed[parsePY->iHZCount - 1], PY_SEPARATOR_S); while (*strP == PY_SEPARATOR) strP++; } } } else { boolean bSeperator = false; do { iIndex = FindPYFAIndex(pyconfig, strP, 1); if (iIndex != -1) { size_t lIndex = strlen(pyconfig->PYTable[iIndex].strPY); strTemp[0] = pyconfig->PYTable[iIndex].strPY[lIndex - 1]; iTemp = -1; /* * if the end of pinyin is 'g', 'n', 'e' * there might be another possbility, for example "wanan" can be "wa nan" and "wan an" * try resolve these problem here */ if (strTemp[0] == 'g' || strTemp[0] == 'n' || strTemp[0] == 'e' || strTemp[0] == 'a') { strncpy(strTemp, strP, lIndex - 1); strTemp[lIndex - 1] = '\0'; /* for example we have "wan", so we try to check "wa" is valid or not, with exact match */ iTemp = FindPYFAIndex(pyconfig, strTemp, 0); /* if "wa" is valid */ if (iTemp != -1) { /* also check "nan" is valid or not */ int firstIndex; firstIndex = iTemp; iTemp = FindPYFAIndex(pyconfig, strP + strlen(pyconfig->PYTable[iTemp].strPY), 1); /* if still is valid */ if (iTemp != -1) { /* * length 1 split is what we must avoid, * for example, "nin" can be "ni n", but no separator can for "nin" if we split here * * and "ying" can be also "yi ng", for just the same case" */ if (strlen(pyconfig->PYTable[iTemp].strPY) == 1 || !strcmp("ng", pyconfig->PYTable[iTemp].strPY)) iTemp = -1; } if (iTemp != -1) { /* check the general frequency that this shoud split or not */ int index2 = FindPYFAIndex(pyconfig, strP + strlen(pyconfig->PYTable[iIndex].strPY), 1); boolean resplit = false; do { /* prefer longer */ if (index2 == -1) { resplit = true; break; } size_t length1 = strlen(pyconfig->PYTable[iIndex].strPY) + strlen(pyconfig->PYTable[index2].strPY); size_t length2 = strlen(pyconfig->PYTable[firstIndex].strPY) + strlen(pyconfig->PYTable[iTemp].strPY); if (length1 != length2) { resplit = (length1 < length2); break; } double freq1 = LookupPYFreq(pyconfig, iIndex, index2); double freq2 = LookupPYFreq(pyconfig, firstIndex, iTemp); resplit = (freq1 <= freq2); } while(0); if (resplit) { strncpy(strTemp, strP, lIndex - 1); strTemp[lIndex - 1] = '\0'; } else iTemp = -1; } } } if (iTemp == -1) strcpy(strTemp, pyconfig->PYTable[iIndex].strPY); MapPY(pyconfig, strTemp, str_Map, mode); strcpy(parsePY->strMap[parsePY->iHZCount], str_Map); strP += strlen(strTemp); if (bSeperator) { bSeperator = false; parsePY->strPYParsed[parsePY->iHZCount][0] = PY_SEPARATOR; parsePY->strPYParsed[parsePY->iHZCount][1] = '\0'; } else parsePY->strPYParsed[parsePY->iHZCount][0] = '\0'; strcat(parsePY->strPYParsed[parsePY->iHZCount++], strTemp); } else { if (pyconfig->bFullPY && *strP != PY_SEPARATOR) parsePY->iMode = PARSE_ERROR; iIndex = IsConsonant(strP, 1); if (-1 != iIndex) { parsePY->iMode = PARSE_ERROR; if (bSeperator) { bSeperator = false; parsePY->strPYParsed[parsePY->iHZCount][0] = PY_SEPARATOR; parsePY->strPYParsed[parsePY->iHZCount][1] = '\0'; } else parsePY->strPYParsed[parsePY->iHZCount][0] = '\0'; strcat(parsePY->strPYParsed[parsePY->iHZCount], consonantMapTable[iIndex].strPY); MapPY(pyconfig, consonantMapTable[iIndex].strPY, str_Map, mode); strcpy(parsePY->strMap[parsePY->iHZCount++], str_Map); strP += strlen(consonantMapTable[iIndex].strPY); } else { iIndex = IsSyllabary(strP, 1); if (-1 != iIndex) { if (bSeperator) { bSeperator = false; parsePY->strPYParsed[parsePY->iHZCount][0] = PY_SEPARATOR; parsePY->strPYParsed[parsePY->iHZCount][1] = '\0'; } else parsePY->strPYParsed[parsePY->iHZCount][0] = '\0'; strcat(parsePY->strPYParsed[parsePY->iHZCount], syllabaryMapTable[iIndex].strPY); MapPY(pyconfig, syllabaryMapTable[iIndex].strPY, str_Map, mode); strcpy(parsePY->strMap[parsePY->iHZCount++], str_Map); strP += strlen(syllabaryMapTable[iIndex].strPY); if (parsePY->iMode != PARSE_ERROR) parsePY->iMode = PARSE_ABBR; } else { //必定是分隔符 strP++; bSeperator = true; parsePY->strPYParsed[parsePY->iHZCount][0] = PY_SEPARATOR; parsePY->strPYParsed[parsePY->iHZCount][1] = '\0'; parsePY->strMap[parsePY->iHZCount][0] = '0'; parsePY->strMap[parsePY->iHZCount][1] = '0'; parsePY->strMap[parsePY->iHZCount][2] = '\0'; } } } } while (*strP); } if (strPY[strlen(strPY) - 1] == PY_SEPARATOR && !bSP) parsePY->iHZCount++; if (parsePY->iMode != PARSE_ERROR) { parsePY->iMode = parsePY->iMode & PARSE_ABBR; if (parsePY->iHZCount > 1) parsePY->iMode = parsePY->iMode | PARSE_PHRASE; else parsePY->iMode = parsePY->iMode | PARSE_SINGLEHZ; } }
void CreatePYBase(void) { _PyStruct *head, *pyList, *temp, *t; char strPY[7], strHZ[UTF8_MAX_LENGTH * 80 + 1], strMap[3]; int iIndex, iCount, i; int iBaseCount; int s = 0; int tt = 0; head = (_PyStruct *) malloc(sizeof(_PyStruct)); head->prev = head; head->next = head; iBaseCount = 0; while (PYTable_template[iBaseCount].strPY[0] != '\0') iBaseCount++; for (iIndex = 0; iIndex < iBaseCount; iIndex++) YY[iIndex] = 0; iIndex = 0; while (!feof(fps)) { fscanf(fps, "%s", strPY); fscanf(fps, "%s\n", strHZ); if (MapPY(&pyconfig, strPY, strMap, PY_PARSE_INPUT_SYSTEM)) { for (i = 0; i < iBaseCount; i++) if ((!strcmp(PYTable_template[i].strPY, strPY)) && PYTable_template[i].control == PYTABLE_NONE) YY[i] += 1; iIndex++; if (fcitx_utf8_strlen(strHZ) > 1) { int8_t charLen = fcitx_utf8_char_len(strHZ); fprintf(stderr, "%s length is larger that 1, truncated to ", strHZ); strHZ[charLen] = '\0'; fprintf(stderr, "%s.\n", strHZ); } temp = (_PyStruct *) malloc(sizeof(_PyStruct)); strcpy(temp->strHZ, strHZ); strcpy(temp->strPY, strMap); pyList = head->prev; while (pyList != head) { if (strcmp(pyList->strPY, strMap) <= 0) break; pyList = pyList->prev; } temp->next = pyList->next; temp->prev = pyList; pyList->next->prev = temp; pyList->next = temp; } else fprintf(stderr, "%s Error!!!!\n", strPY); } iCount = 0; for (i = 0; i < iBaseCount; i++) { if (YY[i]) iCount++; } fwrite(&iCount, sizeof(int), 1, fp1); printf("Groups: %d\n", iCount); iAllCount = iIndex; pyList = head->next; strcpy(strPY, pyList->strPY); iCount = 0; t = pyList; while (pyList != head) { if (!strcmp(strPY, pyList->strPY)) { iCount++; } else { tt++; fwrite(strPY, sizeof(char) * 2, 1, fp1); fwrite(&iCount, sizeof(int), 1, fp1); for (i = 0; i < iCount; i++) { int8_t len = strlen(t->strHZ); fwrite(&len, sizeof(int8_t), 1, fp1); fwrite(t->strHZ, sizeof(char) * len , 1, fp1); t = t->next; } s += iCount; t = pyList; iCount = 1; strcpy(strPY, pyList->strPY); } pyList = pyList->next; } fwrite(strPY, sizeof(char) * 2, 1, fp1); fwrite(&iCount, sizeof(int), 1, fp1); for (i = 0; i < iCount; i++) { int8_t len = strlen(t->strHZ); fwrite(&len, sizeof(int8_t), 1, fp1); fwrite(t->strHZ, sizeof(char) * len , 1, fp1); t = t->next; } s += iCount; fclose(fp1); fclose(fps); }