int main(int argc, char **argv) { int i; gtk_init(&argc, &argv); load_setttings(); if (argc > 1) { p_err("Currently only support ~/.config/hime/pho.tab2"); } pho_load(); for(i=0; i < idxnum_pho; i++) { phokey_t key = idx_pho[i].key; int frm = idx_pho[i].start; int to = idx_pho[i+1].start; int j; for(j=frm; j < to; j++) { prph(key); char *str = pho_idx_str(j); dbg(" %s %d\n", str, ch_pho[j].count); } } return 0; }
void prphs(phokey_t *ks, int ksN) { int i; for(i=0;i<ksN;i++) { prph(ks[i]); dbg(" "); } }
int tsin_parse_recur(int start, TSIN_PARSE *out, short *r_match_phr_N, short *r_no_match_ch_N) { int plen; double bestscore = -1; int bestusecount = 0; *r_match_phr_N = 0; *r_no_match_ch_N = tsin_parse_len - start; for(plen=1; start + plen <= tsin_parse_len && plen <= MAX_PHRASE_LEN; plen++) { #if DBG dbg("---- aa st:%d hh plen:%d ", start, plen);utf8_putchar(tss.chpho[start].ch); dbg("\n"); #endif if (plen > 1) { if (tsin_is_gtab) { if (gbuf[start+plen-1].flag & FLAG_CHPHO_PHRASE_USER_HEAD) break; } else if (tss.chpho[start+plen-1].flag & FLAG_CHPHO_PHRASE_USER_HEAD) break; } phokey_t pp[MAX_PHRASE_LEN + 1]; u_int pp32[MAX_PHRASE_LEN + 1]; u_int64_t pp64[MAX_PHRASE_LEN + 1]; int sti, edi; TSIN_PARSE pbest[MAX_PH_BF_EXT+1]; #define MAXV 1000 int maxusecount = 5-MAXV; int remlen; short match_phr_N=0, no_match_ch_N = plen; void *ppp; if (ph_key_sz==2) ppp=pp; else if (ph_key_sz==4) ppp=pp32; else ppp=pp64; bzero(pbest, sizeof(TSIN_PARSE) * tsin_parse_len); pbest[0].len = plen; pbest[0].start = start; int i, ofs; if (tsin_is_gtab) for(ofs=i=0; i < plen; i++) ofs += utf8cpy((char *)pbest[0].str + ofs, gbuf[start + i].ch); else for(ofs=i=0; i < plen; i++) ofs += utf8cpy((char *)pbest[0].str + ofs, tss.chpho[start + i].ch); #if DBG dbg("st:%d hh plen:%d ", start, plen);utf8_putchar(tss.chpho[start].ch); dbg("\n"); #endif if (tsin_is_gtab) extract_gtab_key(start, plen, ppp); else { extract_pho(start, plen, (phokey_t *)ppp); if (c_pinyin_set) mask_tone(pp, plen, c_pinyin_set + start); } #if DBG for(i=0; i < plen; i++) { prph(pp[i]); dbg("%d", c_pinyin_set[i+start]); } dbg("\n"); #endif char *pinyin_set = c_pinyin_set ? c_pinyin_set+start:NULL; if (!tsin_seek(ppp, plen, &sti, &edi, pinyin_set)) { // dbg("tsin_seek not found...\n"); if (plen > 1) break; goto next; } phokey_t mtk[MAX_PHRASE_LEN]; u_int mtk32[MAX_PHRASE_LEN]; u_int64_t mtk64[MAX_PHRASE_LEN]; void *pho; if (ph_key_sz==2) pho=mtk; else if (ph_key_sz==4) pho=mtk32; else pho=mtk64; for (;sti < edi; sti++) { char mtch[MAX_PHRASE_LEN*CH_SZ+1]; char match_len; usecount_t usecount; load_tsin_entry(sti, &match_len, &usecount, pho, (u_char *)mtch); if (match_len < plen) continue; if (tsin_is_gtab) { if (check_gtab_fixed_mismatch(start, mtch, plen)) continue; } else if (check_fixed_mismatch(start, mtch, plen)) continue; if (usecount < 0) usecount = 0; int i; if (ph_key_sz==2) { if (c_pinyin_set) { // mask_tone(pp, plen, c_pinyin_set + start); mask_tone(mtk, plen, c_pinyin_set + start); } for(i=0;i < plen;i++) if (mtk[i]!=pp[i]) break; } else if (ph_key_sz==4) { for(i=0;i < plen;i++) if (mtk32[i]!=pp32[i]) break; } else { for(i=0;i < plen;i++) if (mtk64[i]!=pp64[i]) break; } if (i < plen) continue; if (match_len > plen) { continue; } if (usecount <= maxusecount) continue; pbest[0].len = plen; maxusecount = usecount; utf8cpyN((char *)pbest[0].str, mtch, plen); pbest[0].flag |= FLAG_TSIN_PARSE_PHRASE; match_phr_N = 1; no_match_ch_N = 0; #if DBG utf8_putcharn(mtch, plen); dbg(" plen %d usecount:%d ", plen, usecount); utf8_putcharn(mtch, plen); dbg("\n"); #endif } next: #if 0 if (!match_phr_N) { if (tsin_is_gtab) { if (!(gbuf[start].ch[0] & 0x80)) no_match_ch_N = 0; } else if (!(tss.chpho[start].ch[0] & 0x80)) no_match_ch_N = 0; } #else // dbg("no_match_ch_N %d\n", no_match_ch_N); #endif remlen = tsin_parse_len - (start + plen); if (remlen) { int next = start + plen; CACHE *pca; short smatch_phr_N, sno_match_ch_N; int uc; if ((pca = cache_lookup(next))) { uc = pca->usecount; smatch_phr_N = pca->match_phr_N; sno_match_ch_N = pca->no_match_ch_N; memcpy(&pbest[1], pca->best, (tsin_parse_len - next) * sizeof(TSIN_PARSE)); } else { uc = tsin_parse_recur(next, &pbest[1], &smatch_phr_N, &sno_match_ch_N); // dbg(" gg %d\n", smatch_phr_N); add_cache(next, uc, &pbest[1], smatch_phr_N, sno_match_ch_N, tsin_parse_len); } match_phr_N += smatch_phr_N; no_match_ch_N += sno_match_ch_N; maxusecount += uc; } double score = log((double)maxusecount + MAXV) / (pow((double)match_phr_N, 10)+ 1.0E-6) / (pow((double)no_match_ch_N, 20) + 1.0E-6); #if DBG dbg("st:%d plen:%d zz muse:%d ma:%d noma:%d score:%.4e %.4e\n", start, plen, maxusecount, match_phr_N, no_match_ch_N, score, bestscore); #endif if (score > bestscore) { #if DBG dbg("is best org %.4e\n", bestscore); #endif bestscore = score; memcpy(out, pbest, sizeof(TSIN_PARSE) * (tsin_parse_len - start)); #if DBG dbg(" str:%d ", start); int i; for(i=0; i < tsin_parse_len - start; i++) { utf8_putcharn((char *)out[i].str, out[i].len); } dbg("\n"); #endif bestusecount = maxusecount; *r_match_phr_N = match_phr_N; *r_no_match_ch_N = no_match_ch_N; } } if (bestusecount < 0) bestusecount = 0; return bestusecount; }
int main(int argc, char **argv) { char *fname = "pho.tab2.src"; FILE *fp; char s[64]; int phrase_area_N=0; char *phrase_area = NULL; if (!getenv("NO_GTK_INIT")) gtk_init(&argc, &argv); if (argc > 1) fname = argv[1]; if ((fp=fopen(fname,"rb"))==NULL) p_err("cannot open %s\n", fname); while (!feof(fp)) { s[0]=0; myfgets(s,sizeof(s),fp); int len=strlen(s); if (s[len-1]=='\n') s[--len]=0; if (len==0) continue; phokey_t kk=0; char *p = s; while (*p && *p!=' ' && *p!=9) { if (kk==(BACK_QUOTE_NO << 9)) kk|=*p; else kk |= lookup((u_char *)p); p += utf8_sz(p); } items[itemsN].key = kk; p++; char *str = p; while (*p && *p != ' ' && *p!=9) p++; *p = 0; p++; int slen = strlen(str); if (slen==utf8_sz(str)) { u8cpy((char *)items[itemsN].ch, str); } else { dbg("str %s\n", str); int newN = phrase_area_N + slen + 1; phrase_area = trealloc(phrase_area, char, newN); strcpy(phrase_area + phrase_area_N, str); items[itemsN].ch[0] = PHO_PHRASE_ESCAPE; items[itemsN].ch[1] = phrase_area_N & 0xff; items[itemsN].ch[2] = (phrase_area_N>>8) & 0xff; items[itemsN].ch[3] = (phrase_area_N>>16) & 0xff; phrase_area_N = newN; } items[itemsN].count = atoi(p); items[itemsN].oseq = itemsN; itemsN++; } fclose(fp); qsort(items, itemsN, sizeof(PHITEM), qcmp_key_del); int i; #if 1 int newN = 1; for(i=1;i<itemsN;i++) if (qcmp_key_del(&items[i-1], &items[i])) items[newN++] = items[i]; else { #if 0 prph(items[i].key); utf8_putchar((char *)items[i].ch); dbg("\n"); #endif } if (itemsN != newN) { dbg("deleted %d %d\n",itemsN, newN); itemsN = newN; } #endif qsort(items, itemsN, sizeof(PHITEM), qcmp_key); PHO_IDX pho_idx[3000]; u_short pho_idxN=0; for(i=0; i < itemsN; ) { phokey_t key = items[i].key; pho_idx[pho_idxN].key = key; pho_idx[pho_idxN].start = i; pho_idxN++; int j; for (j=i+1; j < itemsN && items[j].key == key; j++); int l; for(l=i; l<j; l++) { bchcpy(pho_items[pho_itemsN].ch, items[l].ch); pho_items[pho_itemsN].count = items[l].count; pho_itemsN++; } i = j; } char *tp = strstr(fname, ".tab2.src"); if (!tp) p_err("file name should be *.tab2.src"); tp = strstr(fname, ".src"); *tp=0; char *fname_out = fname; if ((fp=fopen(fname_out,"wb"))==NULL) p_err("cannot create %s\n", fname_out); fwrite("PH",1,2,fp); // dbg("pho_itemsN:%d pho_idxN:%d\n", pho_itemsN, pho_idxN); fwrite(&pho_idxN, sizeof(u_short), 1, fp); fwrite(&pho_itemsN, sizeof(pho_itemsN), 1, fp); fwrite(&phrase_area_N, sizeof(phrase_area_N), 1, fp); #if 0 fclose(fp); exit(0); #endif fwrite(pho_idx, sizeof(PHO_IDX), pho_idxN, fp); fwrite(pho_items, sizeof(PHO_ITEM), pho_itemsN, fp); fwrite(phrase_area, 1, phrase_area_N, fp); fclose(fp); if (getenv("HIME_NO_RELOAD")==NULL) { /* caleb- does found where "reload" is used. * caleb- think the send_hime_message() here does nothing. */ send_hime_message(GDK_DISPLAY(), "reload"); } return 0; }