Beispiel #1
0
int main(int argc, char **argv)
{
  int i;

  gtk_init(&argc, &argv);

  load_setttings();

  if (argc > 1) {
    p_err("Currently only support ~/.config/hime/pho.tab2");
  }

  pho_load();

  for(i=0; i < idxnum_pho; i++) {
    phokey_t key = idx_pho[i].key;
    int frm = idx_pho[i].start;
    int to = idx_pho[i+1].start;

    int j;
    for(j=frm; j < to; j++) {
      prph(key);
      char *str = pho_idx_str(j);
      dbg(" %s %d\n", str, ch_pho[j].count);
    }
  }

  return 0;
}
Beispiel #2
0
void prphs(phokey_t *ks, int ksN)
{
  int i;
  for(i=0;i<ksN;i++) {
    prph(ks[i]); dbg(" ");
  }
}
Beispiel #3
0
int tsin_parse_recur(int start, TSIN_PARSE *out,
                     short *r_match_phr_N, short *r_no_match_ch_N)
{
  int plen;
  double bestscore = -1;
  int bestusecount = 0;
  *r_match_phr_N = 0;
  *r_no_match_ch_N = tsin_parse_len - start;


  for(plen=1; start + plen <= tsin_parse_len && plen <= MAX_PHRASE_LEN; plen++) {
#if DBG
    dbg("---- aa st:%d hh plen:%d ", start, plen);utf8_putchar(tss.chpho[start].ch); dbg("\n");
#endif
    if (plen > 1) {
      if (tsin_is_gtab) {
        if (gbuf[start+plen-1].flag & FLAG_CHPHO_PHRASE_USER_HEAD)
          break;
      } else
        if (tss.chpho[start+plen-1].flag & FLAG_CHPHO_PHRASE_USER_HEAD)
          break;
    }

    phokey_t pp[MAX_PHRASE_LEN + 1];
    u_int pp32[MAX_PHRASE_LEN + 1];
    u_int64_t pp64[MAX_PHRASE_LEN + 1];
    int sti, edi;
    TSIN_PARSE pbest[MAX_PH_BF_EXT+1];
#define MAXV 1000
    int maxusecount = 5-MAXV;
    int remlen;
    short match_phr_N=0, no_match_ch_N = plen;
    void *ppp;

    if (ph_key_sz==2)
      ppp=pp;
    else if (ph_key_sz==4)
      ppp=pp32;
    else
      ppp=pp64;

    bzero(pbest, sizeof(TSIN_PARSE) * tsin_parse_len);

    pbest[0].len = plen;
    pbest[0].start = start;
    int i, ofs;

    if (tsin_is_gtab)
      for(ofs=i=0; i < plen; i++)
        ofs += utf8cpy((char *)pbest[0].str + ofs, gbuf[start + i].ch);
    else
      for(ofs=i=0; i < plen; i++)
        ofs += utf8cpy((char *)pbest[0].str + ofs, tss.chpho[start + i].ch);

#if DBG
    dbg("st:%d hh plen:%d ", start, plen);utf8_putchar(tss.chpho[start].ch); dbg("\n");
#endif

    if (tsin_is_gtab)
      extract_gtab_key(start, plen, ppp);
    else {
      extract_pho(start, plen, (phokey_t *)ppp);
      if (c_pinyin_set)
        mask_tone(pp, plen, c_pinyin_set + start);
    }

#if DBG
    for(i=0; i < plen; i++) {
      prph(pp[i]); dbg("%d", c_pinyin_set[i+start]);
    }
    dbg("\n");
#endif

    char *pinyin_set = c_pinyin_set ? c_pinyin_set+start:NULL;
    if (!tsin_seek(ppp, plen, &sti, &edi, pinyin_set)) {
//      dbg("tsin_seek not found...\n");
      if (plen > 1)
        break;
      goto next;
    }

    phokey_t mtk[MAX_PHRASE_LEN];
    u_int mtk32[MAX_PHRASE_LEN];
    u_int64_t mtk64[MAX_PHRASE_LEN];
    void *pho;

    if (ph_key_sz==2)
      pho=mtk;
    else if (ph_key_sz==4)
      pho=mtk32;
    else
      pho=mtk64;

    for (;sti < edi; sti++) {
      char mtch[MAX_PHRASE_LEN*CH_SZ+1];
      char match_len;
      usecount_t usecount;

      load_tsin_entry(sti, &match_len, &usecount, pho, (u_char *)mtch);

      if (match_len < plen)
        continue;

      if (tsin_is_gtab) {
        if (check_gtab_fixed_mismatch(start, mtch, plen))
          continue;
      } else
      if (check_fixed_mismatch(start, mtch, plen))
        continue;

      if (usecount < 0)
        usecount = 0;

      int i;
      if (ph_key_sz==2) {
        if (c_pinyin_set) {
//          mask_tone(pp, plen, c_pinyin_set + start);
          mask_tone(mtk, plen, c_pinyin_set + start);
        }
        for(i=0;i < plen;i++)
          if (mtk[i]!=pp[i])
            break;
      } else if (ph_key_sz==4) {
        for(i=0;i < plen;i++)
          if (mtk32[i]!=pp32[i])
            break;
      } else {
        for(i=0;i < plen;i++)
          if (mtk64[i]!=pp64[i])
            break;
      }

      if (i < plen)
        continue;

      if (match_len > plen) {
        continue;
      }

      if (usecount <= maxusecount)
        continue;

      pbest[0].len = plen;
      maxusecount = usecount;
      utf8cpyN((char *)pbest[0].str, mtch, plen);
      pbest[0].flag |= FLAG_TSIN_PARSE_PHRASE;

      match_phr_N = 1;
      no_match_ch_N = 0;
#if DBG
      utf8_putcharn(mtch, plen);
      dbg("   plen %d usecount:%d  ", plen, usecount);
        utf8_putcharn(mtch, plen);
      dbg("\n");
#endif
    }


next:

#if 0
    if (!match_phr_N) {
      if (tsin_is_gtab) {
        if (!(gbuf[start].ch[0] & 0x80))
          no_match_ch_N = 0;
      } else
      if (!(tss.chpho[start].ch[0] & 0x80))
        no_match_ch_N = 0;
    }
#else
//	dbg("no_match_ch_N %d\n", no_match_ch_N);
#endif

    remlen = tsin_parse_len - (start + plen);


    if (remlen) {
      int next = start + plen;
      CACHE *pca;

      short smatch_phr_N, sno_match_ch_N;
      int uc;

      if ((pca = cache_lookup(next))) {
        uc = pca->usecount;
        smatch_phr_N = pca->match_phr_N;
        sno_match_ch_N = pca->no_match_ch_N;
        memcpy(&pbest[1], pca->best, (tsin_parse_len - next) * sizeof(TSIN_PARSE));
      } else {
        uc = tsin_parse_recur(next, &pbest[1], &smatch_phr_N, &sno_match_ch_N);
//        dbg("   gg %d\n", smatch_phr_N);
        add_cache(next, uc, &pbest[1], smatch_phr_N, sno_match_ch_N, tsin_parse_len);
      }

      match_phr_N += smatch_phr_N;
      no_match_ch_N += sno_match_ch_N;
      maxusecount += uc;
    }


    double score = log((double)maxusecount + MAXV) /
      (pow((double)match_phr_N, 10)+ 1.0E-6) / (pow((double)no_match_ch_N, 20) + 1.0E-6);

#if DBG
    dbg("st:%d plen:%d zz muse:%d ma:%d noma:%d  score:%.4e %.4e\n", start, plen,
        maxusecount, match_phr_N, no_match_ch_N, score, bestscore);
#endif
    if (score > bestscore) {
#if DBG
      dbg("is best org %.4e\n", bestscore);
#endif
      bestscore = score;
      memcpy(out, pbest, sizeof(TSIN_PARSE) * (tsin_parse_len - start));

#if DBG
      dbg("    str:%d  ", start);
      int i;
      for(i=0;  i < tsin_parse_len - start; i++) {
        utf8_putcharn((char *)out[i].str, out[i].len);
      }
      dbg("\n");
#endif

      bestusecount = maxusecount;
      *r_match_phr_N = match_phr_N;
      *r_no_match_ch_N = no_match_ch_N;
    }
  }

  if (bestusecount < 0)
    bestusecount = 0;

  return bestusecount;
}
Beispiel #4
0
int main(int argc, char **argv)
{
  char *fname = "pho.tab2.src";
  FILE *fp;
  char s[64];
  int phrase_area_N=0;
  char *phrase_area = NULL;

  if (!getenv("NO_GTK_INIT"))
    gtk_init(&argc, &argv);

  if (argc > 1)
    fname = argv[1];

  if ((fp=fopen(fname,"rb"))==NULL)
    p_err("cannot open %s\n", fname);


  while (!feof(fp)) {
    s[0]=0;
    myfgets(s,sizeof(s),fp);
    int len=strlen(s);

    if (s[len-1]=='\n')
      s[--len]=0;

    if (len==0)
      continue;

    phokey_t kk=0;
    char *p = s;

    while (*p && *p!=' ' && *p!=9) {
      if (kk==(BACK_QUOTE_NO << 9))
        kk|=*p;
      else
        kk |= lookup((u_char *)p);

      p += utf8_sz(p);
    }

    items[itemsN].key = kk;

    p++;

    char *str = p;
    while (*p && *p != ' ' && *p!=9)
      p++;

    *p = 0;
    p++;

    int slen = strlen(str);
    if (slen==utf8_sz(str)) {
      u8cpy((char *)items[itemsN].ch, str);
    } else {
      dbg("str %s\n", str);
      int newN = phrase_area_N + slen + 1;
      phrase_area = trealloc(phrase_area, char, newN);
      strcpy(phrase_area + phrase_area_N, str);
      items[itemsN].ch[0] = PHO_PHRASE_ESCAPE;
      items[itemsN].ch[1] = phrase_area_N & 0xff;
      items[itemsN].ch[2] = (phrase_area_N>>8) & 0xff;
      items[itemsN].ch[3] = (phrase_area_N>>16) & 0xff;
      phrase_area_N = newN;
    }

    items[itemsN].count = atoi(p);
    items[itemsN].oseq = itemsN;

    itemsN++;
  }

  fclose(fp);


  qsort(items, itemsN, sizeof(PHITEM), qcmp_key_del);
  int i;

#if 1
  int newN = 1;
  for(i=1;i<itemsN;i++)
    if (qcmp_key_del(&items[i-1], &items[i]))
      items[newN++] = items[i];
    else {
#if 0
      prph(items[i].key);
      utf8_putchar((char *)items[i].ch);
      dbg("\n");
#endif
    }

  if (itemsN != newN) {
    dbg("deleted %d %d\n",itemsN, newN);
    itemsN = newN;
  }
#endif

  qsort(items, itemsN, sizeof(PHITEM), qcmp_key);

  PHO_IDX pho_idx[3000];
  u_short pho_idxN=0;

  for(i=0; i < itemsN; ) {
    phokey_t key = items[i].key;
    pho_idx[pho_idxN].key = key;
    pho_idx[pho_idxN].start = i;
    pho_idxN++;

    int j;

    for (j=i+1; j < itemsN && items[j].key == key; j++);

    int l;
    for(l=i; l<j; l++) {
      bchcpy(pho_items[pho_itemsN].ch, items[l].ch);
      pho_items[pho_itemsN].count = items[l].count;
      pho_itemsN++;
    }

    i = j;
  }

  char *tp = strstr(fname, ".tab2.src");
  if (!tp)
    p_err("file name should be *.tab2.src");

  tp = strstr(fname, ".src");
  *tp=0;

  char *fname_out = fname;

  if ((fp=fopen(fname_out,"wb"))==NULL)
    p_err("cannot create %s\n", fname_out);

  fwrite("PH",1,2,fp);
//  dbg("pho_itemsN:%d  pho_idxN:%d\n", pho_itemsN, pho_idxN);
  fwrite(&pho_idxN, sizeof(u_short), 1, fp);
  fwrite(&pho_itemsN, sizeof(pho_itemsN), 1, fp);
  fwrite(&phrase_area_N, sizeof(phrase_area_N), 1, fp);
#if 0
  fclose(fp); exit(0);
#endif
  fwrite(pho_idx, sizeof(PHO_IDX), pho_idxN, fp);
  fwrite(pho_items, sizeof(PHO_ITEM), pho_itemsN, fp);

  fwrite(phrase_area, 1, phrase_area_N, fp);

  fclose(fp);

  if (getenv("HIME_NO_RELOAD")==NULL) {
    /* caleb- does found where "reload" is used.
     * caleb- think the send_hime_message() here does nothing.
     */
    send_hime_message(GDK_DISPLAY(), "reload");
  }

  return 0;
}