コード例 #1
0
ファイル: pho2pinyin.c プロジェクト: medicalwei/hime
gboolean is_pinyin_kbm()
{
  char kbm_str[32];

  get_hime_conf_fstr(PHONETIC_KEYBOARD, kbm_str, "zo-asdf");
#if 1
  b_pinyin = strstr(kbm_str, "pinyin") != NULL;
#else
  b_pinyin = 1;
#endif

  if (b_pinyin)
    load_pin_juyin();
  return b_pinyin;
}
コード例 #2
0
ファイル: hime-tsa2d32.c プロジェクト: b4283/hime
int main(int argc, char **argv)
{
  FILE *fp,*fw;
  char s[1024];
  u_char chbuf[MAX_PHRASE_LEN * CH_SZ];
  u_short phbuf[80];
  u_int phbuf32[80];
  u_int64_t phbuf64[80];
  int i,j,idx,len, ofs;
  u_short kk;
  u_int64_t kk64;
  int hashidx[TSIN_HASH_N];
  u_char clen;
  int lineCnt=0;
  gboolean reload = getenv("HIME_NO_RELOAD")==NULL;

  if (reload) {
    dbg("need reload\n");
  } else {
    dbg("NO_GTK_INIT\n");
  }

  if (getenv("NO_GTK_INIT")==NULL)
    gtk_init(&argc, &argv);

  dbg("enter %s\n", argv[0]);

  if (argc < 2)
    p_err("must specify input file");


  init_TableDir();

  if ((fp=fopen(argv[1], "rb"))==NULL) {
     printf("Cannot open %s\n", argv[1]);
     exit(-1);
  }

  skip_utf8_sigature(fp);
  char *outfile;
  int fofs = ftell(fp);
  myfgets(s, sizeof(s), fp);
  if (strstr(s, "!!pinyin")) {
    b_pinyin = TRUE;
    printf("is pinyin\n");
    load_pin_juyin();
  } else
    fseek(fp, fofs, SEEK_SET);

  fofs = ftell(fp);
  int keybits=0, maxkey=0;
  char keymap[128];
  char kno[128];
  bzero(kno, sizeof(kno));
  myfgets(s, sizeof(s), fp);
  puts(s);
  if (strstr(s, TSIN_GTAB_KEY)) {
    is_gtab = TRUE;
    lineCnt++;

    if (argc < 3)
      p_err("useage %s input_file output_file", argv[0]);

    outfile = argv[2];

    len=strlen((char *)s);
    if (s[len-1]=='\n')
      s[--len]=0;
    char aa[128];
    keymap[0]=' ';
    sscanf(s, "%s %d %d %s", aa, &keybits, &maxkey, keymap+1);
    for(i=0; keymap[i]; i++)
      kno[keymap[i]]=i;

    if (maxkey * keybits > 32)
      gtabkey64 = TRUE;
  } else {
    if (argc==3)
      outfile = argv[2];
    else
      outfile = "tsin32";

    fseek(fp, fofs, SEEK_SET);
  }



  INMD inmd, *cur_inmd = &inmd;

  char *cphbuf;
  if (is_gtab) {
    cur_inmd->keybits = keybits;
    if (gtabkey64) {
      cphbuf = (char *)phbuf64;
      phsz = 8;
      key_cmp = key_cmp64;
      hash_shift = TSIN_HASH_SHIFT_64;
      cur_inmd->key64 = TRUE;
    } else {
      cphbuf = (char *)phbuf32;
      phsz = 4;
      hash_shift = TSIN_HASH_SHIFT_32;
      key_cmp = key_cmp32;
      cur_inmd->key64 = FALSE;
    }
    cur_inmd->last_k_bitn = (((cur_inmd->key64 ? 64:32) / cur_inmd->keybits) - 1) * cur_inmd->keybits;
    dbg("cur_inmd->last_k_bitn %d\n", cur_inmd->last_k_bitn);
  } else {
      cphbuf = (char *)phbuf;
      phsz = 2;
      key_cmp = key_cmp16;
      hash_shift = TSIN_HASH_SHIFT;
  }

  dbg("phsz: %d\n", phsz);

  phcount=ofs=0;
  while (!feof(fp)) {
    usecount_t usecount=0;

    lineCnt++;

    myfgets((char *)s,sizeof(s),fp);
    len=strlen((char *)s);
    if (s[0]=='#')
      continue;

    if (strstr(s, TSIN_GTAB_KEY))
      continue;

    if (s[len-1]=='\n')
      s[--len]=0;

    if (len==0)
      continue;

    i=0;
    int chbufN=0;
    int charN = 0;
    while (s[i]!=' ' && i<len) {
      int len = utf8_sz((char *)&s[i]);

      memcpy(&chbuf[chbufN], &s[i], len);

      i+=len;
      chbufN+=len;
      charN++;
    }

    while ((i < len && s[i]==' ') || s[i]=='\t')
      i++;

    int phbufN=0;
    while (i<len && phbufN < charN && s[i]!=' ') {
      if (is_gtab) {
        kk64=0;
        int idx=0;
        while (s[i]!=' ' && i<len) {
          int k = kno[s[i]];
          kk64|=(u_int64_t)k << ( LAST_K_bitN - idx*keybits);
          i++;
          idx++;
        }

        if (phsz==8)
          phbuf64[phbufN++]=kk64;
        else
          phbuf32[phbufN++]=(u_int)kk64;
      } else {
        kk=0;
        if (b_pinyin) {
          kk = pinyin2phokey(s+i);
          while (s[i]!=' ' && i<len)
            i++;
        } else {
          while (s[i]!=' ' && i<len) {
            if (kk==(BACK_QUOTE_NO << 9))
              kk|=s[i];
            else
              kk |= lookup((u_char *)&s[i]);

            i+=utf8_sz((char *)&s[i]);
          }
        }

        phbuf[phbufN++]=kk;
      }

      i++;
    }

    if (phbufN!=charN) {
      p_err("%s   Line %d problem in phbufN!=chbufN %d != %d\n", s, lineCnt, phbufN, chbufN);
    }

    clen=phbufN;

    while (i<len && s[i]==' ')
      i++;

    if (i==len)
      usecount = 0;
    else
      usecount = atoi((char *)&s[i]);

    /*      printf("len:%d\n", clen); */

    if (phcount >= phidxsize) {
      phidxsize+=1024;
      if (!(phidx=(int *)realloc(phidx,phidxsize*4))) {
        puts("realloc err");
        exit(1);
      }
    }

    phidx[phcount++]=ofs;

    int new_bfN = ofs + 1 + sizeof(usecount_t)+ phsz * clen + chbufN;

    if (bfsize < new_bfN) {
      bfsize = new_bfN + 1024*1024;
      bf = (char *)realloc(bf, bfsize);
    }

    memcpy(&bf[ofs++],&clen,1);
    memcpy(&bf[ofs],&usecount, sizeof(usecount_t)); ofs+=sizeof(usecount_t);

    memcpy(&bf[ofs], cphbuf, clen * phsz);
    ofs+=clen * phsz;

    memcpy(&bf[ofs], chbuf, chbufN);
    ofs+=chbufN;
  }
  fclose(fp);

  /* dumpbf(bf,phidx); */

  puts("Sorting ....");

  qsort(phidx,phcount, sizeof(phidx[0]),qcmp);

  if (!(sf=(u_char *)malloc(bfsize))) {
    puts("malloc err");
    exit(1);
  }

  if (!(sidx=(int *)malloc(phidxsize*sizeof(int)))) {
    puts("malloc err");
    exit(1);
  }


  // delete duplicate
  ofs=0;
  j=0;
  for(i=0;i<phcount;i++) {
    idx = phidx[i];
    sidx[j]=ofs;
    len=bf[idx];
    int tlen = utf8_tlen(&bf[idx + 1 + sizeof(usecount_t) + phsz*len], len);
    clen= phsz*len + tlen + 1 + sizeof(usecount_t);

    if (i && !qcmp_eq(&phidx[i-1], &phidx[i]))
      continue;

    memcpy(&sf[ofs], &bf[idx], clen);
    j++;
    ofs+=clen;
  }

  phcount=j;
#if 1
  puts("Sorting by usecount ....");
  qsort(sidx, phcount, 4, qcmp_usecount);
#endif

  for(i=0;i<256;i++)
    hashidx[i]=-1;

  for(i=0;i<phcount;i++) {
    idx=sidx[i];
    idx+= 1 + sizeof(usecount_t);
    int v;

    if (phsz==2) {
      phokey_t kk;
      memcpy(&kk, &sf[idx], phsz);
      v = kk >> TSIN_HASH_SHIFT;
    } else if (phsz==4) {