PyObject * PyMmseg_BuildDict(PyObject * self, PyObject* args) { csfHelper_MMSegObject *self2 = (csfHelper_MMSegObject *)self; char *type; char *source_file; char *target_file; if (!PyArg_Parse(args, "(sss)", &type, &source_file, &target_file)) return NULL; else { int ret = 0; if(strncmp("unigram", type, 7) == 0) { UnigramCorpusReader ur; ur.open(source_file, NULL); int ret = 0; { UnigramDict ud; ret = ud.import(ur); ud.save(target_file); } return Py_BuildValue("i",ret); } if(strncmp("thesaurus", type, 9) == 0 ) { ThesaurusDict tdict; ret = tdict.import(source_file, target_file); return Py_BuildValue("i",ret); } return Py_None; } }
int main(int argc, char **argv) { int c; const char* corpus_file = NULL; const char* uni_corpus_file = NULL; const char* thesaurus_file = NULL; const char* out_file = NULL; const char* dict_path = NULL; const char* target_file = NULL; char out_buf[512]; if(argc < 2){ usage(argv[0]); exit(0); } u1 bPlainText = 0; u1 bUcs2 = 0; while ((c = getopt(argc, argv, "t:b:u:d:o:rU")) != -1) { switch (c) { case 'o': target_file = optarg; break; case 'u': uni_corpus_file = optarg; break; case 'b': corpus_file = optarg; break; case 'd': dict_path = optarg; break; case 't': thesaurus_file = optarg; break; case 'r': bPlainText = 1; break; case 'U': bUcs2 = 1; break; case 'h': usage(argv[0]); exit(0); default: fprintf(stderr, "Illegal argument \"%c\"\n", c); return 1; } } if(optind < argc) { out_file = argv[optind]; } if(thesaurus_file) { ThesaurusDict tdict; tdict.import(thesaurus_file, target_file); //ThesaurusDict ldict; //ldict.load("thesaurus.lib"); return 0; } if(corpus_file){ //build Synonyms dictionary SynonymsDict dict; dict.import(corpus_file); if(target_file) dict.save(target_file); else dict.save("synonyms.dat"); //debug use //dict.load("synonyms.dat"); //printf("%s\n", dict.exactMatch("c#")); return 0; } if(!corpus_file && !dict_path) { //build unigram if(!out_file) { //build the output filename size_t len = strlen(uni_corpus_file); memcpy(out_buf,uni_corpus_file,len); memcpy(&out_buf[len],".uni\0",5); out_file = out_buf; } if(target_file) { out_file = target_file; } UnigramCorpusReader ur; ur.open(uni_corpus_file,bPlainText?"plain":NULL); if(!bUcs2){ UnigramDict ud; int ret = ud.import(ur); ud.save(out_file); //check int i = 0; for(i=0;i<ur.count();i++) { UnigramRecord* rec = ur.getAt(i); if(ud.exactMatch(rec->key.c_str()) == rec->count){ continue; }else{ printf("error!!!"); } }//end for }else{ printf("UCS2 used as inner encoding, is unsupported\n"); } return 0; }else if(!dict_path){ //not segment mode. //build bigram if(!out_file) { //build the output filename size_t len = strlen(corpus_file); memcpy(out_buf,corpus_file,len); memcpy(&out_buf[len],".bi\0",4); out_file = out_buf; } printf("Bigram build unsupported.\n"); }//end if(!corpus_file) //Segment mode { SegmenterManager* mgr = new SegmenterManager(); int nRet = 0; if(dict_path) nRet = mgr->init(dict_path); else{ usage(argv[0]); exit(0); } if(nRet == 0){ //init ok, do segment. Segmenter* seg = mgr->getSegmenter(); segment(out_file,seg); } delete mgr; } return 0; }