PyObject *segment(PyObject *self, PyObject *args) {

	PyObject *module = PyImport_ImportModule("cmmseg");
	 SegmenterManager* mgr =  NULL;
	{
		PyObject *module_dict = PyModule_GetDict(module);
		if(!module_dict) {
			PyErr_SetString(PyExc_ValueError, "Needs load segment dictionary library frist!");
			return NULL;
		}
		PyObject *c_api_object = PyDict_GetItemString(module_dict, "__segmgr");

		if (!c_api_object || !PyCObject_Check(c_api_object)) {
			PyErr_SetString(PyExc_ValueError, "Needs load segment dictionary library frist!");
			return NULL;
		}
		mgr = (SegmenterManager*)PyCObject_AsVoidPtr(c_api_object);
	}

    // Chris: not get from pool and delete later, or get from pool and clear
	Segmenter* seg = mgr->getSegmenter(false);
	char *fromPython;

	if (!PyArg_Parse(args, "(s)", &fromPython)) {
        delete seg;
        return NULL;
    } else {
        seg->setBuffer((u1*)fromPython, (u4)strlen(fromPython));

		PyObject* seg_result = PyList_New(0);
		while(1)
		{
			u2 len = 0, symlen = 0;
			char* tok = (char*)seg->peekToken(len,symlen);
			if(!tok || !*tok || !len){
				break;
			}
			//append new item
			PyObject *res = PyString_FromStringAndSize(tok,len);
			PyList_Append(seg_result, res);
            Py_DECREF(res);
			seg->popToken(len);
		}
        delete seg;
        return seg_result;
    }
}
 PyObject *init(PyObject *self, PyObject *args) {
    char *fromPython;
    PyObject *module = PyImport_ImportModule("cmmseg");
	//
	/*
	PyObject *module_dict = PyModule_GetDict(module); \
    PyObject *c_api_object = PyDict_GetItemString(module_dict, "_C_API"); \
    if (PyCObject_Check(c_api_object)) { \
      PyCurses_API = (void **)PyCObject_AsVoidPtr(c_api_object); \
    } \
	*/
	{
		PyObject *module_dict = PyModule_GetDict(module);
		if(module_dict) {
			PyObject *c_api_object = PyDict_GetItemString(module_dict, "__segmgr");
			if (c_api_object && PyCObject_Check(c_api_object))
				return self;
		}
	}
	if (!PyArg_Parse(args, "(s)", &fromPython)){
		PyErr_SetString(PyExc_ValueError, "invalid dict_path");
        return NULL;
	}else {
        SegmenterManager* mgr = new SegmenterManager();
		int nRet = 0;
		if(fromPython)
			nRet = mgr->init(fromPython);
		if(nRet == 0){
			//return self;
		}else {
			delete mgr;
			PyErr_SetString(PyExc_ValueError, "invalid dict_path");
			return NULL;
		}
		//add to module obj
		{
			//bind to self
			PyObject *c_api_object;
			c_api_object = PyCObject_FromVoidPtr((void *)mgr, NULL);
			if (c_api_object != NULL)
				PyModule_AddObject(module, "__segmgr", c_api_object);
		}
		return module;
    }
}
Exemplo n.º 3
0
int mmseg_init(char *path)
{
	if (!path) {
		return -1;
	}
	if (mgr_) {
		return 0;
	}
	SegmenterManager *mgr = new SegmenterManager();
	int ret = mgr->init(path);
	if (ret != 0) {
		delete mgr;
		cerr << "error: invalid dict path\n";
		return -1;
	}
	mgr_ = mgr;
	return 0;
}
Exemplo n.º 4
0
int main(int argc, char **argv) {
	int c;
	const char* corpus_file = NULL;
	const char* uni_corpus_file = NULL;
	const char* thesaurus_file = NULL;
	const char* out_file = NULL;
	const char* dict_path = NULL;
	const char* target_file = NULL;
	char out_buf[512];
	
	if(argc < 2){
		usage(argv[0]);
		exit(0);
	}
	u1 bPlainText = 0;
	u1 bUcs2 = 0;
	while ((c = getopt(argc, argv, "t:b:u:d:o:rU")) != -1) {
		switch (c) {
		case 'o':
			target_file = optarg;
			break;
		case 'u':
			uni_corpus_file = optarg;
			break;
		case 'b':
			corpus_file = optarg;
			break;
		case 'd':
			dict_path = optarg;
			break;
		case 't':
			thesaurus_file = optarg;
			break;
		case 'r':
			bPlainText = 1;
			break;
		case 'U':
			bUcs2 = 1;
			break;
		case 'h':
			usage(argv[0]);
			exit(0);
		default:
			fprintf(stderr, "Illegal argument \"%c\"\n", c);
			return 1;
		}
	}

	if(optind < argc) {
		out_file = argv[optind];
	}

	if(thesaurus_file) {
		ThesaurusDict tdict;
		tdict.import(thesaurus_file, target_file);
		//ThesaurusDict ldict;
		//ldict.load("thesaurus.lib");
		return 0;
	}

	if(corpus_file){
		//build Synonyms dictionary
		SynonymsDict dict;
		dict.import(corpus_file);
		if(target_file)
		   dict.save(target_file);
		else
		   dict.save("synonyms.dat");
		//debug use
		//dict.load("synonyms.dat");
		//printf("%s\n", dict.exactMatch("c#"));
		return 0;
	}

	if(!corpus_file && !dict_path) {
		//build unigram 
		if(!out_file) {
			//build the output filename
			size_t len = strlen(uni_corpus_file);
			memcpy(out_buf,uni_corpus_file,len);
			memcpy(&out_buf[len],".uni\0",5);
			out_file = out_buf;
		}
		
		if(target_file) {
			out_file = target_file;
		}
		
		UnigramCorpusReader ur;
		ur.open(uni_corpus_file,bPlainText?"plain":NULL);
		if(!bUcs2){
			UnigramDict ud;
			int ret = ud.import(ur);
			ud.save(out_file);		
			//check
			int i = 0;
			for(i=0;i<ur.count();i++)
			{
				UnigramRecord* rec = ur.getAt(i);
				
				if(ud.exactMatch(rec->key.c_str()) == rec->count){
					continue;
				}else{
					printf("error!!!");
				}
			}//end for
		}else{
			printf("UCS2 used as inner encoding, is unsupported\n");
		}
		return 0;
	}else
	if(!dict_path){ //not segment mode.
		//build bigram
		if(!out_file) {
			//build the output filename
			size_t len = strlen(corpus_file);
			memcpy(out_buf,corpus_file,len);
			memcpy(&out_buf[len],".bi\0",4);
			out_file = out_buf;
		}
		printf("Bigram build unsupported.\n");
	}//end if(!corpus_file)
	//Segment mode
	{
		SegmenterManager* mgr = new SegmenterManager();
		int nRet = 0;
		if(dict_path)
			nRet = mgr->init(dict_path);
		else{
			usage(argv[0]);
			exit(0);
		}
		if(nRet == 0){
			//init ok, do segment.
			Segmenter* seg = mgr->getSegmenter();
			segment(out_file,seg);
		}
		delete mgr;
	}
	
	return 0;
}