Example #1
0
PyObject * PyMmseg_BuildDict(PyObject * self, PyObject* args)
{
	csfHelper_MMSegObject *self2 = (csfHelper_MMSegObject *)self;
	char *type; 
	char *source_file;
	char *target_file;

	if (!PyArg_Parse(args, "(sss)", &type, &source_file, &target_file))
		return NULL;
	else
	{
		int ret = 0;
		if(strncmp("unigram", type, 7) == 0) {
			UnigramCorpusReader ur;
			ur.open(source_file, NULL);
			int ret = 0;
			{
				UnigramDict ud;
				ret = ud.import(ur);
				ud.save(target_file);		
			}
			return Py_BuildValue("i",ret);
		}

		if(strncmp("thesaurus", type, 9) == 0 ) {
			ThesaurusDict tdict;
			ret = tdict.import(source_file, target_file);
			return Py_BuildValue("i",ret);
		}

		return Py_None;
	}
}
Example #2
0
int main(int argc, char **argv) {
	int c;
	const char* corpus_file = NULL;
	const char* uni_corpus_file = NULL;
	const char* thesaurus_file = NULL;
	const char* out_file = NULL;
	const char* dict_path = NULL;
	const char* target_file = NULL;
	char out_buf[512];
	
	if(argc < 2){
		usage(argv[0]);
		exit(0);
	}
	u1 bPlainText = 0;
	u1 bUcs2 = 0;
	while ((c = getopt(argc, argv, "t:b:u:d:o:rU")) != -1) {
		switch (c) {
		case 'o':
			target_file = optarg;
			break;
		case 'u':
			uni_corpus_file = optarg;
			break;
		case 'b':
			corpus_file = optarg;
			break;
		case 'd':
			dict_path = optarg;
			break;
		case 't':
			thesaurus_file = optarg;
			break;
		case 'r':
			bPlainText = 1;
			break;
		case 'U':
			bUcs2 = 1;
			break;
		case 'h':
			usage(argv[0]);
			exit(0);
		default:
			fprintf(stderr, "Illegal argument \"%c\"\n", c);
			return 1;
		}
	}

	if(optind < argc) {
		out_file = argv[optind];
	}

	if(thesaurus_file) {
		ThesaurusDict tdict;
		tdict.import(thesaurus_file, target_file);
		//ThesaurusDict ldict;
		//ldict.load("thesaurus.lib");
		return 0;
	}

	if(corpus_file){
		//build Synonyms dictionary
		SynonymsDict dict;
		dict.import(corpus_file);
		if(target_file)
		   dict.save(target_file);
		else
		   dict.save("synonyms.dat");
		//debug use
		//dict.load("synonyms.dat");
		//printf("%s\n", dict.exactMatch("c#"));
		return 0;
	}

	if(!corpus_file && !dict_path) {
		//build unigram 
		if(!out_file) {
			//build the output filename
			size_t len = strlen(uni_corpus_file);
			memcpy(out_buf,uni_corpus_file,len);
			memcpy(&out_buf[len],".uni\0",5);
			out_file = out_buf;
		}
		
		if(target_file) {
			out_file = target_file;
		}
		
		UnigramCorpusReader ur;
		ur.open(uni_corpus_file,bPlainText?"plain":NULL);
		if(!bUcs2){
			UnigramDict ud;
			int ret = ud.import(ur);
			ud.save(out_file);		
			//check
			int i = 0;
			for(i=0;i<ur.count();i++)
			{
				UnigramRecord* rec = ur.getAt(i);
				
				if(ud.exactMatch(rec->key.c_str()) == rec->count){
					continue;
				}else{
					printf("error!!!");
				}
			}//end for
		}else{
			printf("UCS2 used as inner encoding, is unsupported\n");
		}
		return 0;
	}else
	if(!dict_path){ //not segment mode.
		//build bigram
		if(!out_file) {
			//build the output filename
			size_t len = strlen(corpus_file);
			memcpy(out_buf,corpus_file,len);
			memcpy(&out_buf[len],".bi\0",4);
			out_file = out_buf;
		}
		printf("Bigram build unsupported.\n");
	}//end if(!corpus_file)
	//Segment mode
	{
		SegmenterManager* mgr = new SegmenterManager();
		int nRet = 0;
		if(dict_path)
			nRet = mgr->init(dict_path);
		else{
			usage(argv[0]);
			exit(0);
		}
		if(nRet == 0){
			//init ok, do segment.
			Segmenter* seg = mgr->getSegmenter();
			segment(out_file,seg);
		}
		delete mgr;
	}
	
	return 0;
}