BOOL Mecab_analysis(Mecab *m, const char *str){ int i = 0; mecab_node_t *head; mecab_node_t *node; if(m->mecab == NULL) return FALSE; if(m->size > 0 || m->feature != NULL) Mecab_refresh(m); head = (mecab_node_t *) mecab_sparse_tonode(m->mecab, str); if(head == NULL) return FALSE; for (node = head; node != NULL; node = node->next) { if(node->stat != MECAB_BOS_NODE && node->stat != MECAB_EOS_NODE) m->size++; } m->feature = (char **) calloc(m->size, sizeof(char *)); for (node = head; node != NULL; node = node->next) { if(node->stat != MECAB_BOS_NODE && node->stat != MECAB_EOS_NODE){ m->feature[i] = (char *) calloc(node->length + strlen(node->feature) + 2,sizeof(char)); strcpy(m->feature[i],""); strncat(m->feature[i],node->surface,node->length); strcat(m->feature[i],","); strcat(m->feature[i],node->feature); i++; } } return TRUE; }
// MecabNode Tagger.ParseToNode(String input) static KMETHOD Tagger_ParseToNode(KonohaContext *kctx, KonohaStack *sfp) { struct _kTagger *mecab = (struct _kTagger *)sfp[0].asObject; const char *input = kString_text(sfp[1].asString); const mecab_node_t* node = mecab_sparse_tonode(mecab->mecab, input); struct _kMecabNode* ret = (struct _kMecabNode *)KLIB new_kObject(kctx, OnStack, KGetReturnType(sfp), 0); ret->node = node; KReturn(ret); }
main(int argc, char **argv){ char input_file[128]; strcpy(input_file,argv[1]); #else int mecab_analyze (char *input_file){ #endif char input[MAX_TEXT_SIZE]; char analyzed_text[MAX_TEXT_SIZE]; char wk_buff[MAX_TEXT_SIZE]; char wk_file_name[256]; char title_buff[256]; mecab_t *mecab; const mecab_node_t *node; FILE *wfp; char surface_buff[256]; char key_list[MAX_KEY_NUMBERS][MAX_KEY_LENGTH]; int key_numbers; strcpy(wk_file_name,TO_MECAB_FILE_DIR); strcat(wk_file_name,input_file); if(read_text(wk_file_name,input,title_buff)){ fprintf(stderr,"[%s] not found\n",wk_file_name); return(-1); } /**** remove(wk_file_name); ****/ // edit character e.g. ' '' { 0x0a edit_input_text(input); /** memset(analyzed_text,'\0',sizeof(analyzed_text)); if(!modify_text(analyzed_text,input)){ strcpy(wk_buff,analyzed_text); while(1){ memset(analyzed_text,'\0',sizeof(analyzed_text)); if(modify_text(analyzed_text,wk_buff)) break; strcpy(wk_buff,analyzed_text); } } **/ strcpy(wk_file_name,TO_HIBARI_FILE_DIR); strcat(wk_file_name,input_file); if((wfp = fopen(wk_file_name,"w")) == NULL){ fprintf(stderr,"[%s] could not open\n",wk_file_name); return(-1); } /* fprintf(wfp,"{\"%s\"}.\n",input); // first write message */ fprintf(wfp,"{\"%s\"}.\n",title_buff); // write wiki title mecab = mecab_new2(""); CHECK(mecab); mecab_set_lattice_level(mecab, 0); // mecab_set_lattice_level(mecab, 1); node = mecab_sparse_tonode(mecab, input); CHECK(node); memset(key_list,'\0',sizeof(key_list)); for (key_numbers=0; node; node = node->next) { strncpy(surface_buff,node->surface,node->length); surface_buff[node->length] ='\0'; #ifdef UNIT_TEST printf("名詞:[%s] 文字種:[%d] ID:[%d]\n",surface_buff,node->char_type,node->posid); #endif if (node->length <= 1) continue; if (omitted_word(surface_buff)) continue; // check charcter type switch(node->posid){ case 3: //記号 case 4: //数字 case 5: //記号 case 6: //記号 case 7: //記号 break; case 36: // ' case 37: case 38: case 39: case 40: case 41: case 42: case 43: case 44: case 45: case 46: case 47: case 48: case 49: case 50: case 51: case 52: case 53: case 54: case 55: case 56: case 57: case 58: case 59: case 60: case 67: if(!check_duplicate((char *)key_list,surface_buff,key_numbers)){ fprintf(wfp,"{\"%s\"}.\n",surface_buff); key_numbers ++; } break; case 61: // 非自立 名詞 case 62: case 63: case 64: case 65: case 66: break; default: // printf("[%s] ID:[%d]\n",surface_buff,node->posid); break; } #ifdef NOT_USE printf(" %s %d %d %d %d posid:[%d] %d %d %d %f %f %f %ld\n", node->feature, (int)(node->surface - input), (int)(node->surface - input + node->length), node->rcAttr, node->lcAttr, node->posid, (int)node->char_type, (int)node->stat, (int)node->isbest, node->alpha, node->beta, node->prob, node->cost); #endif } fclose(wfp); mecab_destroy(mecab); return 0; }
int main (int argc, char **argv) { char input[] = "太郎は次郎が持っている本を花子に渡した。"; mecab_model_t *model, *another_model; mecab_t *mecab; mecab_lattice_t *lattice; const mecab_node_t *node; const char *result; int i; size_t len; model = mecab_model_new(argc, argv); CHECK(model); mecab = mecab_model_new_tagger(model); CHECK(mecab); lattice = mecab_model_new_lattice(model); CHECK(lattice); mecab_lattice_set_sentence(lattice, input); mecab_parse_lattice(mecab, lattice); printf("RESULT: %s\n", mecab_lattice_tostr(lattice)); node = mecab_lattice_get_bos_node(lattice); for (; node; node = node->next) { printf("%d ", node->id); if (node->stat == MECAB_BOS_NODE) printf("BOS"); else if (node->stat == MECAB_EOS_NODE) printf("EOS"); else fwrite (node->surface, sizeof(char), node->length, stdout); printf(" %s %d %d %d %d %d %d %d %d %f %f %f %ld\n", node->feature, (int)(node->surface - input), (int)(node->surface - input + node->length), node->rcAttr, node->lcAttr, node->posid, (int)node->char_type, (int)node->stat, (int)node->isbest, node->alpha, node->beta, node->prob, node->cost); } len = mecab_lattice_get_size(lattice); for (i = 0; i <= len; ++i) { mecab_node_t *b, *e; b = mecab_lattice_get_begin_nodes(lattice, (size_t)i); e = mecab_lattice_get_end_nodes(lattice, (size_t)i); for (; b; b = b->bnext) { printf("B[%d] %s\t%s\n", i, b->surface, b->feature); } for (; e; e = e->enext) { printf("E[%d] %s\t%s\n", i, e->surface, e->feature); } } mecab_lattice_set_sentence(lattice, input); mecab_lattice_set_request_type(lattice, MECAB_NBEST); mecab_parse_lattice(mecab, lattice); for (i = 0; i < 10; ++i) { fprintf(stdout, "%s", mecab_lattice_tostr(lattice)); if (!mecab_lattice_next(lattice)) { break; } } mecab_lattice_set_sentence(lattice, input); mecab_lattice_set_request_type(lattice, MECAB_MARGINAL_PROB); mecab_lattice_set_theta(lattice, 0.001); mecab_parse_lattice(mecab, lattice); node = mecab_lattice_get_bos_node(lattice); for (; node; node = node->next) { fwrite(node->surface, sizeof(char), node->length, stdout); fprintf(stdout, "\t%s\t%f\n", node->feature, node->prob); } mecab_set_lattice_level(mecab, 0); mecab_set_all_morphs(mecab, 1); node = mecab_sparse_tonode(mecab, input); CHECK(node); for (; node; node = node->next) { fwrite (node->surface, sizeof(char), node->length, stdout); printf("\t%s\n", node->feature); } const mecab_dictionary_info_t *d = mecab_dictionary_info(mecab); for (; d; d = d->next) { printf("filename: %s\n", d->filename); printf("charset: %s\n", d->charset); printf("size: %d\n", d->size); printf("type: %d\n", d->type); printf("lsize: %d\n", d->lsize); printf("rsize: %d\n", d->rsize); printf("version: %d\n", d->version); } mecab_destroy(mecab); mecab_lattice_destroy(lattice); mecab_model_destroy(model); return 0; }
SEXP setMeCabMap(int typeSet, char input[], map<string, int> & ma0, map<string, int> & ma1, map<string, int>::iterator & pma0, map<string, int>::iterator & pma, list <string> & strL, list <string>::iterator & iter, list <string> & hinsi, list <string>::iterator & hinsi_it, list <string> & saibun, list <string>::iterator & saibun_it, vector<string> & Ppos2, int pos_n, int Ngram, int genkei, const char * dic ){// map<string, int> & ma1, char *Ppos[], mecab_t *mecab; mecab_node_t *node; int i, j , posC = 0, xx =0; char buf1 [BUF1];// 2010 12 17 //[128];// [512];//入力された語形を記憶 char buf2[BUF3]; char buf3[BUF2];// 2010 12 17 //[64];// [512];記号チェック用 char buf4[BUF2];// 2010 12 17 //[64];// [1024];記号チェック用 string str; char *p; wchar_t wbuf [BUF4] ;// = { 0 }; //wchar_t wbuf [5120] = { 0 }; /* ワイド文字列 : 日本語文字数 + 1 */ memset (wbuf, 0, sizeof wbuf); // 2015 12 18 unsigned int wz = 0; string target; char target2[BUF3]; // http://mecab.sourceforge.net/mecab.html mecab = mecab_new2 (dic);// mecab = mecab_new2 ("めかぶ");// mecab_new2 (" -u user.dic");mecab_new2(" -d mecab\dic\ipadic -O ruby"); CHECK(mecab); //Rprintf("%s strlen of input= %d\n", input, strlen(input)); if(typeSet == 0){// 文字単位なら // Rprintf("in typeSet == 0 %s \n", file_name ); // Rprintf("%s\n", input); p = strchr( input, '\n' ); /* 改行文字があった場合 */ if ( p != NULL ) { /* 改行文字を終端文字に置き換える */ *p = '\0'; } // Rprintf("strlen of input= %d\n", strlen(input)); if(strlen(input) > 0){ //Rprintf("%s\n", input); // Rprintf("in strlen(input) > 0 %s \n", file_name ); mbstowcs(wbuf, input, strlen(input));/* マルチバイト文字列をワイド文字列に変換*/ //for(int z = 0; z < (wcslen(wbuf) - Ngram); z++){ for( wz = 0; wz < wcslen(wbuf) ; wz++){ // 2005 07 22 // 2008 04 05 #if defined(_WIN64) || !defined(_WIN32) // defined(__MINGW32__) || defined(__MINGW64__ #if defined(WIN32) || defined(WIN64) || defined(_WIN32) || defined(_WIN64) wsprintf(target2, "%lc", wbuf[wz]);// windows では wsprintf #elif defined(__MINGW32__) || defined(__WINGW64__) wsprintf(target2, "%lc", wbuf[wz]);// windows では wsprintf #else sprintf(target2, "%lc", wbuf[wz]);// Linux では sprintf #endif // Rprintf("target2 = %s\n", target2); if(strlen(target2) < 1){ break; } //エスケープ記号類 //strcpy(buf1, *target2); if( *target2 > 0x00 && *target2 < 0x21 ){//エスケープ記号類0x0e continue; }// //////////// windows では wsprintf(str[ys], "%lc", wbuf[z+ys + yw]); // if( strcmp(target2, " ") == 0 || strcmp(target2, " ")==0){ if( strcmp((char *) target2, " ") == 0 || strcmp((char *) target2, " ")==0){ // printf("found\n"); continue; } else{ /////////////// new_begin //////////////// ここは文字単位 // target = target2; strL.push_back( target2); if(strL.size() >= (unsigned int) Ngram){ // Rprintf("in if(strL.size) \n"); target.erase(); //target.append("["); xx = 1; for ( iter = strL.begin(); iter != strL.end(); iter++){ // Rprintf("in for\n"); // Rprintf("str %s\n", * iter); target.append( *iter); if(xx < Ngram){ target.append(" ");//target.append("-"); } xx++; // Rprintf("xx = %d\n", xx); } xx = 1; //target.append("]"); // Rprintf("target %s\n", target); // Rprintf("before m1.find \n"); //出てきた形態素原型は既に全体マップにあるか? pma = ma0.find(target); //出てきた形態素原型は既にマップにあるか? if(pma != ma0.end()){ pma->second = pma->second + 1; //二つ目の数値を加算 } else{// マップにないなら,新規にマップに追加 // Rprintf("add map \n"); ma0.insert(make_pair(target, 1));// 1 は 1個目と言う意味 } // 同じ処理を個別マップにも行う pma = ma1.find(target);//出てきた形態素原型は既に個別マップにあるか? if(pma != ma1.end()){ pma->second = pma->second + 1; //二つ目の数値を加算 } else{// マップにないなら,新規にマップに追加 ma1.insert(make_pair(target, 1));// 1 は 1個目と言う意味 } strL.pop_front(); }//_if strSize>= Ngram }// _else_end ////////////////////////////////////// new _end //// }//_for2_< wcslen }// if_strlen_>_0_end } else {// if_type_set 形態素あるいは品詞の場合 //////////////////////////////////////////////////////////////// // Rprintf("after fgets input = %s\n",input ); node = ( mecab_node_t * ) mecab_sparse_tonode(mecab, input); CHECK(node); // Rprintf("node check" ); /// 解析結果のノードをなめる for (; node; node = node->next) {// node とはその文の形態素ごとに設定される // printf("%d ", node->id); if (node->stat == MECAB_BOS_NODE) //printf("BOS"); continue; else if (node->stat == MECAB_EOS_NODE) //printf("EOS"); continue; else {// BOS, EOS 以外 // 2010 buf1 = (char *)malloc( node->length * MB_CUR_MAX+ 1); strncpy(buf1, node->surface, node->length) ;//元の語形 buf1[node->length] = '\0';// 末尾にNULLを加える// 2006 06 移動 // strlen関数はstringの文字数を返します。この長さには、終端のNULL文字('\0')は含まれません。 if(strlen(buf1) < 1){// 2006 06 移動 continue; } //< 2005 11 07> //Rprintf("%s\n", buf1); //if( atoi(buf1) > 0x00 && atoi(buf1) < 0x0e ){// if( atoi(buf1) == 0x0e){//エスケープ記号類 if( buf1[0] > 0x00 && buf1[0] < 0x21 ){//エスケープ記号類0x0e // strlen(buf1) == 1 && continue; }// </ 2005 11 07> // buf1[node->length] = '\0';// 末尾にNULLを加える// 2006 06 移動 // if(strlen(buf1) < 1){// 2006 06 移動 // continue; // } // Rprintf("buf1 = %s\n", buf1); strcpy(buf2, node->feature);//ノードごとに解析情報の取得.要素数は 9 if(strlen(buf2) < 1){ continue; } // Rprintf("buf2 = %s\n", buf2); ////////////// p = strtok(buf2, "," );//取得情報の分割 // 品詞の判定 j = 1; //////////////////////////////////////////////////////////////////// if(typeSet == 2){// 品詞情報で数える if( j == 1 && p != NULL ){//品詞情報1 strL.push_back(p); // Rprintf("typeSet == = %d; p = %s\n", typeSet, p); p = NULL; } }else if(typeSet == 1){// 形態素原形で数える ////////////////////////////////////////////// if(j == 1 && p != NULL){ sprintf(buf3, "%s", p); // // if(mSym < 1 && strcmp(buf3, "記号") == 0){ // if(mSym < 1 && strcmp(buf3, KIGO) == 0){ // p = NULL; // //j = 9; // continue;// 記号は一切省き,総計にも加えない // } // // // Rprintf("buf3 %s\n", buf3); if(pos_n == 0){ hinsi.push_back(buf3); posC = 1; }else{ for(i = 0; i < pos_n; i++){ sprintf(buf4, "%s", Ppos2[i].c_str()); // 2011 03 10 sprintf(buf4, "%s", Ppos[i]); // Rprintf("buf4 %s\n", buf4); if(strcmp(buf3, buf4) == 0){ posC = 1; hinsi.push_back(buf3); break; } } } if(posC != 1){ p = NULL; posC = 0; continue; } } while ( p != NULL ) { // if(j == 1){//品詞情報1 // str = p; // // str.append(","); // }else if(j == 2){//品詞第2情報 saibun.push_back(p); } else if( j == 7){ if(genkei == 1 || p == NULL || strcmp(p, "*") == 0){ // strL.push_back(p);//原型str = buf1;// str.append(buf1);//元の語形 strL.push_back(buf1);//元の語形 //Rprintf("in str = buf1\n"); } else{ strL.push_back(p);//原型 strL.push_back(buf1); //Rprintf("in str = p\n"); } } p = strtok( NULL,"," ); j++; if(j > 7){ p = NULL; } }// while(P!= NULL) posC = 0; } // else if typset = 1 } //////else // BOS, EOS 以外 ////////////// 抽出終了 if(strL.size() >= (unsigned int) Ngram){// リストのサイズが指定通りであるなら,保存を始める // Rprintf("type = %d, strL size = %d\n", typeSet, strL.size() ); target.erase();//保存のための文字列を初期化 target.append(""); xx = 1; for ( iter = strL.begin(); iter != strL.end(); iter++){ // Rprintf("in for\n"); //sprintf(buf3, "%s", *iter); //Rprintf("str %s\n", *iter); //Rprintf("after Rprintf in for\n"); target.append( *iter);// target.append( buf3); //target.append( *iter); // Rprintf("target append\n"); if(xx < Ngram){ target.append(" ");//target.append("-"); } xx++; } // for xx = 1; if(typeSet == 1){ target.append(" "); for ( hinsi_it = hinsi.begin(); hinsi_it != hinsi.end(); hinsi_it++){ // Rprintf("in for\n"); //sprintf(buf3, "%s", *iter); //Rprintf("str %s\n", *iter); //Rprintf("after Rprintf in for\n"); target.append( *hinsi_it);// target.append( buf3); //target.append( *iter); // Rprintf("target append\n"); if(xx < Ngram){ target.append(" ");//target.append("-"); } xx++; } // for xx = 1; target.append(" "); for ( saibun_it = saibun.begin(); saibun_it != saibun.end(); saibun_it++){ // Rprintf("in for\n"); //sprintf(buf3, "%s", *iter); //Rprintf("str %s\n", *iter); //Rprintf("after Rprintf in for\n"); target.append( *saibun_it);// target.append( buf3); //target.append( *iter); // Rprintf("target append\n"); if(xx < Ngram){ target.append(" ");//target.append("-"); } xx++; } // for xx = 1; }//if(typeSet == 1){ pma0 = ma0.find(target);//出てきた形態素原型は既に全体マップにあるか? if(pma0 != ma0.end()){ pma0->second = pma0->second + 1; //二つ目の数値を加算 } else{// マップにないなら,新規にマップに追加 ma0.insert(make_pair(target, 1));// 1 は 1個目と言う意味 } pma = ma1.find(target);// str 出てきた形態素原型は既に個別マップにあるか? if(pma != ma1.end()){ pma->second = pma->second + 1; //二つ目の数値を加算 } else{// マップにないなら,新規にマップに追加 ma1.insert(make_pair(target, 1));// 1 は 1個目と言う意味 } strL.pop_front();// 最初の要素を取り除く if(typeSet == 1){ hinsi.pop_front(); saibun.pop_front(); } } // if(strL.size() >= Ngram) }//for(;node;)// Rprintf("node check ended\n"); } mecab_destroy(mecab); return (R_NilValue);// return 0; }