static void check_mecab_dictionary_encoding(grn_ctx *ctx) { #ifdef HAVE_MECAB_DICTIONARY_INFO_T mecab_t *mecab; mecab = mecab_new2("-Owakati"); if (mecab) { grn_encoding encoding; grn_bool have_same_encoding_dictionary; encoding = GRN_CTX_GET_ENCODING(ctx); have_same_encoding_dictionary = (encoding == get_mecab_encoding(mecab)); mecab_destroy(mecab); if (!have_same_encoding_dictionary) { GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR, "[tokenizer][mecab] " "MeCab has no dictionary that uses the context encoding" ": <%s>", grn_encoding_to_string(encoding)); } } else { GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR, "[tokenizer][mecab] " "mecab_new2 failed in check_mecab_dictionary_encoding: %s", mecab_global_error_message()); } #endif }
static void Tagger_Init(KonohaContext *kctx, kObject *o, void *conf) { struct _kTagger *mecab = (struct _kTagger *)o; const char* dummy = ""; // dummy mecab->mecab = mecab_new2(dummy); DBG_ASSERT(mecab->mecab != NULL); }
static void check_mecab_dictionary_encoding(grn_ctx *ctx) { #ifdef HAVE_MECAB_DICTIONARY_INFO_T mecab_t *mecab; mecab = mecab_new2("-Owakati"); if (mecab) { grn_encoding encoding; int have_same_encoding_dictionary = 0; encoding = GRN_CTX_GET_ENCODING(ctx); have_same_encoding_dictionary = encoding == get_mecab_encoding(mecab); mecab_destroy(mecab); if (!have_same_encoding_dictionary) { ERR(GRN_TOKENIZER_ERROR, "MeCab has no dictionary that uses the context encoding: <%s>", grn_enctostr(encoding)); } } else { ERR(GRN_TOKENIZER_ERROR, "mecab_new2 failed in check_mecab_dictionary_encoding: %s", mecab_strerror(NULL)); } #endif }
static emacs_value Fmecab_new(emacs_env *env, ptrdiff_t nargs, emacs_value args[], void *data) { emacs_value type = env->type_of(env, args[0]); mecab_t *mecab; if (!env->is_not_nil(env, args[0])) { mecab = mecab_new(0, NULL); } else if (env->eq(env, type, env->intern(env, "string"))) { ptrdiff_t size; char *dict = retrieve_string(env, args[0], &size); mecab = mecab_new2(dict); free(dict); } else if (env->eq(env, type, env->intern(env, "vector"))) { int argc = (int)env->vec_size(env, args[0]); char **argv = (char**)malloc(sizeof(char*) * argc); for (int i = 0; i < argc; ++i) { ptrdiff_t size; emacs_value dict = env->vec_get(env, args[0], i); argv[i] = retrieve_string(env, dict, &size); } mecab = mecab_new(argc, argv); for (int i = 0; i < argc; ++i) { free(argv[i]); } free(argv); } else { emacs_value errmsg = env->make_string(env, "Invalid argument", sizeof("Invalid argument")); env->non_local_exit_signal(env, env->intern(env, "error"), errmsg); return env->intern(env, "nil"); } return env->make_user_ptr(env, el_mecab_free, mecab); }
/* This function is called for a full text search query or a document to be indexed. This means that both short/long strings are given. The return value of this function is ignored. When an error occurs in this function, `ctx->rc' is overwritten with an error code (not GRN_SUCCESS). */ static grn_obj * mecab_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) { grn_mecab_tokenizer *tokenizer; unsigned int normalizer_flags = 0; grn_tokenizer_query *query; grn_obj *normalized_query; const char *normalized_string; unsigned int normalized_string_length; query = grn_tokenizer_query_open(ctx, nargs, args, normalizer_flags); if (!query) { return NULL; } if (!sole_mecab) { grn_plugin_mutex_lock(ctx, sole_mecab_mutex); if (!sole_mecab) { sole_mecab = mecab_new2("-Owakati"); if (!sole_mecab) { GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR, "[tokenizer][mecab] " "mecab_new2() failed on mecab_init(): %s", mecab_global_error_message()); } else { sole_mecab_encoding = get_mecab_encoding(sole_mecab); } } grn_plugin_mutex_unlock(ctx, sole_mecab_mutex); } if (!sole_mecab) { grn_tokenizer_query_close(ctx, query); return NULL; } if (query->encoding != sole_mecab_encoding) { grn_tokenizer_query_close(ctx, query); GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR, "[tokenizer][mecab] " "MeCab dictionary charset (%s) does not match " "the table encoding: <%s>", grn_encoding_to_string(sole_mecab_encoding), grn_encoding_to_string(query->encoding)); return NULL; } if (!(tokenizer = GRN_PLUGIN_MALLOC(ctx, sizeof(grn_mecab_tokenizer)))) { grn_tokenizer_query_close(ctx, query); GRN_PLUGIN_ERROR(ctx, GRN_NO_MEMORY_AVAILABLE, "[tokenizer][mecab] " "memory allocation to grn_mecab_tokenizer failed"); return NULL; } tokenizer->mecab = sole_mecab; tokenizer->query = query; normalized_query = query->normalized_query; grn_string_get_normalized(ctx, normalized_query, &normalized_string, &normalized_string_length, NULL); GRN_TEXT_INIT(&(tokenizer->buf), 0); if (query->have_tokenized_delimiter) { tokenizer->next = normalized_string; tokenizer->end = tokenizer->next + normalized_string_length; } else if (normalized_string_length == 0) { tokenizer->next = ""; tokenizer->end = tokenizer->next; } else { grn_bool succeeded; grn_plugin_mutex_lock(ctx, sole_mecab_mutex); if (grn_mecab_chunked_tokenize_enabled && ctx->encoding == GRN_ENC_UTF8) { succeeded = chunked_tokenize_utf8(ctx, tokenizer, normalized_string, normalized_string_length); } else { const char *s; s = mecab_sparse_tostr2(tokenizer->mecab, normalized_string, normalized_string_length); if (!s) { succeeded = GRN_FALSE; GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR, "[tokenizer][mecab] " "mecab_sparse_tostr() failed len=%d err=%s", normalized_string_length, mecab_strerror(tokenizer->mecab)); } else { succeeded = GRN_TRUE; GRN_TEXT_PUTS(ctx, &(tokenizer->buf), s); } } grn_plugin_mutex_unlock(ctx, sole_mecab_mutex); if (!succeeded) { grn_tokenizer_query_close(ctx, tokenizer->query); GRN_PLUGIN_FREE(ctx, tokenizer); return NULL; } { char *buf, *p; unsigned int bufsize; buf = GRN_TEXT_VALUE(&(tokenizer->buf)); bufsize = GRN_TEXT_LEN(&(tokenizer->buf)); /* A certain version of mecab returns trailing lf or spaces. */ for (p = buf + bufsize - 2; buf <= p && isspace(*(unsigned char *)p); p--) { *p = '\0'; } tokenizer->next = buf; tokenizer->end = p + 1; } } user_data->ptr = tokenizer; grn_tokenizer_token_init(ctx, &(tokenizer->token)); return NULL; }
main(int argc, char **argv){ char input_file[128]; strcpy(input_file,argv[1]); #else int mecab_analyze (char *input_file){ #endif char input[MAX_TEXT_SIZE]; char analyzed_text[MAX_TEXT_SIZE]; char wk_buff[MAX_TEXT_SIZE]; char wk_file_name[256]; char title_buff[256]; mecab_t *mecab; const mecab_node_t *node; FILE *wfp; char surface_buff[256]; char key_list[MAX_KEY_NUMBERS][MAX_KEY_LENGTH]; int key_numbers; strcpy(wk_file_name,TO_MECAB_FILE_DIR); strcat(wk_file_name,input_file); if(read_text(wk_file_name,input,title_buff)){ fprintf(stderr,"[%s] not found\n",wk_file_name); return(-1); } /**** remove(wk_file_name); ****/ // edit character e.g. ' '' { 0x0a edit_input_text(input); /** memset(analyzed_text,'\0',sizeof(analyzed_text)); if(!modify_text(analyzed_text,input)){ strcpy(wk_buff,analyzed_text); while(1){ memset(analyzed_text,'\0',sizeof(analyzed_text)); if(modify_text(analyzed_text,wk_buff)) break; strcpy(wk_buff,analyzed_text); } } **/ strcpy(wk_file_name,TO_HIBARI_FILE_DIR); strcat(wk_file_name,input_file); if((wfp = fopen(wk_file_name,"w")) == NULL){ fprintf(stderr,"[%s] could not open\n",wk_file_name); return(-1); } /* fprintf(wfp,"{\"%s\"}.\n",input); // first write message */ fprintf(wfp,"{\"%s\"}.\n",title_buff); // write wiki title mecab = mecab_new2(""); CHECK(mecab); mecab_set_lattice_level(mecab, 0); // mecab_set_lattice_level(mecab, 1); node = mecab_sparse_tonode(mecab, input); CHECK(node); memset(key_list,'\0',sizeof(key_list)); for (key_numbers=0; node; node = node->next) { strncpy(surface_buff,node->surface,node->length); surface_buff[node->length] ='\0'; #ifdef UNIT_TEST printf("名詞:[%s] 文字種:[%d] ID:[%d]\n",surface_buff,node->char_type,node->posid); #endif if (node->length <= 1) continue; if (omitted_word(surface_buff)) continue; // check charcter type switch(node->posid){ case 3: //記号 case 4: //数字 case 5: //記号 case 6: //記号 case 7: //記号 break; case 36: // ' case 37: case 38: case 39: case 40: case 41: case 42: case 43: case 44: case 45: case 46: case 47: case 48: case 49: case 50: case 51: case 52: case 53: case 54: case 55: case 56: case 57: case 58: case 59: case 60: case 67: if(!check_duplicate((char *)key_list,surface_buff,key_numbers)){ fprintf(wfp,"{\"%s\"}.\n",surface_buff); key_numbers ++; } break; case 61: // 非自立 名詞 case 62: case 63: case 64: case 65: case 66: break; default: // printf("[%s] ID:[%d]\n",surface_buff,node->posid); break; } #ifdef NOT_USE printf(" %s %d %d %d %d posid:[%d] %d %d %d %f %f %f %ld\n", node->feature, (int)(node->surface - input), (int)(node->surface - input + node->length), node->rcAttr, node->lcAttr, node->posid, (int)node->char_type, (int)node->stat, (int)node->isbest, node->alpha, node->beta, node->prob, node->cost); #endif } fclose(wfp); mecab_destroy(mecab); return 0; }
/* This function is called for a full text search query or a document to be indexed. This means that both short/long strings are given. The return value of this function is ignored. When an error occurs in this function, `ctx->rc' is overwritten with an error code (not GRN_SUCCESS). */ static grn_obj * mecab_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) { char *buf, *p; const char *s; grn_mecab_tokenizer *tokenizer; unsigned int bufsize; grn_tokenizer_query *query; grn_obj *normalized_query; const char *normalized_string; unsigned int normalized_string_length; query = grn_tokenizer_query_open(ctx, nargs, args); if (!query) { return NULL; } if (!sole_mecab) { grn_plugin_mutex_lock(ctx, sole_mecab_mutex); if (!sole_mecab) { sole_mecab = mecab_new2("-Owakati"); if (!sole_mecab) { GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR, "[tokenizer][mecab] " "mecab_new2() failed on mecab_init(): %s", mecab_strerror(NULL)); } else { sole_mecab_encoding = get_mecab_encoding(sole_mecab); } } grn_plugin_mutex_unlock(ctx, sole_mecab_mutex); } if (!sole_mecab) { grn_tokenizer_query_close(ctx, query); return NULL; } if (query->encoding != sole_mecab_encoding) { grn_tokenizer_query_close(ctx, query); GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR, "[tokenizer][mecab] " "MeCab dictionary charset (%s) does not match " "the table encoding: <%s>", grn_enctostr(sole_mecab_encoding), grn_enctostr(query->encoding)); return NULL; } if (!(tokenizer = GRN_PLUGIN_MALLOC(ctx, sizeof(grn_mecab_tokenizer)))) { grn_tokenizer_query_close(ctx, query); GRN_PLUGIN_ERROR(ctx, GRN_NO_MEMORY_AVAILABLE, "[tokenizer][mecab] " "memory allocation to grn_mecab_tokenizer failed"); return NULL; } tokenizer->mecab = sole_mecab; tokenizer->query = query; normalized_query = query->normalized_query; grn_string_get_normalized(ctx, normalized_query, &normalized_string, &normalized_string_length, NULL); tokenizer->have_tokenized_delimiter = grn_tokenizer_have_tokenized_delimiter(ctx, normalized_string, normalized_string_length, query->encoding); if (tokenizer->have_tokenized_delimiter) { tokenizer->buf = NULL; tokenizer->next = normalized_string; tokenizer->end = tokenizer->next + normalized_string_length; } else { grn_plugin_mutex_lock(ctx, sole_mecab_mutex); s = mecab_sparse_tostr2(tokenizer->mecab, normalized_string, normalized_string_length); if (!s) { GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR, "[tokenizer][mecab] " "mecab_sparse_tostr() failed len=%d err=%s", normalized_string_length, mecab_strerror(tokenizer->mecab)); } else { bufsize = strlen(s) + 1; if (!(buf = GRN_PLUGIN_MALLOC(ctx, bufsize))) { GRN_PLUGIN_LOG(ctx, GRN_LOG_ALERT, "[tokenizer][mecab] " "buffer allocation on mecab_init failed !"); } else { memcpy(buf, s, bufsize); } } grn_plugin_mutex_unlock(ctx, sole_mecab_mutex); if (!s || !buf) { grn_tokenizer_query_close(ctx, tokenizer->query); GRN_PLUGIN_FREE(ctx, tokenizer); return NULL; } /* A certain version of mecab returns trailing lf or spaces. */ for (p = buf + bufsize - 2; buf <= p && isspace(*(unsigned char *)p); p--) { *p = '\0'; } tokenizer->buf = buf; tokenizer->next = buf; tokenizer->end = p + 1; } user_data->ptr = tokenizer; grn_tokenizer_token_init(ctx, &(tokenizer->token)); return NULL; }
/* This function is called for a full text search query or a document to be indexed. This means that both short/long strings are given. The return value of this function is ignored. When an error occurs in this function, `ctx->rc' is overwritten with an error code (not GRN_SUCCESS). */ static grn_obj * mecab_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) { grn_obj *str; int nflags = 0; char *buf, *p; const char *s; grn_obj *table = args[0]; grn_obj_flags table_flags; grn_encoding table_encoding; grn_mecab_tokenizer *token; unsigned int bufsize, len; if (!(str = grn_ctx_pop(ctx))) { ERR(GRN_INVALID_ARGUMENT, "missing argument"); return NULL; } if (!sole_mecab) { CRITICAL_SECTION_ENTER(sole_mecab_lock); if (!sole_mecab) { sole_mecab = mecab_new2("-Owakati"); if (!sole_mecab) { ERR(GRN_TOKENIZER_ERROR, "mecab_new2 failed on grn_mecab_init: %s", mecab_strerror(NULL)); } else { sole_mecab_encoding = get_mecab_encoding(sole_mecab); } } CRITICAL_SECTION_LEAVE(sole_mecab_lock); } if (!sole_mecab) { return NULL; } grn_table_get_info(ctx, table, &table_flags, &table_encoding, NULL); if (table_encoding != sole_mecab_encoding) { ERR(GRN_TOKENIZER_ERROR, "MeCab dictionary charset (%s) does not match the context encoding: <%s>", grn_enctostr(sole_mecab_encoding), grn_enctostr(table_encoding)); return NULL; } if (!(token = GRN_MALLOC(sizeof(grn_mecab_tokenizer)))) { return NULL; } token->mecab = sole_mecab; token->encoding = table_encoding; nflags |= (table_flags & GRN_OBJ_KEY_NORMALIZE); if (!(token->nstr = grn_str_open_(ctx, GRN_TEXT_VALUE(str), GRN_TEXT_LEN(str), nflags, token->encoding))) { GRN_FREE(token); ERR(GRN_TOKENIZER_ERROR, "grn_str_open failed at grn_token_open"); return NULL; } len = token->nstr->norm_blen; CRITICAL_SECTION_ENTER(sole_mecab_lock); s = mecab_sparse_tostr2(token->mecab, token->nstr->norm, len); if (!s) { ERR(GRN_TOKENIZER_ERROR, "mecab_sparse_tostr failed len=%d err=%s", len, mecab_strerror(token->mecab)); } else { bufsize = strlen(s) + 1; if (!(buf = GRN_MALLOC(bufsize))) { GRN_LOG(ctx, GRN_LOG_ALERT, "buffer allocation on mecab_init failed !"); } else { memcpy(buf, s, bufsize); } } CRITICAL_SECTION_LEAVE(sole_mecab_lock); if (!s || !buf) { grn_str_close(ctx, token->nstr); GRN_FREE(token); return NULL; } /* A certain version of mecab returns trailing lf or spaces. */ for (p = buf + bufsize - 2; buf <= p && isspace(*(unsigned char *)p); p--) { *p = '\0'; } user_data->ptr = token; token->buf = buf; token->next = buf; token->end = p + 1; GRN_TEXT_INIT(&token->curr_, GRN_OBJ_DO_SHALLOW_COPY); GRN_UINT32_INIT(&token->stat_, 0); return NULL; }
SEXP setMeCabMap(int typeSet, char input[], map<string, int> & ma0, map<string, int> & ma1, map<string, int>::iterator & pma0, map<string, int>::iterator & pma, list <string> & strL, list <string>::iterator & iter, list <string> & hinsi, list <string>::iterator & hinsi_it, list <string> & saibun, list <string>::iterator & saibun_it, vector<string> & Ppos2, int pos_n, int Ngram, int genkei, const char * dic ){// map<string, int> & ma1, char *Ppos[], mecab_t *mecab; mecab_node_t *node; int i, j , posC = 0, xx =0; char buf1 [BUF1];// 2010 12 17 //[128];// [512];//入力された語形を記憶 char buf2[BUF3]; char buf3[BUF2];// 2010 12 17 //[64];// [512];記号チェック用 char buf4[BUF2];// 2010 12 17 //[64];// [1024];記号チェック用 string str; char *p; wchar_t wbuf [BUF4] ;// = { 0 }; //wchar_t wbuf [5120] = { 0 }; /* ワイド文字列 : 日本語文字数 + 1 */ memset (wbuf, 0, sizeof wbuf); // 2015 12 18 unsigned int wz = 0; string target; char target2[BUF3]; // http://mecab.sourceforge.net/mecab.html mecab = mecab_new2 (dic);// mecab = mecab_new2 ("めかぶ");// mecab_new2 (" -u user.dic");mecab_new2(" -d mecab\dic\ipadic -O ruby"); CHECK(mecab); //Rprintf("%s strlen of input= %d\n", input, strlen(input)); if(typeSet == 0){// 文字単位なら // Rprintf("in typeSet == 0 %s \n", file_name ); // Rprintf("%s\n", input); p = strchr( input, '\n' ); /* 改行文字があった場合 */ if ( p != NULL ) { /* 改行文字を終端文字に置き換える */ *p = '\0'; } // Rprintf("strlen of input= %d\n", strlen(input)); if(strlen(input) > 0){ //Rprintf("%s\n", input); // Rprintf("in strlen(input) > 0 %s \n", file_name ); mbstowcs(wbuf, input, strlen(input));/* マルチバイト文字列をワイド文字列に変換*/ //for(int z = 0; z < (wcslen(wbuf) - Ngram); z++){ for( wz = 0; wz < wcslen(wbuf) ; wz++){ // 2005 07 22 // 2008 04 05 #if defined(_WIN64) || !defined(_WIN32) // defined(__MINGW32__) || defined(__MINGW64__ #if defined(WIN32) || defined(WIN64) || defined(_WIN32) || defined(_WIN64) wsprintf(target2, "%lc", wbuf[wz]);// windows では wsprintf #elif defined(__MINGW32__) || defined(__WINGW64__) wsprintf(target2, "%lc", wbuf[wz]);// windows では wsprintf #else sprintf(target2, "%lc", wbuf[wz]);// Linux では sprintf #endif // Rprintf("target2 = %s\n", target2); if(strlen(target2) < 1){ break; } //エスケープ記号類 //strcpy(buf1, *target2); if( *target2 > 0x00 && *target2 < 0x21 ){//エスケープ記号類0x0e continue; }// //////////// windows では wsprintf(str[ys], "%lc", wbuf[z+ys + yw]); // if( strcmp(target2, " ") == 0 || strcmp(target2, " ")==0){ if( strcmp((char *) target2, " ") == 0 || strcmp((char *) target2, " ")==0){ // printf("found\n"); continue; } else{ /////////////// new_begin //////////////// ここは文字単位 // target = target2; strL.push_back( target2); if(strL.size() >= (unsigned int) Ngram){ // Rprintf("in if(strL.size) \n"); target.erase(); //target.append("["); xx = 1; for ( iter = strL.begin(); iter != strL.end(); iter++){ // Rprintf("in for\n"); // Rprintf("str %s\n", * iter); target.append( *iter); if(xx < Ngram){ target.append(" ");//target.append("-"); } xx++; // Rprintf("xx = %d\n", xx); } xx = 1; //target.append("]"); // Rprintf("target %s\n", target); // Rprintf("before m1.find \n"); //出てきた形態素原型は既に全体マップにあるか? pma = ma0.find(target); //出てきた形態素原型は既にマップにあるか? if(pma != ma0.end()){ pma->second = pma->second + 1; //二つ目の数値を加算 } else{// マップにないなら,新規にマップに追加 // Rprintf("add map \n"); ma0.insert(make_pair(target, 1));// 1 は 1個目と言う意味 } // 同じ処理を個別マップにも行う pma = ma1.find(target);//出てきた形態素原型は既に個別マップにあるか? if(pma != ma1.end()){ pma->second = pma->second + 1; //二つ目の数値を加算 } else{// マップにないなら,新規にマップに追加 ma1.insert(make_pair(target, 1));// 1 は 1個目と言う意味 } strL.pop_front(); }//_if strSize>= Ngram }// _else_end ////////////////////////////////////// new _end //// }//_for2_< wcslen }// if_strlen_>_0_end } else {// if_type_set 形態素あるいは品詞の場合 //////////////////////////////////////////////////////////////// // Rprintf("after fgets input = %s\n",input ); node = ( mecab_node_t * ) mecab_sparse_tonode(mecab, input); CHECK(node); // Rprintf("node check" ); /// 解析結果のノードをなめる for (; node; node = node->next) {// node とはその文の形態素ごとに設定される // printf("%d ", node->id); if (node->stat == MECAB_BOS_NODE) //printf("BOS"); continue; else if (node->stat == MECAB_EOS_NODE) //printf("EOS"); continue; else {// BOS, EOS 以外 // 2010 buf1 = (char *)malloc( node->length * MB_CUR_MAX+ 1); strncpy(buf1, node->surface, node->length) ;//元の語形 buf1[node->length] = '\0';// 末尾にNULLを加える// 2006 06 移動 // strlen関数はstringの文字数を返します。この長さには、終端のNULL文字('\0')は含まれません。 if(strlen(buf1) < 1){// 2006 06 移動 continue; } //< 2005 11 07> //Rprintf("%s\n", buf1); //if( atoi(buf1) > 0x00 && atoi(buf1) < 0x0e ){// if( atoi(buf1) == 0x0e){//エスケープ記号類 if( buf1[0] > 0x00 && buf1[0] < 0x21 ){//エスケープ記号類0x0e // strlen(buf1) == 1 && continue; }// </ 2005 11 07> // buf1[node->length] = '\0';// 末尾にNULLを加える// 2006 06 移動 // if(strlen(buf1) < 1){// 2006 06 移動 // continue; // } // Rprintf("buf1 = %s\n", buf1); strcpy(buf2, node->feature);//ノードごとに解析情報の取得.要素数は 9 if(strlen(buf2) < 1){ continue; } // Rprintf("buf2 = %s\n", buf2); ////////////// p = strtok(buf2, "," );//取得情報の分割 // 品詞の判定 j = 1; //////////////////////////////////////////////////////////////////// if(typeSet == 2){// 品詞情報で数える if( j == 1 && p != NULL ){//品詞情報1 strL.push_back(p); // Rprintf("typeSet == = %d; p = %s\n", typeSet, p); p = NULL; } }else if(typeSet == 1){// 形態素原形で数える ////////////////////////////////////////////// if(j == 1 && p != NULL){ sprintf(buf3, "%s", p); // // if(mSym < 1 && strcmp(buf3, "記号") == 0){ // if(mSym < 1 && strcmp(buf3, KIGO) == 0){ // p = NULL; // //j = 9; // continue;// 記号は一切省き,総計にも加えない // } // // // Rprintf("buf3 %s\n", buf3); if(pos_n == 0){ hinsi.push_back(buf3); posC = 1; }else{ for(i = 0; i < pos_n; i++){ sprintf(buf4, "%s", Ppos2[i].c_str()); // 2011 03 10 sprintf(buf4, "%s", Ppos[i]); // Rprintf("buf4 %s\n", buf4); if(strcmp(buf3, buf4) == 0){ posC = 1; hinsi.push_back(buf3); break; } } } if(posC != 1){ p = NULL; posC = 0; continue; } } while ( p != NULL ) { // if(j == 1){//品詞情報1 // str = p; // // str.append(","); // }else if(j == 2){//品詞第2情報 saibun.push_back(p); } else if( j == 7){ if(genkei == 1 || p == NULL || strcmp(p, "*") == 0){ // strL.push_back(p);//原型str = buf1;// str.append(buf1);//元の語形 strL.push_back(buf1);//元の語形 //Rprintf("in str = buf1\n"); } else{ strL.push_back(p);//原型 strL.push_back(buf1); //Rprintf("in str = p\n"); } } p = strtok( NULL,"," ); j++; if(j > 7){ p = NULL; } }// while(P!= NULL) posC = 0; } // else if typset = 1 } //////else // BOS, EOS 以外 ////////////// 抽出終了 if(strL.size() >= (unsigned int) Ngram){// リストのサイズが指定通りであるなら,保存を始める // Rprintf("type = %d, strL size = %d\n", typeSet, strL.size() ); target.erase();//保存のための文字列を初期化 target.append(""); xx = 1; for ( iter = strL.begin(); iter != strL.end(); iter++){ // Rprintf("in for\n"); //sprintf(buf3, "%s", *iter); //Rprintf("str %s\n", *iter); //Rprintf("after Rprintf in for\n"); target.append( *iter);// target.append( buf3); //target.append( *iter); // Rprintf("target append\n"); if(xx < Ngram){ target.append(" ");//target.append("-"); } xx++; } // for xx = 1; if(typeSet == 1){ target.append(" "); for ( hinsi_it = hinsi.begin(); hinsi_it != hinsi.end(); hinsi_it++){ // Rprintf("in for\n"); //sprintf(buf3, "%s", *iter); //Rprintf("str %s\n", *iter); //Rprintf("after Rprintf in for\n"); target.append( *hinsi_it);// target.append( buf3); //target.append( *iter); // Rprintf("target append\n"); if(xx < Ngram){ target.append(" ");//target.append("-"); } xx++; } // for xx = 1; target.append(" "); for ( saibun_it = saibun.begin(); saibun_it != saibun.end(); saibun_it++){ // Rprintf("in for\n"); //sprintf(buf3, "%s", *iter); //Rprintf("str %s\n", *iter); //Rprintf("after Rprintf in for\n"); target.append( *saibun_it);// target.append( buf3); //target.append( *iter); // Rprintf("target append\n"); if(xx < Ngram){ target.append(" ");//target.append("-"); } xx++; } // for xx = 1; }//if(typeSet == 1){ pma0 = ma0.find(target);//出てきた形態素原型は既に全体マップにあるか? if(pma0 != ma0.end()){ pma0->second = pma0->second + 1; //二つ目の数値を加算 } else{// マップにないなら,新規にマップに追加 ma0.insert(make_pair(target, 1));// 1 は 1個目と言う意味 } pma = ma1.find(target);// str 出てきた形態素原型は既に個別マップにあるか? if(pma != ma1.end()){ pma->second = pma->second + 1; //二つ目の数値を加算 } else{// マップにないなら,新規にマップに追加 ma1.insert(make_pair(target, 1));// 1 は 1個目と言う意味 } strL.pop_front();// 最初の要素を取り除く if(typeSet == 1){ hinsi.pop_front(); saibun.pop_front(); } } // if(strL.size() >= Ngram) }//for(;node;)// Rprintf("node check ended\n"); } mecab_destroy(mecab); return (R_NilValue);// return 0; }