Exemple #1
0
static void
check_mecab_dictionary_encoding(grn_ctx *ctx)
{
#ifdef HAVE_MECAB_DICTIONARY_INFO_T
  mecab_t *mecab;

  mecab = mecab_new2("-Owakati");
  if (mecab) {
    grn_encoding encoding;
    grn_bool have_same_encoding_dictionary;

    encoding = GRN_CTX_GET_ENCODING(ctx);
    have_same_encoding_dictionary = (encoding == get_mecab_encoding(mecab));
    mecab_destroy(mecab);

    if (!have_same_encoding_dictionary) {
      GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR,
                       "[tokenizer][mecab] "
                       "MeCab has no dictionary that uses the context encoding"
                       ": <%s>",
                       grn_encoding_to_string(encoding));
    }
  } else {
    GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR,
                     "[tokenizer][mecab] "
                     "mecab_new2 failed in check_mecab_dictionary_encoding: %s",
                     mecab_global_error_message());
  }
#endif
}
Exemple #2
0
static void Tagger_Init(KonohaContext *kctx, kObject *o, void *conf)
{
	struct _kTagger *mecab = (struct _kTagger *)o;
	const char* dummy = ""; // dummy
	mecab->mecab = mecab_new2(dummy);
	DBG_ASSERT(mecab->mecab != NULL);
}
Exemple #3
0
static void
check_mecab_dictionary_encoding(grn_ctx *ctx)
{
#ifdef HAVE_MECAB_DICTIONARY_INFO_T
  mecab_t *mecab;

  mecab = mecab_new2("-Owakati");
  if (mecab) {
    grn_encoding encoding;
    int have_same_encoding_dictionary = 0;

    encoding = GRN_CTX_GET_ENCODING(ctx);
    have_same_encoding_dictionary = encoding == get_mecab_encoding(mecab);
    mecab_destroy(mecab);

    if (!have_same_encoding_dictionary) {
      ERR(GRN_TOKENIZER_ERROR,
          "MeCab has no dictionary that uses the context encoding: <%s>",
          grn_enctostr(encoding));
    }
  } else {
    ERR(GRN_TOKENIZER_ERROR,
        "mecab_new2 failed in check_mecab_dictionary_encoding: %s",
        mecab_strerror(NULL));
  }
#endif
}
Exemple #4
0
static emacs_value
Fmecab_new(emacs_env *env, ptrdiff_t nargs, emacs_value args[], void *data)
{
	emacs_value type = env->type_of(env, args[0]);
	mecab_t *mecab;

	if (!env->is_not_nil(env, args[0])) {
		mecab = mecab_new(0, NULL);
	} else if (env->eq(env, type, env->intern(env, "string"))) {
		ptrdiff_t size;
		char *dict = retrieve_string(env, args[0], &size);
		mecab = mecab_new2(dict);
		free(dict);
	} else if (env->eq(env, type, env->intern(env, "vector"))) {
		int argc = (int)env->vec_size(env, args[0]);
		char **argv = (char**)malloc(sizeof(char*) * argc);

		for (int i = 0; i < argc; ++i) {
			ptrdiff_t size;
			emacs_value dict = env->vec_get(env, args[0], i);
			argv[i] = retrieve_string(env, dict, &size);
		}

		mecab = mecab_new(argc, argv);

		for (int i = 0; i < argc; ++i) {
			free(argv[i]);
		}
		free(argv);
	} else {
		emacs_value errmsg = env->make_string(env, "Invalid argument",
						      sizeof("Invalid argument"));
		env->non_local_exit_signal(env, env->intern(env, "error"), errmsg);
		return env->intern(env, "nil");
	}

	return env->make_user_ptr(env, el_mecab_free, mecab);
}
Exemple #5
0
/*
  This function is called for a full text search query or a document to be
  indexed. This means that both short/long strings are given.
  The return value of this function is ignored. When an error occurs in this
  function, `ctx->rc' is overwritten with an error code (not GRN_SUCCESS).
 */
static grn_obj *
mecab_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
{
  grn_mecab_tokenizer *tokenizer;
  unsigned int normalizer_flags = 0;
  grn_tokenizer_query *query;
  grn_obj *normalized_query;
  const char *normalized_string;
  unsigned int normalized_string_length;

  query = grn_tokenizer_query_open(ctx, nargs, args, normalizer_flags);
  if (!query) {
    return NULL;
  }
  if (!sole_mecab) {
    grn_plugin_mutex_lock(ctx, sole_mecab_mutex);
    if (!sole_mecab) {
      sole_mecab = mecab_new2("-Owakati");
      if (!sole_mecab) {
        GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR,
                         "[tokenizer][mecab] "
                         "mecab_new2() failed on mecab_init(): %s",
                         mecab_global_error_message());
      } else {
        sole_mecab_encoding = get_mecab_encoding(sole_mecab);
      }
    }
    grn_plugin_mutex_unlock(ctx, sole_mecab_mutex);
  }
  if (!sole_mecab) {
    grn_tokenizer_query_close(ctx, query);
    return NULL;
  }

  if (query->encoding != sole_mecab_encoding) {
    grn_tokenizer_query_close(ctx, query);
    GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR,
                     "[tokenizer][mecab] "
                     "MeCab dictionary charset (%s) does not match "
                     "the table encoding: <%s>",
                     grn_encoding_to_string(sole_mecab_encoding),
                     grn_encoding_to_string(query->encoding));
    return NULL;
  }

  if (!(tokenizer = GRN_PLUGIN_MALLOC(ctx, sizeof(grn_mecab_tokenizer)))) {
    grn_tokenizer_query_close(ctx, query);
    GRN_PLUGIN_ERROR(ctx, GRN_NO_MEMORY_AVAILABLE,
                     "[tokenizer][mecab] "
                     "memory allocation to grn_mecab_tokenizer failed");
    return NULL;
  }
  tokenizer->mecab = sole_mecab;
  tokenizer->query = query;

  normalized_query = query->normalized_query;
  grn_string_get_normalized(ctx,
                            normalized_query,
                            &normalized_string,
                            &normalized_string_length,
                            NULL);
  GRN_TEXT_INIT(&(tokenizer->buf), 0);
  if (query->have_tokenized_delimiter) {
    tokenizer->next = normalized_string;
    tokenizer->end = tokenizer->next + normalized_string_length;
  } else if (normalized_string_length == 0) {
    tokenizer->next = "";
    tokenizer->end = tokenizer->next;
  } else {
    grn_bool succeeded;
    grn_plugin_mutex_lock(ctx, sole_mecab_mutex);
    if (grn_mecab_chunked_tokenize_enabled &&
        ctx->encoding == GRN_ENC_UTF8) {
      succeeded = chunked_tokenize_utf8(ctx,
                                        tokenizer,
                                        normalized_string,
                                        normalized_string_length);
    } else {
      const char *s;
      s = mecab_sparse_tostr2(tokenizer->mecab,
                              normalized_string,
                              normalized_string_length);
      if (!s) {
        succeeded = GRN_FALSE;
        GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR,
                         "[tokenizer][mecab] "
                         "mecab_sparse_tostr() failed len=%d err=%s",
                         normalized_string_length,
                         mecab_strerror(tokenizer->mecab));
      } else {
        succeeded = GRN_TRUE;
        GRN_TEXT_PUTS(ctx, &(tokenizer->buf), s);
      }
    }
    grn_plugin_mutex_unlock(ctx, sole_mecab_mutex);
    if (!succeeded) {
      grn_tokenizer_query_close(ctx, tokenizer->query);
      GRN_PLUGIN_FREE(ctx, tokenizer);
      return NULL;
    }
    {
      char *buf, *p;
      unsigned int bufsize;

      buf = GRN_TEXT_VALUE(&(tokenizer->buf));
      bufsize = GRN_TEXT_LEN(&(tokenizer->buf));
      /* A certain version of mecab returns trailing lf or spaces. */
      for (p = buf + bufsize - 2;
           buf <= p && isspace(*(unsigned char *)p);
           p--) { *p = '\0'; }
      tokenizer->next = buf;
      tokenizer->end = p + 1;
    }
  }
  user_data->ptr = tokenizer;

  grn_tokenizer_token_init(ctx, &(tokenizer->token));

  return NULL;
}
main(int argc, char **argv){
char input_file[128];
strcpy(input_file,argv[1]);

#else
int
mecab_analyze (char *input_file){
#endif
	char input[MAX_TEXT_SIZE];
	char analyzed_text[MAX_TEXT_SIZE];
	char wk_buff[MAX_TEXT_SIZE];
	char wk_file_name[256];
	char title_buff[256];
	mecab_t *mecab;
	const mecab_node_t *node;
	FILE *wfp;
	char surface_buff[256];
	char key_list[MAX_KEY_NUMBERS][MAX_KEY_LENGTH];
	int key_numbers;

	strcpy(wk_file_name,TO_MECAB_FILE_DIR);
	strcat(wk_file_name,input_file);
	if(read_text(wk_file_name,input,title_buff)){
    	fprintf(stderr,"[%s] not found\n",wk_file_name);
		return(-1);
	}
	/****
	 remove(wk_file_name);
	****/
    // edit character e.g. ' '' { 0x0a
	edit_input_text(input);

	/**
	memset(analyzed_text,'\0',sizeof(analyzed_text));
	if(!modify_text(analyzed_text,input)){
		strcpy(wk_buff,analyzed_text);
		while(1){
			memset(analyzed_text,'\0',sizeof(analyzed_text));
			if(modify_text(analyzed_text,wk_buff))
				break;
			strcpy(wk_buff,analyzed_text);
		}
	}
	**/

	strcpy(wk_file_name,TO_HIBARI_FILE_DIR);
	strcat(wk_file_name,input_file);
  	if((wfp = fopen(wk_file_name,"w")) == NULL){
   		fprintf(stderr,"[%s] could not open\n",wk_file_name);
   		return(-1);
  	}
	/*
	fprintf(wfp,"{\"%s\"}.\n",input); // first write message
	*/
	fprintf(wfp,"{\"%s\"}.\n",title_buff); // write wiki title

  	mecab = mecab_new2("");
  	CHECK(mecab);

  	mecab_set_lattice_level(mecab, 0);   
  	// mecab_set_lattice_level(mecab, 1);   

  	node = mecab_sparse_tonode(mecab, input);
  	CHECK(node);
  	memset(key_list,'\0',sizeof(key_list));
  	for (key_numbers=0;  node; node = node->next) {
      	strncpy(surface_buff,node->surface,node->length);
      	surface_buff[node->length] ='\0';
#ifdef UNIT_TEST
		printf("名詞:[%s] 文字種:[%d] ID:[%d]\n",surface_buff,node->char_type,node->posid);
#endif
      	if (node->length <= 1)
			continue;
		if (omitted_word(surface_buff))
			continue;
      // check charcter type
    	switch(node->posid){
			case 3:  //記号
			case 4:  //数字
			case 5:  //記号
			case 6:  //記号
			case 7:  //記号
       	      	break;
			case 36: // '
			case 37:
			case 38:
			case 39:
			case 40:
			case 41:
			case 42:
			case 43:
			case 44:
			case 45:
			case 46:
			case 47:
			case 48:
			case 49:
			case 50:
			case 51:
			case 52:
			case 53:
			case 54:
			case 55:
			case 56:
			case 57:
			case 58:
			case 59:
			case 60:
			case 67:
	    		if(!check_duplicate((char *)key_list,surface_buff,key_numbers)){
					fprintf(wfp,"{\"%s\"}.\n",surface_buff);
					key_numbers ++;
				}
          		break;
			case 61: // 非自立 名詞
			case 62:
			case 63:
			case 64:
			case 65:
			case 66:
          		break;
        	default:
				//		printf("[%s] ID:[%d]\n",surface_buff,node->posid);
          		break;
    	}

#ifdef NOT_USE
    printf(" %s %d %d %d %d posid:[%d] %d %d %d %f %f %f %ld\n",
	   node->feature,
	   (int)(node->surface - input),
	   (int)(node->surface - input + node->length),
	   node->rcAttr,
	   node->lcAttr,
	   node->posid,
	   (int)node->char_type,
	   (int)node->stat,
	   (int)node->isbest,
	   node->alpha,
	   node->beta,
	   node->prob,
	   node->cost);
#endif
  	}
  	fclose(wfp);
  	mecab_destroy(mecab);
   
  	return 0;
}
Exemple #7
0
/*
  This function is called for a full text search query or a document to be
  indexed. This means that both short/long strings are given.
  The return value of this function is ignored. When an error occurs in this
  function, `ctx->rc' is overwritten with an error code (not GRN_SUCCESS).
 */
static grn_obj *
mecab_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
{
  char *buf, *p;
  const char *s;
  grn_mecab_tokenizer *tokenizer;
  unsigned int bufsize;
  grn_tokenizer_query *query;
  grn_obj *normalized_query;
  const char *normalized_string;
  unsigned int normalized_string_length;

  query = grn_tokenizer_query_open(ctx, nargs, args);
  if (!query) {
    return NULL;
  }
  if (!sole_mecab) {
    grn_plugin_mutex_lock(ctx, sole_mecab_mutex);
    if (!sole_mecab) {
      sole_mecab = mecab_new2("-Owakati");
      if (!sole_mecab) {
        GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR,
                         "[tokenizer][mecab] "
                         "mecab_new2() failed on mecab_init(): %s",
                         mecab_strerror(NULL));
      } else {
        sole_mecab_encoding = get_mecab_encoding(sole_mecab);
      }
    }
    grn_plugin_mutex_unlock(ctx, sole_mecab_mutex);
  }
  if (!sole_mecab) {
    grn_tokenizer_query_close(ctx, query);
    return NULL;
  }

  if (query->encoding != sole_mecab_encoding) {
    grn_tokenizer_query_close(ctx, query);
    GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR,
                     "[tokenizer][mecab] "
                     "MeCab dictionary charset (%s) does not match "
                     "the table encoding: <%s>",
                     grn_enctostr(sole_mecab_encoding),
                     grn_enctostr(query->encoding));
    return NULL;
  }

  if (!(tokenizer = GRN_PLUGIN_MALLOC(ctx, sizeof(grn_mecab_tokenizer)))) {
    grn_tokenizer_query_close(ctx, query);
    GRN_PLUGIN_ERROR(ctx, GRN_NO_MEMORY_AVAILABLE,
                     "[tokenizer][mecab] "
                     "memory allocation to grn_mecab_tokenizer failed");
    return NULL;
  }
  tokenizer->mecab = sole_mecab;
  tokenizer->query = query;

  normalized_query = query->normalized_query;
  grn_string_get_normalized(ctx,
                            normalized_query,
                            &normalized_string,
                            &normalized_string_length,
                            NULL);
  tokenizer->have_tokenized_delimiter =
    grn_tokenizer_have_tokenized_delimiter(ctx,
                                           normalized_string,
                                           normalized_string_length,
                                           query->encoding);

  if (tokenizer->have_tokenized_delimiter) {
    tokenizer->buf = NULL;
    tokenizer->next = normalized_string;
    tokenizer->end = tokenizer->next + normalized_string_length;
  } else {
    grn_plugin_mutex_lock(ctx, sole_mecab_mutex);
    s = mecab_sparse_tostr2(tokenizer->mecab,
                            normalized_string,
                            normalized_string_length);
    if (!s) {
      GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR,
                       "[tokenizer][mecab] "
                       "mecab_sparse_tostr() failed len=%d err=%s",
                       normalized_string_length,
                       mecab_strerror(tokenizer->mecab));
    } else {
      bufsize = strlen(s) + 1;
      if (!(buf = GRN_PLUGIN_MALLOC(ctx, bufsize))) {
        GRN_PLUGIN_LOG(ctx, GRN_LOG_ALERT,
                       "[tokenizer][mecab] "
                       "buffer allocation on mecab_init failed !");
      } else {
        memcpy(buf, s, bufsize);
      }
    }
    grn_plugin_mutex_unlock(ctx, sole_mecab_mutex);
    if (!s || !buf) {
      grn_tokenizer_query_close(ctx, tokenizer->query);
      GRN_PLUGIN_FREE(ctx, tokenizer);
      return NULL;
    }
    /* A certain version of mecab returns trailing lf or spaces. */
    for (p = buf + bufsize - 2;
         buf <= p && isspace(*(unsigned char *)p);
         p--) { *p = '\0'; }
    tokenizer->buf = buf;
    tokenizer->next = buf;
    tokenizer->end = p + 1;
  }
  user_data->ptr = tokenizer;

  grn_tokenizer_token_init(ctx, &(tokenizer->token));

  return NULL;
}
Exemple #8
0
/*
  This function is called for a full text search query or a document to be
  indexed. This means that both short/long strings are given.
  The return value of this function is ignored. When an error occurs in this
  function, `ctx->rc' is overwritten with an error code (not GRN_SUCCESS).
 */
static grn_obj *
mecab_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
{
  grn_obj *str;
  int nflags = 0;
  char *buf, *p;
  const char *s;
  grn_obj *table = args[0];
  grn_obj_flags table_flags;
  grn_encoding table_encoding;
  grn_mecab_tokenizer *token;
  unsigned int bufsize, len;
  if (!(str = grn_ctx_pop(ctx))) {
    ERR(GRN_INVALID_ARGUMENT, "missing argument");
    return NULL;
  }
  if (!sole_mecab) {
    CRITICAL_SECTION_ENTER(sole_mecab_lock);
    if (!sole_mecab) {
      sole_mecab = mecab_new2("-Owakati");
      if (!sole_mecab) {
        ERR(GRN_TOKENIZER_ERROR, "mecab_new2 failed on grn_mecab_init: %s",
            mecab_strerror(NULL));
      } else {
        sole_mecab_encoding = get_mecab_encoding(sole_mecab);
      }
    }
    CRITICAL_SECTION_LEAVE(sole_mecab_lock);
  }
  if (!sole_mecab) {
    return NULL;
  }
  grn_table_get_info(ctx, table, &table_flags, &table_encoding, NULL);
  if (table_encoding != sole_mecab_encoding) {
    ERR(GRN_TOKENIZER_ERROR,
        "MeCab dictionary charset (%s) does not match the context encoding: <%s>",
        grn_enctostr(sole_mecab_encoding), grn_enctostr(table_encoding));
    return NULL;
  }
  if (!(token = GRN_MALLOC(sizeof(grn_mecab_tokenizer)))) { return NULL; }
  token->mecab = sole_mecab;
  token->encoding = table_encoding;
  nflags |= (table_flags & GRN_OBJ_KEY_NORMALIZE);
  if (!(token->nstr = grn_str_open_(ctx, GRN_TEXT_VALUE(str), GRN_TEXT_LEN(str),
                                    nflags, token->encoding))) {
    GRN_FREE(token);
    ERR(GRN_TOKENIZER_ERROR, "grn_str_open failed at grn_token_open");
    return NULL;
  }
  len = token->nstr->norm_blen;
  CRITICAL_SECTION_ENTER(sole_mecab_lock);
  s = mecab_sparse_tostr2(token->mecab, token->nstr->norm, len);
  if (!s) {
    ERR(GRN_TOKENIZER_ERROR, "mecab_sparse_tostr failed len=%d err=%s",
        len, mecab_strerror(token->mecab));
  } else {
    bufsize = strlen(s) + 1;
    if (!(buf = GRN_MALLOC(bufsize))) {
      GRN_LOG(ctx, GRN_LOG_ALERT, "buffer allocation on mecab_init failed !");
    } else {
      memcpy(buf, s, bufsize);
    }
  }
  CRITICAL_SECTION_LEAVE(sole_mecab_lock);
  if (!s || !buf) {
    grn_str_close(ctx, token->nstr);
    GRN_FREE(token);
    return NULL;
  }
  /* A certain version of mecab returns trailing lf or spaces. */
  for (p = buf + bufsize - 2;
       buf <= p && isspace(*(unsigned char *)p);
       p--) { *p = '\0'; }
  user_data->ptr = token;
  token->buf = buf;
  token->next = buf;
  token->end = p + 1;
  GRN_TEXT_INIT(&token->curr_, GRN_OBJ_DO_SHALLOW_COPY);
  GRN_UINT32_INIT(&token->stat_, 0);
  return NULL;
}
SEXP setMeCabMap(int typeSet, char input[], map<string, int> & ma0, 	map<string, int>  &  ma1,  	map<string, int>::iterator & pma0, map<string, int>::iterator & pma,  list <string> & strL, 	 list <string>::iterator & iter,  list <string> & hinsi, 	 list <string>::iterator & hinsi_it,  list <string> & saibun, 	 list <string>::iterator & saibun_it, vector<string> & Ppos2, int pos_n, int Ngram, int genkei, const char * dic ){// map<string, int> &  ma1, char *Ppos[], 
  
  mecab_t *mecab;
  mecab_node_t *node;
	
 int i, j , posC = 0, xx =0; 	
 char buf1 [BUF1];// 2010 12 17 //[128];// [512];//入力された語形を記憶
 char buf2[BUF3];
 char buf3[BUF2];// 2010 12 17 //[64];// [512];記号チェック用
 char buf4[BUF2];// 2010 12 17 //[64];// [1024];記号チェック用
 string str;
 char *p;
	
	
 wchar_t  wbuf [BUF4] ;// = { 0 }; //wchar_t  wbuf [5120] = { 0 }; /* ワイド文字列 : 日本語文字数  + 1 */
 memset (wbuf, 0, sizeof wbuf); // 2015 12 18
  unsigned int  wz = 0;
	
  string target;
  char target2[BUF3];
			
				

  //	http://mecab.sourceforge.net/mecab.html
  mecab = mecab_new2 (dic);// mecab = mecab_new2 ("めかぶ");// mecab_new2 (" -u user.dic");mecab_new2(" -d mecab\dic\ipadic -O ruby");
  CHECK(mecab);


	//Rprintf("%s strlen of input= %d\n", input, strlen(input)); 
			
  if(typeSet == 0){// 文字単位なら
	//			  Rprintf("in typeSet == 0 %s \n",  file_name );	
	//		  Rprintf("%s\n", input);			  
	p = strchr( input, '\n' );
	/* 改行文字があった場合 */
	if ( p != NULL )
	  {
		/* 改行文字を終端文字に置き換える */
		*p = '\0';
	  }
		
	//			Rprintf("strlen of input= %d\n", strlen(input));  
	if(strlen(input) > 0){
	  //Rprintf("%s\n", input);
				
	  //		  Rprintf("in strlen(input) > 0  %s \n",  file_name );				
	  mbstowcs(wbuf, input,  strlen(input));/* マルチバイト文字列をワイド文字列に変換*/

	  //for(int z = 0; z <  (wcslen(wbuf) - Ngram); z++){
	  for( wz = 0; wz <  wcslen(wbuf) ; wz++){

		// 2005 07 22
		// 2008 04 05 #if defined(_WIN64) || !defined(_WIN32)
		// defined(__MINGW32__) || defined(__MINGW64__
#if defined(WIN32) || defined(WIN64) || defined(_WIN32) || defined(_WIN64)
		wsprintf(target2, "%lc", wbuf[wz]);// windows では wsprintf
#elif  defined(__MINGW32__) || defined(__WINGW64__)
		wsprintf(target2, "%lc", wbuf[wz]);//  windows では wsprintf
#else
		sprintf(target2, "%lc", wbuf[wz]);// Linux  では sprintf
#endif
		
		//				Rprintf("target2 = %s\n", target2);
		if(strlen(target2) < 1){
		  break;
		}
		//エスケープ記号類
		//strcpy(buf1, *target2);
		if( *target2 > 0x00 && *target2 < 0x21 ){//エスケープ記号類0x0e
		  continue;
		}//
		//////////// windows では wsprintf(str[ys], "%lc", wbuf[z+ys + yw]);

		//	if( strcmp(target2, " ") == 0 ||  strcmp(target2, " ")==0){
		if( strcmp((char *) target2, " ") == 0 ||  strcmp((char *) target2, " ")==0){
		  //				  printf("found\n");
		  continue;
		} else{
				  
		  /////////////// new_begin //////////////// ここは文字単位
 
		  // target = target2;   

		  strL.push_back( target2);
					
		  if(strL.size() >= (unsigned int) Ngram){
			//					Rprintf("in if(strL.size) \n");					
			target.erase();
			//target.append("[");
			xx = 1;
			for ( iter = strL.begin(); iter != strL.end(); iter++){
			  //						Rprintf("in for\n");
			  //						Rprintf("str %s\n", * iter);
			  target.append( *iter);
			  if(xx < Ngram){
				 target.append(" ");//target.append("-");
			  }
			  xx++;
			  //					  Rprintf("xx = %d\n", xx);
						
			}
			xx = 1;
			//target.append("]");
			//					Rprintf("target %s\n", target);
			//					Rprintf("before m1.find \n");
			//出てきた形態素原型は既に全体マップにあるか?
			pma = ma0.find(target);
			//出てきた形態素原型は既にマップにあるか?
			if(pma != ma0.end()){
			  pma->second =  pma->second + 1;
			  //二つ目の数値を加算
			}					
			else{// マップにないなら,新規にマップに追加
			  //					  Rprintf("add map \n");
			  ma0.insert(make_pair(target, 1));// 1 は 1個目と言う意味
			}
			// 同じ処理を個別マップにも行う
			pma = ma1.find(target);//出てきた形態素原型は既に個別マップにあるか?
			if(pma != ma1.end()){
			  pma->second =  pma->second + 1;
			  //二つ目の数値を加算
			}
			else{// マップにないなら,新規にマップに追加
			  ma1.insert(make_pair(target, 1));// 1 は 1個目と言う意味
			}
					  
			strL.pop_front();
		  }//_if strSize>= Ngram
		}// _else_end
				  
		////////////////////////////////////// new _end ////
	  }//_for2_< wcslen
	  
	}// if_strlen_>_0_end




			  
			  
  } else {// if_type_set 形態素あるいは品詞の場合



	////////////////////////////////////////////////////////////////
	//			Rprintf("after fgets input =  %s\n",input );
	node = (	mecab_node_t * ) mecab_sparse_tonode(mecab, input);
	CHECK(node);
	//			Rprintf("node check" );		
	/// 解析結果のノードをなめる
	for (;  node; node = node->next) {// node とはその文の形態素ごとに設定される
	  //			  		printf("%d ", node->id);
			  
	  if (node->stat == MECAB_BOS_NODE)
		//printf("BOS");
		continue;
	  else if (node->stat == MECAB_EOS_NODE)
		//printf("EOS");
		continue;
	  else {// BOS, EOS 以外
	    // 2010  		buf1 = (char *)malloc( node->length * MB_CUR_MAX+ 1);	
		strncpy(buf1, node->surface, node->length) ;//元の語形
		
		buf1[node->length] = '\0';// 末尾にNULLを加える// 2006 06 移動
		// strlen関数はstringの文字数を返します。この長さには、終端のNULL文字('\0')は含まれません。
		if(strlen(buf1) < 1){// 2006 06 移動		
		  continue;
		}
		
		//< 2005 11 07> //Rprintf("%s\n", buf1);			
		//if( atoi(buf1) >  0x00 &&  atoi(buf1) < 0x0e ){// if( atoi(buf1)  == 0x0e){//エスケープ記号類
		if( buf1[0] > 0x00 && buf1[0] < 0x21 ){//エスケープ記号類0x0e // strlen(buf1) == 1 &&
		  continue;
		}// </ 2005 11 07>
		
		//		buf1[node->length] = '\0';// 末尾にNULLを加える// 2006 06 移動				 
// 		if(strlen(buf1) < 1){// 2006 06 移動		
// 		  continue;
// 		}
		//				Rprintf("buf1 = %s\n", buf1);
				
		strcpy(buf2, node->feature);//ノードごとに解析情報の取得.要素数は 9
		if(strlen(buf2) < 1){
		  continue;
		}
		//				Rprintf("buf2 = %s\n", buf2);

		//////////////

				
		p = strtok(buf2, "," );//取得情報の分割
		// 品詞の判定
		j = 1;
		////////////////////////////////////////////////////////////////////

					
		if(typeSet == 2){// 品詞情報で数える
		  if( j == 1 && p != NULL ){//品詞情報1
			strL.push_back(p);
			//						Rprintf("typeSet == = %d; p = %s\n", typeSet, p);					 
			p = NULL;
		  }


					
		}else if(typeSet == 1){// 形態素原形で数える
						  
		  //////////////////////////////////////////////


		  if(j == 1 &&  p != NULL){
			sprintf(buf3, "%s", p);
// 			// if(mSym < 1 && strcmp(buf3, "記号") == 0){
// 			if(mSym < 1 && strcmp(buf3, KIGO) == 0){						
// 			  p = NULL;
// 			  //j = 9;
// 			  continue;// 記号は一切省き,総計にも加えない
// 			}
// 			//	
//			Rprintf("buf3 %s\n", buf3);
			if(pos_n == 0){
			  hinsi.push_back(buf3);
			  posC = 1;
			}else{
			  for(i = 0; i < pos_n; i++){
			    sprintf(buf4, "%s", Ppos2[i].c_str());	// 2011 03 10 sprintf(buf4, "%s", Ppos[i]);				
				//					Rprintf("buf4 %s\n", buf4);
				
				if(strcmp(buf3, buf4) == 0){
				  posC = 1;
				  hinsi.push_back(buf3);
				  break;
				}
			  }
			}
			if(posC != 1){
			  p = NULL;
			  posC = 0;
			  continue;
			}
		  }
				  
				
		  while ( p != NULL ) {
					
			// if(j == 1){//品詞情報1
			// 					str = p;
			// 					// str.append(",");
			// 				  }else
			 if(j == 2){//品詞第2情報
			   saibun.push_back(p);
			 } else if( j == 7){
			  if(genkei == 1 || p == NULL || strcmp(p, "*") == 0){
				// strL.push_back(p);//原型str = buf1;// str.append(buf1);//元の語形
				strL.push_back(buf1);//元の語形
				//Rprintf("in str = buf1\n");
			  }
			  else{
				strL.push_back(p);//原型 strL.push_back(buf1);
				//Rprintf("in str = p\n");
			  }
			}
			p = strtok( NULL,"," );
			j++;
			if(j > 7){
			  p = NULL;
			}

				  
		  }// while(P!= NULL)
		  posC = 0;
		} // else if typset = 1

				  
	  }  //////else // BOS, EOS 以外

	  ////////////// 抽出終了
	  if(strL.size() >= (unsigned int) Ngram){// リストのサイズが指定通りであるなら,保存を始める
		//				  Rprintf("type = %d, strL size =  %d\n", typeSet, strL.size() );
		target.erase();//保存のための文字列を初期化
		target.append("");
		xx = 1;
		for ( iter = strL.begin(); iter != strL.end(); iter++){
		  // Rprintf("in for\n");
		  //sprintf(buf3, "%s", *iter);
		  //Rprintf("str %s\n", *iter);
		  //Rprintf("after Rprintf in for\n");
		  target.append( *iter);// target.append( buf3); //target.append( *iter);
		  //					Rprintf("target append\n");
		  if(xx < Ngram){
			 target.append(" ");//target.append("-");
		  }
		  xx++;
		} // for 
		xx = 1;
		if(typeSet == 1){
		  target.append(" ");
		  for ( hinsi_it = hinsi.begin(); hinsi_it != hinsi.end(); hinsi_it++){
		  // Rprintf("in for\n");
			//sprintf(buf3, "%s", *iter);
			//Rprintf("str %s\n", *iter);
			//Rprintf("after Rprintf in for\n");
			target.append( *hinsi_it);// target.append( buf3); //target.append( *iter);
		  //					Rprintf("target append\n");
			if(xx < Ngram){
			   target.append(" ");//target.append("-");
			}
			xx++;
		  } // for
		
		  xx = 1;
		
		  target.append(" ");
		  for ( saibun_it = saibun.begin(); saibun_it != saibun.end(); saibun_it++){
			// Rprintf("in for\n");
			//sprintf(buf3, "%s", *iter);
		  //Rprintf("str %s\n", *iter);
		  //Rprintf("after Rprintf in for\n");
			target.append( *saibun_it);// target.append( buf3); //target.append( *iter);
			//					Rprintf("target append\n");
		  if(xx < Ngram){
			 target.append(" ");//target.append("-");
		  }
		  xx++;
		  } // for
		
		  xx = 1;
		}//if(typeSet == 1){
		
		
		pma0 = ma0.find(target);//出てきた形態素原型は既に全体マップにあるか?
		if(pma0 != ma0.end()){
		  pma0->second =  pma0->second + 1;
		  //二つ目の数値を加算
		}
		else{// マップにないなら,新規にマップに追加
		  ma0.insert(make_pair(target, 1));// 1 は 1個目と言う意味
		}
				  
		pma = ma1.find(target);// str 出てきた形態素原型は既に個別マップにあるか?
		if(pma != ma1.end()){
		  pma->second =  pma->second + 1;
		  //二つ目の数値を加算
		}
		else{// マップにないなら,新規にマップに追加
		  ma1.insert(make_pair(target, 1));// 1 は 1個目と言う意味
		}
				  
		strL.pop_front();// 最初の要素を取り除く
		
		if(typeSet == 1){
		  hinsi.pop_front();
		  saibun.pop_front();
		}
	  }				  // if(strL.size() >= Ngram)
				
	}//for(;node;)// Rprintf("node check ended\n");
			  
  }
  	mecab_destroy(mecab);


	return (R_NilValue);// return 0;


}