コード例 #1
0
void stemfile(struct stemmer * z, FILE * f)
{  while(TRUE)
   {  int ch = getc(f);
      if (ch == EOF) return;
      if (LETTER(ch))
      {  int i = 0;
         while(TRUE)
         {  if (i == i_max)
            {  i_max += INC;
               s = realloc(s, i_max + 1);
            }
            ch = tolower(ch); /* forces lower case */

            s[i] = ch; i++;
            ch = getc(f);
            if (!LETTER(ch)) { ungetc(ch,f); break; }
         }
         s[porter_stem(z, s, i - 1) + 1] = 0;
         /* the previous line calls the stemmer and uses its result to
            zero-terminate the string in s */
         printf("%s",s);
      }
      else putchar(ch);
   }
}
コード例 #2
0
ファイル: trec-DocParser.c プロジェクト: jiaul/JSENG
int get_trec_doc(FILE *fin, document_t *doc)
{
  int i;
  char TEMP_BUF[BUF_SIZE];
  int insideDoc, acceptText;
  stop_t st;
  stem_t sm, *sptr;
  doc->len = 0; insideDoc = 0; acceptText = 0;
  
  while(get_token(fin, TEMP_BUF, BUF_SIZE) > 0) {
  
     if(!strcmp(TEMP_BUF, "<doc>")) {
        insideDoc = 1;
        doc->len = 0;
        continue;
     }
     
     if(!strcmp(TEMP_BUF, "</doc>")) {
        insideDoc = 0;
        return 1;
     }
     
     if(insideDoc && !strcmp(TEMP_BUF, "<docno>")) {
        i = 1;
        while (EOF != (doc->no[0] = fgetc(fin)) && isspace(doc->no[0]));
        while (EOF != (doc->no[i] = fgetc(fin)) && doc->no[i] != '<' && !isspace(doc->no[i])) i++;
        ungetc(doc->no[i], fin);
        doc->no[i] = '\0';
        acceptText = 1;
        continue;
     }
     
     if(TEMP_BUF[0] == '<' || TEMP_BUF[strlen(TEMP_BUF) - 1] == '>')   // ignore other tags
        continue;
        
     if(acceptText) {
           TEMP_BUF[MAX_TERM_LEN - 1] = '\0';
           st.word = TEMP_BUF;
           if(bsearch(&st, stopword, stop_size, sizeof(stop_t), comp_stop) == NULL) {
              if(STEM_FLAG)
                 porter_stem(TEMP_BUF);                                        // Insert here stemmer
              if(strlen(TEMP_BUF)> 0)
                strcpy(doc->terms[doc->len].term, TEMP_BUF);
              doc->terms[doc->len].pos = doc->len;
              if(doc->len < MAX_DOC_LEN-2)
              doc->len++;
           }  
     }   
  }  
  return doc->len;
}
コード例 #3
0
CH_IR_RET_E ch_ir_indexer_print_stats (
   CH_IR_INDEXER_CTXT_X *px_indexer_ctxt,
   uint8_t *puc_token)
{
   CH_IR_RET_E e_ret_val = eCH_IR_RET_FAILURE;
   HM_FOR_EACH_PARAMS_X x_for_each_param = {eHM_DATA_STRUCT_INVALID};
   HM_RET_E e_hm_ret = eHM_RET_FAILURE;
   uint32_t ui_total_time_ms = 0;
   HM_NODE_DATA_X x_node_data = { eHM_KEY_TYPE_INVALID };
   CH_IR_TOKEN_HM_ENTRY_X *px_token_hm_entry = NULL;

   if (NULL == px_indexer_ctxt)
   {
      CH_IR_LOG_MED("Invalid Args");
      e_ret_val = eCH_IR_RET_INVALID_ARGS;
      goto CLEAN_RETURN;
   }

   if (NULL == puc_token)
   {
#if 0
      printf ("30 most frequent words:\n");
      printf ("|-%7s-+-%30s-+-%10s-+-%7s|\n", "-------",
         "------------------------------", "----------", "---------");
      printf ("| %7s | %30s | %10s | %7s|\n", "Sl. No.", "Token", "Occurances",
         "Frequency");
      printf ("|-%7s-+-%30s-+-%10s-+-%7s|\n", "-------",
         "------------------------------", "----------", "---------");

      x_for_each_param.e_data_structure = eHM_DATA_STRUCT_LINKED_LIST;
      x_for_each_param.e_direction = eHM_FOR_EACH_DIRECTION_FORWARD;
      e_hm_ret = hm_for_each_v2 (px_indexer_ctxt->hl_token_hm,
         &x_for_each_param, ch_ir_indexer_token_hm_for_each_cbk,
         px_indexer_ctxt);

      printf ("|-%7s-+-%30s-+-%10s-+-%7s|\n", "-------",
         "------------------------------", "----------", "---------");
#endif
      printf ("\n");

      printf ("******************Token Stats******************\n");
      printf ("* Total Tokens Ignored           : %10d *\n",
         px_indexer_ctxt->x_stats.ui_num_tokens_ignored);
      printf ("* Total Unique Tokens            : %10d *\n",
         px_indexer_ctxt->x_stats.ui_num_unique_tokens);
      printf ("* Total Tokens                   : %10d *\n",
         px_indexer_ctxt->x_stats.x_tokenizer_stats.ui_token_count);
      printf ("***********************************************\n");

      printf ("\n");

      ui_total_time_ms = px_indexer_ctxt->x_stats.ui_tokenization_time_ms
         + px_indexer_ctxt->x_stats.ui_token_sort_time_ms
         + px_indexer_ctxt->x_stats.ui_index_serialization_time_ms
         + px_indexer_ctxt->x_stats.ui_index_compression_time_ms
         + px_indexer_ctxt->x_stats.ui_compressed_index_serialization_time_ms;

      printf ("*****************Timing Stats******************\n");
      printf ("* Tokenization                   : %7d ms *\n",
         px_indexer_ctxt->x_stats.ui_tokenization_time_ms);
      printf ("* Token Sorting                  : %7d ms *\n",
         px_indexer_ctxt->x_stats.ui_token_sort_time_ms);
      printf ("* Index Serialization            : %7d ms *\n",
         px_indexer_ctxt->x_stats.ui_index_serialization_time_ms);
      printf ("* Index Compression              : %7d ms *\n",
         px_indexer_ctxt->x_stats.ui_index_compression_time_ms);
      printf ("* Compressed Index Serialization : %7d ms *\n",
         px_indexer_ctxt->x_stats.ui_compressed_index_serialization_time_ms);
      printf ("* -------------------------------+----------- *\n");
      printf ("* Total Time To Build Index      : %7d ms *\n",
         ui_total_time_ms);
      printf ("***********************************************\n");

      printf ("\n");

      printf ("******************Index Stats******************\n");
      printf ("* Number of Inverted List       : %8d    *\n",
         px_indexer_ctxt->x_stats.ui_num_unique_tokens);
      printf ("* Uncompressed Index            : %8d KB *\n",
         px_indexer_ctxt->x_stats.ui_uncompressed_index_size_bytes / 1024);
      printf ("* Compressed Index              : %8d KB *\n",
         px_indexer_ctxt->x_stats.ui_compressed_index_size_bytes / 1024);
      printf ("* Compression Ratio             : %8.3f    *\n",
         (float) px_indexer_ctxt->x_stats.ui_uncompressed_index_size_bytes /
         (float) px_indexer_ctxt->x_stats.ui_compressed_index_size_bytes);

      printf ("***********************************************\n");

      printf ("\n");
   }
   else
   {
      puc_token = (uint8_t *) porter_stem ((char *) puc_token);

      (void) pal_memset (&x_node_data, 0x00, sizeof(x_node_data));
      x_node_data.e_hm_key_type = eHM_KEY_TYPE_STRING;
      x_node_data.u_hm_key.puc_str_key = puc_token;
      e_hm_ret = hm_search_node (px_indexer_ctxt->hl_token_hm, &x_node_data);
      if (eHM_RET_HM_NODE_FOUND == e_hm_ret)
      {
         px_token_hm_entry = (CH_IR_TOKEN_HM_ENTRY_X *) x_node_data.p_data;

         printf ("\nToken %s Found: Document Frequency: %d, Total Occurances: %d\n",
            puc_token,
            px_token_hm_entry->x_postings.ui_doc_freq,
            px_token_hm_entry->ui_num_occurances);
         printf ("Interpret the following data as (Doc Id, Term Frequency, Gap)\n");

         x_for_each_param.e_data_structure = eHM_DATA_STRUCT_LINKED_LIST;
         x_for_each_param.e_direction = eHM_FOR_EACH_DIRECTION_FORWARD;
         e_hm_ret = hm_for_each_v2 (px_token_hm_entry->x_postings.hl_posting_hm,
            &x_for_each_param, ch_ir_indexer_postings_hm_for_each_cbk,
            px_indexer_ctxt);

         printf ("\n");
      }
   }
   e_ret_val = eCH_IR_RET_SUCCESS;
CLEAN_RETURN:
   return e_ret_val;
}