void stemfile(struct stemmer * z, FILE * f) { while(TRUE) { int ch = getc(f); if (ch == EOF) return; if (LETTER(ch)) { int i = 0; while(TRUE) { if (i == i_max) { i_max += INC; s = realloc(s, i_max + 1); } ch = tolower(ch); /* forces lower case */ s[i] = ch; i++; ch = getc(f); if (!LETTER(ch)) { ungetc(ch,f); break; } } s[porter_stem(z, s, i - 1) + 1] = 0; /* the previous line calls the stemmer and uses its result to zero-terminate the string in s */ printf("%s",s); } else putchar(ch); } }
int get_trec_doc(FILE *fin, document_t *doc) { int i; char TEMP_BUF[BUF_SIZE]; int insideDoc, acceptText; stop_t st; stem_t sm, *sptr; doc->len = 0; insideDoc = 0; acceptText = 0; while(get_token(fin, TEMP_BUF, BUF_SIZE) > 0) { if(!strcmp(TEMP_BUF, "<doc>")) { insideDoc = 1; doc->len = 0; continue; } if(!strcmp(TEMP_BUF, "</doc>")) { insideDoc = 0; return 1; } if(insideDoc && !strcmp(TEMP_BUF, "<docno>")) { i = 1; while (EOF != (doc->no[0] = fgetc(fin)) && isspace(doc->no[0])); while (EOF != (doc->no[i] = fgetc(fin)) && doc->no[i] != '<' && !isspace(doc->no[i])) i++; ungetc(doc->no[i], fin); doc->no[i] = '\0'; acceptText = 1; continue; } if(TEMP_BUF[0] == '<' || TEMP_BUF[strlen(TEMP_BUF) - 1] == '>') // ignore other tags continue; if(acceptText) { TEMP_BUF[MAX_TERM_LEN - 1] = '\0'; st.word = TEMP_BUF; if(bsearch(&st, stopword, stop_size, sizeof(stop_t), comp_stop) == NULL) { if(STEM_FLAG) porter_stem(TEMP_BUF); // Insert here stemmer if(strlen(TEMP_BUF)> 0) strcpy(doc->terms[doc->len].term, TEMP_BUF); doc->terms[doc->len].pos = doc->len; if(doc->len < MAX_DOC_LEN-2) doc->len++; } } } return doc->len; }
CH_IR_RET_E ch_ir_indexer_print_stats ( CH_IR_INDEXER_CTXT_X *px_indexer_ctxt, uint8_t *puc_token) { CH_IR_RET_E e_ret_val = eCH_IR_RET_FAILURE; HM_FOR_EACH_PARAMS_X x_for_each_param = {eHM_DATA_STRUCT_INVALID}; HM_RET_E e_hm_ret = eHM_RET_FAILURE; uint32_t ui_total_time_ms = 0; HM_NODE_DATA_X x_node_data = { eHM_KEY_TYPE_INVALID }; CH_IR_TOKEN_HM_ENTRY_X *px_token_hm_entry = NULL; if (NULL == px_indexer_ctxt) { CH_IR_LOG_MED("Invalid Args"); e_ret_val = eCH_IR_RET_INVALID_ARGS; goto CLEAN_RETURN; } if (NULL == puc_token) { #if 0 printf ("30 most frequent words:\n"); printf ("|-%7s-+-%30s-+-%10s-+-%7s|\n", "-------", "------------------------------", "----------", "---------"); printf ("| %7s | %30s | %10s | %7s|\n", "Sl. No.", "Token", "Occurances", "Frequency"); printf ("|-%7s-+-%30s-+-%10s-+-%7s|\n", "-------", "------------------------------", "----------", "---------"); x_for_each_param.e_data_structure = eHM_DATA_STRUCT_LINKED_LIST; x_for_each_param.e_direction = eHM_FOR_EACH_DIRECTION_FORWARD; e_hm_ret = hm_for_each_v2 (px_indexer_ctxt->hl_token_hm, &x_for_each_param, ch_ir_indexer_token_hm_for_each_cbk, px_indexer_ctxt); printf ("|-%7s-+-%30s-+-%10s-+-%7s|\n", "-------", "------------------------------", "----------", "---------"); #endif printf ("\n"); printf ("******************Token Stats******************\n"); printf ("* Total Tokens Ignored : %10d *\n", px_indexer_ctxt->x_stats.ui_num_tokens_ignored); printf ("* Total Unique Tokens : %10d *\n", px_indexer_ctxt->x_stats.ui_num_unique_tokens); printf ("* Total Tokens : %10d *\n", px_indexer_ctxt->x_stats.x_tokenizer_stats.ui_token_count); printf ("***********************************************\n"); printf ("\n"); ui_total_time_ms = px_indexer_ctxt->x_stats.ui_tokenization_time_ms + px_indexer_ctxt->x_stats.ui_token_sort_time_ms + px_indexer_ctxt->x_stats.ui_index_serialization_time_ms + px_indexer_ctxt->x_stats.ui_index_compression_time_ms + px_indexer_ctxt->x_stats.ui_compressed_index_serialization_time_ms; printf ("*****************Timing Stats******************\n"); printf ("* Tokenization : %7d ms *\n", px_indexer_ctxt->x_stats.ui_tokenization_time_ms); printf ("* Token Sorting : %7d ms *\n", px_indexer_ctxt->x_stats.ui_token_sort_time_ms); printf ("* Index Serialization : %7d ms *\n", px_indexer_ctxt->x_stats.ui_index_serialization_time_ms); printf ("* Index Compression : %7d ms *\n", px_indexer_ctxt->x_stats.ui_index_compression_time_ms); printf ("* Compressed Index Serialization : %7d ms *\n", px_indexer_ctxt->x_stats.ui_compressed_index_serialization_time_ms); printf ("* -------------------------------+----------- *\n"); printf ("* Total Time To Build Index : %7d ms *\n", ui_total_time_ms); printf ("***********************************************\n"); printf ("\n"); printf ("******************Index Stats******************\n"); printf ("* Number of Inverted List : %8d *\n", px_indexer_ctxt->x_stats.ui_num_unique_tokens); printf ("* Uncompressed Index : %8d KB *\n", px_indexer_ctxt->x_stats.ui_uncompressed_index_size_bytes / 1024); printf ("* Compressed Index : %8d KB *\n", px_indexer_ctxt->x_stats.ui_compressed_index_size_bytes / 1024); printf ("* Compression Ratio : %8.3f *\n", (float) px_indexer_ctxt->x_stats.ui_uncompressed_index_size_bytes / (float) px_indexer_ctxt->x_stats.ui_compressed_index_size_bytes); printf ("***********************************************\n"); printf ("\n"); } else { puc_token = (uint8_t *) porter_stem ((char *) puc_token); (void) pal_memset (&x_node_data, 0x00, sizeof(x_node_data)); x_node_data.e_hm_key_type = eHM_KEY_TYPE_STRING; x_node_data.u_hm_key.puc_str_key = puc_token; e_hm_ret = hm_search_node (px_indexer_ctxt->hl_token_hm, &x_node_data); if (eHM_RET_HM_NODE_FOUND == e_hm_ret) { px_token_hm_entry = (CH_IR_TOKEN_HM_ENTRY_X *) x_node_data.p_data; printf ("\nToken %s Found: Document Frequency: %d, Total Occurances: %d\n", puc_token, px_token_hm_entry->x_postings.ui_doc_freq, px_token_hm_entry->ui_num_occurances); printf ("Interpret the following data as (Doc Id, Term Frequency, Gap)\n"); x_for_each_param.e_data_structure = eHM_DATA_STRUCT_LINKED_LIST; x_for_each_param.e_direction = eHM_FOR_EACH_DIRECTION_FORWARD; e_hm_ret = hm_for_each_v2 (px_token_hm_entry->x_postings.hl_posting_hm, &x_for_each_param, ch_ir_indexer_postings_hm_for_each_cbk, px_indexer_ctxt); printf ("\n"); } } e_ret_val = eCH_IR_RET_SUCCESS; CLEAN_RETURN: return e_ret_val; }