int main() { HEGGHANDLE hHandle = eggPath_open("file:///tmp/"); HEGGINDEXWRITER hIndexWriter = eggIndexWriter_open(hHandle, ""); int cnt = 100; index_t i = 0; while(i != cnt) { HEGGDOCUMENT hDocument = eggDocument_new(); HEGGFIELD hField1 = eggField_new("content", "hello", strlen("hello")+1, EGG_NOT_ANALYZED | EGG_INDEX_STRING | EGG_STORAGE); eggDocument_add(hDocument, hField1); eggIndexWriter_add_document(hIndexWriter, hDocument); eggDocument_delete(hDocument); i++; } if(eggIndexWriter_optimize(hIndexWriter)) { printf("optimize success! \n"); } eggIndexWriter_close(hIndexWriter); eggPath_close(hHandle); return 0; }
int main() { HEGGHANDLE hHandle = eggPath_open("file:///tmp/"); HEGGINDEXREADER hIndexReader = eggIndexReader_open(hHandle); HEGGINDEXSEARCHER hIndexSearcher = eggIndexSearcher_new(hIndexReader); HEGGQUERY hq = eggQuery_new_string("content", "is good", strlen("is good"), ANALYZER_CWSLEX); if(hq != EGG_NULL) { printf("query init OK! \n"); } //填0取所有结果,非0按填的值取个数 HEGGTOPCOLLECTOR hTopCollector = eggTopCollector_new(0); eggTopCollector_set_sorttype(hTopCollector, EGG_TOPSORT_WEIGHT); printf("eggTopCollector sortType is EGG_TOPSORT_WEIGHT \n"); EBOOL ret = eggIndexSearcher_search_with_query(hIndexSearcher, hTopCollector, hq); if (ret == EGG_TRUE) { //对最后结果进行排序 //EGG_TOPSORT_WEIGHT: 按document的weight排序 //EGG_TOPSORT_SCORE: 按查询关键字的相关度排序(打分排序) //EGG_TOPSORT_NOT: 不排序 HEGGSCOREDOC lp_score_doc = eggTopCollector_top_docs(hTopCollector); count_t cnt = eggTopCollector_total_hits(hTopCollector); index_t i = 0; printf("have hit %u documents\n", cnt); while (i != cnt) { HEGGDOCUMENT lp_eggDocument = EGG_NULL; eggIndexReader_get_document(hIndexReader, lp_score_doc[i].idDoc, &lp_eggDocument); HEGGFIELD lp_field = eggDocument_get_field(lp_eggDocument, "content"); unsigned len = 0; char *val = eggField_get_value(lp_field, &len); printf("id : [%llu], content : [%s], weight : [%d]\n", EGGDID_DOCID(&lp_score_doc[i].idDoc), val, eggDocument_get_weight(lp_eggDocument)); lp_field = 0; eggDocument_delete(lp_eggDocument); i++; } } eggTopCollector_delete(hTopCollector); eggQuery_delete(hq); eggIndexSearcher_delete(hIndexSearcher); eggIndexReader_close(hIndexReader); eggPath_close(hHandle); return 0; }
PUBLIC P_NEW_BLOCK_ITEM eggAnalyzer_get_dictlist(char *analyzerName) { if(!analyzerName) { return EGG_NULL; } P_NEW_BLOCK_ITEM pBlockItem = NULL; HEGGTOPCOLLECTOR hTopCollector = eggSySRecorder_get_dict("dict", analyzerName); HEGGSCOREDOC lp_score_doc = eggTopCollector_top_docs(hTopCollector); count_t cnt = eggTopCollector_total_hits(hTopCollector); HEGGINDEXREADER hIndexReader = eggSySRecorder_alloc_reader(); while(cnt--) { HEGGDOCUMENT lp_eggDocument = EGG_NULL; eggIndexReader_get_document(hIndexReader, lp_score_doc[cnt].idDoc, &lp_eggDocument); char* pDict = EGG_NULL; char* pKey = EGG_NULL; HEGGFIELD hField1 = eggDocument_get_field(lp_eggDocument, EGG_SYS_DICTNAME); HEGGFIELD hField2 = eggDocument_get_field(lp_eggDocument, EGG_SYS_DICTKEY); size32_t n_len1 = 0; size32_t n_len2 = 0; pDict = eggField_get_value(hField1, &n_len1); pKey = eggField_get_value(hField2, &n_len2); if(pDict && pKey) { char *lp_dict_buf = strndup(pDict, n_len1); char *lp_key_buf = strndup(pKey, n_len2); // printf("[%s], [%s]\n", lp_dict_buf, lp_key_buf); eggPrtLog_info("eggAnalyzer", "[%s], [%s]\n", lp_dict_buf, lp_key_buf); pBlockItem = BlockItemPushWord(pBlockItem, lp_dict_buf, lp_key_buf, "NR", 1000000); free(lp_dict_buf); free(lp_key_buf); } // free(pDict); //free(pKey); eggDocument_delete(lp_eggDocument); } eggTopCollector_delete(hTopCollector); eggSySRecorder_free_reader((void**)&hIndexReader); return pBlockItem; }
int main(int argc, char* argv[]) { HEGGHANDLE hHandle = eggPath_open("file:///egg/"); HEGGINDEXREADER hIndexReader = eggIndexReader_open(hHandle); HEGGINDEXSEARCHER hIndexSearcher = eggIndexSearcher_new(hIndexReader); HEGGQUERY hq = eggQuery_new_string("content", argv[1], strlen(argv[1]), ""); if(hq != EGG_NULL) { printf("query init OK! \n"); } HEGGTOPCOLLECTOR hTopCollector = eggTopCollector_new(0); EBOOL ret = eggIndexSearcher_search_with_query(hIndexSearcher, hTopCollector, hq); if (ret == EGG_TRUE) { HEGGSCOREDOC lp_score_doc = eggTopCollector_top_docs(hTopCollector); count_t cnt = eggTopCollector_total_hits(hTopCollector); index_t i = 0; printf("have hit %u documents\n", cnt); while (i != cnt) { HEGGDOCUMENT lp_eggDocument = EGG_NULL; eggIndexReader_get_document(hIndexReader, lp_score_doc[i].idDoc, &lp_eggDocument); HEGGFIELD lp_field = eggDocument_get_field(lp_eggDocument, "content"); unsigned len = 0; char *val = eggField_get_value(lp_field, &len); printf("id : [%llu], content : [%s], \n", EGGDID_DOCID(&lp_score_doc[i].idDoc), val); lp_field = 0; eggDocument_delete(lp_eggDocument); i++; } } eggTopCollector_delete(hTopCollector); eggQuery_delete(hq); eggIndexSearcher_delete(hIndexSearcher); eggIndexReader_close(hIndexReader); eggPath_close(hHandle); return 0; }
int main() { //ImLexAnalyzer* p_la = (ImLexAnalyzer*)ImCnLexAnalyzer_new(); // ImLexAnalyzer* p_la = (ImLexAnalyzer*)ImCnLexAnalyzer_new(); HEGGDIRECTORY hDirectory = eggDirectory_open("/ape/ImRoBot5/index/bbstest"); HEGGINDEXREADER hIndexReader = eggIndexReader_open(hDirectory); HEGGINDEXSEARCHER hIndexSearcher = eggIndexSearcher_new(hIndexReader); HEGGQUERY h1, h2, h3; h1 = eggQuery_new_string("title", "人", 3, ANALYZER_CWSLEX); //h2 = eggQuery_new_string("content", "new", 3, p_la); // h2 = eggQuery_new_string("body", "some description", 16, p_la); // h3 = eggQuery_new_int32("price", 199); // h2 = eggQuery_and(h3, h2); //h1 = eggQuery_or(h2, h1); // h3 = h2 = 0; HEGGTOPCOLLECTOR hTopCollector = eggTopCollector_new(0); int ret = eggIndexSearcher_search_with_query(hIndexSearcher, hTopCollector, h1); if (ret == EGG_TRUE) { eggTopCollector_normalized(hTopCollector, EGG_TOPSORT_SCORE); // eggTopCollector_normalized(hTopCollector, EGG_TOPSORT_NOT); HEGGSCOREDOC lp_score_doc = eggTopCollector_top_docs(hTopCollector); count_t cnt = eggTopCollector_total_hits(hTopCollector); printf("have hit %u documents\n", cnt); if (cnt > 0) { printf("last document: id[%llu]\n", lp_score_doc[cnt-1].idDoc); HEGGDOCUMENT lp_eggDocument = EGG_NULL; eggIndexReader_get_document(hIndexReader, lp_score_doc[cnt-1].idDoc, &lp_eggDocument); HEGGFIELD lp_field = eggDocument_get_field(lp_eggDocument,"title"); unsigned len = 0; char *val = eggField_get_value(lp_field, &len); printf("last document: body[%.*s]\n", len, val); lp_field = 0; eggDocument_delete(lp_eggDocument); } } eggTopCollector_delete(hTopCollector); eggQuery_delete(h1); eggIndexSearcher_delete(hIndexSearcher); eggIndexReader_close(hIndexReader); eggDirectory_close(hDirectory); ImLexAnalyzer_delete(p_la); }
void CeggItfTest::testReIndex(char* dir_path) { struct timeval tv_start, tv_end; int i = 0; gettimeofday(&tv_start, EGG_NULL); void *hEggHandle = eggPath_open(dir_path); HEGGINDEXWRITER hIndexWrite; hIndexWrite = eggIndexWriter_open(hEggHandle, ANALYZER_CWSLEX); char *buf = EGG_NULL; size_t fileSize = 0; int count = 0; char databuf[4096]; char c= 0; getchar(); while (( c = getchar() ) == 'c') { getchar(); offset64_t id = 0; printf("id :"); scanf("%llu", &id); getchar(); HEGGDOCUMENT hDocument = eggDocument_new(); sprintf(databuf, "body : %s", buf); HEGGFIELD hField1 = eggField_new("body1", "159", strlen("159"), EGG_NOT_ANALYZED | EGG_INDEX_STRING | EGG_STORAGE); eggDocument_add(hDocument, hField1); // eggIndexWriter_reIndex_document(hIndexWrite, hDocument, id); eggDocument_delete(hDocument); } eggIndexWriter_optimize(hIndexWrite); eggIndexWriter_close(hIndexWrite); gettimeofday(&tv_end, EGG_NULL); return ; }
void CeggItfTest::testExportDoc(char* dir_path) { void *hEggHandle = eggPath_open(dir_path); HEGGINDEXREADER hIndexReader = eggIndexReader_open(hEggHandle); offset64_t n_cursor = 0; HEGGDOCUMENT lp_eggDocument = EGG_NULL; while(lp_eggDocument = eggIndexReader_export_document(hIndexReader, &n_cursor)) { HEGGFIELD lp_field = eggDocument_get_field(lp_eggDocument, "content"); unsigned int len = 0; if(lp_field) printf("%s", eggField_get_value(lp_field, &len)); eggDocument_delete(lp_eggDocument); } return ; }
int main() { HEGGHANDLE hHandle = eggPath_open("file:///tmp/"); HEGGINDEXWRITER hIndexWriter = eggIndexWriter_open(hHandle, ""); int cnt = 20; index_t i = 0; while(i != cnt) { HEGGDOCUMENT hDocument = eggDocument_new(); HEGGFIELD hField1 = eggField_new("sex", (char*)&persons[i].sex, 1, EGG_NOT_ANALYZED | EGG_INDEX_STRING | EGG_STORAGE); HEGGFIELD hField2 = eggField_new("num", (void*)&persons[i].num, sizeof(int), EGG_NOT_ANALYZED | EGG_INDEX_INT32 | EGG_STORAGE); eggDocument_add(hDocument, hField1); eggDocument_add(hDocument, hField2); eggIndexWriter_add_document(hIndexWriter, hDocument); eggDocument_delete(hDocument); i++; } if(eggIndexWriter_optimize(hIndexWriter)) { printf("optimize success! \n"); } eggIndexWriter_close(hIndexWriter); eggPath_close(hHandle); return 0; }
void CeggItfTest::testIndexSearch(char* dir_path) { char key[1000] = {0}; type_t op = EGG_TOPSORT_SCORE; HEGGHANDLE hEggHandle = eggPath_open(dir_path); HEGGINDEXREADER hIndexReader = eggIndexReader_open(hEggHandle); HEGGINDEXSEARCHER hIndexSearcher = eggIndexSearcher_new(hIndexReader); char fieldName[200] = ""; HEGGQUERY h1; h1 = getQuery(); char c; printf("key range search?(y/n)"); scanf("%c", &c); if(c == 'y') { printf("FieldName: "); scanf("%s", fieldName); int startPrice = 0; int endPrice = 0; printf("start Price: "); scanf("%d", &startPrice); printf("end Price: "); scanf("%d", &endPrice); HEGGQUERY h2 = 0; op = EGG_TOPSORT_ORDERBY; h2 = eggQuery_new_int32range(fieldName, startPrice, endPrice); h1 = eggQuery_and(h1, h2); } HEGGTOPCOLLECTOR hTopCollector = eggTopCollector_new(0); switch (1) { case 1: eggTopCollector_set_orderby(hTopCollector, 2, "num1", 1, "num2", 1); break; case 2: eggTopCollector_set_sorttype(hTopCollector, EGG_TOPSORT_SCORE); break; default: eggTopCollector_set_sorttype(hTopCollector, EGG_TOPSORT_NOT); break; } struct timeval vstart, vend; gettimeofday(&vstart, 0); EBOOL ret = eggIndexSearcher_search_with_query(hIndexSearcher, hTopCollector, h1); gettimeofday(&vend, 0); printf("search_with_query time : %f\n", (double)(vend.tv_sec - vstart.tv_sec) + (double)(vend.tv_usec - vstart.tv_usec)/1000000); if(ret ==EGG_FALSE) { printf("no key !\n"); exit(1); } // eggTopCollector_delete(hTopCollector); // eggQuery_delete(h1); // eggIndexSearcher_delete(hIndexSearcher); // eggIndexReader_close(hIndexReader); // eggPath_close(hEggHandle); // return ; if (0) { // deprecated HEGGQUERY hQuery_tmp = 0; //取时间范围 hQuery_tmp = eggQuery_new_stringrange("time", "1", "2"); //按时间排序 eggIndexSearcher_filter(hIndexSearcher, hTopCollector, hQuery_tmp, 1); //按相关度排序 //eggIndexSearcher_filter(hIndexSearcher, hTopCollector, hQuery_tmp, 0); eggQuery_delete(hQuery_tmp); } HEGGSCOREDOC lp_score_doc = eggTopCollector_top_docs(hTopCollector); count_t cnt = eggTopCollector_total_hits(hTopCollector); index_t idx = 0; printf("count : %d\n", cnt); // return ; #if(0) HEGGDOCUMENT* ppeggDocument = EGG_NULL; eggIndexReader_get_documentSet(hIndexReader, lp_score_doc, cnt, &ppeggDocument); while(idx != cnt) { printf("--------------------------\n"); HEGGFIELD lp_field = eggDocument_get_field(ppeggDocument[idx], "f_id"); unsigned int len = 0; if(lp_field) printf("count %d id : %lld \nf_id : %s ", idx, EGGDID_DOCID(&(lp_score_doc[idx].idDoc)), eggField_get_value(lp_field, &len)); eggDocument_delete(ppeggDocument[idx]); idx++; } #endif #if(0) while(idx != cnt && idx < 10000) { HEGGDOCUMENT lp_eggDocument = EGG_NULL; printf("%lld ----\n", lp_score_doc[idx].idDoc); eggIndexReader_get_document(hIndexReader, lp_score_doc[idx].idDoc, &lp_eggDocument); HEGGFIELD lp_field = eggDocument_get_field(lp_eggDocument, "f_id"); // HEGGFIELD lp_field2 = eggDocument_get_field(lp_eggDocument, "random"); //HEGGFIELD lp_field3 = eggDocument_get_field(lp_eggDocument, "num1"); //HEGGFIELD lp_field4 = eggDocument_get_field(lp_eggDocument, "num2"); // HEGGFIELD lp_field3 = eggDocument_get_field(lp_eggDocument, "spanfield2"); unsigned int len = 0; unsigned int len2 = 0; unsigned int len3 = 0; if(lp_field) printf("count %d id : %lld f_id: %s \n", idx, EGGDID_DOCID(&(lp_score_doc[idx].idDoc)), eggField_get_value(lp_field, &len) ); // if(lp_field3) // printf("count %d id : %lld content : %s weightfield: %d\n", idx, EGGDID_DOCID(&(lp_score_doc[idx].idDoc)), eggField_get_value(lp_field3, &len), eggField_get_value(lp_field3, &len3)); /* { char **pkeywords; size16_t *pkeySz; // int **ppos = NULL; count_t nums; eggTopCollector_get_keyPosition(hTopCollector, EGGDID_DOCID(&lp_score_doc[idx].idDoc), "content", &pkeywords, &pkeySz, NULL, &nums); int i; for (i = 0; i < nums; i++) { printf("Key[%.*s]\n", pkeySz[i], pkeywords[i]); } free(pkeySz); for (i = 0; i < nums; i++) { free(pkeywords[i]); } free(pkeywords); } */ // lp_field = eggDocument_get_field(lp_eggDocument, "price"); // printf("date : [%s] \n", eggField_get_value(lp_field, &len)); eggDocument_delete(lp_eggDocument); idx++; // usleep(5000); } #endif eggTopCollector_delete(hTopCollector); eggQuery_delete(h1); eggIndexSearcher_delete(hIndexSearcher); eggIndexReader_close(hIndexReader); eggPath_close(hEggHandle); }
void CeggItfTest::testIndexAdd(char* dir_path) { int i = 0; HEGGHANDLE hEggHandle = eggPath_open(dir_path); HEGGINDEXWRITER hIndexWrite; FILE* fp_file = fopen("kfc.txt", "r+"); hIndexWrite = eggIndexWriter_open(hEggHandle, "weightfield"); char *buf = EGG_NULL; size_t fileSize = 0; int count = 0; char databuf[4096]; srand(time(0)); while (getline(&buf, &fileSize, fp_file) != -1 & count < 5) { count++; if (buf[strlen(buf)-1] == '\n') { buf[strlen(buf)-1] = '\0'; } // printf("%s\n", buf); HEGGDOCUMENT hDocument = eggDocument_new(); char buftmp[1024]; sprintf(buftmp, "%s",buf); // memcpy(buftmp, ); // HEGGFIELD hField1 = eggField_new("content", buftmp, strlen(buftmp), EGG_ANALYZED | EGG_INDEX_STRING | EGG_STORAGE); HEGGFIELD hField1 = eggField_new("content", "1", strlen("1"), EGG_ANALYZED | EGG_INDEX_STRING | EGG_STORAGE); int count1 = count % 100; char numbuf[10]; sprintf(numbuf, "%d", count1); HEGGFIELD hField2 = eggField_new("random", (char*)&count1, 4, EGG_NOT_ANALYZED|EGG_RANGE_INDEX | EGG_INDEX_INT32 | EGG_STORAGE); int num1 = rand()%100; HEGGFIELD hField3 = eggField_new("num1", (char*)&num1, 4, EGG_NOT_ANALYZED|EGG_RANGE_INDEX | EGG_INDEX_INT32 | EGG_STORAGE); int num2 = rand()%100; HEGGFIELD hField4 = eggField_new("num2", (char*)&num2, 4, EGG_NOT_ANALYZED|EGG_RANGE_INDEX | EGG_INDEX_INT32 | EGG_STORAGE); HEGGFIELD hField5 = eggField_new("title", "111", 3, EGG_NOT_ANALYZED|EGG_INDEX_STRING | EGG_STORAGE); eggDocument_add(hDocument, hField1); eggDocument_add(hDocument, hField2); eggDocument_add(hDocument, hField3); eggDocument_add(hDocument, hField4); eggDocument_add(hDocument, hField5); eggIndexWriter_add_document(hIndexWrite, hDocument); eggDocument_delete(hDocument); if (count % 5000 == 0) { printf("count %d\n", count); break; eggIndexWriter_optimize(hIndexWrite); } // sleep(1); } eggIndexWriter_optimize(hIndexWrite); eggIndexWriter_close(hIndexWrite); eggPath_close(hEggHandle ); fclose(fp_file); free(buf); return ; }
PRIVATE HEGGNETPACKAGE eggMemServer_processing_optimize(HEGGNETPACKAGE hNetPackage) { HEGGNETPACKAGE lp_res_package = eggNetPackage_new(EGG_PACKAGE_OPTIMIZE); EBOOL ret; if(POINTER_IS_INVALID(hNetPackage)) { ret = EGG_NET_IVDHANDLE; lp_res_package = eggNetPackage_add(lp_res_package, &ret, sizeof(ret), EGG_PACKAGE_RET); return lp_res_package; } size32_t n_iter_sz = 0; char* lp_data_str = (char*)(hNetPackage + 1); struct timeval tvstart,tvend; gettimeofday(&tvstart, 0); count_t a_cnt = 0; while(n_iter_sz != hNetPackage->eSize) { HEGGNETUNITPACKAGE lp_unit_package = (HEGGNETUNITPACKAGE)(lp_data_str + n_iter_sz); if(lp_unit_package->ty == EGG_PACKAGE_OPTIMIZE_ADD) { HEGGDOCNODE lp_doc_node = (HEGGDOCNODE)malloc(lp_unit_package->size); memcpy(lp_doc_node, lp_unit_package + 1, lp_unit_package->size); HEGGDOCUMENT lp_document = eggDocument_unserialization(lp_doc_node); eggIndexWriter_add_document(g_hEggMemEggHandle->hWriter, lp_document); eggDocument_delete(lp_document); // printf("count doc : %d\n", ++a_cnt); } else if(lp_unit_package->ty == EGG_PACKAGE_OPTIMIZE_DELETE) { EGGDID dId = {0}; memcpy(&dId, lp_unit_package + 1, lp_unit_package->size); eggIndexWriter_delete_document(g_hEggMemEggHandle->hWriter, dId ); } else if(lp_unit_package->ty == EGG_PACKAGE_OPTIMIZE_MODIFY) //modify { EGGDID dId = {0}; memcpy(&dId, lp_unit_package + 1, sizeof(dId)); HEGGDOCNODE lp_doc_node = (HEGGDOCNODE)malloc(lp_unit_package->size - sizeof(dId)); memcpy(lp_doc_node, (char*)(lp_unit_package + 1) + sizeof(dId), lp_unit_package->size - sizeof(dId)); HEGGDOCUMENT lp_document = eggDocument_unserialization(lp_doc_node); eggIndexWriter_modify_document(g_hEggMemEggHandle->hWriter, dId, lp_document); //eggDocument_delete(lp_document); } else { EGGDID dId = {0}; memcpy(&dId, lp_unit_package + 1, sizeof(dId)); HEGGDOCNODE lp_doc_node = (HEGGDOCNODE)malloc(lp_unit_package->size - sizeof(dId)); memcpy(lp_doc_node, (char*)(lp_unit_package + 1) + sizeof(dId), lp_unit_package->size - sizeof(dId)); HEGGDOCUMENT lp_document = eggDocument_unserialization(lp_doc_node); eggIndexWriter_incrementmodify_document(g_hEggMemEggHandle->hWriter, dId, lp_document); // eggDocument_delete(lp_document); } n_iter_sz += sizeof(EGGNETUNITPACKAGE) + lp_unit_package->size; } gettimeofday(&tvend, 0); fprintf(stderr, "ADD doc time %f\n" , (tvend.tv_sec - tvstart.tv_sec) + (double)(tvend.tv_usec - tvstart.tv_usec)/1000000 ); /* gettimeofday(&tvstart, 0); */ /* ret = eggIndexWriter_optimize(hNetServer->hWriter); */ /* gettimeofday(&tvend, 0); */ /* fprintf(stderr, "optimize doc time %f\n", (tvend.tv_sec - tvstart.tv_sec) + (double)(tvend.tv_usec - tvstart.tv_usec)/1000000 ); */ ret = EGG_TRUE; lp_res_package = eggNetPackage_add(lp_res_package, &ret, sizeof(ret), EGG_PACKAGE_RET); return lp_res_package; }