예제 #1
0
static void
S_add_document(Indexer *indexer, const char *title, const char *content) {
    Doc *doc = Doc_new(NULL, 0);

    {
        // Store 'title' field   
        String *field_str = Str_newf("title");
        String *value_str = Str_new_from_utf8(title, strlen(title));
        Doc_Store(doc, field_str, (Obj*)value_str);
        DECREF(field_str);
        DECREF(value_str);
    }

    {
        // Store 'content' field   
        String *field_str = Str_newf("content");
        String *value_str = Str_new_from_utf8(content, strlen(content));
        Doc_Store(doc, field_str, (Obj*)value_str);
        DECREF(field_str);
        DECREF(value_str);
    }

    Indexer_Add_Doc(indexer, doc, 1.0);

    DECREF(doc);
}
예제 #2
0
static void
test_highlighting(TestBatchRunner *runner) {
    Schema *schema = Schema_new();
    StandardTokenizer *tokenizer = StandardTokenizer_new();
    FullTextType *plain_type = FullTextType_new((Analyzer*)tokenizer);
    FullTextType_Set_Highlightable(plain_type, true);
    FullTextType *dunked_type = FullTextType_new((Analyzer*)tokenizer);
    FullTextType_Set_Highlightable(dunked_type, true);
    FullTextType_Set_Boost(dunked_type, 0.1f);
    String *content = (String*)SSTR_WRAP_UTF8("content", 7);
    Schema_Spec_Field(schema, content, (FieldType*)plain_type);
    String *alt = (String*)SSTR_WRAP_UTF8("alt", 3);
    Schema_Spec_Field(schema, alt, (FieldType*)dunked_type);
    DECREF(plain_type);
    DECREF(dunked_type);
    DECREF(tokenizer);

    RAMFolder *folder = RAMFolder_new(NULL);
    Indexer *indexer = Indexer_new(schema, (Obj*)folder, NULL, 0);

    Doc *doc = Doc_new(NULL, 0);
    String *string = (String *)SSTR_WRAP_UTF8(TEST_STRING, TEST_STRING_LEN);
    Doc_Store(doc, content, (Obj*)string);
    Indexer_Add_Doc(indexer, doc, 1.0f);
    DECREF(doc);

    doc = Doc_new(NULL, 0);
    string = (String *)SSTR_WRAP_UTF8("\"I see,\" said the blind man.", 28);
    Doc_Store(doc, content, (Obj*)string);
    Indexer_Add_Doc(indexer, doc, 1.0f);
    DECREF(doc);

    doc = Doc_new(NULL, 0);
    string = (String *)SSTR_WRAP_UTF8("x but not why or 2ee", 20);
    Doc_Store(doc, content, (Obj*)string);
    string = (String *)SSTR_WRAP_UTF8(TEST_STRING
                                     " and extra stuff so it scores lower",
                                     TEST_STRING_LEN + 35);
    Doc_Store(doc, alt, (Obj*)string);
    Indexer_Add_Doc(indexer, doc, 1.0f);
    DECREF(doc);

    Indexer_Commit(indexer);
    DECREF(indexer);

    Searcher *searcher = (Searcher*)IxSearcher_new((Obj*)folder);
    Obj *query = (Obj*)SSTR_WRAP_UTF8("\"x y z\" AND " PHI, 14);
    Hits *hits = Searcher_Hits(searcher, query, 0, 10, NULL);

    test_Raw_Excerpt(runner, searcher, query);
    test_Highlight_Excerpt(runner, searcher, query);
    test_Create_Excerpt(runner, searcher, query, hits);

    DECREF(hits);
    DECREF(searcher);
    DECREF(folder);
    DECREF(schema);
}
static Folder*
S_create_index()
{
    Schema     *schema  = (Schema*)TestSchema_new();
    RAMFolder  *folder  = RAMFolder_new(NULL);
    VArray     *doc_set = TestUtils_doc_set();
    Indexer    *indexer = Indexer_new(schema, (Obj*)folder, NULL, NULL, 0);
    u32_t i, max;

    for (i = 0, max = VA_Get_Size(doc_set); i < max; i++) {
        static CharBuf field = ZCB_LITERAL("content");
        Doc *doc = Doc_new(NULL, 0);
        Doc_Store(doc, &field, VA_Fetch(doc_set, i));
        Indexer_Add_Doc(indexer, doc, 1.0f);
        DECREF(doc);
    }

    Indexer_Commit(indexer);

    DECREF(doc_set);
    DECREF(indexer);
    DECREF(schema);
        
    return (Folder*)folder;
}
예제 #4
0
static void
test_hl_selection(TestBatchRunner *runner) {
    Schema *schema = Schema_new();
    StandardTokenizer *tokenizer = StandardTokenizer_new();
    FullTextType *plain_type = FullTextType_new((Analyzer*)tokenizer);
    FullTextType_Set_Highlightable(plain_type, true);
    String *content = (String*)SSTR_WRAP_UTF8("content", 7);
    Schema_Spec_Field(schema, content, (FieldType*)plain_type);
    DECREF(plain_type);
    DECREF(tokenizer);

    RAMFolder *folder = RAMFolder_new(NULL);
    Indexer *indexer = Indexer_new(schema, (Obj*)folder, NULL, 0);

    static char test_string[] =
        "bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla. "
        "bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla. "
        "bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla. "
        "bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla. "
        "bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla. "
        "bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla NNN bla. "
        "bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla. "
        "bla bla bla MMM bla bla bla bla bla bla bla bla bla bla bla bla. "
        "bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla. "
        "bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla. "
        "bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla. "
        "bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla. "
        "bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla. ";
    Doc *doc = Doc_new(NULL, 0);
    String *string = (String *)SSTR_WRAP_UTF8(test_string, strlen(test_string));
    Doc_Store(doc, content, (Obj*)string);
    Indexer_Add_Doc(indexer, doc, 1.0f);
    DECREF(doc);

    Indexer_Commit(indexer);
    DECREF(indexer);

    Searcher *searcher = (Searcher*)IxSearcher_new((Obj*)folder);
    Obj *query = (Obj*)SSTR_WRAP_UTF8("NNN MMM", 7);
    Highlighter *highlighter = Highlighter_new(searcher, query, content, 200);
    Hits *hits = Searcher_Hits(searcher, query, 0, 10, NULL);
    HitDoc *hit = Hits_Next(hits);
    String *excerpt = Highlighter_Create_Excerpt(highlighter, hit);
    String *mmm = (String*)SSTR_WRAP_UTF8("MMM", 3);
    String *nnn = (String*)SSTR_WRAP_UTF8("NNN", 3);
    TEST_TRUE(runner, Str_Find(excerpt, mmm) >= 0 || Str_Find(excerpt, nnn) >= 0,
              "Sentence boundary algo doesn't chop terms");

    DECREF(excerpt);
    DECREF(hit);
    DECREF(hits);
    DECREF(highlighter);
    DECREF(searcher);
    DECREF(folder);
    DECREF(schema);
}
예제 #5
0
파일: TestSimple.c 프로젝트: kidaa/lucy
static void
test_simple(TestBatchRunner *runner) {
    RAMFolder *folder   = RAMFolder_new(NULL);
    String    *language = SSTR_WRAP_UTF8("en", 2);
    Simple    *lucy     = Simple_new((Obj*)folder, language);

    String *food_field = SSTR_WRAP_UTF8("food", 4);

    {
        Doc *doc = Doc_new(NULL, 0);
        String *value = SSTR_WRAP_UTF8("creamed corn", 12);
        Doc_Store(doc, food_field, (Obj*)value);
        Simple_Add_Doc(lucy, doc);
        DECREF(doc);

        String *query = SSTR_WRAP_UTF8("creamed", 7);
        uint32_t num_results = Simple_Search(lucy, query, 0, 10);
        TEST_INT_EQ(runner, num_results, 1, "Search works right after add");
    }

    {
        Doc *doc = Doc_new(NULL, 0);
        String *value = SSTR_WRAP_UTF8("creamed spinach", 15);
        Doc_Store(doc, food_field, (Obj*)value);
        Simple_Add_Doc(lucy, doc);
        DECREF(doc);

        String *query = SSTR_WRAP_UTF8("creamed", 7);
        uint32_t num_results = Simple_Search(lucy, query, 0, 10);
        TEST_INT_EQ(runner, num_results, 2, "Search returns total hits");
    }

    {
        Doc *doc = Doc_new(NULL, 0);
        String *value = SSTR_WRAP_UTF8("creamed broccoli", 16);
        Doc_Store(doc, food_field, (Obj*)value);
        Simple_Add_Doc(lucy, doc);
        DECREF(doc);

        DECREF(lucy);
        lucy = Simple_new((Obj*)folder, language);

        String *query = SSTR_WRAP_UTF8("cream", 5);
        uint32_t num_results = Simple_Search(lucy, query, 0, 10);
        TEST_INT_EQ(runner, num_results, 3, "commit upon destroy");

        HitDoc *hit;
        while ((hit = Simple_Next(lucy)) != NULL) {
            String *food = (String*)HitDoc_Extract(hit, food_field);
            TEST_TRUE(runner, Str_Starts_With_Utf8(food, "cream", 5), "Next");
            DECREF(food);
            DECREF(hit);
        }
    }

    {
        Doc *doc = Doc_new(NULL, 0);
        String *band_field = SSTR_WRAP_UTF8("band", 4);
        String *value = SSTR_WRAP_UTF8("Cream", 5);
        Doc_Store(doc, band_field, (Obj*)value);
        Simple_Add_Doc(lucy, doc);
        DECREF(doc);

        String *query = SSTR_WRAP_UTF8("cream", 5);
        uint32_t num_results = Simple_Search(lucy, query, 0, 10);
        TEST_INT_EQ(runner, num_results, 4,
                    "Search uses correct EasyAnalyzer");
    }

    DECREF(lucy);
    DECREF(folder);
}
예제 #6
0
static Folder*
build_index() {
    // Plain type.
    String         *pattern   = Str_newf("\\S+");
    RegexTokenizer *tokenizer = RegexTokenizer_new(pattern);
    FullTextType   *plain     = FullTextType_new((Analyzer*)tokenizer);

    // Fancy type.

    String         *word_pattern   = Str_newf("\\w+");
    RegexTokenizer *word_tokenizer = RegexTokenizer_new(word_pattern);

    Hash *stop_list = Hash_new(0);
    Hash_Store_Utf8(stop_list, "x", 1, (Obj*)CFISH_TRUE);
    SnowballStopFilter *stop_filter = SnowStop_new(NULL, stop_list);

    Vector *analyzers = Vec_new(0);
    Vec_Push(analyzers, (Obj*)word_tokenizer);
    Vec_Push(analyzers, (Obj*)stop_filter);
    PolyAnalyzer *fancy_analyzer = PolyAnalyzer_new(NULL, analyzers);

    FullTextType *fancy = FullTextType_new((Analyzer*)fancy_analyzer);

    // Schema.
    Schema *schema   = Schema_new();
    String *plain_str = Str_newf("plain");
    String *fancy_str = Str_newf("fancy");
    Schema_Spec_Field(schema, plain_str, (FieldType*)plain);
    Schema_Spec_Field(schema, fancy_str, (FieldType*)fancy);

    // Indexer.
    RAMFolder *folder  = RAMFolder_new(NULL);
    Indexer   *indexer = Indexer_new(schema, (Obj*)folder, NULL, 0);

    // Index documents.
    Vector *doc_set = TestUtils_doc_set();
    for (uint32_t i = 0; i < Vec_Get_Size(doc_set); ++i) {
        String *content_string = (String*)Vec_Fetch(doc_set, i);
        Doc *doc = Doc_new(NULL, 0);
        Doc_Store(doc, plain_str, (Obj*)content_string);
        Doc_Store(doc, fancy_str, (Obj*)content_string);
        Indexer_Add_Doc(indexer, doc, 1.0);
        DECREF(doc);
    }
    Indexer_Commit(indexer);

    // Clean up.
    DECREF(doc_set);
    DECREF(indexer);
    DECREF(fancy_str);
    DECREF(plain_str);
    DECREF(schema);
    DECREF(fancy);
    DECREF(fancy_analyzer);
    DECREF(analyzers);
    DECREF(stop_list);
    DECREF(word_pattern);
    DECREF(plain);
    DECREF(tokenizer);
    DECREF(pattern);

    return (Folder*)folder;
}
예제 #7
0
파일: indexer.c 프로젝트: apache/lucy
Doc*
S_parse_file(const char *filename) {
    size_t bytes = strlen(uscon_source) + 1 + strlen(filename) + 1;
    char *path = (char*)malloc(bytes);
    path[0] = '\0';
    strcat(path, uscon_source);
    strcat(path, "/");
    strcat(path, filename);

    FILE *stream = fopen(path, "r");
    if (stream == NULL) {
        perror(path);
        exit(1);
    }

    char *title    = NULL;
    char *bodytext = NULL;
    if (fscanf(stream, "%m[^\r\n] %m[\x01-\x7F]", &title, &bodytext) != 2) {
        fprintf(stderr, "Can't extract title/bodytext from '%s'", path);
        exit(1);
    }

    const char *category = NULL;
    if (S_starts_with(filename, "art")) {
        category = "article";
    }
    else if (S_starts_with(filename, "amend")) {
        category = "amendment";
    }
    else if (S_starts_with(filename, "preamble")) {
        category = "preamble";
    }
    else {
        fprintf(stderr, "Can't derive category for %s", filename);
        exit(1);
    }

    Doc *doc = Doc_new(NULL, 0);

    {
        // Store 'title' field
        String *field = Str_newf("title");
        String *value = Str_new_from_utf8(title, strlen(title));
        Doc_Store(doc, field, (Obj*)value);
        DECREF(field);
        DECREF(value);
    }

    {
        // Store 'content' field
        String *field = Str_newf("content");
        String *value = Str_new_from_utf8(bodytext, strlen(bodytext));
        Doc_Store(doc, field, (Obj*)value);
        DECREF(field);
        DECREF(value);
    }

    {
        // Store 'url' field
        String *field = Str_newf("url");
        String *value = Str_new_from_utf8(filename, strlen(filename));
        Doc_Store(doc, field, (Obj*)value);
        DECREF(field);
        DECREF(value);
    }

    {
        // Store 'category' field
        String *field = Str_newf("category");
        String *value = Str_new_from_utf8(category, strlen(category));
        Doc_Store(doc, field, (Obj*)value);
        DECREF(field);
        DECREF(value);
    }

    fclose(stream);
    free(bodytext);
    free(title);
    free(path);
    return doc;
}