Esempio n. 1
0
static Schema*
S_create_schema() {
    // Create a new schema.
    Schema *schema = Schema_new();

    // Create an analyzer.
    String       *language = Str_newf("en");
    EasyAnalyzer *analyzer = EasyAnalyzer_new(language);

    // Specify fields.

    FullTextType *type = FullTextType_new((Analyzer*)analyzer);

    {
        String *field_str = Str_newf("title");
        Schema_Spec_Field(schema, field_str, (FieldType*)type);
        DECREF(field_str);
    }

    {
        String *field_str = Str_newf("content");
        Schema_Spec_Field(schema, field_str, (FieldType*)type);
        DECREF(field_str);
    }

    DECREF(language);
    DECREF(analyzer);
    DECREF(type);
    return schema;
}
Esempio n. 2
0
static void
test_highlighting(TestBatchRunner *runner) {
    Schema *schema = Schema_new();
    StandardTokenizer *tokenizer = StandardTokenizer_new();
    FullTextType *plain_type = FullTextType_new((Analyzer*)tokenizer);
    FullTextType_Set_Highlightable(plain_type, true);
    FullTextType *dunked_type = FullTextType_new((Analyzer*)tokenizer);
    FullTextType_Set_Highlightable(dunked_type, true);
    FullTextType_Set_Boost(dunked_type, 0.1f);
    String *content = (String*)SSTR_WRAP_UTF8("content", 7);
    Schema_Spec_Field(schema, content, (FieldType*)plain_type);
    String *alt = (String*)SSTR_WRAP_UTF8("alt", 3);
    Schema_Spec_Field(schema, alt, (FieldType*)dunked_type);
    DECREF(plain_type);
    DECREF(dunked_type);
    DECREF(tokenizer);

    RAMFolder *folder = RAMFolder_new(NULL);
    Indexer *indexer = Indexer_new(schema, (Obj*)folder, NULL, 0);

    Doc *doc = Doc_new(NULL, 0);
    String *string = (String *)SSTR_WRAP_UTF8(TEST_STRING, TEST_STRING_LEN);
    Doc_Store(doc, content, (Obj*)string);
    Indexer_Add_Doc(indexer, doc, 1.0f);
    DECREF(doc);

    doc = Doc_new(NULL, 0);
    string = (String *)SSTR_WRAP_UTF8("\"I see,\" said the blind man.", 28);
    Doc_Store(doc, content, (Obj*)string);
    Indexer_Add_Doc(indexer, doc, 1.0f);
    DECREF(doc);

    doc = Doc_new(NULL, 0);
    string = (String *)SSTR_WRAP_UTF8("x but not why or 2ee", 20);
    Doc_Store(doc, content, (Obj*)string);
    string = (String *)SSTR_WRAP_UTF8(TEST_STRING
                                     " and extra stuff so it scores lower",
                                     TEST_STRING_LEN + 35);
    Doc_Store(doc, alt, (Obj*)string);
    Indexer_Add_Doc(indexer, doc, 1.0f);
    DECREF(doc);

    Indexer_Commit(indexer);
    DECREF(indexer);

    Searcher *searcher = (Searcher*)IxSearcher_new((Obj*)folder);
    Obj *query = (Obj*)SSTR_WRAP_UTF8("\"x y z\" AND " PHI, 14);
    Hits *hits = Searcher_Hits(searcher, query, 0, 10, NULL);

    test_Raw_Excerpt(runner, searcher, query);
    test_Highlight_Excerpt(runner, searcher, query);
    test_Create_Excerpt(runner, searcher, query, hits);

    DECREF(hits);
    DECREF(searcher);
    DECREF(folder);
    DECREF(schema);
}
Esempio n. 3
0
static Schema*
S_create_schema() {
    // Create a new schema.
    Schema *schema = Schema_new();

    // Create an analyzer.
    String       *language = Str_newf("en");
    EasyAnalyzer *analyzer = EasyAnalyzer_new(language);

    // Specify fields.

    {
        String *field_str = Str_newf("title");
        FullTextType *type = FullTextType_new((Analyzer*)analyzer);
        Schema_Spec_Field(schema, field_str, (FieldType*)type);
        DECREF(type);
        DECREF(field_str);
    }

    {
        String *field_str = Str_newf("content");
        FullTextType *type = FullTextType_new((Analyzer*)analyzer);
        FullTextType_Set_Highlightable(type, true);
        Schema_Spec_Field(schema, field_str, (FieldType*)type);
        DECREF(type);
        DECREF(field_str);
    }

    {
        String *field_str = Str_newf("url");
        StringType *type = StringType_new();
        StringType_Set_Indexed(type, false);
        Schema_Spec_Field(schema, field_str, (FieldType*)type);
        DECREF(type);
        DECREF(field_str);
    }

    {
        String *field_str = Str_newf("category");
        StringType *type = StringType_new();
        StringType_Set_Stored(type, false);
        Schema_Spec_Field(schema, field_str, (FieldType*)type);
        DECREF(type);
        DECREF(field_str);
    }

    DECREF(analyzer);
    DECREF(language);
    return schema;
}
Esempio n. 4
0
static void
test_hl_selection(TestBatchRunner *runner) {
    Schema *schema = Schema_new();
    StandardTokenizer *tokenizer = StandardTokenizer_new();
    FullTextType *plain_type = FullTextType_new((Analyzer*)tokenizer);
    FullTextType_Set_Highlightable(plain_type, true);
    String *content = (String*)SSTR_WRAP_UTF8("content", 7);
    Schema_Spec_Field(schema, content, (FieldType*)plain_type);
    DECREF(plain_type);
    DECREF(tokenizer);

    RAMFolder *folder = RAMFolder_new(NULL);
    Indexer *indexer = Indexer_new(schema, (Obj*)folder, NULL, 0);

    static char test_string[] =
        "bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla. "
        "bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla. "
        "bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla. "
        "bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla. "
        "bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla. "
        "bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla NNN bla. "
        "bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla. "
        "bla bla bla MMM bla bla bla bla bla bla bla bla bla bla bla bla. "
        "bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla. "
        "bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla. "
        "bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla. "
        "bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla. "
        "bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla bla. ";
    Doc *doc = Doc_new(NULL, 0);
    String *string = (String *)SSTR_WRAP_UTF8(test_string, strlen(test_string));
    Doc_Store(doc, content, (Obj*)string);
    Indexer_Add_Doc(indexer, doc, 1.0f);
    DECREF(doc);

    Indexer_Commit(indexer);
    DECREF(indexer);

    Searcher *searcher = (Searcher*)IxSearcher_new((Obj*)folder);
    Obj *query = (Obj*)SSTR_WRAP_UTF8("NNN MMM", 7);
    Highlighter *highlighter = Highlighter_new(searcher, query, content, 200);
    Hits *hits = Searcher_Hits(searcher, query, 0, 10, NULL);
    HitDoc *hit = Hits_Next(hits);
    String *excerpt = Highlighter_Create_Excerpt(highlighter, hit);
    String *mmm = (String*)SSTR_WRAP_UTF8("MMM", 3);
    String *nnn = (String*)SSTR_WRAP_UTF8("NNN", 3);
    TEST_TRUE(runner, Str_Find(excerpt, mmm) >= 0 || Str_Find(excerpt, nnn) >= 0,
              "Sentence boundary algo doesn't chop terms");

    DECREF(excerpt);
    DECREF(hit);
    DECREF(hits);
    DECREF(highlighter);
    DECREF(searcher);
    DECREF(folder);
    DECREF(schema);
}
Esempio n. 5
0
TestSchema*
TestSchema_init(TestSchema *self)
{
    Tokenizer *tokenizer = Tokenizer_new(NULL);
    FullTextType *type = FullTextType_new((Analyzer*)tokenizer);

    Schema_init((Schema*)self);
    FullTextType_Set_Highlightable(type, true);
    Schema_Spec_Field(self, &content, (FieldType*)type);
    DECREF(type);
    DECREF(tokenizer);

    return self;
}
Esempio n. 6
0
void
Schema_Eat_IMP(Schema *self, Schema *other) {
    if (!Schema_is_a(self, Schema_get_class(other))) {
        THROW(ERR, "%o not a descendent of %o",
              Schema_get_class_name(self), Schema_get_class_name(other));
    }

    SchemaIVARS *const ovars = Schema_IVARS(other);
    HashIterator *iter = HashIter_new(ovars->types);
    while (HashIter_Next(iter)) {
        String    *field = HashIter_Get_Key(iter);
        FieldType *type  = (FieldType*)HashIter_Get_Value(iter);
        Schema_Spec_Field(self, field, type);
    }
    DECREF(iter);
}
Esempio n. 7
0
Schema*
Schema_Load_IMP(Schema *self, Obj *dump) {
    Hash *source = (Hash*)CERTIFY(dump, HASH);
    String *class_name
        = (String*)CERTIFY(Hash_Fetch_Utf8(source, "_class", 6), STRING);
    Class *klass = Class_singleton(class_name, NULL);
    Schema *loaded = (Schema*)Class_Make_Obj(klass);
    Hash *type_dumps
        = (Hash*)CERTIFY(Hash_Fetch_Utf8(source, "fields", 6), HASH);
    Vector *analyzer_dumps
        = (Vector*)CERTIFY(Hash_Fetch_Utf8(source, "analyzers", 9), VECTOR);
    Vector *analyzers
        = (Vector*)Freezer_load((Obj*)analyzer_dumps);
    UNUSED_VAR(self);

    // Start with a blank Schema.
    Schema_init(loaded);
    SchemaIVARS *const loaded_ivars = Schema_IVARS(loaded);
    Vec_Grow(loaded_ivars->uniq_analyzers, Vec_Get_Size(analyzers));

    HashIterator *iter = HashIter_new(type_dumps);
    while (HashIter_Next(iter)) {
        String *field     = HashIter_Get_Key(iter);
        Hash   *type_dump = (Hash*)CERTIFY(HashIter_Get_Value(iter), HASH);
        String *type_str  = (String*)Hash_Fetch_Utf8(type_dump, "type", 4);
        if (type_str) {
            if (Str_Equals_Utf8(type_str, "fulltext", 8)) {
                // Replace the "analyzer" tick with the real thing.
                Obj *tick
                    = CERTIFY(Hash_Fetch_Utf8(type_dump, "analyzer", 8), OBJ);
                Analyzer *analyzer
                    = (Analyzer*)Vec_Fetch(analyzers,
                                          (uint32_t)Json_obj_to_i64(tick));
                if (!analyzer) {
                    THROW(ERR, "Can't find analyzer for '%o'", field);
                }
                Hash_Store_Utf8(type_dump, "analyzer", 8, INCREF(analyzer));
                FullTextType *type
                    = (FullTextType*)S_load_type(FULLTEXTTYPE,
                                                 (Obj*)type_dump);
                Schema_Spec_Field(loaded, field, (FieldType*)type);
                DECREF(type);
            }
            else if (Str_Equals_Utf8(type_str, "string", 6)) {
                StringType *type
                    = (StringType*)S_load_type(STRINGTYPE, (Obj*)type_dump);
                Schema_Spec_Field(loaded, field, (FieldType*)type);
                DECREF(type);
            }
            else if (Str_Equals_Utf8(type_str, "blob", 4)) {
                BlobType *type
                    = (BlobType*)S_load_type(BLOBTYPE, (Obj*)type_dump);
                Schema_Spec_Field(loaded, field, (FieldType*)type);
                DECREF(type);
            }
            else if (Str_Equals_Utf8(type_str, "i32_t", 5)) {
                Int32Type *type
                    = (Int32Type*)S_load_type(INT32TYPE, (Obj*)type_dump);
                Schema_Spec_Field(loaded, field, (FieldType*)type);
                DECREF(type);
            }
            else if (Str_Equals_Utf8(type_str, "i64_t", 5)) {
                Int64Type *type
                    = (Int64Type*)S_load_type(INT64TYPE, (Obj*)type_dump);
                Schema_Spec_Field(loaded, field, (FieldType*)type);
                DECREF(type);
            }
            else if (Str_Equals_Utf8(type_str, "f32_t", 5)) {
                Float32Type *type
                    = (Float32Type*)S_load_type(FLOAT32TYPE, (Obj*)type_dump);
                Schema_Spec_Field(loaded, field, (FieldType*)type);
                DECREF(type);
            }
            else if (Str_Equals_Utf8(type_str, "f64_t", 5)) {
                Float64Type *type
                    = (Float64Type*)S_load_type(FLOAT64TYPE, (Obj*)type_dump);
                Schema_Spec_Field(loaded, field, (FieldType*)type);
                DECREF(type);
            }
            else {
                THROW(ERR, "Unknown type '%o' for field '%o'", type_str, field);
            }
        }
        else {
            FieldType *type
                = (FieldType*)CERTIFY(Freezer_load((Obj*)type_dump),
                                      FIELDTYPE);
            Schema_Spec_Field(loaded, field, type);
            DECREF(type);
        }
    }
    DECREF(iter);

    DECREF(analyzers);

    return loaded;
}
Esempio n. 8
0
static Folder*
build_index() {
    // Plain type.
    String         *pattern   = Str_newf("\\S+");
    RegexTokenizer *tokenizer = RegexTokenizer_new(pattern);
    FullTextType   *plain     = FullTextType_new((Analyzer*)tokenizer);

    // Fancy type.

    String         *word_pattern   = Str_newf("\\w+");
    RegexTokenizer *word_tokenizer = RegexTokenizer_new(word_pattern);

    Hash *stop_list = Hash_new(0);
    Hash_Store_Utf8(stop_list, "x", 1, (Obj*)CFISH_TRUE);
    SnowballStopFilter *stop_filter = SnowStop_new(NULL, stop_list);

    Vector *analyzers = Vec_new(0);
    Vec_Push(analyzers, (Obj*)word_tokenizer);
    Vec_Push(analyzers, (Obj*)stop_filter);
    PolyAnalyzer *fancy_analyzer = PolyAnalyzer_new(NULL, analyzers);

    FullTextType *fancy = FullTextType_new((Analyzer*)fancy_analyzer);

    // Schema.
    Schema *schema   = Schema_new();
    String *plain_str = Str_newf("plain");
    String *fancy_str = Str_newf("fancy");
    Schema_Spec_Field(schema, plain_str, (FieldType*)plain);
    Schema_Spec_Field(schema, fancy_str, (FieldType*)fancy);

    // Indexer.
    RAMFolder *folder  = RAMFolder_new(NULL);
    Indexer   *indexer = Indexer_new(schema, (Obj*)folder, NULL, 0);

    // Index documents.
    Vector *doc_set = TestUtils_doc_set();
    for (uint32_t i = 0; i < Vec_Get_Size(doc_set); ++i) {
        String *content_string = (String*)Vec_Fetch(doc_set, i);
        Doc *doc = Doc_new(NULL, 0);
        Doc_Store(doc, plain_str, (Obj*)content_string);
        Doc_Store(doc, fancy_str, (Obj*)content_string);
        Indexer_Add_Doc(indexer, doc, 1.0);
        DECREF(doc);
    }
    Indexer_Commit(indexer);

    // Clean up.
    DECREF(doc_set);
    DECREF(indexer);
    DECREF(fancy_str);
    DECREF(plain_str);
    DECREF(schema);
    DECREF(fancy);
    DECREF(fancy_analyzer);
    DECREF(analyzers);
    DECREF(stop_list);
    DECREF(word_pattern);
    DECREF(plain);
    DECREF(tokenizer);
    DECREF(pattern);

    return (Folder*)folder;
}