Пример #1
0
// Read data from file, update the index
void updateIndex(string data_file, Indexer *index, Analyzer* analyzer)
{
    /// Set up the Schema
    Schema *schema = Schema::create(srch2is::DefaultIndex);
    schema->setPrimaryKey("primaryKey");
    schema->setSearchableAttribute("description", 2);
    schema->setScoringExpression("idf_score*doc_boost");

    Record *record = new Record(schema);

    unsigned docsCounter = 0;
    string line;

    ifstream data(data_file.c_str());

    /// Read records from file
    /// the file should have two fields, seperated by '^'
    /// the first field is the primary key, the second field is a searchable attribute
    while(getline(data,line))
    {
        unsigned cellCounter = 0;
        stringstream  lineStream(line);
        string cell;

        while(getline(lineStream,cell,'^') && cellCounter < 3 )
        {
            if (cellCounter == 0)
            {
                record->setPrimaryKey(cell.c_str());
            }
            else if (cellCounter == 1)
            {
                record->setSearchableAttributeValue(0, cell);
            }
            else
            {
                float recordBoost = atof(cell.c_str());
                record->setRecordBoost(recordBoost);
            }

            cellCounter++;
        }

        index->addRecord(record, analyzer);

        docsCounter++;

        record->clear();
    }

    cout << "#New Docs Inserted:" << docsCounter << endl;

    sleep(4);
    cout << "Index updated." << endl;

    data.close();

    delete schema;
}
// Read data from file, update the index
void updateIndex(string data_file, Indexer *index)
{
    /// Set up the Schema
    Schema *schema = Schema::create(srch2is::DefaultIndex);
    schema->setPrimaryKey("primaryKey");
    schema->setSearchableAttribute("description", 2);

    Record *record = new Record(schema);

    unsigned docsCounter = 0;
    string line;

    ifstream data(data_file.c_str());

    /// Read records from file
    /// the file should have two fields, seperated by '^'
    /// the first field is the primary key, the second field is a searchable attribute
    Analyzer *analyzer = new Analyzer(NULL, NULL, NULL, NULL, "", srch2is::STANDARD_ANALYZER);
    while(getline(data,line))
    {
        unsigned cellCounter = 0;
        stringstream  lineStream(line);
        string cell;

        while(getline(lineStream,cell,'^') && cellCounter < 2 )
        {
            if (cellCounter == 0)
            {
                record->setPrimaryKey(cell.c_str());
            }
            else
            {
                record->setSearchableAttributeValue(0, cell);
            }

            cellCounter++;
        }
        index->addRecord(record, analyzer);
        docsCounter++;

        record->clear();
    }

    cout << "#Docs Read:" << docsCounter << endl;

    index->commit();
    sleep(11);
    cout << "Index updated." << endl;

    data.close();

    delete schema;
    delete analyzer;
}
Пример #3
0
// Read data from file, build the index, and save the index to disk
void buildIndex(string data_file, string index_dir)
{
    /// Set up the Schema
    Schema *schema = Schema::create(srch2is::DefaultIndex);
    schema->setPrimaryKey("primaryKey");
    schema->setSearchableAttribute("description", 2);
    schema->setScoringExpression("idf_score*doc_boost");

    /// Create an Analyzer
    Analyzer *analyzer = new Analyzer(NULL, NULL, NULL, NULL, "",
                                      srch2is::STANDARD_ANALYZER);

    /// Create an index writer
    unsigned mergeEveryNSeconds = 2;
    unsigned mergeEveryMWrites = 5;
    unsigned updateHistogramEveryPMerges = 1;
    unsigned updateHistogramEveryQWrites = 5;
    IndexMetaData *indexMetaData = new IndexMetaData( new CacheManager(),
    		mergeEveryNSeconds, mergeEveryMWrites,
    		updateHistogramEveryPMerges, updateHistogramEveryQWrites, index_dir);
    Indexer *indexer = Indexer::create(indexMetaData, analyzer, schema);

    Record *record = new Record(schema);

    unsigned docsCounter = 0;
    string line;

    ifstream data(data_file.c_str());

    /// Read records from file
    /// the file should have two fields, seperated by '^'
    /// the first field is the primary key, the second field is a searchable attribute
    while(getline(data,line))
    {
        unsigned cellCounter = 0;
        stringstream  lineStream(line);
        string cell;

        while(getline(lineStream,cell,'^') && cellCounter < 3 )
        {
            if (cellCounter == 0)
            {
                record->setPrimaryKey(cell.c_str());
            }
            else if (cellCounter == 1)
            {
                record->setSearchableAttributeValue(0, cell);
            }
            else
            {
                float recordBoost = atof(cell.c_str());
                record->setRecordBoost(recordBoost);
            }

            cellCounter++;
        }

        indexer->addRecord(record, analyzer);

        docsCounter++;

        record->clear();
    }

    cout << "#Docs Read:" << docsCounter << endl;

    indexer->commit();
    indexer->save();

    cout << "Index saved." << endl;

    data.close();

    delete indexer;
    delete indexMetaData;
    delete analyzer;
    delete schema;
}
Пример #4
0
void addRecords()
{
    ///Create Schema
    Schema *schema = Schema::create(srch2::instantsearch::DefaultIndex);
    schema->setPrimaryKey("article_id"); // integer, not searchable
    schema->setSearchableAttribute("article_id"); // convert id to searchable text
    schema->setSearchableAttribute("article_authors", 2); // searchable text
    schema->setSearchableAttribute("article_title", 7); // searchable text
    
    SynonymContainer *syn = SynonymContainer::getInstance("", SYNONYM_DONOT_KEEP_ORIGIN);
    syn->init();
    Record *record = new Record(schema);
    Analyzer *analyzer = new Analyzer(NULL, NULL, NULL, syn, "");

    unsigned mergeEveryNSeconds = 3;
    unsigned mergeEveryMWrites = 5;
    unsigned updateHistogramEveryPMerges = 1;
    unsigned updateHistogramEveryQWrites = 5;
    string INDEX_DIR = ".";
    IndexMetaData *indexMetaData = new IndexMetaData( NULL,
    		mergeEveryNSeconds, mergeEveryMWrites,
    		updateHistogramEveryPMerges, updateHistogramEveryQWrites,
    		INDEX_DIR);
    Indexer *index = Indexer::create(indexMetaData, analyzer, schema);
    
    record->setPrimaryKey(1001);
    record->setSearchableAttributeValue("article_authors", "Tom Smith and Jack Lennon");
    record->setSearchableAttributeValue("article_title", "come Yesterday Once More");
    record->setRecordBoost(10);
    index->addRecord(record, analyzer);

    record->clear();
    record->setPrimaryKey(1008);
    record->setSearchableAttributeValue(0, "Jimi Hendrix");
    record->setSearchableAttributeValue(1, "Little wing");
    record->setRecordBoost(90);
    index->addRecord(record, analyzer);

    index->commit();
    //index->commit();
    //index->print_Index();

    std::cout << "print 1 $$$$$$$$$$$$$$" << std::endl;

    record->clear();
    record->setPrimaryKey(1007);
    record->setSearchableAttributeValue(0, "Jimaai Hendaarix");
    record->setSearchableAttributeValue(1, "Littaale waaing");
    record->setRecordBoost(90);
    index->addRecord(record, analyzer);

    //index->print_Index();

    std::cout << "print 2 $$$$$$$$$$$$$$" << std::endl;

    delete schema;
    delete record;
    delete analyzer;
    delete index;
    syn->free();
}
Пример #5
0
void testIndexData()
{
    /// Create Schema
    Schema *schema = Schema::create(srch2::instantsearch::DefaultIndex);
    schema->setPrimaryKey("article_id"); // integer, not searchable
    schema->setSearchableAttribute("article_id"); // convert id to searchable text
    schema->setSearchableAttribute("article_authors", 2); // searchable text
    schema->setSearchableAttribute("article_title", 7); // searchable text

    /// Create Analyzer
    SynonymContainer *syn = SynonymContainer::getInstance("", SYNONYM_DONOT_KEEP_ORIGIN);
    syn->init();

    Analyzer *analyzer = new Analyzer(NULL, NULL, NULL, syn, "");

    /// Create IndexData
    string INDEX_DIR = ".";
    IndexData *indexData = IndexData::create(INDEX_DIR,
                                            analyzer,
                                            schema,
                                            srch2::instantsearch::DISABLE_STEMMER_NORMALIZER);

    Record *record = new Record(schema);

    record->setPrimaryKey(1001);
    record->setSearchableAttributeValue("article_authors", "Tom Smith and Jack Lennon");
    record->setSearchableAttributeValue("article_title", "come Yesterday Once More");
    record->setRecordBoost(10);
    indexData->_addRecord(record, analyzer);

    record->clear();
    record->setPrimaryKey(1008);
    record->setSearchableAttributeValue(0, "Jimi Hendrix");
    record->setSearchableAttributeValue(1, "Little wing");
    record->setRecordBoost(90);
    indexData->_addRecord(record, analyzer);

    indexData->finishBulkLoad();
    //index->print_Index();

    record->clear();
    record->setPrimaryKey(1007);
    record->setSearchableAttributeValue(0, "Jimaai Hendaarix");
    record->setSearchableAttributeValue(1, "Littaale waaing");
    record->setRecordBoost(90);
    indexData->_addRecord(record, analyzer);

    //index->print_Index();

    /// test Trie
    Trie_Internal *trie = indexData->trie;

    typedef boost::shared_ptr<TrieRootNodeAndFreeList > TrieRootNodeSharedPtr;
    TrieRootNodeSharedPtr rootSharedPtr;
    trie->getTrieRootNode_ReadView(rootSharedPtr);
    TrieNode *root = rootSharedPtr->root;

    (void)(root);

    ASSERT( trie->getTrieNodeFromUtf8String( root, "and")->getId() < trie->getTrieNodeFromUtf8String( root, "come")->getId() );
    ASSERT( trie->getTrieNodeFromUtf8String( root, "come")->getId() < trie->getTrieNodeFromUtf8String( root, "hendrix")->getId() );
    ASSERT( trie->getTrieNodeFromUtf8String( root, "hendrix")->getId() < trie->getTrieNodeFromUtf8String( root, "jack")->getId() );
    ASSERT( trie->getTrieNodeFromUtf8String( root, "jack")->getId() < trie->getTrieNodeFromUtf8String( root, "jimi")->getId() );
    ASSERT( trie->getTrieNodeFromUtf8String( root, "jimi")->getId() < trie->getTrieNodeFromUtf8String( root, "lennon")->getId() );
    ASSERT( trie->getTrieNodeFromUtf8String( root, "lennon")->getId() < trie->getTrieNodeFromUtf8String( root, "little")->getId() );
    ASSERT( trie->getTrieNodeFromUtf8String( root, "little")->getId() < trie->getTrieNodeFromUtf8String( root, "more")->getId() );
    ASSERT( trie->getTrieNodeFromUtf8String( root, "more")->getId() < trie->getTrieNodeFromUtf8String( root, "once")->getId() );
    ASSERT( trie->getTrieNodeFromUtf8String( root, "once")->getId() < trie->getTrieNodeFromUtf8String( root, "smith")->getId() );
    ASSERT( trie->getTrieNodeFromUtf8String( root, "smith")->getId() < trie->getTrieNodeFromUtf8String( root, "tom")->getId() );
    ASSERT( trie->getTrieNodeFromUtf8String( root, "tom")->getId() < trie->getTrieNodeFromUtf8String( root, "wing")->getId() );
    ASSERT( trie->getTrieNodeFromUtf8String( root, "wing")->getId() < trie->getTrieNodeFromUtf8String( root, "yesterday")->getId() );

    // we assume that there is no background thread does merge,
    // or even if there is such a background thread, it didn't have a chance to do the merge
    ASSERT( trie->getTrieNodeFromUtf8String( root, "jimaai") == NULL );
    ASSERT( trie->getTrieNodeFromUtf8String( root, "Hendaarix") == NULL );
    ASSERT( trie->getTrieNodeFromUtf8String( root, "Littaale") == NULL );
    ASSERT( trie->getTrieNodeFromUtf8String( root, "waaing") == NULL );

    ASSERT( trie->getTrieNodeFromUtf8String( root, "j")->getMinId() == trie->getTrieNodeFromUtf8String( root, "jack")->getId() );
    ASSERT( trie->getTrieNodeFromUtf8String( root, "j")->getMaxId() == trie->getTrieNodeFromUtf8String( root, "jimi")->getId() );
    ASSERT( trie->getTrieNodeFromUtf8String( root, "ja")->getMinId() == trie->getTrieNodeFromUtf8String( root, "jack")->getId() );
    ASSERT( trie->getTrieNodeFromUtf8String( root, "ja")->getMaxId() == trie->getTrieNodeFromUtf8String( root, "jack")->getId() );
    ASSERT( trie->getTrieNodeFromUtf8String( root, "win")->getMinId() == trie->getTrieNodeFromUtf8String( root, "wing")->getId() );
    ASSERT( trie->getTrieNodeFromUtf8String( root, "win")->getMaxId() == trie->getTrieNodeFromUtf8String( root, "wing")->getId() );

    /// test ForwardIndex
    ForwardIndex *forwardIndex = indexData->forwardIndex;
    shared_ptr<vectorview<ForwardListPtr> > forwardListDirectoryReadView;
    forwardIndex->getForwardListDirectory_ReadView(forwardListDirectoryReadView);
    float score = 0;
    unsigned keywordId = 1;
    // define the attributeBitmap only in debug mode
#if ASSERT_LEVEL > 0
    vector<unsigned> attributeBitmap;
#endif
    ASSERT( forwardIndex->haveWordInRange(forwardListDirectoryReadView, 0,
    		trie->getTrieNodeFromUtf8String( root, "jack")->getId(),
    		trie->getTrieNodeFromUtf8String( root, "lennon")->getId(),
    		vector<unsigned>(), ATTRIBUTES_OP_AND,
    		keywordId, attributeBitmap, score) == true );
    ASSERT( forwardIndex->haveWordInRange(forwardListDirectoryReadView, 0,
    		trie->getTrieNodeFromUtf8String( root, "smith")->getId() + 1,
    		trie->getTrieNodeFromUtf8String( root, "tom")->getId() - 1,
    		vector<unsigned>(), ATTRIBUTES_OP_AND,
    		keywordId, attributeBitmap, score) == false );
    ASSERT( forwardIndex->haveWordInRange(forwardListDirectoryReadView, 1,
    		trie->getTrieNodeFromUtf8String( root, "hendrix")->getId(),
    		trie->getTrieNodeFromUtf8String( root, "jimi")->getId(),
    		vector<unsigned>(), ATTRIBUTES_OP_AND,
    		keywordId, attributeBitmap, score) == true );
    ASSERT( forwardIndex->haveWordInRange(forwardListDirectoryReadView, 1,
    		trie->getTrieNodeFromUtf8String( root, "wing")->getId() + 1,
    		trie->getTrieNodeFromUtf8String( root, "wing")->getId() + 2,
    		vector<unsigned>(), ATTRIBUTES_OP_AND,
    		keywordId, attributeBitmap, score) == false );

    /// test InvertedIndex
    InvertedIndex *invertedIndex = indexData->invertedIndex;

    (void)(forwardIndex);
    (void)(invertedIndex);
    (void)score;
    (void)keywordId;

    ASSERT(invertedIndex->getInvertedListSize_ReadView( trie->getTrieNodeFromUtf8String( root, "and")->getInvertedListOffset() ) == 1);
    ASSERT(invertedIndex->getInvertedListSize_ReadView( trie->getTrieNodeFromUtf8String( root, "come")->getInvertedListOffset() ) == 1);
    ASSERT(invertedIndex->getInvertedListSize_ReadView( trie->getTrieNodeFromUtf8String( root, "hendrix")->getInvertedListOffset() ) == 1);
    ASSERT(invertedIndex->getInvertedListSize_ReadView( trie->getTrieNodeFromUtf8String( root, "jack")->getInvertedListOffset() ) == 1);
    ASSERT(invertedIndex->getInvertedListSize_ReadView( trie->getTrieNodeFromUtf8String( root, "jimi")->getInvertedListOffset() ) == 1);
    ASSERT(invertedIndex->getInvertedListSize_ReadView( trie->getTrieNodeFromUtf8String( root, "lennon")->getInvertedListOffset() ) == 1);
    ASSERT(invertedIndex->getInvertedListSize_ReadView( trie->getTrieNodeFromUtf8String( root, "little")->getInvertedListOffset() ) == 1);
    ASSERT(invertedIndex->getInvertedListSize_ReadView( trie->getTrieNodeFromUtf8String( root, "more")->getInvertedListOffset() ) == 1);
    ASSERT(invertedIndex->getInvertedListSize_ReadView( trie->getTrieNodeFromUtf8String( root, "once")->getInvertedListOffset() ) == 1);
    ASSERT(invertedIndex->getInvertedListSize_ReadView( trie->getTrieNodeFromUtf8String( root, "smith")->getInvertedListOffset() ) == 1);
    ASSERT(invertedIndex->getInvertedListSize_ReadView( trie->getTrieNodeFromUtf8String( root, "tom")->getInvertedListOffset() ) == 1);
    ASSERT(invertedIndex->getInvertedListSize_ReadView( trie->getTrieNodeFromUtf8String( root, "wing")->getInvertedListOffset() ) == 1);


    delete schema;
    delete record;
    delete analyzer;
    delete indexData;
    syn->free();
}
Пример #6
0
void test1()
{
    Schema *schema = Schema::create(srch2::instantsearch::DefaultIndex);

    schema->setPrimaryKey("article_id"); // integer, not searchable

    schema->setSearchableAttribute("article_id"); // convert id to searchable text
    schema->setSearchableAttribute("article_authors", 2); // searchable text
    schema->setSearchableAttribute("article_title", 7); // searchable text

    // create an analyzer
    SynonymContainer *syn = SynonymContainer::getInstance("", SYNONYM_DONOT_KEEP_ORIGIN);
    syn->init();
    Analyzer *analyzer = new Analyzer(NULL, NULL, NULL, syn, "");
    
    unsigned mergeEveryNSeconds = 3;
    unsigned mergeEveryMWrites = 5;
    unsigned updateHistogramEveryPMerges = 1;
    unsigned updateHistogramEveryQWrites = 5;
    string INDEX_DIR = "test";
    IndexMetaData *indexMetaData = new IndexMetaData( new CacheManager(),
    		mergeEveryNSeconds, mergeEveryMWrites,
    		updateHistogramEveryPMerges, updateHistogramEveryQWrites,
    		INDEX_DIR);
    
    Indexer *index = Indexer::create(indexMetaData, analyzer, schema);
    Record *record = new Record(schema);
    char* authorsCharStar = new char[30];
    char* titleCharStar = new char[30];

    //generate random characers
    srand ( time(NULL) );
    // create a record of 3 attributes
    for (unsigned i = 0; i < 1000; i++)
    {
        record->setPrimaryKey(i + 1000);

        sprintf(authorsCharStar,"John %cLen%cnon",(rand() % 50)+65,(rand() % 10)+65);
        string authors = string(authorsCharStar);
        record->setSearchableAttributeValue("article_authors", authors);

        sprintf(titleCharStar,"Yesterday %cOnc%ce %cMore",
                (rand()%59)+65, (rand()%59)+65, (rand()%10)+65);
        string title = string(titleCharStar);
        record->setSearchableAttributeValue("article_title", title);

        record->setRecordBoost(rand() % 100);
        index->addRecord(record, analyzer);

        // for creating another record
        record->clear();
    }

    // build the index
    index->commit();

    //indexer->printNumberOfBytes();

    delete[] authorsCharStar;
    delete[] titleCharStar;
    delete record;
    delete index;
    delete analyzer;
    delete schema;
}
Пример #7
0
int main (int argc, char** argv)
{
  UnitTest t (21);

  // (blank)
  bool good = true;
  Record record;

  try {record = Record ("");}
  catch (std::string& e){t.diag (e); good = false;}
  t.notok (good, "Record::Record ('')");

  // []
  good = true;
  try {record = Record ("[]");}
  catch (std::string& e){t.diag (e); good = false;}
  t.notok (good, "Record::Record ('[]')");

  // [name:value]
  good = true;
  try {record = Record ("[name:value]");}
  catch (std::string& e){t.diag (e); good = false;}
  t.ok (good, "Record::Record ('[name:value]')");
  t.is (record.get ("name"), "value", "name=value");

  // [name:"value"]
  good = true;
  try {record = Record ("[name:\"value\"]");}
  catch (std::string& e){t.diag (e); good = false;}
  t.ok (good, "Record::Record ('[name:\"value\"]')");
  t.is (record.get ("name"), "value", "name=value");

  // [name:"one two"]
  good = true;
  try {record = Record ("[name:\"one two\"]");}
  catch (std::string& e){t.diag (e); good = false;}
  t.ok (good, "Record::Record ('[name:\"one two\"]')");
  t.is (record.get ("name"), "one two", "name=one two");

  // [one:two three:four]
  good = true;
  try {record = Record ("[one:\"two\" three:\"four\"]");}
  catch (std::string& e){t.diag (e); good = false;}
  t.ok (good, "Record::Record ('[one:\"two\" three:\"four\"]')");
  t.is (record.get ("one"), "two", "one=two");
  t.is (record.get ("three"), "four", "three=four");

  // Record::set
  record.clear ();
  record.set ("name", "value");
  t.is (record.composeF4 (), "[name:\"value\"]\n", "Record::set");

  // Record::has
  t.ok    (record.has ("name"), "Record::has");
  t.notok (record.has ("woof"), "Record::has not");

  // Record::get_int
  record.set ("one", 1);
  t.is (record.composeF4 (), "[name:\"value\" one:\"1\"]\n", "Record::set");
  t.is (record.get_int ("one"), 1, "Record::get_int");

  // Record::get_ulong
  record.set ("two", "4294967295");
  t.is (record.composeF4 (), "[name:\"value\" one:\"1\" two:\"4294967295\"]\n", "Record::set");
  t.is ((size_t)record.get_ulong ("two"), (size_t)4294967295, "Record::get_ulong");

  // Record::remove
  record.remove ("one");
  record.remove ("two");
  t.is (record.composeF4 (), "[name:\"value\"]\n", "Record::remove");

  // Record::all
  std::vector <Att> all = record.all ();
  t.is (all.size (), (size_t)1, "Record::all size");
  t.is (all[0].name (), "name", "Record::all[0].name ()");

  return 0;
}
Пример #8
0
// Read data from file, update the index
void updateIndexAndLookupRecord(string data_file, Indexer *index)
{
    /// Set up the Schema
    Schema *schema = Schema::create(srch2is::DefaultIndex);
    schema->setPrimaryKey("primaryKey");
    schema->setSearchableAttribute("description", 2);

    Record *record = new Record(schema);

    unsigned docsCounter = 0;
    string line;

    ifstream data(data_file.c_str());

    string recordToLookup;

    /// Read one record from file to insert
    /// the file should have two fields, seperated by '^'
    /// the first field is the primary key, the second field is a searchable attribute
    if ( getline(data,line) )
    {
        unsigned cellCounter = 0;
        stringstream  lineStream(line);
        string cell;

        while(getline(lineStream,cell,'^') && cellCounter < 3 )
        {
            if (cellCounter == 0)
            {
                record->setPrimaryKey(cell.c_str());

                recordToLookup = cell;

                // test looking up a record that doesn't exist
                ASSERT(index->lookupRecord(recordToLookup) == LU_ABSENT_OR_TO_BE_DELETED);
            }
            else if (cellCounter == 1)
            {
                record->setSearchableAttributeValue(0, cell);
            }
            else
            {
                float recordBoost = atof(cell.c_str());
                record->setRecordBoost(recordBoost);
            }

            cellCounter++;
        }
        Analyzer* analyzer = getAnalyzer();
        index->addRecord(record, analyzer);
        delete analyzer;

        // test looking up a record that was just inserted and NOT merged yet
        ASSERT(index->lookupRecord(recordToLookup) == LU_TO_BE_INSERTED);

        docsCounter++;

        record->clear();
    }

    cout << "#New Docs Inserted:" << docsCounter << endl;

    sleep(11);

    // test looking up a record that was merged
    ASSERT(index->lookupRecord(recordToLookup) == LU_PRESENT_IN_READVIEW_AND_WRITEVIEW);

    cout << "Index updated." << endl;

    data.close();

    delete schema;
}
Indexer *buildIndex(string data_file, string index_dir, string expression)
{
    /// Set up the Schema
    Schema *schema = Schema::create(srch2is::DefaultIndex, srch2::instantsearch::POSITION_INDEX_FIELDBIT);
    schema->setPrimaryKey("id");
    schema->setSearchableAttribute("name", 2);
    schema->setSearchableAttribute("category", 1);
    schema->setScoringExpression(expression);

    /// Create an Analyzer
    SynonymContainer *syn = SynonymContainer::getInstance(string(""), SYNONYM_DONOT_KEEP_ORIGIN);
    ProtectedWordsContainer *prot = ProtectedWordsContainer::getInstance("");
    AnalyzerInternal *simpleAnlyzer = new StandardAnalyzer(NULL, NULL, prot, syn, string(""));

    Analyzer *analyzer = new Analyzer(NULL, NULL, prot, syn, "", srch2is::STANDARD_ANALYZER);

    /// Create an index writer
    unsigned mergeEveryNSeconds = 3;
    unsigned mergeEveryMWrites = 5;
    unsigned updateHistogramEveryPMerges = 1;
    unsigned updateHistogramEveryQWrites = 5;
    IndexMetaData *indexMetaData = new IndexMetaData( new CacheManager(),
    		mergeEveryNSeconds, mergeEveryMWrites,
    		updateHistogramEveryPMerges, updateHistogramEveryQWrites,
    		index_dir);
    Indexer *indexer = Indexer::create(indexMetaData, analyzer, schema);

    Record *record = new Record(schema);

    unsigned docsCounter = 0;
    string line;

    ifstream data(data_file.c_str());

    /// Read records from file
    /// the file should have two fields, seperated by '^'
    /// the first field is the primary key, the second field is a searchable attribute
    while(getline(data,line))
    {
        unsigned cellCounter = 0;
        stringstream  lineStream(line);
        string cell;

        while(getline(lineStream,cell,'^') && cellCounter < 4 )
        {
            if (cellCounter == 0)
            {
                record->setPrimaryKey(cell.c_str());
            }
            else if (cellCounter == 1)
            {
                record->setSearchableAttributeValue(0, cell);
            }
            else if (cellCounter == 2)
            {
                record->setSearchableAttributeValue(1, cell);
            }
            else if (cellCounter == 3)
            {
                record->setRecordBoost(atof(cell.c_str()));
            }

            cellCounter++;
        }

        indexer->addRecord(record, analyzer);

        docsCounter++;

        record->clear();
    }

    cout << "#Docs Read:" << docsCounter << endl;

    indexer->commit();

    data.close();

    delete record;
    delete analyzer;
    delete schema;
    prot->free();
    syn->free();

    return indexer;
}
Пример #10
0
/**
 * Keyword: Record
 *   Pink:  2, 5, 6
 *   Pinksyponzi: 4
 *
 *   Floyd: 2, 6
 *   Floydsyponzi: 4
 *
 *   Shine: 2, 3
 *   Shinesyponzi: 4
 */
void addRecords() {
    ///Create Schema
    Schema *schema = Schema::create(srch2::instantsearch::DefaultIndex);
    schema->setPrimaryKey("article_id"); // integer, not searchable
    schema->setSearchableAttribute("article_id"); // convert id to searchable text
    schema->setSearchableAttribute("article_authors", 2); // searchable text
    schema->setSearchableAttribute("article_title", 7); // searchable text

    Record *record = new Record(schema);
    SynonymContainer *syn = SynonymContainer::getInstance("", SYNONYM_DONOT_KEEP_ORIGIN);
    syn->init();
    Analyzer *analyzer = new Analyzer(NULL, NULL, NULL, syn, "");

    unsigned mergeEveryNSeconds = 3;
    unsigned mergeEveryMWrites = 5;
    unsigned updateHistogramEveryPMerges = 1;
    unsigned updateHistogramEveryQWrites = 5;
    srch2is::IndexMetaData *indexMetaData = new srch2is::IndexMetaData(NULL,
            mergeEveryNSeconds, mergeEveryMWrites,
            updateHistogramEveryPMerges, updateHistogramEveryQWrites,
            INDEX_DIR);
    srch2is::Indexer *index = srch2is::Indexer::create(indexMetaData, analyzer,
            schema);

    record->setPrimaryKey(1001);
    record->setSearchableAttributeValue("article_authors",
            "Tom Smith and Jack Lennon");
    record->setSearchableAttributeValue("article_title",
            "come Yesterday Once More");
    record->setRecordBoost(10);
	string testRecord = "test string";
	record->setInMemoryData(testRecord.c_str(), testRecord.length());
    index->addRecord(record, analyzer);

    record->clear();
    record->setPrimaryKey(1002);
    record->setSearchableAttributeValue(0, "George Harris");
    record->setSearchableAttributeValue(1, "Here comes the sun");
    record->setRecordBoost(20);
    record->setInMemoryData(testRecord.c_str(), testRecord.length());
    index->addRecord(record, analyzer);
    record->clear();
    record->setPrimaryKey(1003);
    record->setSearchableAttributeValue(0, "Pink Floyd");
    record->setSearchableAttributeValue(1, "Shine on you crazy diamond");
    record->setRecordBoost(30);
    record->setInMemoryData(testRecord.c_str(), testRecord.length());
    index->addRecord(record, analyzer);
    record->clear();
    record->setPrimaryKey(1004);
    record->setSearchableAttributeValue(0, "Uriah Hepp");
    record->setSearchableAttributeValue(1, "Come Shine away Melinda ");
    record->setRecordBoost(40);
    record->setInMemoryData(testRecord.c_str(), testRecord.length());
    index->addRecord(record, analyzer);
    record->clear();
    record->setPrimaryKey(1005);
    record->setSearchableAttributeValue(0, "Pinksyponzi Floydsyponzi");
    record->setSearchableAttributeValue(1,
            "Shinesyponzi on Wish you were here");
    record->setRecordBoost(50);
    record->setInMemoryData(testRecord.c_str(), testRecord.length());
    index->addRecord(record, analyzer);
    record->clear();
    record->setPrimaryKey(1006);
    record->setSearchableAttributeValue(0, "U2 2345 Pink");
    record->setSearchableAttributeValue(1, "with or without you");
    record->setRecordBoost(60);
    record->setInMemoryData(testRecord.c_str(), testRecord.length());
    index->addRecord(record, analyzer);
    record->clear();
    record->setPrimaryKey(1007);
    record->setSearchableAttributeValue(0, "Led Zepplelin");
    record->setSearchableAttributeValue(1, "Stairway to Heaven pink floyd");
    record->setRecordBoost(80);
    record->setInMemoryData(testRecord.c_str(), testRecord.length());
    index->addRecord(record, analyzer);
    record->clear();
    record->setPrimaryKey(1008);
    record->setSearchableAttributeValue(0, "Jimi Hendrix");
    record->setSearchableAttributeValue(1, "Little wing");
    record->setRecordBoost(90);
    record->setInMemoryData(testRecord.c_str(), testRecord.length());
    index->addRecord(record, analyzer);
    ///TODO: Assert that This record must not be added
    /// 1) Repeat of primary key
    /// 2) Check when adding junk data liek &*^#^%%
    /*record->clear();
     record->setPrimaryKey(1001);
     record->setAttributeValue(0, "jimi pink");
     record->setAttributeValue(1, "comes junk 2345 $%^^#");
     record->setBoost(100);
     index->addRecordBeforeCommit(record);*/
    index->commit();
    index->save();

    delete schema;
    delete record;
    delete analyzer;
    delete index;
}