void TrecDocumentProcessorTestCase::testProcessGZipFile()
{
    TrecDocumentProcessor processor;
    processor.init(m_pDocSchema.get(), m_pDocTemp.get());
    
    DocumentSource docSource(m_pDocSchema.get());
    RawDocumentPtr pRawDoc = new RawDocument();
    pRawDoc->setPath(getTestPath() + "/1.gz");
    docSource.setRawDocument(pRawDoc);

    Answer ans;
    makeAnswer(TEST_FILE2, ans);

    size_t i = 0;
    do
    {
        processor.process(docSource);

        DocumentPtr pDoc = docSource.stealLastDocument();
        CPPUNIT_ASSERT(pDoc);

        Document::Iterator it = pDoc->iterator();
        while (it.hasNext())
        {
            const Field* pField = it.next();
            CPPUNIT_ASSERT_EQUAL(ans[i].first, pField->getFieldSchema()->getName());
            CPPUNIT_ASSERT_EQUAL(ans[i].second, std::string(pField->getValue().c_str()));
            ++i;
        }
    } while(docSource.toBeContinued());
    CPPUNIT_ASSERT_EQUAL(ans.size(), i);
}
void StandardDocumentProcessorTestCase::testProcessWithEmptyField()
{
    String sPath = writeTestFile("file_with_empty_field.txt", TEST_FILE_WITH_EMPTY_FIELD);

    StandardDocumentProcessor processor;
    processor.init(m_pDocSchema.get());
    
    DocumentSource docSource(m_pDocSchema.get());
    RawDocumentPtr pRawDoc = new RawDocument();
    pRawDoc->setPath(sPath);
    docSource.setRawDocument(pRawDoc);

    processor.process(docSource);

    DocumentPtr pDoc = docSource.stealLastDocument();
    CPPUNIT_ASSERT(pDoc.isNotNull());

    Answer ans;
    makeAnswer(TEST_FILE_WITH_EMPTY_FIELD, ans);

    Document::Iterator it = pDoc->iterator();
    CPPUNIT_ASSERT_EQUAL(ans.size(), it.size());
    size_t i = 0;
    while (it.hasNext())
    {
        const Field* pField = it.next();
//        cout << ans[i].first << " : " << ans[i].second << endl;
        CPPUNIT_ASSERT_EQUAL(ans[i].first, pField->getFieldSchema()->getName());
        CPPUNIT_ASSERT_EQUAL(ans[i].second, std::string(pField->getValue().c_str()));
        ++i;
    }
}
void TrecDocumentProcessorTestCase::testProcessFileMisField()
{
    string sPath = writeTestFile("trec_file3.txt", TEST_FILE_MISS_FIELD);

    TrecDocumentProcessor processor;
    processor.init(m_pDocSchema.get(), m_pDocTemp.get());
    
    DocumentSource docSource(m_pDocSchema.get());
    RawDocumentPtr pRawDoc = new RawDocument();
    pRawDoc->setPath(sPath);
    docSource.setRawDocument(pRawDoc);

    processor.process(docSource);

    DocumentPtr pDoc = docSource.stealLastDocument();
    CPPUNIT_ASSERT(pDoc);

    Answer ans;
    makeAnswer(TEST_FILE_MISS_FIELD, ans);

    Document::Iterator it = pDoc->iterator();
    CPPUNIT_ASSERT_EQUAL(ans.size(), it.size());
    size_t i = 0;
    while (it.hasNext())
    {
        const Field* pField = it.next();
//        cout << ans[i].first << " : " << ans[i].second << endl;
        CPPUNIT_ASSERT_EQUAL(ans[i].first, pField->getFieldSchema()->getName());
        CPPUNIT_ASSERT_EQUAL(ans[i].second, std::string(pField->getValue().c_str()));
        ++i;
    }
}
void StandardDocumentProcessorTestCase::testProcessMultiFile()
{
    string sPath = writeTestFile("file2.txt", TEST_FILE2);

    StandardDocumentProcessor processor;
    processor.init(m_pDocSchema.get());
    
    DocumentSource docSource(m_pDocSchema.get());

    Answer ans;
    makeAnswer(TEST_FILE2, ans);
    RawDocumentPtr pRawDoc = new RawDocument();
    pRawDoc->setPath(sPath);
    docSource.setRawDocument(pRawDoc);

    size_t i = 0;
    do
    {
        processor.process(docSource);

        DocumentPtr pDoc = docSource.stealLastDocument();
        CPPUNIT_ASSERT(pDoc.isNotNull());

        Document::Iterator it = pDoc->iterator();
        while (it.hasNext())
        {
            const Field* pField = it.next();
            CPPUNIT_ASSERT_EQUAL(ans[i].first, pField->getFieldSchema()->getName());
            CPPUNIT_ASSERT_EQUAL(ans[i].second, std::string(pField->getValue().c_str()));
            ++i;
        }
    } while(docSource.toBeContinued());
    CPPUNIT_ASSERT_EQUAL(ans.size(), i);
}
void TrecDocumentProcessorTestCase::testProcessTrecFileWithMeta()
{
    m_pDocSchema = new DocumentSchema();
    m_pDocSchema->addTextField("field1");
    m_pDocSchema->addTextField("field2");
    m_pDocSchema->addTextField("url");
    m_pDocSchema->addTextField("date");
    m_pDocSchema->addTextField("content_type");
    m_pDocSchema->addTextField("content_length");
    m_pDocSchema->addTextField("last_modified");
    m_pDocSchema->addTextField("title");
    m_pDocSchema->addTextField("body");

    m_pDocTemp = new DocumentTemplate();
    m_pDocTemp->setDocTag("DOC");
    m_pDocTemp->setEmbeddedField("body");

    m_pDocTemp->addMeta("url", "url");
    m_pDocTemp->addMeta("Date", "date");
    m_pDocTemp->addMeta("Content-Type", "content_type");
    m_pDocTemp->addMeta("Content-Length", "content_length");
    m_pDocTemp->addMeta("Last-Modified", "last_modified");
    m_pDocTemp->addMeta("title", "title");

    m_pDocTemp->addTag("field1", "field1");
    m_pDocTemp->addTag("field2", "field2");
    m_pDocTemp->addTag("DOCHDR");

    m_pDocTemp->makeSure(m_pDocSchema.get());

    string sPath = writeTestFile("trec_file3.txt", TEST_FILE3);

    TrecDocumentProcessor processor;
    processor.setContentType("html");
    processor.init(m_pDocSchema.get(), m_pDocTemp.get());
    
    DocumentSource docSource(m_pDocSchema.get());

    RawDocumentPtr pRawDoc = new RawDocument();
    pRawDoc->setPath(sPath);
    docSource.setRawDocument(pRawDoc);

    processor.process(docSource);

    DocumentPtr pDoc = docSource.stealLastDocument();
    CPPUNIT_ASSERT(pDoc);

    Document::Iterator it = pDoc->iterator();
    CPPUNIT_ASSERT(it.hasNext());
    const Field* pField = it.next();
    CPPUNIT_ASSERT_EQUAL(string("value1"), string(pField->getValue().c_str()));

    CPPUNIT_ASSERT(it.hasNext());
    pField = it.next();
    CPPUNIT_ASSERT_EQUAL(string("value2"), string(pField->getValue().c_str()));

    CPPUNIT_ASSERT(it.hasNext());
    pField = it.next();
    CPPUNIT_ASSERT_EQUAL(string("http://www.ram.org:80/ramblings/movies/jimmy_hollywood.html"),
                         string(pField->getValue().c_str()));
    CPPUNIT_ASSERT(it.hasNext());
    pField = it.next();
    CPPUNIT_ASSERT_EQUAL(string("Wednesday, 01-Jan-97 15:20:23 GMT"),
                         string(pField->getValue().c_str()));

    CPPUNIT_ASSERT(it.hasNext());
    pField = it.next();
    CPPUNIT_ASSERT_EQUAL(string("text/html"),
                         string(pField->getValue().c_str()));

    CPPUNIT_ASSERT(it.hasNext());
    pField = it.next();
    CPPUNIT_ASSERT_EQUAL(string("1873"),
                         string(pField->getValue().c_str()));

    CPPUNIT_ASSERT(it.hasNext());
    pField = it.next();
    CPPUNIT_ASSERT_EQUAL(string("Thursday, 23-Nov-95 03:11:57 GMT"),
                         string(pField->getValue().c_str()));

    CPPUNIT_ASSERT(it.hasNext());
    pField = it.next();
    CPPUNIT_ASSERT_EQUAL(string(" Jimmy Hollywood movie review "),
                         string(pField->getValue().c_str()));

    CPPUNIT_ASSERT(it.hasNext());
    pField = it.next();
    CPPUNIT_ASSERT_EQUAL(string(" Jimmy Hollywood "),
                         string(pField->getValue().c_str()));
}