void TrecDocumentProcessorTestCase::testProcessGZipFile() { TrecDocumentProcessor processor; processor.init(m_pDocSchema.get(), m_pDocTemp.get()); DocumentSource docSource(m_pDocSchema.get()); RawDocumentPtr pRawDoc = new RawDocument(); pRawDoc->setPath(getTestPath() + "/1.gz"); docSource.setRawDocument(pRawDoc); Answer ans; makeAnswer(TEST_FILE2, ans); size_t i = 0; do { processor.process(docSource); DocumentPtr pDoc = docSource.stealLastDocument(); CPPUNIT_ASSERT(pDoc); Document::Iterator it = pDoc->iterator(); while (it.hasNext()) { const Field* pField = it.next(); CPPUNIT_ASSERT_EQUAL(ans[i].first, pField->getFieldSchema()->getName()); CPPUNIT_ASSERT_EQUAL(ans[i].second, std::string(pField->getValue().c_str())); ++i; } } while(docSource.toBeContinued()); CPPUNIT_ASSERT_EQUAL(ans.size(), i); }
void TrecDocumentProcessorTestCase::testProcessFileMisField() { string sPath = writeTestFile("trec_file3.txt", TEST_FILE_MISS_FIELD); TrecDocumentProcessor processor; processor.init(m_pDocSchema.get(), m_pDocTemp.get()); DocumentSource docSource(m_pDocSchema.get()); RawDocumentPtr pRawDoc = new RawDocument(); pRawDoc->setPath(sPath); docSource.setRawDocument(pRawDoc); processor.process(docSource); DocumentPtr pDoc = docSource.stealLastDocument(); CPPUNIT_ASSERT(pDoc); Answer ans; makeAnswer(TEST_FILE_MISS_FIELD, ans); Document::Iterator it = pDoc->iterator(); CPPUNIT_ASSERT_EQUAL(ans.size(), it.size()); size_t i = 0; while (it.hasNext()) { const Field* pField = it.next(); // cout << ans[i].first << " : " << ans[i].second << endl; CPPUNIT_ASSERT_EQUAL(ans[i].first, pField->getFieldSchema()->getName()); CPPUNIT_ASSERT_EQUAL(ans[i].second, std::string(pField->getValue().c_str())); ++i; } }
void StandardDocumentProcessorTestCase::testProcessWithEmptyField() { String sPath = writeTestFile("file_with_empty_field.txt", TEST_FILE_WITH_EMPTY_FIELD); StandardDocumentProcessor processor; processor.init(m_pDocSchema.get()); DocumentSource docSource(m_pDocSchema.get()); RawDocumentPtr pRawDoc = new RawDocument(); pRawDoc->setPath(sPath); docSource.setRawDocument(pRawDoc); processor.process(docSource); DocumentPtr pDoc = docSource.stealLastDocument(); CPPUNIT_ASSERT(pDoc.isNotNull()); Answer ans; makeAnswer(TEST_FILE_WITH_EMPTY_FIELD, ans); Document::Iterator it = pDoc->iterator(); CPPUNIT_ASSERT_EQUAL(ans.size(), it.size()); size_t i = 0; while (it.hasNext()) { const Field* pField = it.next(); // cout << ans[i].first << " : " << ans[i].second << endl; CPPUNIT_ASSERT_EQUAL(ans[i].first, pField->getFieldSchema()->getName()); CPPUNIT_ASSERT_EQUAL(ans[i].second, std::string(pField->getValue().c_str())); ++i; } }
void StandardDocumentProcessorTestCase::testProcessMultiFile() { string sPath = writeTestFile("file2.txt", TEST_FILE2); StandardDocumentProcessor processor; processor.init(m_pDocSchema.get()); DocumentSource docSource(m_pDocSchema.get()); Answer ans; makeAnswer(TEST_FILE2, ans); RawDocumentPtr pRawDoc = new RawDocument(); pRawDoc->setPath(sPath); docSource.setRawDocument(pRawDoc); size_t i = 0; do { processor.process(docSource); DocumentPtr pDoc = docSource.stealLastDocument(); CPPUNIT_ASSERT(pDoc.isNotNull()); Document::Iterator it = pDoc->iterator(); while (it.hasNext()) { const Field* pField = it.next(); CPPUNIT_ASSERT_EQUAL(ans[i].first, pField->getFieldSchema()->getName()); CPPUNIT_ASSERT_EQUAL(ans[i].second, std::string(pField->getValue().c_str())); ++i; } } while(docSource.toBeContinued()); CPPUNIT_ASSERT_EQUAL(ans.size(), i); }
bool DirectoryFileFetcher::fetchNext(RawDocumentPtr& pRawDoc) { while (m_pDirIterator->hasNext()) { const File& file = m_pDirIterator->next(); if (!file.isFile()) { continue; } if (m_pFileFilter) { const Path& filePath = m_pDirIterator->getPath(); if (m_pFileFilter->isFiltered(filePath)) { FX_DEBUG("File : [%s] filtered", filePath.toString().c_str()); continue; } } FX_DEBUG("Fetching file : [%s].", file.getPath().c_str()); pRawDoc->setPath(file.getPath()); return true; } return false; }
void TrecDocumentProcessorTestCase::testProcessTrecFileWithMeta() { m_pDocSchema = new DocumentSchema(); m_pDocSchema->addTextField("field1"); m_pDocSchema->addTextField("field2"); m_pDocSchema->addTextField("url"); m_pDocSchema->addTextField("date"); m_pDocSchema->addTextField("content_type"); m_pDocSchema->addTextField("content_length"); m_pDocSchema->addTextField("last_modified"); m_pDocSchema->addTextField("title"); m_pDocSchema->addTextField("body"); m_pDocTemp = new DocumentTemplate(); m_pDocTemp->setDocTag("DOC"); m_pDocTemp->setEmbeddedField("body"); m_pDocTemp->addMeta("url", "url"); m_pDocTemp->addMeta("Date", "date"); m_pDocTemp->addMeta("Content-Type", "content_type"); m_pDocTemp->addMeta("Content-Length", "content_length"); m_pDocTemp->addMeta("Last-Modified", "last_modified"); m_pDocTemp->addMeta("title", "title"); m_pDocTemp->addTag("field1", "field1"); m_pDocTemp->addTag("field2", "field2"); m_pDocTemp->addTag("DOCHDR"); m_pDocTemp->makeSure(m_pDocSchema.get()); string sPath = writeTestFile("trec_file3.txt", TEST_FILE3); TrecDocumentProcessor processor; processor.setContentType("html"); processor.init(m_pDocSchema.get(), m_pDocTemp.get()); DocumentSource docSource(m_pDocSchema.get()); RawDocumentPtr pRawDoc = new RawDocument(); pRawDoc->setPath(sPath); docSource.setRawDocument(pRawDoc); processor.process(docSource); DocumentPtr pDoc = docSource.stealLastDocument(); CPPUNIT_ASSERT(pDoc); Document::Iterator it = pDoc->iterator(); CPPUNIT_ASSERT(it.hasNext()); const Field* pField = it.next(); CPPUNIT_ASSERT_EQUAL(string("value1"), string(pField->getValue().c_str())); CPPUNIT_ASSERT(it.hasNext()); pField = it.next(); CPPUNIT_ASSERT_EQUAL(string("value2"), string(pField->getValue().c_str())); CPPUNIT_ASSERT(it.hasNext()); pField = it.next(); CPPUNIT_ASSERT_EQUAL(string("http://www.ram.org:80/ramblings/movies/jimmy_hollywood.html"), string(pField->getValue().c_str())); CPPUNIT_ASSERT(it.hasNext()); pField = it.next(); CPPUNIT_ASSERT_EQUAL(string("Wednesday, 01-Jan-97 15:20:23 GMT"), string(pField->getValue().c_str())); CPPUNIT_ASSERT(it.hasNext()); pField = it.next(); CPPUNIT_ASSERT_EQUAL(string("text/html"), string(pField->getValue().c_str())); CPPUNIT_ASSERT(it.hasNext()); pField = it.next(); CPPUNIT_ASSERT_EQUAL(string("1873"), string(pField->getValue().c_str())); CPPUNIT_ASSERT(it.hasNext()); pField = it.next(); CPPUNIT_ASSERT_EQUAL(string("Thursday, 23-Nov-95 03:11:57 GMT"), string(pField->getValue().c_str())); CPPUNIT_ASSERT(it.hasNext()); pField = it.next(); CPPUNIT_ASSERT_EQUAL(string(" Jimmy Hollywood movie review "), string(pField->getValue().c_str())); CPPUNIT_ASSERT(it.hasNext()); pField = it.next(); CPPUNIT_ASSERT_EQUAL(string(" Jimmy Hollywood "), string(pField->getValue().c_str())); }