/* * @Function Name : ExtractTxt * @Description : 获取文件的文本内容 * @Return Value : 返回操作状态 */ bool C_HtmlFile::ExtractTxt() { PlainTextExtractor extractor; extractor.setVerboseLogging(true); if (!extractor.processFile(m_pszFileData, m_ulFileLen, m_strText)) { return false; } m_ulTextLen = m_strText.length(); return true; }
int officeparser(char* src, char* workspace,char *curpath,char* resfile) { cout << "starting process office file" << endl; char srcpath[1024]= {0}; char destpath[1024]= {0}; sprintf(srcpath,"%s/%s/%s",workspace,curpath,src); sprintf(destpath,"%s/temps/%s",workspace,resfile); PlainTextExtractor extractor; //string filename_str = filename; string text; if (!extractor.processFile(srcpath, text)) { cout << " office file processing failed..." << endl; return -1; } ofstream fout(destpath,ios::app); if(!fout) { cout << " can't open result file..." << endl; return -1; } /**/ Metadata meta; if(extractor.extractMetadata(srcpath, meta)) { tm cd = meta.creationDate(); tm lmd = meta.lastModificationDate(); fout << "Author: " << meta.author() << endl; fout << "Creation date: " << asctime(&cd) << endl; fout << "Last modified by: " << meta.lastModifiedBy() << endl; fout << "Last modification date: " << asctime(&lmd) << endl; fout << "Page count: " << meta.pageCount() << endl; fout << "Word count: " << meta.wordCount() << endl; } /**/ fout << text << endl; fout.close(); cout << " processing complete" << endl; return 0; }