int TextHandler::convert_pdf_to_text(const string & filepath, const TextID & tid, string & text) { //读取原文件 ByteBuffer data; if(false == read_file(filepath, data) ) { return ERR_READ_PDF_INPUT_FILE_FAILED; } //生成source文件 string source_file_path = m_default_file_path + tid + PDF_SOURCE_POSTFIX; if(false == write_file(source_file_path, data.GetPtr(), data.GetLength()) ) { return ERR_WRITE_PDF_SOURCE_FILE; } string pdf_ext_filepath = m_default_file_path + tid + PDF_EXT_TXT_POSTFIX; //call xpdf string cmd = "./mdata/pdf/pdftotext -cfg ./mdata/pdf/xpdf-chinese-simplified/xpdfrc -enc GBK " + source_file_path + " " + pdf_ext_filepath; system(cmd.c_str()); //读取抽取的文件 ByteBuffer tmp_data; if(false == read_file(pdf_ext_filepath, tmp_data)) { return ERR_READ_PDF_EXT_FILE_FAILED; } tmp_data.String(text); return SUCCESS; }
// tool functions bool TextHandler::read_file(const string & file_path, ByteBuffer & data) { int length = 0; ifstream file(file_path.c_str(), ios::binary); if(false == file.good()) { cerr << "Read txt result file failed, path = " << file_path << endl; return false; } file.seekg (0, ios::end); length = file.tellg(); file.seekg (0, ios::beg); data.Resize(length); file.read(data.GetPtr(), length); file.close(); return true; }
int TextHandler::convert_html_to_text(const string & filepath, const TextID & tid, string & text) { //读取原文件 ByteBuffer data; if(false == read_file(filepath, data) ) { return ERR_READ_HTML_INPUT_FILE_FAILED; } //生成source文件 string source_file_path = m_default_file_path + tid + HTML_SOURCE_POSTFIX; if(false == write_file(source_file_path, data.GetPtr(), data.GetLength()) ) { return ERR_WRITE_HTML_SOURCE_FILE; } string html_ext_filepath = m_default_file_path + tid + HTML_EXT_TXT_POSTFIX; string html_info_filepath = m_default_file_path + tid + HTML_INFO_POSTFIX; //call html parser string cmd = "java -jar ./mdata/html/htmlAnalysis.jar -g \"" + source_file_path + "\" \"" + html_ext_filepath + "\" \"" + html_info_filepath + "\""; system(cmd.c_str()); //读取抽取的文件 ByteBuffer tmp_data; if(false == read_file(html_ext_filepath, tmp_data)) { return ERR_READ_HTML_EXT_FILE_FAILED; } tmp_data.String(text); return SUCCESS; }