// on_load_text_2 int TextHandler::convert_txt_to_text(const string & filepath, string & text) { ByteBuffer data; if(false == read_file(filepath, data) ) { return ERR_READ_TXT_INPUT_FILE_FAILED; } data.String(text); return SUCCESS; }
int TextHandler::convert_word_to_text(const string & filepath, const TextID & tid, const string & file_type, string & text) { string cmd = "java -jar ./mdata/word/WordDocumentExtractor.jar parse -t \"" + file_type + "\" -i \"" + filepath + "\" -m \"" + m_word_workspace + "/" + tid + ".word.tmp\" -x \"" + m_word_workspace + "/" + tid + ".word.xml\""; system(cmd.c_str()); //读取原文件 string word_ext_result_file = m_word_workspace + "/" + tid + ".word.xml"; ByteBuffer data; if(false == read_file(word_ext_result_file, data) ) { return ERR_READ_WORD_EXT_FILE_FAILED; } //解析抽取出的XML stringstream ss; TiXmlDocument xmldoc; xmldoc.Parse(data.String().c_str()); TiXmlHandle docHandle( &xmldoc ); try { size_t idx = 0; TiXmlElement * elem = docHandle.FirstChild("document").Child("p", idx).ToElement(); while(elem) { const char * tmp = elem->GetText(); if(tmp) ss << tmp << endl; else ss << endl; ++idx; elem = docHandle.FirstChild("document").Child("p", idx).ToElement(); } }catch (...) { return ERR_PARSE_WORD_XML; } //清理XML资源 xmldoc.Clear(); text = ss.str(); return SUCCESS; }
// tool functions for build result int TextHandler::build_base_result_file(const TextID & tid, const string & build_id, const vector<boost::tuple<string, string, string, size_t>> & result_vec, const bool is_blank_between_sent, const bool is_bilingual, string & base_result_file_path, string & result_text) { //读取base source文件 ByteBuffer source; string source_file_path = m_default_file_path + tid + BASE_TEXT_SOURCE_POSTFIX; if(false == read_file(source_file_path, source)) { return ERR_READ_BASE_SOURCE_FILE; } //读取base pos文件 ByteBuffer pos_data; string pos_file_path = m_default_file_path + tid + BASE_TEXT_POS_POSTFIX; if(false == read_file(pos_file_path, pos_data) ) { return ERR_READ_BASE_POS_FILE; } //生成结果文件 TextFormat format; if(false == format.UnSerialization(pos_data.String()) ) { return ERR_PARSE_RESULT_FORMAT; } //为每个段落生成tgt string curr_para_tgt; vector<string> para_tgt_vec; para_tgt_vec.resize(format.Size(), ""); size_t i = 0; for(i=0; i<result_vec.size(); ++i) { string final_tgt; if(result_vec[i].get<2>().size() > 0) final_tgt += result_vec[i].get<2>(); else final_tgt += result_vec[i].get<1>(); if(result_vec[i].get<3>() >= format.Size()) { lerr << "formate restor failed. para_size = " << format.Size() << " result_vec[" << i << "].para_idx = " << result_vec[i].get<2>() << endl; continue; } if(i+1 < result_vec.size() && is_blank_between_sent) para_tgt_vec[result_vec[i].get<3>()] += final_tgt + " "; else para_tgt_vec[result_vec[i].get<3>()] += final_tgt; } //重新生成原文 result_text.clear(); size_t para_idx = 0; size_t offset = 0; while(offset < source.GetLength()) { if(para_idx < format.Size()) { size_t len = format[para_idx]._offset - offset; //输出非翻译段落 if(len > 0) { result_text += source.String().substr(offset, len); } //输出翻译段落 if(is_bilingual) result_text += source.String().substr(format[para_idx]._offset, format[para_idx]._len) + " ["; result_text += para_tgt_vec[para_idx]; if(is_bilingual) result_text += "] "; //更新偏移量 offset = format[para_idx]._offset + format[para_idx]._len; ++para_idx; } else { size_t len = source.GetLength() - offset; //输出非翻译段落 if(len > 0) { result_text += source.String().substr(offset, len); } break; } } //输出结果到文件 base_result_file_path = m_default_file_path + tid + "." + build_id + BASE_TEXT_TARGET_POSTFIX; if(false == write_file(base_result_file_path, result_text.c_str(), result_text.size()) ) { return ERR_WRITE_BASE_RESULT_FILE; } return SUCCESS; }