예제 #1
0
// on_load_text_2
int TextHandler::convert_txt_to_text(const string & filepath, string & text)
{
    ByteBuffer data;

    if(false == read_file(filepath, data) )
    {
        return ERR_READ_TXT_INPUT_FILE_FAILED;
    }

    data.String(text);

    return SUCCESS;
}
예제 #2
0
int TextHandler::convert_word_to_text(const string & filepath, const TextID & tid, const string & file_type, string & text)
{
    string cmd = "java -jar ./mdata/word/WordDocumentExtractor.jar parse -t \"" + file_type + "\" -i \"" + filepath + "\" -m \"" + m_word_workspace + "/" + tid + ".word.tmp\" -x \"" + m_word_workspace + "/"  + tid + ".word.xml\"";
    system(cmd.c_str());

    //读取原文件
    string word_ext_result_file = m_word_workspace + "/" + tid + ".word.xml";
    ByteBuffer data;

    if(false == read_file(word_ext_result_file, data) )
    {
        return ERR_READ_WORD_EXT_FILE_FAILED;
    }

    //解析抽取出的XML
    stringstream ss;

    TiXmlDocument xmldoc;
    xmldoc.Parse(data.String().c_str());
    TiXmlHandle docHandle( &xmldoc );

    try
    {
        size_t idx = 0;
        TiXmlElement * elem = docHandle.FirstChild("document").Child("p", idx).ToElement();

        while(elem)
        {
            const char * tmp = elem->GetText();
            if(tmp)
                ss << tmp << endl;
            else
                ss << endl;

            ++idx;
            elem = docHandle.FirstChild("document").Child("p", idx).ToElement();
        }

    }catch (...)
    {
        return ERR_PARSE_WORD_XML;
    }

    //清理XML资源
    xmldoc.Clear();

    text = ss.str();
    return SUCCESS;
}
예제 #3
0
// tool functions for build result
int TextHandler::build_base_result_file(const TextID & tid,
                                        const string & build_id,
                                        const vector<boost::tuple<string, string, string, size_t>> & result_vec,
                                        const bool is_blank_between_sent,
                                        const bool is_bilingual,
                                        string & base_result_file_path,
                                        string & result_text)
{
    //读取base source文件
    ByteBuffer source;
    string source_file_path = m_default_file_path + tid + BASE_TEXT_SOURCE_POSTFIX;

    if(false == read_file(source_file_path, source))
    {
        return ERR_READ_BASE_SOURCE_FILE;
    }

    //读取base pos文件
    ByteBuffer pos_data;
    string pos_file_path = m_default_file_path + tid + BASE_TEXT_POS_POSTFIX;

    if(false == read_file(pos_file_path, pos_data) )
    {
        return ERR_READ_BASE_POS_FILE;
    }

    //生成结果文件
    TextFormat format;
    if(false == format.UnSerialization(pos_data.String()) )
    {
        return ERR_PARSE_RESULT_FORMAT;
    }

    //为每个段落生成tgt
    string curr_para_tgt;
    vector<string> para_tgt_vec;
    para_tgt_vec.resize(format.Size(), "");

    size_t i = 0;

    for(i=0; i<result_vec.size(); ++i)
    {

        string final_tgt;

        if(result_vec[i].get<2>().size() > 0)
            final_tgt += result_vec[i].get<2>();
        else
            final_tgt += result_vec[i].get<1>();

        if(result_vec[i].get<3>() >= format.Size())
        {
            lerr << "formate restor failed. para_size = " << format.Size() << " result_vec[" << i << "].para_idx = " << result_vec[i].get<2>() << endl;
            continue;
        }

        if(i+1 < result_vec.size() && is_blank_between_sent)
            para_tgt_vec[result_vec[i].get<3>()] += final_tgt + " ";
        else
            para_tgt_vec[result_vec[i].get<3>()] += final_tgt;
    }

    //重新生成原文
    result_text.clear();
    size_t para_idx = 0;
    size_t offset = 0;

    while(offset < source.GetLength())
    {
        if(para_idx < format.Size())
        {
            size_t len = format[para_idx]._offset - offset;

            //输出非翻译段落
            if(len > 0)
            {
                result_text += source.String().substr(offset, len);
            }

            //输出翻译段落
            if(is_bilingual)
                result_text += source.String().substr(format[para_idx]._offset, format[para_idx]._len) + " [";

            result_text += para_tgt_vec[para_idx];

            if(is_bilingual)
                result_text += "] ";

            //更新偏移量
            offset = format[para_idx]._offset + format[para_idx]._len;
            ++para_idx;
        }
        else
        {
            size_t len = source.GetLength() - offset;

            //输出非翻译段落
            if(len > 0)
            {
                result_text += source.String().substr(offset, len);
            }

            break;
        }
    }

    //输出结果到文件
    base_result_file_path =  m_default_file_path + tid + "." + build_id + BASE_TEXT_TARGET_POSTFIX;

    if(false == write_file(base_result_file_path, result_text.c_str(), result_text.size()) )
    {
        return ERR_WRITE_BASE_RESULT_FILE;
    }

    return SUCCESS;
}