Exemplo n.º 1
0
int TextHandler::convert_pdf_to_text(const string & filepath, const TextID & tid, string & text)
{
    //读取原文件
    ByteBuffer data;

    if(false == read_file(filepath, data) )
    {
        return ERR_READ_PDF_INPUT_FILE_FAILED;
    }

    //生成source文件
    string source_file_path = m_default_file_path + tid + PDF_SOURCE_POSTFIX;

    if(false == write_file(source_file_path, data.GetPtr(), data.GetLength()) )
    {
        return ERR_WRITE_PDF_SOURCE_FILE;
    }

    string pdf_ext_filepath = m_default_file_path + tid + PDF_EXT_TXT_POSTFIX;

    //call xpdf
    string cmd = "./mdata/pdf/pdftotext -cfg ./mdata/pdf/xpdf-chinese-simplified/xpdfrc -enc GBK " + source_file_path + " " + pdf_ext_filepath;
    system(cmd.c_str());

    //读取抽取的文件
    ByteBuffer tmp_data;
    if(false == read_file(pdf_ext_filepath, tmp_data))
    {
        return ERR_READ_PDF_EXT_FILE_FAILED;
    }

    tmp_data.String(text);

    return SUCCESS;
}
Exemplo n.º 2
0
// tool functions
bool TextHandler::read_file(const string & file_path, ByteBuffer & data)
{
    int length = 0;
    ifstream file(file_path.c_str(), ios::binary);

    if(false == file.good())
    {
        cerr << "Read txt result file failed, path = " << file_path << endl;
        return false;
    }

    file.seekg (0, ios::end);
    length = file.tellg();
    file.seekg (0, ios::beg);

    data.Resize(length);
    file.read(data.GetPtr(), length);

    file.close();

    return true;
}
Exemplo n.º 3
0
int TextHandler::convert_html_to_text(const string & filepath, const TextID & tid, string & text)
{
    //读取原文件
    ByteBuffer data;

    if(false == read_file(filepath, data) )
    {
        return ERR_READ_HTML_INPUT_FILE_FAILED;
    }

    //生成source文件
    string source_file_path = m_default_file_path + tid + HTML_SOURCE_POSTFIX;

    if(false == write_file(source_file_path, data.GetPtr(), data.GetLength()) )
    {
        return ERR_WRITE_HTML_SOURCE_FILE;
    }

    string html_ext_filepath = m_default_file_path + tid + HTML_EXT_TXT_POSTFIX;
    string html_info_filepath = m_default_file_path + tid + HTML_INFO_POSTFIX;

    //call html parser
    string cmd = "java -jar ./mdata/html/htmlAnalysis.jar -g \"" + source_file_path + "\" \"" + html_ext_filepath + "\" \"" + html_info_filepath + "\"";
    system(cmd.c_str());

    //读取抽取的文件
    ByteBuffer tmp_data;
    if(false == read_file(html_ext_filepath, tmp_data))
    {
        return ERR_READ_HTML_EXT_FILE_FAILED;
    }

    tmp_data.String(text);

    return SUCCESS;
}