/
PopplerTools.cpp
148 lines (130 loc) · 4.9 KB
/
PopplerTools.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
#include "PopplerTools.h"
#include "PMSettings.h"
#include "Logger.h"
#include <QDebug>
namespace {
const char* PDFIMAGES_NAMES[] = { "pdfimages", "pdfimages.exe", NULL };
const char* PDFUNITE_NAMES[] = { "pdfunite", "pdfunite.exe", NULL };
const char* PDFTOTEXT_NAMES[] = { "pdftotext", "pdftotext.exe", NULL };
}
bool FindPDFImages(QProcess &pdfimagesProcess, const char* imageNames[]) {
int index = 0;
bool foundPdfImages = false;
pdfimagesProcess.setArguments(QStringList("--version"));
while(imageNames[index] != 0) {
pdfimagesProcess.setProgram(imageNames[index]);
pdfimagesProcess.start();
pdfimagesProcess.waitForFinished(-1);
if(pdfimagesProcess.error() == QProcess::UnknownError) {
foundPdfImages = true;
break;
}
qDebug() << "pdfimage executable not found" << imageNames[index];
index++;
}
if(!foundPdfImages) {
qWarning("pdfimage executable not found");
}
return foundPdfImages;
}
PopplerTools::PopplerTools()
{
TempDir.setAutoRemove(!PMSettings::IsDebugEnabled());
FoundPDFImagesExec = FindPDFImages(PDFImageProcess, PDFIMAGES_NAMES);
FoundPDFUniteExec = FindPDFImages(PDFUniteProcess, PDFUNITE_NAMES);
FoundPDFToTextExec = FindPDFImages(PDFToTextProcess, PDFTOTEXT_NAMES);
}
void PopplerTools::Merge(const QStringList &/*inputFiles*/, const PageList &/*pageList*/, const QString &/*outputFile*/, const MetaDataList &/*meta_data_list*/)
{
abort(); // not implemented
}
void PopplerTools::MergePDF(const QStringList &inputFiles, const QString &outputFile, const MetaDataList &/*meta_data_list*/)
{
QStringList arguments;
arguments.append(inputFiles);
arguments.append(outputFile);
PDFUniteProcess.setArguments(arguments);
PDFUniteProcess.start();
PDFUniteProcess.waitForFinished();
Logger::Log(PDFUniteProcess);
}
QStringList PopplerTools::WriteToSeparatePages(const QStringList &inputFiles, const PageList &pageList, const QString &outputPath)
{
int pageIndex = 0;
QStringList outputFiles;
QString destinationPath = outputPath;
if (TempDir.isValid()) {
destinationPath = TempDir.path();
}
qDebug() << "Tempdir: " << destinationPath;
QDir dir(destinationPath);
foreach (const PageEntry& page, pageList) {
QString outputFile = destinationPath + QDir::separator() + QString("temp_doc_%1.tmp").arg(++pageIndex);
QStringList arguments;
const int pageNumber = page.second + 1;
arguments.append("-all");
arguments.append("-l");
arguments.append(QString::number(pageNumber));
arguments.append("-f");
arguments.append(QString::number(pageNumber));
arguments.append(inputFiles.at(page.first));
arguments.append(outputFile);
PDFImageProcess.setArguments(arguments);
PDFImageProcess.start();
PDFImageProcess.waitForFinished();
Logger::Log(PDFImageProcess);
// correct the filename. pdfimages does not tell us how the file will be called
QFileInfo fileTofind(outputFile);
if (!fileTofind.exists()) {
QStringList foundFiles = dir.entryList(QStringList() << (fileTofind.baseName() + "*" ));
if(foundFiles.size() == 1) {
outputFile = destinationPath + QDir::separator() + foundFiles.first();
}
else {
if(foundFiles.size() > 1) {
qWarning() << "Found to many images for this page: " << fileTofind.baseName() << ". Maybe not from scanned source?";
} else {
qWarning() << "Could not find correct output for file " << fileTofind.baseName() << ". Found these pdfimages " << foundFiles;
}
outputFile = "";
}
}
if(QFileInfo(outputFile).exists()) {
outputFiles.append(outputFile);
}
}
return outputFiles;
}
QString PopplerTools::ReadText(QFile &file)
{
QTemporaryDir tempDir;
QFile outputFile(tempDir.path() + "/text_file.txt");
QString resultText;
if(FoundPDFToTextExec) {
PDFToTextProcess.setArguments( QStringList() << file.fileName() << outputFile.fileName());
PDFToTextProcess.start();
PDFToTextProcess.waitForFinished();
Logger::Log(PDFToTextProcess);
if(outputFile.exists()) {
outputFile.open(QIODevice::ReadOnly);
resultText = QString(outputFile.readAll());
} else {
Logger::Log(Logger::ERROR, "could not read textfile: " + outputFile.fileName() );
}
} else {
Logger::Log(Logger::ERROR, "pdftotext not found. No text extraction from PDF possible");
}
return resultText;
}
bool PopplerTools::HasFoundPDFImagesExec() const
{
return FoundPDFImagesExec;
}
bool PopplerTools::HasFoundPDFUniteExec() const
{
return FoundPDFUniteExec;
}
bool PopplerTools::HasFoundPDFToTextExec() const
{
return FoundPDFToTextExec;
}