// Index any PDFs that are attached. Basically it turns the PDF into text and adds it the same // way as a note's body void NoteIndexer::indexPdf(qint32 reslid) { NSqlQuery sql(global.db); if (reslid <= 0) return; QString file = global.fileManager.getDbaDirPath() + QString::number(reslid) +".pdf"; QString text = ""; Poppler::Document *doc = Poppler::Document::load(file); if (doc == NULL || doc->isEncrypted() || doc->isLocked()) return; for (int i=0; i<doc->numPages(); i++) { QRectF rect; text = text + doc->page(i)->text(rect) + QString(" "); } // Add the new content. it is basically a text version of the note with a weight of 100. sql.prepare("Insert into SearchIndex (lid, weight, source, content) values (:lid, :weight, :source, :content)"); sql.bindValue(":lid", reslid); sql.bindValue(":weight", 100); sql.bindValue(":source", "recognition"); sql.bindValue(":content", text); sql.exec(); }
// Index any PDFs that are attached. Basically it turns the PDF into text and adds it the same // way as a note's body void IndexRunner::indexPdf(qint32 lid, Resource &r) { if (!keepRunning || pauseIndexing) { indexTimer->start(); return; } ResourceTable rtable(db); qint32 reslid = rtable.getLid(r.guid); if (lid <= 0) { indexTimer->start(); return; } QString file = global.fileManager.getDbaDirPath() + QString::number(reslid) +".pdf"; QString text = ""; Poppler::Document *doc = Poppler::Document::load(file); if (doc == NULL || doc->isEncrypted() || doc->isLocked()) { indexTimer->start(); return; } for (int i=0; keepRunning && !pauseIndexing && i<doc->numPages(); i++) { QRectF rect; text = text + doc->page(i)->text(rect) + QString(" "); } IndexRecord *rec = new IndexRecord(); rec->content = text; rec->source = "recognition"; rec->weight = 100; rec->lid = lid; if (indexHash->contains(lid)) { delete indexHash->value(lid); indexHash->remove(lid); } indexHash->insert(lid, rec); }
int main( int argc, char **argv ) { QApplication a( argc, argv ); // QApplication required! QTime t; t.start(); QDir directory( argv[1] ); foreach ( const QString &fileName, directory.entryList() ) { if (fileName.endsWith("pdf") ) { qDebug() << "Doing" << fileName.toLatin1().data() << ":"; Poppler::Document *doc = Poppler::Document::load( directory.canonicalPath()+"/"+fileName ); if (!doc) { qWarning() << "doc not loaded"; } else if ( doc->isLocked() ) { if (! doc->unlock( "", "password" ) ) { qWarning() << "couldn't unlock document"; delete doc; } } else { doc->pdfVersion(); doc->info("Title"); doc->info("Subject"); doc->info("Author"); doc->info("Keywords"); doc->info("Creator"); doc->info("Producer"); doc->date("CreationDate").toString(); doc->date("ModDate").toString(); doc->numPages(); doc->isLinearized(); doc->isEncrypted(); doc->okToPrint(); doc->okToCopy(); doc->okToChange(); doc->okToAddNotes(); doc->pageMode(); for( int index = 0; index < doc->numPages(); ++index ) { Poppler::Page *page = doc->page( index ); QImage image = page->renderToImage(); page->pageSize(); page->orientation(); delete page; std::cout << "."; std::cout.flush(); } std::cout << std::endl; delete doc; } } } std::cout << "Elapsed time: " << (t.elapsed()/1000) << "seconds" << std::endl; }
int main( int argc, char **argv ) { QApplication a( argc, argv ); // QApplication required! if ( argc != 3) { qWarning() << "usage: test-password-qt4 owner-password filename"; exit(1); } Poppler::Document *doc = Poppler::Document::load(argv[2], argv[1]); if (!doc) { qWarning() << "doc not loaded"; exit(1); } // output some meta-data int major = 0, minor = 0; doc->getPdfVersion( &major, &minor ); qDebug() << " PDF Version: " << qPrintable(QString::fromLatin1("%1.%2").arg(major).arg(minor)); qDebug() << " Title: " << doc->info("Title"); qDebug() << " Subject: " << doc->info("Subject"); qDebug() << " Author: " << doc->info("Author"); qDebug() << " Key words: " << doc->info("Keywords"); qDebug() << " Creator: " << doc->info("Creator"); qDebug() << " Producer: " << doc->info("Producer"); qDebug() << " Date created: " << doc->date("CreationDate").toString(); qDebug() << " Date modified: " << doc->date("ModDate").toString(); qDebug() << "Number of pages: " << doc->numPages(); qDebug() << " Linearised: " << doc->isLinearized(); qDebug() << " Encrypted: " << doc->isEncrypted(); qDebug() << " OK to print: " << doc->okToPrint(); qDebug() << " OK to copy: " << doc->okToCopy(); qDebug() << " OK to change: " << doc->okToChange(); qDebug() << "OK to add notes: " << doc->okToAddNotes(); qDebug() << " Page mode: " << doc->pageMode(); QStringList fontNameList; foreach( const Poppler::FontInfo &font, doc->fonts() ) fontNameList += font.name(); qDebug() << " Fonts: " << fontNameList.join( ", " ); Poppler::Page *page = doc->page(0); qDebug() << " Page 1 size: " << page->pageSize().width()/72 << "inches x " << page->pageSize().height()/72 << "inches"; PDFDisplay test( doc ); // create picture display test.setWindowTitle("Poppler-Qt4 Test"); test.show(); // show it return a.exec(); // start event loop }
// Index any resources void IndexRunner::indexRecognition(qint32 lid, Resource &r) { if (!keepRunning || pauseIndexing) { //indexTimer->start(); return; } // Add filename or source url to search index if (r.attributes.isSet()) { NSqlQuery sql(db); ResourceAttributes a = r.attributes; if (a.fileName.isSet()) { sql.prepare("Insert into SearchIndex (lid, weight, source, content) values (:lid, :weight, :source, :content)"); sql.bindValue(":lid", lid); sql.bindValue(":weight", 100); sql.bindValue(":source", "recognition"); sql.bindValue(":content", QString(a.fileName)); sql.exec(); } if (a.sourceURL.isSet()) { sql.prepare("Insert into SearchIndex (lid, weight, source, content) values (:lid, :weight, :source, :content)"); sql.bindValue(":lid", lid); sql.bindValue(":weight", 100); sql.bindValue(":source", "recognition"); sql.bindValue(":content", QString(a.sourceURL)); sql.exec(); } } // Make sure we have something to look through. Data recognition; if (r.recognition.isSet()) recognition = r.recognition; if (!recognition.body.isSet()) return; QDomDocument doc; QString emsg; doc.setContent(recognition.body, &emsg); // look for text tags QDomNodeList anchors = doc.documentElement().elementsByTagName("t"); #if QT_VERSION < 0x050000 for (unsigned int i=0; keepRunning && !pauseIndexing && i<anchors.length(); i++) { #else for (int i=0; keepRunning && !pauseIndexing && i<anchors.length(); i++) { #endif QApplication::processEvents(); QDomElement enmedia = anchors.at(i).toElement(); QString weight = enmedia.attribute("w"); QString text = enmedia.text(); if (text != "") { IndexRecord *rec = new IndexRecord(); rec->weight = weight.toInt(); rec->lid = lid; rec->content = text; rec->source = "recognition"; if (indexHash->contains(lid)) { delete indexHash->value(lid); indexHash->remove(lid); } indexHash->insert(lid, rec); } } } // Index any PDFs that are attached. Basically it turns the PDF into text and adds it the same // way as a note's body void IndexRunner::indexPdf(qint32 lid, Resource &r) { if (!global.indexPDFLocally) return; if (!keepRunning || pauseIndexing) { //indexTimer->start(); return; } ResourceTable rtable(db); qint32 reslid = rtable.getLid(r.guid); if (lid <= 0) { //indexTimer->start(); return; } QString file = global.fileManager.getDbaDirPath() + QString::number(reslid) +".pdf"; QString text = ""; Poppler::Document *doc = Poppler::Document::load(file); if (doc == nullptr || doc->isEncrypted() || doc->isLocked()) { //indexTimer->start(); return; } for (int i=0; keepRunning && !pauseIndexing && i<doc->numPages(); i++) { QRectF rect; text = text + doc->page(i)->text(rect) + QString(" "); } IndexRecord *rec = new IndexRecord(); rec->content = text; rec->source = "recognition"; rec->weight = 100; rec->lid = lid; if (indexHash->contains(lid)) { delete indexHash->value(lid); indexHash->remove(lid); } indexHash->insert(lid, rec); } // Index any files that are attached. void IndexRunner::indexAttachment(qint32 lid, Resource &r) { if (!officeFound) return; QLOG_DEBUG() << "indexing attachment to note " << lid; if (!keepRunning || pauseIndexing) { //indexTimer->start(); return; } ResourceTable rtable(db); qint32 reslid = rtable.getLid(r.guid); if (lid <= 0) { //indexTimer->start(); return; } QLOG_DEBUG() << "Resource " << reslid; QString extension = ""; ResourceAttributes attributes; if (r.attributes.isSet()) attributes = r.attributes; if (attributes.fileName.isSet()) { extension = attributes.fileName; int i = extension.indexOf("."); if (i != -1) extension = extension.mid(i); } if (extension != ".doc" && extension != ".xls" && extension != ".ppt" && extension != ".docx" && extension != ".xlsx" && extension != ".pptx" && extension != ".pps" && extension != ".pdf" && extension != ".odt" && extension != ".odf" && extension != ".ott" && extension != ".odm" && extension != ".html" && extension != ".txt" && extension != ".oth" && extension != ".ods" && extension != ".ots" && extension != ".odg" && extension != ".otg" && extension != ".odp" && extension != ".otp" && extension != ".odb" && extension != ".oxt" && extension != ".htm" && extension != ".docm") return; QString file = global.fileManager.getDbaDirPath() + QString::number(reslid) +extension; QFile dataFile(file); if (!dataFile.exists()) { QDir dir(global.fileManager.getDbaDirPath()); QStringList filterList; filterList.append(QString::number(lid)+".*"); QStringList list= dir.entryList(filterList, QDir::Files); if (list.size() > 0) { file = global.fileManager.getDbaDirPath()+list[0]; } } QString outDir = global.fileManager.getTmpDirPath(); QProcess sofficeProcess; QString cmd = "soffice --headless --convert-to txt:\"Text\" --outdir " +outDir + " " +file; sofficeProcess.start(cmd, QIODevice::ReadWrite|QIODevice::Unbuffered); QLOG_DEBUG() << "Starting soffice "; sofficeProcess.waitForStarted(); QLOG_DEBUG() << "Waiting for completion"; sofficeProcess.waitForFinished(); int rc = sofficeProcess.exitCode(); QLOG_DEBUG() << "soffice Errors:" << sofficeProcess.readAllStandardError(); QLOG_DEBUG() << "soffice Output:" << sofficeProcess.readAllStandardOutput(); QLOG_DEBUG() << "return code:" << rc; if (rc == 255) { QLOG_ERROR() << "soffice not found. Disabling attachment indexing."; this->officeFound = false; return; } QFile txtFile(outDir+QString::number(reslid) +".txt"); if (txtFile.open(QIODevice::ReadOnly)) { QString text; text = txtFile.readAll(); NSqlQuery sql(db); db->lockForWrite(); sql.prepare("Insert into SearchIndex (lid, weight, source, content) values (:lid, :weight, 'recognition', :content)"); sql.bindValue(":lid", lid); sql.bindValue(":weight", 100); text = global.normalizeTermForSearchAndIndex(text); sql.bindValue(":content", text); QLOG_DEBUG() << "Adding note resource to index DB"; sql.exec(); db->unlock(); txtFile.close(); } QDir dir; dir.remove(outDir+QString::number(reslid) +".txt"); } void IndexRunner::flushCache() { if (indexHash->size() <= 0) return; QDateTime start = QDateTime::currentDateTimeUtc(); NSqlQuery sql(db); db->lockForWrite(); sql.exec("begin"); QHash<qint32, IndexRecord*>::iterator i; // Start adding words to the index. Every 200 sql insertions we do a commit int commitCount = 200; for (i=indexHash->begin(); keepRunning && !pauseIndexing && i!=indexHash->end(); ++i) { qint32 lid = i.key(); IndexRecord *rec = i.value(); qint32 weight = rec->weight; QString source = rec->source; QString content = rec->content; delete rec; // Delete any old content sql.prepare("Delete from SearchIndex where lid=:lid and source=:source"); sql.bindValue(":lid", lid); sql.bindValue(":source", source); sql.exec(); // Add the new content. it is basically a text version of the note with a weight of 100. sql.prepare("Insert into SearchIndex (lid, weight, source, content) values (:lid, :weight, :source, :content)"); sql.bindValue(":lid", lid); sql.bindValue(":weight", weight); sql.bindValue(":source", source); content = global.normalizeTermForSearchAndIndex(content); sql.bindValue(":content", content); sql.exec(); commitCount--; if (commitCount <= 0) { sql.exec("commit"); commitCount = 200; } } indexHash->clear(); sql.exec("commit"); sql.finish(); db->unlock(); QDateTime finish = QDateTime::currentDateTimeUtc(); QLOG_DEBUG() << "Index Cache Flush Complete: " << finish.toMSecsSinceEpoch() - start.toMSecsSinceEpoch() << " milliseconds."; } void IndexRunner::busy(bool value, bool finished) { iAmBusy=value; emit(this->indexDone(finished)); }
int main( int argc, char **argv ) { QApplication a( argc, argv ); // QApplication required! Q_UNUSED( argc ); Q_UNUSED( argv ); QTime t; t.start(); QDir dbDir( QStringLiteral( "./pdfdb" ) ); if ( !dbDir.exists() ) { qWarning() << "Database directory does not exist"; } QStringList excludeSubDirs; excludeSubDirs << QStringLiteral("000048") << QStringLiteral("000607"); const QStringList dirs = dbDir.entryList(QStringList() << QStringLiteral("0000*"), QDir::Dirs); foreach ( const QString &subdir, dirs ) { if ( excludeSubDirs.contains(subdir) ) { // then skip it } else { QString path = "./pdfdb/" + subdir + "/data.pdf"; std::cout <<"Doing " << path.toLatin1().data() << " :"; Poppler::Document *doc = Poppler::Document::load( path ); if (!doc) { qWarning() << "doc not loaded"; } else { int major = 0, minor = 0; doc->getPdfVersion( &major, &minor ); doc->info(QStringLiteral("Title")); doc->info(QStringLiteral("Subject")); doc->info(QStringLiteral("Author")); doc->info(QStringLiteral("Keywords")); doc->info(QStringLiteral("Creator")); doc->info(QStringLiteral("Producer")); doc->date(QStringLiteral("CreationDate")).toString(); doc->date(QStringLiteral("ModDate")).toString(); doc->numPages(); doc->isLinearized(); doc->isEncrypted(); doc->okToPrint(); doc->okToCopy(); doc->okToChange(); doc->okToAddNotes(); doc->pageMode(); for( int index = 0; index < doc->numPages(); ++index ) { Poppler::Page *page = doc->page( index ); page->renderToImage(); page->pageSize(); page->orientation(); delete page; std::cout << "."; std::cout.flush(); } std::cout << std::endl; delete doc; } } } std::cout << "Elapsed time: " << (t.elapsed()/1000) << std::endl; }