示例#1
0
// Index any PDFs that are attached.  Basically it turns the PDF into text and adds it the same
// way as a note's body
void NoteIndexer::indexPdf(qint32 reslid) {

    NSqlQuery sql(global.db);
    if (reslid <= 0)
        return;

    QString file = global.fileManager.getDbaDirPath() + QString::number(reslid) +".pdf";

    QString text = "";
    Poppler::Document *doc = Poppler::Document::load(file);
    if (doc == NULL || doc->isEncrypted() || doc->isLocked())
        return;

    for (int i=0; i<doc->numPages(); i++) {
        QRectF rect;
        text = text + doc->page(i)->text(rect) + QString(" ");
    }

    // Add the new content.  it is basically a text version of the note with a weight of 100.
    sql.prepare("Insert into SearchIndex (lid, weight, source, content) values (:lid, :weight, :source, :content)");
    sql.bindValue(":lid", reslid);
    sql.bindValue(":weight", 100);
    sql.bindValue(":source", "recognition");
    sql.bindValue(":content", text);
    sql.exec();
}
示例#2
0
// Index any PDFs that are attached.  Basically it turns the PDF into text and adds it the same
// way as a note's body
void IndexRunner::indexPdf(qint32 lid, Resource &r) {
    if (!keepRunning || pauseIndexing) {
        indexTimer->start();
        return;
    }
    ResourceTable rtable(db);
    qint32 reslid = rtable.getLid(r.guid);
    if (lid <= 0) {
        indexTimer->start();
        return;
    }
    QString file = global.fileManager.getDbaDirPath() + QString::number(reslid) +".pdf";

    QString text = "";
    Poppler::Document *doc = Poppler::Document::load(file);
    if (doc == NULL || doc->isEncrypted() || doc->isLocked()) {
        indexTimer->start();
        return;
    }
    for (int i=0; keepRunning && !pauseIndexing && i<doc->numPages(); i++) {
        QRectF rect;
        text = text + doc->page(i)->text(rect) + QString(" ");
    }
    IndexRecord *rec = new IndexRecord();
    rec->content = text;
    rec->source = "recognition";
    rec->weight = 100;
    rec->lid = lid;
    if (indexHash->contains(lid)) {
        delete indexHash->value(lid);
        indexHash->remove(lid);
    }
    indexHash->insert(lid, rec);
}
int main( int argc, char **argv )
{
    QApplication a( argc, argv );               // QApplication required!

    QTime t;
    t.start();

    QDir directory( argv[1] );
    foreach ( const QString &fileName, directory.entryList() ) {
        if (fileName.endsWith("pdf") ) {
	    qDebug() << "Doing" << fileName.toLatin1().data() << ":";
	    Poppler::Document *doc = Poppler::Document::load( directory.canonicalPath()+"/"+fileName );
	    if (!doc) {
		qWarning() << "doc not loaded";
	    } else if ( doc->isLocked() ) {
	        if (! doc->unlock( "", "password" ) ) {
		    qWarning() << "couldn't unlock document";
		    delete doc;
		}
	    } else {
		doc->pdfVersion();
		doc->info("Title");
		doc->info("Subject");
		doc->info("Author");
		doc->info("Keywords");
		doc->info("Creator");
		doc->info("Producer");
		doc->date("CreationDate").toString();
		doc->date("ModDate").toString();
		doc->numPages();
		doc->isLinearized();
		doc->isEncrypted();
		doc->okToPrint();
		doc->okToCopy();
		doc->okToChange();
		doc->okToAddNotes();
		doc->pageMode();

		for( int index = 0; index < doc->numPages(); ++index ) {
		    Poppler::Page *page = doc->page( index );
		    QImage image = page->renderToImage();
		    page->pageSize();
		    page->orientation();
		    delete page;
		    std::cout << ".";
		    std::cout.flush();
		}
		std::cout << std::endl;
		delete doc;
	    }
	}
    }

    std::cout << "Elapsed time: " << (t.elapsed()/1000) << "seconds" << std::endl;

}
示例#4
0
int main( int argc, char **argv )
{
    QApplication a( argc, argv );               // QApplication required!

    if ( argc != 3)
    {
	qWarning() << "usage: test-password-qt4 owner-password filename";
	exit(1);
    }
  
    Poppler::Document *doc = Poppler::Document::load(argv[2], argv[1]);
    if (!doc)
    {
	qWarning() << "doc not loaded";
	exit(1);
    }
  
    // output some meta-data
    int major = 0, minor = 0;
    doc->getPdfVersion( &major, &minor );
    qDebug() << "    PDF Version: " << qPrintable(QString::fromLatin1("%1.%2").arg(major).arg(minor));
    qDebug() << "          Title: " << doc->info("Title");
    qDebug() << "        Subject: " << doc->info("Subject");
    qDebug() << "         Author: " << doc->info("Author");
    qDebug() << "      Key words: " << doc->info("Keywords");
    qDebug() << "        Creator: " << doc->info("Creator");
    qDebug() << "       Producer: " << doc->info("Producer");
    qDebug() << "   Date created: " << doc->date("CreationDate").toString();
    qDebug() << "  Date modified: " << doc->date("ModDate").toString();
    qDebug() << "Number of pages: " << doc->numPages();
    qDebug() << "     Linearised: " << doc->isLinearized();
    qDebug() << "      Encrypted: " << doc->isEncrypted();
    qDebug() << "    OK to print: " << doc->okToPrint();
    qDebug() << "     OK to copy: " << doc->okToCopy();
    qDebug() << "   OK to change: " << doc->okToChange();
    qDebug() << "OK to add notes: " << doc->okToAddNotes();
    qDebug() << "      Page mode: " << doc->pageMode();
    QStringList fontNameList;
    foreach( const Poppler::FontInfo &font, doc->fonts() )
	fontNameList += font.name();
    qDebug() << "          Fonts: " << fontNameList.join( ", " );

    Poppler::Page *page = doc->page(0);
    qDebug() << "    Page 1 size: " << page->pageSize().width()/72 << "inches x " << page->pageSize().height()/72 << "inches";

    PDFDisplay test( doc );        // create picture display
    test.setWindowTitle("Poppler-Qt4 Test");
    test.show();                            // show it

    return a.exec();                        // start event loop
}
示例#5
0
// Index any resources
void IndexRunner::indexRecognition(qint32 lid, Resource &r) {

    if (!keepRunning || pauseIndexing) {
        //indexTimer->start();
        return;
    }

    // Add filename or source url to search index
    if (r.attributes.isSet()) {
        NSqlQuery sql(db);
        ResourceAttributes a = r.attributes;
        if (a.fileName.isSet()) {
            sql.prepare("Insert into SearchIndex (lid, weight, source, content) values (:lid, :weight, :source, :content)");
            sql.bindValue(":lid", lid);
            sql.bindValue(":weight", 100);
            sql.bindValue(":source", "recognition");
            sql.bindValue(":content", QString(a.fileName));
            sql.exec();
        }
        if (a.sourceURL.isSet()) {
            sql.prepare("Insert into SearchIndex (lid, weight, source, content) values (:lid, :weight, :source, :content)");
            sql.bindValue(":lid", lid);
            sql.bindValue(":weight", 100);
            sql.bindValue(":source", "recognition");
            sql.bindValue(":content", QString(a.sourceURL));
            sql.exec();
        }
    }


    // Make sure we have something to look through.
    Data recognition;
    if (r.recognition.isSet())
        recognition = r.recognition;
    if (!recognition.body.isSet())
        return;

    QDomDocument doc;
    QString emsg;
    doc.setContent(recognition.body, &emsg);

    // look for text tags
    QDomNodeList anchors = doc.documentElement().elementsByTagName("t");
#if QT_VERSION < 0x050000
    for (unsigned int i=0; keepRunning && !pauseIndexing && i<anchors.length(); i++) {
#else
    for (int i=0; keepRunning && !pauseIndexing && i<anchors.length(); i++) {
#endif
        QApplication::processEvents();
        QDomElement enmedia = anchors.at(i).toElement();
        QString weight = enmedia.attribute("w");
        QString text = enmedia.text();
        if (text != "") {
            IndexRecord *rec = new IndexRecord();
            rec->weight = weight.toInt();
            rec->lid = lid;
            rec->content = text;
            rec->source = "recognition";
            if (indexHash->contains(lid)) {
                delete indexHash->value(lid);
                indexHash->remove(lid);
            }
            indexHash->insert(lid, rec);
        }
    }
}


// Index any PDFs that are attached.  Basically it turns the PDF into text and adds it the same
// way as a note's body
void IndexRunner::indexPdf(qint32 lid, Resource &r) {
    if (!global.indexPDFLocally)
        return;
    if (!keepRunning || pauseIndexing) {
        //indexTimer->start();
        return;
    }
    ResourceTable rtable(db);
    qint32 reslid = rtable.getLid(r.guid);
    if (lid <= 0) {
        //indexTimer->start();
        return;
    }
    QString file = global.fileManager.getDbaDirPath() + QString::number(reslid) +".pdf";

    QString text = "";
    Poppler::Document *doc = Poppler::Document::load(file);
    if (doc == nullptr || doc->isEncrypted() || doc->isLocked()) {
        //indexTimer->start();
        return;
    }
    for (int i=0; keepRunning && !pauseIndexing && i<doc->numPages(); i++) {
        QRectF rect;
        text = text + doc->page(i)->text(rect) + QString(" ");
    }
    IndexRecord *rec = new IndexRecord();
    rec->content = text;
    rec->source = "recognition";
    rec->weight = 100;
    rec->lid = lid;
    if (indexHash->contains(lid)) {
        delete indexHash->value(lid);
        indexHash->remove(lid);
    }
    indexHash->insert(lid, rec);
}




// Index any files that are attached.
void IndexRunner::indexAttachment(qint32 lid, Resource &r) {
    if (!officeFound)
        return;
    QLOG_DEBUG() << "indexing attachment to note " << lid;
    if (!keepRunning || pauseIndexing) {
        //indexTimer->start();
        return;
    }
    ResourceTable rtable(db);
    qint32 reslid = rtable.getLid(r.guid);
    if (lid <= 0) {
        //indexTimer->start();
        return;
    }
    QLOG_DEBUG() << "Resource " << reslid;
    QString extension = "";
    ResourceAttributes attributes;
    if (r.attributes.isSet())
        attributes = r.attributes;
    if (attributes.fileName.isSet()) {
        extension = attributes.fileName;
        int i = extension.indexOf(".");
	if (i != -1)
	  extension = extension.mid(i);
    }
    if (extension != ".doc"  && extension != ".xls"  && extension != ".ppt" &&
        extension != ".docx" && extension != ".xlsx" && extension != ".pptx" &&
        extension != ".pps"  && extension != ".pdf"  && extension != ".odt"  &&
        extension != ".odf"  && extension != ".ott"  && extension != ".odm"  &&
        extension != ".html" && extension != ".txt"  && extension != ".oth"  &&
        extension != ".ods"  && extension != ".ots"  && extension != ".odg"  &&
        extension != ".otg"  && extension != ".odp"  && extension != ".otp"  &&
        extension != ".odb"  && extension != ".oxt"  && extension != ".htm"  &&
        extension != ".docm")
                return;

    QString file = global.fileManager.getDbaDirPath() + QString::number(reslid) +extension;
    QFile dataFile(file);
    if (!dataFile.exists()) {
        QDir dir(global.fileManager.getDbaDirPath());
        QStringList filterList;
        filterList.append(QString::number(lid)+".*");
        QStringList list= dir.entryList(filterList, QDir::Files);
        if (list.size() > 0) {
            file = global.fileManager.getDbaDirPath()+list[0];
        }
    }

    QString outDir = global.fileManager.getTmpDirPath();

    QProcess sofficeProcess;
    QString cmd = "soffice --headless --convert-to txt:\"Text\" --outdir "
                    +outDir + " "
                    +file;

    sofficeProcess.start(cmd,
                         QIODevice::ReadWrite|QIODevice::Unbuffered);

    QLOG_DEBUG() << "Starting soffice ";
    sofficeProcess.waitForStarted();
    QLOG_DEBUG() << "Waiting for completion";
    sofficeProcess.waitForFinished();
    int rc = sofficeProcess.exitCode();
    QLOG_DEBUG() << "soffice Errors:" << sofficeProcess.readAllStandardError();
    QLOG_DEBUG() << "soffice Output:" << sofficeProcess.readAllStandardOutput();
    QLOG_DEBUG() << "return code:" << rc;
    if (rc == 255) {
        QLOG_ERROR() << "soffice not found.  Disabling attachment indexing.";
        this->officeFound = false;
        return;
    }
    QFile txtFile(outDir+QString::number(reslid) +".txt");
    if (txtFile.open(QIODevice::ReadOnly)) {
        QString text;
        text = txtFile.readAll();
        NSqlQuery sql(db);
        db->lockForWrite();
        sql.prepare("Insert into SearchIndex (lid, weight, source, content) values (:lid, :weight, 'recognition', :content)");
        sql.bindValue(":lid", lid);
        sql.bindValue(":weight", 100);

        text = global.normalizeTermForSearchAndIndex(text);
        sql.bindValue(":content", text);

        QLOG_DEBUG() << "Adding note resource to index DB";
        sql.exec();
        db->unlock();
        txtFile.close();
    }
    QDir dir;
    dir.remove(outDir+QString::number(reslid) +".txt");
}


void IndexRunner::flushCache() {
    if (indexHash->size() <= 0)
        return;
    QDateTime start = QDateTime::currentDateTimeUtc();
    NSqlQuery sql(db);
    db->lockForWrite();
    sql.exec("begin");
    QHash<qint32, IndexRecord*>::iterator i;

    // Start adding words to the index.  Every 200 sql insertions we do a commit
    int commitCount = 200;

    for (i=indexHash->begin(); keepRunning && !pauseIndexing && i!=indexHash->end(); ++i) {
        qint32 lid = i.key();
        IndexRecord *rec = i.value();
        qint32 weight = rec->weight;
        QString source = rec->source;
        QString content = rec->content;
        delete rec;

        // Delete any old content
        sql.prepare("Delete from SearchIndex where lid=:lid and source=:source");
        sql.bindValue(":lid", lid);
        sql.bindValue(":source", source);
        sql.exec();

        // Add the new content.  it is basically a text version of the note with a weight of 100.
        sql.prepare("Insert into SearchIndex (lid, weight, source, content) values (:lid, :weight, :source, :content)");
        sql.bindValue(":lid", lid);
        sql.bindValue(":weight", weight);
        sql.bindValue(":source", source);

        content = global.normalizeTermForSearchAndIndex(content);
        sql.bindValue(":content", content);

        sql.exec();
        commitCount--;
        if (commitCount <= 0) {
            sql.exec("commit");
            commitCount = 200;
        }
    }
    indexHash->clear();
    sql.exec("commit");

    sql.finish();
    db->unlock();
    QDateTime finish = QDateTime::currentDateTimeUtc();

    QLOG_DEBUG() << "Index Cache Flush Complete: " <<
                    finish.toMSecsSinceEpoch() - start.toMSecsSinceEpoch()
                    << " milliseconds.";
}



void IndexRunner::busy(bool value, bool finished) {
    iAmBusy=value;
    emit(this->indexDone(finished));
}
int main( int argc, char **argv )
{
    QApplication a( argc, argv );               // QApplication required!

    Q_UNUSED( argc );
    Q_UNUSED( argv );

    QTime t;
    t.start();
    QDir dbDir( QStringLiteral( "./pdfdb" ) );
    if ( !dbDir.exists() ) {
	qWarning() << "Database directory does not exist";
    }

    QStringList excludeSubDirs;
    excludeSubDirs << QStringLiteral("000048") << QStringLiteral("000607");

    const QStringList dirs = dbDir.entryList(QStringList() << QStringLiteral("0000*"), QDir::Dirs);
    foreach ( const QString &subdir, dirs ) {
	if ( excludeSubDirs.contains(subdir) ) {
	    // then skip it
	} else {
	    QString path = "./pdfdb/" + subdir + "/data.pdf";
	    std::cout <<"Doing " << path.toLatin1().data() << " :";
	    Poppler::Document *doc = Poppler::Document::load( path );
	    if (!doc) {
		qWarning() << "doc not loaded";
	    } else {
		int major = 0, minor = 0;
		doc->getPdfVersion( &major, &minor );
		doc->info(QStringLiteral("Title"));
		doc->info(QStringLiteral("Subject"));
		doc->info(QStringLiteral("Author"));
		doc->info(QStringLiteral("Keywords"));
		doc->info(QStringLiteral("Creator"));
		doc->info(QStringLiteral("Producer"));
		doc->date(QStringLiteral("CreationDate")).toString();
		doc->date(QStringLiteral("ModDate")).toString();
		doc->numPages();
		doc->isLinearized();
		doc->isEncrypted();
		doc->okToPrint();
		doc->okToCopy();
		doc->okToChange();
		doc->okToAddNotes();
		doc->pageMode();

		for( int index = 0; index < doc->numPages(); ++index ) {
		    Poppler::Page *page = doc->page( index );
		    page->renderToImage();
		    page->pageSize();
		    page->orientation();
		    delete page;
		    std::cout << ".";
		    std::cout.flush();
		}
		std::cout << std::endl;
		delete doc;
	    }
	}
    }

    std::cout << "Elapsed time: " << (t.elapsed()/1000) << std::endl;

}