void Worker::run() { while (1) { res->requestSemaphore.acquire(1); if (die) { break; } // get next page to render res->requestMutex.lock(); int page, width, index; map<int,pair<int,int> >::iterator less = res->requests.lower_bound(res->center_page); map<int,pair<int,int> >::iterator greater = less--; if (greater != res->requests.end()) { if (greater != res->requests.begin()) { // favour nearby page, go down first if (greater->first + less->first <= res->center_page * 2) { page = greater->first; index = greater->second.first; width = greater->second.second; res->requests.erase(greater); } else { page = less->first; index = less->second.first; width = less->second.second; res->requests.erase(less); } } else { page = greater->first; index = greater->second.first; width = greater->second.second; res->requests.erase(greater); } } else { page = less->first; index = less->second.first; width = less->second.second; res->requests.erase(less); } res->requestMutex.unlock(); // check for duplicate requests res->k_page[page].mutex.lock(); if (res->k_page[page].status[index] == width && res->k_page[page].rotation[index] == res->rotation) { res->k_page[page].mutex.unlock(); continue; } int rotation = res->rotation; res->k_page[page].mutex.unlock(); // open page #ifdef DEBUG cerr << " rendering page " << page << " for index " << index << endl; #endif Poppler::Page *p = res->doc->page(page); if (p == NULL) { cerr << "failed to load page " << page << endl; continue; } // render page float dpi = 72.0 * width / res->get_page_width(page); QImage img = p->renderToImage(dpi, dpi, -1, -1, -1, -1, static_cast<Poppler::Page::Rotation>(rotation)); if (img.isNull()) { cerr << "failed to render page " << page << endl; continue; } // invert to current color setting if (res->inverted_colors) { img.invertPixels(); } // put page res->k_page[page].mutex.lock(); if (!res->k_page[page].img[index].isNull()) { res->k_page[page].img[index] = QImage(); // assign null image } // adjust all available images to current color setting if (res->k_page[page].inverted_colors != res->inverted_colors) { res->k_page[page].inverted_colors = res->inverted_colors; for (int i = 0; i < 3; i++) { res->k_page[page].img[i].invertPixels(); } res->k_page[page].thumbnail.invertPixels(); } res->k_page[page].img[index] = img; res->k_page[page].status[index] = width; res->k_page[page].rotation[index] = rotation; res->k_page[page].mutex.unlock(); res->garbageMutex.lock(); res->garbage.insert(page); // TODO add index information? res->garbageMutex.unlock(); emit page_rendered(page); // collect goto links res->link_mutex.lock(); if (res->k_page[page].links == NULL) { res->link_mutex.unlock(); QList<Poppler::Link *> *links = new QList<Poppler::Link *>; QList<Poppler::Link *> l = p->links(); links->swap(l); res->link_mutex.lock(); res->k_page[page].links = links; } if (res->k_page[page].text == NULL) { res->link_mutex.unlock(); QList<Poppler::TextBox *> text = p->textList(); // assign boxes to lines // make single parts from chained boxes set<Poppler::TextBox *> used; QList<SelectionPart *> selection_parts; Q_FOREACH(Poppler::TextBox *box, text) { if (used.find(box) != used.end()) { continue; } used.insert(box); SelectionPart *p = new SelectionPart(box); selection_parts.push_back(p); Poppler::TextBox *next = box->nextWord(); while (next != NULL) { used.insert(next); p->add_word(next); next = next->nextWord(); } } // sort by y coordinate qStableSort(selection_parts.begin(), selection_parts.end(), selection_less_y); QRectF line_box; QList<SelectionLine *> *lines = new QList<SelectionLine *>(); Q_FOREACH(SelectionPart *part, selection_parts) { QRectF box = part->get_bbox(); // box fits into line_box's line if (!lines->empty() && box.y() <= line_box.center().y() && box.bottom() > line_box.center().y()) { float ratio_w = box.width() / line_box.width(); float ratio_h = box.height() / line_box.height(); if (ratio_w < 1.0f) { ratio_w = 1.0f / ratio_w; } if (ratio_h < 1.0f) { ratio_h = 1.0f / ratio_h; } if (ratio_w > 1.3f && ratio_h > 1.3f) { lines->back()->sort(); lines->push_back(new SelectionLine(part)); line_box = part->get_bbox(); } else { lines->back()->add_part(part); } // it doesn't fit, create new line } else { if (!lines->empty()) { lines->back()->sort(); } lines->push_back(new SelectionLine(part)); line_box = part->get_bbox(); } } if (!lines->empty()) { lines->back()->sort(); } res->link_mutex.lock(); res->k_page[page].text = lines; }
QString PopplerExtractor::parseFirstPage(Poppler::Document* pdfDoc, const QString& fileUrl) { QScopedPointer<Poppler::Page> p(pdfDoc->page(0)); if (!p) { qWarning() << "Could not read page content from" << fileUrl; return QString(); } QList<Poppler::TextBox*> tbList = p->textList(); QMap<int, QString> possibleTitleMap; int currentLargestChar = 0; int skipTextboxes = 0; // Iterate over all textboxes. Each textbox can be a single character/word or textblock // Here we combine the etxtboxes back together based on the textsize // Important are the words with the biggest font size foreach(Poppler::TextBox * tb, tbList) { // if we added followup words, skip the textboxes here now if (skipTextboxes > 0) { skipTextboxes--; continue; } int height = tb->charBoundingBox(0).height(); // if the following text is smaller than the biggest we found up to now, ignore it if (height >= currentLargestChar) { QString possibleTitle; possibleTitle.append(tb->text()); currentLargestChar = height; // if the text has follow up words add them to to create the full title Poppler::TextBox* next = tb->nextWord(); while (next) { possibleTitle.append(QLatin1Char(' ')); possibleTitle.append(next->text()); next = next->nextWord(); skipTextboxes++; } // now combine text for each font size together, very likeley it must be connected QString existingTitlePart = possibleTitleMap.value(currentLargestChar, QString()); existingTitlePart.append(QLatin1Char(' ')); existingTitlePart.append(possibleTitle); possibleTitleMap.insert(currentLargestChar, existingTitlePart); } } qDeleteAll(tbList); QList<int> titleSizes = possibleTitleMap.keys(); qSort(titleSizes.begin(), titleSizes.end(), qGreater<int>()); QString newPossibleTitle; // find the text with the largest font that is not just 1 character foreach(int i, titleSizes) { QString title = possibleTitleMap.value(i); // sometime the biggest part is a single letter // as a starting paragraph letter if (title.size() < 5) { continue; } else { newPossibleTitle = title.trimmed(); break; } }