void fileWriter(MappingRaw* map, const int numRows, const string_t& fileName) { std::map<wchar_t, wchar_t> uniqnessCheck; std::stringstream ss; for (int i = 0; i < numRows; ++i) { ss << toUtf8(wstring_t(1, map[i].from)) << "|" << toUtf8(wstring_t(1, map[i].to)); //<< "|" << map[i].desc; ss << std::endl; auto it = uniqnessCheck.insert(std::make_pair(map[i].from, map[i].to)); if ( !it.second ) { wchar_t ch = map[i].from; wchar_t to = it.first->second; std::stringstream ss; ss << "Character INTCODE(" << (int) ch << "), HEX (0x" << Poco::NumberFormatter::formatHex((int)ch, 4) << "), CHAR (" << toUtf8(wstring_t(1, ch)) << ")" << " already mapped to INTCODE(" << (int)to << "), HEX (0x" << Poco::NumberFormatter::formatHex((int)to, 4) << "), CHAR (" << toUtf8(wstring_t(1, to)) << ")"; throw std::logic_error(ss.str().c_str()); } } writeFileAsBinary(fileName, ss.str()); }
void Converter::convertSingleDocQuick( const string_t& fileName ) { tDocumentsSp docs = word()->getDocuments(); tDocumentSp doc = docs->open(toUtf16(getInputAbsPath(fileName))); if (!doc) { logError(logger(), "Error while opening document: " + fileName); return; } tCharMappingSp cm; string_t fontName, newFontName; wstring_t text, textUnicode, docAsText; int c = 0; tSelectionSp s = word()->getSelection(); int pos = 0; int totalCharsQty = s->getStoryLength(); do { s->setStart(pos); s->setEnd(pos + 1); s->selectCurrentFont(); fontName = s->getFont()->getName(); if ( canSkipFont(fontName) ) { //s->getFont()->haveCommonAttributes(); pos = s->getEnd(); docAsText += s->getText(); std::cout << "\r" << percentageStr(pos, totalCharsQty - 1); continue; } text = s->getText(); if ( fontName.empty() ) { saveSelection(s); fontName = makeGuess(s); restoreSelection(s); /// if after all we have empty font name, log about that event /// and go forward if (fontName.empty()) { logError(logger(), "EMPTY FONT NAME: Investigate"); pos = s->getEnd(); docAsText += text; std::cout << "\r" << percentageStr(pos, totalCharsQty - 1); continue; } } /// use mapping textUnicode.clear(); cm = getCM(fontName); if (cm) { bool spacingOnly = cm->doConversion(text, textUnicode, fontName); newFontName = getFontSubstitution(cm, fontName); //tFontSp fontDup = s->getFont()->duplicate(); s->setText(textUnicode); //s->getFont()->haveCommonAttributes(); s->getFont()->setName(newFontName); //s->setFont(fontDup); } /// extract text from the document as well docAsText += textUnicode; pos = s->getEnd(); std::cout << "\r" << percentageStr(pos, totalCharsQty - 1); } while ( pos < totalCharsQty - 1 ); /// -------------------------------------------/// /// now save result in the appropriate folder /// string_t outputDir = getOutputAbsPath(fileName); Poco::File(outputDir).createDirectories(); Poco::Path p(fileName); doc->saveAs( outputDir + p.getBaseName() + " QUICK." + p.getExtension() ); doc->close(); if ( config_->getBool("app.saveAlsoAsUTF8", false) ) writeFileAsBinary( outputDir + p.getBaseName() + " UTF8 QUICK.txt", toUtf8(docAsText)); }
void Converter::convertSingleDocPrecise( const string_t& fileName ) { #ifdef SECURITY_ENABLED if ( !security_.getKey().updateCounters(0) ) { logContent(security_.getKey()); return; } #endif tDocumentsSp docs = word()->getDocuments(); tDocumentSp doc = docs->open(toUtf16(getInputAbsPath(fileName))); if (!doc) { logError(logger(), "Error while opening document: " + fileName); return; } /// -------------------------------------------/// usedFonts_.clear(); wstring_t docAsText; // tParagraphsSp paragraphs = doc->getParagraphs(); // int count = paragraphs->getCount(); // tSentencesSp sentences = doc->getSentences(); // int sentCount = sentences->getCount(); // for (int i = 1; i <= count; ++i) { // tParagraphSp p = paragraphs->getItem(i); // docAsText += processRangePrecise(p->getRange(), false); // std::cout << "\r" << percentageStr(i, count); // } tRangeSp r = doc->getContent(); int64 totalBytes = r->getStoryLength(); docAsText += processRangePreciseVer2(r, true); std::cout << std::endl; /// footnotes logInfo(logger(), "Processing [footnotes]: "); tFootnotesSp footnots = doc->getFootnotes(); int notesCount = footnots->getCount(); for (int i = 1; i <= notesCount; ++i) { tNoteSp note = footnots->getItem(i); tRangeSp r = note->getRange(); processRangePreciseVer2(r, false); std::cout << "\r" << percentageStr(i, notesCount); } if (notesCount > 0) std::cout << std::endl; tSectionsSp sections = doc->getSections(); int sectionsCount = sections->getCount(); for (int i = 1; i <= sectionsCount; ++i) { tSectionSp section(new Section(sections->getItem(i))); tHeadersFootersSp hfs = section->getHeaders(); if (hfs) { logInfo(logger(), "Processing [headers]: "); tHeaderFooterSp hf( new HeaderFooter(hfs->getItem(1)) ); tRangeSp r = hf->getRange(); processRangePreciseVer2(r, false); } hfs = section->getFooters(); if (hfs) { logInfo(logger(), "Processing [footers]: "); tHeaderFooterSp hf( new HeaderFooter(hfs->getItem(1)) ); tRangeSp r = hf->getRange(); processRangePreciseVer2(r, false); } } #ifdef SECURITY_ENABLED security_.getKey().updateCounters(totalBytes); #endif logUsedFonts(fileName, usedFonts_); usedFonts_.clear(); /// now save result in the appropriate folder string_t outputDir = getOutputAbsPath(fileName); Poco::File(outputDir).createDirectories(); Poco::Path p(fileName); logInfo(logger(), "Saving document..."); doc->saveAs( outputDir + p.getBaseName() + " UNICODE." + p.getExtension() ); doc->close(); logInfo(logger(), "Save was successful."); if ( wantUtf8Text_ ) writeFileAsBinary( outputDir + p.getBaseName() + " UTF8.txt", toUtf8(docAsText)); #ifdef SECURITY_ENABLED logContent(security_.getKey()); #endif }