char UTF8UTF16::processText(SWBuf &text, const SWKey *key, const SWModule *module) { const unsigned char *from; SWBuf orig = text; from = (const unsigned char *)orig.c_str(); // ------------------------------- text = ""; while (*from) { __u32 ch = getUniCharFromUTF8(&from); if (!ch) continue; // invalid char if (ch < 0x10000) { text.setSize(text.size()+2); *((__u16 *)(text.getRawData()+(text.size()-2))) = (__u16)ch; } else { __u16 utf16; utf16 = (__s16)((ch - 0x10000) / 0x400 + 0xD800); text.setSize(text.size()+4); *((__u16 *)(text.getRawData()+(text.size()-4))) = utf16; utf16 = (__s16)((ch - 0x10000) % 0x400 + 0xDC00); *((__u16 *)(text.getRawData()+(text.size()-2))) = utf16; } } text.setSize(text.size()+2); *((__u16 *)(text.getRawData()+(text.size()-2))) = (__u16)0; text.setSize(text.size()-2); return 0; }
void zStr::getCompressedText(long block, long entry, char **buf) const { __u32 size = 0; if (cacheBlockIndex != block) { __u32 start = 0; zdxfd->seek(block * ZDXENTRYSIZE, SEEK_SET); zdxfd->read(&start, 4); zdxfd->read(&size, 4); start = swordtoarch32(start); size = swordtoarch32(size); SWBuf buf; buf.setSize(size + 5); zdtfd->seek(start, SEEK_SET); zdtfd->read(buf.getRawData(), size); flushCache(); unsigned long len = size; buf.setSize(size); rawZFilter(buf, 0); // 0 = decipher compressor->zBuf(&len, buf.getRawData()); char *rawBuf = compressor->Buf(0, &len); cacheBlock = new EntriesBlock(rawBuf, len); cacheBlockIndex = block; } size = cacheBlock->getEntrySize(entry); *buf = (*buf) ? (char *)realloc(*buf, size*2 + 1) : (char *)malloc(size*2 + 1); strcpy(*buf, cacheBlock->getEntry(entry)); }
void zVerse::flushCache() const { if (dirtyCache) { __u32 idxoff; __u32 start, outstart; __u32 size, outsize; __u32 zsize, outzsize; idxoff = cacheBufIdx * 12; if (cacheBuf) { size = outsize = zsize = outzsize = strlen(cacheBuf); if (size) { // if (compressor) { // delete compressor; // compressor = new LZSSCompress(); // } compressor->Buf(cacheBuf); unsigned long tmpSize; compressor->zBuf(&tmpSize); outzsize = zsize = tmpSize; SWBuf buf; buf.setSize(zsize + 5); memcpy(buf.getRawData(), compressor->zBuf(&tmpSize), tmpSize); outzsize = zsize = tmpSize; buf.setSize(zsize); rawZFilter(buf, 1); // 1 = encipher start = outstart = textfp[cacheTestament-1]->seek(0, SEEK_END); outstart = archtosword32(start); outsize = archtosword32(size); outzsize = archtosword32(zsize); textfp[cacheTestament-1]->write(buf, zsize); idxfp[cacheTestament-1]->seek(idxoff, SEEK_SET); idxfp[cacheTestament-1]->write(&outstart, 4); idxfp[cacheTestament-1]->write(&outzsize, 4); idxfp[cacheTestament-1]->write(&outsize, 4); } free(cacheBuf); cacheBuf = 0; } dirtyCache = false; } }
char UTF8NFC::processText(SWBuf &text, const SWKey *key, const SWModule *module) { if ((unsigned long)key < 2) // hack, we're en(1)/de(0)ciphering return -1; err = U_ZERO_ERROR; UnicodeString source(text.getRawData(), text.length(), conv, err); UnicodeString target; err = U_ZERO_ERROR; Normalizer::normalize(source, UNORM_NFC, 0, target, err); err = U_ZERO_ERROR; text.setSize(text.size()*2); // potentially, it can grow to 2x the original size int32_t len = target.extract(text.getRawData(), text.size(), conv, err); text.setSize(len); return 0; }
void RawVerse::readText(char testmt, long start, unsigned short size, SWBuf &buf) const { buf = ""; buf.setFillByte(0); buf.setSize(size + 1); if (!testmt) testmt = ((idxfp[1]) ? 1:2); if (size) { if (textfp[testmt-1]->getFd() >= 0) { textfp[testmt-1]->seek(start, SEEK_SET); textfp[testmt-1]->read(buf.getRawData(), (int)size); } } }
char UTF8arShaping::processText(SWBuf &text, const SWKey *key, const SWModule *module) { UChar *ustr, *ustr2; if ((unsigned long)key < 2) // hack, we're en(1)/de(0)ciphering return -1; int32_t len = text.length(); ustr = new UChar[len]; ustr2 = new UChar[len]; // Convert UTF-8 string to UTF-16 (UChars) len = ucnv_toUChars(conv, ustr, len, text.c_str(), -1, &err); len = u_shapeArabic(ustr, len, ustr2, len, U_SHAPE_LETTERS_SHAPE | U_SHAPE_DIGITS_EN2AN, &err); text.setSize(text.size()*2); len = ucnv_fromUChars(conv, text.getRawData(), text.size(), ustr2, len, &err); text.setSize(len); delete [] ustr2; delete [] ustr; return 0; }
void RawStr::readText(__u32 istart, __u16 *isize, char **idxbuf, SWBuf &buf) const { unsigned int ch; char *idxbuflocal = 0; getIDXBufDat(istart, &idxbuflocal); __u32 start = istart; do { if (*idxbuf) delete [] *idxbuf; buf = ""; buf.setFillByte(0); buf.setSize(++(*isize)); *idxbuf = new char [ (*isize) ]; datfd->seek(start, SEEK_SET); datfd->read(buf.getRawData(), (int)((*isize) - 1)); for (ch = 0; buf[ch]; ch++) { // skip over index string if (buf[ch] == 10) { ch++; break; } } buf = SWBuf(buf.c_str()+ch); // resolve link if (!strncmp(buf.c_str(), "@LINK", 5)) { for (ch = 0; buf[ch]; ch++) { // null before nl if (buf[ch] == 10) { buf[ch] = 0; break; } } findOffset(buf.c_str() + 6, &start, isize); } else break; } while (true); // while we're resolving links if (idxbuflocal) { int localsize = strlen(idxbuflocal); localsize = (localsize < (*isize - 1)) ? localsize : (*isize - 1); strncpy(*idxbuf, idxbuflocal, localsize); (*idxbuf)[localsize] = 0; free(idxbuflocal); } }
char UTF8NFKD::processText(SWBuf &text, const SWKey *key, const SWModule *module) { if ((unsigned long)key < 2) // hack, we're en(1)/de(0)ciphering return -1; int32_t len = 5 + text.length() * 5; source = new UChar[len + 1]; //each char could become a surrogate pair // Convert UTF-8 string to UTF-16 (UChars) int32_t ulen = ucnv_toUChars(conv, source, len, text.c_str(), -1, &err); target = new UChar[len + 1]; //compatability decomposition ulen = unorm_normalize(source, ulen, UNORM_NFKD, 0, target, len, &err); text.setSize(len); len = ucnv_fromUChars(conv, text.getRawData(), len, target, ulen, &err); text.setSize(len); delete [] source; delete [] target; return 0; }
bool TEIHTMLHREF::handleToken(SWBuf &buf, const char *token, BasicFilterUserData *userData) { // manually process if it wasn't a simple substitution if (!substituteToken(buf, token)) { MyUserData *u = (MyUserData *)userData; XMLTag tag(token); if (!strcmp(tag.getName(), "p")) { if ((!tag.isEndTag()) && (!tag.isEmpty())) { // non-empty start tag buf += "<!P><br />"; } else if (tag.isEndTag()) { // end tag buf += "<!/P><br />"; //userData->supressAdjacentWhitespace = true; } else { // empty paragraph break marker buf += "<!P><br />"; //userData->supressAdjacentWhitespace = true; } } // <hi> else if (!strcmp(tag.getName(), "hi")) { if ((!tag.isEndTag()) && (!tag.isEmpty())) { SWBuf rend = tag.getAttribute("rend"); u->lastHi = rend; if (rend == "ital") buf += "<i>"; else if (rend == "bold") buf += "<b>"; else if (rend == "sup") buf += "<small><sup>"; } else if (tag.isEndTag()) { SWBuf rend = u->lastHi; if (rend == "ital") buf += "</i>"; else if (rend == "bold") buf += "</b>"; else if (rend == "sup") buf += "</sup></small>"; } } // <entryFree> else if (!strcmp(tag.getName(), "entryFree")) { if ((!tag.isEndTag()) && (!tag.isEmpty())) { SWBuf n = tag.getAttribute("n"); if (n != "") { buf += "<b>"; buf += n; buf += "</b>"; } } } // <sense> else if (!strcmp(tag.getName(), "sense")) { if ((!tag.isEndTag()) && (!tag.isEmpty())) { SWBuf n = tag.getAttribute("n"); if (n != "") { buf += "<br /><b>"; buf += n; buf += "</b>"; } } } // <div> else if (!strcmp(tag.getName(), "div")) { if ((!tag.isEndTag()) && (!tag.isEmpty())) { buf += "<!P>"; } else if (tag.isEndTag()) { } } // <pos>, <gen>, <case>, <gram>, <number>, <mood>, <pron>, <def> else if (!strcmp(tag.getName(), "pos") || !strcmp(tag.getName(), "gen") || !strcmp(tag.getName(), "case") || !strcmp(tag.getName(), "gram") || !strcmp(tag.getName(), "number") || !strcmp(tag.getName(), "pron") /*|| !strcmp(tag.getName(), "def")*/) { if ((!tag.isEndTag()) && (!tag.isEmpty())) { buf += "<i>"; } else if (tag.isEndTag()) { buf += "</i>"; } } // <tr> else if (!strcmp(tag.getName(), "tr")) { if ((!tag.isEndTag()) && (!tag.isEmpty())) { buf += "<i>"; } else if (tag.isEndTag()) { buf += "</i>"; } } // orth else if (!strcmp(tag.getName(), "orth")) { if ((!tag.isEndTag()) && (!tag.isEmpty())) { buf += "<b>"; } else if (tag.isEndTag()) { buf += "</b>"; } } // <etym>, <usg> else if (!strcmp(tag.getName(), "etym") || !strcmp(tag.getName(), "usg")) { // do nothing here } else if (!strcmp(tag.getName(), "ref")) { if (!tag.isEndTag()) { u->suspendTextPassThru = true; SWBuf target; SWBuf work; SWBuf ref; int was_osisref = false; if(tag.getAttribute("osisRef")) { target += tag.getAttribute("osisRef"); was_osisref=true; } else if(tag.getAttribute("target")) target += tag.getAttribute("target"); if(target.size()) { const char* the_ref = strchr(target, ':'); if(!the_ref) { // No work ref = target; } else { // Compensate for starting : ref = the_ref + 1; int size = target.size() - ref.size() - 1; work.setSize(size); strncpy(work.getRawData(), target, size); } if(was_osisref) { buf.appendFormatted("<a href=\"passagestudy.jsp?action=showRef&type=scripRef&value=%s&module=%s\">", (ref) ? URL::encode(ref.c_str()).c_str() : "", (work.size()) ? URL::encode(work.c_str()).c_str() : ""); } else { // Dictionary link, or something buf.appendFormatted("<a href=\"sword://%s/%s\">", (work.size()) ? URL::encode(work.c_str()).c_str() : u->version.c_str(), (ref) ? URL::encode(ref.c_str()).c_str() : "" ); } } else { //std::cout << "TARGET WASN'T\n"; } } else { buf += u->lastTextNode.c_str(); buf += "</a>"; u->suspendTextPassThru = false; } } // <note> tag else if (!strcmp(tag.getName(), "note")) { if (!tag.isEndTag()) { if (!tag.isEmpty()) { u->suspendTextPassThru = true; } } if (tag.isEndTag()) { SWBuf footnoteNumber = tag.getAttribute("swordFootnote"); buf.appendFormatted("<a href=\"passagestudy.jsp?action=showNote&type=n&value=%s&module=%s&passage=%s\"><small><sup>*n</sup></small></a>", URL::encode(footnoteNumber.c_str()).c_str(), URL::encode(u->version.c_str()).c_str(), URL::encode(u->key->getText()).c_str()); u->suspendTextPassThru = false; } } else { return false; // we still didn't handle token } } return true; }
int main(int argc, char **argv) { SWBuf program = argv[0]; fprintf(stderr, "You are running %s: $Rev: 2138 $\n", argv[0]); // Let's test our command line arguments if (argc < 3) { usage(*argv); } // variables for arguments, holding defaults SWBuf path = argv[1]; SWBuf teiDoc = argv[2]; SWBuf compType = ""; SWBuf modDrv = ""; SWBuf recommendedPath = "./modules/lexdict/"; SWBuf cipherKey = ""; SWCompress *compressor = 0; for (int i = 3; i < argc; i++) { if (!strcmp(argv[i], "-z")) { if (compType.size()) usage(*argv, "Cannot specify both -z and -Z"); if (modDrv.size()) usage(*argv, "Cannot specify both -z and -s"); compType = "ZIP"; modDrv = "zLD"; recommendedPath += "zld/"; } else if (!strcmp(argv[i], "-Z")) { if (compType.size()) usage(*argv, "Cannot specify both -z and -Z"); if (modDrv.size()) usage(*argv, "Cannot specify both -Z and -s"); compType = "LZSS"; recommendedPath += "zld/"; } else if (!strcmp(argv[i], "-s")) { if (compType.size()) usage(*argv, "Cannot specify both -s and -z or -Z"); if (i+1 < argc) { int size = atoi(argv[++i]); if (size == 2) { modDrv = "RawLD"; recommendedPath += "rawld/"; continue; } if (size == 4) { modDrv = "RawLD4"; recommendedPath += "rawld4/"; continue; } } usage(*argv, "-s requires one of <2|4>"); } else if (!strcmp(argv[i], "-N")) { normalize = false; } else if (!strcmp(argv[i], "-c")) { if (i+1 < argc) cipherKey = argv[++i]; else usage(*argv, "-c requires <cipher_key>"); } else usage(*argv, (((SWBuf)"Unknown argument: ")+ argv[i]).c_str()); } if (!modDrv.size()) { modDrv = "RawLD4"; recommendedPath += "rawld4/"; } #ifndef _ICU_ if (normalize) { normalize = false; cout << program << " is not compiled with support for ICU. Setting -N flag." << endl; } #endif if (compType == "ZIP") { compressor = new ZipCompress(); } else if (compType = "LZSS") { compressor = new LZSSCompress(); } #ifdef DEBUG // cout << "path: " << path << " teiDoc: " << teiDoc << " compressType: " << compType << " ldType: " << modDrv << " cipherKey: " << cipherKey.c_str() << " normalize: " << normalize << "\n"; cout << "path: " << path << " teiDoc: " << teiDoc << " compressType: " << compType << " ldType: " << modDrv << " normalize: " << normalize << "\n"; cout << ""; // exit(-3); #endif SWBuf modName = path; int pathlen = path.length(); char lastChar = path[pathlen - 1]; if (lastChar != '/' && lastChar != '\\') { modName += "/"; } modName += "dict"; SWBuf keyBuf; SWBuf entBuf; SWBuf lineBuf; vector<string> linkBuf; if (modDrv == "zLD") { if (zLD::createModule(modName)) { fprintf(stderr, "error: %s: couldn't create module at path: %s \n", program.c_str(), modName.c_str()); exit(-3); } module = new zLD(modName, 0, 0, 30, compressor); } else if (modDrv == "RawLD") { if (RawLD::createModule(modName)) { fprintf(stderr, "error: %s: couldn't create module at path: %s \n", program.c_str(), modName.c_str()); exit(-3); } module = new RawLD(modName); } else { if (RawLD4::createModule(modName)) { fprintf(stderr, "error: %s: couldn't create module at path: %s \n", program.c_str(), modName.c_str()); exit(-3); } module = new RawLD4(modName); } SWFilter *cipherFilter = 0; if (cipherKey.size()) { fprintf(stderr, "Adding cipher filter with phrase: %s\n", cipherKey.c_str() ); cipherFilter = new CipherFilter(cipherKey.c_str()); module->AddRawFilter(cipherFilter); } if (!module->isWritable()) { fprintf(stderr, "The module is not writable. Writing text to it will not work.\nExiting.\n" ); exit(-1); } // Let's see if we can open our input file ifstream infile(teiDoc); if (infile.fail()) { fprintf(stderr, "error: %s: couldn't open input file: %s \n", program.c_str(), teiDoc.c_str()); exit(-2); } currentKey = module->CreateKey(); currentKey->Persist(1); module->setKey(*currentKey); (*module) = TOP; SWBuf token; SWBuf text; bool intoken = false; char curChar = '\0'; while (infile.good()) { curChar = infile.get(); // skip the character if it is bad. infile.good() will catch the problem if (curChar == -1) { continue; } if (!intoken && curChar == '<') { intoken = true; token = "<"; continue; } if (intoken && curChar == '>') { intoken = false; token.append('>'); XMLTag *t = new XMLTag(token.c_str()); if (!handleToken(text, t)) { text.append(*t); } delete t; continue; } if (intoken) token.append(curChar); else switch (curChar) { case '>' : text.append(">"); break; case '<' : text.append("<"); break; default : text.append(curChar); break; } } // Force the last entry from the text buffer. //text = ""; //writeEntry(*currentKey, text); delete module; delete currentKey; if (cipherFilter) delete cipherFilter; infile.close(); #ifdef _ICU_ if (converted) fprintf(stderr, "tei2mod converted %d verses to UTF-8\n", converted); if (normalized) fprintf(stderr, "tei2mod normalized %d verses to NFC\n", normalized); #endif /* * Suggested module name detection. * Only used for suggesting a conf. * * Various forms of path. * . and .. - no module name given, use "dict". * Or one of the following where z is the module name * and x may be . or .. * z * x/y/z * x/y/z/ * x/y/z/z */ SWBuf suggestedModuleName = path; if (lastChar == '/' || lastChar == '\\') { suggestedModuleName.setSize(--pathlen); } lastChar = suggestedModuleName[pathlen - 1]; if (lastChar == '.') { suggestedModuleName = "???"; } else { /* At this point the suggestion is either * what follows the last / or \ * or the entire string */ const char *m = strrchr(suggestedModuleName.c_str(), '/'); if (!m) { m = strrchr(suggestedModuleName.c_str(), '\\'); } if (m) { suggestedModuleName = m+1; } } recommendedPath += suggestedModuleName; recommendedPath += "/dict"; fprintf(stderr, "\nSuggested conf (replace ??? with appropriate values)\n\n"); fprintf(stderr, "[%s]\n", suggestedModuleName.c_str()); fprintf(stderr, "DataPath=%s\n", recommendedPath.c_str()); fprintf(stderr, "Description=???\n"); fprintf(stderr, "SourceType=TEI\n"); fprintf(stderr, "Encoding=%s\n", (normalize ? "UTF-8" : "???")); fprintf(stderr, "ModDrv=%s\n", modDrv.c_str()); if (compressor) { fprintf(stderr, "CompressType=%s\n", compType.c_str()); } if (cipherKey.size()) { fprintf(stderr, "CipherKey=%s\n", cipherKey.c_str()); } }
void zVerse::zReadText(char testmt, long start, unsigned short size, unsigned long ulBuffNum, SWBuf &inBuf) const { __u32 ulCompOffset = 0; // compressed buffer start __u32 ulCompSize = 0; // buffer size compressed __u32 ulUnCompSize = 0; // buffer size uncompressed if (!testmt) { testmt = ((idxfp[0]) ? 1:2); } // assert we have and valid file descriptor if (compfp[testmt-1]->getFd() < 1) return; if (size && !(((long) ulBuffNum == cacheBufIdx) && (testmt == cacheTestament) && (cacheBuf))) { //fprintf(stderr, "Got buffer number{%ld} versestart{%ld} versesize{%d}\n", ulBuffNum, ulVerseStart, usVerseSize); if (idxfp[testmt-1]->seek(ulBuffNum*12, SEEK_SET)!=(long) ulBuffNum*12) { fprintf(stderr, "Error seeking compressed file index\n"); return; } if (idxfp[testmt-1]->read(&ulCompOffset, 4)<4) { fprintf(stderr, "Error reading ulCompOffset\n"); return; } if (idxfp[testmt-1]->read(&ulCompSize, 4)<4) { fprintf(stderr, "Error reading ulCompSize\n"); return; } if (idxfp[testmt-1]->read(&ulUnCompSize, 4)<4) { fprintf(stderr, "Error reading ulUnCompSize\n"); return; } ulCompOffset = swordtoarch32(ulCompOffset); ulCompSize = swordtoarch32(ulCompSize); ulUnCompSize = swordtoarch32(ulUnCompSize); if (textfp[testmt-1]->seek(ulCompOffset, SEEK_SET)!=(long)ulCompOffset) { fprintf(stderr, "Error: could not seek to right place in compressed text\n"); return; } SWBuf pcCompText; pcCompText.setSize(ulCompSize+5); if (textfp[testmt-1]->read(pcCompText.getRawData(), ulCompSize)<(long)ulCompSize) { fprintf(stderr, "Error reading compressed text\n"); return; } pcCompText.setSize(ulCompSize); rawZFilter(pcCompText, 0); // 0 = decipher unsigned long bufSize = ulCompSize; compressor->zBuf(&bufSize, pcCompText.getRawData()); if (cacheBuf) { flushCache(); free(cacheBuf); } unsigned long len = 0; compressor->Buf(0, &len); cacheBuf = (char *)calloc(len + 1, 1); memcpy(cacheBuf, compressor->Buf(), len); cacheBufSize = strlen(cacheBuf); // TODO: can we just use len? cacheTestament = testmt; cacheBufIdx = ulBuffNum; } inBuf = ""; if ((size > 0) && cacheBuf && ((unsigned)start < cacheBufSize)) { inBuf.setFillByte(0); inBuf.setSize(size+1); strncpy(inBuf.getRawData(), &(cacheBuf[start]), size); inBuf.setSize(strlen(inBuf.c_str())); } }
virtual void logMessage(const char *message, int level) const { SWBuf msg = message; if (msg.size() > 512) msg.setSize(512); __android_log_write(levelMapping[level], "libsword.so", msg.c_str()); }
void zStr::flushCache() const { static const char nl[] = {13, 10}; if (cacheBlock) { if (cacheDirty) { __u32 start = 0; unsigned long size = 0; __u32 outstart = 0, outsize = 0; const char *rawBuf = cacheBlock->getRawData(&size); compressor->Buf(rawBuf, &size); compressor->zBuf(&size); SWBuf buf; buf.setSize(size + 5); memcpy(buf.getRawData(), compressor->zBuf(&size), size); // 1 = encipher buf.setSize(size); rawZFilter(buf, 1); // 1 = encipher long zdxSize = zdxfd->seek(0, SEEK_END); unsigned long zdtSize = zdtfd->seek(0, SEEK_END); if ((cacheBlockIndex * ZDXENTRYSIZE) > (zdxSize - ZDXENTRYSIZE)) { // New Block start = zdtSize; } else { zdxfd->seek(cacheBlockIndex * ZDXENTRYSIZE, SEEK_SET); zdxfd->read(&start, 4); zdxfd->read(&outsize, 4); start = swordtoarch32(start); outsize = swordtoarch32(outsize); if (start + outsize >= zdtSize) { // last entry, just overwrite // start is already set } else if (size < outsize) { // middle entry, but smaller, that's fine and let's preserve bigger size size = outsize; } else { // middle and bigger-- we have serious problems, for now let's put it at the end = lots of wasted space start = zdtSize; } } outstart = archtosword32(start); outsize = archtosword32((__u32)size); zdxfd->seek(cacheBlockIndex * ZDXENTRYSIZE, SEEK_SET); zdtfd->seek(start, SEEK_SET); zdtfd->write(buf, size); // add a new line to make data file easier to read in an editor zdtfd->write(&nl, 2); zdxfd->write(&outstart, 4); zdxfd->write(&outsize, 4); } delete cacheBlock; cacheBlock = 0; } cacheBlockIndex = -1; cacheDirty = false; }