int main(int argc, char **argv) { greekAccentsFilter.setOptionValue("Off"); // off = accents off parseParams(argc, argv); // Let's see if we can open our input file FileDesc *fd = FileMgr::getSystemFileMgr()->open(inFile, FileMgr::RDONLY); if (fd->getFd() < 0) { fprintf(stderr, "error: %s: couldn't open input file: %s \n", argv[0], inFile.c_str()); exit(-2); } RawGenBook *book; // Do some initialization stuff if (!augMod) { RawGenBook::createModule(outPath); } book = new RawGenBook(outPath); SWBuf lineBuffer; SWBuf keyBuffer; SWBuf entBuffer; bool more = true; do { more = FileMgr::getLine(fd, lineBuffer)!=0; if (lineBuffer.startsWith("$$$")) { if ((keyBuffer.size()) && (entBuffer.size())) { writeEntry(book, keyBuffer, entBuffer); } keyBuffer = lineBuffer; keyBuffer << 3; keyBuffer.trim(); entBuffer.size(0); } else { if (keyBuffer.size()) { entBuffer += lineBuffer; entBuffer += "\n"; } } } while (more); if ((keyBuffer.size()) && (entBuffer.size())) { writeEntry(book, keyBuffer, entBuffer); } delete book; FileMgr::getSystemFileMgr()->close(fd); return 0; }
char OSISGlosses::processText(SWBuf &text, const SWKey *key, const SWModule *module) { SWBuf token; bool intoken = false; const SWBuf orig = text; const char * from = orig.c_str(); if (!option) { for (text = ""; *from; ++from) { if (*from == '<') { intoken = true; token = ""; continue; } if (*from == '>') { // process tokens intoken = false; if (token.startsWith("w ")) { // Word XMLTag wtag(token); const char *l = wtag.getAttribute("gloss"); if (l) { wtag.setAttribute("gloss", 0); token = wtag; token.trim(); // drop <> token << 1; token--; } } // keep token in text text.append('<'); text.append(token); text.append('>'); continue; } if (intoken) { token += *from; } else { text.append(*from); } } } return 0; }
int main(int argc, char **argv) { // handle options if (argc < 2) usage(*argv); const char *progName = argv[0]; const char *inFileName = argv[1]; SWBuf v11n = "KJV"; SWBuf outPath = "./"; SWBuf locale = "en"; bool fourByteSize = false; bool append = false; int iType = 4; SWBuf cipherKey = ""; SWCompress *compressor = 0; SWBuf compType = ""; for (int i = 2; i < argc; i++) { if (!strcmp(argv[i], "-a")) { append = true; } else if (!strcmp(argv[i], "-z")) { if (fourByteSize) usage(*argv, "Cannot specify both -z and -4"); compType = "ZIP"; if (i+1 < argc && argv[i+1][0] != '-') { switch (argv[++i][0]) { case 'l': compType = "LZSS"; break; case 'z': compType = "ZIP"; break; case 'b': compType = "BZIP2"; break; case 'x': compType = "XZ"; break; } } } else if (!strcmp(argv[i], "-Z")) { if (compType.size()) usage(*argv, "Cannot specify both -z and -Z"); if (fourByteSize) usage(*argv, "Cannot specify both -Z and -4"); compType = "LZSS"; } else if (!strcmp(argv[i], "-4")) { fourByteSize = true; } else if (!strcmp(argv[i], "-b")) { if (i+1 < argc) { iType = atoi(argv[++i]); if ((iType >= 2) && (iType <= 4)) continue; } usage(*argv, "-b requires one of <2|3|4>"); } else if (!strcmp(argv[i], "-o")) { if (i+1 < argc) outPath = argv[++i]; else usage(progName, "-o requires <output_path>"); } else if (!strcmp(argv[i], "-v")) { if (i+1 < argc) v11n = argv[++i]; else usage(progName, "-v requires <v11n>"); } else if (!strcmp(argv[i], "-l")) { if (i+1 < argc) locale = argv[++i]; else usage(progName, "-l requires <locale>"); } else if (!strcmp(argv[i], "-c")) { if (i+1 < argc) cipherKey = argv[++i]; else usage(*argv, "-c requires <cipher_key>"); } else usage(progName, (((SWBuf)"Unknown argument: ")+ argv[i]).c_str()); } // ----------------------------------------------------- const VersificationMgr::System *v = VersificationMgr::getSystemVersificationMgr()->getVersificationSystem(v11n); if (!v) std::cout << "Warning: Versification " << v11n << " not found. Using KJV versification...\n"; if (compType == "LZSS") { compressor = new LZSSCompress(); } else if (compType == "ZIP") { #ifndef EXCLUDEZLIB compressor = new ZipCompress(); #else usage(*argv, "ERROR: SWORD library not compiled with ZIP compression support.\n\tBe sure libz is available when compiling SWORD library"); #endif } else if (compType == "BZIP2") { #ifndef EXCLUDEBZIP2 compressor = new Bzip2Compress(); #else usage(*argv, "ERROR: SWORD library not compiled with bzip2 compression support.\n\tBe sure libbz2 is available when compiling SWORD library"); #endif } else if (compType == "XZ") { #ifndef EXCLUDEXZ compressor = new XzCompress(); #else usage(*argv, "ERROR: SWORD library not compiled with xz compression support.\n\tBe sure liblzma is available when compiling SWORD library"); #endif } // setup module if (!append) { if (compressor) { if (zText::createModule(outPath, iType, v11n)) { fprintf(stderr, "ERROR: %s: couldn't create module at path: %s \n", *argv, outPath.c_str()); exit(-1); } } else { if (!fourByteSize) RawText::createModule(outPath, v11n); else RawText4::createModule(outPath, v11n); } } SWModule *module = 0; if (compressor) { // Create a compressed text module allowing very large entries // Taking defaults except for first, fourth, fifth and last argument module = new zText( outPath, // ipath 0, // iname 0, // idesc iType, // iblockType compressor, // icomp 0, // idisp ENC_UNKNOWN, // enc DIRECTION_LTR, // dir FMT_UNKNOWN, // markup 0, // lang v11n // versification ); } else { module = (!fourByteSize) ? (SWModule *)new RawText(outPath, 0, 0, 0, ENC_UNKNOWN, DIRECTION_LTR, FMT_UNKNOWN, 0, v11n) : (SWModule *)new RawText4(outPath, 0, 0, 0, ENC_UNKNOWN, DIRECTION_LTR, FMT_UNKNOWN, 0, v11n); } SWFilter *cipherFilter = 0; if (cipherKey.length()) { fprintf(stderr, "Adding cipher filter with phrase: %s\n", cipherKey.c_str() ); cipherFilter = new CipherFilter(cipherKey.c_str()); module->addRawFilter(cipherFilter); } // ----------------------------------------------------- // setup locale manager LocaleMgr::getSystemLocaleMgr()->setDefaultLocaleName(locale); // setup module key to allow full range of possible values, and then some VerseKey *vkey = (VerseKey *)module->createKey(); vkey->setIntros(true); vkey->setAutoNormalize(false); vkey->setPersist(true); module->setKey(*vkey); // ----------------------------------------------------- // process input file FileDesc *fd = FileMgr::getSystemFileMgr()->open(inFileName, FileMgr::RDONLY); SWBuf lineBuffer; SWBuf keyBuffer; SWBuf entBuffer; bool more = true; do { more = FileMgr::getLine(fd, lineBuffer)!=0; if (lineBuffer.startsWith("$$$")) { if ((keyBuffer.size()) && (entBuffer.size())) { writeEntry(module, keyBuffer, entBuffer); } keyBuffer = lineBuffer; keyBuffer << 3; keyBuffer.trim(); entBuffer.size(0); } else { if (keyBuffer.size()) { entBuffer += lineBuffer; entBuffer += "\n"; } } } while (more); if ((keyBuffer.size()) && (entBuffer.size())) { writeEntry(module, keyBuffer, entBuffer); } delete module; if (cipherFilter) delete cipherFilter; delete vkey; FileMgr::getSystemFileMgr()->close(fd); return 0; }
char OSISLemma::processText(SWBuf &text, const SWKey *key, const SWModule *module) { SWBuf token; bool intoken = false; const SWBuf orig = text; const char * from = orig.c_str(); if (!option) { for (text = ""; *from; ++from) { if (*from == '<') { intoken = true; token = ""; continue; } if (*from == '>') { // process tokens intoken = false; if (token.startsWith("w ")) { // Word XMLTag wtag(token); // always save off lemma if we haven't yet if (!wtag.getAttribute("savlm")) { const char *l = wtag.getAttribute("lemma"); if (l) { wtag.setAttribute("savlm", l); } } int count = wtag.getAttributePartCount("lemma", ' '); for (int i = 0; i < count; i++) { SWBuf a = wtag.getAttribute("lemma", i, ' '); const char *prefix = a.stripPrefix(':'); if ((!prefix) || ((SWBuf)prefix).startsWith("lemma.")) { // remove attribute part wtag.setAttribute("lemma", 0, i, ' '); i--; count--; } } token = wtag; token.trim(); // drop <> token << 1; token--; } // keep token in text text.append('<'); text.append(token); text.append('>'); continue; } if (intoken) { token += *from; } else { text.append(*from); } } } return 0; }
char OSISWordJS::processText(SWBuf &text, const SWKey *key, const SWModule *module) { if (option) { char token[2112]; // cheese. Fix. int tokpos = 0; bool intoken = false; int wordNum = 1; char wordstr[5]; SWBuf modName = (module)?module->getName():""; // add TR to w src in KJV then remove this next line SWBuf wordSrcPrefix = (modName == "KJV")?SWBuf("TR"):modName; VerseKey *vkey = 0; if (key) { vkey = SWDYNAMIC_CAST(VerseKey, key); } const SWBuf orig = text; const char * from = orig.c_str(); for (text = ""; *from; ++from) { if (*from == '<') { intoken = true; tokpos = 0; token[0] = 0; token[1] = 0; token[2] = 0; continue; } if (*from == '>') { // process tokens intoken = false; if ((*token == 'w') && (token[1] == ' ')) { // Word XMLTag wtag(token); sprintf(wordstr, "%03d", wordNum); SWBuf lemmaClass; SWBuf lemma; SWBuf morph; SWBuf page; SWBuf src; char gh = 0; page = module->getEntryAttributes()["Word"][wordstr]["Page"].c_str(); if (page.length()) page = (SWBuf)"p:" + page; int count = atoi(module->getEntryAttributes()["Word"][wordstr]["PartCount"].c_str()); for (int i = 0; i < count; i++) { // for now, lemma class can just be equal to last lemma class in multi part word SWBuf tmp = "LemmaClass"; if (count > 1) tmp.appendFormatted(".%d", i+1); lemmaClass = module->getEntryAttributes()["Word"][wordstr][tmp]; tmp = "Lemma"; if (count > 1) tmp.appendFormatted(".%d", i+1); tmp = (module->getEntryAttributes()["Word"][wordstr][tmp].c_str()); // if we're strongs, if (lemmaClass == "strong") { gh = tmp[0]; tmp << 1; } if (lemma.size()) lemma += "|"; lemma += tmp; tmp = "Morph"; if (count > 1) tmp.appendFormatted(".%d", i+1); tmp = (module->getEntryAttributes()["Word"][wordstr][tmp].c_str()); if (morph.size()) morph += "|"; morph += tmp; tmp = "Src"; if (count > 1) tmp.appendFormatted(".%d", i+1); tmp = (module->getEntryAttributes()["Word"][wordstr][tmp].c_str()); if (!tmp.length()) tmp.appendFormatted("%d", wordNum); tmp.insert(0, wordSrcPrefix); if (src.size()) src += "|"; src += tmp; } SWBuf lexName = ""; // we can pass the real lex name in, but we have some // aliases in the javascript to optimize bandwidth if ((gh == 'G') && (defaultGreekLex)) { lexName = (!strcmp(defaultGreekLex->getName(), "StrongsGreek"))?"G":defaultGreekLex->getName(); } else if ((gh == 'H') && (defaultHebLex)) { lexName = (!strcmp(defaultHebLex->getName(), "StrongsHebrew"))?"H":defaultHebLex->getName(); } SWBuf xlit = wtag.getAttribute("xlit"); if ((lemmaClass != "strong") && (xlit.startsWith("betacode:"))) { lexName = "betacode"; // const char *m = strchr(xlit.c_str(), ':'); // strong = ++m; } SWBuf wordID; if (vkey) { // optimize for bandwidth and use only the verse as the unique entry id wordID.appendFormatted("%d", vkey->getVerse()); } else { wordID = key->getText(); } wordID.appendFormatted("_%s", src.c_str()); // clean up our word ID for XHTML for (unsigned int i = 0; i < wordID.size(); i++) { if ((!isdigit(wordID[i])) && (!isalpha(wordID[i]))) { wordID[i] = '_'; } } // 'p' = 'fillpop' to save bandwidth text.appendFormatted("<span class=\"clk\" onclick=\"p('%s','%s','%s','%s','%s','%s');\" >", lexName.c_str(), lemma.c_str(), wordID.c_str(), morph.c_str(), page.c_str(), modName.c_str()); wordNum++; if (wtag.isEmpty()) { text += "</w></span>"; } } if ((*token == '/') && (token[1] == 'w') && option) { // Word text += "</w></span>"; continue; } // if not a strongs token, keep token in text text.append('<'); text.append(token); text.append('>'); continue; } if (intoken) { if (tokpos < 2045) { token[tokpos++] = *from; token[tokpos+2] = 0; } } else { text.append(*from); } } } return 0; }
char OSISStrongs::processText(SWBuf &text, const SWKey *key, const SWModule *module) { SWBuf token; bool intoken = false; int wordNum = 1; char wordstr[5]; const char *wordStart = 0; SWBuf page = ""; // some modules include <seg> page info, so we add these to the words const SWBuf orig = text; const char * from = orig.c_str(); for (text = ""; *from; ++from) { if (*from == '<') { intoken = true; token = ""; continue; } if (*from == '>') { // process tokens intoken = false; // possible page seg -------------------------------- if (token.startsWith("seg ")) { XMLTag stag(token); SWBuf type = stag.getAttribute("type"); if (type == "page") { SWBuf number = stag.getAttribute("subtype"); if (number.length()) { page = number; } } } // --------------------------------------------------- if (token.startsWith("w ")) { // Word XMLTag wtag(token); if (module->isProcessEntryAttributes()) { wordStart = from+1; char gh = 0; VerseKey *vkey = 0; if (key) { vkey = SWDYNAMIC_CAST(VerseKey, key); } SWBuf lemma = ""; SWBuf morph = ""; SWBuf src = ""; SWBuf morphClass = ""; SWBuf lemmaClass = ""; const char *attrib; sprintf(wordstr, "%03d", wordNum); // why is morph entry attribute processing done in here? Well, it's faster. It makes more local sense to place this code in osismorph. // easier to keep lemma and morph in same wordstr number too maybe. if ((attrib = wtag.getAttribute("morph"))) { int count = wtag.getAttributePartCount("morph", ' '); int i = (count > 1) ? 0 : -1; // -1 for whole value cuz it's faster, but does the same thing as 0 do { SWBuf mClass = ""; SWBuf mp = ""; attrib = wtag.getAttribute("morph", i, ' '); if (i < 0) i = 0; // to handle our -1 condition const char *m = strchr(attrib, ':'); if (m) { int len = m-attrib; mClass.append(attrib, len); attrib += (len+1); } if ((mClass == "x-Robinsons") || (mClass == "x-Robinson") || (mClass == "Robinson")) { mClass = "robinson"; } if (i) { morphClass += " "; morph += " "; } mp += attrib; morphClass += mClass; morph += mp; if (count > 1) { SWBuf tmp; tmp.setFormatted("Morph.%d", i+1); module->getEntryAttributes()["Word"][wordstr][tmp] = mp; tmp.setFormatted("MorphClass.%d", i+1); module->getEntryAttributes()["Word"][wordstr][tmp] = mClass; } } while (++i < count); } if ((attrib = wtag.getAttribute("lemma"))) { int count = wtag.getAttributePartCount("lemma", ' '); int i = (count > 1) ? 0 : -1; // -1 for whole value cuz it's faster, but does the same thing as 0 do { gh = 0; SWBuf lClass = ""; SWBuf l = ""; attrib = wtag.getAttribute("lemma", i, ' '); if (i < 0) i = 0; // to handle our -1 condition const char *m = strchr(attrib, ':'); if (m) { int len = m-attrib; lClass.append(attrib, len); attrib += (len+1); } if ((lClass == "x-Strongs") || (lClass == "strong") || (lClass == "Strong")) { if (isdigit(attrib[0])) { if (vkey) { gh = vkey->getTestament() ? 'H' : 'G'; } } else { gh = *attrib; attrib++; } lClass = "strong"; } if (gh) l += gh; l += attrib; if (i) { lemmaClass += " "; lemma += " "; } lemma += l; lemmaClass += lClass; if (count > 1) { SWBuf tmp; tmp.setFormatted("Lemma.%d", i+1); module->getEntryAttributes()["Word"][wordstr][tmp] = l; tmp.setFormatted("LemmaClass.%d", i+1); module->getEntryAttributes()["Word"][wordstr][tmp] = lClass; } } while (++i < count); module->getEntryAttributes()["Word"][wordstr]["PartCount"].setFormatted("%d", count); } if ((attrib = wtag.getAttribute("src"))) { int count = wtag.getAttributePartCount("src", ' '); int i = (count > 1) ? 0 : -1; // -1 for whole value cuz it's faster, but does the same thing as 0 do { SWBuf mp = ""; attrib = wtag.getAttribute("src", i, ' '); if (i < 0) i = 0; // to handle our -1 condition if (i) src += " "; mp += attrib; src += mp; if (count > 1) { SWBuf tmp; tmp.setFormatted("Src.%d", i+1); module->getEntryAttributes()["Word"][wordstr][tmp] = mp; } } while (++i < count); } if (lemma.length()) module->getEntryAttributes()["Word"][wordstr]["Lemma"] = lemma; if (lemmaClass.length()) module->getEntryAttributes()["Word"][wordstr]["LemmaClass"] = lemmaClass; if (morph.length()) module->getEntryAttributes()["Word"][wordstr]["Morph"] = morph; if (morphClass.length()) module->getEntryAttributes()["Word"][wordstr]["MorphClass"] = morphClass; if (src.length()) module->getEntryAttributes()["Word"][wordstr]["Src"] = src; if (page.length()) module->getEntryAttributes()["Word"][wordstr]["Page"] = page; if (wtag.isEmpty()) { int j; for (j = token.length()-1; ((j>0) && (strchr(" /", token[j]))); j--); token.size(j+1); } token += " wn=\""; token += wordstr; token += "\""; if (wtag.isEmpty()) { token += "/"; } wordNum++; } if (!option) { /* * Code which handles multiple lemma types. Kindof works but breaks at least WEBIF filters for strongs. * int count = wtag.getAttributePartCount("lemma", ' '); for (int i = 0; i < count; i++) { SWBuf a = wtag.getAttribute("lemma", i, ' '); const char *prefix = a.stripPrefix(':'); if ((prefix) && (!strcmp(prefix, "x-Strongs") || !strcmp(prefix, "strong") || !strcmp(prefix, "Strong"))) { // remove attribute part wtag.setAttribute("lemma", 0, i, ' '); i--; count--; } } * Instead the codee below just removes the lemma attribute *****/ const char *l = wtag.getAttribute("lemma"); if (l) { SWBuf savlm = l; wtag.setAttribute("lemma", 0); wtag.setAttribute("savlm", savlm); token = wtag; token.trim(); // drop <> token << 1; token--; } } } if (token.startsWith("/w")) { // Word End if (module->isProcessEntryAttributes()) { if (wordStart) { SWBuf tmp; tmp.append(wordStart, (from-wordStart)-3); sprintf(wordstr, "%03d", wordNum-1); module->getEntryAttributes()["Word"][wordstr]["Text"] = tmp; } } wordStart = 0; } // keep token in text text.append('<'); text.append(token); text.append('>'); continue; } if (intoken) { token += *from; } else { text.append(*from); } } return 0; }
void writeEntry(SWModule *book, SWBuf keyBuffer, SWBuf entBuffer) { if (greekFilter) { greekAccentsFilter.processText(keyBuffer); } if (toUpper) { unsigned size = (keyBuffer.size()+5)*3; keyBuffer.setFillByte(0); keyBuffer.resize(size); StringMgr::getSystemStringMgr()->upperUTF8(keyBuffer.getRawData(), size-2); } // Added for Hesychius, but this stuff should be pushed back into new StringMgr // functionality #ifdef _ICU_ // if (lexLevels) { if (lexLevels && !keyBuffer.startsWith("/Intro")) { unsigned size = (keyBuffer.size()+(lexLevels*2)); keyBuffer.setFillByte(0); keyBuffer.resize(size); UErrorCode err = U_ZERO_ERROR; int max = (size+5)*3; UChar *ubuffer = new UChar[max+10]; int32_t len; u_strFromUTF8(ubuffer, max+9, &len, keyBuffer.c_str(), -1, &err); if (err == U_ZERO_ERROR) { UChar *upper = new UChar[(lexLevels+1)*3]; memcpy(upper, ubuffer, lexLevels*sizeof(UChar)); upper[lexLevels] = 0; len = u_strToUpper(upper, (lexLevels+1)*3, upper, -1, 0, &err); memmove(ubuffer+len+1, ubuffer, (max-len)*sizeof(UChar)); memcpy(ubuffer, upper, len*sizeof(UChar)); ubuffer[len] = '/'; delete [] upper; int totalShift = 0; for (int i = lexLevels-1; i; i--) { int shift = (i < len)? i : len; memmove(ubuffer+(shift+1), ubuffer, (max-shift)*sizeof(UChar)); ubuffer[shift] = '/'; totalShift += (shift+1); } u_strToUTF8(keyBuffer.getRawData(), max, 0, ubuffer, -1, &err); } /* u_strFromUTF8(ubuffer, max+9, &len, keyBuffer.c_str(), -1, &err); if (err == U_ZERO_ERROR) { int totalShift = 0; for (int i = lexLevels; i; i--) { int shift = (i < len)? i : len; memmove(ubuffer+(shift+1), ubuffer, (max-shift)*sizeof(UChar)); ubuffer[shift] = '/'; totalShift += (shift+1); } UChar *upper = new UChar[(totalShift+1)*3]; memcpy(upper, ubuffer, totalShift*sizeof(UChar)); upper[totalShift] = 0; len = u_strToUpper(upper, (totalShift+1)*3, upper, -1, 0, &err); memmove(ubuffer+len, ubuffer+totalShift, (max-totalShift)*sizeof(UChar)); memcpy(ubuffer, upper, len*sizeof(UChar)); delete [] upper; u_strToUTF8(keyBuffer.getRawData(), max, 0, ubuffer, -1, &err); } */ delete [] ubuffer; } #endif std::cout << keyBuffer << std::endl; book->setKey(keyBuffer.c_str()); // check to see if we already have an entry for (int i = 2; book->getKey()->popError() != KEYERR_OUTOFBOUNDS; i++) { SWBuf key; key.setFormatted("%s {%d}", keyBuffer.c_str(), i); std::cout << "dup key, trying: " << key << std::endl; book->setKey(key.c_str()); } book->setEntry(entBuffer); }