/* * usGetNextChar - get the next character from the specified block list */ static USHORT usGetNextChar(FILE *pFile, list_id_enum eListID, ULONG *pulFileOffset, ULONG *pulCharPos, USHORT *pusPropMod) { readinfo_type *pReadinfo; list_mem_type *pAnchor; USHORT usLSB, usMSB; switch (eListID) { case text_list: pReadinfo = &tOthers; pAnchor = pTextAnchor; break; case footnote_list: pReadinfo = &tFootnote; pAnchor = pFootnoteAnchor; break; case hdrftr_list: pReadinfo = &tHdrFtr; pAnchor = pHdrFtrAnchor; break; case endnote_list: pReadinfo = &tOthers; pAnchor = pEndnoteAnchor; break; case textbox_list: pReadinfo = &tOthers; pAnchor = pTextBoxAnchor; break; case hdrtextbox_list: pReadinfo = &tOthers; pAnchor = pHdrTextBoxAnchor; break; default: DBG_DEC(eListID); return (USHORT)EOF; } usLSB = usGetNextByte(pFile, pReadinfo, pAnchor, pulFileOffset, pulCharPos, pusPropMod); if (usLSB == (USHORT)EOF) { return (USHORT)EOF; } fail(pReadinfo->pBlockCurrent == NULL); if (pReadinfo->pBlockCurrent->tInfo.bUsesUnicode) { usMSB = usGetNextByte(pFile, pReadinfo, pAnchor, NULL, NULL, NULL); } else { usMSB = 0x00; } if (usMSB == (USHORT)EOF) { DBG_MSG("usGetNextChar: Unexpected EOF"); DBG_HEX_C(pulFileOffset != NULL, *pulFileOffset); DBG_HEX_C(pulCharPos != NULL, *pulCharPos); return (USHORT)EOF; } return (usMSB << 8) | usLSB; } /* end of usGetNextChar */
/* * vAdd2RowInfoList - Add an element to the Row Information List */ void vAdd2RowInfoList(const row_block_type *pRowBlock) { row_desc_type *pListMember; short *psTmp; int iIndex; fail(pRowBlock == NULL); if (pRowBlock->ulFileOffsetStart == FC_INVALID || pRowBlock->ulFileOffsetEnd == FC_INVALID || pRowBlock->ulFileOffsetStart == pRowBlock->ulFileOffsetEnd) { DBG_HEX_C(pRowBlock->ulFileOffsetStart != FC_INVALID, pRowBlock->ulFileOffsetStart); DBG_HEX_C(pRowBlock->ulFileOffsetEnd != FC_INVALID, pRowBlock->ulFileOffsetEnd); return; } NO_DBG_HEX(pRowBlock->ulFileOffsetStart); NO_DBG_HEX(pRowBlock->ulFileOffsetEnd); NO_DBG_DEC(pRowBlock->ucNumberOfColumns); /* Create the new list member */ pListMember = xmalloc(sizeof(row_desc_type)); /* Fill the new list member */ pListMember->tInfo = *pRowBlock; pListMember->pNext = NULL; /* Correct the values where needed */ for (iIndex = 0, psTmp = pListMember->tInfo.asColumnWidth; iIndex < (int)pListMember->tInfo.ucNumberOfColumns; iIndex++, psTmp++) { if (*psTmp < 0) { *psTmp = 0; DBG_MSG("The column width was negative"); } } /* Add the new member to the list */ if (pAnchor == NULL) { pAnchor = pListMember; pRowCurrent = pListMember; } else { fail(pRowLast == NULL); pRowLast->pNext = pListMember; } pRowLast = pListMember; } /* end of vAdd2RowInfoList */
/* * Translate a data position to an offset in the file. * Logical to physical offset. * * Returns: FC_INVALID: in case of error * otherwise: the computed file offset */ ULONG ulDataPos2FileOffset(ULONG ulDataPos) { data_mem_type *pCurr; fail(ulDataPos == CP_INVALID); for (pCurr = pAnchor; pCurr != NULL; pCurr = pCurr->pNext) { if (ulDataPos < pCurr->tInfo.ulDataPos || ulDataPos >= pCurr->tInfo.ulDataPos + pCurr->tInfo.ulLength) { /* The data offset is not in this block, try the next */ continue; } /* The data offset is in the current block */ return pCurr->tInfo.ulFileOffset + ulDataPos - pCurr->tInfo.ulDataPos; } /* Passed beyond the end of the list */ DBG_HEX_C(ulDataPos != 0, ulDataPos); return FC_INVALID; } /* end of ulDataPos2FileOffset */
/* * ulTranslateCharacters - Translate characters to local representation * * Translate all characters to local representation * * returns the translated character */ ULONG ulTranslateCharacters(USHORT usChar, ULONG ulFileOffset, int iWordVersion, conversion_type eConversionType, encoding_type eEncoding, BOOL bUseMacCharSet) { const char_table_type *pTmp; const USHORT *usCharSet; usCharSet = NULL; if (bUseMacCharSet) { /* Macintosh character set */ usCharSet = usMacRoman; } else if (iWordVersion == 0) { /* DOS character set */ usCharSet = usCp850; } else { /* Windows character set */ switch (eEncoding) { case encoding_latin_2: usCharSet = usCp1250; break; case encoding_cyrillic: usCharSet = usCp1251; break; case encoding_latin_1: default: usCharSet = usCp1252; break; } } fail(usCharSet == NULL); if (usChar >= 0x80 && usChar <= 0x9f) { /* Translate implementation defined characters */ usChar = usCharSet[usChar - 0x80]; } else if (iWordVersion < 8 && usChar >= 0xa0 && usChar <= 0xff) { /* Translate old character set to Unixcode */ usChar = usCharSet[usChar - 0x80]; } /* Microsoft Unicode to real Unicode */ if (usChar >= 0xf020 && usChar <= 0xf0ff) { DBG_HEX_C(usPrivateArea[usChar - 0xf020] == 0x003f, usChar); usChar = usPrivateArea[usChar - 0xf020]; } /* Characters with a special meaning in Word */ switch (usChar) { case IGNORE_CHARACTER: case FOOTNOTE_SEPARATOR: case FOOTNOTE_CONTINUATION: case ANNOTATION: case FRAME: case LINE_FEED: case WORD_SOFT_HYPHEN: case UNICODE_HYPHENATION_POINT: return IGNORE_CHARACTER; case PICTURE: case TABLE_SEPARATOR: case TAB: case HARD_RETURN: case PAGE_BREAK: case PAR_END: case COLUMN_FEED: return (ULONG)usChar; case FOOTNOTE_OR_ENDNOTE: NO_DBG_HEX(ulFileOffset); switch (eGetNotetype(ulFileOffset)) { case notetype_is_footnote: return FOOTNOTE_CHAR; case notetype_is_endnote: return ENDNOTE_CHAR; default: return UNKNOWN_NOTE_CHAR; } case WORD_UNBREAKABLE_JOIN: return (ULONG)OUR_UNBREAKABLE_JOIN; default: break; } if (eEncoding != encoding_utf_8) { /* Latin characters in an oriental text */ if (usChar >= 0xff01 && usChar <= 0xff5e) { usChar -= 0xfee0; } } if (eEncoding == encoding_latin_1 && (eConversionType == conversion_ps || eConversionType == conversion_pdf)) { /* Ugly, but it makes the PostScript and PDF look better */ switch (usChar) { case UNICODE_ELLIPSIS: return 140; case UNICODE_TRADEMARK_SIGN: return 141; case UNICODE_PER_MILLE_SIGN: return 142; case UNICODE_BULLET: case UNICODE_BULLET_OPERATOR: case UNICODE_BLACK_CLUB_SUIT: return 143; case UNICODE_LEFT_SINGLE_QMARK: return 144; case UNICODE_RIGHT_SINGLE_QMARK: return 145; case UNICODE_SINGLE_LEFT_ANGLE_QMARK: return 146; case UNICODE_SINGLE_RIGHT_ANGLE_QMARK: return 147; case UNICODE_LEFT_DOUBLE_QMARK: return 148; case UNICODE_RIGHT_DOUBLE_QMARK: return 149; case UNICODE_DOUBLE_LOW_9_QMARK: return 150; case UNICODE_EN_DASH: return 151; case UNICODE_EM_DASH: return 152; case UNICODE_MINUS_SIGN: return 153; case UNICODE_CAPITAL_LIGATURE_OE: return 154; case UNICODE_SMALL_LIGATURE_OE: return 155; case UNICODE_DAGGER: return 156; case UNICODE_DOUBLE_DAGGER: return 157; case UNICODE_SMALL_LIGATURE_FI: return 158; case UNICODE_SMALL_LIGATURE_FL: return 159; default: break; } } if (eConversionType == conversion_pdf) { if (eEncoding == encoding_latin_1) { switch (usChar) { case UNICODE_EURO_SIGN: return 128; default: break; } } else if (eEncoding == encoding_latin_2) { switch (usChar) { case UNICODE_CAPITAL_D_WITH_STROKE: case UNICODE_SMALL_D_WITH_STROKE: return 0x3f; default: break; } } } if (usChar < 0x80) { /* US ASCII */ if (usChar < 0x20 || usChar == 0x7f) { /* Ignore control characters */ DBG_HEX(usChar); DBG_FIXME(); return IGNORE_CHARACTER; } return (ULONG)usChar; } if (eEncoding == encoding_utf_8) { /* No need to convert Unicode characters */ return (ULONG)usChar; } /* Unicode to local representation */ pTmp = pGetCharTableRecord(usChar); if (pTmp != NULL) { DBG_HEX_C(usChar >= 0x7f && usChar <= 0x9f, usChar); return (ULONG)pTmp->ucLocal; } /* Fancy characters to simple US ASCII */ switch (usChar) { case UNICODE_SMALL_F_HOOK: return (ULONG)'f'; case UNICODE_GREEK_CAPITAL_CHI: return (ULONG)'X'; case UNICODE_GREEK_SMALL_UPSILON: return (ULONG)'v'; case UNICODE_MODIFIER_CIRCUMFLEX: case UNICODE_UPWARDS_ARROW: return (ULONG)'^'; case UNICODE_SMALL_TILDE: case UNICODE_TILDE_OPERATOR: return (ULONG)'~'; case UNICODE_EN_QUAD: case UNICODE_EM_QUAD: case UNICODE_EN_SPACE: case UNICODE_EM_SPACE: case UNICODE_THREE_PER_EM_SPACE: case UNICODE_FOUR_PER_EM_SPACE: case UNICODE_SIX_PER_EM_SPACE: case UNICODE_FIGURE_SPACE: case UNICODE_PUNCTUATION_SPACE: case UNICODE_THIN_SPACE: case UNICODE_NARROW_NO_BREAK_SPACE: case UNICODE_LIGHT_SHADE: case UNICODE_MEDIUM_SHADE: case UNICODE_DARK_SHADE: return (ULONG)' '; case UNICODE_LEFT_DOUBLE_QMARK: case UNICODE_RIGHT_DOUBLE_QMARK: case UNICODE_DOUBLE_LOW_9_QMARK: case UNICODE_DOUBLE_HIGH_REV_9_QMARK: case UNICODE_DOUBLE_PRIME: return (ULONG)'"'; case UNICODE_LEFT_SINGLE_QMARK: case UNICODE_RIGHT_SINGLE_QMARK: case UNICODE_SINGLE_LOW_9_QMARK: case UNICODE_SINGLE_HIGH_REV_9_QMARK: case UNICODE_PRIME: return (ULONG)'\''; case UNICODE_HYPHEN: case UNICODE_NON_BREAKING_HYPHEN: case UNICODE_FIGURE_DASH: case UNICODE_EN_DASH: case UNICODE_EM_DASH: case UNICODE_HORIZONTAL_BAR: case UNICODE_MINUS_SIGN: case UNICODE_BD_LIGHT_HORIZONTAL: case UNICODE_BD_DOUBLE_HORIZONTAL: return (ULONG)'-'; case UNICODE_DOUBLE_VERTICAL_LINE: case UNICODE_BD_LIGHT_VERTICAL: case UNICODE_BD_DOUBLE_VERTICAL: return (ULONG)'|'; case UNICODE_DOUBLE_LOW_LINE: return (ULONG)'_'; case UNICODE_DAGGER: return (ULONG)'+'; case UNICODE_DOUBLE_DAGGER: return (ULONG)'#'; case UNICODE_BULLET: case UNICODE_BULLET_OPERATOR: case UNICODE_BLACK_CLUB_SUIT: return (ULONG)ucGetBulletCharacter(eConversionType, eEncoding); case UNICODE_ONE_DOT_LEADER: case UNICODE_TWO_DOT_LEADER: return (ULONG)'.'; case UNICODE_ELLIPSIS: #if defined(__riscos) return (ULONG)OUR_ELLIPSIS; #else if (ulFileOffset == 0) { return (ULONG)OUR_ELLIPSIS; } return UNICODE_ELLIPSIS; #endif /* __riscos */ case UNICODE_DOUBLE_LEFT_ANGLE_QMARK: case UNICODE_TRIANGULAR_BULLET: case UNICODE_SINGLE_LEFT_ANGLE_QMARK: case UNICODE_LEFTWARDS_ARROW: return (ULONG)'<'; case UNICODE_DOUBLE_RIGHT_ANGLE_QMARK: case UNICODE_SINGLE_RIGHT_ANGLE_QMARK: case UNICODE_RIGHTWARDS_ARROW: return (ULONG)'>'; case UNICODE_UNDERTIE: return (ULONG)'-'; case UNICODE_N_ARY_SUMMATION: return (ULONG)'S'; case UNICODE_EURO_SIGN: return (ULONG)'E'; case UNICODE_CIRCLE: case UNICODE_SQUARE: return (ULONG)'O'; case UNICODE_DIAMOND: return (ULONG)OUR_DIAMOND; case UNICODE_NUMERO_SIGN: return (ULONG)'N'; case UNICODE_KELVIN_SIGN: return (ULONG)'K'; case UNICODE_DOWNWARDS_ARROW: return (ULONG)'v'; case UNICODE_FRACTION_SLASH: case UNICODE_DIVISION_SLASH: return (ULONG)'/'; case UNICODE_ASTERISK_OPERATOR: return (ULONG)'*'; case UNICODE_RATIO: return (ULONG)':'; case UNICODE_BD_LIGHT_DOWN_RIGHT: case UNICODE_BD_LIGHT_DOWN_AND_LEFT: case UNICODE_BD_LIGHT_UP_AND_RIGHT: case UNICODE_BD_LIGHT_UP_AND_LEFT: case UNICODE_BD_LIGHT_VERTICAL_AND_RIGHT: case UNICODE_BD_LIGHT_VERTICAL_AND_LEFT: case UNICODE_BD_LIGHT_DOWN_AND_HORIZONTAL: case UNICODE_BD_LIGHT_UP_AND_HORIZONTAL: case UNICODE_BD_LIGHT_VERTICAL_AND_HORIZONTAL: case UNICODE_BD_DOUBLE_DOWN_AND_RIGHT: case UNICODE_BD_DOUBLE_DOWN_AND_LEFT: case UNICODE_BD_DOUBLE_UP_AND_RIGHT: case UNICODE_BD_DOUBLE_UP_AND_LEFT: case UNICODE_BD_DOUBLE_VERTICAL_AND_RIGHT: case UNICODE_BD_DOUBLE_VERTICAL_AND_LEFT: case UNICODE_BD_DOUBLE_DOWN_AND_HORIZONTAL: case UNICODE_BD_DOUBLE_UP_AND_HORIZONTAL: case UNICODE_BD_DOUBLE_VERTICAL_AND_HORIZONTAL: case UNICODE_BLACK_SQUARE: return (ULONG)'+'; case UNICODE_HAIR_SPACE: case UNICODE_ZERO_WIDTH_SPACE: case UNICODE_ZERO_WIDTH_NON_JOINER: case UNICODE_ZERO_WIDTH_JOINER: case UNICODE_LEFT_TO_RIGHT_MARK: case UNICODE_RIGHT_TO_LEFT_MARK: case UNICODE_LEFT_TO_RIGHT_EMBEDDING: case UNICODE_RIGHT_TO_LEFT_EMBEDDING: case UNICODE_POP_DIRECTIONAL_FORMATTING: case UNICODE_LEFT_TO_RIGHT_OVERRIDE: case UNICODE_RIGHT_TO_LEFT_OVERRIDE: case UNICODE_ZERO_WIDTH_NO_BREAK_SPACE: return IGNORE_CHARACTER; default: break; } if (usChar == UNICODE_TRADEMARK_SIGN) { /* * No local representation, it doesn't look like anything in * US-ASCII and a question mark does more harm than good. */ return IGNORE_CHARACTER; } if (usChar >= 0xa0 && usChar <= 0xff) { /* Before Word 97, Word did't use Unicode */ return (ULONG)usChar; } DBG_HEX_C(usChar < 0x3000 || usChar >= 0xd800, ulFileOffset); DBG_HEX_C(usChar < 0x3000 || usChar >= 0xd800, usChar); DBG_MSG_C(usChar >= 0xe000 && usChar < 0xf900, "Private Use Area"); /* Untranslated Unicode character */ return 0x3f; } /* end of ulTranslateCharacters */
/* * Build the lists with Paragraph Information for Word 6/7 files */ void vGet6PapInfo(FILE *pFile, ULONG ulStartBlock, const ULONG *aulBBD, size_t tBBDLen, const UCHAR *aucHeader) { row_block_type tRow; style_block_type tStyle; USHORT *ausParfPage; UCHAR *aucBuffer; ULONG ulCharPos, ulCharPosFirst, ulCharPosLast; ULONG ulBeginParfInfo; size_t tParfInfoLen, tParfPageNum, tOffset, tSize, tLenOld, tLen; size_t tIndex, tIndex2, tRun; int iFodo, iLen; row_info_enum eRowInfo; USHORT usParfFirstPage, usCount, usIstd; UCHAR aucFpage[BIG_BLOCK_SIZE]; fail(pFile == NULL || aucHeader == NULL); fail(ulStartBlock > MAX_BLOCKNUMBER && ulStartBlock != END_OF_CHAIN); fail(aulBBD == NULL); ulBeginParfInfo = ulGetLong(0xc0, aucHeader); /* fcPlcfbtePapx */ NO_DBG_HEX(ulBeginParfInfo); tParfInfoLen = (size_t)ulGetLong(0xc4, aucHeader); /* lcbPlcfbtePapx */ NO_DBG_DEC(tParfInfoLen); if (tParfInfoLen < 4) { DBG_DEC(tParfInfoLen); return; } aucBuffer = xmalloc(tParfInfoLen); if (!bReadBuffer(pFile, ulStartBlock, aulBBD, tBBDLen, BIG_BLOCK_SIZE, aucBuffer, ulBeginParfInfo, tParfInfoLen)) { aucBuffer = xfree(aucBuffer); return; } NO_DBG_PRINT_BLOCK(aucBuffer, tParfInfoLen); tLen = (tParfInfoLen - 4) / 6; ausParfPage = xcalloc(tLen, sizeof(USHORT)); for (tIndex = 0, tOffset = (tLen + 1) * 4; tIndex < tLen; tIndex++, tOffset += 2) { ausParfPage[tIndex] = usGetWord(tOffset, aucBuffer); NO_DBG_DEC(ausParfPage[tIndex]); } DBG_HEX(ulGetLong(0, aucBuffer)); aucBuffer = xfree(aucBuffer); tParfPageNum = (size_t)usGetWord(0x190, aucHeader); /* cpnBtePap */ DBG_DEC(tParfPageNum); if (tLen < tParfPageNum) { /* Replace ParfPage by a longer version */ tLenOld = tLen; usParfFirstPage = usGetWord(0x18c, aucHeader); /* pnPapFirst */ DBG_DEC(usParfFirstPage); tLen += tParfPageNum - 1; tSize = tLen * sizeof(USHORT); ausParfPage = xrealloc(ausParfPage, tSize); /* Add new values */ usCount = usParfFirstPage + 1; for (tIndex = tLenOld; tIndex < tLen; tIndex++) { ausParfPage[tIndex] = usCount; NO_DBG_DEC(ausParfPage[tIndex]); usCount++; } } (void)memset(&tRow, 0, sizeof(tRow)); ulCharPosFirst = CP_INVALID; for (tIndex = 0; tIndex < tLen; tIndex++) { if (!bReadBuffer(pFile, ulStartBlock, aulBBD, tBBDLen, BIG_BLOCK_SIZE, aucFpage, (ULONG)ausParfPage[tIndex] * BIG_BLOCK_SIZE, BIG_BLOCK_SIZE)) { break; } tRun = (size_t)ucGetByte(0x1ff, aucFpage); NO_DBG_DEC(tRun); for (tIndex2 = 0; tIndex2 < tRun; tIndex2++) { NO_DBG_HEX(ulGetLong(tIndex2 * 4, aucFpage)); iFodo = 2 * (int)ucGetByte( (tRun + 1) * 4 + tIndex2 * 7, aucFpage); if (iFodo <= 0) { continue; } iLen = 2 * (int)ucGetByte(iFodo, aucFpage); usIstd = (USHORT)ucGetByte(iFodo + 1, aucFpage); vFillStyleFromStylesheet(usIstd, &tStyle); vGet6StyleInfo(iFodo, aucFpage + 3, iLen - 3, &tStyle); ulCharPos = ulGetLong(tIndex2 * 4, aucFpage); NO_DBG_HEX(ulCharPos); tStyle.ulFileOffset = ulCharPos2FileOffsetX( ulCharPos, &tStyle.eListID); vAdd2StyleInfoList(&tStyle); eRowInfo = eGet6RowInfo(iFodo, aucFpage + 3, iLen - 3, &tRow); switch(eRowInfo) { case found_a_cell: if (ulCharPosFirst != CP_INVALID) { break; } ulCharPosFirst = ulGetLong( tIndex2 * 4, aucFpage); NO_DBG_HEX(ulCharPosFirst); tRow.ulCharPosStart = ulCharPosFirst; tRow.ulFileOffsetStart = ulCharPos2FileOffset(ulCharPosFirst); DBG_HEX_C(tRow.ulFileOffsetStart == FC_INVALID, ulCharPosFirst); break; case found_end_of_row: ulCharPosLast = ulGetLong( tIndex2 * 4, aucFpage); NO_DBG_HEX(ulCharPosLast); tRow.ulCharPosEnd = ulCharPosLast; tRow.ulFileOffsetEnd = ulCharPos2FileOffset(ulCharPosLast); DBG_HEX_C(tRow.ulFileOffsetEnd == FC_INVALID, ulCharPosLast); vAdd2RowInfoList(&tRow); (void)memset(&tRow, 0, sizeof(tRow)); ulCharPosFirst = CP_INVALID; break; case found_nothing: break; default: DBG_DEC(eRowInfo); break; } } } ausParfPage = xfree(ausParfPage); } /* end of vGet6PapInfo */