// ------------------------------------------------------------------------------------------------ unsigned int ReadString(const char*& sbegin_out, const char*& send_out, const char* input, const char*& cursor, const char* end, bool long_length = false, bool allow_null = false) { const uint32_t len_len = long_length ? 4 : 1; if(Offset(cursor, end) < len_len) { TokenizeError("cannot ReadString, out of bounds reading length",input, cursor); } const uint32_t length = long_length ? ReadWord(input, cursor, end) : ReadByte(input, cursor, end); if (Offset(cursor, end) < length) { TokenizeError("cannot ReadString, length is out of bounds",input, cursor); } sbegin_out = cursor; cursor += length; send_out = cursor; if(!allow_null) { for (unsigned int i = 0; i < length; ++i) { if(sbegin_out[i] == '\0') { TokenizeError("failed ReadString, unexpected NUL character in string",input, cursor); } } } return length; }
// ------------------------------------------------------------------------------------------------ uint8_t ReadByte(const char* input, const char*& cursor, const char* end) { if(Offset(cursor, end) < sizeof( uint8_t ) ) { TokenizeError("cannot ReadByte, out of bounds",input, cursor); } uint8_t word;/* = *reinterpret_cast< const uint8_t* >( cursor )*/ ::memcpy( &word, cursor, sizeof( uint8_t ) ); ++cursor; return word; }
// ------------------------------------------------------------------------------------------------ uint8_t ReadByte(const char* input, const char*& cursor, const char* end) { if(Offset(cursor, end) < 1) { TokenizeError("cannot ReadByte, out of bounds",input, cursor); } uint8_t word = *reinterpret_cast<const uint8_t*>(cursor); ++cursor; return word; }
// ------------------------------------------------------------------------------------------------ uint32_t ReadWord(const char* input, const char*& cursor, const char* end) { if(Offset(cursor, end) < 4) { TokenizeError("cannot ReadWord, out of bounds",input, cursor); } uint32_t word = *reinterpret_cast<const uint32_t*>(cursor); AI_SWAP4(word); cursor += 4; return word; }
// ------------------------------------------------------------------------------------------------ uint64_t ReadDoubleWord(const char* input, const char*& cursor, const char* end) { const size_t k_to_read = sizeof(uint64_t); if(Offset(cursor, end) < k_to_read) { TokenizeError("cannot ReadDoubleWord, out of bounds",input, cursor); } uint64_t dword = *reinterpret_cast<const uint64_t*>(cursor); AI_SWAP8(dword); cursor += k_to_read; return dword; }
// ------------------------------------------------------------------------------------------------ uint32_t ReadWord(const char* input, const char*& cursor, const char* end) { const size_t k_to_read = sizeof( uint32_t ); if(Offset(cursor, end) < k_to_read ) { TokenizeError("cannot ReadWord, out of bounds",input, cursor); } uint32_t word; ::memcpy(&word, cursor, 4); AI_SWAP4(word); cursor += k_to_read; return word; }
// ------------------------------------------------------------------------------------------------ void Tokenize(TokenList& output_tokens, const char* input) { ai_assert(input); // line and column numbers numbers are one-based unsigned int line = 1; unsigned int column = 1; bool comment = false; bool in_double_quotes = false; bool pending_data_token = false; const char* token_begin = NULL, *token_end = NULL; for (const char* cur = input;*cur;column += (*cur == '\t' ? ASSIMP_FBX_TAB_WIDTH : 1), ++cur) { const char c = *cur; if (IsLineEnd(c)) { comment = false; column = 0; ++line; } if(comment) { continue; } if(in_double_quotes) { if (c == '\"') { in_double_quotes = false; token_end = cur; ProcessDataToken(output_tokens,token_begin,token_end,line,column); pending_data_token = false; } continue; } switch(c) { case '\"': if (token_begin) { TokenizeError("unexpected double-quote", line, column); } token_begin = cur; in_double_quotes = true; continue; case ';': ProcessDataToken(output_tokens,token_begin,token_end,line,column); comment = true; continue; case '{': ProcessDataToken(output_tokens,token_begin,token_end, line, column); output_tokens.push_back(new_Token(cur,cur+1,TokenType_OPEN_BRACKET,line,column)); continue; case '}': ProcessDataToken(output_tokens,token_begin,token_end,line,column); output_tokens.push_back(new_Token(cur,cur+1,TokenType_CLOSE_BRACKET,line,column)); continue; case ',': if (pending_data_token) { ProcessDataToken(output_tokens,token_begin,token_end,line,column,TokenType_DATA,true); } output_tokens.push_back(new_Token(cur,cur+1,TokenType_COMMA,line,column)); continue; case ':': if (pending_data_token) { ProcessDataToken(output_tokens,token_begin,token_end,line,column,TokenType_KEY,true); } else { TokenizeError("unexpected colon", line, column); } continue; } if (IsSpaceOrNewLine(c)) { if (token_begin) { // peek ahead and check if the next token is a colon in which // case this counts as KEY token. TokenType type = TokenType_DATA; for (const char* peek = cur; *peek && IsSpaceOrNewLine(*peek); ++peek) { if (*peek == ':') { type = TokenType_KEY; cur = peek; break; } } ProcessDataToken(output_tokens,token_begin,token_end,line,column,type); } pending_data_token = false; } else { token_end = cur; if (!token_begin) { token_begin = cur; } pending_data_token = true; } } }
// ------------------------------------------------------------------------------------------------ bool ReadScope(TokenList& output_tokens, const char* input, const char*& cursor, const char* end, bool const is64bits) { // the first word contains the offset at which this block ends const uint64_t end_offset = is64bits ? ReadDoubleWord(input, cursor, end) : ReadWord(input, cursor, end); // we may get 0 if reading reached the end of the file - // fbx files have a mysterious extra footer which I don't know // how to extract any information from, but at least it always // starts with a 0. if(!end_offset) { return false; } if(end_offset > Offset(input, end)) { TokenizeError("block offset is out of range",input, cursor); } else if(end_offset < Offset(input, cursor)) { TokenizeError("block offset is negative out of range",input, cursor); } // the second data word contains the number of properties in the scope const uint64_t prop_count = is64bits ? ReadDoubleWord(input, cursor, end) : ReadWord(input, cursor, end); // the third data word contains the length of the property list const uint64_t prop_length = is64bits ? ReadDoubleWord(input, cursor, end) : ReadWord(input, cursor, end); // now comes the name of the scope/key const char* sbeg, *send; ReadString(sbeg, send, input, cursor, end); output_tokens.push_back(new_Token(sbeg, send, TokenType_KEY, Offset(input, cursor) )); // now come the individual properties const char* begin_cursor = cursor; for (unsigned int i = 0; i < prop_count; ++i) { ReadData(sbeg, send, input, cursor, begin_cursor + prop_length); output_tokens.push_back(new_Token(sbeg, send, TokenType_DATA, Offset(input, cursor) )); if(i != prop_count-1) { output_tokens.push_back(new_Token(cursor, cursor + 1, TokenType_COMMA, Offset(input, cursor) )); } } if (Offset(begin_cursor, cursor) != prop_length) { TokenizeError("property length not reached, something is wrong",input, cursor); } // at the end of each nested block, there is a NUL record to indicate // that the sub-scope exists (i.e. to distinguish between P: and P : {}) // this NUL record is 13 bytes long on 32 bit version and 25 bytes long on 64 bit. const size_t sentinel_block_length = is64bits ? (sizeof(uint64_t)* 3 + 1) : (sizeof(uint32_t)* 3 + 1); if (Offset(input, cursor) < end_offset) { if (end_offset - Offset(input, cursor) < sentinel_block_length) { TokenizeError("insufficient padding bytes at block end",input, cursor); } output_tokens.push_back(new_Token(cursor, cursor + 1, TokenType_OPEN_BRACKET, Offset(input, cursor) )); // XXX this is vulnerable to stack overflowing .. while(Offset(input, cursor) < end_offset - sentinel_block_length) { ReadScope(output_tokens, input, cursor, input + end_offset - sentinel_block_length, is64bits); } output_tokens.push_back(new_Token(cursor, cursor + 1, TokenType_CLOSE_BRACKET, Offset(input, cursor) )); for (unsigned int i = 0; i < sentinel_block_length; ++i) { if(cursor[i] != '\0') { TokenizeError("failed to read nested block sentinel, expected all bytes to be 0",input, cursor); } } cursor += sentinel_block_length; } if (Offset(input, cursor) != end_offset) { TokenizeError("scope length not reached, something is wrong",input, cursor); } return true; }
// ------------------------------------------------------------------------------------------------ void ReadData(const char*& sbegin_out, const char*& send_out, const char* input, const char*& cursor, const char* end) { if(Offset(cursor, end) < 1) { TokenizeError("cannot ReadData, out of bounds reading length",input, cursor); } const char type = *cursor; sbegin_out = cursor++; switch(type) { // 16 bit int case 'Y': cursor += 2; break; // 1 bit bool flag (yes/no) case 'C': cursor += 1; break; // 32 bit int case 'I': // <- fall through // float case 'F': cursor += 4; break; // double case 'D': cursor += 8; break; // 64 bit int case 'L': cursor += 8; break; // note: do not write cursor += ReadWord(...cursor) as this would be UB // raw binary data case 'R': { const uint32_t length = ReadWord(input, cursor, end); cursor += length; break; } case 'b': // TODO: what is the 'b' type code? Right now we just skip over it / // take the full range we could get cursor = end; break; // array of * case 'f': case 'd': case 'l': case 'i': case 'c': { const uint32_t length = ReadWord(input, cursor, end); const uint32_t encoding = ReadWord(input, cursor, end); const uint32_t comp_len = ReadWord(input, cursor, end); // compute length based on type and check against the stored value if(encoding == 0) { uint32_t stride = 0; switch(type) { case 'f': case 'i': stride = 4; break; case 'd': case 'l': stride = 8; break; case 'c': stride = 1; break; default: ai_assert(false); }; ai_assert(stride > 0); if(length * stride != comp_len) { TokenizeError("cannot ReadData, calculated data stride differs from what the file claims",input, cursor); } } // zip/deflate algorithm (encoding==1)? take given length. anything else? die else if (encoding != 1) { TokenizeError("cannot ReadData, unknown encoding",input, cursor); } cursor += comp_len; break; } // string case 'S': { const char* sb, *se; // 0 characters can legally happen in such strings ReadString(sb, se, input, cursor, end, true, true); break; } default: TokenizeError("cannot ReadData, unexpected type code: " + std::string(&type, 1),input, cursor); } if(cursor > end) { TokenizeError("cannot ReadData, the remaining size is too small for the data type: " + std::string(&type, 1),input, cursor); } // the type code is contained in the returned range send_out = cursor; }
// ------------------------------------------------------------------------------------------------ void TokenizeError(const std::string& message, const char* begin, const char* cursor) { TokenizeError(message, Offset(begin, cursor)); }