uint32_t TBinaryProtocol::readMessageBegin(std::string& name, TMessageType& messageType, int32_t& seqid) { uint32_t result = 0; int32_t sz; result += readI32(sz); if (sz < 0) { // Check for correct version number int32_t version = sz & VERSION_MASK; if (version != VERSION_1) { throw TProtocolException(TProtocolException::BAD_VERSION, "Bad version identifier"); } messageType = (TMessageType)(sz & 0x000000ff); result += readString(name); result += readI32(seqid); } else { if (strict_read_) { throw TProtocolException(TProtocolException::BAD_VERSION, "No version identifier... old protocol client in strict mode?"); } else { // Handle pre-versioned input int8_t type; result += readStringBody(name, sz); result += readByte(type); messageType = (TMessageType)type; result += readI32(seqid); } } return result; }
uint32_t TBinaryProtocol::readSetBegin(TType& elemType, uint32_t& size) { int8_t e; uint32_t result = 0; int32_t sizei; result += readByte(e); elemType = (TType)e; result += readI32(sizei); if (sizei < 0) { throw TProtocolException(TProtocolException::NEGATIVE_SIZE); } else if (container_limit_ && sizei > container_limit_) { throw TProtocolException(TProtocolException::SIZE_LIMIT); } size = (uint32_t)sizei; return result; }
uint32_t TBinaryProtocol::readMapBegin(TType& keyType, TType& valType, uint32_t& size) { int8_t k, v; uint32_t result = 0; int32_t sizei; result += readByte(k); keyType = (TType)k; result += readByte(v); valType = (TType)v; result += readI32(sizei); if (sizei < 0) { throw TProtocolException(TProtocolException::NEGATIVE_SIZE); } else if (container_limit_ && sizei > container_limit_) { throw TProtocolException(TProtocolException::SIZE_LIMIT); } size = (uint32_t)sizei; return result; }
/** * read column chunk metadata information */ int readColumnMetadata( CompactProtocol *prot, struct ColumnChunkMetadata_4C *colChunk) { uint32_t xfer = 0; TType ftype; int16_t fid; readStructBegin(prot); bool isset_type = false; bool isset_encodings = false; bool isset_path_in_schema = false; bool isset_codec = false; bool isset_num_values = false; bool isset_total_uncompressed_size = false; bool isset_total_compressed_size = false; bool isset_data_page_offset = false; while (true) { xfer += readFieldBegin(prot, &ftype, &fid); if (ftype == T_STOP) { break; } switch (fid) { case 1: if (ftype == T_I32) { int32_t type; xfer += readI32(prot, &type); colChunk->type = (PrimitiveTypeName) type; isset_type = true; } break; case 2: if (ftype == T_LIST) { uint32_t encodingCount; TType etype; xfer += readListBegin(prot, &etype, &encodingCount); colChunk->EncodingCount = encodingCount; colChunk->pEncodings = (enum Encoding *) palloc0(sizeof(enum Encoding) * encodingCount); for (int i = 0; i < encodingCount; i++) { int32_t encoding; xfer += readI32(prot, &encoding); colChunk->pEncodings[i] = (enum Encoding) encoding; } isset_encodings = true; } break; case 3: if (ftype == T_LIST) { { /*process path in schema, setting colchunk->depth and colchunk->pathInSchema*/ TType etype; uint32_t lsize; StringInfoData colNameBuf; xfer += readListBegin(prot, &etype, &lsize); colChunk->depth = lsize; initStringInfo(&colNameBuf); char *path_in_schema; for (int i = 0; i < lsize - 1; i++) { xfer += readString(prot, &path_in_schema); appendStringInfo(&colNameBuf, "%s:", path_in_schema); pfree(path_in_schema); } xfer += readString(prot, &path_in_schema); appendStringInfo(&colNameBuf, "%s", path_in_schema); colChunk->pathInSchema = colNameBuf.data; colChunk->colName = path_in_schema; } isset_path_in_schema = true; } break; case 4: if (ftype == T_I32) { int32_t compresscode; xfer += readI32(prot, &compresscode); colChunk->codec = (enum CompressionCodecName) compresscode; isset_codec = true; } break; case 5: if (ftype == T_I64) { int64_t valCnt; xfer += readI64(prot, &valCnt); colChunk->valueCount = valCnt; isset_num_values = true; } break; case 6: if (ftype == T_I64) { xfer += readI64(prot, &(colChunk->totalUncompressedSize)); isset_total_uncompressed_size = true; } break; case 7: if (ftype == T_I64) { xfer += readI64(prot, &(colChunk->totalSize)); isset_total_compressed_size = true; } break; case 8: if (ftype == T_LIST) { xfer += skipType(prot, ftype); } break; case 9: if (ftype == T_I64) { xfer += readI64(prot, &(colChunk->firstDataPage)); isset_data_page_offset = true; } break; case 10: if (ftype == T_I64) { xfer += skipType(prot, ftype); } break; case 11: if (ftype == T_I64) { xfer += skipType(prot, ftype); } break; default: break; } } readStructEnd(prot); if (!isset_type) ereport(ERROR, (errcode(ERRCODE_GP_INTERNAL_ERROR), errmsg("file metadata: row group column chunk type not set"))); if (!isset_encodings) ereport(ERROR, (errcode(ERRCODE_GP_INTERNAL_ERROR), errmsg("file metadata: row group column chunk encoding not set"))); if (!isset_path_in_schema) ereport(ERROR, (errcode(ERRCODE_GP_INTERNAL_ERROR), errmsg("file metadata: row group column chunk path_in_schema not set"))); if (!isset_codec) ereport(ERROR, (errcode(ERRCODE_GP_INTERNAL_ERROR), errmsg("file metadata: row group column chunk compression code not set"))); if (!isset_num_values) ereport(ERROR, (errcode(ERRCODE_GP_INTERNAL_ERROR), errmsg("file metadata: row group column chunk value number not set"))); if (!isset_total_uncompressed_size) ereport(ERROR, (errcode(ERRCODE_GP_INTERNAL_ERROR), errmsg("file metadata: row group column chunk total uncompressed size not set"))); if (!isset_total_compressed_size) ereport(ERROR, (errcode(ERRCODE_GP_INTERNAL_ERROR), errmsg("file metadata: row group column chunk total compressed size not set"))); if (!isset_data_page_offset) ereport(ERROR, (errcode(ERRCODE_GP_INTERNAL_ERROR), errmsg("file metadata: row group column chunk first data page not set"))); return xfer; }
/** * Read single field of schema element in parquet file * @prot: The protocol for reading (input) * @fieldType: The type of the field * @type_length: The length of the field type * @repetition_type:The repetition level of parent * @fieldName The name of the field * @num_children The children number of field */ int readSchemaElement_Single( CompactProtocol *prot, PrimitiveTypeName *fieldType, int32_t *type_length, RepetitionType *repetition_type, char **fieldName, int32_t *num_children) { uint32_t xfer = 0; TType ftype; int16_t fid; bool isset_name = false; readStructBegin(prot); while (true) { xfer += readFieldBegin(prot, &ftype, &fid); if (ftype == T_STOP) { break; } switch (fid) { case 1: if (ftype == T_I32) { int32_t val; xfer += readI32(prot, &val); *fieldType = (PrimitiveTypeName) val; } break; case 2: if (ftype == T_I32) { int32_t bit_length = 0; xfer += readI32(prot, &bit_length); *type_length = bit_length / 8; } break; case 3: if (ftype == T_I32) { int32_t ecast1; xfer += readI32(prot, &ecast1); *repetition_type = (RepetitionType) ecast1; } break; case 4: if (ftype == T_STRING) { isset_name = true; xfer += readString(prot, fieldName); } break; case 5: if (ftype == T_I32) { xfer += readI32(prot, num_children); } break; case 6: if (ftype == T_I32) { xfer += skipType(prot, ftype); } break; default: break; } } readStructEnd(prot); if (!isset_name) ereport(ERROR, (errcode(ERRCODE_GP_INTERNAL_ERROR), errmsg( "file metadata schema element information not correct"))); return xfer; }
/* * The initialize read method, read file metadata, but just read the first 4 parts, * including version, schema information, number of rows, and rowgroup number, but * doesn't read each rowgroup metadata, and keyvalue part. Read metadata of next * rowgroup before reading the actual data. * * @parquetMetadata parquet metadata information * @prot footer protocol for reading */ int readParquetFileMetadata( ParquetMetadata *parquetMetadata, CompactProtocol *prot) { uint32_t xfer = 0; TType ftype; int16_t fid; bool isset_version = false; bool isset_schema = false; bool isset_num_rows = false; bool isset_row_groups = false; while (true) { xfer += readFieldBegin(prot, &ftype, &fid); if (ftype == T_STOP) { break; } switch (fid) { case 1: /* Process version*/ if (ftype == T_I32) { xfer += readI32(prot, &((*parquetMetadata)->version)); isset_version = true; } break; case 2: /* process schema - field information*/ if (ftype == T_LIST) { { uint32_t lsize; TType ltype; xfer += readListBegin(prot, <ype, &lsize); readSchemaElement(prot, lsize, &((*parquetMetadata)->pfield), &((*parquetMetadata)->fieldCount), &((*parquetMetadata)->colCount), &((*parquetMetadata)->schemaTreeNodeCount)); } isset_schema = true; } break; case 3: /* process number of rows*/ if (ftype == T_I64) { int64_t num_rows = 0; xfer += readI64(prot, &num_rows); (*parquetMetadata)->num_rows = num_rows; isset_num_rows = true; } break; case 4: /* process row group information*/ if (ftype == T_LIST) { /* get row group count*/ uint32_t lSize; TType etype; xfer += readListBegin(prot, &etype, &lSize); (*parquetMetadata)->blockCount = lSize; isset_row_groups = true; break; } break; default: break; } /*hit row groups, break out the while statement*/ if(isset_row_groups) break; } if (!isset_version) ereport(ERROR, (errcode(ERRCODE_GP_INTERNAL_ERROR), errmsg("file metadata version not set"))); if (!isset_schema) ereport(ERROR, (errcode(ERRCODE_GP_INTERNAL_ERROR), errmsg("file metadata schema not set"))); if (!isset_num_rows) ereport(ERROR, (errcode(ERRCODE_GP_INTERNAL_ERROR), errmsg("file metadata num_rows not set"))); if (!isset_row_groups) ereport(ERROR, (errcode(ERRCODE_GP_INTERNAL_ERROR), errmsg("file metadata row group information not set"))); return xfer; }
// Returns a new reference. static PyObject* decode_val(DecodeBuffer* input, TType type, PyObject* typeargs, long string_limit, long container_limit) { switch (type) { case T_BOOL: { int8_t v = readByte(input); if (INT_CONV_ERROR_OCCURRED(v)) { return NULL; } switch (v) { case 0: Py_RETURN_FALSE; case 1: Py_RETURN_TRUE; // Don't laugh. This is a potentially serious issue. default: PyErr_SetString(PyExc_TypeError, "boolean out of range"); return NULL; } break; } case T_I08: { int8_t v = readByte(input); if (INT_CONV_ERROR_OCCURRED(v)) { return NULL; } return PyInt_FromLong(v); } case T_I16: { int16_t v = readI16(input); if (INT_CONV_ERROR_OCCURRED(v)) { return NULL; } return PyInt_FromLong(v); } case T_I32: { int32_t v = readI32(input); if (INT_CONV_ERROR_OCCURRED(v)) { return NULL; } return PyInt_FromLong(v); } case T_I64: { int64_t v = readI64(input); if (INT_CONV_ERROR_OCCURRED(v)) { return NULL; } // TODO(dreiss): Find out if we can take this fastpath always when // sizeof(long) == sizeof(long long). if (CHECK_RANGE(v, LONG_MIN, LONG_MAX)) { return PyInt_FromLong((long) v); } return PyLong_FromLongLong(v); } case T_DOUBLE: { double v = readDouble(input); if (v == -1.0 && PyErr_Occurred()) { return false; } return PyFloat_FromDouble(v); } case T_STRING: { Py_ssize_t len = readI32(input); char* buf; if (!readBytes(input, &buf, len)) { return NULL; } if (!check_length_limit(len, string_limit)) { return NULL; } if (is_utf8(typeargs)) return PyUnicode_DecodeUTF8(buf, len, 0); else return PyString_FromStringAndSize(buf, len); } case T_LIST: case T_SET: { SetListTypeArgs parsedargs; int32_t len; PyObject* ret = NULL; int i; bool use_tuple = false; if (!parse_set_list_args(&parsedargs, typeargs)) { return NULL; } if (!checkTypeByte(input, parsedargs.element_type)) { return NULL; } len = readI32(input); if (!check_length_limit(len, container_limit)) { return NULL; } use_tuple = type == T_LIST && parsedargs.immutable; ret = use_tuple ? PyTuple_New(len) : PyList_New(len); if (!ret) { return NULL; } for (i = 0; i < len; i++) { PyObject* item = decode_val(input, parsedargs.element_type, parsedargs.typeargs, string_limit, container_limit); if (!item) { Py_DECREF(ret); return NULL; } if (use_tuple) { PyTuple_SET_ITEM(ret, i, item); } else { PyList_SET_ITEM(ret, i, item); } } // TODO(dreiss): Consider biting the bullet and making two separate cases // for list and set, avoiding this post facto conversion. if (type == T_SET) { PyObject* setret; setret = parsedargs.immutable ? PyFrozenSet_New(ret) : PySet_New(ret); Py_DECREF(ret); return setret; } return ret; } case T_MAP: { int32_t len; int i; MapTypeArgs parsedargs; PyObject* ret = NULL; if (!parse_map_args(&parsedargs, typeargs)) { return NULL; } if (!checkTypeByte(input, parsedargs.ktag)) { return NULL; } if (!checkTypeByte(input, parsedargs.vtag)) { return NULL; } len = readI32(input); if (!check_length_limit(len, container_limit)) { return NULL; } ret = PyDict_New(); if (!ret) { goto error; } for (i = 0; i < len; i++) { PyObject* k = NULL; PyObject* v = NULL; k = decode_val(input, parsedargs.ktag, parsedargs.ktypeargs, string_limit, container_limit); if (k == NULL) { goto loop_error; } v = decode_val(input, parsedargs.vtag, parsedargs.vtypeargs, string_limit, container_limit); if (v == NULL) { goto loop_error; } if (PyDict_SetItem(ret, k, v) == -1) { goto loop_error; } Py_DECREF(k); Py_DECREF(v); continue; // Yuck! Destructors, anyone? loop_error: Py_XDECREF(k); Py_XDECREF(v); goto error; } if (parsedargs.immutable) { PyObject* thrift = PyImport_ImportModule("thrift.Thrift"); PyObject* cls = NULL; PyObject* arg = NULL; if (!thrift) { goto error; } cls = PyObject_GetAttrString(thrift, "TFrozenDict"); if (!cls) { goto error; } arg = PyTuple_New(1); PyTuple_SET_ITEM(arg, 0, ret); return PyObject_CallObject(cls, arg); } return ret; error: Py_XDECREF(ret); return NULL; } case T_STRUCT: { StructTypeArgs parsedargs; if (!parse_struct_args(&parsedargs, typeargs)) { return NULL; } return decode_struct(input, Py_None, parsedargs.klass, parsedargs.spec, string_limit, container_limit); } case T_STOP: case T_VOID: case T_UTF16: case T_UTF8: case T_U64: default: PyErr_SetString(PyExc_TypeError, "Unexpected TType"); return NULL; } }
static bool skip(DecodeBuffer* input, TType type) { #define SKIPBYTES(n) \ do { \ if (!readBytes(input, &dummy_buf, (n))) { \ return false; \ } \ } while(0) char* dummy_buf; switch (type) { case T_BOOL: case T_I08: SKIPBYTES(1); break; case T_I16: SKIPBYTES(2); break; case T_I32: SKIPBYTES(4); break; case T_I64: case T_DOUBLE: SKIPBYTES(8); break; case T_STRING: { // TODO(dreiss): Find out if these check_ssize_t32s are really necessary. int len = readI32(input); if (!check_ssize_t_32(len)) { return false; } SKIPBYTES(len); break; } case T_LIST: case T_SET: { TType etype; int len, i; etype = readByte(input); if (etype == -1) { return false; } len = readI32(input); if (!check_ssize_t_32(len)) { return false; } for (i = 0; i < len; i++) { if (!skip(input, etype)) { return false; } } break; } case T_MAP: { TType ktype, vtype; int len, i; ktype = readByte(input); if (ktype == -1) { return false; } vtype = readByte(input); if (vtype == -1) { return false; } len = readI32(input); if (!check_ssize_t_32(len)) { return false; } for (i = 0; i < len; i++) { if (!(skip(input, ktype) && skip(input, vtype))) { return false; } } break; } case T_STRUCT: { while (true) { TType type; type = readByte(input); if (type == -1) { return false; } if (type == T_STOP) break; SKIPBYTES(2); // tag if (!skip(input, type)) { return false; } } break; } case T_STOP: case T_VOID: case T_UTF16: case T_UTF8: case T_U64: default: PyErr_SetString(PyExc_TypeError, "Unexpected TType"); return false; } return true; #undef SKIPBYTES }
// Returns a new reference. static PyObject* decode_val(DecodeBuffer* input, TType type, PyObject* typeargs) { switch (type) { case T_BOOL: { int8_t v = readByte(input); if (INT_CONV_ERROR_OCCURRED(v)) { return NULL; } switch (v) { case 0: Py_RETURN_FALSE; case 1: Py_RETURN_TRUE; // Don't laugh. This is a potentially serious issue. default: PyErr_SetString(PyExc_TypeError, "boolean out of range"); return NULL; } break; } case T_I08: { int8_t v = readByte(input); if (INT_CONV_ERROR_OCCURRED(v)) { return NULL; } return PyInt_FromLong(v); } case T_I16: { int16_t v = readI16(input); if (INT_CONV_ERROR_OCCURRED(v)) { return NULL; } return PyInt_FromLong(v); } case T_I32: { int32_t v = readI32(input); if (INT_CONV_ERROR_OCCURRED(v)) { return NULL; } return PyInt_FromLong(v); } case T_I64: { int64_t v = readI64(input); if (INT_CONV_ERROR_OCCURRED(v)) { return NULL; } // TODO(dreiss): Find out if we can take this fastpath always when // sizeof(long) == sizeof(long long). if (CHECK_RANGE(v, LONG_MIN, LONG_MAX)) { return PyInt_FromLong((long) v); } return PyLong_FromLongLong(v); } case T_DOUBLE: { double v = readDouble(input); if (v == -1.0 && PyErr_Occurred()) { return false; } return PyFloat_FromDouble(v); } case T_STRING: { Py_ssize_t len = readI32(input); char* buf; if (!readBytes(input, &buf, len)) { return NULL; } return PyString_FromStringAndSize(buf, len); } case T_LIST: case T_SET: { SetListTypeArgs parsedargs; int32_t len; PyObject* ret = NULL; int i; if (!parse_set_list_args(&parsedargs, typeargs)) { return NULL; } if (!checkTypeByte(input, parsedargs.element_type)) { return NULL; } len = readI32(input); if (!check_ssize_t_32(len)) { return NULL; } ret = PyList_New(len); if (!ret) { return NULL; } for (i = 0; i < len; i++) { PyObject* item = decode_val(input, parsedargs.element_type, parsedargs.typeargs); if (!item) { Py_DECREF(ret); return NULL; } PyList_SET_ITEM(ret, i, item); } // TODO(dreiss): Consider biting the bullet and making two separate cases // for list and set, avoiding this post facto conversion. if (type == T_SET) { PyObject* setret; #if (PY_VERSION_HEX < 0x02050000) // hack needed for older versions setret = PyObject_CallFunctionObjArgs((PyObject*)&PySet_Type, ret, NULL); #else // official version setret = PySet_New(ret); #endif Py_DECREF(ret); return setret; } return ret; } case T_MAP: { int32_t len; int i; MapTypeArgs parsedargs; PyObject* ret = NULL; if (!parse_map_args(&parsedargs, typeargs)) { return NULL; } if (!checkTypeByte(input, parsedargs.ktag)) { return NULL; } if (!checkTypeByte(input, parsedargs.vtag)) { return NULL; } len = readI32(input); if (!check_ssize_t_32(len)) { return false; } ret = PyDict_New(); if (!ret) { goto error; } for (i = 0; i < len; i++) { PyObject* k = NULL; PyObject* v = NULL; k = decode_val(input, parsedargs.ktag, parsedargs.ktypeargs); if (k == NULL) { goto loop_error; } v = decode_val(input, parsedargs.vtag, parsedargs.vtypeargs); if (v == NULL) { goto loop_error; } if (PyDict_SetItem(ret, k, v) == -1) { goto loop_error; } Py_DECREF(k); Py_DECREF(v); continue; // Yuck! Destructors, anyone? loop_error: Py_XDECREF(k); Py_XDECREF(v); goto error; } return ret; error: Py_XDECREF(ret); return NULL; } case T_STRUCT: { StructTypeArgs parsedargs; if (!parse_struct_args(&parsedargs, typeargs)) { return NULL; } PyObject* ret = PyObject_CallObject(parsedargs.klass, NULL); if (!ret) { return NULL; } if (!decode_struct(input, ret, parsedargs.spec)) { Py_DECREF(ret); return NULL; } return ret; } case T_STOP: case T_VOID: case T_UTF16: case T_UTF8: case T_U64: default: PyErr_SetString(PyExc_TypeError, "Unexpected TType"); return NULL; } }
uint32_t TBinaryProtocol::readString(string& str) { uint32_t result; int32_t size; result = readI32(size); return result + readStringBody(str, size); }