static double readDouble(DecodeBuffer* input) { union { int64_t f; double t; } transfer; transfer.f = readI64(input); if (transfer.f == -1) { return -1; } return transfer.t; }
/** * read column chunk information */ int readColumnChunk( CompactProtocol *prot, struct ColumnChunkMetadata_4C *colChunk) { uint32_t xfer = 0; TType ftype; int16_t fid; readStructBegin(prot); bool isset_file_offset = false; while (true) { xfer += readFieldBegin(prot, &ftype, &fid); if (ftype == T_STOP) { break; } switch (fid) { case 1: if (ftype == T_STRING) { char *file_path; xfer += readString(prot, &file_path); colChunk->path = file_path; } break; case 2: if (ftype == T_I64) { xfer += readI64(prot, &(colChunk->file_offset)); isset_file_offset = true; } break; case 3: if (ftype == T_STRUCT) { /*read column metadata*/ xfer += readColumnMetadata(prot, colChunk); } break; default: break; } } readStructEnd(prot); if (!isset_file_offset) ereport(ERROR, (errcode(ERRCODE_GP_INTERNAL_ERROR), errmsg("file metadata: row group column chunk fileoffset not set"))); return xfer; }
/** * read column chunk metadata information */ int readColumnMetadata( CompactProtocol *prot, struct ColumnChunkMetadata_4C *colChunk) { uint32_t xfer = 0; TType ftype; int16_t fid; readStructBegin(prot); bool isset_type = false; bool isset_encodings = false; bool isset_path_in_schema = false; bool isset_codec = false; bool isset_num_values = false; bool isset_total_uncompressed_size = false; bool isset_total_compressed_size = false; bool isset_data_page_offset = false; while (true) { xfer += readFieldBegin(prot, &ftype, &fid); if (ftype == T_STOP) { break; } switch (fid) { case 1: if (ftype == T_I32) { int32_t type; xfer += readI32(prot, &type); colChunk->type = (PrimitiveTypeName) type; isset_type = true; } break; case 2: if (ftype == T_LIST) { uint32_t encodingCount; TType etype; xfer += readListBegin(prot, &etype, &encodingCount); colChunk->EncodingCount = encodingCount; colChunk->pEncodings = (enum Encoding *) palloc0(sizeof(enum Encoding) * encodingCount); for (int i = 0; i < encodingCount; i++) { int32_t encoding; xfer += readI32(prot, &encoding); colChunk->pEncodings[i] = (enum Encoding) encoding; } isset_encodings = true; } break; case 3: if (ftype == T_LIST) { { /*process path in schema, setting colchunk->depth and colchunk->pathInSchema*/ TType etype; uint32_t lsize; StringInfoData colNameBuf; xfer += readListBegin(prot, &etype, &lsize); colChunk->depth = lsize; initStringInfo(&colNameBuf); char *path_in_schema; for (int i = 0; i < lsize - 1; i++) { xfer += readString(prot, &path_in_schema); appendStringInfo(&colNameBuf, "%s:", path_in_schema); pfree(path_in_schema); } xfer += readString(prot, &path_in_schema); appendStringInfo(&colNameBuf, "%s", path_in_schema); colChunk->pathInSchema = colNameBuf.data; colChunk->colName = path_in_schema; } isset_path_in_schema = true; } break; case 4: if (ftype == T_I32) { int32_t compresscode; xfer += readI32(prot, &compresscode); colChunk->codec = (enum CompressionCodecName) compresscode; isset_codec = true; } break; case 5: if (ftype == T_I64) { int64_t valCnt; xfer += readI64(prot, &valCnt); colChunk->valueCount = valCnt; isset_num_values = true; } break; case 6: if (ftype == T_I64) { xfer += readI64(prot, &(colChunk->totalUncompressedSize)); isset_total_uncompressed_size = true; } break; case 7: if (ftype == T_I64) { xfer += readI64(prot, &(colChunk->totalSize)); isset_total_compressed_size = true; } break; case 8: if (ftype == T_LIST) { xfer += skipType(prot, ftype); } break; case 9: if (ftype == T_I64) { xfer += readI64(prot, &(colChunk->firstDataPage)); isset_data_page_offset = true; } break; case 10: if (ftype == T_I64) { xfer += skipType(prot, ftype); } break; case 11: if (ftype == T_I64) { xfer += skipType(prot, ftype); } break; default: break; } } readStructEnd(prot); if (!isset_type) ereport(ERROR, (errcode(ERRCODE_GP_INTERNAL_ERROR), errmsg("file metadata: row group column chunk type not set"))); if (!isset_encodings) ereport(ERROR, (errcode(ERRCODE_GP_INTERNAL_ERROR), errmsg("file metadata: row group column chunk encoding not set"))); if (!isset_path_in_schema) ereport(ERROR, (errcode(ERRCODE_GP_INTERNAL_ERROR), errmsg("file metadata: row group column chunk path_in_schema not set"))); if (!isset_codec) ereport(ERROR, (errcode(ERRCODE_GP_INTERNAL_ERROR), errmsg("file metadata: row group column chunk compression code not set"))); if (!isset_num_values) ereport(ERROR, (errcode(ERRCODE_GP_INTERNAL_ERROR), errmsg("file metadata: row group column chunk value number not set"))); if (!isset_total_uncompressed_size) ereport(ERROR, (errcode(ERRCODE_GP_INTERNAL_ERROR), errmsg("file metadata: row group column chunk total uncompressed size not set"))); if (!isset_total_compressed_size) ereport(ERROR, (errcode(ERRCODE_GP_INTERNAL_ERROR), errmsg("file metadata: row group column chunk total compressed size not set"))); if (!isset_data_page_offset) ereport(ERROR, (errcode(ERRCODE_GP_INTERNAL_ERROR), errmsg("file metadata: row group column chunk first data page not set"))); return xfer; }
/** * Read parquet file rowgroup information, and convert it to hawq structure * * @prot the reading protocol * @rowGroupInfo the row group information * @pfields the schema information of the file * @pfieldCount the field count of schema */ int readRowGroupInfo( CompactProtocol *prot, struct BlockMetadata_4C* rowGroupInfo, struct FileField_4C* pfields, int pfieldCount) { uint32_t xfer = 0; TType ftype; int16_t fid; readStructBegin(prot); bool isset_columns = false; bool isset_total_byte_size = false; bool isset_num_rows = false; while (true) { xfer += readFieldBegin(prot, &ftype, &fid); if (ftype == T_STOP) { break; } switch (fid) { case 1: if (ftype == T_LIST) { uint32_t colChunkCnt; TType etype; xfer += readListBegin(prot, &etype, &colChunkCnt); rowGroupInfo->ColChunkCount = colChunkCnt; rowGroupInfo->columns = (struct ColumnChunkMetadata_4C*) palloc0( colChunkCnt * sizeof(struct ColumnChunkMetadata_4C)); for (int i = 0; i < colChunkCnt; i++) { xfer += readColumnChunk(prot, &(rowGroupInfo->columns[i])); } isset_columns = true; } break; case 2: if (ftype == T_I64) { int64_t val; xfer += readI64(prot, &val); rowGroupInfo->totalByteSize = val; isset_total_byte_size = true; } break; case 3: if (ftype == T_I64) { int64_t val; xfer += readI64(prot, &val); rowGroupInfo->rowCount = val; isset_num_rows = true; } break; default: break; } } readStructEnd(prot); if (!isset_columns) ereport(ERROR, (errcode(ERRCODE_GP_INTERNAL_ERROR), errmsg("file metadata row group column chunk not set"))); if (!isset_total_byte_size) ereport(ERROR, (errcode(ERRCODE_GP_INTERNAL_ERROR), errmsg("file metadata row group total byte size not set"))); if (!isset_num_rows) ereport(ERROR, (errcode(ERRCODE_GP_INTERNAL_ERROR), errmsg("file metadata row group row count not set"))); /*assign r and d of fields to column chunks*/ int columnIndex = 0; assignRDFromFieldToColumnChunk(rowGroupInfo->columns, &columnIndex, pfields, pfieldCount); return xfer; }
/* * The initialize read method, read file metadata, but just read the first 4 parts, * including version, schema information, number of rows, and rowgroup number, but * doesn't read each rowgroup metadata, and keyvalue part. Read metadata of next * rowgroup before reading the actual data. * * @parquetMetadata parquet metadata information * @prot footer protocol for reading */ int readParquetFileMetadata( ParquetMetadata *parquetMetadata, CompactProtocol *prot) { uint32_t xfer = 0; TType ftype; int16_t fid; bool isset_version = false; bool isset_schema = false; bool isset_num_rows = false; bool isset_row_groups = false; while (true) { xfer += readFieldBegin(prot, &ftype, &fid); if (ftype == T_STOP) { break; } switch (fid) { case 1: /* Process version*/ if (ftype == T_I32) { xfer += readI32(prot, &((*parquetMetadata)->version)); isset_version = true; } break; case 2: /* process schema - field information*/ if (ftype == T_LIST) { { uint32_t lsize; TType ltype; xfer += readListBegin(prot, <ype, &lsize); readSchemaElement(prot, lsize, &((*parquetMetadata)->pfield), &((*parquetMetadata)->fieldCount), &((*parquetMetadata)->colCount), &((*parquetMetadata)->schemaTreeNodeCount)); } isset_schema = true; } break; case 3: /* process number of rows*/ if (ftype == T_I64) { int64_t num_rows = 0; xfer += readI64(prot, &num_rows); (*parquetMetadata)->num_rows = num_rows; isset_num_rows = true; } break; case 4: /* process row group information*/ if (ftype == T_LIST) { /* get row group count*/ uint32_t lSize; TType etype; xfer += readListBegin(prot, &etype, &lSize); (*parquetMetadata)->blockCount = lSize; isset_row_groups = true; break; } break; default: break; } /*hit row groups, break out the while statement*/ if(isset_row_groups) break; } if (!isset_version) ereport(ERROR, (errcode(ERRCODE_GP_INTERNAL_ERROR), errmsg("file metadata version not set"))); if (!isset_schema) ereport(ERROR, (errcode(ERRCODE_GP_INTERNAL_ERROR), errmsg("file metadata schema not set"))); if (!isset_num_rows) ereport(ERROR, (errcode(ERRCODE_GP_INTERNAL_ERROR), errmsg("file metadata num_rows not set"))); if (!isset_row_groups) ereport(ERROR, (errcode(ERRCODE_GP_INTERNAL_ERROR), errmsg("file metadata row group information not set"))); return xfer; }
// Returns a new reference. static PyObject* decode_val(DecodeBuffer* input, TType type, PyObject* typeargs, long string_limit, long container_limit) { switch (type) { case T_BOOL: { int8_t v = readByte(input); if (INT_CONV_ERROR_OCCURRED(v)) { return NULL; } switch (v) { case 0: Py_RETURN_FALSE; case 1: Py_RETURN_TRUE; // Don't laugh. This is a potentially serious issue. default: PyErr_SetString(PyExc_TypeError, "boolean out of range"); return NULL; } break; } case T_I08: { int8_t v = readByte(input); if (INT_CONV_ERROR_OCCURRED(v)) { return NULL; } return PyInt_FromLong(v); } case T_I16: { int16_t v = readI16(input); if (INT_CONV_ERROR_OCCURRED(v)) { return NULL; } return PyInt_FromLong(v); } case T_I32: { int32_t v = readI32(input); if (INT_CONV_ERROR_OCCURRED(v)) { return NULL; } return PyInt_FromLong(v); } case T_I64: { int64_t v = readI64(input); if (INT_CONV_ERROR_OCCURRED(v)) { return NULL; } // TODO(dreiss): Find out if we can take this fastpath always when // sizeof(long) == sizeof(long long). if (CHECK_RANGE(v, LONG_MIN, LONG_MAX)) { return PyInt_FromLong((long) v); } return PyLong_FromLongLong(v); } case T_DOUBLE: { double v = readDouble(input); if (v == -1.0 && PyErr_Occurred()) { return false; } return PyFloat_FromDouble(v); } case T_STRING: { Py_ssize_t len = readI32(input); char* buf; if (!readBytes(input, &buf, len)) { return NULL; } if (!check_length_limit(len, string_limit)) { return NULL; } if (is_utf8(typeargs)) return PyUnicode_DecodeUTF8(buf, len, 0); else return PyString_FromStringAndSize(buf, len); } case T_LIST: case T_SET: { SetListTypeArgs parsedargs; int32_t len; PyObject* ret = NULL; int i; bool use_tuple = false; if (!parse_set_list_args(&parsedargs, typeargs)) { return NULL; } if (!checkTypeByte(input, parsedargs.element_type)) { return NULL; } len = readI32(input); if (!check_length_limit(len, container_limit)) { return NULL; } use_tuple = type == T_LIST && parsedargs.immutable; ret = use_tuple ? PyTuple_New(len) : PyList_New(len); if (!ret) { return NULL; } for (i = 0; i < len; i++) { PyObject* item = decode_val(input, parsedargs.element_type, parsedargs.typeargs, string_limit, container_limit); if (!item) { Py_DECREF(ret); return NULL; } if (use_tuple) { PyTuple_SET_ITEM(ret, i, item); } else { PyList_SET_ITEM(ret, i, item); } } // TODO(dreiss): Consider biting the bullet and making two separate cases // for list and set, avoiding this post facto conversion. if (type == T_SET) { PyObject* setret; setret = parsedargs.immutable ? PyFrozenSet_New(ret) : PySet_New(ret); Py_DECREF(ret); return setret; } return ret; } case T_MAP: { int32_t len; int i; MapTypeArgs parsedargs; PyObject* ret = NULL; if (!parse_map_args(&parsedargs, typeargs)) { return NULL; } if (!checkTypeByte(input, parsedargs.ktag)) { return NULL; } if (!checkTypeByte(input, parsedargs.vtag)) { return NULL; } len = readI32(input); if (!check_length_limit(len, container_limit)) { return NULL; } ret = PyDict_New(); if (!ret) { goto error; } for (i = 0; i < len; i++) { PyObject* k = NULL; PyObject* v = NULL; k = decode_val(input, parsedargs.ktag, parsedargs.ktypeargs, string_limit, container_limit); if (k == NULL) { goto loop_error; } v = decode_val(input, parsedargs.vtag, parsedargs.vtypeargs, string_limit, container_limit); if (v == NULL) { goto loop_error; } if (PyDict_SetItem(ret, k, v) == -1) { goto loop_error; } Py_DECREF(k); Py_DECREF(v); continue; // Yuck! Destructors, anyone? loop_error: Py_XDECREF(k); Py_XDECREF(v); goto error; } if (parsedargs.immutable) { PyObject* thrift = PyImport_ImportModule("thrift.Thrift"); PyObject* cls = NULL; PyObject* arg = NULL; if (!thrift) { goto error; } cls = PyObject_GetAttrString(thrift, "TFrozenDict"); if (!cls) { goto error; } arg = PyTuple_New(1); PyTuple_SET_ITEM(arg, 0, ret); return PyObject_CallObject(cls, arg); } return ret; error: Py_XDECREF(ret); return NULL; } case T_STRUCT: { StructTypeArgs parsedargs; if (!parse_struct_args(&parsedargs, typeargs)) { return NULL; } return decode_struct(input, Py_None, parsedargs.klass, parsedargs.spec, string_limit, container_limit); } case T_STOP: case T_VOID: case T_UTF16: case T_UTF8: case T_U64: default: PyErr_SetString(PyExc_TypeError, "Unexpected TType"); return NULL; } }
// Returns a new reference. static PyObject* decode_val(DecodeBuffer* input, TType type, PyObject* typeargs) { switch (type) { case T_BOOL: { int8_t v = readByte(input); if (INT_CONV_ERROR_OCCURRED(v)) { return NULL; } switch (v) { case 0: Py_RETURN_FALSE; case 1: Py_RETURN_TRUE; // Don't laugh. This is a potentially serious issue. default: PyErr_SetString(PyExc_TypeError, "boolean out of range"); return NULL; } break; } case T_I08: { int8_t v = readByte(input); if (INT_CONV_ERROR_OCCURRED(v)) { return NULL; } return PyInt_FromLong(v); } case T_I16: { int16_t v = readI16(input); if (INT_CONV_ERROR_OCCURRED(v)) { return NULL; } return PyInt_FromLong(v); } case T_I32: { int32_t v = readI32(input); if (INT_CONV_ERROR_OCCURRED(v)) { return NULL; } return PyInt_FromLong(v); } case T_I64: { int64_t v = readI64(input); if (INT_CONV_ERROR_OCCURRED(v)) { return NULL; } // TODO(dreiss): Find out if we can take this fastpath always when // sizeof(long) == sizeof(long long). if (CHECK_RANGE(v, LONG_MIN, LONG_MAX)) { return PyInt_FromLong((long) v); } return PyLong_FromLongLong(v); } case T_DOUBLE: { double v = readDouble(input); if (v == -1.0 && PyErr_Occurred()) { return false; } return PyFloat_FromDouble(v); } case T_STRING: { Py_ssize_t len = readI32(input); char* buf; if (!readBytes(input, &buf, len)) { return NULL; } return PyString_FromStringAndSize(buf, len); } case T_LIST: case T_SET: { SetListTypeArgs parsedargs; int32_t len; PyObject* ret = NULL; int i; if (!parse_set_list_args(&parsedargs, typeargs)) { return NULL; } if (!checkTypeByte(input, parsedargs.element_type)) { return NULL; } len = readI32(input); if (!check_ssize_t_32(len)) { return NULL; } ret = PyList_New(len); if (!ret) { return NULL; } for (i = 0; i < len; i++) { PyObject* item = decode_val(input, parsedargs.element_type, parsedargs.typeargs); if (!item) { Py_DECREF(ret); return NULL; } PyList_SET_ITEM(ret, i, item); } // TODO(dreiss): Consider biting the bullet and making two separate cases // for list and set, avoiding this post facto conversion. if (type == T_SET) { PyObject* setret; #if (PY_VERSION_HEX < 0x02050000) // hack needed for older versions setret = PyObject_CallFunctionObjArgs((PyObject*)&PySet_Type, ret, NULL); #else // official version setret = PySet_New(ret); #endif Py_DECREF(ret); return setret; } return ret; } case T_MAP: { int32_t len; int i; MapTypeArgs parsedargs; PyObject* ret = NULL; if (!parse_map_args(&parsedargs, typeargs)) { return NULL; } if (!checkTypeByte(input, parsedargs.ktag)) { return NULL; } if (!checkTypeByte(input, parsedargs.vtag)) { return NULL; } len = readI32(input); if (!check_ssize_t_32(len)) { return false; } ret = PyDict_New(); if (!ret) { goto error; } for (i = 0; i < len; i++) { PyObject* k = NULL; PyObject* v = NULL; k = decode_val(input, parsedargs.ktag, parsedargs.ktypeargs); if (k == NULL) { goto loop_error; } v = decode_val(input, parsedargs.vtag, parsedargs.vtypeargs); if (v == NULL) { goto loop_error; } if (PyDict_SetItem(ret, k, v) == -1) { goto loop_error; } Py_DECREF(k); Py_DECREF(v); continue; // Yuck! Destructors, anyone? loop_error: Py_XDECREF(k); Py_XDECREF(v); goto error; } return ret; error: Py_XDECREF(ret); return NULL; } case T_STRUCT: { StructTypeArgs parsedargs; if (!parse_struct_args(&parsedargs, typeargs)) { return NULL; } PyObject* ret = PyObject_CallObject(parsedargs.klass, NULL); if (!ret) { return NULL; } if (!decode_struct(input, ret, parsedargs.spec)) { Py_DECREF(ret); return NULL; } return ret; } case T_STOP: case T_VOID: case T_UTF16: case T_UTF8: case T_U64: default: PyErr_SetString(PyExc_TypeError, "Unexpected TType"); return NULL; } }