コード例 #1
0
ファイル: fastbinary.c プロジェクト: liuhg/thrift
static double readDouble(DecodeBuffer* input) {
    union {
        int64_t f;
        double t;
    } transfer;

    transfer.f = readI64(input);
    if (transfer.f == -1) {
        return -1;
    }
    return transfer.t;
}
コード例 #2
0
/**
 * read column chunk information
 */
int
readColumnChunk(
		CompactProtocol *prot,
		struct ColumnChunkMetadata_4C *colChunk)
{
	uint32_t xfer = 0;
	TType ftype;
	int16_t fid;

	readStructBegin(prot);
	bool isset_file_offset = false;

	while (true) {
		xfer += readFieldBegin(prot, &ftype, &fid);
		if (ftype == T_STOP) {
			break;
		}
		switch (fid) {
		case 1:
			if (ftype == T_STRING) {
				char *file_path;
				xfer += readString(prot, &file_path);
				colChunk->path = file_path;
			}
			break;
		case 2:
			if (ftype == T_I64) {
				xfer += readI64(prot, &(colChunk->file_offset));
				isset_file_offset = true;
			}
			break;
		case 3:
			if (ftype == T_STRUCT) {
				/*read column metadata*/
				xfer += readColumnMetadata(prot, colChunk);
			}
			break;
		default:
			break;
		}
	}

	readStructEnd(prot);

	if (!isset_file_offset)
		ereport(ERROR,
				(errcode(ERRCODE_GP_INTERNAL_ERROR), errmsg("file metadata: row group column chunk fileoffset not set")));
	return xfer;

}
コード例 #3
0
/**
 * read column chunk metadata information
 */
int
readColumnMetadata(
		CompactProtocol *prot,
		struct ColumnChunkMetadata_4C *colChunk)
{
	uint32_t xfer = 0;
	TType ftype;
	int16_t fid;
	readStructBegin(prot);

	bool isset_type = false;
	bool isset_encodings = false;
	bool isset_path_in_schema = false;
	bool isset_codec = false;
	bool isset_num_values = false;
	bool isset_total_uncompressed_size = false;
	bool isset_total_compressed_size = false;
	bool isset_data_page_offset = false;

	while (true) {
		xfer += readFieldBegin(prot, &ftype, &fid);
		if (ftype == T_STOP) {
			break;
		}
		switch (fid) {
		case 1:
			if (ftype == T_I32) {
				int32_t type;
				xfer += readI32(prot, &type);
				colChunk->type = (PrimitiveTypeName) type;
				isset_type = true;
			}
			break;
		case 2:
			if (ftype == T_LIST) {
				uint32_t encodingCount;
				TType etype;
				xfer += readListBegin(prot, &etype, &encodingCount);
				colChunk->EncodingCount = encodingCount;
				colChunk->pEncodings =
						(enum Encoding *) palloc0(sizeof(enum Encoding) * encodingCount);
				for (int i = 0; i < encodingCount; i++) {
					int32_t encoding;
					xfer += readI32(prot, &encoding);
					colChunk->pEncodings[i] = (enum Encoding) encoding;
				}
				isset_encodings = true;
			}
			break;
		case 3:
			if (ftype == T_LIST) {
				{
					/*process path in schema, setting colchunk->depth and colchunk->pathInSchema*/
					TType etype;
					uint32_t lsize;
					StringInfoData colNameBuf;

					xfer += readListBegin(prot, &etype, &lsize);
					colChunk->depth = lsize;
					initStringInfo(&colNameBuf);
					char *path_in_schema;
					for (int i = 0; i < lsize - 1; i++) {
						xfer += readString(prot, &path_in_schema);
						appendStringInfo(&colNameBuf, "%s:", path_in_schema);
						pfree(path_in_schema);
					}
					xfer += readString(prot, &path_in_schema);
					appendStringInfo(&colNameBuf, "%s", path_in_schema);

					colChunk->pathInSchema = colNameBuf.data;
					colChunk->colName = path_in_schema;
				}
				isset_path_in_schema = true;
			}
			break;
		case 4:
			if (ftype == T_I32) {
				int32_t compresscode;
				xfer += readI32(prot, &compresscode);
				colChunk->codec = (enum CompressionCodecName) compresscode;
				isset_codec = true;
			}
			break;
		case 5:
			if (ftype == T_I64) {
				int64_t valCnt;
				xfer += readI64(prot, &valCnt);
				colChunk->valueCount = valCnt;
				isset_num_values = true;
			}
			break;
		case 6:
			if (ftype == T_I64) {
				xfer += readI64(prot, &(colChunk->totalUncompressedSize));
				isset_total_uncompressed_size = true;
			}
			break;
		case 7:
			if (ftype == T_I64) {
				xfer += readI64(prot, &(colChunk->totalSize));
				isset_total_compressed_size = true;
			}
			break;
		case 8:
			if (ftype == T_LIST) {
				xfer += skipType(prot, ftype);
			}
			break;
		case 9:
			if (ftype == T_I64) {
				xfer += readI64(prot, &(colChunk->firstDataPage));
				isset_data_page_offset = true;
			}
			break;
		case 10:
			if (ftype == T_I64) {
				xfer += skipType(prot, ftype);
			}
			break;
		case 11:
			if (ftype == T_I64) {
				xfer += skipType(prot, ftype);
			}
			break;
		default:
			break;
		}
	}
	readStructEnd(prot);

	if (!isset_type)
		ereport(ERROR,
				(errcode(ERRCODE_GP_INTERNAL_ERROR), errmsg("file metadata: row group column chunk type not set")));
	if (!isset_encodings)
		ereport(ERROR,
				(errcode(ERRCODE_GP_INTERNAL_ERROR), errmsg("file metadata: row group column chunk encoding not set")));
	if (!isset_path_in_schema)
		ereport(ERROR,
				(errcode(ERRCODE_GP_INTERNAL_ERROR), errmsg("file metadata: row group column chunk path_in_schema not set")));
	if (!isset_codec)
		ereport(ERROR,
				(errcode(ERRCODE_GP_INTERNAL_ERROR), errmsg("file metadata: row group column chunk compression code not set")));
	if (!isset_num_values)
		ereport(ERROR,
				(errcode(ERRCODE_GP_INTERNAL_ERROR), errmsg("file metadata: row group column chunk value number not set")));
	if (!isset_total_uncompressed_size)
		ereport(ERROR,
				(errcode(ERRCODE_GP_INTERNAL_ERROR), errmsg("file metadata: row group column chunk total uncompressed size not set")));
	if (!isset_total_compressed_size)
		ereport(ERROR,
				(errcode(ERRCODE_GP_INTERNAL_ERROR), errmsg("file metadata: row group column chunk total compressed size not set")));
	if (!isset_data_page_offset)
		ereport(ERROR,
				(errcode(ERRCODE_GP_INTERNAL_ERROR), errmsg("file metadata: row group column chunk first data page not set")));
	return xfer;
}
コード例 #4
0
/**
 * Read parquet file rowgroup information, and convert it to hawq structure
 *
 * @prot			the reading protocol
 * @rowGroupInfo	the row group information
 * @pfields			the schema information of the file
 * @pfieldCount		the field count of schema
 */
int
readRowGroupInfo(
		CompactProtocol *prot,
		struct BlockMetadata_4C* rowGroupInfo,
		struct FileField_4C* pfields,
		int pfieldCount)
{
	uint32_t xfer = 0;
	TType ftype;
	int16_t fid;

	readStructBegin(prot);

	bool isset_columns = false;
	bool isset_total_byte_size = false;
	bool isset_num_rows = false;

	while (true) {
		xfer += readFieldBegin(prot, &ftype, &fid);
		if (ftype == T_STOP) {
			break;
		}
		switch (fid) {
		case 1:
			if (ftype == T_LIST)
			{
				uint32_t colChunkCnt;
				TType etype;
				xfer += readListBegin(prot, &etype, &colChunkCnt);
				rowGroupInfo->ColChunkCount = colChunkCnt;
				rowGroupInfo->columns =
						(struct ColumnChunkMetadata_4C*) palloc0(
								colChunkCnt * sizeof(struct ColumnChunkMetadata_4C));
				for (int i = 0; i < colChunkCnt; i++) {
					xfer += readColumnChunk(prot, &(rowGroupInfo->columns[i]));
				}
				isset_columns = true;
			}
			break;
		case 2:
			if (ftype == T_I64) {
				int64_t val;
				xfer += readI64(prot, &val);
				rowGroupInfo->totalByteSize = val;
				isset_total_byte_size = true;
			}
			break;
		case 3:
			if (ftype == T_I64) {
				int64_t val;
				xfer += readI64(prot, &val);
				rowGroupInfo->rowCount = val;
				isset_num_rows = true;
			}
			break;
		default:
			break;
		}
	}

	readStructEnd(prot);

	if (!isset_columns)
		ereport(ERROR,
				(errcode(ERRCODE_GP_INTERNAL_ERROR), errmsg("file metadata row group column chunk not set")));
	if (!isset_total_byte_size)
		ereport(ERROR,
				(errcode(ERRCODE_GP_INTERNAL_ERROR), errmsg("file metadata row group total byte size not set")));
	if (!isset_num_rows)
		ereport(ERROR,
				(errcode(ERRCODE_GP_INTERNAL_ERROR), errmsg("file metadata row group row count not set")));

	/*assign r and d of fields to column chunks*/
	int columnIndex = 0;
	assignRDFromFieldToColumnChunk(rowGroupInfo->columns, &columnIndex, pfields,
			pfieldCount);

	return xfer;
}
コード例 #5
0
/*
 *  The initialize read method, read file metadata, but just read the first 4 parts,
 *  including version, schema information, number of rows, and rowgroup number, but
 *  doesn't read each rowgroup metadata, and keyvalue part. Read metadata of next
 *  rowgroup before reading the actual data.
 *
 *  @parquetMetadata			parquet metadata information
 *  @prot						footer protocol for reading
 */
int
readParquetFileMetadata(
		ParquetMetadata *parquetMetadata,
		CompactProtocol *prot)
{
	uint32_t xfer = 0;
	TType ftype;
	int16_t fid;

	bool isset_version = false;
	bool isset_schema = false;
	bool isset_num_rows = false;
	bool isset_row_groups = false;


	while (true) {
		xfer += readFieldBegin(prot, &ftype, &fid);
		if (ftype == T_STOP) {
			break;
		}
		switch (fid) {
		case 1:
			/* Process version*/
			if (ftype == T_I32) {
				xfer += readI32(prot, &((*parquetMetadata)->version));
				isset_version = true;
			}
			break;
		case 2:
			/* process schema - field information*/
			if (ftype == T_LIST) {
				{
					uint32_t lsize;
					TType ltype;
					xfer += readListBegin(prot, &ltype, &lsize);

					readSchemaElement(prot, lsize,
							&((*parquetMetadata)->pfield),
							&((*parquetMetadata)->fieldCount),
							&((*parquetMetadata)->colCount),
							&((*parquetMetadata)->schemaTreeNodeCount));
				}
				isset_schema = true;
			}
			break;
		case 3:
			/* process number of rows*/
			if (ftype == T_I64) {
				int64_t num_rows = 0;
				xfer += readI64(prot, &num_rows);
				(*parquetMetadata)->num_rows = num_rows;
				isset_num_rows = true;
			}
			break;
		case 4:
			/* process row group information*/
			if (ftype == T_LIST) {
				/* get row group count*/
				uint32_t lSize;
				TType etype;
				xfer += readListBegin(prot, &etype, &lSize);
				(*parquetMetadata)->blockCount = lSize;
				isset_row_groups = true;
				break;
			}
			break;
		default:
			break;
		}

		/*hit row groups, break out the while statement*/
		if(isset_row_groups)
			break;
	}

	if (!isset_version)
		ereport(ERROR,
				(errcode(ERRCODE_GP_INTERNAL_ERROR), errmsg("file metadata version not set")));
	if (!isset_schema)
		ereport(ERROR,
				(errcode(ERRCODE_GP_INTERNAL_ERROR), errmsg("file metadata schema not set")));
	if (!isset_num_rows)
		ereport(ERROR,
				(errcode(ERRCODE_GP_INTERNAL_ERROR), errmsg("file metadata num_rows not set")));
	if (!isset_row_groups)
		ereport(ERROR,
				(errcode(ERRCODE_GP_INTERNAL_ERROR),
						errmsg("file metadata row group information not set")));
	return xfer;
}
コード例 #6
0
ファイル: fastbinary.c プロジェクト: liuhg/thrift
// Returns a new reference.
static PyObject*
decode_val(DecodeBuffer* input, TType type, PyObject* typeargs, long string_limit, long container_limit) {
    switch (type) {

    case T_BOOL: {
        int8_t v = readByte(input);
        if (INT_CONV_ERROR_OCCURRED(v)) {
            return NULL;
        }

        switch (v) {
        case 0:
            Py_RETURN_FALSE;
        case 1:
            Py_RETURN_TRUE;
        // Don't laugh.  This is a potentially serious issue.
        default:
            PyErr_SetString(PyExc_TypeError, "boolean out of range");
            return NULL;
        }
        break;
    }
    case T_I08: {
        int8_t v = readByte(input);
        if (INT_CONV_ERROR_OCCURRED(v)) {
            return NULL;
        }

        return PyInt_FromLong(v);
    }
    case T_I16: {
        int16_t v = readI16(input);
        if (INT_CONV_ERROR_OCCURRED(v)) {
            return NULL;
        }
        return PyInt_FromLong(v);
    }
    case T_I32: {
        int32_t v = readI32(input);
        if (INT_CONV_ERROR_OCCURRED(v)) {
            return NULL;
        }
        return PyInt_FromLong(v);
    }

    case T_I64: {
        int64_t v = readI64(input);
        if (INT_CONV_ERROR_OCCURRED(v)) {
            return NULL;
        }
        // TODO(dreiss): Find out if we can take this fastpath always when
        //               sizeof(long) == sizeof(long long).
        if (CHECK_RANGE(v, LONG_MIN, LONG_MAX)) {
            return PyInt_FromLong((long) v);
        }

        return PyLong_FromLongLong(v);
    }

    case T_DOUBLE: {
        double v = readDouble(input);
        if (v == -1.0 && PyErr_Occurred()) {
            return false;
        }
        return PyFloat_FromDouble(v);
    }

    case T_STRING: {
        Py_ssize_t len = readI32(input);
        char* buf;
        if (!readBytes(input, &buf, len)) {
            return NULL;
        }
        if (!check_length_limit(len, string_limit)) {
            return NULL;
        }

        if (is_utf8(typeargs))
            return PyUnicode_DecodeUTF8(buf, len, 0);
        else
            return PyString_FromStringAndSize(buf, len);
    }

    case T_LIST:
    case T_SET: {
        SetListTypeArgs parsedargs;
        int32_t len;
        PyObject* ret = NULL;
        int i;
        bool use_tuple = false;

        if (!parse_set_list_args(&parsedargs, typeargs)) {
            return NULL;
        }

        if (!checkTypeByte(input, parsedargs.element_type)) {
            return NULL;
        }

        len = readI32(input);
        if (!check_length_limit(len, container_limit)) {
            return NULL;
        }

        use_tuple = type == T_LIST && parsedargs.immutable;
        ret = use_tuple ? PyTuple_New(len) : PyList_New(len);
        if (!ret) {
            return NULL;
        }

        for (i = 0; i < len; i++) {
            PyObject* item = decode_val(input, parsedargs.element_type, parsedargs.typeargs, string_limit, container_limit);
            if (!item) {
                Py_DECREF(ret);
                return NULL;
            }
            if (use_tuple) {
                PyTuple_SET_ITEM(ret, i, item);
            } else  {
                PyList_SET_ITEM(ret, i, item);
            }
        }

        // TODO(dreiss): Consider biting the bullet and making two separate cases
        //               for list and set, avoiding this post facto conversion.
        if (type == T_SET) {
            PyObject* setret;
            setret = parsedargs.immutable ? PyFrozenSet_New(ret) : PySet_New(ret);
            Py_DECREF(ret);
            return setret;
        }
        return ret;
    }

    case T_MAP: {
        int32_t len;
        int i;
        MapTypeArgs parsedargs;
        PyObject* ret = NULL;

        if (!parse_map_args(&parsedargs, typeargs)) {
            return NULL;
        }

        if (!checkTypeByte(input, parsedargs.ktag)) {
            return NULL;
        }
        if (!checkTypeByte(input, parsedargs.vtag)) {
            return NULL;
        }

        len = readI32(input);
        if (!check_length_limit(len, container_limit)) {
            return NULL;
        }

        ret = PyDict_New();
        if (!ret) {
            goto error;
        }

        for (i = 0; i < len; i++) {
            PyObject* k = NULL;
            PyObject* v = NULL;
            k = decode_val(input, parsedargs.ktag, parsedargs.ktypeargs, string_limit, container_limit);
            if (k == NULL) {
                goto loop_error;
            }
            v = decode_val(input, parsedargs.vtag, parsedargs.vtypeargs, string_limit, container_limit);
            if (v == NULL) {
                goto loop_error;
            }
            if (PyDict_SetItem(ret, k, v) == -1) {
                goto loop_error;
            }

            Py_DECREF(k);
            Py_DECREF(v);
            continue;

            // Yuck!  Destructors, anyone?
loop_error:
            Py_XDECREF(k);
            Py_XDECREF(v);
            goto error;
        }

        if (parsedargs.immutable) {
            PyObject* thrift = PyImport_ImportModule("thrift.Thrift");
            PyObject* cls = NULL;
            PyObject* arg = NULL;
            if (!thrift) {
                goto error;
            }
            cls = PyObject_GetAttrString(thrift, "TFrozenDict");
            if (!cls) {
                goto error;
            }
            arg = PyTuple_New(1);
            PyTuple_SET_ITEM(arg, 0, ret);
            return PyObject_CallObject(cls, arg);
        }

        return ret;

error:
        Py_XDECREF(ret);
        return NULL;
    }

    case T_STRUCT: {
        StructTypeArgs parsedargs;
        if (!parse_struct_args(&parsedargs, typeargs)) {
            return NULL;
        }

        return decode_struct(input, Py_None, parsedargs.klass, parsedargs.spec, string_limit, container_limit);
    }

    case T_STOP:
    case T_VOID:
    case T_UTF16:
    case T_UTF8:
    case T_U64:
    default:
        PyErr_SetString(PyExc_TypeError, "Unexpected TType");
        return NULL;
    }
}
コード例 #7
0
// Returns a new reference.
static PyObject*
decode_val(DecodeBuffer* input, TType type, PyObject* typeargs) {
  switch (type) {

  case T_BOOL: {
    int8_t v = readByte(input);
    if (INT_CONV_ERROR_OCCURRED(v)) {
      return NULL;
    }

    switch (v) {
    case 0: Py_RETURN_FALSE;
    case 1: Py_RETURN_TRUE;
    // Don't laugh.  This is a potentially serious issue.
    default: PyErr_SetString(PyExc_TypeError, "boolean out of range"); return NULL;
    }
    break;
  }
  case T_I08: {
    int8_t v = readByte(input);
    if (INT_CONV_ERROR_OCCURRED(v)) {
      return NULL;
    }

    return PyInt_FromLong(v);
  }
  case T_I16: {
    int16_t v = readI16(input);
    if (INT_CONV_ERROR_OCCURRED(v)) {
      return NULL;
    }
    return PyInt_FromLong(v);
  }
  case T_I32: {
    int32_t v = readI32(input);
    if (INT_CONV_ERROR_OCCURRED(v)) {
      return NULL;
    }
    return PyInt_FromLong(v);
  }

  case T_I64: {
    int64_t v = readI64(input);
    if (INT_CONV_ERROR_OCCURRED(v)) {
      return NULL;
    }
    // TODO(dreiss): Find out if we can take this fastpath always when
    //               sizeof(long) == sizeof(long long).
    if (CHECK_RANGE(v, LONG_MIN, LONG_MAX)) {
      return PyInt_FromLong((long) v);
    }

    return PyLong_FromLongLong(v);
  }

  case T_DOUBLE: {
    double v = readDouble(input);
    if (v == -1.0 && PyErr_Occurred()) {
      return false;
    }
    return PyFloat_FromDouble(v);
  }

  case T_STRING: {
    Py_ssize_t len = readI32(input);
    char* buf;
    if (!readBytes(input, &buf, len)) {
      return NULL;
    }

    return PyString_FromStringAndSize(buf, len);
  }

  case T_LIST:
  case T_SET: {
    SetListTypeArgs parsedargs;
    int32_t len;
    PyObject* ret = NULL;
    int i;

    if (!parse_set_list_args(&parsedargs, typeargs)) {
      return NULL;
    }

    if (!checkTypeByte(input, parsedargs.element_type)) {
      return NULL;
    }

    len = readI32(input);
    if (!check_ssize_t_32(len)) {
      return NULL;
    }

    ret = PyList_New(len);
    if (!ret) {
      return NULL;
    }

    for (i = 0; i < len; i++) {
      PyObject* item = decode_val(input, parsedargs.element_type, parsedargs.typeargs);
      if (!item) {
        Py_DECREF(ret);
        return NULL;
      }
      PyList_SET_ITEM(ret, i, item);
    }

    // TODO(dreiss): Consider biting the bullet and making two separate cases
    //               for list and set, avoiding this post facto conversion.
    if (type == T_SET) {
      PyObject* setret;
#if (PY_VERSION_HEX < 0x02050000)
      // hack needed for older versions
      setret = PyObject_CallFunctionObjArgs((PyObject*)&PySet_Type, ret, NULL);
#else
      // official version
      setret = PySet_New(ret);
#endif
      Py_DECREF(ret);
      return setret;
    }
    return ret;
  }

  case T_MAP: {
    int32_t len;
    int i;
    MapTypeArgs parsedargs;
    PyObject* ret = NULL;

    if (!parse_map_args(&parsedargs, typeargs)) {
      return NULL;
    }

    if (!checkTypeByte(input, parsedargs.ktag)) {
      return NULL;
    }
    if (!checkTypeByte(input, parsedargs.vtag)) {
      return NULL;
    }

    len = readI32(input);
    if (!check_ssize_t_32(len)) {
      return false;
    }

    ret = PyDict_New();
    if (!ret) {
      goto error;
    }

    for (i = 0; i < len; i++) {
      PyObject* k = NULL;
      PyObject* v = NULL;
      k = decode_val(input, parsedargs.ktag, parsedargs.ktypeargs);
      if (k == NULL) {
        goto loop_error;
      }
      v = decode_val(input, parsedargs.vtag, parsedargs.vtypeargs);
      if (v == NULL) {
        goto loop_error;
      }
      if (PyDict_SetItem(ret, k, v) == -1) {
        goto loop_error;
      }

      Py_DECREF(k);
      Py_DECREF(v);
      continue;

      // Yuck!  Destructors, anyone?
      loop_error:
      Py_XDECREF(k);
      Py_XDECREF(v);
      goto error;
    }

    return ret;

    error:
    Py_XDECREF(ret);
    return NULL;
  }

  case T_STRUCT: {
    StructTypeArgs parsedargs;
    if (!parse_struct_args(&parsedargs, typeargs)) {
      return NULL;
    }

    PyObject* ret = PyObject_CallObject(parsedargs.klass, NULL);
    if (!ret) {
      return NULL;
    }

    if (!decode_struct(input, ret, parsedargs.spec)) {
      Py_DECREF(ret);
      return NULL;
    }

    return ret;
  }

  case T_STOP:
  case T_VOID:
  case T_UTF16:
  case T_UTF8:
  case T_U64:
  default:
    PyErr_SetString(PyExc_TypeError, "Unexpected TType");
    return NULL;
  }
}