int writeRowGroupInfo( struct BlockMetadata_4C* rowGroupInfo, CompactProtocol *prot) { uint32_t xfer = 0; xfer += writeStructBegin(prot); /*write out the column chunk metadata*/ xfer += writeFieldBegin(prot, T_LIST, 1); xfer += writeListBegin(prot, T_STRUCT, rowGroupInfo->ColChunkCount); for(int i = 0; i < rowGroupInfo->ColChunkCount; i++){ /*write out each column chunk metadata*/ xfer += writeColumnChunk(&(rowGroupInfo->columns[i]), prot); } /*write out total byte size*/ xfer += writeFieldBegin(prot, T_I64, 2); xfer += writeI64(prot, rowGroupInfo->totalByteSize); /*write out num_rows*/ xfer += writeFieldBegin(prot, T_I64, 3); xfer += writeI64(prot, rowGroupInfo->rowCount); xfer += writeFieldStop(prot); xfer += writeStructEnd(prot); return xfer; }
int writeColumnChunk( struct ColumnChunkMetadata_4C *columnInfo, CompactProtocol *prot) { uint32_t xfer = 0; xfer += writeStructBegin(prot); /*write out column path*/ if(columnInfo->path != NULL) { xfer += writeFieldBegin(prot, T_STRING, 1); xfer += writeString(prot, columnInfo->path, strlen(columnInfo->path)); } /*write out file offset*/ xfer += writeFieldBegin(prot, T_I64, 2); xfer += writeI64(prot, columnInfo->file_offset); /*write out column metadata*/ xfer += writeFieldBegin(prot, T_STRUCT, 3); xfer += writeColumnMetadata(columnInfo, prot); xfer += writeFieldStop(prot); xfer += writeStructEnd(prot); return xfer; }
static void writeDouble(PyObject* outbuf, double dub) { // Unfortunately, bitwise_cast doesn't work in C. Bad C! union { double f; int64_t t; } transfer; transfer.f = dub; writeI64(outbuf, transfer.t); }
/** * Write out begin of parquet file metadata (part before rowgroup),including version, * schema, and num_rows */ int writePreviousParquetFileMetadata( ParquetMetadata parquetMetadata, char *fileName, File file, int rowgroupCnt, CompactProtocol **read_prot) { uint32_t xfer = 0; CompactProtocol *write_prot = (struct CompactProtocol *) palloc0(sizeof(struct CompactProtocol)); initCompactProtocol(write_prot, file, fileName, -1, PARQUET_FOOTER_BUFFERMODE_WRITE); xfer += writeStructBegin(write_prot); /*write out version*/ xfer += writeFieldBegin(write_prot, T_I32, 1); xfer += writeI32(write_prot, (int32_t)parquetMetadata->version); /*write out schema*/ xfer += writeFieldBegin(write_prot, T_LIST, 2); xfer += writeListBegin(write_prot, T_STRUCT, parquetMetadata->schemaTreeNodeCount + 1); xfer += writeSchemaElement(parquetMetadata->pfield, parquetMetadata->fieldCount, parquetMetadata->schemaTreeNodeCount, write_prot); /*write out number of rows*/ xfer += writeFieldBegin(write_prot, T_I64, 3); xfer += writeI64(write_prot, (int64_t)parquetMetadata->num_rows); /*write out rowgroup size*/ xfer += writeFieldBegin(write_prot, T_LIST, 4); xfer += writeListBegin(write_prot, T_STRUCT, parquetMetadata->blockCount); /*write out the previous row group metadata information before deserialize*/ writePerviousRowGroupMetadata(rowgroupCnt, parquetMetadata, *read_prot, write_prot); /*append the first part of footer to file*/ xfer = appendFooterBufferTempData(file, write_prot->footerProcessor); /*free the write protocol for first part of file*/ freeCompactProtocol(write_prot); pfree(write_prot); /*if there is previous metadata, should end footer serializer*/ if(rowgroupCnt != 0) endDeserializerFooter(parquetMetadata, read_prot); return xfer; }
/** * Write part functions */ int writeColumnMetadata( struct ColumnChunkMetadata_4C *columnInfo, CompactProtocol *prot) { uint32_t xfer = 0; char *elemPath = NULL; const char *delim = ":"; Assert(NULL != columnInfo->pathInSchema); char path[strlen(columnInfo->pathInSchema) + 1]; xfer += writeStructBegin(prot); /*write out type*/ xfer += writeFieldBegin(prot, T_I32, 1); xfer += writeI32(prot, columnInfo->type); /*write out encoding*/ xfer += writeFieldBegin(prot, T_LIST, 2); xfer += writeListBegin(prot, T_I32, columnInfo->EncodingCount); for (int i = 0; i < columnInfo->EncodingCount; i++) { xfer += writeI32(prot, (int32_t)(columnInfo->pEncodings[i])); } /*write out path_in_schema*/ xfer += writeFieldBegin(prot, T_LIST, 3); xfer += writeListBegin(prot, T_STRING, columnInfo->depth); strcpy(path, columnInfo->pathInSchema); elemPath = strtok(path, delim); if (elemPath == NULL) { ereport(ERROR, (errcode(ERRCODE_GP_INTERNAL_ERROR), errmsg("file metadata column metadata(path_in_schema) not correct"))); } xfer += writeString(prot, elemPath, strlen(elemPath)); for (int i = 1; i < columnInfo->depth; i++) { elemPath = strtok(NULL, delim); if (elemPath == NULL) { ereport(ERROR, (errcode(ERRCODE_GP_INTERNAL_ERROR), errmsg("file metadata column metadata(path_in_schema) not correct"))); } xfer += writeString(prot, elemPath, strlen(elemPath)); } /*write out codec*/ xfer += writeFieldBegin(prot, T_I32, 4); xfer += writeI32(prot, (int32_t)columnInfo->codec); /*write out num of values*/ xfer += writeFieldBegin(prot, T_I64, 5); xfer += writeI64(prot, (int64_t)columnInfo->valueCount); /*write total uncompressed size*/ xfer += writeFieldBegin(prot, T_I64, 6); xfer += writeI64(prot, columnInfo->totalUncompressedSize); /*write out total compressed size*/ xfer += writeFieldBegin(prot, T_I64, 7); xfer += writeI64(prot, columnInfo->totalSize); /*write out key value metadata.*/ /*There's no key value metadata for parquet storage, don't need to write it out*/ /*write out data page offset*/ xfer += writeFieldBegin(prot, T_I64, 9); xfer += writeI64(prot, columnInfo->firstDataPage); /*write out index page offset and dictionary page offset. No need to write currently*/ /*write out field stop identifier*/ xfer += writeFieldStop(prot); xfer += writeStructEnd(prot); return xfer; }
static bool output_val(PyObject* output, PyObject* value, TType type, PyObject* typeargs) { /* * Refcounting Strategy: * * We assume that elements of the thrift_spec tuple are not going to be * mutated, so we don't ref count those at all. Other than that, we try to * keep a reference to all the user-created objects while we work with them. * output_val assumes that a reference is already held. The *caller* is * responsible for handling references */ switch (type) { case T_BOOL: { int v = PyObject_IsTrue(value); if (v == -1) { return false; } writeByte(output, (int8_t) v); break; } case T_I08: { int32_t val; if (!parse_pyint(value, &val, INT8_MIN, INT8_MAX)) { return false; } writeByte(output, (int8_t) val); break; } case T_I16: { int32_t val; if (!parse_pyint(value, &val, INT16_MIN, INT16_MAX)) { return false; } writeI16(output, (int16_t) val); break; } case T_I32: { int32_t val; if (!parse_pyint(value, &val, INT32_MIN, INT32_MAX)) { return false; } writeI32(output, val); break; } case T_I64: { int64_t nval = PyLong_AsLongLong(value); if (INT_CONV_ERROR_OCCURRED(nval)) { return false; } if (!CHECK_RANGE(nval, INT64_MIN, INT64_MAX)) { PyErr_SetString(PyExc_OverflowError, "int out of range"); return false; } writeI64(output, nval); break; } case T_DOUBLE: { double nval = PyFloat_AsDouble(value); if (nval == -1.0 && PyErr_Occurred()) { return false; } writeDouble(output, nval); break; } case T_STRING: { Py_ssize_t len = 0; if (is_utf8(typeargs) && PyUnicode_Check(value)) value = PyUnicode_AsUTF8String(value); len = PyString_Size(value); if (!check_ssize_t_32(len)) { return false; } writeI32(output, (int32_t) len); PycStringIO->cwrite(output, PyString_AsString(value), (int32_t) len); break; } case T_LIST: case T_SET: { Py_ssize_t len; SetListTypeArgs parsedargs; PyObject *item; PyObject *iterator; if (!parse_set_list_args(&parsedargs, typeargs)) { return false; } len = PyObject_Length(value); if (!check_ssize_t_32(len)) { return false; } writeByte(output, parsedargs.element_type); writeI32(output, (int32_t) len); iterator = PyObject_GetIter(value); if (iterator == NULL) { return false; } while ((item = PyIter_Next(iterator))) { if (!output_val(output, item, parsedargs.element_type, parsedargs.typeargs)) { Py_DECREF(item); Py_DECREF(iterator); return false; } Py_DECREF(item); } Py_DECREF(iterator); if (PyErr_Occurred()) { return false; } break; } case T_MAP: { PyObject *k, *v; Py_ssize_t pos = 0; Py_ssize_t len; MapTypeArgs parsedargs; len = PyDict_Size(value); if (!check_ssize_t_32(len)) { return false; } if (!parse_map_args(&parsedargs, typeargs)) { return false; } writeByte(output, parsedargs.ktag); writeByte(output, parsedargs.vtag); writeI32(output, len); // TODO(bmaurer): should support any mapping, not just dicts while (PyDict_Next(value, &pos, &k, &v)) { // TODO(dreiss): Think hard about whether these INCREFs actually // turn any unsafe scenarios into safe scenarios. Py_INCREF(k); Py_INCREF(v); if (!output_val(output, k, parsedargs.ktag, parsedargs.ktypeargs) || !output_val(output, v, parsedargs.vtag, parsedargs.vtypeargs)) { Py_DECREF(k); Py_DECREF(v); return false; } Py_DECREF(k); Py_DECREF(v); } break; } // TODO(dreiss): Consider breaking this out as a function // the way we did for decode_struct. case T_STRUCT: { StructTypeArgs parsedargs; Py_ssize_t nspec; Py_ssize_t i; if (!parse_struct_args(&parsedargs, typeargs)) { return false; } nspec = PyTuple_Size(parsedargs.spec); if (nspec == -1) { return false; } for (i = 0; i < nspec; i++) { StructItemSpec parsedspec; PyObject* spec_tuple; PyObject* instval = NULL; spec_tuple = PyTuple_GET_ITEM(parsedargs.spec, i); if (spec_tuple == Py_None) { continue; } if (!parse_struct_item_spec (&parsedspec, spec_tuple)) { return false; } instval = PyObject_GetAttr(value, parsedspec.attrname); if (!instval) { return false; } if (instval == Py_None) { Py_DECREF(instval); continue; } writeByte(output, (int8_t) parsedspec.type); writeI16(output, parsedspec.tag); if (!output_val(output, instval, parsedspec.type, parsedspec.typeargs)) { Py_DECREF(instval); return false; } Py_DECREF(instval); } writeByte(output, (int8_t)T_STOP); break; } case T_STOP: case T_VOID: case T_UTF16: case T_UTF8: case T_U64: default: PyErr_SetString(PyExc_TypeError, "Unexpected TType"); return false; } return true; }