int
writeRowGroupInfo(
		struct BlockMetadata_4C* rowGroupInfo,
		CompactProtocol *prot)
{
	uint32_t xfer = 0;
	xfer += writeStructBegin(prot);

	/*write out the column chunk metadata*/
	xfer += writeFieldBegin(prot, T_LIST, 1);
	xfer += writeListBegin(prot, T_STRUCT, rowGroupInfo->ColChunkCount);
	for(int i = 0; i < rowGroupInfo->ColChunkCount; i++){
		/*write out each column chunk metadata*/
		xfer += writeColumnChunk(&(rowGroupInfo->columns[i]), prot);
	}

	/*write out total byte size*/
	xfer += writeFieldBegin(prot, T_I64, 2);
	xfer += writeI64(prot, rowGroupInfo->totalByteSize);

	/*write out num_rows*/
	xfer += writeFieldBegin(prot, T_I64, 3);
	xfer += writeI64(prot, rowGroupInfo->rowCount);

	xfer += writeFieldStop(prot);
	xfer += writeStructEnd(prot);
	return xfer;
}
int
writeColumnChunk(
		struct ColumnChunkMetadata_4C *columnInfo,
		CompactProtocol *prot)
{
	uint32_t xfer = 0;
	xfer += writeStructBegin(prot);

	/*write out column path*/
	if(columnInfo->path != NULL)
	{
		xfer += writeFieldBegin(prot, T_STRING, 1);
		xfer += writeString(prot, columnInfo->path, strlen(columnInfo->path));
	}

	/*write out file offset*/
	xfer += writeFieldBegin(prot, T_I64, 2);
	xfer += writeI64(prot, columnInfo->file_offset);

	/*write out column metadata*/
	xfer += writeFieldBegin(prot, T_STRUCT, 3);
	xfer += writeColumnMetadata(columnInfo, prot);

	xfer += writeFieldStop(prot);
	xfer += writeStructEnd(prot);

	return xfer;
}
Exemple #3
0
static void writeDouble(PyObject* outbuf, double dub) {
    // Unfortunately, bitwise_cast doesn't work in C.  Bad C!
    union {
        double f;
        int64_t t;
    } transfer;
    transfer.f = dub;
    writeI64(outbuf, transfer.t);
}
/**
 * Write out begin of parquet file metadata (part before rowgroup),including version,
 * schema, and num_rows
 */
int
writePreviousParquetFileMetadata(
		ParquetMetadata parquetMetadata,
		char *fileName,
		File file,
		int rowgroupCnt,
		CompactProtocol **read_prot)
{
	uint32_t xfer = 0;
	CompactProtocol *write_prot = (struct CompactProtocol *) palloc0(sizeof(struct CompactProtocol));

	initCompactProtocol(write_prot, file, fileName, -1,
			PARQUET_FOOTER_BUFFERMODE_WRITE);

	xfer += writeStructBegin(write_prot);

	/*write out version*/
	xfer += writeFieldBegin(write_prot, T_I32, 1);
	xfer += writeI32(write_prot, (int32_t)parquetMetadata->version);

	/*write out schema*/
	xfer += writeFieldBegin(write_prot, T_LIST, 2);
	xfer += writeListBegin(write_prot, T_STRUCT, parquetMetadata->schemaTreeNodeCount + 1);
	xfer += writeSchemaElement(parquetMetadata->pfield, parquetMetadata->fieldCount, parquetMetadata->schemaTreeNodeCount, write_prot);

	/*write out number of rows*/
	xfer += writeFieldBegin(write_prot, T_I64, 3);
	xfer += writeI64(write_prot, (int64_t)parquetMetadata->num_rows);

	/*write out rowgroup size*/
	xfer += writeFieldBegin(write_prot, T_LIST, 4);
	xfer += writeListBegin(write_prot, T_STRUCT, parquetMetadata->blockCount);

	/*write out the previous row group metadata information before deserialize*/
	writePerviousRowGroupMetadata(rowgroupCnt, parquetMetadata, *read_prot, write_prot);

	/*append the first part of footer to file*/
	xfer = appendFooterBufferTempData(file, write_prot->footerProcessor);

	/*free the write protocol for first part of file*/
	freeCompactProtocol(write_prot);
	pfree(write_prot);

	/*if there is previous metadata, should end footer serializer*/
	if(rowgroupCnt != 0)
		endDeserializerFooter(parquetMetadata, read_prot);

	return xfer;
}
/**
 * Write part functions
 */
int
writeColumnMetadata(
		struct ColumnChunkMetadata_4C *columnInfo,
		CompactProtocol *prot)
{
	uint32_t xfer = 0;
	char *elemPath = NULL;
	const char *delim = ":";
	Assert(NULL != columnInfo->pathInSchema);
	char path[strlen(columnInfo->pathInSchema) + 1];

	xfer += writeStructBegin(prot);

	/*write out type*/
	xfer += writeFieldBegin(prot, T_I32, 1);
	xfer += writeI32(prot, columnInfo->type);

	/*write out encoding*/
	xfer += writeFieldBegin(prot, T_LIST, 2);
	xfer += writeListBegin(prot, T_I32, columnInfo->EncodingCount);
	for (int i = 0; i < columnInfo->EncodingCount; i++) {
		xfer += writeI32(prot, (int32_t)(columnInfo->pEncodings[i]));
	}

	/*write out path_in_schema*/
	xfer += writeFieldBegin(prot, T_LIST, 3);
	xfer += writeListBegin(prot, T_STRING, columnInfo->depth);
	strcpy(path, columnInfo->pathInSchema);

	elemPath = strtok(path, delim);
	if (elemPath == NULL) {
		ereport(ERROR,
				(errcode(ERRCODE_GP_INTERNAL_ERROR),
						errmsg("file metadata column metadata(path_in_schema) not correct")));
	}
	xfer += writeString(prot, elemPath, strlen(elemPath));
	for (int i = 1; i < columnInfo->depth; i++) {
		elemPath = strtok(NULL, delim);
		if (elemPath == NULL) {
			ereport(ERROR,
					(errcode(ERRCODE_GP_INTERNAL_ERROR),
							errmsg("file metadata column metadata(path_in_schema) not correct")));
		}
		xfer += writeString(prot, elemPath, strlen(elemPath));
	}

	/*write out codec*/
	xfer += writeFieldBegin(prot, T_I32, 4);
	xfer += writeI32(prot, (int32_t)columnInfo->codec);

	/*write out num of values*/
	xfer += writeFieldBegin(prot, T_I64, 5);
	xfer += writeI64(prot, (int64_t)columnInfo->valueCount);

	/*write total uncompressed size*/
	xfer += writeFieldBegin(prot, T_I64, 6);
	xfer += writeI64(prot, columnInfo->totalUncompressedSize);

	/*write out total compressed size*/
	xfer += writeFieldBegin(prot, T_I64, 7);
	xfer += writeI64(prot, columnInfo->totalSize);

	/*write out key value metadata.*/
	/*There's no key value metadata for parquet storage, don't need to write it out*/

	/*write out data page offset*/
	xfer += writeFieldBegin(prot, T_I64, 9);
	xfer += writeI64(prot, columnInfo->firstDataPage);

	/*write out index page offset and dictionary page offset. No need to write currently*/

	/*write out field stop identifier*/
	xfer += writeFieldStop(prot);
	xfer += writeStructEnd(prot);

	return xfer;
}
Exemple #6
0
static bool
output_val(PyObject* output, PyObject* value, TType type, PyObject* typeargs) {
    /*
     * Refcounting Strategy:
     *
     * We assume that elements of the thrift_spec tuple are not going to be
     * mutated, so we don't ref count those at all. Other than that, we try to
     * keep a reference to all the user-created objects while we work with them.
     * output_val assumes that a reference is already held. The *caller* is
     * responsible for handling references
     */

    switch (type) {

    case T_BOOL: {
        int v = PyObject_IsTrue(value);
        if (v == -1) {
            return false;
        }

        writeByte(output, (int8_t) v);
        break;
    }
    case T_I08: {
        int32_t val;

        if (!parse_pyint(value, &val, INT8_MIN, INT8_MAX)) {
            return false;
        }

        writeByte(output, (int8_t) val);
        break;
    }
    case T_I16: {
        int32_t val;

        if (!parse_pyint(value, &val, INT16_MIN, INT16_MAX)) {
            return false;
        }

        writeI16(output, (int16_t) val);
        break;
    }
    case T_I32: {
        int32_t val;

        if (!parse_pyint(value, &val, INT32_MIN, INT32_MAX)) {
            return false;
        }

        writeI32(output, val);
        break;
    }
    case T_I64: {
        int64_t nval = PyLong_AsLongLong(value);

        if (INT_CONV_ERROR_OCCURRED(nval)) {
            return false;
        }

        if (!CHECK_RANGE(nval, INT64_MIN, INT64_MAX)) {
            PyErr_SetString(PyExc_OverflowError, "int out of range");
            return false;
        }

        writeI64(output, nval);
        break;
    }

    case T_DOUBLE: {
        double nval = PyFloat_AsDouble(value);
        if (nval == -1.0 && PyErr_Occurred()) {
            return false;
        }

        writeDouble(output, nval);
        break;
    }

    case T_STRING: {
        Py_ssize_t len = 0;
        if (is_utf8(typeargs) && PyUnicode_Check(value))
            value = PyUnicode_AsUTF8String(value);
        len = PyString_Size(value);

        if (!check_ssize_t_32(len)) {
            return false;
        }

        writeI32(output, (int32_t) len);
        PycStringIO->cwrite(output, PyString_AsString(value), (int32_t) len);
        break;
    }

    case T_LIST:
    case T_SET: {
        Py_ssize_t len;
        SetListTypeArgs parsedargs;
        PyObject *item;
        PyObject *iterator;

        if (!parse_set_list_args(&parsedargs, typeargs)) {
            return false;
        }

        len = PyObject_Length(value);

        if (!check_ssize_t_32(len)) {
            return false;
        }

        writeByte(output, parsedargs.element_type);
        writeI32(output, (int32_t) len);

        iterator =  PyObject_GetIter(value);
        if (iterator == NULL) {
            return false;
        }

        while ((item = PyIter_Next(iterator))) {
            if (!output_val(output, item, parsedargs.element_type, parsedargs.typeargs)) {
                Py_DECREF(item);
                Py_DECREF(iterator);
                return false;
            }
            Py_DECREF(item);
        }

        Py_DECREF(iterator);

        if (PyErr_Occurred()) {
            return false;
        }

        break;
    }

    case T_MAP: {
        PyObject *k, *v;
        Py_ssize_t pos = 0;
        Py_ssize_t len;

        MapTypeArgs parsedargs;

        len = PyDict_Size(value);
        if (!check_ssize_t_32(len)) {
            return false;
        }

        if (!parse_map_args(&parsedargs, typeargs)) {
            return false;
        }

        writeByte(output, parsedargs.ktag);
        writeByte(output, parsedargs.vtag);
        writeI32(output, len);

        // TODO(bmaurer): should support any mapping, not just dicts
        while (PyDict_Next(value, &pos, &k, &v)) {
            // TODO(dreiss): Think hard about whether these INCREFs actually
            //               turn any unsafe scenarios into safe scenarios.
            Py_INCREF(k);
            Py_INCREF(v);

            if (!output_val(output, k, parsedargs.ktag, parsedargs.ktypeargs)
                    || !output_val(output, v, parsedargs.vtag, parsedargs.vtypeargs)) {
                Py_DECREF(k);
                Py_DECREF(v);
                return false;
            }
            Py_DECREF(k);
            Py_DECREF(v);
        }
        break;
    }

    // TODO(dreiss): Consider breaking this out as a function
    //               the way we did for decode_struct.
    case T_STRUCT: {
        StructTypeArgs parsedargs;
        Py_ssize_t nspec;
        Py_ssize_t i;

        if (!parse_struct_args(&parsedargs, typeargs)) {
            return false;
        }

        nspec = PyTuple_Size(parsedargs.spec);

        if (nspec == -1) {
            return false;
        }

        for (i = 0; i < nspec; i++) {
            StructItemSpec parsedspec;
            PyObject* spec_tuple;
            PyObject* instval = NULL;

            spec_tuple = PyTuple_GET_ITEM(parsedargs.spec, i);
            if (spec_tuple == Py_None) {
                continue;
            }

            if (!parse_struct_item_spec (&parsedspec, spec_tuple)) {
                return false;
            }

            instval = PyObject_GetAttr(value, parsedspec.attrname);

            if (!instval) {
                return false;
            }

            if (instval == Py_None) {
                Py_DECREF(instval);
                continue;
            }

            writeByte(output, (int8_t) parsedspec.type);
            writeI16(output, parsedspec.tag);

            if (!output_val(output, instval, parsedspec.type, parsedspec.typeargs)) {
                Py_DECREF(instval);
                return false;
            }

            Py_DECREF(instval);
        }

        writeByte(output, (int8_t)T_STOP);
        break;
    }

    case T_STOP:
    case T_VOID:
    case T_UTF16:
    case T_UTF8:
    case T_U64:
    default:
        PyErr_SetString(PyExc_TypeError, "Unexpected TType");
        return false;

    }

    return true;
}