Example #1
0
/*
 * Detect data format
 * used to set file extension on S3 in gpwriter.
 */
const char *get_format_str(FunctionCallInfo fcinfo) {
    Relation rel = EXTPROTOCOL_GET_RELATION(fcinfo);
    ExtTableEntry *exttbl = GetExtTableEntry(rel->rd_id);
    char fmtcode = exttbl->fmtcode;

    if (fmttype_is_text(fmtcode)) return "txt";
    if (fmttype_is_csv(fmtcode)) return "csv";
    if (fmttype_is_avro(fmtcode)) return "avro";
    if (fmttype_is_parquet(fmtcode)) return "parquet";
    return S3_DEFAULT_FORMAT;
}
Example #2
0
/*
 * Export data out of GPDB.
 */
Datum
gphdfsprotocol_export(PG_FUNCTION_ARGS)
{
	URL_FILE *myData;
	char     *data;
	int       datlen;
	size_t    wrote = 0;
	static char	ebuf[512] = {0};
	int	    	ebuflen = 512;

	/* Must be called via the external table format manager */
	if (!CALLED_AS_EXTPROTOCOL(fcinfo))
		elog(ERROR, "cannot execute gphdfsprotocol_export outside protocol manager");

	/* Get our internal description of the protocol */
	myData = (URL_FILE *) EXTPROTOCOL_GET_USER_CTX(fcinfo);

	/* =======================================================================
	 *                            DO CLOSE
	 * ======================================================================= */
	if (EXTPROTOCOL_IS_LAST_CALL(fcinfo))
	{
		if (myData)
			url_fclose(myData, true, "gphdfs protocol");
		PG_RETURN_INT32(0);
	}

	/* =======================================================================
	 *                            DO OPEN
	 * ======================================================================= */
	if (myData == NULL)
	{
		myData = gphdfs_fopen(fcinfo, true);
		EXTPROTOCOL_SET_USER_CTX(fcinfo, myData);

		/* add schema info to pipe */
		StringInfo schema_data = makeStringInfo();

		Relation relation = FORMATTER_GET_RELATION(fcinfo);
		ExtTableEntry *exttbl = GetExtTableEntry(relation->rd_id);
		if (fmttype_is_avro(exttbl->fmtcode) || fmttype_is_parquet(exttbl->fmtcode) )
		{
			int relNameLen = strlen(relation->rd_rel->relname.data);
			appendIntToBuffer(schema_data, relNameLen);
			appendBinaryStringInfo(schema_data, relation->rd_rel->relname.data, relNameLen);

			int ncolumns = relation->rd_att->natts;
			appendIntToBuffer(schema_data, ncolumns);
			int i = 0;
			for (; i< ncolumns; i++)
			{
				Oid type = relation->rd_att->attrs[i]->atttypid;

				/* add attname,atttypid,attnotnull,attndims to schema_data filed */
				int attNameLen = strlen(relation->rd_att->attrs[i]->attname.data);
				appendIntToBuffer(schema_data, attNameLen);
				appendBinaryStringInfo(schema_data, relation->rd_att->attrs[i]->attname.data, attNameLen);

				appendIntToBuffer(schema_data, type);

				bool notNull = relation->rd_att->attrs[i]->attnotnull;
				appendInt1ToBuffer(schema_data, notNull?1:0);

				appendIntToBuffer(schema_data, relation->rd_att->attrs[i]->attndims);

				/* add type delimiter, for udt, it can be anychar */
				char delim = 0;
				int16 typlen;
				bool typbyval;
				char typalien;
				Oid typioparam;
				Oid func;
				get_type_io_data(type, IOFunc_input, &typlen, &typbyval, &typalien, &delim, &typioparam, &func);
				appendInt1ToBuffer(schema_data, delim);
			}

			StringInfo schema_head = makeStringInfo();
			appendIntToBuffer(schema_head, schema_data->len + 2);
			appendInt2ToBuffer(schema_head, 2);

			url_execute_fwrite(schema_head->data, schema_head->len, myData, NULL);
			url_execute_fwrite(schema_data->data, schema_data->len, myData, NULL);

			pfree(schema_head->data);
			pfree(schema_data->data);
		}
	}


	/* =======================================================================
	 *                            DO THE EXPORT
	 * ======================================================================= */
	data   = EXTPROTOCOL_GET_DATABUF(fcinfo);
	datlen = EXTPROTOCOL_GET_DATALEN(fcinfo);

	if (datlen > 0)
		wrote = url_execute_fwrite(data, datlen, myData, NULL);

	if (url_ferror(myData, wrote, ebuf, ebuflen))
	{
		ereport(ERROR,
				(errcode_for_file_access(),
				 strlen(ebuf) > 0 ? errmsg("could not write to external resource:\n%s",ebuf) :
				 errmsg("could not write to external resource: %m")));
	}

	PG_RETURN_INT32((int)wrote);
}
Example #3
0
/**
 * Open/Init of the gphdfs protocol
 *
 * It setup the Hadoop env var by calling hadoop_env.sh.
 * Then it calls the corresponding java program to do the actual
 * read/write.
 */
static URL_FILE
*gphdfs_fopen(PG_FUNCTION_ARGS, bool forwrite)
{
	URL_FILE      *myData;
	StringInfoData cmd;
	StringInfoData env_cmd;
	StringInfoData table_schema;
	StringInfoData table_attr_names;
	char          *java_cmd;
	extvar_t       extvar;
	char          *url;
	Relation       rel;
	ExtTableEntry *exttbl;
	char          *format;

	/* Before we start, make sure that all the GUCs are set properly.
	 * This will also set the gp_hadoop_connector_version global var.
	 */
	checkHadoopGUCs();

	/* The env setup script */
	initStringInfo(&env_cmd);
	appendStringInfo(&env_cmd, "source $GPHOME/%s/hadoop_env.sh;", gp_hadoop_connector_jardir);

	/* The java program. See the java program for details */
	if (forwrite)
		java_cmd = "java $GP_JAVA_OPT -classpath $CLASSPATH com.emc.greenplum.gpdb.hdfsconnector.HDFSWriter $GP_SEGMENT_ID $GP_XID\0";
	else
		java_cmd = "java $GP_JAVA_OPT -classpath $CLASSPATH com.emc.greenplum.gpdb.hdfsconnector.HDFSReader $GP_SEGMENT_ID $GP_SEGMENT_COUNT\0";

	/* NOTE: I've to assume that if it's not TEXT, it's going to be the RIGHT
	 * custom format. There's no easy way to find out the name of the formatter here.
	 * If the wrong formatter is used, we'll see some error in the protocol.
	 * No big deal.
	 */
	rel    = EXTPROTOCOL_GET_RELATION(fcinfo);
	exttbl = GetExtTableEntry(rel->rd_id);
	format = (fmttype_is_text(exttbl->fmtcode) || fmttype_is_csv(exttbl->fmtcode)) ? "TEXT":"GPDBWritable";
	if (fmttype_is_avro(exttbl->fmtcode))
	{
		format = "AVRO";
	} else if (fmttype_is_parquet(exttbl->fmtcode))
	{
		format = "PARQUET";
	}

	/* we transfer table's schema info together with its url */
	if (!forwrite)
	{
		initStringInfo(&table_schema);
		initStringInfo(&table_attr_names);

		int colnum = rel->rd_att->natts;
		for (int i =0; i < colnum; i++)
		{
			int typid = rel->rd_att->attrs[i]->atttypid;

			/* add type delimiter, for udt, it can be anychar */
			char delim = 0;
			int16 typlen;
			bool typbyval;
			char typalien;
			Oid typioparam;
			Oid func;
			get_type_io_data(typid, IOFunc_input, &typlen, &typbyval, &typalien, &delim, &typioparam, &func);

			char out[20] = {0};
			sprintf(out, "%010d%d%d%03d", typid, rel->rd_att->attrs[i]->attnotnull,
				rel->rd_att->attrs[i]->attndims, delim);

			appendStringInfoString(&table_schema, out);

			char name[70] = {0};
			sprintf(name, "%s%c", rel->rd_att->attrs[i]->attname.data, ',');
			appendStringInfoString(&table_attr_names, name);
		}
	}
	/* Form the actual command
	 *
	 * 1. calls the env setup script
	 * 2. append the remaining arguements: <format>, <conn ver> and <url> to the java command
	 *
	 * Note: "url" has to be quoted because it's an unverified user input
	 * Note: gp_hadoop_connector_version does not need to be quoted
	 *       because we've verified it in checkHadoopGUCs().
	 */

	/* Note: if url is passed with E prefix, quote simply quote has no effect,
	 * we filter some dangerous chararacters right now. */
	char* url_user = EXTPROTOCOL_GET_URL(fcinfo);
	if (hasIllegalCharacters(url_user))
	{
		ereport(ERROR, (0, errmsg("illegal char in url")));
	}

	url = quoteArgument(EXTPROTOCOL_GET_URL(fcinfo));
	initStringInfo(&cmd);

	appendStringInfo(&cmd, EXEC_URL_PREFIX "%s%s %s %s %s", env_cmd.data, java_cmd, format,
			gp_hadoop_connector_version, url);

	if (!forwrite)
	{
		appendStringInfo(&cmd, " '%s'", table_schema.data);
		pfree(table_schema.data);

		appendStringInfo(&cmd, " '%s'", table_attr_names.data);
		pfree(table_attr_names.data);
	}

	/* Setup the env and run the script..
	 *
	 * NOTE: the last argument to external_set_env_vars is set to ZERO because we
	 * don't have access to the scan counter at all. It's ok because we don't need it.
	 */
	external_set_env_vars(&extvar, url, false, NULL, NULL, false, 0);
	myData = url_execute_fopen(cmd.data, forwrite, &extvar, NULL);

	/* Free the command string */
	pfree(cmd.data);

	return myData;
}
Example #4
0
/* 
 * Add key/value pairs to connection header. 
 * These values are the context of the query and used 
 * by the remote component. 
 */
void build_http_header(PxfInputData *input)
{
	extvar_t ev;
	CHURL_HEADERS headers = input->headers; 
	GPHDUri *gphduri = input->gphduri;
	Relation rel = input->rel;
	char *filterstr = input->filterstr;
	ProjectionInfo *proj_info = input->proj_info;
	
	if (rel != NULL)
	{
		/* format */
		ExtTableEntry *exttbl = GetExtTableEntry(rel->rd_id);
        /* pxf treats CSV as TEXT */
		char* format = get_format_name(exttbl->fmtcode);
		churl_headers_append(headers, "X-GP-FORMAT", format);
		
		/* Record fields - name and type of each field */
		add_tuple_desc_httpheader(headers, rel);
	}
	
	if (proj_info != NULL && proj_info->pi_isVarList)
	{
		List* qualsAttributes = extractPxfAttributes(input->quals);
		/* projection information is incomplete if columns from WHERE clause wasn't extracted */
		if (qualsAttributes !=  NIL || list_length(input->quals) == 0)
		{
			add_projection_desc_httpheader(headers, proj_info, qualsAttributes);
		}
		else
			elog(DEBUG2, "Query will not be optimized to use projection information");
	}

	/* GP cluster configuration */
	external_set_env_vars(&ev, gphduri->uri, false, NULL, NULL, false, 0);
	
	churl_headers_append(headers, "X-GP-SEGMENT-ID", ev.GP_SEGMENT_ID);
	churl_headers_append(headers, "X-GP-SEGMENT-COUNT", ev.GP_SEGMENT_COUNT);
	churl_headers_append(headers, "X-GP-XID", ev.GP_XID);
	
	/* Report alignment size to remote component
	 * GPDBWritable uses alignment that has to be the same as
	 * in the C code.
	 * Since the C code can be compiled for both 32 and 64 bits,
	 * the alignment can be either 4 or 8.
	 */
	add_alignment_size_httpheader(headers);
	
	/* headers for uri data */
	churl_headers_append(headers, "X-GP-URL-HOST", gphduri->host);
	churl_headers_append(headers, "X-GP-URL-PORT", gphduri->port);
	churl_headers_append(headers, "X-GP-DATA-DIR", gphduri->data);

	/* location options */
	add_location_options_httpheader(headers, gphduri);
	
	/* full uri */
	churl_headers_append(headers, "X-GP-URI", gphduri->uri);
	
	/* filters */
	if (filterstr)
	{
		churl_headers_append(headers, "X-GP-HAS-FILTER", "1");
		churl_headers_append(headers, "X-GP-FILTER", filterstr);
	}
	else
		churl_headers_append(headers, "X-GP-HAS-FILTER", "0");

	add_delegation_token_headers(headers, input);
	add_remote_credentials(headers);
}