void gpbridge_export_start(PG_FUNCTION_ARGS)
{
	gphadoop_context* context = create_context(fcinfo);

	parse_gphd_uri(context, false, fcinfo);

	/* get rest servers list and choose one */
	Relation rel = EXTPROTOCOL_GET_RELATION(fcinfo);
	PxfServer* rest_server = get_pxf_server(context->gphd_uri, rel);

	if (!rest_server)
		ereport(ERROR,
				(errcode(ERRCODE_CONNECTION_FAILURE),
						errmsg("No REST servers were found (by accessing PXF URI %s)",
							   context->gphd_uri->uri)));

	elog(DEBUG2, "chosen pxf rest_server = %s:%d", rest_server->host, rest_server->port);

	build_file_name_for_write(context);
	build_uri_for_write(context, rest_server);
	free_datanode_rest_server(rest_server);

	context->churl_headers = churl_headers_init();
	add_querydata_to_http_header(context, fcinfo);

	context->churl_handle = churl_init_upload(context->uri.data,
											  context->churl_headers);
}
Beispiel #2
0
/*
 * Detect data format
 * used to set file extension on S3 in gpwriter.
 */
const char *get_format_str(FunctionCallInfo fcinfo) {
    Relation rel = EXTPROTOCOL_GET_RELATION(fcinfo);
    ExtTableEntry *exttbl = GetExtTableEntry(rel->rd_id);
    char fmtcode = exttbl->fmtcode;

    if (fmttype_is_text(fmtcode)) return "txt";
    if (fmttype_is_csv(fmtcode)) return "csv";
    if (fmttype_is_avro(fmtcode)) return "avro";
    if (fmttype_is_parquet(fmtcode)) return "parquet";
    return S3_DEFAULT_FORMAT;
}
/*
 * Add key/value pairs to connection header.
 * These values are the context of the query and used
 * by the remote component.
 */
void add_querydata_to_http_header(gphadoop_context* context, PG_FUNCTION_ARGS)
{
	PxfInputData inputData = {0};
	inputData.headers = context->churl_headers;
	inputData.gphduri = context->gphd_uri;
	inputData.rel = EXTPROTOCOL_GET_RELATION(fcinfo);
	inputData.filterstr = serializePxfFilterQuals(EXTPROTOCOL_GET_SCANQUALS(fcinfo));
	add_delegation_token(&inputData);
	
	build_http_header(&inputData);
	free_token_resources(&inputData);
}
/*
 * Add key/value pairs to connection header.
 * These values are the context of the query and used
 * by the remote component.
 */
void add_querydata_to_http_header(gphadoop_context* context, PG_FUNCTION_ARGS)
{
	PxfInputData inputData = {0};
	inputData.headers = context->churl_headers;
	inputData.gphduri = context->gphd_uri;
	inputData.rel = EXTPROTOCOL_GET_RELATION(fcinfo);
	inputData.quals = EXTPROTOCOL_GET_SCANQUALS(fcinfo);
	inputData.filterstr = serializePxfFilterQuals(EXTPROTOCOL_GET_SCANQUALS(fcinfo));
	if (EXTPROTOCOL_GET_SELECTDESC(fcinfo)) {
		inputData.proj_info = EXTPROTOCOL_GET_PROJINFO(fcinfo);
		int agg_type = EXTPROTOCOL_GET_AGG_TYPE(fcinfo);
		if (agg_type) {
			inputData.agg_type = agg_type;
		}
	}
	add_delegation_token(&inputData);
	
	build_http_header(&inputData);
	free_token_resources(&inputData);
}
Beispiel #5
0
/**
 * Open/Init of the gphdfs protocol
 *
 * It setup the Hadoop env var by calling hadoop_env.sh.
 * Then it calls the corresponding java program to do the actual
 * read/write.
 */
static URL_FILE
*gphdfs_fopen(PG_FUNCTION_ARGS, bool forwrite)
{
	URL_FILE      *myData;
	StringInfoData cmd;
	StringInfoData env_cmd;
	StringInfoData table_schema;
	StringInfoData table_attr_names;
	char          *java_cmd;
	extvar_t       extvar;
	char          *url;
	Relation       rel;
	ExtTableEntry *exttbl;
	char          *format;

	/* Before we start, make sure that all the GUCs are set properly.
	 * This will also set the gp_hadoop_connector_version global var.
	 */
	checkHadoopGUCs();

	/* The env setup script */
	initStringInfo(&env_cmd);
	appendStringInfo(&env_cmd, "source $GPHOME/%s/hadoop_env.sh;", gp_hadoop_connector_jardir);

	/* The java program. See the java program for details */
	if (forwrite)
		java_cmd = "java $GP_JAVA_OPT -classpath $CLASSPATH com.emc.greenplum.gpdb.hdfsconnector.HDFSWriter $GP_SEGMENT_ID $GP_XID\0";
	else
		java_cmd = "java $GP_JAVA_OPT -classpath $CLASSPATH com.emc.greenplum.gpdb.hdfsconnector.HDFSReader $GP_SEGMENT_ID $GP_SEGMENT_COUNT\0";

	/* NOTE: I've to assume that if it's not TEXT, it's going to be the RIGHT
	 * custom format. There's no easy way to find out the name of the formatter here.
	 * If the wrong formatter is used, we'll see some error in the protocol.
	 * No big deal.
	 */
	rel    = EXTPROTOCOL_GET_RELATION(fcinfo);
	exttbl = GetExtTableEntry(rel->rd_id);
	format = (fmttype_is_text(exttbl->fmtcode) || fmttype_is_csv(exttbl->fmtcode)) ? "TEXT":"GPDBWritable";
	if (fmttype_is_avro(exttbl->fmtcode))
	{
		format = "AVRO";
	} else if (fmttype_is_parquet(exttbl->fmtcode))
	{
		format = "PARQUET";
	}

	/* we transfer table's schema info together with its url */
	if (!forwrite)
	{
		initStringInfo(&table_schema);
		initStringInfo(&table_attr_names);

		int colnum = rel->rd_att->natts;
		for (int i =0; i < colnum; i++)
		{
			int typid = rel->rd_att->attrs[i]->atttypid;

			/* add type delimiter, for udt, it can be anychar */
			char delim = 0;
			int16 typlen;
			bool typbyval;
			char typalien;
			Oid typioparam;
			Oid func;
			get_type_io_data(typid, IOFunc_input, &typlen, &typbyval, &typalien, &delim, &typioparam, &func);

			char out[20] = {0};
			sprintf(out, "%010d%d%d%03d", typid, rel->rd_att->attrs[i]->attnotnull,
				rel->rd_att->attrs[i]->attndims, delim);

			appendStringInfoString(&table_schema, out);

			char name[70] = {0};
			sprintf(name, "%s%c", rel->rd_att->attrs[i]->attname.data, ',');
			appendStringInfoString(&table_attr_names, name);
		}
	}
	/* Form the actual command
	 *
	 * 1. calls the env setup script
	 * 2. append the remaining arguements: <format>, <conn ver> and <url> to the java command
	 *
	 * Note: "url" has to be quoted because it's an unverified user input
	 * Note: gp_hadoop_connector_version does not need to be quoted
	 *       because we've verified it in checkHadoopGUCs().
	 */

	/* Note: if url is passed with E prefix, quote simply quote has no effect,
	 * we filter some dangerous chararacters right now. */
	char* url_user = EXTPROTOCOL_GET_URL(fcinfo);
	if (hasIllegalCharacters(url_user))
	{
		ereport(ERROR, (0, errmsg("illegal char in url")));
	}

	url = quoteArgument(EXTPROTOCOL_GET_URL(fcinfo));
	initStringInfo(&cmd);

	appendStringInfo(&cmd, EXEC_URL_PREFIX "%s%s %s %s %s", env_cmd.data, java_cmd, format,
			gp_hadoop_connector_version, url);

	if (!forwrite)
	{
		appendStringInfo(&cmd, " '%s'", table_schema.data);
		pfree(table_schema.data);

		appendStringInfo(&cmd, " '%s'", table_attr_names.data);
		pfree(table_attr_names.data);
	}

	/* Setup the env and run the script..
	 *
	 * NOTE: the last argument to external_set_env_vars is set to ZERO because we
	 * don't have access to the scan counter at all. It's ok because we don't need it.
	 */
	external_set_env_vars(&extvar, url, false, NULL, NULL, false, 0);
	myData = url_execute_fopen(cmd.data, forwrite, &extvar, NULL);

	/* Free the command string */
	pfree(cmd.data);

	return myData;
}