void gpbridge_export_start(PG_FUNCTION_ARGS) { gphadoop_context* context = create_context(fcinfo); parse_gphd_uri(context, false, fcinfo); /* get rest servers list and choose one */ Relation rel = EXTPROTOCOL_GET_RELATION(fcinfo); PxfServer* rest_server = get_pxf_server(context->gphd_uri, rel); if (!rest_server) ereport(ERROR, (errcode(ERRCODE_CONNECTION_FAILURE), errmsg("No REST servers were found (by accessing PXF URI %s)", context->gphd_uri->uri))); elog(DEBUG2, "chosen pxf rest_server = %s:%d", rest_server->host, rest_server->port); build_file_name_for_write(context); build_uri_for_write(context, rest_server); free_datanode_rest_server(rest_server); context->churl_headers = churl_headers_init(); add_querydata_to_http_header(context, fcinfo); context->churl_handle = churl_init_upload(context->uri.data, context->churl_headers); }
/* * Detect data format * used to set file extension on S3 in gpwriter. */ const char *get_format_str(FunctionCallInfo fcinfo) { Relation rel = EXTPROTOCOL_GET_RELATION(fcinfo); ExtTableEntry *exttbl = GetExtTableEntry(rel->rd_id); char fmtcode = exttbl->fmtcode; if (fmttype_is_text(fmtcode)) return "txt"; if (fmttype_is_csv(fmtcode)) return "csv"; if (fmttype_is_avro(fmtcode)) return "avro"; if (fmttype_is_parquet(fmtcode)) return "parquet"; return S3_DEFAULT_FORMAT; }
/* * Add key/value pairs to connection header. * These values are the context of the query and used * by the remote component. */ void add_querydata_to_http_header(gphadoop_context* context, PG_FUNCTION_ARGS) { PxfInputData inputData = {0}; inputData.headers = context->churl_headers; inputData.gphduri = context->gphd_uri; inputData.rel = EXTPROTOCOL_GET_RELATION(fcinfo); inputData.filterstr = serializePxfFilterQuals(EXTPROTOCOL_GET_SCANQUALS(fcinfo)); add_delegation_token(&inputData); build_http_header(&inputData); free_token_resources(&inputData); }
/* * Add key/value pairs to connection header. * These values are the context of the query and used * by the remote component. */ void add_querydata_to_http_header(gphadoop_context* context, PG_FUNCTION_ARGS) { PxfInputData inputData = {0}; inputData.headers = context->churl_headers; inputData.gphduri = context->gphd_uri; inputData.rel = EXTPROTOCOL_GET_RELATION(fcinfo); inputData.quals = EXTPROTOCOL_GET_SCANQUALS(fcinfo); inputData.filterstr = serializePxfFilterQuals(EXTPROTOCOL_GET_SCANQUALS(fcinfo)); if (EXTPROTOCOL_GET_SELECTDESC(fcinfo)) { inputData.proj_info = EXTPROTOCOL_GET_PROJINFO(fcinfo); int agg_type = EXTPROTOCOL_GET_AGG_TYPE(fcinfo); if (agg_type) { inputData.agg_type = agg_type; } } add_delegation_token(&inputData); build_http_header(&inputData); free_token_resources(&inputData); }
/** * Open/Init of the gphdfs protocol * * It setup the Hadoop env var by calling hadoop_env.sh. * Then it calls the corresponding java program to do the actual * read/write. */ static URL_FILE *gphdfs_fopen(PG_FUNCTION_ARGS, bool forwrite) { URL_FILE *myData; StringInfoData cmd; StringInfoData env_cmd; StringInfoData table_schema; StringInfoData table_attr_names; char *java_cmd; extvar_t extvar; char *url; Relation rel; ExtTableEntry *exttbl; char *format; /* Before we start, make sure that all the GUCs are set properly. * This will also set the gp_hadoop_connector_version global var. */ checkHadoopGUCs(); /* The env setup script */ initStringInfo(&env_cmd); appendStringInfo(&env_cmd, "source $GPHOME/%s/hadoop_env.sh;", gp_hadoop_connector_jardir); /* The java program. See the java program for details */ if (forwrite) java_cmd = "java $GP_JAVA_OPT -classpath $CLASSPATH com.emc.greenplum.gpdb.hdfsconnector.HDFSWriter $GP_SEGMENT_ID $GP_XID\0"; else java_cmd = "java $GP_JAVA_OPT -classpath $CLASSPATH com.emc.greenplum.gpdb.hdfsconnector.HDFSReader $GP_SEGMENT_ID $GP_SEGMENT_COUNT\0"; /* NOTE: I've to assume that if it's not TEXT, it's going to be the RIGHT * custom format. There's no easy way to find out the name of the formatter here. * If the wrong formatter is used, we'll see some error in the protocol. * No big deal. */ rel = EXTPROTOCOL_GET_RELATION(fcinfo); exttbl = GetExtTableEntry(rel->rd_id); format = (fmttype_is_text(exttbl->fmtcode) || fmttype_is_csv(exttbl->fmtcode)) ? "TEXT":"GPDBWritable"; if (fmttype_is_avro(exttbl->fmtcode)) { format = "AVRO"; } else if (fmttype_is_parquet(exttbl->fmtcode)) { format = "PARQUET"; } /* we transfer table's schema info together with its url */ if (!forwrite) { initStringInfo(&table_schema); initStringInfo(&table_attr_names); int colnum = rel->rd_att->natts; for (int i =0; i < colnum; i++) { int typid = rel->rd_att->attrs[i]->atttypid; /* add type delimiter, for udt, it can be anychar */ char delim = 0; int16 typlen; bool typbyval; char typalien; Oid typioparam; Oid func; get_type_io_data(typid, IOFunc_input, &typlen, &typbyval, &typalien, &delim, &typioparam, &func); char out[20] = {0}; sprintf(out, "%010d%d%d%03d", typid, rel->rd_att->attrs[i]->attnotnull, rel->rd_att->attrs[i]->attndims, delim); appendStringInfoString(&table_schema, out); char name[70] = {0}; sprintf(name, "%s%c", rel->rd_att->attrs[i]->attname.data, ','); appendStringInfoString(&table_attr_names, name); } } /* Form the actual command * * 1. calls the env setup script * 2. append the remaining arguements: <format>, <conn ver> and <url> to the java command * * Note: "url" has to be quoted because it's an unverified user input * Note: gp_hadoop_connector_version does not need to be quoted * because we've verified it in checkHadoopGUCs(). */ /* Note: if url is passed with E prefix, quote simply quote has no effect, * we filter some dangerous chararacters right now. */ char* url_user = EXTPROTOCOL_GET_URL(fcinfo); if (hasIllegalCharacters(url_user)) { ereport(ERROR, (0, errmsg("illegal char in url"))); } url = quoteArgument(EXTPROTOCOL_GET_URL(fcinfo)); initStringInfo(&cmd); appendStringInfo(&cmd, EXEC_URL_PREFIX "%s%s %s %s %s", env_cmd.data, java_cmd, format, gp_hadoop_connector_version, url); if (!forwrite) { appendStringInfo(&cmd, " '%s'", table_schema.data); pfree(table_schema.data); appendStringInfo(&cmd, " '%s'", table_attr_names.data); pfree(table_attr_names.data); } /* Setup the env and run the script.. * * NOTE: the last argument to external_set_env_vars is set to ZERO because we * don't have access to the scan counter at all. It's ok because we don't need it. */ external_set_env_vars(&extvar, url, false, NULL, NULL, false, 0); myData = url_execute_fopen(cmd.data, forwrite, &extvar, NULL); /* Free the command string */ pfree(cmd.data); return myData; }