/* * Detect data format * used to set file extension on S3 in gpwriter. */ const char *get_format_str(FunctionCallInfo fcinfo) { Relation rel = EXTPROTOCOL_GET_RELATION(fcinfo); ExtTableEntry *exttbl = GetExtTableEntry(rel->rd_id); char fmtcode = exttbl->fmtcode; if (fmttype_is_text(fmtcode)) return "txt"; if (fmttype_is_csv(fmtcode)) return "csv"; if (fmttype_is_avro(fmtcode)) return "avro"; if (fmttype_is_parquet(fmtcode)) return "parquet"; return S3_DEFAULT_FORMAT; }
/* * Export data out of GPDB. */ Datum gphdfsprotocol_export(PG_FUNCTION_ARGS) { URL_FILE *myData; char *data; int datlen; size_t wrote = 0; static char ebuf[512] = {0}; int ebuflen = 512; /* Must be called via the external table format manager */ if (!CALLED_AS_EXTPROTOCOL(fcinfo)) elog(ERROR, "cannot execute gphdfsprotocol_export outside protocol manager"); /* Get our internal description of the protocol */ myData = (URL_FILE *) EXTPROTOCOL_GET_USER_CTX(fcinfo); /* ======================================================================= * DO CLOSE * ======================================================================= */ if (EXTPROTOCOL_IS_LAST_CALL(fcinfo)) { if (myData) url_fclose(myData, true, "gphdfs protocol"); PG_RETURN_INT32(0); } /* ======================================================================= * DO OPEN * ======================================================================= */ if (myData == NULL) { myData = gphdfs_fopen(fcinfo, true); EXTPROTOCOL_SET_USER_CTX(fcinfo, myData); /* add schema info to pipe */ StringInfo schema_data = makeStringInfo(); Relation relation = FORMATTER_GET_RELATION(fcinfo); ExtTableEntry *exttbl = GetExtTableEntry(relation->rd_id); if (fmttype_is_avro(exttbl->fmtcode) || fmttype_is_parquet(exttbl->fmtcode) ) { int relNameLen = strlen(relation->rd_rel->relname.data); appendIntToBuffer(schema_data, relNameLen); appendBinaryStringInfo(schema_data, relation->rd_rel->relname.data, relNameLen); int ncolumns = relation->rd_att->natts; appendIntToBuffer(schema_data, ncolumns); int i = 0; for (; i< ncolumns; i++) { Oid type = relation->rd_att->attrs[i]->atttypid; /* add attname,atttypid,attnotnull,attndims to schema_data filed */ int attNameLen = strlen(relation->rd_att->attrs[i]->attname.data); appendIntToBuffer(schema_data, attNameLen); appendBinaryStringInfo(schema_data, relation->rd_att->attrs[i]->attname.data, attNameLen); appendIntToBuffer(schema_data, type); bool notNull = relation->rd_att->attrs[i]->attnotnull; appendInt1ToBuffer(schema_data, notNull?1:0); appendIntToBuffer(schema_data, relation->rd_att->attrs[i]->attndims); /* add type delimiter, for udt, it can be anychar */ char delim = 0; int16 typlen; bool typbyval; char typalien; Oid typioparam; Oid func; get_type_io_data(type, IOFunc_input, &typlen, &typbyval, &typalien, &delim, &typioparam, &func); appendInt1ToBuffer(schema_data, delim); } StringInfo schema_head = makeStringInfo(); appendIntToBuffer(schema_head, schema_data->len + 2); appendInt2ToBuffer(schema_head, 2); url_execute_fwrite(schema_head->data, schema_head->len, myData, NULL); url_execute_fwrite(schema_data->data, schema_data->len, myData, NULL); pfree(schema_head->data); pfree(schema_data->data); } } /* ======================================================================= * DO THE EXPORT * ======================================================================= */ data = EXTPROTOCOL_GET_DATABUF(fcinfo); datlen = EXTPROTOCOL_GET_DATALEN(fcinfo); if (datlen > 0) wrote = url_execute_fwrite(data, datlen, myData, NULL); if (url_ferror(myData, wrote, ebuf, ebuflen)) { ereport(ERROR, (errcode_for_file_access(), strlen(ebuf) > 0 ? errmsg("could not write to external resource:\n%s",ebuf) : errmsg("could not write to external resource: %m"))); } PG_RETURN_INT32((int)wrote); }
/** * Open/Init of the gphdfs protocol * * It setup the Hadoop env var by calling hadoop_env.sh. * Then it calls the corresponding java program to do the actual * read/write. */ static URL_FILE *gphdfs_fopen(PG_FUNCTION_ARGS, bool forwrite) { URL_FILE *myData; StringInfoData cmd; StringInfoData env_cmd; StringInfoData table_schema; StringInfoData table_attr_names; char *java_cmd; extvar_t extvar; char *url; Relation rel; ExtTableEntry *exttbl; char *format; /* Before we start, make sure that all the GUCs are set properly. * This will also set the gp_hadoop_connector_version global var. */ checkHadoopGUCs(); /* The env setup script */ initStringInfo(&env_cmd); appendStringInfo(&env_cmd, "source $GPHOME/%s/hadoop_env.sh;", gp_hadoop_connector_jardir); /* The java program. See the java program for details */ if (forwrite) java_cmd = "java $GP_JAVA_OPT -classpath $CLASSPATH com.emc.greenplum.gpdb.hdfsconnector.HDFSWriter $GP_SEGMENT_ID $GP_XID\0"; else java_cmd = "java $GP_JAVA_OPT -classpath $CLASSPATH com.emc.greenplum.gpdb.hdfsconnector.HDFSReader $GP_SEGMENT_ID $GP_SEGMENT_COUNT\0"; /* NOTE: I've to assume that if it's not TEXT, it's going to be the RIGHT * custom format. There's no easy way to find out the name of the formatter here. * If the wrong formatter is used, we'll see some error in the protocol. * No big deal. */ rel = EXTPROTOCOL_GET_RELATION(fcinfo); exttbl = GetExtTableEntry(rel->rd_id); format = (fmttype_is_text(exttbl->fmtcode) || fmttype_is_csv(exttbl->fmtcode)) ? "TEXT":"GPDBWritable"; if (fmttype_is_avro(exttbl->fmtcode)) { format = "AVRO"; } else if (fmttype_is_parquet(exttbl->fmtcode)) { format = "PARQUET"; } /* we transfer table's schema info together with its url */ if (!forwrite) { initStringInfo(&table_schema); initStringInfo(&table_attr_names); int colnum = rel->rd_att->natts; for (int i =0; i < colnum; i++) { int typid = rel->rd_att->attrs[i]->atttypid; /* add type delimiter, for udt, it can be anychar */ char delim = 0; int16 typlen; bool typbyval; char typalien; Oid typioparam; Oid func; get_type_io_data(typid, IOFunc_input, &typlen, &typbyval, &typalien, &delim, &typioparam, &func); char out[20] = {0}; sprintf(out, "%010d%d%d%03d", typid, rel->rd_att->attrs[i]->attnotnull, rel->rd_att->attrs[i]->attndims, delim); appendStringInfoString(&table_schema, out); char name[70] = {0}; sprintf(name, "%s%c", rel->rd_att->attrs[i]->attname.data, ','); appendStringInfoString(&table_attr_names, name); } } /* Form the actual command * * 1. calls the env setup script * 2. append the remaining arguements: <format>, <conn ver> and <url> to the java command * * Note: "url" has to be quoted because it's an unverified user input * Note: gp_hadoop_connector_version does not need to be quoted * because we've verified it in checkHadoopGUCs(). */ /* Note: if url is passed with E prefix, quote simply quote has no effect, * we filter some dangerous chararacters right now. */ char* url_user = EXTPROTOCOL_GET_URL(fcinfo); if (hasIllegalCharacters(url_user)) { ereport(ERROR, (0, errmsg("illegal char in url"))); } url = quoteArgument(EXTPROTOCOL_GET_URL(fcinfo)); initStringInfo(&cmd); appendStringInfo(&cmd, EXEC_URL_PREFIX "%s%s %s %s %s", env_cmd.data, java_cmd, format, gp_hadoop_connector_version, url); if (!forwrite) { appendStringInfo(&cmd, " '%s'", table_schema.data); pfree(table_schema.data); appendStringInfo(&cmd, " '%s'", table_attr_names.data); pfree(table_attr_names.data); } /* Setup the env and run the script.. * * NOTE: the last argument to external_set_env_vars is set to ZERO because we * don't have access to the scan counter at all. It's ok because we don't need it. */ external_set_env_vars(&extvar, url, false, NULL, NULL, false, 0); myData = url_execute_fopen(cmd.data, forwrite, &extvar, NULL); /* Free the command string */ pfree(cmd.data); return myData; }
/* * Add key/value pairs to connection header. * These values are the context of the query and used * by the remote component. */ void build_http_header(PxfInputData *input) { extvar_t ev; CHURL_HEADERS headers = input->headers; GPHDUri *gphduri = input->gphduri; Relation rel = input->rel; char *filterstr = input->filterstr; ProjectionInfo *proj_info = input->proj_info; if (rel != NULL) { /* format */ ExtTableEntry *exttbl = GetExtTableEntry(rel->rd_id); /* pxf treats CSV as TEXT */ char* format = get_format_name(exttbl->fmtcode); churl_headers_append(headers, "X-GP-FORMAT", format); /* Record fields - name and type of each field */ add_tuple_desc_httpheader(headers, rel); } if (proj_info != NULL && proj_info->pi_isVarList) { List* qualsAttributes = extractPxfAttributes(input->quals); /* projection information is incomplete if columns from WHERE clause wasn't extracted */ if (qualsAttributes != NIL || list_length(input->quals) == 0) { add_projection_desc_httpheader(headers, proj_info, qualsAttributes); } else elog(DEBUG2, "Query will not be optimized to use projection information"); } /* GP cluster configuration */ external_set_env_vars(&ev, gphduri->uri, false, NULL, NULL, false, 0); churl_headers_append(headers, "X-GP-SEGMENT-ID", ev.GP_SEGMENT_ID); churl_headers_append(headers, "X-GP-SEGMENT-COUNT", ev.GP_SEGMENT_COUNT); churl_headers_append(headers, "X-GP-XID", ev.GP_XID); /* Report alignment size to remote component * GPDBWritable uses alignment that has to be the same as * in the C code. * Since the C code can be compiled for both 32 and 64 bits, * the alignment can be either 4 or 8. */ add_alignment_size_httpheader(headers); /* headers for uri data */ churl_headers_append(headers, "X-GP-URL-HOST", gphduri->host); churl_headers_append(headers, "X-GP-URL-PORT", gphduri->port); churl_headers_append(headers, "X-GP-DATA-DIR", gphduri->data); /* location options */ add_location_options_httpheader(headers, gphduri); /* full uri */ churl_headers_append(headers, "X-GP-URI", gphduri->uri); /* filters */ if (filterstr) { churl_headers_append(headers, "X-GP-HAS-FILTER", "1"); churl_headers_append(headers, "X-GP-FILTER", filterstr); } else churl_headers_append(headers, "X-GP-HAS-FILTER", "0"); add_delegation_token_headers(headers, input); add_remote_credentials(headers); }