/* * Export data out of GPDB. * invoked by GPDB, be careful with C++ exceptions. */ Datum s3_export(PG_FUNCTION_ARGS) { /* Must be called via the external table format manager */ if (!CALLED_AS_EXTPROTOCOL(fcinfo)) elog(ERROR, "extprotocol_import: not called by external protocol manager"); /* Get our internal description of the protocol */ GPWriter *gpwriter = (GPWriter *)EXTPROTOCOL_GET_USER_CTX(fcinfo); /* last call. destroy writer */ if (EXTPROTOCOL_IS_LAST_CALL(fcinfo)) { thread_cleanup(); if (!writer_cleanup(&gpwriter)) { ereport(ERROR, (0, errmsg("Failed to cleanup S3 extension: %s", s3extErrorMessage.c_str()))); } EXTPROTOCOL_SET_USER_CTX(fcinfo, NULL); PG_RETURN_INT32(0); } /* first call. do any desired init */ if (gpwriter == NULL) { const char *url_with_options = EXTPROTOCOL_GET_URL(fcinfo); const char *format = get_format_str(fcinfo); thread_setup(); gpwriter = writer_init(url_with_options, format); if (!gpwriter) { ereport(ERROR, (0, errmsg("Failed to init S3 extension, segid = %d, " "segnum = %d, please check your " "configurations and net connection: %s", s3ext_segid, s3ext_segnum, s3extErrorMessage.c_str()))); } EXTPROTOCOL_SET_USER_CTX(fcinfo, gpwriter); } char *data_buf = EXTPROTOCOL_GET_DATABUF(fcinfo); int32 data_len = EXTPROTOCOL_GET_DATALEN(fcinfo); if (!writer_transfer_data(gpwriter, data_buf, data_len)) { ereport(ERROR, (0, errmsg("s3_export: could not write data: %s", s3extErrorMessage.c_str()))); } PG_RETURN_INT32(data_len); }
/** * Open/Init of the gphdfs protocol * * It setup the Hadoop env var by calling hadoop_env.sh. * Then it calls the corresponding java program to do the actual * read/write. */ static URL_FILE *gphdfs_fopen(PG_FUNCTION_ARGS, bool forwrite) { URL_FILE *myData; StringInfoData cmd; StringInfoData env_cmd; StringInfoData table_schema; StringInfoData table_attr_names; char *java_cmd; extvar_t extvar; char *url; Relation rel; ExtTableEntry *exttbl; char *format; /* Before we start, make sure that all the GUCs are set properly. * This will also set the gp_hadoop_connector_version global var. */ checkHadoopGUCs(); /* The env setup script */ initStringInfo(&env_cmd); appendStringInfo(&env_cmd, "source $GPHOME/%s/hadoop_env.sh;", gp_hadoop_connector_jardir); /* The java program. See the java program for details */ if (forwrite) java_cmd = "java $GP_JAVA_OPT -classpath $CLASSPATH com.emc.greenplum.gpdb.hdfsconnector.HDFSWriter $GP_SEGMENT_ID $GP_XID\0"; else java_cmd = "java $GP_JAVA_OPT -classpath $CLASSPATH com.emc.greenplum.gpdb.hdfsconnector.HDFSReader $GP_SEGMENT_ID $GP_SEGMENT_COUNT\0"; /* NOTE: I've to assume that if it's not TEXT, it's going to be the RIGHT * custom format. There's no easy way to find out the name of the formatter here. * If the wrong formatter is used, we'll see some error in the protocol. * No big deal. */ rel = EXTPROTOCOL_GET_RELATION(fcinfo); exttbl = GetExtTableEntry(rel->rd_id); format = (fmttype_is_text(exttbl->fmtcode) || fmttype_is_csv(exttbl->fmtcode)) ? "TEXT":"GPDBWritable"; if (fmttype_is_avro(exttbl->fmtcode)) { format = "AVRO"; } else if (fmttype_is_parquet(exttbl->fmtcode)) { format = "PARQUET"; } /* we transfer table's schema info together with its url */ if (!forwrite) { initStringInfo(&table_schema); initStringInfo(&table_attr_names); int colnum = rel->rd_att->natts; for (int i =0; i < colnum; i++) { int typid = rel->rd_att->attrs[i]->atttypid; /* add type delimiter, for udt, it can be anychar */ char delim = 0; int16 typlen; bool typbyval; char typalien; Oid typioparam; Oid func; get_type_io_data(typid, IOFunc_input, &typlen, &typbyval, &typalien, &delim, &typioparam, &func); char out[20] = {0}; sprintf(out, "%010d%d%d%03d", typid, rel->rd_att->attrs[i]->attnotnull, rel->rd_att->attrs[i]->attndims, delim); appendStringInfoString(&table_schema, out); char name[70] = {0}; sprintf(name, "%s%c", rel->rd_att->attrs[i]->attname.data, ','); appendStringInfoString(&table_attr_names, name); } } /* Form the actual command * * 1. calls the env setup script * 2. append the remaining arguements: <format>, <conn ver> and <url> to the java command * * Note: "url" has to be quoted because it's an unverified user input * Note: gp_hadoop_connector_version does not need to be quoted * because we've verified it in checkHadoopGUCs(). */ /* Note: if url is passed with E prefix, quote simply quote has no effect, * we filter some dangerous chararacters right now. */ char* url_user = EXTPROTOCOL_GET_URL(fcinfo); if (hasIllegalCharacters(url_user)) { ereport(ERROR, (0, errmsg("illegal char in url"))); } url = quoteArgument(EXTPROTOCOL_GET_URL(fcinfo)); initStringInfo(&cmd); appendStringInfo(&cmd, EXEC_URL_PREFIX "%s%s %s %s %s", env_cmd.data, java_cmd, format, gp_hadoop_connector_version, url); if (!forwrite) { appendStringInfo(&cmd, " '%s'", table_schema.data); pfree(table_schema.data); appendStringInfo(&cmd, " '%s'", table_attr_names.data); pfree(table_attr_names.data); } /* Setup the env and run the script.. * * NOTE: the last argument to external_set_env_vars is set to ZERO because we * don't have access to the scan counter at all. It's ok because we don't need it. */ external_set_env_vars(&extvar, url, false, NULL, NULL, false, 0); myData = url_execute_fopen(cmd.data, forwrite, &extvar, NULL); /* Free the command string */ pfree(cmd.data); return myData; }
/* * Import data into GPDB. */ Datum demoprot_import(PG_FUNCTION_ARGS) { extprotocol_t *myData; char *data; int datlen; size_t nread = 0; /* Must be called via the external table format manager */ if (!CALLED_AS_EXTPROTOCOL(fcinfo)) elog(ERROR, "extprotocol_import: not called by external protocol manager"); /* Get our internal description of the protocol */ myData = (extprotocol_t *) EXTPROTOCOL_GET_USER_CTX(fcinfo); if(EXTPROTOCOL_IS_LAST_CALL(fcinfo)) { /* we're done receiving data. close our connection */ if(myData && myData->file) if(fclose(myData->file)) ereport(ERROR, (errcode_for_file_access(), errmsg("could not close file \"%s\": %m", myData->filename))); PG_RETURN_INT32(0); } if (myData == NULL) { /* first call. do any desired init */ const char *p_name = "demoprot"; DemoUri *parsed_url; char *url = EXTPROTOCOL_GET_URL(fcinfo); myData = palloc(sizeof(extprotocol_t)); myData->url = pstrdup(url); parsed_url = ParseDemoUri(myData->url); myData->filename = pstrdup(parsed_url->path); if(strcasecmp(parsed_url->protocol, p_name) != 0) elog(ERROR, "internal error: demoprot called with a different protocol (%s)", parsed_url->protocol); FreeDemoUri(parsed_url); /* open the destination file (or connect to remote server in other cases) */ myData->file = fopen(myData->filename, "r"); if (myData->file == NULL) ereport(ERROR, (errcode_for_file_access(), errmsg("demoprot_import: could not open file \"%s\" for reading: %m", myData->filename), errOmitLocation(true))); EXTPROTOCOL_SET_USER_CTX(fcinfo, myData); } /* ======================================================================= * DO THE IMPORT * ======================================================================= */ data = EXTPROTOCOL_GET_DATABUF(fcinfo); datlen = EXTPROTOCOL_GET_DATALEN(fcinfo); if(datlen > 0) { nread = fread(data, 1, datlen, myData->file); if (ferror(myData->file)) ereport(ERROR, (errcode_for_file_access(), errmsg("demoprot_import: could not write to file \"%s\": %m", myData->filename))); } PG_RETURN_INT32((int)nread); }
/* * Import data into GPDB. * invoked by GPDB, be careful with C++ exceptions. */ Datum s3_import(PG_FUNCTION_ARGS) { S3ExtBase *myData; char *data; int data_len; size_t nread = 0; /* Must be called via the external table format manager */ if (!CALLED_AS_EXTPROTOCOL(fcinfo)) elog(ERROR, "extprotocol_import: not called by external protocol manager"); /* Get our internal description of the protocol */ myData = (S3ExtBase *)EXTPROTOCOL_GET_USER_CTX(fcinfo); if (EXTPROTOCOL_IS_LAST_CALL(fcinfo)) { if (myData) { thread_cleanup(); if (!myData->Destroy()) { ereport(ERROR, (0, errmsg("Failed to cleanup S3 extention"))); } delete myData; } /* * Cleanup function for the XML library. */ xmlCleanupParser(); PG_RETURN_INT32(0); } if (myData == NULL) { /* first call. do any desired init */ curl_global_init(CURL_GLOBAL_ALL); thread_setup(); const char *p_name = "s3"; char *url_with_options = EXTPROTOCOL_GET_URL(fcinfo); char *url = truncate_options(url_with_options); char *config_path = get_opt_s3(url_with_options, "config"); if (!config_path) { // no config path in url, use default value // data_folder/gpseg0/s3/s3.conf config_path = strdup("s3/s3.conf"); } bool result = InitConfig(config_path, ""); if (!result) { free(config_path); ereport(ERROR, (0, errmsg("Can't find config file, please check"))); } else { ClearConfig(); free(config_path); } InitLog(); if (s3ext_accessid == "") { ereport(ERROR, (0, errmsg("ERROR: access id is empty"))); } if (s3ext_secret == "") { ereport(ERROR, (0, errmsg("ERROR: secret is empty"))); } if ((s3ext_segnum == -1) || (s3ext_segid == -1)) { ereport(ERROR, (0, errmsg("ERROR: segment id is invalid"))); } myData = CreateExtWrapper(url); if (!myData || !myData->Init(s3ext_segid, s3ext_segnum, s3ext_chunksize)) { if (myData) delete myData; ereport(ERROR, (0, errmsg("Failed to init S3 extension, segid = " "%d, segnum = %d, please check your " "configurations and net connection", s3ext_segid, s3ext_segnum))); } EXTPROTOCOL_SET_USER_CTX(fcinfo, myData); free(url); } /* ======================================================================= * DO THE IMPORT * ======================================================================= */ data = EXTPROTOCOL_GET_DATABUF(fcinfo); data_len = EXTPROTOCOL_GET_DATALEN(fcinfo); uint64_t readlen = 0; if (data_len > 0) { readlen = data_len; if (!myData->TransferData(data, readlen)) ereport(ERROR, (0, errmsg("s3_import: could not read data"))); nread = (size_t)readlen; // S3DEBUG("read %d data from S3", nread); } PG_RETURN_INT32((int)nread); }