size_t streaming_readfunc(void* ptr, size_t size, size_t nmemb, void* stream) { LOG(LOG_INFO, "entry\n"); IOBuf* b = (IOBuf*)stream; ObjectStream* os = (ObjectStream*)b->user_data; size_t total = (size * nmemb); LOG(LOG_INFO, "(%08lx) curl buff %ld\n", (size_t)os, total); // wait for producer to fill buffers WAIT(&os->iob_full); LOG(LOG_INFO, "(%08lx) avail-data: %ld\n", (size_t)os, b->avail); // maybe we were requested to quit or abort? if (b->write_count == 0) { // called by stream_sync() LOG(LOG_INFO, "(%08lx) got EOF\n", (size_t)os); POST(&os->iob_empty); // polite return 0; } else if (b->first->buf == (char*)1) { // called by stream_abort() LOG(LOG_INFO, "(%08lx) got ABORT\n", (size_t)os); POST(&os->iob_empty); // polite return CURL_READFUNC_ABORT; } // move producer's data into curl buffers. // (Might take more than one callback) size_t move_req = ((total <= b->avail) ? total : b->avail); size_t moved = aws_iobuf_get_raw(b, (char*)ptr, move_req); // track total size os->written += moved; LOG(LOG_INFO, "(%08lx) moved %ld (total: %ld)\n", (size_t)os, moved, os->written); if (b->avail) { LOG(LOG_INFO, "(%08lx) iterating (avail: %ld)\n", (size_t)os, b->avail); POST(&os->iob_full); // next callback is pre-approved } else { LOG(LOG_INFO, "(%08lx) done with buffer (total written %ld)\n", (size_t)os, os->written); POST(&os->iob_empty); // tell producer that buffer is used } return moved; }
static void S3_Close_internal( void* fd, IOR_param_t* param, int multi_part_upload_p ) { char* fname = (char*)fd; /* see NOTE above S3_Create_Or_Open() */ // easier to think int n_to_n = param->filePerProc; int n_to_1 = (! n_to_n); int segmented = (param->segmentCount == 1); if (param->verbose >= VERBOSE_2) { printf("-> S3_Close('%s', ,%d) %s\n", fname, multi_part_upload_p, ((n_to_n) ? "N:N" : ((segmented) ? "N:1(seg)" : "N:1(str)"))); } if (param->open == WRITE) { // finalizing Multi-Part Upload (for N:1 or N:N) if (multi_part_upload_p) { size_t etag_data_size = param->etags->write_count; /* local ETag data (bytes) */ size_t etags_per_rank = etag_data_size / ETAG_SIZE; /* number of local etags */ // --- create XML containing ETags in an IOBuf for "close" request IOBuf* xml = NULL; if (n_to_1) { // for N:1, gather all Etags at Rank0 MPI_Datatype mpi_size_t; if (sizeof(size_t) == sizeof(int)) mpi_size_t = MPI_INT; else if (sizeof(size_t) == sizeof(long)) mpi_size_t = MPI_LONG; else mpi_size_t = MPI_LONG_LONG; // Everybody should have the same number of ETags (?) size_t etag_count_max = 0; /* highest number on any proc */ MPI_Allreduce(&etags_per_rank, &etag_count_max, 1, mpi_size_t, MPI_MAX, param->testComm); if (etags_per_rank != etag_count_max) { printf("Rank %d: etag count mismatch: max:%d, mine:%d\n", rank, etag_count_max, etags_per_rank); MPI_Abort(param->testComm, 1); } // collect ETag data at Rank0 aws_iobuf_realloc(param->etags); /* force single contiguous buffer */ char* etag_data = param->etags->first->buf; /* per-rank data, contiguous */ if (rank == 0) { char* etag_ptr; int i; int j; int rnk; char* etag_vec = (char*)malloc((param->numTasks * etag_data_size) +1); if (! etag_vec) { fprintf(stderr, "rank 0 failed to malloc %d bytes\n", param->numTasks * etag_data_size); MPI_Abort(param->testComm, 1); } MPI_Gather(etag_data, etag_data_size, MPI_BYTE, etag_vec, etag_data_size, MPI_BYTE, 0, MPI_COMM_WORLD); // --- debugging: show the gathered etag data // (This shows the raw concatenated etag-data from each node.) if (param->verbose >= VERBOSE_4) { printf("rank 0: gathered %d etags from all ranks:\n", etags_per_rank); etag_ptr=etag_vec; for (rnk=0; rnk<param->numTasks; ++rnk) { printf("\t[%d]: '", rnk); int ii; for (ii=0; ii<etag_data_size; ++ii) /* NOT null-terminated! */ printf("%c", etag_ptr[ii]); printf("'\n"); etag_ptr += etag_data_size; } } // add XML for *all* the parts. The XML must be ordered by // part-number. Each rank wrote <etags_per_rank> parts, // locally. At rank0, the etags for each rank are now // stored as a continguous block of text, with the blocks // stored in rank order in etag_vec. In other words, our // internal rep at rank 0 matches the "segmented" format. // From this, we must select etags in an order matching how // they appear in the actual object, and give sequential // part-numbers to the resulting sequence. // // That ordering of parts in the actual written object // varies according to whether we wrote in the "segmented" // or "strided" format. // // supposing N ranks, and P parts per rank: // // segmented: // // all parts for a given rank are consecutive. // rank r writes these parts: // // rP, rP+1, ... (r+1)P -1 // // i.e. rank0 writes parts 0,1,2,3 ... P-1 // // // strided: // // rank r writes every P-th part, starting with r. // // r, P+r, ... (P-1)P + r // // i.e. rank0 writes parts 0,P,2P,3P ... (P-1)P // // // NOTE: If we knew ahead of time how many parts each rank was // going to write, we could assign part-number ranges, per // rank, and then have nice locality here. // // Alternatively, we could have everyone format their own // XML text and send that, instead of just the tags. This // would increase the amount of data being sent, but would // reduce the work for rank0 to format everything. size_t i_max; // outer-loop size_t j_max; // inner loop size_t start_multiplier; // initial offset in collected data size_t stride; // in etag_vec if (segmented) { // segmented i_max = param->numTasks; j_max = etags_per_rank; start_multiplier = etag_data_size; /* one rank's-worth of Etag data */ stride = ETAG_SIZE; /* one ETag */ } else { // strided i_max = etags_per_rank; j_max = param->numTasks; start_multiplier = ETAG_SIZE; /* one ETag */ stride = etag_data_size; /* one rank's-worth of Etag data */ } xml = aws_iobuf_new(); aws_iobuf_growth_size(xml, 1024 * 8); // write XML header ... aws_iobuf_append_str(xml, "<CompleteMultipartUpload>\n"); int part = 0; for (i=0; i<i_max; ++i) { etag_ptr=etag_vec + (i * start_multiplier); for (j=0; j<j_max; ++j) { // etags were saved as contiguous text. Extract the next one. char etag[ETAG_SIZE +1]; memcpy(etag, etag_ptr, ETAG_SIZE); etag[ETAG_SIZE] = 0; // write XML for next part, with Etag ... snprintf(buff, BUFF_SIZE, " <Part>\n" " <PartNumber>%d</PartNumber>\n" " <ETag>%s</ETag>\n" " </Part>\n", part, etag); aws_iobuf_append_str(xml, buff); etag_ptr += stride; ++ part; } } // write XML tail ... aws_iobuf_append_str(xml, "</CompleteMultipartUpload>\n"); } else { MPI_Gather(etag_data, etag_data_size, MPI_BYTE, NULL, etag_data_size, MPI_BYTE, 0, MPI_COMM_WORLD); } } else { /* N:N */ xml = aws_iobuf_new(); aws_iobuf_growth_size(xml, 1024 * 8); // write XML header ... aws_iobuf_append_str(xml, "<CompleteMultipartUpload>\n"); // all parts of our object were written from this rank. char etag[ETAG_SIZE +1]; int part = 0; int i; for (i=0; i<etags_per_rank; ++i) { // TBD: Instead of reading into etag, then sprintf'ing, then // copying into xml, we could just read directly into xml int sz = aws_iobuf_get_raw(param->etags, etag, ETAG_SIZE); if (sz != ETAG_SIZE) { snprintf(buff, BUFF_SIZE, "Read of ETag %d had length %d (not %d)\n", rank, i, sz, ETAG_SIZE); ERR_SIMPLE(buff); } etag[ETAG_SIZE] = 0; // write XML for next part, with Etag ... snprintf(buff, BUFF_SIZE, " <Part>\n" " <PartNumber>%d</PartNumber>\n" " <ETag>%s</ETag>\n" " </Part>\n", part, etag); aws_iobuf_append_str(xml, buff); ++ part; } // write XML tail ... aws_iobuf_append_str(xml, "</CompleteMultipartUpload>\n"); } // send request to finalize MPU if (n_to_n || (rank == 0)) { // DEBUGGING: show the XML we constructed if (param->verbose >= VERBOSE_3) debug_iobuf(xml, 1, 1); // --- POST our XML to the server. snprintf(buff, BUFF_SIZE, "%s?uploadId=%s", fname, param->UploadId); AWS4C_CHECK ( s3_post(xml, buff) ); AWS4C_CHECK_OK( xml ); aws_iobuf_free(xml); } // everybody reset MPU info. Allows another MPU, and frees memory. s3_MPU_reset(param); // Everybody meetup, so non-zero ranks won't go trying to stat the // N:1 file until rank0 has finished the S3 multi-part finalize. // The object will not appear to exist, until then. if (n_to_1) MPI_CHECK(MPI_Barrier(param->testComm), "barrier error"); } else { // No finalization is needed, when using EMC's byte-range writing // support. However, we do need to make sure everyone has // finished writing, before anyone starts reading. if (n_to_1) { MPI_CHECK(MPI_Barrier(param->testComm), "barrier error"); if (param->verbose >= VERBOSE_2) printf("rank %d: passed barrier\n", rank); } } // After writing, reset the CURL connection, so that caches won't be // used for reads. aws_reset_connection(); } if (param->verbose >= VERBOSE_2) { printf("<- S3_Close\n"); } }