static int do_wait (struct aiocb64 **cbp, size_t nent, int allowed_err) { int go_on; size_t cnt; int result = 0; do { aio_suspend64 ((const struct aiocb64 *const *) cbp, nent, NULL); go_on = 0; for (cnt = 0; cnt < nent; ++cnt) if (cbp[cnt] != NULL) { if (aio_error64 (cbp[cnt]) == EINPROGRESS) go_on = 1; else { if (aio_return64 (cbp[cnt]) == -1 && (allowed_err == 0 || aio_error64 (cbp[cnt]) != allowed_err)) { error (0, aio_error64 (cbp[cnt]), "Operation failed\n"); result = 1; } cbp[cnt] = NULL; } } } while (go_on); return result; }
void ADIOI_XFS_ReadComplete(ADIO_Request *request, ADIO_Status *status, int *error_code) { int err; static char myname[] = "ADIOI_XFS_READCOMPLETE"; if (*request == ADIO_REQUEST_NULL) { *error_code = MPI_SUCCESS; return; } if ((*request)->queued) { do { err = aio_suspend64((const aiocb64_t **) &((*request)->handle), 1, 0); } while ((err == -1) && (errno == EINTR)); if (err != -1) { err = aio_return64((aiocb64_t *) (*request)->handle); (*request)->nbytes = err; errno = aio_error64((aiocb64_t *) (*request)->handle); } else (*request)->nbytes = -1; if (err == -1) { *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_IO, "**io", "**io %s", strerror(errno)); } else *error_code = MPI_SUCCESS; } /* if ((*request)->queued) */ else *error_code = MPI_SUCCESS; #ifdef HAVE_STATUS_SET_BYTES if ((*request)->nbytes != -1) MPIR_Status_set_bytes(status, (*request)->datatype, (*request)->nbytes); #endif if ((*request)->queued != -1) { /* queued = -1 is an internal hack used when the request must be completed, but the request object should not be freed. This is used in ADIOI_Complete_async, because the user will call MPI_Wait later, which would require status to be filled. Ugly but works. queued = -1 should be used only in ADIOI_Complete_async. This should not affect the user in any way. */ /* if request is still queued in the system, it is also there on ADIOI_Async_list. Delete it from there. */ if ((*request)->queued) ADIOI_Del_req_from_list(request); (*request)->fd->async_count--; if ((*request)->handle) ADIOI_Free((*request)->handle); ADIOI_Free_request((ADIOI_Req_node *) (*request)); *request = ADIO_REQUEST_NULL; } }
// // CloseHandle() has a slightly different interface from the NT call. It takes an // additional input parameter to determine the object type. The object can be either // a FILE_ELEMENT or a CQ_ELEMENT. // // CloseHandle() closes the file handle or the completion queue handle and frees all // the allocated memory. It does an additional check to ensure that all the queued // Asynch I/Os that have not yet completed are cancelled before actually closing the // handle. This helps clean up the kernel queues of any pending requests. // // Although it takes longer, this is acceptable since the code is not in the performance // critical region. // BOOL CloseHandle(HANDLE object, int object_type) { struct File *filep; struct IOCQ *cqid; int retval, i; #ifdef _DEBUG cout << "CloseHandle() freeing : handle = " << object << " objecttype = " << object_type << endl; #endif switch (object_type) { case FILE_ELEMENT: filep = (struct File *)object; cqid = filep->iocq; // cancel any pending aio requests. retval = aio_cancel64(filep->fd, NULL); while (retval == AIO_NOTCANCELED) { retval = aio_cancel64(filep->fd, NULL); } if (cqid != NULL && cqid->element_list != NULL && cqid->aiocb_list != NULL) { for (i = 0; i < cqid->size; i++) { if (cqid->element_list[i].aiocbp.aio_fildes != filep->fd) continue; // We are not interested in the return values of aio_error() and aio_return(). // only have to dequeue all the requests. if (!cqid->aiocb_list[i]) continue; retval = aio_error64(cqid->aiocb_list[i]); retval = aio_return64(cqid->aiocb_list[i]); } } #if defined(IOMTR_OS_LINUX) || defined(IOMTR_OS_OSX) || defined(IOMTR_OS_SOLARIS) close(filep->fd); #elif defined(IOMTR_OS_NETWARE) if (IsType(filep->type, LogicalDiskType)) NXClose(filep->fd); else if (IsType(filep->type, PhysicalDiskType)) MM_ReleaseIOObject(filep->fd); #else #warning ===> WARNING: You have to do some coding here to get the port done! #endif break; case CQ_ELEMENT: cqid = (struct IOCQ *)object; // cancel any pending aio requests. for (i = 0; i < cqid->size; i++) { if (!cqid->aiocb_list[i]) continue; #if defined(IOMTR_OS_LINUX) || defined(IOMTR_OS_OSX) || defined(IOMTR_OS_NETWARE) /* * In Linux, you crash (!) if the aiocpb isn't in your queue. :-( * This code seems to occasionally do this...so I just cancel all * AIOs for the queue, thus avoiding the problem of cancelling a * message not in the queue. */ retval = aio_cancel64(cqid->element_list[i].aiocbp.aio_fildes, NULL); #elif defined(IOMTR_OS_SOLARIS) retval = aio_cancel64(cqid->element_list[i].aiocbp.aio_fildes, cqid->aiocb_list[i]); #else #warning ===> WARNING: You have to do some coding here to get the port done! #endif if (retval == AIO_NOTCANCELED) { retval = aio_error64(cqid->aiocb_list[i]); retval = aio_return64(cqid->aiocb_list[i]); } } #if defined(IOMTR_OS_LINUX) || defined(IOMTR_OS_OSX) || defined(IOMTR_OS_SOLARIS) free(cqid->aiocb_list); #elif defined(IOMTR_OS_NETWARE) NXMemFree(cqid->aiocb_list); #else #warning ===> WARNING: You have to do some coding here to get the port done! #endif // Something strange here. If I free the element_list, the next round // of aio_write() and aio_read() calls fail. If I dont free this, then they // succeed. But then, there is a memory leak equal to the max number of outstanding // I/Os * sizeof(CQ_Element). Does that mean that the above aio_cancel() calls // are broken ??? It should be mentioned here that the element_list holds the // actual aiocb structures. // // It suddenly seems to be working now. // Remember to turn this "free" off when you hit the problem again. // NEED TO LOOK INTO THIS. #if defined(IOMTR_OS_LINUX) || defined(IOMTR_OS_OSX) || defined(IOMTR_OS_SOLARIS) free(cqid->element_list); free(cqid); #elif defined(IOMTR_OS_NETWARE) NXMemFree(cqid->element_list); NXMemFree(cqid); #else #warning ===> WARNING: You have to do some coding here to get the port done! #endif break; default: break; } return (TRUE); }
// // WriteFile() writes "bytes_to_write" bytes from the buffer into the file pointed to by // the file handle. // The call uses asynch I/O routine aio_write(). // // WriteFile() checks the Overlapped structure to determine the write offset in the file handle. // It also determines from the Overlapped structure if the I/O completion status should be // posted on the event queue or the completion queue associated with the file handle. // BOOL WriteFile(HANDLE file_handle, void *buffer, DWORD bytes_to_write, LPDWORD bytes_written, LPOVERLAPPED lpOverlapped) { struct File *filep; struct IOCQ *this_cq; struct aiocb64 *aiocbp; int i, free_index = -1; #ifdef IMMEDIATE_AIO_COMPLETION int aio_error_return; #endif filep = (struct File *)file_handle; // // At this point we have to decide whether to place this in the Completion queue // or the event queue. if ((ULONG_PTR) lpOverlapped->hEvent & 0x00000001) { // forcibly place this on the event queue even though a completion queue is associated // with the file. Well, thats what you asked for. this_cq = (IOCQ *) ((ULONG_PTR) lpOverlapped->hEvent ^ 0x1); } else this_cq = filep->iocq; if (this_cq == NULL) { cout << "event or completion queue not allocated " << endl; return (FALSE); } // First locate an empty slot in the queue. if (this_cq->last_freed != -1) { free_index = this_cq->last_freed; this_cq->last_freed = -1; // the slot is taken. Thanks } else // search for a free index. for (i = 0; i < this_cq->size; i++) { if (this_cq->aiocb_list[i] == NULL) { free_index = i; break; } } // either free_index holds the free index or there is no free space in the Q. if (free_index == -1) return (FALSE); aiocbp = &this_cq->element_list[free_index].aiocbp; aiocbp->aio_buf = buffer; aiocbp->aio_fildes = filep->fd; aiocbp->aio_nbytes = bytes_to_write; aiocbp->aio_offset = (off64_t) lpOverlapped->OffsetHigh; aiocbp->aio_offset = aiocbp->aio_offset << 32; aiocbp->aio_offset += lpOverlapped->Offset; aiocbp->aio_sigevent.sigev_notify = SIGEV_NONE; this_cq->element_list[free_index].data = lpOverlapped; this_cq->element_list[free_index].bytes_transferred = 0; this_cq->element_list[free_index].completion_key = filep->completion_key; *bytes_written = 0; #if defined(IOMTR_OS_LINUX) || defined(IOMTR_OS_OSX) || defined(IOMTR_OS_SOLARIS) if (aio_write64(&this_cq->element_list[free_index].aiocbp) < 0) #elif defined(IOMTR_OS_NETWARE) if (aio_write64(&this_cq->element_list[free_index].aiocbp, filep->type) < 0) #else #warning ===> WARNING: You have to do some coding here to get the port done! #endif { cout << "queuing for write failed with error " << errno << endl; // Note that we have not set aiocb_list[] with the correct pointers. // So, this slot will get grabbed in the next loop. SetLastError(errno); return (FALSE); } #ifdef IMMEDIATE_AIO_COMPLETION // Check if the aio_write completed successfully. if ((aio_error_return = aio_error64(&this_cq->element_list[free_index].aiocbp)) != EINPROGRESS) { *bytes_written = (DWORD) aio_return64(&this_cq->element_list[free_index].aiocbp); if ((long)*bytes_written < 0) { *bytes_written = 0; if (aio_error_return) SetLastError(aio_error_return); else if (errno) SetLastError(errno); return (FALSE); } else { SetLastError(0); return (TRUE); } } #endif else // aio_write is in progress. We have to set the aiocb_list[] to point correctly. this_cq->aiocb_list[free_index] = aiocbp; SetLastError(ERROR_IO_PENDING); return (FALSE); }
// // This call is very similar to GetQueuedCompletionStatus(). // // The difference is that it searches for results on the event queue associated with // a file handle. The call does one quick scan of the event queue and returns an I/O // completion. // Depending on a wait value being TRUE or FALSE, the call either blocks indefinitely // scans the event queue just once waiting for an I/O completion. // BOOL GetOverlappedResult(HANDLE file_handle, LPOVERLAPPED lpOverlapped, LPDWORD bytes_transferred, BOOL wait) { struct timespec *timeoutp; struct timespec timeout; struct File *filep; int this_fd; int i, j; int aio_error_return; IOCQ *eventqid; // // This function either blocks for ever or scans the AIO list just once for completions. // filep = (struct File *)file_handle; if (wait == TRUE) timeoutp = NULL; else { timeout.tv_sec = 0; timeout.tv_nsec = 0; timeoutp = &timeout; } // // GetOverlappedResult() function blocks until one or more AIOs complete and then // returns a completion status. // The search for an AIO completion is very similar to the GetQueuedCompletionStatus() // function. // // get a handle to the current event queue. eventqid = (IOCQ *) ((ULONG_PTR) lpOverlapped->hEvent ^ 0x1); // Call aio_suspend() now. if (aio_suspend64(eventqid->aiocb_list, eventqid->size, timeoutp) < 0) { if ((errno == EAGAIN) || (errno == EINVAL)) { // No operations completed in the given timeout. // Note that changing lpOverlapped has no effect. Its a local copy. lpOverlapped = NULL; *bytes_transferred = 0; SetLastError(WAIT_TIMEOUT); return (FALSE); } #ifdef _DEBUG cout << "aio_suspend returned error " << errno << endl; #endif SetLastError(errno); return (FALSE); } // aio_suspend() returned successfully. Check if atleast one of the // I/Os completed. must have!. i = eventqid->position; for (j = 0; j < eventqid->size; j++) { if (i == eventqid->size) i = 0; errno = 0; if (eventqid->aiocb_list[i]) { if ((aio_error_return = aio_error64(eventqid->aiocb_list[i])) != EINPROGRESS) { this_fd = eventqid->element_list[i].aiocbp.aio_fildes; if (this_fd == filep->fd) { DWORD bytes_expected; *bytes_transferred = (DWORD) aio_return64(eventqid->aiocb_list[i]); bytes_expected = eventqid->aiocb_list[i]->aio_nbytes; // We have done an aio_return() on this element. So anull it. eventqid->aiocb_list[i] = 0; eventqid->last_freed = i; eventqid->position = i + 1; // Note that changing lpOverlapped has no effect. Its a local copy. lpOverlapped = (LPOVERLAPPED) eventqid->element_list[i].data; if (*bytes_transferred != bytes_expected) { if (aio_error_return == ENOSPC) { SetLastError(aio_error_return); return (FALSE); } } if ((DWORD) * bytes_transferred < 0) { *bytes_transferred = 0; if (aio_error_return) SetLastError(aio_error_return); else if (errno) SetLastError(errno); return (FALSE); } else return (TRUE); } } } i++; } // end of for() loop // At this point NO I/O has completed. Return WAIT_TIMEOUT and FALSE. SetLastError(WAIT_TIMEOUT); return (FALSE); }
// // Again, another NT-like call. // // The GetQueuedCompletionStatus() returns the status of exactly one completed // (with or without errors) I/O. It searches the specified completion queue to // see if any asynch I/O operation has completed. If not, then depending on // the timeout value it blocks the caller until one or more I/Os complete // using the aio_suspend() call. // // Note that it is the callers responsibility to call this function with the // correct completion queue handle and timeout value. // // The function returns the #bytes transferred and the completion key (tied to // the file handle and a pointer to void data. // BOOL GetQueuedCompletionStatus(HANDLE cq, LPDWORD bytes_transferred, LPDWORD completion_key, LPOVERLAPPED * lpOverlapped, DWORD tmout) { struct timespec *timeoutp; struct timespec timeout; struct IOCQ *cqid; int i, j; int aio_error_return; cqid = (struct IOCQ *)cq; // // We have two arrays of completion queue information. // The first one is the element_list[] which holds all info about each of the async // I/O request including the all important aiocb structure (for aio control block). // // The second array is the aiocb_list[], which is an array of pointers // to aiocb structures (see any of the aio_...() man pages for details on aiocb) // in the element_list[] and they are required since aio_suspend() requires the start // address of a list of all aiocb pointers to be monitored and the size of the list. // // The way we search for AIO completion is very simple. // Start with current position (initially 0) in and then scan the list for completion. // If atleast one AIO is done return the status and set position to the next element // in the list. (that is where the next search begin - effectively simulating a round // -robin search. // // If none of the AIO is done, block with aio_suspend() until one or more AIOs // complete. When aio_suspend() returns, tag *all* of the completed AIOs and return. // This helps in avoiding two problems. One, aio_suspend() is not called as often as // it otherwise would have been. So, it improves performance. Two, the number of // outstanding I/Os queued to the kernel is not very large. That keeps the kernel happy. // // The next call to GetQueuedCompletionStatus() will pick up the completed AIOs. // // Network AIOs can be speeded up a bit by having an initial tagging loop that // will tag and break when an AIO completion is detected. // // IO Return Loop - return one AIO completion. i = cqid->position; for (j = 0; j < cqid->size; j++) { if (i == cqid->size) { i = 0; } if (cqid->element_list[i].done == TRUE) { // IO operation completed with either success or failure. *completion_key = cqid->element_list[i].completion_key; // Always set completion key *lpOverlapped = (LPOVERLAPPED) cqid->element_list[i].data; // Always set overlap data cqid->element_list[i].done = FALSE; cqid->last_freed = i; cqid->position = i + 1; *bytes_transferred = cqid->element_list[i].bytes_transferred; // We are returning the status of this aio. Set it to NULL to free the slot. cqid->aiocb_list[i] = 0; if ((DWORD) * bytes_transferred < (DWORD) 0) { *bytes_transferred = 0; // TODO: Here - and in the other locations where SetLastError() // is called in this method - we have the problem, that it is // set to // a.) defines defined by us - like WAIT_TIMEOUT // b.) whatever is in the errno variable // We can not realy be shure what each one is and if there is // maybe an overlaps, so we have to consolidate that in some // way. // As this method is called by CQAIO::GetStatus() (only?), we // have to considere changes there as well. SetLastError(cqid->element_list[i].error); return (FALSE); } else { return (TRUE); } } i++; } // end of IO Return loop. // // Beyond this point return FALSE. No I/O has completed yet. // aio_suspend() till atleast one completes. But do not return // a completion. Only mark them. // if (tmout == INFINITE) timeoutp = NULL; else { timeout.tv_sec = tmout / 1000; timeout.tv_nsec = (tmout % 1000) * 1000000; timeoutp = &timeout; } if (aio_suspend64(cqid->aiocb_list, cqid->size, timeoutp) < 0) { *lpOverlapped = NULL; *bytes_transferred = 0; *completion_key = 0; if ((errno == EAGAIN) || (errno == EINVAL)) { #if defined(IOMTR_OS_LINUX) assert(errno == EAGAIN); #endif SetLastError(WAIT_TIMEOUT); } else { SetLastError(errno); } return (FALSE); } // Tagging loop - to tag completed AIOs. for (j = 0; j < cqid->size; j++) { errno = 0; if (cqid->aiocb_list[j]) { if ((aio_error_return = aio_error64(cqid->aiocb_list[j])) != EINPROGRESS) { cqid->element_list[j].bytes_transferred = aio_return64(cqid->aiocb_list[j]); // // We have done an aio_return() on this element. Anull it. // The slot will be picked up by the next request. // But do not do it here. It should be done when the status // of this transaction is actually returned. // Else, ReadFile() and WriteFile() will pick up the slot // thinking it is empty. // // cqid->aiocb_list[j] = 0; cqid->element_list[j].done = TRUE; if (aio_error_return) cqid->element_list[j].error = aio_error_return; else if (errno) cqid->element_list[j].error = errno; } } } SetLastError(WAIT_TIMEOUT); return (FALSE); }
static STATUS DI_force( DI_IO *f, DI_OP *diop, CL_ERR_DESC *err_code) { STATUS big_status = OK, small_status = OK, intern_status = OK; register DI_SLAVE_CB *disl; do { if (Di_slave) { disl = diop->di_evcb; disl->file_op = DI_SL_SYNC; /* Send file properties to slave */ FPROP_COPY(f->io_fprop,disl->io_fprop); DI_slave_send( disl->dest_slave_no, diop, &big_status, &small_status, &intern_status); if (big_status != OK ) break; if ( small_status == OK ) { if ((small_status = disl->status) != OK ) { STRUCT_ASSIGN_MACRO(disl->errcode, *err_code); } } } else { /* ** put code in here for fsync issues */ #ifdef xCL_ASYNC_IO if( Di_async_io) { DI_AIOCB *aio; aio=DI_get_aiocb(); #ifdef dr6_us5 aio->aio.aio_filedes=diop->di_fd; #else aio->aio.aio_fildes=diop->di_fd; #endif /* dr6_us5 */ #ifdef LARGEFILE64 if(aio_fsync64( O_SYNC, &aio->aio)) #elif defined(any_aix) if(fsync( aio->aio.aio_fildes )) #else if(aio_fsync( O_SYNC, &aio->aio)) #endif /* LARGEFILE64 */ { SETCLERR(err_code, 0, ER_fsync); small_status = FAIL; break; } else { if( (small_status=CSsuspend( CS_DIOW_MASK, 0, 0) ) != OK) { DIlru_set_di_error( &small_status, err_code, DI_LRU_CSSUSPEND_ERR, DI_GENERAL_ERR); break; } #if defined(axp_osf) if ( (aio_error(&aio->aio)) != 0 ) #else #ifdef LARGEFILE64 if ( (aio_error64(&aio->aio)) != 0 ) #else /* LARGEFILE64 */ if ( (aio_error(&aio->aio)) != 0 ) #endif /* LARGEFILE64 */ #endif { SETCLERR(err_code, 0, ER_fsync); small_status = FAIL; break; } } } else #endif /* xCL_ASYNC_IO */ if (FSYNC(diop->di_fd) < 0) { #ifdef xCL_092_NO_RAW_FSYNC /* AIX returns EINVAL on character special files */ if (errno != EINVAL) #endif /* xCL_092_NO_RAW_FSYNC */ { SETCLERR(err_code, 0, ER_fsync); small_status = FAIL; } } } } while (FALSE); if ( big_status != OK ) small_status = big_status; if ( small_status != OK ) DIlru_set_di_error( &small_status, err_code, intern_status, DI_GENERAL_ERR); return( small_status ); }