Esempio n. 1
0
/* wait for multiple requests to complete */
int ADIOI_PVFS2_aio_wait_fn(int count, void ** array_of_states, 
		double timeout, MPI_Status *status)
{

    ADIOI_AIO_Request **aio_reqlist;
    PVFS_sys_op_id *op_id_array;
    int i,j, greq_count, completed_count=0;
    int *error_array;

    aio_reqlist = (ADIOI_AIO_Request **)array_of_states;

    op_id_array = (PVFS_sys_op_id*)ADIOI_Calloc(count, sizeof(PVFS_sys_op_id));
    error_array = (int *)ADIOI_Calloc(count, sizeof(int));
    greq_count = count;


    /* PVFS-2.6: testsome actually tests all requests and fills in op_id_array
     * with the ones that have completed.  count is an in/out parameter.
     * returns with the number of completed operations.  what a mess! */
    while (completed_count < greq_count ) {
	count = greq_count;
	PVFS_sys_testsome(op_id_array, &count, NULL, error_array, INT_MAX);
	completed_count += count;
	for (i=0; i< count; i++) {
	    for (j=0; j<greq_count; j++) {
		if (op_id_array[i] == aio_reqlist[j]->op_id) {
		    aio_reqlist[j]->nbytes = 
			aio_reqlist[j]->resp_io.total_completed;
		    MPI_Grequest_complete(aio_reqlist[j]->req);
		}
	    }
	}
    }
    return MPI_SUCCESS; /* TODO: no idea how to deal with errors */
}
Esempio n. 2
0
MPI_File MPIO_File_create(int size)
{
    MPI_File mpi_fh;

    mpi_fh = (MPI_File) ADIOI_Calloc(size,1);
    return mpi_fh;
}
Esempio n. 3
0
int ADIOI_Heap_create(heap_t *heap, int size) {
    heap->size = size;
    heap->nodes = (heap_node_t *) ADIOI_Calloc (size, sizeof(heap_node_t));
    if (heap->nodes == NULL)
	return 1;
    else
	return 0;
}
Esempio n. 4
0
/* Wait for completion of one of the outstanding AIO requests */
int ADIOI_NTFS_aio_wait_fn(int count, void **array_of_states,
		double timeout, MPI_Status *status)
{
	int i, mpi_errno = MPI_SUCCESS;
	ADIOI_AIO_Request **aio_reqlist;
    LPHANDLE lpHandles;
    DWORD retObject=0;

    /* FIXME: Validate the args -- has it already been done by the 
       caller ? */
	aio_reqlist = (ADIOI_AIO_Request **)array_of_states;
    lpHandles = (LPHANDLE) ADIOI_Calloc(count, sizeof(HANDLE));
    if (lpHandles == NULL)
    {
	mpi_errno = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE,
	    "ADIOI_NTFS_aio_wait_fn", __LINE__, MPI_ERR_IO,
	    "**nomem", "**nomem %s", "Event handles");
	return mpi_errno;
    }
	/* XXX: set-up arrays of outstanding requests */
    for(i=0; i<count; i++){
        lpHandles[i] = (aio_reqlist[i])->lpOvl->hEvent;
    }

	/* XXX: wait for one request to complete */
    /* FIXME: Is the timeout in seconds ? */
    timeout = (timeout <= 0) ? INFINITE : (timeout * 1000);
    
    if((retObject = WaitForMultipleObjects(count, lpHandles,
                    FALSE, timeout)) != WAIT_FAILED){
        retObject = retObject - WAIT_OBJECT_0;
        if(GetOverlappedResult( aio_reqlist[retObject]->fd, 
                aio_reqlist[retObject]->lpOvl, &(aio_reqlist[retObject]->nbytes), 
                FALSE)){
        	/* XXX: mark completed requests as 'done'*/
            mpi_errno = MPI_Grequest_complete(aio_reqlist[retObject]->req);
    	    if (mpi_errno != MPI_SUCCESS) {
	    	    mpi_errno = MPIO_Err_create_code(MPI_SUCCESS,
				    MPIR_ERR_RECOVERABLE,
				    "ADIOI_NTFS_aio_wait_fn", __LINE__,
				    MPI_ERR_IO, "**mpi_grequest_complete",
				    0);
            }
        }else{
            if(GetLastError() == ERROR_IO_INCOMPLETE){
            /* IO in progress */
	        /* TODO: need to diddle with status somehow */
            }else{
            /* Error occured */
            /* TODO: not sure how to handle this */    
            }
        }
    }else{
        /* TODO: How to handle error while waiting ? */
    }
    ADIOI_Free(lpHandles);
	return mpi_errno;
}
Esempio n. 5
0
int ADIOI_NTFS_aio(ADIO_File fd, void *buf, int len, ADIO_Offset offset,
		  int wr, void *handle)
{
	DWORD dwNumWritten=0, dwNumRead=0;
	BOOL ret_val = FALSE;
	FDTYPE fd_sys;

	OVERLAPPED *pOvl;

    fd_sys = fd->fd_sys;

    pOvl = (OVERLAPPED *) ADIOI_Calloc(sizeof(OVERLAPPED), 1);
	pOvl->hEvent = CreateEvent(NULL, TRUE, TRUE, NULL);
	pOvl->Offset = DWORDLOW(offset);
	pOvl->OffsetHigh = DWORDHIGH(offset);

	if (wr)
	{
		ret_val = WriteFile(fd_sys, buf, len, &dwNumWritten, pOvl);
		//ret_val = WriteFile(fd_sys, buf, len, &dwNumWritten, NULL);
		//if (ret_val && dwNumWritten) printf("written immediately: %d\n", dwNumWritten);
	}
	else
	{
		ret_val = ReadFile(fd_sys, buf, len, &dwNumRead, pOvl);
		//ret_val = ReadFile(fd_sys, buf, len, &dwNumRead, NULL);
	}

    if (ret_val == FALSE) 
	{
		errno = GetLastError();
		if (errno != ERROR_IO_PENDING)
		{
			if (wr)
				FPRINTF(stderr, "WriteFile error: len %d, dwNumWritten %d\n", len, dwNumWritten);
			else
				FPRINTF(stderr, "ReadFile error: len %d, dwNumRead %d\n", len, dwNumRead);
			FPRINTF(stderr, "Unknown errno %d in ADIOI_NTFS_aio\n", errno);
		    MPI_Abort(MPI_COMM_WORLD, 1);
		}
		ret_val = TRUE;
	}

    *((OVERLAPPED **) handle) = pOvl;

    return ret_val;
}
Esempio n. 6
0
static void ADIOI_GEN_IreadStridedColl_read(ADIOI_NBC_Request *nbc_req,
                                            int *error_code)
{
    ADIOI_GEN_IreadStridedColl_vars *vars = nbc_req->data.rd.rsc_vars;
    ADIOI_Iread_and_exch_vars *rae_vars = NULL;
    ADIOI_Access *my_req = vars->my_req;
    int nprocs = vars->nprocs;
    int i;

    /* my_req[] and count_my_req_per_proc aren't needed at this point, so
     * let's free the memory
     */
    ADIOI_Free(vars->count_my_req_per_proc);
    for (i = 0; i < nprocs; i++) {
        if (my_req[i].count) {
            ADIOI_Free(my_req[i].offsets);
            ADIOI_Free(my_req[i].lens);
        }
    }
    ADIOI_Free(my_req);

    /* read data in sizes of no more than ADIOI_Coll_bufsize,
     * communicate, and fill user buf.
     */
    rae_vars = (ADIOI_Iread_and_exch_vars *)ADIOI_Calloc(
            1, sizeof(ADIOI_Iread_and_exch_vars));
    nbc_req->data.rd.rae_vars = rae_vars;
    rae_vars->fd = vars->fd;
    rae_vars->buf = vars->buf;
    rae_vars->datatype = vars->datatype;
    rae_vars->nprocs = vars->nprocs;
    rae_vars->myrank = vars->myrank;
    rae_vars->others_req = vars->others_req;
    rae_vars->offset_list = vars->offset_list;
    rae_vars->len_list = vars->len_list;
    rae_vars->contig_access_count = vars->contig_access_count;
    rae_vars->min_st_offset = vars->min_st_offset;
    rae_vars->fd_size = vars->fd_size;
    rae_vars->fd_start = vars->fd_start;
    rae_vars->fd_end = vars->fd_end;
    rae_vars->buf_idx = vars->buf_idx;
    rae_vars->next_fn = ADIOI_GEN_IreadStridedColl_free;

    ADIOI_Iread_and_exch(nbc_req, error_code);
}
Esempio n. 7
0
/* parse the file-of-hints.  Format is zero or more lines of "<key> <value>\n".
 * A # in collumn zero is a comment and the line will be ignored.  Do our best
 * to ignore badly formed lines too. 
 *
 * The caller provides an 'info' object.  Each key-value pair found by the
 * parser will get added to the info object.  any keys already set will be left
 * alone on the assumption that the caller knows best. 
 *
 * because MPI-IO hints are optional, we can get away with limited error
 * reporting.  */
static int file_to_info(int fd, MPI_Info info)
{
    char *buffer, *token, *key, *val, *garbage;
    char *pos1=NULL, *pos2=NULL;
    int flag, ret;
    char dummy;
    struct stat statbuf;

    /* assumption: config files will be small (less than 1MB) */
    fstat(fd, &statbuf);
    /* add 1 to size to make room for NULL termination */
    buffer = (char *)ADIOI_Calloc(statbuf.st_size + 1, sizeof (char));
    if (buffer == NULL) return -1;

    ret = read(fd, buffer, statbuf.st_size);
    if (ret < 0) return -1;
    token = strtok_r(buffer, "\n", &pos1);
    do {
	if ( (key = strtok_r(token, " \t", &pos2)) == NULL) 
	    /* malformed line: found no items */
	    continue;
	if (token[0] == '#') 
	    /* ignore '#'-delimited comments */
	    continue;
	if ( (val = strtok_r(NULL, " \t", &pos2))  == NULL) 
	    /* malformed line: found key without value */
	    continue;
	if ( (garbage = strtok_r(NULL, " \t", &pos2)) != NULL) 
	    /* malformed line: more than two items */
	    continue;
	    
#ifdef SYSHINT_DEBUG
	printf("found: key=%s val=%s\n", key, val);
#endif
	/* don't actually care what the value is. only want to know if key
	 * exists: we leave it alone if so*/
	ADIOI_Info_get(info, key, 1, &dummy, &flag);
	if (flag == 1) continue;
	ADIOI_Info_set(info, key, val);
    } while ((token = strtok_r(NULL, "\n", &pos1)) != NULL);
    ADIOI_Free(buffer);
    return 0;
}
Esempio n. 8
0
static void ADIOI_Iread_and_exch_l2_begin(ADIOI_NBC_Request *nbc_req,
                                          int *error_code)
{
    ADIOI_Iread_and_exch_vars *vars = nbc_req->data.rd.rae_vars;
    ADIOI_R_Iexchange_data_vars *red_vars = NULL;

    /* loop exit condition */
    if (vars->m >= vars->max_ntimes) {
        ADIOI_Iread_and_exch_fini(nbc_req, error_code);
        return;
    }

    /* create a struct for ADIOI_R_Iexchange_data() */
    red_vars = (ADIOI_R_Iexchange_data_vars *)ADIOI_Calloc(
            1, sizeof(ADIOI_R_Iexchange_data_vars));
    nbc_req->data.rd.red_vars = red_vars;
    red_vars->fd = vars->fd;
    red_vars->buf = vars->buf;
    red_vars->flat_buf = vars->flat_buf;
    red_vars->offset_list = vars->offset_list;
    red_vars->len_list = vars->len_list;
    red_vars->send_size = vars->send_size;
    red_vars->recv_size = vars->recv_size;
    red_vars->count = vars->count;
    red_vars->start_pos = vars->start_pos;
    red_vars->partial_send = vars->partial_send;
    red_vars->recd_from_proc = vars->recd_from_proc;
    red_vars->nprocs = vars->nprocs;
    red_vars->myrank = vars->myrank;
    red_vars->buftype_is_contig = vars->buftype_is_contig;
    red_vars->contig_access_count = vars->contig_access_count;
    red_vars->min_st_offset = vars->min_st_offset;
    red_vars->fd_size = vars->fd_size;
    red_vars->fd_start = vars->fd_start;
    red_vars->fd_end = vars->fd_end;
    red_vars->others_req = vars->others_req;
    red_vars->iter = vars->m;
    red_vars->buftype_extent = vars->buftype_extent;
    red_vars->buf_idx = vars->buf_idx;
    red_vars->next_fn = ADIOI_Iread_and_exch_l2_end;

    ADIOI_R_Iexchange_data(nbc_req, error_code);
}
Esempio n. 9
0
int ADIOI_SCI_aio(ADIO_File fd, void *buf, int len, ADIO_Offset offset,
		  int wr, void *handle)
{
    int err=-1, fd_sys;

#ifndef NO_AIO
    int error_code;
#ifdef AIO_SUN 
    aio_result_t *result;
#else
    struct aiocb *aiocbp;
#endif
#endif

    fd_sys = fd->fd_sys;

#ifdef AIO_SUN
    result = (aio_result_t *) ADIOI_Malloc(sizeof(aio_result_t));
    result->aio_return = AIO_INPROGRESS;
    if (wr) err = aiowrite(fd_sys, buf, len, offset, SEEK_SET, result); 
    else err = aioread(fd_sys, buf, len, offset, SEEK_SET, result);

    if (err == -1) {
	if (errno == EAGAIN) { 
       /* the man pages say EPROCLIM, but in reality errno is set to EAGAIN! */

        /* exceeded the max. no. of outstanding requests.
           complete all previous async. requests and try again.*/

	    ADIOI_Complete_async(&error_code);
	    if (wr) err = aiowrite(fd_sys, buf, len, offset, SEEK_SET, result); 
	    else err = aioread(fd_sys, buf, len, offset, SEEK_SET, result);

	    while (err == -1) {
		if (errno == EAGAIN) {
                    /* sleep and try again */
                    sleep(1);
		    if (wr) err = aiowrite(fd_sys, buf, len, offset, SEEK_SET, result); 
		    else err = aioread(fd_sys, buf, len, offset, SEEK_SET, result);
		}
                else {
                    FPRINTF(stderr, "Unknown errno %d in ADIOI_SCI_aio\n", errno);
                    MPI_Abort(MPI_COMM_WORLD, 1);
                }
	    }
	}
        else {
            FPRINTF(stderr, "Unknown errno %d in ADIOI_SCI_aio\n", errno);
            MPI_Abort(MPI_COMM_WORLD, 1);
        }
    }

    *((aio_result_t **) handle) = result;
#endif

#ifdef NO_FD_IN_AIOCB
/* IBM */
    aiocbp = (struct aiocb *) ADIOI_Malloc(sizeof(struct aiocb));
    aiocbp->aio_whence = SEEK_SET;
    aiocbp->aio_offset = offset;
    aiocbp->aio_buf = buf;
    aiocbp->aio_nbytes = len;
    if (wr) err = aio_write(fd_sys, aiocbp);
    else err = aio_read(fd_sys, aiocbp);

    if (err == -1) {
	if (errno == EAGAIN) {
        /* exceeded the max. no. of outstanding requests.
          complete all previous async. requests and try again. */

	    ADIOI_Complete_async(&error_code);
	    if (wr) err = aio_write(fd_sys, aiocbp);
	    else err = aio_read(fd_sys, aiocbp);

            while (err == -1) {
                if (errno == EAGAIN) {
                    /* sleep and try again */
                    sleep(1);
		    if (wr) err = aio_write(fd_sys, aiocbp);
		    else err = aio_read(fd_sys, aiocbp);
		}
                else {
                    FPRINTF(stderr, "Unknown errno %d in ADIOI_SCI_aio\n", errno);
                    MPI_Abort(MPI_COMM_WORLD, 1);
                }
            }
	}
        else {
            FPRINTF(stderr, "Unknown errno %d in ADIOI_SCI_aio\n", errno);
            MPI_Abort(MPI_COMM_WORLD, 1);
        }
    }

    *((struct aiocb **) handle) = aiocbp;

#elif (!defined(NO_AIO) && !defined(AIO_SUN))
/* DEC, SGI IRIX 5 and 6 */

    aiocbp = (struct aiocb *) ADIOI_Calloc(sizeof(struct aiocb), 1);
    aiocbp->aio_fildes = fd_sys;
    aiocbp->aio_offset = offset;
    aiocbp->aio_buf = buf;
    aiocbp->aio_nbytes = len;

#ifdef AIO_PRIORITY_DEFAULT
/* DEC */
    aiocbp->aio_reqprio = AIO_PRIO_DFL;   /* not needed in DEC Unix 4.0 */
    aiocbp->aio_sigevent.sigev_signo = 0;
#else
    aiocbp->aio_reqprio = 0;
#endif

#ifdef AIO_SIGNOTIFY_NONE
/* SGI IRIX 6 */
    aiocbp->aio_sigevent.sigev_notify = SIGEV_NONE;
#else
    aiocbp->aio_sigevent.sigev_signo = 0;
#endif

    if (wr) err = aio_write(aiocbp);
    else err = aio_read(aiocbp);

    if (err == -1) {
	if (errno == EAGAIN) {
        /* exceeded the max. no. of outstanding requests.
           complete all previous async. requests and try again. */

	    ADIOI_Complete_async(&error_code);
	    if (wr) err = aio_write(aiocbp);
	    else err = aio_read(aiocbp);

	    while (err == -1) {
		if (errno == EAGAIN) {
		    /* sleep and try again */
		    sleep(1);
		    if (wr) err = aio_write(aiocbp);
		    else err = aio_read(aiocbp);
		}
		else {
		    FPRINTF(stderr, "Unknown errno %d in ADIOI_SCI_aio\n", errno);
		    MPI_Abort(MPI_COMM_WORLD, 1);
		}
	    }
        }
	else {
	    FPRINTF(stderr, "Unknown errno %d in ADIOI_SCI_aio\n", errno);
	    MPI_Abort(MPI_COMM_WORLD, 1);
	}
    }

    *((struct aiocb **) handle) = aiocbp;
#endif

    return err;
}
void ADIOI_NTFS_ReadContig(ADIO_File fd, void *buf, int count,
                           MPI_Datatype datatype, int file_ptr_type,
                           ADIO_Offset offset, ADIO_Status *status,
                           int *error_code)
{
    LONG dwTemp;
    DWORD dwNumRead = 0;
    int err=-1, datatype_size, len;
    static char myname[] = "ADIOI_NTFS_ReadContig";
    OVERLAPPED *pOvl;

    MPI_Type_size(datatype, &datatype_size);
    len = datatype_size * count;

    pOvl = (OVERLAPPED *) ADIOI_Calloc(sizeof(OVERLAPPED), 1);
    if (pOvl == NULL)
    {
        *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE,
                                           myname, __LINE__, MPI_ERR_IO,
                                           "**nomem", "**nomem %s", "OVERLAPPED");
        return;
    }
    pOvl->hEvent = CreateEvent(NULL, TRUE, TRUE, NULL);
    if (pOvl->hEvent == NULL)
    {
        err = GetLastError();
        *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE,
                                           myname, __LINE__, MPI_ERR_IO,
                                           "**io", "**io %s", ADIOI_NTFS_Strerror(err));
        ADIOI_Free(pOvl);
        return;
    }
    pOvl->Offset = DWORDLOW(offset);
    pOvl->OffsetHigh = DWORDHIGH(offset);

    if (file_ptr_type == ADIO_EXPLICIT_OFFSET)
    {
        if (fd->fp_sys_posn != offset)
        {
            dwTemp = DWORDHIGH(offset);
            if (SetFilePointer(fd->fd_sys, DWORDLOW(offset), &dwTemp, FILE_BEGIN) == INVALID_SET_FILE_POINTER)
            {
                err = GetLastError();
                if (err != NO_ERROR)
                {
                    *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE,
                                                       myname, __LINE__, MPI_ERR_IO,
                                                       "**io", "**io %s", ADIOI_NTFS_Strerror(err));
                    CloseHandle(pOvl->hEvent);
                    ADIOI_Free(pOvl);
                    return;
                }
            }
        }
        /*
        {
            ADIO_Fcntl_t fcntl_struct;
            int error_code;
            ADIO_Fcntl(fd, ADIO_FCNTL_GET_FSIZE, &fcntl_struct, &error_code);
            printf("File size b: %d\n", fcntl_struct.fsize);
        }
        printf("ReadFile(%d bytes)\n", len);fflush(stdout);
        */
        err = ReadFile(fd->fd_sys, buf, len, &dwNumRead, pOvl);
        /* --BEGIN ERROR HANDLING-- */
        if (err == FALSE)
        {
            err = GetLastError();
            switch (err)
            {
            case ERROR_IO_PENDING:
                break;
            case ERROR_HANDLE_EOF:
                /*printf("EOF error\n");fflush(stdout);*/
                SetEvent(pOvl->hEvent);
                break;
            default:
                *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE,
                                                   myname, __LINE__, MPI_ERR_IO,
                                                   "**io",
                                                   "**io %s", ADIOI_NTFS_Strerror(err));
                CloseHandle(pOvl->hEvent);
                ADIOI_Free(pOvl);
                return;
            }
        }
        /* --END ERROR HANDLING-- */
        err = GetOverlappedResult(fd->fd_sys, pOvl, &dwNumRead, TRUE);
        /* --BEGIN ERROR HANDLING-- */
        if (err == FALSE)
        {
            err = GetLastError();
            if (err != ERROR_HANDLE_EOF) /* Ignore EOF errors */
            {
                *error_code = MPIO_Err_create_code(MPI_SUCCESS,
                                                   MPIR_ERR_RECOVERABLE, myname,
                                                   __LINE__, MPI_ERR_IO, "**io",
                                                   "**io %s", ADIOI_NTFS_Strerror(err));
                CloseHandle(pOvl->hEvent);
                ADIOI_Free(pOvl);
                return;
            }
        }
        /* --END ERROR HANDLING-- */
        if (!CloseHandle(pOvl->hEvent))
        {
            err = GetLastError();
            *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE,
                                               myname, __LINE__, MPI_ERR_IO,
                                               "**io", "**io %s", ADIOI_NTFS_Strerror(err));
            CloseHandle(pOvl->hEvent);
            ADIOI_Free(pOvl);
            return;
        }
        ADIOI_Free(pOvl);

        fd->fp_sys_posn = offset + (ADIO_Offset)dwNumRead;
        /* individual file pointer not updated */
    }
    else
    {
        /* read from curr. location of ind. file pointer */
        if (fd->fp_sys_posn != fd->fp_ind)
        {
            dwTemp = DWORDHIGH(fd->fp_ind);
            if (SetFilePointer(fd->fd_sys, DWORDLOW(fd->fp_ind), &dwTemp, FILE_BEGIN) == INVALID_SET_FILE_POINTER)
            {
                err = GetLastError();
                if (err != NO_ERROR)
                {
                    *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE,
                                                       myname, __LINE__, MPI_ERR_IO,
                                                       "**io", "**io %s", ADIOI_NTFS_Strerror(err));
                    CloseHandle(pOvl->hEvent);
                    ADIOI_Free(pOvl);
                    return;
                }
            }
        }
        /*
        {
            ADIO_Fcntl_t fcntl_struct;
            int error_code;
            ADIO_Fcntl(fd, ADIO_FCNTL_GET_FSIZE, &fcntl_struct, &error_code);
            printf("File size c: %d\n", fcntl_struct.fsize);
        }
        printf("ReadFile(%d bytes)\n", len);fflush(stdout);
        */
        err = ReadFile(fd->fd_sys, buf, len, &dwNumRead, pOvl);
        /* --BEGIN ERROR HANDLING-- */
        if (err == FALSE)
        {
            err = GetLastError();
            switch (err)
            {
            case ERROR_IO_PENDING:
                break;
            case ERROR_HANDLE_EOF:
                /*printf("EOF error\n");fflush(stdout);*/
                SetEvent(pOvl->hEvent);
                break;
            default:
                *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE,
                                                   myname, __LINE__, MPI_ERR_IO,
                                                   "**io",
                                                   "**io %s", ADIOI_NTFS_Strerror(err));
                CloseHandle(pOvl->hEvent);
                ADIOI_Free(pOvl);
                return;
            }
        }
        /* --END ERROR HANDLING-- */
        err = GetOverlappedResult(fd->fd_sys, pOvl, &dwNumRead, TRUE);
        /* --BEGIN ERROR HANDLING-- */
        if (err == FALSE)
        {
            err = GetLastError();
            if (err != ERROR_HANDLE_EOF) /* Ignore EOF errors */
            {
                *error_code = MPIO_Err_create_code(MPI_SUCCESS,
                                                   MPIR_ERR_RECOVERABLE, myname,
                                                   __LINE__, MPI_ERR_IO, "**io",
                                                   "**io %s", ADIOI_NTFS_Strerror(err));
                CloseHandle(pOvl->hEvent);
                ADIOI_Free(pOvl);
                return;
            }
        }
        /* --END ERROR HANDLING-- */
        if (!CloseHandle(pOvl->hEvent))
        {
            err = GetLastError();
            *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE,
                                               myname, __LINE__, MPI_ERR_IO,
                                               "**io", "**io %s", ADIOI_NTFS_Strerror(err));
            ADIOI_Free(pOvl);
            return;
        }
        ADIOI_Free(pOvl);

        fd->fp_ind = fd->fp_ind + (ADIO_Offset)dwNumRead;
        fd->fp_sys_posn = fd->fp_ind;
    }

#ifdef HAVE_STATUS_SET_BYTES
    if (err != FALSE)
    {
        MPIR_Status_set_bytes(status, datatype, dwNumRead);
    }
#endif

    /* --BEGIN ERROR HANDLING-- */
    if (err == FALSE)
    {
        err = GetLastError();
        *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE,
                                           myname, __LINE__, MPI_ERR_IO,
                                           "**io",
                                           "**io %s", ADIOI_NTFS_Strerror(err));
        return;
    }
    /* --END ERROR HANDLING-- */
    *error_code = MPI_SUCCESS;
}
Esempio n. 11
0
/* If successful, error_code is set to MPI_SUCCESS.  Otherwise an error
 * code is created and returned in error_code.
 */
static void ADIOI_LUSTRE_Exch_and_write(ADIO_File fd, const void *buf,
					MPI_Datatype datatype, int nprocs,
					int myrank, ADIOI_Access *others_req,
                                        ADIOI_Access *my_req,
					ADIO_Offset *offset_list,
                                        ADIO_Offset *len_list, 
					int contig_access_count,
                                        int *striping_info, int **buf_idx,
                                        int *error_code)
{
    /* Send data to appropriate processes and write in sizes of no more
     * than lustre stripe_size.
     * The idea is to reduce the amount of extra memory required for
     * collective I/O. If all data were written all at once, which is much
     * easier, it would require temp space more than the size of user_buf,
     * which is often unacceptable. For example, to write a distributed
     * array to a file, where each local array is 8Mbytes, requiring
     * at least another 8Mbytes of temp space is unacceptable.
     */

    int hole, i, j, m, flag, ntimes = 1 , max_ntimes, buftype_is_contig;
    ADIO_Offset st_loc = -1, end_loc = -1, min_st_loc, max_end_loc;
    ADIO_Offset off, req_off, send_off, iter_st_off, *off_list;
    ADIO_Offset max_size, step_size = 0;
    int real_size, req_len, send_len;
    int *recv_curr_offlen_ptr, *recv_count, *recv_size;
    int *send_curr_offlen_ptr, *send_size;
    int *sent_to_proc, *recv_start_pos;
    int *send_buf_idx, *curr_to_proc, *done_to_proc;
    int *this_buf_idx;
    char *write_buf = NULL;
    MPI_Status status;
    ADIOI_Flatlist_node *flat_buf = NULL;
    MPI_Aint buftype_extent;
    int stripe_size = striping_info[0], avail_cb_nodes = striping_info[2];
    int data_sieving = 0;
    ADIO_Offset *srt_off = NULL;
    int *srt_len = NULL;
    int srt_num = 0;
    ADIO_Offset block_offset;
    int block_len;

    *error_code = MPI_SUCCESS;	/* changed below if error */
    /* only I/O errors are currently reported */

    /* calculate the number of writes of stripe size to be done.
     * That gives the no. of communication phases as well.
     * Note:
     *   Because we redistribute data in stripe-contiguous pattern for Lustre,
     *   each process has the same no. of communication phases.
     */

    for (i = 0; i < nprocs; i++) {
	if (others_req[i].count) {
	    st_loc = others_req[i].offsets[0];
	    end_loc = others_req[i].offsets[0];
	    break;
	}
    }
    for (i = 0; i < nprocs; i++) {
	for (j = 0; j < others_req[i].count; j++) {
	    st_loc = MPL_MIN(st_loc, others_req[i].offsets[j]);
	    end_loc = MPL_MAX(end_loc, (others_req[i].offsets[j] +
                                          others_req[i].lens[j] - 1));
	}
    }
    /* this process does no writing. */
    if ((st_loc == -1) && (end_loc == -1))
	ntimes = 0;
    MPI_Allreduce(&end_loc, &max_end_loc, 1, MPI_LONG_LONG_INT, MPI_MAX, fd->comm);
    /* avoid min_st_loc be -1 */
    if (st_loc == -1)
        st_loc = max_end_loc;
    MPI_Allreduce(&st_loc, &min_st_loc, 1, MPI_LONG_LONG_INT, MPI_MIN, fd->comm);
    /* align downward */
    min_st_loc -= min_st_loc % (ADIO_Offset)stripe_size;

    /* Each time, only avail_cb_nodes number of IO clients perform IO,
     * so, step_size=avail_cb_nodes*stripe_size IO will be performed at most,
     * and ntimes=whole_file_portion/step_size
     */
    step_size = (ADIO_Offset) avail_cb_nodes * stripe_size;
    max_ntimes = (max_end_loc - min_st_loc + 1) / step_size
        + (((max_end_loc - min_st_loc + 1) % step_size) ? 1 : 0);
/*     max_ntimes = (int)((max_end_loc - min_st_loc) / step_size + 1); */
    if (ntimes)
	write_buf = (char *) ADIOI_Malloc(stripe_size);

    /* calculate the start offset for each iteration */
    off_list = (ADIO_Offset *) ADIOI_Malloc(max_ntimes * sizeof(ADIO_Offset));
    for (m = 0; m < max_ntimes; m ++)
        off_list[m] = max_end_loc;
    for (i = 0; i < nprocs; i++) {
        for (j = 0; j < others_req[i].count; j ++) {
            req_off = others_req[i].offsets[j];
            m = (int)((req_off - min_st_loc) / step_size);
            off_list[m] = MPL_MIN(off_list[m], req_off);
        }
    }

    recv_curr_offlen_ptr = (int *) ADIOI_Calloc(nprocs, sizeof(int));
    send_curr_offlen_ptr = (int *) ADIOI_Calloc(nprocs, sizeof(int));
    /* their use is explained below. calloc initializes to 0. */

    recv_count = (int *) ADIOI_Malloc(nprocs * sizeof(int));
    /* to store count of how many off-len pairs per proc are satisfied
       in an iteration. */

    send_size = (int *) ADIOI_Malloc(nprocs * sizeof(int));
    /* total size of data to be sent to each proc. in an iteration.
       Of size nprocs so that I can use MPI_Alltoall later. */

    recv_size = (int *) ADIOI_Malloc(nprocs * sizeof(int));
    /* total size of data to be recd. from each proc. in an iteration. */

    sent_to_proc = (int *) ADIOI_Calloc(nprocs, sizeof(int));
    /* amount of data sent to each proc so far. Used in
       ADIOI_Fill_send_buffer. initialized to 0 here. */

    send_buf_idx = (int *) ADIOI_Malloc(nprocs * sizeof(int));
    curr_to_proc = (int *) ADIOI_Malloc(nprocs * sizeof(int));
    done_to_proc = (int *) ADIOI_Malloc(nprocs * sizeof(int));
    /* Above three are used in ADIOI_Fill_send_buffer */

    this_buf_idx = (int *) ADIOI_Malloc(nprocs * sizeof(int));

    recv_start_pos = (int *) ADIOI_Malloc(nprocs * sizeof(int));
    /* used to store the starting value of recv_curr_offlen_ptr[i] in
       this iteration */

    ADIOI_Datatype_iscontig(datatype, &buftype_is_contig);
    if (!buftype_is_contig) {
	flat_buf = ADIOI_Flatten_and_find(datatype);
    }
    MPI_Type_extent(datatype, &buftype_extent);
    /* I need to check if there are any outstanding nonblocking writes to
     * the file, which could potentially interfere with the writes taking
     * place in this collective write call. Since this is not likely to be
     * common, let me do the simplest thing possible here: Each process
     * completes all pending nonblocking operations before completing.
     */
    /*ADIOI_Complete_async(error_code);
    if (*error_code != MPI_SUCCESS) return;
    MPI_Barrier(fd->comm);
    */

    iter_st_off = min_st_loc;

    /* Although we have recognized the data according to OST index,
     * a read-modify-write will be done if there is a hole between the data.
     * For example: if blocksize=60, xfersize=30 and stripe_size=100,
     * then rank0 will collect data [0, 30] and [60, 90] then write. There
     * is a hole in [30, 60], which will cause a read-modify-write in [0, 90].
     *
     * To reduce its impact on the performance, we can disable data sieving
     * by hint "ds_in_coll".
     */
    /* check the hint for data sieving */
    data_sieving = fd->hints->fs_hints.lustre.ds_in_coll;

    for (m = 0; m < max_ntimes; m++) {
	/* go through all others_req and my_req to check which will be received
         * and sent in this iteration.
         */

	/* Note that MPI guarantees that displacements in filetypes are in
	   monotonically nondecreasing order and that, for writes, the
	   filetypes cannot specify overlapping regions in the file. This
	   simplifies implementation a bit compared to reads. */

	/*
           off         = start offset in the file for the data to be written in
                         this iteration
           iter_st_off = start offset of this iteration
           real_size   = size of data written (bytes) corresponding to off
           max_size    = possible maximum size of data written in this iteration
           req_off     = offset in the file for a particular contiguous request minus
                         what was satisfied in previous iteration
           send_off    = offset the request needed by other processes in this iteration
           req_len     = size corresponding to req_off
           send_len    = size corresponding to send_off
         */

	/* first calculate what should be communicated */
	for (i = 0; i < nprocs; i++)
	    recv_count[i] = recv_size[i] = send_size[i] = 0;

        off = off_list[m];
        max_size = MPL_MIN(step_size, max_end_loc - iter_st_off + 1);
        real_size = (int) MPL_MIN((off / stripe_size + 1) * stripe_size -
                                    off,
                                    end_loc - off + 1);

	for (i = 0; i < nprocs; i++) {
            if (my_req[i].count) {
                this_buf_idx[i] = buf_idx[i][send_curr_offlen_ptr[i]];
                for (j = send_curr_offlen_ptr[i]; j < my_req[i].count; j++) {
                    send_off = my_req[i].offsets[j];
                    send_len = my_req[i].lens[j];
                    if (send_off < iter_st_off + max_size) {
                        send_size[i] += send_len;
                    } else {
                        break;
                    }
                }
                send_curr_offlen_ptr[i] = j;
            }
	    if (others_req[i].count) {
		recv_start_pos[i] = recv_curr_offlen_ptr[i];
		for (j = recv_curr_offlen_ptr[i]; j < others_req[i].count; j++) {
                    req_off = others_req[i].offsets[j];
                    req_len = others_req[i].lens[j];
		    if (req_off < iter_st_off + max_size) {
			recv_count[i]++;
                        ADIOI_Assert((((ADIO_Offset)(MPIU_Upint)write_buf)+req_off-off) == (ADIO_Offset)(MPIU_Upint)(write_buf+req_off-off));
			MPI_Address(write_buf + req_off - off,
				    &(others_req[i].mem_ptrs[j]));
                        recv_size[i] += req_len;
		    } else {
			break;
                    }
		}
		recv_curr_offlen_ptr[i] = j;
	    }
	}
        /* use variable "hole" to pass data_sieving flag into W_Exchange_data */
        hole = data_sieving;
	ADIOI_LUSTRE_W_Exchange_data(fd, buf, write_buf, flat_buf, offset_list,
                                     len_list, send_size, recv_size, off, real_size,
                                     recv_count, recv_start_pos,
                                     sent_to_proc, nprocs, myrank,
                                     buftype_is_contig, contig_access_count,
                                     striping_info, others_req, send_buf_idx,
                                     curr_to_proc, done_to_proc, &hole, m,
                                  buftype_extent, this_buf_idx,
                                  &srt_off, &srt_len, &srt_num, error_code);

	if (*error_code != MPI_SUCCESS)
            goto over;

	flag = 0;
	for (i = 0; i < nprocs; i++)
	    if (recv_count[i]) {
		flag = 1;
		break;
	    }
	if (flag) {
            /* check whether to do data sieving */
            if(data_sieving == ADIOI_HINT_ENABLE) {
	        ADIO_WriteContig(fd, write_buf, real_size, MPI_BYTE,
			         ADIO_EXPLICIT_OFFSET, off, &status,
			         error_code);
            } else {
                /* if there is no hole, write data in one time;
                 * otherwise, write data in several times */
                if (!hole) {
                    ADIO_WriteContig(fd, write_buf, real_size, MPI_BYTE,
                                     ADIO_EXPLICIT_OFFSET, off, &status,
                                     error_code);
                } else {
                    block_offset = -1;
                    block_len = 0;
                    for (i = 0; i < srt_num; ++i) {
                        if (srt_off[i] < off + real_size &&
                            srt_off[i] >= off) {
                            if (block_offset == -1) {
                                block_offset = srt_off[i];
                                block_len = srt_len[i];
                            } else {
                                if (srt_off[i] == block_offset + block_len) {
                                    block_len += srt_len[i];
                                } else {
                                    ADIO_WriteContig(fd,
                                                     write_buf + block_offset - off,
                                                     block_len,
                                                     MPI_BYTE, ADIO_EXPLICIT_OFFSET,
                                                     block_offset, &status,
                                                     error_code);
	                            if (*error_code != MPI_SUCCESS)
		                        goto over;
                                    block_offset = srt_off[i];
                                    block_len = srt_len[i];
                                }
                            }
                        }
                    }
                    if (block_offset != -1) {
                        ADIO_WriteContig(fd,
                                         write_buf + block_offset - off,
                                         block_len,
                                         MPI_BYTE, ADIO_EXPLICIT_OFFSET,
                                         block_offset, &status,
                                         error_code);
                        if (*error_code != MPI_SUCCESS)
                            goto over;
                    }
                }
            }
	    if (*error_code != MPI_SUCCESS)
		goto over;
	}
        iter_st_off += max_size;
    }
over:
    if (srt_off)
        ADIOI_Free(srt_off);
    if (srt_len)
        ADIOI_Free(srt_len);
    if (ntimes)
	ADIOI_Free(write_buf);
    ADIOI_Free(recv_curr_offlen_ptr);
    ADIOI_Free(send_curr_offlen_ptr);
    ADIOI_Free(recv_count);
    ADIOI_Free(send_size);
    ADIOI_Free(recv_size);
    ADIOI_Free(sent_to_proc);
    ADIOI_Free(recv_start_pos);
    ADIOI_Free(send_buf_idx);
    ADIOI_Free(curr_to_proc);
    ADIOI_Free(done_to_proc);
    ADIOI_Free(this_buf_idx);
    ADIOI_Free(off_list);
}
Esempio n. 12
0
/* ADIOI_Exchange_file_views - Sends all the aggregators the file
 * views and file view states of the clients.  It fills in the
 * client_file_view_state_arr for the aggregators and the
 * my_mem_view_state for the client.  It also initializes the
 * agg_file_view_state for all clients, which is the view for each
 * aggregator of a client's filetype. */
void ADIOI_Exch_file_views(int myrank, int nprocs, int file_ptr_type,
                           ADIO_File fd, int count,
                           MPI_Datatype datatype, ADIO_Offset off,
                           view_state * my_mem_view_state_arr,
                           view_state * agg_file_view_state_arr,
                           view_state * client_file_view_state_arr)
{
    /* Convert my own fileview to an ADIOI_Flattened type and a
     * disp. MPI_Alltoall the count of ADIOI_Flatlist nodes.
     * MPI_Isend/Irecv the block_lens, indices of ADIOI_Flatlist node
     * to/from each of the aggregators with the rest of the file view
     * state. */

    int i = -1, j = -1;
    amount_and_extra_data_t *send_count_arr = NULL;
    amount_and_extra_data_t *recv_count_arr = NULL;
    int send_req_arr_sz = 0;
    int recv_req_arr_sz = 0;
    MPI_Request *send_req_arr = NULL, *recv_req_arr = NULL;
    MPI_Status *statuses = NULL;
    ADIO_Offset disp_off_sz_ext_typesz[6];
    MPI_Aint memtype_extent, filetype_extent;
    int ret = -1;

    /* parameters for datatypes */
    ADIOI_Flatlist_node *flat_mem_p = NULL, *flat_file_p = NULL;
    MPI_Count memtype_sz = -1;
    int memtype_is_contig = -1;
    ADIO_Offset filetype_sz = -1;

#ifdef AGGREGATION_PROFILE
    MPE_Log_event(5014, 0, NULL);
#endif
    /* The memtype will be freed after the call.  The filetype will be
     * freed in the close and should have been flattened in the file
     * view. */
    MPI_Type_size_x(datatype, &memtype_sz);
    MPI_Type_extent(datatype, &memtype_extent);
    if (memtype_sz == memtype_extent) {
        memtype_is_contig = 1;
        flat_mem_p = ADIOI_Flatten_and_find(datatype);
        flat_mem_p->blocklens[0] = memtype_sz * count;
    } else {
        flat_mem_p = ADIOI_Flatten_and_find(datatype);
    }

    MPI_Type_extent(fd->filetype, &filetype_extent);
    MPI_Type_size_x(fd->filetype, &filetype_sz);
    flat_file_p = ADIOI_Flatten_and_find(fd->filetype);
    if (filetype_extent == filetype_sz) {
        flat_file_p->blocklens[0] = memtype_sz * count;
        filetype_extent = memtype_sz * count;
        filetype_sz = filetype_extent;
    }

    disp_off_sz_ext_typesz[0] = fd->fp_ind;
    disp_off_sz_ext_typesz[1] = fd->disp;
    disp_off_sz_ext_typesz[2] = off;
    disp_off_sz_ext_typesz[3] = memtype_sz * count;
    disp_off_sz_ext_typesz[4] = (ADIO_Offset) filetype_extent;
    disp_off_sz_ext_typesz[5] = (ADIO_Offset) filetype_sz;

    if (fd->hints->cb_alltoall != ADIOI_HINT_DISABLE) {
        recv_count_arr = ADIOI_Calloc(nprocs, sizeof(amount_and_extra_data_t));
        send_count_arr = ADIOI_Calloc(nprocs, sizeof(amount_and_extra_data_t));
    } else {
        send_count_arr = ADIOI_Calloc(fd->hints->cb_nodes, sizeof(amount_and_extra_data_t));

        /* only aggregators receive data */
        if (fd->is_agg) {
            recv_count_arr = ADIOI_Calloc(nprocs, sizeof(amount_and_extra_data_t));
            recv_req_arr = ADIOI_Malloc(nprocs * sizeof(MPI_Request));
            for (i = 0; i < nprocs; i++)
                MPI_Irecv(&recv_count_arr[i], sizeof(amount_and_extra_data_t),
                          MPI_BYTE, i, COUNT_EXCH, fd->comm, &recv_req_arr[i]);
        }

        /* only send data to aggregators */
        send_req_arr = ADIOI_Calloc(fd->hints->cb_nodes, sizeof(MPI_Request));
        for (i = 0; i < fd->hints->cb_nodes; i++) {
            send_count_arr[i].count = flat_file_p->count;
            send_count_arr[i].fp_ind = disp_off_sz_ext_typesz[0];
            send_count_arr[i].disp = disp_off_sz_ext_typesz[1];
            send_count_arr[i].byte_off = disp_off_sz_ext_typesz[2];
            send_count_arr[i].sz = disp_off_sz_ext_typesz[3];
            send_count_arr[i].ext = disp_off_sz_ext_typesz[4];
            send_count_arr[i].type_sz = disp_off_sz_ext_typesz[5];
            MPI_Isend(&send_count_arr[i], sizeof(amount_and_extra_data_t),
                      MPI_BYTE, fd->hints->ranklist[i], COUNT_EXCH, fd->comm, &send_req_arr[i]);
        }
    }


    /* Every client has to build mem and file view_states for each aggregator.
     * We initialize their values here.  and we also initialize
     * send_count_arr */

    if (memtype_is_contig) {
        /* if memory is contigous, we now replace memtype_sz and
         * memtype_extent with the full access size */
        memtype_sz *= count;
        memtype_extent = memtype_sz;
    }

    for (i = 0; i < fd->hints->cb_nodes; i++) {
        int tmp_agg_idx = fd->hints->ranklist[i];
        memset(&(my_mem_view_state_arr[tmp_agg_idx]), 0, sizeof(view_state));
        my_mem_view_state_arr[tmp_agg_idx].sz = disp_off_sz_ext_typesz[3];
        my_mem_view_state_arr[tmp_agg_idx].ext = (ADIO_Offset) memtype_extent;
        my_mem_view_state_arr[tmp_agg_idx].type_sz = (ADIO_Offset) memtype_sz;
        my_mem_view_state_arr[tmp_agg_idx].flat_type_p = flat_mem_p;
        ADIOI_init_view_state(file_ptr_type, 1, &(my_mem_view_state_arr[tmp_agg_idx]), TEMP_OFF);
        ADIOI_init_view_state(file_ptr_type, 1, &(my_mem_view_state_arr[tmp_agg_idx]), REAL_OFF);

        memset(&(agg_file_view_state_arr[tmp_agg_idx]), 0, sizeof(view_state));
        agg_file_view_state_arr[tmp_agg_idx].fp_ind = disp_off_sz_ext_typesz[0];
        agg_file_view_state_arr[tmp_agg_idx].disp = disp_off_sz_ext_typesz[1];
        agg_file_view_state_arr[tmp_agg_idx].byte_off = disp_off_sz_ext_typesz[2];
        agg_file_view_state_arr[tmp_agg_idx].sz = disp_off_sz_ext_typesz[3];
        agg_file_view_state_arr[tmp_agg_idx].ext = disp_off_sz_ext_typesz[4];
        agg_file_view_state_arr[tmp_agg_idx].type_sz = disp_off_sz_ext_typesz[5];
        agg_file_view_state_arr[tmp_agg_idx].flat_type_p = flat_file_p;

        ADIOI_init_view_state(file_ptr_type, 1, &(agg_file_view_state_arr[tmp_agg_idx]), TEMP_OFF);
        ADIOI_init_view_state(file_ptr_type, 1, &(agg_file_view_state_arr[tmp_agg_idx]), REAL_OFF);

        if (fd->hints->cb_alltoall != ADIOI_HINT_DISABLE) {
            send_count_arr[tmp_agg_idx].count = flat_file_p->count;
            send_count_arr[tmp_agg_idx].fp_ind = disp_off_sz_ext_typesz[0];
            send_count_arr[tmp_agg_idx].disp = disp_off_sz_ext_typesz[1];
            send_count_arr[tmp_agg_idx].byte_off = disp_off_sz_ext_typesz[2];
            send_count_arr[tmp_agg_idx].sz = disp_off_sz_ext_typesz[3];
            send_count_arr[tmp_agg_idx].ext = disp_off_sz_ext_typesz[4];
            send_count_arr[tmp_agg_idx].type_sz = disp_off_sz_ext_typesz[5];
        }
    }

#ifdef DEBUG2
    fprintf(stderr, "my own flattened memtype: ");
    ADIOI_Print_flatlist_node(flat_mem_p);
    fprintf(stderr, "my own flattened filetype: ");
    ADIOI_Print_flatlist_node(flat_file_p);
#endif

    if (fd->hints->cb_alltoall != ADIOI_HINT_DISABLE) {
        ret = MPI_Alltoall(send_count_arr, sizeof(amount_and_extra_data_t),
                           MPI_BYTE,
                           recv_count_arr, sizeof(amount_and_extra_data_t), MPI_BYTE, fd->comm);
        if (ret != MPI_SUCCESS) {
            fprintf(stderr, "ADIOI_Exchange_file_views: MPI_Alltoall failed " "with error %d", ret);
            return;
        }
    } else {
#ifdef MPI_STATUSES_IGNORE
        statuses = MPI_STATUSES_IGNORE;
#else
        statuses = (MPI_Status *) ADIOI_Malloc(1 + nprocs * sizeof(MPI_Status));
#endif
        if (fd->is_agg) {
            MPI_Waitall(nprocs, recv_req_arr, statuses);
            ADIOI_Free(recv_req_arr);
        }
        MPI_Waitall(fd->hints->cb_nodes, send_req_arr, statuses);
#ifndef MPI_STATUSES_IGNORE
        ADIOI_Free(statuses);
#endif
        ADIOI_Free(send_req_arr);
    }
#ifdef DEBUG2
    if (fd->hints->cb_alltoall != ADIOI_HINT_DISABLE) {
        fprintf(stderr, "send_count_arr:");
        for (i = 0; i < nprocs; i++) {
            fprintf(stderr, "[%d]=%d ", i, send_count_arr[i].count);
        }
        fprintf(stderr, "\n");
        fprintf(stderr, "recv_count_arr:");
        for (i = 0; i < nprocs; i++) {
            fprintf(stderr, "[%d]=%d ", i, recv_count_arr[i].count);
        }
        fprintf(stderr, "\n");
    } else {
        fprintf(stderr, "send_count_arr:");
        for (i = 0; i < fd->hints->cb_nodes; i++) {
            fprintf(stderr, "[%d]=%d ", i, send_count_arr[i].count);
        }
        fprintf(stderr, "\n");
        if (fd->is_agg) {
            fprintf(stderr, "recv_count_arr:");
            for (i = 0; i < nprocs; i++) {
                fprintf(stderr, "[%d]=%d ", i, recv_count_arr[i].count);
            }
            fprintf(stderr, "\n");
        }
    }
#endif

    if (fd->hints->cb_alltoall == ADIOI_HINT_DISABLE) {
        for (i = 0; i < fd->hints->cb_nodes; i++)
            if (send_count_arr[i].count > 0)
                send_req_arr_sz++;
    }
    /* Figure out how many counts to send/recv */
    for (i = 0; i < nprocs; i++) {
        if (fd->hints->cb_alltoall != ADIOI_HINT_DISABLE) {
            if (send_count_arr[i].count > 0)
                send_req_arr_sz++;
        }
        /* Only aggregators should recv */
        if (fd->is_agg) {
            if (recv_count_arr[i].count > 0) {
                if ((client_file_view_state_arr[i].flat_type_p =
                     (ADIOI_Flatlist_node *) ADIOI_Malloc(sizeof(ADIOI_Flatlist_node))) == NULL) {
                    fprintf(stderr, "ADIOI_Exchange_file_views: malloc " "flat_type_p failed\n");
                }
                client_file_view_state_arr[i].flat_type_p->count = recv_count_arr[i].count;
                client_file_view_state_arr[i].flat_type_p->indices =
                    (ADIO_Offset *) ADIOI_Calloc(recv_count_arr[i].count, sizeof(ADIO_Offset));
                client_file_view_state_arr[i].flat_type_p->blocklens =
                    (ADIO_Offset *) ADIOI_Calloc(recv_count_arr[i].count, sizeof(ADIO_Offset));

                /* Copy the extra data out of the stuff we Alltoall'd */
                memcpy(&client_file_view_state_arr[i].fp_ind,
                       &recv_count_arr[i].fp_ind, 6 * sizeof(ADIO_Offset));

                recv_req_arr_sz++;
            }
        }
    }

    /* Since ADIOI_Calloc may do other things we add the +1
     * to avoid a 0-size malloc */
    send_req_arr = (MPI_Request *) ADIOI_Calloc(2 * (send_req_arr_sz) + 1, sizeof(MPI_Request));

    j = 0;
    if (recv_req_arr_sz > 0) {
        assert(fd->is_agg);
        recv_req_arr = (MPI_Request *) ADIOI_Calloc(2 * (recv_req_arr_sz), sizeof(MPI_Request));
        for (i = 0; i < nprocs; i++) {
            if (recv_count_arr[i].count > 0) {
                MPI_Irecv(client_file_view_state_arr[i].flat_type_p->indices,
                          recv_count_arr[i].count, ADIO_OFFSET, i,
                          INDICES, fd->comm, &recv_req_arr[j]);
                j++;
                MPI_Irecv(client_file_view_state_arr[i].flat_type_p->blocklens,
                          recv_count_arr[i].count, ADIO_OFFSET, i,
                          BLOCK_LENS, fd->comm, &recv_req_arr[j]);
                j++;
            }
        }
    }

    if (fd->hints->cb_alltoall != ADIOI_HINT_DISABLE) {
        j = 0;
        for (i = 0; i < nprocs; i++) {
            if (send_count_arr[i].count > 0) {
                MPI_Isend(flat_file_p->indices,
                          send_count_arr[i].count, ADIO_OFFSET, i,
                          INDICES, fd->comm, &send_req_arr[j]);
                j++;
                MPI_Isend(flat_file_p->blocklens,
                          send_count_arr[i].count, ADIO_OFFSET, i,
                          BLOCK_LENS, fd->comm, &send_req_arr[j]);
                j++;
            }
        }
    } else {
        j = 0;
        for (i = 0; i < fd->hints->cb_nodes; i++) {
            if (send_count_arr[i].count > 0) {
                MPI_Isend(flat_file_p->indices,
                          send_count_arr[i].count, ADIO_OFFSET,
                          fd->hints->ranklist[i], INDICES, fd->comm, &send_req_arr[j]);
                j++;
                MPI_Isend(flat_file_p->blocklens,
                          send_count_arr[i].count, ADIO_OFFSET,
                          fd->hints->ranklist[i], BLOCK_LENS, fd->comm, &send_req_arr[j]);
                j++;
            }
        }
    }

    /* Since ADIOI_Malloc may do other things we add the +1
     * to avoid a 0-size malloc */
#ifdef MPI_STATUSES_IGNORE
    statuses = MPI_STATUSES_IGNORE;
#else
    statuses = (MPI_Status *)
        ADIOI_Malloc(1 + 2 * MPL_MAX(send_req_arr_sz, recv_req_arr_sz)
                     * sizeof(MPI_Status));
#endif

    if (send_req_arr_sz > 0) {
        MPI_Waitall(2 * send_req_arr_sz, send_req_arr, statuses);
        ADIOI_Free(send_count_arr);
        ADIOI_Free(send_req_arr);
    }
    if (recv_req_arr_sz > 0) {
        MPI_Waitall(2 * recv_req_arr_sz, recv_req_arr, statuses);
        ADIOI_Free(recv_count_arr);
        ADIOI_Free(recv_req_arr);
    }
#ifndef MPI_STATUSES_IGNORE
    ADIOI_Free(statuses);
#endif

    if (fd->is_agg == 1) {
        ADIOI_init_view_state(file_ptr_type, nprocs, client_file_view_state_arr, TEMP_OFF);
        ADIOI_init_view_state(file_ptr_type, nprocs, client_file_view_state_arr, REAL_OFF);
    }
#ifdef DEBUG
    if (fd->is_agg == 1) {
        ADIOI_Flatlist_node *fr_node_p;
        for (i = 0; i < nprocs; i++) {
            fprintf(stderr, "client_file_view_state_arr[%d]=(fp_ind=%Ld,"
                    "disp=%Ld,byte_off=%Ld,sz=%Ld,ext=%Ld\n", i,
                    client_file_view_state_arr[i].fp_ind,
                    client_file_view_state_arr[i].disp,
                    client_file_view_state_arr[i].byte_off,
                    client_file_view_state_arr[i].sz, client_file_view_state_arr[i].ext);
        }

        fr_node_p = ADIOI_Flatten_and_find(fd->file_realm_types[fd->my - cb_nodes_index]);
        assert(fr_node_p != NULL);

        fprintf(stderr, "my file realm (idx=%d,st_off=%Ld) ",
                fd->my_cb_nodes_index, fd->file_realm_st_offs[fd->my_cb_nodes_index]);
        ADIOI_Print_flatlist_node(fr_node_p);
    }
#endif

#ifdef DEBUG2
    if (fd->is_agg == 1) {
        for (i = 0; i < nprocs; i++) {
            fprintf(stderr, "client_file_view_state_arr[%d]: ", i);
            ADIOI_Print_flatlist_node(client_file_view_state_arr[i].flat_type_p);
        }
    }
#endif
#ifdef AGGREGATION_PROFILE
    MPE_Log_event(5015, 0, NULL);
#endif
}
Esempio n. 13
0
int ADIOI_NFS_aio(ADIO_File fd, void *buf, int len, ADIO_Offset offset,
		  int wr, MPI_Request *request)
{
    int err=-1, fd_sys;
    int error_code, this_errno;

    struct aiocb *aiocbp;
    ADIOI_AIO_Request *aio_req;

    fd_sys = fd->fd_sys;

    aio_req = (ADIOI_AIO_Request*)ADIOI_Calloc(sizeof(ADIOI_AIO_Request), 1);
    aiocbp = (struct aiocb *) ADIOI_Calloc(sizeof(struct aiocb), 1);
    aiocbp->aio_offset = offset;
    aiocbp->aio_buf    = buf;
    aiocbp->aio_nbytes = len;

#ifdef ROMIO_HAVE_STRUCT_AIOCB_WITH_AIO_WHENCE
    aiocbp->aio_whence = SEEK_SET;
#endif
#ifdef ROMIO_HAVE_STRUCT_AIOCB_WITH_AIO_FILDES
    aiocbp->aio_fildes = fd_sys;
#endif
#ifdef ROMIO_HAVE_STRUCT_AIOCB_WITH_AIO_SIGEVENT
# ifdef AIO_SIGNOTIFY_NONE
    aiocbp->aio_sigevent.sigev_notify = SIGEV_NONE;
# endif
    aiocbp->aio_sigevent.sigev_signo = 0;
#endif
#ifdef ROMIO_HAVE_STRUCT_AIOCB_WITH_AIO_REQPRIO
# ifdef AIO_PRIO_DFL
    aiocbp->aio_reqprio = AIO_PRIO_DFL;   /* not needed in DEC Unix 4.0 */
# else
    aiocbp->aio_reqprio = 0;
# endif
#endif

    if (wr) ADIOI_WRITE_LOCK(fd, offset, SEEK_SET, len);
    else ADIOI_READ_LOCK(fd, offset, SEEK_SET, len);

#ifndef ROMIO_HAVE_AIO_CALLS_NEED_FILEDES
    if (wr) err = aio_write(aiocbp);
    else err = aio_read(aiocbp);
#else
    /* Broken IBM interface */
    if (wr) err = aio_write(fd_sys, aiocbp);
    else err = aio_read(fd_sys, aiocbp);
#endif

    this_errno = errno;
    ADIOI_UNLOCK(fd, offset, SEEK_SET, len);

    if (err == -1) {
	if (this_errno == EAGAIN) {
        /* exceeded the max. no. of outstanding requests.
           complete all previous async. requests and try again. */
	    ADIO_WriteContig(fd, buf, len, MPI_BYTE, ADIO_EXPLICIT_OFFSET,
			    offset, NULL, &error_code);
	    MPIO_Completed_request_create(&fd, len, &error_code, request);
	    return 0;
	} else {
	    return -this_errno;
	}
    }
    aio_req->aiocbp = aiocbp;
    if (ADIOI_GEN_greq_class == 0) {
	    MPIX_Grequest_class_create(ADIOI_GEN_aio_query_fn, 
			    ADIOI_GEN_aio_free_fn, MPIU_Greq_cancel_fn, 
			    ADIOI_GEN_aio_poll_fn, ADIOI_GEN_aio_wait_fn, 
			    &ADIOI_GEN_greq_class);
    }
    MPIX_Grequest_class_allocate(ADIOI_GEN_greq_class, aio_req, request);
    memcpy(&(aio_req->req), request, sizeof(MPI_Request));
    return 0;
}
Esempio n. 14
0
void ADIOI_LUSTRE_Open(ADIO_File fd, int *error_code)
{
    int perm, old_mask, amode, amode_direct;
    int lumlen, myrank, flag, set_layout=0, err;
    struct lov_user_md *lum = NULL;
    char *value;
    ADIO_Offset str_factor = -1, str_unit=0, start_iodev=-1;
    size_t value_sz = (MPI_MAX_INFO_VAL+1)*sizeof(char);

#if defined(MPICH) || !defined(PRINT_ERR_MSG)
    static char myname[] = "ADIOI_LUSTRE_OPEN";
#endif

    MPI_Comm_rank(fd->comm, &myrank);

    if (fd->perm == ADIO_PERM_NULL) {
	old_mask = umask(022);
	umask(old_mask);
	perm = old_mask ^ 0666;
    }
    else perm = fd->perm;

    amode = 0;
    if (fd->access_mode & ADIO_CREATE)
	amode = amode | O_CREAT;
    if (fd->access_mode & ADIO_RDONLY)
	amode = amode | O_RDONLY;
    if (fd->access_mode & ADIO_WRONLY)
	amode = amode | O_WRONLY;
    if (fd->access_mode & ADIO_RDWR)
	amode = amode | O_RDWR;
    if (fd->access_mode & ADIO_EXCL)
	amode = amode | O_EXCL;

    amode_direct = amode | O_DIRECT;

    /* odd length here because lov_user_md contains some fixed data and
     * then a list of 'lmm_objects' representing stripe */
    lumlen = sizeof(struct lov_user_md) +
	    MAX_LOV_UUID_COUNT * sizeof(struct lov_user_ost_data);
    lum = (struct lov_user_md *)ADIOI_Calloc(1,lumlen);

    value = (char *) ADIOI_Malloc(value_sz);
    /* we already validated in LUSTRE_SetInfo that these are going to be the same */
    if (fd->info != MPI_INFO_NULL) {
	/* striping information */
	ADIOI_Info_get(fd->info, "striping_unit", MPI_MAX_INFO_VAL,
		value, &flag);
	if (flag)
	    str_unit=atoll(value);

	ADIOI_Info_get(fd->info, "striping_factor", MPI_MAX_INFO_VAL,
		value, &flag);
	if (flag)
	    str_factor=atoll(value);

	ADIOI_Info_get(fd->info, "romio_lustre_start_iodevice",
		MPI_MAX_INFO_VAL, value, &flag);
	if (flag)
	    start_iodev=atoll(value);
    }
    if ((str_factor > 0) || (str_unit > 0) || (start_iodev >= 0))
	set_layout = 1;

    /* if hints were set, we need to delay creation of any lustre objects.
     * However, if we open the file with O_LOV_DELAY_CREATE and don't call the
     * follow-up ioctl, subsequent writes will fail */
    if (myrank == 0 && set_layout)
	amode = amode | O_LOV_DELAY_CREATE;

    fd->fd_sys = open(fd->filename, amode, perm);
    if (fd->fd_sys == -1) goto fn_exit;

    /* we can only set these hints on new files */
    /* It was strange and buggy to open the file in the hint path.  Instead,
     * we'll apply the file tunings at open time */
    if ((amode & O_CREAT) && set_layout ) {
	/* if user has specified striping info, first aggregator tries to set
	 * it */
	if (myrank == fd->hints->ranklist[0] || fd->comm == MPI_COMM_SELF) {
	    lum->lmm_magic = LOV_USER_MAGIC;
	    lum->lmm_pattern = 0;
	    /* crude check for overflow of lustre internal datatypes.
		 * Silently cap to large value if user provides a value
		 * larger than lustre supports */
	    if (str_unit > UINT_MAX)
	            lum->lmm_stripe_size = UINT_MAX;
	    else
	            lum->lmm_stripe_size = str_unit;

	    if (str_factor > USHRT_MAX)
	            lum->lmm_stripe_count = USHRT_MAX;
	    else
	            lum->lmm_stripe_count = str_factor;

	    if (start_iodev > USHRT_MAX)
	             lum->lmm_stripe_offset = USHRT_MAX;
	    else
	            lum->lmm_stripe_offset = start_iodev;
	    err = ioctl(fd->fd_sys, LL_IOC_LOV_SETSTRIPE, lum);
	    if (err == -1 && errno != EEXIST) {
		fprintf(stderr, "Failure to set stripe info %s \n", strerror(errno));
		/* not a fatal error, but user might care to know */
	    }
	} /* End of striping parameters validation */
    }

    /* Pascal Deveze reports that, even though we pass a
     * "GETSTRIPE" (read) flag to the ioctl, if some of the values of this
     * struct are uninitialzed, the call can give an error.  zero it out in case
     * there are other members that must be initialized and in case
     * lov_user_md struct changes in future */
    memset(lum, 0, lumlen);
    lum->lmm_magic = LOV_USER_MAGIC;
    err = ioctl(fd->fd_sys, LL_IOC_LOV_GETSTRIPE, (void *)lum);
    if (!err) {

	fd->hints->striping_unit = lum->lmm_stripe_size;
	MPL_snprintf(value, value_sz, "%d", lum->lmm_stripe_size);
	ADIOI_Info_set(fd->info, "striping_unit", value);

	fd->hints->striping_factor = lum->lmm_stripe_count;
	MPL_snprintf(value, value_sz, "%d", lum->lmm_stripe_count);
	ADIOI_Info_set(fd->info, "striping_factor", value);

	fd->hints->start_iodevice = lum->lmm_stripe_offset;
	MPL_snprintf(value, value_sz, "%d", lum->lmm_stripe_offset);
	ADIOI_Info_set(fd->info, "romio_lustre_start_iodevice", value);

    }

    if (fd->access_mode & ADIO_APPEND)
	fd->fp_ind = fd->fp_sys_posn = lseek(fd->fd_sys, 0, SEEK_END);

    fd->fd_direct = -1;
    if (fd->direct_write || fd->direct_read) {
	fd->fd_direct = open(fd->filename, amode_direct, perm);
	if (fd->fd_direct != -1) {
	    fd->d_mem = fd->d_miniosz = (1<<12);
	} else {
	    perror("cannot open file with O_Direct");
	    fd->direct_write = fd->direct_read = 0;
	}
    }

fn_exit:
    ADIOI_Free(lum);
    ADIOI_Free(value);
    /* --BEGIN ERROR HANDLING-- */
    if (fd->fd_sys == -1 || ((fd->fd_direct == -1) && 
		(fd->direct_write || fd->direct_read))) {
	*error_code = ADIOI_Err_create_code(myname, fd->filename, errno);
    }
    /* --END ERROR HANDLING-- */
    else *error_code = MPI_SUCCESS;

}
Esempio n. 15
0
static void ADIOI_Read_and_exch(ADIO_File fd, void *buf, MPI_Datatype
			 datatype, int nprocs,
			 int myrank, ADIOI_Access
			 *others_req, ADIO_Offset *offset_list,
			 ADIO_Offset *len_list, int contig_access_count, ADIO_Offset
                         min_st_offset, ADIO_Offset fd_size,
			 ADIO_Offset *fd_start, ADIO_Offset *fd_end,
                         int *buf_idx, int *error_code)
{
/* Read in sizes of no more than coll_bufsize, an info parameter.
   Send data to appropriate processes. 
   Place recd. data in user buf.
   The idea is to reduce the amount of extra memory required for
   collective I/O. If all data were read all at once, which is much
   easier, it would require temp space more than the size of user_buf,
   which is often unacceptable. For example, to read a distributed
   array from a file, where each local array is 8Mbytes, requiring
   at least another 8Mbytes of temp space is unacceptable. */

    int i, j, m, ntimes, max_ntimes, buftype_is_contig;
    ADIO_Offset st_loc=-1, end_loc=-1, off, done, real_off, req_off;
    char *read_buf = NULL, *tmp_buf;
    int *curr_offlen_ptr, *count, *send_size, *recv_size;
    int *partial_send, *recd_from_proc, *start_pos;
    /* Not convinced end_loc-st_loc couldn't be > int, so make these offsets*/
    ADIO_Offset real_size, size, for_curr_iter, for_next_iter;
    int req_len, flag, rank;
    MPI_Status status;
    ADIOI_Flatlist_node *flat_buf=NULL;
    MPI_Aint buftype_extent;
    int coll_bufsize;

    *error_code = MPI_SUCCESS;  /* changed below if error */
    /* only I/O errors are currently reported */
    
/* calculate the number of reads of size coll_bufsize
   to be done by each process and the max among all processes.
   That gives the no. of communication phases as well.
   coll_bufsize is obtained from the hints object. */

    coll_bufsize = fd->hints->cb_buffer_size;

    /* grab some initial values for st_loc and end_loc */
    for (i=0; i < nprocs; i++) {
	if (others_req[i].count) {
	    st_loc = others_req[i].offsets[0];
	    end_loc = others_req[i].offsets[0];
	    break;
	}
    }

    /* now find the real values */
    for (i=0; i < nprocs; i++)
	for (j=0; j<others_req[i].count; j++) {
	    st_loc = ADIOI_MIN(st_loc, others_req[i].offsets[j]);
	    end_loc = ADIOI_MAX(end_loc, (others_req[i].offsets[j]
					  + others_req[i].lens[j] - 1));
	}

    /* calculate ntimes, the number of times this process must perform I/O
     * operations in order to complete all the requests it has received.
     * the need for multiple I/O operations comes from the restriction that
     * we only use coll_bufsize bytes of memory for internal buffering.
     */
    if ((st_loc==-1) && (end_loc==-1)) {
	/* this process does no I/O. */
	ntimes = 0;
    }
    else {
	/* ntimes=ceiling_div(end_loc - st_loc + 1, coll_bufsize)*/
	ntimes = (int) ((end_loc - st_loc + coll_bufsize)/coll_bufsize);
    }

    MPI_Allreduce(&ntimes, &max_ntimes, 1, MPI_INT, MPI_MAX, fd->comm); 

    read_buf = fd->io_buf;  /* Allocated at open time */

    curr_offlen_ptr = (int *) ADIOI_Calloc(nprocs, sizeof(int)); 
    /* its use is explained below. calloc initializes to 0. */

    count = (int *) ADIOI_Malloc(nprocs * sizeof(int));
    /* to store count of how many off-len pairs per proc are satisfied
       in an iteration. */

    partial_send = (int *) ADIOI_Calloc(nprocs, sizeof(int));
    /* if only a portion of the last off-len pair is sent to a process 
       in a particular iteration, the length sent is stored here.
       calloc initializes to 0. */

    send_size = (int *) ADIOI_Malloc(nprocs * sizeof(int));
    /* total size of data to be sent to each proc. in an iteration */

    recv_size = (int *) ADIOI_Malloc(nprocs * sizeof(int));
    /* total size of data to be recd. from each proc. in an iteration.
       Of size nprocs so that I can use MPI_Alltoall later. */

    recd_from_proc = (int *) ADIOI_Calloc(nprocs, sizeof(int));
    /* amount of data recd. so far from each proc. Used in
       ADIOI_Fill_user_buffer. initialized to 0 here. */

    start_pos = (int *) ADIOI_Malloc(nprocs*sizeof(int));
    /* used to store the starting value of curr_offlen_ptr[i] in 
       this iteration */

    ADIOI_Datatype_iscontig(datatype, &buftype_is_contig);
    if (!buftype_is_contig) {
	ADIOI_Flatten_datatype(datatype);
	flat_buf = ADIOI_Flatlist;
        while (flat_buf->type != datatype) flat_buf = flat_buf->next;
    }
    MPI_Type_extent(datatype, &buftype_extent);

    done = 0;
    off = st_loc;
    for_curr_iter = for_next_iter = 0;

    MPI_Comm_rank(fd->comm, &rank);

    for (m=0; m<ntimes; m++) {
       /* read buf of size coll_bufsize (or less) */
       /* go through all others_req and check if any are satisfied
          by the current read */

       /* since MPI guarantees that displacements in filetypes are in 
          monotonically nondecreasing order, I can maintain a pointer
	  (curr_offlen_ptr) to 
          current off-len pair for each process in others_req and scan
          further only from there. There is still a problem of filetypes
          such as:  (1, 2, 3 are not process nos. They are just numbers for
          three chunks of data, specified by a filetype.)

                   1  -------!--
                   2    -----!----
                   3       --!-----

          where ! indicates where the current read_size limitation cuts 
          through the filetype.  I resolve this by reading up to !, but
          filling the communication buffer only for 1. I copy the portion
          left over for 2 into a tmp_buf for use in the next
	  iteration. i.e., 2 and 3 will be satisfied in the next
	  iteration. This simplifies filling in the user's buf at the
	  other end, as only one off-len pair with incomplete data
	  will be sent. I also don't need to send the individual
	  offsets and lens along with the data, as the data is being
	  sent in a particular order. */ 

          /* off = start offset in the file for the data actually read in 
                   this iteration 
             size = size of data read corresponding to off
             real_off = off minus whatever data was retained in memory from
                  previous iteration for cases like 2, 3 illustrated above
             real_size = size plus the extra corresponding to real_off
             req_off = off in file for a particular contiguous request 
                       minus what was satisfied in previous iteration
             req_size = size corresponding to req_off */

	size = ADIOI_MIN((unsigned)coll_bufsize, end_loc-st_loc+1-done); 
	real_off = off - for_curr_iter;
	real_size = size + for_curr_iter;

	for (i=0; i<nprocs; i++) count[i] = send_size[i] = 0;
	for_next_iter = 0;

	for (i=0; i<nprocs; i++) {
#ifdef RDCOLL_DEBUG
	    DBG_FPRINTF(stderr, "rank %d, i %d, others_count %d\n", rank, i, others_req[i].count); 
#endif
	    if (others_req[i].count) {
		start_pos[i] = curr_offlen_ptr[i];
		for (j=curr_offlen_ptr[i]; j<others_req[i].count;
		     j++) {
		    if (partial_send[i]) {
			/* this request may have been partially
			   satisfied in the previous iteration. */
			req_off = others_req[i].offsets[j] +
			    partial_send[i]; 
                        req_len = others_req[i].lens[j] -
			    partial_send[i];
			partial_send[i] = 0;
			/* modify the off-len pair to reflect this change */
			others_req[i].offsets[j] = req_off;
			others_req[i].lens[j] = req_len;
		    }
		    else {
			req_off = others_req[i].offsets[j];
                        req_len = others_req[i].lens[j];
		    }
		    if (req_off < real_off + real_size) {
			count[i]++;
      ADIOI_Assert((((ADIO_Offset)(MPIR_Upint)read_buf)+req_off-real_off) == (ADIO_Offset)(MPIR_Upint)(read_buf+req_off-real_off));
			MPI_Address(read_buf+req_off-real_off, 
                               &(others_req[i].mem_ptrs[j]));
      ADIOI_Assert((real_off + real_size - req_off) == (int)(real_off + real_size - req_off));
			send_size[i] += (int)(ADIOI_MIN(real_off + real_size - req_off, 
                                      (ADIO_Offset)(unsigned)req_len)); 

			if (real_off+real_size-req_off < (ADIO_Offset)(unsigned)req_len) {
			    partial_send[i] = (int) (real_off + real_size - req_off);
			    if ((j+1 < others_req[i].count) && 
                                 (others_req[i].offsets[j+1] < 
                                     real_off+real_size)) { 
				/* this is the case illustrated in the
				   figure above. */
				for_next_iter = ADIOI_MAX(for_next_iter,
					  real_off + real_size - others_req[i].offsets[j+1]); 
				/* max because it must cover requests 
				   from different processes */
			    }
			    break;
			}
		    }
		    else break;
		}
		curr_offlen_ptr[i] = j;
	    }
	}

	flag = 0;
	for (i=0; i<nprocs; i++)
	    if (count[i]) flag = 1;

	if (flag) {
      ADIOI_Assert(size == (int)size);
	    ADIO_ReadContig(fd, read_buf+for_curr_iter, (int)size, MPI_BYTE,
			    ADIO_EXPLICIT_OFFSET, off, &status, error_code);
	    if (*error_code != MPI_SUCCESS) return;
	}
	
	for_curr_iter = for_next_iter;
	
	ADIOI_R_Exchange_data(fd, buf, flat_buf, offset_list, len_list,
			    send_size, recv_size, count, 
       			    start_pos, partial_send, recd_from_proc, nprocs,
			    myrank, 
			    buftype_is_contig, contig_access_count,
			    min_st_offset, fd_size, fd_start, fd_end,
			    others_req, 
                            m, buftype_extent, buf_idx); 


	if (for_next_iter) {
	    tmp_buf = (char *) ADIOI_Malloc(for_next_iter);
      ADIOI_Assert((((ADIO_Offset)(MPIR_Upint)read_buf)+real_size-for_next_iter) == (ADIO_Offset)(MPIR_Upint)(read_buf+real_size-for_next_iter));
      ADIOI_Assert((for_next_iter+coll_bufsize) == (size_t)(for_next_iter+coll_bufsize));
	    memcpy(tmp_buf, read_buf+real_size-for_next_iter, for_next_iter);
	    ADIOI_Free(fd->io_buf);
	    fd->io_buf = (char *) ADIOI_Malloc(for_next_iter+coll_bufsize);
	    memcpy(fd->io_buf, tmp_buf, for_next_iter);
	    read_buf = fd->io_buf;
	    ADIOI_Free(tmp_buf);
	}

	off += size;
	done += size;
    }

    for (i=0; i<nprocs; i++) count[i] = send_size[i] = 0;
    for (m=ntimes; m<max_ntimes; m++) 
/* nothing to send, but check for recv. */
	ADIOI_R_Exchange_data(fd, buf, flat_buf, offset_list, len_list,
			    send_size, recv_size, count, 
			    start_pos, partial_send, recd_from_proc, nprocs,
			    myrank, 
			    buftype_is_contig, contig_access_count,
			    min_st_offset, fd_size, fd_start, fd_end,
			    others_req, m,
                            buftype_extent, buf_idx); 

    ADIOI_Free(curr_offlen_ptr);
    ADIOI_Free(count);
    ADIOI_Free(partial_send);
    ADIOI_Free(send_size);
    ADIOI_Free(recv_size);
    ADIOI_Free(recd_from_proc);
    ADIOI_Free(start_pos);
}
Esempio n. 16
0
void ADIOI_Calc_my_req(ADIO_Offset *offset_list, int *len_list, int
			    contig_access_count, ADIO_Offset 
			    min_st_offset, ADIO_Offset *fd_start,
			    ADIO_Offset *fd_end, ADIO_Offset fd_size,
                            int nprocs, int nprocs_for_coll, 
                            int *count_my_req_procs_ptr,
			    int **count_my_req_per_proc_ptr,
			    ADIOI_Access **my_req_ptr,
			    int **buf_idx_ptr)
{
/* calculate what portions of the access requests of this process are
   located in the file domains of other processes */

    int *count_my_req_per_proc, count_my_req_procs, *buf_idx;
    int i, l, proc, len, rem_len, curr_idx;
    ADIO_Offset off;
    ADIOI_Access *my_req;

    *count_my_req_per_proc_ptr = (int *) ADIOI_Calloc(nprocs,sizeof(int)); 
    count_my_req_per_proc = *count_my_req_per_proc_ptr;
/* count_my_req_per_proc[i] gives the no. of contig. requests of this
   process in process i's file domain. calloc initializes to zero.
   I'm allocating memory of size nprocs, so that I can do an 
   MPI_Alltoall later on.*/

    buf_idx = (int *) ADIOI_Malloc(nprocs_for_coll*sizeof(int));
/* buf_idx is relevant only if buftype_is_contig.
   buf_idx[i] gives the index into user_buf where data received
   from proc. i should be placed. This allows receives to be done
   without extra buffer. This can't be done if buftype is not contig. */
   
/* initialize buf_idx to -1 */
    for (i=0; i<nprocs_for_coll; i++) buf_idx[i] = -1;

/* one pass just to calculate how much space to allocate for
   my_req */
    for (i=0; i<contig_access_count; i++) { 

/* proc_no = CD(offset_list[i]-min_st_offset+1, fd_size) - 1 */
/* CD = ceiling division. CD(j,k) = (j+k-1)/k */

	proc = (int) ((offset_list[i] - min_st_offset + fd_size)/fd_size - 1);
        /* sanity check */
	if (proc >= nprocs_for_coll) {
	    FPRINTF(stderr, "Error: proc >= nprocs_for_coll, file %s, line %d\n", __FILE__, __LINE__);
	    MPI_Abort(MPI_COMM_WORLD, 1);
	}

	off = offset_list[i];
	len = (int) (((off+len_list[i]-1) <= fd_end[proc]) ? len_list[i] : 
	                           (fd_end[proc] - off + 1));
	rem_len = len_list[i] - len;
	count_my_req_per_proc[proc]++;

	while (rem_len != 0) {
	    proc++;
	    off = fd_start[proc];
	    len = (int) (((off+rem_len-1) <= fd_end[proc]) ? rem_len : 
	                           (fd_end[proc] - off + 1));
	    rem_len -= len;
	    count_my_req_per_proc[proc]++;
	}
    }

/* now allocate space for my_req, offset, and len */

    *my_req_ptr = (ADIOI_Access *)
	ADIOI_Malloc(nprocs_for_coll*sizeof(ADIOI_Access)); 
    my_req = *my_req_ptr;

    count_my_req_procs = 0;
    for (i=0; i<nprocs_for_coll; i++) {
	if (count_my_req_per_proc[i]) {
	    my_req[i].offsets = (ADIO_Offset *)
		ADIOI_Malloc(count_my_req_per_proc[i] * sizeof(ADIO_Offset));
	    my_req[i].lens = (int *)
		ADIOI_Malloc(count_my_req_per_proc[i] * sizeof(int));
	    count_my_req_procs++;
	}	    
	my_req[i].count = 0;  /* will be incremented where needed
				      later */
    }

/* now fill in my_req */
    curr_idx = 0;
    for (i=0; i<contig_access_count; i++) { 

	/* for each separate contiguous request from this process */

	proc = (int) ((offset_list[i] - min_st_offset + fd_size)/fd_size - 1);
	if (buf_idx[proc] == -1) buf_idx[proc] = curr_idx;

	l = my_req[proc].count;
	off = offset_list[i];
	len = (int) (((off+len_list[i]-1) <= fd_end[proc]) ? len_list[i] : 
	                           (fd_end[proc] - off + 1));
	curr_idx += len;
	/* the length may go beyond proc's file domain */
	rem_len = len_list[i] - len;

	/* store the proc, offset, and len information in an array
         of structures, my_req. Each structure contains the 
         offsets and lengths located in that process's FD, 
	 and the associated count. */

	my_req[proc].offsets[l] = off;
	my_req[proc].lens[l] = len;
	my_req[proc].count++;

	/* this request may span the file domains of more than one
	   process */
	while (rem_len != 0) {
	    proc++;
	    if (buf_idx[proc] == -1) buf_idx[proc] = curr_idx;
	    l = my_req[proc].count;
	    off = fd_start[proc];
	    len = (int) (((off+rem_len-1) <= fd_end[proc]) ? rem_len : 
	                           (fd_end[proc] - off + 1));
	    curr_idx += len;
	    rem_len -= len;
	    my_req[proc].offsets[l] = off;
	    my_req[proc].lens[l] = len;
	    my_req[proc].count++;
	}
    }
    *count_my_req_procs_ptr = count_my_req_procs;
    *buf_idx_ptr = buf_idx;
}
Esempio n. 17
0
MPI_File ADIO_Open(MPI_Comm orig_comm,
		   MPI_Comm comm, const char *filename, int file_system,
		   ADIOI_Fns *ops,
		   int access_mode, ADIO_Offset disp, MPI_Datatype etype, 
		   MPI_Datatype filetype,
		   MPI_Info info, int perm, int *error_code)
{
    MPI_File mpi_fh;
    ADIO_File fd;
    int err, rank, procs;
    static char myname[] = "ADIO_OPEN";
    int  max_error_code;
    MPI_Info dupinfo;
    int syshints_processed, can_skip;
    char *p;

    *error_code = MPI_SUCCESS;

    /* obtain MPI_File handle */
    mpi_fh = MPIO_File_create(sizeof(struct ADIOI_FileD));
    if (mpi_fh == MPI_FILE_NULL) {
	fd = MPI_FILE_NULL;
	*error_code = MPIO_Err_create_code(*error_code,
					   MPIR_ERR_RECOVERABLE,
					   myname,
					   __LINE__,
					   MPI_ERR_OTHER,
					   "**nomem2",0);
	goto fn_exit;

    }
    fd = MPIO_File_resolve(mpi_fh);

    fd->cookie = ADIOI_FILE_COOKIE;
    fd->fp_ind = disp;
    fd->fp_sys_posn = 0;
    fd->comm = comm;       /* dup'ed in MPI_File_open */
    fd->filename = ADIOI_Strdup(filename);
    fd->file_system = file_system;
    fd->fs_ptr = NULL;

    fd->fns = ops;

    fd->disp = disp;
    fd->split_coll_count = 0;
    fd->shared_fp_fd = ADIO_FILE_NULL;
    fd->atomicity = 0;
    fd->etype = etype;          /* MPI_BYTE by default */
    fd->filetype = filetype;    /* MPI_BYTE by default */
    fd->etype_size = 1;  /* default etype is MPI_BYTE */

    fd->file_realm_st_offs = NULL;
    fd->file_realm_types = NULL;

    fd->perm = perm;

    fd->async_count = 0;

    fd->fortran_handle = -1;

    fd->err_handler = ADIOI_DFLT_ERR_HANDLER;

    fd->io_buf_window = MPI_WIN_NULL;
    fd->io_buf_put_amounts_window = MPI_WIN_NULL;

    MPI_Comm_rank(comm, &rank);
    MPI_Comm_size(comm, &procs);
/* create and initialize info object */
    fd->hints = (ADIOI_Hints *)ADIOI_Calloc(1, sizeof(struct ADIOI_Hints_struct));
    if (fd->hints == NULL) {
	*error_code = MPIO_Err_create_code(*error_code,
					   MPIR_ERR_RECOVERABLE,
					   myname,
					   __LINE__,
					   MPI_ERR_OTHER,
					   "**nomem2",0);
	goto fn_exit;
    }
    fd->hints->cb_config_list = NULL;
    fd->hints->ranklist = NULL;
    fd->hints->initialized = 0;
    fd->info = MPI_INFO_NULL;

    /* move system-wide hint processing *back* into open, but this time the
     * hintfile reader will do a scalable read-and-broadcast.  The global
     * ADIOI_syshints will get initialized at first open.  subsequent open
     * calls will just use result from first open.
     *
     * We have two goals here:
     * 1: avoid processing the hintfile multiple times
     * 2: have all processes participate in hintfile processing (so we can read-and-broadcast)
     *
     * a code might do an "initialize from 0", so we can only skip hint
     * processing once everyone has particpiated in hint processing */
    if (ADIOI_syshints == MPI_INFO_NULL)
	syshints_processed = 0;
    else
	syshints_processed = 1;

    MPI_Allreduce(&syshints_processed, &can_skip, 1, MPI_INT, MPI_MIN, fd->comm);
    if (!can_skip) {
	if (ADIOI_syshints == MPI_INFO_NULL)
	    MPI_Info_create(&ADIOI_syshints);
	ADIOI_process_system_hints(fd, ADIOI_syshints);
    }

    ADIOI_incorporate_system_hints(info, ADIOI_syshints, &dupinfo);
    ADIO_SetInfo(fd, dupinfo, &err);
    if (dupinfo != MPI_INFO_NULL) {
	*error_code = MPI_Info_free(&dupinfo);
	if (*error_code != MPI_SUCCESS)
	    goto fn_exit;
    }
    ADIOI_Info_set(fd->info, "romio_filesystem_type", fd->fns->fsname);

    /* Instead of repeatedly allocating this buffer in collective read/write,
     * allocating up-front might make memory management on small platforms
     * (e.g. Blue Gene) more efficent */

    fd->io_buf = ADIOI_Malloc(fd->hints->cb_buffer_size);
     /* deferred open: 
     * we can only do this optimization if 'fd->hints->deferred_open' is set
     * (which means the user hinted 'no_indep_rw' and collective buffering).
     * Furthermore, we only do this if our collective read/write routines use
     * our generic function, and not an fs-specific routine (we can defer opens
     * only if we use our aggreagation code). */
    if (fd->hints->deferred_open && 
		    !(uses_generic_read(fd) \
			    && uses_generic_write(fd))) {
	    fd->hints->deferred_open = 0;
    }
    if (ADIO_Feature(fd, ADIO_SCALABLE_OPEN))
	    /* disable deferred open on these fs so that scalable broadcast
	     * will always use the propper communicator */
	    fd->hints->deferred_open = 0;


    /* on BlueGene, the cb_config_list is built when hints are processed. No
     * one else does that right now */
    if (fd->hints->ranklist == NULL) {
	build_cb_config_list(fd, orig_comm, comm, rank, procs, error_code);
	if (*error_code != MPI_SUCCESS) 
	    goto fn_exit;
    }
    fd->is_open = 0;
    fd->my_cb_nodes_index = -2;
    fd->is_agg = is_aggregator(rank, fd);
    /* deferred open used to split the communicator to create an "aggregator
     * communicator", but we only used it as a way to indicate that deferred
     * open happened.  fd->is_open and fd->is_agg are sufficient */

    /* actual opens start here */
    /* generic open: one process opens to create the file, all others open */
    /* nfs open: everybody opens or else you'll end up with "file not found"
     * due to stupid nfs consistency semantics */
    /* scalable open: one process opens and broadcasts results to everyone */

    ADIOI_OpenColl(fd, rank, access_mode, error_code);

    /* for debugging, it can be helpful to see the hints selected. Some file
     * systes set up the hints in the open call (e.g. lustre) */
    p = getenv("ROMIO_PRINT_HINTS");
    if (rank == 0 && p != NULL ) {
	ADIOI_Info_print_keyvals(fd->info);
    }

 fn_exit:
    MPI_Allreduce(error_code, &max_error_code, 1, MPI_INT, MPI_MAX, comm);
    if (max_error_code != MPI_SUCCESS) {

        /* If the file was successfully opened, close it */
        if (*error_code == MPI_SUCCESS) {
        
            /* in the deferred open case, only those who have actually
               opened the file should close it */
            if (fd->hints->deferred_open)  {
                if (fd->is_agg) {
                    (*(fd->fns->ADIOI_xxx_Close))(fd, error_code);
                }
            }
            else {
                (*(fd->fns->ADIOI_xxx_Close))(fd, error_code);
            }
        }
	ADIOI_Free(fd->filename);
	ADIOI_Free(fd->hints->ranklist);
	ADIOI_Free(fd->hints->cb_config_list);
	ADIOI_Free(fd->hints);
	if (fd->info != MPI_INFO_NULL) MPI_Info_free(&(fd->info));
	ADIOI_Free(fd->io_buf);
	ADIOI_Free(fd);
        fd = ADIO_FILE_NULL;
	if (*error_code == MPI_SUCCESS)
	{
	    *error_code = MPIO_Err_create_code(MPI_SUCCESS,
					       MPIR_ERR_RECOVERABLE, myname,
					       __LINE__, MPI_ERR_IO,
					       "**oremote_fail", 0);
	}
    }

    return fd;
}
Esempio n. 18
0
/* This function is for implementation convenience.
 * It takes care of the differences in the interface for nonblocking I/O
 * on various Unix machines! If wr==1 write, wr==0 read.
 *
 * Returns 0 on success, -errno on failure.
 */
int ADIOI_GEN_aio(ADIO_File fd, void *buf, int len, ADIO_Offset offset,
		  int wr, MPI_Request *request)
{
    int err=-1, fd_sys;

    int error_code;
    struct aiocb *aiocbp=NULL;
    ADIOI_AIO_Request *aio_req=NULL;
    MPI_Status status;
#if defined(ROMIO_XFS)
    unsigned maxiosz = wr ? fd->hints->fs_hints.xfs.write_chunk_sz :
	    fd->hints->fs_hints.xfs.read_chunk_sz;
#endif /* ROMIO_XFS */

    fd_sys = fd->fd_sys;

#if defined(ROMIO_XFS)
    /* Use Direct I/O if desired and properly aligned */
    if (fd->fns == &ADIO_XFS_operations &&
	 ((wr && fd->direct_write) || (!wr && fd->direct_read)) &&
	 !(((long) buf) % fd->d_mem) && !(offset % fd->d_miniosz) && 
	 !(len % fd->d_miniosz) && (len >= fd->d_miniosz) && 
	 (len <= maxiosz)) {
	    fd_sys = fd->fd_direct;
    }
#endif /* ROMIO_XFS */

    aio_req = (ADIOI_AIO_Request*)ADIOI_Calloc(sizeof(ADIOI_AIO_Request), 1);
    aiocbp = (struct aiocb *) ADIOI_Calloc(sizeof(struct aiocb), 1);
    aiocbp->aio_offset = offset;
    aiocbp->aio_buf    = buf;
    aiocbp->aio_nbytes = len;

#ifdef ROMIO_HAVE_STRUCT_AIOCB_WITH_AIO_WHENCE
    aiocbp->aio_whence = SEEK_SET;
#endif
#ifdef ROMIO_HAVE_STRUCT_AIOCB_WITH_AIO_FILDES
    aiocbp->aio_fildes = fd_sys;
#endif
#ifdef ROMIO_HAVE_STRUCT_AIOCB_WITH_AIO_SIGEVENT
# ifdef AIO_SIGNOTIFY_NONE
    aiocbp->aio_sigevent.sigev_notify = SIGEV_NONE;
# endif
    aiocbp->aio_sigevent.sigev_signo = 0;
#endif
#ifdef ROMIO_HAVE_STRUCT_AIOCB_WITH_AIO_REQPRIO
# ifdef AIO_PRIO_DFL
    aiocbp->aio_reqprio = AIO_PRIO_DFL;   /* not needed in DEC Unix 4.0 */
# else
    aiocbp->aio_reqprio = 0;
# endif
#endif

#ifndef ROMIO_HAVE_AIO_CALLS_NEED_FILEDES
#ifndef ROMIO_HAVE_STRUCT_AIOCB_WITH_AIO_FILDES
#error 'No fildes set for aio structure'
#endif
    if (wr) err = aio_write(aiocbp);
    else err = aio_read(aiocbp);
#else
    /* Broken IBM interface */
    if (wr) err = aio_write(fd_sys, aiocbp);
    else err = aio_read(fd_sys, aiocbp);
#endif

    if (err == -1) {
	if (errno == EAGAIN || errno == ENOSYS) { 
	    /* exceeded the max. no. of outstanding requests.
               or, aio routines are not actually implemented 
	    treat this as a blocking request and return.  */
	    if (wr) 
		ADIO_WriteContig(fd, buf, len, MPI_BYTE, 
			    ADIO_EXPLICIT_OFFSET, offset, &status, &error_code);  
	    else
		ADIO_ReadContig(fd, buf, len, MPI_BYTE,
			    ADIO_EXPLICIT_OFFSET, offset, &status, &error_code);  
		    
	    MPIO_Completed_request_create(&fd, len, &error_code, request);
	    if (aiocbp != NULL) ADIOI_Free(aiocbp);
	    if (aio_req != NULL) ADIOI_Free(aio_req);
	    return 0;
	} else {
	    return errno;
	}
    }
    aio_req->aiocbp = aiocbp;
    if (ADIOI_GEN_greq_class == 0) {
	    MPIX_Grequest_class_create(ADIOI_GEN_aio_query_fn, 
			    ADIOI_GEN_aio_free_fn, MPIU_Greq_cancel_fn, 
			    ADIOI_GEN_aio_poll_fn, ADIOI_GEN_aio_wait_fn, 
			    &ADIOI_GEN_greq_class);
    }
    MPIX_Grequest_class_allocate(ADIOI_GEN_greq_class, aio_req, request);
    memcpy(&(aio_req->req), request, sizeof(MPI_Request));
    return 0;
}
Esempio n. 19
0
/* wait for multiple requests to complete */
int ADIOI_GEN_aio_wait_fn(int count, void ** array_of_states, 
		double timeout, MPI_Status *status)
{
	const struct aiocb **cblist;
	int err, errcode=MPI_SUCCESS;
	int nr_complete=0;
	double starttime;
	struct timespec aio_timer;
	struct timespec *aio_timer_p = NULL;

	ADIOI_AIO_Request **aio_reqlist;
	int i;

	aio_reqlist = (ADIOI_AIO_Request **)array_of_states;

	cblist = (const struct aiocb**) ADIOI_Calloc(count, sizeof(struct aiocb*));

	starttime = MPI_Wtime();
	if (timeout >0) {
	    aio_timer.tv_sec = (time_t)timeout;
	    aio_timer.tv_nsec = timeout - aio_timer.tv_sec;
	    aio_timer_p = &aio_timer;
	}
	for (i=0; i< count; i++)
	{
		cblist[i] = aio_reqlist[i]->aiocbp;
	}

	while(nr_complete < count) {
	    do {
		err = aio_suspend(cblist, count, aio_timer_p);
	    } while (err < 0 && errno == EINTR);
	    if (err == 0) 
	    { /* run through the list of requests, and mark all the completed
		 ones as done */
		for (i=0; i< count; i++)
		{
		    /* aio_error returns an ERRNO value */
		    if (aio_reqlist[i]->aiocbp == NULL) 
			continue;
		    errno = aio_error(aio_reqlist[i]->aiocbp);
		    if (errno == 0) {
			ssize_t n = aio_return(aio_reqlist[i]->aiocbp);
			aio_reqlist[i]->nbytes = n;
			errcode = MPI_Grequest_complete(aio_reqlist[i]->req);
			if (errcode != MPI_SUCCESS) {
			    errcode = MPIO_Err_create_code(MPI_SUCCESS,
				    MPIR_ERR_RECOVERABLE,
				    "ADIOI_GEN_aio_wait_fn", 
				    __LINE__, MPI_ERR_IO, 
				    "**mpi_grequest_complete", 0);
			}
			ADIOI_Free(aio_reqlist[i]->aiocbp);
			aio_reqlist[i]->aiocbp = NULL;
			cblist[i] = NULL;
			nr_complete++;
		    } 
		    /* TODO: need to handle error conditions somehow*/
		}
	    } /* TODO: also need to handle errors here  */
	    if ( (timeout > 0) && (timeout < (MPI_Wtime() - starttime) ))
		break;
	}

	if (cblist != NULL) ADIOI_Free(cblist);
        return errcode;
}
Esempio n. 20
0
/* ADIOI_Calc_my_req() - calculate what portions of the access requests
 * of this process are located in the file domains of various processes
 * (including this one)
 */
void ADIOI_Calc_my_req(ADIO_File fd, ADIO_Offset *offset_list, ADIO_Offset *len_list, 
		       int contig_access_count, ADIO_Offset 
		       min_st_offset, ADIO_Offset *fd_start,
		       ADIO_Offset *fd_end, ADIO_Offset fd_size,
                       int nprocs,
                       int *count_my_req_procs_ptr,
		       int **count_my_req_per_proc_ptr,
		       ADIOI_Access **my_req_ptr,
		       int **buf_idx_ptr)
/* Possibly reconsider if buf_idx's are ok as int's, or should they be aints/offsets? 
   They are used as memory buffer indices so it seems like the 2G limit is in effect */
{
    int *count_my_req_per_proc, count_my_req_procs, *buf_idx;
    int i, l, proc;
    ADIO_Offset fd_len, rem_len, curr_idx, off;
    ADIOI_Access *my_req;

#ifdef AGGREGATION_PROFILE
    MPE_Log_event (5024, 0, NULL);
#endif

    *count_my_req_per_proc_ptr = (int *) ADIOI_Calloc(nprocs,sizeof(int)); 
    count_my_req_per_proc = *count_my_req_per_proc_ptr;
/* count_my_req_per_proc[i] gives the no. of contig. requests of this
   process in process i's file domain. calloc initializes to zero.
   I'm allocating memory of size nprocs, so that I can do an 
   MPI_Alltoall later on.*/

    buf_idx = (int *) ADIOI_Malloc(nprocs*sizeof(int));
/* buf_idx is relevant only if buftype_is_contig.
   buf_idx[i] gives the index into user_buf where data received
   from proc. i should be placed. This allows receives to be done
   without extra buffer. This can't be done if buftype is not contig. */
   
    /* initialize buf_idx to -1 */
    for (i=0; i < nprocs; i++) buf_idx[i] = -1;

    /* one pass just to calculate how much space to allocate for my_req;
     * contig_access_count was calculated way back in ADIOI_Calc_my_off_len()
     */
    for (i=0; i < contig_access_count; i++) {
	/* short circuit offset/len processing if len == 0 
	 * 	(zero-byte  read/write */
	if (len_list[i] == 0) 
		continue;
	off = offset_list[i];
	fd_len = len_list[i];
	/* note: we set fd_len to be the total size of the access.  then
	 * ADIOI_Calc_aggregator() will modify the value to return the 
	 * amount that was available from the file domain that holds the
	 * first part of the access.
	 */
	proc = ADIOI_Calc_aggregator(fd, off, min_st_offset, &fd_len, fd_size, 
				     fd_start, fd_end);
	count_my_req_per_proc[proc]++;

	/* figure out how much data is remaining in the access (i.e. wasn't 
	 * part of the file domain that had the starting byte); we'll take 
	 * care of this data (if there is any) in the while loop below.
	 */
	rem_len = len_list[i] - fd_len;

	while (rem_len != 0) {
	    off += fd_len; /* point to first remaining byte */
	    fd_len = rem_len; /* save remaining size, pass to calc */
	    proc = ADIOI_Calc_aggregator(fd, off, min_st_offset, &fd_len, 
					 fd_size, fd_start, fd_end);

	    count_my_req_per_proc[proc]++;
	    rem_len -= fd_len; /* reduce remaining length by amount from fd */
	}
    }

/* now allocate space for my_req, offset, and len */

    *my_req_ptr = (ADIOI_Access *)
	ADIOI_Malloc(nprocs*sizeof(ADIOI_Access)); 
    my_req = *my_req_ptr;

    count_my_req_procs = 0;
    for (i=0; i < nprocs; i++) {
	if (count_my_req_per_proc[i]) {
	    my_req[i].offsets = (ADIO_Offset *)
		ADIOI_Malloc(count_my_req_per_proc[i] * sizeof(ADIO_Offset));
	    my_req[i].lens = (int *)
		ADIOI_Malloc(count_my_req_per_proc[i] * sizeof(int));
	    count_my_req_procs++;
	}	    
	my_req[i].count = 0;  /* will be incremented where needed
				      later */
    }

/* now fill in my_req */
    curr_idx = 0;
    for (i=0; i<contig_access_count; i++) { 
	/* short circuit offset/len processing if len == 0 
	 * 	(zero-byte  read/write */
	if (len_list[i] == 0)
		continue;
	off = offset_list[i];
	fd_len = len_list[i];
	proc = ADIOI_Calc_aggregator(fd, off, min_st_offset, &fd_len, fd_size, 
				     fd_start, fd_end);

	/* for each separate contiguous access from this process */
	if (buf_idx[proc] == -1) 
  {
    ADIOI_Assert(curr_idx == (int) curr_idx);
    buf_idx[proc] = (int) curr_idx;
  }

	l = my_req[proc].count;
	curr_idx += fd_len; 

	rem_len = len_list[i] - fd_len;

	/* store the proc, offset, and len information in an array
         * of structures, my_req. Each structure contains the 
         * offsets and lengths located in that process's FD, 
	 * and the associated count. 
	 */
	my_req[proc].offsets[l] = off;
  ADIOI_Assert(fd_len == (int) fd_len);
	my_req[proc].lens[l] = (int) fd_len;
	my_req[proc].count++;

	while (rem_len != 0) {
	    off += fd_len;
	    fd_len = rem_len;
	    proc = ADIOI_Calc_aggregator(fd, off, min_st_offset, &fd_len, 
					 fd_size, fd_start, fd_end);

	    if (buf_idx[proc] == -1) 
      {
        ADIOI_Assert(curr_idx == (int) curr_idx);
        buf_idx[proc] = (int) curr_idx;
      }

	    l = my_req[proc].count;
	    curr_idx += fd_len;
	    rem_len -= fd_len;

	    my_req[proc].offsets[l] = off;
      ADIOI_Assert(fd_len == (int) fd_len);
	    my_req[proc].lens[l] = (int) fd_len;
	    my_req[proc].count++;
	}
    }

#ifdef AGG_DEBUG
    for (i=0; i<nprocs; i++) {
	if (count_my_req_per_proc[i] > 0) {
	    FPRINTF(stdout, "data needed from %d (count = %d):\n", i, 
		    my_req[i].count);
	    for (l=0; l < my_req[i].count; l++) {
		FPRINTF(stdout, "   off[%d] = %lld, len[%d] = %d\n", l,
			my_req[i].offsets[l], l, my_req[i].lens[l]);
	    }
	FPRINTF(stdout, "buf_idx[%d] = 0x%x\n", i, buf_idx[i]);
	}
    }
#endif

    *count_my_req_procs_ptr = count_my_req_procs;
    *buf_idx_ptr = buf_idx;
#ifdef AGGREGATION_PROFILE
    MPE_Log_event (5025, 0, NULL);
#endif
}
Esempio n. 21
0
static void ADIOI_Iread_and_exch_l1_begin(ADIOI_NBC_Request *nbc_req,
                                          int *error_code)
{
    ADIOI_Iread_and_exch_vars *vars = nbc_req->data.rd.rae_vars;
    ADIO_File fd;
    int nprocs;
    ADIOI_Access *others_req;

    int i, j;
    ADIO_Offset real_off, req_off;
    char *read_buf;
    int *curr_offlen_ptr, *count, *send_size;
    int *partial_send, *start_pos;
    ADIO_Offset size, real_size, for_next_iter;
    int req_len, flag;

    ADIOI_R_Iexchange_data_vars *red_vars = NULL;

    /* loop exit condition */
    if (vars->m >= vars->ntimes) {
        ADIOI_Iread_and_exch_reset(nbc_req, error_code);
        return;
    }

    fd = vars->fd;
    nprocs = vars->nprocs;
    others_req = vars->others_req;

    read_buf = vars->read_buf;
    curr_offlen_ptr = vars->curr_offlen_ptr;
    count = vars->count;
    send_size = vars->send_size;
    partial_send = vars->partial_send;
    start_pos = vars->start_pos;

    /* read buf of size coll_bufsize (or less) */
    /* go through all others_req and check if any are satisfied
       by the current read */

    /* since MPI guarantees that displacements in filetypes are in
       monotonically nondecreasing order, I can maintain a pointer
       (curr_offlen_ptr) to
       current off-len pair for each process in others_req and scan
       further only from there. There is still a problem of filetypes
       such as:  (1, 2, 3 are not process nos. They are just numbers for
       three chunks of data, specified by a filetype.)

       1  -------!--
       2    -----!----
       3       --!-----

       where ! indicates where the current read_size limitation cuts
       through the filetype.  I resolve this by reading up to !, but
       filling the communication buffer only for 1. I copy the portion
       left over for 2 into a tmp_buf for use in the next
       iteration. i.e., 2 and 3 will be satisfied in the next
       iteration. This simplifies filling in the user's buf at the
       other end, as only one off-len pair with incomplete data
       will be sent. I also don't need to send the individual
       offsets and lens along with the data, as the data is being
       sent in a particular order. */

    /* off = start offset in the file for the data actually read in
             this iteration
       size = size of data read corresponding to off
       real_off = off minus whatever data was retained in memory from
             previous iteration for cases like 2, 3 illustrated above
       real_size = size plus the extra corresponding to real_off
       req_off = off in file for a particular contiguous request
                 minus what was satisfied in previous iteration
       req_size = size corresponding to req_off */

    size = ADIOI_MIN((unsigned)vars->coll_bufsize,
                     vars->end_loc - vars->st_loc + 1 - vars->done);
    real_off = vars->off - vars->for_curr_iter;
    real_size = size + vars->for_curr_iter;

    vars->size = size;
    vars->real_size = real_size;

    for (i = 0; i < nprocs; i++) count[i] = send_size[i] = 0;
    for_next_iter = 0;

    for (i = 0; i < nprocs; i++) {
#ifdef RDCOLL_DEBUG
        DBG_FPRINTF(stderr, "rank %d, i %d, others_count %d\n",
                    vars->myrank, i, others_req[i].count);
#endif
        if (others_req[i].count) {
            start_pos[i] = curr_offlen_ptr[i];
            for (j = curr_offlen_ptr[i]; j < others_req[i].count; j++) {
                if (partial_send[i]) {
                    /* this request may have been partially
                       satisfied in the previous iteration. */
                    req_off = others_req[i].offsets[j] + partial_send[i];
                    req_len = others_req[i].lens[j] - partial_send[i];
                    partial_send[i] = 0;
                    /* modify the off-len pair to reflect this change */
                    others_req[i].offsets[j] = req_off;
                    others_req[i].lens[j] = req_len;
                }
                else {
                    req_off = others_req[i].offsets[j];
                    req_len = others_req[i].lens[j];
                }
                if (req_off < real_off + real_size) {
                    count[i]++;
                    ADIOI_Assert((((ADIO_Offset)(MPIR_Upint)read_buf) + req_off - real_off) == (ADIO_Offset)(MPIR_Upint)(read_buf + req_off - real_off));
                    MPI_Address(read_buf + req_off - real_off,
                                &(others_req[i].mem_ptrs[j]));
                    ADIOI_Assert((real_off + real_size - req_off) == (int)(real_off + real_size - req_off));
                    send_size[i] += (int)(ADIOI_MIN(real_off + real_size - req_off,
                                                    (ADIO_Offset)(unsigned)req_len));

                    if (real_off + real_size - req_off < (ADIO_Offset)(unsigned)req_len) {
                        partial_send[i] = (int)(real_off + real_size - req_off);
                        if ((j+1 < others_req[i].count) &&
                            (others_req[i].offsets[j+1] < real_off + real_size)) {
                            /* this is the case illustrated in the
                               figure above. */
                            for_next_iter = ADIOI_MAX(for_next_iter,
                                    real_off + real_size - others_req[i].offsets[j+1]);
                            /* max because it must cover requests
                               from different processes */
                        }
                        break;
                    }
                }
                else break;
            }
            curr_offlen_ptr[i] = j;
        }
    }
    vars->for_next_iter = for_next_iter;

    flag = 0;
    for (i = 0; i < nprocs; i++)
        if (count[i]) flag = 1;

    /* create a struct for ADIOI_R_Iexchange_data() */
    red_vars = (ADIOI_R_Iexchange_data_vars *)ADIOI_Calloc(
            1, sizeof(ADIOI_R_Iexchange_data_vars));
    nbc_req->data.rd.red_vars = red_vars;
    red_vars->fd = vars->fd;
    red_vars->buf = vars->buf;
    red_vars->flat_buf = vars->flat_buf;
    red_vars->offset_list = vars->offset_list;
    red_vars->len_list = vars->len_list;
    red_vars->send_size = vars->send_size;
    red_vars->recv_size = vars->recv_size;
    red_vars->count = vars->count;
    red_vars->start_pos = vars->start_pos;
    red_vars->partial_send = vars->partial_send;
    red_vars->recd_from_proc = vars->recd_from_proc;
    red_vars->nprocs = vars->nprocs;
    red_vars->myrank = vars->myrank;
    red_vars->buftype_is_contig = vars->buftype_is_contig;
    red_vars->contig_access_count = vars->contig_access_count;
    red_vars->min_st_offset = vars->min_st_offset;
    red_vars->fd_size = vars->fd_size;
    red_vars->fd_start = vars->fd_start;
    red_vars->fd_end = vars->fd_end;
    red_vars->others_req = vars->others_req;
    red_vars->iter = vars->m;
    red_vars->buftype_extent = vars->buftype_extent;
    red_vars->buf_idx = vars->buf_idx;
    red_vars->next_fn = ADIOI_Iread_and_exch_l1_end;

    if (flag) {
        ADIOI_Assert(size == (int)size);
        ADIO_IreadContig(fd, read_buf+vars->for_curr_iter, (int)size,
                         MPI_BYTE, ADIO_EXPLICIT_OFFSET, vars->off,
                         &vars->req2, error_code);

        nbc_req->data.rd.state = ADIOI_IRC_STATE_IREAD_AND_EXCH_L1_BEGIN;
        return;
    }

    ADIOI_R_Iexchange_data(nbc_req, error_code);
}
Esempio n. 22
0
static void ADIOI_Iread_and_exch(ADIOI_NBC_Request *nbc_req, int *error_code)
{
    ADIOI_Iread_and_exch_vars *vars = nbc_req->data.rd.rae_vars;
    ADIO_File fd = vars->fd;
    MPI_Datatype datatype = vars->datatype;
    int nprocs = vars->nprocs;
    ADIOI_Access *others_req = vars->others_req;

    /* Read in sizes of no more than coll_bufsize, an info parameter.
       Send data to appropriate processes.
       Place recd. data in user buf.
       The idea is to reduce the amount of extra memory required for
       collective I/O. If all data were read all at once, which is much
       easier, it would require temp space more than the size of user_buf,
       which is often unacceptable. For example, to read a distributed
       array from a file, where each local array is 8Mbytes, requiring
       at least another 8Mbytes of temp space is unacceptable. */

    int i, j;
    ADIO_Offset st_loc = -1, end_loc = -1;
    ADIOI_Flatlist_node *flat_buf = NULL;
    int coll_bufsize;

    *error_code = MPI_SUCCESS;  /* changed below if error */
    /* only I/O errors are currently reported */

    /* calculate the number of reads of size coll_bufsize
       to be done by each process and the max among all processes.
       That gives the no. of communication phases as well.
       coll_bufsize is obtained from the hints object. */

    coll_bufsize = fd->hints->cb_buffer_size;
    vars->coll_bufsize = coll_bufsize;

    /* grab some initial values for st_loc and end_loc */
    for (i = 0; i < nprocs; i++) {
        if (others_req[i].count) {
            st_loc = others_req[i].offsets[0];
            end_loc = others_req[i].offsets[0];
            break;
        }
    }

    /* now find the real values */
    for (i = 0; i < nprocs; i++)
        for (j = 0; j < others_req[i].count; j++) {
            st_loc = ADIOI_MIN(st_loc, others_req[i].offsets[j]);
            end_loc = ADIOI_MAX(end_loc, (others_req[i].offsets[j]
                          + others_req[i].lens[j] - 1));
        }

    vars->st_loc = st_loc;
    vars->end_loc = end_loc;

    /* calculate ntimes, the number of times this process must perform I/O
     * operations in order to complete all the requests it has received.
     * the need for multiple I/O operations comes from the restriction that
     * we only use coll_bufsize bytes of memory for internal buffering.
     */
    if ((st_loc == -1) && (end_loc == -1)) {
        /* this process does no I/O. */
        vars->ntimes = 0;
    }
    else {
        /* ntimes=ceiling_div(end_loc - st_loc + 1, coll_bufsize)*/
        vars->ntimes = (int)((end_loc - st_loc + coll_bufsize) / coll_bufsize);
    }

    *error_code = MPI_Iallreduce(&vars->ntimes, &vars->max_ntimes, 1, MPI_INT,
                                 MPI_MAX, fd->comm, &vars->req1);

    vars->read_buf = fd->io_buf;  /* Allocated at open time */

    vars->curr_offlen_ptr = (int *)ADIOI_Calloc(nprocs, sizeof(int));
    /* its use is explained below. calloc initializes to 0. */

    vars->count = (int *)ADIOI_Malloc(nprocs * sizeof(int));
    /* to store count of how many off-len pairs per proc are satisfied
       in an iteration. */

    vars->partial_send = (int *)ADIOI_Calloc(nprocs, sizeof(int));
    /* if only a portion of the last off-len pair is sent to a process
       in a particular iteration, the length sent is stored here.
       calloc initializes to 0. */

    vars->send_size = (int *)ADIOI_Malloc(nprocs * sizeof(int));
    /* total size of data to be sent to each proc. in an iteration */

    vars->recv_size = (int *)ADIOI_Malloc(nprocs * sizeof(int));
    /* total size of data to be recd. from each proc. in an iteration.
       Of size nprocs so that I can use MPI_Alltoall later. */

    vars->recd_from_proc = (int *)ADIOI_Calloc(nprocs, sizeof(int));
    /* amount of data recd. so far from each proc. Used in
       ADIOI_Fill_user_buffer. initialized to 0 here. */

    vars->start_pos = (int *)ADIOI_Malloc(nprocs*sizeof(int));
    /* used to store the starting value of curr_offlen_ptr[i] in
       this iteration */

    ADIOI_Datatype_iscontig(datatype, &vars->buftype_is_contig);
    if (!vars->buftype_is_contig) {
        ADIOI_Flatten_datatype(datatype);
        flat_buf = ADIOI_Flatlist;
        while (flat_buf->type != datatype) flat_buf = flat_buf->next;
        vars->flat_buf = flat_buf;
    }
    MPI_Type_extent(datatype, &vars->buftype_extent);

    vars->done = 0;
    vars->off = st_loc;
    vars->for_curr_iter = vars->for_next_iter = 0;

    /* set the state to wait until MPI_Ialltoall finishes. */
    nbc_req->data.rd.state = ADIOI_IRC_STATE_IREAD_AND_EXCH;
}
Esempio n. 23
0
/* Nonblocking version of ADIOI_GEN_ReadStridedColl() */
void ADIOI_GEN_IreadStridedColl(ADIO_File fd, void *buf, int count,
                   MPI_Datatype datatype, int file_ptr_type,
                   ADIO_Offset offset, MPI_Request *request,
                   int *error_code)
{
    /* Uses a generalized version of the extended two-phase method described
       in "An Extended Two-Phase Method for Accessing Sections of
       Out-of-Core Arrays", Rajeev Thakur and Alok Choudhary,
       Scientific Programming, (5)4:301--317, Winter 1996.
       http://www.mcs.anl.gov/home/thakur/ext2ph.ps */

    ADIOI_NBC_Request *nbc_req = NULL;
    ADIOI_GEN_IreadStridedColl_vars *vars = NULL;
    int nprocs, myrank;
#ifdef RDCOLL_DEBUG
    int i;
#endif

    /* FIXME: need an implementation of ADIOI_IOIstridedColl
    if (fd->hints->cb_pfr != ADIOI_HINT_DISABLE) {
        ADIOI_IOIstridedColl(fd, buf, count, ADIOI_READ, datatype,
                             file_ptr_type, offset, request, error_code);
        return;
    }
    */

    /* top-level struct keeping the status of function progress */
    nbc_req = (ADIOI_NBC_Request *)ADIOI_Calloc(1, sizeof(ADIOI_NBC_Request));
    nbc_req->rdwr = ADIOI_READ;

    /* create a generalized request */
    if (ADIOI_GEN_greq_class == 0) {
        MPIX_Grequest_class_create(ADIOI_GEN_irc_query_fn,
                ADIOI_GEN_irc_free_fn, MPIU_Greq_cancel_fn,
                ADIOI_GEN_irc_poll_fn, ADIOI_GEN_irc_wait_fn,
                &ADIOI_GEN_greq_class);
    }
    MPIX_Grequest_class_allocate(ADIOI_GEN_greq_class, nbc_req, request);
    memcpy(&nbc_req->req, request, sizeof(MPI_Request));

    /* create a struct for parameters and variables */
    vars = (ADIOI_GEN_IreadStridedColl_vars *)ADIOI_Calloc(
            1, sizeof(ADIOI_GEN_IreadStridedColl_vars));
    nbc_req->data.rd.rsc_vars = vars;

    /* save the parameters */
    vars->fd = fd;
    vars->buf = buf;
    vars->count = count;
    vars->datatype = datatype;
    vars->file_ptr_type = file_ptr_type;
    vars->offset = offset;

    MPI_Comm_size(fd->comm, &nprocs);
    MPI_Comm_rank(fd->comm, &myrank);
    vars->nprocs = nprocs;
    vars->myrank = myrank;

    /* number of aggregators, cb_nodes, is stored in the hints */
    vars->nprocs_for_coll = fd->hints->cb_nodes;
    vars->orig_fp = fd->fp_ind;

    /* only check for interleaving if cb_read isn't disabled */
    if (fd->hints->cb_read != ADIOI_HINT_DISABLE) {
        /* For this process's request, calculate the list of offsets and
           lengths in the file and determine the start and end offsets. */

        /* Note: end_offset points to the last byte-offset that will be accessed.
           e.g., if start_offset=0 and 100 bytes to be read, end_offset=99*/

        ADIOI_Calc_my_off_len(fd, count, datatype, file_ptr_type, offset,
                              &vars->offset_list, &vars->len_list,
                              &vars->start_offset, &vars->end_offset,
                              &vars->contig_access_count);

#ifdef RDCOLL_DEBUG
        for (i = 0; i < vars->contig_access_count; i++) {
            DBG_FPRINTF(stderr, "rank %d  off %lld  len %lld\n",
                        myrank, vars->offset_list[i], vars->len_list[i]);
        }
#endif

        /* each process communicates its start and end offsets to other
           processes. The result is an array each of start and end offsets
           stored in order of process rank. */

        vars->st_offsets = (ADIO_Offset *)ADIOI_Malloc(nprocs*sizeof(ADIO_Offset));
        vars->end_offsets = (ADIO_Offset *)ADIOI_Malloc(nprocs*sizeof(ADIO_Offset));

        *error_code = MPI_Iallgather(&vars->start_offset, 1, ADIO_OFFSET,
                                     vars->st_offsets, 1, ADIO_OFFSET,
                                     fd->comm, &vars->req_offset[0]);
        if (*error_code != MPI_SUCCESS) return;
        *error_code = MPI_Iallgather(&vars->end_offset, 1, ADIO_OFFSET,
                                     vars->end_offsets, 1, ADIO_OFFSET,
                                     fd->comm, &vars->req_offset[1]);

        nbc_req->data.rd.state = ADIOI_IRC_STATE_GEN_IREADSTRIDEDCOLL;
        return;
    }

    ADIOI_GEN_IreadStridedColl_indio(nbc_req, error_code);
}
Esempio n. 24
0
static void ADIOI_GEN_IreadStridedColl_indio(ADIOI_NBC_Request *nbc_req,
                                             int *error_code)
{
    ADIOI_GEN_IreadStridedColl_vars *vars = nbc_req->data.rd.rsc_vars;
    ADIOI_Icalc_others_req_vars *cor_vars = NULL;
    ADIO_File fd = vars->fd;
    void *buf;
    int count, file_ptr_type;
    MPI_Datatype datatype = vars->datatype;
    ADIO_Offset offset;
    int filetype_is_contig;
    ADIO_Offset off;
    int nprocs;

    ADIOI_Datatype_iscontig(datatype, &vars->buftype_is_contig);

    if (fd->hints->cb_read == ADIOI_HINT_DISABLE
    || (!vars->interleave_count && (fd->hints->cb_read == ADIOI_HINT_AUTO)))
    {
        buf = vars->buf;
        count = vars->count;
        file_ptr_type = vars->file_ptr_type;
        offset = vars->offset;

        /* don't do aggregation */
        if (fd->hints->cb_read != ADIOI_HINT_DISABLE) {
            ADIOI_Free(vars->offset_list);
            ADIOI_Free(vars->len_list);
            ADIOI_Free(vars->st_offsets);
            ADIOI_Free(vars->end_offsets);
        }

        fd->fp_ind = vars->orig_fp;
        ADIOI_Datatype_iscontig(fd->filetype, &filetype_is_contig);

        if (vars->buftype_is_contig && filetype_is_contig) {
            if (file_ptr_type == ADIO_EXPLICIT_OFFSET) {
                off = fd->disp + (fd->etype_size) * offset;
                ADIO_IreadContig(fd, buf, count, datatype, ADIO_EXPLICIT_OFFSET,
                                 off, &vars->req_ind_io, error_code);
            }
            else ADIO_IreadContig(fd, buf, count, datatype, ADIO_INDIVIDUAL,
                                  0, &vars->req_ind_io, error_code);
        }
        else {
            ADIO_IreadStrided(fd, buf, count, datatype, file_ptr_type,
                              offset, &vars->req_ind_io, error_code);
        }

        nbc_req->data.rd.state = ADIOI_IRC_STATE_GEN_IREADSTRIDEDCOLL_INDIO;
        return;
    }

    nprocs = vars->nprocs;

    /* We're going to perform aggregation of I/O.  Here we call
     * ADIOI_Calc_file_domains() to determine what processes will handle I/O
     * to what regions.  We pass nprocs_for_coll into this function; it is
     * used to determine how many processes will perform I/O, which is also
     * the number of regions into which the range of bytes must be divided.
     * These regions are called "file domains", or FDs.
     *
     * When this function returns, fd_start, fd_end, fd_size, and
     * min_st_offset will be filled in.  fd_start holds the starting byte
     * location for each file domain.  fd_end holds the ending byte location.
     * min_st_offset holds the minimum byte location that will be accessed.
     *
     * Both fd_start[] and fd_end[] are indexed by an aggregator number; this
     * needs to be mapped to an actual rank in the communicator later.
     *
     */
    ADIOI_Calc_file_domains(vars->st_offsets, vars->end_offsets, nprocs,
                vars->nprocs_for_coll, &vars->min_st_offset,
                &vars->fd_start, &vars->fd_end,
                fd->hints->min_fdomain_size, &vars->fd_size,
                fd->hints->striping_unit);

    /* calculate where the portions of the access requests of this process
     * are located in terms of the file domains.  this could be on the same
     * process or on other processes.  this function fills in:
     * count_my_req_procs - number of processes (including this one) for which
     *     this process has requests in their file domain
     * count_my_req_per_proc - count of requests for each process, indexed
     *     by rank of the process
     * my_req[] - array of data structures describing the requests to be
     *     performed by each process (including self).  indexed by rank.
     * buf_idx[] - array of locations into which data can be directly moved;
     *     this is only valid for contiguous buffer case
     */
    ADIOI_Calc_my_req(fd, vars->offset_list, vars->len_list,
              vars->contig_access_count, vars->min_st_offset,
              vars->fd_start, vars->fd_end, vars->fd_size,
              nprocs, &vars->count_my_req_procs,
              &vars->count_my_req_per_proc, &vars->my_req,
              &vars->buf_idx);

    /* perform a collective communication in order to distribute the
     * data calculated above.  fills in the following:
     * count_others_req_procs - number of processes (including this
     *     one) which have requests in this process's file domain.
     * count_others_req_per_proc[] - number of separate contiguous
     *     requests from proc i lie in this process's file domain.
     */

    cor_vars = (ADIOI_Icalc_others_req_vars *)ADIOI_Calloc(
            1, sizeof(ADIOI_Icalc_others_req_vars));
    nbc_req->cor_vars = cor_vars;
    cor_vars->fd = vars->fd;
    cor_vars->count_my_req_procs = vars->count_my_req_procs;
    cor_vars->count_my_req_per_proc = vars->count_my_req_per_proc;
    cor_vars->my_req = vars->my_req;
    cor_vars->nprocs = vars->nprocs;
    cor_vars->myrank = vars->myrank;
    cor_vars->count_others_req_procs_ptr = &vars->count_others_req_procs;
    cor_vars->others_req_ptr = &vars->others_req;
    cor_vars->next_fn = ADIOI_GEN_IreadStridedColl_read;

    ADIOI_Icalc_others_req(nbc_req, error_code);
}
Esempio n. 25
0
/* If successful, error_code is set to MPI_SUCCESS.  Otherwise an error
 * code is created and returned in error_code.
 */
static void ADIOI_Exch_and_write(ADIO_File fd, void *buf, MPI_Datatype
                                 datatype, int nprocs,
                                 int myrank,
                                 ADIOI_Access
                                 * others_req, ADIO_Offset * offset_list,
                                 ADIO_Offset * len_list, int contig_access_count,
                                 ADIO_Offset min_st_offset, ADIO_Offset fd_size,
                                 ADIO_Offset * fd_start, ADIO_Offset * fd_end,
                                 MPI_Aint * buf_idx, int *error_code)
{
/* Send data to appropriate processes and write in sizes of no more
   than coll_bufsize.
   The idea is to reduce the amount of extra memory required for
   collective I/O. If all data were written all at once, which is much
   easier, it would require temp space more than the size of user_buf,
   which is often unacceptable. For example, to write a distributed
   array to a file, where each local array is 8Mbytes, requiring
   at least another 8Mbytes of temp space is unacceptable. */

    /* Not convinced end_loc-st_loc couldn't be > int, so make these offsets */
    ADIO_Offset size = 0;
    int hole, i, j, m, ntimes, max_ntimes, buftype_is_contig;
    ADIO_Offset st_loc = -1, end_loc = -1, off, done, req_off;
    char *write_buf = NULL;
    int *curr_offlen_ptr, *count, *send_size, req_len, *recv_size;
    int *partial_recv, *sent_to_proc, *start_pos, flag;
    int *send_buf_idx, *curr_to_proc, *done_to_proc;
    MPI_Status status;
    ADIOI_Flatlist_node *flat_buf = NULL;
    MPI_Aint buftype_extent;
    int info_flag, coll_bufsize;
    char *value;
    static char myname[] = "ADIOI_EXCH_AND_WRITE";

    *error_code = MPI_SUCCESS;  /* changed below if error */
    /* only I/O errors are currently reported */

/* calculate the number of writes of size coll_bufsize
   to be done by each process and the max among all processes.
   That gives the no. of communication phases as well. */

    value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL + 1) * sizeof(char));
    ADIOI_Info_get(fd->info, "cb_buffer_size", MPI_MAX_INFO_VAL, value, &info_flag);
    coll_bufsize = atoi(value);
    ADIOI_Free(value);


    for (i = 0; i < nprocs; i++) {
        if (others_req[i].count) {
            st_loc = others_req[i].offsets[0];
            end_loc = others_req[i].offsets[0];
            break;
        }
    }

    for (i = 0; i < nprocs; i++)
        for (j = 0; j < others_req[i].count; j++) {
            st_loc = MPL_MIN(st_loc, others_req[i].offsets[j]);
            end_loc = MPL_MAX(end_loc, (others_req[i].offsets[j]
                                        + others_req[i].lens[j] - 1));
        }

/* ntimes=ceiling_div(end_loc - st_loc + 1, coll_bufsize)*/

    ntimes = (int) ((end_loc - st_loc + coll_bufsize) / coll_bufsize);

    if ((st_loc == -1) && (end_loc == -1)) {
        ntimes = 0;     /* this process does no writing. */
    }

    MPI_Allreduce(&ntimes, &max_ntimes, 1, MPI_INT, MPI_MAX, fd->comm);

    write_buf = fd->io_buf;

    curr_offlen_ptr = (int *) ADIOI_Calloc(nprocs * 10, sizeof(int));
    /* its use is explained below. calloc initializes to 0. */

    count = curr_offlen_ptr + nprocs;
    /* to store count of how many off-len pairs per proc are satisfied
     * in an iteration. */

    partial_recv = count + nprocs;
    /* if only a portion of the last off-len pair is recd. from a process
     * in a particular iteration, the length recd. is stored here.
     * calloc initializes to 0. */

    send_size = partial_recv + nprocs;
    /* total size of data to be sent to each proc. in an iteration.
     * Of size nprocs so that I can use MPI_Alltoall later. */

    recv_size = send_size + nprocs;
    /* total size of data to be recd. from each proc. in an iteration. */

    sent_to_proc = recv_size + nprocs;
    /* amount of data sent to each proc so far. Used in
     * ADIOI_Fill_send_buffer. initialized to 0 here. */

    send_buf_idx = sent_to_proc + nprocs;
    curr_to_proc = send_buf_idx + nprocs;
    done_to_proc = curr_to_proc + nprocs;
    /* Above three are used in ADIOI_Fill_send_buffer */

    start_pos = done_to_proc + nprocs;
    /* used to store the starting value of curr_offlen_ptr[i] in
     * this iteration */

    ADIOI_Datatype_iscontig(datatype, &buftype_is_contig);
    if (!buftype_is_contig) {
        flat_buf = ADIOI_Flatten_and_find(datatype);
    }
    MPI_Type_extent(datatype, &buftype_extent);


/* I need to check if there are any outstanding nonblocking writes to
   the file, which could potentially interfere with the writes taking
   place in this collective write call. Since this is not likely to be
   common, let me do the simplest thing possible here: Each process
   completes all pending nonblocking operations before completing. */

    /*ADIOI_Complete_async(error_code);
     * if (*error_code != MPI_SUCCESS) return;
     * MPI_Barrier(fd->comm);
     */

    done = 0;
    off = st_loc;

    for (m = 0; m < ntimes; m++) {
        /* go through all others_req and check which will be satisfied
         * by the current write */

        /* Note that MPI guarantees that displacements in filetypes are in
         * monotonically nondecreasing order and that, for writes, the
         * filetypes cannot specify overlapping regions in the file. This
         * simplifies implementation a bit compared to reads. */

        /* off = start offset in the file for the data to be written in
         * this iteration
         * size = size of data written (bytes) corresponding to off
         * req_off = off in file for a particular contiguous request
         * minus what was satisfied in previous iteration
         * req_size = size corresponding to req_off */

        /* first calculate what should be communicated */

        for (i = 0; i < nprocs; i++)
            count[i] = recv_size[i] = 0;

        size = MPL_MIN((unsigned) coll_bufsize, end_loc - st_loc + 1 - done);

        for (i = 0; i < nprocs; i++) {
            if (others_req[i].count) {
                start_pos[i] = curr_offlen_ptr[i];
                for (j = curr_offlen_ptr[i]; j < others_req[i].count; j++) {
                    if (partial_recv[i]) {
                        /* this request may have been partially
                         * satisfied in the previous iteration. */
                        req_off = others_req[i].offsets[j] + partial_recv[i];
                        req_len = others_req[i].lens[j] - partial_recv[i];
                        partial_recv[i] = 0;
                        /* modify the off-len pair to reflect this change */
                        others_req[i].offsets[j] = req_off;
                        others_req[i].lens[j] = req_len;
                    } else {
                        req_off = others_req[i].offsets[j];
                        req_len = others_req[i].lens[j];
                    }
                    if (req_off < off + size) {
                        count[i]++;
                        ADIOI_Assert((((ADIO_Offset) (uintptr_t) write_buf) + req_off - off) ==
                                     (ADIO_Offset) (uintptr_t) (write_buf + req_off - off));
                        MPI_Address(write_buf + req_off - off, &(others_req[i].mem_ptrs[j]));
                        ADIOI_Assert((off + size - req_off) == (int) (off + size - req_off));
                        recv_size[i] += (int) (MPL_MIN(off + size - req_off, (unsigned) req_len));

                        if (off + size - req_off < (unsigned) req_len) {
                            partial_recv[i] = (int) (off + size - req_off);

                            /* --BEGIN ERROR HANDLING-- */
                            if ((j + 1 < others_req[i].count) &&
                                (others_req[i].offsets[j + 1] < off + size)) {
                                *error_code = MPIO_Err_create_code(MPI_SUCCESS,
                                                                   MPIR_ERR_RECOVERABLE,
                                                                   myname,
                                                                   __LINE__,
                                                                   MPI_ERR_ARG,
                                                                   "Filetype specifies overlapping write regions (which is illegal according to the MPI-2 specification)",
                                                                   0);
                                /* allow to continue since additional
                                 * communication might have to occur
                                 */
                            }
                            /* --END ERROR HANDLING-- */
                            break;
                        }
                    } else
                        break;
                }
                curr_offlen_ptr[i] = j;
            }
        }

        ADIOI_W_Exchange_data(fd, buf, write_buf, flat_buf, offset_list,
                              len_list, send_size, recv_size, off, size, count,
                              start_pos, partial_recv,
                              sent_to_proc, nprocs, myrank,
                              buftype_is_contig, contig_access_count,
                              min_st_offset, fd_size, fd_start, fd_end,
                              others_req, send_buf_idx, curr_to_proc,
                              done_to_proc, &hole, m, buftype_extent, buf_idx, error_code);
        if (*error_code != MPI_SUCCESS)
            return;

        flag = 0;
        for (i = 0; i < nprocs; i++)
            if (count[i])
                flag = 1;

        if (flag) {
            ADIOI_Assert(size == (int) size);
            ADIO_WriteContig(fd, write_buf, (int) size, MPI_BYTE, ADIO_EXPLICIT_OFFSET,
                             off, &status, error_code);
            if (*error_code != MPI_SUCCESS)
                return;
        }

        off += size;
        done += size;
    }

    for (i = 0; i < nprocs; i++)
        count[i] = recv_size[i] = 0;
    for (m = ntimes; m < max_ntimes; m++) {
        ADIOI_Assert(size == (int) size);
        /* nothing to recv, but check for send. */
        ADIOI_W_Exchange_data(fd, buf, write_buf, flat_buf, offset_list,
                              len_list, send_size, recv_size, off, (int) size, count,
                              start_pos, partial_recv,
                              sent_to_proc, nprocs, myrank,
                              buftype_is_contig, contig_access_count,
                              min_st_offset, fd_size, fd_start, fd_end,
                              others_req, send_buf_idx,
                              curr_to_proc, done_to_proc, &hole, m,
                              buftype_extent, buf_idx, error_code);
        if (*error_code != MPI_SUCCESS)
            return;
    }

    ADIOI_Free(curr_offlen_ptr);
}
Esempio n. 26
0
void ADIOI_NTFS_WriteContig(ADIO_File fd, void *buf, int count, 
			    MPI_Datatype datatype, int file_ptr_type,
			    ADIO_Offset offset, ADIO_Status *status,
			    int *error_code)
{
    static char myname[] = "ADIOI_NTFS_WriteContig";
    LONG dwTemp;
    DWORD dwNumWritten = 0;
    int err=-1, datatype_size, len;
    OVERLAPPED *pOvl;

    MPI_Type_size(datatype, &datatype_size);
    len = datatype_size * count;

    pOvl = (OVERLAPPED *) ADIOI_Calloc(sizeof(OVERLAPPED), 1);
    if (pOvl == NULL)
    {
	*error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE,
	    myname, __LINE__, MPI_ERR_IO,
	    "**nomem", "**nomem %s", "OVERLAPPED");
	return;
    }
    pOvl->hEvent = CreateEvent(NULL, TRUE, TRUE, NULL);
    if (pOvl->hEvent == NULL)
    {
	err = GetLastError();
	*error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE,
	    myname, __LINE__, MPI_ERR_IO,
	    "**io", "**io %s", ADIOI_NTFS_Strerror(err));
	ADIOI_Free(pOvl);
	return;
    }
    pOvl->Offset = DWORDLOW(offset);
    pOvl->OffsetHigh = DWORDHIGH(offset);

    if (file_ptr_type == ADIO_EXPLICIT_OFFSET)
    {
	if (fd->fp_sys_posn != offset)
	{
	    dwTemp = DWORDHIGH(offset);
	    if (SetFilePointer(fd->fd_sys, DWORDLOW(offset), &dwTemp, FILE_BEGIN) == INVALID_SET_FILE_POINTER)
	    {
		err = GetLastError();
		if (err != NO_ERROR)
		{
		    *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE,
			myname, __LINE__, MPI_ERR_IO,
			"**io", "**io %s", ADIOI_NTFS_Strerror(err));
		    CloseHandle(pOvl->hEvent);
		    ADIOI_Free(pOvl);
		    return;
		}
	    }
	}
	/*printf("WriteFile(%d bytes)\n", len);fflush(stdout);*/
	err = WriteFile(fd->fd_sys, buf, len, &dwNumWritten, pOvl);
	/* --BEGIN ERROR HANDLING-- */
	if (err == FALSE)
	{
	    err = GetLastError();
	    if (err != ERROR_IO_PENDING)
	    {
		*error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE,
		    myname, __LINE__, MPI_ERR_IO,
		    "**io",
		    "**io %s", ADIOI_NTFS_Strerror(err));
		CloseHandle(pOvl->hEvent);
		ADIOI_Free(pOvl);
		return;
	    }
	}
	/* --END ERROR HANDLING-- */
	err = GetOverlappedResult(fd->fd_sys, pOvl, &dwNumWritten, TRUE);
	/* --BEGIN ERROR HANDLING-- */
	if (err == FALSE)
	{
	    err = GetLastError();
	    *error_code = MPIO_Err_create_code(MPI_SUCCESS,
		MPIR_ERR_RECOVERABLE, myname,
		__LINE__, MPI_ERR_IO, "**io",
		"**io %s", ADIOI_NTFS_Strerror(err));
	    CloseHandle(pOvl->hEvent);
	    ADIOI_Free(pOvl);
	    return;
	}
	/* --END ERROR HANDLING-- */
	if (!CloseHandle(pOvl->hEvent))
	{
	    err = GetLastError();
	    *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE,
		myname, __LINE__, MPI_ERR_IO,
		"**io", "**io %s", ADIOI_NTFS_Strerror(err));
	    CloseHandle(pOvl->hEvent);
	    ADIOI_Free(pOvl);
	    return;
	}
	ADIOI_Free(pOvl);

	fd->fp_sys_posn = offset + dwNumWritten;
	/* individual file pointer not updated */        
    }
    else
    {
	/* write from curr. location of ind. file pointer */
	if (fd->fp_sys_posn != fd->fp_ind)
	{
	    dwTemp = DWORDHIGH(fd->fp_ind);
	    if (SetFilePointer(fd->fd_sys, DWORDLOW(fd->fp_ind), &dwTemp, FILE_BEGIN) == INVALID_SET_FILE_POINTER)
	    {
		err = GetLastError();
		if (err != NO_ERROR)
		{
		    *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE,
			myname, __LINE__, MPI_ERR_IO,
			"**io", "**io %s", ADIOI_NTFS_Strerror(err));
		    CloseHandle(pOvl->hEvent);
		    ADIOI_Free(pOvl);
		    return;
		}
	    }
	}
	/*printf("WriteFile(%d bytes)\n", len);fflush(stdout);*/
	err = WriteFile(fd->fd_sys, buf, len, &dwNumWritten, pOvl);
	/* --BEGIN ERROR HANDLING-- */
	if (err == FALSE)
	{
	    err = GetLastError();
	    if (err != ERROR_IO_PENDING)
	    {
		*error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE,
		    myname, __LINE__, MPI_ERR_IO,
		    "**io",
		    "**io %s", ADIOI_NTFS_Strerror(err));
		CloseHandle(pOvl->hEvent);
		ADIOI_Free(pOvl);
		return;
	    }
	}
	/* --END ERROR HANDLING-- */
	err = GetOverlappedResult(fd->fd_sys, pOvl, &dwNumWritten, TRUE);
	/* --BEGIN ERROR HANDLING-- */
	if (err == FALSE)
	{
	    err = GetLastError();
	    *error_code = MPIO_Err_create_code(MPI_SUCCESS,
		MPIR_ERR_RECOVERABLE, myname,
		__LINE__, MPI_ERR_IO, "**io",
		"**io %s", ADIOI_NTFS_Strerror(err));
	    CloseHandle(pOvl->hEvent);
	    ADIOI_Free(pOvl);
	    return;
	}
	/* --END ERROR HANDLING-- */
	if (!CloseHandle(pOvl->hEvent))
	{
	    err = GetLastError();
	    *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE,
		myname, __LINE__, MPI_ERR_IO,
		"**io", "**io %s", ADIOI_NTFS_Strerror(err));
	    ADIOI_Free(pOvl);
	    return;
	}
	ADIOI_Free(pOvl);

	fd->fp_ind = fd->fp_ind + dwNumWritten;
	fd->fp_sys_posn = fd->fp_ind;
    }

#ifdef HAVE_STATUS_SET_BYTES
    if (err != FALSE)
    {
	MPIR_Status_set_bytes(status, datatype, dwNumWritten);
    }
#endif

    /* --BEGIN ERROR HANDLING-- */
    if (err == FALSE)
    {
	err = GetLastError();
	*error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE,
					   myname, __LINE__, MPI_ERR_IO,
					   "**io",
					   "**io %s", ADIOI_NTFS_Strerror(err));
	return;
    }
    /* --END ERROR HANDLING-- */
    *error_code = MPI_SUCCESS;
}
Esempio n. 27
0
void ADIOI_LUSTRE_Open(ADIO_File fd, int *error_code)
{
    int perm, old_mask, amode, amode_direct;
    int lumlen;
    struct lov_user_md *lum = NULL;
    char *value;

#if defined(MPICH2) || !defined(PRINT_ERR_MSG)
    static char myname[] = "ADIOI_LUSTRE_OPEN";
#endif

    if (fd->perm == ADIO_PERM_NULL) {
	old_mask = umask(022);
	umask(old_mask);
	perm = old_mask ^ 0666;
    }
    else perm = fd->perm;

    amode = 0;
    if (fd->access_mode & ADIO_CREATE)
	amode = amode | O_CREAT;
    if (fd->access_mode & ADIO_RDONLY)
	amode = amode | O_RDONLY;
    if (fd->access_mode & ADIO_WRONLY)
	amode = amode | O_WRONLY;
    if (fd->access_mode & ADIO_RDWR)
	amode = amode | O_RDWR;
    if (fd->access_mode & ADIO_EXCL)
	amode = amode | O_EXCL;

    amode_direct = amode | O_DIRECT;

    fd->fd_sys = open(fd->filename, amode|O_CREAT, perm);

    if (fd->fd_sys != -1) {
        int err;

        /* get file striping information and set it in info */
	/* odd malloc here because lov_user_md contains some fixed data and
	 * then a list of 'lmm_objects' representing stripe */
        lumlen = sizeof(struct lov_user_md) +
                 MAX_LOV_UUID_COUNT * sizeof(struct lov_user_ost_data);
	/* furthermore, Pascal Deveze reports that, even though we pass a
	 * "GETSTRIPE" (read) flag to the ioctl, if some of the values of this
	 * struct are uninitialzed, the call can give an error.  calloc in case
	 * there are other members that must be initialized and in case
	 * lov_user_md struct changes in future */
	lum = (struct lov_user_md *)ADIOI_Calloc(1,lumlen);
        lum->lmm_magic = LOV_USER_MAGIC;
        err = ioctl(fd->fd_sys, LL_IOC_LOV_GETSTRIPE, (void *)lum);
        if (!err) {
            value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char));

            fd->hints->striping_unit = lum->lmm_stripe_size;
            sprintf(value, "%d", lum->lmm_stripe_size);
            ADIOI_Info_set(fd->info, "striping_unit", value);

            fd->hints->striping_factor = lum->lmm_stripe_count;
            sprintf(value, "%d", lum->lmm_stripe_count);
            ADIOI_Info_set(fd->info, "striping_factor", value);

            fd->hints->fs_hints.lustre.start_iodevice = lum->lmm_stripe_offset;
            sprintf(value, "%d", lum->lmm_stripe_offset);
            ADIOI_Info_set(fd->info, "romio_lustre_start_iodevice", value);

            ADIOI_Free(value);
        }
        ADIOI_Free(lum);

        if (fd->access_mode & ADIO_APPEND)
            fd->fp_ind = fd->fp_sys_posn = lseek(fd->fd_sys, 0, SEEK_END);
    } 

    if ((fd->fd_sys != -1) && (fd->access_mode & ADIO_APPEND))
	fd->fp_ind = fd->fp_sys_posn = lseek(fd->fd_sys, 0, SEEK_END);

    fd->fd_direct = -1;
    if (fd->direct_write || fd->direct_read) {
	fd->fd_direct = open(fd->filename, amode_direct, perm);
	if (fd->fd_direct != -1) {
	    fd->d_mem = fd->d_miniosz = (1<<12);
	} else {
	    perror("cannot open file with O_Direct");
	    fd->direct_write = fd->direct_read = 0;
	}
    }

    /* --BEGIN ERROR HANDLING-- */
    if (fd->fd_sys == -1 || ((fd->fd_direct == -1) && 
		(fd->direct_write || fd->direct_read))) {
	if (errno == ENAMETOOLONG)
	    *error_code = MPIO_Err_create_code(MPI_SUCCESS,
					       MPIR_ERR_RECOVERABLE, myname,
					       __LINE__, MPI_ERR_BAD_FILE,
					       "**filenamelong",
					       "**filenamelong %s %d",
					       fd->filename,
					       strlen(fd->filename));
	else if (errno == ENOENT)
	    *error_code = MPIO_Err_create_code(MPI_SUCCESS,
					       MPIR_ERR_RECOVERABLE, myname,
					       __LINE__, MPI_ERR_NO_SUCH_FILE,
					       "**filenoexist",
					       "**filenoexist %s",
					       fd->filename);
	else if (errno == ENOTDIR || errno == ELOOP)
	    *error_code = MPIO_Err_create_code(MPI_SUCCESS,
					       MPIR_ERR_RECOVERABLE,
					       myname, __LINE__,
					       MPI_ERR_BAD_FILE,
					       "**filenamedir",
					       "**filenamedir %s",
					       fd->filename);
	else if (errno == EACCES) {
	    *error_code = MPIO_Err_create_code(MPI_SUCCESS,
					       MPIR_ERR_RECOVERABLE, myname,
					       __LINE__, MPI_ERR_ACCESS,
					       "**fileaccess",
					       "**fileaccess %s", 
					       fd->filename );
	}
	else if (errno == EROFS) {
	    /* Read only file or file system and write access requested */
	    *error_code = MPIO_Err_create_code(MPI_SUCCESS,
					       MPIR_ERR_RECOVERABLE, myname,
					       __LINE__, MPI_ERR_READ_ONLY,
					       "**ioneedrd", 0 );
	}
	else {
	    *error_code = MPIO_Err_create_code(MPI_SUCCESS,
					       MPIR_ERR_RECOVERABLE, myname,
					       __LINE__, MPI_ERR_IO, "**io",
					       "**io %s", strerror(errno));
	}
    }
    /* --END ERROR HANDLING-- */
    else *error_code = MPI_SUCCESS;

}
Esempio n. 28
0
/* This function is for implementation convenience. It is not user-visible.
 * If wr==1 write, wr==0 read.
 *
 * Returns MPI_SUCCESS on success, mpi_errno on failure.
 */
int ADIOI_NTFS_aio(ADIO_File fd, void *buf, int len, ADIO_Offset offset,
		   int wr, MPI_Request *request)
{
    static char myname[] = "ADIOI_NTFS_aio";

    ADIOI_AIO_Request *aio_req;
    static DWORD dwNumWritten, dwNumRead;
    BOOL ret_val = FALSE;
    FDTYPE fd_sys;
    int mpi_errno = MPI_SUCCESS;
    DWORD err;

    fd_sys = fd->fd_sys;

    aio_req = (ADIOI_AIO_Request *)ADIOI_Calloc(sizeof(ADIOI_AIO_Request), 1);
    if (aio_req == NULL)
    {
	mpi_errno = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE,
	    myname, __LINE__, MPI_ERR_IO,
	    "**nomem", "**nomem %s", "AIO_REQ");
	return mpi_errno;
    }
    aio_req->lpOvl = (LPOVERLAPPED ) ADIOI_Calloc(sizeof(OVERLAPPED), 1);
    if (aio_req->lpOvl == NULL)
    {
	mpi_errno = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE,
	    myname, __LINE__, MPI_ERR_IO,
	    "**nomem", "**nomem %s", "OVERLAPPED");
    ADIOI_Free(aio_req);
	return mpi_errno;
    }
    aio_req->lpOvl->hEvent = CreateEvent(NULL, TRUE, FALSE, NULL);
    if (aio_req->lpOvl->hEvent == NULL)
    {
    char errMsg[ADIOI_NTFS_ERR_MSG_MAX];
	err = GetLastError();
    ADIOI_NTFS_Strerror(err, errMsg, ADIOI_NTFS_ERR_MSG_MAX);
	mpi_errno = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE,
	    myname, __LINE__, MPI_ERR_IO,
	    "**io", "**io %s", errMsg);
    ADIOI_Free(aio_req->lpOvl);
    ADIOI_Free(aio_req);
	return mpi_errno;
    }
    aio_req->lpOvl->Offset = DWORDLOW(offset);
    aio_req->lpOvl->OffsetHigh = DWORDHIGH(offset);
    aio_req->fd = fd_sys;
    
    /* XXX: initiate async I/O  */
    if (wr)
    {
	ret_val = WriteFile(fd_sys, buf, len, &dwNumWritten, aio_req->lpOvl);
    }
    else
    {
	ret_val = ReadFile(fd_sys, buf, len, &dwNumRead, aio_req->lpOvl);
    }

    /* --BEGIN ERROR HANDLING-- */
    if (ret_val == FALSE) 
    {
	mpi_errno = GetLastError();
	if (mpi_errno != ERROR_IO_PENDING)
	{
        char errMsg[ADIOI_NTFS_ERR_MSG_MAX];
        ADIOI_NTFS_Strerror(mpi_errno, errMsg, ADIOI_NTFS_ERR_MSG_MAX);
	    mpi_errno = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE,
		myname, __LINE__, MPI_ERR_IO,
		"**io",
		"**io %s", errMsg);
	    return mpi_errno;
	}
	mpi_errno = MPI_SUCCESS;
    }
    /* --END ERROR HANDLING-- */

    /* XXX: set up generalized request class and request */
    if (ADIOI_NTFS_greq_class == 0) {
	    mpi_errno = MPIX_Grequest_class_create(ADIOI_NTFS_aio_query_fn,
			    ADIOI_NTFS_aio_free_fn, MPIU_Greq_cancel_fn,
			    ADIOI_NTFS_aio_poll_fn, ADIOI_NTFS_aio_wait_fn,
			    &ADIOI_NTFS_greq_class);
        if(mpi_errno != MPI_SUCCESS){
        /* FIXME: Pass appropriate error code to user */
        }
    }
    mpi_errno = MPIX_Grequest_class_allocate(ADIOI_NTFS_greq_class, aio_req, request);
    if(mpi_errno != MPI_SUCCESS){
    /* FIXME: Pass appropriate error code to user */
    }
    memcpy(&(aio_req->req), request, sizeof(MPI_Request));
    return mpi_errno;
}
Esempio n. 29
0
void ADIOI_PVFS2_AIO_contig(ADIO_File fd, void *buf, int count, 
			    MPI_Datatype datatype, int file_ptr_type,
			    ADIO_Offset offset, MPI_Request *request,
			    int flag, int *error_code)
{

    int ret;
    MPI_Count datatype_size, len;
    ADIOI_PVFS2_fs *pvfs_fs;
    ADIOI_AIO_Request *aio_req;
    static char myname[] = "ADIOI_PVFS2_AIO_contig";

    pvfs_fs = (ADIOI_PVFS2_fs*)fd->fs_ptr;

    aio_req = (ADIOI_AIO_Request*)ADIOI_Calloc(sizeof(ADIOI_AIO_Request), 1);

    MPI_Type_size_x(datatype, &datatype_size);
    len = datatype_size * count;

    ret = PVFS_Request_contiguous(len, PVFS_BYTE, &(aio_req->mem_req));
    /* --BEGIN ERROR HANDLING-- */
    if (ret != 0) {
	*error_code = MPIO_Err_create_code(MPI_SUCCESS,
					   MPIR_ERR_RECOVERABLE,
					   myname, __LINE__,
					   ADIOI_PVFS2_error_convert(ret),
					   "Error in pvfs_request_contig (memory)", 0);
	return;
    }
    /* --END ERROR HANDLING-- */

    ret = PVFS_Request_contiguous(len, PVFS_BYTE, &(aio_req->file_req));
    /* --BEGIN ERROR HANDLING-- */
    if (ret != 0) {
	*error_code = MPIO_Err_create_code(MPI_SUCCESS,
					   MPIR_ERR_RECOVERABLE,
					   myname, __LINE__,
					   ADIOI_PVFS2_error_convert(ret),
					   "Error in pvfs_request_contig (file)", 0);
	return;
    }
    /* --END ERROR HANDLING-- */

    if (file_ptr_type == ADIO_INDIVIDUAL) {
	/* copy individual file pointer into offset variable, continue */
	offset = fd->fp_ind;
    } 
    if (flag == READ) {
#ifdef ADIOI_MPE_LOGGING
	MPE_Log_event( ADIOI_MPE_iread_a, 0, NULL );
#endif
	ret = PVFS_isys_read(pvfs_fs->object_ref, aio_req->file_req, offset, 
		buf, aio_req->mem_req, &(pvfs_fs->credentials), 
		&(aio_req->resp_io), &(aio_req->op_id), NULL);
#ifdef ADIOI_MPE_LOGGING
	MPE_Log_event( ADIOI_MPE_iread_b, 0, NULL );
#endif
    } else if (flag == WRITE) {
#ifdef ADIOI_MPE_LOGGING
	MPE_Log_event( ADIOI_MPE_iwrite_a, 0, NULL );
#endif
	ret = PVFS_isys_write(pvfs_fs->object_ref, aio_req->file_req, offset, 
		buf, aio_req->mem_req, &(pvfs_fs->credentials), 
		&(aio_req->resp_io), &(aio_req->op_id), NULL);
#ifdef ADIOI_MPE_LOGGING
	MPE_Log_event( ADIOI_MPE_iwrite_b, 0, NULL );
#endif 
    } 

    /* --BEGIN ERROR HANDLING-- */
    if (ret < 0 ) {
	*error_code = MPIO_Err_create_code(MPI_SUCCESS,
					   MPIR_ERR_RECOVERABLE,
					   myname, __LINE__,
					   ADIOI_PVFS2_error_convert(ret),
					   "Error in PVFS_isys_io", 0);
	goto fn_exit;
    }
    /* --END ERROR HANDLING-- */

#ifdef HAVE_MPI_GREQUEST_EXTENSIONS
    /* posted. defered completion */
    if (ret == 0) { 
	if (ADIOI_PVFS2_greq_class == 0) {
	    MPIX_Grequest_class_create(ADIOI_GEN_aio_query_fn, 
		    ADIOI_PVFS2_aio_free_fn, MPIU_Greq_cancel_fn,
		    ADIOI_PVFS2_aio_poll_fn, ADIOI_PVFS2_aio_wait_fn,
		    &ADIOI_PVFS2_greq_class);
	}
	MPIX_Grequest_class_allocate(ADIOI_PVFS2_greq_class, aio_req, request);
	memcpy(&(aio_req->req), request, sizeof(*request));
    }
#else
    /* if generalized request extensions not available, we will have to process
     * this operation right here */
    int error;
    ret = PVFS_sys_wait(aio_req->op_id, "ADIOI_PVFS2_AIO_Contig", &error);
    if (ret == 0) {
	MPIO_Completed_request_create(&fd, len, error_code, request);
    }
#endif

    /* immediate completion */
    if (ret == 1) {
	MPIO_Completed_request_create(&fd, len, error_code, request);
    }

    if (file_ptr_type == ADIO_INDIVIDUAL) {
	fd->fp_ind += len;
    }
    fd->fp_sys_posn = offset + len;

    *error_code = MPI_SUCCESS;
fn_exit:
    return;
}
Esempio n. 30
0
/* Avery Ching and Kenin Columa's reworked two-phase algorithm.  Key features
 * - persistent file domains
 * - an option to use alltoall instead of point-to-point
 */
void ADIOI_IOStridedColl(ADIO_File fd, void *buf, int count, int rdwr,
                         MPI_Datatype datatype, int file_ptr_type,
                         ADIO_Offset offset, ADIO_Status * status, int *error_code)
{
    ADIO_Offset min_st_offset = 0, max_end_offset = 0;
    ADIO_Offset st_end_offset[2];
    ADIO_Offset *all_st_end_offsets = NULL;
    int filetype_is_contig, buftype_is_contig, is_contig;
    ADIO_Offset off;
    int interleave_count = 0, i, nprocs, myrank, nprocs_for_coll;
    int cb_enable;
    ADIO_Offset bufsize;
    MPI_Aint extent;
#ifdef DEBUG2
    MPI_Aint bufextent;
#endif
    MPI_Count size;
    int agg_rank;

    ADIO_Offset agg_disp;       /* aggregated file offset */
    MPI_Datatype agg_dtype;     /* aggregated file datatype */

    int aggregators_done = 0;
    ADIO_Offset buffered_io_size = 0;

    int *alltoallw_disps;

    int *alltoallw_counts;
    int *client_alltoallw_counts;
    int *agg_alltoallw_counts;

    char *cb_buf = NULL;

    MPI_Datatype *client_comm_dtype_arr;        /* aggregator perspective */
    MPI_Datatype *agg_comm_dtype_arr;   /* client perspective */
    ADIO_Offset *client_comm_sz_arr;    /* aggregator perspective */
    ADIO_Offset *agg_comm_sz_arr;       /* client perspective */

    /* file views for each client and aggregator */
    view_state *client_file_view_state_arr = NULL;
    view_state *agg_file_view_state_arr = NULL;
    /* mem views for local process */
    view_state *my_mem_view_state_arr = NULL;

    MPI_Status *agg_comm_statuses = NULL;
    MPI_Request *agg_comm_requests = NULL;
    MPI_Status *client_comm_statuses = NULL;
    MPI_Request *client_comm_requests = NULL;
    int aggs_client_count = 0;
    int clients_agg_count = 0;

    MPI_Comm_size(fd->comm, &nprocs);
    MPI_Comm_rank(fd->comm, &myrank);
#ifdef DEBUG
    fprintf(stderr, "p%d: entering ADIOI_IOStridedColl\n", myrank);
#endif
#ifdef AGGREGATION_PROFILE
    if (rdwr == ADIOI_READ)
        MPE_Log_event(5010, 0, NULL);
    else
        MPE_Log_event(5012, 0, NULL);
#endif

    /* I need to check if there are any outstanding nonblocking writes
     * to the file, which could potentially interfere with the writes
     * taking place in this collective write call. Since this is not
     * likely to be common, let me do the simplest thing possible here:
     * Each process completes all pending nonblocking operations before
     * completing. */

    nprocs_for_coll = fd->hints->cb_nodes;

    if (rdwr == ADIOI_READ)
        cb_enable = fd->hints->cb_read;
    else
        cb_enable = fd->hints->cb_write;

    /* only check for interleaving if cb_read isn't disabled */
    if (cb_enable != ADIOI_HINT_DISABLE) {
        /* find the starting and ending byte of my I/O access */
        ADIOI_Calc_bounds(fd, count, datatype, file_ptr_type, offset,
                          &st_end_offset[0], &st_end_offset[1]);

        /* allocate an array of start/end pairs */
        all_st_end_offsets = (ADIO_Offset *)
            ADIOI_Malloc(2 * nprocs * sizeof(ADIO_Offset));
        MPI_Allgather(st_end_offset, 2, ADIO_OFFSET, all_st_end_offsets, 2, ADIO_OFFSET, fd->comm);

        min_st_offset = all_st_end_offsets[0];
        max_end_offset = all_st_end_offsets[1];

        for (i = 1; i < nprocs; i++) {
            /* are the accesses of different processes interleaved? */
            if ((all_st_end_offsets[i * 2] < all_st_end_offsets[i * 2 - 1]) &&
                (all_st_end_offsets[i * 2] <= all_st_end_offsets[i * 2 + 1]))
                interleave_count++;
            /* This is a rudimentary check for interleaving, but should
             * suffice for the moment. */

            min_st_offset = MPL_MIN(all_st_end_offsets[i * 2], min_st_offset);
            max_end_offset = MPL_MAX(all_st_end_offsets[i * 2 + 1], max_end_offset);
        }
    }

    ADIOI_Datatype_iscontig(datatype, &buftype_is_contig);
    ADIOI_Datatype_iscontig(fd->filetype, &filetype_is_contig);

    if ((cb_enable == ADIOI_HINT_DISABLE || (!interleave_count && (cb_enable == ADIOI_HINT_AUTO)))
        && (fd->hints->cb_pfr != ADIOI_HINT_ENABLE)) {
        if (cb_enable != ADIOI_HINT_DISABLE) {
            ADIOI_Free(all_st_end_offsets);
        }

        if (buftype_is_contig && filetype_is_contig) {
            if (file_ptr_type == ADIO_EXPLICIT_OFFSET) {
                off = fd->disp + (fd->etype_size) * offset;
                if (rdwr == ADIOI_READ)
                    ADIO_ReadContig(fd, buf, count, datatype,
                                    ADIO_EXPLICIT_OFFSET, off, status, error_code);
                else
                    ADIO_WriteContig(fd, buf, count, datatype,
                                     ADIO_EXPLICIT_OFFSET, off, status, error_code);
            } else {
                if (rdwr == ADIOI_READ)
                    ADIO_ReadContig(fd, buf, count, datatype, ADIO_INDIVIDUAL,
                                    0, status, error_code);
                else
                    ADIO_WriteContig(fd, buf, count, datatype, ADIO_INDIVIDUAL,
                                     0, status, error_code);
            }
        } else {
            if (rdwr == ADIOI_READ)
                ADIO_ReadStrided(fd, buf, count, datatype, file_ptr_type,
                                 offset, status, error_code);
            else
                ADIO_WriteStrided(fd, buf, count, datatype, file_ptr_type,
                                  offset, status, error_code);
        }
        return;
    }

    MPI_Type_extent(datatype, &extent);
#ifdef DEBUG2
    bufextent = extent * count;
#endif
    MPI_Type_size_x(datatype, &size);
    bufsize = size * (MPI_Count) count;

    /* Calculate file realms */
    if ((fd->hints->cb_pfr != ADIOI_HINT_ENABLE) || (fd->file_realm_types == NULL))
        ADIOI_Calc_file_realms(fd, min_st_offset, max_end_offset);

    my_mem_view_state_arr = (view_state *)
        ADIOI_Calloc(1, nprocs * sizeof(view_state));
    agg_file_view_state_arr = (view_state *)
        ADIOI_Calloc(1, nprocs * sizeof(view_state));
    client_comm_sz_arr = (ADIO_Offset *)
        ADIOI_Calloc(1, nprocs * sizeof(ADIO_Offset));

    if (fd->is_agg) {
        client_file_view_state_arr = (view_state *)
            ADIOI_Calloc(1, nprocs * sizeof(view_state));
    } else {
        client_file_view_state_arr = NULL;
    }

    /* Alltoallw doesn't like a null array even if the counts are
     * zero.  If you do not include this code, it will fail. */
    client_comm_dtype_arr = (MPI_Datatype *)
        ADIOI_Calloc(1, nprocs * sizeof(MPI_Datatype));
    if (!fd->is_agg)
        for (i = 0; i < nprocs; i++)
            client_comm_dtype_arr[i] = MPI_BYTE;

    ADIOI_Exch_file_views(myrank, nprocs, file_ptr_type, fd, count,
                          datatype, offset, my_mem_view_state_arr,
                          agg_file_view_state_arr, client_file_view_state_arr);

    agg_comm_sz_arr = (ADIO_Offset *)
        ADIOI_Calloc(1, nprocs * sizeof(ADIO_Offset));
    agg_comm_dtype_arr = (MPI_Datatype *)
        ADIOI_Malloc(nprocs * sizeof(MPI_Datatype));
    if (fd->is_agg) {
        ADIOI_Build_agg_reqs(fd, rdwr, nprocs,
                             client_file_view_state_arr,
                             client_comm_dtype_arr, client_comm_sz_arr, &agg_disp, &agg_dtype);
        buffered_io_size = 0;
        for (i = 0; i < nprocs; i++) {
            if (client_comm_sz_arr[i] > 0)
                buffered_io_size += client_comm_sz_arr[i];
        }
    }
#ifdef USE_PRE_REQ
    else {
        /* Example use of ADIOI_Build_client_pre_req. to an
         * appropriate section */

        for (i = 0; i < fd->hints->cb_nodes; i++) {
            agg_rank = fd->hints->ranklist[(i + myrank) % fd->hints->cb_nodes];
#ifdef AGGREGATION_PROFILE
            MPE_Log_event(5040, 0, NULL);
#endif
            ADIOI_Build_client_pre_req(fd, agg_rank, (i + myrank) % fd->hints->cb_nodes,
                                       &(my_mem_view_state_arr[agg_rank]),
                                       &(agg_file_view_state_arr[agg_rank]),
                                       2 * 1024 * 1024, 64 * 1024);
#ifdef AGGREGATION_PROFILE
            MPE_Log_event(5041, 0, NULL);
#endif
        }
    }
#endif


    if (fd->is_agg)
        cb_buf = (char *) ADIOI_Malloc(fd->hints->cb_buffer_size);
    alltoallw_disps = (int *) ADIOI_Calloc(nprocs, sizeof(int));
    alltoallw_counts = client_alltoallw_counts = (int *)
        ADIOI_Calloc(2 * nprocs, sizeof(int));
    agg_alltoallw_counts = &alltoallw_counts[nprocs];

    if (fd->hints->cb_alltoall == ADIOI_HINT_DISABLE) {
        /* aggregators pre-post all Irecv's for incoming data from clients */
        if ((fd->is_agg) && (rdwr == ADIOI_WRITE))
            post_aggregator_comm(fd->comm, rdwr, nprocs, cb_buf,
                                 client_comm_dtype_arr,
                                 client_comm_sz_arr, &agg_comm_requests, &aggs_client_count);
    }
    /* Aggregators send amounts for data requested to clients */
    Exch_data_amounts(fd, nprocs, client_comm_sz_arr, agg_comm_sz_arr,
                      client_alltoallw_counts, agg_alltoallw_counts, &aggregators_done);

#ifdef DEBUG
    fprintf(stderr, "client_alltoallw_counts[ ");
    for (i = 0; i < nprocs; i++) {
        fprintf(stderr, "%d ", client_alltoallw_counts[i]);
    }
    fprintf(stderr, "]\n");
    fprintf(stderr, "agg_alltoallw_counts[ ");
    for (i = 0; i < nprocs; i++) {
        fprintf(stderr, "%d ", agg_alltoallw_counts[i]);
    }
    fprintf(stderr, "]\n");
#endif

    /* keep looping while aggregators still have I/O to do */
    while (aggregators_done != nprocs_for_coll) {
        if (fd->hints->cb_alltoall == ADIOI_HINT_DISABLE) {
            /* clients should build datatypes for local memory locations
             * for data communication with aggregators and post
             * communication as the datatypes are built */

            client_comm_requests = (MPI_Request *)
                ADIOI_Calloc(fd->hints->cb_nodes, sizeof(MPI_Request));

            for (i = 0; i < fd->hints->cb_nodes; i++) {
                clients_agg_count = 0;
                agg_rank = fd->hints->ranklist[(i + myrank) % fd->hints->cb_nodes];
                if (agg_comm_sz_arr[agg_rank] > 0) {
                    ADIOI_Build_client_req(fd, agg_rank,
                                           (i + myrank) % fd->hints->cb_nodes,
                                           &(my_mem_view_state_arr[agg_rank]),
                                           &(agg_file_view_state_arr[agg_rank]),
                                           agg_comm_sz_arr[agg_rank],
                                           &(agg_comm_dtype_arr[agg_rank]));

#ifdef AGGREGATION_PROFILE
                    if (i == 0)
                        MPE_Log_event(5038, 0, NULL);
#endif
                    post_client_comm(fd, rdwr, agg_rank, buf,
                                     agg_comm_dtype_arr[agg_rank],
                                     agg_alltoallw_counts[agg_rank],
                                     &client_comm_requests[clients_agg_count]);
                    clients_agg_count++;
                }
            }
#ifdef AGGREGATION_PROFILE
            if (!clients_agg_count)
                MPE_Log_event(5039, 0, NULL);
#endif

            if (rdwr == ADIOI_READ) {
                if (fd->is_agg && buffered_io_size) {
                    ADIOI_IOFiletype(fd, cb_buf, buffered_io_size, MPI_BYTE,
                                     ADIO_EXPLICIT_OFFSET, agg_disp, agg_dtype,
                                     ADIOI_READ, status, error_code);
                    if (*error_code != MPI_SUCCESS)
                        return;
                    MPI_Type_free(&agg_dtype);
                }
#ifdef DEBUG
                fprintf(stderr, "expecting from [agg](disp,size,cnt)=");
                for (i = 0; i < nprocs; i++) {
                    MPI_Type_size_x(agg_comm_dtype_arr[i], &size);
                    fprintf(stderr, "[%d](%d,%d,%d)", i, alltoallw_disps[i],
                            size, agg_alltoallw_counts[i]);
                    if (i != nprocs - 1)
                        fprintf(stderr, ",");
                }
                fprintf(stderr, "]\n");
                if (fd->is_agg) {
                    fprintf(stderr, "sending to [client](disp,size,cnt)=");
                    for (i = 0; i < nprocs; i++) {
                        if (fd->is_agg)
                            MPI_Type_size_x(client_comm_dtype_arr[i], &size);
                        else
                            size = -1;

                        fprintf(stderr, "[%d](%d,%d,%d)", i, alltoallw_disps[i],
                                size, client_alltoallw_counts[i]);
                        if (i != nprocs - 1)
                            fprintf(stderr, ",");
                    }
                    fprintf(stderr, "\n");
                }
                fflush(NULL);
#endif
                /* aggregators post all Isends for outgoing data to clients */
                if (fd->is_agg)
                    post_aggregator_comm(fd->comm, rdwr, nprocs, cb_buf,
                                         client_comm_dtype_arr,
                                         client_comm_sz_arr,
                                         &agg_comm_requests, &aggs_client_count);

                if (fd->is_agg && aggs_client_count) {
#ifdef MPI_STATUSES_IGNORE
                    agg_comm_statuses = MPI_STATUSES_IGNORE;
#else
                    agg_comm_statuses = ADIOI_Malloc(aggs_client_count * sizeof(MPI_Status));
#endif
                    MPI_Waitall(aggs_client_count, agg_comm_requests, agg_comm_statuses);
#ifdef AGGREGATION_PROFILE
                    MPE_Log_event(5033, 0, NULL);
#endif
                    ADIOI_Free(agg_comm_requests);
#ifndef MPI_STATUSES_IGNORE
                    ADIOI_Free(agg_comm_statuses);
#endif
                }

                if (clients_agg_count) {
#ifdef MPI_STATUSES_IGNORE
                    client_comm_statuses = MPI_STATUSES_IGNORE;
#else
                    client_comm_statuses = ADIOI_Malloc(clients_agg_count * sizeof(MPI_Status));
#endif
                    MPI_Waitall(clients_agg_count, client_comm_requests, client_comm_statuses);
#ifdef AGGREGATION_PROFILE
                    MPE_Log_event(5039, 0, NULL);
#endif
                    ADIOI_Free(client_comm_requests);
#ifndef MPI_STATUSES_IGNORE
                    ADIOI_Free(client_comm_statuses);
#endif
                }
#ifdef DEBUG2
                fprintf(stderr, "buffered_io_size = %lld\n", buffered_io_size);
                if (fd->is_agg && buffered_io_size) {
                    fprintf(stderr, "buf = [");
                    for (i = 0; i < bufextent; i++)
                        fprintf(stderr, "%c", ((char *) buf)[i]);
                    fprintf(stderr, "]\n");
                    fprintf(stderr, "cb_buf = [");
                    for (i = 0; i < buffered_io_size; i++)
                        fprintf(stderr, "%c", cb_buf[i]);
                    fprintf(stderr, "]\n");
                    fflush(NULL);
                }
#endif
            } else {    /* Write Case */
#ifdef DEBUG
                fprintf(stderr, "sending to [agg](disp,size,cnt)=");
                for (i = 0; i < nprocs; i++) {
                    MPI_Type_size_x(agg_comm_dtype_arr[i], &size);
                    fprintf(stderr, "[%d](%d,%d,%d)", i, alltoallw_disps[i],
                            size, agg_alltoallw_counts[i]);
                    if (i != nprocs - 1)
                        fprintf(stderr, ",");
                }
                fprintf(stderr, "]\n");
                fprintf(stderr, "expecting from [client](disp,size,cnt)=");
                for (i = 0; i < nprocs; i++) {
                    if (fd->is_agg)
                        MPI_Type_size_x(client_comm_dtype_arr[i], &size);
                    else
                        size = -1;

                    fprintf(stderr, "[%d](%d,%d,%d)", i, alltoallw_disps[i],
                            size, client_alltoallw_counts[i]);
                    if (i != nprocs - 1)
                        fprintf(stderr, ",");
                }
                fprintf(stderr, "\n");
                fflush(NULL);
#endif
#ifdef DEBUG
                fprintf(stderr, "buffered_io_size = %lld\n", buffered_io_size);
#endif

                if (clients_agg_count) {
#ifdef MPI_STATUSES_IGNORE
                    client_comm_statuses = MPI_STATUSES_IGNORE;
#else
                    client_comm_statuses = ADIOI_Malloc(clients_agg_count * sizeof(MPI_Status));
#endif
                    MPI_Waitall(clients_agg_count, client_comm_requests, client_comm_statuses);
#ifdef AGGREGATION_PROFILE
                    MPE_Log_event(5039, 0, NULL);
#endif
                    ADIOI_Free(client_comm_requests);
#ifndef MPI_STATUSES_IGNORE
                    ADIOI_Free(client_comm_statuses);
#endif
                }
#ifdef DEBUG2
                if (bufextent) {
                    fprintf(stderr, "buf = [");
                    for (i = 0; i < bufextent; i++)
                        fprintf(stderr, "%c", ((char *) buf)[i]);
                    fprintf(stderr, "]\n");
                }
#endif

                if (fd->is_agg && buffered_io_size) {
                    ADIOI_Assert(aggs_client_count != 0);
                    /* make sure we actually have the data to write out */
#ifdef MPI_STATUSES_IGNORE
                    agg_comm_statuses = MPI_STATUSES_IGNORE;
#else
                    agg_comm_statuses = (MPI_Status *)
                        ADIOI_Malloc(aggs_client_count * sizeof(MPI_Status));
#endif

                    MPI_Waitall(aggs_client_count, agg_comm_requests, agg_comm_statuses);
#ifdef AGGREGATION_PROFILE
                    MPE_Log_event(5033, 0, NULL);
#endif
                    ADIOI_Free(agg_comm_requests);
#ifndef MPI_STATUSES_IGNORE
                    ADIOI_Free(agg_comm_statuses);
#endif
#ifdef DEBUG2
                    fprintf(stderr, "cb_buf = [");
                    for (i = 0; i < buffered_io_size; i++)
                        fprintf(stderr, "%c", cb_buf[i]);
                    fprintf(stderr, "]\n");
                    fflush(NULL);
#endif
                    ADIOI_IOFiletype(fd, cb_buf, buffered_io_size, MPI_BYTE,
                                     ADIO_EXPLICIT_OFFSET, agg_disp, agg_dtype,
                                     ADIOI_WRITE, status, error_code);
                    if (*error_code != MPI_SUCCESS)
                        return;
                    MPI_Type_free(&agg_dtype);
                }

            }
        } else {
            /* Alltoallw version of everything */
            ADIOI_Build_client_reqs(fd, nprocs, my_mem_view_state_arr,
                                    agg_file_view_state_arr, agg_comm_sz_arr, agg_comm_dtype_arr);

            if (rdwr == ADIOI_READ) {
                if (fd->is_agg && buffered_io_size) {
                    ADIOI_IOFiletype(fd, cb_buf, buffered_io_size, MPI_BYTE,
                                     ADIO_EXPLICIT_OFFSET, agg_disp, agg_dtype,
                                     ADIOI_READ, status, error_code);
                    if (*error_code != MPI_SUCCESS)
                        return;
                    MPI_Type_free(&agg_dtype);
                }
#ifdef AGGREGATION_PROFILE
                MPE_Log_event(5032, 0, NULL);
#endif
                MPI_Alltoallw(cb_buf, client_alltoallw_counts, alltoallw_disps,
                              client_comm_dtype_arr,
                              buf, agg_alltoallw_counts, alltoallw_disps,
                              agg_comm_dtype_arr, fd->comm);
#ifdef AGGREGATION_PROFILE
                MPE_Log_event(5033, 0, NULL);
#endif
            } else {    /* Write Case */
#ifdef AGGREGATION_PROFILE
                MPE_Log_event(5032, 0, NULL);
#endif
                MPI_Alltoallw(buf, agg_alltoallw_counts, alltoallw_disps,
                              agg_comm_dtype_arr,
                              cb_buf, client_alltoallw_counts, alltoallw_disps,
                              client_comm_dtype_arr, fd->comm);
#ifdef AGGREGATION_PROFILE
                MPE_Log_event(5033, 0, NULL);
#endif
                if (fd->is_agg && buffered_io_size) {
                    ADIOI_IOFiletype(fd, cb_buf, buffered_io_size, MPI_BYTE,
                                     ADIO_EXPLICIT_OFFSET, agg_disp, agg_dtype,
                                     ADIOI_WRITE, status, error_code);
                    if (*error_code != MPI_SUCCESS)
                        return;
                    MPI_Type_free(&agg_dtype);
                }
            }
        }

        /* Free (uncommit) datatypes for reuse */
        if (fd->is_agg) {
            if (buffered_io_size > 0) {
                for (i = 0; i < nprocs; i++) {
                    if (client_comm_sz_arr[i] > 0)
                        MPI_Type_free(&client_comm_dtype_arr[i]);
                }
            }
        }
        for (i = 0; i < nprocs; i++) {
            if (agg_comm_sz_arr[i] > 0)
                MPI_Type_free(&agg_comm_dtype_arr[i]);
        }

        /* figure out next set up requests */
        if (fd->is_agg) {
            ADIOI_Build_agg_reqs(fd, rdwr, nprocs,
                                 client_file_view_state_arr,
                                 client_comm_dtype_arr, client_comm_sz_arr, &agg_disp, &agg_dtype);
            buffered_io_size = 0;
            for (i = 0; i < nprocs; i++) {
                if (client_comm_sz_arr[i] > 0)
                    buffered_io_size += client_comm_sz_arr[i];
            }
        }
#ifdef USE_PRE_REQ
        else {
            /* Example use of ADIOI_Build_client_pre_req. to an
             * appropriate section */
            for (i = 0; i < fd->hints->cb_nodes; i++) {
                agg_rank = fd->hints->ranklist[(i + myrank) % fd->hints->cb_nodes];
#ifdef AGGREGATION_PROFILE
                MPE_Log_event(5040, 0, NULL);
#endif
                ADIOI_Build_client_pre_req(fd, agg_rank, (i + myrank) % fd->hints->cb_nodes,
                                           &(my_mem_view_state_arr[agg_rank]),
                                           &(agg_file_view_state_arr[agg_rank]),
                                           2 * 1024 * 1024, 64 * 1024);
#ifdef AGGREGATION_PROFILE
                MPE_Log_event(5041, 0, NULL);
#endif
            }
        }
#endif

        /* aggregators pre-post all Irecv's for incoming data from
         * clients.  if nothing is needed, agg_comm_requests is not
         * allocated */
        if (fd->hints->cb_alltoall == ADIOI_HINT_DISABLE) {
            if ((fd->is_agg) && (rdwr == ADIOI_WRITE))
                post_aggregator_comm(fd->comm, rdwr, nprocs, cb_buf,
                                     client_comm_dtype_arr,
                                     client_comm_sz_arr, &agg_comm_requests, &aggs_client_count);
        }

        /* Aggregators send amounts for data requested to clients */
        Exch_data_amounts(fd, nprocs, client_comm_sz_arr, agg_comm_sz_arr,
                          client_alltoallw_counts, agg_alltoallw_counts, &aggregators_done);

    }

    /* Clean up */

    if (fd->hints->cb_pfr != ADIOI_HINT_ENABLE) {
        /* AAR, FSIZE, and User provided uniform File realms */
        if (1) {
            MPI_Type_free(&fd->file_realm_types[0]);
        } else {
            for (i = 0; i < fd->hints->cb_nodes; i++) {
                ADIOI_Datatype_iscontig(fd->file_realm_types[i], &is_contig);
                MPI_Type_free(&fd->file_realm_types[i]);
            }
        }
        ADIOI_Free(fd->file_realm_types);
        ADIOI_Free(fd->file_realm_st_offs);
    }


    if (fd->is_agg) {
        if (buffered_io_size > 0)
            MPI_Type_free(&agg_dtype);
        for (i = 0; i < nprocs; i++) {
            MPI_Type_free(&client_comm_dtype_arr[i]);
            ADIOI_Free(client_file_view_state_arr[i].flat_type_p->indices);
            ADIOI_Free(client_file_view_state_arr[i].flat_type_p->blocklens);
            ADIOI_Free(client_file_view_state_arr[i].flat_type_p);
        }
        ADIOI_Free(client_file_view_state_arr);
        ADIOI_Free(cb_buf);
    }
    for (i = 0; i < nprocs; i++)
        if (agg_comm_sz_arr[i] > 0)
            MPI_Type_free(&agg_comm_dtype_arr[i]);

    ADIOI_Free(client_comm_sz_arr);
    ADIOI_Free(client_comm_dtype_arr);
    ADIOI_Free(my_mem_view_state_arr);
    ADIOI_Free(agg_file_view_state_arr);
    ADIOI_Free(agg_comm_sz_arr);
    ADIOI_Free(agg_comm_dtype_arr);
    ADIOI_Free(alltoallw_disps);
    ADIOI_Free(alltoallw_counts);
    ADIOI_Free(all_st_end_offsets);

#ifdef HAVE_STATUS_SET_BYTES
    MPIR_Status_set_bytes(status, datatype, bufsize);
    /* This is a temporary way of filling in status.  The right way is
     * to keep track of how much data was actually read and placed in
     * buf during collective I/O. */
#endif
    fd->fp_sys_posn = -1;       /* set it to null. */
#ifdef AGGREGATION_PROFILE
    if (rdwr == ADIOI_READ)
        MPE_Log_event(5011, 0, NULL);
    else
        MPE_Log_event(5013, 0, NULL);
#endif
}