Esempio n. 1
0
static STATUS
DI_galloc(
    DI_IO	*f,
    i4	n,
    DI_OP	*diop,
    i4		*end_of_file,
    CL_ERR_DESC *err_code)
{
    STATUS                      big_status = OK, small_status =OK;
    STATUS                      intern_status = OK;
    register DI_SLAVE_CB        *disl;
    i4				last_page;
    OFFSET_TYPE			lseek_ret;

    do
    {
# ifdef OS_THREADS_USED
	/* Seek/write must be semaphore protected */
	if ((f->io_fprop & FPROP_PRIVATE) == 0)
	    CS_synch_lock( &f->io_sem );
# endif /* OS_THREADS_USED */

        if (Di_slave)
        {
	    disl = diop->di_evcb;

	    disl->file_op = DI_SL_ZALLOC;
	    disl->length = n * f->io_bytes_per_page;
	    /* Pass file properties to slave */
	    FPROP_COPY(f->io_fprop,disl->io_fprop);

	    DI_slave_send( disl->dest_slave_no, diop,
			   &big_status, &small_status, &intern_status );

	    if (( big_status != OK ) || ( small_status != OK ))
		break;

	    if ( disl->status != OK )
	    {
	        STRUCT_ASSIGN_MACRO(disl->errcode, *err_code);
	        small_status = DI_BADEXTEND;
	        break;
	     }
	     else
	     {
	        lseek_ret = disl->length;
	     }

	}
    	else
    	{
	    /* 
	    ** Running without slaves 
	    */
	    OFFSET_TYPE	lseek_offset;
	    i8		reservation;
	    i4		buf_size;
	    i4		bytes_written;
	    i4		pages_remaining = n;
	    i4		pages_at_a_time = Di_zero_bufsize /
					  f->io_bytes_per_page;

	    /* find current end-of-file */

	    lseek_ret = IIdio_get_file_eof(diop->di_fd, f->io_fprop);
	    if ( lseek_ret == (OFFSET_TYPE)-1L )
	    {
	    	SETCLERR(err_code, 0, ER_lseek);
	    	small_status = DI_BADINFO;
		break;
	    }
	    else
	    {
		lseek_offset = lseek_ret;
		/* If this filesystem can do reservations, see if we
		** should reserve more space.
		** Even though we have to write the zeros anyway, the
		** reservation may well be larger than the zeroing
		** buffer, and this way helps maintain contiguity.
		** Not worth it for tiny writes.
		*/
		if (pages_remaining > 2
		  && FPROP_ALLOCSTRATEGY_GET(f->io_fprop) == FPROP_ALLOCSTRATEGY_RESV)
		{
		    reservation = lseek_offset + (pages_remaining * f->io_bytes_per_page);
		    if (reservation > f->io_reserved_bytes)
		    {
			/* Re-check in case some other server reserved */
			small_status = IIdio_get_reserved(diop->di_fd,
				&f->io_reserved_bytes, err_code);
			if (small_status == OK && reservation > f->io_reserved_bytes)
			{
			    small_status = IIdio_reserve(diop->di_fd,
					f->io_reserved_bytes,
					reservation - f->io_reserved_bytes,
					err_code);
			    if (small_status == OK)
			    {
				f->io_reserved_bytes = reservation;
			    }
			    else
			    {
				if (small_status != DI_BADFILE)
				    break;
				/* Fallocate not supported, turn off
				** "reserve" strategy, continue without.
				*/
				small_status = OK;
				FPROP_ALLOCSTRATEGY_SET(f->io_fprop, FPROP_ALLOCSTRATEGY_VIRT);
			    }
			}
		    }
		} /* end reservations */

		while ( pages_remaining > 0 )
		{
		    if ( pages_remaining < pages_at_a_time )
			buf_size = pages_remaining *
				    f->io_bytes_per_page;
		    else
			buf_size = Di_zero_bufsize;

# if  defined(OS_THREADS_USED) && !defined(xCL_NO_ATOMIC_READ_WRITE_IO)
		    bytes_written =
#ifdef LARGEFILE64
			pwrite64( diop->di_fd, Di_zero_buffer, 
				    buf_size, lseek_offset );
#else /*  LARGEFILE64 */
			pwrite( diop->di_fd, Di_zero_buffer, 
				    buf_size, lseek_offset );
#endif /* LARGEFILE64 */
# else /* OS_THREADS_USED  !xCL_NO_ATOMIC_READ_WRITE_IO */
		    bytes_written =
			IIdio_write( diop->di_fd, Di_zero_buffer, 
				    buf_size, 
				    lseek_offset, 
				    &lseek_offset, 
				    f->io_fprop,
				    err_code );
# endif /* OS_THREADS_USED */

		    if ( bytes_written != buf_size )
		    {
			SETCLERR(err_code, 0, ER_write);
			small_status = DI_BADEXTEND;
			break;
		    }

		    lseek_offset += buf_size;
		    pages_remaining -= pages_at_a_time;
		}

		if ( small_status != OK )
		    break;
	    }
	}

	*end_of_file = ( lseek_ret / f->io_bytes_per_page) - 1;

    } while (FALSE);

    if (big_status == OK && small_status == OK)
    {
	/*
	** Update the current allocated end-of-file under mutex protection
	*/
	last_page = *end_of_file + n;
	if (last_page > f->io_alloc_eof)
	    f->io_alloc_eof = last_page;
    }

# ifdef OS_THREADS_USED
    if ((f->io_fprop & FPROP_PRIVATE) == 0)
	CS_synch_unlock( &f->io_sem );
# endif /* OS_THREADS_USED */

    if ( big_status != OK )
	small_status = big_status;

    if ( small_status != OK )
	DIlru_set_di_error( &small_status, err_code, intern_status,
			    DI_GENERAL_ERR);

    return(small_status);

}
Esempio n. 2
0
/*{
** Name: DI_inproc_write -   writes page(s) to a file on disk.
**
** Description:
**	This routine was created to make DIwrite more readable once
**	error checking had been added. See DIwrite for comments.
**
** Inputs:
**      f                    Pointer to the DI file
**                           context needed to do I/O.
**	diop		     Pointer to dilru file context.
**      buf                  Pointer to page(s) to write.
**      page                 Value indicating page(s) to write.
**	num_of_pages	     number of pages to write
**      
** Outputs:
**      err_code             Pointer to a variable used
**                           to return operating system 
**                           errors.
**    Returns:
**          OK
**	    other errors.
**    Exceptions:
**        none
**
** Side Effects:
**        none
**
** History:
**    30-nov-1992 (rmuth)
**	    Created.
**    03-jun-1996 (canor01)
**	    Note in the scb that this is a DI wait.
**    05-May-1997 (merja01)
**      Changed preprocessor stmt for pwrite.  Not all platforms
**      using OS_THREADS have a pwrite function.  This function
**      seems to only be available on Solaris 2.4 where async IO
**      is not yet supported.
**    14-July-1997 (schte01)
**      For those platforms that do direct i/o (where the
**      seek and the write are separate functions), do not release and
**      reaquire the semaphore on the DI_IO block. This will protect
**      against i/o being done by a different thread in between the 
**      lseek and the write.
**    14-Aug-1997 (schte01)    
**      Add xCL_DIRECT_IO as a condition to the 14-July-1997 change
**      instead of the test for !xCL_ASYNCH_IO.
**	22-Dec-1998 (jenjo02)
**	    If DI_FD_PER_THREAD is defined, call IIdio_write() instead of
**	    pwrite().
**	01-oct-1998 (somsa01)
**	    Return DI_NODISKSPACE when we are out of disk space.
**  01-Apr-2004 (fanch01)
**      Add O_DIRECT support on Linux depending on the filesystem
**      properties, pagesize.  Fixups for misaligned buffers on read()
**      and write() operations.
**    13-apr-04 (toumi01)
**	Move stack variable declaration to support "standard" C compilers.
**	29-Jan-2005 (schka24)
**	    Ditch attempt to gather diow timing stats, not useful in
**	    the real world and generates excess syscalls on some platforms.
**	15-Mar-2006 (jenjo02)
**	    io_sem is not needed with thread affinity.
**	6-Nov-2009 (kschendel) SIR 122757
**	    Make io-sem a SYNCH, avoid entirely if PRIVATE.
**	    Delete copy-to-align, caller is supposed to do it now.
**	    Don't attempt SCB updating if not backend.
*/
static STATUS
DI_inproc_write(
    DI_IO	*f,
    DI_OP	*diop,
    char        *buf,
    i4	page,
    i4	num_of_pages,
    CL_ERR_DESC *err_code )
{
    STATUS	status = OK;
    CS_SCB	*scb;
    i4		saved_state;

    /* unix variables */
    int		bytes_written;
    int		bytes_to_write;
    OFFSET_TYPE lseek_offset;
    /* 
    ** seek to place to write 
    */
    lseek_offset = 
	(OFFSET_TYPE)(f->io_bytes_per_page) * (OFFSET_TYPE)page;

    bytes_to_write = (f->io_bytes_per_page * (num_of_pages));

    if (Di_backend)
    {
	CSget_scb(&scb);
	if ( scb )
	{
	    saved_state = scb->cs_state;
	    scb->cs_state = CS_EVENT_WAIT;

	    if (f->io_open_flags & DI_O_LOG_FILE_MASK)
	    {
		scb->cs_memory = CS_LIOW_MASK;
		scb->cs_liow++;
		Cs_srv_block.cs_wtstatistics.cs_liow_done++;
		Cs_srv_block.cs_wtstatistics.cs_liow_waits++;
		Cs_srv_block.cs_wtstatistics.cs_liow_kbytes
		    += bytes_to_write / 1024;
	    }
	    else
	    {
		scb->cs_memory = CS_DIOW_MASK;
		scb->cs_diow++;
		Cs_srv_block.cs_wtstatistics.cs_diow_done++;
		Cs_srv_block.cs_wtstatistics.cs_diow_waits++;
		Cs_srv_block.cs_wtstatistics.cs_diow_kbytes
		    += bytes_to_write / 1024;
	    }
	}
    }

# if  defined(OS_THREADS_USED) && defined(xCL_NO_ATOMIC_READ_WRITE_IO)
    if ( !Di_thread_affinity && (f->io_fprop & FPROP_PRIVATE) == 0)
	CS_synch_lock( &f->io_sem );
# endif /* OS_THREADS_USED && !xCL_NO_ATOMIC_READ_WRITE_IO */

# if  defined(OS_THREADS_USED) && !defined(xCL_NO_ATOMIC_READ_WRITE_IO)
    bytes_written =
#ifdef LARGEFILE64
 	 pwrite64( (int)diop->di_fd, buf, bytes_to_write, lseek_offset );
#else /*  LARGEFILE64 */
 	 pwrite( (int)diop->di_fd, buf, bytes_to_write, lseek_offset );
#endif /* LARGEFILE64 */
# else /* OS_THREADS_USED  !xCL_NO_ATOMIC_READ_WRITE_IO */
    bytes_written =
 	 IIdio_write( (int)diop->di_fd, buf, bytes_to_write,
 	 	       lseek_offset, 0, 
		       f->io_fprop,
		       err_code );
# endif /* OS_THREADS_USED */

    if ( bytes_written != bytes_to_write )
    {
	SETCLERR(err_code, 0, ER_write);

	switch( err_code->errnum )
	{
	case EFBIG:
	    status = DI_BADEXTEND;
	    break;
	case ENOSPC:
	    status = DI_NODISKSPACE;
	    break;
#ifdef EDQUOTA
	case EDQUOT:
	    status = DI_EXCEED_LIMIT;
	    break;
#endif
	default:
	    if (err_code->errnum == 0)
		status = DI_ENDFILE;
	    else
		status = DI_BADWRITE;
	    break;
	}
    }

# if  defined(OS_THREADS_USED) && defined(xCL_NO_ATOMIC_READ_WRITE_IO)
    if ( !Di_thread_affinity && (f->io_fprop & FPROP_PRIVATE) == 0)
	CS_synch_unlock( &f->io_sem );
# endif /* OS_THREADS_USED && xCL_NO_ATOMIC_READ_WRITE_IO */

    if ( Di_backend && scb )
    {

	scb->cs_memory &= ~(CS_DIOW_MASK | CS_LIOW_MASK);
	scb->cs_state = saved_state;
    }

    return( status );
}
Esempio n. 3
0
/*{
** Name: do_writev -  Perform writev() call.
**
** Description:
**	This function collects the queued write requests, 
**	chooses the optimum function to perform the write(s),
**	and invokes the completion handler for each request.
**
** Inputs:
**	DI_TGIO * tgio  - Control block for current thread. 
**      
** Outputs:
**    None.
**
** Returns:
**    OK
**    FAIL - One of more of the write requests failed.
**
**    Exceptions:
**        none
**
** Side Effects:
**	  The completion handler for each I/O request is invoked.
**
** History:
**	19-May-1999 (jenjo02)
**	    Created.
**	09-Jul-1999 (jenjo02)
**	    If queued list is ordered, skip the quicksort.
**	09-Apr-2001 (jenjo02)
**	    Increment first gio's io_count stat for each physical I/O,
**	    gw_pages for multi-page writes.
**	05-Nov-2002 (jenjo02)
**	    Cleaned up use of io_sem: only write() and writev() need
**	    the mutex to protect the (separate) seek. pwrite(64)
**	    atomically seeks and does not need the mutex.
**	25-Aug-2005 (schka24)
**	    Don't bother with IO timing, too slow on some platforms (Linux)
**	    and the results aren't all that interesting.
**	14-Oct-2005 (jenjo02)
**	    Chris's file descriptor properties now cached in io_fprop
**	    (file properties) and established on the first open, 
**	    not every open.
**	24-Jan-2006 (jenjo02)
**	    Break on change in file ("f"), then lru-open to get an
**	    FD, do the write(v), and lru_release the FD. This keeps
**	    gather_write from hogging FDs while waiting for the
**	    signal to actually do something.
**	15-Mar-2006 (jenjo02)
**	    f->io_sem not needed if running with thread affinity,
**	    the fd is not shared by multiple threads.
*/
static STATUS
do_writev( DI_TGIO * tgio, CL_ERR_DESC *err_code )
{
    CL_ERR_DESC lerr_code;
    STATUS 	big_status = OK, small_status = OK;
    i4 		i, j, k;
    DI_GIO 	*gio, *first_gio;
    DI_IO	*f;
    DI_OP	*diop;
    OFFSET_TYPE next_offset, lseek_offset;
    i4		bytes_to_write, bytes_written;
    i4		saved_state;

    i4		num_writev = 0, num_write = 0;
    i4		num_writev_gio = 0, num_write_gio = 0;

#if defined(sgi_us5)
    if( iov_max == 0 )
    {
	iov_max = sysconf(_SC_IOV_MAX);
	if( iov_max <= 0 )
        {
	    iov_max = 16;	/* arbitrary minimum value */
#ifdef DEBUG_THIS_PUPPY
	    TRdisplay("%@ %x do_writev: %t ERROR sysconf failed with %d\n",
		    tgio->tgio_scb->cs_self, 
		    f->io_l_filename, f->io_filename,
		    iov_max);
#endif /* DEBUG_THIS_PUPPY */
        }
        else if( iov_max > 2048 )
        {
	    iov_max = 2048;	/* arbitrary maximum value */
        }
    }
#else
    iov_max = IOV_MAX;
#endif

    /* If unordered, sort the queued list into file,offset order */
    if ( tgio->tgio_state & TGIO_UNORDERED )
    {
	gio_sort( tgio->tgio_queue, 0, tgio->tgio_queued-1 );
	tgio->tgio_state &= ~(TGIO_UNORDERED);
    }


    /*
    ** Method:
    **
    **	Collect requests by file/offset into an iovec until
    **	the next file offset becomes discontiguous. Additionally, if
    **	the buffer addresses are contiguous, colaesce those requests.
    **
    **  Up to IOV_MAX iovecs can be written by a single writev().
    **
    **	If but a single iovec results, the probably-more-efficient
    **	function (p)write() is called instead of writev().
    */
    k = 0;

    while ( (j = k) < tgio->tgio_queued )
    {
	#if defined(sgi_us5)
		struct iovec iov[iov_max];
	#else
		struct iovec iov[IOV_MAX];
	#endif

	/*
	** "i" indexes the current iovec element
	** "j" is the first GIO used in this iovec array
	** "k" is the current GIO in the queue
	*/
	i = 0;
	
	gio = first_gio = tgio->tgio_queue[j];
	f = gio->gio_f;
	lseek_offset = next_offset = gio->gio_offset;
	small_status = OK;

	iov[0].iov_base = gio->gio_buf;
	iov[0].iov_len  = 0;

	do
	{
	    /* If this buffer is contiguous with previous, coalesce it */
	    if ( (char *)iov[i].iov_base + iov[i].iov_len == gio->gio_buf )
	    {
		iov[i].iov_len += gio->gio_len;
	    }
	    /* Initialize next iovec if any remain */
	    else if ( i < iov_max - 1 )
	    {
		i++;
		iov[i].iov_base = gio->gio_buf;
		iov[i].iov_len  = gio->gio_len;
	    }
	    else
		break;

	    next_offset += gio->gio_len;

	} while ( ++k < tgio->tgio_queued
		    && (gio = tgio->tgio_queue[k])
		    && gio->gio_f == f
		    && gio->gio_offset == next_offset );

	/* "k" indexes the next, unprocessed GIO */

	bytes_to_write = next_offset - lseek_offset;
	
	saved_state = tgio->tgio_scb->cs_state;
	tgio->tgio_scb->cs_state = CS_EVENT_WAIT;
	tgio->tgio_scb->cs_memory = CS_DIOW_MASK;

	/* Accumulate multi-page write stats */
	if ( k - j > 1 )
	{
	    /*
	    ** Using the first gio, count
	    ** the number of multi-pages written (k-j)
	    ** and a single I/O.
	    */
	    if ( first_gio->gio_io_count )
		++*first_gio->gio_io_count;
	    if ( first_gio->gio_gw_pages )
		*first_gio->gio_gw_pages += k - j;
	}

	/* Count a single I/O write for server */
	tgio->tgio_scb->cs_diow++;
	Cs_srv_block.cs_wtstatistics.cs_diow_done++;

	/* Count a single I/O wait for server */
	Cs_srv_block.cs_wtstatistics.cs_diow_waits++;

	/* Accumulate number of KB written by this I/O */
	Cs_srv_block.cs_wtstatistics.cs_diow_kbytes
	    += bytes_to_write / 1024;
	
	/* Now get an FD to do the write(v) */
	diop = (DI_OP*)&first_gio->gio_diop;
	if ( big_status = DIlru_open(f, FALSE, diop, err_code) )
	    return(big_status);

#ifdef DEBUG_THIS_PUPPY
	{
	    i4	x;
	    i8	offset = lseek_offset;

	    TRdisplay("%@ %p do_writev: %~t doing %d todo %d fd %d lseek from %ld\n",
		    tgio->tgio_scb->cs_self, 
		    f->io_l_filename, f->io_filename,
		    i+1, tgio->tgio_queued - j,
		    diop->di_fd, offset);
	    for (x = 0; x <= i; x++)
	    {
	TRdisplay("%@ do_writev: iovec[%d] base %p bytes %d (page %d for %d)\n",
			x,
			iov[x].iov_base, iov[x].iov_len,
			(i4)(offset/f->io_bytes_per_page),
			iov[x].iov_len/f->io_bytes_per_page);
		offset += iov[x].iov_len;
	    }
	}
#endif /* DEBUG_THIS_PUPPY */

	/* If more than one iovec, seek and use writev */
	if ( i++ )
	{
	    /* writev needs seek mutex protection */
	    if ( !Di_thread_affinity )
		CS_synch_lock( &f->io_sem );
	    
	    num_writev++;
	    num_writev_gio += k - j;

	    bytes_written = 
		IIdio_writev( diop->di_fd, 
				(char *)iov,
				i,
				lseek_offset, 0, 
				f->io_fprop,
				err_code);
	    if ( !Di_thread_affinity )
		CS_synch_unlock( &f->io_sem );
	}
	else
	{
	    num_write++;
	    num_write_gio += k - j;

# if  !defined(xCL_NO_ATOMIC_READ_WRITE_IO)
	    /* pwrite(64) needs no seek mutex protection */
	    bytes_written =
#ifdef LARGEFILE64
	     pwrite64( diop->di_fd, 
			iov[0].iov_base, 
			bytes_to_write, 
			lseek_offset );
#else /*  LARGEFILE64 */
	     pwrite( diop->di_fd,
			iov[0].iov_base, 
			bytes_to_write, 
			lseek_offset );
#endif /* LARGEFILE64 */
	    if (bytes_written != bytes_to_write)
		SETCLERR(err_code, 0, ER_write);
# else /* !xCL_NO_ATOMIC_READ_WRITE_IO */
	    /* write() needs seek mutex protection */
	    if ( !Di_thread_affinity )
		CS_synch_lock( &f->io_sem );

	    bytes_written =
	     IIdio_write( diop->di_fd,
			    iov[0].iov_base, 
			    bytes_to_write, 
			    lseek_offset, 0, 
			    f->io_fprop,
			    err_code );
	    if ( !Di_thread_affinity )
		CS_synch_unlock( &f->io_sem );

# endif /* !xCL_NO_ATOMIC_READ_WRITE_IO */
	}

	/* Release the FD */
	(VOID)DIlru_release( diop, &lerr_code );
	    
	tgio->tgio_scb->cs_memory &= ~(CS_DIOW_MASK);
	tgio->tgio_scb->cs_state = saved_state;

	if (bytes_written != bytes_to_write)
	{
	    switch ( err_code->errnum )
	    {
		case EFBIG:
		    small_status = DI_BADEXTEND;
		    break;
		case ENOSPC:
		    small_status = DI_EXCEED_LIMIT;
		    break;
#ifdef EDQUOTA
		case EDQUOT:
		    small_status = DI_EXCEED_LIMIT;
		    break;
#endif
		default:
		    if (err_code->errnum == 0)
			small_status = DI_ENDFILE;
		    else
			small_status = DI_BADWRITE;
		    break;
	    }
	    /* Preserve the worst status from all the writes */
	    big_status = (big_status) ? big_status : small_status;
	}

	/* Invoke completion handler for each GIO written */
	do 
	{
	    gio = tgio->tgio_queue[j];
	    (gio->gio_evcomp)( gio->gio_data, small_status, err_code );

	} while ( ++j < k );
    }

#ifdef DEBUG_THIS_PUPPY
    TRdisplay("%@ %p do_writev: %d write requests completed using %d(%d) writev, %d(%d) write\n",
		tgio->tgio_scb->cs_self, 
		tgio->tgio_queued, 
		num_writev, num_writev_gio,
		num_write, num_write_gio);
#endif /* DEBUG_THIS_PUPPY */

    /* Clear the queued count(s) */
    tgio->tgio_queued = *tgio->tgio_uqueued = 0;

    return( big_status );
}