static STATUS DI_galloc( DI_IO *f, i4 n, DI_OP *diop, i4 *end_of_file, CL_ERR_DESC *err_code) { STATUS big_status = OK, small_status =OK; STATUS intern_status = OK; register DI_SLAVE_CB *disl; i4 last_page; OFFSET_TYPE lseek_ret; do { # ifdef OS_THREADS_USED /* Seek/write must be semaphore protected */ if ((f->io_fprop & FPROP_PRIVATE) == 0) CS_synch_lock( &f->io_sem ); # endif /* OS_THREADS_USED */ if (Di_slave) { disl = diop->di_evcb; disl->file_op = DI_SL_ZALLOC; disl->length = n * f->io_bytes_per_page; /* Pass file properties to slave */ FPROP_COPY(f->io_fprop,disl->io_fprop); DI_slave_send( disl->dest_slave_no, diop, &big_status, &small_status, &intern_status ); if (( big_status != OK ) || ( small_status != OK )) break; if ( disl->status != OK ) { STRUCT_ASSIGN_MACRO(disl->errcode, *err_code); small_status = DI_BADEXTEND; break; } else { lseek_ret = disl->length; } } else { /* ** Running without slaves */ OFFSET_TYPE lseek_offset; i8 reservation; i4 buf_size; i4 bytes_written; i4 pages_remaining = n; i4 pages_at_a_time = Di_zero_bufsize / f->io_bytes_per_page; /* find current end-of-file */ lseek_ret = IIdio_get_file_eof(diop->di_fd, f->io_fprop); if ( lseek_ret == (OFFSET_TYPE)-1L ) { SETCLERR(err_code, 0, ER_lseek); small_status = DI_BADINFO; break; } else { lseek_offset = lseek_ret; /* If this filesystem can do reservations, see if we ** should reserve more space. ** Even though we have to write the zeros anyway, the ** reservation may well be larger than the zeroing ** buffer, and this way helps maintain contiguity. ** Not worth it for tiny writes. */ if (pages_remaining > 2 && FPROP_ALLOCSTRATEGY_GET(f->io_fprop) == FPROP_ALLOCSTRATEGY_RESV) { reservation = lseek_offset + (pages_remaining * f->io_bytes_per_page); if (reservation > f->io_reserved_bytes) { /* Re-check in case some other server reserved */ small_status = IIdio_get_reserved(diop->di_fd, &f->io_reserved_bytes, err_code); if (small_status == OK && reservation > f->io_reserved_bytes) { small_status = IIdio_reserve(diop->di_fd, f->io_reserved_bytes, reservation - f->io_reserved_bytes, err_code); if (small_status == OK) { f->io_reserved_bytes = reservation; } else { if (small_status != DI_BADFILE) break; /* Fallocate not supported, turn off ** "reserve" strategy, continue without. */ small_status = OK; FPROP_ALLOCSTRATEGY_SET(f->io_fprop, FPROP_ALLOCSTRATEGY_VIRT); } } } } /* end reservations */ while ( pages_remaining > 0 ) { if ( pages_remaining < pages_at_a_time ) buf_size = pages_remaining * f->io_bytes_per_page; else buf_size = Di_zero_bufsize; # if defined(OS_THREADS_USED) && !defined(xCL_NO_ATOMIC_READ_WRITE_IO) bytes_written = #ifdef LARGEFILE64 pwrite64( diop->di_fd, Di_zero_buffer, buf_size, lseek_offset ); #else /* LARGEFILE64 */ pwrite( diop->di_fd, Di_zero_buffer, buf_size, lseek_offset ); #endif /* LARGEFILE64 */ # else /* OS_THREADS_USED !xCL_NO_ATOMIC_READ_WRITE_IO */ bytes_written = IIdio_write( diop->di_fd, Di_zero_buffer, buf_size, lseek_offset, &lseek_offset, f->io_fprop, err_code ); # endif /* OS_THREADS_USED */ if ( bytes_written != buf_size ) { SETCLERR(err_code, 0, ER_write); small_status = DI_BADEXTEND; break; } lseek_offset += buf_size; pages_remaining -= pages_at_a_time; } if ( small_status != OK ) break; } } *end_of_file = ( lseek_ret / f->io_bytes_per_page) - 1; } while (FALSE); if (big_status == OK && small_status == OK) { /* ** Update the current allocated end-of-file under mutex protection */ last_page = *end_of_file + n; if (last_page > f->io_alloc_eof) f->io_alloc_eof = last_page; } # ifdef OS_THREADS_USED if ((f->io_fprop & FPROP_PRIVATE) == 0) CS_synch_unlock( &f->io_sem ); # endif /* OS_THREADS_USED */ if ( big_status != OK ) small_status = big_status; if ( small_status != OK ) DIlru_set_di_error( &small_status, err_code, intern_status, DI_GENERAL_ERR); return(small_status); }
/*{ ** Name: DI_inproc_write - writes page(s) to a file on disk. ** ** Description: ** This routine was created to make DIwrite more readable once ** error checking had been added. See DIwrite for comments. ** ** Inputs: ** f Pointer to the DI file ** context needed to do I/O. ** diop Pointer to dilru file context. ** buf Pointer to page(s) to write. ** page Value indicating page(s) to write. ** num_of_pages number of pages to write ** ** Outputs: ** err_code Pointer to a variable used ** to return operating system ** errors. ** Returns: ** OK ** other errors. ** Exceptions: ** none ** ** Side Effects: ** none ** ** History: ** 30-nov-1992 (rmuth) ** Created. ** 03-jun-1996 (canor01) ** Note in the scb that this is a DI wait. ** 05-May-1997 (merja01) ** Changed preprocessor stmt for pwrite. Not all platforms ** using OS_THREADS have a pwrite function. This function ** seems to only be available on Solaris 2.4 where async IO ** is not yet supported. ** 14-July-1997 (schte01) ** For those platforms that do direct i/o (where the ** seek and the write are separate functions), do not release and ** reaquire the semaphore on the DI_IO block. This will protect ** against i/o being done by a different thread in between the ** lseek and the write. ** 14-Aug-1997 (schte01) ** Add xCL_DIRECT_IO as a condition to the 14-July-1997 change ** instead of the test for !xCL_ASYNCH_IO. ** 22-Dec-1998 (jenjo02) ** If DI_FD_PER_THREAD is defined, call IIdio_write() instead of ** pwrite(). ** 01-oct-1998 (somsa01) ** Return DI_NODISKSPACE when we are out of disk space. ** 01-Apr-2004 (fanch01) ** Add O_DIRECT support on Linux depending on the filesystem ** properties, pagesize. Fixups for misaligned buffers on read() ** and write() operations. ** 13-apr-04 (toumi01) ** Move stack variable declaration to support "standard" C compilers. ** 29-Jan-2005 (schka24) ** Ditch attempt to gather diow timing stats, not useful in ** the real world and generates excess syscalls on some platforms. ** 15-Mar-2006 (jenjo02) ** io_sem is not needed with thread affinity. ** 6-Nov-2009 (kschendel) SIR 122757 ** Make io-sem a SYNCH, avoid entirely if PRIVATE. ** Delete copy-to-align, caller is supposed to do it now. ** Don't attempt SCB updating if not backend. */ static STATUS DI_inproc_write( DI_IO *f, DI_OP *diop, char *buf, i4 page, i4 num_of_pages, CL_ERR_DESC *err_code ) { STATUS status = OK; CS_SCB *scb; i4 saved_state; /* unix variables */ int bytes_written; int bytes_to_write; OFFSET_TYPE lseek_offset; /* ** seek to place to write */ lseek_offset = (OFFSET_TYPE)(f->io_bytes_per_page) * (OFFSET_TYPE)page; bytes_to_write = (f->io_bytes_per_page * (num_of_pages)); if (Di_backend) { CSget_scb(&scb); if ( scb ) { saved_state = scb->cs_state; scb->cs_state = CS_EVENT_WAIT; if (f->io_open_flags & DI_O_LOG_FILE_MASK) { scb->cs_memory = CS_LIOW_MASK; scb->cs_liow++; Cs_srv_block.cs_wtstatistics.cs_liow_done++; Cs_srv_block.cs_wtstatistics.cs_liow_waits++; Cs_srv_block.cs_wtstatistics.cs_liow_kbytes += bytes_to_write / 1024; } else { scb->cs_memory = CS_DIOW_MASK; scb->cs_diow++; Cs_srv_block.cs_wtstatistics.cs_diow_done++; Cs_srv_block.cs_wtstatistics.cs_diow_waits++; Cs_srv_block.cs_wtstatistics.cs_diow_kbytes += bytes_to_write / 1024; } } } # if defined(OS_THREADS_USED) && defined(xCL_NO_ATOMIC_READ_WRITE_IO) if ( !Di_thread_affinity && (f->io_fprop & FPROP_PRIVATE) == 0) CS_synch_lock( &f->io_sem ); # endif /* OS_THREADS_USED && !xCL_NO_ATOMIC_READ_WRITE_IO */ # if defined(OS_THREADS_USED) && !defined(xCL_NO_ATOMIC_READ_WRITE_IO) bytes_written = #ifdef LARGEFILE64 pwrite64( (int)diop->di_fd, buf, bytes_to_write, lseek_offset ); #else /* LARGEFILE64 */ pwrite( (int)diop->di_fd, buf, bytes_to_write, lseek_offset ); #endif /* LARGEFILE64 */ # else /* OS_THREADS_USED !xCL_NO_ATOMIC_READ_WRITE_IO */ bytes_written = IIdio_write( (int)diop->di_fd, buf, bytes_to_write, lseek_offset, 0, f->io_fprop, err_code ); # endif /* OS_THREADS_USED */ if ( bytes_written != bytes_to_write ) { SETCLERR(err_code, 0, ER_write); switch( err_code->errnum ) { case EFBIG: status = DI_BADEXTEND; break; case ENOSPC: status = DI_NODISKSPACE; break; #ifdef EDQUOTA case EDQUOT: status = DI_EXCEED_LIMIT; break; #endif default: if (err_code->errnum == 0) status = DI_ENDFILE; else status = DI_BADWRITE; break; } } # if defined(OS_THREADS_USED) && defined(xCL_NO_ATOMIC_READ_WRITE_IO) if ( !Di_thread_affinity && (f->io_fprop & FPROP_PRIVATE) == 0) CS_synch_unlock( &f->io_sem ); # endif /* OS_THREADS_USED && xCL_NO_ATOMIC_READ_WRITE_IO */ if ( Di_backend && scb ) { scb->cs_memory &= ~(CS_DIOW_MASK | CS_LIOW_MASK); scb->cs_state = saved_state; } return( status ); }
/*{ ** Name: do_writev - Perform writev() call. ** ** Description: ** This function collects the queued write requests, ** chooses the optimum function to perform the write(s), ** and invokes the completion handler for each request. ** ** Inputs: ** DI_TGIO * tgio - Control block for current thread. ** ** Outputs: ** None. ** ** Returns: ** OK ** FAIL - One of more of the write requests failed. ** ** Exceptions: ** none ** ** Side Effects: ** The completion handler for each I/O request is invoked. ** ** History: ** 19-May-1999 (jenjo02) ** Created. ** 09-Jul-1999 (jenjo02) ** If queued list is ordered, skip the quicksort. ** 09-Apr-2001 (jenjo02) ** Increment first gio's io_count stat for each physical I/O, ** gw_pages for multi-page writes. ** 05-Nov-2002 (jenjo02) ** Cleaned up use of io_sem: only write() and writev() need ** the mutex to protect the (separate) seek. pwrite(64) ** atomically seeks and does not need the mutex. ** 25-Aug-2005 (schka24) ** Don't bother with IO timing, too slow on some platforms (Linux) ** and the results aren't all that interesting. ** 14-Oct-2005 (jenjo02) ** Chris's file descriptor properties now cached in io_fprop ** (file properties) and established on the first open, ** not every open. ** 24-Jan-2006 (jenjo02) ** Break on change in file ("f"), then lru-open to get an ** FD, do the write(v), and lru_release the FD. This keeps ** gather_write from hogging FDs while waiting for the ** signal to actually do something. ** 15-Mar-2006 (jenjo02) ** f->io_sem not needed if running with thread affinity, ** the fd is not shared by multiple threads. */ static STATUS do_writev( DI_TGIO * tgio, CL_ERR_DESC *err_code ) { CL_ERR_DESC lerr_code; STATUS big_status = OK, small_status = OK; i4 i, j, k; DI_GIO *gio, *first_gio; DI_IO *f; DI_OP *diop; OFFSET_TYPE next_offset, lseek_offset; i4 bytes_to_write, bytes_written; i4 saved_state; i4 num_writev = 0, num_write = 0; i4 num_writev_gio = 0, num_write_gio = 0; #if defined(sgi_us5) if( iov_max == 0 ) { iov_max = sysconf(_SC_IOV_MAX); if( iov_max <= 0 ) { iov_max = 16; /* arbitrary minimum value */ #ifdef DEBUG_THIS_PUPPY TRdisplay("%@ %x do_writev: %t ERROR sysconf failed with %d\n", tgio->tgio_scb->cs_self, f->io_l_filename, f->io_filename, iov_max); #endif /* DEBUG_THIS_PUPPY */ } else if( iov_max > 2048 ) { iov_max = 2048; /* arbitrary maximum value */ } } #else iov_max = IOV_MAX; #endif /* If unordered, sort the queued list into file,offset order */ if ( tgio->tgio_state & TGIO_UNORDERED ) { gio_sort( tgio->tgio_queue, 0, tgio->tgio_queued-1 ); tgio->tgio_state &= ~(TGIO_UNORDERED); } /* ** Method: ** ** Collect requests by file/offset into an iovec until ** the next file offset becomes discontiguous. Additionally, if ** the buffer addresses are contiguous, colaesce those requests. ** ** Up to IOV_MAX iovecs can be written by a single writev(). ** ** If but a single iovec results, the probably-more-efficient ** function (p)write() is called instead of writev(). */ k = 0; while ( (j = k) < tgio->tgio_queued ) { #if defined(sgi_us5) struct iovec iov[iov_max]; #else struct iovec iov[IOV_MAX]; #endif /* ** "i" indexes the current iovec element ** "j" is the first GIO used in this iovec array ** "k" is the current GIO in the queue */ i = 0; gio = first_gio = tgio->tgio_queue[j]; f = gio->gio_f; lseek_offset = next_offset = gio->gio_offset; small_status = OK; iov[0].iov_base = gio->gio_buf; iov[0].iov_len = 0; do { /* If this buffer is contiguous with previous, coalesce it */ if ( (char *)iov[i].iov_base + iov[i].iov_len == gio->gio_buf ) { iov[i].iov_len += gio->gio_len; } /* Initialize next iovec if any remain */ else if ( i < iov_max - 1 ) { i++; iov[i].iov_base = gio->gio_buf; iov[i].iov_len = gio->gio_len; } else break; next_offset += gio->gio_len; } while ( ++k < tgio->tgio_queued && (gio = tgio->tgio_queue[k]) && gio->gio_f == f && gio->gio_offset == next_offset ); /* "k" indexes the next, unprocessed GIO */ bytes_to_write = next_offset - lseek_offset; saved_state = tgio->tgio_scb->cs_state; tgio->tgio_scb->cs_state = CS_EVENT_WAIT; tgio->tgio_scb->cs_memory = CS_DIOW_MASK; /* Accumulate multi-page write stats */ if ( k - j > 1 ) { /* ** Using the first gio, count ** the number of multi-pages written (k-j) ** and a single I/O. */ if ( first_gio->gio_io_count ) ++*first_gio->gio_io_count; if ( first_gio->gio_gw_pages ) *first_gio->gio_gw_pages += k - j; } /* Count a single I/O write for server */ tgio->tgio_scb->cs_diow++; Cs_srv_block.cs_wtstatistics.cs_diow_done++; /* Count a single I/O wait for server */ Cs_srv_block.cs_wtstatistics.cs_diow_waits++; /* Accumulate number of KB written by this I/O */ Cs_srv_block.cs_wtstatistics.cs_diow_kbytes += bytes_to_write / 1024; /* Now get an FD to do the write(v) */ diop = (DI_OP*)&first_gio->gio_diop; if ( big_status = DIlru_open(f, FALSE, diop, err_code) ) return(big_status); #ifdef DEBUG_THIS_PUPPY { i4 x; i8 offset = lseek_offset; TRdisplay("%@ %p do_writev: %~t doing %d todo %d fd %d lseek from %ld\n", tgio->tgio_scb->cs_self, f->io_l_filename, f->io_filename, i+1, tgio->tgio_queued - j, diop->di_fd, offset); for (x = 0; x <= i; x++) { TRdisplay("%@ do_writev: iovec[%d] base %p bytes %d (page %d for %d)\n", x, iov[x].iov_base, iov[x].iov_len, (i4)(offset/f->io_bytes_per_page), iov[x].iov_len/f->io_bytes_per_page); offset += iov[x].iov_len; } } #endif /* DEBUG_THIS_PUPPY */ /* If more than one iovec, seek and use writev */ if ( i++ ) { /* writev needs seek mutex protection */ if ( !Di_thread_affinity ) CS_synch_lock( &f->io_sem ); num_writev++; num_writev_gio += k - j; bytes_written = IIdio_writev( diop->di_fd, (char *)iov, i, lseek_offset, 0, f->io_fprop, err_code); if ( !Di_thread_affinity ) CS_synch_unlock( &f->io_sem ); } else { num_write++; num_write_gio += k - j; # if !defined(xCL_NO_ATOMIC_READ_WRITE_IO) /* pwrite(64) needs no seek mutex protection */ bytes_written = #ifdef LARGEFILE64 pwrite64( diop->di_fd, iov[0].iov_base, bytes_to_write, lseek_offset ); #else /* LARGEFILE64 */ pwrite( diop->di_fd, iov[0].iov_base, bytes_to_write, lseek_offset ); #endif /* LARGEFILE64 */ if (bytes_written != bytes_to_write) SETCLERR(err_code, 0, ER_write); # else /* !xCL_NO_ATOMIC_READ_WRITE_IO */ /* write() needs seek mutex protection */ if ( !Di_thread_affinity ) CS_synch_lock( &f->io_sem ); bytes_written = IIdio_write( diop->di_fd, iov[0].iov_base, bytes_to_write, lseek_offset, 0, f->io_fprop, err_code ); if ( !Di_thread_affinity ) CS_synch_unlock( &f->io_sem ); # endif /* !xCL_NO_ATOMIC_READ_WRITE_IO */ } /* Release the FD */ (VOID)DIlru_release( diop, &lerr_code ); tgio->tgio_scb->cs_memory &= ~(CS_DIOW_MASK); tgio->tgio_scb->cs_state = saved_state; if (bytes_written != bytes_to_write) { switch ( err_code->errnum ) { case EFBIG: small_status = DI_BADEXTEND; break; case ENOSPC: small_status = DI_EXCEED_LIMIT; break; #ifdef EDQUOTA case EDQUOT: small_status = DI_EXCEED_LIMIT; break; #endif default: if (err_code->errnum == 0) small_status = DI_ENDFILE; else small_status = DI_BADWRITE; break; } /* Preserve the worst status from all the writes */ big_status = (big_status) ? big_status : small_status; } /* Invoke completion handler for each GIO written */ do { gio = tgio->tgio_queue[j]; (gio->gio_evcomp)( gio->gio_data, small_status, err_code ); } while ( ++j < k ); } #ifdef DEBUG_THIS_PUPPY TRdisplay("%@ %p do_writev: %d write requests completed using %d(%d) writev, %d(%d) write\n", tgio->tgio_scb->cs_self, tgio->tgio_queued, num_writev, num_writev_gio, num_write, num_write_gio); #endif /* DEBUG_THIS_PUPPY */ /* Clear the queued count(s) */ tgio->tgio_queued = *tgio->tgio_uqueued = 0; return( big_status ); }