STATIC int xfs_sync_fsdata( struct xfs_mount *mp) { struct xfs_buf *bp; int error; /* * If the buffer is pinned then push on the log so we won't get stuck * waiting in the write for someone, maybe ourselves, to flush the log. * * Even though we just pushed the log above, we did not have the * superblock buffer locked at that point so it can become pinned in * between there and here. */ bp = xfs_getsb(mp, 0); if (xfs_buf_ispinned(bp)) xfs_log_force(mp, 0); error = xfs_bwrite(bp); xfs_buf_relse(bp); return error; }
STATIC void xfs_sync_worker( struct work_struct *work) { struct xfs_mount *mp = container_of(to_delayed_work(work), struct xfs_mount, m_sync_work); int error; if (!(mp->m_flags & XFS_MOUNT_RDONLY)) { if (mp->m_super->s_frozen == SB_UNFROZEN && xfs_log_need_covered(mp)) error = xfs_fs_log_dummy(mp); else xfs_log_force(mp, 0); xfs_ail_push_all(mp->m_ail); } xfs_syncd_queue_sync(mp); }
/* * This is called to wait for the given dquot to be unpinned. * Most of these pin/unpin routines are plagiarized from inode code. */ void xfs_qm_dqunpin_wait( xfs_dquot_t *dqp) { SPLDECL(s); ASSERT(XFS_DQ_IS_LOCKED(dqp)); if (dqp->q_pincount == 0) { return; } /* * Give the log a push so we don't wait here too long. */ xfs_log_force(dqp->q_mount, (xfs_lsn_t)0, XFS_LOG_FORCE); s = XFS_DQ_PINLOCK(dqp); if (dqp->q_pincount == 0) { XFS_DQ_PINUNLOCK(dqp, s); return; } sv_wait(&(dqp->q_pinwait), PINOD, &(XFS_DQ_TO_QINF(dqp)->qi_pinlock), s); }
STATIC int xfs_commit_dummy_trans( struct xfs_mount *mp, uint flags) { struct xfs_inode *ip = mp->m_rootip; struct xfs_trans *tp; int error; int log_flags = XFS_LOG_FORCE; if (flags & SYNC_WAIT) log_flags |= XFS_LOG_SYNC; /* * Put a dummy transaction in the log to tell recovery * that all others are OK. */ tp = xfs_trans_alloc(mp, XFS_TRANS_DUMMY1); error = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES(mp), 0, 0, 0); if (error) { xfs_trans_cancel(tp, 0); return error; } xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); xfs_trans_ihold(tp, ip); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); error = xfs_trans_commit(tp, 0); xfs_iunlock(ip, XFS_ILOCK_EXCL); /* the log force ensures this transaction is pushed to disk */ xfs_log_force(mp, 0, log_flags); return error; }
/* * Every sync period we need to unpin all items, reclaim inodes and sync * disk quotas. We might need to cover the log to indicate that the * filesystem is idle and not frozen. */ STATIC void xfs_sync_worker( struct work_struct *work) { struct xfs_mount *mp = container_of(to_delayed_work(work), struct xfs_mount, m_sync_work); int error; if (!(mp->m_flags & XFS_MOUNT_RDONLY)) { /* dgc: errors ignored here */ if (mp->m_super->s_frozen == SB_UNFROZEN && xfs_log_need_covered(mp)) error = xfs_fs_log_dummy(mp); else xfs_log_force(mp, 0); error = xfs_qm_sync(mp, SYNC_TRYLOCK); /* start pushing all the metadata that is currently dirty */ xfs_ail_push_all(mp->m_ail); } /* queue us up again */ xfs_syncd_queue_sync(mp); }
/* PRIVATE, debugging */ int xfs_qm_internalqcheck( xfs_mount_t *mp) { xfs_ino_t lastino; int done, count; int i; xfs_dqtest_t *d, *e; xfs_dqhash_t *h1; int error; lastino = 0; qmtest_hashmask = 32; count = 5; done = 0; qmtest_nfails = 0; if (! XFS_IS_QUOTA_ON(mp)) return XFS_ERROR(ESRCH); xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE | XFS_LOG_SYNC); XFS_bflush(mp->m_ddev_targp); xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE | XFS_LOG_SYNC); XFS_bflush(mp->m_ddev_targp); mutex_lock(&qcheck_lock); /* There should be absolutely no quota activity while this is going on. */ qmtest_udqtab = kmem_zalloc(qmtest_hashmask * sizeof(xfs_dqhash_t), KM_SLEEP); qmtest_gdqtab = kmem_zalloc(qmtest_hashmask * sizeof(xfs_dqhash_t), KM_SLEEP); do { /* * Iterate thru all the inodes in the file system, * adjusting the corresponding dquot counters */ if ((error = xfs_bulkstat(mp, &lastino, &count, xfs_qm_internalqcheck_adjust, NULL, 0, NULL, BULKSTAT_FG_IGET, &done))) { break; } } while (! done); if (error) { cmn_err(CE_DEBUG, "Bulkstat returned error 0x%x", error); } cmn_err(CE_DEBUG, "Checking results against system dquots"); for (i = 0; i < qmtest_hashmask; i++) { h1 = &qmtest_udqtab[i]; for (d = (xfs_dqtest_t *) h1->qh_next; d != NULL; ) { xfs_dqtest_cmp(d); e = (xfs_dqtest_t *) d->HL_NEXT; kmem_free(d, sizeof(xfs_dqtest_t)); d = e; } h1 = &qmtest_gdqtab[i]; for (d = (xfs_dqtest_t *) h1->qh_next; d != NULL; ) { xfs_dqtest_cmp(d); e = (xfs_dqtest_t *) d->HL_NEXT; kmem_free(d, sizeof(xfs_dqtest_t)); d = e; } } if (qmtest_nfails) { cmn_err(CE_DEBUG, "******** quotacheck failed ********"); cmn_err(CE_DEBUG, "failures = %d", qmtest_nfails); } else { cmn_err(CE_DEBUG, "******** quotacheck successful! ********"); } kmem_free(qmtest_udqtab, qmtest_hashmask * sizeof(xfs_dqhash_t)); kmem_free(qmtest_gdqtab, qmtest_hashmask * sizeof(xfs_dqhash_t)); mutex_unlock(&qcheck_lock); return (qmtest_nfails); }
ssize_t /* bytes written, or (-) error */ xfs_write( bhv_desc_t *bdp, struct file *file, const char *buf, size_t size, loff_t *offset, int ioflags, cred_t *credp) { xfs_inode_t *xip; xfs_mount_t *mp; ssize_t ret; int error = 0; xfs_fsize_t isize, new_size; xfs_fsize_t n, limit; xfs_iocore_t *io; vnode_t *vp; int iolock; int eventsent = 0; vrwlock_t locktype; XFS_STATS_INC(xs_write_calls); vp = BHV_TO_VNODE(bdp); xip = XFS_BHVTOI(bdp); if (size == 0) return 0; io = &xip->i_iocore; mp = io->io_mount; fs_check_frozen(vp->v_vfsp, SB_FREEZE_WRITE); if (XFS_FORCED_SHUTDOWN(xip->i_mount)) { return -EIO; } if (unlikely(ioflags & IO_ISDIRECT)) { if (((__psint_t)buf & BBMASK) || (*offset & mp->m_blockmask) || (size & mp->m_blockmask)) { return XFS_ERROR(-EINVAL); } iolock = XFS_IOLOCK_SHARED; locktype = VRWLOCK_WRITE_DIRECT; } else { iolock = XFS_IOLOCK_EXCL; locktype = VRWLOCK_WRITE; } if (ioflags & IO_ISLOCKED) iolock = 0; xfs_ilock(xip, XFS_ILOCK_EXCL|iolock); isize = xip->i_d.di_size; limit = XFS_MAXIOFFSET(mp); if (file->f_flags & O_APPEND) *offset = isize; start: n = limit - *offset; if (n <= 0) { xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock); return -EFBIG; } if (n < size) size = n; new_size = *offset + size; if (new_size > isize) { io->io_new_size = new_size; } if ((DM_EVENT_ENABLED(vp->v_vfsp, xip, DM_EVENT_WRITE) && !(ioflags & IO_INVIS) && !eventsent)) { loff_t savedsize = *offset; int dmflags = FILP_DELAY_FLAG(file) | DM_SEM_FLAG_RD(ioflags); xfs_iunlock(xip, XFS_ILOCK_EXCL); error = XFS_SEND_DATA(xip->i_mount, DM_EVENT_WRITE, vp, *offset, size, dmflags, &locktype); if (error) { if (iolock) xfs_iunlock(xip, iolock); return -error; } xfs_ilock(xip, XFS_ILOCK_EXCL); eventsent = 1; /* * The iolock was dropped and reaquired in XFS_SEND_DATA * so we have to recheck the size when appending. * We will only "goto start;" once, since having sent the * event prevents another call to XFS_SEND_DATA, which is * what allows the size to change in the first place. */ if ((file->f_flags & O_APPEND) && savedsize != xip->i_d.di_size) { *offset = isize = xip->i_d.di_size; goto start; } } /* * If the offset is beyond the size of the file, we have a couple * of things to do. First, if there is already space allocated * we need to either create holes or zero the disk or ... * * If there is a page where the previous size lands, we need * to zero it out up to the new size. */ if (!(ioflags & IO_ISDIRECT) && (*offset > isize && isize)) { error = xfs_zero_eof(BHV_TO_VNODE(bdp), io, *offset, isize, *offset + size); if (error) { xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock); return(-error); } } xfs_iunlock(xip, XFS_ILOCK_EXCL); /* * If we're writing the file then make sure to clear the * setuid and setgid bits if the process is not being run * by root. This keeps people from modifying setuid and * setgid binaries. */ if (((xip->i_d.di_mode & S_ISUID) || ((xip->i_d.di_mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP))) && !capable(CAP_FSETID)) { error = xfs_write_clear_setuid(xip); if (error) { xfs_iunlock(xip, iolock); return -error; } } if ((ssize_t) size < 0) { ret = -EINVAL; goto error; } if (!access_ok(VERIFY_READ, buf, size)) { ret = -EINVAL; goto error; } retry: if (unlikely(ioflags & IO_ISDIRECT)) { xfs_inval_cached_pages(vp, io, *offset, 1, 1); xfs_rw_enter_trace(XFS_DIOWR_ENTER, io, buf, size, *offset, ioflags); ret = do_generic_direct_write(file, buf, size, offset); } else { xfs_rw_enter_trace(XFS_WRITE_ENTER, io, buf, size, *offset, ioflags); ret = do_generic_file_write(file, buf, size, offset); } if (unlikely(ioflags & IO_INVIS)) { /* generic_file_write updates the mtime/ctime but we need * to undo that because this I/O was supposed to be * invisible. */ struct inode *inode = LINVFS_GET_IP(vp); inode->i_mtime = xip->i_d.di_mtime.t_sec; inode->i_ctime = xip->i_d.di_ctime.t_sec; } else { xfs_ichgtime(xip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); } if ((ret == -ENOSPC) && DM_EVENT_ENABLED(vp->v_vfsp, xip, DM_EVENT_NOSPACE) && !(ioflags & IO_INVIS)) { xfs_rwunlock(bdp, locktype); error = XFS_SEND_NAMESP(xip->i_mount, DM_EVENT_NOSPACE, vp, DM_RIGHT_NULL, vp, DM_RIGHT_NULL, NULL, NULL, 0, 0, 0); /* Delay flag intentionally unused */ if (error) return -error; xfs_rwlock(bdp, locktype); *offset = xip->i_d.di_size; goto retry; } error: if (ret <= 0) { if (iolock) xfs_rwunlock(bdp, locktype); return ret; } XFS_STATS_ADD(xs_write_bytes, ret); if (*offset > xip->i_d.di_size) { xfs_ilock(xip, XFS_ILOCK_EXCL); if (*offset > xip->i_d.di_size) { struct inode *inode = LINVFS_GET_IP(vp); xip->i_d.di_size = *offset; i_size_write(inode, *offset); xip->i_update_core = 1; xip->i_update_size = 1; mark_inode_dirty_sync(inode); } xfs_iunlock(xip, XFS_ILOCK_EXCL); } /* Handle various SYNC-type writes */ if ((file->f_flags & O_SYNC) || IS_SYNC(file->f_dentry->d_inode)) { /* * If we're treating this as O_DSYNC and we have not updated the * size, force the log. */ if (!(mp->m_flags & XFS_MOUNT_OSYNCISOSYNC) && !(xip->i_update_size)) { /* * If an allocation transaction occurred * without extending the size, then we have to force * the log up the proper point to ensure that the * allocation is permanent. We can't count on * the fact that buffered writes lock out direct I/O * writes - the direct I/O write could have extended * the size nontransactionally, then finished before * we started. xfs_write_file will think that the file * didn't grow but the update isn't safe unless the * size change is logged. * * Force the log if we've committed a transaction * against the inode or if someone else has and * the commit record hasn't gone to disk (e.g. * the inode is pinned). This guarantees that * all changes affecting the inode are permanent * when we return. */ xfs_inode_log_item_t *iip; xfs_lsn_t lsn; iip = xip->i_itemp; if (iip && iip->ili_last_lsn) { lsn = iip->ili_last_lsn; xfs_log_force(mp, lsn, XFS_LOG_FORCE | XFS_LOG_SYNC); } else if (xfs_ipincount(xip) > 0) { xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE | XFS_LOG_SYNC); } } else { xfs_trans_t *tp; /* * O_SYNC or O_DSYNC _with_ a size update are handled * the same way. * * If the write was synchronous then we need to make * sure that the inode modification time is permanent. * We'll have updated the timestamp above, so here * we use a synchronous transaction to log the inode. * It's not fast, but it's necessary. * * If this a dsync write and the size got changed * non-transactionally, then we need to ensure that * the size change gets logged in a synchronous * transaction. */ tp = xfs_trans_alloc(mp, XFS_TRANS_WRITE_SYNC); if ((error = xfs_trans_reserve(tp, 0, XFS_SWRITE_LOG_RES(mp), 0, 0, 0))) { /* Transaction reserve failed */ xfs_trans_cancel(tp, 0); } else { /* Transaction reserve successful */ xfs_ilock(xip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, xip, XFS_ILOCK_EXCL); xfs_trans_ihold(tp, xip); xfs_trans_log_inode(tp, xip, XFS_ILOG_CORE); xfs_trans_set_sync(tp); error = xfs_trans_commit(tp, 0, NULL); xfs_iunlock(xip, XFS_ILOCK_EXCL); } } } /* (ioflags & O_SYNC) */ /* * If we are coming from an nfsd thread then insert into the * reference cache. */ if (!strcmp(current->comm, "nfsd")) xfs_refcache_insert(xip); /* Drop lock this way - the old refcache release is in here */ if (iolock) xfs_rwunlock(bdp, locktype); return(ret); }
/* * Write a modified dquot to disk. * The dquot must be locked and the flush lock too taken by caller. * The flush lock will not be unlocked until the dquot reaches the disk, * but the dquot is free to be unlocked and modified by the caller * in the interim. Dquot is still locked on return. This behavior is * identical to that of inodes. */ int xfs_qm_dqflush( xfs_dquot_t *dqp, uint flags) { xfs_mount_t *mp; xfs_buf_t *bp; xfs_disk_dquot_t *ddqp; int error; SPLDECL(s); ASSERT(XFS_DQ_IS_LOCKED(dqp)); ASSERT(XFS_DQ_IS_FLUSH_LOCKED(dqp)); xfs_dqtrace_entry(dqp, "DQFLUSH"); /* * If not dirty, nada. */ if (!XFS_DQ_IS_DIRTY(dqp)) { xfs_dqfunlock(dqp); return (0); } /* * Cant flush a pinned dquot. Wait for it. */ xfs_qm_dqunpin_wait(dqp); /* * This may have been unpinned because the filesystem is shutting * down forcibly. If that's the case we must not write this dquot * to disk, because the log record didn't make it to disk! */ if (XFS_FORCED_SHUTDOWN(dqp->q_mount)) { dqp->dq_flags &= ~(XFS_DQ_DIRTY); xfs_dqfunlock(dqp); return XFS_ERROR(EIO); } /* * Get the buffer containing the on-disk dquot * We don't need a transaction envelope because we know that the * the ondisk-dquot has already been allocated for. */ if ((error = xfs_qm_dqtobp(NULL, dqp, &ddqp, &bp, XFS_QMOPT_DOWARN))) { xfs_dqtrace_entry(dqp, "DQTOBP FAIL"); ASSERT(error != ENOENT); /* * Quotas could have gotten turned off (ESRCH) */ xfs_dqfunlock(dqp); return (error); } if (xfs_qm_dqcheck(&dqp->q_core, be32_to_cpu(ddqp->d_id), 0, XFS_QMOPT_DOWARN, "dqflush (incore copy)")) { xfs_force_shutdown(dqp->q_mount, XFS_CORRUPT_INCORE); return XFS_ERROR(EIO); } /* This is the only portion of data that needs to persist */ memcpy(ddqp, &(dqp->q_core), sizeof(xfs_disk_dquot_t)); /* * Clear the dirty field and remember the flush lsn for later use. */ dqp->dq_flags &= ~(XFS_DQ_DIRTY); mp = dqp->q_mount; /* lsn is 64 bits */ AIL_LOCK(mp, s); dqp->q_logitem.qli_flush_lsn = dqp->q_logitem.qli_item.li_lsn; AIL_UNLOCK(mp, s); /* * Attach an iodone routine so that we can remove this dquot from the * AIL and release the flush lock once the dquot is synced to disk. */ xfs_buf_attach_iodone(bp, (void(*)(xfs_buf_t *, xfs_log_item_t *)) xfs_qm_dqflush_done, &(dqp->q_logitem.qli_item)); /* * If the buffer is pinned then push on the log so we won't * get stuck waiting in the write for too long. */ if (XFS_BUF_ISPINNED(bp)) { xfs_dqtrace_entry(dqp, "DQFLUSH LOG FORCE"); xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE); } if (flags & XFS_QMOPT_DELWRI) { xfs_bdwrite(mp, bp); } else if (flags & XFS_QMOPT_ASYNC) { xfs_bawrite(mp, bp); } else { error = xfs_bwrite(mp, bp); } xfs_dqtrace_entry(dqp, "DQFLUSH END"); /* * dqp is still locked, but caller is free to unlock it now. */ return (error); }
/* * This is called when IOP_TRYLOCK returns XFS_ITEM_PUSHBUF to indicate that * the dquot is locked by us, but the flush lock isn't. So, here we are * going to see if the relevant dquot buffer is incore, waiting on DELWRI. * If so, we want to push it out to help us take this item off the AIL as soon * as possible. * * We must not be holding the AIL lock at this point. Calling incore() to * search the buffer cache can be a time consuming thing, and AIL lock is a * spinlock. */ STATIC void xfs_qm_dquot_logitem_pushbuf( xfs_dq_logitem_t *qip) { xfs_dquot_t *dqp; xfs_mount_t *mp; xfs_buf_t *bp; uint dopush; dqp = qip->qli_dquot; ASSERT(XFS_DQ_IS_LOCKED(dqp)); /* * The qli_pushbuf_flag keeps others from * trying to duplicate our effort. */ ASSERT(qip->qli_pushbuf_flag != 0); ASSERT(qip->qli_push_owner == current_pid()); /* * If flushlock isn't locked anymore, chances are that the * inode flush completed and the inode was taken off the AIL. * So, just get out. */ if (!issemalocked(&(dqp->q_flock)) || ((qip->qli_item.li_flags & XFS_LI_IN_AIL) == 0)) { qip->qli_pushbuf_flag = 0; xfs_dqunlock(dqp); return; } mp = dqp->q_mount; bp = xfs_incore(mp->m_ddev_targp, qip->qli_format.qlf_blkno, XFS_QI_DQCHUNKLEN(mp), XFS_INCORE_TRYLOCK); if (bp != NULL) { if (XFS_BUF_ISDELAYWRITE(bp)) { dopush = ((qip->qli_item.li_flags & XFS_LI_IN_AIL) && issemalocked(&(dqp->q_flock))); qip->qli_pushbuf_flag = 0; xfs_dqunlock(dqp); if (XFS_BUF_ISPINNED(bp)) { xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE); } if (dopush) { int error; #ifdef XFSRACEDEBUG delay_for_intr(); delay(300); #endif error = xfs_bawrite(mp, bp); if (error) xfs_fs_cmn_err(CE_WARN, mp, "xfs_qm_dquot_logitem_pushbuf: pushbuf error %d on qip %p, bp %p", error, qip, bp); } else { xfs_buf_relse(bp); } } else { qip->qli_pushbuf_flag = 0; xfs_dqunlock(dqp); xfs_buf_relse(bp); } return; } qip->qli_pushbuf_flag = 0; xfs_dqunlock(dqp); }
ssize_t /* bytes written, or (-) error */ xfs_write( bhv_desc_t *bdp, struct kiocb *iocb, const struct iovec *iovp, unsigned int nsegs, loff_t *offset, int ioflags, cred_t *credp) { struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; unsigned long segs = nsegs; xfs_inode_t *xip; xfs_mount_t *mp; ssize_t ret = 0, error = 0; xfs_fsize_t isize, new_size; xfs_iocore_t *io; vnode_t *vp; unsigned long seg; int iolock; int eventsent = 0; vrwlock_t locktype; size_t ocount = 0, count; loff_t pos; int need_isem = 1, need_flush = 0; XFS_STATS_INC(xs_write_calls); vp = BHV_TO_VNODE(bdp); xip = XFS_BHVTOI(bdp); for (seg = 0; seg < segs; seg++) { const struct iovec *iv = &iovp[seg]; /* * If any segment has a negative length, or the cumulative * length ever wraps negative then return -EINVAL. */ ocount += iv->iov_len; if (unlikely((ssize_t)(ocount|iv->iov_len) < 0)) return -EINVAL; if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len)) continue; if (seg == 0) return -EFAULT; segs = seg; ocount -= iv->iov_len; /* This segment is no good */ break; } count = ocount; pos = *offset; if (count == 0) return 0; io = &xip->i_iocore; mp = io->io_mount; if (XFS_FORCED_SHUTDOWN(mp)) return -EIO; if (ioflags & IO_ISDIRECT) { xfs_buftarg_t *target = (xip->i_d.di_flags & XFS_DIFLAG_REALTIME) ? mp->m_rtdev_targp : mp->m_ddev_targp; if ((pos & target->pbr_smask) || (count & target->pbr_smask)) return XFS_ERROR(-EINVAL); if (!VN_CACHED(vp) && pos < i_size_read(inode)) need_isem = 0; if (VN_CACHED(vp)) need_flush = 1; } relock: if (need_isem) { iolock = XFS_IOLOCK_EXCL; locktype = VRWLOCK_WRITE; down(&inode->i_sem); } else { iolock = XFS_IOLOCK_SHARED; locktype = VRWLOCK_WRITE_DIRECT; } xfs_ilock(xip, XFS_ILOCK_EXCL|iolock); isize = i_size_read(inode); if (file->f_flags & O_APPEND) *offset = isize; start: error = -generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode)); if (error) { xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock); goto out_unlock_isem; } new_size = pos + count; if (new_size > isize) io->io_new_size = new_size; if ((DM_EVENT_ENABLED(vp->v_vfsp, xip, DM_EVENT_WRITE) && !(ioflags & IO_INVIS) && !eventsent)) { loff_t savedsize = pos; int dmflags = FILP_DELAY_FLAG(file); if (need_isem) dmflags |= DM_FLAGS_ISEM; xfs_iunlock(xip, XFS_ILOCK_EXCL); error = XFS_SEND_DATA(xip->i_mount, DM_EVENT_WRITE, vp, pos, count, dmflags, &locktype); if (error) { xfs_iunlock(xip, iolock); goto out_unlock_isem; } xfs_ilock(xip, XFS_ILOCK_EXCL); eventsent = 1; /* * The iolock was dropped and reaquired in XFS_SEND_DATA * so we have to recheck the size when appending. * We will only "goto start;" once, since having sent the * event prevents another call to XFS_SEND_DATA, which is * what allows the size to change in the first place. */ if ((file->f_flags & O_APPEND) && savedsize != isize) { pos = isize = xip->i_d.di_size; goto start; } } /* * On Linux, generic_file_write updates the times even if * no data is copied in so long as the write had a size. * * We must update xfs' times since revalidate will overcopy xfs. */ if (!(ioflags & IO_INVIS)) { xfs_ichgtime(xip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); inode_update_time(inode, 1); } /* * If the offset is beyond the size of the file, we have a couple * of things to do. First, if there is already space allocated * we need to either create holes or zero the disk or ... * * If there is a page where the previous size lands, we need * to zero it out up to the new size. */ if (pos > isize) { error = xfs_zero_eof(BHV_TO_VNODE(bdp), io, pos, isize, pos + count); if (error) { xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock); goto out_unlock_isem; } } xfs_iunlock(xip, XFS_ILOCK_EXCL); /* * If we're writing the file then make sure to clear the * setuid and setgid bits if the process is not being run * by root. This keeps people from modifying setuid and * setgid binaries. */ if (((xip->i_d.di_mode & S_ISUID) || ((xip->i_d.di_mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP))) && !capable(CAP_FSETID)) { error = xfs_write_clear_setuid(xip); if (likely(!error)) error = -remove_suid(file->f_dentry); if (unlikely(error)) { xfs_iunlock(xip, iolock); goto out_unlock_isem; } } retry: /* We can write back this queue in page reclaim */ current->backing_dev_info = mapping->backing_dev_info; if ((ioflags & IO_ISDIRECT)) { if (need_flush) { xfs_inval_cached_trace(io, pos, -1, ctooff(offtoct(pos)), -1); VOP_FLUSHINVAL_PAGES(vp, ctooff(offtoct(pos)), -1, FI_REMAPF_LOCKED); } if (need_isem) { /* demote the lock now the cached pages are gone */ XFS_ILOCK_DEMOTE(mp, io, XFS_IOLOCK_EXCL); up(&inode->i_sem); iolock = XFS_IOLOCK_SHARED; locktype = VRWLOCK_WRITE_DIRECT; need_isem = 0; } xfs_rw_enter_trace(XFS_DIOWR_ENTER, io, (void *)iovp, segs, *offset, ioflags); ret = generic_file_direct_write(iocb, iovp, &segs, pos, offset, count, ocount); /* * direct-io write to a hole: fall through to buffered I/O * for completing the rest of the request. */ if (ret >= 0 && ret != count) { XFS_STATS_ADD(xs_write_bytes, ret); pos += ret; count -= ret; need_isem = 1; ioflags &= ~IO_ISDIRECT; xfs_iunlock(xip, iolock); goto relock; } } else { xfs_rw_enter_trace(XFS_WRITE_ENTER, io, (void *)iovp, segs, *offset, ioflags); ret = generic_file_buffered_write(iocb, iovp, segs, pos, offset, count, ret); } current->backing_dev_info = NULL; if (ret == -EIOCBQUEUED) ret = wait_on_sync_kiocb(iocb); if ((ret == -ENOSPC) && DM_EVENT_ENABLED(vp->v_vfsp, xip, DM_EVENT_NOSPACE) && !(ioflags & IO_INVIS)) { xfs_rwunlock(bdp, locktype); error = XFS_SEND_NAMESP(xip->i_mount, DM_EVENT_NOSPACE, vp, DM_RIGHT_NULL, vp, DM_RIGHT_NULL, NULL, NULL, 0, 0, 0); /* Delay flag intentionally unused */ if (error) goto out_unlock_isem; xfs_rwlock(bdp, locktype); pos = xip->i_d.di_size; goto retry; } if (*offset > xip->i_d.di_size) { xfs_ilock(xip, XFS_ILOCK_EXCL); if (*offset > xip->i_d.di_size) { xip->i_d.di_size = *offset; i_size_write(inode, *offset); xip->i_update_core = 1; xip->i_update_size = 1; } xfs_iunlock(xip, XFS_ILOCK_EXCL); } error = -ret; if (ret <= 0) goto out_unlock_internal; XFS_STATS_ADD(xs_write_bytes, ret); /* Handle various SYNC-type writes */ if ((file->f_flags & O_SYNC) || IS_SYNC(inode)) { /* * If we're treating this as O_DSYNC and we have not updated the * size, force the log. */ if (!(mp->m_flags & XFS_MOUNT_OSYNCISOSYNC) && !(xip->i_update_size)) { xfs_inode_log_item_t *iip = xip->i_itemp; /* * If an allocation transaction occurred * without extending the size, then we have to force * the log up the proper point to ensure that the * allocation is permanent. We can't count on * the fact that buffered writes lock out direct I/O * writes - the direct I/O write could have extended * the size nontransactionally, then finished before * we started. xfs_write_file will think that the file * didn't grow but the update isn't safe unless the * size change is logged. * * Force the log if we've committed a transaction * against the inode or if someone else has and * the commit record hasn't gone to disk (e.g. * the inode is pinned). This guarantees that * all changes affecting the inode are permanent * when we return. */ if (iip && iip->ili_last_lsn) { xfs_log_force(mp, iip->ili_last_lsn, XFS_LOG_FORCE | XFS_LOG_SYNC); } else if (xfs_ipincount(xip) > 0) { xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE | XFS_LOG_SYNC); } } else { xfs_trans_t *tp; /* * O_SYNC or O_DSYNC _with_ a size update are handled * the same way. * * If the write was synchronous then we need to make * sure that the inode modification time is permanent. * We'll have updated the timestamp above, so here * we use a synchronous transaction to log the inode. * It's not fast, but it's necessary. * * If this a dsync write and the size got changed * non-transactionally, then we need to ensure that * the size change gets logged in a synchronous * transaction. */ tp = xfs_trans_alloc(mp, XFS_TRANS_WRITE_SYNC); if ((error = xfs_trans_reserve(tp, 0, XFS_SWRITE_LOG_RES(mp), 0, 0, 0))) { /* Transaction reserve failed */ xfs_trans_cancel(tp, 0); } else { /* Transaction reserve successful */ xfs_ilock(xip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, xip, XFS_ILOCK_EXCL); xfs_trans_ihold(tp, xip); xfs_trans_log_inode(tp, xip, XFS_ILOG_CORE); xfs_trans_set_sync(tp); error = xfs_trans_commit(tp, 0, NULL); xfs_iunlock(xip, XFS_ILOCK_EXCL); if (error) goto out_unlock_internal; } } xfs_rwunlock(bdp, locktype); if (need_isem) up(&inode->i_sem); error = sync_page_range(inode, mapping, pos, ret); if (!error) error = ret; return error; } out_unlock_internal: xfs_rwunlock(bdp, locktype); out_unlock_isem: if (need_isem) up(&inode->i_sem); return -error; }
/* * This gets called by xfs_trans_push_ail(), when IOP_TRYLOCK * failed to get the inode flush lock but did get the inode locked SHARED. * Here we're trying to see if the inode buffer is incore, and if so whether it's * marked delayed write. If that's the case, we'll initiate a bawrite on that * buffer to expedite the process. * * We aren't holding the AIL lock (or the flush lock) when this gets called, * so it is inherently race-y. */ STATIC void xfs_inode_item_pushbuf( xfs_inode_log_item_t *iip) { xfs_inode_t *ip; xfs_mount_t *mp; xfs_buf_t *bp; uint dopush; ip = iip->ili_inode; ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED)); /* * The ili_pushbuf_flag keeps others from * trying to duplicate our effort. */ ASSERT(iip->ili_pushbuf_flag != 0); ASSERT(iip->ili_push_owner == current_pid()); /* * If a flush is not in progress anymore, chances are that the * inode was taken off the AIL. So, just get out. */ if (completion_done(&ip->i_flush) || ((iip->ili_item.li_flags & XFS_LI_IN_AIL) == 0)) { iip->ili_pushbuf_flag = 0; xfs_iunlock(ip, XFS_ILOCK_SHARED); return; } mp = ip->i_mount; bp = xfs_incore(mp->m_ddev_targp, iip->ili_format.ilf_blkno, iip->ili_format.ilf_len, XFS_INCORE_TRYLOCK); if (bp != NULL) { if (XFS_BUF_ISDELAYWRITE(bp)) { /* * We were racing with iflush because we don't hold * the AIL lock or the flush lock. However, at this point, * we have the buffer, and we know that it's dirty. * So, it's possible that iflush raced with us, and * this item is already taken off the AIL. * If not, we can flush it async. */ dopush = ((iip->ili_item.li_flags & XFS_LI_IN_AIL) && !completion_done(&ip->i_flush)); iip->ili_pushbuf_flag = 0; xfs_iunlock(ip, XFS_ILOCK_SHARED); xfs_buftrace("INODE ITEM PUSH", bp); if (XFS_BUF_ISPINNED(bp)) { xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE); } if (dopush) { int error; error = xfs_bawrite(mp, bp); if (error) xfs_fs_cmn_err(CE_WARN, mp, "xfs_inode_item_pushbuf: pushbuf error %d on iip %p, bp %p", error, iip, bp); } else { xfs_buf_relse(bp); } } else { iip->ili_pushbuf_flag = 0; xfs_iunlock(ip, XFS_ILOCK_SHARED); xfs_buf_relse(bp); } return; } /* * We have to be careful about resetting pushbuf flag too early (above). * Even though in theory we can do it as soon as we have the buflock, * we don't want others to be doing work needlessly. They'll come to * this function thinking that pushing the buffer is their * responsibility only to find that the buffer is still locked by * another doing the same thing */ iip->ili_pushbuf_flag = 0; xfs_iunlock(ip, XFS_ILOCK_SHARED); return; }
STATIC int xfs_trim_extents( struct xfs_mount *mp, xfs_agnumber_t agno, xfs_fsblock_t start, xfs_fsblock_t len, xfs_fsblock_t minlen, __uint64_t *blocks_trimmed) { struct block_device *bdev = mp->m_ddev_targp->bt_bdev; struct xfs_btree_cur *cur; struct xfs_buf *agbp; struct xfs_perag *pag; int error; int i; pag = xfs_perag_get(mp, agno); error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agbp); if (error || !agbp) goto out_put_perag; cur = xfs_allocbt_init_cursor(mp, NULL, agbp, agno, XFS_BTNUM_CNT); /* * Force out the log. This means any transactions that might have freed * space before we took the AGF buffer lock are now on disk, and the * volatile disk cache is flushed. */ xfs_log_force(mp, XFS_LOG_SYNC); /* * Look up the longest btree in the AGF and start with it. */ error = xfs_alloc_lookup_le(cur, 0, XFS_BUF_TO_AGF(agbp)->agf_longest, &i); if (error) goto out_del_cursor; /* * Loop until we are done with all extents that are large * enough to be worth discarding. */ while (i) { xfs_agblock_t fbno; xfs_extlen_t flen; error = xfs_alloc_get_rec(cur, &fbno, &flen, &i); if (error) goto out_del_cursor; XFS_WANT_CORRUPTED_GOTO(i == 1, out_del_cursor); ASSERT(flen <= XFS_BUF_TO_AGF(agbp)->agf_longest); /* * Too small? Give up. */ if (flen < minlen) { trace_xfs_discard_toosmall(mp, agno, fbno, flen); goto out_del_cursor; } /* * If the extent is entirely outside of the range we are * supposed to discard skip it. Do not bother to trim * down partially overlapping ranges for now. */ if (XFS_AGB_TO_FSB(mp, agno, fbno) + flen < start || XFS_AGB_TO_FSB(mp, agno, fbno) >= start + len) { trace_xfs_discard_exclude(mp, agno, fbno, flen); goto next_extent; } /* * If any blocks in the range are still busy, skip the * discard and try again the next time. */ if (xfs_alloc_busy_search(mp, agno, fbno, flen)) { trace_xfs_discard_busy(mp, agno, fbno, flen); goto next_extent; } trace_xfs_discard_extent(mp, agno, fbno, flen); error = -blkdev_issue_discard(bdev, XFS_AGB_TO_DADDR(mp, agno, fbno), XFS_FSB_TO_BB(mp, flen), GFP_NOFS, 0); if (error) goto out_del_cursor; *blocks_trimmed += flen; next_extent: error = xfs_btree_decrement(cur, 0, &i); if (error) goto out_del_cursor; } out_del_cursor: xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR); xfs_buf_relse(agbp); out_put_perag: xfs_perag_put(pag); return error; }
ssize_t /* bytes written, or (-) error */ xfs_write( bhv_desc_t *bdp, struct kiocb *iocb, const struct iovec *iovp, unsigned int segs, loff_t *offset, int ioflags, cred_t *credp) { struct file *file = iocb->ki_filp; size_t size = 0; xfs_inode_t *xip; xfs_mount_t *mp; ssize_t ret; int error = 0; xfs_fsize_t isize, new_size; xfs_fsize_t n, limit; xfs_iocore_t *io; vnode_t *vp; unsigned long seg; int iolock; int eventsent = 0; vrwlock_t locktype; XFS_STATS_INC(xs_write_calls); vp = BHV_TO_VNODE(bdp); vn_trace_entry(vp, "xfs_write", (inst_t *)__return_address); xip = XFS_BHVTOI(bdp); /* START copy & waste from filemap.c */ for (seg = 0; seg < segs; seg++) { const struct iovec *iv = &iovp[seg]; /* * If any segment has a negative length, or the cumulative * length ever wraps negative then return -EINVAL. */ size += iv->iov_len; if (unlikely((ssize_t)(size|iv->iov_len) < 0)) return XFS_ERROR(-EINVAL); } /* END copy & waste from filemap.c */ if (size == 0) return 0; io = &(xip->i_iocore); mp = io->io_mount; xfs_check_frozen(mp, bdp, XFS_FREEZE_WRITE); if (XFS_FORCED_SHUTDOWN(mp)) { return -EIO; } if (ioflags & IO_ISDIRECT) { pb_target_t *target = (xip->i_d.di_flags & XFS_DIFLAG_REALTIME) ? mp->m_rtdev_targp : mp->m_ddev_targp; if ((*offset & target->pbr_smask) || (size & target->pbr_smask)) { return XFS_ERROR(-EINVAL); } iolock = XFS_IOLOCK_SHARED; locktype = VRWLOCK_WRITE_DIRECT; } else { iolock = XFS_IOLOCK_EXCL; locktype = VRWLOCK_WRITE; } xfs_ilock(xip, XFS_ILOCK_EXCL|iolock); isize = xip->i_d.di_size; limit = XFS_MAXIOFFSET(mp); if (file->f_flags & O_APPEND) *offset = isize; start: n = limit - *offset; if (n <= 0) { xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock); return -EFBIG; } if (n < size) size = n; new_size = *offset + size; if (new_size > isize) { io->io_new_size = new_size; } if ((DM_EVENT_ENABLED(vp->v_vfsp, xip, DM_EVENT_WRITE) && !(ioflags & IO_INVIS) && !eventsent)) { loff_t savedsize = *offset; xfs_iunlock(xip, XFS_ILOCK_EXCL); error = XFS_SEND_DATA(xip->i_mount, DM_EVENT_WRITE, vp, *offset, size, FILP_DELAY_FLAG(file), &locktype); if (error) { xfs_iunlock(xip, iolock); return -error; } xfs_ilock(xip, XFS_ILOCK_EXCL); eventsent = 1; /* * The iolock was dropped and reaquired in XFS_SEND_DATA * so we have to recheck the size when appending. * We will only "goto start;" once, since having sent the * event prevents another call to XFS_SEND_DATA, which is * what allows the size to change in the first place. */ if ((file->f_flags & O_APPEND) && savedsize != xip->i_d.di_size) { *offset = isize = xip->i_d.di_size; goto start; } } /* * On Linux, generic_file_write updates the times even if * no data is copied in so long as the write had a size. * * We must update xfs' times since revalidate will overcopy xfs. */ if (size && !(ioflags & IO_INVIS)) xfs_ichgtime(xip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); /* * If the offset is beyond the size of the file, we have a couple * of things to do. First, if there is already space allocated * we need to either create holes or zero the disk or ... * * If there is a page where the previous size lands, we need * to zero it out up to the new size. */ if (!(ioflags & IO_ISDIRECT) && (*offset > isize && isize)) { error = xfs_zero_eof(BHV_TO_VNODE(bdp), io, *offset, isize, *offset + size); if (error) { xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock); return(-error); } } xfs_iunlock(xip, XFS_ILOCK_EXCL); /* * If we're writing the file then make sure to clear the * setuid and setgid bits if the process is not being run * by root. This keeps people from modifying setuid and * setgid binaries. */ if (((xip->i_d.di_mode & S_ISUID) || ((xip->i_d.di_mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP))) && !capable(CAP_FSETID)) { error = xfs_write_clear_setuid(xip); if (error) { xfs_iunlock(xip, iolock); return -error; } } retry: if (ioflags & IO_ISDIRECT) { xfs_inval_cached_pages(vp, &xip->i_iocore, *offset, 1, 1); } ret = generic_file_aio_write_nolock(iocb, iovp, segs, offset); if ((ret == -ENOSPC) && DM_EVENT_ENABLED(vp->v_vfsp, xip, DM_EVENT_NOSPACE) && !(ioflags & IO_INVIS)) { xfs_rwunlock(bdp, locktype); error = XFS_SEND_NAMESP(xip->i_mount, DM_EVENT_NOSPACE, vp, DM_RIGHT_NULL, vp, DM_RIGHT_NULL, NULL, NULL, 0, 0, 0); /* Delay flag intentionally unused */ if (error) return -error; xfs_rwlock(bdp, locktype); *offset = xip->i_d.di_size; goto retry; } if (*offset > xip->i_d.di_size) { xfs_ilock(xip, XFS_ILOCK_EXCL); if (*offset > xip->i_d.di_size) { struct inode *inode = LINVFS_GET_IP(vp); xip->i_d.di_size = *offset; i_size_write(inode, *offset); xip->i_update_core = 1; xip->i_update_size = 1; } xfs_iunlock(xip, XFS_ILOCK_EXCL); } if (ret <= 0) { xfs_rwunlock(bdp, locktype); return ret; } XFS_STATS_ADD(xs_write_bytes, ret); /* Handle various SYNC-type writes */ if ((file->f_flags & O_SYNC) || IS_SYNC(file->f_dentry->d_inode)) { /* * If we're treating this as O_DSYNC and we have not updated the * size, force the log. */ if (!(mp->m_flags & XFS_MOUNT_OSYNCISOSYNC) && !(xip->i_update_size)) { /* * If an allocation transaction occurred * without extending the size, then we have to force * the log up the proper point to ensure that the * allocation is permanent. We can't count on * the fact that buffered writes lock out direct I/O * writes - the direct I/O write could have extended * the size nontransactionally, then finished before * we started. xfs_write_file will think that the file * didn't grow but the update isn't safe unless the * size change is logged. * * Force the log if we've committed a transaction * against the inode or if someone else has and * the commit record hasn't gone to disk (e.g. * the inode is pinned). This guarantees that * all changes affecting the inode are permanent * when we return. */ xfs_inode_log_item_t *iip; xfs_lsn_t lsn; iip = xip->i_itemp; if (iip && iip->ili_last_lsn) { lsn = iip->ili_last_lsn; xfs_log_force(mp, lsn, XFS_LOG_FORCE | XFS_LOG_SYNC); } else if (xfs_ipincount(xip) > 0) { xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE | XFS_LOG_SYNC); } } else { xfs_trans_t *tp; /* * O_SYNC or O_DSYNC _with_ a size update are handled * the same way. * * If the write was synchronous then we need to make * sure that the inode modification time is permanent. * We'll have updated the timestamp above, so here * we use a synchronous transaction to log the inode. * It's not fast, but it's necessary. * * If this a dsync write and the size got changed * non-transactionally, then we need to ensure that * the size change gets logged in a synchronous * transaction. */ tp = xfs_trans_alloc(mp, XFS_TRANS_WRITE_SYNC); if ((error = xfs_trans_reserve(tp, 0, XFS_SWRITE_LOG_RES(mp), 0, 0, 0))) { /* Transaction reserve failed */ xfs_trans_cancel(tp, 0); } else { /* Transaction reserve successful */ xfs_ilock(xip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, xip, XFS_ILOCK_EXCL); xfs_trans_ihold(tp, xip); xfs_trans_log_inode(tp, xip, XFS_ILOG_CORE); xfs_trans_set_sync(tp); error = xfs_trans_commit(tp, 0, (xfs_lsn_t)0); xfs_iunlock(xip, XFS_ILOCK_EXCL); } } } /* (ioflags & O_SYNC) */ xfs_rwunlock(bdp, locktype); return(ret); }
/*ARGSUSED*/ int xfs_trans_commit( xfs_trans_t *tp, uint flags, xfs_lsn_t *commit_lsn_p) { xfs_log_iovec_t *log_vector; int nvec; xfs_mount_t *mp; xfs_lsn_t commit_lsn; /* REFERENCED */ int error; int log_flags; int sync; #define XFS_TRANS_LOGVEC_COUNT 16 xfs_log_iovec_t log_vector_fast[XFS_TRANS_LOGVEC_COUNT]; #if defined(XLOG_NOLOG) || defined(DEBUG) static xfs_lsn_t trans_lsn = 1; #endif void *commit_iclog; int shutdown; commit_lsn = -1; /* * Determine whether this commit is releasing a permanent * log reservation or not. */ if (flags & XFS_TRANS_RELEASE_LOG_RES) { ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); log_flags = XFS_LOG_REL_PERM_RESERV; } else { log_flags = 0; } mp = tp->t_mountp; /* * If there is nothing to be logged by the transaction, * then unlock all of the items associated with the * transaction and free the transaction structure. * Also make sure to return any reserved blocks to * the free pool. */ shut_us_down: shutdown = XFS_FORCED_SHUTDOWN(mp) ? EIO : 0; if (!(tp->t_flags & XFS_TRANS_DIRTY) || shutdown) { xfs_trans_unreserve_and_mod_sb(tp); /* * It is indeed possible for the transaction to be * not dirty but the dqinfo portion to be. All that * means is that we have some (non-persistent) quota * reservations that need to be unreserved. */ XFS_TRANS_UNRESERVE_AND_MOD_DQUOTS(mp, tp); if (tp->t_ticket) { commit_lsn = xfs_log_done(mp, tp->t_ticket, NULL, log_flags); if (commit_lsn == -1 && !shutdown) shutdown = XFS_ERROR(EIO); } PFLAGS_RESTORE_FSTRANS(&tp->t_pflags); xfs_trans_free_items(tp, shutdown? XFS_TRANS_ABORT : 0); xfs_trans_free_busy(tp); xfs_trans_free(tp); XFS_STATS_INC(xs_trans_empty); if (commit_lsn_p) *commit_lsn_p = commit_lsn; return (shutdown); } #if defined(XLOG_NOLOG) || defined(DEBUG) ASSERT(!xlog_debug || tp->t_ticket != NULL); #else ASSERT(tp->t_ticket != NULL); #endif /* * If we need to update the superblock, then do it now. */ if (tp->t_flags & XFS_TRANS_SB_DIRTY) { xfs_trans_apply_sb_deltas(tp); } XFS_TRANS_APPLY_DQUOT_DELTAS(mp, tp); /* * Ask each log item how many log_vector entries it will * need so we can figure out how many to allocate. * Try to avoid the kmem_alloc() call in the common case * by using a vector from the stack when it fits. */ nvec = xfs_trans_count_vecs(tp); if (nvec == 0) { xfs_force_shutdown(mp, XFS_LOG_IO_ERROR); goto shut_us_down; } if (nvec <= XFS_TRANS_LOGVEC_COUNT) { log_vector = log_vector_fast; } else { log_vector = (xfs_log_iovec_t *)kmem_alloc(nvec * sizeof(xfs_log_iovec_t), KM_SLEEP); } /* * Fill in the log_vector and pin the logged items, and * then write the transaction to the log. */ xfs_trans_fill_vecs(tp, log_vector); /* * Ignore errors here. xfs_log_done would do the right thing. * We need to put the ticket, etc. away. */ error = xfs_log_write(mp, log_vector, nvec, tp->t_ticket, &(tp->t_lsn)); #if defined(XLOG_NOLOG) || defined(DEBUG) if (xlog_debug) { commit_lsn = xfs_log_done(mp, tp->t_ticket, &commit_iclog, log_flags); } else { commit_lsn = 0; tp->t_lsn = trans_lsn++; } #else /* * This is the regular case. At this point (after the call finishes), * the transaction is committed incore and could go out to disk at * any time. However, all the items associated with the transaction * are still locked and pinned in memory. */ commit_lsn = xfs_log_done(mp, tp->t_ticket, &commit_iclog, log_flags); #endif tp->t_commit_lsn = commit_lsn; if (nvec > XFS_TRANS_LOGVEC_COUNT) { kmem_free(log_vector, nvec * sizeof(xfs_log_iovec_t)); } if (commit_lsn_p) *commit_lsn_p = commit_lsn; /* * If we got a log write error. Unpin the logitems that we * had pinned, clean up, free trans structure, and return error. */ if (error || commit_lsn == -1) { PFLAGS_RESTORE_FSTRANS(&tp->t_pflags); xfs_trans_uncommit(tp, flags|XFS_TRANS_ABORT); return XFS_ERROR(EIO); } /* * Once the transaction has committed, unused * reservations need to be released and changes to * the superblock need to be reflected in the in-core * version. Do that now. */ xfs_trans_unreserve_and_mod_sb(tp); sync = tp->t_flags & XFS_TRANS_SYNC; /* * Tell the LM to call the transaction completion routine * when the log write with LSN commit_lsn completes (e.g. * when the transaction commit really hits the on-disk log). * After this call we cannot reference tp, because the call * can happen at any time and the call will free the transaction * structure pointed to by tp. The only case where we call * the completion routine (xfs_trans_committed) directly is * if the log is turned off on a debug kernel or we're * running in simulation mode (the log is explicitly turned * off). */ tp->t_logcb.cb_func = (void(*)(void*, int))xfs_trans_committed; tp->t_logcb.cb_arg = tp; /* * We need to pass the iclog buffer which was used for the * transaction commit record into this function, and attach * the callback to it. The callback must be attached before * the items are unlocked to avoid racing with other threads * waiting for an item to unlock. */ shutdown = xfs_log_notify(mp, commit_iclog, &(tp->t_logcb)); /* * Mark this thread as no longer being in a transaction */ PFLAGS_RESTORE_FSTRANS(&tp->t_pflags); /* * Once all the items of the transaction have been copied * to the in core log and the callback is attached, the * items can be unlocked. * * This will free descriptors pointing to items which were * not logged since there is nothing more to do with them. * For items which were logged, we will keep pointers to them * so they can be unpinned after the transaction commits to disk. * This will also stamp each modified meta-data item with * the commit lsn of this transaction for dependency tracking * purposes. */ xfs_trans_unlock_items(tp, commit_lsn); /* * If we detected a log error earlier, finish committing * the transaction now (unpin log items, etc). * * Order is critical here, to avoid using the transaction * pointer after its been freed (by xfs_trans_committed * either here now, or as a callback). We cannot do this * step inside xfs_log_notify as was done earlier because * of this issue. */ if (shutdown) xfs_trans_committed(tp, XFS_LI_ABORTED); /* * Now that the xfs_trans_committed callback has been attached, * and the items are released we can finally allow the iclog to * go to disk. */ error = xfs_log_release_iclog(mp, commit_iclog); /* * If the transaction needs to be synchronous, then force the * log out now and wait for it. */ if (sync) { if (!error) error = xfs_log_force(mp, commit_lsn, XFS_LOG_FORCE | XFS_LOG_SYNC); XFS_STATS_INC(xs_trans_sync); } else { XFS_STATS_INC(xs_trans_async); } return (error); }
/* * Write a modified dquot to disk. * The dquot must be locked and the flush lock too taken by caller. * The flush lock will not be unlocked until the dquot reaches the disk, * but the dquot is free to be unlocked and modified by the caller * in the interim. Dquot is still locked on return. This behavior is * identical to that of inodes. */ int xfs_qm_dqflush( struct xfs_dquot *dqp, struct xfs_buf **bpp) { struct xfs_mount *mp = dqp->q_mount; struct xfs_buf *bp; struct xfs_disk_dquot *ddqp; int error; ASSERT(XFS_DQ_IS_LOCKED(dqp)); ASSERT(!completion_done(&dqp->q_flush)); trace_xfs_dqflush(dqp); *bpp = NULL; xfs_qm_dqunpin_wait(dqp); /* * This may have been unpinned because the filesystem is shutting * down forcibly. If that's the case we must not write this dquot * to disk, because the log record didn't make it to disk. * * We also have to remove the log item from the AIL in this case, * as we wait for an emptry AIL as part of the unmount process. */ if (XFS_FORCED_SHUTDOWN(mp)) { struct xfs_log_item *lip = &dqp->q_logitem.qli_item; dqp->dq_flags &= ~XFS_DQ_DIRTY; spin_lock(&mp->m_ail->xa_lock); if (lip->li_flags & XFS_LI_IN_AIL) xfs_trans_ail_delete(mp->m_ail, lip, SHUTDOWN_CORRUPT_INCORE); else spin_unlock(&mp->m_ail->xa_lock); error = XFS_ERROR(EIO); goto out_unlock; } /* * Get the buffer containing the on-disk dquot */ error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, dqp->q_blkno, mp->m_quotainfo->qi_dqchunklen, 0, &bp, NULL); if (error) goto out_unlock; /* * Calculate the location of the dquot inside the buffer. */ ddqp = bp->b_addr + dqp->q_bufoffset; /* * A simple sanity check in case we got a corrupted dquot.. */ error = xfs_dqcheck(mp, &dqp->q_core, be32_to_cpu(ddqp->d_id), 0, XFS_QMOPT_DOWARN, "dqflush (incore copy)"); if (error) { xfs_buf_relse(bp); xfs_dqfunlock(dqp); xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); return XFS_ERROR(EIO); } /* This is the only portion of data that needs to persist */ memcpy(ddqp, &dqp->q_core, sizeof(xfs_disk_dquot_t)); /* * Clear the dirty field and remember the flush lsn for later use. */ dqp->dq_flags &= ~XFS_DQ_DIRTY; xfs_trans_ail_copy_lsn(mp->m_ail, &dqp->q_logitem.qli_flush_lsn, &dqp->q_logitem.qli_item.li_lsn); /* * copy the lsn into the on-disk dquot now while we have the in memory * dquot here. This can't be done later in the write verifier as we * can't get access to the log item at that point in time. * * We also calculate the CRC here so that the on-disk dquot in the * buffer always has a valid CRC. This ensures there is no possibility * of a dquot without an up-to-date CRC getting to disk. */ if (xfs_sb_version_hascrc(&mp->m_sb)) { struct xfs_dqblk *dqb = (struct xfs_dqblk *)ddqp; dqb->dd_lsn = cpu_to_be64(dqp->q_logitem.qli_item.li_lsn); xfs_update_cksum((char *)dqb, sizeof(struct xfs_dqblk), XFS_DQUOT_CRC_OFF); } /* * Attach an iodone routine so that we can remove this dquot from the * AIL and release the flush lock once the dquot is synced to disk. */ xfs_buf_attach_iodone(bp, xfs_qm_dqflush_done, &dqp->q_logitem.qli_item); /* * If the buffer is pinned then push on the log so we won't * get stuck waiting in the write for too long. */ if (xfs_buf_ispinned(bp)) { trace_xfs_dqflush_force(dqp); xfs_log_force(mp, 0); } trace_xfs_dqflush_done(dqp); *bpp = bp; return 0; out_unlock: xfs_dqfunlock(dqp); return XFS_ERROR(EIO); }
/* * xfs_trans_push_ail * * This routine is called to move the tail of the AIL * forward. It does this by trying to flush items in the AIL * whose lsns are below the given threshold_lsn. * * The routine returns the lsn of the tail of the log. */ xfs_lsn_t xfs_trans_push_ail( xfs_mount_t *mp, xfs_lsn_t threshold_lsn) { xfs_lsn_t lsn; xfs_log_item_t *lip; int gen; int restarts; int lock_result; int flush_log; SPLDECL(s); #define XFS_TRANS_PUSH_AIL_RESTARTS 10 AIL_LOCK(mp,s); lip = xfs_trans_first_ail(mp, &gen); if (lip == NULL || XFS_FORCED_SHUTDOWN(mp)) { /* * Just return if the AIL is empty. */ AIL_UNLOCK(mp, s); return (xfs_lsn_t)0; } XFS_STATS_INC(xs_push_ail); /* * While the item we are looking at is below the given threshold * try to flush it out. Make sure to limit the number of times * we allow xfs_trans_next_ail() to restart scanning from the * beginning of the list. We'd like not to stop until we've at least * tried to push on everything in the AIL with an LSN less than * the given threshold. However, we may give up before that if * we realize that we've been holding the AIL_LOCK for 'too long', * blocking interrupts. Currently, too long is < 500us roughly. */ flush_log = 0; restarts = 0; while (((restarts < XFS_TRANS_PUSH_AIL_RESTARTS) && (XFS_LSN_CMP(lip->li_lsn, threshold_lsn) < 0))) { /* * If we can lock the item without sleeping, unlock * the AIL lock and flush the item. Then re-grab the * AIL lock so we can look for the next item on the * AIL. Since we unlock the AIL while we flush the * item, the next routine may start over again at the * the beginning of the list if anything has changed. * That is what the generation count is for. * * If we can't lock the item, either its holder will flush * it or it is already being flushed or it is being relogged. * In any of these case it is being taken care of and we * can just skip to the next item in the list. */ lock_result = IOP_TRYLOCK(lip); switch (lock_result) { case XFS_ITEM_SUCCESS: AIL_UNLOCK(mp, s); XFS_STATS_INC(xs_push_ail_success); IOP_PUSH(lip); AIL_LOCK(mp,s); break; case XFS_ITEM_PUSHBUF: AIL_UNLOCK(mp, s); XFS_STATS_INC(xs_push_ail_pushbuf); #ifdef XFSRACEDEBUG delay_for_intr(); delay(300); #endif ASSERT(lip->li_ops->iop_pushbuf); ASSERT(lip); IOP_PUSHBUF(lip); AIL_LOCK(mp,s); break; case XFS_ITEM_PINNED: XFS_STATS_INC(xs_push_ail_pinned); flush_log = 1; break; case XFS_ITEM_LOCKED: XFS_STATS_INC(xs_push_ail_locked); break; case XFS_ITEM_FLUSHING: XFS_STATS_INC(xs_push_ail_flushing); break; default: ASSERT(0); break; } lip = xfs_trans_next_ail(mp, lip, &gen, &restarts); if (lip == NULL) { break; } if (XFS_FORCED_SHUTDOWN(mp)) { /* * Just return if we shut down during the last try. */ AIL_UNLOCK(mp, s); return (xfs_lsn_t)0; } } if (flush_log) { /* * If something we need to push out was pinned, then * push out the log so it will become unpinned and * move forward in the AIL. */ AIL_UNLOCK(mp, s); XFS_STATS_INC(xs_push_ail_flush); xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE); AIL_LOCK(mp, s); } lip = xfs_ail_min(&(mp->m_ail)); if (lip == NULL) { lsn = (xfs_lsn_t)0; } else { lsn = lip->li_lsn; } AIL_UNLOCK(mp, s); return lsn; } /* xfs_trans_push_ail */
/* * Write a modified dquot to disk. * The dquot must be locked and the flush lock too taken by caller. * The flush lock will not be unlocked until the dquot reaches the disk, * but the dquot is free to be unlocked and modified by the caller * in the interim. Dquot is still locked on return. This behavior is * identical to that of inodes. */ int xfs_qm_dqflush( xfs_dquot_t *dqp, uint flags) { struct xfs_mount *mp = dqp->q_mount; struct xfs_buf *bp; struct xfs_disk_dquot *ddqp; int error; ASSERT(XFS_DQ_IS_LOCKED(dqp)); ASSERT(!completion_done(&dqp->q_flush)); trace_xfs_dqflush(dqp); /* * If not dirty, or it's pinned and we are not supposed to block, nada. */ if (!XFS_DQ_IS_DIRTY(dqp) || ((flags & SYNC_TRYLOCK) && atomic_read(&dqp->q_pincount) > 0)) { xfs_dqfunlock(dqp); return 0; } xfs_qm_dqunpin_wait(dqp); /* * This may have been unpinned because the filesystem is shutting * down forcibly. If that's the case we must not write this dquot * to disk, because the log record didn't make it to disk! */ if (XFS_FORCED_SHUTDOWN(mp)) { dqp->dq_flags &= ~XFS_DQ_DIRTY; xfs_dqfunlock(dqp); return XFS_ERROR(EIO); } /* * Get the buffer containing the on-disk dquot */ error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, dqp->q_blkno, mp->m_quotainfo->qi_dqchunklen, 0, &bp); if (error) { ASSERT(error != ENOENT); xfs_dqfunlock(dqp); return error; } /* * Calculate the location of the dquot inside the buffer. */ ddqp = bp->b_addr + dqp->q_bufoffset; /* * A simple sanity check in case we got a corrupted dquot.. */ error = xfs_qm_dqcheck(mp, &dqp->q_core, be32_to_cpu(ddqp->d_id), 0, XFS_QMOPT_DOWARN, "dqflush (incore copy)"); if (error) { xfs_buf_relse(bp); xfs_dqfunlock(dqp); xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); return XFS_ERROR(EIO); } /* This is the only portion of data that needs to persist */ memcpy(ddqp, &dqp->q_core, sizeof(xfs_disk_dquot_t)); /* * Clear the dirty field and remember the flush lsn for later use. */ dqp->dq_flags &= ~XFS_DQ_DIRTY; xfs_trans_ail_copy_lsn(mp->m_ail, &dqp->q_logitem.qli_flush_lsn, &dqp->q_logitem.qli_item.li_lsn); /* * Attach an iodone routine so that we can remove this dquot from the * AIL and release the flush lock once the dquot is synced to disk. */ xfs_buf_attach_iodone(bp, xfs_qm_dqflush_done, &dqp->q_logitem.qli_item); /* * If the buffer is pinned then push on the log so we won't * get stuck waiting in the write for too long. */ if (xfs_buf_ispinned(bp)) { trace_xfs_dqflush_force(dqp); xfs_log_force(mp, 0); } if (flags & SYNC_WAIT) error = xfs_bwrite(bp); else xfs_buf_delwri_queue(bp); xfs_buf_relse(bp); trace_xfs_dqflush_done(dqp); /* * dqp is still locked, but caller is free to unlock it now. */ return error; }
/* * Function that does the work of pushing on the AIL */ long xfsaild_push( xfs_mount_t *mp, xfs_lsn_t *last_lsn) { long tout = 1000; /* milliseconds */ xfs_lsn_t last_pushed_lsn = *last_lsn; xfs_lsn_t target = mp->m_ail.xa_target; xfs_lsn_t lsn; xfs_log_item_t *lip; int gen; int restarts; int flush_log, count, stuck; #define XFS_TRANS_PUSH_AIL_RESTARTS 10 spin_lock(&mp->m_ail_lock); lip = xfs_trans_first_push_ail(mp, &gen, *last_lsn); if (!lip || XFS_FORCED_SHUTDOWN(mp)) { /* * AIL is empty or our push has reached the end. */ spin_unlock(&mp->m_ail_lock); last_pushed_lsn = 0; goto out; } XFS_STATS_INC(xs_push_ail); /* * While the item we are looking at is below the given threshold * try to flush it out. We'd like not to stop until we've at least * tried to push on everything in the AIL with an LSN less than * the given threshold. * * However, we will stop after a certain number of pushes and wait * for a reduced timeout to fire before pushing further. This * prevents use from spinning when we can't do anything or there is * lots of contention on the AIL lists. */ tout = 10; lsn = lip->li_lsn; flush_log = stuck = count = restarts = 0; while ((XFS_LSN_CMP(lip->li_lsn, target) < 0)) { int lock_result; /* * If we can lock the item without sleeping, unlock the AIL * lock and flush the item. Then re-grab the AIL lock so we * can look for the next item on the AIL. List changes are * handled by the AIL lookup functions internally * * If we can't lock the item, either its holder will flush it * or it is already being flushed or it is being relogged. In * any of these case it is being taken care of and we can just * skip to the next item in the list. */ lock_result = IOP_TRYLOCK(lip); spin_unlock(&mp->m_ail_lock); switch (lock_result) { case XFS_ITEM_SUCCESS: XFS_STATS_INC(xs_push_ail_success); IOP_PUSH(lip); last_pushed_lsn = lsn; break; case XFS_ITEM_PUSHBUF: XFS_STATS_INC(xs_push_ail_pushbuf); IOP_PUSHBUF(lip); last_pushed_lsn = lsn; break; case XFS_ITEM_PINNED: XFS_STATS_INC(xs_push_ail_pinned); stuck++; flush_log = 1; break; case XFS_ITEM_LOCKED: XFS_STATS_INC(xs_push_ail_locked); last_pushed_lsn = lsn; stuck++; break; case XFS_ITEM_FLUSHING: XFS_STATS_INC(xs_push_ail_flushing); last_pushed_lsn = lsn; stuck++; break; default: ASSERT(0); break; } spin_lock(&mp->m_ail_lock); /* should we bother continuing? */ if (XFS_FORCED_SHUTDOWN(mp)) break; ASSERT(mp->m_log); count++; /* * Are there too many items we can't do anything with? * If we we are skipping too many items because we can't flush * them or they are already being flushed, we back off and * given them time to complete whatever operation is being * done. i.e. remove pressure from the AIL while we can't make * progress so traversals don't slow down further inserts and * removals to/from the AIL. * * The value of 100 is an arbitrary magic number based on * observation. */ if (stuck > 100) break; lip = xfs_trans_next_ail(mp, lip, &gen, &restarts); if (lip == NULL) break; if (restarts > XFS_TRANS_PUSH_AIL_RESTARTS) break; lsn = lip->li_lsn; } spin_unlock(&mp->m_ail_lock); if (flush_log) { /* * If something we need to push out was pinned, then * push out the log so it will become unpinned and * move forward in the AIL. */ XFS_STATS_INC(xs_push_ail_flush); xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE); } if (!count) { /* We're past our target or empty, so idle */ tout = 1000; } else if (XFS_LSN_CMP(lsn, target) >= 0) { /* * We reached the target so wait a bit longer for I/O to * complete and remove pushed items from the AIL before we * start the next scan from the start of the AIL. */ tout += 20; last_pushed_lsn = 0; } else if ((restarts > XFS_TRANS_PUSH_AIL_RESTARTS) || ((stuck * 100) / count > 90)) { /* * Either there is a lot of contention on the AIL or we * are stuck due to operations in progress. "Stuck" in this * case is defined as >90% of the items we tried to push * were stuck. * * Backoff a bit more to allow some I/O to complete before * continuing from where we were. */ tout += 10; } out: *last_lsn = last_pushed_lsn; return tout; } /* xfsaild_push */
ssize_t /* bytes written, or (-) error */ xfs_write( bhv_desc_t *bdp, uio_t *uio, int ioflag, cred_t *credp) { xfs_inode_t *xip; xfs_mount_t *mp; ssize_t ret = 0; int error = 0; xfs_fsize_t isize, new_size; xfs_fsize_t n, limit; xfs_fsize_t size; xfs_iocore_t *io; xfs_vnode_t *vp; int iolock; //int eventsent = 0; vrwlock_t locktype; xfs_off_t offset_c; xfs_off_t *offset; xfs_off_t pos; XFS_STATS_INC(xs_write_calls); vp = BHV_TO_VNODE(bdp); xip = XFS_BHVTOI(bdp); io = &xip->i_iocore; mp = io->io_mount; if (XFS_FORCED_SHUTDOWN(xip->i_mount)) { return EIO; } size = uio->uio_resid; pos = offset_c = uio->uio_offset; offset = &offset_c; if (unlikely(ioflag & IO_ISDIRECT)) { if (((__psint_t)buf & BBMASK) || (*offset & mp->m_blockmask) || (size & mp->m_blockmask)) { return EINVAL; } iolock = XFS_IOLOCK_SHARED; locktype = VRWLOCK_WRITE_DIRECT; } else { if (io->io_flags & XFS_IOCORE_RT) return EINVAL; iolock = XFS_IOLOCK_EXCL; locktype = VRWLOCK_WRITE; } iolock = XFS_IOLOCK_EXCL; locktype = VRWLOCK_WRITE; xfs_ilock(xip, XFS_ILOCK_EXCL|iolock); isize = xip->i_d.di_size; limit = XFS_MAXIOFFSET(mp); if (ioflag & O_APPEND) *offset = isize; //start: n = limit - *offset; if (n <= 0) { xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock); return EFBIG; } if (n < size) size = n; new_size = *offset + size; if (new_size > isize) { io->io_new_size = new_size; } #ifdef RMC /* probably be a long time before if ever that we do dmapi */ if ((DM_EVENT_ENABLED(vp->v_vfsp, xip, DM_EVENT_WRITE) && !(ioflags & IO_INVIS) && !eventsent)) { loff_t savedsize = *offset; int dmflags = FILP_DELAY_FLAG(file) | DM_SEM_FLAG_RD(ioflags); xfs_iunlock(xip, XFS_ILOCK_EXCL); error = XFS_SEND_DATA(xip->i_mount, DM_EVENT_WRITE, vp, *offset, size, dmflags, &locktype); if (error) { if (iolock) xfs_iunlock(xip, iolock); return -error; } xfs_ilock(xip, XFS_ILOCK_EXCL); eventsent = 1; /* * The iolock was dropped and reaquired in XFS_SEND_DATA * so we have to recheck the size when appending. * We will only "goto start;" once, since having sent the * event prevents another call to XFS_SEND_DATA, which is * what allows the size to change in the first place. */ if ((file->f_flags & O_APPEND) && savedsize != xip->i_d.di_size) { *offset = isize = xip->i_d.di_size; goto start; } } #endif /* * If the offset is beyond the size of the file, we have a couple * of things to do. First, if there is already space allocated * we need to either create holes or zero the disk or ... * * If there is a page where the previous size lands, we need * to zero it out up to the new size. */ if (!(ioflag & IO_ISDIRECT) && (*offset > isize && isize)) { error = xfs_zero_eof(BHV_TO_VNODE(bdp), io, *offset, isize, *offset + size); if (error) { xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock); return(-error); } } xfs_iunlock(xip, XFS_ILOCK_EXCL); #if 0 /* * If we're writing the file then make sure to clear the * setuid and setgid bits if the process is not being run * by root. This keeps people from modifying setuid and * setgid binaries. */ if (((xip->i_d.di_mode & S_ISUID) || ((xip->i_d.di_mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP))) && !capable(CAP_FSETID)) { error = xfs_write_clear_setuid(xip); if (likely(!error)) error = -remove_suid(file->f_dentry); if (unlikely(error)) { xfs_iunlock(xip, iolock); goto out_unlock_mutex; } } #endif //retry: if (unlikely(ioflag & IO_ISDIRECT)) { #ifdef RMC xfs_off_t pos = *offset; struct address_space *mapping = file->f_dentry->d_inode->i_mapping; struct inode *inode = mapping->host; ret = precheck_file_write(file, inode, &size, &pos); if (ret || size == 0) goto error; xfs_inval_cached_pages(vp, io, pos, 1, 1); inode->i_ctime = inode->i_mtime = CURRENT_TIME; /* mark_inode_dirty_sync(inode); - we do this later */ xfs_rw_enter_trace(XFS_DIOWR_ENTER, io, buf, size, pos, ioflags); ret = generic_file_direct_IO(WRITE, file, (char *)buf, size, pos); xfs_inval_cached_pages(vp, io, pos, 1, 1); if (ret > 0) *offset += ret; #endif } else { xfs_rw_enter_trace(XFS_WRITE_ENTER, io, buf, size, *offset, ioflags); ret = xfs_write_file(xip,uio,ioflag); } xfs_ichgtime(xip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); //error: if (ret <= 0) { if (iolock) xfs_rwunlock(bdp, locktype); return ret; } XFS_STATS_ADD(xs_write_bytes, ret); if (*offset > xip->i_d.di_size) { xfs_ilock(xip, XFS_ILOCK_EXCL); if (*offset > xip->i_d.di_size) { printf("xfs_write look at doing more here %s:%d\n",__FILE__,__LINE__); #ifdef RMC struct inode *inode = LINVFS_GET_IP(vp); i_size_write(inode, *offset); mark_inode_dirty_sync(inode); #endif xip->i_d.di_size = *offset; xip->i_update_core = 1; xip->i_update_size = 1; } xfs_iunlock(xip, XFS_ILOCK_EXCL); } /* Handle various SYNC-type writes */ #if 0 // if ((file->f_flags & O_SYNC) || IS_SYNC(inode)) { #endif if (ioflag & IO_SYNC) { /* * If we're treating this as O_DSYNC and we have not updated the * size, force the log. */ if (!(mp->m_flags & XFS_MOUNT_OSYNCISOSYNC) && !(xip->i_update_size)) { xfs_inode_log_item_t *iip = xip->i_itemp; /* * If an allocation transaction occurred * without extending the size, then we have to force * the log up the proper point to ensure that the * allocation is permanent. We can't count on * the fact that buffered writes lock out direct I/O * writes - the direct I/O write could have extended * the size nontransactionally, then finished before * we started. xfs_write_file will think that the file * didn't grow but the update isn't safe unless the * size change is logged. * * Force the log if we've committed a transaction * against the inode or if someone else has and * the commit record hasn't gone to disk (e.g. * the inode is pinned). This guarantees that * all changes affecting the inode are permanent * when we return. */ if (iip && iip->ili_last_lsn) { xfs_log_force(mp, iip->ili_last_lsn, XFS_LOG_FORCE | XFS_LOG_SYNC); } else if (xfs_ipincount(xip) > 0) { xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE | XFS_LOG_SYNC); } } else { xfs_trans_t *tp; /* * O_SYNC or O_DSYNC _with_ a size update are handled * the same way. * * If the write was synchronous then we need to make * sure that the inode modification time is permanent. * We'll have updated the timestamp above, so here * we use a synchronous transaction to log the inode. * It's not fast, but it's necessary. * * If this a dsync write and the size got changed * non-transactionally, then we need to ensure that * the size change gets logged in a synchronous * transaction. */ tp = xfs_trans_alloc(mp, XFS_TRANS_WRITE_SYNC); if ((error = xfs_trans_reserve(tp, 0, XFS_SWRITE_LOG_RES(mp), 0, 0, 0))) { /* Transaction reserve failed */ xfs_trans_cancel(tp, 0); } else { /* Transaction reserve successful */ xfs_ilock(xip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, xip, XFS_ILOCK_EXCL); xfs_trans_ihold(tp, xip); xfs_trans_log_inode(tp, xip, XFS_ILOG_CORE); xfs_trans_set_sync(tp); error = xfs_trans_commit(tp, 0, NULL); xfs_iunlock(xip, XFS_ILOCK_EXCL); } if (error) goto out_unlock_internal; } xfs_rwunlock(bdp, locktype); return ret; } /* (ioflags & O_SYNC) */ out_unlock_internal: xfs_rwunlock(bdp, locktype); #if 0 out_unlock_mutex: if (need_i_mutex) mutex_unlock(&inode->i_mutex); #endif //out_nounlocks: return -error; }
/* * Handle logging requirements of various synchronous types of write. */ int xfs_write_sync_logforce( xfs_mount_t *mp, xfs_inode_t *ip) { int error = 0; /* * If we're treating this as O_DSYNC and we have not updated the * size, force the log. */ if (!(mp->m_flags & XFS_MOUNT_OSYNCISOSYNC) && !(ip->i_update_size)) { xfs_inode_log_item_t *iip = ip->i_itemp; /* * If an allocation transaction occurred * without extending the size, then we have to force * the log up the proper point to ensure that the * allocation is permanent. We can't count on * the fact that buffered writes lock out direct I/O * writes - the direct I/O write could have extended * the size nontransactionally, then finished before * we started. xfs_write_file will think that the file * didn't grow but the update isn't safe unless the * size change is logged. * * Force the log if we've committed a transaction * against the inode or if someone else has and * the commit record hasn't gone to disk (e.g. * the inode is pinned). This guarantees that * all changes affecting the inode are permanent * when we return. */ if (iip && iip->ili_last_lsn) { xfs_log_force(mp, iip->ili_last_lsn, XFS_LOG_FORCE | XFS_LOG_SYNC); } else if (xfs_ipincount(ip) > 0) { xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE | XFS_LOG_SYNC); } } else { xfs_trans_t *tp; /* * O_SYNC or O_DSYNC _with_ a size update are handled * the same way. * * If the write was synchronous then we need to make * sure that the inode modification time is permanent. * We'll have updated the timestamp above, so here * we use a synchronous transaction to log the inode. * It's not fast, but it's necessary. * * If this a dsync write and the size got changed * non-transactionally, then we need to ensure that * the size change gets logged in a synchronous * transaction. */ tp = xfs_trans_alloc(mp, XFS_TRANS_WRITE_SYNC); if ((error = xfs_trans_reserve(tp, 0, XFS_SWRITE_LOG_RES(mp), 0, 0, 0))) { /* Transaction reserve failed */ xfs_trans_cancel(tp, 0); } else { /* Transaction reserve successful */ xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); xfs_trans_ihold(tp, ip); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); xfs_trans_set_sync(tp); error = xfs_trans_commit(tp, 0, NULL); xfs_iunlock(ip, XFS_ILOCK_EXCL); } } return error; }
/* * This gets called by xfs_trans_push_ail(), when IOP_TRYLOCK * failed to get the inode flush lock but did get the inode locked SHARED. * Here we're trying to see if the inode buffer is incore, and if so whether it's * marked delayed write. If that's the case, we'll initiate a bawrite on that * buffer to expedite the process. * * We aren't holding the AIL_LOCK (or the flush lock) when this gets called, * so it is inherently race-y. */ STATIC void xfs_inode_item_pushbuf( xfs_inode_log_item_t *iip) { xfs_inode_t *ip; xfs_mount_t *mp; xfs_buf_t *bp; uint dopush; ip = iip->ili_inode; ASSERT(ismrlocked(&(ip->i_lock), MR_ACCESS)); /* * The ili_pushbuf_flag keeps others from * trying to duplicate our effort. */ ASSERT(iip->ili_pushbuf_flag != 0); ASSERT(iip->ili_push_owner == get_thread_id()); /* * If flushlock isn't locked anymore, chances are that the * inode flush completed and the inode was taken off the AIL. * So, just get out. */ if ((valusema(&(ip->i_flock)) > 0) || ((iip->ili_item.li_flags & XFS_LI_IN_AIL) == 0)) { iip->ili_pushbuf_flag = 0; xfs_iunlock(ip, XFS_ILOCK_SHARED); return; } mp = ip->i_mount; bp = xfs_incore(mp->m_ddev_targ, iip->ili_format.ilf_blkno, iip->ili_format.ilf_len, XFS_INCORE_TRYLOCK); if (bp != NULL) { if (XFS_BUF_ISDELAYWRITE(bp)) { /* * We were racing with iflush because we don't hold * the AIL_LOCK or the flush lock. However, at this point, * we have the buffer, and we know that it's dirty. * So, it's possible that iflush raced with us, and * this item is already taken off the AIL. * If not, we can flush it async. */ dopush = ((iip->ili_item.li_flags & XFS_LI_IN_AIL) && (valusema(&(ip->i_flock)) <= 0)); iip->ili_pushbuf_flag = 0; xfs_iunlock(ip, XFS_ILOCK_SHARED); xfs_buftrace("INODE ITEM PUSH", bp); if (XFS_BUF_ISPINNED(bp)) { xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE); } if (dopush) { xfs_bawrite(mp, bp); } else { xfs_buf_relse(bp); } } else { iip->ili_pushbuf_flag = 0; xfs_iunlock(ip, XFS_ILOCK_SHARED); xfs_buf_relse(bp); } return; } /* * We have to be careful about resetting pushbuf flag too early (above). * Eventhough in theory we can do it as soon as we have the buflock, * we don't want others to be doing work needlessly. They'll come to * this function thinking that pushing the buffer is there responsibility * only to find that the buffer is still locked by another doing the * same thing.XXX */ iip->ili_pushbuf_flag = 0; xfs_iunlock(ip, XFS_ILOCK_SHARED); return; }