asmlinkage void do_page_fault(unsigned long address, struct pt_regs *regs, int protection, int writeaccess) { struct task_struct *tsk; struct mm_struct *mm; struct vm_area_struct * vma; siginfo_t info; D(printk("Page fault for %lX on %X at %lX, prot %d write %d\n", address, smp_processor_id(), instruction_pointer(regs), protection, writeaccess)); tsk = current; /* * We fault-in kernel-space virtual memory on-demand. The * 'reference' page table is init_mm.pgd. * * NOTE! We MUST NOT take any locks for this case. We may * be in an interrupt or a critical region, and should * only copy the information from the master page table, * nothing more. * * NOTE2: This is done so that, when updating the vmalloc * mappings we don't have to walk all processes pgdirs and * add the high mappings all at once. Instead we do it as they * are used. However vmalloc'ed page entries have the PAGE_GLOBAL * bit set so sometimes the TLB can use a lingering entry. * * This verifies that the fault happens in kernel space * and that the fault was not a protection error (error_code & 1). */ if (address >= VMALLOC_START && !protection && !user_mode(regs)) goto vmalloc_fault; /* When stack execution is not allowed we store the signal * trampolines in the reserved cris_signal_return_page. * Handle this in the exact same way as vmalloc (we know * that the mapping is there and is valid so no need to * call handle_mm_fault). */ if (cris_signal_return_page && address == cris_signal_return_page && !protection && user_mode(regs)) goto vmalloc_fault; /* we can and should enable interrupts at this point */ local_irq_enable(); mm = tsk->mm; info.si_code = SEGV_MAPERR; /* * If we're in an interrupt or have no user * context, we must not take the fault.. */ if (in_atomic() || !mm) goto no_context; down_read(&mm->mmap_sem); vma = find_vma(mm, address); if (!vma) goto bad_area; if (vma->vm_start <= address) goto good_area; if (!(vma->vm_flags & VM_GROWSDOWN)) goto bad_area; if (user_mode(regs)) { /* * accessing the stack below usp is always a bug. * we get page-aligned addresses so we can only check * if we're within a page from usp, but that might be * enough to catch brutal errors at least. */ if (address + PAGE_SIZE < rdusp()) goto bad_area; } if (expand_stack(vma, address)) goto bad_area; /* * Ok, we have a good vm_area for this memory access, so * we can handle it.. */ good_area: info.si_code = SEGV_ACCERR; /* first do some preliminary protection checks */ if (writeaccess == 2){ if (!(vma->vm_flags & VM_EXEC)) goto bad_area; } else if (writeaccess == 1) { if (!(vma->vm_flags & VM_WRITE)) goto bad_area; } else { if (!(vma->vm_flags & (VM_READ | VM_EXEC))) goto bad_area; } /* * If for any reason at all we couldn't handle the fault, * make sure we exit gracefully rather than endlessly redo * the fault. */ switch (handle_mm_fault(mm, vma, address, writeaccess & 1)) { case VM_FAULT_MINOR: tsk->min_flt++; break; case VM_FAULT_MAJOR: tsk->maj_flt++; break; case VM_FAULT_SIGBUS: goto do_sigbus; default: goto out_of_memory; } up_read(&mm->mmap_sem); return; /* * Something tried to access memory that isn't in our memory map.. * Fix it, but check if it's kernel or user first.. */ bad_area: up_read(&mm->mmap_sem); bad_area_nosemaphore: DPG(show_registers(regs)); /* User mode accesses just cause a SIGSEGV */ if (user_mode(regs)) { info.si_signo = SIGSEGV; info.si_errno = 0; /* info.si_code has been set above */ info.si_addr = (void *)address; force_sig_info(SIGSEGV, &info, tsk); return; } no_context: /* Are we prepared to handle this kernel fault? * * (The kernel has valid exception-points in the source * when it acesses user-memory. When it fails in one * of those points, we find it in a table and do a jump * to some fixup code that loads an appropriate error * code) */ if (find_fixup_code(regs)) return; /* * Oops. The kernel tried to access some bad page. We'll have to * terminate things with extreme prejudice. */ if ((unsigned long) (address) < PAGE_SIZE) raw_printk(KERN_ALERT "Unable to handle kernel NULL pointer dereference"); else raw_printk(KERN_ALERT "Unable to handle kernel access"); raw_printk(" at virtual address %08lx\n",address); die_if_kernel("Oops", regs, (writeaccess << 1) | protection); do_exit(SIGKILL); /* * We ran out of memory, or some other thing happened to us that made * us unable to handle the page fault gracefully. */ out_of_memory: up_read(&mm->mmap_sem); printk("VM: killing process %s\n", tsk->comm); if (user_mode(regs)) do_exit(SIGKILL); goto no_context; do_sigbus: up_read(&mm->mmap_sem); /* * Send a sigbus, regardless of whether we were in kernel * or user mode. */ info.si_signo = SIGBUS; info.si_errno = 0; info.si_code = BUS_ADRERR; info.si_addr = (void *)address; force_sig_info(SIGBUS, &info, tsk); /* Kernel mode? Handle exceptions or die */ if (!user_mode(regs)) goto no_context; return; vmalloc_fault: { /* * Synchronize this task's top level page-table * with the 'reference' page table. * * Use current_pgd instead of tsk->active_mm->pgd * since the latter might be unavailable if this * code is executed in a misfortunately run irq * (like inside schedule() between switch_mm and * switch_to...). */ int offset = pgd_index(address); pgd_t *pgd, *pgd_k; pud_t *pud, *pud_k; pmd_t *pmd, *pmd_k; pte_t *pte_k; pgd = (pgd_t *)per_cpu(current_pgd, smp_processor_id()) + offset; pgd_k = init_mm.pgd + offset; /* Since we're two-level, we don't need to do both * set_pgd and set_pmd (they do the same thing). If * we go three-level at some point, do the right thing * with pgd_present and set_pgd here. * * Also, since the vmalloc area is global, we don't * need to copy individual PTE's, it is enough to * copy the pgd pointer into the pte page of the * root task. If that is there, we'll find our pte if * it exists. */ pud = pud_offset(pgd, address); pud_k = pud_offset(pgd_k, address); if (!pud_present(*pud_k)) goto no_context; pmd = pmd_offset(pud, address); pmd_k = pmd_offset(pud_k, address); if (!pmd_present(*pmd_k)) goto bad_area_nosemaphore; set_pmd(pmd, *pmd_k); /* Make sure the actual PTE exists as well to * catch kernel vmalloc-area accesses to non-mapped * addresses. If we don't do this, this will just * silently loop forever. */ pte_k = pte_offset_kernel(pmd_k, address); if (!pte_present(*pte_k)) goto no_context; return; } }
/* * Prepare the write for the inline data. * If the the data can be written into the inode, we just read * the page and make it uptodate, and start the journal. * Otherwise read the page, makes it dirty so that it can be * handle in writepages(the i_disksize update is left to the * normal ext4_da_write_end). */ int ext4_da_write_inline_data_begin(struct address_space *mapping, struct inode *inode, loff_t pos, unsigned len, unsigned flags, struct page **pagep, void **fsdata) { int ret, inline_size; handle_t *handle; struct page *page; struct ext4_iloc iloc; int retries; ret = ext4_get_inode_loc(inode, &iloc); if (ret) return ret; retry_journal: handle = ext4_journal_start(inode, EXT4_HT_INODE, 1); if (IS_ERR(handle)) { ret = PTR_ERR(handle); goto out; } inline_size = ext4_get_max_inline_size(inode); ret = -ENOSPC; if (inline_size >= pos + len) { ret = ext4_prepare_inline_data(handle, inode, pos + len); if (ret && ret != -ENOSPC) goto out_journal; } /* * We cannot recurse into the filesystem as the transaction * is already started. */ flags |= AOP_FLAG_NOFS; if (ret == -ENOSPC) { ret = ext4_da_convert_inline_data_to_extent(mapping, inode, flags, fsdata); ext4_journal_stop(handle); if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) goto retry_journal; goto out; } page = grab_cache_page_write_begin(mapping, 0, flags); if (!page) { ret = -ENOMEM; goto out_journal; } down_read(&EXT4_I(inode)->xattr_sem); if (!ext4_has_inline_data(inode)) { ret = 0; goto out_release_page; } if (!PageUptodate(page)) { ret = ext4_read_inline_page(inode, page); if (ret < 0) goto out_release_page; } up_read(&EXT4_I(inode)->xattr_sem); *pagep = page; brelse(iloc.bh); return 1; out_release_page: up_read(&EXT4_I(inode)->xattr_sem); unlock_page(page); page_cache_release(page); out_journal: ext4_journal_stop(handle); out: brelse(iloc.bh); return ret; }
/** * nilfs_get_block() - get a file block on the filesystem (callback function) * @inode - inode struct of the target file * @blkoff - file block number * @bh_result - buffer head to be mapped on * @create - indicate whether allocating the block or not when it has not * been allocated yet. * * This function does not issue actual read request of the specified data * block. It is done by VFS. */ int nilfs_get_block(struct inode *inode, sector_t blkoff, struct buffer_head *bh_result, int create) { struct nilfs_inode_info *ii = NILFS_I(inode); struct the_nilfs *nilfs = inode->i_sb->s_fs_info; __u64 blknum = 0; int err = 0, ret; unsigned maxblocks = bh_result->b_size >> inode->i_blkbits; down_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem); ret = nilfs_bmap_lookup_contig(ii->i_bmap, blkoff, &blknum, maxblocks); up_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem); if (ret >= 0) { /* found */ map_bh(bh_result, inode->i_sb, blknum); if (ret > 0) bh_result->b_size = (ret << inode->i_blkbits); goto out; } /* data block was not found */ if (ret == -ENOENT && create) { struct nilfs_transaction_info ti; bh_result->b_blocknr = 0; err = nilfs_transaction_begin(inode->i_sb, &ti, 1); if (unlikely(err)) goto out; err = nilfs_bmap_insert(ii->i_bmap, (unsigned long)blkoff, (unsigned long)bh_result); if (unlikely(err != 0)) { if (err == -EEXIST) { /* * The get_block() function could be called * from multiple callers for an inode. * However, the page having this block must * be locked in this case. */ printk(KERN_WARNING "nilfs_get_block: a race condition " "while inserting a data block. " "(inode number=%lu, file block " "offset=%llu)\n", inode->i_ino, (unsigned long long)blkoff); err = 0; } nilfs_transaction_abort(inode->i_sb); goto out; } nilfs_mark_inode_dirty(inode); nilfs_transaction_commit(inode->i_sb); /* never fails */ /* Error handling should be detailed */ set_buffer_new(bh_result); set_buffer_delay(bh_result); map_bh(bh_result, inode->i_sb, 0); /* dbn must be changed to proper value */ } else if (ret == -ENOENT) { /* not found is not error (e.g. hole); must return without the mapped state flag. */ ; } else { err = ret; } out: return err; }
int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) { struct inode *inode = file->f_mapping->host; struct f2fs_inode_info *fi = F2FS_I(inode); struct f2fs_sb_info *sbi = F2FS_I_SB(inode); nid_t ino = inode->i_ino; int ret = 0; bool need_cp = false; struct writeback_control wbc = { .sync_mode = WB_SYNC_ALL, .nr_to_write = LONG_MAX, .for_reclaim = 0, }; if (unlikely(f2fs_readonly(inode->i_sb))) return 0; trace_f2fs_sync_file_enter(inode); /* if fdatasync is triggered, let's do in-place-update */ if (get_dirty_pages(inode) <= SM_I(sbi)->min_fsync_blocks) set_inode_flag(fi, FI_NEED_IPU); ret = filemap_write_and_wait_range(inode->i_mapping, start, end); clear_inode_flag(fi, FI_NEED_IPU); if (ret) { trace_f2fs_sync_file_exit(inode, need_cp, datasync, ret); return ret; } /* if the inode is dirty, let's recover all the time */ if (!datasync) { f2fs_write_inode(inode, NULL); goto go_write; } /* * if there is no written data, don't waste time to write recovery info. */ if (!is_inode_flag_set(fi, FI_APPEND_WRITE) && !exist_written_data(sbi, ino, APPEND_INO)) { /* it may call write_inode just prior to fsync */ if (need_inode_page_update(sbi, ino)) goto go_write; if (is_inode_flag_set(fi, FI_UPDATE_WRITE) || exist_written_data(sbi, ino, UPDATE_INO)) goto flush_out; goto out; } go_write: /* guarantee free sections for fsync */ f2fs_balance_fs(sbi); /* * Both of fdatasync() and fsync() are able to be recovered from * sudden-power-off. */ down_read(&fi->i_sem); need_cp = need_do_checkpoint(inode); up_read(&fi->i_sem); if (need_cp) { /* all the dirty node pages should be flushed for POR */ ret = f2fs_sync_fs(inode->i_sb, 1); /* * We've secured consistency through sync_fs. Following pino * will be used only for fsynced inodes after checkpoint. */ try_to_fix_pino(inode); clear_inode_flag(fi, FI_APPEND_WRITE); clear_inode_flag(fi, FI_UPDATE_WRITE); goto out; } sync_nodes: sync_node_pages(sbi, ino, &wbc); /* if cp_error was enabled, we should avoid infinite loop */ if (unlikely(f2fs_cp_error(sbi))) goto out; if (need_inode_block_update(sbi, ino)) { mark_inode_dirty_sync(inode); f2fs_write_inode(inode, NULL); goto sync_nodes; } ret = wait_on_node_pages_writeback(sbi, ino); if (ret) goto out; /* once recovery info is written, don't need to tack this */ remove_dirty_inode(sbi, ino, APPEND_INO); clear_inode_flag(fi, FI_APPEND_WRITE); flush_out: remove_dirty_inode(sbi, ino, UPDATE_INO); clear_inode_flag(fi, FI_UPDATE_WRITE); ret = f2fs_issue_flush(sbi); out: trace_f2fs_sync_file_exit(inode, need_cp, datasync, ret); f2fs_trace_ios(NULL, 1); return ret; } static pgoff_t __get_first_dirty_index(struct address_space *mapping, pgoff_t pgofs, int whence) { struct pagevec pvec; int nr_pages; if (whence != SEEK_DATA) return 0; /* find first dirty page index */ pagevec_init(&pvec, 0); nr_pages = pagevec_lookup_tag(&pvec, mapping, &pgofs, PAGECACHE_TAG_DIRTY, 1); pgofs = nr_pages ? pvec.pages[0]->index : LONG_MAX; pagevec_release(&pvec); return pgofs; } static bool __found_offset(block_t blkaddr, pgoff_t dirty, pgoff_t pgofs, int whence) { switch (whence) { case SEEK_DATA: if ((blkaddr == NEW_ADDR && dirty == pgofs) || (blkaddr != NEW_ADDR && blkaddr != NULL_ADDR)) return true; break; case SEEK_HOLE: if (blkaddr == NULL_ADDR) return true; break; } return false; } static inline int unsigned_offsets(struct file *file) { return file->f_mode & FMODE_UNSIGNED_OFFSET; } static loff_t vfs_setpos(struct file *file, loff_t offset, loff_t maxsize) { if (offset < 0 && !unsigned_offsets(file)) return -EINVAL; if (offset > maxsize) return -EINVAL; if (offset != file->f_pos) { file->f_pos = offset; file->f_version = 0; } return offset; } static loff_t f2fs_seek_block(struct file *file, loff_t offset, int whence) { struct inode *inode = file->f_mapping->host; loff_t maxbytes = inode->i_sb->s_maxbytes; struct dnode_of_data dn; pgoff_t pgofs, end_offset, dirty; loff_t data_ofs = offset; loff_t isize; int err = 0; mutex_lock(&inode->i_mutex); isize = i_size_read(inode); if (offset >= isize) goto fail; /* handle inline data case */ if (f2fs_has_inline_data(inode) || f2fs_has_inline_dentry(inode)) { if (whence == SEEK_HOLE) data_ofs = isize; goto found; } pgofs = (pgoff_t)(offset >> PAGE_CACHE_SHIFT); dirty = __get_first_dirty_index(inode->i_mapping, pgofs, whence); for (; data_ofs < isize; data_ofs = pgofs << PAGE_CACHE_SHIFT) { set_new_dnode(&dn, inode, NULL, NULL, 0); err = get_dnode_of_data(&dn, pgofs, LOOKUP_NODE_RA); if (err && err != -ENOENT) { goto fail; } else if (err == -ENOENT) { /* direct node does not exists */ if (whence == SEEK_DATA) { pgofs = PGOFS_OF_NEXT_DNODE(pgofs, F2FS_I(inode)); continue; } else { goto found; } } end_offset = ADDRS_PER_PAGE(dn.node_page, F2FS_I(inode)); /* find data/hole in dnode block */ for (; dn.ofs_in_node < end_offset; dn.ofs_in_node++, pgofs++, data_ofs = (loff_t)pgofs << PAGE_CACHE_SHIFT) { block_t blkaddr; blkaddr = datablock_addr(dn.node_page, dn.ofs_in_node); if (__found_offset(blkaddr, dirty, pgofs, whence)) { f2fs_put_dnode(&dn); goto found; } } f2fs_put_dnode(&dn); } if (whence == SEEK_DATA) goto fail; found: if (whence == SEEK_HOLE && data_ofs > isize) data_ofs = isize; mutex_unlock(&inode->i_mutex); return vfs_setpos(file, data_ofs, maxbytes); fail: mutex_unlock(&inode->i_mutex); return -ENXIO; } static loff_t f2fs_llseek(struct file *file, loff_t offset, int whence) { struct inode *inode = file->f_mapping->host; loff_t maxbytes = inode->i_sb->s_maxbytes; switch (whence) { case SEEK_SET: case SEEK_CUR: case SEEK_END: return generic_file_llseek_size(file, offset, whence, maxbytes, i_size_read(inode)); case SEEK_DATA: case SEEK_HOLE: if (offset < 0) return -ENXIO; return f2fs_seek_block(file, offset, whence); } return -EINVAL; } static int f2fs_file_mmap(struct file *file, struct vm_area_struct *vma) { struct inode *inode = file_inode(file); if (f2fs_encrypted_inode(inode)) { int err = f2fs_get_encryption_info(inode); if (err) return 0; } /* we don't need to use inline_data strictly */ if (f2fs_has_inline_data(inode)) { int err = f2fs_convert_inline_inode(inode); if (err) return err; } file_accessed(file); vma->vm_ops = &f2fs_file_vm_ops; return 0; } static int f2fs_file_open(struct inode *inode, struct file *filp) { int ret = generic_file_open(inode, filp); if (!ret && f2fs_encrypted_inode(inode)) { ret = f2fs_get_encryption_info(inode); if (ret) ret = -EACCES; } return ret; }
/* * So this function is called when the volume is mkfsed with * dir_index disabled. In order to keep f_pos persistent * after we convert from an inlined dir to a blocked based, * we just pretend that we are a normal dir and return the * offset as if '.' and '..' really take place. * */ int ext4_read_inline_dir(struct file *filp, void *dirent, filldir_t filldir, int *has_inline_data) { int error = 0; unsigned int offset, parent_ino; int i, stored; struct ext4_dir_entry_2 *de; struct super_block *sb; struct inode *inode = file_inode(filp); int ret, inline_size = 0; struct ext4_iloc iloc; void *dir_buf = NULL; int dotdot_offset, dotdot_size, extra_offset, extra_size; ret = ext4_get_inode_loc(inode, &iloc); if (ret) return ret; down_read(&EXT4_I(inode)->xattr_sem); if (!ext4_has_inline_data(inode)) { up_read(&EXT4_I(inode)->xattr_sem); *has_inline_data = 0; goto out; } inline_size = ext4_get_inline_size(inode); dir_buf = kmalloc(inline_size, GFP_NOFS); if (!dir_buf) { ret = -ENOMEM; up_read(&EXT4_I(inode)->xattr_sem); goto out; } ret = ext4_read_inline_data(inode, dir_buf, inline_size, &iloc); up_read(&EXT4_I(inode)->xattr_sem); if (ret < 0) goto out; ret = 0; sb = inode->i_sb; stored = 0; parent_ino = le32_to_cpu(((struct ext4_dir_entry_2 *)dir_buf)->inode); offset = filp->f_pos; /* * dotdot_offset and dotdot_size is the real offset and * size for ".." and "." if the dir is block based while * the real size for them are only EXT4_INLINE_DOTDOT_SIZE. * So we will use extra_offset and extra_size to indicate them * during the inline dir iteration. */ dotdot_offset = EXT4_DIR_REC_LEN(1); dotdot_size = dotdot_offset + EXT4_DIR_REC_LEN(2); extra_offset = dotdot_size - EXT4_INLINE_DOTDOT_SIZE; extra_size = extra_offset + inline_size; while (!error && !stored && filp->f_pos < extra_size) { revalidate: /* * If the version has changed since the last call to * readdir(2), then we might be pointing to an invalid * dirent right now. Scan from the start of the inline * dir to make sure. */ if (filp->f_version != inode->i_version) { for (i = 0; i < extra_size && i < offset;) { /* * "." is with offset 0 and * ".." is dotdot_offset. */ if (!i) { i = dotdot_offset; continue; } else if (i == dotdot_offset) { i = dotdot_size; continue; } /* for other entry, the real offset in * the buf has to be tuned accordingly. */ de = (struct ext4_dir_entry_2 *) (dir_buf + i - extra_offset); /* It's too expensive to do a full * dirent test each time round this * loop, but we do have to test at * least that it is non-zero. A * failure will be detected in the * dirent test below. */ if (ext4_rec_len_from_disk(de->rec_len, extra_size) < EXT4_DIR_REC_LEN(1)) break; i += ext4_rec_len_from_disk(de->rec_len, extra_size); } offset = i; filp->f_pos = offset; filp->f_version = inode->i_version; } while (!error && filp->f_pos < extra_size) { if (filp->f_pos == 0) { error = filldir(dirent, ".", 1, 0, inode->i_ino, DT_DIR); if (error) break; stored++; filp->f_pos = dotdot_offset; continue; } if (filp->f_pos == dotdot_offset) { error = filldir(dirent, "..", 2, dotdot_offset, parent_ino, DT_DIR); if (error) break; stored++; filp->f_pos = dotdot_size; continue; } de = (struct ext4_dir_entry_2 *) (dir_buf + filp->f_pos - extra_offset); if (ext4_check_dir_entry(inode, filp, de, iloc.bh, dir_buf, extra_size, filp->f_pos)) { ret = stored; goto out; } if (le32_to_cpu(de->inode)) { /* We might block in the next section * if the data destination is * currently swapped out. So, use a * version stamp to detect whether or * not the directory has been modified * during the copy operation. */ u64 version = filp->f_version; error = filldir(dirent, de->name, de->name_len, filp->f_pos, le32_to_cpu(de->inode), get_dtype(sb, de->file_type)); if (error) break; if (version != filp->f_version) goto revalidate; stored++; } filp->f_pos += ext4_rec_len_from_disk(de->rec_len, extra_size); } } out: kfree(dir_buf); brelse(iloc.bh); return ret; }
int ext4_ext_migrate(struct inode *inode) { handle_t *handle; int retval = 0, i; __le32 *i_data; struct ext4_inode_info *ei; struct inode *tmp_inode = NULL; struct migrate_struct lb; unsigned long max_entries; __u32 goal; uid_t owner[2]; /* * If the filesystem does not support extents, or the inode * already is extent-based, error out. */ if (!EXT4_HAS_INCOMPAT_FEATURE(inode->i_sb, EXT4_FEATURE_INCOMPAT_EXTENTS) || (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) return -EINVAL; if (S_ISLNK(inode->i_mode) && inode->i_blocks == 0) /* * don't migrate fast symlink */ return retval; /* * Worst case we can touch the allocation bitmaps, a bgd * block, and a block to link in the orphan list. We do need * need to worry about credits for modifying the quota inode. */ handle = ext4_journal_start(inode, EXT4_HT_MIGRATE, 4 + EXT4_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb)); if (IS_ERR(handle)) { retval = PTR_ERR(handle); return retval; } goal = (((inode->i_ino - 1) / EXT4_INODES_PER_GROUP(inode->i_sb)) * EXT4_INODES_PER_GROUP(inode->i_sb)) + 1; owner[0] = i_uid_read(inode); owner[1] = i_gid_read(inode); tmp_inode = ext4_new_inode(handle, inode->i_sb->s_root->d_inode, S_IFREG, NULL, goal, owner); if (IS_ERR(tmp_inode)) { retval = PTR_ERR(tmp_inode); ext4_journal_stop(handle); return retval; } i_size_write(tmp_inode, i_size_read(inode)); /* * Set the i_nlink to zero so it will be deleted later * when we drop inode reference. */ clear_nlink(tmp_inode); ext4_ext_tree_init(handle, tmp_inode); ext4_orphan_add(handle, tmp_inode); ext4_journal_stop(handle); /* * start with one credit accounted for * superblock modification. * * For the tmp_inode we already have committed the * transaction that created the inode. Later as and * when we add extents we extent the journal */ /* * Even though we take i_mutex we can still cause block * allocation via mmap write to holes. If we have allocated * new blocks we fail migrate. New block allocation will * clear EXT4_STATE_EXT_MIGRATE flag. The flag is updated * with i_data_sem held to prevent racing with block * allocation. */ down_read(&EXT4_I(inode)->i_data_sem); ext4_set_inode_state(inode, EXT4_STATE_EXT_MIGRATE); up_read((&EXT4_I(inode)->i_data_sem)); handle = ext4_journal_start(inode, EXT4_HT_MIGRATE, 1); if (IS_ERR(handle)) { /* * It is impossible to update on-disk structures without * a handle, so just rollback in-core changes and live other * work to orphan_list_cleanup() */ ext4_orphan_del(NULL, tmp_inode); retval = PTR_ERR(handle); goto out; } ei = EXT4_I(inode); i_data = ei->i_data; memset(&lb, 0, sizeof(lb)); /* 32 bit block address 4 bytes */ max_entries = inode->i_sb->s_blocksize >> 2; for (i = 0; i < EXT4_NDIR_BLOCKS; i++) { if (i_data[i]) { retval = update_extent_range(handle, tmp_inode, le32_to_cpu(i_data[i]), &lb); if (retval) goto err_out; } else lb.curr_block++; } if (i_data[EXT4_IND_BLOCK]) { retval = update_ind_extent_range(handle, tmp_inode, le32_to_cpu(i_data[EXT4_IND_BLOCK]), &lb); if (retval) goto err_out; } else lb.curr_block += max_entries; if (i_data[EXT4_DIND_BLOCK]) { retval = update_dind_extent_range(handle, tmp_inode, le32_to_cpu(i_data[EXT4_DIND_BLOCK]), &lb); if (retval) goto err_out; } else lb.curr_block += max_entries * max_entries; if (i_data[EXT4_TIND_BLOCK]) { retval = update_tind_extent_range(handle, tmp_inode, le32_to_cpu(i_data[EXT4_TIND_BLOCK]), &lb); if (retval) goto err_out; } /* * Build the last extent */ retval = finish_range(handle, tmp_inode, &lb); err_out: if (retval) /* * Failure case delete the extent information with the * tmp_inode */ free_ext_block(handle, tmp_inode); else { retval = ext4_ext_swap_inode_data(handle, inode, tmp_inode); if (retval) /* * if we fail to swap inode data free the extent * details of the tmp inode */ free_ext_block(handle, tmp_inode); } /* We mark the tmp_inode dirty via ext4_ext_tree_init. */ if (ext4_journal_extend(handle, 1) != 0) ext4_journal_restart(handle, 1); /* * Mark the tmp_inode as of size zero */ i_size_write(tmp_inode, 0); /* * set the i_blocks count to zero * so that the ext4_delete_inode does the * right job * * We don't need to take the i_lock because * the inode is not visible to user space. */ tmp_inode->i_blocks = 0; /* Reset the extent details */ ext4_ext_tree_init(handle, tmp_inode); ext4_journal_stop(handle); out: unlock_new_inode(tmp_inode); iput(tmp_inode); return retval; }
/* * Get the appropriate destination keyring for the request. * * The keyring selected is returned with an extra reference upon it which the * caller must release. */ static void construct_get_dest_keyring(struct key **_dest_keyring) { struct request_key_auth *rka; const struct cred *cred = current_cred(); struct key *dest_keyring = *_dest_keyring, *authkey; kenter("%p", dest_keyring); /* find the appropriate keyring */ if (dest_keyring) { /* the caller supplied one */ key_get(dest_keyring); } else { /* use a default keyring; falling through the cases until we * find one that we actually have */ switch (cred->jit_keyring) { case KEY_REQKEY_DEFL_DEFAULT: case KEY_REQKEY_DEFL_REQUESTOR_KEYRING: if (cred->request_key_auth) { authkey = cred->request_key_auth; down_read(&authkey->sem); rka = authkey->payload.data; if (!test_bit(KEY_FLAG_REVOKED, &authkey->flags)) dest_keyring = key_get(rka->dest_keyring); up_read(&authkey->sem); if (dest_keyring) break; } case KEY_REQKEY_DEFL_THREAD_KEYRING: dest_keyring = key_get(cred->thread_keyring); if (dest_keyring) break; case KEY_REQKEY_DEFL_PROCESS_KEYRING: dest_keyring = key_get(cred->tgcred->process_keyring); if (dest_keyring) break; case KEY_REQKEY_DEFL_SESSION_KEYRING: rcu_read_lock(); dest_keyring = key_get( rcu_dereference(cred->tgcred->session_keyring)); rcu_read_unlock(); if (dest_keyring) break; case KEY_REQKEY_DEFL_USER_SESSION_KEYRING: dest_keyring = key_get(cred->user->session_keyring); break; case KEY_REQKEY_DEFL_USER_KEYRING: dest_keyring = key_get(cred->user->uid_keyring); break; case KEY_REQKEY_DEFL_GROUP_KEYRING: default: BUG(); } } *_dest_keyring = dest_keyring; kleave(" [dk %d]", key_serial(dest_keyring)); return; }
/** * nilfs_fill_super() - initialize a super block instance * @sb: super_block * @data: mount options * @silent: silent mode flag * @nilfs: the_nilfs struct * * This function is called exclusively by nilfs->ns_mount_mutex. * So, the recovery process is protected from other simultaneous mounts. */ static int nilfs_fill_super(struct super_block *sb, void *data, int silent, struct the_nilfs *nilfs) { struct nilfs_sb_info *sbi; struct inode *root; __u64 cno; int err; sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); if (!sbi) return -ENOMEM; sb->s_fs_info = sbi; get_nilfs(nilfs); sbi->s_nilfs = nilfs; sbi->s_super = sb; atomic_set(&sbi->s_count, 1); err = init_nilfs(nilfs, sbi, (char *)data); if (err) goto failed_sbi; spin_lock_init(&sbi->s_inode_lock); INIT_LIST_HEAD(&sbi->s_dirty_files); INIT_LIST_HEAD(&sbi->s_list); /* * Following initialization is overlapped because * nilfs_sb_info structure has been cleared at the beginning. * But we reserve them to keep our interest and make ready * for the future change. */ get_random_bytes(&sbi->s_next_generation, sizeof(sbi->s_next_generation)); spin_lock_init(&sbi->s_next_gen_lock); sb->s_op = &nilfs_sops; sb->s_export_op = &nilfs_export_ops; sb->s_root = NULL; sb->s_time_gran = 1; sb->s_bdi = nilfs->ns_bdi; err = load_nilfs(nilfs, sbi); if (err) goto failed_sbi; cno = nilfs_last_cno(nilfs); if (sb->s_flags & MS_RDONLY) { if (nilfs_test_opt(sbi, SNAPSHOT)) { down_read(&nilfs->ns_segctor_sem); err = nilfs_cpfile_is_snapshot(nilfs->ns_cpfile, sbi->s_snapshot_cno); up_read(&nilfs->ns_segctor_sem); if (err < 0) { if (err == -ENOENT) err = -EINVAL; goto failed_sbi; } if (!err) { printk(KERN_ERR "NILFS: The specified checkpoint is " "not a snapshot " "(checkpoint number=%llu).\n", (unsigned long long)sbi->s_snapshot_cno); err = -EINVAL; goto failed_sbi; } cno = sbi->s_snapshot_cno; } } err = nilfs_attach_checkpoint(sbi, cno); if (err) { printk(KERN_ERR "NILFS: error loading a checkpoint" " (checkpoint number=%llu).\n", (unsigned long long)cno); goto failed_sbi; } if (!(sb->s_flags & MS_RDONLY)) { err = nilfs_attach_segment_constructor(sbi); if (err) goto failed_checkpoint; } root = nilfs_iget(sb, NILFS_ROOT_INO); if (IS_ERR(root)) { printk(KERN_ERR "NILFS: get root inode failed\n"); err = PTR_ERR(root); goto failed_segctor; } if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) { iput(root); printk(KERN_ERR "NILFS: corrupt root inode.\n"); err = -EINVAL; goto failed_segctor; } sb->s_root = d_alloc_root(root); if (!sb->s_root) { iput(root); printk(KERN_ERR "NILFS: get root dentry failed\n"); err = -ENOMEM; goto failed_segctor; } if (!(sb->s_flags & MS_RDONLY)) { down_write(&nilfs->ns_sem); nilfs_setup_super(sbi); up_write(&nilfs->ns_sem); } down_write(&nilfs->ns_super_sem); if (!nilfs_test_opt(sbi, SNAPSHOT)) nilfs->ns_current = sbi; up_write(&nilfs->ns_super_sem); return 0; failed_segctor: nilfs_detach_segment_constructor(sbi); failed_checkpoint: nilfs_detach_checkpoint(sbi); failed_sbi: put_nilfs(nilfs); sb->s_fs_info = NULL; nilfs_put_sbinfo(sbi); return err; }
/* * Canonical page fault handler */ void do_page_fault(unsigned long address, long cause, struct pt_regs *regs) { struct vm_area_struct *vma; struct mm_struct *mm = current->mm; siginfo_t info; int si_code = SEGV_MAPERR; int fault; const struct exception_table_entry *fixup; /* * If we're in an interrupt or have no user context, * then must not take the fault. */ if (unlikely(in_interrupt() || !mm)) goto no_context; local_irq_enable(); down_read(&mm->mmap_sem); vma = find_vma(mm, address); if (!vma) goto bad_area; if (vma->vm_start <= address) goto good_area; if (!(vma->vm_flags & VM_GROWSDOWN)) goto bad_area; if (expand_stack(vma, address)) goto bad_area; good_area: /* Address space is OK. Now check access rights. */ si_code = SEGV_ACCERR; switch (cause) { case FLT_IFETCH: if (!(vma->vm_flags & VM_EXEC)) goto bad_area; break; case FLT_LOAD: if (!(vma->vm_flags & VM_READ)) goto bad_area; break; case FLT_STORE: if (!(vma->vm_flags & VM_WRITE)) goto bad_area; break; } fault = handle_mm_fault(mm, vma, address, (cause > 0)); /* The most common case -- we are done. */ if (likely(!(fault & VM_FAULT_ERROR))) { if (fault & VM_FAULT_MAJOR) current->maj_flt++; else current->min_flt++; up_read(&mm->mmap_sem); return; } up_read(&mm->mmap_sem); /* Handle copyin/out exception cases */ if (!user_mode(regs)) goto no_context; if (fault & VM_FAULT_OOM) { pagefault_out_of_memory(); return; } /* User-mode address is in the memory map, but we are * unable to fix up the page fault. */ if (fault & VM_FAULT_SIGBUS) { info.si_signo = SIGBUS; info.si_code = BUS_ADRERR; } /* Address is not in the memory map */ else { info.si_signo = SIGSEGV; info.si_code = SEGV_ACCERR; } info.si_errno = 0; info.si_addr = (void __user *)address; force_sig_info(info.si_code, &info, current); return; bad_area: up_read(&mm->mmap_sem); if (user_mode(regs)) { info.si_signo = SIGSEGV; info.si_errno = 0; info.si_code = si_code; info.si_addr = (void *)address; force_sig_info(SIGSEGV, &info, current); return; } /* Kernel-mode fault falls through */ no_context: fixup = search_exception_tables(pt_elr(regs)); if (fixup) { pt_set_elr(regs, fixup->fixup); return; } /* Things are looking very, very bad now */ bust_spinlocks(1); printk(KERN_EMERG "Unable to handle kernel paging request at " "virtual address 0x%08lx, regs %p\n", address, regs); die("Bad Kernel VA", regs, SIGKILL); }
/* * MS_SYNC syncs the entire file - including mappings. * * MS_ASYNC does not start I/O (it used to, up to 2.5.67). * Nor does it marks the relevant pages dirty (it used to up to 2.6.17). * Now it doesn't do anything, since dirty pages are properly tracked. * * The application may now run fsync() to * write out the dirty pages and wait on the writeout and check the result. * Or the application may run fadvise(FADV_DONTNEED) against the fd to start * async writeout immediately. * So by _not_ starting I/O in MS_ASYNC we provide complete flexibility to * applications. */ SYSCALL_DEFINE3(msync, unsigned long, start, size_t, len, int, flags) { unsigned long end; struct mm_struct *mm = current->mm; struct vm_area_struct *vma; int unmapped_error = 0; int error = -EINVAL; if (flags & ~(MS_ASYNC | MS_INVALIDATE | MS_SYNC)) goto out; if (start & ~PAGE_MASK) goto out; if ((flags & MS_ASYNC) && (flags & MS_SYNC)) goto out; error = -ENOMEM; len = (len + ~PAGE_MASK) & PAGE_MASK; end = start + len; if (end < start) goto out; error = 0; if (end == start) goto out; /* * If the interval [start,end) covers some unmapped address ranges, * just ignore them, but return -ENOMEM at the end. */ down_read(&mm->mmap_sem); vma = find_vma(mm, start); for (;;) { struct file *file; /* Still start < end. */ error = -ENOMEM; if (!vma) goto out_unlock; /* Here start < vma->vm_end. */ if (start < vma->vm_start) { start = vma->vm_start; if (start >= end) goto out_unlock; unmapped_error = -ENOMEM; } /* Here vma->vm_start <= start < vma->vm_end. */ if ((flags & MS_INVALIDATE) && (vma->vm_flags & VM_LOCKED)) { error = -EBUSY; goto out_unlock; } file = vma->vm_file; start = vma->vm_end; if ((flags & MS_SYNC) && file && (vma->vm_flags & VM_SHARED)) { get_file(file); up_read(&mm->mmap_sem); error = do_fsync(file, 0); fput(file); if (error || start >= end) goto out; down_read(&mm->mmap_sem); vma = find_vma(mm, start); } else { if (start >= end) { error = 0; goto out_unlock; } vma = vma->vm_next; } } out_unlock: up_read(&mm->mmap_sem); out: return error ? : unmapped_error; }
static int nilfs_get_sb(struct file_system_type *fs_type, int flags, const char *dev_name, void *data, struct vfsmount *mnt) { struct nilfs_super_data sd; struct super_block *s; fmode_t mode = FMODE_READ; struct the_nilfs *nilfs; int err, need_to_close = 1; if (!(flags & MS_RDONLY)) mode |= FMODE_WRITE; sd.bdev = open_bdev_exclusive(dev_name, mode, fs_type); if (IS_ERR(sd.bdev)) return PTR_ERR(sd.bdev); /* * To get mount instance using sget() vfs-routine, NILFS needs * much more information than normal filesystems to identify mount * instance. For snapshot mounts, not only a mount type (ro-mount * or rw-mount) but also a checkpoint number is required. */ sd.cno = 0; sd.flags = flags; if (nilfs_identify((char *)data, &sd)) { err = -EINVAL; goto failed; } nilfs = find_or_create_nilfs(sd.bdev); if (!nilfs) { err = -ENOMEM; goto failed; } mutex_lock(&nilfs->ns_mount_mutex); if (!sd.cno) { /* * Check if an exclusive mount exists or not. * Snapshot mounts coexist with a current mount * (i.e. rw-mount or ro-mount), whereas rw-mount and * ro-mount are mutually exclusive. */ down_read(&nilfs->ns_super_sem); if (nilfs->ns_current && ((nilfs->ns_current->s_super->s_flags ^ flags) & MS_RDONLY)) { up_read(&nilfs->ns_super_sem); err = -EBUSY; goto failed_unlock; } up_read(&nilfs->ns_super_sem); } /* * Find existing nilfs_sb_info struct */ sd.sbi = nilfs_find_sbinfo(nilfs, !(flags & MS_RDONLY), sd.cno); /* * Get super block instance holding the nilfs_sb_info struct. * A new instance is allocated if no existing mount is present or * existing instance has been unmounted. */ s = sget(fs_type, nilfs_test_bdev_super, nilfs_set_bdev_super, &sd); if (sd.sbi) nilfs_put_sbinfo(sd.sbi); if (IS_ERR(s)) { err = PTR_ERR(s); goto failed_unlock; } if (!s->s_root) { char b[BDEVNAME_SIZE]; /* New superblock instance created */ s->s_flags = flags; s->s_mode = mode; strlcpy(s->s_id, bdevname(sd.bdev, b), sizeof(s->s_id)); sb_set_blocksize(s, block_size(sd.bdev)); err = nilfs_fill_super(s, data, flags & MS_SILENT ? 1 : 0, nilfs); if (err) goto cancel_new; s->s_flags |= MS_ACTIVE; need_to_close = 0; } mutex_unlock(&nilfs->ns_mount_mutex); put_nilfs(nilfs); if (need_to_close) close_bdev_exclusive(sd.bdev, mode); simple_set_mnt(mnt, s); return 0; failed_unlock: mutex_unlock(&nilfs->ns_mount_mutex); put_nilfs(nilfs); failed: close_bdev_exclusive(sd.bdev, mode); return err; cancel_new: /* Abandoning the newly allocated superblock */ mutex_unlock(&nilfs->ns_mount_mutex); put_nilfs(nilfs); deactivate_locked_super(s); /* * deactivate_locked_super() invokes close_bdev_exclusive(). * We must finish all post-cleaning before this call; * put_nilfs() needs the block device. */ return err; }
/* * inode->i_mutex: down */ int reiserfs_xattr_get(struct inode *inode, const char *name, void *buffer, size_t buffer_size) { ssize_t err = 0; struct dentry *dentry; size_t isize; size_t file_pos = 0; size_t buffer_pos = 0; struct page *page; __u32 hash = 0; if (name == NULL) return -EINVAL; /* We can't have xattrs attached to v1 items since they don't have * generation numbers */ if (get_inode_sd_version(inode) == STAT_DATA_V1) return -EOPNOTSUPP; dentry = xattr_lookup(inode, name, XATTR_REPLACE); if (IS_ERR(dentry)) { err = PTR_ERR(dentry); goto out; } down_read(&REISERFS_I(inode)->i_xattr_sem); isize = i_size_read(dentry->d_inode); /* Just return the size needed */ if (buffer == NULL) { err = isize - sizeof(struct reiserfs_xattr_header); goto out_unlock; } if (buffer_size < isize - sizeof(struct reiserfs_xattr_header)) { err = -ERANGE; goto out_unlock; } while (file_pos < isize) { size_t chunk; char *data; size_t skip = 0; if (isize - file_pos > PAGE_CACHE_SIZE) chunk = PAGE_CACHE_SIZE; else chunk = isize - file_pos; page = reiserfs_get_page(dentry->d_inode, file_pos); if (IS_ERR(page)) { err = PTR_ERR(page); goto out_unlock; } lock_page(page); data = page_address(page); if (file_pos == 0) { struct reiserfs_xattr_header *rxh = (struct reiserfs_xattr_header *)data; skip = file_pos = sizeof(struct reiserfs_xattr_header); chunk -= skip; /* Magic doesn't match up.. */ if (rxh->h_magic != cpu_to_le32(REISERFS_XATTR_MAGIC)) { unlock_page(page); reiserfs_put_page(page); reiserfs_warning(inode->i_sb, "jdm-20001", "Invalid magic for xattr (%s) " "associated with %k", name, INODE_PKEY(inode)); err = -EIO; goto out_unlock; } hash = le32_to_cpu(rxh->h_hash); } memcpy(buffer + buffer_pos, data + skip, chunk); unlock_page(page); reiserfs_put_page(page); file_pos += chunk; buffer_pos += chunk; skip = 0; } err = isize - sizeof(struct reiserfs_xattr_header); if (xattr_hash(buffer, isize - sizeof(struct reiserfs_xattr_header)) != hash) { reiserfs_warning(inode->i_sb, "jdm-20002", "Invalid hash for xattr (%s) associated " "with %k", name, INODE_PKEY(inode)); err = -EIO; } out_unlock: up_read(&REISERFS_I(inode)->i_xattr_sem); dput(dentry); out: return err; }
asmlinkage long sys_msgctl(int msqid, int cmd, struct msqid_ds __user *buf) { struct msg_queue *msq; int err, version; struct ipc_namespace *ns; if (msqid < 0 || cmd < 0) return -EINVAL; version = ipc_parse_version(&cmd); ns = current->nsproxy->ipc_ns; switch (cmd) { case IPC_INFO: case MSG_INFO: { struct msginfo msginfo; int max_id; if (!buf) return -EFAULT; /* * We must not return kernel stack data. * due to padding, it's not enough * to set all member fields. */ err = security_msg_queue_msgctl(NULL, cmd); if (err) return err; memset(&msginfo, 0, sizeof(msginfo)); msginfo.msgmni = ns->msg_ctlmni; msginfo.msgmax = ns->msg_ctlmax; msginfo.msgmnb = ns->msg_ctlmnb; msginfo.msgssz = MSGSSZ; msginfo.msgseg = MSGSEG; down_read(&msg_ids(ns).rw_mutex); if (cmd == MSG_INFO) { msginfo.msgpool = msg_ids(ns).in_use; msginfo.msgmap = atomic_read(&ns->msg_hdrs); msginfo.msgtql = atomic_read(&ns->msg_bytes); } else { msginfo.msgmap = MSGMAP; msginfo.msgpool = MSGPOOL; msginfo.msgtql = MSGTQL; } max_id = ipc_get_maxid(&msg_ids(ns)); up_read(&msg_ids(ns).rw_mutex); if (copy_to_user(buf, &msginfo, sizeof(struct msginfo))) return -EFAULT; return (max_id < 0) ? 0 : max_id; } case MSG_STAT: /* msqid is an index rather than a msg queue id */ case IPC_STAT: { struct msqid64_ds tbuf; int success_return; if (!buf) return -EFAULT; if (cmd == MSG_STAT) { msq = msg_lock(ns, msqid); if (IS_ERR(msq)) return PTR_ERR(msq); success_return = msq->q_perm.id; } else { msq = msg_lock_check(ns, msqid); if (IS_ERR(msq)) return PTR_ERR(msq); success_return = 0; } err = -EACCES; if (ipcperms(&msq->q_perm, S_IRUGO)) goto out_unlock; err = security_msg_queue_msgctl(msq, cmd); if (err) goto out_unlock; memset(&tbuf, 0, sizeof(tbuf)); kernel_to_ipc64_perm(&msq->q_perm, &tbuf.msg_perm); tbuf.msg_stime = msq->q_stime; tbuf.msg_rtime = msq->q_rtime; tbuf.msg_ctime = msq->q_ctime; tbuf.msg_cbytes = msq->q_cbytes; tbuf.msg_qnum = msq->q_qnum; tbuf.msg_qbytes = msq->q_qbytes; tbuf.msg_lspid = msq->q_lspid; tbuf.msg_lrpid = msq->q_lrpid; msg_unlock(msq); if (copy_msqid_to_user(buf, &tbuf, version)) return -EFAULT; return success_return; } case IPC_SET: case IPC_RMID: err = msgctl_down(ns, msqid, cmd, buf, version); return err; default: return -EINVAL; } out_unlock: msg_unlock(msq); return err; }
static int ext4_ioctl_setproject(struct file *filp, __u32 projid) { struct inode *inode = file_inode(filp); struct super_block *sb = inode->i_sb; struct ext4_inode_info *ei = EXT4_I(inode); int err, rc; handle_t *handle; kprojid_t kprojid; struct ext4_iloc iloc; struct ext4_inode *raw_inode; struct dquot *transfer_to[MAXQUOTAS] = { }; if (!ext4_has_feature_project(sb)) { if (projid != EXT4_DEF_PROJID) return -EOPNOTSUPP; else return 0; } if (EXT4_INODE_SIZE(sb) <= EXT4_GOOD_OLD_INODE_SIZE) return -EOPNOTSUPP; kprojid = make_kprojid(&init_user_ns, (projid_t)projid); if (projid_eq(kprojid, EXT4_I(inode)->i_projid)) return 0; err = -EPERM; /* Is it quota file? Do not allow user to mess with it */ if (ext4_is_quota_file(inode)) return err; err = ext4_get_inode_loc(inode, &iloc); if (err) return err; raw_inode = ext4_raw_inode(&iloc); if (!EXT4_FITS_IN_INODE(raw_inode, ei, i_projid)) { err = ext4_expand_extra_isize(inode, EXT4_SB(sb)->s_want_extra_isize, &iloc); if (err) return err; } else { brelse(iloc.bh); } err = dquot_initialize(inode); if (err) return err; handle = ext4_journal_start(inode, EXT4_HT_QUOTA, EXT4_QUOTA_INIT_BLOCKS(sb) + EXT4_QUOTA_DEL_BLOCKS(sb) + 3); if (IS_ERR(handle)) return PTR_ERR(handle); err = ext4_reserve_inode_write(handle, inode, &iloc); if (err) goto out_stop; transfer_to[PRJQUOTA] = dqget(sb, make_kqid_projid(kprojid)); if (!IS_ERR(transfer_to[PRJQUOTA])) { /* __dquot_transfer() calls back ext4_get_inode_usage() which * counts xattr inode references. */ down_read(&EXT4_I(inode)->xattr_sem); err = __dquot_transfer(inode, transfer_to); up_read(&EXT4_I(inode)->xattr_sem); dqput(transfer_to[PRJQUOTA]); if (err) goto out_dirty; } EXT4_I(inode)->i_projid = kprojid; inode->i_ctime = current_time(inode); out_dirty: rc = ext4_mark_iloc_dirty(handle, inode, &iloc); if (!err) err = rc; out_stop: ext4_journal_stop(handle); return err; }
static int nilfs_remount(struct super_block *sb, int *flags, char *data) { struct the_nilfs *nilfs = sb->s_fs_info; unsigned long old_sb_flags; unsigned long old_mount_opt; int err; old_sb_flags = sb->s_flags; old_mount_opt = nilfs->ns_mount_opt; if (!parse_options(data, sb, 1)) { err = -EINVAL; goto restore_opts; } sb->s_flags = (sb->s_flags & ~MS_POSIXACL); err = -EINVAL; if (!nilfs_valid_fs(nilfs)) { printk(KERN_WARNING "NILFS (device %s): couldn't " "remount because the filesystem is in an " "incomplete recovery state.\n", sb->s_id); goto restore_opts; } if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) goto out; if (*flags & MS_RDONLY) { /* Shutting down log writer */ nilfs_detach_log_writer(sb); sb->s_flags |= MS_RDONLY; /* * Remounting a valid RW partition RDONLY, so set * the RDONLY flag and then mark the partition as valid again. */ down_write(&nilfs->ns_sem); nilfs_cleanup_super(sb); up_write(&nilfs->ns_sem); } else { __u64 features; struct nilfs_root *root; /* * Mounting a RDONLY partition read-write, so reread and * store the current valid flag. (It may have been changed * by fsck since we originally mounted the partition.) */ down_read(&nilfs->ns_sem); features = le64_to_cpu(nilfs->ns_sbp[0]->s_feature_compat_ro) & ~NILFS_FEATURE_COMPAT_RO_SUPP; up_read(&nilfs->ns_sem); if (features) { printk(KERN_WARNING "NILFS (device %s): couldn't " "remount RDWR because of unsupported optional " "features (%llx)\n", sb->s_id, (unsigned long long)features); err = -EROFS; goto restore_opts; } sb->s_flags &= ~MS_RDONLY; root = NILFS_I(sb->s_root->d_inode)->i_root; err = nilfs_attach_log_writer(sb, root); if (err) goto restore_opts; down_write(&nilfs->ns_sem); nilfs_setup_super(sb, true); up_write(&nilfs->ns_sem); } out: return 0; restore_opts: sb->s_flags = old_sb_flags; nilfs->ns_mount_opt = old_mount_opt; return err; }
int ext4_ext_migrate(struct inode *inode) { handle_t *handle; int retval = 0, i; __le32 *i_data; ext4_lblk_t blk_count = 0; struct ext4_inode_info *ei; struct inode *tmp_inode = NULL; struct list_blocks_struct lb; unsigned long max_entries; /* * If the filesystem does not support extents, or the inode * already is extent-based, error out. */ if (!EXT4_HAS_INCOMPAT_FEATURE(inode->i_sb, EXT4_FEATURE_INCOMPAT_EXTENTS) || (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) return -EINVAL; if (S_ISLNK(inode->i_mode) && inode->i_blocks == 0) /* * don't migrate fast symlink */ return retval; handle = ext4_journal_start(inode, EXT4_DATA_TRANS_BLOCKS(inode->i_sb) + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 + 2 * EXT4_QUOTA_INIT_BLOCKS(inode->i_sb) + 1); if (IS_ERR(handle)) { retval = PTR_ERR(handle); return retval; } tmp_inode = ext4_new_inode(handle, inode->i_sb->s_root->d_inode, S_IFREG); if (IS_ERR(tmp_inode)) { retval = -ENOMEM; ext4_journal_stop(handle); return retval; } i_size_write(tmp_inode, i_size_read(inode)); /* * We don't want the inode to be reclaimed * if we got interrupted in between. We have * this tmp inode carrying reference to the * data blocks of the original file. We set * the i_nlink to zero at the last stage after * switching the original file to extent format */ tmp_inode->i_nlink = 1; ext4_ext_tree_init(handle, tmp_inode); ext4_orphan_add(handle, tmp_inode); ext4_journal_stop(handle); /* * start with one credit accounted for * superblock modification. * * For the tmp_inode we already have commited the * trascation that created the inode. Later as and * when we add extents we extent the journal */ /* * Even though we take i_mutex we can still cause block allocation * via mmap write to holes. If we have allocated new blocks we fail * migrate. New block allocation will clear EXT4_EXT_MIGRATE flag. * The flag is updated with i_data_sem held to prevent racing with * block allocation. */ down_read((&EXT4_I(inode)->i_data_sem)); EXT4_I(inode)->i_flags = EXT4_I(inode)->i_flags | EXT4_EXT_MIGRATE; up_read((&EXT4_I(inode)->i_data_sem)); handle = ext4_journal_start(inode, 1); ei = EXT4_I(inode); i_data = ei->i_data; memset(&lb, 0, sizeof(lb)); /* 32 bit block address 4 bytes */ max_entries = inode->i_sb->s_blocksize >> 2; for (i = 0; i < EXT4_NDIR_BLOCKS; i++, blk_count++) { if (i_data[i]) { retval = update_extent_range(handle, tmp_inode, le32_to_cpu(i_data[i]), blk_count, &lb); if (retval) goto err_out; } } if (i_data[EXT4_IND_BLOCK]) { retval = update_ind_extent_range(handle, tmp_inode, le32_to_cpu(i_data[EXT4_IND_BLOCK]), &blk_count, &lb); if (retval) goto err_out; } else blk_count += max_entries; if (i_data[EXT4_DIND_BLOCK]) { retval = update_dind_extent_range(handle, tmp_inode, le32_to_cpu(i_data[EXT4_DIND_BLOCK]), &blk_count, &lb); if (retval) goto err_out; } else blk_count += max_entries * max_entries; if (i_data[EXT4_TIND_BLOCK]) { retval = update_tind_extent_range(handle, tmp_inode, le32_to_cpu(i_data[EXT4_TIND_BLOCK]), &blk_count, &lb); if (retval) goto err_out; } /* * Build the last extent */ retval = finish_range(handle, tmp_inode, &lb); err_out: if (retval) /* * Failure case delete the extent information with the * tmp_inode */ free_ext_block(handle, tmp_inode); else { retval = ext4_ext_swap_inode_data(handle, inode, tmp_inode); if (retval) /* * if we fail to swap inode data free the extent * details of the tmp inode */ free_ext_block(handle, tmp_inode); } /* We mark the tmp_inode dirty via ext4_ext_tree_init. */ if (ext4_journal_extend(handle, 1) != 0) ext4_journal_restart(handle, 1); /* * Mark the tmp_inode as of size zero */ i_size_write(tmp_inode, 0); /* * set the i_blocks count to zero * so that the ext4_delete_inode does the * right job * * We don't need to take the i_lock because * the inode is not visible to user space. */ tmp_inode->i_blocks = 0; /* Reset the extent details */ ext4_ext_tree_init(handle, tmp_inode); /* * Set the i_nlink to zero so that * generic_drop_inode really deletes the * inode */ tmp_inode->i_nlink = 0; ext4_journal_stop(handle); iput(tmp_inode); return retval; }
static int load_image(struct pil_device *pil) { int i, ret; char fw_name[30]; struct elf32_hdr *ehdr; const struct elf32_phdr *phdr; const struct firmware *fw; unsigned long proxy_timeout = pil->desc->proxy_timeout; down_read(&pil_pm_rwsem); snprintf(fw_name, sizeof(fw_name), "%s.mdt", pil->desc->name); ret = request_firmware(&fw, fw_name, &pil->dev); if (ret) { dev_err(&pil->dev, "%s: Failed to locate %s\n", pil->desc->name, fw_name); goto out; } if (fw->size < sizeof(*ehdr)) { dev_err(&pil->dev, "%s: Not big enough to be an elf header\n", pil->desc->name); ret = -EIO; goto release_fw; } ehdr = (struct elf32_hdr *)fw->data; if (memcmp(ehdr->e_ident, ELFMAG, SELFMAG)) { dev_err(&pil->dev, "%s: Not an elf header\n", pil->desc->name); ret = -EIO; goto release_fw; } if (ehdr->e_phnum == 0) { dev_err(&pil->dev, "%s: No loadable segments\n", pil->desc->name); ret = -EIO; goto release_fw; } if (sizeof(struct elf32_phdr) * ehdr->e_phnum + sizeof(struct elf32_hdr) > fw->size) { dev_err(&pil->dev, "%s: Program headers not within mdt\n", pil->desc->name); ret = -EIO; goto release_fw; } ret = pil->desc->ops->init_image(pil->desc, fw->data, fw->size); if (ret) { dev_err(&pil->dev, "%s: Invalid firmware metadata\n", pil->desc->name); goto release_fw; } phdr = (const struct elf32_phdr *)(fw->data + sizeof(struct elf32_hdr)); for (i = 0; i < ehdr->e_phnum; i++, phdr++) { if (!segment_is_loadable(phdr)) continue; ret = load_segment(phdr, i, pil); if (ret) { dev_err(&pil->dev, "%s: Failed to load segment %d\n", pil->desc->name, i); goto release_fw; } } ret = pil_proxy_vote(pil); if (ret) { dev_err(&pil->dev, "%s: Failed to proxy vote\n", pil->desc->name); goto release_fw; } ret = pil->desc->ops->auth_and_reset(pil->desc); if (ret) { dev_err(&pil->dev, "%s: Failed to bring out of reset\n", pil->desc->name); proxy_timeout = 0; goto err_boot; } dev_info(&pil->dev, "%s: Brought out of reset\n", pil->desc->name); err_boot: pil_proxy_unvote(pil, proxy_timeout); release_fw: release_firmware(fw); out: up_read(&pil_pm_rwsem); return ret; }
int do_page_fault(unsigned long addr, int error_code, struct pt_regs *regs) { struct task_struct *tsk; struct mm_struct *mm; int fault; tsk = current; mm = tsk->mm; /* * If we're in an interrupt or have no user * context, we must not take the fault.. */ if (in_interrupt() || !mm) goto no_context; down_read(&mm->mmap_sem); fault = __do_page_fault(mm, addr, error_code, tsk); up_read(&mm->mmap_sem); /* * Handle the "normal" case first */ if (fault > 0) return 0; /* * We had some memory, but were unable to * successfully fix up this page fault. */ if (fault == 0) goto do_sigbus; /* * If we are in kernel mode at this point, we * have no context to handle this fault with. */ if (!user_mode(regs)) goto no_context; if (fault == -3) { /* * We ran out of memory, or some other thing happened to * us that made us unable to handle the page fault gracefully. */ printk("VM: killing process %s\n", tsk->comm); do_exit(SIGKILL); } else __do_user_fault(tsk, addr, error_code, fault == -1 ? SEGV_ACCERR : SEGV_MAPERR, regs); return 0; /* * We ran out of memory, or some other thing happened to us that made * us unable to handle the page fault gracefully. */ do_sigbus: /* * Send a sigbus, regardless of whether we were in kernel * or user mode. */ tsk->thread.address = addr; tsk->thread.error_code = error_code; tsk->thread.trap_no = 14; force_sig(SIGBUS, tsk); #ifdef CONFIG_DEBUG_USER printk(KERN_DEBUG "%s: sigbus at 0x%08lx, pc=0x%08lx\n", current->comm, addr, instruction_pointer(regs)); #endif /* Kernel mode? Handle exceptions or die */ if (user_mode(regs)) return 0; no_context: __do_kernel_fault(mm, addr, error_code, regs); return 0; }
/* XXX Note, this fuction does not currently handle faults from * vmalloc/vmaped'd memory. That should probably be in a separate * function anyway. */ int l4_do_page_fault(unsigned long address, long access, struct pt_regs *regs) { struct vm_area_struct * vma; struct mm_struct *mm = current->mm; int fault, si_code = SEGV_MAPERR; siginfo_t info; /* If we're in an interrupt context, or have no user context, we must not take the fault. */ if (!mm) /* || in_interrupt()) */ goto bad_area_nosemaphore; down_read(&mm->mmap_sem); vma = find_vma(mm, address); if (!vma) goto bad_area; if (vma->vm_start <= address) goto good_area; if (!(vma->vm_flags & VM_GROWSDOWN)) goto bad_area; if (expand_stack(vma, address)) goto bad_area; /* Ok, we have a good vm_area for this memory access, so we can handle it. */ good_area: si_code = SEGV_ACCERR; if (/* LOAD */ access & 0x4) { /* Allow reads even for write/execute-only mappings */ if (!(vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC))) goto bad_area; } else if (/* FETCH */ access & 0x1) { if (!(vma->vm_flags & VM_EXEC)) goto bad_area; } else { if (!(vma->vm_flags & VM_WRITE)) goto bad_area; } survive: /* If for any reason at all we couldn't handle the fault, make sure we exit gracefully rather than endlessly redo the fault. */ fault = handle_mm_fault(mm, vma, address, access & 0x2); up_read(&mm->mmap_sem); switch (fault) { case VM_FAULT_MINOR: current->min_flt++; break; case VM_FAULT_MAJOR: current->maj_flt++; break; case VM_FAULT_SIGBUS: goto do_sigbus; case VM_FAULT_OOM: goto out_of_memory; #if 0 /* * Well, it's a good idea to have this here, but apparently * handle_mm_fault() can return all sorts of weird stuff, which * makes it unsuitable to put BUG() here. -gl */ default: BUG(); #endif } return 0; /* Something tried to access memory that isn't in our memory map. Fix it, but check if it's kernel or user first. */ bad_area: up_read(&mm->mmap_sem); /* Check if it is at TASK_SIG_BASE */ #ifdef CONFIG_ARCH_ARM /* * Binary patching for NPTL * * XXX ??? Better place this thing? */ if (user_mode(regs) && ((address & PAGE_MASK) == 0xffff0000)) { #if 0 printk("Fault at address 0x%lx pc = 0x%lx, " "need rewrite\n", address, L4_MsgWord(¤t_regs()->msg, 1)); #endif if (address == 0xffff0fe0) { L4_Msg_t msg; unsigned long pc = L4_MsgWord(¤t_regs()->msg, 1); unsigned long lr, fpc; unsigned long instr, r; long offs; if (pc != 0xffff0fe0) goto bad_area_nosemaphore; L4_Copy_regs_to_mrs(task_thread_info(current)->user_tid); L4_StoreMRs(0, 16, &msg.msg[0]); lr = msg.msg[14]; fpc = lr - 4; L4_CacheFlushAll(); instr = get_instr(fpc); if (instr == -1UL) goto bad_area_nosemaphore; if ((instr & 0x0f000000) == 0x0b000000) { offs = instr << 8; offs = offs >> 6; /* ASR */ fpc = (fpc + 8) + offs; instr = get_instr(fpc); if (instr == -1UL) goto bad_area_nosemaphore; if ((instr & 0xffffffff) == 0xe3e00a0f) { /* mvn r0, 0xf000 */ /* * Rewrite to load the * kernel_reserved[0] from the * utcb. * * This requires L4 to cooperate * with the ExReg() syscall. */ /* mov r0, #0xff000000 */ r = set_instr(fpc, 0xe3a004ff); if (r == -1UL) goto bad_area_nosemaphore; fpc += 4; /* ldr r0, [r0, #0xff0] */ r = set_instr(fpc, 0xe5900ff0); if (r == -1UL) goto bad_area_nosemaphore; fpc += 4; /* ldr r0, [r0, #56] */ r = set_instr(fpc, 0xe5900038); if (r == -1UL) goto bad_area_nosemaphore; fpc += 4; /* mov pc, lr */ r = set_instr(fpc, 0xe1a0f00e); if (r == -1UL) goto bad_area_nosemaphore; L4_CacheFlushAll(); msg.msg[0] = current_thread_info()->tp_value; msg.msg[15] = lr; L4_LoadMRs(0, 16, &msg.msg[0]); L4_Copy_mrs_to_regs( task_thread_info(current)->user_tid); L4_MsgPutWord(¤t_regs()->msg, 1, lr); return 0; } } else if (instr == 0xe240f01f) {
int videobuf_qbuf(struct videobuf_queue *q, struct v4l2_buffer *b) { struct videobuf_buffer *buf; enum v4l2_field field; unsigned long flags = 0; int retval; MAGIC_CHECK(q->int_ops->magic, MAGIC_QTYPE_OPS); if (b->memory == V4L2_MEMORY_MMAP) down_read(¤t->mm->mmap_sem); mutex_lock(&q->vb_lock); retval = -EBUSY; if (q->reading) { dprintk(1, "qbuf: Reading running...\n"); goto done; } retval = -EINVAL; if (b->type != q->type) { dprintk(1, "qbuf: Wrong type.\n"); goto done; } if (b->index >= VIDEO_MAX_FRAME) { dprintk(1, "qbuf: index out of range.\n"); goto done; } buf = q->bufs[b->index]; if (NULL == buf) { dprintk(1, "qbuf: buffer is null.\n"); goto done; } MAGIC_CHECK(buf->magic, MAGIC_BUFFER); if (buf->memory != b->memory) { dprintk(1, "qbuf: memory type is wrong.\n"); goto done; } if (buf->state != VIDEOBUF_NEEDS_INIT && buf->state != VIDEOBUF_IDLE) { dprintk(1, "qbuf: buffer is already queued or active.\n"); goto done; } if (b->flags & V4L2_BUF_FLAG_INPUT) { if (b->input >= q->inputs) { dprintk(1, "qbuf: wrong input.\n"); goto done; } buf->input = b->input; } else { buf->input = UNSET; } switch (b->memory) { case V4L2_MEMORY_MMAP: if (0 == buf->baddr) { dprintk(1, "qbuf: mmap requested " "but buffer addr is zero!\n"); goto done; } break; case V4L2_MEMORY_USERPTR: if (b->length < buf->bsize) { dprintk(1, "qbuf: buffer length is not enough\n"); goto done; } if (VIDEOBUF_NEEDS_INIT != buf->state && buf->baddr != b->m.userptr) q->ops->buf_release(q, buf); buf->baddr = b->m.userptr; break; case V4L2_MEMORY_OVERLAY: buf->boff = b->m.offset; break; default: dprintk(1, "qbuf: wrong memory type\n"); goto done; } dprintk(1, "qbuf: requesting next field\n"); field = videobuf_next_field(q); retval = q->ops->buf_prepare(q, buf, field); if (0 != retval) { dprintk(1, "qbuf: buffer_prepare returned %d\n", retval); goto done; } list_add_tail(&buf->stream, &q->stream); if (q->streaming) { spin_lock_irqsave(q->irqlock, flags); q->ops->buf_queue(q, buf); spin_unlock_irqrestore(q->irqlock, flags); } dprintk(1, "qbuf: succeded\n"); retval = 0; wake_up_interruptible_sync(&q->wait); done: mutex_unlock(&q->vb_lock); if (b->memory == V4L2_MEMORY_MMAP) up_read(¤t->mm->mmap_sem); return retval; }
/* * Note this is constrained to return 0, -EFAULT, -EACCESS, -ENOMEM by * segv(). */ int handle_page_fault(unsigned long address, unsigned long ip, int is_write, int is_user, int *code_out) { struct mm_struct *mm = current->mm; struct vm_area_struct *vma; pgd_t *pgd; pud_t *pud; pmd_t *pmd; pte_t *pte; int err = -EFAULT; unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; *code_out = SEGV_MAPERR; /* * If the fault was during atomic operation, don't take the fault, just * fail. */ if (in_atomic()) goto out_nosemaphore; if (is_user) flags |= FAULT_FLAG_USER; down_read(&mm->mmap_sem); vma = find_vma(mm, address); if (!vma) goto out; else if (vma->vm_start <= address) goto good_area; else if (!(vma->vm_flags & VM_GROWSDOWN)) goto out; else if (is_user && !ARCH_IS_STACKGROW(address)) goto out; else if (expand_stack(vma, address)) goto out; good_area: *code_out = SEGV_ACCERR; if (is_write) { if (!(vma->vm_flags & VM_WRITE)) goto out; flags |= FAULT_FLAG_WRITE; } else { /* Don't require VM_READ|VM_EXEC for write faults! */ if (!(vma->vm_flags & (VM_READ | VM_EXEC))) goto out; } do { int fault; fault = handle_mm_fault(mm, vma, address, is_write ? FAULT_FLAG_WRITE : 0); if (unlikely(fault & VM_FAULT_ERROR)) { if (fault & VM_FAULT_OOM) { goto out_of_memory; } else if (fault & VM_FAULT_SIGBUS) { err = -EACCES; goto out; } BUG(); } if (fault & VM_FAULT_MAJOR) current->maj_flt++; else current->min_flt++; pgd = pgd_offset(mm, address); pud = pud_offset(pgd, address); pmd = pmd_offset(pud, address); pte = pte_offset_kernel(pmd, address); } while (!pte_present(*pte)); err = 0; /* * The below warning was added in place of * pte_mkyoung(); if (is_write) pte_mkdirty(); * If it's triggered, we'd see normally a hang here (a clean pte is * marked read-only to emulate the dirty bit). * However, the generic code can mark a PTE writable but clean on a * concurrent read fault, triggering this harmlessly. So comment it out. */ #if 0 WARN_ON(!pte_young(*pte) || (is_write && !pte_dirty(*pte))); #endif flush_tlb_page(vma, address); out: up_read(&mm->mmap_sem); out_nosemaphore: return err; out_of_memory: /* * We ran out of memory, call the OOM killer, and return the userspace * (which will retry the fault, or kill us if we got oom-killed). */ up_read(&mm->mmap_sem); if (!is_user) goto out_nosemaphore; pagefault_out_of_memory(); return 0; }
/** * get_vaddr_frames() - map virtual addresses to pfns * @start: starting user address * @nr_frames: number of pages / pfns from start to map * @write: whether pages will be written to by the caller * @force: whether to force write access even if user mapping is * readonly. See description of the same argument of get_user_pages(). * @vec: structure which receives pages / pfns of the addresses mapped. * It should have space for at least nr_frames entries. * * This function maps virtual addresses from @start and fills @vec structure * with page frame numbers or page pointers to corresponding pages (choice * depends on the type of the vma underlying the virtual address). If @start * belongs to a normal vma, the function grabs reference to each of the pages * to pin them in memory. If @start belongs to VM_IO | VM_PFNMAP vma, we don't * touch page structures and the caller must make sure pfns aren't reused for * anything else while he is using them. * * The function returns number of pages mapped which may be less than * @nr_frames. In particular we stop mapping if there are more vmas of * different type underlying the specified range of virtual addresses. * When the function isn't able to map a single page, it returns error. * * This function takes care of grabbing mmap_sem as necessary. */ int get_vaddr_frames(unsigned long start, unsigned int nr_frames, bool write, bool force, struct frame_vector *vec) { struct mm_struct *mm = current->mm; struct vm_area_struct *vma; int ret = 0; int err; int locked; if (nr_frames == 0) return 0; if (WARN_ON_ONCE(nr_frames > vec->nr_allocated)) nr_frames = vec->nr_allocated; down_read(&mm->mmap_sem); locked = 1; vma = find_vma_intersection(mm, start, start + 1); if (!vma) { ret = -EFAULT; goto out; } if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) { vec->got_ref = true; vec->is_pfns = false; ret = get_user_pages_locked(current, mm, start, nr_frames, write, force, (struct page **)(vec->ptrs), &locked); goto out; } vec->got_ref = false; vec->is_pfns = true; do { unsigned long *nums = frame_vector_pfns(vec); while (ret < nr_frames && start + PAGE_SIZE <= vma->vm_end) { err = follow_pfn(vma, start, &nums[ret]); if (err) { if (ret == 0) ret = err; goto out; } start += PAGE_SIZE; ret++; } /* * We stop if we have enough pages or if VMA doesn't completely * cover the tail page. */ if (ret >= nr_frames || start < vma->vm_end) break; vma = find_vma_intersection(mm, start, start + 1); } while (vma && vma->vm_flags & (VM_IO | VM_PFNMAP)); out: if (locked) up_read(&mm->mmap_sem); if (!ret) ret = -EFAULT; if (ret > 0) vec->nr_frames = ret; return ret; }
/* * This function fills a red-black tree with information from an * inlined dir. It returns the number directory entries loaded * into the tree. If there is an error it is returned in err. */ int htree_inlinedir_to_tree(struct file *dir_file, struct inode *dir, ext4_lblk_t block, struct dx_hash_info *hinfo, __u32 start_hash, __u32 start_minor_hash, int *has_inline_data) { int err = 0, count = 0; unsigned int parent_ino; int pos; struct ext4_dir_entry_2 *de; struct inode *inode = file_inode(dir_file); int ret, inline_size = 0; struct ext4_iloc iloc; void *dir_buf = NULL; struct ext4_dir_entry_2 fake; ret = ext4_get_inode_loc(inode, &iloc); if (ret) return ret; down_read(&EXT4_I(inode)->xattr_sem); if (!ext4_has_inline_data(inode)) { up_read(&EXT4_I(inode)->xattr_sem); *has_inline_data = 0; goto out; } inline_size = ext4_get_inline_size(inode); dir_buf = kmalloc(inline_size, GFP_NOFS); if (!dir_buf) { ret = -ENOMEM; up_read(&EXT4_I(inode)->xattr_sem); goto out; } ret = ext4_read_inline_data(inode, dir_buf, inline_size, &iloc); up_read(&EXT4_I(inode)->xattr_sem); if (ret < 0) goto out; pos = 0; parent_ino = le32_to_cpu(((struct ext4_dir_entry_2 *)dir_buf)->inode); while (pos < inline_size) { /* * As inlined dir doesn't store any information about '.' and * only the inode number of '..' is stored, we have to handle * them differently. */ if (pos == 0) { fake.inode = cpu_to_le32(inode->i_ino); fake.name_len = 1; strcpy(fake.name, "."); fake.rec_len = ext4_rec_len_to_disk( EXT4_DIR_REC_LEN(fake.name_len), inline_size); ext4_set_de_type(inode->i_sb, &fake, S_IFDIR); de = &fake; pos = EXT4_INLINE_DOTDOT_OFFSET; } else if (pos == EXT4_INLINE_DOTDOT_OFFSET) { fake.inode = cpu_to_le32(parent_ino); fake.name_len = 2; strcpy(fake.name, ".."); fake.rec_len = ext4_rec_len_to_disk( EXT4_DIR_REC_LEN(fake.name_len), inline_size); ext4_set_de_type(inode->i_sb, &fake, S_IFDIR); de = &fake; pos = EXT4_INLINE_DOTDOT_SIZE; } else { de = (struct ext4_dir_entry_2 *)(dir_buf + pos); pos += ext4_rec_len_from_disk(de->rec_len, inline_size); if (ext4_check_dir_entry(inode, dir_file, de, iloc.bh, dir_buf, inline_size, pos)) { ret = count; goto out; } } ext4fs_dirhash(de->name, de->name_len, hinfo); if ((hinfo->hash < start_hash) || ((hinfo->hash == start_hash) && (hinfo->minor_hash < start_minor_hash))) continue; if (de->inode == 0) continue; err = ext4_htree_store_dirent(dir_file, hinfo->hash, hinfo->minor_hash, de); if (err) { count = err; goto out; } count++; } ret = count; out: kfree(dir_buf); brelse(iloc.bh); return ret; }
unsigned long oxnas_copy_from_user(void *to, const void __user *from, unsigned long count) { int pages_mapped, i; unsigned long transfered = 0; struct page *pages[2]; oxnas_dma_channel_t *channel; unsigned long uaddr = (unsigned long)from; unsigned long end_page = (uaddr + count + PAGE_SIZE - 1) >> PAGE_SHIFT; unsigned long start_page = uaddr >> PAGE_SHIFT; int nr_pages = end_page - start_page; //printk("F 0x%08lx 0x%08lx %lu -> %lu, %lu, %d\n", (unsigned long)to, (unsigned long)from, count, start_page, end_page, nr_pages); might_sleep(); BUG_ON(nr_pages > 2); // Only support a single concurrent copy operation, as only using a single // DMA channel for now while (down_interruptible(©_mutex)); // Get kernel mappings for the user pages down_read(¤t->mm->mmap_sem); pages_mapped = get_user_pages(current, current->mm, uaddr, nr_pages, 0, 0, pages, NULL); up_read(¤t->mm->mmap_sem); if (pages_mapped != nr_pages) { // Didn't get mappings for all pages requested, so release any we did get for (i=0; i < pages_mapped; ++i) { page_cache_release(pages[i]); } pages_mapped = 0; } if (pages_mapped) { int i; struct scatterlist sl; struct scatterlist gl[2]; // Fill gathering DMA descriptors gl[0].page = pages[0]; gl[0].offset = uaddr & ~PAGE_MASK; if (pages_mapped > 1) { gl[0].length = PAGE_SIZE - gl[0].offset; gl[1].offset = 0; gl[1].page = pages[1]; gl[1].length = count - gl[0].length; } else { gl[0].length = count; } // Create DMA mappings for all the user pages for (i=0; i < pages_mapped; ++i) { gl[i].dma_address = dma_map_single(0, page_address(gl[i].page) + gl[i].offset, gl[i].length, DMA_TO_DEVICE); } // Create a DMA mapping for the kernel memory range sl.dma_address = dma_map_single(0, to, count, DMA_FROM_DEVICE); sl.length = count; // Allocate a DMA channel channel = oxnas_dma_request(1); BUG_ON(channel == OXNAS_DMA_CHANNEL_NUL); // Do DMA from user to kernel memory oxnas_dma_set_sg( channel, gl, pages_mapped, &sl, 1, OXNAS_DMA_MODE_INC, OXNAS_DMA_MODE_INC, 0); // Using notification callback oxnas_dma_set_callback(channel, dma_callback, OXNAS_DMA_CALLBACK_ARG_NUL); oxnas_dma_start(channel); // Sleep until transfer complete while (down_interruptible(&callback_semaphore)); oxnas_dma_set_callback(channel, OXNAS_DMA_CALLBACK_NUL, OXNAS_DMA_CALLBACK_ARG_NUL); transfered += count; // Release the DMA channel oxnas_dma_free(channel); // Release kernel DMA mapping dma_unmap_single(0, sl.length, count, DMA_FROM_DEVICE); // Release user DMA mappings for (i=0; i < pages_mapped; ++i) { dma_unmap_single(0, gl[i].dma_address, gl[i].length, DMA_TO_DEVICE); } // Release user pages for (i=0; i < pages_mapped; ++i) { page_cache_release(pages[i]); } } up(©_mutex); return count - transfered; }
/* * Try to write data in the inode. * If the inode has inline data, check whether the new write can be * in the inode also. If not, create the page the handle, move the data * to the page make it update and let the later codes create extent for it. */ int ext4_try_to_write_inline_data(struct address_space *mapping, struct inode *inode, loff_t pos, unsigned len, unsigned flags, struct page **pagep) { int ret; handle_t *handle; struct page *page; struct ext4_iloc iloc; if (pos + len > ext4_get_max_inline_size(inode)) goto convert; ret = ext4_get_inode_loc(inode, &iloc); if (ret) return ret; /* * The possible write could happen in the inode, * so try to reserve the space in inode first. */ handle = ext4_journal_start(inode, EXT4_HT_INODE, 1); if (IS_ERR(handle)) { ret = PTR_ERR(handle); handle = NULL; goto out; } ret = ext4_prepare_inline_data(handle, inode, pos + len); if (ret && ret != -ENOSPC) goto out; /* We don't have space in inline inode, so convert it to extent. */ if (ret == -ENOSPC) { ext4_journal_stop(handle); brelse(iloc.bh); goto convert; } flags |= AOP_FLAG_NOFS; page = grab_cache_page_write_begin(mapping, 0, flags); if (!page) { ret = -ENOMEM; goto out; } *pagep = page; down_read(&EXT4_I(inode)->xattr_sem); if (!ext4_has_inline_data(inode)) { ret = 0; unlock_page(page); page_cache_release(page); goto out_up_read; } if (!PageUptodate(page)) { ret = ext4_read_inline_page(inode, page); if (ret < 0) goto out_up_read; } ret = 1; handle = NULL; out_up_read: up_read(&EXT4_I(inode)->xattr_sem); out: if (handle) ext4_journal_stop(handle); brelse(iloc.bh); return ret; convert: return ext4_convert_inline_data_to_extent(mapping, inode, flags); }
void spu_acquire(struct spu_context *ctx) { down_read(&ctx->state_sema); }
/* * This routine handles page faults. It determines the address, * and the problem, and then passes it off to one of the appropriate * routines. */ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long cause, unsigned long address) { struct vm_area_struct *vma = NULL; struct task_struct *tsk = current; struct mm_struct *mm = tsk->mm; int code = SEGV_MAPERR; vm_fault_t fault; unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; cause >>= 2; /* Restart the instruction */ regs->ea -= 4; /* * We fault-in kernel-space virtual memory on-demand. The * 'reference' page table is init_mm.pgd. * * NOTE! We MUST NOT take any locks for this case. We may * be in an interrupt or a critical region, and should * only copy the information from the master page table, * nothing more. */ if (unlikely(address >= VMALLOC_START && address <= VMALLOC_END)) { if (user_mode(regs)) goto bad_area_nosemaphore; else goto vmalloc_fault; } if (unlikely(address >= TASK_SIZE)) goto bad_area_nosemaphore; /* * If we're in an interrupt or have no user * context, we must not take the fault.. */ if (faulthandler_disabled() || !mm) goto bad_area_nosemaphore; if (user_mode(regs)) flags |= FAULT_FLAG_USER; if (!down_read_trylock(&mm->mmap_sem)) { if (!user_mode(regs) && !search_exception_tables(regs->ea)) goto bad_area_nosemaphore; retry: down_read(&mm->mmap_sem); } vma = find_vma(mm, address); if (!vma) goto bad_area; if (vma->vm_start <= address) goto good_area; if (!(vma->vm_flags & VM_GROWSDOWN)) goto bad_area; if (expand_stack(vma, address)) goto bad_area; /* * Ok, we have a good vm_area for this memory access, so * we can handle it.. */ good_area: code = SEGV_ACCERR; switch (cause) { case EXC_SUPERV_INSN_ACCESS: goto bad_area; case EXC_SUPERV_DATA_ACCESS: goto bad_area; case EXC_X_PROTECTION_FAULT: if (!(vma->vm_flags & VM_EXEC)) goto bad_area; break; case EXC_R_PROTECTION_FAULT: if (!(vma->vm_flags & VM_READ)) goto bad_area; break; case EXC_W_PROTECTION_FAULT: if (!(vma->vm_flags & VM_WRITE)) goto bad_area; flags = FAULT_FLAG_WRITE; break; } /* * If for any reason at all we couldn't handle the fault, * make sure we exit gracefully rather than endlessly redo * the fault. */ fault = handle_mm_fault(vma, address, flags); if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) return; if (unlikely(fault & VM_FAULT_ERROR)) { if (fault & VM_FAULT_OOM) goto out_of_memory; else if (fault & VM_FAULT_SIGSEGV) goto bad_area; else if (fault & VM_FAULT_SIGBUS) goto do_sigbus; BUG(); } /* * Major/minor page fault accounting is only done on the * initial attempt. If we go through a retry, it is extremely * likely that the page will be found in page cache at that point. */ if (flags & FAULT_FLAG_ALLOW_RETRY) { if (fault & VM_FAULT_MAJOR) current->maj_flt++; else current->min_flt++; if (fault & VM_FAULT_RETRY) { /* Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk * of starvation. */ flags &= ~FAULT_FLAG_ALLOW_RETRY; flags |= FAULT_FLAG_TRIED; /* * No need to up_read(&mm->mmap_sem) as we would * have already released it in __lock_page_or_retry * in mm/filemap.c. */ goto retry; } } up_read(&mm->mmap_sem); return; /* * Something tried to access memory that isn't in our memory map.. * Fix it, but check if it's kernel or user first.. */ bad_area: up_read(&mm->mmap_sem); bad_area_nosemaphore: /* User mode accesses just cause a SIGSEGV */ if (user_mode(regs)) { if (unhandled_signal(current, SIGSEGV) && printk_ratelimit()) { pr_info("%s: unhandled page fault (%d) at 0x%08lx, " "cause %ld\n", current->comm, SIGSEGV, address, cause); show_regs(regs); } _exception(SIGSEGV, regs, code, address); return; } no_context: /* Are we prepared to handle this kernel fault? */ if (fixup_exception(regs)) return; /* * Oops. The kernel tried to access some bad page. We'll have to * terminate things with extreme prejudice. */ bust_spinlocks(1); pr_alert("Unable to handle kernel %s at virtual address %08lx", address < PAGE_SIZE ? "NULL pointer dereference" : "paging request", address); pr_alert("ea = %08lx, ra = %08lx, cause = %ld\n", regs->ea, regs->ra, cause); panic("Oops"); return; /* * We ran out of memory, or some other thing happened to us that made * us unable to handle the page fault gracefully. */ out_of_memory: up_read(&mm->mmap_sem); if (!user_mode(regs)) goto no_context; pagefault_out_of_memory(); return; do_sigbus: up_read(&mm->mmap_sem); /* Kernel mode? Handle exceptions or die */ if (!user_mode(regs)) goto no_context; _exception(SIGBUS, regs, BUS_ADRERR, address); return; vmalloc_fault: { /* * Synchronize this task's top level page-table * with the 'reference' page table. * * Do _not_ use "tsk" here. We might be inside * an interrupt in the middle of a task switch.. */ int offset = pgd_index(address); pgd_t *pgd, *pgd_k; pud_t *pud, *pud_k; pmd_t *pmd, *pmd_k; pte_t *pte_k; pgd = pgd_current + offset; pgd_k = init_mm.pgd + offset; if (!pgd_present(*pgd_k)) goto no_context; set_pgd(pgd, *pgd_k); pud = pud_offset(pgd, address); pud_k = pud_offset(pgd_k, address); if (!pud_present(*pud_k)) goto no_context; pmd = pmd_offset(pud, address); pmd_k = pmd_offset(pud_k, address); if (!pmd_present(*pmd_k)) goto no_context; set_pmd(pmd, *pmd_k); pte_k = pte_offset_kernel(pmd_k, address); if (!pte_present(*pte_k)) goto no_context; flush_tlb_one(address); return; } }
/* * Pin down all the iovec pages needed for len bytes. * Return a struct dma_pinned_list to keep track of pages pinned down. * * We are allocating a single chunk of memory, and then carving it up into * 3 sections, the latter 2 whose size depends on the number of iovecs and the * total number of pages, respectively. */ struct dma_pinned_list *dma_pin_iovec_pages(struct iovec *iov, size_t len) { struct dma_pinned_list *local_list; struct page **pages; int i; int ret; int nr_iovecs = 0; int iovec_len_used = 0; int iovec_pages_used = 0; /* don't pin down non-user-based iovecs */ if (segment_eq(get_fs(), KERNEL_DS)) return NULL; /* determine how many iovecs/pages there are, up front */ do { iovec_len_used += iov[nr_iovecs].iov_len; iovec_pages_used += num_pages_spanned(&iov[nr_iovecs]); nr_iovecs++; } while (iovec_len_used < len); /* single kmalloc for pinned list, page_list[], and the page arrays */ local_list = kmalloc(sizeof(*local_list) + (nr_iovecs * sizeof (struct dma_page_list)) + (iovec_pages_used * sizeof (struct page*)), GFP_KERNEL); if (!local_list) goto out; /* list of pages starts right after the page list array */ pages = (struct page **) &local_list->page_list[nr_iovecs]; local_list->nr_iovecs = 0; for (i = 0; i < nr_iovecs; i++) { struct dma_page_list *page_list = &local_list->page_list[i]; len -= iov[i].iov_len; if (!access_ok(VERIFY_WRITE, iov[i].iov_base, iov[i].iov_len)) goto unpin; page_list->nr_pages = num_pages_spanned(&iov[i]); page_list->base_address = iov[i].iov_base; page_list->pages = pages; pages += page_list->nr_pages; /* pin pages down */ down_read(¤t->mm->mmap_sem); ret = get_user_pages( current, current->mm, (unsigned long) iov[i].iov_base, page_list->nr_pages, 1, /* write */ 0, /* force */ page_list->pages, NULL); up_read(¤t->mm->mmap_sem); if (ret != page_list->nr_pages) goto unpin; local_list->nr_iovecs = i + 1; } return local_list; unpin: dma_unpin_iovec_pages(local_list); out: return NULL; }
int nilfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, __u64 start, __u64 len) { struct the_nilfs *nilfs = inode->i_sb->s_fs_info; __u64 logical = 0, phys = 0, size = 0; __u32 flags = 0; loff_t isize; sector_t blkoff, end_blkoff; sector_t delalloc_blkoff; unsigned long delalloc_blklen; unsigned int blkbits = inode->i_blkbits; int ret, n; ret = fiemap_check_flags(fieinfo, FIEMAP_FLAG_SYNC); if (ret) return ret; mutex_lock(&inode->i_mutex); isize = i_size_read(inode); blkoff = start >> blkbits; end_blkoff = (start + len - 1) >> blkbits; delalloc_blklen = nilfs_find_uncommitted_extent(inode, blkoff, &delalloc_blkoff); do { __u64 blkphy; unsigned int maxblocks; if (delalloc_blklen && blkoff == delalloc_blkoff) { if (size) { /* End of the current extent */ ret = fiemap_fill_next_extent( fieinfo, logical, phys, size, flags); if (ret) break; } if (blkoff > end_blkoff) break; flags = FIEMAP_EXTENT_MERGED | FIEMAP_EXTENT_DELALLOC; logical = blkoff << blkbits; phys = 0; size = delalloc_blklen << blkbits; blkoff = delalloc_blkoff + delalloc_blklen; delalloc_blklen = nilfs_find_uncommitted_extent( inode, blkoff, &delalloc_blkoff); continue; } /* * Limit the number of blocks that we look up so as * not to get into the next delayed allocation extent. */ maxblocks = INT_MAX; if (delalloc_blklen) maxblocks = min_t(sector_t, delalloc_blkoff - blkoff, maxblocks); blkphy = 0; down_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem); n = nilfs_bmap_lookup_contig( NILFS_I(inode)->i_bmap, blkoff, &blkphy, maxblocks); up_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem); if (n < 0) { int past_eof; if (unlikely(n != -ENOENT)) break; /* error */ /* HOLE */ blkoff++; past_eof = ((blkoff << blkbits) >= isize); if (size) { /* End of the current extent */ if (past_eof) flags |= FIEMAP_EXTENT_LAST; ret = fiemap_fill_next_extent( fieinfo, logical, phys, size, flags); if (ret) break; size = 0; } if (blkoff > end_blkoff || past_eof) break; } else { if (size) { if (phys && blkphy << blkbits == phys + size) { /* The current extent goes on */ size += n << blkbits; } else { /* Terminate the current extent */ ret = fiemap_fill_next_extent( fieinfo, logical, phys, size, flags); if (ret || blkoff > end_blkoff) break; /* Start another extent */ flags = FIEMAP_EXTENT_MERGED; logical = blkoff << blkbits; phys = blkphy << blkbits; size = n << blkbits; } } else { /* Start a new extent */ flags = FIEMAP_EXTENT_MERGED; logical = blkoff << blkbits; phys = blkphy << blkbits; size = n << blkbits; } blkoff += n; } cond_resched(); } while (true); /* If ret is 1 then we just hit the end of the extent array */ if (ret == 1) ret = 0; mutex_unlock(&inode->i_mutex); return ret; }
/* * This routine handles page faults. It determines the address, * and the problem, and then passes it off to one of the appropriate * routines. * * error_code: * 04 Protection -> Write-Protection (suprression) * 10 Segment translation -> Not present (nullification) * 11 Page translation -> Not present (nullification) * 3b Region third trans. -> Not present (nullification) */ extern inline void do_exception(struct pt_regs *regs, unsigned long error_code) { struct task_struct *tsk; struct mm_struct *mm; struct vm_area_struct * vma; unsigned long address; int user_address; unsigned long fixup; int si_code = SEGV_MAPERR; tsk = current; mm = tsk->mm; /* * Check for low-address protection. This needs to be treated * as a special case because the translation exception code * field is not guaranteed to contain valid data in this case. */ if (error_code == 4 && !(S390_lowcore.trans_exc_code & 4)) { /* Low-address protection hit in kernel mode means NULL pointer write access in kernel mode. */ if (!(regs->psw.mask & PSW_PROBLEM_STATE)) { address = 0; user_address = 0; goto no_context; } /* Low-address protection hit in user mode 'cannot happen'. */ die ("Low-address protection", regs, error_code); do_exit(SIGKILL); } /* * get the failing address * more specific the segment and page table portion of * the address */ address = S390_lowcore.trans_exc_code & -4096L; user_address = check_user_space(regs, error_code); /* * Verify that the fault happened in user space, that * we are not in an interrupt and that there is a * user context. */ if (user_address == 0 || in_interrupt() || !mm) goto no_context; /* * When we get here, the fault happened in the current * task's user address space, so we can switch on the * interrupts again and then search the VMAs */ __sti(); down_read(&mm->mmap_sem); vma = find_vma(mm, address); if (!vma) goto bad_area; if (vma->vm_start <= address) goto good_area; if (!(vma->vm_flags & VM_GROWSDOWN)) goto bad_area; if (expand_stack(vma, address)) goto bad_area; /* * Ok, we have a good vm_area for this memory access, so * we can handle it.. */ good_area: si_code = SEGV_ACCERR; if (error_code != 4) { /* page not present, check vm flags */ if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))) goto bad_area; } else { if (!(vma->vm_flags & VM_WRITE)) goto bad_area; } survive: /* * If for any reason at all we couldn't handle the fault, * make sure we exit gracefully rather than endlessly redo * the fault. */ switch (handle_mm_fault(mm, vma, address, error_code == 4)) { case 1: tsk->min_flt++; break; case 2: tsk->maj_flt++; break; case 0: goto do_sigbus; default: goto out_of_memory; } up_read(&mm->mmap_sem); return; /* * Something tried to access memory that isn't in our memory map.. * Fix it, but check if it's kernel or user first.. */ bad_area: up_read(&mm->mmap_sem); /* User mode accesses just cause a SIGSEGV */ if (regs->psw.mask & PSW_PROBLEM_STATE) { tsk->thread.prot_addr = address; tsk->thread.trap_no = error_code; force_sigsegv(regs, error_code, si_code, address); return; } no_context: /* Are we prepared to handle this kernel fault? */ if ((fixup = search_exception_table(regs->psw.addr)) != 0) { regs->psw.addr = fixup; return; } /* * Oops. The kernel tried to access some bad page. We'll have to * terminate things with extreme prejudice. */ if (user_address == 0) printk(KERN_ALERT "Unable to handle kernel pointer dereference" " at virtual kernel address %016lx\n", address); else printk(KERN_ALERT "Unable to handle kernel paging request" " at virtual user address %016lx\n", address); die("Oops", regs, error_code); do_exit(SIGKILL); /* * We ran out of memory, or some other thing happened to us that made * us unable to handle the page fault gracefully. */ out_of_memory: if (tsk->pid == 1) { yield(); goto survive; } up_read(&mm->mmap_sem); printk("VM: killing process %s\n", tsk->comm); if (regs->psw.mask & PSW_PROBLEM_STATE) do_exit(SIGKILL); goto no_context; do_sigbus: up_read(&mm->mmap_sem); /* * Send a sigbus, regardless of whether we were in kernel * or user mode. */ tsk->thread.prot_addr = address; tsk->thread.trap_no = error_code; force_sig(SIGBUS, tsk); /* Kernel mode? Handle exceptions or die */ if (!(regs->psw.mask & PSW_PROBLEM_STATE)) goto no_context; }