/*
 * Prepare the write for the inline data.
 * If the the data can be written into the inode, we just read
 * the page and make it uptodate, and start the journal.
 * Otherwise read the page, makes it dirty so that it can be
 * handle in writepages(the i_disksize update is left to the
 * normal ext4_da_write_end).
 */
int ext4_da_write_inline_data_begin(struct address_space *mapping,
				    struct inode *inode,
				    loff_t pos, unsigned len,
				    unsigned flags,
				    struct page **pagep,
				    void **fsdata)
{
	int ret, inline_size;
	handle_t *handle;
	struct page *page;
	struct ext4_iloc iloc;
	int retries;

	ret = ext4_get_inode_loc(inode, &iloc);
	if (ret)
		return ret;

retry_journal:
	handle = ext4_journal_start(inode, EXT4_HT_INODE, 1);
	if (IS_ERR(handle)) {
		ret = PTR_ERR(handle);
		goto out;
	}

	inline_size = ext4_get_max_inline_size(inode);

	ret = -ENOSPC;
	if (inline_size >= pos + len) {
		ret = ext4_prepare_inline_data(handle, inode, pos + len);
		if (ret && ret != -ENOSPC)
			goto out_journal;
	}

	/*
	 * We cannot recurse into the filesystem as the transaction
	 * is already started.
	 */
	flags |= AOP_FLAG_NOFS;

	if (ret == -ENOSPC) {
		ret = ext4_da_convert_inline_data_to_extent(mapping,
							    inode,
							    flags,
							    fsdata);
		ext4_journal_stop(handle);
		if (ret == -ENOSPC &&
		    ext4_should_retry_alloc(inode->i_sb, &retries))
			goto retry_journal;
		goto out;
	}


	page = grab_cache_page_write_begin(mapping, 0, flags);
	if (!page) {
		ret = -ENOMEM;
		goto out_journal;
	}

	down_read(&EXT4_I(inode)->xattr_sem);
	if (!ext4_has_inline_data(inode)) {
		ret = 0;
		goto out_release_page;
	}

	if (!PageUptodate(page)) {
		ret = ext4_read_inline_page(inode, page);
		if (ret < 0)
			goto out_release_page;
	}

	up_read(&EXT4_I(inode)->xattr_sem);
	*pagep = page;
	brelse(iloc.bh);
	return 1;
out_release_page:
	up_read(&EXT4_I(inode)->xattr_sem);
	unlock_page(page);
	page_cache_release(page);
out_journal:
	ext4_journal_stop(handle);
out:
	brelse(iloc.bh);
	return ret;
}
Example #2
0
/* Read the environment variable of current process specified by @key. */
int cfs_get_environ(const char *key, char *value, int *val_len)
{
	struct mm_struct *mm;
	char *buffer;
	int buf_len = PAGE_CACHE_SIZE;
	int key_len = strlen(key);
	unsigned long addr;
	int rc;
	ENTRY;

	buffer = kmalloc(buf_len, GFP_USER);
	if (!buffer)
		RETURN(-ENOMEM);

	mm = get_task_mm(current);
	if (!mm) {
		kfree(buffer);
		RETURN(-EINVAL);
	}

	/* Avoid deadlocks on mmap_sem if called from sys_mmap_pgoff(),
	 * which is already holding mmap_sem for writes.  If some other
	 * thread gets the write lock in the meantime, this thread will
	 * block, but at least it won't deadlock on itself.  LU-1735 */
	if (down_read_trylock(&mm->mmap_sem) == 0) {
		kfree(buffer);
		return -EDEADLK;
	}
	up_read(&mm->mmap_sem);

	addr = mm->env_start;
	while (addr < mm->env_end) {
		int this_len, retval, scan_len;
		char *env_start, *env_end;

		memset(buffer, 0, buf_len);

		this_len = min_t(int, mm->env_end - addr, buf_len);
		retval = cfs_access_process_vm(current, addr, buffer,
					       this_len, 0);
		if (retval != this_len)
			break;

		addr += retval;

		/* Parse the buffer to find out the specified key/value pair.
		 * The "key=value" entries are separated by '\0'. */
		env_start = buffer;
		scan_len = this_len;
		while (scan_len) {
			char *entry;
			int entry_len;

			env_end = memscan(env_start, '\0', scan_len);
			LASSERT(env_end >= env_start &&
				env_end <= env_start + scan_len);

			/* The last entry of this buffer cross the buffer
			 * boundary, reread it in next cycle. */
			if (unlikely(env_end - env_start == scan_len)) {
				/* This entry is too large to fit in buffer */
				if (unlikely(scan_len == this_len)) {
					CERROR("Too long env variable.\n");
					GOTO(out, rc = -EINVAL);
				}
				addr -= scan_len;
				break;
			}

			entry = env_start;
			entry_len = env_end - env_start;

			/* Key length + length of '=' */
			if (entry_len > key_len + 1 &&
			    !memcmp(entry, key, key_len)) {
				entry += key_len + 1;
				entry_len -= key_len + 1;
				/* The 'value' buffer passed in is too small.*/
				if (entry_len >= *val_len)
					GOTO(out, rc = -EOVERFLOW);

				memcpy(value, entry, entry_len);
				*val_len = entry_len;
				GOTO(out, rc = 0);
			}

			scan_len -= (env_end - env_start + 1);
			env_start = env_end + 1;
		}
	}
	GOTO(out, rc = -ENOENT);

out:
	mmput(mm);
	kfree((void *)buffer);
	return rc;
}
Example #3
0
/*
 * Select an allocation group to look for a free inode in, based on the parent
 * inode and then mode.  Return the allocation group buffer.
 */
STATIC xfs_buf_t *			/* allocation group buffer */
xfs_ialloc_ag_select(
	xfs_trans_t	*tp,		/* transaction pointer */
	xfs_ino_t	parent,		/* parent directory inode number */
	mode_t		mode,		/* bits set to indicate file type */
	int		okalloc)	/* ok to allocate more space */
{
	xfs_buf_t	*agbp;		/* allocation group header buffer */
	xfs_agnumber_t	agcount;	/* number of ag's in the filesystem */
	xfs_agnumber_t	agno;		/* current ag number */
	int		flags;		/* alloc buffer locking flags */
	xfs_extlen_t	ineed;		/* blocks needed for inode allocation */
	xfs_extlen_t	longest = 0;	/* longest extent available */
	xfs_mount_t	*mp;		/* mount point structure */
	int		needspace;	/* file mode implies space allocated */
	xfs_perag_t	*pag;		/* per allocation group data */
	xfs_agnumber_t	pagno;		/* parent (starting) ag number */

	/*
	 * Files of these types need at least one block if length > 0
	 * (and they won't fit in the inode, but that's hard to figure out).
	 */
	needspace = S_ISDIR(mode) || S_ISREG(mode) || S_ISLNK(mode);
	mp = tp->t_mountp;
	agcount = mp->m_maxagi;
	if (S_ISDIR(mode))
		pagno = xfs_ialloc_next_ag(mp);
	else {
		pagno = XFS_INO_TO_AGNO(mp, parent);
		if (pagno >= agcount)
			pagno = 0;
	}
	ASSERT(pagno < agcount);
	/*
	 * Loop through allocation groups, looking for one with a little
	 * free space in it.  Note we don't look for free inodes, exactly.
	 * Instead, we include whether there is a need to allocate inodes
	 * to mean that blocks must be allocated for them,
	 * if none are currently free.
	 */
	agno = pagno;
	flags = XFS_ALLOC_FLAG_TRYLOCK;
	down_read(&mp->m_peraglock);
	for (;;) {
		pag = &mp->m_perag[agno];
		if (!pag->pagi_init) {
			if (xfs_ialloc_read_agi(mp, tp, agno, &agbp)) {
				agbp = NULL;
				goto nextag;
			}
		} else
			agbp = NULL;

		if (!pag->pagi_inodeok) {
			xfs_ialloc_next_ag(mp);
			goto unlock_nextag;
		}

		/*
		 * Is there enough free space for the file plus a block
		 * of inodes (if we need to allocate some)?
		 */
		ineed = pag->pagi_freecount ? 0 : XFS_IALLOC_BLOCKS(mp);
		if (ineed && !pag->pagf_init) {
			if (agbp == NULL &&
			    xfs_ialloc_read_agi(mp, tp, agno, &agbp)) {
				agbp = NULL;
				goto nextag;
			}
			(void)xfs_alloc_pagf_init(mp, tp, agno, flags);
		}
		if (!ineed || pag->pagf_init) {
			if (ineed && !(longest = pag->pagf_longest))
				longest = pag->pagf_flcount > 0;
			if (!ineed ||
			    (pag->pagf_freeblks >= needspace + ineed &&
			     longest >= ineed &&
			     okalloc)) {
				if (agbp == NULL &&
				    xfs_ialloc_read_agi(mp, tp, agno, &agbp)) {
					agbp = NULL;
					goto nextag;
				}
				up_read(&mp->m_peraglock);
				return agbp;
			}
		}
unlock_nextag:
		if (agbp)
			xfs_trans_brelse(tp, agbp);
nextag:
		/*
		 * No point in iterating over the rest, if we're shutting
		 * down.
		 */
		if (XFS_FORCED_SHUTDOWN(mp)) {
			up_read(&mp->m_peraglock);
			return NULL;
		}
		agno++;
		if (agno >= agcount)
			agno = 0;
		if (agno == pagno) {
			if (flags == 0) {
				up_read(&mp->m_peraglock);
				return NULL;
			}
			flags = 0;
		}
	}
}
Example #4
0
RMstatus xwcMemTranslateUserVirtual(uint32_t    entity,
                                    void       *desc,
                                    void       *auxmap)
{
    unsigned       i;
    linkEntry     *hwsgl;
    size_t         count;
    int            pageEst;
    T2DPD         *pd = desc;
    T2DESC_AUXMAP *pdmap = auxmap;

    /* Estimate the number of needed page pointers */
    pageEst = (((unsigned long)pd->pair[entity].ptr & ~PAGE_MASK) +
               pd->pair[entity].size + ~PAGE_MASK) >> PAGE_SHIFT;

    /* Allocate list of pointers to pages for this user buffer reference */
    pdmap->pair[entity].pages = vmalloc(pageEst * sizeof(pdmap->pair[entity].pages));
    if (pdmap->pair[entity].pages == NULL)
        return RM_NO_MEMORY;

    /* Lock this process' pages and map them. The descriptor pair pointer */
    /* still references the user's buffer at this point                   */
    down_read(&current->mm->mmap_sem);
    pdmap->pair[entity].pageCt =
        get_user_pages(current,
                       current->mm,
                       (unsigned long)pd->pair[entity].ptr,
                       pageEst,
                       WRITE, 1,
                       pdmap->pair[entity].pages,
                       NULL);
    up_read(&current->mm->mmap_sem);

    /* here for development, remove once stabilized */
    if (pageEst != pdmap->pair[entity].pageCt)
        printk("t23xwc: user page estimate = %d, actual = %d\n", pageEst, pdmap->pair[entity].pageCt);

    if (pdmap->pair[entity].pageCt > pageEst)
        panic("t23xwc - user pages mapped exceeds estimate\n");

    /* Needed user pages are now mapped. If data element fits in 1 page, then */
    /* we can just do a physical pointer, no scatterlist is needed. If it     */
    /* exceeds one page, we must have a scatterlist                           */

    if (pdmap->pair[entity].pageCt > 1) /* Does entry span pages? */
    {
        /* Allocate "hardware" scatterlist */
        hwsgl = kmalloc(pageEst * sizeof(linkEntry), GFP_KERNEL | GFP_DMA);
        if (hwsgl == NULL)
        {
            /* Out of kmalloc() space, gotta bail. Release mapped pages */
            for (i = 0; i < pdmap->pair[entity].pageCt; i++)
                page_cache_release(pdmap->pair[entity].pages[i]);

            /* Free allocated page list */
            vfree(pdmap->pair[entity].pages);

            return RM_NO_MEMORY;
        }

        count = pd->pair[entity].size;

        hwsgl[0].segAddr =
            (unsigned char *)virt_to_phys(lowmem_page_address(pdmap->pair[entity].pages[0]) +
                                          ((unsigned long)pd->pair[entity].ptr & ~PAGE_MASK));

        hwsgl[0].chainCtrl = 0;
        hwsgl[0].extAddr   = 0;

        if (pdmap->pair[entity].pageCt > 1)
        {
            hwsgl[0].segLen = PAGE_SIZE - ((unsigned long)pd->pair[entity].ptr & ~PAGE_MASK);
            count -= hwsgl[0].segLen;
            for (i = 1; i < pdmap->pair[entity].pageCt; i++)
            {
                hwsgl[i].segLen    = count < PAGE_SIZE ? count : PAGE_SIZE;
                hwsgl[i].segAddr   = (unsigned char *)
                                     virt_to_phys(lowmem_page_address(pdmap->pair[entity].pages[i]));
                hwsgl[i].extAddr   = 0;
                hwsgl[i].chainCtrl = 0;
                count -= PAGE_SIZE;
            }
        }
        else
            hwsgl[0].segLen = pd->pair[entity].size;

        /* mark the last entry in the Talitos scatterlist */
        hwsgl[pdmap->pair[entity].pageCt - 1].chainCtrl = LAST_ENTRY;

        /* Point to descriptor pair to the Talitos scatterlist */
        pd->pair[entity].ptr     = (unsigned char *)virt_to_phys(hwsgl);
        pd->pair[entity].extent |= JUMPTABLE;
    }
    else
        pd->pair[entity].ptr =
            (unsigned char *)virt_to_phys(lowmem_page_address(pdmap->pair[entity].pages[0]) +
                                          ((unsigned long)pd->pair[entity].ptr & ~PAGE_MASK));

    return RM_OK;
}
Example #5
0
static void c_stop(struct seq_file *m, void *p)
{
	up_read(&crypto_alg_sem);
}
int nilfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
		 __u64 start, __u64 len)
{
	struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
	__u64 logical = 0, phys = 0, size = 0;
	__u32 flags = 0;
	loff_t isize;
	sector_t blkoff, end_blkoff;
	sector_t delalloc_blkoff;
	unsigned long delalloc_blklen;
	unsigned int blkbits = inode->i_blkbits;
	int ret, n;

	ret = fiemap_check_flags(fieinfo, FIEMAP_FLAG_SYNC);
	if (ret)
		return ret;

	mutex_lock(&inode->i_mutex);

	isize = i_size_read(inode);

	blkoff = start >> blkbits;
	end_blkoff = (start + len - 1) >> blkbits;

	delalloc_blklen = nilfs_find_uncommitted_extent(inode, blkoff,
							&delalloc_blkoff);

	do {
		__u64 blkphy;
		unsigned int maxblocks;

		if (delalloc_blklen && blkoff == delalloc_blkoff) {
			if (size) {
				
				ret = fiemap_fill_next_extent(
					fieinfo, logical, phys, size, flags);
				if (ret)
					break;
			}
			if (blkoff > end_blkoff)
				break;

			flags = FIEMAP_EXTENT_MERGED | FIEMAP_EXTENT_DELALLOC;
			logical = blkoff << blkbits;
			phys = 0;
			size = delalloc_blklen << blkbits;

			blkoff = delalloc_blkoff + delalloc_blklen;
			delalloc_blklen = nilfs_find_uncommitted_extent(
				inode, blkoff, &delalloc_blkoff);
			continue;
		}

		maxblocks = INT_MAX;
		if (delalloc_blklen)
			maxblocks = min_t(sector_t, delalloc_blkoff - blkoff,
					  maxblocks);
		blkphy = 0;

		down_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem);
		n = nilfs_bmap_lookup_contig(
			NILFS_I(inode)->i_bmap, blkoff, &blkphy, maxblocks);
		up_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem);

		if (n < 0) {
			int past_eof;

			if (unlikely(n != -ENOENT))
				break; 

			
			blkoff++;
			past_eof = ((blkoff << blkbits) >= isize);

			if (size) {
				

				if (past_eof)
					flags |= FIEMAP_EXTENT_LAST;

				ret = fiemap_fill_next_extent(
					fieinfo, logical, phys, size, flags);
				if (ret)
					break;
				size = 0;
			}
			if (blkoff > end_blkoff || past_eof)
				break;
		} else {
			if (size) {
				if (phys && blkphy << blkbits == phys + size) {
					
					size += n << blkbits;
				} else {
					
					ret = fiemap_fill_next_extent(
						fieinfo, logical, phys, size,
						flags);
					if (ret || blkoff > end_blkoff)
						break;

					
					flags = FIEMAP_EXTENT_MERGED;
					logical = blkoff << blkbits;
					phys = blkphy << blkbits;
					size = n << blkbits;
				}
			} else {
				
				flags = FIEMAP_EXTENT_MERGED;
				logical = blkoff << blkbits;
				phys = blkphy << blkbits;
				size = n << blkbits;
			}
			blkoff += n;
		}
		cond_resched();
	} while (true);

	
	if (ret == 1)
		ret = 0;

	mutex_unlock(&inode->i_mutex);
	return ret;
}
Example #7
0
File: fault.c Project: 08opt/linux
/*
 * This routine handles page faults.  It determines the address,
 * and the problem, and then passes it off to one of the appropriate
 * routines.
 *
 * interruption code (int_code):
 *   04       Protection           ->  Write-Protection  (suprression)
 *   10       Segment translation  ->  Not present       (nullification)
 *   11       Page translation     ->  Not present       (nullification)
 *   3b       Region third trans.  ->  Not present       (nullification)
 */
static inline int do_exception(struct pt_regs *regs, int access,
			       unsigned long trans_exc_code)
{
	struct task_struct *tsk;
	struct mm_struct *mm;
	struct vm_area_struct *vma;
	unsigned long address;
	unsigned int flags;
	int fault;

	if (notify_page_fault(regs))
		return 0;

	tsk = current;
	mm = tsk->mm;

	/*
	 * Verify that the fault happened in user space, that
	 * we are not in an interrupt and that there is a 
	 * user context.
	 */
	fault = VM_FAULT_BADCONTEXT;
	if (unlikely(!user_space_fault(trans_exc_code) || in_atomic() || !mm))
		goto out;

	address = trans_exc_code & __FAIL_ADDR_MASK;
	perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
	flags = FAULT_FLAG_ALLOW_RETRY;
	if (access == VM_WRITE || (trans_exc_code & store_indication) == 0x400)
		flags |= FAULT_FLAG_WRITE;
	down_read(&mm->mmap_sem);

#ifdef CONFIG_PGSTE
	if (test_tsk_thread_flag(current, TIF_SIE) && S390_lowcore.gmap) {
		address = __gmap_fault(address,
				     (struct gmap *) S390_lowcore.gmap);
		if (address == -EFAULT) {
			fault = VM_FAULT_BADMAP;
			goto out_up;
		}
		if (address == -ENOMEM) {
			fault = VM_FAULT_OOM;
			goto out_up;
		}
	}
#endif

retry:
	fault = VM_FAULT_BADMAP;
	vma = find_vma(mm, address);
	if (!vma)
		goto out_up;

	if (unlikely(vma->vm_start > address)) {
		if (!(vma->vm_flags & VM_GROWSDOWN))
			goto out_up;
		if (expand_stack(vma, address))
			goto out_up;
	}

	/*
	 * Ok, we have a good vm_area for this memory access, so
	 * we can handle it..
	 */
	fault = VM_FAULT_BADACCESS;
	if (unlikely(!(vma->vm_flags & access)))
		goto out_up;

	if (is_vm_hugetlb_page(vma))
		address &= HPAGE_MASK;
	/*
	 * If for any reason at all we couldn't handle the fault,
	 * make sure we exit gracefully rather than endlessly redo
	 * the fault.
	 */
	fault = handle_mm_fault(mm, vma, address, flags);
	if (unlikely(fault & VM_FAULT_ERROR))
		goto out_up;

	/*
	 * Major/minor page fault accounting is only done on the
	 * initial attempt. If we go through a retry, it is extremely
	 * likely that the page will be found in page cache at that point.
	 */
	if (flags & FAULT_FLAG_ALLOW_RETRY) {
		if (fault & VM_FAULT_MAJOR) {
			tsk->maj_flt++;
			perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1,
				      regs, address);
		} else {
			tsk->min_flt++;
			perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1,
				      regs, address);
		}
		if (fault & VM_FAULT_RETRY) {
			/* Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk
			 * of starvation. */
			flags &= ~FAULT_FLAG_ALLOW_RETRY;
			down_read(&mm->mmap_sem);
			goto retry;
		}
	}
	/*
	 * The instruction that caused the program check will
	 * be repeated. Don't signal single step via SIGTRAP.
	 */
	clear_tsk_thread_flag(tsk, TIF_PER_TRAP);
	fault = 0;
out_up:
	up_read(&mm->mmap_sem);
out:
	return fault;
}
Example #8
0
static inline void lov_conf_thaw(struct lov_object *lov)
{
	if (lov->lo_owner != current)
		up_read(&lov->lo_type_guard);
}
int do_page_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
{
	struct task_struct *tsk;
	struct mm_struct *mm;
	int fault;

	tsk = current;
	mm  = tsk->mm;

	/*
	 * If we're in an interrupt or have no user
	 * context, we must not take the fault..
	 */
	if (in_interrupt() || !mm)
		goto no_context;

	down_read(&mm->mmap_sem);
	fault = __do_page_fault(mm, addr, fsr, tsk);
	up_read(&mm->mmap_sem);

	/*
	 * Handle the "normal" case first
	 */
	if (fault > 0)
		return 0;

	/*
	 * We had some memory, but were unable to
	 * successfully fix up this page fault.
	 */
	if (fault == 0)
		goto do_sigbus;

	/*
	 * If we are in kernel mode at this point, we
	 * have no context to handle this fault with.
	 */
	if (!user_mode(regs))
		goto no_context;

	if (fault == -3) {
		/*
		 * We ran out of memory, or some other thing happened to
		 * us that made us unable to handle the page fault gracefully.
		 */
		printk("VM: killing process %s\n", tsk->comm);
		do_exit(SIGKILL);
	} else
		__do_user_fault(tsk, addr, fsr, fault == -1 ?
				SEGV_ACCERR : SEGV_MAPERR, regs);
	return 0;


/*
 * We ran out of memory, or some other thing happened to us that made
 * us unable to handle the page fault gracefully.
 */
do_sigbus:
	/*
	 * Send a sigbus, regardless of whether we were in kernel
	 * or user mode.
	 */
	tsk->thread.address = addr;
	tsk->thread.error_code = fsr;
	tsk->thread.trap_no = 14;
	force_sig(SIGBUS, tsk);
#ifdef CONFIG_DEBUG_USER
	printk("%s: sigbus at 0x%08lx, pc=0x%08lx\n",
		current->comm, addr, instruction_pointer(regs));
#endif

	/* Kernel mode? Handle exceptions or die */
	if (user_mode(regs))
		return 0;

no_context:
	__do_kernel_fault(mm, addr, fsr, regs);
	return 0;
}
Example #10
0
File: fault.c Project: E-LLP/n900
void do_page_fault(struct pt_regs *regs)
{
	struct vm_area_struct * vma;
	struct mm_struct *mm = current->mm;
	unsigned int exccause = regs->exccause;
	unsigned int address = regs->excvaddr;
	siginfo_t info;

	int is_write, is_exec;
	int fault;

	info.si_code = SEGV_MAPERR;

	/* We fault-in kernel-space virtual memory on-demand. The
	 * 'reference' page table is init_mm.pgd.
	 */
	if (address >= TASK_SIZE && !user_mode(regs))
		goto vmalloc_fault;

	/* If we're in an interrupt or have no user
	 * context, we must not take the fault..
	 */
	if (in_atomic() || !mm) {
		bad_page_fault(regs, address, SIGSEGV);
		return;
	}

	is_write = (exccause == EXCCAUSE_STORE_CACHE_ATTRIBUTE) ? 1 : 0;
	is_exec =  (exccause == EXCCAUSE_ITLB_PRIVILEGE ||
		    exccause == EXCCAUSE_ITLB_MISS ||
		    exccause == EXCCAUSE_FETCH_CACHE_ATTRIBUTE) ? 1 : 0;

#ifdef DEBUG_PAGE_FAULT
	printk("[%s:%d:%08x:%d:%08x:%s%s]\n", current->comm, current->pid,
	       address, exccause, regs->pc, is_write? "w":"", is_exec? "x":"");
#endif

	down_read(&mm->mmap_sem);
	vma = find_vma(mm, address);

	if (!vma)
		goto bad_area;
	if (vma->vm_start <= address)
		goto good_area;
	if (!(vma->vm_flags & VM_GROWSDOWN))
		goto bad_area;
	if (expand_stack(vma, address))
		goto bad_area;

	/* Ok, we have a good vm_area for this memory access, so
	 * we can handle it..
	 */

good_area:
	info.si_code = SEGV_ACCERR;

	if (is_write) {
		if (!(vma->vm_flags & VM_WRITE))
			goto bad_area;
	} else if (is_exec) {
		if (!(vma->vm_flags & VM_EXEC))
			goto bad_area;
	} else	/* Allow read even from write-only pages. */
		if (!(vma->vm_flags & (VM_READ | VM_WRITE)))
			goto bad_area;

	/* If for any reason at all we couldn't handle the fault,
	 * make sure we exit gracefully rather than endlessly redo
	 * the fault.
	 */
survive:
	fault = handle_mm_fault(mm, vma, address, is_write);
	if (unlikely(fault & VM_FAULT_ERROR)) {
		if (fault & VM_FAULT_OOM)
			goto out_of_memory;
		else if (fault & VM_FAULT_SIGBUS)
			goto do_sigbus;
		BUG();
	}
	if (fault & VM_FAULT_MAJOR)
		current->maj_flt++;
	else
		current->min_flt++;

	up_read(&mm->mmap_sem);
	return;

	/* Something tried to access memory that isn't in our memory map..
	 * Fix it, but check if it's kernel or user first..
	 */
bad_area:
	up_read(&mm->mmap_sem);
	if (user_mode(regs)) {
		current->thread.bad_vaddr = address;
		current->thread.error_code = is_write;
		info.si_signo = SIGSEGV;
		info.si_errno = 0;
		/* info.si_code has been set above */
		info.si_addr = (void *) address;
		force_sig_info(SIGSEGV, &info, current);
		return;
	}
	bad_page_fault(regs, address, SIGSEGV);
	return;


	/* We ran out of memory, or some other thing happened to us that made
	 * us unable to handle the page fault gracefully.
	 */
out_of_memory:
	up_read(&mm->mmap_sem);
	if (is_global_init(current)) {
		yield();
		down_read(&mm->mmap_sem);
		goto survive;
	}
	printk("VM: killing process %s\n", current->comm);
	if (user_mode(regs))
		do_group_exit(SIGKILL);
	bad_page_fault(regs, address, SIGKILL);
	return;

do_sigbus:
	up_read(&mm->mmap_sem);

	/* Send a sigbus, regardless of whether we were in kernel
	 * or user mode.
	 */
	current->thread.bad_vaddr = address;
	info.si_code = SIGBUS;
	info.si_errno = 0;
	info.si_code = BUS_ADRERR;
	info.si_addr = (void *) address;
	force_sig_info(SIGBUS, &info, current);

	/* Kernel mode? Handle exceptions or die */
	if (!user_mode(regs))
		bad_page_fault(regs, address, SIGBUS);

vmalloc_fault:
	{
		/* Synchronize this task's top level page-table
		 * with the 'reference' page table.
		 */
		struct mm_struct *act_mm = current->active_mm;
		int index = pgd_index(address);
		pgd_t *pgd, *pgd_k;
		pmd_t *pmd, *pmd_k;
		pte_t *pte_k;

		if (act_mm == NULL)
			goto bad_page_fault;

		pgd = act_mm->pgd + index;
		pgd_k = init_mm.pgd + index;

		if (!pgd_present(*pgd_k))
			goto bad_page_fault;

		pgd_val(*pgd) = pgd_val(*pgd_k);

		pmd = pmd_offset(pgd, address);
		pmd_k = pmd_offset(pgd_k, address);
		if (!pmd_present(*pmd) || !pmd_present(*pmd_k))
			goto bad_page_fault;

		pmd_val(*pmd) = pmd_val(*pmd_k);
		pte_k = pte_offset_kernel(pmd_k, address);

		if (!pte_present(*pte_k))
			goto bad_page_fault;
		return;
	}
bad_page_fault:
	bad_page_fault(regs, address, SIGKILL);
	return;
}
Example #11
0
/*
 * This routine handles page faults.  It determines the problem, and
 * then passes it off to one of the appropriate routines.
 *
 * error_code:
 *	bit 0 == 0 means no page found, 1 means protection fault
 *	bit 1 == 0 means read, 1 means write
 *
 * If this routine detects a bad access, it returns 1, otherwise it
 * returns 0.
 */
int do_page_fault(struct pt_regs *regs, unsigned long address,
			      unsigned long error_code)
{
	struct mm_struct *mm = current->mm;
	struct vm_area_struct * vma;
	int write, fault;

#ifdef DEBUG
	printk ("do page fault:\nregs->sr=%#x, regs->pc=%#lx, address=%#lx, %ld, %p\n",
		regs->sr, regs->pc, address, error_code,
		current->mm->pgd);
#endif

	/*
	 * If we're in an interrupt or have no user
	 * context, we must not take the fault..
	 */
	if (!mm || pagefault_disabled())
		goto no_context;

	down_read(&mm->mmap_sem);

	vma = find_vma(mm, address);
	if (!vma)
		goto map_err;
	if (vma->vm_flags & VM_IO)
		goto acc_err;
	if (vma->vm_start <= address)
		goto good_area;
	if (!(vma->vm_flags & VM_GROWSDOWN))
		goto map_err;
	if (user_mode(regs)) {
		/* Accessing the stack below usp is always a bug.  The
		   "+ 256" is there due to some instructions doing
		   pre-decrement on the stack and that doesn't show up
		   until later.  */
		if (address + 256 < rdusp())
			goto map_err;
	}
	if (expand_stack(vma, address))
		goto map_err;

/*
 * Ok, we have a good vm_area for this memory access, so
 * we can handle it..
 */
good_area:
#ifdef DEBUG
	printk("do_page_fault: good_area\n");
#endif
	write = 0;
	switch (error_code & 3) {
		default:	/* 3: write, present */
			/* fall through */
		case 2:		/* write, not present */
			if (!(vma->vm_flags & VM_WRITE))
				goto acc_err;
			write++;
			break;
		case 1:		/* read, present */
			goto acc_err;
		case 0:		/* read, not present */
			if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
				goto acc_err;
	}

	/*
	 * If for any reason at all we couldn't handle the fault,
	 * make sure we exit gracefully rather than endlessly redo
	 * the fault.
	 */

	fault = handle_mm_fault(mm, vma, address, write ? FAULT_FLAG_WRITE : 0);
#ifdef DEBUG
	printk("handle_mm_fault returns %d\n",fault);
#endif
	if (unlikely(fault & VM_FAULT_ERROR)) {
		if (fault & VM_FAULT_OOM)
			goto out_of_memory;
		else if (fault & VM_FAULT_SIGBUS)
			goto bus_err;
		BUG();
	}
	if (fault & VM_FAULT_MAJOR)
		current->maj_flt++;
	else
		current->min_flt++;

	up_read(&mm->mmap_sem);
	return 0;

/*
 * We ran out of memory, or some other thing happened to us that made
 * us unable to handle the page fault gracefully.
 */
out_of_memory:
	up_read(&mm->mmap_sem);
	if (!user_mode(regs))
		goto no_context;
	pagefault_out_of_memory();
	return 0;

no_context:
	current->thread.signo = SIGBUS;
	current->thread.faddr = address;
	return send_fault_sig(regs);

bus_err:
	current->thread.signo = SIGBUS;
	current->thread.code = BUS_ADRERR;
	current->thread.faddr = address;
	goto send_sig;

map_err:
	current->thread.signo = SIGSEGV;
	current->thread.code = SEGV_MAPERR;
	current->thread.faddr = address;
	goto send_sig;

acc_err:
	current->thread.signo = SIGSEGV;
	current->thread.code = SEGV_ACCERR;
	current->thread.faddr = address;

send_sig:
	up_read(&mm->mmap_sem);
	return send_fault_sig(regs);
}
static int __kprobes do_page_fault(unsigned long addr, unsigned int esr,
				   struct pt_regs *regs)
{
	struct task_struct *tsk;
	struct mm_struct *mm;
	int fault, sig, code;
	unsigned long vm_flags = VM_READ | VM_WRITE | VM_EXEC;
	unsigned int mm_flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;

	tsk = current;
	mm  = tsk->mm;

	
	if (interrupts_enabled(regs))
		local_irq_enable();

	if (in_atomic() || !mm)
		goto no_context;

	if (user_mode(regs))
		mm_flags |= FAULT_FLAG_USER;

	if (esr & ESR_LNX_EXEC) {
		vm_flags = VM_EXEC;
	} else if (esr & ESR_EL1_WRITE) {
		vm_flags = VM_WRITE;
		mm_flags |= FAULT_FLAG_WRITE;
	}

	if (!down_read_trylock(&mm->mmap_sem)) {
		if (!user_mode(regs) && !search_exception_tables(regs->pc))
			goto no_context;
retry:
		down_read(&mm->mmap_sem);
	} else {
		might_sleep();
#ifdef CONFIG_DEBUG_VM
		if (!user_mode(regs) && !search_exception_tables(regs->pc))
			goto no_context;
#endif
	}

	fault = __do_page_fault(mm, addr, mm_flags, vm_flags, tsk);

	if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current))
		return 0;


	perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, addr);
	if (mm_flags & FAULT_FLAG_ALLOW_RETRY) {
		if (fault & VM_FAULT_MAJOR) {
			tsk->maj_flt++;
			perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, regs,
				      addr);
		} else {
			tsk->min_flt++;
			perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, regs,
				      addr);
		}
		if (fault & VM_FAULT_RETRY) {
			mm_flags &= ~FAULT_FLAG_ALLOW_RETRY;
			mm_flags |= FAULT_FLAG_TRIED;
			goto retry;
		}
	}

	up_read(&mm->mmap_sem);

	if (likely(!(fault & (VM_FAULT_ERROR | VM_FAULT_BADMAP |
			      VM_FAULT_BADACCESS))))
		return 0;

	if (!user_mode(regs))
		goto no_context;

	if (fault & VM_FAULT_OOM) {
		pagefault_out_of_memory();
		return 0;
	}

	if (fault & VM_FAULT_SIGBUS) {
		sig = SIGBUS;
		code = BUS_ADRERR;
	} else {
		sig = SIGSEGV;
		code = fault == VM_FAULT_BADACCESS ?
			SEGV_ACCERR : SEGV_MAPERR;
	}

	__do_user_fault(tsk, addr, esr, sig, code, regs);
	return 0;

no_context:
	__do_kernel_fault(mm, addr, esr, regs);
	return 0;
}
Example #13
0
/**
 * nilfs_get_block() - get a file block on the filesystem (callback function)
 * @inode - inode struct of the target file
 * @blkoff - file block number
 * @bh_result - buffer head to be mapped on
 * @create - indicate whether allocating the block or not when it has not
 *      been allocated yet.
 *
 * This function does not issue actual read request of the specified data
 * block. It is done by VFS.
 */
int nilfs_get_block(struct inode *inode, sector_t blkoff,
		    struct buffer_head *bh_result, int create)
{
	struct nilfs_inode_info *ii = NILFS_I(inode);
	struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
	__u64 blknum = 0;
	int err = 0, ret;
	unsigned maxblocks = bh_result->b_size >> inode->i_blkbits;

	down_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem);
	ret = nilfs_bmap_lookup_contig(ii->i_bmap, blkoff, &blknum, maxblocks);
	up_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem);
	if (ret >= 0) {	/* found */
		map_bh(bh_result, inode->i_sb, blknum);
		if (ret > 0)
			bh_result->b_size = (ret << inode->i_blkbits);
		goto out;
	}
	/* data block was not found */
	if (ret == -ENOENT && create) {
		struct nilfs_transaction_info ti;

		bh_result->b_blocknr = 0;
		err = nilfs_transaction_begin(inode->i_sb, &ti, 1);
		if (unlikely(err))
			goto out;
		err = nilfs_bmap_insert(ii->i_bmap, (unsigned long)blkoff,
					(unsigned long)bh_result);
		if (unlikely(err != 0)) {
			if (err == -EEXIST) {
				/*
				 * The get_block() function could be called
				 * from multiple callers for an inode.
				 * However, the page having this block must
				 * be locked in this case.
				 */
				printk(KERN_WARNING
				       "nilfs_get_block: a race condition "
				       "while inserting a data block. "
				       "(inode number=%lu, file block "
				       "offset=%llu)\n",
				       inode->i_ino,
				       (unsigned long long)blkoff);
				err = 0;
			}
			nilfs_transaction_abort(inode->i_sb);
			goto out;
		}
		nilfs_mark_inode_dirty(inode);
		nilfs_transaction_commit(inode->i_sb); /* never fails */
		/* Error handling should be detailed */
		set_buffer_new(bh_result);
		set_buffer_delay(bh_result);
		map_bh(bh_result, inode->i_sb, 0); /* dbn must be changed
						      to proper value */
	} else if (ret == -ENOENT) {
		/* not found is not error (e.g. hole); must return without
		   the mapped state flag. */
		;
	} else {
		err = ret;
	}

 out:
	return err;
}
Example #14
0
/*
 * This routine handles page faults.  It determines the address,
 * and the problem, and then passes it off to one of the appropriate
 * routines.
 */
asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long cause,
				unsigned long address)
{
	struct vm_area_struct *vma = NULL;
	struct task_struct *tsk = current;
	struct mm_struct *mm = tsk->mm;
	int code = SEGV_MAPERR;
	vm_fault_t fault;
	unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;

	cause >>= 2;

	/* Restart the instruction */
	regs->ea -= 4;

	/*
	 * We fault-in kernel-space virtual memory on-demand. The
	 * 'reference' page table is init_mm.pgd.
	 *
	 * NOTE! We MUST NOT take any locks for this case. We may
	 * be in an interrupt or a critical region, and should
	 * only copy the information from the master page table,
	 * nothing more.
	 */
	if (unlikely(address >= VMALLOC_START && address <= VMALLOC_END)) {
		if (user_mode(regs))
			goto bad_area_nosemaphore;
		else
			goto vmalloc_fault;
	}

	if (unlikely(address >= TASK_SIZE))
		goto bad_area_nosemaphore;

	/*
	 * If we're in an interrupt or have no user
	 * context, we must not take the fault..
	 */
	if (faulthandler_disabled() || !mm)
		goto bad_area_nosemaphore;

	if (user_mode(regs))
		flags |= FAULT_FLAG_USER;

	if (!down_read_trylock(&mm->mmap_sem)) {
		if (!user_mode(regs) && !search_exception_tables(regs->ea))
			goto bad_area_nosemaphore;
retry:
		down_read(&mm->mmap_sem);
	}

	vma = find_vma(mm, address);
	if (!vma)
		goto bad_area;
	if (vma->vm_start <= address)
		goto good_area;
	if (!(vma->vm_flags & VM_GROWSDOWN))
		goto bad_area;
	if (expand_stack(vma, address))
		goto bad_area;
/*
 * Ok, we have a good vm_area for this memory access, so
 * we can handle it..
 */
good_area:
	code = SEGV_ACCERR;

	switch (cause) {
	case EXC_SUPERV_INSN_ACCESS:
		goto bad_area;
	case EXC_SUPERV_DATA_ACCESS:
		goto bad_area;
	case EXC_X_PROTECTION_FAULT:
		if (!(vma->vm_flags & VM_EXEC))
			goto bad_area;
		break;
	case EXC_R_PROTECTION_FAULT:
		if (!(vma->vm_flags & VM_READ))
			goto bad_area;
		break;
	case EXC_W_PROTECTION_FAULT:
		if (!(vma->vm_flags & VM_WRITE))
			goto bad_area;
		flags = FAULT_FLAG_WRITE;
		break;
	}

	/*
	 * If for any reason at all we couldn't handle the fault,
	 * make sure we exit gracefully rather than endlessly redo
	 * the fault.
	 */
	fault = handle_mm_fault(vma, address, flags);

	if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current))
		return;

	if (unlikely(fault & VM_FAULT_ERROR)) {
		if (fault & VM_FAULT_OOM)
			goto out_of_memory;
		else if (fault & VM_FAULT_SIGSEGV)
			goto bad_area;
		else if (fault & VM_FAULT_SIGBUS)
			goto do_sigbus;
		BUG();
	}

	/*
	 * Major/minor page fault accounting is only done on the
	 * initial attempt. If we go through a retry, it is extremely
	 * likely that the page will be found in page cache at that point.
	 */
	if (flags & FAULT_FLAG_ALLOW_RETRY) {
		if (fault & VM_FAULT_MAJOR)
			current->maj_flt++;
		else
			current->min_flt++;
		if (fault & VM_FAULT_RETRY) {
			/* Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk
			 * of starvation. */
			flags &= ~FAULT_FLAG_ALLOW_RETRY;
			flags |= FAULT_FLAG_TRIED;

			/*
			 * No need to up_read(&mm->mmap_sem) as we would
			 * have already released it in __lock_page_or_retry
			 * in mm/filemap.c.
			 */

			goto retry;
		}
	}

	up_read(&mm->mmap_sem);
	return;

/*
 * Something tried to access memory that isn't in our memory map..
 * Fix it, but check if it's kernel or user first..
 */
bad_area:
	up_read(&mm->mmap_sem);

bad_area_nosemaphore:
	/* User mode accesses just cause a SIGSEGV */
	if (user_mode(regs)) {
		if (unhandled_signal(current, SIGSEGV) && printk_ratelimit()) {
			pr_info("%s: unhandled page fault (%d) at 0x%08lx, "
				"cause %ld\n", current->comm, SIGSEGV, address, cause);
			show_regs(regs);
		}
		_exception(SIGSEGV, regs, code, address);
		return;
	}

no_context:
	/* Are we prepared to handle this kernel fault? */
	if (fixup_exception(regs))
		return;

	/*
	 * Oops. The kernel tried to access some bad page. We'll have to
	 * terminate things with extreme prejudice.
	 */
	bust_spinlocks(1);

	pr_alert("Unable to handle kernel %s at virtual address %08lx",
		address < PAGE_SIZE ? "NULL pointer dereference" :
		"paging request", address);
	pr_alert("ea = %08lx, ra = %08lx, cause = %ld\n", regs->ea, regs->ra,
		cause);
	panic("Oops");
	return;

/*
 * We ran out of memory, or some other thing happened to us that made
 * us unable to handle the page fault gracefully.
 */
out_of_memory:
	up_read(&mm->mmap_sem);
	if (!user_mode(regs))
		goto no_context;
	pagefault_out_of_memory();
	return;

do_sigbus:
	up_read(&mm->mmap_sem);

	/* Kernel mode? Handle exceptions or die */
	if (!user_mode(regs))
		goto no_context;

	_exception(SIGBUS, regs, BUS_ADRERR, address);
	return;

vmalloc_fault:
	{
		/*
		 * Synchronize this task's top level page-table
		 * with the 'reference' page table.
		 *
		 * Do _not_ use "tsk" here. We might be inside
		 * an interrupt in the middle of a task switch..
		 */
		int offset = pgd_index(address);
		pgd_t *pgd, *pgd_k;
		pud_t *pud, *pud_k;
		pmd_t *pmd, *pmd_k;
		pte_t *pte_k;

		pgd = pgd_current + offset;
		pgd_k = init_mm.pgd + offset;

		if (!pgd_present(*pgd_k))
			goto no_context;
		set_pgd(pgd, *pgd_k);

		pud = pud_offset(pgd, address);
		pud_k = pud_offset(pgd_k, address);
		if (!pud_present(*pud_k))
			goto no_context;
		pmd = pmd_offset(pud, address);
		pmd_k = pmd_offset(pud_k, address);
		if (!pmd_present(*pmd_k))
			goto no_context;
		set_pmd(pmd, *pmd_k);

		pte_k = pte_offset_kernel(pmd_k, address);
		if (!pte_present(*pte_k))
			goto no_context;

		flush_tlb_one(address);
		return;
	}
}
Example #15
0
static void
__i915_gem_userptr_get_pages_worker(struct work_struct *_work)
{
	struct get_pages_work *work = container_of(_work, typeof(*work), work);
	struct drm_i915_gem_object *obj = work->obj;
	const int npages = obj->base.size >> PAGE_SHIFT;
	struct vm_page **pvec;
	int pinned, ret;

	ret = -ENOMEM;
	pinned = 0;

	pvec = kvmalloc_array(npages, sizeof(struct vm_page *), GFP_KERNEL);
	if (pvec != NULL) {
		struct mm_struct *mm = obj->userptr.mm->mm;
		unsigned int flags = 0;

		if (!i915_gem_object_is_readonly(obj))
			flags |= FOLL_WRITE;

		ret = -EFAULT;
		if (mmget_not_zero(mm)) {
			down_read(&mm->mmap_sem);
			while (pinned < npages) {
				ret = get_user_pages_remote
					(work->task, mm,
					 obj->userptr.ptr + pinned * PAGE_SIZE,
					 npages - pinned,
					 flags,
					 pvec + pinned, NULL, NULL);
				if (ret < 0)
					break;

				pinned += ret;
			}
			up_read(&mm->mmap_sem);
			mmput(mm);
		}
	}

	mutex_lock(&obj->mm.lock);
	if (obj->userptr.work == &work->work) {
		struct sg_table *pages = ERR_PTR(ret);

		if (pinned == npages) {
			pages = __i915_gem_userptr_alloc_pages(obj, pvec,
							       npages);
			if (!IS_ERR(pages)) {
				pinned = 0;
				pages = NULL;
			}
		}

		obj->userptr.work = ERR_CAST(pages);
		if (IS_ERR(pages))
			__i915_gem_userptr_set_active(obj, false);
	}
	mutex_unlock(&obj->mm.lock);

	release_pages(pvec, pinned);
	kvfree(pvec);

	i915_gem_object_put(obj);
	put_task_struct(work->task);
	kfree(work);
}
Example #16
0
/*
 * This routine is responsible for faulting in user pages.
 * It passes the work off to one of the appropriate routines.
 * It returns true if the fault was successfully handled.
 */
static int handle_page_fault(struct pt_regs *regs,
			     int fault_num,
			     int is_page_fault,
			     unsigned long address,
			     int write)
{
	struct task_struct *tsk;
	struct mm_struct *mm;
	struct vm_area_struct *vma;
	unsigned long stack_offset;
	int fault;
	int si_code;
	int is_kernel_mode;
	pgd_t *pgd;
	unsigned int flags;

	/* on TILE, protection faults are always writes */
	if (!is_page_fault)
		write = 1;

	flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;

	is_kernel_mode = !user_mode(regs);

	tsk = validate_current();

	/*
	 * Check to see if we might be overwriting the stack, and bail
	 * out if so.  The page fault code is a relatively likely
	 * place to get trapped in an infinite regress, and once we
	 * overwrite the whole stack, it becomes very hard to recover.
	 */
	stack_offset = stack_pointer & (THREAD_SIZE-1);
	if (stack_offset < THREAD_SIZE / 8) {
		pr_alert("Potential stack overrun: sp %#lx\n", stack_pointer);
		show_regs(regs);
		pr_alert("Killing current process %d/%s\n",
			 tsk->pid, tsk->comm);
		do_group_exit(SIGKILL);
	}

	/*
	 * Early on, we need to check for migrating PTE entries;
	 * see homecache.c.  If we find a migrating PTE, we wait until
	 * the backing page claims to be done migrating, then we proceed.
	 * For kernel PTEs, we rewrite the PTE and return and retry.
	 * Otherwise, we treat the fault like a normal "no PTE" fault,
	 * rather than trying to patch up the existing PTE.
	 */
	pgd = get_current_pgd();
	if (handle_migrating_pte(pgd, fault_num, address, regs->pc,
				 is_kernel_mode, write))
		return 1;

	si_code = SEGV_MAPERR;

	/*
	 * We fault-in kernel-space virtual memory on-demand. The
	 * 'reference' page table is init_mm.pgd.
	 *
	 * NOTE! We MUST NOT take any locks for this case. We may
	 * be in an interrupt or a critical region, and should
	 * only copy the information from the master page table,
	 * nothing more.
	 *
	 * This verifies that the fault happens in kernel space
	 * and that the fault was not a protection fault.
	 */
	if (unlikely(address >= TASK_SIZE &&
		     !is_arch_mappable_range(address, 0))) {
		if (is_kernel_mode && is_page_fault &&
		    vmalloc_fault(pgd, address) >= 0)
			return 1;
		/*
		 * Don't take the mm semaphore here. If we fixup a prefetch
		 * fault we could otherwise deadlock.
		 */
		mm = NULL;  /* happy compiler */
		vma = NULL;
		goto bad_area_nosemaphore;
	}

	/*
	 * If we're trying to touch user-space addresses, we must
	 * be either at PL0, or else with interrupts enabled in the
	 * kernel, so either way we can re-enable interrupts here
	 * unless we are doing atomic access to user space with
	 * interrupts disabled.
	 */
	if (!(regs->flags & PT_FLAGS_DISABLE_IRQ))
		local_irq_enable();

	mm = tsk->mm;

	/*
	 * If we're in an interrupt, have no user context or are running in an
	 * region with pagefaults disabled then we must not take the fault.
	 */
	if (pagefault_disabled() || !mm) {
		vma = NULL;  /* happy compiler */
		goto bad_area_nosemaphore;
	}

	if (!is_kernel_mode)
		flags |= FAULT_FLAG_USER;

	/*
	 * When running in the kernel we expect faults to occur only to
	 * addresses in user space.  All other faults represent errors in the
	 * kernel and should generate an OOPS.  Unfortunately, in the case of an
	 * erroneous fault occurring in a code path which already holds mmap_sem
	 * we will deadlock attempting to validate the fault against the
	 * address space.  Luckily the kernel only validly references user
	 * space from well defined areas of code, which are listed in the
	 * exceptions table.
	 *
	 * As the vast majority of faults will be valid we will only perform
	 * the source reference check when there is a possibility of a deadlock.
	 * Attempt to lock the address space, if we cannot we then validate the
	 * source.  If this is invalid we can skip the address space check,
	 * thus avoiding the deadlock.
	 */
	if (!down_read_trylock(&mm->mmap_sem)) {
		if (is_kernel_mode &&
		    !search_exception_tables(regs->pc)) {
			vma = NULL;  /* happy compiler */
			goto bad_area_nosemaphore;
		}

retry:
		down_read(&mm->mmap_sem);
	}

	vma = find_vma(mm, address);
	if (!vma)
		goto bad_area;
	if (vma->vm_start <= address)
		goto good_area;
	if (!(vma->vm_flags & VM_GROWSDOWN))
		goto bad_area;
	if (regs->sp < PAGE_OFFSET) {
		/*
		 * accessing the stack below sp is always a bug.
		 */
		if (address < regs->sp)
			goto bad_area;
	}
	if (expand_stack(vma, address))
		goto bad_area;

/*
 * Ok, we have a good vm_area for this memory access, so
 * we can handle it..
 */
good_area:
	si_code = SEGV_ACCERR;
	if (fault_num == INT_ITLB_MISS) {
		if (!(vma->vm_flags & VM_EXEC))
			goto bad_area;
	} else if (write) {
#ifdef TEST_VERIFY_AREA
		if (!is_page_fault && regs->cs == KERNEL_CS)
			pr_err("WP fault at " REGFMT "\n", regs->eip);
#endif
		if (!(vma->vm_flags & VM_WRITE))
			goto bad_area;
		flags |= FAULT_FLAG_WRITE;
	} else {
		if (!is_page_fault || !(vma->vm_flags & VM_READ))
			goto bad_area;
	}

	/*
	 * If for any reason at all we couldn't handle the fault,
	 * make sure we exit gracefully rather than endlessly redo
	 * the fault.
	 */
	fault = handle_mm_fault(mm, vma, address, flags);

	if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current))
		return 0;

	if (unlikely(fault & VM_FAULT_ERROR)) {
		if (fault & VM_FAULT_OOM)
			goto out_of_memory;
		else if (fault & VM_FAULT_SIGSEGV)
			goto bad_area;
		else if (fault & VM_FAULT_SIGBUS)
			goto do_sigbus;
		BUG();
	}
	if (flags & FAULT_FLAG_ALLOW_RETRY) {
		if (fault & VM_FAULT_MAJOR)
			tsk->maj_flt++;
		else
			tsk->min_flt++;
		if (fault & VM_FAULT_RETRY) {
			flags &= ~FAULT_FLAG_ALLOW_RETRY;
			flags |= FAULT_FLAG_TRIED;

			 /*
			  * No need to up_read(&mm->mmap_sem) as we would
			  * have already released it in __lock_page_or_retry
			  * in mm/filemap.c.
			  */
			goto retry;
		}
	}

#if CHIP_HAS_TILE_DMA()
	/* If this was a DMA TLB fault, restart the DMA engine. */
	switch (fault_num) {
	case INT_DMATLB_MISS:
	case INT_DMATLB_MISS_DWNCL:
	case INT_DMATLB_ACCESS:
	case INT_DMATLB_ACCESS_DWNCL:
		__insn_mtspr(SPR_DMA_CTR, SPR_DMA_CTR__REQUEST_MASK);
		break;
	}
#endif

	up_read(&mm->mmap_sem);
	return 1;

/*
 * Something tried to access memory that isn't in our memory map..
 * Fix it, but check if it's kernel or user first..
 */
bad_area:
	up_read(&mm->mmap_sem);

bad_area_nosemaphore:
	/* User mode accesses just cause a SIGSEGV */
	if (!is_kernel_mode) {
		/*
		 * It's possible to have interrupts off here.
		 */
		local_irq_enable();

		force_sig_info_fault("segfault", SIGSEGV, si_code, address,
				     fault_num, tsk, regs);
		return 0;
	}

no_context:
	/* Are we prepared to handle this kernel fault?  */
	if (fixup_exception(regs))
		return 0;

/*
 * Oops. The kernel tried to access some bad page. We'll have to
 * terminate things with extreme prejudice.
 */

	bust_spinlocks(1);

	/* FIXME: no lookup_address() yet */
#ifdef SUPPORT_LOOKUP_ADDRESS
	if (fault_num == INT_ITLB_MISS) {
		pte_t *pte = lookup_address(address);

		if (pte && pte_present(*pte) && !pte_exec_kernel(*pte))
			pr_crit("kernel tried to execute non-executable page - exploit attempt? (uid: %d)\n",
				current->uid);
	}
#endif
	if (address < PAGE_SIZE)
		pr_alert("Unable to handle kernel NULL pointer dereference\n");
	else
		pr_alert("Unable to handle kernel paging request\n");
	pr_alert(" at virtual address " REGFMT ", pc " REGFMT "\n",
		 address, regs->pc);

	show_regs(regs);

	if (unlikely(tsk->pid < 2)) {
		panic("Kernel page fault running %s!",
		      is_idle_task(tsk) ? "the idle task" : "init");
	}

	/*
	 * More FIXME: we should probably copy the i386 here and
	 * implement a generic die() routine.  Not today.
	 */
#ifdef SUPPORT_DIE
	die("Oops", regs);
#endif
	bust_spinlocks(1);

	do_group_exit(SIGKILL);

/*
 * We ran out of memory, or some other thing happened to us that made
 * us unable to handle the page fault gracefully.
 */
out_of_memory:
	up_read(&mm->mmap_sem);
	if (is_kernel_mode)
		goto no_context;
	pagefault_out_of_memory();
	return 0;

do_sigbus:
	up_read(&mm->mmap_sem);

	/* Kernel mode? Handle exceptions or die */
	if (is_kernel_mode)
		goto no_context;

	force_sig_info_fault("bus error", SIGBUS, BUS_ADRERR, address,
			     fault_num, tsk, regs);
	return 0;
}
Example #17
0
void do_page_fault(struct pt_regs *regs, int write, unsigned long address,
		   unsigned long cause_code)
{
	struct vm_area_struct *vma = NULL;
	struct task_struct *tsk = current;
	struct mm_struct *mm = tsk->mm;
	siginfo_t info;
	int fault, ret;
	unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;

	/*
	 * We fault-in kernel-space virtual memory on-demand. The
	 * 'reference' page table is init_mm.pgd.
	 *
	 * NOTE! We MUST NOT take any locks for this case. We may
	 * be in an interrupt or a critical region, and should
	 * only copy the information from the master page table,
	 * nothing more.
	 */
	if (address >= VMALLOC_START && address <= VMALLOC_END) {
		ret = handle_vmalloc_fault(address);
		if (unlikely(ret))
			goto bad_area_nosemaphore;
		else
			return;
	}

	info.si_code = SEGV_MAPERR;

	/*
	 * If we're in an interrupt or have no user
	 * context, we must not take the fault..
	 */
	if (in_atomic() || !mm)
		goto no_context;

	if (user_mode(regs))
		flags |= FAULT_FLAG_USER;
retry:
	down_read(&mm->mmap_sem);
	vma = find_vma(mm, address);
	if (!vma)
		goto bad_area;
	if (vma->vm_start <= address)
		goto good_area;
	if (!(vma->vm_flags & VM_GROWSDOWN))
		goto bad_area;
	if (expand_stack(vma, address))
		goto bad_area;

	/*
	 * Ok, we have a good vm_area for this memory access, so
	 * we can handle it..
	 */
good_area:
	info.si_code = SEGV_ACCERR;

	/* Handle protection violation, execute on heap or stack */

	if (cause_code == ((ECR_V_PROTV << 16) | ECR_C_PROTV_INST_FETCH))
		goto bad_area;

	if (write) {
		if (!(vma->vm_flags & VM_WRITE))
			goto bad_area;
		flags |= FAULT_FLAG_WRITE;
	} else {
		if (!(vma->vm_flags & (VM_READ | VM_EXEC)))
			goto bad_area;
	}

survive:
	/*
	 * If for any reason at all we couldn't handle the fault,
	 * make sure we exit gracefully rather than endlessly redo
	 * the fault.
	 */
	fault = handle_mm_fault(mm, vma, address, flags);

	/* If Pagefault was interrupted by SIGKILL, exit page fault "early" */
	if (unlikely(fatal_signal_pending(current))) {
		if ((fault & VM_FAULT_ERROR) && !(fault & VM_FAULT_RETRY))
			up_read(&mm->mmap_sem);
		if (user_mode(regs))
			return;
	}

	if (likely(!(fault & VM_FAULT_ERROR))) {
		if (flags & FAULT_FLAG_ALLOW_RETRY) {
			/* To avoid updating stats twice for retry case */
			if (fault & VM_FAULT_MAJOR)
				tsk->maj_flt++;
			else
				tsk->min_flt++;

			if (fault & VM_FAULT_RETRY) {
				flags &= ~FAULT_FLAG_ALLOW_RETRY;
				flags |= FAULT_FLAG_TRIED;
				goto retry;
			}
		}

		/* Fault Handled Gracefully */
		up_read(&mm->mmap_sem);
		return;
	}

	/* TBD: switch to pagefault_out_of_memory() */
	if (fault & VM_FAULT_OOM)
		goto out_of_memory;
	else if (fault & VM_FAULT_SIGSEGV)
		goto bad_area;
	else if (fault & VM_FAULT_SIGBUS)
		goto do_sigbus;

	/* no man's land */
	BUG();

	/*
	 * Something tried to access memory that isn't in our memory map..
	 * Fix it, but check if it's kernel or user first..
	 */
bad_area:
	up_read(&mm->mmap_sem);

bad_area_nosemaphore:
	/* User mode accesses just cause a SIGSEGV */
	if (user_mode(regs)) {
		tsk->thread.fault_address = address;
		tsk->thread.cause_code = cause_code;
		info.si_signo = SIGSEGV;
		info.si_errno = 0;
		/* info.si_code has been set above */
		info.si_addr = (void __user *)address;
		force_sig_info(SIGSEGV, &info, tsk);
		return;
	}

no_context:
	/* Are we prepared to handle this kernel fault?
	 *
	 * (The kernel has valid exception-points in the source
	 *  when it acesses user-memory. When it fails in one
	 *  of those points, we find it in a table and do a jump
	 *  to some fixup code that loads an appropriate error
	 *  code)
	 */
	if (fixup_exception(regs))
		return;

	die("Oops", regs, address, cause_code);

out_of_memory:
	if (is_global_init(tsk)) {
		yield();
		goto survive;
	}
	up_read(&mm->mmap_sem);

	if (user_mode(regs))
		do_group_exit(SIGKILL);	/* This will never return */

	goto no_context;

do_sigbus:
	up_read(&mm->mmap_sem);

	if (!user_mode(regs))
		goto no_context;

	tsk->thread.fault_address = address;
	tsk->thread.cause_code = cause_code;
	info.si_signo = SIGBUS;
	info.si_errno = 0;
	info.si_code = BUS_ADRERR;
	info.si_addr = (void __user *)address;
	force_sig_info(SIGBUS, &info, tsk);
}
static ssize_t nilfs_cpfile_do_get_ssinfo(struct inode *cpfile, __u64 *cnop,
					  void *buf, unsigned cisz, size_t nci)
{
	struct buffer_head *bh;
	struct nilfs_cpfile_header *header;
	struct nilfs_checkpoint *cp;
	struct nilfs_cpinfo *ci = buf;
	__u64 curr = *cnop, next;
	unsigned long curr_blkoff, next_blkoff;
	void *kaddr;
	int n = 0, ret;

	down_read(&NILFS_MDT(cpfile)->mi_sem);

	if (curr == 0) {
		ret = nilfs_cpfile_get_header_block(cpfile, &bh);
		if (ret < 0)
			goto out;
		kaddr = kmap_atomic(bh->b_page);
		header = nilfs_cpfile_block_get_header(cpfile, bh, kaddr);
		curr = le64_to_cpu(header->ch_snapshot_list.ssl_next);
		kunmap_atomic(kaddr);
		brelse(bh);
		if (curr == 0) {
			ret = 0;
			goto out;
		}
	} else if (unlikely(curr == ~(__u64)0)) {
		ret = 0;
		goto out;
	}

	curr_blkoff = nilfs_cpfile_get_blkoff(cpfile, curr);
	ret = nilfs_cpfile_get_checkpoint_block(cpfile, curr, 0, &bh);
	if (unlikely(ret < 0)) {
		if (ret == -ENOENT)
			ret = 0; 
		goto out;
	}
	kaddr = kmap_atomic(bh->b_page);
	while (n < nci) {
		cp = nilfs_cpfile_block_get_checkpoint(cpfile, curr, bh, kaddr);
		curr = ~(__u64)0; 
		if (unlikely(nilfs_checkpoint_invalid(cp) ||
			     !nilfs_checkpoint_snapshot(cp)))
			break;
		nilfs_cpfile_checkpoint_to_cpinfo(cpfile, cp, ci);
		ci = (void *)ci + cisz;
		n++;
		next = le64_to_cpu(cp->cp_snapshot_list.ssl_next);
		if (next == 0)
			break; 

		next_blkoff = nilfs_cpfile_get_blkoff(cpfile, next);
		if (curr_blkoff != next_blkoff) {
			kunmap_atomic(kaddr);
			brelse(bh);
			ret = nilfs_cpfile_get_checkpoint_block(cpfile, next,
								0, &bh);
			if (unlikely(ret < 0)) {
				WARN_ON(ret == -ENOENT);
				goto out;
			}
			kaddr = kmap_atomic(bh->b_page);
		}
		curr = next;
		curr_blkoff = next_blkoff;
	}
	kunmap_atomic(kaddr);
	brelse(bh);
	*cnop = curr;
	ret = n;

 out:
	up_read(&NILFS_MDT(cpfile)->mi_sem);
	return ret;
}
Example #19
0
int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
{
	struct inode *inode = file->f_mapping->host;
	struct f2fs_inode_info *fi = F2FS_I(inode);
	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
	nid_t ino = inode->i_ino;
	int ret = 0;
	bool need_cp = false;
	struct writeback_control wbc = {
		.sync_mode = WB_SYNC_ALL,
		.nr_to_write = LONG_MAX,
		.for_reclaim = 0,
	};

	if (unlikely(f2fs_readonly(inode->i_sb)))
		return 0;

	trace_f2fs_sync_file_enter(inode);

	/* if fdatasync is triggered, let's do in-place-update */
	if (get_dirty_pages(inode) <= SM_I(sbi)->min_fsync_blocks)
		set_inode_flag(fi, FI_NEED_IPU);
	ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
	clear_inode_flag(fi, FI_NEED_IPU);

	if (ret) {
		trace_f2fs_sync_file_exit(inode, need_cp, datasync, ret);
		return ret;
	}

	/* if the inode is dirty, let's recover all the time */
	if (!datasync && is_inode_flag_set(fi, FI_DIRTY_INODE)) {
		update_inode_page(inode);
		goto go_write;
	}

	/*
	 * if there is no written data, don't waste time to write recovery info.
	 */
	if (!is_inode_flag_set(fi, FI_APPEND_WRITE) &&
			!exist_written_data(sbi, ino, APPEND_INO)) {

		/* it may call write_inode just prior to fsync */
		if (need_inode_page_update(sbi, ino))
			goto go_write;

		if (is_inode_flag_set(fi, FI_UPDATE_WRITE) ||
				exist_written_data(sbi, ino, UPDATE_INO))
			goto flush_out;
		goto out;
	}
go_write:
	/* guarantee free sections for fsync */
	f2fs_balance_fs(sbi);

	/*
	 * Both of fdatasync() and fsync() are able to be recovered from
	 * sudden-power-off.
	 */
	down_read(&fi->i_sem);
	need_cp = need_do_checkpoint(inode);
	up_read(&fi->i_sem);

	if (need_cp) {
		/* all the dirty node pages should be flushed for POR */
		ret = f2fs_sync_fs(inode->i_sb, 1);

		/*
		 * We've secured consistency through sync_fs. Following pino
		 * will be used only for fsynced inodes after checkpoint.
		 */
		try_to_fix_pino(inode);
		clear_inode_flag(fi, FI_APPEND_WRITE);
		clear_inode_flag(fi, FI_UPDATE_WRITE);
		goto out;
	}
sync_nodes:
	sync_node_pages(sbi, ino, &wbc);

	/* if cp_error was enabled, we should avoid infinite loop */
	if (unlikely(f2fs_cp_error(sbi)))
		goto out;

	if (need_inode_block_update(sbi, ino)) {
		mark_inode_dirty_sync(inode);
		f2fs_write_inode(inode, NULL);
		goto sync_nodes;
	}

	ret = wait_on_node_pages_writeback(sbi, ino);
	if (ret)
		goto out;

	/* once recovery info is written, don't need to tack this */
	remove_dirty_inode(sbi, ino, APPEND_INO);
	clear_inode_flag(fi, FI_APPEND_WRITE);
flush_out:
	remove_dirty_inode(sbi, ino, UPDATE_INO);
	clear_inode_flag(fi, FI_UPDATE_WRITE);
	ret = f2fs_issue_flush(sbi);
out:
	trace_f2fs_sync_file_exit(inode, need_cp, datasync, ret);
	f2fs_trace_ios(NULL, 1);
	return ret;
}

static pgoff_t __get_first_dirty_index(struct address_space *mapping,
						pgoff_t pgofs, int whence)
{
	struct pagevec pvec;
	int nr_pages;

	if (whence != SEEK_DATA)
		return 0;

	/* find first dirty page index */
	pagevec_init(&pvec, 0);
	nr_pages = pagevec_lookup_tag(&pvec, mapping, &pgofs,
					PAGECACHE_TAG_DIRTY, 1);
	pgofs = nr_pages ? pvec.pages[0]->index : LONG_MAX;
	pagevec_release(&pvec);
	return pgofs;
}

static bool __found_offset(block_t blkaddr, pgoff_t dirty, pgoff_t pgofs,
							int whence)
{
	switch (whence) {
	case SEEK_DATA:
		if ((blkaddr == NEW_ADDR && dirty == pgofs) ||
			(blkaddr != NEW_ADDR && blkaddr != NULL_ADDR))
			return true;
		break;
	case SEEK_HOLE:
		if (blkaddr == NULL_ADDR)
			return true;
		break;
	}
	return false;
}

static inline int unsigned_offsets(struct file *file)
{
	return file->f_mode & FMODE_UNSIGNED_OFFSET;
}

static loff_t vfs_setpos(struct file *file, loff_t offset, loff_t maxsize)
{
	if (offset < 0 && !unsigned_offsets(file))
		return -EINVAL;
	if (offset > maxsize)
		return -EINVAL;

	if (offset != file->f_pos) {
		file->f_pos = offset;
		file->f_version = 0;
	}
	return offset;
}

static loff_t f2fs_seek_block(struct file *file, loff_t offset, int whence)
{
	struct inode *inode = file->f_mapping->host;
	loff_t maxbytes = inode->i_sb->s_maxbytes;
	struct dnode_of_data dn;
	pgoff_t pgofs, end_offset, dirty;
	loff_t data_ofs = offset;
	loff_t isize;
	int err = 0;

	mutex_lock(&inode->i_mutex);

	isize = i_size_read(inode);
	if (offset >= isize)
		goto fail;

	/* handle inline data case */
	if (f2fs_has_inline_data(inode) || f2fs_has_inline_dentry(inode)) {
		if (whence == SEEK_HOLE)
			data_ofs = isize;
		goto found;
	}

	pgofs = (pgoff_t)(offset >> PAGE_CACHE_SHIFT);

	dirty = __get_first_dirty_index(inode->i_mapping, pgofs, whence);

	for (; data_ofs < isize; data_ofs = pgofs << PAGE_CACHE_SHIFT) {
		set_new_dnode(&dn, inode, NULL, NULL, 0);
		err = get_dnode_of_data(&dn, pgofs, LOOKUP_NODE_RA);
		if (err && err != -ENOENT) {
			goto fail;
		} else if (err == -ENOENT) {
			/* direct node does not exists */
			if (whence == SEEK_DATA) {
				pgofs = PGOFS_OF_NEXT_DNODE(pgofs,
							F2FS_I(inode));
				continue;
			} else {
				goto found;
			}
		}

		end_offset = ADDRS_PER_PAGE(dn.node_page, F2FS_I(inode));

		/* find data/hole in dnode block */
		for (; dn.ofs_in_node < end_offset;
				dn.ofs_in_node++, pgofs++,
				data_ofs = (loff_t)pgofs << PAGE_CACHE_SHIFT) {
			block_t blkaddr;
			blkaddr = datablock_addr(dn.node_page, dn.ofs_in_node);

			if (__found_offset(blkaddr, dirty, pgofs, whence)) {
				f2fs_put_dnode(&dn);
				goto found;
			}
		}
		f2fs_put_dnode(&dn);
	}

	if (whence == SEEK_DATA)
		goto fail;
found:
	if (whence == SEEK_HOLE && data_ofs > isize)
		data_ofs = isize;
	mutex_unlock(&inode->i_mutex);
	return vfs_setpos(file, data_ofs, maxbytes);
fail:
	mutex_unlock(&inode->i_mutex);
	return -ENXIO;
}

static loff_t f2fs_llseek(struct file *file, loff_t offset, int whence)
{
	struct inode *inode = file->f_mapping->host;
	loff_t maxbytes = inode->i_sb->s_maxbytes;

	switch (whence) {
	case SEEK_SET:
	case SEEK_CUR:
	case SEEK_END:
		return generic_file_llseek_size(file, offset, whence,
						maxbytes);
	case SEEK_DATA:
	case SEEK_HOLE:
		if (offset < 0)
			return -ENXIO;
		return f2fs_seek_block(file, offset, whence);
	}

	return -EINVAL;
}

static int f2fs_file_mmap(struct file *file, struct vm_area_struct *vma)
{
	struct inode *inode = file_inode(file);

	if (f2fs_encrypted_inode(inode)) {
		int err = f2fs_get_encryption_info(inode);
		if (err)
			return 0;
	}

	/* we don't need to use inline_data strictly */
	if (f2fs_has_inline_data(inode)) {
		int err = f2fs_convert_inline_inode(inode);
		if (err)
			return err;
	}

	file_accessed(file);
	vma->vm_ops = &f2fs_file_vm_ops;
	return 0;
}

static int f2fs_file_open(struct inode *inode, struct file *filp)
{
	int ret = generic_file_open(inode, filp);

	if (!ret && f2fs_encrypted_inode(inode)) {
		ret = f2fs_get_encryption_info(inode);
		if (ret)
			ret = -EACCES;
	}
	return ret;
}
Example #20
0
/*
 * siw_post_receive()
 *
 * Post a list of R-WR's to a RQ.
 *
 * @ofa_qp:	OFA QP contained in siw QP
 * @wr:		Null terminated list of user WR's
 * @bad_wr:	Points to failing WR in case of synchronous failure.
 */
int siw_post_receive(struct ib_qp *ofa_qp, struct ib_recv_wr *wr,
		     struct ib_recv_wr **bad_wr)
{
	struct siw_wqe	*wqe = NULL;
	struct siw_qp	*qp = siw_qp_ofa2siw(ofa_qp);
	unsigned long	flags;
	int rv = 0;

	dprint(DBG_WR|DBG_TX, "(QP%d): state=%d\n", QP_ID(qp),
		qp->attrs.state);

	if (unlikely(qp->srq)) {
		*bad_wr = wr;
		return -EOPNOTSUPP; /* what else from errno.h? */
	}
	/*
	 * Try to acquire QP state lock. Must be non-blocking
	 * to accommodate kernel clients needs.
	 */
	if (!down_read_trylock(&qp->state_lock)) {
		*bad_wr = wr;
		return -ENOTCONN;
	}

	if (qp->attrs.state > SIW_QP_STATE_RTS) {
		up_read(&qp->state_lock);
		dprint(DBG_ON, " (QP%d): state=%d\n", QP_ID(qp),
			qp->attrs.state);
		*bad_wr = wr;
		return -EINVAL;
	}
	while (wr) {
		wqe = siw_wqe_alloc(qp, SIW_WR_RECEIVE);
		if (!wqe) {
			rv = -ENOMEM;
			break;
		}
		if (wr->num_sge > qp->attrs.rq_max_sges) {
			dprint(DBG_WR|DBG_ON, "(QP%d): Num SGE: %d\n",
				QP_ID(qp), wr->num_sge);
			rv = -EINVAL;
			break;
		}
		wr_type(wqe) = SIW_WR_RECEIVE;
		wr_id(wqe) = wr->wr_id;

		rv = siw_copy_sgl(wr->sg_list, wqe->wr.recv.sge, wr->num_sge);
		if (rv < 0) {
			/*
			 * XXX tentatively allow zero length receive
			 */
			rv = -EINVAL;
			break;
		}
		wqe->wr.recv.num_sge = wr->num_sge;
		wqe->bytes = rv;
		wqe->wr_status = SR_WR_QUEUED;

		lock_rq_rxsave(qp, flags);
		list_add_tail(&wqe->list, &qp->rq);
		unlock_rq_rxsave(qp, flags);

		wr = wr->next;
	}
	if (rv <= 0) {
		dprint(DBG_WR|DBG_ON, "(QP%d): error=%d\n", QP_ID(qp), rv);
		if (wqe != NULL)
			siw_wqe_put(wqe);
		*bad_wr = wr;
	}
	dprint(DBG_WR|DBG_RX, "(QP%d): rq_space=%d\n", QP_ID(qp),
		atomic_read(&qp->rq_space));

	up_read(&qp->state_lock);

	return rv > 0 ? 0 : rv;
}
/*
 * xfs_filestream_associate() should only be called to associate a regular file
 * with its parent directory.  Calling it with a child directory isn't
 * appropriate because filestreams don't apply to entire directory hierarchies.
 * Creating a file in a child directory of an existing filestream directory
 * starts a new filestream with its own allocation group association.
 *
 * Returns < 0 on error, 0 if successful association occurred, > 0 if
 * we failed to get an association because of locking issues.
 */
int
xfs_filestream_associate(
	xfs_inode_t	*pip,
	xfs_inode_t	*ip)
{
	xfs_mount_t	*mp;
	xfs_mru_cache_t	*cache;
	fstrm_item_t	*item;
	xfs_agnumber_t	ag, rotorstep, startag;
	int		err = 0;

	ASSERT(pip->i_d.di_mode & S_IFDIR);
	ASSERT(ip->i_d.di_mode & S_IFREG);
	if (!(pip->i_d.di_mode & S_IFDIR) || !(ip->i_d.di_mode & S_IFREG))
		return -EINVAL;

	mp = pip->i_mount;
	cache = mp->m_filestream;
	down_read(&mp->m_peraglock);

	/*
	 * We have a problem, Houston.
	 *
	 * Taking the iolock here violates inode locking order - we already
	 * hold the ilock. Hence if we block getting this lock we may never
	 * wake. Unfortunately, that means if we can't get the lock, we're
	 * screwed in terms of getting a stream association - we can't spin
	 * waiting for the lock because someone else is waiting on the lock we
	 * hold and we cannot drop that as we are in a transaction here.
	 *
	 * Lucky for us, this inversion is not a problem because it's a
	 * directory inode that we are trying to lock here.
	 *
	 * So, if we can't get the iolock without sleeping then just give up
	 */
	if (!xfs_ilock_nowait(pip, XFS_IOLOCK_EXCL)) {
		up_read(&mp->m_peraglock);
		return 1;
	}

	/* If the parent directory is already in the cache, use its AG. */
	item = xfs_mru_cache_lookup(cache, pip->i_ino);
	if (item) {
		ASSERT(item->ip == pip);
		ag = item->ag;
		xfs_mru_cache_done(cache);

		TRACE_LOOKUP(mp, pip, pip, ag, xfs_filestream_peek_ag(mp, ag));
		err = _xfs_filestream_update_ag(ip, pip, ag);

		goto exit;
	}

	/*
	 * Set the starting AG using the rotor for inode32, otherwise
	 * use the directory inode's AG.
	 */
	if (mp->m_flags & XFS_MOUNT_32BITINODES) {
		rotorstep = xfs_rotorstep;
		startag = (mp->m_agfrotor / rotorstep) % mp->m_sb.sb_agcount;
		mp->m_agfrotor = (mp->m_agfrotor + 1) %
		                 (mp->m_sb.sb_agcount * rotorstep);
	} else
		startag = XFS_INO_TO_AGNO(mp, pip->i_ino);

	/* Pick a new AG for the parent inode starting at startag. */
	err = _xfs_filestream_pick_ag(mp, startag, &ag, 0, 0);
	if (err || ag == NULLAGNUMBER)
		goto exit_did_pick;

	/* Associate the parent inode with the AG. */
	err = _xfs_filestream_update_ag(pip, NULL, ag);
	if (err)
		goto exit_did_pick;

	/* Associate the file inode with the AG. */
	err = _xfs_filestream_update_ag(ip, pip, ag);
	if (err)
		goto exit_did_pick;

	TRACE_ASSOCIATE(mp, ip, pip, ag, xfs_filestream_peek_ag(mp, ag));

exit_did_pick:
	/*
	 * If _xfs_filestream_pick_ag() returned a valid AG, remove the
	 * reference it took on it, since the file and directory will have taken
	 * their own now if they were successfully cached.
	 */
	if (ag != NULLAGNUMBER)
		xfs_filestream_put_ag(mp, ag);

exit:
	xfs_iunlock(pip, XFS_IOLOCK_EXCL);
	up_read(&mp->m_peraglock);
	return -err;
}
Example #22
0
/*
 * siw_post_send()
 *
 * Post a list of S-WR's to a SQ.
 *
 * @ofa_qp:	OFA QP contained in siw QP
 * @wr:		Null terminated list of user WR's
 * @bad_wr:	Points to failing WR in case of synchronous failure.
 */
int siw_post_send(struct ib_qp *ofa_qp, struct ib_send_wr *wr,
		  struct ib_send_wr **bad_wr)
{
	struct siw_wqe	*wqe = NULL;
	struct siw_qp	*qp = siw_qp_ofa2siw(ofa_qp);

	unsigned long flags;
	int rv = 0;

	dprint(DBG_WR|DBG_TX, "(QP%d): state=%d\n",
		QP_ID(qp), qp->attrs.state);

	/*
	 * Try to acquire QP state lock. Must be non-blocking
	 * to accommodate kernel clients needs.
	 */
	if (!down_read_trylock(&qp->state_lock)) {
		*bad_wr = wr;
		return -ENOTCONN;
	}

	if (qp->attrs.state != SIW_QP_STATE_RTS) {
		dprint(DBG_WR|DBG_ON, "(QP%d): state=%d\n",
			QP_ID(qp), qp->attrs.state);
		up_read(&qp->state_lock);
		*bad_wr = wr;
		return -ENOTCONN;
	}
	dprint(DBG_WR|DBG_TX, "(QP%d): sq_space(#1)=%d\n",
		QP_ID(qp), atomic_read(&qp->sq_space));

	while (wr) {
		wqe = siw_wqe_alloc(qp, opcode_ofa2siw(wr->opcode));
		if (!wqe) {
			dprint(DBG_ON, " siw_wqe_alloc\n");
			rv = -ENOMEM;
			break;
		}
		wr_type(wqe) = opcode_ofa2siw(wr->opcode);
		wr_id(wqe) = wr->wr_id;

		wr_flags(wqe) = wr->send_flags;
		if (qp->attrs.flags & SIW_SIGNAL_ALL_WR)
			wr_flags(wqe) |= IB_SEND_SIGNALED;

		if (wr->num_sge > qp->attrs.sq_max_sges) {
			/*
			 * NOTE: we allow for zero length wr's here.
			 */
			dprint(DBG_WR, "(QP%d): Num SGE: %d\n",
				QP_ID(qp), wr->num_sge);
			rv = -EINVAL;
			break;
		}

		switch (wr->opcode) {

		case IB_WR_SEND:
			if (!SIW_INLINED_DATA(wqe)) {
				rv = siw_copy_sgl(wr->sg_list,
						  wqe->wr.send.sge,
						  wr->num_sge);
				wqe->wr.send.num_sge = wr->num_sge;
			} else
				rv = siw_copy_inline_sgl(wr, wqe);

			if (rv < 0) {
				rv = -EINVAL;
				break;
			}
			wqe->bytes = rv;
			break;

		case IB_WR_RDMA_READ:
			/*
			 * OFED WR restricts RREAD sink to SGL containing
			 * 1 SGE only. we could relax to SGL with multiple
			 * elements referring the SAME ltag or even sending
			 * a private per-rreq tag referring to a checked
			 * local sgl with MULTIPLE ltag's. would be easy
			 * to do...
			 */
			if (wr->num_sge != 1) {
				rv = -EINVAL;
				break;
			}
			rv = siw_copy_sgl(wr->sg_list, wqe->wr.rread.sge, 1);
			/*
			 * NOTE: zero length RREAD is allowed!
			 */
			wqe->wr.rread.raddr = wr->wr.rdma.remote_addr;
			wqe->wr.rread.rtag = wr->wr.rdma.rkey;
			wqe->wr.rread.num_sge = 1;
			wqe->bytes = rv;
			break;

		case IB_WR_RDMA_WRITE:
			if (!SIW_INLINED_DATA(wqe)) {
				rv = siw_copy_sgl(wr->sg_list,
						  wqe->wr.send.sge,
						  wr->num_sge);
				wqe->wr.write.num_sge = wr->num_sge;
			} else
				rv = siw_copy_inline_sgl(wr, wqe);
			/*
			 * NOTE: zero length WRITE is allowed!
			 */
			if (rv < 0) {
				rv = -EINVAL;
				break;
			}
			wqe->wr.write.raddr = wr->wr.rdma.remote_addr;
			wqe->wr.write.rtag = wr->wr.rdma.rkey;
			wqe->bytes = rv;
			break;

		default:
			dprint(DBG_WR|DBG_TX|DBG_ON,
				"(QP%d): Opcode %d not yet implemented\n",
				QP_ID(qp), wr->opcode);
			wqe->wr.sgl.num_sge = 0;
			rv = -ENOSYS;
			break;
		}
		dprint(DBG_WR|DBG_TX, "(QP%d): opcode %d, bytes %d, "
				"flags 0x%x\n",
				QP_ID(qp), wr_type(wqe), wqe->bytes,
				wr_flags(wqe));
		if (rv < 0)
			break;

		wqe->wr_status = SR_WR_QUEUED;

		lock_sq_rxsave(qp, flags);
		list_add_tail(&wqe->list, &qp->sq);
		unlock_sq_rxsave(qp, flags);

		wr = wr->next;
	}
	/*
	 * Send directly if SQ processing is not in progress.
	 * Eventual immediate errors (rv < 0) do not affect the involved
	 * RI resources (Verbs, 8.3.1) and thus do not prevent from SQ
	 * processing, if new work is already pending. But rv must be passed
	 * to caller.
	 */
	lock_sq_rxsave(qp, flags);
	if (tx_wqe(qp) == NULL) {
		struct siw_wqe	*next = siw_next_tx_wqe(qp);
		if (next != NULL) {
			if (wr_type(next) != SIW_WR_RDMA_READ_REQ ||
			    !ORD_SUSPEND_SQ(qp)) {
				tx_wqe(qp) = next;
				if (wr_type(next) != SIW_WR_RDMA_READ_REQ)
					list_del_init(&next->list);
				else
					siw_rreq_queue(next, qp);

				unlock_sq_rxsave(qp, flags);

				dprint(DBG_WR|DBG_TX,
					"(QP%d): Direct sending...\n",
					QP_ID(qp));

				if (qp->attrs.flags & SIW_KERNEL_VERBS)
					siw_sq_queue_work(qp);
				else if (siw_qp_sq_process(qp, 1) != 0 &&
				    !(qp->tx_ctx.tx_suspend))
					siw_qp_cm_drop(qp, 0);
			} else
				unlock_sq_rxsave(qp, flags);
		} else
			unlock_sq_rxsave(qp, flags);
	} else
		unlock_sq_rxsave(qp, flags);

	up_read(&qp->state_lock);

	dprint(DBG_WR|DBG_TX, "(QP%d): sq_space(#2)=%d\n", QP_ID(qp),
		atomic_read(&qp->sq_space));
	if (rv >= 0)
		return 0;
	/*
	 * Immediate error
	 */
	dprint(DBG_WR|DBG_ON, "(QP%d): error=%d\n", QP_ID(qp), rv);

	if (wqe != NULL)
		siw_wqe_put(wqe);
	*bad_wr = wr;
	return rv;
}
Example #23
0
asmlinkage long sys_msgctl(int msqid, int cmd, struct msqid_ds __user *buf)
{
	struct msg_queue *msq;
	int err, version;
	struct ipc_namespace *ns;

	if (msqid < 0 || cmd < 0)
		return -EINVAL;

	version = ipc_parse_version(&cmd);
	ns = current->nsproxy->ipc_ns;

	switch (cmd) {
	case IPC_INFO:
	case MSG_INFO:
	{
		struct msginfo msginfo;
		int max_id;

		if (!buf)
			return -EFAULT;
		/*
		 * We must not return kernel stack data.
		 * due to padding, it's not enough
		 * to set all member fields.
		 */
		err = security_msg_queue_msgctl(NULL, cmd);
		if (err)
			return err;

		memset(&msginfo, 0, sizeof(msginfo));
		msginfo.msgmni = ns->msg_ctlmni;
		msginfo.msgmax = ns->msg_ctlmax;
		msginfo.msgmnb = ns->msg_ctlmnb;
		msginfo.msgssz = MSGSSZ;
		msginfo.msgseg = MSGSEG;
		down_read(&msg_ids(ns).rw_mutex);
		if (cmd == MSG_INFO) {
			msginfo.msgpool = msg_ids(ns).in_use;
			msginfo.msgmap = atomic_read(&ns->msg_hdrs);
			msginfo.msgtql = atomic_read(&ns->msg_bytes);
		} else {
			msginfo.msgmap = MSGMAP;
			msginfo.msgpool = MSGPOOL;
			msginfo.msgtql = MSGTQL;
		}
		max_id = ipc_get_maxid(&msg_ids(ns));
		up_read(&msg_ids(ns).rw_mutex);
		if (copy_to_user(buf, &msginfo, sizeof(struct msginfo)))
			return -EFAULT;
		return (max_id < 0) ? 0 : max_id;
	}
	case MSG_STAT:	/* msqid is an index rather than a msg queue id */
	case IPC_STAT:
	{
		struct msqid64_ds tbuf;
		int success_return;

		if (!buf)
			return -EFAULT;

		if (cmd == MSG_STAT) {
			msq = msg_lock(ns, msqid);
			if (IS_ERR(msq))
				return PTR_ERR(msq);
			success_return = msq->q_perm.id;
		} else {
			msq = msg_lock_check(ns, msqid);
			if (IS_ERR(msq))
				return PTR_ERR(msq);
			success_return = 0;
		}
		err = -EACCES;
		if (ipcperms(&msq->q_perm, S_IRUGO))
			goto out_unlock;

		err = security_msg_queue_msgctl(msq, cmd);
		if (err)
			goto out_unlock;

		memset(&tbuf, 0, sizeof(tbuf));

		kernel_to_ipc64_perm(&msq->q_perm, &tbuf.msg_perm);
		tbuf.msg_stime  = msq->q_stime;
		tbuf.msg_rtime  = msq->q_rtime;
		tbuf.msg_ctime  = msq->q_ctime;
		tbuf.msg_cbytes = msq->q_cbytes;
		tbuf.msg_qnum   = msq->q_qnum;
		tbuf.msg_qbytes = msq->q_qbytes;
		tbuf.msg_lspid  = msq->q_lspid;
		tbuf.msg_lrpid  = msq->q_lrpid;
		msg_unlock(msq);
		if (copy_msqid_to_user(buf, &tbuf, version))
			return -EFAULT;
		return success_return;
	}
	case IPC_SET:
	case IPC_RMID:
		err = msgctl_down(ns, msqid, cmd, buf, version);
		return err;
	default:
		return  -EINVAL;
	}

out_unlock:
	msg_unlock(msq);
	return err;
}
Example #24
0
static int __kprobes do_page_fault(unsigned long addr, unsigned int esr,
                                   struct pt_regs *regs)
{
    struct task_struct *tsk;
    struct mm_struct *mm;
    int fault, sig, code;
    unsigned long vm_flags = VM_READ | VM_WRITE | VM_EXEC;
    unsigned int mm_flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;

    if (esr & ESR_LNX_EXEC) {
        vm_flags = VM_EXEC;
    } else if ((esr & ESR_WRITE) && !(esr & ESR_CM)) {
        vm_flags = VM_WRITE;
        mm_flags |= FAULT_FLAG_WRITE;
    }

    tsk = current;
    mm  = tsk->mm;

    /* Enable interrupts if they were enabled in the parent context. */
    if (interrupts_enabled(regs))
        local_irq_enable();

    /*
     * If we're in an interrupt or have no user context, we must not take
     * the fault.
     */
    if (in_atomic() || !mm)
        goto no_context;

    /*
     * As per x86, we may deadlock here. However, since the kernel only
     * validly references user space from well defined areas of the code,
     * we can bug out early if this is from code which shouldn't.
     */
    if (!down_read_trylock(&mm->mmap_sem)) {
        if (!user_mode(regs) && !search_exception_tables(regs->pc))
            goto no_context;
retry:
        down_read(&mm->mmap_sem);
    } else {
        /*
         * The above down_read_trylock() might have succeeded in which
         * case, we'll have missed the might_sleep() from down_read().
         */
        might_sleep();
#ifdef CONFIG_DEBUG_VM
        if (!user_mode(regs) && !search_exception_tables(regs->pc))
            goto no_context;
#endif
    }

    fault = __do_page_fault(mm, addr, mm_flags, vm_flags, tsk);

    /*
     * If we need to retry but a fatal signal is pending, handle the
     * signal first. We do not need to release the mmap_sem because it
     * would already be released in __lock_page_or_retry in mm/filemap.c.
     */
    if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current))
        return 0;

    /*
     * Major/minor page fault accounting is only done on the initial
     * attempt. If we go through a retry, it is extremely likely that the
     * page will be found in page cache at that point.
     */

    perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, addr);
    if (mm_flags & FAULT_FLAG_ALLOW_RETRY) {
        if (fault & VM_FAULT_MAJOR) {
            tsk->maj_flt++;
            perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, regs,
                          addr);
        } else {
            tsk->min_flt++;
            perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, regs,
                          addr);
        }
        if (fault & VM_FAULT_RETRY) {
            /*
             * Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk of
             * starvation.
             */
            mm_flags &= ~FAULT_FLAG_ALLOW_RETRY;
            goto retry;
        }
    }

    up_read(&mm->mmap_sem);

    /*
     * Handle the "normal" case first - VM_FAULT_MAJOR / VM_FAULT_MINOR
     */
    if (likely(!(fault & (VM_FAULT_ERROR | VM_FAULT_BADMAP |
                          VM_FAULT_BADACCESS))))
        return 0;

    if (fault & VM_FAULT_OOM) {
        /*
         * We ran out of memory, call the OOM killer, and return to
         * userspace (which will retry the fault, or kill us if we got
         * oom-killed).
         */
        pagefault_out_of_memory();
        return 0;
    }

    /*
     * If we are in kernel mode at this point, we have no context to
     * handle this fault with.
     */
    if (!user_mode(regs))
        goto no_context;

    if (fault & VM_FAULT_SIGBUS) {
        /*
         * We had some memory, but were unable to successfully fix up
         * this page fault.
         */
        sig = SIGBUS;
        code = BUS_ADRERR;
    } else {
        /*
         * Something tried to access memory that isn't in our memory
         * map.
         */
        sig = SIGSEGV;
        code = fault == VM_FAULT_BADACCESS ?
               SEGV_ACCERR : SEGV_MAPERR;
    }

    __do_user_fault(tsk, addr, esr, sig, code, regs);
    return 0;

no_context:
    __do_kernel_fault(mm, addr, esr, regs);
    return 0;
}
int videobuf_qbuf(struct videobuf_queue *q,
	      struct v4l2_buffer *b)
{
	struct videobuf_buffer *buf;
	enum v4l2_field field;
	unsigned long flags = 0;
	int retval;

	MAGIC_CHECK(q->int_ops->magic, MAGIC_QTYPE_OPS);

	if (b->memory == V4L2_MEMORY_MMAP)
		down_read(&current->mm->mmap_sem);

	mutex_lock(&q->vb_lock);
	retval = -EBUSY;
	if (q->reading) {
		dprintk(1, "qbuf: Reading running...\n");
		goto done;
	}
	retval = -EINVAL;
	if (b->type != q->type) {
		dprintk(1, "qbuf: Wrong type.\n");
		goto done;
	}
	if (b->index >= VIDEO_MAX_FRAME) {
		dprintk(1, "qbuf: index out of range.\n");
		goto done;
	}
	buf = q->bufs[b->index];
	if (NULL == buf) {
		dprintk(1, "qbuf: buffer is null.\n");
		goto done;
	}
	MAGIC_CHECK(buf->magic, MAGIC_BUFFER);
	if (buf->memory != b->memory) {
		dprintk(1, "qbuf: memory type is wrong.\n");
		goto done;
	}
	if (buf->state != VIDEOBUF_NEEDS_INIT && buf->state != VIDEOBUF_IDLE) {
		dprintk(1, "qbuf: buffer is already queued or active.\n");
		goto done;
	}

	if (b->flags & V4L2_BUF_FLAG_INPUT) {
		if (b->input >= q->inputs) {
			dprintk(1, "qbuf: wrong input.\n");
			goto done;
		}
		buf->input = b->input;
	} else {
		buf->input = UNSET;
	}

	switch (b->memory) {
	case V4L2_MEMORY_MMAP:
		if (0 == buf->baddr) {
			dprintk(1, "qbuf: mmap requested "
				   "but buffer addr is zero!\n");
			goto done;
		}
		break;
	case V4L2_MEMORY_USERPTR:
		if (b->length < buf->bsize) {
			dprintk(1, "qbuf: buffer length is not enough\n");
			goto done;
		}
		if (VIDEOBUF_NEEDS_INIT != buf->state &&
		    buf->baddr != b->m.userptr)
			q->ops->buf_release(q, buf);
		buf->baddr = b->m.userptr;
		break;
	case V4L2_MEMORY_OVERLAY:
		buf->boff = b->m.offset;
		break;
	default:
		dprintk(1, "qbuf: wrong memory type\n");
		goto done;
	}

	dprintk(1, "qbuf: requesting next field\n");
	field = videobuf_next_field(q);
	retval = q->ops->buf_prepare(q, buf, field);
	if (0 != retval) {
		dprintk(1, "qbuf: buffer_prepare returned %d\n", retval);
		goto done;
	}

	list_add_tail(&buf->stream, &q->stream);
	if (q->streaming) {
		spin_lock_irqsave(q->irqlock, flags);
		q->ops->buf_queue(q, buf);
		spin_unlock_irqrestore(q->irqlock, flags);
	}
	dprintk(1, "qbuf: succeded\n");
	retval = 0;
	wake_up_interruptible_sync(&q->wait);

 done:
	mutex_unlock(&q->vb_lock);

	if (b->memory == V4L2_MEMORY_MMAP)
		up_read(&current->mm->mmap_sem);

	return retval;
}
/*
 * Logic: we've got two memory sums for each process, "shared", and
 * "non-shared". Shared memory may get counted more than once, for
 * each process that owns it. Non-shared memory is counted
 * accurately.
 */
void task_mem(struct seq_file *m, struct mm_struct *mm)
{
	struct vm_area_struct *vma;
	struct vm_region *region;
	struct rb_node *p;
	unsigned long bytes = 0, sbytes = 0, slack = 0, size;
        
	down_read(&mm->mmap_sem);
	for (p = rb_first(&mm->mm_rb); p; p = rb_next(p)) {
		vma = rb_entry(p, struct vm_area_struct, vm_rb);

		bytes += kobjsize(vma);

		region = vma->vm_region;
		if (region) {
			size = kobjsize(region);
			size += region->vm_end - region->vm_start;
		} else {
			size = vma->vm_end - vma->vm_start;
		}

		if (atomic_read(&mm->mm_count) > 1 ||
		    vma->vm_flags & VM_MAYSHARE) {
			sbytes += size;
		} else {
			bytes += size;
			if (region)
				slack = region->vm_end - vma->vm_end;
		}
	}

	if (atomic_read(&mm->mm_count) > 1)
		sbytes += kobjsize(mm);
	else
		bytes += kobjsize(mm);
	
	if (current->fs && current->fs->users > 1)
		sbytes += kobjsize(current->fs);
	else
		bytes += kobjsize(current->fs);

	if (current->files && atomic_read(&current->files->count) > 1)
		sbytes += kobjsize(current->files);
	else
		bytes += kobjsize(current->files);

	if (current->sighand && atomic_read(&current->sighand->count) > 1)
		sbytes += kobjsize(current->sighand);
	else
		bytes += kobjsize(current->sighand);

	bytes += kobjsize(current); /* includes kernel stack */

	seq_printf(m,
		"Mem:\t%8lu bytes\n"
		"Slack:\t%8lu bytes\n"
		"Shared:\t%8lu bytes\n",
		bytes, slack, sbytes);

	up_read(&mm->mmap_sem);
}
Example #27
0
/*
 * Allocate new inodes in the allocation group specified by agbp.
 * Return 0 for success, else error code.
 */
STATIC int				/* error code or 0 */
xfs_ialloc_ag_alloc(
	xfs_trans_t	*tp,		/* transaction pointer */
	xfs_buf_t	*agbp,		/* alloc group buffer */
	int		*alloc)
{
	xfs_agi_t	*agi;		/* allocation group header */
	xfs_alloc_arg_t	args;		/* allocation argument structure */
	int		blks_per_cluster;  /* fs blocks per inode cluster */
	xfs_btree_cur_t	*cur;		/* inode btree cursor */
	xfs_daddr_t	d;		/* disk addr of buffer */
	int		error;
	xfs_buf_t	*fbuf;		/* new free inodes' buffer */
	xfs_dinode_t	*free;		/* new free inode structure */
	int		i;		/* inode counter */
	int		j;		/* block counter */
	int		nbufs;		/* num bufs of new inodes */
	xfs_agino_t	newino;		/* new first inode's number */
	xfs_agino_t	newlen;		/* new number of inodes */
	int		ninodes;	/* num inodes per buf */
	xfs_agino_t	thisino;	/* current inode number, for loop */
	int		version;	/* inode version number to use */
	int		isaligned = 0;	/* inode allocation at stripe unit */
					/* boundary */

	args.tp = tp;
	args.mp = tp->t_mountp;

	/*
	 * Locking will ensure that we don't have two callers in here
	 * at one time.
	 */
	newlen = XFS_IALLOC_INODES(args.mp);
	if (args.mp->m_maxicount &&
	    args.mp->m_sb.sb_icount + newlen > args.mp->m_maxicount)
		return XFS_ERROR(ENOSPC);
	args.minlen = args.maxlen = XFS_IALLOC_BLOCKS(args.mp);
	/*
	 * First try to allocate inodes contiguous with the last-allocated
	 * chunk of inodes.  If the filesystem is striped, this will fill
	 * an entire stripe unit with inodes.
 	 */
	agi = XFS_BUF_TO_AGI(agbp);
	newino = be32_to_cpu(agi->agi_newino);
	args.agbno = XFS_AGINO_TO_AGBNO(args.mp, newino) +
			XFS_IALLOC_BLOCKS(args.mp);
	if (likely(newino != NULLAGINO &&
		  (args.agbno < be32_to_cpu(agi->agi_length)))) {
		args.fsbno = XFS_AGB_TO_FSB(args.mp,
				be32_to_cpu(agi->agi_seqno), args.agbno);
		args.type = XFS_ALLOCTYPE_THIS_BNO;
		args.mod = args.total = args.wasdel = args.isfl =
			args.userdata = args.minalignslop = 0;
		args.prod = 1;
		args.alignment = 1;
		/*
		 * Allow space for the inode btree to split.
		 */
		args.minleft = XFS_IN_MAXLEVELS(args.mp) - 1;
		if ((error = xfs_alloc_vextent(&args)))
			return error;
	} else
		args.fsbno = NULLFSBLOCK;

	if (unlikely(args.fsbno == NULLFSBLOCK)) {
		/*
		 * Set the alignment for the allocation.
		 * If stripe alignment is turned on then align at stripe unit
		 * boundary.
		 * If the cluster size is smaller than a filesystem block
		 * then we're doing I/O for inodes in filesystem block size
		 * pieces, so don't need alignment anyway.
		 */
		isaligned = 0;
		if (args.mp->m_sinoalign) {
			ASSERT(!(args.mp->m_flags & XFS_MOUNT_NOALIGN));
			args.alignment = args.mp->m_dalign;
			isaligned = 1;
		} else if (XFS_SB_VERSION_HASALIGN(&args.mp->m_sb) &&
			   args.mp->m_sb.sb_inoalignmt >=
			   XFS_B_TO_FSBT(args.mp,
			  	XFS_INODE_CLUSTER_SIZE(args.mp)))
				args.alignment = args.mp->m_sb.sb_inoalignmt;
		else
			args.alignment = 1;
		/*
		 * Need to figure out where to allocate the inode blocks.
		 * Ideally they should be spaced out through the a.g.
		 * For now, just allocate blocks up front.
		 */
		args.agbno = be32_to_cpu(agi->agi_root);
		args.fsbno = XFS_AGB_TO_FSB(args.mp,
				be32_to_cpu(agi->agi_seqno), args.agbno);
		/*
		 * Allocate a fixed-size extent of inodes.
		 */
		args.type = XFS_ALLOCTYPE_NEAR_BNO;
		args.mod = args.total = args.wasdel = args.isfl =
			args.userdata = args.minalignslop = 0;
		args.prod = 1;
		/*
		 * Allow space for the inode btree to split.
		 */
		args.minleft = XFS_IN_MAXLEVELS(args.mp) - 1;
		if ((error = xfs_alloc_vextent(&args)))
			return error;
	}

	/*
	 * If stripe alignment is turned on, then try again with cluster
	 * alignment.
	 */
	if (isaligned && args.fsbno == NULLFSBLOCK) {
		args.type = XFS_ALLOCTYPE_NEAR_BNO;
		args.agbno = be32_to_cpu(agi->agi_root);
		args.fsbno = XFS_AGB_TO_FSB(args.mp,
				be32_to_cpu(agi->agi_seqno), args.agbno);
		if (XFS_SB_VERSION_HASALIGN(&args.mp->m_sb) &&
			args.mp->m_sb.sb_inoalignmt >=
			XFS_B_TO_FSBT(args.mp, XFS_INODE_CLUSTER_SIZE(args.mp)))
				args.alignment = args.mp->m_sb.sb_inoalignmt;
		else
			args.alignment = 1;
		if ((error = xfs_alloc_vextent(&args)))
			return error;
	}

	if (args.fsbno == NULLFSBLOCK) {
		*alloc = 0;
		return 0;
	}
	ASSERT(args.len == args.minlen);
	/*
	 * Convert the results.
	 */
	newino = XFS_OFFBNO_TO_AGINO(args.mp, args.agbno, 0);
	/*
	 * Loop over the new block(s), filling in the inodes.
	 * For small block sizes, manipulate the inodes in buffers
	 * which are multiples of the blocks size.
	 */
	if (args.mp->m_sb.sb_blocksize >= XFS_INODE_CLUSTER_SIZE(args.mp)) {
		blks_per_cluster = 1;
		nbufs = (int)args.len;
		ninodes = args.mp->m_sb.sb_inopblock;
	} else {
		blks_per_cluster = XFS_INODE_CLUSTER_SIZE(args.mp) /
				   args.mp->m_sb.sb_blocksize;
		nbufs = (int)args.len / blks_per_cluster;
		ninodes = blks_per_cluster * args.mp->m_sb.sb_inopblock;
	}
	/*
	 * Figure out what version number to use in the inodes we create.
	 * If the superblock version has caught up to the one that supports
	 * the new inode format, then use the new inode version.  Otherwise
	 * use the old version so that old kernels will continue to be
	 * able to use the file system.
	 */
	if (XFS_SB_VERSION_HASNLINK(&args.mp->m_sb))
		version = XFS_DINODE_VERSION_2;
	else
		version = XFS_DINODE_VERSION_1;

	for (j = 0; j < nbufs; j++) {
		/*
		 * Get the block.
		 */
		d = XFS_AGB_TO_DADDR(args.mp, be32_to_cpu(agi->agi_seqno),
				     args.agbno + (j * blks_per_cluster));
		fbuf = xfs_trans_get_buf(tp, args.mp->m_ddev_targp, d,
					 args.mp->m_bsize * blks_per_cluster,
					 XFS_BUF_LOCK);
		ASSERT(fbuf);
		ASSERT(!XFS_BUF_GETERROR(fbuf));
		/*
		 * Set initial values for the inodes in this buffer.
		 */
		xfs_biozero(fbuf, 0, ninodes << args.mp->m_sb.sb_inodelog);
		for (i = 0; i < ninodes; i++) {
			free = XFS_MAKE_IPTR(args.mp, fbuf, i);
			INT_SET(free->di_core.di_magic, ARCH_CONVERT, XFS_DINODE_MAGIC);
			INT_SET(free->di_core.di_version, ARCH_CONVERT, version);
			INT_SET(free->di_next_unlinked, ARCH_CONVERT, NULLAGINO);
			xfs_ialloc_log_di(tp, fbuf, i,
				XFS_DI_CORE_BITS | XFS_DI_NEXT_UNLINKED);
		}
		xfs_trans_inode_alloc_buf(tp, fbuf);
	}
	be32_add(&agi->agi_count, newlen);
	be32_add(&agi->agi_freecount, newlen);
	down_read(&args.mp->m_peraglock);
	args.mp->m_perag[be32_to_cpu(agi->agi_seqno)].pagi_freecount += newlen;
	up_read(&args.mp->m_peraglock);
	agi->agi_newino = cpu_to_be32(newino);
	/*
	 * Insert records describing the new inode chunk into the btree.
	 */
	cur = xfs_btree_init_cursor(args.mp, tp, agbp,
			be32_to_cpu(agi->agi_seqno),
			XFS_BTNUM_INO, (xfs_inode_t *)0, 0);
	for (thisino = newino;
	     thisino < newino + newlen;
	     thisino += XFS_INODES_PER_CHUNK) {
		if ((error = xfs_inobt_lookup_eq(cur, thisino,
				XFS_INODES_PER_CHUNK, XFS_INOBT_ALL_FREE, &i))) {
			xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
			return error;
		}
		ASSERT(i == 0);
		if ((error = xfs_inobt_insert(cur, &i))) {
			xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
			return error;
		}
		ASSERT(i == 1);
	}
	xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
	/*
	 * Log allocation group header fields
	 */
	xfs_ialloc_log_agi(tp, agbp,
		XFS_AGI_COUNT | XFS_AGI_FREECOUNT | XFS_AGI_NEWINO);
	/*
	 * Modify/log superblock values for inode count and inode free count.
	 */
	xfs_trans_mod_sb(tp, XFS_TRANS_SB_ICOUNT, (long)newlen);
	xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, (long)newlen);
	*alloc = 1;
	return 0;
}
Example #28
0
/*
 * inode->i_mutex: down
 */
int
reiserfs_xattr_get(struct inode *inode, const char *name, void *buffer,
		   size_t buffer_size)
{
	ssize_t err = 0;
	struct dentry *dentry;
	size_t isize;
	size_t file_pos = 0;
	size_t buffer_pos = 0;
	struct page *page;
	__u32 hash = 0;

	if (name == NULL)
		return -EINVAL;

	/* We can't have xattrs attached to v1 items since they don't have
	 * generation numbers */
	if (get_inode_sd_version(inode) == STAT_DATA_V1)
		return -EOPNOTSUPP;

	dentry = xattr_lookup(inode, name, XATTR_REPLACE);
	if (IS_ERR(dentry)) {
		err = PTR_ERR(dentry);
		goto out;
	}

	down_read(&REISERFS_I(inode)->i_xattr_sem);

	isize = i_size_read(dentry->d_inode);

	/* Just return the size needed */
	if (buffer == NULL) {
		err = isize - sizeof(struct reiserfs_xattr_header);
		goto out_unlock;
	}

	if (buffer_size < isize - sizeof(struct reiserfs_xattr_header)) {
		err = -ERANGE;
		goto out_unlock;
	}

	while (file_pos < isize) {
		size_t chunk;
		char *data;
		size_t skip = 0;
		if (isize - file_pos > PAGE_CACHE_SIZE)
			chunk = PAGE_CACHE_SIZE;
		else
			chunk = isize - file_pos;

		page = reiserfs_get_page(dentry->d_inode, file_pos);
		if (IS_ERR(page)) {
			err = PTR_ERR(page);
			goto out_unlock;
		}

		lock_page(page);
		data = page_address(page);
		if (file_pos == 0) {
			struct reiserfs_xattr_header *rxh =
			    (struct reiserfs_xattr_header *)data;
			skip = file_pos = sizeof(struct reiserfs_xattr_header);
			chunk -= skip;
			/* Magic doesn't match up.. */
			if (rxh->h_magic != cpu_to_le32(REISERFS_XATTR_MAGIC)) {
				unlock_page(page);
				reiserfs_put_page(page);
				reiserfs_warning(inode->i_sb, "jdm-20001",
						 "Invalid magic for xattr (%s) "
						 "associated with %k", name,
						 INODE_PKEY(inode));
				err = -EIO;
				goto out_unlock;
			}
			hash = le32_to_cpu(rxh->h_hash);
		}
		memcpy(buffer + buffer_pos, data + skip, chunk);
		unlock_page(page);
		reiserfs_put_page(page);
		file_pos += chunk;
		buffer_pos += chunk;
		skip = 0;
	}
	err = isize - sizeof(struct reiserfs_xattr_header);

	if (xattr_hash(buffer, isize - sizeof(struct reiserfs_xattr_header)) !=
	    hash) {
		reiserfs_warning(inode->i_sb, "jdm-20002",
				 "Invalid hash for xattr (%s) associated "
				 "with %k", name, INODE_PKEY(inode));
		err = -EIO;
	}

out_unlock:
	up_read(&REISERFS_I(inode)->i_xattr_sem);
	dput(dentry);

out:
	return err;
}
Example #29
0
/*
 * Allocate an inode on disk.
 * Mode is used to tell whether the new inode will need space, and whether
 * it is a directory.
 *
 * The arguments IO_agbp and alloc_done are defined to work within
 * the constraint of one allocation per transaction.
 * xfs_dialloc() is designed to be called twice if it has to do an
 * allocation to make more free inodes.  On the first call,
 * IO_agbp should be set to NULL. If an inode is available,
 * i.e., xfs_dialloc() did not need to do an allocation, an inode
 * number is returned.  In this case, IO_agbp would be set to the
 * current ag_buf and alloc_done set to false.
 * If an allocation needed to be done, xfs_dialloc would return
 * the current ag_buf in IO_agbp and set alloc_done to true.
 * The caller should then commit the current transaction, allocate a new
 * transaction, and call xfs_dialloc() again, passing in the previous
 * value of IO_agbp.  IO_agbp should be held across the transactions.
 * Since the agbp is locked across the two calls, the second call is
 * guaranteed to have a free inode available.
 *
 * Once we successfully pick an inode its number is returned and the
 * on-disk data structures are updated.  The inode itself is not read
 * in, since doing so would break ordering constraints with xfs_reclaim.
 */
int
xfs_dialloc(
	xfs_trans_t	*tp,		/* transaction pointer */
	xfs_ino_t	parent,		/* parent inode (directory) */
	mode_t		mode,		/* mode bits for new inode */
	int		okalloc,	/* ok to allocate more space */
	xfs_buf_t	**IO_agbp,	/* in/out ag header's buffer */
	boolean_t	*alloc_done,	/* true if we needed to replenish
					   inode freelist */
	xfs_ino_t	*inop)		/* inode number allocated */
{
	xfs_agnumber_t	agcount;	/* number of allocation groups */
	xfs_buf_t	*agbp;		/* allocation group header's buffer */
	xfs_agnumber_t	agno;		/* allocation group number */
	xfs_agi_t	*agi;		/* allocation group header structure */
	xfs_btree_cur_t	*cur;		/* inode allocation btree cursor */
	int		error;		/* error return value */
	int		i;		/* result code */
	int		ialloced;	/* inode allocation status */
	int		noroom = 0;	/* no space for inode blk allocation */
	xfs_ino_t	ino;		/* fs-relative inode to be returned */
	/* REFERENCED */
	int		j;		/* result code */
	xfs_mount_t	*mp;		/* file system mount structure */
	int		offset;		/* index of inode in chunk */
	xfs_agino_t	pagino;		/* parent's a.g. relative inode # */
	xfs_agnumber_t	pagno;		/* parent's allocation group number */
	xfs_inobt_rec_incore_t rec;	/* inode allocation record */
	xfs_agnumber_t	tagno;		/* testing allocation group number */
	xfs_btree_cur_t	*tcur;		/* temp cursor */
	xfs_inobt_rec_incore_t trec;	/* temp inode allocation record */


	if (*IO_agbp == NULL) {
		/*
		 * We do not have an agbp, so select an initial allocation
		 * group for inode allocation.
		 */
		agbp = xfs_ialloc_ag_select(tp, parent, mode, okalloc);
		/*
		 * Couldn't find an allocation group satisfying the
		 * criteria, give up.
		 */
		if (!agbp) {
			*inop = NULLFSINO;
			return 0;
		}
		agi = XFS_BUF_TO_AGI(agbp);
		ASSERT(be32_to_cpu(agi->agi_magicnum) == XFS_AGI_MAGIC);
	} else {
		/*
		 * Continue where we left off before.  In this case, we
		 * know that the allocation group has free inodes.
		 */
		agbp = *IO_agbp;
		agi = XFS_BUF_TO_AGI(agbp);
		ASSERT(be32_to_cpu(agi->agi_magicnum) == XFS_AGI_MAGIC);
		ASSERT(be32_to_cpu(agi->agi_freecount) > 0);
	}
	mp = tp->t_mountp;
	agcount = mp->m_sb.sb_agcount;
	agno = be32_to_cpu(agi->agi_seqno);
	tagno = agno;
	pagno = XFS_INO_TO_AGNO(mp, parent);
	pagino = XFS_INO_TO_AGINO(mp, parent);

	/*
	 * If we have already hit the ceiling of inode blocks then clear
	 * okalloc so we scan all available agi structures for a free
	 * inode.
	 */

	if (mp->m_maxicount &&
	    mp->m_sb.sb_icount + XFS_IALLOC_INODES(mp) > mp->m_maxicount) {
		noroom = 1;
		okalloc = 0;
	}

	/*
	 * Loop until we find an allocation group that either has free inodes
	 * or in which we can allocate some inodes.  Iterate through the
	 * allocation groups upward, wrapping at the end.
	 */
	*alloc_done = B_FALSE;
	while (!agi->agi_freecount) {
		/*
		 * Don't do anything if we're not supposed to allocate
		 * any blocks, just go on to the next ag.
		 */
		if (okalloc) {
			/*
			 * Try to allocate some new inodes in the allocation
			 * group.
			 */
			if ((error = xfs_ialloc_ag_alloc(tp, agbp, &ialloced))) {
				xfs_trans_brelse(tp, agbp);
				if (error == ENOSPC) {
					*inop = NULLFSINO;
					return 0;
				} else
					return error;
			}
			if (ialloced) {
				/*
				 * We successfully allocated some inodes, return
				 * the current context to the caller so that it
				 * can commit the current transaction and call
				 * us again where we left off.
				 */
				ASSERT(be32_to_cpu(agi->agi_freecount) > 0);
				*alloc_done = B_TRUE;
				*IO_agbp = agbp;
				*inop = NULLFSINO;
				return 0;
			}
		}
		/*
		 * If it failed, give up on this ag.
		 */
		xfs_trans_brelse(tp, agbp);
		/*
		 * Go on to the next ag: get its ag header.
		 */
nextag:
		if (++tagno == agcount)
			tagno = 0;
		if (tagno == agno) {
			*inop = NULLFSINO;
			return noroom ? ENOSPC : 0;
		}
		down_read(&mp->m_peraglock);
		if (mp->m_perag[tagno].pagi_inodeok == 0) {
			up_read(&mp->m_peraglock);
			goto nextag;
		}
		error = xfs_ialloc_read_agi(mp, tp, tagno, &agbp);
		up_read(&mp->m_peraglock);
		if (error)
			goto nextag;
		agi = XFS_BUF_TO_AGI(agbp);
		ASSERT(be32_to_cpu(agi->agi_magicnum) == XFS_AGI_MAGIC);
	}
	/*
	 * Here with an allocation group that has a free inode.
	 * Reset agno since we may have chosen a new ag in the
	 * loop above.
	 */
	agno = tagno;
	*IO_agbp = NULL;
	cur = xfs_btree_init_cursor(mp, tp, agbp, be32_to_cpu(agi->agi_seqno),
				    XFS_BTNUM_INO, (xfs_inode_t *)0, 0);
	/*
	 * If pagino is 0 (this is the root inode allocation) use newino.
	 * This must work because we've just allocated some.
	 */
	if (!pagino)
		pagino = be32_to_cpu(agi->agi_newino);
#ifdef DEBUG
	if (cur->bc_nlevels == 1) {
		int	freecount = 0;

		if ((error = xfs_inobt_lookup_ge(cur, 0, 0, 0, &i)))
			goto error0;
		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
		do {
			if ((error = xfs_inobt_get_rec(cur, &rec.ir_startino,
					&rec.ir_freecount, &rec.ir_free, &i)))
				goto error0;
			XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
			freecount += rec.ir_freecount;
			if ((error = xfs_inobt_increment(cur, 0, &i)))
				goto error0;
		} while (i == 1);

		ASSERT(freecount == be32_to_cpu(agi->agi_freecount) ||
		       XFS_FORCED_SHUTDOWN(mp));
	}
#endif
	/*
	 * If in the same a.g. as the parent, try to get near the parent.
	 */
	if (pagno == agno) {
		if ((error = xfs_inobt_lookup_le(cur, pagino, 0, 0, &i)))
			goto error0;
		if (i != 0 &&
		    (error = xfs_inobt_get_rec(cur, &rec.ir_startino,
			    &rec.ir_freecount, &rec.ir_free, &j)) == 0 &&
		    j == 1 &&
		    rec.ir_freecount > 0) {
			/*
			 * Found a free inode in the same chunk
			 * as parent, done.
			 */
		}
		/*
		 * In the same a.g. as parent, but parent's chunk is full.
		 */
		else {
			int	doneleft;	/* done, to the left */
			int	doneright;	/* done, to the right */

			if (error)
				goto error0;
			ASSERT(i == 1);
			ASSERT(j == 1);
			/*
			 * Duplicate the cursor, search left & right
			 * simultaneously.
			 */
			if ((error = xfs_btree_dup_cursor(cur, &tcur)))
				goto error0;
			/*
			 * Search left with tcur, back up 1 record.
			 */
			if ((error = xfs_inobt_decrement(tcur, 0, &i)))
				goto error1;
			doneleft = !i;
			if (!doneleft) {
				if ((error = xfs_inobt_get_rec(tcur,
						&trec.ir_startino,
						&trec.ir_freecount,
						&trec.ir_free, &i)))
					goto error1;
				XFS_WANT_CORRUPTED_GOTO(i == 1, error1);
			}
			/*
			 * Search right with cur, go forward 1 record.
			 */
			if ((error = xfs_inobt_increment(cur, 0, &i)))
				goto error1;
			doneright = !i;
			if (!doneright) {
				if ((error = xfs_inobt_get_rec(cur,
						&rec.ir_startino,
						&rec.ir_freecount,
						&rec.ir_free, &i)))
					goto error1;
				XFS_WANT_CORRUPTED_GOTO(i == 1, error1);
			}
			/*
			 * Loop until we find the closest inode chunk
			 * with a free one.
			 */
			while (!doneleft || !doneright) {
				int	useleft;  /* using left inode
						     chunk this time */

				/*
				 * Figure out which block is closer,
				 * if both are valid.
				 */
				if (!doneleft && !doneright)
					useleft =
						pagino -
						(trec.ir_startino +
						 XFS_INODES_PER_CHUNK - 1) <
						 rec.ir_startino - pagino;
				else
					useleft = !doneleft;
				/*
				 * If checking the left, does it have
				 * free inodes?
				 */
				if (useleft && trec.ir_freecount) {
					/*
					 * Yes, set it up as the chunk to use.
					 */
					rec = trec;
					xfs_btree_del_cursor(cur,
						XFS_BTREE_NOERROR);
					cur = tcur;
					break;
				}
				/*
				 * If checking the right, does it have
				 * free inodes?
				 */
				if (!useleft && rec.ir_freecount) {
					/*
					 * Yes, it's already set up.
					 */
					xfs_btree_del_cursor(tcur,
						XFS_BTREE_NOERROR);
					break;
				}
				/*
				 * If used the left, get another one
				 * further left.
				 */
				if (useleft) {
					if ((error = xfs_inobt_decrement(tcur, 0,
							&i)))
						goto error1;
					doneleft = !i;
					if (!doneleft) {
						if ((error = xfs_inobt_get_rec(
							    tcur,
							    &trec.ir_startino,
							    &trec.ir_freecount,
							    &trec.ir_free, &i)))
							goto error1;
						XFS_WANT_CORRUPTED_GOTO(i == 1,
							error1);
					}
				}
				/*
				 * If used the right, get another one
				 * further right.
				 */
				else {
					if ((error = xfs_inobt_increment(cur, 0,
							&i)))
						goto error1;
					doneright = !i;
					if (!doneright) {
						if ((error = xfs_inobt_get_rec(
							    cur,
							    &rec.ir_startino,
							    &rec.ir_freecount,
							    &rec.ir_free, &i)))
							goto error1;
						XFS_WANT_CORRUPTED_GOTO(i == 1,
							error1);
					}
				}
			}
			ASSERT(!doneleft || !doneright);
		}
	}
	/*
	 * In a different a.g. from the parent.
	 * See if the most recently allocated block has any free.
	 */
	else if (be32_to_cpu(agi->agi_newino) != NULLAGINO) {
		if ((error = xfs_inobt_lookup_eq(cur,
				be32_to_cpu(agi->agi_newino), 0, 0, &i)))
			goto error0;
		if (i == 1 &&
		    (error = xfs_inobt_get_rec(cur, &rec.ir_startino,
			    &rec.ir_freecount, &rec.ir_free, &j)) == 0 &&
		    j == 1 &&
		    rec.ir_freecount > 0) {
			/*
			 * The last chunk allocated in the group still has
			 * a free inode.
			 */
		}
		/*
		 * None left in the last group, search the whole a.g.
		 */
		else {
			if (error)
				goto error0;
			if ((error = xfs_inobt_lookup_ge(cur, 0, 0, 0, &i)))
				goto error0;
			ASSERT(i == 1);
			for (;;) {
				if ((error = xfs_inobt_get_rec(cur,
						&rec.ir_startino,
						&rec.ir_freecount, &rec.ir_free,
						&i)))
					goto error0;
				XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
				if (rec.ir_freecount > 0)
					break;
				if ((error = xfs_inobt_increment(cur, 0, &i)))
					goto error0;
				XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
			}
		}
	}
	offset = XFS_IALLOC_FIND_FREE(&rec.ir_free);
	ASSERT(offset >= 0);
	ASSERT(offset < XFS_INODES_PER_CHUNK);
	ASSERT((XFS_AGINO_TO_OFFSET(mp, rec.ir_startino) %
				   XFS_INODES_PER_CHUNK) == 0);
	ino = XFS_AGINO_TO_INO(mp, agno, rec.ir_startino + offset);
	XFS_INOBT_CLR_FREE(&rec, offset);
	rec.ir_freecount--;
	if ((error = xfs_inobt_update(cur, rec.ir_startino, rec.ir_freecount,
			rec.ir_free)))
		goto error0;
	be32_add(&agi->agi_freecount, -1);
	xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT);
	down_read(&mp->m_peraglock);
	mp->m_perag[tagno].pagi_freecount--;
	up_read(&mp->m_peraglock);
#ifdef DEBUG
	if (cur->bc_nlevels == 1) {
		int	freecount = 0;

		if ((error = xfs_inobt_lookup_ge(cur, 0, 0, 0, &i)))
			goto error0;
		do {
			if ((error = xfs_inobt_get_rec(cur, &rec.ir_startino,
					&rec.ir_freecount, &rec.ir_free, &i)))
				goto error0;
			XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
			freecount += rec.ir_freecount;
			if ((error = xfs_inobt_increment(cur, 0, &i)))
				goto error0;
		} while (i == 1);
		ASSERT(freecount == be32_to_cpu(agi->agi_freecount) ||
		       XFS_FORCED_SHUTDOWN(mp));
	}
#endif
	xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
	xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -1);
	*inop = ino;
	return 0;
error1:
	xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
error0:
	xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
	return error;
}
/*
 * Try to write data in the inode.
 * If the inode has inline data, check whether the new write can be
 * in the inode also. If not, create the page the handle, move the data
 * to the page make it update and let the later codes create extent for it.
 */
int ext4_try_to_write_inline_data(struct address_space *mapping,
				  struct inode *inode,
				  loff_t pos, unsigned len,
				  unsigned flags,
				  struct page **pagep)
{
	int ret;
	handle_t *handle;
	struct page *page;
	struct ext4_iloc iloc;

	if (pos + len > ext4_get_max_inline_size(inode))
		goto convert;

	ret = ext4_get_inode_loc(inode, &iloc);
	if (ret)
		return ret;

	/*
	 * The possible write could happen in the inode,
	 * so try to reserve the space in inode first.
	 */
	handle = ext4_journal_start(inode, EXT4_HT_INODE, 1);
	if (IS_ERR(handle)) {
		ret = PTR_ERR(handle);
		handle = NULL;
		goto out;
	}

	ret = ext4_prepare_inline_data(handle, inode, pos + len);
	if (ret && ret != -ENOSPC)
		goto out;

	/* We don't have space in inline inode, so convert it to extent. */
	if (ret == -ENOSPC) {
		ext4_journal_stop(handle);
		brelse(iloc.bh);
		goto convert;
	}

	flags |= AOP_FLAG_NOFS;

	page = grab_cache_page_write_begin(mapping, 0, flags);
	if (!page) {
		ret = -ENOMEM;
		goto out;
	}

	*pagep = page;
	down_read(&EXT4_I(inode)->xattr_sem);
	if (!ext4_has_inline_data(inode)) {
		ret = 0;
		unlock_page(page);
		page_cache_release(page);
		goto out_up_read;
	}

	if (!PageUptodate(page)) {
		ret = ext4_read_inline_page(inode, page);
		if (ret < 0)
			goto out_up_read;
	}

	ret = 1;
	handle = NULL;
out_up_read:
	up_read(&EXT4_I(inode)->xattr_sem);
out:
	if (handle)
		ext4_journal_stop(handle);
	brelse(iloc.bh);
	return ret;
convert:
	return ext4_convert_inline_data_to_extent(mapping,
						  inode, flags);
}