Ejemplo n.º 1
0
int do_syslog(int type, char __user *buf, int len, bool from_file)
{
	unsigned i, j, limit, count;
	int do_clear = 0;
	char c;
	int error;

	error = check_syslog_permissions(type, from_file);
	if (error)
		goto out;

	error = security_syslog(type);
	if (error)
		return error;

	switch (type) {
	case SYSLOG_ACTION_CLOSE:	/* Close log */
		break;
	case SYSLOG_ACTION_OPEN:	/* Open log */
		break;
	case SYSLOG_ACTION_READ:	/* Read from log */
		error = -EINVAL;
		if (!buf || len < 0)
			goto out;
		error = 0;
		if (!len)
			goto out;
		if (!access_ok(VERIFY_WRITE, buf, len)) {
			error = -EFAULT;
			goto out;
		}
		error = wait_event_interruptible(log_wait,
							(log_start - log_end));
		if (error)
			goto out;
		i = 0;
		spin_lock_irq(&logbuf_lock);
		while (!error && (log_start != log_end) && i < len) {
			c = LOG_BUF(log_start);
			log_start++;
			spin_unlock_irq(&logbuf_lock);
			error = __put_user(c,buf);
			buf++;
			i++;
			cond_resched();
			spin_lock_irq(&logbuf_lock);
		}
		spin_unlock_irq(&logbuf_lock);
		if (!error)
			error = i;
		break;
	/* Read/clear last kernel messages */
	case SYSLOG_ACTION_READ_CLEAR:
		do_clear = 1;
		/* FALL THRU */
	/* Read last kernel messages */
	case SYSLOG_ACTION_READ_ALL:
		error = -EINVAL;
		if (!buf || len < 0)
			goto out;
		error = 0;
		if (!len)
			goto out;
		if (!access_ok(VERIFY_WRITE, buf, len)) {
			error = -EFAULT;
			goto out;
		}
		count = len;
		if (count > log_buf_len)
			count = log_buf_len;
		spin_lock_irq(&logbuf_lock);
		if (count > logged_chars)
			count = logged_chars;
		if (do_clear)
			logged_chars = 0;
		limit = log_end;
		/*
		 * __put_user() could sleep, and while we sleep
		 * printk() could overwrite the messages
		 * we try to copy to user space. Therefore
		 * the messages are copied in reverse. <manfreds>
		 */
		for (i = 0; i < count && !error; i++) {
			j = limit-1-i;
			if (j + log_buf_len < log_end)
				break;
			c = LOG_BUF(j);
			spin_unlock_irq(&logbuf_lock);
			error = __put_user(c,&buf[count-1-i]);
			cond_resched();
			spin_lock_irq(&logbuf_lock);
		}
		spin_unlock_irq(&logbuf_lock);
		if (error)
			break;
		error = i;
		if (i != count) {
			int offset = count-error;
			/* buffer overflow during copy, correct user buffer. */
			for (i = 0; i < error; i++) {
				if (__get_user(c,&buf[i+offset]) ||
				    __put_user(c,&buf[i])) {
					error = -EFAULT;
					break;
				}
				cond_resched();
			}
		}
		break;
	/* Clear ring buffer */
	case SYSLOG_ACTION_CLEAR:
		logged_chars = 0;
		break;
	/* Disable logging to console */
	case SYSLOG_ACTION_CONSOLE_OFF:
		if (saved_console_loglevel == -1)
			saved_console_loglevel = console_loglevel;
		console_loglevel = minimum_console_loglevel;
		break;
	/* Enable logging to console */
	case SYSLOG_ACTION_CONSOLE_ON:
		if (saved_console_loglevel != -1) {
			console_loglevel = saved_console_loglevel;
			saved_console_loglevel = -1;
		}
		break;
	/* Set level of messages printed to console */
	case SYSLOG_ACTION_CONSOLE_LEVEL:
		error = -EINVAL;
		if (len < 1 || len > 8)
			goto out;
		if (len < minimum_console_loglevel)
			len = minimum_console_loglevel;
		console_loglevel = len;
		/* Implicitly re-enable logging to console */
		saved_console_loglevel = -1;
		error = 0;
		break;
	/* Number of chars in the log buffer */
	case SYSLOG_ACTION_SIZE_UNREAD:
		error = log_end - log_start;
		break;
	/* Size of the log buffer */
	case SYSLOG_ACTION_SIZE_BUFFER:
		error = log_buf_len;
		break;
	default:
		error = -EINVAL;
		break;
	}
out:
	return error;
}
Ejemplo n.º 2
0
static int jffs2_sum_process_sum_data(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb,
				struct jffs2_raw_summary *summary, uint32_t *pseudo_random)
{
	struct jffs2_inode_cache *ic;
	struct jffs2_full_dirent *fd;
	void *sp;
	int i, ino;
	int err;

	sp = summary->sum;

	for (i=0; i<je32_to_cpu(summary->sum_num); i++) {
		dbg_summary("processing summary index %d\n", i);

		cond_resched();

		/* Make sure there's a spare ref for dirty space */
		err = jffs2_prealloc_raw_node_refs(c, jeb, 2);
		if (err)
			return err;

		switch (je16_to_cpu(((struct jffs2_sum_unknown_flash *)sp)->nodetype)) {
			case JFFS2_NODETYPE_INODE: {
				struct jffs2_sum_inode_flash *spi;
				spi = sp;

				ino = je32_to_cpu(spi->inode);

				dbg_summary("Inode at 0x%08x-0x%08x\n",
					    jeb->offset + je32_to_cpu(spi->offset),
					    jeb->offset + je32_to_cpu(spi->offset) + je32_to_cpu(spi->totlen));

				ic = jffs2_scan_make_ino_cache(c, ino);
				if (!ic) {
					JFFS2_NOTICE("scan_make_ino_cache failed\n");
					return -ENOMEM;
				}

				sum_link_node_ref(c, jeb, je32_to_cpu(spi->offset) | REF_UNCHECKED,
						  PAD(je32_to_cpu(spi->totlen)), ic);

				*pseudo_random += je32_to_cpu(spi->version);

				sp += JFFS2_SUMMARY_INODE_SIZE;

				break;
			}

			case JFFS2_NODETYPE_DIRENT: {
				struct jffs2_sum_dirent_flash *spd;
				spd = sp;

				dbg_summary("Dirent at 0x%08x-0x%08x\n",
					    jeb->offset + je32_to_cpu(spd->offset),
					    jeb->offset + je32_to_cpu(spd->offset) + je32_to_cpu(spd->totlen));


				fd = jffs2_alloc_full_dirent(spd->nsize+1);
				if (!fd)
					return -ENOMEM;

				memcpy(&fd->name, spd->name, spd->nsize);
				fd->name[spd->nsize] = 0;

				ic = jffs2_scan_make_ino_cache(c, je32_to_cpu(spd->pino));
				if (!ic) {
					jffs2_free_full_dirent(fd);
					return -ENOMEM;
				}

				fd->raw = sum_link_node_ref(c, jeb,  je32_to_cpu(spd->offset) | REF_UNCHECKED,
							    PAD(je32_to_cpu(spd->totlen)), ic);

				fd->next = NULL;
				fd->version = je32_to_cpu(spd->version);
				fd->ino = je32_to_cpu(spd->ino);
				fd->nhash = full_name_hash(fd->name, spd->nsize);
				fd->type = spd->type;

				jffs2_add_fd_to_list(c, fd, &ic->scan_dents);

				*pseudo_random += je32_to_cpu(spd->version);

				sp += JFFS2_SUMMARY_DIRENT_SIZE(spd->nsize);

				break;
			}
#ifdef CONFIG_JFFS2_FS_XATTR
			case JFFS2_NODETYPE_XATTR: {
				struct jffs2_xattr_datum *xd;
				struct jffs2_sum_xattr_flash *spx;

				spx = (struct jffs2_sum_xattr_flash *)sp;
				dbg_summary("xattr at %#08x-%#08x (xid=%u, version=%u)\n", 
					    jeb->offset + je32_to_cpu(spx->offset),
					    jeb->offset + je32_to_cpu(spx->offset) + je32_to_cpu(spx->totlen),
					    je32_to_cpu(spx->xid), je32_to_cpu(spx->version));

				xd = jffs2_setup_xattr_datum(c, je32_to_cpu(spx->xid),
								je32_to_cpu(spx->version));
				if (IS_ERR(xd))
					return PTR_ERR(xd);
				if (xd->version > je32_to_cpu(spx->version)) {
					/* node is not the newest one */
					struct jffs2_raw_node_ref *raw
						= sum_link_node_ref(c, jeb, je32_to_cpu(spx->offset) | REF_UNCHECKED,
								    PAD(je32_to_cpu(spx->totlen)), NULL);
					raw->next_in_ino = xd->node->next_in_ino;
					xd->node->next_in_ino = raw;
				} else {
					xd->version = je32_to_cpu(spx->version);
					sum_link_node_ref(c, jeb, je32_to_cpu(spx->offset) | REF_UNCHECKED,
							  PAD(je32_to_cpu(spx->totlen)), (void *)xd);
				}
				*pseudo_random += je32_to_cpu(spx->xid);
				sp += JFFS2_SUMMARY_XATTR_SIZE;

				break;
			}
			case JFFS2_NODETYPE_XREF: {
				struct jffs2_xattr_ref *ref;
				struct jffs2_sum_xref_flash *spr;

				spr = (struct jffs2_sum_xref_flash *)sp;
				dbg_summary("xref at %#08x-%#08x\n",
					    jeb->offset + je32_to_cpu(spr->offset),
					    jeb->offset + je32_to_cpu(spr->offset) + 
					    (uint32_t)PAD(sizeof(struct jffs2_raw_xref)));

				ref = jffs2_alloc_xattr_ref();
				if (!ref) {
					JFFS2_NOTICE("allocation of xattr_datum failed\n");
					return -ENOMEM;
				}
				ref->next = c->xref_temp;
				c->xref_temp = ref;

				sum_link_node_ref(c, jeb, je32_to_cpu(spr->offset) | REF_UNCHECKED,
						  PAD(sizeof(struct jffs2_raw_xref)), (void *)ref);

				*pseudo_random += ref->node->flash_offset;
				sp += JFFS2_SUMMARY_XREF_SIZE;

				break;
			}
#endif
			default : {
				uint16_t nodetype = je16_to_cpu(((struct jffs2_sum_unknown_flash *)sp)->nodetype);
				JFFS2_WARNING("Unsupported node type %x found in summary! Exiting...\n", nodetype);
				if ((nodetype & JFFS2_COMPAT_MASK) == JFFS2_FEATURE_INCOMPAT)
					return -EIO;

				/* For compatible node types, just fall back to the full scan */
				c->wasted_size -= jeb->wasted_size;
				c->free_size += c->sector_size - jeb->free_size;
				c->used_size -= jeb->used_size;
				c->dirty_size -= jeb->dirty_size;
				jeb->wasted_size = jeb->used_size = jeb->dirty_size = 0;
				jeb->free_size = c->sector_size;

				jffs2_free_jeb_node_refs(c, jeb);
				return -ENOTRECOVERABLE;
			}
		}
	}
	return 0;
}
Ejemplo n.º 3
0
int __generic_block_fiemap(struct inode *inode,
			   struct fiemap_extent_info *fieinfo, u64 start,
			   u64 len, get_block_t *get_block)
{
	struct buffer_head tmp;
	unsigned int start_blk;
	long long length = 0, map_len = 0;
	u64 logical = 0, phys = 0, size = 0;
	u32 flags = FIEMAP_EXTENT_MERGED;
	int ret = 0, past_eof = 0, whole_file = 0;

	if ((ret = fiemap_check_flags(fieinfo, FIEMAP_FLAG_SYNC)))
		return ret;

	start_blk = logical_to_blk(inode, start);

	length = (long long)min_t(u64, len, i_size_read(inode));
	if (length < len)
		whole_file = 1;

	map_len = length;

	do {
		/*
		 * we set b_size to the total size we want so it will map as
		 * many contiguous blocks as possible at once
		 */
		memset(&tmp, 0, sizeof(struct buffer_head));
		tmp.b_size = map_len;

		ret = get_block(inode, start_blk, &tmp, 0);
		if (ret)
			break;

		/* HOLE */
		if (!buffer_mapped(&tmp)) {
			length -= blk_to_logical(inode, 1);
			start_blk++;

			/*
			 * we want to handle the case where there is an
			 * allocated block at the front of the file, and then
			 * nothing but holes up to the end of the file properly,
			 * to make sure that extent at the front gets properly
			 * marked with FIEMAP_EXTENT_LAST
			 */
			if (!past_eof &&
			    blk_to_logical(inode, start_blk) >=
			    blk_to_logical(inode, 0)+i_size_read(inode))
				past_eof = 1;

			/*
			 * first hole after going past the EOF, this is our
			 * last extent
			 */
			if (past_eof && size) {
				flags = FIEMAP_EXTENT_MERGED|FIEMAP_EXTENT_LAST;
				ret = fiemap_fill_next_extent(fieinfo, logical,
							      phys, size,
							      flags);
				break;
			}

			/* if we have holes up to/past EOF then we're done */
			if (length <= 0 || past_eof)
				break;
		} else {
			/*
			 * we have gone over the length of what we wanted to
			 * map, and it wasn't the entire file, so add the extent
			 * we got last time and exit.
			 *
			 * This is for the case where say we want to map all the
			 * way up to the second to the last block in a file, but
			 * the last block is a hole, making the second to last
			 * block FIEMAP_EXTENT_LAST.  In this case we want to
			 * see if there is a hole after the second to last block
			 * so we can mark it properly.  If we found data after
			 * we exceeded the length we were requesting, then we
			 * are good to go, just add the extent to the fieinfo
			 * and break
			 */
			if (length <= 0 && !whole_file) {
				ret = fiemap_fill_next_extent(fieinfo, logical,
							      phys, size,
							      flags);
				break;
			}

			/*
			 * if size != 0 then we know we already have an extent
			 * to add, so add it.
			 */
			if (size) {
				ret = fiemap_fill_next_extent(fieinfo, logical,
							      phys, size,
							      flags);
				if (ret)
					break;
			}

			logical = blk_to_logical(inode, start_blk);
			phys = blk_to_logical(inode, tmp.b_blocknr);
			size = tmp.b_size;
			flags = FIEMAP_EXTENT_MERGED;

			length -= tmp.b_size;
			start_blk += logical_to_blk(inode, size);

			/*
			 * If we are past the EOF, then we need to make sure as
			 * soon as we find a hole that the last extent we found
			 * is marked with FIEMAP_EXTENT_LAST
			 */
			if (!past_eof &&
			    logical+size >=
			    blk_to_logical(inode, 0)+i_size_read(inode))
				past_eof = 1;
		}
		cond_resched();
	} while (1);

	/* if ret is 1 then we just hit the end of the extent array */
	if (ret == 1)
		ret = 0;

	return ret;
}
Ejemplo n.º 4
0
static int __init mtd_readtest_init(void)
{
	uint64_t tmp;
	int err, i;

	printk(KERN_INFO "\n");
	printk(KERN_INFO "=================================================\n");

	if (dev < 0) {
		printk(PRINT_PREF "Please specify a valid mtd-device via module paramter\n");
		return -EINVAL;
	}

	printk(PRINT_PREF "MTD device: %d\n", dev);

	mtd = get_mtd_device(NULL, dev);
	if (IS_ERR(mtd)) {
		err = PTR_ERR(mtd);
		printk(PRINT_PREF "error: Cannot get MTD device\n");
		return err;
	}

	if (mtd->writesize == 1) {
		printk(PRINT_PREF "not NAND flash, assume page size is 512 "
		       "bytes.\n");
		pgsize = 512;
	} else
		pgsize = mtd->writesize;

	tmp = mtd->size;
	do_div(tmp, mtd->erasesize);
	ebcnt = tmp;
	pgcnt = mtd->erasesize / pgsize;

	printk(PRINT_PREF "MTD device size %llu, eraseblock size %u, "
	       "page size %u, count of eraseblocks %u, pages per "
	       "eraseblock %u, OOB size %u\n",
	       (unsigned long long)mtd->size, mtd->erasesize,
	       pgsize, ebcnt, pgcnt, mtd->oobsize);

	err = -ENOMEM;
	iobuf = kmalloc(mtd->erasesize, GFP_KERNEL);
	if (!iobuf) {
		printk(PRINT_PREF "error: cannot allocate memory\n");
		goto out;
	}
	iobuf1 = kmalloc(mtd->erasesize, GFP_KERNEL);
	if (!iobuf1) {
		printk(PRINT_PREF "error: cannot allocate memory\n");
		goto out;
	}

	err = scan_for_bad_eraseblocks();
	if (err)
		goto out;

	/* Read all eraseblocks 1 page at a time */
	printk(PRINT_PREF "testing page read\n");
	for (i = 0; i < ebcnt; ++i) {
		int ret;

		if (bbt[i])
			continue;
		ret = read_eraseblock_by_page(i);
		if (ret) {
			dump_eraseblock(i);
			if (!err)
				err = ret;
		}
		cond_resched();
	}

	if (err)
		printk(PRINT_PREF "finished with errors\n");
	else
		printk(PRINT_PREF "finished\n");

out:

	kfree(iobuf);
	kfree(iobuf1);
	kfree(bbt);
	put_mtd_device(mtd);
	if (err)
		printk(PRINT_PREF "error %d occurred\n", err);
	printk(KERN_INFO "=================================================\n");
	return err;
}
Ejemplo n.º 5
0
static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm,
					      unsigned long dst_start,
					      unsigned long src_start,
					      unsigned long len,
					      bool zeropage)
{
	struct vm_area_struct *dst_vma;
	ssize_t err;
	pmd_t *dst_pmd;
	unsigned long src_addr, dst_addr;
	long copied;
	struct page *page;

	/*
	 * Sanitize the command parameters:
	 */
	BUG_ON(dst_start & ~PAGE_MASK);
	BUG_ON(len & ~PAGE_MASK);

	/* Does the address range wrap, or is the span zero-sized? */
	BUG_ON(src_start + len <= src_start);
	BUG_ON(dst_start + len <= dst_start);

	src_addr = src_start;
	dst_addr = dst_start;
	copied = 0;
	page = NULL;
retry:
	down_read(&dst_mm->mmap_sem);

	/*
	 * Make sure the vma is not shared, that the dst range is
	 * both valid and fully within a single existing vma.
	 */
	err = -ENOENT;
	dst_vma = find_vma(dst_mm, dst_start);
	if (!dst_vma)
		goto out_unlock;
	/*
	 * Be strict and only allow __mcopy_atomic on userfaultfd
	 * registered ranges to prevent userland errors going
	 * unnoticed. As far as the VM consistency is concerned, it
	 * would be perfectly safe to remove this check, but there's
	 * no useful usage for __mcopy_atomic ouside of userfaultfd
	 * registered ranges. This is after all why these are ioctls
	 * belonging to the userfaultfd and not syscalls.
	 */
	if (!dst_vma->vm_userfaultfd_ctx.ctx)
		goto out_unlock;

	if (dst_start < dst_vma->vm_start ||
	    dst_start + len > dst_vma->vm_end)
		goto out_unlock;

	err = -EINVAL;
	/*
	 * shmem_zero_setup is invoked in mmap for MAP_ANONYMOUS|MAP_SHARED but
	 * it will overwrite vm_ops, so vma_is_anonymous must return false.
	 */
	if (WARN_ON_ONCE(vma_is_anonymous(dst_vma) &&
	    dst_vma->vm_flags & VM_SHARED))
		goto out_unlock;

	/*
	 * If this is a HUGETLB vma, pass off to appropriate routine
	 */
	if (is_vm_hugetlb_page(dst_vma))
		return  __mcopy_atomic_hugetlb(dst_mm, dst_vma, dst_start,
						src_start, len, zeropage);

	if (!vma_is_anonymous(dst_vma) && !vma_is_shmem(dst_vma))
		goto out_unlock;

	/*
	 * Ensure the dst_vma has a anon_vma or this page
	 * would get a NULL anon_vma when moved in the
	 * dst_vma.
	 */
	err = -ENOMEM;
	if (vma_is_anonymous(dst_vma) && unlikely(anon_vma_prepare(dst_vma)))
		goto out_unlock;

	while (src_addr < src_start + len) {
		pmd_t dst_pmdval;

		BUG_ON(dst_addr >= dst_start + len);

		dst_pmd = mm_alloc_pmd(dst_mm, dst_addr);
		if (unlikely(!dst_pmd)) {
			err = -ENOMEM;
			break;
		}

		dst_pmdval = pmd_read_atomic(dst_pmd);
		/*
		 * If the dst_pmd is mapped as THP don't
		 * override it and just be strict.
		 */
		if (unlikely(pmd_trans_huge(dst_pmdval))) {
			err = -EEXIST;
			break;
		}
		if (unlikely(pmd_none(dst_pmdval)) &&
		    unlikely(__pte_alloc(dst_mm, dst_pmd, dst_addr))) {
			err = -ENOMEM;
			break;
		}
		/* If an huge pmd materialized from under us fail */
		if (unlikely(pmd_trans_huge(*dst_pmd))) {
			err = -EFAULT;
			break;
		}

		BUG_ON(pmd_none(*dst_pmd));
		BUG_ON(pmd_trans_huge(*dst_pmd));

		err = mfill_atomic_pte(dst_mm, dst_pmd, dst_vma, dst_addr,
				       src_addr, &page, zeropage);
		cond_resched();

		if (unlikely(err == -EFAULT)) {
			void *page_kaddr;

			up_read(&dst_mm->mmap_sem);
			BUG_ON(!page);

			page_kaddr = kmap(page);
			err = copy_from_user(page_kaddr,
					     (const void __user *) src_addr,
					     PAGE_SIZE);
			kunmap(page);
			if (unlikely(err)) {
				err = -EFAULT;
				goto out;
			}
			goto retry;
		} else
			BUG_ON(page);

		if (!err) {
			dst_addr += PAGE_SIZE;
			src_addr += PAGE_SIZE;
			copied += PAGE_SIZE;

			if (fatal_signal_pending(current))
				err = -EINTR;
		}
		if (err)
			break;
	}

out_unlock:
	up_read(&dst_mm->mmap_sem);
out:
	if (page)
		put_page(page);
	BUG_ON(copied < 0);
	BUG_ON(err > 0);
	BUG_ON(!copied && !err);
	return copied ? copied : err;
}
Ejemplo n.º 6
0
static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm,
					      unsigned long dst_start,
					      unsigned long src_start,
					      unsigned long len,
					      bool zeropage,
					      bool *mmap_changing)
{
	struct vm_area_struct *dst_vma;
	ssize_t err;
	pmd_t *dst_pmd;
	unsigned long src_addr, dst_addr;
	long copied;
	struct page *page;

	/*
	 * Sanitize the command parameters:
	 */
	BUG_ON(dst_start & ~PAGE_MASK);
	BUG_ON(len & ~PAGE_MASK);

	/* Does the address range wrap, or is the span zero-sized? */
	BUG_ON(src_start + len <= src_start);
	BUG_ON(dst_start + len <= dst_start);

	src_addr = src_start;
	dst_addr = dst_start;
	copied = 0;
	page = NULL;
retry:
	down_read(&dst_mm->mmap_sem);

	/*
	 * If memory mappings are changing because of non-cooperative
	 * operation (e.g. mremap) running in parallel, bail out and
	 * request the user to retry later
	 */
	err = -EAGAIN;
	if (mmap_changing && READ_ONCE(*mmap_changing))
		goto out_unlock;

	/*
	 * Make sure the vma is not shared, that the dst range is
	 * both valid and fully within a single existing vma.
	 */
	err = -ENOENT;
	dst_vma = find_vma(dst_mm, dst_start);
	if (!dst_vma)
		goto out_unlock;
	/*
	 * Check the vma is registered in uffd, this is required to
	 * enforce the VM_MAYWRITE check done at uffd registration
	 * time.
	 */
	if (!dst_vma->vm_userfaultfd_ctx.ctx)
		goto out_unlock;

	if (dst_start < dst_vma->vm_start ||
	    dst_start + len > dst_vma->vm_end)
		goto out_unlock;

	err = -EINVAL;
	/*
	 * shmem_zero_setup is invoked in mmap for MAP_ANONYMOUS|MAP_SHARED but
	 * it will overwrite vm_ops, so vma_is_anonymous must return false.
	 */
	if (WARN_ON_ONCE(vma_is_anonymous(dst_vma) &&
	    dst_vma->vm_flags & VM_SHARED))
		goto out_unlock;

	/*
	 * If this is a HUGETLB vma, pass off to appropriate routine
	 */
	if (is_vm_hugetlb_page(dst_vma))
		return  __mcopy_atomic_hugetlb(dst_mm, dst_vma, dst_start,
						src_start, len, zeropage);

	if (!vma_is_anonymous(dst_vma) && !vma_is_shmem(dst_vma))
		goto out_unlock;

	/*
	 * Ensure the dst_vma has a anon_vma or this page
	 * would get a NULL anon_vma when moved in the
	 * dst_vma.
	 */
	err = -ENOMEM;
	if (!(dst_vma->vm_flags & VM_SHARED) &&
	    unlikely(anon_vma_prepare(dst_vma)))
		goto out_unlock;

	while (src_addr < src_start + len) {
		pmd_t dst_pmdval;

		BUG_ON(dst_addr >= dst_start + len);

		dst_pmd = mm_alloc_pmd(dst_mm, dst_addr);
		if (unlikely(!dst_pmd)) {
			err = -ENOMEM;
			break;
		}

		dst_pmdval = pmd_read_atomic(dst_pmd);
		/*
		 * If the dst_pmd is mapped as THP don't
		 * override it and just be strict.
		 */
		if (unlikely(pmd_trans_huge(dst_pmdval))) {
			err = -EEXIST;
			break;
		}
		if (unlikely(pmd_none(dst_pmdval)) &&
		    unlikely(__pte_alloc(dst_mm, dst_pmd, dst_addr))) {
			err = -ENOMEM;
			break;
		}
		/* If an huge pmd materialized from under us fail */
		if (unlikely(pmd_trans_huge(*dst_pmd))) {
			err = -EFAULT;
			break;
		}

		BUG_ON(pmd_none(*dst_pmd));
		BUG_ON(pmd_trans_huge(*dst_pmd));

		err = mfill_atomic_pte(dst_mm, dst_pmd, dst_vma, dst_addr,
				       src_addr, &page, zeropage);
		cond_resched();

		if (unlikely(err == -ENOENT)) {
			void *page_kaddr;

			up_read(&dst_mm->mmap_sem);
			BUG_ON(!page);

			page_kaddr = kmap(page);
			err = copy_from_user(page_kaddr,
					     (const void __user *) src_addr,
					     PAGE_SIZE);
			kunmap(page);
			if (unlikely(err)) {
				err = -EFAULT;
				goto out;
			}
			goto retry;
		} else
			BUG_ON(page);

		if (!err) {
			dst_addr += PAGE_SIZE;
			src_addr += PAGE_SIZE;
			copied += PAGE_SIZE;

			if (fatal_signal_pending(current))
				err = -EINTR;
		}
		if (err)
			break;
	}

out_unlock:
	up_read(&dst_mm->mmap_sem);
out:
	if (page)
		put_page(page);
	BUG_ON(copied < 0);
	BUG_ON(err > 0);
	BUG_ON(!copied && !err);
	return copied ? copied : err;
}
Ejemplo n.º 7
0
int nilfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
		 __u64 start, __u64 len)
{
	struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
	__u64 logical = 0, phys = 0, size = 0;
	__u32 flags = 0;
	loff_t isize;
	sector_t blkoff, end_blkoff;
	sector_t delalloc_blkoff;
	unsigned long delalloc_blklen;
	unsigned int blkbits = inode->i_blkbits;
	int ret, n;

	ret = fiemap_check_flags(fieinfo, FIEMAP_FLAG_SYNC);
	if (ret)
		return ret;

	mutex_lock(&inode->i_mutex);

	isize = i_size_read(inode);

	blkoff = start >> blkbits;
	end_blkoff = (start + len - 1) >> blkbits;

	delalloc_blklen = nilfs_find_uncommitted_extent(inode, blkoff,
							&delalloc_blkoff);

	do {
		__u64 blkphy;
		unsigned int maxblocks;

		if (delalloc_blklen && blkoff == delalloc_blkoff) {
			if (size) {
				/* End of the current extent */
				ret = fiemap_fill_next_extent(
					fieinfo, logical, phys, size, flags);
				if (ret)
					break;
			}
			if (blkoff > end_blkoff)
				break;

			flags = FIEMAP_EXTENT_MERGED | FIEMAP_EXTENT_DELALLOC;
			logical = blkoff << blkbits;
			phys = 0;
			size = delalloc_blklen << blkbits;

			blkoff = delalloc_blkoff + delalloc_blklen;
			delalloc_blklen = nilfs_find_uncommitted_extent(
				inode, blkoff, &delalloc_blkoff);
			continue;
		}

		/*
		 * Limit the number of blocks that we look up so as
		 * not to get into the next delayed allocation extent.
		 */
		maxblocks = INT_MAX;
		if (delalloc_blklen)
			maxblocks = min_t(sector_t, delalloc_blkoff - blkoff,
					  maxblocks);
		blkphy = 0;

		down_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem);
		n = nilfs_bmap_lookup_contig(
			NILFS_I(inode)->i_bmap, blkoff, &blkphy, maxblocks);
		up_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem);

		if (n < 0) {
			int past_eof;

			if (unlikely(n != -ENOENT))
				break; /* error */

			/* HOLE */
			blkoff++;
			past_eof = ((blkoff << blkbits) >= isize);

			if (size) {
				/* End of the current extent */

				if (past_eof)
					flags |= FIEMAP_EXTENT_LAST;

				ret = fiemap_fill_next_extent(
					fieinfo, logical, phys, size, flags);
				if (ret)
					break;
				size = 0;
			}
			if (blkoff > end_blkoff || past_eof)
				break;
		} else {
			if (size) {
				if (phys && blkphy << blkbits == phys + size) {
					/* The current extent goes on */
					size += n << blkbits;
				} else {
					/* Terminate the current extent */
					ret = fiemap_fill_next_extent(
						fieinfo, logical, phys, size,
						flags);
					if (ret || blkoff > end_blkoff)
						break;

					/* Start another extent */
					flags = FIEMAP_EXTENT_MERGED;
					logical = blkoff << blkbits;
					phys = blkphy << blkbits;
					size = n << blkbits;
				}
			} else {
				/* Start a new extent */
				flags = FIEMAP_EXTENT_MERGED;
				logical = blkoff << blkbits;
				phys = blkphy << blkbits;
				size = n << blkbits;
			}
			blkoff += n;
		}
		cond_resched();
	} while (true);

	/* If ret is 1 then we just hit the end of the extent array */
	if (ret == 1)
		ret = 0;

	mutex_unlock(&inode->i_mutex);
	return ret;
}
/* 
 * Locate a page of swap in physical memory, reserving swap cache space
 * and reading the disk if it is not already cached.
 * A failure return means that either the page allocation failed or that
 * the swap entry is no longer in use.
 */
struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
			struct vm_area_struct *vma, unsigned long addr)
{
	struct page *found_page, *new_page = NULL;
	int err;

	do {
		/*
		 * First check the swap cache.  Since this is normally
		 * called after lookup_swap_cache() failed, re-calling
		 * that would confuse statistics.
		 */
		found_page = find_get_page(&swapper_space, entry.val);
		if (found_page)
			break;

		/*
		 * Get a new page to read into from swap.
		 */
		if (!new_page) {
			new_page = alloc_page_vma(gfp_mask, vma, addr);
			if (!new_page)
				break;		/* Out of memory */
		}

		/*
		 * call radix_tree_preload() while we can wait.
		 */
		err = radix_tree_preload(gfp_mask & GFP_KERNEL);
		if (err)
			break;

		/*
		 * Swap entry may have been freed since our caller observed it.
		 */
		err = swapcache_prepare(entry);
		if (err == -EEXIST) {
			radix_tree_preload_end();
			/*
			 * We might race against get_swap_page() and stumble
			 * across a SWAP_HAS_CACHE swap_map entry whose page
			 * has not been brought into the swapcache yet, while
			 * the other end is scheduled away waiting on discard
			 * I/O completion at scan_swap_map().
			 *
			 * In order to avoid turning this transitory state
			 * into a permanent loop around this -EEXIST case
			 * if !CONFIG_PREEMPT and the I/O completion happens
			 * to be waiting on the CPU waitqueue where we are now
			 * busy looping, we just conditionally invoke the
			 * scheduler here, if there are some more important
			 * tasks to run.
			 */
			cond_resched();
			continue;
		}
		if (err) {		/* swp entry is obsolete ? */
			radix_tree_preload_end();
			break;
		}

		/* May fail (-ENOMEM) if radix-tree node allocation failed. */
		__set_page_locked(new_page);
		SetPageSwapBacked(new_page);
		err = __add_to_swap_cache(new_page, entry);
		if (likely(!err)) {
			radix_tree_preload_end();
			/*
			 * Initiate read into locked page and return.
			 */
			lru_cache_add_anon(new_page);
			swap_readpage(new_page);
			return new_page;
		}
		radix_tree_preload_end();
		ClearPageSwapBacked(new_page);
		__clear_page_locked(new_page);
		/*
		 * add_to_swap_cache() doesn't return -EEXIST, so we can safely
		 * clear SWAP_HAS_CACHE flag.
		 */
		swapcache_free(entry, NULL);
	} while (err != -ENOMEM);

	if (new_page)
		page_cache_release(new_page);
	return found_page;
}
Ejemplo n.º 9
0
/**
 * nilfs_copy_back_pages -- copy back pages to original cache from shadow cache
 * @dmap: destination page cache
 * @smap: source page cache
 *
 * No pages must no be added to the cache during this process.
 * This must be ensured by the caller.
 */
void nilfs_copy_back_pages(struct address_space *dmap,
			   struct address_space *smap)
{
	struct pagevec pvec;
	unsigned int i, n;
	pgoff_t index = 0;
	int err;

	pagevec_init(&pvec, 0);
repeat:
	n = pagevec_lookup(&pvec, smap, index, PAGEVEC_SIZE);
	if (!n)
		return;
	index = pvec.pages[n - 1]->index + 1;

	for (i = 0; i < pagevec_count(&pvec); i++) {
		struct page *page = pvec.pages[i], *dpage;
		pgoff_t offset = page->index;

		lock_page(page);
		dpage = find_lock_page(dmap, offset);
		if (dpage) {
			/* override existing page on the destination cache */
			WARN_ON(PageDirty(dpage));
			nilfs_copy_page(dpage, page, 0);
			unlock_page(dpage);
			page_cache_release(dpage);
		} else {
			struct page *page2;

			/* move the page to the destination cache */
			spin_lock_irq(&smap->tree_lock);
			page2 = radix_tree_delete(&smap->page_tree, offset);
			WARN_ON(page2 != page);

			smap->nrpages--;
			spin_unlock_irq(&smap->tree_lock);

			spin_lock_irq(&dmap->tree_lock);
			err = radix_tree_insert(&dmap->page_tree, offset, page);
			if (unlikely(err < 0)) {
				WARN_ON(err == -EEXIST);
				page->mapping = NULL;
				page_cache_release(page); /* for cache */
			} else {
				page->mapping = dmap;
				dmap->nrpages++;
				if (PageDirty(page))
					radix_tree_tag_set(&dmap->page_tree,
							   offset,
							   PAGECACHE_TAG_DIRTY);
			}
			spin_unlock_irq(&dmap->tree_lock);
		}
		unlock_page(page);
	}
	pagevec_release(&pvec);
	cond_resched();

	goto repeat;
}
Ejemplo n.º 10
0
/**
 * nilfs_find_uncommitted_extent - find extent of uncommitted data
 * @inode: inode
 * @start_blk: start block offset (in)
 * @blkoff: start offset of the found extent (out)
 *
 * This function searches an extent of buffers marked "delayed" which
 * starts from a block offset equal to or larger than @start_blk.  If
 * such an extent was found, this will store the start offset in
 * @blkoff and return its length in blocks.  Otherwise, zero is
 * returned.
 */
unsigned long nilfs_find_uncommitted_extent(struct inode *inode,
					    sector_t start_blk,
					    sector_t *blkoff)
{
	unsigned int i;
	pgoff_t index;
	unsigned int nblocks_in_page;
	unsigned long length = 0;
	sector_t b;
	struct pagevec pvec;
	struct page *page;

	if (inode->i_mapping->nrpages == 0)
		return 0;

	index = start_blk >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
	nblocks_in_page = 1U << (PAGE_CACHE_SHIFT - inode->i_blkbits);

	pagevec_init(&pvec, 0);

repeat:
	pvec.nr = find_get_pages_contig(inode->i_mapping, index, PAGEVEC_SIZE,
					pvec.pages);
	if (pvec.nr == 0)
		return length;

	if (length > 0 && pvec.pages[0]->index > index)
		goto out;

	b = pvec.pages[0]->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
	i = 0;
	do {
		page = pvec.pages[i];

		lock_page(page);
		if (page_has_buffers(page)) {
			struct buffer_head *bh, *head;

			bh = head = page_buffers(page);
			do {
				if (b < start_blk)
					continue;
				if (buffer_delay(bh)) {
					if (length == 0)
						*blkoff = b;
					length++;
				} else if (length > 0) {
					goto out_locked;
				}
			} while (++b, bh = bh->b_this_page, bh != head);
		} else {
			if (length > 0)
				goto out_locked;

			b += nblocks_in_page;
		}
		unlock_page(page);

	} while (++i < pagevec_count(&pvec));

	index = page->index + 1;
	pagevec_release(&pvec);
	cond_resched();
	goto repeat;

out_locked:
	unlock_page(page);
out:
	pagevec_release(&pvec);
	return length;
}
Ejemplo n.º 11
0
/**
 * kvmppc_handle_exit
 *
 * Return value is in the form (errcode<<2 | RESUME_FLAG_HOST | RESUME_FLAG_NV)
 */
int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
                       unsigned int exit_nr)
{
	enum emulation_result er;
	int r = RESUME_HOST;

	/* update before a new last_exit_type is rewritten */
	kvmppc_update_timing_stats(vcpu);

	local_irq_enable();

	run->exit_reason = KVM_EXIT_UNKNOWN;
	run->ready_for_interrupt_injection = 1;

	switch (exit_nr) {
	case BOOKE_INTERRUPT_MACHINE_CHECK:
		printk("MACHINE CHECK: %lx\n", mfspr(SPRN_MCSR));
		kvmppc_dump_vcpu(vcpu);
		r = RESUME_HOST;
		break;

	case BOOKE_INTERRUPT_EXTERNAL:
		kvmppc_account_exit(vcpu, EXT_INTR_EXITS);
		if (need_resched())
			cond_resched();
		r = RESUME_GUEST;
		break;

	case BOOKE_INTERRUPT_DECREMENTER:
		/* Since we switched IVPR back to the host's value, the host
		 * handled this interrupt the moment we enabled interrupts.
		 * Now we just offer it a chance to reschedule the guest. */
		kvmppc_account_exit(vcpu, DEC_EXITS);
		if (need_resched())
			cond_resched();
		r = RESUME_GUEST;
		break;

	case BOOKE_INTERRUPT_PROGRAM:
		if (vcpu->arch.shared->msr & MSR_PR) {
			/* Program traps generated by user-level software must be handled
			 * by the guest kernel. */
			kvmppc_core_queue_program(vcpu, vcpu->arch.fault_esr);
			r = RESUME_GUEST;
			kvmppc_account_exit(vcpu, USR_PR_INST);
			break;
		}

		er = kvmppc_emulate_instruction(run, vcpu);
		switch (er) {
		case EMULATE_DONE:
			/* don't overwrite subtypes, just account kvm_stats */
			kvmppc_account_exit_stat(vcpu, EMULATED_INST_EXITS);
			/* Future optimization: only reload non-volatiles if
			 * they were actually modified by emulation. */
			r = RESUME_GUEST_NV;
			break;
		case EMULATE_DO_DCR:
			run->exit_reason = KVM_EXIT_DCR;
			r = RESUME_HOST;
			break;
		case EMULATE_FAIL:
			/* XXX Deliver Program interrupt to guest. */
			printk(KERN_CRIT "%s: emulation at %lx failed (%08x)\n",
			       __func__, vcpu->arch.pc, vcpu->arch.last_inst);
			/* For debugging, encode the failing instruction and
			 * report it to userspace. */
			run->hw.hardware_exit_reason = ~0ULL << 32;
			run->hw.hardware_exit_reason |= vcpu->arch.last_inst;
			r = RESUME_HOST;
			break;
		default:
			BUG();
		}
		break;

	case BOOKE_INTERRUPT_FP_UNAVAIL:
		kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_FP_UNAVAIL);
		kvmppc_account_exit(vcpu, FP_UNAVAIL);
		r = RESUME_GUEST;
		break;

#ifdef CONFIG_SPE
	case BOOKE_INTERRUPT_SPE_UNAVAIL: {
		if (vcpu->arch.shared->msr & MSR_SPE)
			kvmppc_vcpu_enable_spe(vcpu);
		else
			kvmppc_booke_queue_irqprio(vcpu,
						   BOOKE_IRQPRIO_SPE_UNAVAIL);
		r = RESUME_GUEST;
		break;
	}

	case BOOKE_INTERRUPT_SPE_FP_DATA:
		kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_SPE_FP_DATA);
		r = RESUME_GUEST;
		break;

	case BOOKE_INTERRUPT_SPE_FP_ROUND:
		kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_SPE_FP_ROUND);
		r = RESUME_GUEST;
		break;
#else
	case BOOKE_INTERRUPT_SPE_UNAVAIL:
		/*
		 * Guest wants SPE, but host kernel doesn't support it.  Send
		 * an "unimplemented operation" program check to the guest.
		 */
		kvmppc_core_queue_program(vcpu, ESR_PUO | ESR_SPV);
		r = RESUME_GUEST;
		break;

	/*
	 * These really should never happen without CONFIG_SPE,
	 * as we should never enable the real MSR[SPE] in the guest.
	 */
	case BOOKE_INTERRUPT_SPE_FP_DATA:
	case BOOKE_INTERRUPT_SPE_FP_ROUND:
		printk(KERN_CRIT "%s: unexpected SPE interrupt %u at %08lx\n",
		       __func__, exit_nr, vcpu->arch.pc);
		run->hw.hardware_exit_reason = exit_nr;
		r = RESUME_HOST;
		break;
#endif

	case BOOKE_INTERRUPT_DATA_STORAGE:
		kvmppc_core_queue_data_storage(vcpu, vcpu->arch.fault_dear,
		                               vcpu->arch.fault_esr);
		kvmppc_account_exit(vcpu, DSI_EXITS);
		r = RESUME_GUEST;
		break;

	case BOOKE_INTERRUPT_INST_STORAGE:
		kvmppc_core_queue_inst_storage(vcpu, vcpu->arch.fault_esr);
		kvmppc_account_exit(vcpu, ISI_EXITS);
		r = RESUME_GUEST;
		break;

	case BOOKE_INTERRUPT_SYSCALL:
		if (!(vcpu->arch.shared->msr & MSR_PR) &&
		    (((u32)kvmppc_get_gpr(vcpu, 0)) == KVM_SC_MAGIC_R0)) {
			/* KVM PV hypercalls */
			kvmppc_set_gpr(vcpu, 3, kvmppc_kvm_pv(vcpu));
			r = RESUME_GUEST;
		} else {
			/* Guest syscalls */
			kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_SYSCALL);
		}
		kvmppc_account_exit(vcpu, SYSCALL_EXITS);
		r = RESUME_GUEST;
		break;

	case BOOKE_INTERRUPT_DTLB_MISS: {
		unsigned long eaddr = vcpu->arch.fault_dear;
		int gtlb_index;
		gpa_t gpaddr;
		gfn_t gfn;

#ifdef CONFIG_KVM_E500
		if (!(vcpu->arch.shared->msr & MSR_PR) &&
		    (eaddr & PAGE_MASK) == vcpu->arch.magic_page_ea) {
			kvmppc_map_magic(vcpu);
			kvmppc_account_exit(vcpu, DTLB_VIRT_MISS_EXITS);
			r = RESUME_GUEST;

			break;
		}
#endif

		/* Check the guest TLB. */
		gtlb_index = kvmppc_mmu_dtlb_index(vcpu, eaddr);
		if (gtlb_index < 0) {
			/* The guest didn't have a mapping for it. */
			kvmppc_core_queue_dtlb_miss(vcpu,
			                            vcpu->arch.fault_dear,
			                            vcpu->arch.fault_esr);
			kvmppc_mmu_dtlb_miss(vcpu);
			kvmppc_account_exit(vcpu, DTLB_REAL_MISS_EXITS);
			r = RESUME_GUEST;
			break;
		}

		gpaddr = kvmppc_mmu_xlate(vcpu, gtlb_index, eaddr);
		gfn = gpaddr >> PAGE_SHIFT;

		if (kvm_is_visible_gfn(vcpu->kvm, gfn)) {
			/* The guest TLB had a mapping, but the shadow TLB
			 * didn't, and it is RAM. This could be because:
			 * a) the entry is mapping the host kernel, or
			 * b) the guest used a large mapping which we're faking
			 * Either way, we need to satisfy the fault without
			 * invoking the guest. */
			kvmppc_mmu_map(vcpu, eaddr, gpaddr, gtlb_index);
			kvmppc_account_exit(vcpu, DTLB_VIRT_MISS_EXITS);
			r = RESUME_GUEST;
		} else {
			/* Guest has mapped and accessed a page which is not
			 * actually RAM. */
			vcpu->arch.paddr_accessed = gpaddr;
			r = kvmppc_emulate_mmio(run, vcpu);
			kvmppc_account_exit(vcpu, MMIO_EXITS);
		}

		break;
	}

	case BOOKE_INTERRUPT_ITLB_MISS: {
		unsigned long eaddr = vcpu->arch.pc;
		gpa_t gpaddr;
		gfn_t gfn;
		int gtlb_index;

		r = RESUME_GUEST;

		/* Check the guest TLB. */
		gtlb_index = kvmppc_mmu_itlb_index(vcpu, eaddr);
		if (gtlb_index < 0) {
			/* The guest didn't have a mapping for it. */
			kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_ITLB_MISS);
			kvmppc_mmu_itlb_miss(vcpu);
			kvmppc_account_exit(vcpu, ITLB_REAL_MISS_EXITS);
			break;
		}

		kvmppc_account_exit(vcpu, ITLB_VIRT_MISS_EXITS);

		gpaddr = kvmppc_mmu_xlate(vcpu, gtlb_index, eaddr);
		gfn = gpaddr >> PAGE_SHIFT;

		if (kvm_is_visible_gfn(vcpu->kvm, gfn)) {
			/* The guest TLB had a mapping, but the shadow TLB
			 * didn't. This could be because:
			 * a) the entry is mapping the host kernel, or
			 * b) the guest used a large mapping which we're faking
			 * Either way, we need to satisfy the fault without
			 * invoking the guest. */
			kvmppc_mmu_map(vcpu, eaddr, gpaddr, gtlb_index);
		} else {
			/* Guest mapped and leaped at non-RAM! */
			kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_MACHINE_CHECK);
		}

		break;
	}

	case BOOKE_INTERRUPT_DEBUG: {
		u32 dbsr;

		vcpu->arch.pc = mfspr(SPRN_CSRR0);

		/* clear IAC events in DBSR register */
		dbsr = mfspr(SPRN_DBSR);
		dbsr &= DBSR_IAC1 | DBSR_IAC2 | DBSR_IAC3 | DBSR_IAC4;
		mtspr(SPRN_DBSR, dbsr);

		run->exit_reason = KVM_EXIT_DEBUG;
		kvmppc_account_exit(vcpu, DEBUG_EXITS);
		r = RESUME_HOST;
		break;
	}

	default:
		printk(KERN_EMERG "exit_nr %d\n", exit_nr);
		BUG();
	}

	local_irq_disable();

	kvmppc_core_prepare_to_enter(vcpu);

	if (!(r & RESUME_HOST)) {
		/* To avoid clobbering exit_reason, only check for signals if
		 * we aren't already exiting to userspace for some other
		 * reason. */
		if (signal_pending(current)) {
			run->exit_reason = KVM_EXIT_INTR;
			r = (-EINTR << 2) | RESUME_HOST | (r & RESUME_FLAG_NV);
			kvmppc_account_exit(vcpu, SIGNAL_EXITS);
		}
	}

	return r;
}
Ejemplo n.º 12
0
static int __init mtd_speedtest_init(void)
{
	int err, i, blocks, j, k;
	long speed;
	uint64_t tmp;

	printk(KERN_INFO "\n");
	printk(KERN_INFO "=================================================\n");
	if (count)
		printk(PRINT_PREF "MTD device: %d    count: %d\n", dev, count);
	else
		printk(PRINT_PREF "MTD device: %d\n", dev);

	mtd = get_mtd_device(NULL, dev);
	if (IS_ERR(mtd)) {
		err = PTR_ERR(mtd);
		printk(PRINT_PREF "error: cannot get MTD device\n");
		return err;
	}

	if (mtd->writesize == 1) {
		printk(PRINT_PREF "not NAND flash, assume page size is 512 "
		       "bytes.\n");
		pgsize = 512;
	} else
		pgsize = mtd->writesize;

	tmp = mtd->size;
	do_div(tmp, mtd->erasesize);
	ebcnt = tmp;
	pgcnt = mtd->erasesize / pgsize;

	printk(PRINT_PREF "MTD device size %llu, eraseblock size %u, "
	       "page size %u, count of eraseblocks %u, pages per "
	       "eraseblock %u, OOB size %u\n",
	       (unsigned long long)mtd->size, mtd->erasesize,
	       pgsize, ebcnt, pgcnt, mtd->oobsize);

	if (count > 0 && count < ebcnt)
		ebcnt = count;

	err = -ENOMEM;
	iobuf = kmalloc(mtd->erasesize, GFP_KERNEL);
	if (!iobuf) {
		printk(PRINT_PREF "error: cannot allocate memory\n");
		goto out;
	}

	simple_srand(1);
	set_random_data(iobuf, mtd->erasesize);

	err = scan_for_bad_eraseblocks();
	if (err)
		goto out;

	err = erase_whole_device();
	if (err)
		goto out;

	/* Write all eraseblocks, 1 eraseblock at a time */
	printk(PRINT_PREF "testing eraseblock write speed\n");
	start_timing();
	for (i = 0; i < ebcnt; ++i) {
		if (bbt[i])
			continue;
		err = write_eraseblock(i);
		if (err)
			goto out;
		cond_resched();
	}
	stop_timing();
	speed = calc_speed();
	printk(PRINT_PREF "eraseblock write speed is %ld KiB/s\n", speed);

	/* Read all eraseblocks, 1 eraseblock at a time */
	printk(PRINT_PREF "testing eraseblock read speed\n");
	start_timing();
	for (i = 0; i < ebcnt; ++i) {
		if (bbt[i])
			continue;
		err = read_eraseblock(i);
		if (err)
			goto out;
		cond_resched();
	}
	stop_timing();
	speed = calc_speed();
	printk(PRINT_PREF "eraseblock read speed is %ld KiB/s\n", speed);

	err = erase_whole_device();
	if (err)
		goto out;

	/* Write all eraseblocks, 1 page at a time */
	printk(PRINT_PREF "testing page write speed\n");
	start_timing();
	for (i = 0; i < ebcnt; ++i) {
		if (bbt[i])
			continue;
		err = write_eraseblock_by_page(i);
		if (err)
			goto out;
		cond_resched();
	}
	stop_timing();
	speed = calc_speed();
	printk(PRINT_PREF "page write speed is %ld KiB/s\n", speed);

	/* Read all eraseblocks, 1 page at a time */
	printk(PRINT_PREF "testing page read speed\n");
	start_timing();
	for (i = 0; i < ebcnt; ++i) {
		if (bbt[i])
			continue;
		err = read_eraseblock_by_page(i);
		if (err)
			goto out;
		cond_resched();
	}
	stop_timing();
	speed = calc_speed();
	printk(PRINT_PREF "page read speed is %ld KiB/s\n", speed);

	err = erase_whole_device();
	if (err)
		goto out;

	/* Write all eraseblocks, 2 pages at a time */
	printk(PRINT_PREF "testing 2 page write speed\n");
	start_timing();
	for (i = 0; i < ebcnt; ++i) {
		if (bbt[i])
			continue;
		err = write_eraseblock_by_2pages(i);
		if (err)
			goto out;
		cond_resched();
	}
	stop_timing();
	speed = calc_speed();
	printk(PRINT_PREF "2 page write speed is %ld KiB/s\n", speed);

	/* Read all eraseblocks, 2 pages at a time */
	printk(PRINT_PREF "testing 2 page read speed\n");
	start_timing();
	for (i = 0; i < ebcnt; ++i) {
		if (bbt[i])
			continue;
		err = read_eraseblock_by_2pages(i);
		if (err)
			goto out;
		cond_resched();
	}
	stop_timing();
	speed = calc_speed();
	printk(PRINT_PREF "2 page read speed is %ld KiB/s\n", speed);

	/* Erase all eraseblocks */
	printk(PRINT_PREF "Testing erase speed\n");
	start_timing();
	for (i = 0; i < ebcnt; ++i) {
		if (bbt[i])
			continue;
		err = erase_eraseblock(i);
		if (err)
			goto out;
		cond_resched();
	}
	stop_timing();
	speed = calc_speed();
	printk(PRINT_PREF "erase speed is %ld KiB/s\n", speed);

	/* Multi-block erase all eraseblocks */
	for (k = 1; k < 7; k++) {
		blocks = 1 << k;
		printk(PRINT_PREF "Testing %dx multi-block erase speed\n",
		       blocks);
		start_timing();
		for (i = 0; i < ebcnt; ) {
			for (j = 0; j < blocks && (i + j) < ebcnt; j++)
				if (bbt[i + j])
					break;
			if (j < 1) {
				i++;
				continue;
			}
			err = multiblock_erase(i, j);
			if (err)
				goto out;
			cond_resched();
			i += j;
		}
		stop_timing();
		speed = calc_speed();
		printk(PRINT_PREF "%dx multi-block erase speed is %ld KiB/s\n",
		       blocks, speed);
	}
	printk(PRINT_PREF "finished\n");
out:
	kfree(iobuf);
	kfree(bbt);
	put_mtd_device(mtd);
	if (err)
		printk(PRINT_PREF "error %d occurred\n", err);
	printk(KERN_INFO "=================================================\n");
	return err;
}
Ejemplo n.º 13
0
int jffs2_reserve_space(struct jffs2_sb_info *c, uint32_t minsize, uint32_t *ofs, uint32_t *len, int prio)
{
	int ret = -EAGAIN;
	int blocksneeded = c->resv_blocks_write;
	/* align it */
	minsize = PAD(minsize);

	D1(printk(KERN_DEBUG "jffs2_reserve_space(): Requested 0x%x bytes\n", minsize));
	down(&c->alloc_sem);

	D1(printk(KERN_DEBUG "jffs2_reserve_space(): alloc sem got\n"));

	spin_lock(&c->erase_completion_lock);

	/* this needs a little more thought (true <tglx> :)) */
	while(ret == -EAGAIN) {
		while(c->nr_free_blocks + c->nr_erasing_blocks < blocksneeded) {
			int ret;
			uint32_t dirty, avail;

			/* calculate real dirty size
			 * dirty_size contains blocks on erase_pending_list
			 * those blocks are counted in c->nr_erasing_blocks.
			 * If one block is actually erased, it is not longer counted as dirty_space
			 * but it is counted in c->nr_erasing_blocks, so we add it and subtract it
			 * with c->nr_erasing_blocks * c->sector_size again.
			 * Blocks on erasable_list are counted as dirty_size, but not in c->nr_erasing_blocks
			 * This helps us to force gc and pick eventually a clean block to spread the load.
			 * We add unchecked_size here, as we hopefully will find some space to use.
			 * This will affect the sum only once, as gc first finishes checking
			 * of nodes.
			 */
			dirty = c->dirty_size + c->erasing_size - c->nr_erasing_blocks * c->sector_size + c->unchecked_size;
			if (dirty < c->nospc_dirty_size) {
				if (prio == ALLOC_DELETION && c->nr_free_blocks + c->nr_erasing_blocks >= c->resv_blocks_deletion) {
					D1(printk(KERN_NOTICE "jffs2_reserve_space(): Low on dirty space to GC, but it's a deletion. Allowing...\n"));
					break;
				}
				D1(printk(KERN_DEBUG "dirty size 0x%08x + unchecked_size 0x%08x < nospc_dirty_size 0x%08x, returning -ENOSPC\n",
					  dirty, c->unchecked_size, c->sector_size));

				spin_unlock(&c->erase_completion_lock);
				up(&c->alloc_sem);
				return -ENOSPC;
			}
			
			/* Calc possibly available space. Possibly available means that we
			 * don't know, if unchecked size contains obsoleted nodes, which could give us some
			 * more usable space. This will affect the sum only once, as gc first finishes checking
			 * of nodes.
			 + Return -ENOSPC, if the maximum possibly available space is less or equal than 
			 * blocksneeded * sector_size.
			 * This blocks endless gc looping on a filesystem, which is nearly full, even if
			 * the check above passes.
			 */
			avail = c->free_size + c->dirty_size + c->erasing_size + c->unchecked_size;
			if ( (avail / c->sector_size) <= blocksneeded) {
				if (prio == ALLOC_DELETION && c->nr_free_blocks + c->nr_erasing_blocks >= c->resv_blocks_deletion) {
					D1(printk(KERN_NOTICE "jffs2_reserve_space(): Low on possibly available space, but it's a deletion. Allowing...\n"));
					break;
				}

				D1(printk(KERN_DEBUG "max. available size 0x%08x  < blocksneeded * sector_size 0x%08x, returning -ENOSPC\n",
					  avail, blocksneeded * c->sector_size));
				spin_unlock(&c->erase_completion_lock);
				up(&c->alloc_sem);
				return -ENOSPC;
			}

			up(&c->alloc_sem);

			D1(printk(KERN_DEBUG "Triggering GC pass. nr_free_blocks %d, nr_erasing_blocks %d, free_size 0x%08x, dirty_size 0x%08x, wasted_size 0x%08x, used_size 0x%08x, erasing_size 0x%08x, bad_size 0x%08x (total 0x%08x of 0x%08x)\n",
				  c->nr_free_blocks, c->nr_erasing_blocks, c->free_size, c->dirty_size, c->wasted_size, c->used_size, c->erasing_size, c->bad_size,
				  c->free_size + c->dirty_size + c->wasted_size + c->used_size + c->erasing_size + c->bad_size, c->flash_size));
			spin_unlock(&c->erase_completion_lock);
			
			ret = jffs2_garbage_collect_pass(c);
			if (ret)
				return ret;

			cond_resched();

			if (signal_pending(current))
				return -EINTR;

			down(&c->alloc_sem);
			spin_lock(&c->erase_completion_lock);
		}

		ret = jffs2_do_reserve_space(c, minsize, ofs, len);
		if (ret) {
			D1(printk(KERN_DEBUG "jffs2_reserve_space: ret is %d\n", ret));
		}
	}
	spin_unlock(&c->erase_completion_lock);
	if (ret)
		up(&c->alloc_sem);
	return ret;
}
Ejemplo n.º 14
0
/**
 * console_conditional_schedule - yield the CPU if required
 *
 * If the console code is currently allowed to sleep, and
 * if this CPU should yield the CPU to another task, do
 * so here.
 *
 * Must be called within acquire_console_sem().
 */
void __sched console_conditional_schedule(void)
{
	if (console_may_schedule)
		cond_resched();
}
Ejemplo n.º 15
0
static int __init mtd_stresstest_init(void)
{
    int err;
    int i, op;
    uint64_t tmp;

    printk(KERN_INFO "\n");
    printk(KERN_INFO "=================================================\n");
    printk(PRINT_PREF "MTD device: %d\n", dev);

    mtd = get_mtd_device(NULL, dev);
    if (IS_ERR(mtd)) {
        err = PTR_ERR(mtd);
        printk(PRINT_PREF "error: cannot get MTD device\n");
        return err;
    }

    if (mtd->writesize == 1) {
        printk(PRINT_PREF "not NAND flash, assume page size is 512 "
               "bytes.\n");
        pgsize = 512;
    } else
        pgsize = mtd->writesize;

    tmp = mtd->size;
    do_div(tmp, mtd->erasesize);
    ebcnt = tmp;
    pgcnt = mtd->erasesize / pgsize;

    printk(PRINT_PREF "MTD device size %llu, eraseblock size %u, "
           "page size %u, count of eraseblocks %u, pages per "
           "eraseblock %u, OOB size %u\n",
           (unsigned long long)mtd->size, mtd->erasesize,
           pgsize, ebcnt, pgcnt, mtd->oobsize);

    if (ebcnt < 2) {
        printk(PRINT_PREF "error: need at least 2 eraseblocks\n");
        err = -ENOSPC;
        goto out_put_mtd;
    }

    /* Read or write up 2 eraseblocks at a time */
    bufsize = mtd->erasesize * 2;

    err = -ENOMEM;
    readbuf = vmalloc(bufsize);
    writebuf = vmalloc(bufsize);
    offsets = kmalloc(ebcnt * sizeof(int), GFP_KERNEL);
    if (!readbuf || !writebuf || !offsets) {
        printk(PRINT_PREF "error: cannot allocate memory\n");
        goto out;
    }
    for (i = 0; i < ebcnt; i++)
        offsets[i] = mtd->erasesize;
    simple_srand(current->pid);
    for (i = 0; i < bufsize; i++)
        writebuf[i] = simple_rand();

    err = scan_for_bad_eraseblocks();
    if (err)
        goto out;

    /* Do operations */
    printk(PRINT_PREF "doing operations\n");
    for (op = 0; op < count; op++) {
        if ((op & 1023) == 0)
            printk(PRINT_PREF "%d operations done\n", op);
        err = do_operation();
        if (err)
            goto out;
        cond_resched();
    }
    printk(PRINT_PREF "finished, %d operations done\n", op);

out:
    kfree(offsets);
    kfree(bbt);
    vfree(writebuf);
    vfree(readbuf);
out_put_mtd:
    put_mtd_device(mtd);
    if (err)
        printk(PRINT_PREF "error %d occurred\n", err);
    printk(KERN_INFO "=================================================\n");
    return err;
}
Ejemplo n.º 16
0
/*
 * Commands to do_syslog:
 *
 * 	0 -- Close the log.  Currently a NOP.
 * 	1 -- Open the log. Currently a NOP.
 * 	2 -- Read from the log.
 * 	3 -- Read all messages remaining in the ring buffer.
 * 	4 -- Read and clear all messages remaining in the ring buffer
 * 	5 -- Clear ring buffer.
 * 	6 -- Disable printk's to console
 * 	7 -- Enable printk's to console
 *	8 -- Set level of messages printed to console
 *	9 -- Return number of unread characters in the log buffer
 *     10 -- Return size of the log buffer
 */
int do_syslog(int type, char __user *buf, int len)
{
	unsigned i, j, limit, count;
	int do_clear = 0;
	char c;
	int error = 0;

	error = security_syslog(type);
	if (error)
		return error;

	switch (type) {
	case 0:		/* Close log */
		break;
	case 1:		/* Open log */
		break;
	case 2:		/* Read from log */
		error = -EINVAL;
		if (!buf || len < 0)
			goto out;
		error = 0;
		if (!len)
			goto out;
		if (!access_ok(VERIFY_WRITE, buf, len)) {
			error = -EFAULT;
			goto out;
		}
		error = wait_event_interruptible(log_wait,
							(log_start - log_end));
		if (error)
			goto out;
		i = 0;
		spin_lock_irq(&logbuf_lock);
		while (!error && (log_start != log_end) && i < len) {
			c = LOG_BUF(log_start);
			log_start++;
			spin_unlock_irq(&logbuf_lock);
			error = __put_user(c,buf);
			buf++;
			i++;
			cond_resched();
			spin_lock_irq(&logbuf_lock);
		}
		spin_unlock_irq(&logbuf_lock);
		if (!error)
			error = i;
		break;
	case 4:		/* Read/clear last kernel messages */
		do_clear = 1;
		/* FALL THRU */
	case 3:		/* Read last kernel messages */
		error = -EINVAL;
		if (!buf || len < 0)
			goto out;
		error = 0;
		if (!len)
			goto out;
		if (!access_ok(VERIFY_WRITE, buf, len)) {
			error = -EFAULT;
			goto out;
		}
		count = len;
		if (count > log_buf_len)
			count = log_buf_len;
		spin_lock_irq(&logbuf_lock);
		if (count > logged_chars)
			count = logged_chars;
		if (do_clear)
			logged_chars = 0;
		limit = log_end;
		/*
		 * __put_user() could sleep, and while we sleep
		 * printk() could overwrite the messages
		 * we try to copy to user space. Therefore
		 * the messages are copied in reverse. <manfreds>
		 */
		for (i = 0; i < count && !error; i++) {
			j = limit-1-i;
			if (j + log_buf_len < log_end)
				break;
			c = LOG_BUF(j);
			spin_unlock_irq(&logbuf_lock);
			error = __put_user(c,&buf[count-1-i]);
			cond_resched();
			spin_lock_irq(&logbuf_lock);
		}
		spin_unlock_irq(&logbuf_lock);
		if (error)
			break;
		error = i;
		if (i != count) {
			int offset = count-error;
			/* buffer overflow during copy, correct user buffer. */
			for (i = 0; i < error; i++) {
				if (__get_user(c,&buf[i+offset]) ||
				    __put_user(c,&buf[i])) {
					error = -EFAULT;
					break;
				}
				cond_resched();
			}
		}
		break;
	case 5:		/* Clear ring buffer */
		logged_chars = 0;
		break;
	case 6:		/* Disable logging to console */
		console_loglevel = minimum_console_loglevel;
		break;
	case 7:		/* Enable logging to console */
		console_loglevel = default_console_loglevel;
		break;
	case 8:		/* Set level of messages printed to console */
		error = -EINVAL;
		if (len < 1 || len > 8)
			goto out;
		if (len < minimum_console_loglevel)
			len = minimum_console_loglevel;
		console_loglevel = len;
		error = 0;
		break;
	case 9:		/* Number of chars in the log buffer */
		error = log_end - log_start;
		break;
	case 10:	/* Size of the log buffer */
		error = log_buf_len;
		break;
	default:
		error = -EINVAL;
		break;
	}
out:
	return error;
}
Ejemplo n.º 17
0
Archivo: core.c Proyecto: 274914765/C
/*H:050 Once we've re-enabled interrupts, we look at why the Guest exited. */
void lguest_arch_handle_trap(struct lg_cpu *cpu)
{
    switch (cpu->regs->trapnum) {
    case 13: /* We've intercepted a General Protection Fault. */
        /* Check if this was one of those annoying IN or OUT
         * instructions which we need to emulate.  If so, we just go
         * back into the Guest after we've done it. */
        if (cpu->regs->errcode == 0) {
            if (emulate_insn(cpu))
                return;
        }
        break;
    case 14: /* We've intercepted a Page Fault. */
        /* The Guest accessed a virtual address that wasn't mapped.
         * This happens a lot: we don't actually set up most of the page
         * tables for the Guest at all when we start: as it runs it asks
         * for more and more, and we set them up as required. In this
         * case, we don't even tell the Guest that the fault happened.
         *
         * The errcode tells whether this was a read or a write, and
         * whether kernel or userspace code. */
        if (demand_page(cpu, cpu->arch.last_pagefault,
                cpu->regs->errcode))
            return;

        /* OK, it's really not there (or not OK): the Guest needs to
         * know.  We write out the cr2 value so it knows where the
         * fault occurred.
         *
         * Note that if the Guest were really messed up, this could
         * happen before it's done the LHCALL_LGUEST_INIT hypercall, so
         * lg->lguest_data could be NULL */
        if (cpu->lg->lguest_data &&
            put_user(cpu->arch.last_pagefault,
                 &cpu->lg->lguest_data->cr2))
            kill_guest(cpu, "Writing cr2");
        break;
    case 7: /* We've intercepted a Device Not Available fault. */
        /* If the Guest doesn't want to know, we already restored the
         * Floating Point Unit, so we just continue without telling
         * it. */
        if (!cpu->ts)
            return;
        break;
    case 32 ... 255:
        /* These values mean a real interrupt occurred, in which case
         * the Host handler has already been run.  We just do a
         * friendly check if another process should now be run, then
         * return to run the Guest again */
        cond_resched();
        return;
    case LGUEST_TRAP_ENTRY:
        /* Our 'struct hcall_args' maps directly over our regs: we set
         * up the pointer now to indicate a hypercall is pending. */
        cpu->hcall = (struct hcall_args *)cpu->regs;
        return;
    }

    /* We didn't handle the trap, so it needs to go to the Guest. */
    if (!deliver_trap(cpu, cpu->regs->trapnum))
        /* If the Guest doesn't have a handler (either it hasn't
         * registered any yet, or it's one of the faults we don't let
         * it handle), it dies with this cryptic error message. */
        kill_guest(cpu, "unhandled trap %li at %#lx (%#lx)",
               cpu->regs->trapnum, cpu->regs->eip,
               cpu->regs->trapnum == 14 ? cpu->arch.last_pagefault
               : cpu->regs->errcode);
}
Ejemplo n.º 18
0
/*
 * Given a Virtual Unit Chain, see if it can be deleted, and if so do it.
 */
static void INFTL_trydeletechain(struct INFTLrecord *inftl, unsigned thisVUC)
{
	struct mtd_info *mtd = inftl->mbd.mtd;
	unsigned char BlockUsed[MAX_SECTORS_PER_UNIT];
	unsigned char BlockDeleted[MAX_SECTORS_PER_UNIT];
	unsigned int thisEUN, status;
	int block, silly;
	struct inftl_bci bci;
	size_t retlen;

	DEBUG(MTD_DEBUG_LEVEL3, "INFTL: INFTL_trydeletechain(inftl=%p,"
		"thisVUC=%d)\n", inftl, thisVUC);

	memset(BlockUsed, 0, sizeof(BlockUsed));
	memset(BlockDeleted, 0, sizeof(BlockDeleted));

	thisEUN = inftl->VUtable[thisVUC];
	if (thisEUN == BLOCK_NIL) {
		printk(KERN_WARNING "INFTL: trying to delete non-existent "
		       "Virtual Unit Chain %d!\n", thisVUC);
		return;
	}

	/*
	 * Scan through the Erase Units to determine whether any data is in
	 * each of the 512-byte blocks within the Chain.
	 */
	silly = MAX_LOOPS;
	while (thisEUN < inftl->nb_blocks) {
		for (block = 0; block < inftl->EraseSize/SECTORSIZE; block++) {
			if (BlockUsed[block] || BlockDeleted[block])
				continue;

			if (inftl_read_oob(mtd, (thisEUN * inftl->EraseSize)
					   + (block * SECTORSIZE), 8 , &retlen,
					  (char *)&bci) < 0)
				status = SECTOR_IGNORE;
			else
				status = bci.Status | bci.Status1;

			switch(status) {
			case SECTOR_FREE:
			case SECTOR_IGNORE:
				break;
			case SECTOR_USED:
				BlockUsed[block] = 1;
				continue;
			case SECTOR_DELETED:
				BlockDeleted[block] = 1;
				continue;
			default:
				printk(KERN_WARNING "INFTL: unknown status "
					"for block %d in EUN %d: 0x%x\n",
					block, thisEUN, status);
			}
		}

		if (!silly--) {
			printk(KERN_WARNING "INFTL: infinite loop in Virtual "
				"Unit Chain 0x%x\n", thisVUC);
			return;
		}

		thisEUN = inftl->PUtable[thisEUN];
	}

	for (block = 0; block < inftl->EraseSize/SECTORSIZE; block++)
		if (BlockUsed[block])
			return;

	/*
	 * For each block in the chain free it and make it available
	 * for future use. Erase from the oldest unit first.
	 */
	DEBUG(MTD_DEBUG_LEVEL1, "INFTL: deleting empty VUC %d\n", thisVUC);

	for (;;) {
		u16 *prevEUN = &inftl->VUtable[thisVUC];
		thisEUN = *prevEUN;

		/* If the chain is all gone already, we're done */
		if (thisEUN == BLOCK_NIL) {
			DEBUG(MTD_DEBUG_LEVEL2, "INFTL: Empty VUC %d for deletion was already absent\n", thisEUN);
			return;
		}

		/* Find oldest unit in chain. */
		while (inftl->PUtable[thisEUN] != BLOCK_NIL) {
			BUG_ON(thisEUN >= inftl->nb_blocks);

			prevEUN = &inftl->PUtable[thisEUN];
			thisEUN = *prevEUN;
		}

		DEBUG(MTD_DEBUG_LEVEL3, "Deleting EUN %d from VUC %d\n",
		      thisEUN, thisVUC);

		if (INFTL_formatblock(inftl, thisEUN) < 0) {
			/*
			 * Could not erase : mark block as reserved.
			 */
			inftl->PUtable[thisEUN] = BLOCK_RESERVED;
		} else {
			/* Correctly erased : mark it as free */
			inftl->PUtable[thisEUN] = BLOCK_FREE;
			inftl->numfreeEUNs++;
		}

		/* Now sort out whatever was pointing to it... */
		*prevEUN = BLOCK_NIL;

		/* Ideally we'd actually be responsive to new
		   requests while we're doing this -- if there's
		   free space why should others be made to wait? */
		cond_resched();
	}

	inftl->VUtable[thisVUC] = BLOCK_NIL;
}
Ejemplo n.º 19
0
/*
 * We completely avoid races by reading each swap page in advance,
 * and then search for the process using it.  All the necessary
 * page table adjustments can then be made atomically.
 */
static int try_to_unuse(unsigned int type)
{
	struct swap_info_struct * si = &swap_info[type];
	struct mm_struct *start_mm;
	unsigned short *swap_map;
	unsigned short swcount;
	struct page *page;
	swp_entry_t entry;
	unsigned int i = 0;
	int retval = 0;
	int reset_overflow = 0;
	int shmem;

	/*
	 * When searching mms for an entry, a good strategy is to
	 * start at the first mm we freed the previous entry from
	 * (though actually we don't notice whether we or coincidence
	 * freed the entry).  Initialize this start_mm with a hold.
	 *
	 * A simpler strategy would be to start at the last mm we
	 * freed the previous entry from; but that would take less
	 * advantage of mmlist ordering, which clusters forked mms
	 * together, child after parent.  If we race with dup_mmap(), we
	 * prefer to resolve parent before child, lest we miss entries
	 * duplicated after we scanned child: using last mm would invert
	 * that.  Though it's only a serious concern when an overflowed
	 * swap count is reset from SWAP_MAP_MAX, preventing a rescan.
	 */
	start_mm = &init_mm;
	atomic_inc(&init_mm.mm_users);

	/*
	 * Keep on scanning until all entries have gone.  Usually,
	 * one pass through swap_map is enough, but not necessarily:
	 * there are races when an instance of an entry might be missed.
	 */
	while ((i = find_next_to_unuse(si, i)) != 0) {
		if (signal_pending(current)) {
			retval = -EINTR;
			break;
		}

		/* 
		 * Get a page for the entry, using the existing swap
		 * cache page if there is one.  Otherwise, get a clean
		 * page and read the swap into it. 
		 */
		swap_map = &si->swap_map[i];
		entry = swp_entry(type, i);
		page = read_swap_cache_async(entry, NULL, 0);
		if (!page) {
			/*
			 * Either swap_duplicate() failed because entry
			 * has been freed independently, and will not be
			 * reused since sys_swapoff() already disabled
			 * allocation from here, or alloc_page() failed.
			 */
			if (!*swap_map)
				continue;
			retval = -ENOMEM;
			break;
		}

		/*
		 * Don't hold on to start_mm if it looks like exiting.
		 */
		if (atomic_read(&start_mm->mm_users) == 1) {
			mmput(start_mm);
			start_mm = &init_mm;
			atomic_inc(&init_mm.mm_users);
		}

		/*
		 * Wait for and lock page.  When do_swap_page races with
		 * try_to_unuse, do_swap_page can handle the fault much
		 * faster than try_to_unuse can locate the entry.  This
		 * apparently redundant "wait_on_page_locked" lets try_to_unuse
		 * defer to do_swap_page in such a case - in some tests,
		 * do_swap_page and try_to_unuse repeatedly compete.
		 */
		wait_on_page_locked(page);
		wait_on_page_writeback(page);
		lock_page(page);
		wait_on_page_writeback(page);

		/*
		 * Remove all references to entry.
		 * Whenever we reach init_mm, there's no address space
		 * to search, but use it as a reminder to search shmem.
		 */
		shmem = 0;
		swcount = *swap_map;
		if (swcount > 1) {
			if (start_mm == &init_mm)
				shmem = shmem_unuse(entry, page);
			else
				retval = unuse_mm(start_mm, entry, page);
		}
		if (*swap_map > 1) {
			int set_start_mm = (*swap_map >= swcount);
			struct list_head *p = &start_mm->mmlist;
			struct mm_struct *new_start_mm = start_mm;
			struct mm_struct *prev_mm = start_mm;
			struct mm_struct *mm;

			atomic_inc(&new_start_mm->mm_users);
			atomic_inc(&prev_mm->mm_users);
			spin_lock(&mmlist_lock);
			while (*swap_map > 1 && !retval &&
					(p = p->next) != &start_mm->mmlist) {
				mm = list_entry(p, struct mm_struct, mmlist);
				if (!atomic_inc_not_zero(&mm->mm_users))
					continue;
				spin_unlock(&mmlist_lock);
				mmput(prev_mm);
				prev_mm = mm;

				cond_resched();

				swcount = *swap_map;
				if (swcount <= 1)
					;
				else if (mm == &init_mm) {
					set_start_mm = 1;
					shmem = shmem_unuse(entry, page);
				} else
					retval = unuse_mm(mm, entry, page);
				if (set_start_mm && *swap_map < swcount) {
					mmput(new_start_mm);
					atomic_inc(&mm->mm_users);
					new_start_mm = mm;
					set_start_mm = 0;
				}
				spin_lock(&mmlist_lock);
			}
			spin_unlock(&mmlist_lock);
			mmput(prev_mm);
			mmput(start_mm);
			start_mm = new_start_mm;
		}
		if (retval) {
			unlock_page(page);
			page_cache_release(page);
			break;
		}

		/*
		 * How could swap count reach 0x7fff when the maximum
		 * pid is 0x7fff, and there's no way to repeat a swap
		 * page within an mm (except in shmem, where it's the
		 * shared object which takes the reference count)?
		 * We believe SWAP_MAP_MAX cannot occur in Linux 2.4.
		 *
		 * If that's wrong, then we should worry more about
		 * exit_mmap() and do_munmap() cases described above:
		 * we might be resetting SWAP_MAP_MAX too early here.
		 * We know "Undead"s can happen, they're okay, so don't
		 * report them; but do report if we reset SWAP_MAP_MAX.
		 */
		if (*swap_map == SWAP_MAP_MAX) {
			spin_lock(&swap_lock);
			*swap_map = 1;
			spin_unlock(&swap_lock);
			reset_overflow = 1;
		}

		/*
		 * If a reference remains (rare), we would like to leave
		 * the page in the swap cache; but try_to_unmap could
		 * then re-duplicate the entry once we drop page lock,
		 * so we might loop indefinitely; also, that page could
		 * not be swapped out to other storage meanwhile.  So:
		 * delete from cache even if there's another reference,
		 * after ensuring that the data has been saved to disk -
		 * since if the reference remains (rarer), it will be
		 * read from disk into another page.  Splitting into two
		 * pages would be incorrect if swap supported "shared
		 * private" pages, but they are handled by tmpfs files.
		 *
		 * Note shmem_unuse already deleted a swappage from
		 * the swap cache, unless the move to filepage failed:
		 * in which case it left swappage in cache, lowered its
		 * swap count to pass quickly through the loops above,
		 * and now we must reincrement count to try again later.
		 */
		if ((*swap_map > 1) && PageDirty(page) && PageSwapCache(page)) {
			struct writeback_control wbc = {
				.sync_mode = WB_SYNC_NONE,
			};

			swap_writepage(page, &wbc);
			lock_page(page);
			wait_on_page_writeback(page);
		}
		if (PageSwapCache(page)) {
			if (shmem)
				swap_duplicate(entry);
			else
				delete_from_swap_cache(page);
		}

		/*
		 * So we could skip searching mms once swap count went
		 * to 1, we did not mark any present ptes as dirty: must
		 * mark page dirty so shrink_list will preserve it.
		 */
		SetPageDirty(page);
		unlock_page(page);
		page_cache_release(page);

		/*
		 * Make sure that we aren't completely killing
		 * interactive performance.
		 */
		cond_resched();
	}
Ejemplo n.º 20
0
/*
 * __mcopy_atomic processing for HUGETLB vmas.  Note that this routine is
 * called with mmap_sem held, it will release mmap_sem before returning.
 */
static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,
					      struct vm_area_struct *dst_vma,
					      unsigned long dst_start,
					      unsigned long src_start,
					      unsigned long len,
					      bool zeropage)
{
	int vm_alloc_shared = dst_vma->vm_flags & VM_SHARED;
	int vm_shared = dst_vma->vm_flags & VM_SHARED;
	ssize_t err;
	pte_t *dst_pte;
	unsigned long src_addr, dst_addr;
	long copied;
	struct page *page;
	struct hstate *h;
	unsigned long vma_hpagesize;
	pgoff_t idx;
	u32 hash;
	struct address_space *mapping;

	/*
	 * There is no default zero huge page for all huge page sizes as
	 * supported by hugetlb.  A PMD_SIZE huge pages may exist as used
	 * by THP.  Since we can not reliably insert a zero page, this
	 * feature is not supported.
	 */
	if (zeropage) {
		up_read(&dst_mm->mmap_sem);
		return -EINVAL;
	}

	src_addr = src_start;
	dst_addr = dst_start;
	copied = 0;
	page = NULL;
	vma_hpagesize = vma_kernel_pagesize(dst_vma);

	/*
	 * Validate alignment based on huge page size
	 */
	err = -EINVAL;
	if (dst_start & (vma_hpagesize - 1) || len & (vma_hpagesize - 1))
		goto out_unlock;

retry:
	/*
	 * On routine entry dst_vma is set.  If we had to drop mmap_sem and
	 * retry, dst_vma will be set to NULL and we must lookup again.
	 */
	if (!dst_vma) {
		err = -ENOENT;
		dst_vma = find_vma(dst_mm, dst_start);
		if (!dst_vma || !is_vm_hugetlb_page(dst_vma))
			goto out_unlock;
		/*
		 * Check the vma is registered in uffd, this is
		 * required to enforce the VM_MAYWRITE check done at
		 * uffd registration time.
		 */
		if (!dst_vma->vm_userfaultfd_ctx.ctx)
			goto out_unlock;

		if (dst_start < dst_vma->vm_start ||
		    dst_start + len > dst_vma->vm_end)
			goto out_unlock;

		err = -EINVAL;
		if (vma_hpagesize != vma_kernel_pagesize(dst_vma))
			goto out_unlock;

		vm_shared = dst_vma->vm_flags & VM_SHARED;
	}

	if (WARN_ON(dst_addr & (vma_hpagesize - 1) ||
		    (len - copied) & (vma_hpagesize - 1)))
		goto out_unlock;

	/*
	 * If not shared, ensure the dst_vma has a anon_vma.
	 */
	err = -ENOMEM;
	if (!vm_shared) {
		if (unlikely(anon_vma_prepare(dst_vma)))
			goto out_unlock;
	}

	h = hstate_vma(dst_vma);

	while (src_addr < src_start + len) {
		pte_t dst_pteval;

		BUG_ON(dst_addr >= dst_start + len);
		VM_BUG_ON(dst_addr & ~huge_page_mask(h));

		/*
		 * Serialize via hugetlb_fault_mutex
		 */
		idx = linear_page_index(dst_vma, dst_addr);
		mapping = dst_vma->vm_file->f_mapping;
		hash = hugetlb_fault_mutex_hash(h, dst_mm, dst_vma, mapping,
								idx, dst_addr);
		mutex_lock(&hugetlb_fault_mutex_table[hash]);

		err = -ENOMEM;
		dst_pte = huge_pte_alloc(dst_mm, dst_addr, huge_page_size(h));
		if (!dst_pte) {
			mutex_unlock(&hugetlb_fault_mutex_table[hash]);
			goto out_unlock;
		}

		err = -EEXIST;
		dst_pteval = huge_ptep_get(dst_pte);
		if (!huge_pte_none(dst_pteval)) {
			mutex_unlock(&hugetlb_fault_mutex_table[hash]);
			goto out_unlock;
		}

		err = hugetlb_mcopy_atomic_pte(dst_mm, dst_pte, dst_vma,
						dst_addr, src_addr, &page);

		mutex_unlock(&hugetlb_fault_mutex_table[hash]);
		vm_alloc_shared = vm_shared;

		cond_resched();

		if (unlikely(err == -ENOENT)) {
			up_read(&dst_mm->mmap_sem);
			BUG_ON(!page);

			err = copy_huge_page_from_user(page,
						(const void __user *)src_addr,
						pages_per_huge_page(h), true);
			if (unlikely(err)) {
				err = -EFAULT;
				goto out;
			}
			down_read(&dst_mm->mmap_sem);

			dst_vma = NULL;
			goto retry;
		} else
			BUG_ON(page);

		if (!err) {
			dst_addr += vma_hpagesize;
			src_addr += vma_hpagesize;
			copied += vma_hpagesize;

			if (fatal_signal_pending(current))
				err = -EINTR;
		}
		if (err)
			break;
	}

out_unlock:
	up_read(&dst_mm->mmap_sem);
out:
	if (page) {
		/*
		 * We encountered an error and are about to free a newly
		 * allocated huge page.
		 *
		 * Reservation handling is very subtle, and is different for
		 * private and shared mappings.  See the routine
		 * restore_reserve_on_error for details.  Unfortunately, we
		 * can not call restore_reserve_on_error now as it would
		 * require holding mmap_sem.
		 *
		 * If a reservation for the page existed in the reservation
		 * map of a private mapping, the map was modified to indicate
		 * the reservation was consumed when the page was allocated.
		 * We clear the PagePrivate flag now so that the global
		 * reserve count will not be incremented in free_huge_page.
		 * The reservation map will still indicate the reservation
		 * was consumed and possibly prevent later page allocation.
		 * This is better than leaking a global reservation.  If no
		 * reservation existed, it is still safe to clear PagePrivate
		 * as no adjustments to reservation counts were made during
		 * allocation.
		 *
		 * The reservation map for shared mappings indicates which
		 * pages have reservations.  When a huge page is allocated
		 * for an address with a reservation, no change is made to
		 * the reserve map.  In this case PagePrivate will be set
		 * to indicate that the global reservation count should be
		 * incremented when the page is freed.  This is the desired
		 * behavior.  However, when a huge page is allocated for an
		 * address without a reservation a reservation entry is added
		 * to the reservation map, and PagePrivate will not be set.
		 * When the page is freed, the global reserve count will NOT
		 * be incremented and it will appear as though we have leaked
		 * reserved page.  In this case, set PagePrivate so that the
		 * global reserve count will be incremented to match the
		 * reservation map entry which was created.
		 *
		 * Note that vm_alloc_shared is based on the flags of the vma
		 * for which the page was originally allocated.  dst_vma could
		 * be different or NULL on error.
		 */
		if (vm_alloc_shared)
			SetPagePrivate(page);
		else
			ClearPagePrivate(page);
		put_page(page);
	}
	BUG_ON(copied < 0);
	BUG_ON(err > 0);
	BUG_ON(!copied && !err);
	return copied ? copied : err;
}
Ejemplo n.º 21
0
static inline unsigned long scan_swap_map(struct swap_info_struct *si)
{
	unsigned long offset, last_in_cluster;
	int latency_ration = LATENCY_LIMIT;

	/* 
	 * We try to cluster swap pages by allocating them sequentially
	 * in swap.  Once we've allocated SWAPFILE_CLUSTER pages this
	 * way, however, we resort to first-free allocation, starting
	 * a new cluster.  This prevents us from scattering swap pages
	 * all over the entire swap partition, so that we reduce
	 * overall disk seek times between swap pages.  -- sct
	 * But we do now try to find an empty cluster.  -Andrea
	 */

	si->flags += SWP_SCANNING;
	if (unlikely(!si->cluster_nr)) {
		si->cluster_nr = SWAPFILE_CLUSTER - 1;
		if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER)
			goto lowest;
		spin_unlock(&swap_lock);

		offset = si->lowest_bit;
		last_in_cluster = offset + SWAPFILE_CLUSTER - 1;

		/* Locate the first empty (unaligned) cluster */
		for (; last_in_cluster <= si->highest_bit; offset++) {
			if (si->swap_map[offset])
				last_in_cluster = offset + SWAPFILE_CLUSTER;
			else if (offset == last_in_cluster) {
				spin_lock(&swap_lock);
				si->cluster_next = offset-SWAPFILE_CLUSTER+1;
				goto cluster;
			}
			if (unlikely(--latency_ration < 0)) {
				cond_resched();
				latency_ration = LATENCY_LIMIT;
			}
		}
		spin_lock(&swap_lock);
		goto lowest;
	}

	si->cluster_nr--;
cluster:
	offset = si->cluster_next;
	if (offset > si->highest_bit)
lowest:		offset = si->lowest_bit;
checks:	if (!(si->flags & SWP_WRITEOK))
		goto no_page;
	if (!si->highest_bit)
		goto no_page;
	if (!si->swap_map[offset]) {
		if (offset == si->lowest_bit)
			si->lowest_bit++;
		if (offset == si->highest_bit)
			si->highest_bit--;
		si->inuse_pages++;
		if (si->inuse_pages == si->pages) {
			si->lowest_bit = si->max;
			si->highest_bit = 0;
		}
		si->swap_map[offset] = 1;
		si->cluster_next = offset + 1;
		si->flags -= SWP_SCANNING;
		return offset;
	}

	spin_unlock(&swap_lock);
	while (++offset <= si->highest_bit) {
		if (!si->swap_map[offset]) {
			spin_lock(&swap_lock);
			goto checks;
		}
		if (unlikely(--latency_ration < 0)) {
			cond_resched();
			latency_ration = LATENCY_LIMIT;
		}
	}
	spin_lock(&swap_lock);
	goto lowest;

no_page:
	si->flags -= SWP_SCANNING;
	return 0;
}
Ejemplo n.º 22
0
/* Get tmp_dnode_info and full_dirent for all non-obsolete nodes associated
   with this ino, returning the former in order of version */
static int jffs2_get_inode_nodes(struct jffs2_sb_info *c, struct jffs2_inode_info *f,
				 struct rb_root *tnp, struct jffs2_full_dirent **fdp,
				 uint32_t *highest_version, uint32_t *latest_mctime,
				 uint32_t *mctime_ver)
{
	struct jffs2_raw_node_ref *ref, *valid_ref;
	struct rb_root ret_tn = RB_ROOT;
	struct jffs2_full_dirent *ret_fd = NULL;
	unsigned char *buf = NULL;
	union jffs2_node_union *node;
	size_t retlen;
	int len, err;

	*mctime_ver = 0;

	dbg_readinode("ino #%u\n", f->inocache->ino);

	if (jffs2_is_writebuffered(c)) {
		/*
		 * If we have the write buffer, we assume the minimal I/O unit
		 * is c->wbuf_pagesize. We implement some optimizations which in
		 * this case and we need a temporary buffer of size =
		 * 2*c->wbuf_pagesize bytes (see comments in read_dnode()).
		 * Basically, we want to read not only the node header, but the
		 * whole wbuf (NAND page in case of NAND) or 2, if the node
		 * header overlaps the border between the 2 wbufs.
		 */
		len = 2*c->wbuf_pagesize;
	} else {
		/*
		 * When there is no write buffer, the size of the temporary
		 * buffer is the size of the larges node header.
		 */
		len = sizeof(union jffs2_node_union);
	}

	/* FIXME: in case of NOR and available ->point() this
	 * needs to be fixed. */
	buf = kmalloc(len, GFP_KERNEL);
	if (!buf)
		return -ENOMEM;

	spin_lock(&c->erase_completion_lock);
	valid_ref = jffs2_first_valid_node(f->inocache->nodes);
	if (!valid_ref && f->inocache->ino != 1)
		JFFS2_WARNING("Eep. No valid nodes for ino #%u.\n", f->inocache->ino);
	while (valid_ref) {
		unsigned char *bufstart;

		/* We can hold a pointer to a non-obsolete node without the spinlock,
		   but _obsolete_ nodes may disappear at any time, if the block
		   they're in gets erased. So if we mark 'ref' obsolete while we're
		   not holding the lock, it can go away immediately. For that reason,
		   we find the next valid node first, before processing 'ref'.
		*/
		ref = valid_ref;
		valid_ref = jffs2_first_valid_node(ref->next_in_ino);
		spin_unlock(&c->erase_completion_lock);

		cond_resched();

		/*
		 * At this point we don't know the type of the node we're going
		 * to read, so we do not know the size of its header. In order
		 * to minimize the amount of flash IO we assume the node has
		 * size = JFFS2_MIN_NODE_HEADER.
		 */
		if (jffs2_is_writebuffered(c)) {
			/*
			 * We treat 'buf' as 2 adjacent wbufs. We want to
			 * adjust bufstart such as it points to the
			 * beginning of the node within this wbuf.
			 */
			bufstart = buf + (ref_offset(ref) % c->wbuf_pagesize);
			/* We will read either one wbuf or 2 wbufs. */
			len = c->wbuf_pagesize - (bufstart - buf);
			if (JFFS2_MIN_NODE_HEADER + (int)(bufstart - buf) > c->wbuf_pagesize) {
				/* The header spans the border of the first wbuf */
				len += c->wbuf_pagesize;
			}
		} else {
			bufstart = buf;
			len = JFFS2_MIN_NODE_HEADER;
		}

		dbg_readinode("read %d bytes at %#08x(%d).\n", len, ref_offset(ref), ref_flags(ref));

		/* FIXME: point() */
		err = jffs2_flash_read(c, ref_offset(ref), len,
				       &retlen, bufstart);
		if (err) {
			JFFS2_ERROR("can not read %d bytes from 0x%08x, " "error code: %d.\n", len, ref_offset(ref), err);
			goto free_out;
		}

		if (retlen < len) {
			JFFS2_ERROR("short read at %#08x: %zu instead of %d.\n", ref_offset(ref), retlen, len);
			err = -EIO;
			goto free_out;
		}

		node = (union jffs2_node_union *)bufstart;

		/* No need to mask in the valid bit; it shouldn't be invalid */
		if (je32_to_cpu(node->u.hdr_crc) != crc32(0, node, sizeof(node->u)-4)) {
			JFFS2_NOTICE("Node header CRC failed at %#08x. {%04x,%04x,%08x,%08x}\n",
				     ref_offset(ref), je16_to_cpu(node->u.magic),
				     je16_to_cpu(node->u.nodetype),
				     je32_to_cpu(node->u.totlen),
				     je32_to_cpu(node->u.hdr_crc));
			jffs2_dbg_dump_node(c, ref_offset(ref));
			jffs2_mark_node_obsolete(c, ref);
			goto cont;
		}
		/* Due to poor choice of crc32 seed, an all-zero node will have a correct CRC */
		if (!je32_to_cpu(node->u.hdr_crc) && !je16_to_cpu(node->u.nodetype) &&
		    !je16_to_cpu(node->u.magic) && !je32_to_cpu(node->u.totlen)) {
			JFFS2_NOTICE("All zero node header at %#08x.\n", ref_offset(ref));
			jffs2_mark_node_obsolete(c, ref);
			goto cont;
		}

		switch (je16_to_cpu(node->u.nodetype)) {

		case JFFS2_NODETYPE_DIRENT:

			if (JFFS2_MIN_NODE_HEADER < sizeof(struct jffs2_raw_dirent)) {
				err = read_more(c, ref, sizeof(struct jffs2_raw_dirent), &len, buf, bufstart);
				if (unlikely(err))
					goto free_out;
			}

			err = read_direntry(c, ref, &node->d, retlen, &ret_fd, latest_mctime, mctime_ver);
			if (err == 1) {
				jffs2_mark_node_obsolete(c, ref);
				break;
			} else if (unlikely(err))
				goto free_out;

			if (je32_to_cpu(node->d.version) > *highest_version)
				*highest_version = je32_to_cpu(node->d.version);

			break;

		case JFFS2_NODETYPE_INODE:

			if (JFFS2_MIN_NODE_HEADER < sizeof(struct jffs2_raw_inode)) {
				err = read_more(c, ref, sizeof(struct jffs2_raw_inode), &len, buf, bufstart);
				if (unlikely(err))
					goto free_out;
			}

			err = read_dnode(c, ref, &node->i, &ret_tn, len, latest_mctime, mctime_ver);
			if (err == 1) {
				jffs2_mark_node_obsolete(c, ref);
				break;
			} else if (unlikely(err))
				goto free_out;

			if (je32_to_cpu(node->i.version) > *highest_version)
				*highest_version = je32_to_cpu(node->i.version);

			break;

		default:
			if (JFFS2_MIN_NODE_HEADER < sizeof(struct jffs2_unknown_node)) {
				err = read_more(c, ref, sizeof(struct jffs2_unknown_node), &len, buf, bufstart);
				if (unlikely(err))
					goto free_out;
			}

			err = read_unknown(c, ref, &node->u);
			if (err == 1) {
				jffs2_mark_node_obsolete(c, ref);
				break;
			} else if (unlikely(err))
				goto free_out;

		}
	cont:
		spin_lock(&c->erase_completion_lock);
	}

	spin_unlock(&c->erase_completion_lock);
	*tnp = ret_tn;
	*fdp = ret_fd;
	kfree(buf);

	dbg_readinode("nodes of inode #%u were read, the highest version is %u, latest_mctime %u, mctime_ver %u.\n",
			f->inocache->ino, *highest_version, *latest_mctime, *mctime_ver);
	return 0;

 free_out:
	jffs2_free_tmp_dnode_info_list(&ret_tn);
	jffs2_free_full_dirent_list(ret_fd);
	kfree(buf);
	return err;
}
/**
 * ubifs_budget_space - ensure there is enough space to complete an operation.
 * @c: UBIFS file-system description object
 * @req: budget request
 *
 * This function allocates budget for an operation. It uses pessimistic
 * approximation of how much flash space the operation needs. The goal of this
 * function is to make sure UBIFS always has flash space to flush all dirty
 * pages, dirty inodes, and dirty znodes (liability). This function may force
 * commit, garbage-collection or write-back. Returns zero in case of success,
 * %-ENOSPC if there is no free space and other negative error codes in case of
 * failures.
 */
int ubifs_budget_space(struct ubifs_info *c, struct ubifs_budget_req *req)
{
    int uninitialized_var(cmt_retries), uninitialized_var(wb_retries);
    int err, idx_growth, data_growth, dd_growth, retried = 0;

    ubifs_assert(req->new_page <= 1);
    ubifs_assert(req->dirtied_page <= 1);
    ubifs_assert(req->new_dent <= 1);
    ubifs_assert(req->mod_dent <= 1);
    ubifs_assert(req->new_ino <= 1);
    ubifs_assert(req->new_ino_d <= UBIFS_MAX_INO_DATA);
    ubifs_assert(req->dirtied_ino <= 4);
    ubifs_assert(req->dirtied_ino_d <= UBIFS_MAX_INO_DATA * 4);
    ubifs_assert(!(req->new_ino_d & 7));
    ubifs_assert(!(req->dirtied_ino_d & 7));

    data_growth = calc_data_growth(c, req);
    dd_growth = calc_dd_growth(c, req);
    if (!data_growth && !dd_growth)
        return 0;
    idx_growth = calc_idx_growth(c, req);

again:
    spin_lock(&c->space_lock);
    ubifs_assert(c->budg_idx_growth >= 0);
    ubifs_assert(c->budg_data_growth >= 0);
    ubifs_assert(c->budg_dd_growth >= 0);

    if (unlikely(c->nospace) && (c->nospace_rp || !can_use_rp(c))) {
        dbg_budg("no space");
        spin_unlock(&c->space_lock);
        return -ENOSPC;
    }

    c->budg_idx_growth += idx_growth;
    c->budg_data_growth += data_growth;
    c->budg_dd_growth += dd_growth;

    err = do_budget_space(c);
    if (likely(!err)) {
        req->idx_growth = idx_growth;
        req->data_growth = data_growth;
        req->dd_growth = dd_growth;
        spin_unlock(&c->space_lock);
        return 0;
    }

    /* Restore the old values */
    c->budg_idx_growth -= idx_growth;
    c->budg_data_growth -= data_growth;
    c->budg_dd_growth -= dd_growth;
    spin_unlock(&c->space_lock);

    if (req->fast) {
        dbg_budg("no space for fast budgeting");
        return err;
    }

    err = make_free_space(c);
    cond_resched();
    if (err == -EAGAIN) {
        dbg_budg("try again");
        goto again;
    } else if (err == -ENOSPC) {
        if (!retried) {
            retried = 1;
            dbg_budg("-ENOSPC, but anyway try once again");
            goto again;
        }
        dbg_budg("FS is full, -ENOSPC");
        c->nospace = 1;
        if (can_use_rp(c) || c->rp_size == 0)
            c->nospace_rp = 1;
        smp_wmb();
    } else
        ubifs_err("cannot budget space, error %d", err);
    return err;
}