Ejemplo n.º 1
0
int __init down_read_trylock_init(void) 
{
	int ret;
	init_rwsem( &rwsem );   	//读写信号量初始化
	printk("<0>after init_rwsem, count: %ld\n",rwsem.count);

	if( EXEC_DOWN_WRITE )
		down_write( &rwsem );   //写者获取读写信号量

	ret = down_read_trylock( &rwsem ); 	//读者尝试获取读写信号量
	if(ret)
	{
		printk("<0>first down_read_trylock, count: %ld\n",rwsem.count);
		ret = down_read_trylock( &rwsem );   
		printk("<0>second down_read_trylock, count: %ld\n",rwsem.count);

		while( rwsem.count != 0 )
		{
			up_read( &rwsem );     	//读者释放读写信号量
		}
	}	
	else
		printk("<0>down_read_trylock failed!\n");

	return 0;
}
Ejemplo n.º 2
0
static int ocfs2_readpages(struct file *filp, struct address_space *mapping,
			   struct list_head *pages, unsigned nr_pages)
{
	int ret, err = -EIO;
	struct inode *inode = mapping->host;
	struct ocfs2_inode_info *oi = OCFS2_I(inode);
	loff_t start;
	struct page *last;

	ret = ocfs2_inode_lock_full(inode, NULL, 0, OCFS2_LOCK_NONBLOCK);
	if (ret)
		return err;

	if (down_read_trylock(&oi->ip_alloc_sem) == 0) {
		ocfs2_inode_unlock(inode, 0);
		return err;
	}

	if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL)
		goto out_unlock;

	last = list_entry(pages->prev, struct page, lru);
	start = (loff_t)last->index << PAGE_CACHE_SHIFT;
	if (start >= i_size_read(inode))
		goto out_unlock;

	err = mpage_readpages(mapping, pages, nr_pages, ocfs2_get_block);

out_unlock:
	up_read(&oi->ip_alloc_sem);
	ocfs2_inode_unlock(inode, 0);

	return err;
}
Ejemplo n.º 3
0
static int ocfs2_readpage(struct file *file, struct page *page)
{
	struct inode *inode = page->mapping->host;
	loff_t start = (loff_t)page->index << PAGE_CACHE_SHIFT;
	int ret, unlock = 1;

	mlog_entry("(0x%p, %lu)\n", file, (page ? page->index : 0));

	ret = ocfs2_meta_lock_with_page(inode, NULL, 0, page);
	if (ret != 0) {
		if (ret == AOP_TRUNCATED_PAGE)
			unlock = 0;
		mlog_errno(ret);
		goto out;
	}

	if (down_read_trylock(&OCFS2_I(inode)->ip_alloc_sem) == 0) {
		ret = AOP_TRUNCATED_PAGE;
		goto out_meta_unlock;
	}

	/*
	 * i_size might have just been updated as we grabed the meta lock.  We
	 * might now be discovering a truncate that hit on another node.
	 * block_read_full_page->get_block freaks out if it is asked to read
	 * beyond the end of a file, so we check here.  Callers
	 * (generic_file_read, vm_ops->fault) are clever enough to check i_size
	 * and notice that the page they just read isn't needed.
	 *
	 * XXX sys_readahead() seems to get that wrong?
	 */
	if (start >= i_size_read(inode)) {
		zero_user_page(page, 0, PAGE_SIZE, KM_USER0);
		SetPageUptodate(page);
		ret = 0;
		goto out_alloc;
	}

	ret = ocfs2_data_lock_with_page(inode, 0, page);
	if (ret != 0) {
		if (ret == AOP_TRUNCATED_PAGE)
			unlock = 0;
		mlog_errno(ret);
		goto out_alloc;
	}

	ret = block_read_full_page(page, ocfs2_get_block);
	unlock = 0;

	ocfs2_data_unlock(inode, 0);
out_alloc:
	up_read(&OCFS2_I(inode)->ip_alloc_sem);
out_meta_unlock:
	ocfs2_meta_unlock(inode, 0);
out:
	if (unlock)
		unlock_page(page);
	mlog_exit(ret);
	return ret;
}
Ejemplo n.º 4
0
static int nilfs_test_bdev_super2(struct super_block *s, void *data)
{
	struct nilfs_super_data *sd = data;
	int ret;

	if (s->s_bdev != sd->bdev)
		return 0;

	if (!((s->s_flags | sd->flags) & MS_RDONLY))
		return 1; /* Reuse an old R/W-mode super_block */

	if (s->s_flags & sd->flags & MS_RDONLY) {
		if (down_read_trylock(&s->s_umount)) {
			ret = s->s_root &&
				(sd->cno == NILFS_SB(s)->s_snapshot_cno);
			up_read(&s->s_umount);
			/*
			 * This path is locked with sb_lock by sget().
			 * So, drop_super() causes deadlock.
			 */
			return ret;
		}
	}
	return 0;
}
Ejemplo n.º 5
0
/*
 * Every sync period we need to unpin all items, reclaim inodes and sync
 * disk quotas.  We might need to cover the log to indicate that the
 * filesystem is idle and not frozen.
 */
STATIC void
xfs_sync_worker(
	struct work_struct *work)
{
	struct xfs_mount *mp = container_of(to_delayed_work(work),
					struct xfs_mount, m_sync_work);
	int		error;

	/*
	 * We shouldn't write/force the log if we are in the mount/unmount
	 * process or on a read only filesystem. The workqueue still needs to be
	 * active in both cases, however, because it is used for inode reclaim
	 * during these times.  Use the s_umount semaphore to provide exclusion
	 * with unmount.
	 */
	if (down_read_trylock(&mp->m_super->s_umount)) {
		if (!(mp->m_flags & XFS_MOUNT_RDONLY)) {
			/* dgc: errors ignored here */
			if (mp->m_super->s_frozen == SB_UNFROZEN &&
			    xfs_log_need_covered(mp))
				error = xfs_fs_log_dummy(mp);
			else
				xfs_log_force(mp, 0);

			/* start pushing all the metadata that is currently
			 * dirty */
			xfs_ail_push_all(mp->m_ail);
		}
		up_read(&mp->m_super->s_umount);
	}

	/* queue us up again */
	xfs_syncd_queue_sync(mp);
}
Ejemplo n.º 6
0
static int unuse_mm(struct mm_struct *mm,
				swp_entry_t entry, struct page *page)
{
	struct vm_area_struct *vma;

	if (!down_read_trylock(&mm->mmap_sem)) {
		/*
		 * Activate page so shrink_cache is unlikely to unmap its
		 * ptes while lock is dropped, so swapoff can make progress.
		 */
		activate_page(page);
		unlock_page(page);
		down_read(&mm->mmap_sem);
		lock_page(page);
	}
	for (vma = mm->mmap; vma; vma = vma->vm_next) {
		if (vma->anon_vma && unuse_vma(vma, entry, page))
			break;
	}
	up_read(&mm->mmap_sem);
	/*
	 * Currently unuse_mm cannot fail, but leave error handling
	 * at call sites for now, since we change it from time to time.
	 */
	return 0;
}
Ejemplo n.º 7
0
/**
 * try_to_get_nonexclusive_access - try to get nonexclusive access to a file
 * @uf_info: unix file specific part of inode to obtain access to
 *
 * Non-blocking version of nonexclusive access obtaining.
 */
int try_to_get_nonexclusive_access(struct unix_file_info *uf_info)
{
	int result;

	result = down_read_trylock(&uf_info->latch);
	if (result)
		nea_grabbed(uf_info);
	return result;
}
Ejemplo n.º 8
0
void l4x_evict_tasks(struct task_struct *exclude)
{
	struct task_struct *p;
	int cnt = 0;

	rcu_read_lock();
	for_each_process(p) {
		l4_cap_idx_t t;
		struct mm_struct *mm;

		if (p == exclude)
			continue;

		task_lock(p);
		mm = p->mm;
		if (!mm) {
			task_unlock(p);
			continue;
		}

		t = ACCESS_ONCE(mm->context.task);

		if (l4_is_invalid_cap(t)) {
			task_unlock(p);
			continue;
		}

		if (down_read_trylock(&mm->mmap_sem)) {
			struct vm_area_struct *vma;
			for (vma = mm->mmap; vma; vma = vma->vm_next)
				if (vma->vm_flags & VM_LOCKED) {
					t = L4_INVALID_CAP;
					break;
				}
			up_read(&mm->mmap_sem);
			if (!vma)
				if (cmpxchg(&mm->context.task, t, L4_INVALID_CAP) != t)
					t = L4_INVALID_CAP;
		} else
			t = L4_INVALID_CAP;

		task_unlock(p);

		if (!l4_is_invalid_cap(t)) {
			l4lx_task_delete_task(t);
			l4lx_task_number_free(t);

			if (++cnt > 10)
				break;
		}
	}
	rcu_read_unlock();

	if (cnt == 0)
		pr_info_ratelimited("l4x-evict: Found no process to free.\n");
}
Ejemplo n.º 9
0
static int cfs_access_process_vm(struct task_struct *tsk,
				 struct mm_struct *mm,
				 unsigned long addr,
				 void *buf, int len, int write)
{
	/* Just copied from kernel for the kernels which doesn't
	 * have access_process_vm() exported */
	struct vm_area_struct *vma;
	struct page *page;
	void *old_buf = buf;

	/* Avoid deadlocks on mmap_sem if called from sys_mmap_pgoff(),
	 * which is already holding mmap_sem for writes.  If some other
	 * thread gets the write lock in the meantime, this thread will
	 * block, but at least it won't deadlock on itself.  LU-1735 */
	if (down_read_trylock(&mm->mmap_sem) == 0)
		return -EDEADLK;

	/* ignore errors, just check how much was successfully transferred */
	while (len) {
		int bytes, rc, offset;
		void *maddr;

		rc = get_user_pages(tsk, mm, addr, 1,
				     write, 1, &page, &vma);
		if (rc <= 0)
			break;

		bytes = len;
		offset = addr & (PAGE_SIZE-1);
		if (bytes > PAGE_SIZE-offset)
			bytes = PAGE_SIZE-offset;

		maddr = kmap(page);
		if (write) {
			copy_to_user_page(vma, page, addr,
					  maddr + offset, buf, bytes);
			set_page_dirty_lock(page);
		} else {
			copy_from_user_page(vma, page, addr,
					    buf, maddr + offset, bytes);
		}
		kunmap(page);
		page_cache_release(page);
		len -= bytes;
		buf += bytes;
		addr += bytes;
	}
	up_read(&mm->mmap_sem);

	return buf - old_buf;
}
Ejemplo n.º 10
0
static ssize_t foo_show(struct kobject *kobj, struct kobj_attribute *attr,
	char *buf)
{
	size_t count;

	/* down_read_trylock returns zero on failure */
	if (!down_read_trylock(&rw_sem))
		return -ERESTARTSYS;

	count = sprintf(buf, "%s\n", foo_kbuff);

	up_read(&rw_sem);
	return count;
}
Ejemplo n.º 11
0
static int ocfs2_readpage(struct file *file, struct page *page)
{
	struct inode *inode = page->mapping->host;
	struct ocfs2_inode_info *oi = OCFS2_I(inode);
	loff_t start = (loff_t)page->index << PAGE_CACHE_SHIFT;
	int ret, unlock = 1;

	trace_ocfs2_readpage((unsigned long long)oi->ip_blkno,
			     (page ? page->index : 0));

	ret = ocfs2_inode_lock_with_page(inode, NULL, 0, page);
	if (ret != 0) {
		if (ret == AOP_TRUNCATED_PAGE)
			unlock = 0;
		mlog_errno(ret);
		goto out;
	}

	if (down_read_trylock(&oi->ip_alloc_sem) == 0) {
		ret = AOP_TRUNCATED_PAGE;
		unlock_page(page);
		unlock = 0;
		down_read(&oi->ip_alloc_sem);
		up_read(&oi->ip_alloc_sem);
		goto out_inode_unlock;
	}

	if (start >= i_size_read(inode)) {
		zero_user(page, 0, PAGE_SIZE);
		SetPageUptodate(page);
		ret = 0;
		goto out_alloc;
	}

	if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL)
		ret = ocfs2_readpage_inline(inode, page);
	else
		ret = block_read_full_page(page, ocfs2_get_block);
	unlock = 0;

out_alloc:
	up_read(&OCFS2_I(inode)->ip_alloc_sem);
out_inode_unlock:
	ocfs2_inode_unlock(inode, 0);
out:
	if (unlock)
		unlock_page(page);
	return ret;
}
Ejemplo n.º 12
0
static void siw_qp_llp_data_ready(struct sock *sk)
#endif
{
	struct siw_qp		*qp;

	read_lock(&sk->sk_callback_lock);

	if (unlikely(!sk->sk_user_data || !sk_to_qp(sk))) {
		dprint(DBG_ON, " No QP: %p\n", sk->sk_user_data);
		goto done;
	}
	qp = sk_to_qp(sk);

	if (down_read_trylock(&qp->state_lock)) {
		read_descriptor_t	rd_desc = {.arg.data = qp, .count = 1};

#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 15, 0)
		dprint(DBG_SK|DBG_RX, "(QP%d): "
			"state (before tcp_read_sock)=%d, bytes=%x\n",
			QP_ID(qp), qp->attrs.state, bytes);
#else
		dprint(DBG_SK|DBG_RX, "(QP%d): "
			"state (before tcp_read_sock)=%d\n",
			QP_ID(qp), qp->attrs.state);
#endif

		if (likely(qp->attrs.state == SIW_QP_STATE_RTS))
			/*
			 * Implements data receive operation during
			 * socket callback. TCP gracefully catches
			 * the case where there is nothing to receive
			 * (not calling siw_tcp_rx_data() then).
			 */
			tcp_read_sock(sk, &rd_desc, siw_tcp_rx_data);

#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 15, 0)
		dprint(DBG_SK|DBG_RX, "(QP%d): "
			"state (after tcp_read_sock)=%d, bytes=%x\n",
			QP_ID(qp), qp->attrs.state, bytes);
#else
		dprint(DBG_SK|DBG_RX, "(QP%d): "
			"state (after tcp_read_sock)=%d\n",
			QP_ID(qp), qp->attrs.state);
#endif

		up_read(&qp->state_lock);
	} else {
Ejemplo n.º 13
0
/*
 * This is used only for read-ahead. Failures or difficult to handle
 * situations are safe to ignore.
 *
 * Right now, we don't bother with BH_Boundary - in-inode extent lists
 * are quite large (243 extents on 4k blocks), so most inodes don't
 * grow out to a tree. If need be, detecting boundary extents could
 * trivially be added in a future version of ocfs2_get_block().
 */
static int ocfs2_readpages(struct file *filp, struct address_space *mapping,
			   struct list_head *pages, unsigned nr_pages)
{
	int ret, err = -EIO;
	struct inode *inode = mapping->host;
	struct ocfs2_inode_info *oi = OCFS2_I(inode);
	loff_t start;
	struct page *last;

	/*
	 * Use the nonblocking flag for the dlm code to avoid page
	 * lock inversion, but don't bother with retrying.
	 */
	ret = ocfs2_inode_lock_full(inode, NULL, 0, OCFS2_LOCK_NONBLOCK);
	if (ret)
		return err;

	if (down_read_trylock(&oi->ip_alloc_sem) == 0) {
		ocfs2_inode_unlock(inode, 0);
		return err;
	}

	/*
	 * Don't bother with inline-data. There isn't anything
	 * to read-ahead in that case anyway...
	 */
	if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL)
		goto out_unlock;

	/*
	 * Check whether a remote node truncated this file - we just
	 * drop out in that case as it's not worth handling here.
	 */
	last = list_entry(pages->prev, struct page, lru);
	start = (loff_t)last->index << PAGE_CACHE_SHIFT;
	if (start >= i_size_read(inode))
		goto out_unlock;

	err = mpage_readpages(mapping, pages, nr_pages, ocfs2_get_block);

out_unlock:
	up_read(&oi->ip_alloc_sem);
	ocfs2_inode_unlock(inode, 0);

	return err;
}
Ejemplo n.º 14
0
/*
 * Reaps the address space of the give task.
 *
 * Returns true on success and false if none or part of the address space
 * has been reclaimed and the caller should retry later.
 */
static bool oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
{
	bool ret = true;

	if (!down_read_trylock(&mm->mmap_sem)) {
		trace_skip_task_reaping(tsk->pid);
		return false;
	}

	/*
	 * MMF_OOM_SKIP is set by exit_mmap when the OOM reaper can't
	 * work on the mm anymore. The check for MMF_OOM_SKIP must run
	 * under mmap_sem for reading because it serializes against the
	 * down_write();up_write() cycle in exit_mmap().
	 */
	if (test_bit(MMF_OOM_SKIP, &mm->flags)) {
		trace_skip_task_reaping(tsk->pid);
		goto out_unlock;
	}

	trace_start_task_reaping(tsk->pid);

	/* failed to reap part of the address space. Try again later */
	ret = __oom_reap_task_mm(mm);
	if (!ret)
		goto out_finish;

	pr_info("oom_reaper: reaped process %d (%s), now anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n",
			task_pid_nr(tsk), tsk->comm,
			K(get_mm_counter(mm, MM_ANONPAGES)),
			K(get_mm_counter(mm, MM_FILEPAGES)),
			K(get_mm_counter(mm, MM_SHMEMPAGES)));
out_finish:
	trace_finish_task_reaping(tsk->pid);
out_unlock:
	up_read(&mm->mmap_sem);

	return ret;
}
static ssize_t mem_free_percent(void)
{
	unsigned long mem_used_pages = 0;
	u64 val = 0;
	int i = 0;
	struct zram *zram = NULL;
	struct zram_meta *meta = NULL;
	unsigned long total_zram_pages = totalram_pages*total_mem_usage_percent/100;

	for (i = 0; i < zram_get_num_devices(); i++) {

		zram = &zram_devices[i];
		if(!zram || !zram->disk)
		{
			continue;
		}

		if( !down_read_trylock(&zram->init_lock) )
			return -1;

		if(zram->init_done)
		{
			meta = zram->meta;
			if (meta && meta->mem_pool)
			{
				val += zs_get_total_pages(meta->mem_pool);
			}
		}

		up_read(&zram->init_lock);
	}

	mem_used_pages = val;

	pr_debug("ZRAM:totalram_pages:%lu,total_zram_pages:%lu,mem_used_pages:%lu\r\n", totalram_pages, total_zram_pages,mem_used_pages);

	return (mem_used_pages >= total_zram_pages) ? 0 : ((total_zram_pages - mem_used_pages)*100/total_zram_pages);
}
Ejemplo n.º 16
0
static int unuse_mm(struct mm_struct *mm,
				swp_entry_t entry, struct page *page)
{
	struct vm_area_struct *vma;
	int ret = 0;

	if (!down_read_trylock(&mm->mmap_sem)) {
		/*
		 * Activate page so shrink_inactive_list is unlikely to unmap
		 * its ptes while lock is dropped, so swapoff can make progress.
		 */
		activate_page(page);
		unlock_page(page);
		down_read(&mm->mmap_sem);
		lock_page(page);
	}
	for (vma = mm->mmap; vma; vma = vma->vm_next) {
		if (vma->anon_vma && (ret = unuse_vma(vma, entry, page)))
			break;
	}
	up_read(&mm->mmap_sem);
	return (ret < 0)? ret: 0;
}
Ejemplo n.º 17
0
static int
process_get(struct request_state* req)
{
	struct ub_entry* e;
	struct sk_buff* skb;
	
	while (!down_read_trylock(&rwlock))
		continue;
	e = ub_cache_find(req->key, req->len_key);

	if (!e)
	{
		req->err = -EUBKEYNOTFOUND;
		up_read(&rwlock);
		return req->err;
	}

	/* If we found a suitable ub_entry* e,
	   it will contain the skb to be emitted on the wire.
	   Clone the copy from the hash table under the lock so that headers can be added
	   and responsibility for it can be handed over to the NIC during the send process.
	   This keeps the critical region under the read lock as short and contained as possible. */
	skb = skb_clone(e->skb, GFP_ATOMIC);
	if (unlikely(!skb))
	{
		req->err = -EUBKEYNOTFOUND;
		up_read(&rwlock);
		return req->err;
	}
	up_read(&rwlock);

	/* We should have found an entry, and this means we have a pointer to an skb
	   within the ub_entry struct which we can now use to send directly on the 
	   wire (after pushing some headers on the front). */
	req->skb_tx = skb;
	return 0;
}
static void cpufreq_interactive_timer(unsigned long data)
{
	u64 now;
	unsigned int delta_time;
	u64 cputime_speedadj;
	int cpu_load;
	struct cpufreq_interactive_cpuinfo *pcpu =
		&per_cpu(cpuinfo, data);
	unsigned int new_freq;
	unsigned int loadadjfreq;
	unsigned int index;
	unsigned long flags;
	bool boosted;
	unsigned long mod_min_sample_time;
	int i, max_load;
	unsigned int max_freq;
	unsigned int boosted_freq;
	struct cpufreq_interactive_cpuinfo *picpu;

	if (!down_read_trylock(&pcpu->enable_sem))
		return;
	if (!pcpu->governor_enabled)
		goto exit;

	spin_lock_irqsave(&pcpu->load_lock, flags);
	now = update_load(data);
	delta_time = (unsigned int)(now - pcpu->cputime_speedadj_timestamp);
	cputime_speedadj = pcpu->cputime_speedadj;
	pcpu->last_evaluated_jiffy = get_jiffies_64();
	spin_unlock_irqrestore(&pcpu->load_lock, flags);

	if (WARN_ON_ONCE(!delta_time))
		goto rearm;

	spin_lock_irqsave(&pcpu->target_freq_lock, flags);
	do_div(cputime_speedadj, delta_time);
	loadadjfreq = (unsigned int)cputime_speedadj * 100;
	cpu_load = loadadjfreq / pcpu->target_freq;
	pcpu->prev_load = cpu_load;
	boosted = boost_val || now < boostpulse_endtime;
	boosted_freq = max(hispeed_freq, pcpu->policy->min);

	if (cpu_load >= go_hispeed_load || boosted) {
		if (pcpu->target_freq < boosted_freq) {
			new_freq = boosted_freq;
		} else {
			new_freq = choose_freq(pcpu, loadadjfreq);

			if (new_freq > freq_calc_thresh)
				new_freq = pcpu->policy->max * cpu_load / 100;

			if (new_freq < boosted_freq)
				new_freq = boosted_freq;
		}
	} else {
		new_freq = choose_freq(pcpu, loadadjfreq);

		if (new_freq > freq_calc_thresh)
			new_freq = pcpu->policy->max * cpu_load / 100;

		if (sync_freq && new_freq < sync_freq) {

			max_load = 0;
			max_freq = 0;

			for_each_online_cpu(i) {
				picpu = &per_cpu(cpuinfo, i);

				if (i == data || picpu->prev_load <
						up_threshold_any_cpu_load)
					continue;

				max_load = max(max_load, picpu->prev_load);
				max_freq = max(max_freq, picpu->target_freq);
			}

			if (max_freq > up_threshold_any_cpu_freq ||
				max_load >= up_threshold_any_cpu_load)
				new_freq = sync_freq;
		}
	}
Ejemplo n.º 19
0
static int __kprobes do_page_fault(unsigned long addr, unsigned int esr,
				   struct pt_regs *regs)
{
	struct task_struct *tsk;
	struct mm_struct *mm;
	int fault, sig, code;
	unsigned long vm_flags = VM_READ | VM_WRITE | VM_EXEC;
	unsigned int mm_flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;

	tsk = current;
	mm  = tsk->mm;

	/* Enable interrupts if they were enabled in the parent context. */
	if (interrupts_enabled(regs))
		local_irq_enable();

	/*
	 * If we're in an interrupt or have no user context, we must not take
	 * the fault.
	 */
	if (in_atomic() || !mm)
		goto no_context;

	if (user_mode(regs))
		mm_flags |= FAULT_FLAG_USER;

	if (esr & ESR_LNX_EXEC) {
		vm_flags = VM_EXEC;
	} else if ((esr & ESR_EL1_WRITE) && !(esr & ESR_EL1_CM)) {
		vm_flags = VM_WRITE;
		mm_flags |= FAULT_FLAG_WRITE;
	}

	/*
	 * As per x86, we may deadlock here. However, since the kernel only
	 * validly references user space from well defined areas of the code,
	 * we can bug out early if this is from code which shouldn't.
	 */
	if (!down_read_trylock(&mm->mmap_sem)) {
		if (!user_mode(regs) && !search_exception_tables(regs->pc))
			goto no_context;
retry:
		down_read(&mm->mmap_sem);
	} else {
		/*
		 * The above down_read_trylock() might have succeeded in which
		 * case, we'll have missed the might_sleep() from down_read().
		 */
		might_sleep();
#ifdef CONFIG_DEBUG_VM
		if (!user_mode(regs) && !search_exception_tables(regs->pc))
			goto no_context;
#endif
	}

	fault = __do_page_fault(mm, addr, mm_flags, vm_flags, tsk);

	/*
	 * If we need to retry but a fatal signal is pending, handle the
	 * signal first. We do not need to release the mmap_sem because it
	 * would already be released in __lock_page_or_retry in mm/filemap.c.
	 */
	if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current))
		return 0;

	/*
	 * Major/minor page fault accounting is only done on the initial
	 * attempt. If we go through a retry, it is extremely likely that the
	 * page will be found in page cache at that point.
	 */

	perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, addr);
	if (mm_flags & FAULT_FLAG_ALLOW_RETRY) {
		if (fault & VM_FAULT_MAJOR) {
			tsk->maj_flt++;
			perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, regs,
				      addr);
		} else {
			tsk->min_flt++;
			perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, regs,
				      addr);
		}
		if (fault & VM_FAULT_RETRY) {
			/*
			 * Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk of
			 * starvation.
			 */
			mm_flags &= ~FAULT_FLAG_ALLOW_RETRY;
			mm_flags |= FAULT_FLAG_TRIED;
			goto retry;
		}
	}

	up_read(&mm->mmap_sem);

	/*
	 * Handle the "normal" case first - VM_FAULT_MAJOR / VM_FAULT_MINOR
	 */
	if (likely(!(fault & (VM_FAULT_ERROR | VM_FAULT_BADMAP |
			      VM_FAULT_BADACCESS))))
		return 0;

	/*
	 * If we are in kernel mode at this point, we have no context to
	 * handle this fault with.
	 */
	if (!user_mode(regs))
		goto no_context;

	if (fault & VM_FAULT_OOM) {
		/*
		 * We ran out of memory, call the OOM killer, and return to
		 * userspace (which will retry the fault, or kill us if we got
		 * oom-killed).
		 */
		pagefault_out_of_memory();
		return 0;
	}

	if (fault & VM_FAULT_SIGBUS) {
		/*
		 * We had some memory, but were unable to successfully fix up
		 * this page fault.
		 */
		sig = SIGBUS;
		code = BUS_ADRERR;
	} else {
		/*
		 * Something tried to access memory that isn't in our memory
		 * map.
		 */
		sig = SIGSEGV;
		code = fault == VM_FAULT_BADACCESS ?
			SEGV_ACCERR : SEGV_MAPERR;
	}

	__do_user_fault(tsk, addr, esr, sig, code, regs);
	return 0;

no_context:
	__do_kernel_fault(mm, addr, esr, regs);
	return 0;
}
Ejemplo n.º 20
0
static bool __oom_reap_task(struct task_struct *tsk)
{
	struct mmu_gather tlb;
	struct vm_area_struct *vma;
	struct mm_struct *mm = NULL;
	struct task_struct *p;
	struct zap_details details = {.check_swap_entries = true,
				      .ignore_dirty = true};
	bool ret = true;

	/*
	 * We have to make sure to not race with the victim exit path
	 * and cause premature new oom victim selection:
	 * __oom_reap_task		exit_mm
	 *   atomic_inc_not_zero
	 *				  mmput
	 *				    atomic_dec_and_test
	 *				  exit_oom_victim
	 *				[...]
	 *				out_of_memory
	 *				  select_bad_process
	 *				    # no TIF_MEMDIE task selects new victim
	 *  unmap_page_range # frees some memory
	 */
	mutex_lock(&oom_lock);

	/*
	 * Make sure we find the associated mm_struct even when the particular
	 * thread has already terminated and cleared its mm.
	 * We might have race with exit path so consider our work done if there
	 * is no mm.
	 */
	p = find_lock_task_mm(tsk);
	if (!p)
		goto unlock_oom;
	mm = p->mm;
	atomic_inc(&mm->mm_users);
	task_unlock(p);

	if (!down_read_trylock(&mm->mmap_sem)) {
		ret = false;
		goto unlock_oom;
	}

	tlb_gather_mmu(&tlb, mm, 0, -1);
	for (vma = mm->mmap ; vma; vma = vma->vm_next) {
		if (is_vm_hugetlb_page(vma))
			continue;

		/*
		 * mlocked VMAs require explicit munlocking before unmap.
		 * Let's keep it simple here and skip such VMAs.
		 */
		if (vma->vm_flags & VM_LOCKED)
			continue;

		/*
		 * Only anonymous pages have a good chance to be dropped
		 * without additional steps which we cannot afford as we
		 * are OOM already.
		 *
		 * We do not even care about fs backed pages because all
		 * which are reclaimable have already been reclaimed and
		 * we do not want to block exit_mmap by keeping mm ref
		 * count elevated without a good reason.
		 */
		if (vma_is_anonymous(vma) || !(vma->vm_flags & VM_SHARED))
			unmap_page_range(&tlb, vma, vma->vm_start, vma->vm_end,
					 &details);
	}
	tlb_finish_mmu(&tlb, 0, -1);
	pr_info("oom_reaper: reaped process %d (%s), now anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n",
			task_pid_nr(tsk), tsk->comm,
			K(get_mm_counter(mm, MM_ANONPAGES)),
			K(get_mm_counter(mm, MM_FILEPAGES)),
			K(get_mm_counter(mm, MM_SHMEMPAGES)));
	up_read(&mm->mmap_sem);

	/*
	 * This task can be safely ignored because we cannot do much more
	 * to release its memory.
	 */
	set_bit(MMF_OOM_REAPED, &mm->flags);
unlock_oom:
	mutex_unlock(&oom_lock);
	/*
	 * Drop our reference but make sure the mmput slow path is called from a
	 * different context because we shouldn't risk we get stuck there and
	 * put the oom_reaper out of the way.
	 */
	if (mm)
		mmput_async(mm);
	return ret;
}

#define MAX_OOM_REAP_RETRIES 10
static void oom_reap_task(struct task_struct *tsk)
{
	int attempts = 0;

	/* Retry the down_read_trylock(mmap_sem) a few times */
	while (attempts++ < MAX_OOM_REAP_RETRIES && !__oom_reap_task(tsk))
		schedule_timeout_idle(HZ/10);

	if (attempts > MAX_OOM_REAP_RETRIES) {
		pr_info("oom_reaper: unable to reap pid:%d (%s)\n",
				task_pid_nr(tsk), tsk->comm);
		debug_show_all_locks();
	}

	/*
	 * Clear TIF_MEMDIE because the task shouldn't be sitting on a
	 * reasonably reclaimable memory anymore or it is not a good candidate
	 * for the oom victim right now because it cannot release its memory
	 * itself nor by the oom reaper.
	 */
	tsk->oom_reaper_list = NULL;
	exit_oom_victim(tsk);

	/* Drop a reference taken by wake_oom_reaper */
	put_task_struct(tsk);
}
Ejemplo n.º 21
0
/* Read the environment variable of current process specified by @key. */
int cfs_get_environ(const char *key, char *value, int *val_len)
{
	struct mm_struct *mm;
	char *buffer;
	int buf_len = PAGE_CACHE_SIZE;
	int key_len = strlen(key);
	unsigned long addr;
	int rc;
	ENTRY;

	buffer = kmalloc(buf_len, GFP_USER);
	if (!buffer)
		RETURN(-ENOMEM);

	mm = get_task_mm(current);
	if (!mm) {
		kfree(buffer);
		RETURN(-EINVAL);
	}

	/* Avoid deadlocks on mmap_sem if called from sys_mmap_pgoff(),
	 * which is already holding mmap_sem for writes.  If some other
	 * thread gets the write lock in the meantime, this thread will
	 * block, but at least it won't deadlock on itself.  LU-1735 */
	if (down_read_trylock(&mm->mmap_sem) == 0) {
		kfree(buffer);
		return -EDEADLK;
	}
	up_read(&mm->mmap_sem);

	addr = mm->env_start;
	while (addr < mm->env_end) {
		int this_len, retval, scan_len;
		char *env_start, *env_end;

		memset(buffer, 0, buf_len);

		this_len = min_t(int, mm->env_end - addr, buf_len);
		retval = cfs_access_process_vm(current, addr, buffer,
					       this_len, 0);
		if (retval != this_len)
			break;

		addr += retval;

		/* Parse the buffer to find out the specified key/value pair.
		 * The "key=value" entries are separated by '\0'. */
		env_start = buffer;
		scan_len = this_len;
		while (scan_len) {
			char *entry;
			int entry_len;

			env_end = memscan(env_start, '\0', scan_len);
			LASSERT(env_end >= env_start &&
				env_end <= env_start + scan_len);

			/* The last entry of this buffer cross the buffer
			 * boundary, reread it in next cycle. */
			if (unlikely(env_end - env_start == scan_len)) {
				/* This entry is too large to fit in buffer */
				if (unlikely(scan_len == this_len)) {
					CERROR("Too long env variable.\n");
					GOTO(out, rc = -EINVAL);
				}
				addr -= scan_len;
				break;
			}

			entry = env_start;
			entry_len = env_end - env_start;

			/* Key length + length of '=' */
			if (entry_len > key_len + 1 &&
			    !memcmp(entry, key, key_len)) {
				entry += key_len + 1;
				entry_len -= key_len + 1;
				/* The 'value' buffer passed in is too small.*/
				if (entry_len >= *val_len)
					GOTO(out, rc = -EOVERFLOW);

				memcpy(value, entry, entry_len);
				*val_len = entry_len;
				GOTO(out, rc = 0);
			}

			scan_len -= (env_end - env_start + 1);
			env_start = env_end + 1;
		}
	}
	GOTO(out, rc = -ENOENT);

out:
	mmput(mm);
	kfree((void *)buffer);
	return rc;
}
Ejemplo n.º 22
0
void decode_address(char *buf, unsigned long address)
{
	struct task_struct *p;
	struct mm_struct *mm;
	unsigned long flags, offset;
	unsigned char in_atomic = (bfin_read_IPEND() & 0x10) || in_atomic();
	struct rb_node *n;

#ifdef CONFIG_KALLSYMS
	unsigned long symsize;
	const char *symname;
	char *modname;
	char *delim = ":";
	char namebuf[128];
#endif

	buf += sprintf(buf, "<0x%08lx> ", address);

#ifdef CONFIG_KALLSYMS
	
	symname = kallsyms_lookup(address, &symsize, &offset, &modname, namebuf);

	if (symname) {
		
		if (!modname)
			modname = delim = "";
		sprintf(buf, "{ %s%s%s%s + 0x%lx }",
			delim, modname, delim, symname,
			(unsigned long)offset);
		return;
	}
#endif

	if (address >= FIXED_CODE_START && address < FIXED_CODE_END) {
		
		strcat(buf, "/* Maybe fixed code section */");
		return;

	} else if (address < CONFIG_BOOT_LOAD) {
		
		strcat(buf, "/* Maybe null pointer? */");
		return;

	} else if (address >= COREMMR_BASE) {
		strcat(buf, "/* core mmrs */");
		return;

	} else if (address >= SYSMMR_BASE) {
		strcat(buf, "/* system mmrs */");
		return;

	} else if (address >= L1_ROM_START && address < L1_ROM_START + L1_ROM_LENGTH) {
		strcat(buf, "/* on-chip L1 ROM */");
		return;

	} else if (address >= L1_SCRATCH_START && address < L1_SCRATCH_START + L1_SCRATCH_LENGTH) {
		strcat(buf, "/* on-chip scratchpad */");
		return;

	} else if (address >= physical_mem_end && address < ASYNC_BANK0_BASE) {
		strcat(buf, "/* unconnected memory */");
		return;

	} else if (address >= ASYNC_BANK3_BASE + ASYNC_BANK3_SIZE && address < BOOT_ROM_START) {
		strcat(buf, "/* reserved memory */");
		return;

	} else if (address >= L1_DATA_A_START && address < L1_DATA_A_START + L1_DATA_A_LENGTH) {
		strcat(buf, "/* on-chip Data Bank A */");
		return;

	} else if (address >= L1_DATA_B_START && address < L1_DATA_B_START + L1_DATA_B_LENGTH) {
		strcat(buf, "/* on-chip Data Bank B */");
		return;
	}

	if (oops_in_progress) {
		strcat(buf, "/* kernel dynamic memory (maybe user-space) */");
		return;
	}

	write_lock_irqsave(&tasklist_lock, flags);
	for_each_process(p) {
		mm = (in_atomic ? p->mm : get_task_mm(p));
		if (!mm)
			continue;

		if (!down_read_trylock(&mm->mmap_sem)) {
			if (!in_atomic)
				mmput(mm);
			continue;
		}

		for (n = rb_first(&mm->mm_rb); n; n = rb_next(n)) {
			struct vm_area_struct *vma;

			vma = rb_entry(n, struct vm_area_struct, vm_rb);

			if (address >= vma->vm_start && address < vma->vm_end) {
				char _tmpbuf[256];
				char *name = p->comm;
				struct file *file = vma->vm_file;

				if (file) {
					char *d_name = d_path(&file->f_path, _tmpbuf,
						      sizeof(_tmpbuf));
					if (!IS_ERR(d_name))
						name = d_name;
				}


				if ((unsigned long)current >= FIXED_CODE_START &&
				    !((unsigned long)current & 0x3)) {
					if (current->mm &&
					    (address > current->mm->start_code) &&
					    (address < current->mm->end_code))
						offset = address - current->mm->start_code;
					else
						offset = (address - vma->vm_start) +
							 (vma->vm_pgoff << PAGE_SHIFT);

					sprintf(buf, "[ %s + 0x%lx ]", name, offset);
				} else
					sprintf(buf, "[ %s vma:0x%lx-0x%lx]",
						name, vma->vm_start, vma->vm_end);

				up_read(&mm->mmap_sem);
				if (!in_atomic)
					mmput(mm);

				if (buf[0] == '\0')
					sprintf(buf, "[ %s ] dynamic memory", name);

				goto done;
			}
		}

		up_read(&mm->mmap_sem);
		if (!in_atomic)
			mmput(mm);
	}

	sprintf(buf, "/* kernel dynamic memory */");

done:
	write_unlock_irqrestore(&tasklist_lock, flags);
}
Ejemplo n.º 23
0
/*
 * Get user stack entries up to the pcstack_limit; return the number of entries
 * acquired.  If pcstack is NULL, return the number of entries potentially
 * acquirable.
 */
unsigned long dtrace_getufpstack(uint64_t *pcstack, uint64_t *fpstack,
				 int pcstack_limit)
{
	struct task_struct	*p = current;
	struct mm_struct	*mm = p->mm;
	unsigned long		tos, bos, fpc;
	unsigned long		*sp;
	unsigned long		depth = 0;
	struct vm_area_struct	*stack_vma;
	struct page		*stack_page = NULL;
	struct pt_regs		*regs = current_pt_regs();

	if (pcstack) {
		if (unlikely(pcstack_limit < 2)) {
			DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
			return 0;
		}
		*pcstack++ = (uint64_t)p->pid;
		*pcstack++ = (uint64_t)p->tgid;
		pcstack_limit -= 2;
	}

	if (!user_mode(regs))
		goto out;

	/*
	 * There is always at least one address to report: the instruction
	 * pointer itself (frame 0).
	 */
	depth++;

	fpc = instruction_pointer(regs);
	if (pcstack) {
		*pcstack++ = (uint64_t)fpc;
		pcstack_limit--;
	}

	/*
	 * We cannot ustack() if this task has no mm, if this task is a kernel
	 * thread, or when someone else has the mmap_sem or the page_table_lock
	 * (because find_user_vma() ultimately does a __get_user_pages() and
	 * thence a follow_page(), which can take that lock).
	 */
	if (mm == NULL || (p->flags & PF_KTHREAD) ||
	    spin_is_locked(&mm->page_table_lock))
		goto out;

	if (!down_read_trylock(&mm->mmap_sem))
		goto out;
	atomic_inc(&mm->mm_users);

	/*
	 * The following construct can be replaced with:
	 * 	tos = current_user_stack_pointer();
	 * once support for 4.0 is no longer necessary.
	 */
#ifdef CONFIG_X86_64
	tos = current_pt_regs()->sp;
#else
	tos = user_stack_pointer(current_pt_regs());
#endif
	stack_vma = find_user_vma(p, mm, NULL, (unsigned long) tos, 0);
	if (!stack_vma ||
	    stack_vma->vm_start > (unsigned long) tos)
		goto unlock_out;

#ifdef CONFIG_STACK_GROWSUP
#error This code does not yet work on STACK_GROWSUP platforms.
#endif
	bos = stack_vma->vm_end;
	if (stack_guard_page_end(stack_vma, bos))
                bos -= PAGE_SIZE;

	/*
	 * If we have a pcstack, loop as long as we are within the stack limit.
	 * Otherwise, loop until we run out of stack.
	 */
	for (sp = (unsigned long *)tos;
	     sp <= (unsigned long *)bos &&
		     ((pcstack && pcstack_limit > 0) ||
		      !pcstack);
	     sp++) {
		struct vm_area_struct	*code_vma;
		unsigned long		addr;

		/*
		 * Recheck for faultedness and pin at page boundaries.
		 */
		if (!stack_page || (((unsigned long)sp & PAGE_MASK) == 0)) {
			if (stack_page) {
				put_page(stack_page);
				stack_page = NULL;
			}

			if (!find_user_vma(p, mm, &stack_page,
					   (unsigned long) sp, 1))
				break;
		}

		DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
		get_user(addr, sp);
		DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);

		if (DTRACE_CPUFLAG_ISSET(CPU_DTRACE_FAULT)) {
			DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_BADADDR);
			break;
		}

		if (addr == fpc)
			continue;

		code_vma = find_user_vma(p, mm, NULL, addr, 0);

		if (!code_vma || code_vma->vm_start > addr)
			continue;

		if ((addr >= tos && addr <= bos) ||
		    (code_vma->vm_flags & VM_GROWSDOWN)) {
			/* stack address - may need it for the fpstack. */
		} else if (code_vma->vm_flags & VM_EXEC) {
			if (pcstack) {
				*pcstack++ = addr;
				pcstack_limit--;
			}
			depth++;
		}
	}
	if (stack_page != NULL)
		put_page(stack_page);

unlock_out:
	atomic_dec(&mm->mm_users);
	up_read(&mm->mmap_sem);

out:
	if (pcstack)
		while (pcstack_limit--)
			*pcstack++ = 0;

	return depth;
}
Ejemplo n.º 24
0
/*
 * This routine handles page faults.  It determines the address,
 * and the problem, and then passes it off to one of the appropriate
 * routines.
 */
asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long cause,
				unsigned long address)
{
	struct vm_area_struct *vma = NULL;
	struct task_struct *tsk = current;
	struct mm_struct *mm = tsk->mm;
	int code = SEGV_MAPERR;
	int fault;
	unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;

	cause >>= 2;

	/* Restart the instruction */
	regs->ea -= 4;

	/*
	 * We fault-in kernel-space virtual memory on-demand. The
	 * 'reference' page table is init_mm.pgd.
	 *
	 * NOTE! We MUST NOT take any locks for this case. We may
	 * be in an interrupt or a critical region, and should
	 * only copy the information from the master page table,
	 * nothing more.
	 */
	if (unlikely(address >= VMALLOC_START && address <= VMALLOC_END)) {
		if (user_mode(regs))
			goto bad_area_nosemaphore;
		else
			goto vmalloc_fault;
	}

	if (unlikely(address >= TASK_SIZE))
		goto bad_area_nosemaphore;

	/*
	 * If we're in an interrupt or have no user
	 * context, we must not take the fault..
	 */
	if (faulthandler_disabled() || !mm)
		goto bad_area_nosemaphore;

	if (user_mode(regs))
		flags |= FAULT_FLAG_USER;

	if (!down_read_trylock(&mm->mmap_sem)) {
		if (!user_mode(regs) && !search_exception_tables(regs->ea))
			goto bad_area_nosemaphore;
retry:
		down_read(&mm->mmap_sem);
	}

	vma = find_vma(mm, address);
	if (!vma)
		goto bad_area;
	if (vma->vm_start <= address)
		goto good_area;
	if (!(vma->vm_flags & VM_GROWSDOWN))
		goto bad_area;
	if (expand_stack(vma, address))
		goto bad_area;
/*
 * Ok, we have a good vm_area for this memory access, so
 * we can handle it..
 */
good_area:
	code = SEGV_ACCERR;

	switch (cause) {
	case EXC_SUPERV_INSN_ACCESS:
		goto bad_area;
	case EXC_SUPERV_DATA_ACCESS:
		goto bad_area;
	case EXC_X_PROTECTION_FAULT:
		if (!(vma->vm_flags & VM_EXEC))
			goto bad_area;
		break;
	case EXC_R_PROTECTION_FAULT:
		if (!(vma->vm_flags & VM_READ))
			goto bad_area;
		break;
	case EXC_W_PROTECTION_FAULT:
		if (!(vma->vm_flags & VM_WRITE))
			goto bad_area;
		flags = FAULT_FLAG_WRITE;
		break;
	}

	/*
	 * If for any reason at all we couldn't handle the fault,
	 * make sure we exit gracefully rather than endlessly redo
	 * the fault.
	 */
	fault = handle_mm_fault(mm, vma, address, flags);

	if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current))
		return;

	if (unlikely(fault & VM_FAULT_ERROR)) {
		if (fault & VM_FAULT_OOM)
			goto out_of_memory;
		else if (fault & VM_FAULT_SIGSEGV)
			goto bad_area;
		else if (fault & VM_FAULT_SIGBUS)
			goto do_sigbus;
		BUG();
	}

	/*
	 * Major/minor page fault accounting is only done on the
	 * initial attempt. If we go through a retry, it is extremely
	 * likely that the page will be found in page cache at that point.
	 */
	if (flags & FAULT_FLAG_ALLOW_RETRY) {
		if (fault & VM_FAULT_MAJOR)
			current->maj_flt++;
		else
			current->min_flt++;
		if (fault & VM_FAULT_RETRY) {
			/* Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk
			 * of starvation. */
			flags &= ~FAULT_FLAG_ALLOW_RETRY;
			flags |= FAULT_FLAG_TRIED;

			/*
			 * No need to up_read(&mm->mmap_sem) as we would
			 * have already released it in __lock_page_or_retry
			 * in mm/filemap.c.
			 */

			goto retry;
		}
	}

	up_read(&mm->mmap_sem);
	return;

/*
 * Something tried to access memory that isn't in our memory map..
 * Fix it, but check if it's kernel or user first..
 */
bad_area:
	up_read(&mm->mmap_sem);

bad_area_nosemaphore:
	/* User mode accesses just cause a SIGSEGV */
	if (user_mode(regs)) {
		if (unhandled_signal(current, SIGSEGV) && printk_ratelimit()) {
			pr_info("%s: unhandled page fault (%d) at 0x%08lx, "
				"cause %ld\n", current->comm, SIGSEGV, address, cause);
			show_regs(regs);
		}
		_exception(SIGSEGV, regs, code, address);
		return;
	}

no_context:
	/* Are we prepared to handle this kernel fault? */
	if (fixup_exception(regs))
		return;

	/*
	 * Oops. The kernel tried to access some bad page. We'll have to
	 * terminate things with extreme prejudice.
	 */
	bust_spinlocks(1);

	pr_alert("Unable to handle kernel %s at virtual address %08lx",
		address < PAGE_SIZE ? "NULL pointer dereference" :
		"paging request", address);
	pr_alert("ea = %08lx, ra = %08lx, cause = %ld\n", regs->ea, regs->ra,
		cause);
	panic("Oops");
	return;

/*
 * We ran out of memory, or some other thing happened to us that made
 * us unable to handle the page fault gracefully.
 */
out_of_memory:
	up_read(&mm->mmap_sem);
	if (!user_mode(regs))
		goto no_context;
	pagefault_out_of_memory();
	return;

do_sigbus:
	up_read(&mm->mmap_sem);

	/* Kernel mode? Handle exceptions or die */
	if (!user_mode(regs))
		goto no_context;

	_exception(SIGBUS, regs, BUS_ADRERR, address);
	return;

vmalloc_fault:
	{
		/*
		 * Synchronize this task's top level page-table
		 * with the 'reference' page table.
		 *
		 * Do _not_ use "tsk" here. We might be inside
		 * an interrupt in the middle of a task switch..
		 */
		int offset = pgd_index(address);
		pgd_t *pgd, *pgd_k;
		pud_t *pud, *pud_k;
		pmd_t *pmd, *pmd_k;
		pte_t *pte_k;

		pgd = pgd_current + offset;
		pgd_k = init_mm.pgd + offset;

		if (!pgd_present(*pgd_k))
			goto no_context;
		set_pgd(pgd, *pgd_k);

		pud = pud_offset(pgd, address);
		pud_k = pud_offset(pgd_k, address);
		if (!pud_present(*pud_k))
			goto no_context;
		pmd = pmd_offset(pud, address);
		pmd_k = pmd_offset(pud_k, address);
		if (!pmd_present(*pmd_k))
			goto no_context;
		set_pmd(pmd, *pmd_k);

		pte_k = pte_offset_kernel(pmd_k, address);
		if (!pte_present(*pte_k))
			goto no_context;

		flush_tlb_one(address);
		return;
	}
}
Ejemplo n.º 25
0
/*
 * This routine is responsible for faulting in user pages.
 * It passes the work off to one of the appropriate routines.
 * It returns true if the fault was successfully handled.
 */
static int handle_page_fault(struct pt_regs *regs,
			     int fault_num,
			     int is_page_fault,
			     unsigned long address,
			     int write)
{
	struct task_struct *tsk;
	struct mm_struct *mm;
	struct vm_area_struct *vma;
	unsigned long stack_offset;
	int fault;
	int si_code;
	int is_kernel_mode;
	pgd_t *pgd;

	/* on TILE, protection faults are always writes */
	if (!is_page_fault)
		write = 1;

	flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;

	is_kernel_mode = (EX1_PL(regs->ex1) != USER_PL);

	tsk = validate_current();

	/*
	 * Check to see if we might be overwriting the stack, and bail
	 * out if so.  The page fault code is a relatively likely
	 * place to get trapped in an infinite regress, and once we
	 * overwrite the whole stack, it becomes very hard to recover.
	 */
	stack_offset = stack_pointer & (THREAD_SIZE-1);
	if (stack_offset < THREAD_SIZE / 8) {
		pr_alert("Potential stack overrun: sp %#lx\n",
		       stack_pointer);
		show_regs(regs);
		pr_alert("Killing current process %d/%s\n",
		       tsk->pid, tsk->comm);
		do_group_exit(SIGKILL);
	}

	/*
	 * Early on, we need to check for migrating PTE entries;
	 * see homecache.c.  If we find a migrating PTE, we wait until
	 * the backing page claims to be done migrating, then we proceed.
	 * For kernel PTEs, we rewrite the PTE and return and retry.
	 * Otherwise, we treat the fault like a normal "no PTE" fault,
	 * rather than trying to patch up the existing PTE.
	 */
	pgd = get_current_pgd();
	if (handle_migrating_pte(pgd, fault_num, address, regs->pc,
				 is_kernel_mode, write))
		return 1;

	si_code = SEGV_MAPERR;

	/*
	 * We fault-in kernel-space virtual memory on-demand. The
	 * 'reference' page table is init_mm.pgd.
	 *
	 * NOTE! We MUST NOT take any locks for this case. We may
	 * be in an interrupt or a critical region, and should
	 * only copy the information from the master page table,
	 * nothing more.
	 *
	 * This verifies that the fault happens in kernel space
	 * and that the fault was not a protection fault.
	 */
	if (unlikely(address >= TASK_SIZE &&
		     !is_arch_mappable_range(address, 0))) {
		if (is_kernel_mode && is_page_fault &&
		    vmalloc_fault(pgd, address) >= 0)
			return 1;
		/*
		 * Don't take the mm semaphore here. If we fixup a prefetch
		 * fault we could otherwise deadlock.
		 */
		mm = NULL;  /* happy compiler */
		vma = NULL;
		goto bad_area_nosemaphore;
	}

	/*
	 * If we're trying to touch user-space addresses, we must
	 * be either at PL0, or else with interrupts enabled in the
	 * kernel, so either way we can re-enable interrupts here
	 * unless we are doing atomic access to user space with
	 * interrupts disabled.
	 */
	if (!(regs->flags & PT_FLAGS_DISABLE_IRQ))
		local_irq_enable();

	mm = tsk->mm;

	/*
	 * If we're in an interrupt, have no user context or are running in an
	 * atomic region then we must not take the fault.
	 */
	if (in_atomic() || !mm) {
		vma = NULL;  /* happy compiler */
		goto bad_area_nosemaphore;
	}

	if (!is_kernel_mode)
		flags |= FAULT_FLAG_USER;

	/*
	 * When running in the kernel we expect faults to occur only to
	 * addresses in user space.  All other faults represent errors in the
	 * kernel and should generate an OOPS.  Unfortunately, in the case of an
	 * erroneous fault occurring in a code path which already holds mmap_sem
	 * we will deadlock attempting to validate the fault against the
	 * address space.  Luckily the kernel only validly references user
	 * space from well defined areas of code, which are listed in the
	 * exceptions table.
	 *
	 * As the vast majority of faults will be valid we will only perform
	 * the source reference check when there is a possibility of a deadlock.
	 * Attempt to lock the address space, if we cannot we then validate the
	 * source.  If this is invalid we can skip the address space check,
	 * thus avoiding the deadlock.
	 */
	if (!down_read_trylock(&mm->mmap_sem)) {
		if (is_kernel_mode &&
		    !search_exception_tables(regs->pc)) {
			vma = NULL;  /* happy compiler */
			goto bad_area_nosemaphore;
		}
		down_read(&mm->mmap_sem);
	}

	vma = find_vma(mm, address);
	if (!vma)
		goto bad_area;
	if (vma->vm_start <= address)
		goto good_area;
	if (!(vma->vm_flags & VM_GROWSDOWN))
		goto bad_area;
	if (regs->sp < PAGE_OFFSET) {
		/*
		 * accessing the stack below sp is always a bug.
		 */
		if (address < regs->sp)
			goto bad_area;
	}
	if (expand_stack(vma, address))
		goto bad_area;

/*
 * Ok, we have a good vm_area for this memory access, so
 * we can handle it..
 */
good_area:
	si_code = SEGV_ACCERR;
	if (fault_num == INT_ITLB_MISS) {
		if (!(vma->vm_flags & VM_EXEC))
			goto bad_area;
	} else if (write) {
#ifdef TEST_VERIFY_AREA
		if (!is_page_fault && regs->cs == KERNEL_CS)
			pr_err("WP fault at "REGFMT"\n", regs->eip);
#endif
		if (!(vma->vm_flags & VM_WRITE))
			goto bad_area;
		flags |= FAULT_FLAG_WRITE;
	} else {
		if (!is_page_fault || !(vma->vm_flags & VM_READ))
			goto bad_area;
	}

	/*
	 * If for any reason at all we couldn't handle the fault,
	 * make sure we exit gracefully rather than endlessly redo
	 * the fault.
	 */
	fault = handle_mm_fault(mm, vma, address, write);
	if (unlikely(fault & VM_FAULT_ERROR)) {
		if (fault & VM_FAULT_OOM)
			goto out_of_memory;
		else if (fault & VM_FAULT_SIGBUS)
			goto do_sigbus;
		BUG();
	}
	if (fault & VM_FAULT_MAJOR)
		tsk->maj_flt++;
	else
		tsk->min_flt++;

#if CHIP_HAS_TILE_DMA() || CHIP_HAS_SN_PROC()
	/*
	 * If this was an asynchronous fault,
	 * restart the appropriate engine.
	 */
	switch (fault_num) {
#if CHIP_HAS_TILE_DMA()
	case INT_DMATLB_MISS:
	case INT_DMATLB_MISS_DWNCL:
	case INT_DMATLB_ACCESS:
	case INT_DMATLB_ACCESS_DWNCL:
		__insn_mtspr(SPR_DMA_CTR, SPR_DMA_CTR__REQUEST_MASK);
		break;
#endif
#if CHIP_HAS_SN_PROC()
	case INT_SNITLB_MISS:
	case INT_SNITLB_MISS_DWNCL:
		__insn_mtspr(SPR_SNCTL,
			     __insn_mfspr(SPR_SNCTL) &
			     ~SPR_SNCTL__FRZPROC_MASK);
		break;
#endif
	}
#endif

	up_read(&mm->mmap_sem);
	return 1;

/*
 * Something tried to access memory that isn't in our memory map..
 * Fix it, but check if it's kernel or user first..
 */
bad_area:
	up_read(&mm->mmap_sem);

bad_area_nosemaphore:
	/* User mode accesses just cause a SIGSEGV */
	if (!is_kernel_mode) {
		/*
		 * It's possible to have interrupts off here.
		 */
		local_irq_enable();

		force_sig_info_fault("segfault", SIGSEGV, si_code, address,
				     fault_num, tsk, regs);
		return 0;
	}

no_context:
	/* Are we prepared to handle this kernel fault?  */
	if (fixup_exception(regs))
		return 0;

/*
 * Oops. The kernel tried to access some bad page. We'll have to
 * terminate things with extreme prejudice.
 */

	bust_spinlocks(1);

	/* FIXME: no lookup_address() yet */
#ifdef SUPPORT_LOOKUP_ADDRESS
	if (fault_num == INT_ITLB_MISS) {
		pte_t *pte = lookup_address(address);

		if (pte && pte_present(*pte) && !pte_exec_kernel(*pte))
			pr_crit("kernel tried to execute"
			       " non-executable page - exploit attempt?"
			       " (uid: %d)\n", current->uid);
	}
#endif
	if (address < PAGE_SIZE)
		pr_alert("Unable to handle kernel NULL pointer dereference\n");
	else
		pr_alert("Unable to handle kernel paging request\n");
	pr_alert(" at virtual address "REGFMT", pc "REGFMT"\n",
		 address, regs->pc);

	show_regs(regs);

	if (unlikely(tsk->pid < 2)) {
		panic("Kernel page fault running %s!",
		      is_idle_task(tsk) ? "the idle task" : "init");
	}

	/*
	 * More FIXME: we should probably copy the i386 here and
	 * implement a generic die() routine.  Not today.
	 */
#ifdef SUPPORT_DIE
	die("Oops", regs);
#endif
	bust_spinlocks(1);

	do_group_exit(SIGKILL);

/*
 * We ran out of memory, or some other thing happened to us that made
 * us unable to handle the page fault gracefully.
 */
out_of_memory:
	up_read(&mm->mmap_sem);
	if (is_kernel_mode)
		goto no_context;
	pagefault_out_of_memory();
	return 0;

do_sigbus:
	up_read(&mm->mmap_sem);

	/* Kernel mode? Handle exceptions or die */
	if (is_kernel_mode)
		goto no_context;

	force_sig_info_fault("bus error", SIGBUS, BUS_ADRERR, address,
			     fault_num, tsk, regs);
	return 0;
}
Ejemplo n.º 26
0
static void cpufreq_interactive_timer(unsigned long data)
{
	u64 now;
	unsigned int delta_time;
	u64 cputime_speedadj;
	int cpu_load;
	struct cpufreq_interactive_cpuinfo *pcpu =
		&per_cpu(cpuinfo, data);
	unsigned int new_freq;
	unsigned int loadadjfreq;
	unsigned int index;
	unsigned long flags;
	bool boosted;
	unsigned long mod_min_sample_time;
	int i, max_load;
	unsigned int max_freq;
	unsigned int boosted_freq;
	struct cpufreq_interactive_cpuinfo *picpu;

	if (!down_read_trylock(&pcpu->enable_sem))
		return;
	if (!pcpu->governor_enabled)
		goto exit;

	spin_lock_irqsave(&pcpu->load_lock, flags);
	pcpu->last_evaluated_jiffy = get_jiffies_64();
	now = update_load(data);
	if (use_sched_hint) {
		/*
		 * Unlock early to avoid deadlock.
		 *
		 * cpufreq_interactive_timer_resched_now() is called
		 * in thread migration notification which already holds
		 * rq lock. Then it locks load_lock to avoid racing with
		 * cpufreq_interactive_timer_resched/start().
		 * sched_get_busy() will also acquire rq lock. Thus we
		 * can't hold load_lock when calling sched_get_busy().
		 *
		 * load_lock used in this function protects time
		 * and load information. These stats are not used when
		 * scheduler hint is available. Thus unlocking load_lock
		 * early is perfectly OK.
		 */
		spin_unlock_irqrestore(&pcpu->load_lock, flags);
		cputime_speedadj = (u64)sched_get_busy(data) *
				pcpu->policy->cpuinfo.max_freq;
		do_div(cputime_speedadj, timer_rate);
	} else {
		delta_time = (unsigned int)
				(now - pcpu->cputime_speedadj_timestamp);
		cputime_speedadj = pcpu->cputime_speedadj;
		spin_unlock_irqrestore(&pcpu->load_lock, flags);
		if (WARN_ON_ONCE(!delta_time))
			goto rearm;
		do_div(cputime_speedadj, delta_time);
	}

	spin_lock_irqsave(&pcpu->target_freq_lock, flags);
	loadadjfreq = (unsigned int)cputime_speedadj * 100;
	cpu_load = loadadjfreq / pcpu->target_freq;
	pcpu->prev_load = cpu_load;
	boosted = boost_val || now < boostpulse_endtime;
	boosted_freq = max(hispeed_freq, pcpu->policy->min);

	if (cpu_load >= go_hispeed_load || boosted) {
		if (pcpu->target_freq < boosted_freq) {
			new_freq = boosted_freq;
		} else {
			new_freq = choose_freq(pcpu, loadadjfreq);

			if (new_freq > freq_calc_thresh)
				new_freq = pcpu->policy->max * cpu_load / 100;

			if (new_freq < boosted_freq)
				new_freq = boosted_freq;
		}
	} else {
		new_freq = choose_freq(pcpu, loadadjfreq);

		if (new_freq > freq_calc_thresh)
			new_freq = pcpu->policy->max * cpu_load / 100;

		if (sync_freq && new_freq < sync_freq) {

			max_load = 0;
			max_freq = 0;

			for_each_online_cpu(i) {
				picpu = &per_cpu(cpuinfo, i);

				if (i == data || picpu->prev_load <
						up_threshold_any_cpu_load)
					continue;

				max_load = max(max_load, picpu->prev_load);
				max_freq = max(max_freq, picpu->target_freq);
			}

			if (max_freq > up_threshold_any_cpu_freq ||
				max_load >= up_threshold_any_cpu_load)
				new_freq = sync_freq;
		}
	}
Ejemplo n.º 27
0
static void cpufreq_interactive_timer(unsigned long data)
{
	u64 now;
	unsigned int delta_time;
	u64 cputime_speedadj;
	int cpu_load;
	struct cpufreq_interactive_cpuinfo *pcpu =
		&per_cpu(cpuinfo, data);
	unsigned int new_freq;
	unsigned int loadadjfreq;
	unsigned int index;
	unsigned long flags;
	bool boosted;
	unsigned long mod_min_sample_time;
	int i, max_load;
	unsigned int max_freq;
	struct cpufreq_interactive_cpuinfo *picpu;

	if (!down_read_trylock(&pcpu->enable_sem))
		return;
	if (!pcpu->governor_enabled)
		goto exit;

	spin_lock_irqsave(&pcpu->load_lock, flags);
	now = update_load(data);
	delta_time = (unsigned int)(now - pcpu->cputime_speedadj_timestamp);
	cputime_speedadj = pcpu->cputime_speedadj;
	spin_unlock_irqrestore(&pcpu->load_lock, flags);

	if (!delta_time)
		goto rearm;

	do_div(cputime_speedadj, delta_time);
	loadadjfreq = (unsigned int)cputime_speedadj * 100;
	cpu_load = loadadjfreq / pcpu->target_freq;

	pcpu->prev_load = cpu_load;
	boosted = now < (last_input_time + boostpulse_duration_val);

	if (cpu_load >= go_hispeed_load)
	{
		if (pcpu->target_freq < hispeed_freq) 
			new_freq = hispeed_freq;
		else
		{
			new_freq = choose_freq(pcpu, loadadjfreq);

			if (new_freq < hispeed_freq)
				new_freq = hispeed_freq;
		}
	}
	else 
	{
		new_freq = choose_freq(pcpu, loadadjfreq);

		if (sync_freq && new_freq < sync_freq) {

			max_load = 0;
			max_freq = 0;

			for_each_online_cpu(i) {
				picpu = &per_cpu(cpuinfo, i);

				if (i == data || picpu->prev_load <
						up_threshold_any_cpu_load)
					continue;

				max_load = max(max_load, picpu->prev_load);
				max_freq = max(max_freq, picpu->target_freq);
			}

			if (max_freq > up_threshold_any_cpu_freq &&
				max_load >= up_threshold_any_cpu_load)
				new_freq = sync_freq;
		}
	}
Ejemplo n.º 28
0
void show_pid_maps(struct task_struct *task)
{
	struct task_struct *t;
	struct mm_struct *mm;
	struct vm_area_struct *vma;
	struct file *file;
	unsigned long long pgoff = 0;
	unsigned long ino = 0;
	dev_t dev = 0;
	int tpid = 0;
	char path_buf[256];
 
	printk(KERN_ALERT "-----------------------------------------------------------\n");
        if((0 == strcmp(current->comm, "BIServer"))|| (0 == strcmp(current->comm, "MainServer")) || (0 == strcmp(current->comm, "PDSServer")) || (0 == strcmp(current->comm, "AppUpdate")))
	{
                printk(KERN_ALERT "* task->pid  (%d)\n", task->pid);
	}
	else
	{
		printk(KERN_ALERT "* dump maps on pid (%d)\n", task->pid);
	}
	printk(KERN_ALERT "-----------------------------------------------------------\n");
 
	if (!down_read_trylock(&task->mm->mmap_sem)) {
		printk(KERN_ALERT "down_read_trylock() failed... do not dump pid maps info\n");
		return;
	}
 
	vma = task->mm->mmap;
	while (vma) {
		file = vma->vm_file;
		if (file) {
			struct inode *inode = file->f_dentry->d_inode;
			dev = inode->i_sb->s_dev;
			ino = inode->i_ino;
			pgoff = ((loff_t)vma->vm_pgoff) << PAGE_SHIFT;
		} else {
			dev = 0;
			ino = 0;
			pgoff = 0;
		}

		printk(KERN_ALERT "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %-10lu ",
				vma->vm_start,
				vma->vm_end,
				vma->vm_flags & VM_READ ? 'r' : '-',
				vma->vm_flags & VM_WRITE ? 'w' : '-',
				vma->vm_flags & VM_EXEC ? 'x' : '-',
				vma->vm_flags & VM_MAYSHARE ? 's' : 'p',
				pgoff,
				MAJOR(dev), MINOR(dev), ino);
 
		if (file) {
			char* p = d_path(&(file->f_path),path_buf, 256);
			if (!IS_ERR(p)) printk("%s", p);
		} else {
			const char *name = arch_vma_name(vma);
			mm = vma->vm_mm;
			tpid = 0;
 
			if (!name) {
				if (mm) {
					if (vma->vm_start <= mm->brk &&
					    vma->vm_end >= mm->start_brk) {
						name = "[heap]";
					} else if (vma->vm_start <= mm->start_stack &&
					           vma->vm_end >= mm->start_stack) {
						name = "[stack]";
					} else {
						t = task;
						do{
							if (vma->vm_start <= t->user_ssp &&
							    vma->vm_end >= t->user_ssp){
								tpid = t->pid;
								name = t->comm;
								break;
							}
						}while_each_thread(task, t);
					}
				} else {
					name = "[vdso]";
				}
			}
			if (name) {
				if (tpid)
					printk("[tstack: %s: %d]", name, tpid);
				else
					printk("%s", name);
			}
		}
		printk( "\n");
 
		vma = vma->vm_next;
	}
Ejemplo n.º 29
0
static ssize_t getvminfo_call(struct file *file, const char __user *buf,
                                size_t count, loff_t *ppos)
{
  int rc;
  char *fullname;
  char *command_name;
  char *sflag;
  char callbuf[MAX_CALL];
  char resp_line[MAX_LINE];
  struct vm_area_struct *it;
  int flag = 0;
  int i;
  int vma_count;
  int sem_ret = 0;
  long temp = 0;
  unsigned long addr1 = 0;
  unsigned long addr2 = 0;
  //struct vm_operations_struct my_vm_ops;
  /* the user's write() call should not include a count that exceeds
   * the size of the module's buffer for the call string.
   */
  if(count >= MAX_CALL)
     return -EINVAL;

  /* The preempt_disable() and preempt_enable() functions are used in the
   * kernel for preventing preemption.  They are used here to protect
   * global state.
   */
  
  preempt_disable();

  if (call_task != NULL) {
     preempt_enable(); 
     return -EAGAIN;
  }

  respbuf = kmalloc(MAX_RESP, GFP_ATOMIC);
  if (respbuf == NULL) {
     preempt_enable(); 
     return -ENOSPC;
  }
  strcpy(respbuf,""); /* initialize buffer with null string */

  /* current is global for the kernel and contains a pointer to the
   * running process
   */
  call_task = current;

  /* Use the kernel function to copy from user space to kernel space.
   */

  rc = copy_from_user(callbuf, buf, count);
  callbuf[MAX_CALL - 1] = '\0'; /* make sure it is a valid string */
  fullname = kmalloc(MAX_CALL, GFP_ATOMIC);
  strcpy(fullname, callbuf);
  command_name = strsep(&fullname, " ");
  
  if (strcmp(command_name, "getvminfo") != 0) {
      strcpy(respbuf, "Failed: invalid operation\n");
      printk(KERN_DEBUG "getvminfo: call %s will return %s\n", callbuf, respbuf);
      preempt_enable();
      return count;  /* write() calls return the number of bytes written */
  }

  sflag = strsep(&fullname, " ");
  if(!kstrtol(sflag, 10, &temp)){
    flag = (int) temp;
  }

 /*
  *Here we record the information we need to the respbuf and modify the
  *set of pointers in vm_area_struct
  */
  
  sem_ret = 0;
  while(sem_ret == 0){
    sem_ret = down_read_trylock(&call_task -> active_mm -> mmap_sem);
  }
  //So this is the root/head of the list of VMAs allocate!
  it = call_task -> active_mm -> mmap;
  vma_count = call_task -> active_mm -> map_count;
  sprintf(respbuf, "VMAs:\n");
  for(i = 0; i < vma_count; i++)
  {
    sprintf(resp_line, "%d. start: %lu end: %lu flag: %lu vm_ops: %lu\n", i+1, it -> vm_start, it -> vm_end, it -> vm_flags, (unsigned long)it -> vm_ops);
    if((unsigned long) it -> vm_ops != 0){
      //If addr1 is not given the right value
      if(addr1==0){
        addr1 = (unsigned long) it -> vm_ops;
        my_fault1 = it -> vm_ops -> fault;
        my_vm_ops1.open = it -> vm_ops -> open;
        my_vm_ops1.close = it -> vm_ops -> close;
        my_vm_ops1.page_mkwrite = it -> vm_ops -> page_mkwrite;
        my_vm_ops1.access = it -> vm_ops -> access;
        my_vm_ops1.fault = &log_fault1;
        
      //If addr2 is not given the right value
      //And the currnt vm_ops != addr1
      //We have the right value for addr2
      }else if(addr2==0 && (unsigned long) it -> vm_ops != addr1){
        addr2 = (unsigned long) it -> vm_ops;
        my_fault2 = it -> vm_ops -> fault;
        my_vm_ops2.open = it -> vm_ops -> open;
        my_vm_ops2.close = it -> vm_ops -> close;
        my_vm_ops2.page_mkwrite = it -> vm_ops -> page_mkwrite;
        my_vm_ops2.access = it -> vm_ops -> access;
        my_vm_ops2.fault = &log_fault2;
        
      }
    }
    strcat(respbuf, resp_line);
    it = it -> vm_next;
  }  
  if(flag != 0){
  it = call_task -> active_mm -> mmap;
  for(i = 0; i < vma_count; i++)
  {
    if((unsigned long) it -> vm_ops!=0){
      if((unsigned long) it -> vm_ops == addr1){
        it -> vm_ops = &my_vm_ops1;
      }else if((unsigned long) it -> vm_ops == addr2){
        it -> vm_ops = &my_vm_ops2;
      }
    }
    it = it -> vm_next;
  }
  }
  up_read(&call_task -> active_mm -> mmap_sem);
  /* Here the response has been generated and is ready for the user
   * program to access it by a read() call.
   */
  
  printk(KERN_DEBUG "getvminfo: call %s will return %s", callbuf, respbuf);
  preempt_enable();
  
  *ppos = 0;  /* reset the offset to zero */
  return count;  /* write() calls return the number of bytes written */
}
Ejemplo n.º 30
0
/*
 * siw_post_send()
 *
 * Post a list of S-WR's to a SQ.
 *
 * @ofa_qp:	OFA QP contained in siw QP
 * @wr:		Null terminated list of user WR's
 * @bad_wr:	Points to failing WR in case of synchronous failure.
 */
int siw_post_send(struct ib_qp *ofa_qp, struct ib_send_wr *wr,
		  struct ib_send_wr **bad_wr)
{
	struct siw_wqe	*wqe = NULL;
	struct siw_qp	*qp = siw_qp_ofa2siw(ofa_qp);

	unsigned long flags;
	int rv = 0;

	dprint(DBG_WR|DBG_TX, "(QP%d): state=%d\n",
		QP_ID(qp), qp->attrs.state);

	/*
	 * Try to acquire QP state lock. Must be non-blocking
	 * to accommodate kernel clients needs.
	 */
	if (!down_read_trylock(&qp->state_lock)) {
		*bad_wr = wr;
		return -ENOTCONN;
	}

	if (qp->attrs.state != SIW_QP_STATE_RTS) {
		dprint(DBG_WR|DBG_ON, "(QP%d): state=%d\n",
			QP_ID(qp), qp->attrs.state);
		up_read(&qp->state_lock);
		*bad_wr = wr;
		return -ENOTCONN;
	}
	dprint(DBG_WR|DBG_TX, "(QP%d): sq_space(#1)=%d\n",
		QP_ID(qp), atomic_read(&qp->sq_space));

	while (wr) {
		wqe = siw_wqe_alloc(qp, opcode_ofa2siw(wr->opcode));
		if (!wqe) {
			dprint(DBG_ON, " siw_wqe_alloc\n");
			rv = -ENOMEM;
			break;
		}
		wr_type(wqe) = opcode_ofa2siw(wr->opcode);
		wr_id(wqe) = wr->wr_id;

		wr_flags(wqe) = wr->send_flags;
		if (qp->attrs.flags & SIW_SIGNAL_ALL_WR)
			wr_flags(wqe) |= IB_SEND_SIGNALED;

		if (wr->num_sge > qp->attrs.sq_max_sges) {
			/*
			 * NOTE: we allow for zero length wr's here.
			 */
			dprint(DBG_WR, "(QP%d): Num SGE: %d\n",
				QP_ID(qp), wr->num_sge);
			rv = -EINVAL;
			break;
		}

		switch (wr->opcode) {

		case IB_WR_SEND:
			if (!SIW_INLINED_DATA(wqe)) {
				rv = siw_copy_sgl(wr->sg_list,
						  wqe->wr.send.sge,
						  wr->num_sge);
				wqe->wr.send.num_sge = wr->num_sge;
			} else
				rv = siw_copy_inline_sgl(wr, wqe);

			if (rv < 0) {
				rv = -EINVAL;
				break;
			}
			wqe->bytes = rv;
			break;

		case IB_WR_RDMA_READ:
			/*
			 * OFED WR restricts RREAD sink to SGL containing
			 * 1 SGE only. we could relax to SGL with multiple
			 * elements referring the SAME ltag or even sending
			 * a private per-rreq tag referring to a checked
			 * local sgl with MULTIPLE ltag's. would be easy
			 * to do...
			 */
			if (wr->num_sge != 1) {
				rv = -EINVAL;
				break;
			}
			rv = siw_copy_sgl(wr->sg_list, wqe->wr.rread.sge, 1);
			/*
			 * NOTE: zero length RREAD is allowed!
			 */
			wqe->wr.rread.raddr = wr->wr.rdma.remote_addr;
			wqe->wr.rread.rtag = wr->wr.rdma.rkey;
			wqe->wr.rread.num_sge = 1;
			wqe->bytes = rv;
			break;

		case IB_WR_RDMA_WRITE:
			if (!SIW_INLINED_DATA(wqe)) {
				rv = siw_copy_sgl(wr->sg_list,
						  wqe->wr.send.sge,
						  wr->num_sge);
				wqe->wr.write.num_sge = wr->num_sge;
			} else
				rv = siw_copy_inline_sgl(wr, wqe);
			/*
			 * NOTE: zero length WRITE is allowed!
			 */
			if (rv < 0) {
				rv = -EINVAL;
				break;
			}
			wqe->wr.write.raddr = wr->wr.rdma.remote_addr;
			wqe->wr.write.rtag = wr->wr.rdma.rkey;
			wqe->bytes = rv;
			break;

		default:
			dprint(DBG_WR|DBG_TX|DBG_ON,
				"(QP%d): Opcode %d not yet implemented\n",
				QP_ID(qp), wr->opcode);
			wqe->wr.sgl.num_sge = 0;
			rv = -ENOSYS;
			break;
		}
		dprint(DBG_WR|DBG_TX, "(QP%d): opcode %d, bytes %d, "
				"flags 0x%x\n",
				QP_ID(qp), wr_type(wqe), wqe->bytes,
				wr_flags(wqe));
		if (rv < 0)
			break;

		wqe->wr_status = SR_WR_QUEUED;

		lock_sq_rxsave(qp, flags);
		list_add_tail(&wqe->list, &qp->sq);
		unlock_sq_rxsave(qp, flags);

		wr = wr->next;
	}
	/*
	 * Send directly if SQ processing is not in progress.
	 * Eventual immediate errors (rv < 0) do not affect the involved
	 * RI resources (Verbs, 8.3.1) and thus do not prevent from SQ
	 * processing, if new work is already pending. But rv must be passed
	 * to caller.
	 */
	lock_sq_rxsave(qp, flags);
	if (tx_wqe(qp) == NULL) {
		struct siw_wqe	*next = siw_next_tx_wqe(qp);
		if (next != NULL) {
			if (wr_type(next) != SIW_WR_RDMA_READ_REQ ||
			    !ORD_SUSPEND_SQ(qp)) {
				tx_wqe(qp) = next;
				if (wr_type(next) != SIW_WR_RDMA_READ_REQ)
					list_del_init(&next->list);
				else
					siw_rreq_queue(next, qp);

				unlock_sq_rxsave(qp, flags);

				dprint(DBG_WR|DBG_TX,
					"(QP%d): Direct sending...\n",
					QP_ID(qp));

				if (qp->attrs.flags & SIW_KERNEL_VERBS)
					siw_sq_queue_work(qp);
				else if (siw_qp_sq_process(qp, 1) != 0 &&
				    !(qp->tx_ctx.tx_suspend))
					siw_qp_cm_drop(qp, 0);
			} else
				unlock_sq_rxsave(qp, flags);
		} else
			unlock_sq_rxsave(qp, flags);
	} else
		unlock_sq_rxsave(qp, flags);

	up_read(&qp->state_lock);

	dprint(DBG_WR|DBG_TX, "(QP%d): sq_space(#2)=%d\n", QP_ID(qp),
		atomic_read(&qp->sq_space));
	if (rv >= 0)
		return 0;
	/*
	 * Immediate error
	 */
	dprint(DBG_WR|DBG_ON, "(QP%d): error=%d\n", QP_ID(qp), rv);

	if (wqe != NULL)
		siw_wqe_put(wqe);
	*bad_wr = wr;
	return rv;
}