static unsigned int mce_chrdev_poll(struct file *file, poll_table *wait) { poll_wait(file, &mce_chrdev_wait, wait); if (READ_ONCE(mcelog.next)) return POLLIN | POLLRDNORM; if (!mce_apei_read_done && apei_check_mce()) return POLLIN | POLLRDNORM; return 0; }
static void *array_of_map_lookup_elem(struct bpf_map *map, void *key) { struct bpf_map **inner_map = array_map_lookup_elem(map, key); if (!inner_map) return NULL; return READ_ONCE(*inner_map); }
/* * A timer is active, when it is enqueued into the rbtree or the * callback function is running or it's in the state of being migrated * to another cpu. * * It is important for this function to not return a false negative. */ bool hrtimer_active(const struct hrtimer *timer) { struct hrtimer_cpu_base *cpu_base; unsigned int seq; do { cpu_base = READ_ONCE(timer->base->cpu_base); seq = raw_read_seqcount_begin(&cpu_base->seq); if (timer->state != HRTIMER_STATE_INACTIVE || cpu_base->running == timer) return true; } while (read_seqcount_retry(&cpu_base->seq, seq) || cpu_base != READ_ONCE(timer->base->cpu_base)); return false; }
/* * Update host ring buffer after iterating over packets. */ void hv_pkt_iter_close(struct vmbus_channel *channel) { struct hv_ring_buffer_info *rbi = &channel->inbound; u32 curr_write_sz, pending_sz, bytes_read, start_read_index; /* * Make sure all reads are done before we update the read index since * the writer may start writing to the read area once the read index * is updated. */ virt_rmb(); start_read_index = rbi->ring_buffer->read_index; rbi->ring_buffer->read_index = rbi->priv_read_index; if (!rbi->ring_buffer->feature_bits.feat_pending_send_sz) return; /* * Issue a full memory barrier before making the signaling decision. * Here is the reason for having this barrier: * If the reading of the pend_sz (in this function) * were to be reordered and read before we commit the new read * index (in the calling function) we could * have a problem. If the host were to set the pending_sz after we * have sampled pending_sz and go to sleep before we commit the * read index, we could miss sending the interrupt. Issue a full * memory barrier to address this. */ virt_mb(); pending_sz = READ_ONCE(rbi->ring_buffer->pending_send_sz); if (!pending_sz) return; /* * Ensure the read of write_index in hv_get_bytes_to_write() * happens after the read of pending_send_sz. */ virt_rmb(); curr_write_sz = hv_get_bytes_to_write(rbi); bytes_read = hv_pkt_iter_bytes_read(rbi, start_read_index); /* * If there was space before we began iteration, * then host was not blocked. */ if (curr_write_sz - bytes_read > pending_sz) return; /* If pending write will not fit, don't give false hope. */ if (curr_write_sz <= pending_sz) return; vmbus_setevent(channel); }
/* * Workqueue handler to drive one grace period and invoke any callbacks * that become ready as a result. Single-CPU and !PREEMPT operation * means that we get away with murder on synchronization. ;-) */ void srcu_drive_gp(struct work_struct *wp) { int idx; struct rcu_head *lh; struct rcu_head *rhp; struct srcu_struct *sp; sp = container_of(wp, struct srcu_struct, srcu_work); if (sp->srcu_gp_running || !READ_ONCE(sp->srcu_cb_head)) return; /* Already running or nothing to do. */ /* Remove recently arrived callbacks and wait for readers. */ WRITE_ONCE(sp->srcu_gp_running, true); local_irq_disable(); lh = sp->srcu_cb_head; sp->srcu_cb_head = NULL; sp->srcu_cb_tail = &sp->srcu_cb_head; local_irq_enable(); idx = sp->srcu_idx; WRITE_ONCE(sp->srcu_idx, !sp->srcu_idx); WRITE_ONCE(sp->srcu_gp_waiting, true); /* srcu_read_unlock() wakes! */ swait_event_exclusive(sp->srcu_wq, !READ_ONCE(sp->srcu_lock_nesting[idx])); WRITE_ONCE(sp->srcu_gp_waiting, false); /* srcu_read_unlock() cheap. */ /* Invoke the callbacks we removed above. */ while (lh) { rhp = lh; lh = lh->next; local_bh_disable(); rhp->func(rhp); local_bh_enable(); } /* * Enable rescheduling, and if there are more callbacks, * reschedule ourselves. This can race with a call_srcu() * at interrupt level, but the ->srcu_gp_running checks will * straighten that out. */ WRITE_ONCE(sp->srcu_gp_running, false); if (READ_ONCE(sp->srcu_cb_head)) schedule_work(&sp->srcu_work); }
/* * Determine number of bytes available in ring buffer after * the current iterator (priv_read_index) location. * * This is similar to hv_get_bytes_to_read but with private * read index instead. */ static u32 hv_pkt_iter_avail(const struct hv_ring_buffer_info *rbi) { u32 priv_read_loc = rbi->priv_read_index; u32 write_loc = READ_ONCE(rbi->ring_buffer->write_index); if (write_loc >= priv_read_loc) return write_loc - priv_read_loc; else return (rbi->ring_datasize - priv_read_loc) + write_loc; }
/* * Fill out an ACK packet. */ static size_t rxrpc_fill_out_ack(struct rxrpc_call *call, struct rxrpc_ack_buffer *pkt, rxrpc_seq_t *_hard_ack, rxrpc_seq_t *_top, u8 reason) { rxrpc_serial_t serial; rxrpc_seq_t hard_ack, top, seq; int ix; u32 mtu, jmax; u8 *ackp = pkt->acks; /* Barrier against rxrpc_input_data(). */ serial = call->ackr_serial; hard_ack = READ_ONCE(call->rx_hard_ack); top = smp_load_acquire(&call->rx_top); *_hard_ack = hard_ack; *_top = top; pkt->ack.bufferSpace = htons(8); pkt->ack.maxSkew = htons(call->ackr_skew); pkt->ack.firstPacket = htonl(hard_ack + 1); pkt->ack.previousPacket = htonl(call->ackr_prev_seq); pkt->ack.serial = htonl(serial); pkt->ack.reason = reason; pkt->ack.nAcks = top - hard_ack; if (reason == RXRPC_ACK_PING) pkt->whdr.flags |= RXRPC_REQUEST_ACK; if (after(top, hard_ack)) { seq = hard_ack + 1; do { ix = seq & RXRPC_RXTX_BUFF_MASK; if (call->rxtx_buffer[ix]) *ackp++ = RXRPC_ACK_TYPE_ACK; else *ackp++ = RXRPC_ACK_TYPE_NACK; seq++; } while (before_eq(seq, top)); } mtu = call->conn->params.peer->if_mtu; mtu -= call->conn->params.peer->hdrsize; jmax = (call->nr_jumbo_bad > 3) ? 1 : rxrpc_rx_jumbo_max; pkt->ackinfo.rxMTU = htonl(rxrpc_rx_mtu); pkt->ackinfo.maxMTU = htonl(mtu); pkt->ackinfo.rwind = htonl(call->rx_winsize); pkt->ackinfo.jumbo_max = htonl(jmax); *ackp++ = 0; *ackp++ = 0; *ackp++ = 0; return top - hard_ack + 3; }
/* * Called from fs/proc with a reference on @p to find the function * which called into schedule(). This needs to be done carefully * because the task might wake up and we might look at a stack * changing under us. */ unsigned long get_wchan(struct task_struct *p) { unsigned long start, bottom, top, sp, fp, ip, ret = 0; int count = 0; if (!p || p == current || p->state == TASK_RUNNING) return 0; if (!try_get_task_stack(p)) return 0; start = (unsigned long)task_stack_page(p); if (!start) goto out; /* * Layout of the stack page: * * ----------- topmax = start + THREAD_SIZE - sizeof(unsigned long) * PADDING * ----------- top = topmax - TOP_OF_KERNEL_STACK_PADDING * stack * ----------- bottom = start * * The tasks stack pointer points at the location where the * framepointer is stored. The data on the stack is: * ... IP FP ... IP FP * * We need to read FP and IP, so we need to adjust the upper * bound by another unsigned long. */ top = start + THREAD_SIZE - TOP_OF_KERNEL_STACK_PADDING; top -= 2 * sizeof(unsigned long); bottom = start; sp = READ_ONCE(p->thread.sp); if (sp < bottom || sp > top) goto out; fp = READ_ONCE_NOCHECK(((struct inactive_task_frame *)sp)->bp); do { if (fp < bottom || fp > top) goto out; ip = READ_ONCE_NOCHECK(*(unsigned long *)(fp + sizeof(unsigned long))); if (!in_sched_functions(ip)) { ret = ip; goto out; } fp = READ_ONCE_NOCHECK(*(unsigned long *)fp); } while (count++ < 16 && p->state != TASK_RUNNING); out: put_task_stack(p); return ret; }
void *thread_update(void *arg) { WRITE_ONCE(y, 1); #ifndef FORCE_FAILURE_2 synchronize_srcu(&ss); #endif might_sleep(); __unbuffered_tpr_x = READ_ONCE(x); return NULL; }
static void __init kasan_pud_populate(pgd_t *pgdp, unsigned long addr, unsigned long end, int node, bool early) { unsigned long next; pud_t *pudp = kasan_pud_offset(pgdp, addr, node, early); do { next = pud_addr_end(addr, end); kasan_pmd_populate(pudp, addr, next, node, early); } while (pudp++, addr = next, addr != end && pud_none(READ_ONCE(*pudp))); }
void *test_xchg_lock(void *arg) { int me = (long)arg; run_on(me); atomic_inc(&nthreadsrunning); while (READ_ONCE(goflag) == GOFLAG_INIT) poll(NULL, 0, 1); while (READ_ONCE(goflag) == GOFLAG_RUN) { xchg_lock(&testlock); if (owner != -1) lockerr++; lockacqs++; owner = me; poll(NULL, 0, 1); owner = -1; xchg_unlock(&testlock); } return NULL; }
/* * Copy the current shadow region into a new pgdir. */ void __init kasan_copy_shadow(pgd_t *pgdir) { pgd_t *pgdp, *pgdp_new, *pgdp_end; pgdp = pgd_offset_k(KASAN_SHADOW_START); pgdp_end = pgd_offset_k(KASAN_SHADOW_END); pgdp_new = pgd_offset_raw(pgdir, KASAN_SHADOW_START); do { set_pgd(pgdp_new, READ_ONCE(*pgdp)); } while (pgdp++, pgdp_new++, pgdp != pgdp_end); }
/* * task->mm can be NULL if the task is the exited group leader. So to * determine whether the task is using a particular mm, we examine all the * task's threads: if one of those is using this mm then this task was also * using it. */ static bool process_shares_mm(struct task_struct *p, struct mm_struct *mm) { struct task_struct *t; for_each_thread(p, t) { struct mm_struct *t_mm = READ_ONCE(t->mm); if (t_mm) return t_mm == mm; } return false; }
static void hv_signal_on_write(u32 old_write, struct vmbus_channel *channel, bool kick_q) { struct hv_ring_buffer_info *rbi = &channel->outbound; virt_mb(); if (READ_ONCE(rbi->ring_buffer->interrupt_mask)) return; /* check interrupt_mask before read_index */ virt_rmb(); /* * This is the only case we need to signal when the * ring transitions from being empty to non-empty. */ if (old_write == READ_ONCE(rbi->ring_buffer->read_index)) vmbus_setevent(channel); return; }
/* Called with IRQs disabled. */ __visible inline void prepare_exit_to_usermode(struct pt_regs *regs) { struct thread_info *ti = current_thread_info(); u32 cached_flags; addr_limit_user_check(); lockdep_assert_irqs_disabled(); lockdep_sys_exit(); cached_flags = READ_ONCE(ti->flags); if (unlikely(cached_flags & EXIT_TO_USERMODE_LOOP_FLAGS)) exit_to_usermode_loop(regs, cached_flags); /* Reload ti->flags; we may have rescheduled above. */ cached_flags = READ_ONCE(ti->flags); fpregs_assert_state_consistent(); if (unlikely(cached_flags & _TIF_NEED_FPU_LOAD)) switch_fpu_return(); #ifdef CONFIG_COMPAT /* * Compat syscalls set TS_COMPAT. Make sure we clear it before * returning to user mode. We need to clear it *after* signal * handling, because syscall restart has a fixup for compat * syscalls. The fixup is exercised by the ptrace_syscall_32 * selftest. * * We also need to clear TS_REGS_POKED_I386: the 32-bit tracer * special case only applies after poking regs and before the * very next return to user mode. */ ti->status &= ~(TS_COMPAT|TS_I386_REGS_POKED); #endif user_enter_irqoff(); mds_user_clear_cpu_buffers(); }
void quarantine_reduce(void) { size_t new_quarantine_size, percpu_quarantines; unsigned long flags; struct qlist_head to_free = QLIST_INIT; size_t size_to_free = 0; struct qlist_node *last; if (likely(READ_ONCE(global_quarantine.bytes) <= READ_ONCE(quarantine_size))) return; spin_lock_irqsave(&quarantine_lock, flags); /* * Update quarantine size in case of hotplug. Allocate a fraction of * the installed memory to quarantine minus per-cpu queue limits. */ new_quarantine_size = (READ_ONCE(totalram_pages) << PAGE_SHIFT) / QUARANTINE_FRACTION; percpu_quarantines = QUARANTINE_PERCPU_SIZE * num_online_cpus(); new_quarantine_size = (new_quarantine_size < percpu_quarantines) ? 0 : new_quarantine_size - percpu_quarantines; WRITE_ONCE(quarantine_size, new_quarantine_size); last = global_quarantine.head; while (last) { struct kmem_cache *cache = qlink_to_cache(last); size_to_free += cache->size; if (!last->next || size_to_free > global_quarantine.bytes - QUARANTINE_LOW_SIZE) break; last = last->next; } qlist_move(&global_quarantine, last, &to_free, size_to_free); spin_unlock_irqrestore(&quarantine_lock, flags); qlist_free_all(&to_free, NULL); }
/* * Entry point from instrumented code. * This is called once per basic-block/edge. */ void notrace __sanitizer_cov_trace_pc(void) { struct task_struct *t; enum kcov_mode mode; t = current; /* * We are interested in code coverage as a function of a syscall inputs, * so we ignore code executed in interrupts. * The checks for whether we are in an interrupt are open-coded, because * 1. We can't use in_interrupt() here, since it also returns true * when we are inside local_bh_disable() section. * 2. We don't want to use (in_irq() | in_serving_softirq() | in_nmi()), * since that leads to slower generated code (three separate tests, * one for each of the flags). */ if (!t || (preempt_count() & (HARDIRQ_MASK | SOFTIRQ_OFFSET | NMI_MASK))) return; mode = READ_ONCE(t->kcov_mode); if (mode == KCOV_MODE_TRACE) { unsigned long *area; unsigned long pos; /* * There is some code that runs in interrupts but for which * in_interrupt() returns false (e.g. preempt_schedule_irq()). * READ_ONCE()/barrier() effectively provides load-acquire wrt * interrupts, there are paired barrier()/WRITE_ONCE() in * kcov_ioctl_locked(). */ barrier(); area = t->kcov_area; /* The first word is number of subsequent PCs. */ pos = READ_ONCE(area[0]) + 1; if (likely(pos < t->kcov_size)) { area[pos] = _RET_IP_; WRITE_ONCE(area[0], pos); } } }
static pud_t *__init kasan_pud_offset(pgd_t *pgdp, unsigned long addr, int node, bool early) { if (pgd_none(READ_ONCE(*pgdp))) { phys_addr_t pud_phys = early ? __pa_symbol(kasan_early_shadow_pud) : kasan_alloc_zeroed_page(node); __pgd_populate(pgdp, pud_phys, PMD_TYPE_TABLE); } return early ? pud_offset_kimg(pgdp, addr) : pud_offset(pgdp, addr); }
static int change_page_range(pte_t *ptep, pgtable_t token, unsigned long addr, void *data) { struct page_change_data *cdata = data; pte_t pte = READ_ONCE(*ptep); pte = clear_pte_bit(pte, cdata->clear_mask); pte = set_pte_bit(pte, cdata->set_mask); set_pte(ptep, pte); return 0; }
static void send_sigio_to_task(struct task_struct *p, struct fown_struct *fown, int fd, int reason, int group) { /* * F_SETSIG can change ->signum lockless in parallel, make * sure we read it once and use the same value throughout. */ int signum = READ_ONCE(fown->signum); if (!sigio_perm(p, fown, signum)) return; switch (signum) { siginfo_t si; default: /* Queue a rt signal with the appropriate fd as its value. We use SI_SIGIO as the source, not SI_KERNEL, since kernel signals always get delivered even if we can't queue. Failure to queue in this case _should_ be reported; we fall back to SIGIO in that case. --sct */ clear_siginfo(&si); si.si_signo = signum; si.si_errno = 0; si.si_code = reason; /* * Posix definies POLL_IN and friends to be signal * specific si_codes for SIG_POLL. Linux extended * these si_codes to other signals in a way that is * ambiguous if other signals also have signal * specific si_codes. In that case use SI_SIGIO instead * to remove the ambiguity. */ if ((signum != SIGPOLL) && sig_specific_sicodes(signum)) si.si_code = SI_SIGIO; /* Make sure we are called with one of the POLL_* reasons, otherwise we could leak kernel stack into userspace. */ BUG_ON((reason < POLL_IN) || ((reason - POLL_IN) >= NSIGPOLL)); if (reason - POLL_IN >= NSIGPOLL) si.si_band = ~0L; else si.si_band = mangle_poll(band_table[reason - POLL_IN]); si.si_fd = fd; if (!do_send_sig_info(signum, &si, p, group)) break; /* fall-through: fall back on the old plain SIGIO signal */ case 0: do_send_sig_info(SIGIO, SEND_SIG_PRIV, p, group); } }
struct xdp_umem *ixgbe_xsk_umem(struct ixgbe_adapter *adapter, struct ixgbe_ring *ring) { bool xdp_on = READ_ONCE(adapter->xdp_prog); int qid = ring->ring_idx; if (!adapter->xsk_umems || !adapter->xsk_umems[qid] || qid >= adapter->num_xsk_umems || !xdp_on) return NULL; return adapter->xsk_umems[qid]; }
static ssize_t interface_show(struct device *dev, struct device_attribute *attr, char *buf) { struct usb_interface *intf; char *string; intf = to_usb_interface(dev); string = READ_ONCE(intf->cur_altsetting->string); if (!string) return 0; return sprintf(buf, "%s\n", string); }
/* * Returns approximate total of the readers' ->unlock_count[] values for the * rank of per-CPU counters specified by idx. */ static unsigned long srcu_readers_unlock_idx(struct srcu_struct *sp, int idx) { int cpu; unsigned long sum = 0; for_each_possible_cpu(cpu) { struct srcu_array *cpuc = per_cpu_ptr(sp->per_cpu_ref, cpu); sum += READ_ONCE(cpuc->unlock_count[idx]); } return sum; }
static void efx_tx_maybe_stop_queue(struct efx_tx_queue *txq1) { /* We need to consider both queues that the net core sees as one */ struct efx_tx_queue *txq2 = efx_tx_queue_partner(txq1); struct efx_nic *efx = txq1->efx; unsigned int fill_level; fill_level = max(txq1->insert_count - txq1->old_read_count, txq2->insert_count - txq2->old_read_count); if (likely(fill_level < efx->txq_stop_thresh)) return; /* We used the stale old_read_count above, which gives us a * pessimistic estimate of the fill level (which may even * validly be >= efx->txq_entries). Now try again using * read_count (more likely to be a cache miss). * * If we read read_count and then conditionally stop the * queue, it is possible for the completion path to race with * us and complete all outstanding descriptors in the middle, * after which there will be no more completions to wake it. * Therefore we stop the queue first, then read read_count * (with a memory barrier to ensure the ordering), then * restart the queue if the fill level turns out to be low * enough. */ netif_tx_stop_queue(txq1->core_txq); smp_mb(); txq1->old_read_count = READ_ONCE(txq1->read_count); txq2->old_read_count = READ_ONCE(txq2->read_count); fill_level = max(txq1->insert_count - txq1->old_read_count, txq2->insert_count - txq2->old_read_count); EFX_WARN_ON_ONCE_PARANOID(fill_level >= efx->txq_entries); if (likely(fill_level < efx->txq_stop_thresh)) { smp_mb(); if (likely(!efx->loopback_selftest)) netif_tx_start_queue(txq1->core_txq); } }
static void reap_tx_dxes(struct wcn36xx *wcn, struct wcn36xx_dxe_ch *ch) { struct wcn36xx_dxe_ctl *ctl; struct ieee80211_tx_info *info; unsigned long flags; /* * Make at least one loop of do-while because in case ring is * completely full head and tail are pointing to the same element * and while-do will not make any cycles. */ spin_lock_irqsave(&ch->lock, flags); ctl = ch->tail_blk_ctl; do { if (READ_ONCE(ctl->desc->ctrl) & WCN36xx_DXE_CTRL_VLD) break; if (ctl->skb && READ_ONCE(ctl->desc->ctrl) & WCN36xx_DXE_CTRL_EOP) { dma_unmap_single(wcn->dev, ctl->desc->src_addr_l, ctl->skb->len, DMA_TO_DEVICE); info = IEEE80211_SKB_CB(ctl->skb); if (!(info->flags & IEEE80211_TX_CTL_REQ_TX_STATUS)) { /* Keep frame until TX status comes */ ieee80211_free_txskb(wcn->hw, ctl->skb); } if (wcn->queues_stopped) { wcn->queues_stopped = false; ieee80211_wake_queues(wcn->hw); } ctl->skb = NULL; } ctl = ctl->next; } while (ctl != ch->head_blk_ctl); ch->tail_blk_ctl = ctl; spin_unlock_irqrestore(&ch->lock, flags); }
/* * Entry point from instrumented code. * This is called once per basic-block/edge. */ void notrace __sanitizer_cov_trace_pc(void) { struct task_struct *t; enum kcov_mode mode; t = current; /* * We are interested in code coverage as a function of a syscall inputs, * so we ignore code executed in interrupts. */ if (!t || !in_task()) return; mode = READ_ONCE(t->kcov_mode); if (mode == KCOV_MODE_TRACE) { unsigned long *area; unsigned long pos; unsigned long ip = _RET_IP_; #ifdef CONFIG_RANDOMIZE_BASE ip -= kaslr_offset(); #endif /* * There is some code that runs in interrupts but for which * in_interrupt() returns false (e.g. preempt_schedule_irq()). * READ_ONCE()/barrier() effectively provides load-acquire wrt * interrupts, there are paired barrier()/WRITE_ONCE() in * kcov_ioctl_locked(). */ barrier(); area = t->kcov_area; /* The first word is number of subsequent PCs. */ pos = READ_ONCE(area[0]) + 1; if (likely(pos < t->kcov_size)) { area[pos] = ip; WRITE_ONCE(area[0], pos); } } }
static int tcf_gact(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) { struct tcf_gact *gact = a->priv; int action = READ_ONCE(gact->tcf_action); #ifdef CONFIG_GACT_PROB { u32 ptype = READ_ONCE(gact->tcfg_ptype); if (ptype) action = gact_rand[ptype](gact); } #endif bstats_cpu_update(this_cpu_ptr(gact->common.cpu_bstats), skb); if (action == TC_ACT_SHOT) qstats_drop_inc(this_cpu_ptr(gact->common.cpu_qstats)); tcf_lastuse_update(&gact->tcf_tm); return action; }
/** * amdgpu_fence_count_emitted - get the count of emitted fences * * @ring: ring the fence is associated with * * Get the number of fences emitted on the requested ring (all asics). * Returns the number of emitted fences on the ring. Used by the * dynpm code to ring track activity. */ unsigned amdgpu_fence_count_emitted(struct amdgpu_ring *ring) { uint64_t emitted; /* We are not protected by ring lock when reading the last sequence * but it's ok to report slightly wrong fence count here. */ amdgpu_fence_process(ring); emitted = 0x100000000ull; emitted -= atomic_read(&ring->fence_drv.last_seq); emitted += READ_ONCE(ring->fence_drv.sync_seq); return lower_32_bits(emitted); }
static void ff800_handle_midi_msg(struct snd_ff *ff, __le32 *buf, size_t length) { int i; for (i = 0; i < length / 4; i++) { u8 byte = le32_to_cpu(buf[i]) & 0xff; struct snd_rawmidi_substream *substream; substream = READ_ONCE(ff->tx_midi_substreams[0]); if (substream) snd_rawmidi_receive(substream, &byte, 1); } }
static inline pte_t gup_get_pte(pte_t *ptep) { #ifndef CONFIG_X2TLB return READ_ONCE(*ptep); #else /* * With get_user_pages_fast, we walk down the pagetables without * taking any locks. For this we would like to load the pointers * atomically, but that is not possible with 64-bit PTEs. What * we do have is the guarantee that a pte will only either go * from not present to present, or present to not present or both * -- it will not switch to a completely different present page * without a TLB flush in between; something that we are blocking * by holding interrupts off. * * Setting ptes from not present to present goes: * ptep->pte_high = h; * smp_wmb(); * ptep->pte_low = l; * * And present to not present goes: * ptep->pte_low = 0; * smp_wmb(); * ptep->pte_high = 0; * * We must ensure here that the load of pte_low sees l iff pte_high * sees h. We load pte_high *after* loading pte_low, which ensures we * don't see an older value of pte_high. *Then* we recheck pte_low, * which ensures that we haven't picked up a changed pte high. We might * have got rubbish values from pte_low and pte_high, but we are * guaranteed that pte_low will not have the present bit set *unless* * it is 'l'. And get_user_pages_fast only operates on present ptes, so * we're safe. * * gup_get_pte should not be used or copied outside gup.c without being * very careful -- it does not atomically load the pte or anything that * is likely to be useful for you. */ pte_t pte; retry: pte.pte_low = ptep->pte_low; smp_rmb(); pte.pte_high = ptep->pte_high; smp_rmb(); if (unlikely(pte.pte_low != ptep->pte_low)) goto retry; return pte; #endif }