// NB: Current implementation copies task structs into and out of queue. This // is fine as long as tasks are small. If they were large we might want to // consider an implementation that uses an extra level of indirection to avoid // the copy operations. // void MS_queue::enqueue(void* t, const int tid) { qnode_t* qn = (qnode_t*)bp->alloc_block(tid); counted_ptr my_tail; qn->t = t; qn->next.p.ptr = 0; // leave sn where it is! while (true) { counted_ptr my_next; mvx(&tail.all, &my_tail.all); mvx(&((qnode_t*)my_tail.p.ptr)->next.all, &my_next.all); counted_ptr my_tail2; mvx(&tail.all, &my_tail2.all); if (my_tail.all == my_tail2.all) { // my_tail and my_next are mutually consistent if (my_next.p.ptr == 0) { // last node; try to link new node after this if (cp_CAS(&((qnode_t*)my_tail.p.ptr)->next, my_next.p.ptr, my_next.p.sn, qn)) { break; // enqueue worked } } else { // try to swing B->tail to next node (void)cp_CAS(&tail, my_tail.p.ptr, my_tail.p.sn, my_next.p.ptr); } } } // try to swing B->tail to newly inserted node (void)cp_CAS(&tail, my_tail.p.ptr, my_tail.p.sn, qn); }
inline void block_pool::free_block(void* block, int tid) { block_head_node_t* hn = &head_nodes[tid]; shared_block_t* b = make_shared_block_t(block); b->next = hn->top; hn->top = b; hn->count++; if (hn->count == GROUP_SIZE+1) { hn->nth = hn->top; } else if (hn->count == GROUP_SIZE * 2) { // got a lot of nodes; move some to global pool shared_block_t* ng = hn->nth->next; while (true) { counted_ptr gp; unsigned long sn; mvx(&global_pool->all, &gp.all); b = (shared_block_t*)gp.p.ptr; sn = gp.p.sn; ng->next_group = b; if (cp_CAS(global_pool, b, sn, ng)) break; // else somebody else got into timing window; try again } // In real-time code I might want to limit the number of iterations of // the above loop, and let my local pool grow bigger when there is very // heavy contention for the global pool. In practice I don't expect a // problem. Note in particular that the code as written is preemption // safe. hn->nth->next = 0; hn->nth = 0; hn->count -= GROUP_SIZE; } }
// Returns 0 if queue was empty. Since payloads are required to be // pointers, this is ok. // void* MS_queue::dequeue(const int tid) { counted_ptr my_head, my_tail; qnode_t* my_next; void* rtn; while (true) { mvx(&head.all, &my_head.all); mvx(&tail.all, &my_tail.all); my_next = (qnode_t*)((qnode_t*)my_head.p.ptr)->next.p.ptr; counted_ptr my_head2; mvx(&head.all, &my_head2.all); if (my_head.all == my_head2.all) { // head, tail, and next are mutually consistent if (my_head.p.ptr != my_tail.p.ptr) { // Read value out of node before CAS. Otherwise another dequeue // might free the next node. rtn = my_next->t; // try to swing head to next node if (cp_CAS(&head, my_head.p.ptr, my_head.p.sn, my_next)) { break; // dequeue worked } } else { // queue is empty, or tail is falling behind if (my_next == 0) // queue is empty return 0; // try to swing tail to next node (void)cp_CAS(&tail, my_tail.p.ptr, my_tail.p.sn, my_next); } } } bp->free_block((void*)my_head.p.ptr, tid); return rtn; }
inline void* block_pool::alloc_block(int tid) { block_head_node_t* hn = &head_nodes[tid]; shared_block_t* b = hn->top; if (b) { hn->top = b->next; hn->count--; if (b == hn->nth) hn->nth = 0; } else { // local pool is empty while (true) { counted_ptr gp; mvx(&global_pool->all, &gp.all); if ((b = (shared_block_t*)gp.p.ptr)) { unsigned long sn = gp.p.sn; if (cp_CAS(global_pool, b, sn, b->next_group)) { // successfully grabbed group from global pool hn->top = b->next; hn->count = GROUP_SIZE-1; break; } // else somebody else got into timing window; try again } else { // global pool is empty b = (shared_block_t*)memalign(CACHELINESIZE, blocksize); assert(b != 0); break; } } // In real-time code I might want to limit the number of iterations of // the above loop, and go ahead and malloc a new node when there is // very heavy contention for the global pool. In practice I don't // expect a starvation problem. Note in particular that the code as // written is preemption safe. } return (void*)&b->payload; }
/** * Re-compute the min value of children, return true if the value * is changed. */ bool revisit(qc32s_node_t* curr, int32_t n) { while (true) { // the word is volatile... get a safe copy of it via 64-bit // atomic load word64_t x; mvx(&curr->word.all, &x.all); // if the node is tentative, return if (x.fields.word.bits.steady == TENTATIVE) return (x.fields.min >= n); // compute mvc: min value of children // // NB: we don't need to do atomic 64-bit reads if we are only // working with an aligned 32-bit field within the packed // struct int32_t mvc = curr->my_num; // min value of all children qc32s_node_t* begin = curr->first_child; qc32s_node_t* end = curr->last_child; if (begin != NULL) { for (qc32s_node_t* n = begin; n <= end; n++) { int32_t lmin = n->word.fields.min; if (mvc > lmin) mvc = lmin; } } if (mvc < x.fields.min && mvc < n) return false; uint32_t aok = (mvc >= x.fields.min); word64_t temp; MAKE_WORD(temp, aok, mvc, x.fields.word.bits.ver + 1); if (bcas64(&curr->word.all, x.all, temp.all)) return (x.fields.min >= n); } }
/** * This code propagates the arrival of value 'n' from a child of /this/ * to /this/ node, and possibly recurses to push the arrival further * upward. */ void arrive_internal(int32_t n) { // The first step is to determine if our arrival with a value of 'n', // at a decendent of /this/, means that we must change the value of // this node. In the ideal case, we don't need to change the value, // because this node has a value that is STEADY and <= 'n'. If that // is the case, this loop will lead to us returning immediately. // Note, however, that we must modify /this/ by incrementing the // version number, to avoid a race. word64_t x; while (true) { // atomically read the 64-bit versioned value of /this/ mvx(&word.all, &x.all); // If the node value > n, or if the value is not steady, then we // can't take the quick exit if ((x.fields.min > n) || (x.fields.word.bits.steady == TENTATIVE)) break; // We can take the quick exit... use a 32-bit CAS to atomically // increment the counter field word64_t temp; MAKE_WORD(temp, x.fields.word.bits.steady, x.fields.min, x.fields.word.bits.ver + 1); #ifdef STM_CPU_SPARC if (bcas64(&word.all, x.all, temp.all)) return; #else if (bcas32((uint32_t *)&word.all, (uint32_t)x.all, (uint32_t)temp.all)) return; #endif // [mfs] if we used cas64 instead of bcas64, then it would // automatically reload the temp value for us... } // if n < this.word.val, then we need to use a CAS to update this // node so that its value == n, and so that it is TENTATIVE while (true) { // if we are not modifying the value, then we can exit this loop if (n >= x.fields.min) break; word64_t temp; MAKE_WORD(temp, TENTATIVE, n, x.fields.word.bits.ver + 1); if (bcas64(&word.all, x.all, temp.all)) { x.all = temp.all; break; } // reread word mvx(&word.all, &x.all); } // In the common case, the node is TENTATIVE, indicating that some // arriver (maybe not me) has updated this node. We need to // propagate the value of this node upward, so that we are either (a) // propagating our own value up, or (b) propagating a concurrent // arriver up. If we don't do this, then a future query by this // thread will violate processor consistency, by appearing to happen // before this arrive(). if (x.fields.word.bits.steady == TENTATIVE) { // first, we recurse upward to arrive at the parent // // [mfs] The current way of telling if we are working on the root // is by checking if my_parent is NULL. That may not be // best in the long run. if (my_parent) { // [mfs] Would rewriting to avoid recursion help? my_parent->arrive_internal(n); } // Once we have successfully propagated upward, we can clear the // tentative mark from this node, and then return, which will // allow us to clear the tentative mark of descendents. // // [mfs] It seems that we only clear the tentative mark from a // node if its value is the one that we are putting into // the Mindicator. Otherwise, the (presumably delayed) // concurrent writer will need to clear that flag later. // Is this going to create pathologies, where we must // propagate actions up the tree without actually doing // modications to values, only because there is a // concurrent TENTATIVE action that is delayed? if (x.fields.min == n) { // [mfs] I really don't like it that we say 'n' here, instead // of x.fields.min. I know they are equal, but every // time I see it I think there is a bug. word64_t temp; MAKE_WORD(temp, STEADY, n, x.fields.word.bits.ver + 1); // [mfs] it is interesting to note that we do not need a // 'while' loop around this CAS. Since the x86 and // SPARC guarantee progress for a CAS, we know that a // failure must mean that the version number has // changed, in which case we are competing with another // concurrent operation, and that we can leave without // modifying this node. // // [mfs] With that said, there are two optimizations to // consider here. First, we might want to test before // the CAS, so that we can avoid the operation if it is // certain to fail. // // Second, it would be GREAT if we could avoid doing 2 // CASes on the root node. At the entry to this // function, we could special-case it for the root // node, in order to only do one CAS. #ifdef STM_CPU_SPARC bcas64(&word.all, x.all, temp.all); #else bcas32((uint32_t*)&word.all, (uint32_t)x.all, (uint32_t)temp.all); #endif } } }