static int _clone_init(struct iostash_bio *io, struct bio *clone, int is4ssd, void *endiofn) { int ret = 0; clone->bi_private = io; clone->bi_end_io = endiofn; #if LINUX_VERSION_CODE < KERNEL_VERSION(3,8,0) clone->bi_destructor = _clone_destructor; #endif if (is4ssd) { if (io->ssd->online) { clone->bi_bdev = io->ssd->bdev; BIO_SECTOR(clone) = (sector_t) (io->fragnum * SCE_SCTRPERFRAG) + (io->psn % SCE_SCTRPERFRAG) + IOSTASH_HEADERSCT; BIO_SIZE(clone) = BIO_SIZE(io->base_bio); } else { ret = -1; } } return ret; }
/* * bfq_bio_done(): .bio_done callback of the bfq policy * * Called after a bio is done, (by request_polling_biodone of dsched). * This function judges whet her a thread consumes up its time slice, and * if so, it will set the maybe_timeout flag in bfq_tdio structure. Any * further action of that thread or the bfq scheduler will cause the * thread to be expired. (in bfq_queue() or in bfq_dequeue()) * * This function requires the bfq_tdio pointer of the thread that pushes * bp to be stored by dsched_set_bio_priv() earlier. Currently it is * stored when bfq_queue() is called. * * lock: none. This function CANNOT be blocked by any lock * * refcount: * the corresponding tdio's refcount should decrease by 1 after * this function call. The counterpart increasing is in bfq_queue(). * For each bio pushed down, we increase the refcount of the pushing * tdio. */ static void bfq_bio_done(struct bio *bp) { struct disk *dp = dsched_get_bio_dp(bp); struct bfq_thread_io *bfq_tdio = dsched_get_bio_priv(bp); struct bfq_disk_ctx *bfq_diskctx = dsched_get_disk_priv(dp); struct timeval tv; int ticks_expired; KKASSERT(bfq_tdio); dsched_thread_io_ref(&bfq_tdio->head); atomic_add_int(&bfq_tdio->bio_completed, 1); /* the tdio has already expired */ if (bfq_tdio != bfq_diskctx->bfq_active_tdio) goto rtn; atomic_add_int(&bfq_tdio->service_received, BIO_SIZE(bp)); /* current time */ getmicrotime(&tv); bfq_tdio->last_request_done_time = tv; timevalsub (&tv, &bfq_tdio->service_start_time); ticks_expired = tvtohz_high(&tv); /* the thread has run out its time slice */ if ((ticks_expired != 0x7fffffff) && (ticks_expired >= BFQ_SLICE_TIMEOUT)) { /* * we cannot block here, so just set a flag */ #if 0 bfq_tdio->maybe_timeout = 1; #endif if (atomic_cmpset_int(&bfq_tdio->maybe_timeout, 0, 1)) { bfq_update_avg_time_slice(bfq_diskctx, tv); dsched_debug(BFQ_DEBUG_VERBOSE, "BFQ: %p may time out\n", bfq_tdio); } } rtn: dsched_thread_io_unref(&bfq_tdio->head); /* ref'ed in this function */ dsched_thread_io_unref(&bfq_tdio->head); /* ref'ed in queue() */ }
static struct iostash_bio *_io_alloc(struct hdd_info * hdd, struct ssd_info * ssd, uint32_t fragnum, struct bio *bio, sector_t psn) { struct iostash_bio *io = mempool_alloc(hdd->io_pool, GFP_NOIO); if (io) { atomic_inc(&hdd->io_pending); io->hdd = hdd; io->ssd = ssd; io->fragnum = fragnum; io->base_bio = bio; io->psn = psn; io->nr_sctr = to_sector(BIO_SIZE(bio)); io->error = 0; io->ssd_werr = 0; /* SSD write error */ atomic_set(&io->io_pending, 0); } return io; }
void iostash_mkrequest(struct request_queue *q, struct bio *bio) #endif { struct hdd_info *hdd; struct ssd_info *ssd; struct iostash_bio *io; sce_fmap_t fmap; uint32_t nr_sctr; sector_t psn; make_request_fn *org_mapreq = NULL; #if KERNEL_VERSION(4,4,0) <= LINUX_VERSION_CODE blk_qc_t ret = BLK_QC_T_NONE; #endif DBG("Got bio=%p bio->bi_rw(%lu) request at s=%lu l=%u.\n", bio, bio->bi_rw, BIO_SECTOR(bio), bio_sectors(bio)); rcu_read_lock(); hdd = hdd_search(bio); if (hdd) { atomic_inc(&hdd->nr_ref); org_mapreq = hdd->org_mapreq; } rcu_read_unlock(); if (unlikely(NULL == hdd)) { /* have to requeue the request, somebody was holding a * dangling reference */ ERR("Request holding a dangling make_request_fn pointer\n."); #if KERNEL_VERSION(4,4,0) <= LINUX_VERSION_CODE bio->bi_error = -EAGAIN; return ret; #elif LINUX_VERSION_CODE <= KERNEL_VERSION(3,1,0) rmb(); /* read the change in make_request_fn */ return -EAGAIN; /* retry */ #else /* no retry possible in newer kernels since the return * of make_request_fn is no longer checked and retried * if not zero, we cannot unload the module */ BUG(); return; #endif } if (!hdd->online) { ERR("request re-routed due to hdd not being online.\n"); /* being unloaded, re-route */ goto out; } hdd->request_q = q; /* calculate physical sector number -- offset partition information */ psn = BIO_SECTOR(bio) + bio->bi_bdev->bd_part->start_sect; nr_sctr = to_sector(BIO_SIZE(bio)); do { if (bio_sectors(bio) == 0) break; /* partition boundary check */ if ((psn < hdd->part_start) || ((psn + nr_sctr) > hdd->part_end)) break; if (bio_data_dir(bio) == WRITE) { gctx.st_write++; #ifdef SCE_AWT /* make sure the request is only for one fragment */ if (((psn + nr_sctr - 1) / SCE_SCTRPERFRAG) != (psn / SCE_SCTRPERFRAG)) { sce_invalidate(hdd->lun, psn, nr_sctr); break; } rcu_read_lock(); if (sce_get4write(hdd->lun, psn, nr_sctr, &fmap) == SCE_SUCCESS) { ssd = (struct ssd_info *)fmap.cdevctx; atomic_inc(&ssd->nr_ref); rcu_read_unlock(); if (!ssd->online) { sce_put4write(hdd->lun, psn, nr_sctr, 1); atomic_dec(&ssd->nr_ref); } else { io = _io_alloc(hdd, ssd, fmap.fragnum, bio, psn); if (NULL == io) { atomic_dec(&ssd->nr_ref); break; } #if KERNEL_VERSION(4,4,0) <= LINUX_VERSION_CODE ret = _io_worker_run(&io->work); #else _io_queue(io); #endif /* lose the reference to hdd, not needed anymore */ atomic_dec(&hdd->nr_ref); #if KERNEL_VERSION(4,4,0) <= LINUX_VERSION_CODE return ret; #elif LINUX_VERSION_CODE <= KERNEL_VERSION(3,1,0) return 0; #else return; #endif } } else rcu_read_unlock(); #else sce_invalidate(hdd->lun, psn, nr_sctr); #endif break; } else { /* Read handling */ gctx.st_read++; /* make sure the request is only for one fragment */ if (((psn + nr_sctr - 1) / SCE_SCTRPERFRAG) != (psn / SCE_SCTRPERFRAG)) break; /* cache hit/miss check */ rcu_read_lock(); if (sce_get4read(hdd->lun, psn, nr_sctr, &fmap) != SCE_SUCCESS) { rcu_read_unlock(); break; } BUG_ON(NULL == fmap.cdevctx); ssd = (struct ssd_info *) fmap.cdevctx; atomic_inc(&ssd->nr_ref); rcu_read_unlock(); /* make sure the request is within the SSD limits and the SSD is online */ if (!ssd->online || ssd->queue_max_hw_sectors < nr_sctr) { sce_put4read(hdd->lun, psn, nr_sctr); atomic_dec(&ssd->nr_ref); break; } /* cache hit */ io = _io_alloc(hdd, ssd, fmap.fragnum, bio, psn); if (NULL == io) { atomic_dec(&ssd->nr_ref); break; } #if KERNEL_VERSION(4,4,0) <= LINUX_VERSION_CODE ret = _io_worker_run(&io->work); #else _io_queue(io); #endif /* lose the reference to hdd , not needed anymore */ atomic_dec(&hdd->nr_ref); } #if KERNEL_VERSION(4,4,0) <= LINUX_VERSION_CODE return ret; #elif LINUX_VERSION_CODE <= KERNEL_VERSION(3,1,0) return 0; #else return; #endif } while (0); out: /* lose the reference to hdd , not needed anymore */ atomic_dec(&hdd->nr_ref); return (org_mapreq) (q, bio); }
/* * bfq_dequeue(): dispatch bios to the disk driver. * * This function will push as many bios as the number of free slots * in the tag queue. * * In the progress of dispatching, the following events may happen: * - Current thread is timeout: Expire the current thread for * BFQ_REASON_TIMEOUT, and select a new thread to serve in the * wf2q tree. * * - Current thread runs out of its budget: Expire the current thread * for BFQ_REASON_OUT_OF_BUDGET, and select a new thread to serve * * - Current thread has no further bios in its queue: if the AS feature * is turned on, the bfq scheduler sets an alarm and starts to suspend. * The bfq_timeout() or bfq_queue() calls may resume the scheduler. * * Implementation note: The bios selected to be dispatched will first * be stored in an array bio_do_dispatch. After this function releases * all the locks it holds, it will call dsched_strategy_request_polling() * for each bio stored. * * With the help of bfq_disk_ctx->pending_dequeue, * there will be only one bfq_dequeue pending on the BFQ_LOCK. * * lock: * BFQ_LOCK: protect from wf2q_augtree operations in bfq_queue() * THREAD_IO_LOCK: locks the active_tdio. Protect from queue insertions * in bfq_queue; Protect the active_tdio->budget * * refcount: * If the scheduler decides to suspend, the refcount of active_tdio * increases by 1. The counterpart decreasing is in bfq_queue() and * bfq_timeout() * blocking: * May be blocking on the disk driver lock. It depends on drivers. * * Calling path: * The callers could be: * bfq_queue(), bfq_timeout() and the registered polling function. * * caller --> helper_msg_dequeue --lwkt_msg--> helper_thread-> me * */ void bfq_dequeue(struct dsched_disk_ctx *diskctx) { int free_slots, bio_index = 0, i, remaining_budget = 0;/* remaining budget of current active process */ struct bio *bio, *bio_to_dispatch[33]; struct bfq_thread_io *active_tdio = NULL; struct bfq_disk_ctx *bfq_diskctx = (struct bfq_disk_ctx *)diskctx; BFQ_LOCK(bfq_diskctx); atomic_cmpset_int(&bfq_diskctx->pending_dequeue, 1, 0); /* * The whole scheduler is waiting for further bios * from process currently being served */ if (bfq_diskctx->bfq_blockon != NULL) goto rtn; remaining_budget = bfq_diskctx->bfq_remaining_budget; active_tdio = bfq_diskctx->bfq_active_tdio; dsched_debug(BFQ_DEBUG_VERBOSE, "BFQ: dequeue: Im in. active_tdio = %p\n", active_tdio); free_slots = diskctx->max_tag_queue_depth - diskctx->current_tag_queue_depth; KKASSERT(free_slots >= 0 && free_slots <= 32); if (active_tdio) DSCHED_THREAD_IO_LOCK(&active_tdio->head); while (free_slots) { /* Here active_tdio must be locked ! */ if (active_tdio) { /* * the bio_done function has marked the current * tdio timeout */ if (active_tdio->maybe_timeout) { dsched_debug(BFQ_DEBUG_VERBOSE, "BFQ: %p time out in dequeue()\n", active_tdio); wf2q_update_vd(active_tdio, active_tdio->budget - remaining_budget); bfq_expire(bfq_diskctx, active_tdio, BFQ_REASON_TIMEOUT); /* there still exist bios not dispatched, * reinsert the tdio into aug-tree*/ if (active_tdio->head.qlength > 0) { wf2q_insert_thread_io(&bfq_diskctx->bfq_wf2q, active_tdio); KKASSERT(bfq_diskctx->bfq_wf2q.wf2q_tdio_count); } active_tdio->maybe_timeout = 0; DSCHED_THREAD_IO_UNLOCK(&active_tdio->head); active_tdio = NULL; continue; } /* select next bio to dispatch */ /* TODO: a wiser slection */ KKASSERT(lockstatus(&active_tdio->head.lock, curthread) == LK_EXCLUSIVE); bio = TAILQ_FIRST(&active_tdio->head.queue); dsched_debug(BFQ_DEBUG_NORMAL, "bfq: the first bio in queue of active_tdio %p is %p\n", active_tdio, bio); dsched_debug(BFQ_DEBUG_VERBOSE, "bfq: active_tdio %p exists, remaining budget = %d, tdio budget = %d\n, qlength = %d, first bio = %p, first bio cmd = %d, first bio size = %d\n", active_tdio, remaining_budget, active_tdio->budget, active_tdio->head.qlength, bio, bio?bio->bio_buf->b_cmd:-1, bio?bio->bio_buf->b_bcount:-1); /* * The bio is not read or write, just * push it down. */ if (bio && (bio->bio_buf->b_cmd != BUF_CMD_READ) && (bio->bio_buf->b_cmd != BUF_CMD_WRITE)) { dsched_debug(BFQ_DEBUG_NORMAL, "bfq: remove bio %p from the queue of %p\n", bio, active_tdio); KKASSERT(lockstatus(&active_tdio->head.lock, curthread) == LK_EXCLUSIVE); TAILQ_REMOVE(&active_tdio->head.queue, bio, link); active_tdio->head.qlength--; free_slots--; #if 0 dsched_strategy_request_polling(diskctx->dp, bio, diskctx); #endif bio_to_dispatch[bio_index++] = bio; KKASSERT(bio_index <= bfq_diskctx->head.max_tag_queue_depth); continue; } /* * Run out of budget * But this is not because the size of bio is larger * than the complete budget. * If the size of bio is larger than the complete * budget, then use a complete budget to cover it. */ if (bio && (remaining_budget < BIO_SIZE(bio)) && (remaining_budget != active_tdio->budget)) { /* charge budget used */ wf2q_update_vd(active_tdio, active_tdio->budget - remaining_budget); bfq_expire(bfq_diskctx, active_tdio, BFQ_REASON_OUT_OF_BUDGET); wf2q_insert_thread_io(&bfq_diskctx->bfq_wf2q, active_tdio); dsched_debug(BFQ_DEBUG_VERBOSE, "BFQ: thread %p ran out of budget\n", active_tdio); DSCHED_THREAD_IO_UNLOCK(&active_tdio->head); active_tdio = NULL; } else { /* if (bio && remaining_budget < BIO_SIZE(bio) && remaining_budget != active_tdio->budget) */ /* * Having enough budget, * or having a complete budget and the size of bio * is larger than that. */ if (bio) { /* dispatch */ remaining_budget -= BIO_SIZE(bio); /* * The size of the first bio is larger * than the whole budget, we should * charge the extra part */ if (remaining_budget < 0) wf2q_update_vd(active_tdio, -remaining_budget); /* compensate */ wf2q_update_vd(active_tdio, -remaining_budget); /* * remaining_budget may be < 0, * but to prevent the budget of current tdio * to substract a negative number, * the remaining_budget has to be >= 0 */ remaining_budget = MAX(0, remaining_budget); dsched_debug(BFQ_DEBUG_NORMAL, "bfq: remove bio %p from the queue of %p\n", bio, active_tdio); KKASSERT(lockstatus(&active_tdio->head.lock, curthread) == LK_EXCLUSIVE); TAILQ_REMOVE(&active_tdio->head.queue, bio, link); free_slots--; active_tdio->head.qlength--; active_tdio->bio_dispatched++; wf2q_inc_tot_service(&bfq_diskctx->bfq_wf2q, BIO_SIZE(bio)); dsched_debug(BFQ_DEBUG_VERBOSE, "BFQ: %p's bio dispatched, size=%d, remaining_budget = %d\n", active_tdio, BIO_SIZE(bio), remaining_budget); #if 0 dsched_strategy_request_polling(diskctx->dp, bio, diskctx); #endif bio_to_dispatch[bio_index++] = bio; KKASSERT(bio_index <= bfq_diskctx->head.max_tag_queue_depth); } else { /* if (bio) */ KKASSERT(active_tdio); /* * If AS feature is switched off, * expire the tdio as well */ if ((remaining_budget <= 0) || !(bfq_diskctx->bfq_flag & BFQ_FLAG_AS) || !active_tdio->tdio_as_switch) { active_tdio->budget -= remaining_budget; wf2q_update_vd(active_tdio, active_tdio->budget); bfq_expire(bfq_diskctx, active_tdio, BFQ_REASON_OUT_OF_BUDGET); DSCHED_THREAD_IO_UNLOCK(&active_tdio->head); active_tdio = NULL; } else { /* no further bio, wait for a while */ bfq_diskctx->bfq_blockon = active_tdio; /* * Increase ref count to ensure that * tdio will not be destroyed during waiting. */ dsched_thread_io_ref(&active_tdio->head); /* * If the tdio is seeky but not thingking for * too long, we wait for it a little shorter */ if (active_tdio->seek_samples >= BFQ_VALID_MIN_SAMPLES && BFQ_TDIO_SEEKY(active_tdio)) callout_reset(&bfq_diskctx->bfq_callout, BFQ_T_WAIT_MIN, (void (*) (void *))helper_msg_as_timeout, bfq_diskctx); else callout_reset(&bfq_diskctx->bfq_callout, BFQ_T_WAIT, (void (*) (void *))helper_msg_as_timeout, bfq_diskctx); /* save the start time of blocking */ getmicrotime(&active_tdio->as_start_time); dsched_debug(BFQ_DEBUG_VERBOSE, "BFQ: blocked on %p, remaining_budget = %d\n", active_tdio, remaining_budget); DSCHED_THREAD_IO_UNLOCK(&active_tdio->head); goto save_and_rtn; } } } } else { /* if (active_tdio) */ /* there is no active tdio */ /* no pending bios at all */ active_tdio = wf2q_get_next_thread_io(&bfq_diskctx->bfq_wf2q); if (!active_tdio) { KKASSERT(bfq_diskctx->bfq_wf2q.wf2q_tdio_count == 0); dsched_debug(BFQ_DEBUG_VERBOSE, "BFQ: no more eligible tdio!\n"); goto save_and_rtn; } /* * A new tdio is picked, * initialize the service related statistic data */ DSCHED_THREAD_IO_LOCK(&active_tdio->head); active_tdio->service_received = 0; /* * Reset the maybe_timeout flag, which * may be set by a biodone after the the service is done */ getmicrotime(&active_tdio->service_start_time); active_tdio->maybe_timeout = 0; remaining_budget = active_tdio->budget; dsched_debug(BFQ_DEBUG_VERBOSE, "bfq: active_tdio %p selected, remaining budget = %d, tdio budget = %d\n, qlength = %d\n", active_tdio, remaining_budget, active_tdio->budget, active_tdio->head.qlength); } }/* while (free_slots) */ /* reach here only when free_slots == 0 */ if (active_tdio) /* && lockcount(&active_tdio->head.lock) > 0) */ DSCHED_THREAD_IO_UNLOCK(&active_tdio->head); save_and_rtn: /* save the remaining budget */ bfq_diskctx->bfq_remaining_budget = remaining_budget; bfq_diskctx->bfq_active_tdio = active_tdio; rtn: BFQ_UNLOCK(bfq_diskctx); /*dispatch the planned bios*/ for (i = 0; i < bio_index; i++) dsched_strategy_request_polling(diskctx->dp, bio_to_dispatch[i], diskctx); }