This is a whirlwind tour through the execution of each of the stages of a zio created by zio_write. It should help to give a rudimentary feeling for how the pipeline execution of a typical write I/O works. See ZFS notes part 3/4 for a discussion of zio_execute.
A zio_write consists of the following pipeline stages:
#defineZIO_WRITE_COMMON_STAGES\
(ZIO_INTERLOCK_STAGES |\
ZIO_VDEV_IO_STAGES |\
ZIO_STAGE_ISSUE_ASYNC |\
ZIO_STAGE_CHECKSUM_GENERATE)
Which translates to:
ZIO_STAGE_READY= 1 << 16,/* RWFCI */
ZIO_STAGE_DONE= 1 << 21/* RWFCI */
ZIO_STAGE_VDEV_IO_START= 1 << 17,/* RWF-I */
ZIO_STAGE_VDEV_IO_DONE= 1 << 18,/* RWF-- */
ZIO_STAGE_VDEV_IO_ASSESS= 1 << 19,/* RWF-I */
ZIO_STAGE_ISSUE_ASYNC= 1 << 3,/* RWF-- */
ZIO_STAGE_CHECKSUM_GENERATE= 1 << 5,/* -W--- */
Which calls the zio_pipeline functions below in the following order:
zio_issue_async,
zio_checksum_generate,
zio_ready,
zio_vdev_io_start,
zio_vdev_io_done,
zio_vdev_io_assess,
zio_done
zio_issue_async enqueues the zio to a taskq thread and tells zio_execute to return immediately. The taskq thread will then execute the remainder of the zio_pipeline.
static int
zio_issue_async(zio_t *zio)
{
zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE);
return (ZIO_PIPELINE_STOP);
}
zio_checksum_generate does more or less what its name suggests. If the blkptr for the io is NULL it gets the checksum property from the zio. Otherwise the blkptr's checksum type is used.
static int
zio_checksum_generate(zio_t *zio)
{
blkptr_t *bp = zio->io_bp;
enum zio_checksum checksum;
if (bp == NULL) {
/*
* This is zio_write_phys().
* We're either generating a label checksum, or none at all.
*/
checksum = zio->io_prop.zp_checksum;
if (checksum == ZIO_CHECKSUM_OFF)
return (ZIO_PIPELINE_CONTINUE);
...
} else {
if (BP_IS_GANG(bp) && zio->io_child_type == ZIO_CHILD_GANG) {
...
checksum = ZIO_CHECKSUM_GANG_HEADER;
} else {
checksum = BP_GET_CHECKSUM(bp);
}
}
zio_checksum_compute(zio, checksum, zio->io_data, zio->io_size);
return (ZIO_PIPELINE_CONTINUE);
}
zio_ready first calls zio_wait_for_children for children of type gang or ddt that are in the ZIO_WAIT_READY state. If there are, zio_wait_children will advance the zio's io_stage and return B_TRUE. This leaves the zio stalled until all of its children reach zio_ready and have notified it. If the zio has an io_ready function - which in most cases it does that will be called before proceeding. The io_ready functions perform simple accounting on the buffer depending on the write type. The currently are: arc_write_ready, dbuf_write_override_ready, dbuf_write_nofill_ready, dmu_sync_late_arrival_ready, zio_write_gang_member_ready, and zio_ddt_child_write_ready. The main work in the function is performed by zio_notify_parent which will execute the parents zio if there are no longer any children that it needs to wait on (see below).
static int
zio_ready(zio_t *zio)
{
blkptr_t *bp = zio->io_bp;
zio_t *pio, *pio_next;
if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY) ||
zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_READY))
return (ZIO_PIPELINE_STOP);
if (zio->io_ready) {
...
zio->io_ready(zio);
}
if (bp != NULL && bp != &zio->io_bp_copy)
zio->io_bp_copy = *bp;
if (zio->io_error)
zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
mutex_enter(&zio->io_lock);
zio->io_state[ZIO_WAIT_READY] = 1;
pio = zio_walk_parents(zio);
mutex_exit(&zio->io_lock);
/*
* As we notify zio's parents, new parents could be added.
* New parents go to the head of zio's io_parent_list, however,
* so we will (correctly) not notify them. The remainder of zio's
* io_parent_list, from 'pio_next' onward, cannot change because
* all parents must wait for us to be done before they can be done.
*/
for (; pio != NULL; pio = pio_next) {
pio_next = zio_walk_parents(zio);
zio_notify_parent(pio, zio, ZIO_WAIT_READY);
}
...
if (zio_injection_enabled &&
zio->io_spa->spa_syncing_txg == zio->io_txg)
zio_handle_ignored_writes(zio);
return (ZIO_PIPELINE_CONTINUE);
}
zio_notify_parent decrements the waiter count and executes the parent zio if there are no more children that it needs to wait reach the ready state.
static void
zio_notify_parent(zio_t *pio, zio_t *zio, enum zio_wait_type wait)
{
uint64_t *countp = &pio->io_children[zio->io_child_type][wait];
...
mutex_enter(&pio->io_lock);
...
(*countp)--;
if (*countp == 0 && pio->io_stall == countp) {
pio->io_stall = NULL;
mutex_exit(&pio->io_lock);
zio_execute(pio);
} else {
mutex_exit(&pio->io_lock);
}
}
The next stage in the pipeline, zio_vdev_io_start, is responsible for queueing the zio for physical I/O. If the zio's vdev, io_vd, is a leaf device, i.e. not a higher layer logical vdev, we will try to queue the zio to the vdev through vdev_queue_io to aggregate writes. Otherwise, we will directly call the vdev's vdev_op_io_start routine to handle passing it through the vdev's layers.
static int
zio_vdev_io_start(zio_t *zio)
{
vdev_t *vd = zio->io_vd;
uint64_t align;
spa_t *spa = zio->io_spa;
int ret;
...
if (vd == NULL) {
...
/*
* The mirror_ops handle multiple DVAs in a single BP.
*/
return (vdev_mirror_ops.vdev_op_io_start(zio));
}
...
/*
* We keep track of time-sensitive I/Os so that the scan thread
* can quickly react to certain workloads. In particular, we care
* about non-scrubbing, top-level reads and writes with the following
* characteristics:
* - synchronous writes of user data to non-slog devices
*- any reads of user data
* When these conditions are met, adjust the timestamp of spa_last_io
* which allows the scan thread to adjust its workload accordingly.
*/
if (!(zio->io_flags & ZIO_FLAG_SCAN_THREAD) && zio->io_bp != NULL &&
vd == vd->vdev_top && !vd->vdev_islog &&
zio->io_bookmark.zb_objset != DMU_META_OBJSET &&
zio->io_txg != spa_syncing_txg(spa)) {
uint64_t old = spa->spa_last_io;
uint64_t new = ddi_get_lbolt64();
if (old != new)
(void) atomic_cas_64(&spa->spa_last_io, old, new);
}
align = 1ULL << vd->vdev_top->vdev_ashift;
if ((!(zio->io_flags & ZIO_FLAG_PHYSICAL) ||
(vd->vdev_top->vdev_physical_ashift > SPA_MINBLOCKSHIFT)) &&
P2PHASE(zio->io_size, align) != 0) {
/* Transform logical writes to be a full physical block size. */
uint64_t asize = P2ROUNDUP(zio->io_size, align);
char *abuf = NULL;
if (zio->io_type == ZIO_TYPE_READ ||
zio->io_type == ZIO_TYPE_WRITE)
abuf = zio_buf_alloc(asize);
ASSERT(vd == vd->vdev_top);
if (zio->io_type == ZIO_TYPE_WRITE) {
bcopy(zio->io_data, abuf, zio->io_size);
bzero(abuf + zio->io_size, asize - zio->io_size);
}
...
}
...
/*
* If this is a repair I/O, and there's no self-healing involved --
* that is, we're just resilvering what we expect to resilver --
* then don't do the I/O unless zio's txg is actually in vd's DTL.
* This prevents spurious resilvering with nested replication.
* For example, given a mirror of mirrors, (A+B)+(C+D), if only
* A is out of date, we'll read from C+D, then use the data to
* resilver A+B -- but we don't actually want to resilver B, just A.
* The top-level mirror has no way to know this, so instead we just
* discard unnecessary repairs as we work our way down the vdev tree.
* The same logic applies to any form of nested replication:
* ditto + mirror, RAID-Z + replacing, etc. This covers them all.
*/
if ((zio->io_flags & ZIO_FLAG_IO_REPAIR) &&
!(zio->io_flags & ZIO_FLAG_SELF_HEAL) &&
zio->io_txg != 0 &&/* not a delegated i/o */
!vdev_dtl_contains(vd, DTL_PARTIAL, zio->io_txg, 1)) {
ASSERT(zio->io_type == ZIO_TYPE_WRITE);
zio_vdev_io_bypass(zio);
return (ZIO_PIPELINE_CONTINUE);
}
if (vd->vdev_ops->vdev_op_leaf) {
switch (zio->io_type) {
case ZIO_TYPE_READ:
if (vdev_cache_read(zio))
return (ZIO_PIPELINE_CONTINUE);
/* FALLTHROUGH */
case ZIO_TYPE_WRITE:
case ZIO_TYPE_FREE:
if ((zio = vdev_queue_io(zio)) == NULL)
return (ZIO_PIPELINE_STOP);
if (!vdev_accessible(vd, zio)) {
zio->io_error = SET_ERROR(ENXIO);
zio_interrupt(zio);
return (ZIO_PIPELINE_STOP);
}
break;
}
...
}
ret = vd->vdev_ops->vdev_op_io_start(zio);
ASSERT(ret == ZIO_PIPELINE_STOP);
return (ret);
}
Most of the work done in zio_vdev_io_done is the (elided) error handling. If this zio's vdev exists, is a leaf, and the io_type is READ, WRITE, or FREE, vdev_queue_io_done will remove the zio and execute zios enqueued to the zio's vdev_queue. If io_type is a write we need to update the vdev's cache to reflect what is now on disk upon write completion in vdev_cache_write. Before return execute the vdev's io completion function.
static int
zio_vdev_io_done(zio_t *zio)
{
vdev_t *vd = zio->io_vd;
vdev_ops_t *ops = vd ? vd->vdev_ops : &vdev_mirror_ops;
boolean_t unexpected_error = B_FALSE;
if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE))
return (ZIO_PIPELINE_STOP);
...
if (vd != NULL && vd->vdev_ops->vdev_op_leaf &&
(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE ||
zio->io_type == ZIO_TYPE_FREE)) {
...
vdev_queue_io_done(zio);
if (zio->io_type == ZIO_TYPE_WRITE)
vdev_cache_write(zio);
...
}
ops->vdev_op_io_done(zio);
...
return (ZIO_PIPELINE_CONTINUE);
}
As earlier in the pipeline this zio will stall if there are children it needs to wait to complete first. This time the zio will stall on children of type CHILD_VDEV. If the I/O failed we consider reissuing the I/O. This time without caching or aggregation. To re-issue we reset the the zio's io_stage to vdev_io_start and pass it off to a taskq thread to execute.
If zio->io_physdone is set, it will be called if this is a leaf (physical) vdev. The only zio_write with a physdone completion function is arc_write, which passes in arc_write_physdone. If the zio's io_private field is set it will point to an arc_write_callback. The only example of its use in a call to arc_write is dbuf_write_physdone when arc_write is called from dbuf_write. From dbuf_write_physdone's preceding comment:
The SPA will call this callback several times for each zio - once for every physical child i/o (zio->io_phys_children times). This allows the DMU to monitor the progress of each logical i/o. For example, there may be 2 copies of an indirect block, or many fragments of a RAID-Z block. There may be a long delay before all copies/fragments are completed, so this callback allows us to retire dirty space gradually, as the physical i/os complete.
static int
zio_vdev_io_assess(zio_t *zio)
{
vdev_t *vd = zio->io_vd;
if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE))
return (ZIO_PIPELINE_STOP);
...
/*
* If the I/O failed, determine whether we should attempt to retry it.
*
* On retry, we cut in line in the issue queue, since we don't want
* compression/checksumming/etc. work to prevent our (cheap) IO reissue.
*/
if (zio->io_error && vd == NULL &&
!(zio->io_flags & (ZIO_FLAG_DONT_RETRY | ZIO_FLAG_IO_RETRY))) {
ASSERT(!(zio->io_flags & ZIO_FLAG_DONT_QUEUE));/* not a leaf */
ASSERT(!(zio->io_flags & ZIO_FLAG_IO_BYPASS));/* not a leaf */
zio->io_error = 0;
zio->io_flags |= ZIO_FLAG_IO_RETRY |
ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE;
zio->io_stage = ZIO_STAGE_VDEV_IO_START >> 1;
zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE,
zio_requeue_io_start_cut_in_line);
return (ZIO_PIPELINE_STOP);
}
/*
* If we got an error on a leaf device, convert it to ENXIO
* if the device is not accessible at all.
*/
if (zio->io_error && vd != NULL && vd->vdev_ops>vdev_op_leaf &&
!vdev_accessible(vd, zio))
zio->io_error = SET_ERROR(ENXIO);
/*
* If we can't write to an interior vdev (mirror or RAID-Z),
* set vdev_cant_write so that we stop trying to allocate from it.
*/
if (zio->io_error == ENXIO && zio->io_type == ZIO_TYPE_WRITE &&
vd != NULL && !vd->vdev_ops->vdev_op_leaf) {
vd->vdev_cant_write = B_TRUE;
}
...
if (vd != NULL && vd->vdev_ops->vdev_op_leaf &&
zio->io_physdone != NULL) {
...
zio->io_physdone(zio->io_logical);
}
return (ZIO_PIPELINE_CONTINUE);
}
If this zio has any outstanding children stall until they have completed. If this zio has no outstanding children it assumes the errors of any children. If the i/o needs to be re-executed because of an error we initiate that here. We're discussing this in the context of a write, but there is still the possibility of checksum errors for indirect blocks that need to be (re-)read from disk and any partial block writes for which the block must be read before being written to. io_done is set in zio_create. It notifies the part of the stack that created the zio that it is no longer outstanding. It will typically remove any dependencies and wake up any waiters on db_changed. The possible io_done functions for zios created by zio_write are arc_write_done, dbuf_write_override_done, dbuf_write_nofill_done, dmu_sync_late_arrival_done, zio_ddt_ditto_write_done, zio_ddt_child_write_done. If there is an io_waiter on this zio, i.e. the zio was initiated from zio_wait, the waiter is awakened and expected to destroy the zio. In the more common asynchronous case of being dispatched from zio_nowaitzio_done will zio_destroy the zio before returning.
static int
zio_done(zio_t *zio)
{
spa_t *spa = zio->io_spa;
zio_t *lio = zio->io_logical;
blkptr_t *bp = zio->io_bp;
vdev_t *vd = zio->io_vd;
uint64_t psize = zio->io_size;
zio_t *pio, *pio_next;
/*
* If our children haven't all completed,
* wait for them and then repeat this pipeline stage.
*/
if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE) ||
zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE) ||
zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE) ||
zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_DONE))
return (ZIO_PIPELINE_STOP);
...
/*
* If there were child vdev/gang/ddt errors, they apply to us now.
*/
zio_inherit_child_errors(zio, ZIO_CHILD_VDEV);
zio_inherit_child_errors(zio, ZIO_CHILD_GANG);
zio_inherit_child_errors(zio, ZIO_CHILD_DDT);
...
zio_pop_transforms(zio);/* note: may set zio->io_error */
vdev_stat_update(zio, psize);
if (zio->io_error) {
/*
* If this I/O is attached to a particular vdev,
* generate an error message describing the I/O failure
* at the block level. We ignore these errors if the
* device is currently unavailable.
*/
if (zio->io_error != ECKSUM && vd != NULL && !vdev_is_dead(vd))
zfs_ereport_post(FM_EREPORT_ZFS_IO, spa, vd, zio, 0, 0);
if ((zio->io_error == EIO || !(zio->io_flags &
(ZIO_FLAG_SPECULATIVE | ZIO_FLAG_DONT_PROPAGATE))) &&
zio == lio) {
/*
* For logical I/O requests, tell the SPA to log the
* error and generate a logical data ereport.
*/
spa_log_error(spa, zio);
zfs_ereport_post(FM_EREPORT_ZFS_DATA, spa, NULL, zio, 0, 0);
}
}
if (zio->io_error && zio == lio) {
/*
* Determine whether zio should be reexecuted. This will
* propagate all the way to the root via zio_notify_parent().
*/
...
if (IO_IS_ALLOCATING(zio) &&
!(zio->io_flags & ZIO_FLAG_CANFAIL)) {
if (zio->io_error != ENOSPC)
zio->io_reexecute |= ZIO_REEXECUTE_NOW;
else
zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND;
}
...
/*
* Here is a possibly good place to attempt to do
* either combinatorial reconstruction or error correction
* based on checksums. It also might be a good place
* to send out preliminary ereports before we suspend
* processing.
*/
}
/*
* If there were logical child errors, they apply to us now.
* We defer this until now to avoid conflating logical child
* errors with errors that happened to the zio itself when
* updating vdev stats and reporting FMA events above.
*/
zio_inherit_child_errors(zio, ZIO_CHILD_LOGICAL);
if ((zio->io_error || zio->io_reexecute) &&
IO_IS_ALLOCATING(zio) && zio->io_gang_leader == zio &&
!(zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE)))
zio_dva_unallocate(zio, zio->io_gang_tree, bp);
zio_gang_tree_free(&zio->io_gang_tree);
/*
* Godfather I/Os should never suspend.
*/
if ((zio->io_flags & ZIO_FLAG_GODFATHER) &&
(zio->io_reexecute & ZIO_REEXECUTE_SUSPEND))
zio->io_reexecute = 0;
if (zio->io_reexecute) {
/*
* This is a logical I/O that wants to reexecute.
*
* Reexecute is top-down. When an i/o fails, if it's not
* the root, it simply notifies its parent and sticks around.
* The parent, seeing that it still has children in zio_done(),
* does the same. This percolates all the way up to the root.
* The root i/o will reexecute or suspend the entire tree.
*
* This approach ensures that zio_reexecute() honors
* all the original i/o dependency relationships, e.g.
* parents not executing until children are ready.
*/
ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
zio->io_gang_leader = NULL;
mutex_enter(&zio->io_lock);
zio->io_state[ZIO_WAIT_DONE] = 1;
mutex_exit(&zio->io_lock);
/*
* "The Godfather" I/O monitors its children but is
* not a true parent to them. It will track them through
* the pipeline but severs its ties whenever they get into
* trouble (e.g. suspended). This allows "The Godfather"
* I/O to return status without blocking.
*/
for (pio = zio_walk_parents(zio); pio != NULL; pio = pio_next) {
zio_link_t *zl = zio->io_walk_link;
pio_next = zio_walk_parents(zio);
if ((pio->io_flags & ZIO_FLAG_GODFATHER) &&
(zio->io_reexecute & ZIO_REEXECUTE_SUSPEND)) {
zio_remove_child(pio, zio, zl);
zio_notify_parent(pio, zio, ZIO_WAIT_DONE);
}
}
if ((pio = zio_unique_parent(zio)) != NULL) {
/*
* We're not a root i/o, so there's nothing to do
* but notify our parent. Don't propagate errors
* upward since we haven't permanently failed yet.
*/
ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER));
zio->io_flags |= ZIO_FLAG_DONT_PROPAGATE;
zio_notify_parent(pio, zio, ZIO_WAIT_DONE);
} else if (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND) {
/*
* We'd fail again if we reexecuted now, so suspend
* until conditions improve (e.g. device comes online).
*/
zio_suspend(spa, zio);
} else {
/*
* Reexecution is potentially a huge amount of work.
* Hand it off to the otherwise-unused claim taskq.
*/
...
spa_taskq_dispatch_ent(spa, ZIO_TYPE_CLAIM,
ZIO_TASKQ_ISSUE, (task_func_t *)zio_reexecute, zio,
0, &zio->io_tqent);
}
return (ZIO_PIPELINE_STOP);
}
...
/*
* Report any checksum errors, since the I/O is complete.
*/
while (zio->io_cksum_report != NULL) {
zio_cksum_report_t *zcr = zio->io_cksum_report;
zio->io_cksum_report = zcr->zcr_next;
zcr->zcr_next = NULL;
zcr->zcr_finish(zcr, NULL);
zfs_ereport_free_checksum(zcr);
}
/*
* It is the responsibility of the done callback to ensure that this
* particular zio is no longer discoverable for adoption, and as
* such, cannot acquire any new parents.
*/
if (zio->io_done)
zio->io_done(zio);
mutex_enter(&zio->io_lock);
zio->io_state[ZIO_WAIT_DONE] = 1;
mutex_exit(&zio->io_lock);
for (pio = zio_walk_parents(zio); pio != NULL; pio = pio_next) {
zio_link_t *zl = zio->io_walk_link;
pio_next = zio_walk_parents(zio);
zio_remove_child(pio, zio, zl);
zio_notify_parent(pio, zio, ZIO_WAIT_DONE);
}
if (zio->io_waiter != NULL) {
mutex_enter(&zio->io_lock);
zio->io_executor = NULL;
cv_broadcast(&zio->io_cv);
mutex_exit(&zio->io_lock);
} else {
zio_destroy(zio);
}
return (ZIO_PIPELINE_STOP);
}
A zio_write consists of the following pipeline stages:
#defineZIO_WRITE_COMMON_STAGES\
(ZIO_INTERLOCK_STAGES |\
ZIO_VDEV_IO_STAGES |\
ZIO_STAGE_ISSUE_ASYNC |\
ZIO_STAGE_CHECKSUM_GENERATE)
Which translates to:
ZIO_STAGE_READY= 1 << 16,/* RWFCI */
ZIO_STAGE_DONE= 1 << 21/* RWFCI */
ZIO_STAGE_VDEV_IO_START= 1 << 17,/* RWF-I */
ZIO_STAGE_VDEV_IO_DONE= 1 << 18,/* RWF-- */
ZIO_STAGE_VDEV_IO_ASSESS= 1 << 19,/* RWF-I */
ZIO_STAGE_ISSUE_ASYNC= 1 << 3,/* RWF-- */
ZIO_STAGE_CHECKSUM_GENERATE= 1 << 5,/* -W--- */
Which calls the zio_pipeline functions below in the following order:
zio_issue_async,
zio_checksum_generate,
zio_ready,
zio_vdev_io_start,
zio_vdev_io_done,
zio_vdev_io_assess,
zio_done
zio_issue_async enqueues the zio to a taskq thread and tells zio_execute to return immediately. The taskq thread will then execute the remainder of the zio_pipeline.
static int
zio_issue_async(zio_t *zio)
{
zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE);
return (ZIO_PIPELINE_STOP);
}
zio_checksum_generate does more or less what its name suggests. If the blkptr for the io is NULL it gets the checksum property from the zio. Otherwise the blkptr's checksum type is used.
static int
zio_checksum_generate(zio_t *zio)
{
blkptr_t *bp = zio->io_bp;
enum zio_checksum checksum;
if (bp == NULL) {
/*
* This is zio_write_phys().
* We're either generating a label checksum, or none at all.
*/
checksum = zio->io_prop.zp_checksum;
if (checksum == ZIO_CHECKSUM_OFF)
return (ZIO_PIPELINE_CONTINUE);
...
} else {
if (BP_IS_GANG(bp) && zio->io_child_type == ZIO_CHILD_GANG) {
...
checksum = ZIO_CHECKSUM_GANG_HEADER;
} else {
checksum = BP_GET_CHECKSUM(bp);
}
}
zio_checksum_compute(zio, checksum, zio->io_data, zio->io_size);
return (ZIO_PIPELINE_CONTINUE);
}
zio_ready first calls zio_wait_for_children for children of type gang or ddt that are in the ZIO_WAIT_READY state. If there are, zio_wait_children will advance the zio's io_stage and return B_TRUE. This leaves the zio stalled until all of its children reach zio_ready and have notified it. If the zio has an io_ready function - which in most cases it does that will be called before proceeding. The io_ready functions perform simple accounting on the buffer depending on the write type. The currently are: arc_write_ready, dbuf_write_override_ready, dbuf_write_nofill_ready, dmu_sync_late_arrival_ready, zio_write_gang_member_ready, and zio_ddt_child_write_ready. The main work in the function is performed by zio_notify_parent which will execute the parents zio if there are no longer any children that it needs to wait on (see below).
static int
zio_ready(zio_t *zio)
{
blkptr_t *bp = zio->io_bp;
zio_t *pio, *pio_next;
if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY) ||
zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_READY))
return (ZIO_PIPELINE_STOP);
if (zio->io_ready) {
...
zio->io_ready(zio);
}
if (bp != NULL && bp != &zio->io_bp_copy)
zio->io_bp_copy = *bp;
if (zio->io_error)
zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
mutex_enter(&zio->io_lock);
zio->io_state[ZIO_WAIT_READY] = 1;
pio = zio_walk_parents(zio);
mutex_exit(&zio->io_lock);
/*
* As we notify zio's parents, new parents could be added.
* New parents go to the head of zio's io_parent_list, however,
* so we will (correctly) not notify them. The remainder of zio's
* io_parent_list, from 'pio_next' onward, cannot change because
* all parents must wait for us to be done before they can be done.
*/
for (; pio != NULL; pio = pio_next) {
pio_next = zio_walk_parents(zio);
zio_notify_parent(pio, zio, ZIO_WAIT_READY);
}
...
if (zio_injection_enabled &&
zio->io_spa->spa_syncing_txg == zio->io_txg)
zio_handle_ignored_writes(zio);
return (ZIO_PIPELINE_CONTINUE);
}
zio_notify_parent decrements the waiter count and executes the parent zio if there are no more children that it needs to wait reach the ready state.
static void
zio_notify_parent(zio_t *pio, zio_t *zio, enum zio_wait_type wait)
{
uint64_t *countp = &pio->io_children[zio->io_child_type][wait];
...
mutex_enter(&pio->io_lock);
...
(*countp)--;
if (*countp == 0 && pio->io_stall == countp) {
pio->io_stall = NULL;
mutex_exit(&pio->io_lock);
zio_execute(pio);
} else {
mutex_exit(&pio->io_lock);
}
}
The next stage in the pipeline, zio_vdev_io_start, is responsible for queueing the zio for physical I/O. If the zio's vdev, io_vd, is a leaf device, i.e. not a higher layer logical vdev, we will try to queue the zio to the vdev through vdev_queue_io to aggregate writes. Otherwise, we will directly call the vdev's vdev_op_io_start routine to handle passing it through the vdev's layers.
static int
zio_vdev_io_start(zio_t *zio)
{
vdev_t *vd = zio->io_vd;
uint64_t align;
spa_t *spa = zio->io_spa;
int ret;
...
if (vd == NULL) {
...
/*
* The mirror_ops handle multiple DVAs in a single BP.
*/
return (vdev_mirror_ops.vdev_op_io_start(zio));
}
...
/*
* We keep track of time-sensitive I/Os so that the scan thread
* can quickly react to certain workloads. In particular, we care
* about non-scrubbing, top-level reads and writes with the following
* characteristics:
* - synchronous writes of user data to non-slog devices
*- any reads of user data
* When these conditions are met, adjust the timestamp of spa_last_io
* which allows the scan thread to adjust its workload accordingly.
*/
if (!(zio->io_flags & ZIO_FLAG_SCAN_THREAD) && zio->io_bp != NULL &&
vd == vd->vdev_top && !vd->vdev_islog &&
zio->io_bookmark.zb_objset != DMU_META_OBJSET &&
zio->io_txg != spa_syncing_txg(spa)) {
uint64_t old = spa->spa_last_io;
uint64_t new = ddi_get_lbolt64();
if (old != new)
(void) atomic_cas_64(&spa->spa_last_io, old, new);
}
align = 1ULL << vd->vdev_top->vdev_ashift;
if ((!(zio->io_flags & ZIO_FLAG_PHYSICAL) ||
(vd->vdev_top->vdev_physical_ashift > SPA_MINBLOCKSHIFT)) &&
P2PHASE(zio->io_size, align) != 0) {
/* Transform logical writes to be a full physical block size. */
uint64_t asize = P2ROUNDUP(zio->io_size, align);
char *abuf = NULL;
if (zio->io_type == ZIO_TYPE_READ ||
zio->io_type == ZIO_TYPE_WRITE)
abuf = zio_buf_alloc(asize);
ASSERT(vd == vd->vdev_top);
if (zio->io_type == ZIO_TYPE_WRITE) {
bcopy(zio->io_data, abuf, zio->io_size);
bzero(abuf + zio->io_size, asize - zio->io_size);
}
...
}
...
/*
* If this is a repair I/O, and there's no self-healing involved --
* that is, we're just resilvering what we expect to resilver --
* then don't do the I/O unless zio's txg is actually in vd's DTL.
* This prevents spurious resilvering with nested replication.
* For example, given a mirror of mirrors, (A+B)+(C+D), if only
* A is out of date, we'll read from C+D, then use the data to
* resilver A+B -- but we don't actually want to resilver B, just A.
* The top-level mirror has no way to know this, so instead we just
* discard unnecessary repairs as we work our way down the vdev tree.
* The same logic applies to any form of nested replication:
* ditto + mirror, RAID-Z + replacing, etc. This covers them all.
*/
if ((zio->io_flags & ZIO_FLAG_IO_REPAIR) &&
!(zio->io_flags & ZIO_FLAG_SELF_HEAL) &&
zio->io_txg != 0 &&/* not a delegated i/o */
!vdev_dtl_contains(vd, DTL_PARTIAL, zio->io_txg, 1)) {
ASSERT(zio->io_type == ZIO_TYPE_WRITE);
zio_vdev_io_bypass(zio);
return (ZIO_PIPELINE_CONTINUE);
}
if (vd->vdev_ops->vdev_op_leaf) {
switch (zio->io_type) {
case ZIO_TYPE_READ:
if (vdev_cache_read(zio))
return (ZIO_PIPELINE_CONTINUE);
/* FALLTHROUGH */
case ZIO_TYPE_WRITE:
case ZIO_TYPE_FREE:
if ((zio = vdev_queue_io(zio)) == NULL)
return (ZIO_PIPELINE_STOP);
if (!vdev_accessible(vd, zio)) {
zio->io_error = SET_ERROR(ENXIO);
zio_interrupt(zio);
return (ZIO_PIPELINE_STOP);
}
break;
}
...
}
ret = vd->vdev_ops->vdev_op_io_start(zio);
ASSERT(ret == ZIO_PIPELINE_STOP);
return (ret);
}
Most of the work done in zio_vdev_io_done is the (elided) error handling. If this zio's vdev exists, is a leaf, and the io_type is READ, WRITE, or FREE, vdev_queue_io_done will remove the zio and execute zios enqueued to the zio's vdev_queue. If io_type is a write we need to update the vdev's cache to reflect what is now on disk upon write completion in vdev_cache_write. Before return execute the vdev's io completion function.
static int
zio_vdev_io_done(zio_t *zio)
{
vdev_t *vd = zio->io_vd;
vdev_ops_t *ops = vd ? vd->vdev_ops : &vdev_mirror_ops;
boolean_t unexpected_error = B_FALSE;
if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE))
return (ZIO_PIPELINE_STOP);
...
if (vd != NULL && vd->vdev_ops->vdev_op_leaf &&
(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE ||
zio->io_type == ZIO_TYPE_FREE)) {
...
vdev_queue_io_done(zio);
if (zio->io_type == ZIO_TYPE_WRITE)
vdev_cache_write(zio);
...
}
ops->vdev_op_io_done(zio);
...
return (ZIO_PIPELINE_CONTINUE);
}
As earlier in the pipeline this zio will stall if there are children it needs to wait to complete first. This time the zio will stall on children of type CHILD_VDEV. If the I/O failed we consider reissuing the I/O. This time without caching or aggregation. To re-issue we reset the the zio's io_stage to vdev_io_start and pass it off to a taskq thread to execute.
If zio->io_physdone is set, it will be called if this is a leaf (physical) vdev. The only zio_write with a physdone completion function is arc_write, which passes in arc_write_physdone. If the zio's io_private field is set it will point to an arc_write_callback. The only example of its use in a call to arc_write is dbuf_write_physdone when arc_write is called from dbuf_write. From dbuf_write_physdone's preceding comment:
The SPA will call this callback several times for each zio - once for every physical child i/o (zio->io_phys_children times). This allows the DMU to monitor the progress of each logical i/o. For example, there may be 2 copies of an indirect block, or many fragments of a RAID-Z block. There may be a long delay before all copies/fragments are completed, so this callback allows us to retire dirty space gradually, as the physical i/os complete.
static int
zio_vdev_io_assess(zio_t *zio)
{
vdev_t *vd = zio->io_vd;
if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE))
return (ZIO_PIPELINE_STOP);
...
/*
* If the I/O failed, determine whether we should attempt to retry it.
*
* On retry, we cut in line in the issue queue, since we don't want
* compression/checksumming/etc. work to prevent our (cheap) IO reissue.
*/
if (zio->io_error && vd == NULL &&
!(zio->io_flags & (ZIO_FLAG_DONT_RETRY | ZIO_FLAG_IO_RETRY))) {
ASSERT(!(zio->io_flags & ZIO_FLAG_DONT_QUEUE));/* not a leaf */
ASSERT(!(zio->io_flags & ZIO_FLAG_IO_BYPASS));/* not a leaf */
zio->io_error = 0;
zio->io_flags |= ZIO_FLAG_IO_RETRY |
ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE;
zio->io_stage = ZIO_STAGE_VDEV_IO_START >> 1;
zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE,
zio_requeue_io_start_cut_in_line);
return (ZIO_PIPELINE_STOP);
}
/*
* If we got an error on a leaf device, convert it to ENXIO
* if the device is not accessible at all.
*/
if (zio->io_error && vd != NULL && vd->vdev_ops>vdev_op_leaf &&
!vdev_accessible(vd, zio))
zio->io_error = SET_ERROR(ENXIO);
/*
* If we can't write to an interior vdev (mirror or RAID-Z),
* set vdev_cant_write so that we stop trying to allocate from it.
*/
if (zio->io_error == ENXIO && zio->io_type == ZIO_TYPE_WRITE &&
vd != NULL && !vd->vdev_ops->vdev_op_leaf) {
vd->vdev_cant_write = B_TRUE;
}
...
if (vd != NULL && vd->vdev_ops->vdev_op_leaf &&
zio->io_physdone != NULL) {
...
zio->io_physdone(zio->io_logical);
}
return (ZIO_PIPELINE_CONTINUE);
}
If this zio has any outstanding children stall until they have completed. If this zio has no outstanding children it assumes the errors of any children. If the i/o needs to be re-executed because of an error we initiate that here. We're discussing this in the context of a write, but there is still the possibility of checksum errors for indirect blocks that need to be (re-)read from disk and any partial block writes for which the block must be read before being written to. io_done is set in zio_create. It notifies the part of the stack that created the zio that it is no longer outstanding. It will typically remove any dependencies and wake up any waiters on db_changed. The possible io_done functions for zios created by zio_write are arc_write_done, dbuf_write_override_done, dbuf_write_nofill_done, dmu_sync_late_arrival_done, zio_ddt_ditto_write_done, zio_ddt_child_write_done. If there is an io_waiter on this zio, i.e. the zio was initiated from zio_wait, the waiter is awakened and expected to destroy the zio. In the more common asynchronous case of being dispatched from zio_nowaitzio_done will zio_destroy the zio before returning.
static int
zio_done(zio_t *zio)
{
spa_t *spa = zio->io_spa;
zio_t *lio = zio->io_logical;
blkptr_t *bp = zio->io_bp;
vdev_t *vd = zio->io_vd;
uint64_t psize = zio->io_size;
zio_t *pio, *pio_next;
/*
* If our children haven't all completed,
* wait for them and then repeat this pipeline stage.
*/
if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE) ||
zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE) ||
zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE) ||
zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_DONE))
return (ZIO_PIPELINE_STOP);
...
/*
* If there were child vdev/gang/ddt errors, they apply to us now.
*/
zio_inherit_child_errors(zio, ZIO_CHILD_VDEV);
zio_inherit_child_errors(zio, ZIO_CHILD_GANG);
zio_inherit_child_errors(zio, ZIO_CHILD_DDT);
...
zio_pop_transforms(zio);/* note: may set zio->io_error */
vdev_stat_update(zio, psize);
if (zio->io_error) {
/*
* If this I/O is attached to a particular vdev,
* generate an error message describing the I/O failure
* at the block level. We ignore these errors if the
* device is currently unavailable.
*/
if (zio->io_error != ECKSUM && vd != NULL && !vdev_is_dead(vd))
zfs_ereport_post(FM_EREPORT_ZFS_IO, spa, vd, zio, 0, 0);
if ((zio->io_error == EIO || !(zio->io_flags &
(ZIO_FLAG_SPECULATIVE | ZIO_FLAG_DONT_PROPAGATE))) &&
zio == lio) {
/*
* For logical I/O requests, tell the SPA to log the
* error and generate a logical data ereport.
*/
spa_log_error(spa, zio);
zfs_ereport_post(FM_EREPORT_ZFS_DATA, spa, NULL, zio, 0, 0);
}
}
if (zio->io_error && zio == lio) {
/*
* Determine whether zio should be reexecuted. This will
* propagate all the way to the root via zio_notify_parent().
*/
...
if (IO_IS_ALLOCATING(zio) &&
!(zio->io_flags & ZIO_FLAG_CANFAIL)) {
if (zio->io_error != ENOSPC)
zio->io_reexecute |= ZIO_REEXECUTE_NOW;
else
zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND;
}
...
/*
* Here is a possibly good place to attempt to do
* either combinatorial reconstruction or error correction
* based on checksums. It also might be a good place
* to send out preliminary ereports before we suspend
* processing.
*/
}
/*
* If there were logical child errors, they apply to us now.
* We defer this until now to avoid conflating logical child
* errors with errors that happened to the zio itself when
* updating vdev stats and reporting FMA events above.
*/
zio_inherit_child_errors(zio, ZIO_CHILD_LOGICAL);
if ((zio->io_error || zio->io_reexecute) &&
IO_IS_ALLOCATING(zio) && zio->io_gang_leader == zio &&
!(zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE)))
zio_dva_unallocate(zio, zio->io_gang_tree, bp);
zio_gang_tree_free(&zio->io_gang_tree);
/*
* Godfather I/Os should never suspend.
*/
if ((zio->io_flags & ZIO_FLAG_GODFATHER) &&
(zio->io_reexecute & ZIO_REEXECUTE_SUSPEND))
zio->io_reexecute = 0;
if (zio->io_reexecute) {
/*
* This is a logical I/O that wants to reexecute.
*
* Reexecute is top-down. When an i/o fails, if it's not
* the root, it simply notifies its parent and sticks around.
* The parent, seeing that it still has children in zio_done(),
* does the same. This percolates all the way up to the root.
* The root i/o will reexecute or suspend the entire tree.
*
* This approach ensures that zio_reexecute() honors
* all the original i/o dependency relationships, e.g.
* parents not executing until children are ready.
*/
ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
zio->io_gang_leader = NULL;
mutex_enter(&zio->io_lock);
zio->io_state[ZIO_WAIT_DONE] = 1;
mutex_exit(&zio->io_lock);
/*
* "The Godfather" I/O monitors its children but is
* not a true parent to them. It will track them through
* the pipeline but severs its ties whenever they get into
* trouble (e.g. suspended). This allows "The Godfather"
* I/O to return status without blocking.
*/
for (pio = zio_walk_parents(zio); pio != NULL; pio = pio_next) {
zio_link_t *zl = zio->io_walk_link;
pio_next = zio_walk_parents(zio);
if ((pio->io_flags & ZIO_FLAG_GODFATHER) &&
(zio->io_reexecute & ZIO_REEXECUTE_SUSPEND)) {
zio_remove_child(pio, zio, zl);
zio_notify_parent(pio, zio, ZIO_WAIT_DONE);
}
}
if ((pio = zio_unique_parent(zio)) != NULL) {
/*
* We're not a root i/o, so there's nothing to do
* but notify our parent. Don't propagate errors
* upward since we haven't permanently failed yet.
*/
ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER));
zio->io_flags |= ZIO_FLAG_DONT_PROPAGATE;
zio_notify_parent(pio, zio, ZIO_WAIT_DONE);
} else if (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND) {
/*
* We'd fail again if we reexecuted now, so suspend
* until conditions improve (e.g. device comes online).
*/
zio_suspend(spa, zio);
} else {
/*
* Reexecution is potentially a huge amount of work.
* Hand it off to the otherwise-unused claim taskq.
*/
...
spa_taskq_dispatch_ent(spa, ZIO_TYPE_CLAIM,
ZIO_TASKQ_ISSUE, (task_func_t *)zio_reexecute, zio,
0, &zio->io_tqent);
}
return (ZIO_PIPELINE_STOP);
}
...
/*
* Report any checksum errors, since the I/O is complete.
*/
while (zio->io_cksum_report != NULL) {
zio_cksum_report_t *zcr = zio->io_cksum_report;
zio->io_cksum_report = zcr->zcr_next;
zcr->zcr_next = NULL;
zcr->zcr_finish(zcr, NULL);
zfs_ereport_free_checksum(zcr);
}
/*
* It is the responsibility of the done callback to ensure that this
* particular zio is no longer discoverable for adoption, and as
* such, cannot acquire any new parents.
*/
if (zio->io_done)
zio->io_done(zio);
mutex_enter(&zio->io_lock);
zio->io_state[ZIO_WAIT_DONE] = 1;
mutex_exit(&zio->io_lock);
for (pio = zio_walk_parents(zio); pio != NULL; pio = pio_next) {
zio_link_t *zl = zio->io_walk_link;
pio_next = zio_walk_parents(zio);
zio_remove_child(pio, zio, zl);
zio_notify_parent(pio, zio, ZIO_WAIT_DONE);
}
if (zio->io_waiter != NULL) {
mutex_enter(&zio->io_lock);
zio->io_executor = NULL;
cv_broadcast(&zio->io_cv);
mutex_exit(&zio->io_lock);
} else {
zio_destroy(zio);
}
return (ZIO_PIPELINE_STOP);
}