Quantcast
Viewing all articles
Browse latest Browse all 19

How txg_sync_thread dispatches writes - ZFS I/O notes part 1/4

The point of these notes is to document how the txg_sync_thread ends up calling dbuf_sync_indirect and dbuf_sync_leaf for dirty buffers (dbufs). As far as I can tell all indirect blocks depend on their children's I/Os being issued before their own I/Os are, but they are logically independent of each other at a given level. Leaf writes are logically independent of all other leaf writes. When looking at dbuf_sync_{leaf, indirect} you'll notice that that they are processing a list of dirty records. This raises the obvious question - how did the dirty records get associated with the dnodes? That is explained in my next post "Understanding how VFS writes translate in to dirty records."

Normal write zios are all dispatched asynchronously (see ZFS I/O notes part 4/4), stalling only to wait for their dependent child I/Os. Level 0 blocks should all be processed in parallel, so it isn't clear why streaming writes hit a ceiling without maxing out all CPUs or disk bandwidth.

Below is the function call chain that reaches from txg_sync_start all the way to zio_wait zio_nowait for all pool I/Os. Included is only the core "business" portion of each function with ellipses (...) placed wherever code has been excluded for readability. Unfortunately, blogger.com really botches the code formatting. The mechanism by which zio_wait / zio_nowait translate to actual I/O operations is explained in a later short post called "Understanding zio."


When a pool is created or imported through vdev_probe the txg_sync_start function is called which creates the txg_sync_thread.

void
txg_sync_start(dsl_pool_t *dp)
{
...
tx->tx_sync_thread = thread_create(NULL, 32 << 10, txg_sync_thread, dp, 0, &p0, TS_RUN, minclsyspri);
...
}

While the pool is present go through transition through the txg states calling spa_sync when the txg reaches the syncing state - once the spa_sync is called we wake all waiters sleeping on the transaction's tx_sync_done_cv.

static void
txg_sync_thread(void *arg)
{
dsl_pool_t *dp = arg;
spa_t *spa = dp->dp_spa;
...
for (;;) {
...
txg = tx->tx_quiesced_txg;
...
spa_sync(spa, txg);
...
cv_broadcast(&tx->tx_sync_done_cv);
...
}
}


spa_sync calls dsl_pool_sync until there is no more dirty data.

void
spa_sync(spa_t *spa, uint64_t txg)
{
dsl_pool_t *dp = spa->spa_dsl_pool;
objset_t *mos = spa->spa_meta_objset;
...
do {
...
dsl_pool_sync(dp, txg);
} while (dmu_objset_is_dirty(mos, txg));
}

dsl_pool_sync iterates through each of the dirty datasets in the pool calling dsl_dataset_sync twice, once to push out the dirty blocks and once to account for any userland changes after both iterations it does a synchronous zio on the root of the pool.

void
dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)
{
...
dsl_dataset_t *ds;
objset_t *mos = dp->dp_meta_objset;
...
tx = dmu_tx_create_assigned(dp, txg);

/*

   * Write out all dirty blocks of dirty datasets.
   */
zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
while ((ds = txg_list_remove(&dp->dp_dirty_datasets, txg)) != NULL) {
/*
     * We must not sync any non-MOS datasets twice, 
       * because we may have taken a snapshot of them.
     * However, we may sync newly-created datasets on
     * pass 2. 
       */
ASSERT(!list_link_active(&ds->ds_synced_link));
list_insert_tail(&synced_datasets, ds);
dsl_dataset_sync(ds, zio, tx);
}
VERIFY0(zio_wait(zio));
...

/*

   * After the data blocks have been written (ensured by the zio_wait()
   * above), update the user/group space accounting.
   */
for (ds = list_head(&synced_datasets); ds != NULL;
      ds = list_next(&synced_datasets, ds)) {
dmu_objset_do_userquota_updates(ds->ds_objset, tx);
}

/*

   * Sync the datasets again to push out the changes due to
   * userspace updates.  This must be done before we process the
   * sync tasks, so that any snapshots will have the correct
   * user accounting information (and we won't get confused
   * about which blocks are part of the snapshot).
   */
zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
while ((ds = txg_list_remove(&dp->dp_dirty_datasets, txg)) != NULL) {
ASSERT(list_link_active(&ds->ds_synced_link));
dmu_buf_rele(ds->ds_dbuf, ds);
dsl_dataset_sync(ds, zio, tx);
}
VERIFY0(zio_wait(zio));
...
}

 dsl_dataset_sync  passes the dataset's objset on to dmu_objset_sync.

void
dsl_dataset_sync(dsl_dataset_t *ds, zio_t *zio, dmu_tx_t *tx)
{
...

dmu_objset_sync(ds->ds_objset, zio, tx);

}

dmu_objset_sync calls dmu_objset_sync_dnodes on the list of the the dirty dnodes and the list of the free dnodes of the object set and calls dnode_sync directly on the special metadata dnodes.

/* called from dsl */

void
dmu_objset_sync(objset_t *os, zio_t *pio, dmu_tx_t *tx)
{
int txgoff;
...
list_t *newlist = NULL;
dbuf_dirty_record_t *dr;
...
/*
   * Create the root block IO
   */
...
zio = arc_write(pio, os->os_spa, tx->tx_txg,
      os->os_rootbp, os->os_phys_buf, DMU_OS_IS_L2CACHEABLE(os),
      DMU_OS_IS_L2COMPRESSIBLE(os), &zp, dmu_objset_write_ready,
      NULL, dmu_objset_write_done, os, ZIO_PRIORITY_ASYNC_WRITE,
      ZIO_FLAG_MUSTSUCCEED, &zb);

/*

* Sync special dnodes - the parent IO for the sync is the root block
   */
dnode_sync(DMU_META_DNODE(os), tx);
...
if (DMU_USERUSED_DNODE(os) &&
      DMU_USERUSED_DNODE(os)->dn_type != DMU_OT_NONE) {
DMU_USERUSED_DNODE(os)->dn_zio = zio;
dnode_sync(DMU_USERUSED_DNODE(os), tx);
DMU_GROUPUSED_DNODE(os)->dn_zio = zio;
dnode_sync(DMU_GROUPUSED_DNODE(os), tx);
}
...
txgoff = tx->tx_txg & TXG_MASK;
...
if (dmu_objset_userused_enabled(os)) {
newlist = &os->os_synced_dnodes;
/*
    * We must create the list here because it uses the
    * dn_dirty_link[] of this txg.
    */
list_create(newlist, sizeof (dnode_t),
      offsetof(dnode_t, dn_dirty_link[txgoff]));
}
dmu_objset_sync_dnodes(&os->os_free_dnodes[txgoff], newlist, tx);
dmu_objset_sync_dnodes(&os->os_dirty_dnodes[txgoff], newlist, tx);
list = &DMU_META_DNODE(os)->dn_dirty_records[txgoff];
while (dr = list_head(list)) {
ASSERT0(dr->dr_dbuf->db_level);
list_remove(list, dr);
if (dr->dr_zio)
zio_nowait(dr->dr_zio);
}
/*
* Free intent log blocks up to this tx.
   */
zil_sync(os->os_zil, tx);
os->os_phys->os_zil_header = os->os_zil_header;
zio_nowait(zio);
}


dmu_objset_sync_dnodes calls dnode_sync on each of the dnodes in list, adding them to newlist if newlist is non-NULL.

static void
dmu_objset_sync_dnodes(list_t *list, list_t *newlist, dmu_tx_t *tx)
{
dnode_t *dn;

while (dn = list_head(list)) {

...
/*
* Initialize dn_zio outside dnode_sync() because the
* meta-dnode needs to set it ouside dnode_sync().
*/
dn->dn_zio = dn->dn_dbuf->db_data_pending->dr_zio;
list_remove(list, dn);

if (newlist) {

(void) dnode_add_ref(dn, newlist);
list_insert_tail(newlist, dn);
}

dnode_sync(dn, tx);

}
}

dnode_sync passes through to dbuf_sync_list the list of dirty buffers

void
dnode_sync(dnode_t *dn, dmu_tx_t *tx)
{
...
list_t *list = &dn->dn_dirty_records[txgoff];
...
dbuf_sync_list(list, tx);
}

dbuf_sync_list iterates through list calling dbuf_sync_leaf for all data blocks and dbuf_sync_indirect for any indirect blocks.

void
dbuf_sync_list(list_t *list, dmu_tx_t *tx)
{
dbuf_dirty_record_t *dr;

while (dr = list_head(list)) {

<...>
list_remove(list, dr);
if (dr->dr_dbuf->db_level > 0)
dbuf_sync_indirect(dr, tx);
else
dbuf_sync_leaf(dr, tx);
}
}

To support copy on write indirect blocks need to be updated as well any time data blocks are updated. Thus we may have to read them from disk when modifying a block in a file. We modify the indirect block to indicate that it has pending dirty data and then create the zio we sync all of the indirect block's children and then issue the I/O for the indirect block itself.

static void
dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
{
dmu_buf_impl_t *db = dr->dr_dbuf;
...
/* Read the block if it hasn't been read yet. */
if (db->db_buf == NULL) {
mutex_exit(&db->db_mtx);
(void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED);
mutex_enter(&db->db_mtx);
}
..
/* Provide the pending dirty record to child dbufs */
db->db_data_pending = dr;

mutex_exit(&db->db_mtx);

/* doesn't actually execute a write - it just creates
   * dr->dr_zio which is executed by zio_nowait before
   * returning
   */
dbuf_write(dr, db->db_buf, tx);

zio = dr->dr_zio;

mutex_enter(&dr->dt.di.dr_mtx);
dbuf_sync_list(&dr->dt.di.dr_children, tx);
ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
mutex_exit(&dr->dt.di.dr_mtx);
zio_nowait(zio);
}

dbuf_sync_leaf creates the I/O for the dirty record and then dispatches it asynchronously.

static void

dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
{
arc_buf_t **datap = &dr->dt.dl.dr_data;
dmu_buf_impl_t *db = dr->dr_dbuf;
...
/* doesn't actually execute a write - it just creates
   * dr->dr_zio which is executed by zio_nowait before
   * returning
   */
dbuf_write(dr, *datap, tx);

ASSERT(!list_link_active(&dr->dr_dirty_node));

if (dn->dn_object == DMU_META_DNODE_OBJECT) {
list_insert_tail(&dn->dn_dirty_records[txg&TXG_MASK], dr);
DB_DNODE_EXIT(db);
} else {
/*
     * Although zio_nowait() does not "wait for an IO", it does
     * initiate the IO. If this is an empty write it seems plausible
     * that the IO could actually be completed before the nowait
     * returns. We need to DB_DNODE_EXIT() first in case
     * zio_nowait() invalidates the dbuf.
     */
DB_DNODE_EXIT(db);
zio_nowait(dr->dr_zio);
}
}


Viewing all articles
Browse latest Browse all 19

Trending Articles