The function zfs_write is the vnode operation for writes in ZFS. Since we're really only interested in ZFS we'll skip how a write system call reaches this point.
vnodeops_t *zfs_fvnodeops;
const fs_operation_def_t zfs_fvnodeops_template[] = {
VOPNAME_OPEN,{ .vop_open = zfs_open },
VOPNAME_CLOSE,{ .vop_close = zfs_close },
VOPNAME_READ,{ .vop_read = zfs_read },
VOPNAME_WRITE,{ .vop_write = zfs_write },
...
zfs_write is responsible for taking the data attached to a uio and passing it down to the DMU (data management unit) layer in a way that the latter can understand. The two paths that the data can take are dmu_assign_arcbuf and dmu_write.
zfs_vnops.c:800
static int
zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
{
...
arc_buf_t*abuf;
...
/*
* Write the file in reasonable size chunks. Each
* chunk is written in a separate transaction. This
* keeps the intent log records small and allow us
* to do more fine-grained space accounting.
*/
while (n > 0) {
abuf = NULL;
woff = uio->uio_loffset;
...
if (xuio && abuf == NULL) {
...
abuf = dmu_xuio_arcbuf(xuio, i_iov);
i_iov++;
} else if (abuf == NULL && n >= max_blksz &&
woff >= zp->z_size &&
P2PHASE(woff, max_blksz) == 0 &&
zp->z_blksz == max_blksz) {
/*
* This write covers a full block. "Borrow" a
* buffer from the dmu so that we can fill it
* before we enter a transaction. This avoids
* the possibility of holding up the transaction
* if the data copy hangs up on a pagefault (e.g.,
* from an NFS server mapping).
*/
size_t cbytes;
abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
max_blksz);
...
if (error = uiocopy(abuf->b_data, max_blksz,
UIO_WRITE, uio, &cbytes)) {
dmu_return_arcbuf(abuf);
break;
}
...
}
...
if (tx_bytes < max_blksz && (!write_eof ||
aiov->iov_base != abuf->b_data)) {
...
dmu_write(zfsvfs->z_os, zp->z_id, woff,
aiov->iov_len, aiov->iov_base, tx);
...
} else {
dmu_assign_arcbuf(sa_get_db(zp->z_sa_hdl),
woff, abuf, tx);
}
...
}
dmu_write calls dmu_buf_hold_array to retrieve the dbufs of the file required by size (perhaps to be described in a later note). It then calls dmu_buf_will_fill for complete writes and dmu_buf_will_dirty for partial block writes (writes requiring a read/modify/write). It subsequently bcopys the data to the dbuf. ZFS can call bcopy on user pages, as opposed to a dedicated copyin function because the caller has held and pre-faulted any pages in advance. dmu_fill_done is an alias for dbuf_fill_done which sanity checks the dbuf and wakes up any waiters on db_changed (functions that were trying to access this dbuf while it was being modified).
dmu.c:818
void
dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
const void *buf, dmu_tx_t *tx)
{
dmu_buf_t **dbp;
int numbufs, i;
if (size == 0)
return;
VERIFY(0 == dmu_buf_hold_array(os, object, offset, size,
FALSE, FTAG, &numbufs, &dbp));
for (i = 0; i < numbufs; i++) {
int tocpy;
int bufoff;
dmu_buf_t *db = dbp[i];
...
bufoff = offset - db->db_offset;
tocpy = (int)MIN(db->db_size - bufoff, size);
...
if (tocpy == db->db_size)
dmu_buf_will_fill(db, tx);
else
dmu_buf_will_dirty(db, tx);
bcopy(buf, (char *)db->db_data + bufoff, tocpy);
if (tocpy == db->db_size)
dmu_buf_fill_done(db, tx);
offset += tocpy;
size -= tocpy;
buf = (char *)buf + tocpy;
}
...
}
dmu_buf_dirty reads the block first so that we can modify it and then mark it dirty associated with this transaction.
void
dmu_buf_will_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx)
{
dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
int rf = DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH;
...
(void) dbuf_read(db, NULL, rf);
(void) dbuf_dirty(db, tx);
}
dmu_buf_will_fill (complete block writes) calls dbuf_noread to transition the state of the dbuf. If the dbuf is not cached dbuf_noread sets the associated data buffer with the result of a call to arc_buf_alloc and sets the state to DB_FILL.
void
dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)
{
dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
...
dbuf_noread(db);
(void) dbuf_dirty(db, tx);
}
dmu_assign_arcbuf either calls dbuf_assign_arcbuf or calls dmu_write. Both paths lead directly to dbuf_dirty.
/*
* When possible directly assign passed loaned
* arc buffer to a dbuf. If this is not possible
* copy the contents of passed arc buf via dmu_write().
*/
void
dmu_assign_arcbuf(dmu_buf_t *handle, uint64_t offset, arc_buf_t *buf,
dmu_tx_t *tx)
{
...
if (offset == db->db.db_offset && blksz == db->db.db_size &&
DBUF_GET_BUFC_TYPE(db) == ARC_BUFC_DATA) {
dbuf_assign_arcbuf(db, buf, tx);
...
} else {
...
dmu_write(os, object, offset, blksz, buf->b_data, tx);
...
}
}
#pragma weak dmu_buf_fill_done = dbuf_fill_done
/*
* Directly assign a provided arc buf to a given dbuf if it's not referenced
* by anybody except our caller. Otherwise copy arcbuf's contents to dbuf.
*/
void
dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx)
{
...
(void) dbuf_dirty(db, tx);
dmu_buf_fill_done(&db->db, tx);
}
dbuf_dirty adds a dbuf to the list of dirty records for a dnode and then marks the dnode itself as being dirty for its object set with dnode_setdirty.
dbuf_dirty_record_t *
dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
{
dnode_t *dn;
...
dbuf_dirty_record_t **drp, *dr;
...
int txgoff = tx->tx_txg & TXG_MASK;
...
dn = DB_DNODE(db);
...
/*
* If this buffer is already dirty, we're done.
*/
drp = &db->db_last_dirty;
...
while ((dr = *drp) != NULL && dr->dr_txg > tx->tx_txg)
drp = &dr->dr_next;
...
if (db->db_blkid == DMU_BONUS_BLKID ||
db->db_blkid == DMU_SPILL_BLKID) {
...
list_insert_tail(&dn->dn_dirty_records[txgoff], dr);
...
}
if (db->db_level+1 < dn->dn_nlevels) {
...
if (db->db_last_dirty == dr ||
dn->dn_object == DMU_META_DNODE_OBJECT) {
...
list_insert_tail(&di->dt.di.dr_children, dr);
...
}
} else {
...
list_insert_tail(&dn->dn_dirty_records[txgoff], dr);
...
}
dnode_setdirty(dn, tx);
...
}
dnode_setdirty will add a dnode to the list of freed dnodes for the object set if the dnode has been recently freed (i.e. it's dn_free_txg is non-zero). Otherwise the dnode will be added to the list of dirty dnodes. The dataset itself is then marked dirty.
void
dnode_setdirty(dnode_t *dn, dmu_tx_t *tx)
{
objset_t *os = dn->dn_objset;
uint64_t txg = tx->tx_txg;
...
if (dn->dn_free_txg > 0 && dn->dn_free_txg <= txg) {
list_insert_tail(&os->os_free_dnodes[txg&TXG_MASK], dn);
} else {
list_insert_tail(&os->os_dirty_dnodes[txg&TXG_MASK], dn);
}
dsl_dataset_dirty(os->os_dsl_dataset, tx);
}
Marking a dataset dirty entails adding the pool to the list for this txg. See dsl_pool_sync to see where ds is removed from the txg_list and dsl_dataset_sync is called on it.
void
dsl_dataset_dirty(dsl_dataset_t *ds, dmu_tx_t *tx)
{
dsl_pool_t *dp;
...
dp = ds->ds_dir->dd_pool;
if (txg_list_add(&dp->dp_dirty_datasets, ds, tx->tx_txg)) {
/* up the hold count until we can be written out */
dmu_buf_add_ref(ds->ds_dbuf, ds);
}
}
vnodeops_t *zfs_fvnodeops;
const fs_operation_def_t zfs_fvnodeops_template[] = {
VOPNAME_OPEN,{ .vop_open = zfs_open },
VOPNAME_CLOSE,{ .vop_close = zfs_close },
VOPNAME_READ,{ .vop_read = zfs_read },
VOPNAME_WRITE,{ .vop_write = zfs_write },
...
zfs_write is responsible for taking the data attached to a uio and passing it down to the DMU (data management unit) layer in a way that the latter can understand. The two paths that the data can take are dmu_assign_arcbuf and dmu_write.
zfs_vnops.c:800
static int
zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
{
...
arc_buf_t*abuf;
...
/*
* Write the file in reasonable size chunks. Each
* chunk is written in a separate transaction. This
* keeps the intent log records small and allow us
* to do more fine-grained space accounting.
*/
while (n > 0) {
abuf = NULL;
woff = uio->uio_loffset;
...
if (xuio && abuf == NULL) {
...
abuf = dmu_xuio_arcbuf(xuio, i_iov);
i_iov++;
} else if (abuf == NULL && n >= max_blksz &&
woff >= zp->z_size &&
P2PHASE(woff, max_blksz) == 0 &&
zp->z_blksz == max_blksz) {
/*
* This write covers a full block. "Borrow" a
* buffer from the dmu so that we can fill it
* before we enter a transaction. This avoids
* the possibility of holding up the transaction
* if the data copy hangs up on a pagefault (e.g.,
* from an NFS server mapping).
*/
size_t cbytes;
abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
max_blksz);
...
if (error = uiocopy(abuf->b_data, max_blksz,
UIO_WRITE, uio, &cbytes)) {
dmu_return_arcbuf(abuf);
break;
}
...
}
...
if (tx_bytes < max_blksz && (!write_eof ||
aiov->iov_base != abuf->b_data)) {
...
dmu_write(zfsvfs->z_os, zp->z_id, woff,
aiov->iov_len, aiov->iov_base, tx);
...
} else {
dmu_assign_arcbuf(sa_get_db(zp->z_sa_hdl),
woff, abuf, tx);
}
...
}
dmu_write calls dmu_buf_hold_array to retrieve the dbufs of the file required by size (perhaps to be described in a later note). It then calls dmu_buf_will_fill for complete writes and dmu_buf_will_dirty for partial block writes (writes requiring a read/modify/write). It subsequently bcopys the data to the dbuf. ZFS can call bcopy on user pages, as opposed to a dedicated copyin function because the caller has held and pre-faulted any pages in advance. dmu_fill_done is an alias for dbuf_fill_done which sanity checks the dbuf and wakes up any waiters on db_changed (functions that were trying to access this dbuf while it was being modified).
dmu.c:818
void
dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
const void *buf, dmu_tx_t *tx)
{
dmu_buf_t **dbp;
int numbufs, i;
if (size == 0)
return;
VERIFY(0 == dmu_buf_hold_array(os, object, offset, size,
FALSE, FTAG, &numbufs, &dbp));
for (i = 0; i < numbufs; i++) {
int tocpy;
int bufoff;
dmu_buf_t *db = dbp[i];
...
bufoff = offset - db->db_offset;
tocpy = (int)MIN(db->db_size - bufoff, size);
...
if (tocpy == db->db_size)
dmu_buf_will_fill(db, tx);
else
dmu_buf_will_dirty(db, tx);
bcopy(buf, (char *)db->db_data + bufoff, tocpy);
if (tocpy == db->db_size)
dmu_buf_fill_done(db, tx);
offset += tocpy;
size -= tocpy;
buf = (char *)buf + tocpy;
}
...
}
dmu_buf_dirty reads the block first so that we can modify it and then mark it dirty associated with this transaction.
void
dmu_buf_will_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx)
{
dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
int rf = DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH;
...
(void) dbuf_read(db, NULL, rf);
(void) dbuf_dirty(db, tx);
}
dmu_buf_will_fill (complete block writes) calls dbuf_noread to transition the state of the dbuf. If the dbuf is not cached dbuf_noread sets the associated data buffer with the result of a call to arc_buf_alloc and sets the state to DB_FILL.
void
dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)
{
dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
...
dbuf_noread(db);
(void) dbuf_dirty(db, tx);
}
dmu_assign_arcbuf either calls dbuf_assign_arcbuf or calls dmu_write. Both paths lead directly to dbuf_dirty.
/*
* When possible directly assign passed loaned
* arc buffer to a dbuf. If this is not possible
* copy the contents of passed arc buf via dmu_write().
*/
void
dmu_assign_arcbuf(dmu_buf_t *handle, uint64_t offset, arc_buf_t *buf,
dmu_tx_t *tx)
{
...
if (offset == db->db.db_offset && blksz == db->db.db_size &&
DBUF_GET_BUFC_TYPE(db) == ARC_BUFC_DATA) {
dbuf_assign_arcbuf(db, buf, tx);
...
} else {
...
dmu_write(os, object, offset, blksz, buf->b_data, tx);
...
}
}
#pragma weak dmu_buf_fill_done = dbuf_fill_done
/*
* Directly assign a provided arc buf to a given dbuf if it's not referenced
* by anybody except our caller. Otherwise copy arcbuf's contents to dbuf.
*/
void
dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx)
{
...
(void) dbuf_dirty(db, tx);
dmu_buf_fill_done(&db->db, tx);
}
dbuf_dirty adds a dbuf to the list of dirty records for a dnode and then marks the dnode itself as being dirty for its object set with dnode_setdirty.
dbuf_dirty_record_t *
dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
{
dnode_t *dn;
...
dbuf_dirty_record_t **drp, *dr;
...
int txgoff = tx->tx_txg & TXG_MASK;
...
dn = DB_DNODE(db);
...
/*
* If this buffer is already dirty, we're done.
*/
drp = &db->db_last_dirty;
...
while ((dr = *drp) != NULL && dr->dr_txg > tx->tx_txg)
drp = &dr->dr_next;
...
if (db->db_blkid == DMU_BONUS_BLKID ||
db->db_blkid == DMU_SPILL_BLKID) {
...
list_insert_tail(&dn->dn_dirty_records[txgoff], dr);
...
}
if (db->db_level+1 < dn->dn_nlevels) {
...
if (db->db_last_dirty == dr ||
dn->dn_object == DMU_META_DNODE_OBJECT) {
...
list_insert_tail(&di->dt.di.dr_children, dr);
...
}
} else {
...
list_insert_tail(&dn->dn_dirty_records[txgoff], dr);
...
}
dnode_setdirty(dn, tx);
...
}
dnode_setdirty will add a dnode to the list of freed dnodes for the object set if the dnode has been recently freed (i.e. it's dn_free_txg is non-zero). Otherwise the dnode will be added to the list of dirty dnodes. The dataset itself is then marked dirty.
void
dnode_setdirty(dnode_t *dn, dmu_tx_t *tx)
{
objset_t *os = dn->dn_objset;
uint64_t txg = tx->tx_txg;
...
if (dn->dn_free_txg > 0 && dn->dn_free_txg <= txg) {
list_insert_tail(&os->os_free_dnodes[txg&TXG_MASK], dn);
} else {
list_insert_tail(&os->os_dirty_dnodes[txg&TXG_MASK], dn);
}
dsl_dataset_dirty(os->os_dsl_dataset, tx);
}
Marking a dataset dirty entails adding the pool to the list for this txg. See dsl_pool_sync to see where ds is removed from the txg_list and dsl_dataset_sync is called on it.
void
dsl_dataset_dirty(dsl_dataset_t *ds, dmu_tx_t *tx)
{
dsl_pool_t *dp;
...
dp = ds->ds_dir->dd_pool;
if (txg_list_add(&dp->dp_dirty_datasets, ds, tx->tx_txg)) {
/* up the hold count until we can be written out */
dmu_buf_add_ref(ds->ds_dbuf, ds);
}
}