# HG changeset patch # User Paul Dagnelie # Date 1605631623 21600 # Node ID 49e0cb1642f6ec3707bea06e30f7868ffaa6e004 # Parent 790618c19823f47043cb596429d2a155dea9cfc5 13317 Decrease contention on dn_struct_rwlock Portions contributed by: Jason King Reviewed by: Brad Lewis Reviewed by: Matt Ahrens matt@delphix.com Reviewed by: George Wilson george.wilson@delphix.com Reviewed by: Brian Behlendorf Approved by: Dan McDonald diff -r 790618c19823 -r 49e0cb1642f6 usr/src/uts/common/fs/zfs/dbuf.c --- a/usr/src/uts/common/fs/zfs/dbuf.c Fri Dec 04 11:41:39 2020 +0000 +++ b/usr/src/uts/common/fs/zfs/dbuf.c Tue Nov 17 10:47:03 2020 -0600 @@ -21,7 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. - * Copyright (c) 2012, 2018 by Delphix. All rights reserved. + * Copyright (c) 2012, 2019 by Delphix. All rights reserved. * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. * Copyright (c) 2013, Joyent, Inc. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. @@ -176,6 +176,7 @@ bzero(db, sizeof (dmu_buf_impl_t)); mutex_init(&db->db_mtx, NULL, MUTEX_DEFAULT, NULL); + rw_init(&db->db_rwlock, NULL, RW_DEFAULT, NULL); cv_init(&db->db_changed, NULL, CV_DEFAULT, NULL); multilist_link_init(&db->db_cache_link); zfs_refcount_create(&db->db_holds); @@ -189,6 +190,7 @@ { dmu_buf_impl_t *db = vdb; mutex_destroy(&db->db_mtx); + rw_destroy(&db->db_rwlock); cv_destroy(&db->db_changed); ASSERT(!multilist_link_active(&db->db_cache_link)); zfs_refcount_destroy(&db->db_holds); @@ -789,10 +791,10 @@ db->db.db_object); /* * dnode_grow_indblksz() can make this fail if we don't - * have the struct_rwlock. XXX indblksz no longer + * have the parent's rwlock. XXX indblksz no longer * grows. safe to do this now? */ - if (RW_WRITE_HELD(&dn->dn_struct_rwlock)) { + if (RW_LOCK_HELD(&db->db_parent->db_rwlock)) { ASSERT3P(db->db_blkptr, ==, ((blkptr_t *)db->db_parent->db.db_data + db->db_blkid % epb)); @@ -868,6 +870,44 @@ db->db_state = DB_UNCACHED; } +/* + * This function is used to lock the parent of the provided dbuf. This should be + * used when modifying or reading db_blkptr. + */ +db_lock_type_t +dmu_buf_lock_parent(dmu_buf_impl_t *db, krw_t rw, void *tag) +{ + enum db_lock_type ret = DLT_NONE; + if (db->db_parent != NULL) { + rw_enter(&db->db_parent->db_rwlock, rw); + ret = DLT_PARENT; + } else if (dmu_objset_ds(db->db_objset) != NULL) { + rrw_enter(&dmu_objset_ds(db->db_objset)->ds_bp_rwlock, rw, + tag); + ret = DLT_OBJSET; + } + /* + * We only return a DLT_NONE lock when it's the top-most indirect block + * of the meta-dnode of the MOS. + */ + return (ret); +} + +/* + * We need to pass the lock type in because it's possible that the block will + * move from being the topmost indirect block in a dnode (and thus, have no + * parent) to not the top-most via an indirection increase. This would cause a + * panic if we didn't pass the lock type in. + */ +void +dmu_buf_unlock_parent(dmu_buf_impl_t *db, db_lock_type_t type, void *tag) +{ + if (type == DLT_PARENT) + rw_exit(&db->db_parent->db_rwlock); + else if (type == DLT_OBJSET) + rrw_exit(&dmu_objset_ds(db->db_objset)->ds_bp_rwlock, tag); +} + static void dbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf) { @@ -1042,8 +1082,13 @@ return (err); } +/* + * Drops db_mtx and the parent lock specified by dblt and tag before + * returning. + */ static int -dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) +dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags, + db_lock_type_t dblt, void *tag) { dnode_t *dn; zbookmark_phys_t zb; @@ -1053,11 +1098,11 @@ DB_DNODE_ENTER(db); dn = DB_DNODE(db); ASSERT(!zfs_refcount_is_zero(&db->db_holds)); - /* We need the struct_rwlock to prevent db_blkptr from changing. */ - ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); ASSERT(MUTEX_HELD(&db->db_mtx)); ASSERT(db->db_state == DB_UNCACHED); ASSERT(db->db_buf == NULL); + ASSERT(db->db_parent == NULL || + RW_LOCK_HELD(&db->db_parent->db_rwlock)); if (db->db_blkid == DMU_BONUS_BLKID) { /* @@ -1085,6 +1130,7 @@ DB_DNODE_EXIT(db); db->db_state = DB_CACHED; mutex_exit(&db->db_mtx); + dmu_buf_unlock_parent(db, dblt, tag); return (0); } @@ -1125,6 +1171,7 @@ DB_DNODE_EXIT(db); db->db_state = DB_CACHED; mutex_exit(&db->db_mtx); + dmu_buf_unlock_parent(db, dblt, tag); return (0); } @@ -1141,12 +1188,14 @@ "object set %llu", dmu_objset_id(db->db_objset)); DB_DNODE_EXIT(db); mutex_exit(&db->db_mtx); + dmu_buf_unlock_parent(db, dblt, tag); return (SET_ERROR(EIO)); } err = dbuf_read_verify_dnode_crypt(db, flags); if (err != 0) { DB_DNODE_EXIT(db); + dmu_buf_unlock_parent(db, dblt, tag); mutex_exit(&db->db_mtx); return (err); } @@ -1166,11 +1215,18 @@ if ((flags & DB_RF_NO_DECRYPT) && BP_IS_PROTECTED(db->db_blkptr)) zio_flags |= ZIO_FLAG_RAW; - - err = arc_read(zio, db->db_objset->os_spa, db->db_blkptr, + /* + * The zio layer will copy the provided blkptr later, but we need to + * do this now so that we can release the parent's rwlock. We have to + * do that now so that if dbuf_read_done is called synchronously (on + * an l1 cache hit) we don't acquire the db_mtx while holding the + * parent's rwlock, which would be a lock ordering violation. + */ + blkptr_t bp = *db->db_blkptr; + dmu_buf_unlock_parent(db, dblt, tag); + (void) arc_read(zio, db->db_objset->os_spa, &bp, dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ, zio_flags, &aflags, &zb); - return (err); } @@ -1269,8 +1325,6 @@ DB_DNODE_ENTER(db); dn = DB_DNODE(db); - if ((flags & DB_RF_HAVESTRUCT) == 0) - rw_enter(&dn->dn_struct_rwlock, RW_READER); prefetch = db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID && (flags & DB_RF_NOPREFETCH) == 0 && dn != NULL && @@ -1307,29 +1361,32 @@ dbuf_set_data(db, db->db_buf); } mutex_exit(&db->db_mtx); - if (err == 0 && prefetch) - dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE); - if ((flags & DB_RF_HAVESTRUCT) == 0) - rw_exit(&dn->dn_struct_rwlock); + if (err == 0 && prefetch) { + dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE, + flags & DB_RF_HAVESTRUCT); + } DB_DNODE_EXIT(db); } else if (db->db_state == DB_UNCACHED) { spa_t *spa = dn->dn_objset->os_spa; boolean_t need_wait = B_FALSE; + db_lock_type_t dblt = dmu_buf_lock_parent(db, RW_READER, FTAG); + if (zio == NULL && db->db_blkptr != NULL && !BP_IS_HOLE(db->db_blkptr)) { zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); need_wait = B_TRUE; } - err = dbuf_read_impl(db, zio, flags); - - /* dbuf_read_impl has dropped db_mtx for us */ - - if (!err && prefetch) - dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE); - - if ((flags & DB_RF_HAVESTRUCT) == 0) - rw_exit(&dn->dn_struct_rwlock); + err = dbuf_read_impl(db, zio, flags, dblt, FTAG); + /* + * dbuf_read_impl has dropped db_mtx and our parent's rwlock + * for us + */ + if (!err && prefetch) { + dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE, + flags & DB_RF_HAVESTRUCT); + } + DB_DNODE_EXIT(db); if (!err && need_wait) @@ -1344,10 +1401,10 @@ * occurred and the dbuf went to UNCACHED. */ mutex_exit(&db->db_mtx); - if (prefetch) - dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE); - if ((flags & DB_RF_HAVESTRUCT) == 0) - rw_exit(&dn->dn_struct_rwlock); + if (prefetch) { + dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE, + flags & DB_RF_HAVESTRUCT); + } DB_DNODE_EXIT(db); /* Skip the wait per the caller's request. */ @@ -1527,7 +1584,9 @@ if (db->db_state == DB_CACHED) { ASSERT(db->db.db_data != NULL); arc_release(db->db_buf, db); + rw_enter(&db->db_rwlock, RW_WRITER); bzero(db->db.db_data, db->db.db_size); + rw_exit(&db->db_rwlock); arc_buf_freeze(db->db_buf); } @@ -1549,15 +1608,6 @@ DB_DNODE_ENTER(db); dn = DB_DNODE(db); - /* XXX does *this* func really need the lock? */ - ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock)); - - /* - * This call to dmu_buf_will_dirty() with the dn_struct_rwlock held - * is OK, because there can be no other references to the db - * when we are changing its size, so no concurrent DB_FILL can - * be happening. - */ /* * XXX we should be doing a dbuf_read, checking the return * value and returning that up to our callers @@ -1634,8 +1684,8 @@ dnode_t *dn; objset_t *os; dbuf_dirty_record_t **drp, *dr; - int drop_struct_lock = FALSE; int txgoff = tx->tx_txg & TXG_MASK; + boolean_t drop_struct_rwlock = B_FALSE; ASSERT(tx->tx_txg != 0); ASSERT(!zfs_refcount_is_zero(&db->db_holds)); @@ -1837,15 +1887,21 @@ return (dr); } - /* - * The dn_struct_rwlock prevents db_blkptr from changing - * due to a write from syncing context completing - * while we are running, so we want to acquire it before - * looking at db_blkptr. - */ if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) { rw_enter(&dn->dn_struct_rwlock, RW_READER); - drop_struct_lock = TRUE; + drop_struct_rwlock = B_TRUE; + } + + /* + * If we are overwriting a dedup BP, then unless it is snapshotted, + * when we get to syncing context we will need to decrement its + * refcount in the DDT. Prefetch the relevant DDT block so that + * syncing context won't have to wait for the i/o. + */ + if (db->db_blkptr != NULL) { + db_lock_type_t dblt = dmu_buf_lock_parent(db, RW_READER, FTAG); + ddt_prefetch(os->os_spa, db->db_blkptr); + dmu_buf_unlock_parent(db, dblt, FTAG); } /* @@ -1858,19 +1914,12 @@ dn->dn_next_nlevels[(tx->tx_txg-1) & TXG_MASK] > db->db_level || dn->dn_next_nlevels[(tx->tx_txg-2) & TXG_MASK] > db->db_level); - /* - * If we are overwriting a dedup BP, then unless it is snapshotted, - * when we get to syncing context we will need to decrement its - * refcount in the DDT. Prefetch the relevant DDT block so that - * syncing context won't have to wait for the i/o. - */ - ddt_prefetch(os->os_spa, db->db_blkptr); if (db->db_level == 0) { ASSERT(!db->db_objset->os_raw_receive || dn->dn_maxblkid >= db->db_blkid); dnode_new_blkid(dn, db->db_blkid, tx, - drop_struct_lock, B_FALSE); + drop_struct_rwlock, B_FALSE); ASSERT(dn->dn_maxblkid >= db->db_blkid); } @@ -1881,15 +1930,14 @@ if (db->db_parent == NULL || db->db_parent == dn->dn_dbuf) { int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; - - parent = dbuf_hold_level(dn, db->db_level+1, + parent = dbuf_hold_level(dn, db->db_level + 1, db->db_blkid >> epbs, FTAG); ASSERT(parent != NULL); parent_held = TRUE; } - if (drop_struct_lock) + if (drop_struct_rwlock) rw_exit(&dn->dn_struct_rwlock); - ASSERT3U(db->db_level+1, ==, parent->db_level); + ASSERT3U(db->db_level + 1, ==, parent->db_level); di = dbuf_dirty(parent, tx); if (parent_held) dbuf_rele(parent, FTAG); @@ -1910,14 +1958,14 @@ } mutex_exit(&db->db_mtx); } else { - ASSERT(db->db_level+1 == dn->dn_nlevels); + ASSERT(db->db_level + 1 == dn->dn_nlevels); ASSERT(db->db_blkid < dn->dn_nblkptr); ASSERT(db->db_parent == NULL || db->db_parent == dn->dn_dbuf); mutex_enter(&dn->dn_mtx); ASSERT(!list_link_active(&dr->dr_dirty_node)); list_insert_tail(&dn->dn_dirty_records[txgoff], dr); mutex_exit(&dn->dn_mtx); - if (drop_struct_lock) + if (drop_struct_rwlock) rw_exit(&dn->dn_struct_rwlock); } @@ -2438,10 +2486,12 @@ *parentp = NULL; return (err); } + rw_enter(&(*parentp)->db_rwlock, RW_READER); *bpp = ((blkptr_t *)(*parentp)->db.db_data) + (blkid & ((1ULL << epbs) - 1)); if (blkid > (dn->dn_phys->dn_maxblkid >> (level * epbs))) ASSERT(BP_IS_HOLE(*bpp)); + rw_exit(&(*parentp)->db_rwlock); return (0); } else { /* the block is referenced from the dnode */ @@ -2686,7 +2736,7 @@ if (blkid > dn->dn_maxblkid) return; - if (dnode_block_freed(dn, blkid)) + if (level == 0 && dnode_block_freed(dn, blkid)) return; /* @@ -2832,7 +2882,9 @@ DBUF_GET_BUFC_TYPE(db), db->db.db_size)); } + rw_enter(&db->db_rwlock, RW_WRITER); bcopy(data->b_data, db->db.db_data, arc_buf_size(data)); + rw_exit(&db->db_rwlock); } /* @@ -2958,7 +3010,6 @@ dbuf_spill_set_blksz(dmu_buf_t *db_fake, uint64_t blksz, dmu_tx_t *tx) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; - dnode_t *dn; if (db->db_blkid != DMU_SPILL_BLKID) return (SET_ERROR(ENOTSUP)); @@ -2967,12 +3018,7 @@ ASSERT3U(blksz, <=, spa_maxblocksize(dmu_objset_spa(db->db_objset))); blksz = P2ROUNDUP(blksz, SPA_MINBLOCKSIZE); - DB_DNODE_ENTER(db); - dn = DB_DNODE(db); - rw_enter(&dn->dn_struct_rwlock, RW_WRITER); dbuf_new_size(db, blksz, tx); - rw_exit(&dn->dn_struct_rwlock); - DB_DNODE_EXIT(db); return (0); } @@ -3688,9 +3734,9 @@ mutex_exit(&db->db_mtx); - rw_enter(&dn->dn_struct_rwlock, RW_WRITER); + db_lock_type_t dblt = dmu_buf_lock_parent(db, RW_WRITER, FTAG); *db->db_blkptr = *bp; - rw_exit(&dn->dn_struct_rwlock); + dmu_buf_unlock_parent(db, dblt, FTAG); } /* ARGSUSED */ @@ -3731,9 +3777,9 @@ * anybody from reading the blocks we're about to * zero out. */ - rw_enter(&dn->dn_struct_rwlock, RW_WRITER); + rw_enter(&db->db_rwlock, RW_WRITER); bzero(db->db.db_data, db->db.db_size); - rw_exit(&dn->dn_struct_rwlock); + rw_exit(&db->db_rwlock); } DB_DNODE_EXIT(db); } @@ -3923,7 +3969,7 @@ } static void -dbuf_remap_impl(dnode_t *dn, blkptr_t *bp, dmu_tx_t *tx) +dbuf_remap_impl(dnode_t *dn, blkptr_t *bp, krwlock_t *rw, dmu_tx_t *tx) { blkptr_t bp_copy = *bp; spa_t *spa = dmu_objset_spa(dn->dn_objset); @@ -3937,14 +3983,16 @@ if (spa_remap_blkptr(spa, &bp_copy, dbuf_remap_impl_callback, &drica)) { /* - * The struct_rwlock prevents dbuf_read_impl() from + * The db_rwlock prevents dbuf_read_impl() from * dereferencing the BP while we are changing it. To * avoid lock contention, only grab it when we are actually * changing the BP. */ - rw_enter(&dn->dn_struct_rwlock, RW_WRITER); + if (rw != NULL) + rw_enter(rw, RW_WRITER); *bp = bp_copy; - rw_exit(&dn->dn_struct_rwlock); + if (rw != NULL) + rw_exit(rw); } } @@ -4017,7 +4065,7 @@ if (db->db_level > 0) { blkptr_t *bp = db->db.db_data; for (int i = 0; i < db->db.db_size >> SPA_BLKPTRSHIFT; i++) { - dbuf_remap_impl(dn, &bp[i], tx); + dbuf_remap_impl(dn, &bp[i], &db->db_rwlock, tx); } } else if (db->db.db_object == DMU_META_DNODE_OBJECT) { dnode_phys_t *dnp = db->db.db_data; @@ -4025,7 +4073,10 @@ DMU_OT_DNODE); for (int i = 0; i < db->db.db_size >> DNODE_SHIFT; i++) { for (int j = 0; j < dnp[i].dn_nblkptr; j++) { - dbuf_remap_impl(dn, &dnp[i].dn_blkptr[j], tx); + krwlock_t *lock = (dn->dn_dbuf == NULL ? NULL : + &dn->dn_dbuf->db_rwlock); + dbuf_remap_impl(dn, &dnp[i].dn_blkptr[j], lock, + tx); } } } diff -r 790618c19823 -r 49e0cb1642f6 usr/src/uts/common/fs/zfs/dmu.c --- a/usr/src/uts/common/fs/zfs/dmu.c Fri Dec 04 11:41:39 2020 +0000 +++ b/usr/src/uts/common/fs/zfs/dmu.c Tue Nov 17 10:47:03 2020 -0600 @@ -172,8 +172,8 @@ uint64_t blkid; dmu_buf_impl_t *db; + rw_enter(&dn->dn_struct_rwlock, RW_READER); blkid = dbuf_whichblock(dn, 0, offset); - rw_enter(&dn->dn_struct_rwlock, RW_READER); db = dbuf_hold(dn, blkid, tag); rw_exit(&dn->dn_struct_rwlock); @@ -197,8 +197,8 @@ err = dnode_hold(os, object, FTAG, &dn); if (err) return (err); + rw_enter(&dn->dn_struct_rwlock, RW_READER); blkid = dbuf_whichblock(dn, 0, offset); - rw_enter(&dn->dn_struct_rwlock, RW_READER); db = dbuf_hold(dn, blkid, tag); rw_exit(&dn->dn_struct_rwlock); dnode_rele(dn, FTAG); @@ -605,7 +605,7 @@ if ((flags & DMU_READ_NO_PREFETCH) == 0 && DNODE_META_IS_CACHEABLE(dn) && length <= zfetch_array_rd_sz) { dmu_zfetch(&dn->dn_zfetch, blkid, nblks, - read && DNODE_IS_CACHEABLE(dn)); + read && DNODE_IS_CACHEABLE(dn), B_TRUE); } rw_exit(&dn->dn_struct_rwlock); @@ -737,7 +737,6 @@ if (err != 0) return; - rw_enter(&dn->dn_struct_rwlock, RW_READER); /* * offset + len - 1 is the last byte we want to prefetch for, and offset * is the first. Then dbuf_whichblk(dn, level, off + len - 1) is the @@ -745,6 +744,7 @@ * offset) is the first. Then the number we need to prefetch is the * last - first + 1. */ + rw_enter(&dn->dn_struct_rwlock, RW_READER); if (level > 0 || dn->dn_datablkshift != 0) { nblks = dbuf_whichblock(dn, level, offset + len - 1) - dbuf_whichblock(dn, level, offset) + 1; @@ -757,7 +757,6 @@ for (int i = 0; i < nblks; i++) dbuf_prefetch(dn, level, blkid + i, pri, 0); } - rw_exit(&dn->dn_struct_rwlock); dnode_rele(dn, FTAG); diff -r 790618c19823 -r 49e0cb1642f6 usr/src/uts/common/fs/zfs/dmu_zfetch.c --- a/usr/src/uts/common/fs/zfs/dmu_zfetch.c Fri Dec 04 11:41:39 2020 +0000 +++ b/usr/src/uts/common/fs/zfs/dmu_zfetch.c Tue Nov 17 10:47:03 2020 -0600 @@ -24,7 +24,7 @@ */ /* - * Copyright (c) 2013, 2015 by Delphix. All rights reserved. + * Copyright (c) 2013, 2017 by Delphix. All rights reserved. */ #include @@ -204,7 +204,8 @@ * TRUE -- prefetch predicted data blocks plus following indirect blocks. */ void -dmu_zfetch(zfetch_t *zf, uint64_t blkid, uint64_t nblks, boolean_t fetch_data) +dmu_zfetch(zfetch_t *zf, uint64_t blkid, uint64_t nblks, boolean_t fetch_data, + boolean_t have_lock) { zstream_t *zs; int64_t pf_start, ipf_start, ipf_istart, ipf_iend; @@ -233,6 +234,9 @@ if (blkid == 0) return; + if (!have_lock) + rw_enter(&zf->zf_dnode->dn_struct_rwlock, RW_READER); + rw_enter(&zf->zf_rwlock, RW_READER); /* @@ -257,6 +261,10 @@ /* Already prefetched this before. */ mutex_exit(&zs->zs_lock); rw_exit(&zf->zf_rwlock); + if (!have_lock) { + rw_exit(&zf->zf_dnode-> + dn_struct_rwlock); + } return; } break; @@ -274,6 +282,8 @@ if (rw_tryupgrade(&zf->zf_rwlock)) dmu_zfetch_stream_create(zf, end_of_access_blkid); rw_exit(&zf->zf_rwlock); + if (!have_lock) + rw_exit(&zf->zf_dnode->dn_struct_rwlock); return; } @@ -353,5 +363,7 @@ dbuf_prefetch(zf->zf_dnode, 1, iblk, ZIO_PRIORITY_ASYNC_READ, ARC_FLAG_PREDICTIVE_PREFETCH); } + if (!have_lock) + rw_exit(&zf->zf_dnode->dn_struct_rwlock); ZFETCHSTAT_BUMP(zfetchstat_hits); } diff -r 790618c19823 -r 49e0cb1642f6 usr/src/uts/common/fs/zfs/dnode.c --- a/usr/src/uts/common/fs/zfs/dnode.c Fri Dec 04 11:41:39 2020 +0000 +++ b/usr/src/uts/common/fs/zfs/dnode.c Tue Nov 17 10:47:03 2020 -0600 @@ -1346,7 +1346,6 @@ } blk = dbuf_whichblock(mdn, 0, object * sizeof (dnode_phys_t)); - db = dbuf_hold(mdn, blk, FTAG); if (drop_struct_lock) rw_exit(&mdn->dn_struct_rwlock); @@ -1783,10 +1782,11 @@ /* resize the old block */ err = dbuf_hold_impl(dn, 0, 0, TRUE, FALSE, FTAG, &db); - if (err == 0) + if (err == 0) { dbuf_new_size(db, size, tx); - else if (err != ENOENT) + } else if (err != ENOENT) { goto fail; + } dnode_setdblksz(dn, size); dnode_setdirty(dn, tx); @@ -2021,7 +2021,6 @@ int trunc = FALSE; int epbs; - rw_enter(&dn->dn_struct_rwlock, RW_WRITER); blksz = dn->dn_datablksz; blkshift = dn->dn_datablkshift; epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; @@ -2038,7 +2037,7 @@ head = P2NPHASE(off, blksz); blkoff = P2PHASE(off, blksz); if ((off >> blkshift) > dn->dn_maxblkid) - goto out; + return; } else { ASSERT(dn->dn_maxblkid == 0); if (off == 0 && len >= blksz) { @@ -2047,12 +2046,15 @@ */ blkid = 0; nblks = 1; - if (dn->dn_nlevels > 1) + if (dn->dn_nlevels > 1) { + rw_enter(&dn->dn_struct_rwlock, RW_WRITER); dnode_dirty_l1(dn, 0, tx); + rw_exit(&dn->dn_struct_rwlock); + } goto done; } else if (off >= blksz) { /* Freeing past end-of-data */ - goto out; + return; } else { /* Freeing part of the block. */ head = blksz - off; @@ -2062,19 +2064,26 @@ } /* zero out any partial block data at the start of the range */ if (head) { + int res; ASSERT3U(blkoff + head, ==, blksz); if (len < head) head = len; - if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, 0, off), - TRUE, FALSE, FTAG, &db) == 0) { + rw_enter(&dn->dn_struct_rwlock, RW_READER); + res = dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, 0, off), + TRUE, FALSE, FTAG, &db); + rw_exit(&dn->dn_struct_rwlock); + if (res == 0) { caddr_t data; + boolean_t dirty; + db_lock_type_t dblt = dmu_buf_lock_parent(db, RW_READER, + FTAG); /* don't dirty if it isn't on disk and isn't dirty */ - if (db->db_last_dirty || - (db->db_blkptr && !BP_IS_HOLE(db->db_blkptr))) { - rw_exit(&dn->dn_struct_rwlock); + dirty = db->db_last_dirty || + (db->db_blkptr && !BP_IS_HOLE(db->db_blkptr)); + dmu_buf_unlock_parent(db, dblt, FTAG); + if (dirty) { dmu_buf_will_dirty(&db->db, tx); - rw_enter(&dn->dn_struct_rwlock, RW_WRITER); data = db->db.db_data; bzero(data + blkoff, head); } @@ -2086,11 +2095,11 @@ /* If the range was less than one block, we're done */ if (len == 0) - goto out; + return; /* If the remaining range is past end of file, we're done */ if ((off >> blkshift) > dn->dn_maxblkid) - goto out; + return; ASSERT(ISP2(blksz)); if (trunc) @@ -2101,16 +2110,23 @@ ASSERT0(P2PHASE(off, blksz)); /* zero out any partial block data at the end of the range */ if (tail) { + int res; if (len < tail) tail = len; - if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, 0, off+len), - TRUE, FALSE, FTAG, &db) == 0) { + rw_enter(&dn->dn_struct_rwlock, RW_READER); + res = dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, 0, off+len), + TRUE, FALSE, FTAG, &db); + rw_exit(&dn->dn_struct_rwlock); + if (res == 0) { + boolean_t dirty; /* don't dirty if not on disk and not dirty */ - if (db->db_last_dirty || - (db->db_blkptr && !BP_IS_HOLE(db->db_blkptr))) { - rw_exit(&dn->dn_struct_rwlock); + db_lock_type_t type = dmu_buf_lock_parent(db, RW_READER, + FTAG); + dirty = db->db_last_dirty || + (db->db_blkptr && !BP_IS_HOLE(db->db_blkptr)); + dmu_buf_unlock_parent(db, type, FTAG); + if (dirty) { dmu_buf_will_dirty(&db->db, tx); - rw_enter(&dn->dn_struct_rwlock, RW_WRITER); bzero(db->db.db_data, tail); } dbuf_rele(db, FTAG); @@ -2120,7 +2136,7 @@ /* If the range did not include a full block, we are done */ if (len == 0) - goto out; + return; ASSERT(IS_P2ALIGNED(off, blksz)); ASSERT(trunc || IS_P2ALIGNED(len, blksz)); @@ -2150,6 +2166,7 @@ * amount of space if we copy the freed BPs into deadlists. */ if (dn->dn_nlevels > 1) { + rw_enter(&dn->dn_struct_rwlock, RW_WRITER); uint64_t first, last; first = blkid >> epbs; @@ -2194,6 +2211,7 @@ dnode_dirty_l1(dn, i, tx); } + rw_exit(&dn->dn_struct_rwlock); } done: @@ -2215,9 +2233,6 @@ dbuf_free_range(dn, blkid, blkid + nblks - 1, tx); dnode_setdirty(dn, tx); -out: - - rw_exit(&dn->dn_struct_rwlock); } static boolean_t @@ -2329,6 +2344,8 @@ dprintf("probing object %llu offset %llx level %d of %u\n", dn->dn_object, *offset, lvl, dn->dn_phys->dn_nlevels); + ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); + hole = ((flags & DNODE_FIND_HOLE) != 0); inc = (flags & DNODE_FIND_BACKWARDS) ? -1 : 1; ASSERT(txg == 0 || !hole); @@ -2361,9 +2378,9 @@ return (error); } data = db->db.db_data; + rw_enter(&db->db_rwlock, RW_READER); } - if (db != NULL && txg != 0 && (db->db_blkptr == NULL || db->db_blkptr->blk_birth <= txg || BP_IS_HOLE(db->db_blkptr))) { @@ -2423,8 +2440,10 @@ error = SET_ERROR(ESRCH); } - if (db) + if (db != NULL) { + rw_exit(&db->db_rwlock); dbuf_rele(db, FTAG); + } return (error); } diff -r 790618c19823 -r 49e0cb1642f6 usr/src/uts/common/fs/zfs/dnode_sync.c --- a/usr/src/uts/common/fs/zfs/dnode_sync.c Fri Dec 04 11:41:39 2020 +0000 +++ b/usr/src/uts/common/fs/zfs/dnode_sync.c Tue Nov 17 10:47:03 2020 -0600 @@ -52,7 +52,6 @@ /* this dnode can't be paged out because it's dirty */ ASSERT(dn->dn_phys->dn_type != DMU_OT_NONE); - ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock)); ASSERT(new_level > 1 && dn->dn_phys->dn_nlevels > 0); db = dbuf_hold_level(dn, dn->dn_phys->dn_nlevels, 0, FTAG); @@ -62,8 +61,24 @@ dprintf("os=%p obj=%llu, increase to %d\n", dn->dn_objset, dn->dn_object, dn->dn_phys->dn_nlevels); + /* + * Lock ordering requires that we hold the children's db_mutexes (by + * calling dbuf_find()) before holding the parent's db_rwlock. The lock + * order is imposed by dbuf_read's steps of "grab the lock to protect + * db_parent, get db_parent, hold db_parent's db_rwlock". + */ + dmu_buf_impl_t *children[DN_MAX_NBLKPTR]; + ASSERT3U(nblkptr, <=, DN_MAX_NBLKPTR); + for (i = 0; i < nblkptr; i++) { + children[i] = + dbuf_find(dn->dn_objset, dn->dn_object, old_toplvl, i); + } + /* transfer dnode's block pointers to new indirect block */ (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED|DB_RF_HAVESTRUCT); + if (dn->dn_dbuf != NULL) + rw_enter(&dn->dn_dbuf->db_rwlock, RW_WRITER); + rw_enter(&db->db_rwlock, RW_WRITER); ASSERT(db->db.db_data); ASSERT(arc_released(db->db_buf)); ASSERT3U(sizeof (blkptr_t) * nblkptr, <=, db->db.db_size); @@ -73,8 +88,7 @@ /* set dbuf's parent pointers to new indirect buf */ for (i = 0; i < nblkptr; i++) { - dmu_buf_impl_t *child = - dbuf_find(dn->dn_objset, dn->dn_object, old_toplvl, i); + dmu_buf_impl_t *child = children[i]; if (child == NULL) continue; @@ -107,6 +121,10 @@ bzero(dn->dn_phys->dn_blkptr, sizeof (blkptr_t) * nblkptr); + rw_exit(&db->db_rwlock); + if (dn->dn_dbuf != NULL) + rw_exit(&dn->dn_dbuf->db_rwlock); + dbuf_rele(db, FTAG); rw_exit(&dn->dn_struct_rwlock); @@ -183,7 +201,7 @@ ASSERT(db->db_level == 1); rw_enter(&dn->dn_struct_rwlock, RW_READER); - err = dbuf_hold_impl(dn, db->db_level-1, + err = dbuf_hold_impl(dn, db->db_level - 1, (db->db_blkid << epbs) + i, TRUE, FALSE, FTAG, &child); rw_exit(&dn->dn_struct_rwlock); if (err == ENOENT) @@ -281,7 +299,9 @@ * ancestor of the first or last block to be freed. The first and * last L1 indirect blocks are always dirtied by dnode_free_range(). */ + db_lock_type_t dblt = dmu_buf_lock_parent(db, RW_READER, FTAG); VERIFY(BP_GET_FILL(db->db_blkptr) == 0 || db->db_dirtycnt > 0); + dmu_buf_unlock_parent(db, dblt, FTAG); dbuf_release_bp(db); bp = db->db.db_data; @@ -307,7 +327,9 @@ if (db->db_level == 1) { FREE_VERIFY(db, start, end, tx); - free_blocks(dn, bp, end-start+1, tx); + rw_enter(&db->db_rwlock, RW_WRITER); + free_blocks(dn, bp, end - start + 1, tx); + rw_exit(&db->db_rwlock); } else { for (uint64_t id = start; id <= end; id++, bp++) { if (BP_IS_HOLE(bp)) @@ -324,10 +346,12 @@ } if (free_indirects) { + rw_enter(&db->db_rwlock, RW_WRITER); for (i = 0, bp = db->db.db_data; i < 1 << epbs; i++, bp++) ASSERT(BP_IS_HOLE(bp)); bzero(db->db.db_data, db->db.db_size); free_blocks(dn, db->db_blkptr, 1, tx); + rw_exit(&db->db_rwlock); } DB_DNODE_EXIT(db); @@ -379,7 +403,6 @@ VERIFY0(dbuf_hold_impl(dn, dnlevel - 1, i, TRUE, FALSE, FTAG, &db)); rw_exit(&dn->dn_struct_rwlock); - free_children(db, blkid, nblks, free_indirects, tx); dbuf_rele(db, FTAG); } diff -r 790618c19823 -r 49e0cb1642f6 usr/src/uts/common/fs/zfs/sys/dbuf.h --- a/usr/src/uts/common/fs/zfs/sys/dbuf.h Fri Dec 04 11:41:39 2020 +0000 +++ b/usr/src/uts/common/fs/zfs/sys/dbuf.h Tue Nov 17 10:47:03 2020 -0600 @@ -108,6 +108,12 @@ DR_OVERRIDDEN } override_states_t; +typedef enum db_lock_type { + DLT_NONE, + DLT_PARENT, + DLT_OBJSET +} db_lock_type_t; + typedef struct dbuf_dirty_record { /* link on our parents dirty list */ list_node_t dr_dirty_node; @@ -217,6 +223,22 @@ */ uint8_t db_level; + /* + * Protects db_buf's contents if they contain an indirect block or data + * block of the meta-dnode. We use this lock to protect the structure of + * the block tree. This means that when modifying this dbuf's data, we + * grab its rwlock. When modifying its parent's data (including the + * blkptr to this dbuf), we grab the parent's rwlock. The lock ordering + * for this lock is: + * 1) dn_struct_rwlock + * 2) db_rwlock + * We don't currently grab multiple dbufs' db_rwlocks at once. + */ + krwlock_t db_rwlock; + + /* buffer holding our data */ + arc_buf_t *db_buf; + /* db_mtx protects the members below */ kmutex_t db_mtx; @@ -232,9 +254,6 @@ */ zfs_refcount_t db_holds; - /* buffer holding our data */ - arc_buf_t *db_buf; - kcondvar_t db_changed; dbuf_dirty_record_t *db_data_pending; @@ -336,6 +355,8 @@ void dbuf_unoverride(dbuf_dirty_record_t *dr); void dbuf_sync_list(list_t *list, int level, dmu_tx_t *tx); void dbuf_release_bp(dmu_buf_impl_t *db); +db_lock_type_t dmu_buf_lock_parent(dmu_buf_impl_t *db, krw_t rw, void *tag); +void dmu_buf_unlock_parent(dmu_buf_impl_t *db, db_lock_type_t type, void *tag); boolean_t dbuf_can_remap(const dmu_buf_impl_t *buf); diff -r 790618c19823 -r 49e0cb1642f6 usr/src/uts/common/fs/zfs/sys/dmu_zfetch.h --- a/usr/src/uts/common/fs/zfs/sys/dmu_zfetch.h Fri Dec 04 11:41:39 2020 +0000 +++ b/usr/src/uts/common/fs/zfs/sys/dmu_zfetch.h Tue Nov 17 10:47:03 2020 -0600 @@ -24,7 +24,7 @@ */ /* - * Copyright (c) 2014 by Delphix. All rights reserved. + * Copyright (c) 2014, 2017 by Delphix. All rights reserved. */ #ifndef _DMU_ZFETCH_H @@ -66,7 +66,8 @@ void dmu_zfetch_init(zfetch_t *, struct dnode *); void dmu_zfetch_fini(zfetch_t *); -void dmu_zfetch(zfetch_t *, uint64_t, uint64_t, boolean_t); +void dmu_zfetch(zfetch_t *, uint64_t, uint64_t, boolean_t, + boolean_t); #ifdef __cplusplus