Mercurial > illumos > git > illumos-joyent
diff usr/src/uts/common/fs/zfs/dbuf.c @ 25635:ce2b70e7aab0
[illumos-gate merge]
commit 0a554e9f2c0d440dc40a97fae2d18f1d428ca786
13404 man page spelling errors
commit 9f76c6ed5b6ee0cc0bf631daca15ac3dc5fc70c4
13400 zfs-tests: implicit conversion from 'enum dmu_objset_type' to 'enum lzc_dataset_type'
commit ef96fc31fc4f4306719704352d5c3e33573c039f
13399 zfs: error: implicit conversion from 'boolean_t' to 'ds_hold_flags_t'
commit 56870e8c76c2675bcef1fcee5d519585ce9c768e
13393 cheetah: case value '47616' not in enumerated type
commit 8247326397b1a16f37e70cf13f5b7a4f50d06712
13403 zfs: symbol 'g_zfs' is multiply-defined
commit 436b964b19ef06803ad9165542d80d9d731d6486
13402 zpool: symbol 'g_zfs' is multiply-defined
commit 99308ed0417a2b8ab73c5856a8a5345ce2a7aea7
13396 PoolsExecption typo in resource pools javadoc
commit 1575b751c16622553e958c1e5c45e59c86b15c6e
13392 px: case value '3' not in enumerated type
commit 9b0429a10eec9313ec782d8421272aff70adbfdc
13339 Add support for Hygon Dhyana Family 18h processor
commit d20422bd742384b77102bb3bd09e0dc4b7372e50
13351 loader: vbe_find_mode_xydm() is using wrong safety and iteration is buggy
commit 174b8e455f9a6974e69fa4e28792580acde0892d
13311 uptime(1) dazed and confused for a minute after boot
commit f816551bb187d104fbf2757703d7a5d2189a3a18
13401 eeprom: 'lv' may be used uninitialized in this function
commit 5e96da73c99d9d17ff5a58b793fff2ab6dcadf25
13391 fm: build errors with gcc 7 on SPARC
commit 58b55f701e285559e4799354996fd284238ed0d4
13398 libstand: xdrproc_t should return bool
commit c6a28d7650029501a356f7b75b2a10a5c4430cef
13394 fhc: case value '4294967295' not in enumerated type
commit 58d4b16fe601073f2408de78e3db7e9bfa9abfd2
13355 remove topo module warning gags
commit 1473b8d60e902819558a8b0e8a257eb0d754c3c3
13388 ZFS list bookmark not working on zvols
commit 4bba12ca5cd6f92aaf0d4c0d19d05528110bc095
13368 libbe_py should support temporary BE activation
commit a92282e44f968185a6bba094d1e5fece2da819cf
13376 fm: variable may be used uninitialized
commit 8b1df8bf71b7b62e7e4d46fe6b457d4d6447b2b8
13367 beadm activate -t should not promote new BE datasets
commit 9704bf7fb82e71b685e194a967937ff03843e73a
13317 Decrease contention on dn_struct_rwlock
commit 88a08813800ed7ba7c927986421cee437f7f2233
13363 ctfconvert could support more granular ignore for missing debug data
commit 3dd4cd56e7843e01a8ab147a0d102cd4f6d732c1
13342 ctfconvert could encode _Float128 for 32-bit objects
commit 73197b540cc5f0434c409b68ca9e1a514a6ce91b
13336 ctfconvert should be able to unconditionally attempt conversion
commit dd4422524768709a579a2a93a10c78a88a6b0ecb
13280 CTF: provide option to truncate and continue
Conflicts & other fixes (with help from Jason King <jbk@joyent.com>):
usr/src/lib/fm/topo/modules/common/ipmi/ipmi_enum.c
usr/src/lib/libctf/common/ctf_convert.c
usr/src/lib/libctf/common/ctf_lib.c
usr/src/lib/libctf/common/libctf.h
usr/src/lib/libproc/common/Psymtab.c
usr/src/man/man1/ld.so.1.1
usr/src/man/man4/process.4
author | Dan McDonald <danmcd@joyent.com> |
---|---|
date | Mon, 04 Jan 2021 14:49:49 -0500 |
parents | d7b4b7dec331 49e0cb1642f6 |
children |
line wrap: on
line diff
--- a/usr/src/uts/common/fs/zfs/dbuf.c Sun Jan 03 09:14:04 2021 -0500 +++ b/usr/src/uts/common/fs/zfs/dbuf.c Mon Jan 04 14:49:49 2021 -0500 @@ -21,7 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. - * Copyright (c) 2012, 2018 by Delphix. All rights reserved. + * Copyright (c) 2012, 2019 by Delphix. All rights reserved. * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. * Copyright (c) 2013, Joyent, Inc. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. @@ -176,6 +176,7 @@ bzero(db, sizeof (dmu_buf_impl_t)); mutex_init(&db->db_mtx, NULL, MUTEX_DEFAULT, NULL); + rw_init(&db->db_rwlock, NULL, RW_DEFAULT, NULL); cv_init(&db->db_changed, NULL, CV_DEFAULT, NULL); multilist_link_init(&db->db_cache_link); zfs_refcount_create(&db->db_holds); @@ -189,6 +190,7 @@ { dmu_buf_impl_t *db = vdb; mutex_destroy(&db->db_mtx); + rw_destroy(&db->db_rwlock); cv_destroy(&db->db_changed); ASSERT(!multilist_link_active(&db->db_cache_link)); zfs_refcount_destroy(&db->db_holds); @@ -789,10 +791,10 @@ db->db.db_object); /* * dnode_grow_indblksz() can make this fail if we don't - * have the struct_rwlock. XXX indblksz no longer + * have the parent's rwlock. XXX indblksz no longer * grows. safe to do this now? */ - if (RW_WRITE_HELD(&dn->dn_struct_rwlock)) { + if (RW_LOCK_HELD(&db->db_parent->db_rwlock)) { ASSERT3P(db->db_blkptr, ==, ((blkptr_t *)db->db_parent->db.db_data + db->db_blkid % epb)); @@ -868,6 +870,44 @@ db->db_state = DB_UNCACHED; } +/* + * This function is used to lock the parent of the provided dbuf. This should be + * used when modifying or reading db_blkptr. + */ +db_lock_type_t +dmu_buf_lock_parent(dmu_buf_impl_t *db, krw_t rw, void *tag) +{ + enum db_lock_type ret = DLT_NONE; + if (db->db_parent != NULL) { + rw_enter(&db->db_parent->db_rwlock, rw); + ret = DLT_PARENT; + } else if (dmu_objset_ds(db->db_objset) != NULL) { + rrw_enter(&dmu_objset_ds(db->db_objset)->ds_bp_rwlock, rw, + tag); + ret = DLT_OBJSET; + } + /* + * We only return a DLT_NONE lock when it's the top-most indirect block + * of the meta-dnode of the MOS. + */ + return (ret); +} + +/* + * We need to pass the lock type in because it's possible that the block will + * move from being the topmost indirect block in a dnode (and thus, have no + * parent) to not the top-most via an indirection increase. This would cause a + * panic if we didn't pass the lock type in. + */ +void +dmu_buf_unlock_parent(dmu_buf_impl_t *db, db_lock_type_t type, void *tag) +{ + if (type == DLT_PARENT) + rw_exit(&db->db_parent->db_rwlock); + else if (type == DLT_OBJSET) + rrw_exit(&dmu_objset_ds(db->db_objset)->ds_bp_rwlock, tag); +} + static void dbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf) { @@ -1042,8 +1082,13 @@ return (err); } +/* + * Drops db_mtx and the parent lock specified by dblt and tag before + * returning. + */ static int -dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) +dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags, + db_lock_type_t dblt, void *tag) { dnode_t *dn; zbookmark_phys_t zb; @@ -1053,11 +1098,11 @@ DB_DNODE_ENTER(db); dn = DB_DNODE(db); ASSERT(!zfs_refcount_is_zero(&db->db_holds)); - /* We need the struct_rwlock to prevent db_blkptr from changing. */ - ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); ASSERT(MUTEX_HELD(&db->db_mtx)); ASSERT(db->db_state == DB_UNCACHED); ASSERT(db->db_buf == NULL); + ASSERT(db->db_parent == NULL || + RW_LOCK_HELD(&db->db_parent->db_rwlock)); if (db->db_blkid == DMU_BONUS_BLKID) { /* @@ -1094,6 +1139,7 @@ DB_DNODE_EXIT(db); db->db_state = DB_CACHED; mutex_exit(&db->db_mtx); + dmu_buf_unlock_parent(db, dblt, tag); return (0); } @@ -1134,6 +1180,7 @@ DB_DNODE_EXIT(db); db->db_state = DB_CACHED; mutex_exit(&db->db_mtx); + dmu_buf_unlock_parent(db, dblt, tag); return (0); } @@ -1150,12 +1197,14 @@ "object set %llu", dmu_objset_id(db->db_objset)); DB_DNODE_EXIT(db); mutex_exit(&db->db_mtx); + dmu_buf_unlock_parent(db, dblt, tag); return (SET_ERROR(EIO)); } err = dbuf_read_verify_dnode_crypt(db, flags); if (err != 0) { DB_DNODE_EXIT(db); + dmu_buf_unlock_parent(db, dblt, tag); mutex_exit(&db->db_mtx); return (err); } @@ -1175,11 +1224,18 @@ if ((flags & DB_RF_NO_DECRYPT) && BP_IS_PROTECTED(db->db_blkptr)) zio_flags |= ZIO_FLAG_RAW; - - err = arc_read(zio, db->db_objset->os_spa, db->db_blkptr, + /* + * The zio layer will copy the provided blkptr later, but we need to + * do this now so that we can release the parent's rwlock. We have to + * do that now so that if dbuf_read_done is called synchronously (on + * an l1 cache hit) we don't acquire the db_mtx while holding the + * parent's rwlock, which would be a lock ordering violation. + */ + blkptr_t bp = *db->db_blkptr; + dmu_buf_unlock_parent(db, dblt, tag); + (void) arc_read(zio, db->db_objset->os_spa, &bp, dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ, zio_flags, &aflags, &zb); - return (err); } @@ -1278,8 +1334,6 @@ DB_DNODE_ENTER(db); dn = DB_DNODE(db); - if ((flags & DB_RF_HAVESTRUCT) == 0) - rw_enter(&dn->dn_struct_rwlock, RW_READER); prefetch = db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID && (flags & DB_RF_NOPREFETCH) == 0 && dn != NULL && @@ -1316,29 +1370,32 @@ dbuf_set_data(db, db->db_buf); } mutex_exit(&db->db_mtx); - if (err == 0 && prefetch) - dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE); - if ((flags & DB_RF_HAVESTRUCT) == 0) - rw_exit(&dn->dn_struct_rwlock); + if (err == 0 && prefetch) { + dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE, + flags & DB_RF_HAVESTRUCT); + } DB_DNODE_EXIT(db); } else if (db->db_state == DB_UNCACHED) { spa_t *spa = dn->dn_objset->os_spa; boolean_t need_wait = B_FALSE; + db_lock_type_t dblt = dmu_buf_lock_parent(db, RW_READER, FTAG); + if (zio == NULL && db->db_blkptr != NULL && !BP_IS_HOLE(db->db_blkptr)) { zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); need_wait = B_TRUE; } - err = dbuf_read_impl(db, zio, flags); - - /* dbuf_read_impl has dropped db_mtx for us */ - - if (!err && prefetch) - dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE); - - if ((flags & DB_RF_HAVESTRUCT) == 0) - rw_exit(&dn->dn_struct_rwlock); + err = dbuf_read_impl(db, zio, flags, dblt, FTAG); + /* + * dbuf_read_impl has dropped db_mtx and our parent's rwlock + * for us + */ + if (!err && prefetch) { + dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE, + flags & DB_RF_HAVESTRUCT); + } + DB_DNODE_EXIT(db); if (!err && need_wait) @@ -1353,10 +1410,10 @@ * occurred and the dbuf went to UNCACHED. */ mutex_exit(&db->db_mtx); - if (prefetch) - dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE); - if ((flags & DB_RF_HAVESTRUCT) == 0) - rw_exit(&dn->dn_struct_rwlock); + if (prefetch) { + dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE, + flags & DB_RF_HAVESTRUCT); + } DB_DNODE_EXIT(db); /* Skip the wait per the caller's request. */ @@ -1536,7 +1593,9 @@ if (db->db_state == DB_CACHED) { ASSERT(db->db.db_data != NULL); arc_release(db->db_buf, db); + rw_enter(&db->db_rwlock, RW_WRITER); bzero(db->db.db_data, db->db.db_size); + rw_exit(&db->db_rwlock); arc_buf_freeze(db->db_buf); } @@ -1558,15 +1617,6 @@ DB_DNODE_ENTER(db); dn = DB_DNODE(db); - /* XXX does *this* func really need the lock? */ - ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock)); - - /* - * This call to dmu_buf_will_dirty() with the dn_struct_rwlock held - * is OK, because there can be no other references to the db - * when we are changing its size, so no concurrent DB_FILL can - * be happening. - */ /* * XXX we should be doing a dbuf_read, checking the return * value and returning that up to our callers @@ -1643,8 +1693,8 @@ dnode_t *dn; objset_t *os; dbuf_dirty_record_t **drp, *dr; - int drop_struct_lock = FALSE; int txgoff = tx->tx_txg & TXG_MASK; + boolean_t drop_struct_rwlock = B_FALSE; ASSERT(tx->tx_txg != 0); ASSERT(!zfs_refcount_is_zero(&db->db_holds)); @@ -1846,15 +1896,21 @@ return (dr); } - /* - * The dn_struct_rwlock prevents db_blkptr from changing - * due to a write from syncing context completing - * while we are running, so we want to acquire it before - * looking at db_blkptr. - */ if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) { rw_enter(&dn->dn_struct_rwlock, RW_READER); - drop_struct_lock = TRUE; + drop_struct_rwlock = B_TRUE; + } + + /* + * If we are overwriting a dedup BP, then unless it is snapshotted, + * when we get to syncing context we will need to decrement its + * refcount in the DDT. Prefetch the relevant DDT block so that + * syncing context won't have to wait for the i/o. + */ + if (db->db_blkptr != NULL) { + db_lock_type_t dblt = dmu_buf_lock_parent(db, RW_READER, FTAG); + ddt_prefetch(os->os_spa, db->db_blkptr); + dmu_buf_unlock_parent(db, dblt, FTAG); } /* @@ -1867,19 +1923,12 @@ dn->dn_next_nlevels[(tx->tx_txg-1) & TXG_MASK] > db->db_level || dn->dn_next_nlevels[(tx->tx_txg-2) & TXG_MASK] > db->db_level); - /* - * If we are overwriting a dedup BP, then unless it is snapshotted, - * when we get to syncing context we will need to decrement its - * refcount in the DDT. Prefetch the relevant DDT block so that - * syncing context won't have to wait for the i/o. - */ - ddt_prefetch(os->os_spa, db->db_blkptr); if (db->db_level == 0) { ASSERT(!db->db_objset->os_raw_receive || dn->dn_maxblkid >= db->db_blkid); dnode_new_blkid(dn, db->db_blkid, tx, - drop_struct_lock, B_FALSE); + drop_struct_rwlock, B_FALSE); ASSERT(dn->dn_maxblkid >= db->db_blkid); } @@ -1890,15 +1939,14 @@ if (db->db_parent == NULL || db->db_parent == dn->dn_dbuf) { int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; - - parent = dbuf_hold_level(dn, db->db_level+1, + parent = dbuf_hold_level(dn, db->db_level + 1, db->db_blkid >> epbs, FTAG); ASSERT(parent != NULL); parent_held = TRUE; } - if (drop_struct_lock) + if (drop_struct_rwlock) rw_exit(&dn->dn_struct_rwlock); - ASSERT3U(db->db_level+1, ==, parent->db_level); + ASSERT3U(db->db_level + 1, ==, parent->db_level); di = dbuf_dirty(parent, tx); if (parent_held) dbuf_rele(parent, FTAG); @@ -1919,14 +1967,14 @@ } mutex_exit(&db->db_mtx); } else { - ASSERT(db->db_level+1 == dn->dn_nlevels); + ASSERT(db->db_level + 1 == dn->dn_nlevels); ASSERT(db->db_blkid < dn->dn_nblkptr); ASSERT(db->db_parent == NULL || db->db_parent == dn->dn_dbuf); mutex_enter(&dn->dn_mtx); ASSERT(!list_link_active(&dr->dr_dirty_node)); list_insert_tail(&dn->dn_dirty_records[txgoff], dr); mutex_exit(&dn->dn_mtx); - if (drop_struct_lock) + if (drop_struct_rwlock) rw_exit(&dn->dn_struct_rwlock); } @@ -2447,10 +2495,12 @@ *parentp = NULL; return (err); } + rw_enter(&(*parentp)->db_rwlock, RW_READER); *bpp = ((blkptr_t *)(*parentp)->db.db_data) + (blkid & ((1ULL << epbs) - 1)); if (blkid > (dn->dn_phys->dn_maxblkid >> (level * epbs))) ASSERT(BP_IS_HOLE(*bpp)); + rw_exit(&(*parentp)->db_rwlock); return (0); } else { /* the block is referenced from the dnode */ @@ -2695,7 +2745,7 @@ if (blkid > dn->dn_maxblkid) return; - if (dnode_block_freed(dn, blkid)) + if (level == 0 && dnode_block_freed(dn, blkid)) return; /* @@ -2841,7 +2891,9 @@ DBUF_GET_BUFC_TYPE(db), db->db.db_size)); } + rw_enter(&db->db_rwlock, RW_WRITER); bcopy(data->b_data, db->db.db_data, arc_buf_size(data)); + rw_exit(&db->db_rwlock); } /* @@ -2967,7 +3019,6 @@ dbuf_spill_set_blksz(dmu_buf_t *db_fake, uint64_t blksz, dmu_tx_t *tx) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; - dnode_t *dn; if (db->db_blkid != DMU_SPILL_BLKID) return (SET_ERROR(ENOTSUP)); @@ -2976,12 +3027,7 @@ ASSERT3U(blksz, <=, spa_maxblocksize(dmu_objset_spa(db->db_objset))); blksz = P2ROUNDUP(blksz, SPA_MINBLOCKSIZE); - DB_DNODE_ENTER(db); - dn = DB_DNODE(db); - rw_enter(&dn->dn_struct_rwlock, RW_WRITER); dbuf_new_size(db, blksz, tx); - rw_exit(&dn->dn_struct_rwlock); - DB_DNODE_EXIT(db); return (0); } @@ -3697,9 +3743,9 @@ mutex_exit(&db->db_mtx); - rw_enter(&dn->dn_struct_rwlock, RW_WRITER); + db_lock_type_t dblt = dmu_buf_lock_parent(db, RW_WRITER, FTAG); *db->db_blkptr = *bp; - rw_exit(&dn->dn_struct_rwlock); + dmu_buf_unlock_parent(db, dblt, FTAG); } /* ARGSUSED */ @@ -3740,9 +3786,9 @@ * anybody from reading the blocks we're about to * zero out. */ - rw_enter(&dn->dn_struct_rwlock, RW_WRITER); + rw_enter(&db->db_rwlock, RW_WRITER); bzero(db->db.db_data, db->db.db_size); - rw_exit(&dn->dn_struct_rwlock); + rw_exit(&db->db_rwlock); } DB_DNODE_EXIT(db); } @@ -3932,7 +3978,7 @@ } static void -dbuf_remap_impl(dnode_t *dn, blkptr_t *bp, dmu_tx_t *tx) +dbuf_remap_impl(dnode_t *dn, blkptr_t *bp, krwlock_t *rw, dmu_tx_t *tx) { blkptr_t bp_copy = *bp; spa_t *spa = dmu_objset_spa(dn->dn_objset); @@ -3946,14 +3992,16 @@ if (spa_remap_blkptr(spa, &bp_copy, dbuf_remap_impl_callback, &drica)) { /* - * The struct_rwlock prevents dbuf_read_impl() from + * The db_rwlock prevents dbuf_read_impl() from * dereferencing the BP while we are changing it. To * avoid lock contention, only grab it when we are actually * changing the BP. */ - rw_enter(&dn->dn_struct_rwlock, RW_WRITER); + if (rw != NULL) + rw_enter(rw, RW_WRITER); *bp = bp_copy; - rw_exit(&dn->dn_struct_rwlock); + if (rw != NULL) + rw_exit(rw); } } @@ -4026,7 +4074,7 @@ if (db->db_level > 0) { blkptr_t *bp = db->db.db_data; for (int i = 0; i < db->db.db_size >> SPA_BLKPTRSHIFT; i++) { - dbuf_remap_impl(dn, &bp[i], tx); + dbuf_remap_impl(dn, &bp[i], &db->db_rwlock, tx); } } else if (db->db.db_object == DMU_META_DNODE_OBJECT) { dnode_phys_t *dnp = db->db.db_data; @@ -4034,7 +4082,10 @@ DMU_OT_DNODE); for (int i = 0; i < db->db.db_size >> DNODE_SHIFT; i++) { for (int j = 0; j < dnp[i].dn_nblkptr; j++) { - dbuf_remap_impl(dn, &dnp[i].dn_blkptr[j], tx); + krwlock_t *lock = (dn->dn_dbuf == NULL ? NULL : + &dn->dn_dbuf->db_rwlock); + dbuf_remap_impl(dn, &dnp[i].dn_blkptr[j], lock, + tx); } } }