changeset 14022:624fa8f61951

3642 dsl_scan_active() should not issue I/O to determine if async destroying is active 3643 txg_delay should not hold the tc_lock Reviewed by: Matthew Ahrens <mahrens@delphix.com> Reviewed by: Adam Leventhal <ahl@delphix.com> Approved by: Gordon Ross <gwr@nexenta.com>
author George Wilson <george.wilson@delphix.com>
date Tue, 23 Apr 2013 09:31:42 -0800
parents 9915df7eb304
children 3843f7c5f635
files usr/src/uts/common/fs/zfs/dsl_destroy.c usr/src/uts/common/fs/zfs/dsl_scan.c usr/src/uts/common/fs/zfs/sys/dsl_scan.h usr/src/uts/common/fs/zfs/sys/txg_impl.h usr/src/uts/common/fs/zfs/txg.c
diffstat 5 files changed, 75 insertions(+), 15 deletions(-) [+]
line wrap: on
line diff
--- a/usr/src/uts/common/fs/zfs/dsl_destroy.c	Wed Apr 17 20:25:52 2013 -0400
+++ b/usr/src/uts/common/fs/zfs/dsl_destroy.c	Tue Apr 23 09:31:42 2013 -0800
@@ -753,12 +753,16 @@
 		zil_destroy_sync(dmu_objset_zil(os), tx);
 
 		if (!spa_feature_is_active(dp->dp_spa, async_destroy)) {
+			dsl_scan_t *scn = dp->dp_scan;
+
 			spa_feature_incr(dp->dp_spa, async_destroy, tx);
 			dp->dp_bptree_obj = bptree_alloc(mos, tx);
 			VERIFY0(zap_add(mos,
 			    DMU_POOL_DIRECTORY_OBJECT,
 			    DMU_POOL_BPTREE_OBJ, sizeof (uint64_t), 1,
 			    &dp->dp_bptree_obj, tx));
+			ASSERT(!scn->scn_async_destroying);
+			scn->scn_async_destroying = B_TRUE;
 		}
 
 		used = ds->ds_dir->dd_phys->dd_used_bytes;
--- a/usr/src/uts/common/fs/zfs/dsl_scan.c	Wed Apr 17 20:25:52 2013 -0400
+++ b/usr/src/uts/common/fs/zfs/dsl_scan.c	Tue Apr 23 09:31:42 2013 -0800
@@ -95,6 +95,15 @@
 	scn = dp->dp_scan = kmem_zalloc(sizeof (dsl_scan_t), KM_SLEEP);
 	scn->scn_dp = dp;
 
+	/*
+	 * It's possible that we're resuming a scan after a reboot so
+	 * make sure that the scan_async_destroying flag is initialized
+	 * appropriately.
+	 */
+	ASSERT(!scn->scn_async_destroying);
+	scn->scn_async_destroying = spa_feature_is_active(dp->dp_spa,
+	    &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY]);
+
 	err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 	    "scrub_func", sizeof (uint64_t), 1, &f);
 	if (err == 0) {
@@ -1344,13 +1353,10 @@
 	if (spa_shutting_down(spa))
 		return (B_FALSE);
 
-	if (scn->scn_phys.scn_state == DSS_SCANNING)
+	if (scn->scn_phys.scn_state == DSS_SCANNING ||
+	    scn->scn_async_destroying)
 		return (B_TRUE);
 
-	if (spa_feature_is_active(spa,
-	    &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY])) {
-		return (B_TRUE);
-	}
 	if (spa_version(scn->scn_dp->dp_spa) >= SPA_VERSION_DEADLISTS) {
 		(void) bpobj_space(&scn->scn_dp->dp_free_bpobj,
 		    &used, &comp, &uncomp);
@@ -1406,6 +1412,7 @@
 
 		if (err == 0 && spa_feature_is_active(spa,
 		    &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY])) {
+			ASSERT(scn->scn_async_destroying);
 			scn->scn_is_bptree = B_TRUE;
 			scn->scn_zio_root = zio_root(dp->dp_spa, NULL,
 			    NULL, ZIO_FLAG_MUSTSUCCEED);
@@ -1426,6 +1433,7 @@
 				VERIFY0(bptree_free(dp->dp_meta_objset,
 				    dp->dp_bptree_obj, tx));
 				dp->dp_bptree_obj = 0;
+				scn->scn_async_destroying = B_FALSE;
 			}
 		}
 		if (scn->scn_visited_this_txg) {
--- a/usr/src/uts/common/fs/zfs/sys/dsl_scan.h	Wed Apr 17 20:25:52 2013 -0400
+++ b/usr/src/uts/common/fs/zfs/sys/dsl_scan.h	Tue Apr 23 09:31:42 2013 -0800
@@ -20,7 +20,7 @@
  */
 /*
  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright (c) 2013 by Delphix. All rights reserved.
  */
 
 #ifndef	_SYS_DSL_SCAN_H
@@ -82,6 +82,7 @@
 
 	/* for freeing blocks */
 	boolean_t scn_is_bptree;
+	boolean_t scn_async_destroying;
 
 	/* for debugging / information */
 	uint64_t scn_visited_this_txg;
--- a/usr/src/uts/common/fs/zfs/sys/txg_impl.h	Wed Apr 17 20:25:52 2013 -0400
+++ b/usr/src/uts/common/fs/zfs/sys/txg_impl.h	Tue Apr 23 09:31:42 2013 -0800
@@ -24,7 +24,7 @@
  */
 
 /*
- * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright (c) 2013 by Delphix. All rights reserved.
  */
 
 #ifndef _SYS_TXG_IMPL_H
@@ -37,14 +37,55 @@
 extern "C" {
 #endif
 
+/*
+ * The tx_cpu structure is a per-cpu structure that is used to track
+ * the number of active transaction holds (tc_count). As transactions
+ * are assigned into a transaction group the appropriate tc_count is
+ * incremented to indicate that there are pending changes that have yet
+ * to quiesce. Consumers evenutally call txg_rele_to_sync() to decrement
+ * the tc_count. A transaction group is not considered quiesced until all
+ * tx_cpu structures have reached a tc_count of zero.
+ *
+ * This structure is a per-cpu structure by design. Updates to this structure
+ * are frequent and concurrent. Having a single structure would result in
+ * heavy lock contention so a per-cpu design was implemented. With the fanned
+ * out mutex design, consumers only need to lock the mutex associated with
+ * thread's cpu.
+ *
+ * The tx_cpu contains two locks, the tc_lock and tc_open_lock.
+ * The tc_lock is used to protect all members of the tx_cpu structure with
+ * the exception of the tc_open_lock. This lock should only be held for a
+ * short period of time, typically when updating the value of tc_count.
+ *
+ * The tc_open_lock protects the tx_open_txg member of the tx_state structure.
+ * This lock is used to ensure that transactions are only assigned into
+ * the current open transaction group. In order to move the current open
+ * transaction group to the quiesce phase, the txg_quiesce thread must
+ * grab all tc_open_locks, increment the tx_open_txg, and drop the locks.
+ * The tc_open_lock is held until the transaction is assigned into the
+ * transaction group. Typically, this is a short operation but if throttling
+ * is occuring it may be held for longer periods of time.
+ */
 struct tx_cpu {
-	kmutex_t	tc_lock;
+	kmutex_t	tc_open_lock;	/* protects tx_open_txg */
+	kmutex_t	tc_lock;	/* protects the rest of this struct */
 	kcondvar_t	tc_cv[TXG_SIZE];
 	uint64_t	tc_count[TXG_SIZE];	/* tx hold count on each txg */
 	list_t		tc_callbacks[TXG_SIZE]; /* commit cb list */
-	char		tc_pad[16];		/* pad to fill 3 cache lines */
+	char		tc_pad[8];		/* pad to fill 3 cache lines */
 };
 
+/*
+ * The tx_state structure maintains the state information about the different
+ * stages of the pool's transcation groups. A per pool tx_state structure
+ * is used to track this information. The tx_state structure also points to
+ * an array of tx_cpu structures (described above). Although the tx_sync_lock
+ * is used to protect the members of this structure, it is not used to
+ * protect the tx_open_txg. Instead a special lock in the tx_cpu structure
+ * is used. Readers of tx_open_txg must grab the per-cpu tc_open_lock.
+ * Any thread wishing to update tx_open_txg must grab the tc_open_lock on
+ * every cpu (see txg_quiesce()).
+ */
 typedef struct tx_state {
 	tx_cpu_t	*tx_cpu;	/* protects access to tx_open_txg */
 	kmutex_t	tx_sync_lock;	/* protects the rest of this struct */
--- a/usr/src/uts/common/fs/zfs/txg.c	Wed Apr 17 20:25:52 2013 -0400
+++ b/usr/src/uts/common/fs/zfs/txg.c	Tue Apr 23 09:31:42 2013 -0800
@@ -126,6 +126,8 @@
 		int i;
 
 		mutex_init(&tx->tx_cpu[c].tc_lock, NULL, MUTEX_DEFAULT, NULL);
+		mutex_init(&tx->tx_cpu[c].tc_open_lock, NULL, MUTEX_DEFAULT,
+		    NULL);
 		for (i = 0; i < TXG_SIZE; i++) {
 			cv_init(&tx->tx_cpu[c].tc_cv[i], NULL, CV_DEFAULT,
 			    NULL);
@@ -168,6 +170,7 @@
 	for (c = 0; c < max_ncpus; c++) {
 		int i;
 
+		mutex_destroy(&tx->tx_cpu[c].tc_open_lock);
 		mutex_destroy(&tx->tx_cpu[c].tc_lock);
 		for (i = 0; i < TXG_SIZE; i++) {
 			cv_destroy(&tx->tx_cpu[c].tc_cv[i]);
@@ -292,10 +295,12 @@
 	tx_cpu_t *tc = &tx->tx_cpu[CPU_SEQID];
 	uint64_t txg;
 
-	mutex_enter(&tc->tc_lock);
+	mutex_enter(&tc->tc_open_lock);
+	txg = tx->tx_open_txg;
 
-	txg = tx->tx_open_txg;
+	mutex_enter(&tc->tc_lock);
 	tc->tc_count[txg & TXG_MASK]++;
+	mutex_exit(&tc->tc_lock);
 
 	th->th_cpu = tc;
 	th->th_txg = txg;
@@ -308,7 +313,8 @@
 {
 	tx_cpu_t *tc = th->th_cpu;
 
-	mutex_exit(&tc->tc_lock);
+	ASSERT(!MUTEX_HELD(&tc->tc_lock));
+	mutex_exit(&tc->tc_open_lock);
 }
 
 void
@@ -345,10 +351,10 @@
 	int c;
 
 	/*
-	 * Grab all tx_cpu locks so nobody else can get into this txg.
+	 * Grab all tc_open_locks so nobody else can get into this txg.
 	 */
 	for (c = 0; c < max_ncpus; c++)
-		mutex_enter(&tx->tx_cpu[c].tc_lock);
+		mutex_enter(&tx->tx_cpu[c].tc_open_lock);
 
 	ASSERT(txg == tx->tx_open_txg);
 	tx->tx_open_txg++;
@@ -361,7 +367,7 @@
 	 * enter the next transaction group.
 	 */
 	for (c = 0; c < max_ncpus; c++)
-		mutex_exit(&tx->tx_cpu[c].tc_lock);
+		mutex_exit(&tx->tx_cpu[c].tc_open_lock);
 
 	/*
 	 * Quiesce the transaction group by waiting for everyone to txg_exit().