changeset 23880:a013b6c81317

13208 Create aggr fails when underlying links have more than 128 Tx rings Reviewed by: Robert Mustacchi <rm@fingolfin.org> Reviewed by: Garrett D'Amore <garrett@damore.org> Approved by: Dan McDonald <danmcd@joyent.com>
author Paul Winder <paul@winder.uk.net>
date Mon, 12 Oct 2020 12:01:29 +0100
parents 4483942be336
children 7436e9af6a7e 1ac626b08c2e
files usr/src/uts/common/io/aggr/aggr_grp.c usr/src/uts/common/sys/aggr_impl.h
diffstat 2 files changed, 109 insertions(+), 15 deletions(-) [+]
line wrap: on
line diff
--- a/usr/src/uts/common/io/aggr/aggr_grp.c	Fri Nov 20 20:15:52 2020 +0000
+++ b/usr/src/uts/common/io/aggr/aggr_grp.c	Mon Oct 12 12:01:29 2020 +0100
@@ -607,6 +607,8 @@
 	port->lp_grp = grp;
 	AGGR_GRP_REFHOLD(grp);
 	grp->lg_nports++;
+	if (grp->lg_nports > grp->lg_nports_high)
+		grp->lg_nports_high = grp->lg_nports;
 
 	aggr_lacp_init_port(port);
 	mac_perim_exit(mph);
@@ -675,7 +677,7 @@
 	 * No slot for this new RX ring.
 	 */
 	if (j == MAX_RINGS_PER_GROUP)
-		return (EIO);
+		return (ENOSPC);
 
 	ring->arr_flags |= MAC_PSEUDO_RING_INUSE;
 	ring->arr_hw_rh = hw_rh;
@@ -884,7 +886,7 @@
 	 * No slot for this new TX ring.
 	 */
 	if (i == MAX_RINGS_PER_GROUP)
-		return (EIO);
+		return (ENOSPC);
 	/*
 	 * The following 4 statements needs to be done before
 	 * calling mac_group_add_ring(). Otherwise it will
@@ -948,7 +950,8 @@
  * rings of the aggr and the hardware rings of the underlying port.
  */
 static int
-aggr_add_pseudo_tx_group(aggr_port_t *port, aggr_pseudo_tx_group_t *tx_grp)
+aggr_add_pseudo_tx_group(aggr_port_t *port, aggr_pseudo_tx_group_t *tx_grp,
+    uint_t limit)
 {
 	aggr_grp_t		*grp = port->lp_grp;
 	mac_ring_handle_t	hw_rh[MAX_RINGS_PER_GROUP], pseudo_rh;
@@ -956,6 +959,9 @@
 	int			hw_rh_cnt, i = 0, j;
 	int			err = 0;
 
+	if (limit == 0)
+		return (ENOSPC);
+
 	ASSERT(MAC_PERIM_HELD(grp->lg_mh));
 	mac_perim_enter_by_mh(port->lp_mh, &pmph);
 
@@ -973,12 +979,13 @@
 	if (hw_rh_cnt == 0)
 		port->lp_tx_ring_cnt = 1;
 	else
-		port->lp_tx_ring_cnt = hw_rh_cnt;
-
+		port->lp_tx_ring_cnt = MIN(hw_rh_cnt, limit);
+
+	port->lp_tx_ring_alloc = port->lp_tx_ring_cnt;
 	port->lp_tx_rings = kmem_zalloc((sizeof (mac_ring_handle_t *) *
-	    port->lp_tx_ring_cnt), KM_SLEEP);
+	    port->lp_tx_ring_alloc), KM_SLEEP);
 	port->lp_pseudo_tx_rings = kmem_zalloc((sizeof (mac_ring_handle_t *) *
-	    port->lp_tx_ring_cnt), KM_SLEEP);
+	    port->lp_tx_ring_alloc), KM_SLEEP);
 
 	if (hw_rh_cnt == 0) {
 		if ((err = aggr_add_pseudo_tx_ring(port, tx_grp,
@@ -987,7 +994,7 @@
 			port->lp_pseudo_tx_rings[0] = pseudo_rh;
 		}
 	} else {
-		for (i = 0; err == 0 && i < hw_rh_cnt; i++) {
+		for (i = 0; err == 0 && i < port->lp_tx_ring_cnt; i++) {
 			err = aggr_add_pseudo_tx_ring(port,
 			    tx_grp, hw_rh[i], &pseudo_rh);
 			if (err != 0)
@@ -1005,10 +1012,11 @@
 			}
 		}
 		kmem_free(port->lp_tx_rings,
-		    (sizeof (mac_ring_handle_t *) * port->lp_tx_ring_cnt));
+		    (sizeof (mac_ring_handle_t *) * port->lp_tx_ring_alloc));
 		kmem_free(port->lp_pseudo_tx_rings,
-		    (sizeof (mac_ring_handle_t *) * port->lp_tx_ring_cnt));
+		    (sizeof (mac_ring_handle_t *) * port->lp_tx_ring_alloc));
 		port->lp_tx_ring_cnt = 0;
+		port->lp_tx_ring_alloc = 0;
 	} else {
 		port->lp_tx_grp_added = B_TRUE;
 		port->lp_tx_notify_mh = mac_client_tx_notify(port->lp_mch,
@@ -1042,9 +1050,9 @@
 		aggr_rem_pseudo_tx_ring(tx_grp, port->lp_pseudo_tx_rings[i]);
 
 	kmem_free(port->lp_tx_rings,
-	    (sizeof (mac_ring_handle_t *) * port->lp_tx_ring_cnt));
+	    (sizeof (mac_ring_handle_t *) * port->lp_tx_ring_alloc));
 	kmem_free(port->lp_pseudo_tx_rings,
-	    (sizeof (mac_ring_handle_t *) * port->lp_tx_ring_cnt));
+	    (sizeof (mac_ring_handle_t *) * port->lp_tx_ring_alloc));
 
 	port->lp_tx_ring_cnt = 0;
 	(void) mac_client_tx_notify(port->lp_mch, NULL, port->lp_tx_notify_mh);
@@ -1111,6 +1119,48 @@
 }
 
 /*
+ * Trim each port in a group to ensure it uses no more than tx_ring_limit
+ * rings.
+ */
+static void
+aggr_grp_balance_tx(aggr_grp_t *grp, uint_t tx_ring_limit)
+{
+	aggr_port_t *port;
+	mac_perim_handle_t mph;
+	uint_t i, tx_ring_cnt;
+
+	ASSERT(tx_ring_limit > 0);
+	ASSERT(MAC_PERIM_HELD(grp->lg_mh));
+
+	for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
+		mac_perim_enter_by_mh(port->lp_mh, &mph);
+
+		/*
+		 * Reduce the Tx ring count first to prevent rings being
+		 * used as they are removed.
+		 */
+		rw_enter(&grp->lg_tx_lock, RW_WRITER);
+		if (port->lp_tx_ring_cnt <= tx_ring_limit) {
+			rw_exit(&grp->lg_tx_lock);
+			mac_perim_exit(mph);
+			continue;
+		}
+
+		tx_ring_cnt = port->lp_tx_ring_cnt;
+		port->lp_tx_ring_cnt = tx_ring_limit;
+		rw_exit(&grp->lg_tx_lock);
+
+		for (i = tx_ring_cnt - 1; i >= tx_ring_limit; i--) {
+			aggr_rem_pseudo_tx_ring(&grp->lg_tx_group,
+			    port->lp_pseudo_tx_rings[i]);
+
+		}
+
+		mac_perim_exit(mph);
+	}
+}
+
+/*
  * Add one or more ports to an existing link aggregation group.
  */
 int
@@ -1120,6 +1170,7 @@
 	int rc;
 	uint_t port_added = 0;
 	uint_t grp_added;
+	uint_t nports_high, tx_ring_limit;
 	aggr_grp_t *grp = NULL;
 	aggr_port_t *port;
 	boolean_t link_state_changed = B_FALSE;
@@ -1140,6 +1191,24 @@
 	mac_perim_enter_by_mh(grp->lg_mh, &mph);
 	rw_exit(&aggr_grp_lock);
 
+	/*
+	 * Limit the number of Tx rings per port. When determining the
+	 * number of ports take into consideration the existing high
+	 * value, and what the new high value may be after this request.
+	 */
+	nports_high = MAX(grp->lg_nports_high, grp->lg_nports + nports);
+	tx_ring_limit = MAX_RINGS_PER_GROUP / nports_high;
+
+	if (tx_ring_limit == 0) {
+		rc = ENOSPC;
+		goto bail;
+	}
+
+	/*
+	 * Balance the Tx rings so each port has a fair share of rings.
+	 */
+	aggr_grp_balance_tx(grp, tx_ring_limit);
+
 	/* Add the specified ports to the aggr. */
 	for (uint_t i = 0; i < nports; i++) {
 		grp_added = 0;
@@ -1164,7 +1233,8 @@
 		 * Create the pseudo ring for each HW ring of the underlying
 		 * port.
 		 */
-		rc = aggr_add_pseudo_tx_group(port, &grp->lg_tx_group);
+		rc = aggr_add_pseudo_tx_group(port, &grp->lg_tx_group,
+		    tx_ring_limit);
 		if (rc != 0)
 			goto bail;
 
@@ -1380,6 +1450,7 @@
 	mac_perim_handle_t mph, pmph;
 	datalink_id_t tempid;
 	boolean_t mac_registered = B_FALSE;
+	uint_t tx_ring_limit;
 	int err;
 	int i, j;
 	kt_did_t tid = 0;
@@ -1551,6 +1622,25 @@
 	aggr_lacp_set_mode(grp, lacp_mode, lacp_timer);
 
 	/*
+	 * The pseudo Tx group holds a maximum of MAX_RINGS_PER_GROUP
+	 * rings, when all the Tx rings of all the ports are accumulated
+	 * it is conceivable this limit is exceeded. We try and prevent
+	 * this by limiting the number of rings an individual port will use.
+	 *
+	 * - When an aggr is first created, we will not let an
+	 *   individual port use more than MAX_RINGS_PER_GROUP/nports
+	 *   rings.
+	 * - As ports are added to an existing aggr, each of the
+	 *   ports will not use more than MAX_RINGS_PER_GROUP/nports_high.
+	 *   Where nports_high is the highest number of ports the aggr has
+	 *   held (including any ports being added). This may involve
+	 *   trimming rings from existing ports.
+	 */
+
+	/* Leave room for 4 ports */
+	tx_ring_limit = MAX_RINGS_PER_GROUP / MAX(4, nports);
+
+	/*
 	 * Attach each port if necessary.
 	 */
 	for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
@@ -1559,7 +1649,8 @@
 		 * underlying port. Note that this is done after the
 		 * aggr registers its MAC.
 		 */
-		err = aggr_add_pseudo_tx_group(port, &grp->lg_tx_group);
+		err = aggr_add_pseudo_tx_group(port, &grp->lg_tx_group,
+		    tx_ring_limit);
 
 		if (err != 0) {
 			mac_perim_exit(mph);
--- a/usr/src/uts/common/sys/aggr_impl.h	Fri Nov 20 20:15:52 2020 +0000
+++ b/usr/src/uts/common/sys/aggr_impl.h	Mon Oct 12 12:01:29 2020 +0100
@@ -23,6 +23,7 @@
  * Use is subject to license terms.
  * Copyright 2012 OmniTI Computer Consulting, Inc  All rights reserved.
  * Copyright 2018 Joyent, Inc.
+ * Copyright 2020 RackTop Systems, Inc.
  */
 
 #ifndef	_SYS_AGGR_IMPL_H
@@ -161,7 +162,8 @@
 	 */
 	mac_group_handle_t	lp_hwghs[MAX_GROUPS_PER_PORT];
 
-	int			lp_tx_ring_cnt;
+	uint_t			lp_tx_ring_alloc;
+	uint_t			lp_tx_ring_cnt;
 	/* handles of the underlying HW TX rings */
 	mac_ring_handle_t	*lp_tx_rings;
 	/*
@@ -195,6 +197,7 @@
 	uint16_t	lg_key;			/* key (group port number) */
 	uint32_t	lg_refs;		/* refcount */
 	uint16_t	lg_nports;		/* number of MAC ports */
+	uint16_t	lg_nports_high;		/* highest no. of MAC ports */
 	uint8_t		lg_addr[ETHERADDRL];	/* group MAC address */
 	uint16_t
 			lg_closing : 1,