Mercurial > illumos > git > illumos-omnios
changeset 23880:a013b6c81317
13208 Create aggr fails when underlying links have more than 128 Tx rings
Reviewed by: Robert Mustacchi <rm@fingolfin.org>
Reviewed by: Garrett D'Amore <garrett@damore.org>
Approved by: Dan McDonald <danmcd@joyent.com>
author | Paul Winder <paul@winder.uk.net> |
---|---|
date | Mon, 12 Oct 2020 12:01:29 +0100 |
parents | 4483942be336 |
children | 7436e9af6a7e 1ac626b08c2e |
files | usr/src/uts/common/io/aggr/aggr_grp.c usr/src/uts/common/sys/aggr_impl.h |
diffstat | 2 files changed, 109 insertions(+), 15 deletions(-) [+] |
line wrap: on
line diff
--- a/usr/src/uts/common/io/aggr/aggr_grp.c Fri Nov 20 20:15:52 2020 +0000 +++ b/usr/src/uts/common/io/aggr/aggr_grp.c Mon Oct 12 12:01:29 2020 +0100 @@ -607,6 +607,8 @@ port->lp_grp = grp; AGGR_GRP_REFHOLD(grp); grp->lg_nports++; + if (grp->lg_nports > grp->lg_nports_high) + grp->lg_nports_high = grp->lg_nports; aggr_lacp_init_port(port); mac_perim_exit(mph); @@ -675,7 +677,7 @@ * No slot for this new RX ring. */ if (j == MAX_RINGS_PER_GROUP) - return (EIO); + return (ENOSPC); ring->arr_flags |= MAC_PSEUDO_RING_INUSE; ring->arr_hw_rh = hw_rh; @@ -884,7 +886,7 @@ * No slot for this new TX ring. */ if (i == MAX_RINGS_PER_GROUP) - return (EIO); + return (ENOSPC); /* * The following 4 statements needs to be done before * calling mac_group_add_ring(). Otherwise it will @@ -948,7 +950,8 @@ * rings of the aggr and the hardware rings of the underlying port. */ static int -aggr_add_pseudo_tx_group(aggr_port_t *port, aggr_pseudo_tx_group_t *tx_grp) +aggr_add_pseudo_tx_group(aggr_port_t *port, aggr_pseudo_tx_group_t *tx_grp, + uint_t limit) { aggr_grp_t *grp = port->lp_grp; mac_ring_handle_t hw_rh[MAX_RINGS_PER_GROUP], pseudo_rh; @@ -956,6 +959,9 @@ int hw_rh_cnt, i = 0, j; int err = 0; + if (limit == 0) + return (ENOSPC); + ASSERT(MAC_PERIM_HELD(grp->lg_mh)); mac_perim_enter_by_mh(port->lp_mh, &pmph); @@ -973,12 +979,13 @@ if (hw_rh_cnt == 0) port->lp_tx_ring_cnt = 1; else - port->lp_tx_ring_cnt = hw_rh_cnt; - + port->lp_tx_ring_cnt = MIN(hw_rh_cnt, limit); + + port->lp_tx_ring_alloc = port->lp_tx_ring_cnt; port->lp_tx_rings = kmem_zalloc((sizeof (mac_ring_handle_t *) * - port->lp_tx_ring_cnt), KM_SLEEP); + port->lp_tx_ring_alloc), KM_SLEEP); port->lp_pseudo_tx_rings = kmem_zalloc((sizeof (mac_ring_handle_t *) * - port->lp_tx_ring_cnt), KM_SLEEP); + port->lp_tx_ring_alloc), KM_SLEEP); if (hw_rh_cnt == 0) { if ((err = aggr_add_pseudo_tx_ring(port, tx_grp, @@ -987,7 +994,7 @@ port->lp_pseudo_tx_rings[0] = pseudo_rh; } } else { - for (i = 0; err == 0 && i < hw_rh_cnt; i++) { + for (i = 0; err == 0 && i < port->lp_tx_ring_cnt; i++) { err = aggr_add_pseudo_tx_ring(port, tx_grp, hw_rh[i], &pseudo_rh); if (err != 0) @@ -1005,10 +1012,11 @@ } } kmem_free(port->lp_tx_rings, - (sizeof (mac_ring_handle_t *) * port->lp_tx_ring_cnt)); + (sizeof (mac_ring_handle_t *) * port->lp_tx_ring_alloc)); kmem_free(port->lp_pseudo_tx_rings, - (sizeof (mac_ring_handle_t *) * port->lp_tx_ring_cnt)); + (sizeof (mac_ring_handle_t *) * port->lp_tx_ring_alloc)); port->lp_tx_ring_cnt = 0; + port->lp_tx_ring_alloc = 0; } else { port->lp_tx_grp_added = B_TRUE; port->lp_tx_notify_mh = mac_client_tx_notify(port->lp_mch, @@ -1042,9 +1050,9 @@ aggr_rem_pseudo_tx_ring(tx_grp, port->lp_pseudo_tx_rings[i]); kmem_free(port->lp_tx_rings, - (sizeof (mac_ring_handle_t *) * port->lp_tx_ring_cnt)); + (sizeof (mac_ring_handle_t *) * port->lp_tx_ring_alloc)); kmem_free(port->lp_pseudo_tx_rings, - (sizeof (mac_ring_handle_t *) * port->lp_tx_ring_cnt)); + (sizeof (mac_ring_handle_t *) * port->lp_tx_ring_alloc)); port->lp_tx_ring_cnt = 0; (void) mac_client_tx_notify(port->lp_mch, NULL, port->lp_tx_notify_mh); @@ -1111,6 +1119,48 @@ } /* + * Trim each port in a group to ensure it uses no more than tx_ring_limit + * rings. + */ +static void +aggr_grp_balance_tx(aggr_grp_t *grp, uint_t tx_ring_limit) +{ + aggr_port_t *port; + mac_perim_handle_t mph; + uint_t i, tx_ring_cnt; + + ASSERT(tx_ring_limit > 0); + ASSERT(MAC_PERIM_HELD(grp->lg_mh)); + + for (port = grp->lg_ports; port != NULL; port = port->lp_next) { + mac_perim_enter_by_mh(port->lp_mh, &mph); + + /* + * Reduce the Tx ring count first to prevent rings being + * used as they are removed. + */ + rw_enter(&grp->lg_tx_lock, RW_WRITER); + if (port->lp_tx_ring_cnt <= tx_ring_limit) { + rw_exit(&grp->lg_tx_lock); + mac_perim_exit(mph); + continue; + } + + tx_ring_cnt = port->lp_tx_ring_cnt; + port->lp_tx_ring_cnt = tx_ring_limit; + rw_exit(&grp->lg_tx_lock); + + for (i = tx_ring_cnt - 1; i >= tx_ring_limit; i--) { + aggr_rem_pseudo_tx_ring(&grp->lg_tx_group, + port->lp_pseudo_tx_rings[i]); + + } + + mac_perim_exit(mph); + } +} + +/* * Add one or more ports to an existing link aggregation group. */ int @@ -1120,6 +1170,7 @@ int rc; uint_t port_added = 0; uint_t grp_added; + uint_t nports_high, tx_ring_limit; aggr_grp_t *grp = NULL; aggr_port_t *port; boolean_t link_state_changed = B_FALSE; @@ -1140,6 +1191,24 @@ mac_perim_enter_by_mh(grp->lg_mh, &mph); rw_exit(&aggr_grp_lock); + /* + * Limit the number of Tx rings per port. When determining the + * number of ports take into consideration the existing high + * value, and what the new high value may be after this request. + */ + nports_high = MAX(grp->lg_nports_high, grp->lg_nports + nports); + tx_ring_limit = MAX_RINGS_PER_GROUP / nports_high; + + if (tx_ring_limit == 0) { + rc = ENOSPC; + goto bail; + } + + /* + * Balance the Tx rings so each port has a fair share of rings. + */ + aggr_grp_balance_tx(grp, tx_ring_limit); + /* Add the specified ports to the aggr. */ for (uint_t i = 0; i < nports; i++) { grp_added = 0; @@ -1164,7 +1233,8 @@ * Create the pseudo ring for each HW ring of the underlying * port. */ - rc = aggr_add_pseudo_tx_group(port, &grp->lg_tx_group); + rc = aggr_add_pseudo_tx_group(port, &grp->lg_tx_group, + tx_ring_limit); if (rc != 0) goto bail; @@ -1380,6 +1450,7 @@ mac_perim_handle_t mph, pmph; datalink_id_t tempid; boolean_t mac_registered = B_FALSE; + uint_t tx_ring_limit; int err; int i, j; kt_did_t tid = 0; @@ -1551,6 +1622,25 @@ aggr_lacp_set_mode(grp, lacp_mode, lacp_timer); /* + * The pseudo Tx group holds a maximum of MAX_RINGS_PER_GROUP + * rings, when all the Tx rings of all the ports are accumulated + * it is conceivable this limit is exceeded. We try and prevent + * this by limiting the number of rings an individual port will use. + * + * - When an aggr is first created, we will not let an + * individual port use more than MAX_RINGS_PER_GROUP/nports + * rings. + * - As ports are added to an existing aggr, each of the + * ports will not use more than MAX_RINGS_PER_GROUP/nports_high. + * Where nports_high is the highest number of ports the aggr has + * held (including any ports being added). This may involve + * trimming rings from existing ports. + */ + + /* Leave room for 4 ports */ + tx_ring_limit = MAX_RINGS_PER_GROUP / MAX(4, nports); + + /* * Attach each port if necessary. */ for (port = grp->lg_ports; port != NULL; port = port->lp_next) { @@ -1559,7 +1649,8 @@ * underlying port. Note that this is done after the * aggr registers its MAC. */ - err = aggr_add_pseudo_tx_group(port, &grp->lg_tx_group); + err = aggr_add_pseudo_tx_group(port, &grp->lg_tx_group, + tx_ring_limit); if (err != 0) { mac_perim_exit(mph);
--- a/usr/src/uts/common/sys/aggr_impl.h Fri Nov 20 20:15:52 2020 +0000 +++ b/usr/src/uts/common/sys/aggr_impl.h Mon Oct 12 12:01:29 2020 +0100 @@ -23,6 +23,7 @@ * Use is subject to license terms. * Copyright 2012 OmniTI Computer Consulting, Inc All rights reserved. * Copyright 2018 Joyent, Inc. + * Copyright 2020 RackTop Systems, Inc. */ #ifndef _SYS_AGGR_IMPL_H @@ -161,7 +162,8 @@ */ mac_group_handle_t lp_hwghs[MAX_GROUPS_PER_PORT]; - int lp_tx_ring_cnt; + uint_t lp_tx_ring_alloc; + uint_t lp_tx_ring_cnt; /* handles of the underlying HW TX rings */ mac_ring_handle_t *lp_tx_rings; /* @@ -195,6 +197,7 @@ uint16_t lg_key; /* key (group port number) */ uint32_t lg_refs; /* refcount */ uint16_t lg_nports; /* number of MAC ports */ + uint16_t lg_nports_high; /* highest no. of MAC ports */ uint8_t lg_addr[ETHERADDRL]; /* group MAC address */ uint16_t lg_closing : 1,