changeset 9062:fdba8b9800fa onnv_111

6808905 recursive mutex_enter in page_get_physical on xpv with BSST fork test
author Vikram Hegde <Vikram.Hegde@Sun.COM>
date Mon, 16 Mar 2009 21:18:21 -0700
parents c3ecb8a7a205
children a12fa8c43965
files usr/src/uts/i86pc/io/intel_iommu.c usr/src/uts/i86pc/io/iommu_rscs.c usr/src/uts/i86pc/sys/intel_iommu.h usr/src/uts/i86pc/sys/iommu_rscs.h usr/src/uts/i86pc/sys/machsystm.h usr/src/uts/i86pc/vm/htable.c usr/src/uts/i86pc/vm/vm_machdep.c
diffstat 7 files changed, 173 insertions(+), 110 deletions(-) [+]
line wrap: on
line diff
--- a/usr/src/uts/i86pc/io/intel_iommu.c	Mon Mar 16 21:06:13 2009 -0700
+++ b/usr/src/uts/i86pc/io/intel_iommu.c	Mon Mar 16 21:18:21 2009 -0700
@@ -274,18 +274,17 @@
 static paddr_t
 iommu_get_page(intel_iommu_state_t *iommu, int kmflag)
 {
-	paddr_t paddr;
+	iommu_pghdl_t *pghdl;
 	caddr_t vaddr;
 
-	paddr = iommu_page_alloc(kmflag);
-	vaddr = iommu_page_map(paddr);
+	pghdl = iommu_page_alloc(iommu, kmflag);
+	vaddr = pghdl->vaddr;
 	bzero(vaddr, IOMMU_PAGE_SIZE);
 	iommu->iu_dmar_ops->do_clflush(vaddr, IOMMU_PAGE_SIZE);
-	iommu_page_unmap(vaddr);
 
 	page_num++;
 
-	return (paddr);
+	return (pghdl->paddr);
 }
 
 /*
@@ -293,9 +292,9 @@
  *   free the iommu page allocated with iommu_get_page
  */
 static void
-iommu_free_page(paddr_t paddr)
+iommu_free_page(intel_iommu_state_t *iommu, paddr_t paddr)
 {
-	iommu_page_free(paddr);
+	iommu_page_free(iommu, paddr);
 	page_num--;
 }
 
@@ -339,7 +338,7 @@
 static void
 destroy_iommu_state(intel_iommu_state_t *iommu)
 {
-	iommu_free_page(iommu->iu_root_entry_paddr);
+	iommu_free_page(iommu, iommu->iu_root_entry_paddr);
 	iommu_rscs_fini(&(iommu->iu_domain_id_hdl));
 	mutex_destroy(&(iommu->iu_reg_lock));
 	mutex_destroy(&(iommu->iu_root_context_lock));
@@ -1827,14 +1826,15 @@
 	/*
 	 * create the first level page table
 	 */
-	domain->dm_page_table_paddr =
-	    iommu_get_page(domain->dm_iommu, KM_SLEEP);
+	domain->dm_page_table_paddr = iommu_get_page(domain->dm_iommu,
+	    KM_SLEEP);
 
 	/*
 	 * init the CPU available page tables
 	 */
 	domain->dm_pt_tree.vp = kmem_zalloc(IOMMU_PAGE_SIZE << 1, KM_SLEEP);
-	domain->dm_pt_tree.pp = iommu_page_map(domain->dm_page_table_paddr);
+	domain->dm_pt_tree.pp = iommu_get_vaddr(domain->dm_iommu,
+	    domain->dm_page_table_paddr);
 	domain->dm_identity = B_FALSE;
 
 	/*
@@ -1972,7 +1972,7 @@
 	/*
 	 * set root entry
 	 */
-	root = iommu_page_map(iommu->iu_root_entry_paddr);
+	root = iommu_get_vaddr(iommu, iommu->iu_root_entry_paddr);
 	rce = (iorce_t)root + bus;
 	mutex_enter(&(iommu->iu_root_context_lock));
 	if (!ROOT_ENTRY_GET_P(rce)) {
@@ -1980,10 +1980,10 @@
 		ROOT_ENTRY_SET_P(rce);
 		ROOT_ENTRY_SET_CTP(rce, paddr);
 		iommu->iu_dmar_ops->do_clflush((caddr_t)rce, sizeof (*rce));
-		context = iommu_page_map(paddr);
+		context = iommu_get_vaddr(iommu, paddr);
 	} else {
 		paddr = ROOT_ENTRY_GET_CTP(rce);
-		context = iommu_page_map(paddr);
+		context = iommu_get_vaddr(iommu, paddr);
 	}
 
 	/* set context entry */
@@ -2003,8 +2003,6 @@
 	}
 
 	mutex_exit(&(iommu->iu_root_context_lock));
-	iommu_page_unmap(root);
-	iommu_page_unmap(context);
 
 	/* cache mode set, flush context cache */
 	if (IOMMU_CAP_GET_CM(iommu->iu_capability)) {
@@ -2300,7 +2298,7 @@
 		domain->dm_iommu->iu_dmar_ops->do_clflush((caddr_t)pte,
 		    sizeof (*pte));
 		vpte->vp = kmem_zalloc(IOMMU_PAGE_SIZE << 1, KM_SLEEP);
-		vpte->pp = iommu_page_map(child);
+		vpte->pp = iommu_get_vaddr(domain->dm_iommu, child);
 	}
 
 	return (vpte);
--- a/usr/src/uts/i86pc/io/iommu_rscs.c	Mon Mar 16 21:06:13 2009 -0700
+++ b/usr/src/uts/i86pc/io/iommu_rscs.c	Mon Mar 16 21:18:21 2009 -0700
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -50,8 +50,28 @@
 #include <vm/hat_i86.h>
 #include <sys/machsystm.h>
 #include <sys/iommu_rscs.h>
+#include <sys/intel_iommu.h>
 
+ddi_dma_attr_t page_dma_attr = {
+	DMA_ATTR_V0,
+	0U,
+	0xffffffffU,
+	0xffffffffU,
+	MMU_PAGESIZE, /* page aligned */
+	0x1,
+	0x1,
+	0xffffffffU,
+	0xffffffffU,
+	1,
+	4,
+	0
+};
 
+ddi_device_acc_attr_t page_acc_attr = {
+	DDI_DEVICE_ATTR_V0,
+	DDI_NEVERSWAP_ACC,
+	DDI_STRICTORDER_ACC
+};
 
 typedef struct iommu_rscs_s {
 	/*
@@ -78,88 +98,119 @@
 	kmutex_t rs_mutex;
 } iommu_rscs_state_t;
 
+static uint_t
+iommu_pghdl_hash_func(paddr_t paddr)
+{
+	return (paddr % IOMMU_PGHDL_HASH_SIZE);
+}
 
 /*
  * iommu_page_alloc()
  *
  */
-paddr_t
-iommu_page_alloc(int kmflag)
+iommu_pghdl_t *
+iommu_page_alloc(intel_iommu_state_t *iommu, int kmflag)
 {
-	paddr_t paddr;
-	page_t *pp;
+	size_t actual_size = 0;
+	iommu_pghdl_t *pghdl;
+	caddr_t vaddr;
+	uint_t idx;
 
 	ASSERT(kmflag == KM_SLEEP || kmflag == KM_NOSLEEP);
 
-	pp = page_get_physical(kmflag);
-	if (pp == NULL) {
-		return (NULL);
+	pghdl = kmem_zalloc(sizeof (*pghdl), kmflag);
+	if (pghdl == NULL) {
+		return (0);
+	}
+
+	if (ddi_dma_alloc_handle(ddi_root_node(), &page_dma_attr, DDI_DMA_SLEEP,
+	    NULL, &pghdl->dma_hdl) != DDI_SUCCESS) {
+		kmem_free(pghdl, sizeof (*pghdl));
+		return (0);
+	}
+
+	if (ddi_dma_mem_alloc(pghdl->dma_hdl, PAGESIZE, &page_acc_attr,
+	    DDI_DMA_CONSISTENT | IOMEM_DATA_UNCACHED,
+	    (kmflag == KM_SLEEP) ? DDI_DMA_SLEEP : DDI_DMA_DONTWAIT,
+	    NULL, &vaddr, &actual_size, &pghdl->mem_hdl) != DDI_SUCCESS) {
+		ddi_dma_free_handle(&pghdl->dma_hdl);
+		kmem_free(pghdl, sizeof (*pghdl));
+		return (0);
 	}
 
-	paddr =  pa_to_ma((uint64_t)pp->p_pagenum << PAGESHIFT);
+	ASSERT(actual_size == PAGESIZE);
+
+	if (actual_size != PAGESIZE) {
+		ddi_dma_mem_free(&pghdl->mem_hdl);
+		ddi_dma_free_handle(&pghdl->dma_hdl);
+		kmem_free(pghdl, sizeof (*pghdl));
+		return (0);
+
+	}
 
-	return (paddr);
+	pghdl->paddr = pfn_to_pa(hat_getpfnum(kas.a_hat, vaddr));
+
+	idx = iommu_pghdl_hash_func(pghdl->paddr);
+	pghdl->next = iommu->iu_pghdl_hash[idx];
+	if (pghdl->next)
+		pghdl->next->prev = pghdl;
+	iommu->iu_pghdl_hash[idx] = pghdl;
+
+	return (pghdl);
 }
 
-
 /*
  * iommu_page_free()
  */
 void
-iommu_page_free(paddr_t paddr)
+iommu_page_free(intel_iommu_state_t *iommu, paddr_t paddr)
 {
-	page_t *pp;
+	uint_t idx;
+	iommu_pghdl_t *pghdl;
 
-	pp = page_numtopp_nolock(ma_to_pa(paddr) >> PAGESHIFT);
-	page_free_physical(pp);
+	idx = iommu_pghdl_hash_func(paddr);
+	pghdl = iommu->iu_pghdl_hash[idx];
+	while (pghdl && pghdl->paddr != paddr)
+		continue;
+	if (pghdl == NULL) {
+		cmn_err(CE_PANIC,
+		    "Freeing a free IOMMU page: paddr=0x%" PRIx64,
+		    paddr);
+		/*NOTREACHED*/
+	}
+	if (pghdl->prev == NULL)
+		iommu->iu_pghdl_hash[idx] = pghdl->next;
+	else
+		pghdl->prev->next = pghdl->next;
+	if (pghdl->next)
+		pghdl->next->prev = pghdl->prev;
+
+	ddi_dma_mem_free(&pghdl->mem_hdl);
+	ddi_dma_free_handle(&pghdl->dma_hdl);
+	kmem_free(pghdl, sizeof (*pghdl));
 }
 
-
 /*
- * iommu_page_map()
- *
+ * iommu_get_vaddr()
  */
 caddr_t
-iommu_page_map(paddr_t addr)
+iommu_get_vaddr(intel_iommu_state_t *iommu, paddr_t paddr)
 {
-	paddr_t paddr;
-	caddr_t kva;
-	page_t *pp;
-
-	paddr = ma_to_pa(addr);
+	uint_t idx;
+	iommu_pghdl_t *pghdl;
 
-	if (kpm_enable) {
-		kva = hat_kpm_pfn2va((pfn_t)btop(paddr));
-	} else {
-		kva = vmem_alloc(heap_arena, PAGESIZE, VM_SLEEP);
-		if (kva == NULL) {
-			return (NULL);
-		}
-		pp = page_numtopp_nolock(paddr >> PAGESHIFT);
-		hat_memload(kas.a_hat, kva, pp,
-		    PROT_READ | PROT_WRITE, HAT_LOAD_LOCK);
+	idx = iommu_pghdl_hash_func(paddr);
+	pghdl = iommu->iu_pghdl_hash[idx];
+	while (pghdl && pghdl->paddr != paddr)
+		continue;
+	if (pghdl == NULL) {
+		return (0);
 	}
-
-	return (kva);
+	return (pghdl->vaddr);
 }
 
 
 /*
- * iommu_page_unmap()
- *
- */
-void
-iommu_page_unmap(caddr_t kva)
-{
-	if (!kpm_enable) {
-		hat_unload(kas.a_hat, kva, PAGESIZE, HAT_UNLOAD_UNLOCK);
-		vmem_free(heap_arena, kva, PAGESIZE);
-	}
-}
-
-
-
-/*
  * iommu_rscs_init()
  *    Initialize the resource structure. init() returns a handle to be
  *    used for the rest of the resource functions. This code is written assuming
--- a/usr/src/uts/i86pc/sys/intel_iommu.h	Mon Mar 16 21:06:13 2009 -0700
+++ b/usr/src/uts/i86pc/sys/intel_iommu.h	Mon Mar 16 21:18:21 2009 -0700
@@ -413,6 +413,9 @@
 
 struct inv_queue_state;
 struct intr_remap_tbl_state;
+struct iommu_pghdl;
+
+#define	IOMMU_PGHDL_HASH_SIZE	(256)
 
 /*
  * struct intel_iommu_state
@@ -445,6 +448,7 @@
  * iu_pend_head		- pending iotlb list
  * iu_inv_queue		- invalidation queue state
  * iu_intr_remap_tbl	- interrupt remapping table state
+ * iu_pghdl_hash	- hash of pages allocated for IOMMU internal work.
  */
 typedef struct intel_iommu_state {
 	list_node_t		node;
@@ -470,6 +474,7 @@
 	iotlb_pend_head_t	iu_pend_head;
 	struct inv_queue_state	*iu_inv_queue;
 	struct intr_remap_tbl_state	*iu_intr_remap_tbl;
+	struct iommu_pghdl	*iu_pghdl_hash[IOMMU_PGHDL_HASH_SIZE];
 } intel_iommu_state_t;
 
 /*
--- a/usr/src/uts/i86pc/sys/iommu_rscs.h	Mon Mar 16 21:06:13 2009 -0700
+++ b/usr/src/uts/i86pc/sys/iommu_rscs.h	Mon Mar 16 21:18:21 2009 -0700
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -50,11 +50,21 @@
  * iommu_page_unmap()
  *   unmap page out of kva
  */
-paddr_t iommu_page_alloc(int kmflag);
-void iommu_page_free(paddr_t paddr);
-caddr_t iommu_page_map(paddr_t paddr);
-void iommu_page_unmap(caddr_t kva);
 
+typedef struct iommu_pghdl {
+	ddi_dma_handle_t dma_hdl;
+	ddi_acc_handle_t mem_hdl;
+	paddr_t paddr;
+	caddr_t vaddr;
+	struct iommu_pghdl *prev;
+	struct iommu_pghdl *next;
+} iommu_pghdl_t;
+
+struct intel_iommu_state;
+
+iommu_pghdl_t *iommu_page_alloc(struct intel_iommu_state *iommu, int kmflag);
+void iommu_page_free(struct intel_iommu_state *iommu, paddr_t paddr);
+caddr_t iommu_get_vaddr(struct intel_iommu_state *iommu, paddr_t paddr);
 
 typedef struct iommu_rscs_s *iommu_rscs_t;
 
--- a/usr/src/uts/i86pc/sys/machsystm.h	Mon Mar 16 21:06:13 2009 -0700
+++ b/usr/src/uts/i86pc/sys/machsystm.h	Mon Mar 16 21:18:21 2009 -0700
@@ -138,8 +138,7 @@
 struct memlist;
 extern void memlist_add(uint64_t, uint64_t, struct memlist *,
     struct memlist **);
-extern page_t *page_get_physical(int flags);
-extern void page_free_physical(page_t *);
+extern page_t *page_get_physical(uintptr_t seed);
 extern int linear_pc(struct regs *rp, proc_t *p, caddr_t *linearp);
 extern int dtrace_linear_pc(struct regs *rp, proc_t *p, caddr_t *linearp);
 
--- a/usr/src/uts/i86pc/vm/htable.c	Mon Mar 16 21:06:13 2009 -0700
+++ b/usr/src/uts/i86pc/vm/htable.c	Mon Mar 16 21:18:21 2009 -0700
@@ -20,7 +20,7 @@
  */
 
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -264,7 +264,7 @@
  * A wrapper around page_get_physical(), with some extra checks.
  */
 static pfn_t
-ptable_alloc(void)
+ptable_alloc(uintptr_t seed)
 {
 	pfn_t pfn;
 	page_t *pp;
@@ -298,7 +298,7 @@
 	}
 #endif /* DEBUG */
 
-	pp = page_get_physical(KM_NOSLEEP);
+	pp = page_get_physical(seed);
 	if (pp == NULL)
 		return (PFN_INVALID);
 	ASSERT(PAGE_SHARED(pp));
@@ -326,13 +326,28 @@
 	atomic_add_32(&active_ptables, -1);
 	if (pp == NULL)
 		panic("ptable_free(): no page for pfn!");
+	ASSERT(PAGE_SHARED(pp));
 	ASSERT(pfn == pp->p_pagenum);
 	ASSERT(!IN_XPV_PANIC());
+
+	/*
+	 * Get an exclusive lock, might have to wait for a kmem reader.
+	 */
+	if (!page_tryupgrade(pp)) {
+		page_unlock(pp);
+		/*
+		 * RFE: we could change this to not loop forever
+		 * For now looping works - it's just like sfmmu.
+		 */
+		while (!page_lock(pp, SE_EXCL, (kmutex_t *)NULL, P_RECLAIM))
+			continue;
+	}
 #ifdef __xpv
 	if (kpm_vbase && xen_kpm_page(pfn, PT_VALID | PT_WRITABLE) < 0)
 		panic("failure making kpm r/w pfn=0x%lx", pfn);
 #endif
-	page_free_physical(pp);
+	page_free(pp, 1);
+	page_unresv(1);
 }
 
 /*
@@ -767,7 +782,7 @@
 		 */
 		if (ht != NULL && !is_bare) {
 			ht->ht_hat = hat;
-			ht->ht_pfn = ptable_alloc();
+			ht->ht_pfn = ptable_alloc((uintptr_t)ht);
 			if (ht->ht_pfn == PFN_INVALID) {
 				if (USE_HAT_RESERVES())
 					htable_put_reserve(ht);
@@ -830,7 +845,7 @@
 		for (;;) {
 			htable_t *stolen;
 
-			hat->hat_user_ptable = ptable_alloc();
+			hat->hat_user_ptable = ptable_alloc((uintptr_t)ht + 1);
 			if (hat->hat_user_ptable != PFN_INVALID)
 				break;
 			stolen = htable_steal(1);
--- a/usr/src/uts/i86pc/vm/vm_machdep.c	Mon Mar 16 21:06:13 2009 -0700
+++ b/usr/src/uts/i86pc/vm/vm_machdep.c	Mon Mar 16 21:18:21 2009 -0700
@@ -3724,25 +3724,34 @@
  * available - this would have a minimal impact on page coloring.
  */
 page_t *
-page_get_physical(int flags)
+page_get_physical(uintptr_t seed)
 {
 	page_t *pp;
-	u_offset_t offset = (u_offset_t)1 << 41;	/* in VA hole */
+	u_offset_t offset;
 	static struct seg tmpseg;
 	static uintptr_t ctr = 0;
-	static kmutex_t pgp_mutex;
 
 	/*
 	 * This code is gross, we really need a simpler page allocator.
 	 *
+	 * We need to assign an offset for the page to call page_create_va()
 	 * To avoid conflicts with other pages, we get creative with the offset.
 	 * For 32 bits, we need an offset > 4Gig
 	 * For 64 bits, need an offset somewhere in the VA hole.
 	 */
-	if (page_resv(1, flags & KM_NOSLEEP) == 0)
+	offset = seed;
+	if (offset > kernelbase)
+		offset -= kernelbase;
+	offset <<= MMU_PAGESHIFT;
+#if defined(__amd64)
+	offset += mmu.hole_start;	/* something in VA hole */
+#else
+	offset += 1ULL << 40;	/* something > 4 Gig */
+#endif
+
+	if (page_resv(1, KM_NOSLEEP) == 0)
 		return (NULL);
 
-	mutex_enter(&pgp_mutex);
 #ifdef	DEBUG
 	pp = page_exists(&kvp, offset);
 	if (pp != NULL)
@@ -3754,31 +3763,7 @@
 	if (pp != NULL) {
 		page_io_unlock(pp);
 		page_hashout(pp, NULL);
-		mutex_exit(&pgp_mutex);
 		page_downgrade(pp);
-	} else {
-		mutex_exit(&pgp_mutex);
 	}
 	return (pp);
 }
-
-void
-page_free_physical(page_t *pp)
-{
-	/*
-	 * Get an exclusive lock, might have to wait for a kmem reader.
-	 */
-	ASSERT(PAGE_SHARED(pp));
-	if (!page_tryupgrade(pp)) {
-		page_unlock(pp);
-		/*
-		 * RFE: we could change this to not loop forever
-		 * George Cameron had some idea on how to do that.
-		 * For now looping works - it's just like sfmmu.
-		 */
-		while (!page_lock(pp, SE_EXCL, (kmutex_t *)NULL, P_RECLAIM))
-			continue;
-	}
-	page_free(pp, 1);
-	page_unresv(1);
-}