mirror of
				git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
				synced 2025-09-18 22:14:16 +00:00 
			
		
		
		
	Merge branch 'next-sriov' of git://git.kernel.org/pub/scm/linux/kernel/git/benh/powerpc into next
Merge Richard's work to support SR-IOV on PowerNV. All generic PCI patches acked by Bjorn. Some minor conflicts with Daniel's pci_controller_ops work. Conflicts: arch/powerpc/include/asm/machdep.h arch/powerpc/platforms/powernv/pci-ioda.c
This commit is contained in:
		
						commit
						ad30cb9946
					
				
					 13 changed files with 1448 additions and 89 deletions
				
			
		
							
								
								
									
										301
									
								
								Documentation/powerpc/pci_iov_resource_on_powernv.txt
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										301
									
								
								Documentation/powerpc/pci_iov_resource_on_powernv.txt
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,301 @@ | |||
| Wei Yang <weiyang@linux.vnet.ibm.com> | ||||
| Benjamin Herrenschmidt <benh@au1.ibm.com> | ||||
| Bjorn Helgaas <bhelgaas@google.com> | ||||
| 26 Aug 2014 | ||||
| 
 | ||||
| This document describes the requirement from hardware for PCI MMIO resource | ||||
| sizing and assignment on PowerKVM and how generic PCI code handles this | ||||
| requirement. The first two sections describe the concepts of Partitionable | ||||
| Endpoints and the implementation on P8 (IODA2). The next two sections talks | ||||
| about considerations on enabling SRIOV on IODA2. | ||||
| 
 | ||||
| 1. Introduction to Partitionable Endpoints | ||||
| 
 | ||||
| A Partitionable Endpoint (PE) is a way to group the various resources | ||||
| associated with a device or a set of devices to provide isolation between | ||||
| partitions (i.e., filtering of DMA, MSIs etc.) and to provide a mechanism | ||||
| to freeze a device that is causing errors in order to limit the possibility | ||||
| of propagation of bad data. | ||||
| 
 | ||||
| There is thus, in HW, a table of PE states that contains a pair of "frozen" | ||||
| state bits (one for MMIO and one for DMA, they get set together but can be | ||||
| cleared independently) for each PE. | ||||
| 
 | ||||
| When a PE is frozen, all stores in any direction are dropped and all loads | ||||
| return all 1's value. MSIs are also blocked. There's a bit more state that | ||||
| captures things like the details of the error that caused the freeze etc., but | ||||
| that's not critical. | ||||
| 
 | ||||
| The interesting part is how the various PCIe transactions (MMIO, DMA, ...) | ||||
| are matched to their corresponding PEs. | ||||
| 
 | ||||
| The following section provides a rough description of what we have on P8 | ||||
| (IODA2).  Keep in mind that this is all per PHB (PCI host bridge).  Each PHB | ||||
| is a completely separate HW entity that replicates the entire logic, so has | ||||
| its own set of PEs, etc. | ||||
| 
 | ||||
| 2. Implementation of Partitionable Endpoints on P8 (IODA2) | ||||
| 
 | ||||
| P8 supports up to 256 Partitionable Endpoints per PHB. | ||||
| 
 | ||||
|   * Inbound | ||||
| 
 | ||||
|     For DMA, MSIs and inbound PCIe error messages, we have a table (in | ||||
|     memory but accessed in HW by the chip) that provides a direct | ||||
|     correspondence between a PCIe RID (bus/dev/fn) with a PE number. | ||||
|     We call this the RTT. | ||||
| 
 | ||||
|     - For DMA we then provide an entire address space for each PE that can | ||||
|       contain two "windows", depending on the value of PCI address bit 59. | ||||
|       Each window can be configured to be remapped via a "TCE table" (IOMMU | ||||
|       translation table), which has various configurable characteristics | ||||
|       not described here. | ||||
| 
 | ||||
|     - For MSIs, we have two windows in the address space (one at the top of | ||||
|       the 32-bit space and one much higher) which, via a combination of the | ||||
|       address and MSI value, will result in one of the 2048 interrupts per | ||||
|       bridge being triggered.  There's a PE# in the interrupt controller | ||||
|       descriptor table as well which is compared with the PE# obtained from | ||||
|       the RTT to "authorize" the device to emit that specific interrupt. | ||||
| 
 | ||||
|     - Error messages just use the RTT. | ||||
| 
 | ||||
|   * Outbound.  That's where the tricky part is. | ||||
| 
 | ||||
|     Like other PCI host bridges, the Power8 IODA2 PHB supports "windows" | ||||
|     from the CPU address space to the PCI address space.  There is one M32 | ||||
|     window and sixteen M64 windows.  They have different characteristics. | ||||
|     First what they have in common: they forward a configurable portion of | ||||
|     the CPU address space to the PCIe bus and must be naturally aligned | ||||
|     power of two in size.  The rest is different: | ||||
| 
 | ||||
|     - The M32 window: | ||||
| 
 | ||||
|       * Is limited to 4GB in size. | ||||
| 
 | ||||
|       * Drops the top bits of the address (above the size) and replaces | ||||
| 	them with a configurable value.  This is typically used to generate | ||||
| 	32-bit PCIe accesses.  We configure that window at boot from FW and | ||||
| 	don't touch it from Linux; it's usually set to forward a 2GB | ||||
| 	portion of address space from the CPU to PCIe | ||||
| 	0x8000_0000..0xffff_ffff.  (Note: The top 64KB are actually | ||||
| 	reserved for MSIs but this is not a problem at this point; we just | ||||
| 	need to ensure Linux doesn't assign anything there, the M32 logic | ||||
| 	ignores that however and will forward in that space if we try). | ||||
| 
 | ||||
|       * It is divided into 256 segments of equal size.  A table in the chip | ||||
| 	maps each segment to a PE#.  That allows portions of the MMIO space | ||||
| 	to be assigned to PEs on a segment granularity.  For a 2GB window, | ||||
| 	the segment granularity is 2GB/256 = 8MB. | ||||
| 
 | ||||
|     Now, this is the "main" window we use in Linux today (excluding | ||||
|     SR-IOV).  We basically use the trick of forcing the bridge MMIO windows | ||||
|     onto a segment alignment/granularity so that the space behind a bridge | ||||
|     can be assigned to a PE. | ||||
| 
 | ||||
|     Ideally we would like to be able to have individual functions in PEs | ||||
|     but that would mean using a completely different address allocation | ||||
|     scheme where individual function BARs can be "grouped" to fit in one or | ||||
|     more segments. | ||||
| 
 | ||||
|     - The M64 windows: | ||||
| 
 | ||||
|       * Must be at least 256MB in size. | ||||
| 
 | ||||
|       * Do not translate addresses (the address on PCIe is the same as the | ||||
| 	address on the PowerBus).  There is a way to also set the top 14 | ||||
| 	bits which are not conveyed by PowerBus but we don't use this. | ||||
| 
 | ||||
|       * Can be configured to be segmented.  When not segmented, we can | ||||
| 	specify the PE# for the entire window.  When segmented, a window | ||||
| 	has 256 segments; however, there is no table for mapping a segment | ||||
| 	to a PE#.  The segment number *is* the PE#. | ||||
| 
 | ||||
|       * Support overlaps.  If an address is covered by multiple windows, | ||||
| 	there's a defined ordering for which window applies. | ||||
| 
 | ||||
|     We have code (fairly new compared to the M32 stuff) that exploits that | ||||
|     for large BARs in 64-bit space: | ||||
| 
 | ||||
|     We configure an M64 window to cover the entire region of address space | ||||
|     that has been assigned by FW for the PHB (about 64GB, ignore the space | ||||
|     for the M32, it comes out of a different "reserve").  We configure it | ||||
|     as segmented. | ||||
| 
 | ||||
|     Then we do the same thing as with M32, using the bridge alignment | ||||
|     trick, to match to those giant segments. | ||||
| 
 | ||||
|     Since we cannot remap, we have two additional constraints: | ||||
| 
 | ||||
|     - We do the PE# allocation *after* the 64-bit space has been assigned | ||||
|       because the addresses we use directly determine the PE#.  We then | ||||
|       update the M32 PE# for the devices that use both 32-bit and 64-bit | ||||
|       spaces or assign the remaining PE# to 32-bit only devices. | ||||
| 
 | ||||
|     - We cannot "group" segments in HW, so if a device ends up using more | ||||
|       than one segment, we end up with more than one PE#.  There is a HW | ||||
|       mechanism to make the freeze state cascade to "companion" PEs but | ||||
|       that only works for PCIe error messages (typically used so that if | ||||
|       you freeze a switch, it freezes all its children).  So we do it in | ||||
|       SW.  We lose a bit of effectiveness of EEH in that case, but that's | ||||
|       the best we found.  So when any of the PEs freezes, we freeze the | ||||
|       other ones for that "domain".  We thus introduce the concept of | ||||
|       "master PE" which is the one used for DMA, MSIs, etc., and "secondary | ||||
|       PEs" that are used for the remaining M64 segments. | ||||
| 
 | ||||
|     We would like to investigate using additional M64 windows in "single | ||||
|     PE" mode to overlay over specific BARs to work around some of that, for | ||||
|     example for devices with very large BARs, e.g., GPUs.  It would make | ||||
|     sense, but we haven't done it yet. | ||||
| 
 | ||||
| 3. Considerations for SR-IOV on PowerKVM | ||||
| 
 | ||||
|   * SR-IOV Background | ||||
| 
 | ||||
|     The PCIe SR-IOV feature allows a single Physical Function (PF) to | ||||
|     support several Virtual Functions (VFs).  Registers in the PF's SR-IOV | ||||
|     Capability control the number of VFs and whether they are enabled. | ||||
| 
 | ||||
|     When VFs are enabled, they appear in Configuration Space like normal | ||||
|     PCI devices, but the BARs in VF config space headers are unusual.  For | ||||
|     a non-VF device, software uses BARs in the config space header to | ||||
|     discover the BAR sizes and assign addresses for them.  For VF devices, | ||||
|     software uses VF BAR registers in the *PF* SR-IOV Capability to | ||||
|     discover sizes and assign addresses.  The BARs in the VF's config space | ||||
|     header are read-only zeros. | ||||
| 
 | ||||
|     When a VF BAR in the PF SR-IOV Capability is programmed, it sets the | ||||
|     base address for all the corresponding VF(n) BARs.  For example, if the | ||||
|     PF SR-IOV Capability is programmed to enable eight VFs, and it has a | ||||
|     1MB VF BAR0, the address in that VF BAR sets the base of an 8MB region. | ||||
|     This region is divided into eight contiguous 1MB regions, each of which | ||||
|     is a BAR0 for one of the VFs.  Note that even though the VF BAR | ||||
|     describes an 8MB region, the alignment requirement is for a single VF, | ||||
|     i.e., 1MB in this example. | ||||
| 
 | ||||
|   There are several strategies for isolating VFs in PEs: | ||||
| 
 | ||||
|   - M32 window: There's one M32 window, and it is split into 256 | ||||
|     equally-sized segments.  The finest granularity possible is a 256MB | ||||
|     window with 1MB segments.  VF BARs that are 1MB or larger could be | ||||
|     mapped to separate PEs in this window.  Each segment can be | ||||
|     individually mapped to a PE via the lookup table, so this is quite | ||||
|     flexible, but it works best when all the VF BARs are the same size.  If | ||||
|     they are different sizes, the entire window has to be small enough that | ||||
|     the segment size matches the smallest VF BAR, which means larger VF | ||||
|     BARs span several segments. | ||||
| 
 | ||||
|   - Non-segmented M64 window: A non-segmented M64 window is mapped entirely | ||||
|     to a single PE, so it could only isolate one VF. | ||||
| 
 | ||||
|   - Single segmented M64 windows: A segmented M64 window could be used just | ||||
|     like the M32 window, but the segments can't be individually mapped to | ||||
|     PEs (the segment number is the PE#), so there isn't as much | ||||
|     flexibility.  A VF with multiple BARs would have to be in a "domain" of | ||||
|     multiple PEs, which is not as well isolated as a single PE. | ||||
| 
 | ||||
|   - Multiple segmented M64 windows: As usual, each window is split into 256 | ||||
|     equally-sized segments, and the segment number is the PE#.  But if we | ||||
|     use several M64 windows, they can be set to different base addresses | ||||
|     and different segment sizes.  If we have VFs that each have a 1MB BAR | ||||
|     and a 32MB BAR, we could use one M64 window to assign 1MB segments and | ||||
|     another M64 window to assign 32MB segments. | ||||
| 
 | ||||
|   Finally, the plan to use M64 windows for SR-IOV, which will be described | ||||
|   more in the next two sections.  For a given VF BAR, we need to | ||||
|   effectively reserve the entire 256 segments (256 * VF BAR size) and | ||||
|   position the VF BAR to start at the beginning of a free range of | ||||
|   segments/PEs inside that M64 window. | ||||
| 
 | ||||
|   The goal is of course to be able to give a separate PE for each VF. | ||||
| 
 | ||||
|   The IODA2 platform has 16 M64 windows, which are used to map MMIO | ||||
|   range to PE#.  Each M64 window defines one MMIO range and this range is | ||||
|   divided into 256 segments, with each segment corresponding to one PE. | ||||
| 
 | ||||
|   We decide to leverage this M64 window to map VFs to individual PEs, since | ||||
|   SR-IOV VF BARs are all the same size. | ||||
| 
 | ||||
|   But doing so introduces another problem: total_VFs is usually smaller | ||||
|   than the number of M64 window segments, so if we map one VF BAR directly | ||||
|   to one M64 window, some part of the M64 window will map to another | ||||
|   device's MMIO range. | ||||
| 
 | ||||
|   IODA supports 256 PEs, so segmented windows contain 256 segments, so if | ||||
|   total_VFs is less than 256, we have the situation in Figure 1.0, where | ||||
|   segments [total_VFs, 255] of the M64 window may map to some MMIO range on | ||||
|   other devices: | ||||
| 
 | ||||
|      0      1                     total_VFs - 1 | ||||
|      +------+------+-     -+------+------+ | ||||
|      |      |      |  ...  |      |      | | ||||
|      +------+------+-     -+------+------+ | ||||
| 
 | ||||
|                            VF(n) BAR space | ||||
| 
 | ||||
|      0      1                     total_VFs - 1                255 | ||||
|      +------+------+-     -+------+------+-      -+------+------+ | ||||
|      |      |      |  ...  |      |      |   ...  |      |      | | ||||
|      +------+------+-     -+------+------+-      -+------+------+ | ||||
| 
 | ||||
|                            M64 window | ||||
| 
 | ||||
| 		Figure 1.0 Direct map VF(n) BAR space | ||||
| 
 | ||||
|   Our current solution is to allocate 256 segments even if the VF(n) BAR | ||||
|   space doesn't need that much, as shown in Figure 1.1: | ||||
| 
 | ||||
|      0      1                     total_VFs - 1                255 | ||||
|      +------+------+-     -+------+------+-      -+------+------+ | ||||
|      |      |      |  ...  |      |      |   ...  |      |      | | ||||
|      +------+------+-     -+------+------+-      -+------+------+ | ||||
| 
 | ||||
|                            VF(n) BAR space + extra | ||||
| 
 | ||||
|      0      1                     total_VFs - 1                255 | ||||
|      +------+------+-     -+------+------+-      -+------+------+ | ||||
|      |      |      |  ...  |      |      |   ...  |      |      | | ||||
|      +------+------+-     -+------+------+-      -+------+------+ | ||||
| 
 | ||||
| 			   M64 window | ||||
| 
 | ||||
| 		Figure 1.1 Map VF(n) BAR space + extra | ||||
| 
 | ||||
|   Allocating the extra space ensures that the entire M64 window will be | ||||
|   assigned to this one SR-IOV device and none of the space will be | ||||
|   available for other devices.  Note that this only expands the space | ||||
|   reserved in software; there are still only total_VFs VFs, and they only | ||||
|   respond to segments [0, total_VFs - 1].  There's nothing in hardware that | ||||
|   responds to segments [total_VFs, 255]. | ||||
| 
 | ||||
| 4. Implications for the Generic PCI Code | ||||
| 
 | ||||
| The PCIe SR-IOV spec requires that the base of the VF(n) BAR space be | ||||
| aligned to the size of an individual VF BAR. | ||||
| 
 | ||||
| In IODA2, the MMIO address determines the PE#.  If the address is in an M32 | ||||
| window, we can set the PE# by updating the table that translates segments | ||||
| to PE#s.  Similarly, if the address is in an unsegmented M64 window, we can | ||||
| set the PE# for the window.  But if it's in a segmented M64 window, the | ||||
| segment number is the PE#. | ||||
| 
 | ||||
| Therefore, the only way to control the PE# for a VF is to change the base | ||||
| of the VF(n) BAR space in the VF BAR.  If the PCI core allocates the exact | ||||
| amount of space required for the VF(n) BAR space, the VF BAR value is fixed | ||||
| and cannot be changed. | ||||
| 
 | ||||
| On the other hand, if the PCI core allocates additional space, the VF BAR | ||||
| value can be changed as long as the entire VF(n) BAR space remains inside | ||||
| the space allocated by the core. | ||||
| 
 | ||||
| Ideally the segment size will be the same as an individual VF BAR size. | ||||
| Then each VF will be in its own PE.  The VF BARs (and therefore the PE#s) | ||||
| are contiguous.  If VF0 is in PE(x), then VF(n) is in PE(x+n).  If we | ||||
| allocate 256 segments, there are (256 - numVFs) choices for the PE# of VF0. | ||||
| 
 | ||||
| If the segment size is smaller than the VF BAR size, it will take several | ||||
| segments to cover a VF BAR, and a VF will be in several PEs.  This is | ||||
| possible, but the isolation isn't as good, and it reduces the number of PE# | ||||
| choices because instead of consuming only numVFs segments, the VF(n) BAR | ||||
| space will consume (numVFs * n) segments.  That means there aren't as many | ||||
| available segments for adjusting base of the VF(n) BAR space. | ||||
|  | @ -79,6 +79,9 @@ struct iommu_table { | |||
| 	struct iommu_group *it_group; | ||||
| #endif | ||||
| 	void (*set_bypass)(struct iommu_table *tbl, bool enable); | ||||
| #ifdef CONFIG_PPC_POWERNV | ||||
| 	void           *data; | ||||
| #endif | ||||
| }; | ||||
| 
 | ||||
| /* Pure 2^n version of get_order */ | ||||
|  |  | |||
|  | @ -236,6 +236,11 @@ struct machdep_calls { | |||
| 	/* Called after scan and before resource survey */ | ||||
| 	void (*pcibios_fixup_phb)(struct pci_controller *hose); | ||||
| 
 | ||||
| #ifdef CONFIG_PCI_IOV | ||||
| 	void (*pcibios_fixup_sriov)(struct pci_dev *pdev); | ||||
| 	resource_size_t (*pcibios_iov_resource_alignment)(struct pci_dev *, int resno); | ||||
| #endif /* CONFIG_PCI_IOV */ | ||||
| 
 | ||||
| 	/* Called to shutdown machine specific hardware not already controlled
 | ||||
| 	 * by other drivers. | ||||
| 	 */ | ||||
|  |  | |||
|  | @ -175,6 +175,7 @@ struct iommu_table; | |||
| 
 | ||||
| struct pci_dn { | ||||
| 	int     flags; | ||||
| #define PCI_DN_FLAG_IOV_VF	0x01 | ||||
| 
 | ||||
| 	int	busno;			/* pci bus number */ | ||||
| 	int	devfn;			/* pci device and function number */ | ||||
|  | @ -189,13 +190,21 @@ struct pci_dn { | |||
| 
 | ||||
| 	int	pci_ext_config_space;	/* for pci devices */ | ||||
| 
 | ||||
| 	struct	pci_dev *pcidev;	/* back-pointer to the pci device */ | ||||
| #ifdef CONFIG_EEH | ||||
| 	struct eeh_dev *edev;		/* eeh device */ | ||||
| #endif | ||||
| #define IODA_INVALID_PE		(-1) | ||||
| #ifdef CONFIG_PPC_POWERNV | ||||
| 	int	pe_number; | ||||
| #ifdef CONFIG_PCI_IOV | ||||
| 	u16     vfs_expanded;		/* number of VFs IOV BAR expanded */ | ||||
| 	u16     num_vfs;		/* number of VFs enabled*/ | ||||
| 	int     offset;			/* PE# for the first VF PE */ | ||||
| #define M64_PER_IOV 4 | ||||
| 	int     m64_per_iov; | ||||
| #define IODA_INVALID_M64        (-1) | ||||
| 	int     m64_wins[PCI_SRIOV_NUM_BARS][M64_PER_IOV]; | ||||
| #endif /* CONFIG_PCI_IOV */ | ||||
| #endif | ||||
| 	struct list_head child_list; | ||||
| 	struct list_head list; | ||||
|  | @ -207,6 +216,8 @@ struct pci_dn { | |||
| extern struct pci_dn *pci_get_pdn_by_devfn(struct pci_bus *bus, | ||||
| 					   int devfn); | ||||
| extern struct pci_dn *pci_get_pdn(struct pci_dev *pdev); | ||||
| extern struct pci_dn *add_dev_pci_data(struct pci_dev *pdev); | ||||
| extern void remove_dev_pci_data(struct pci_dev *pdev); | ||||
| extern void *update_dn_pci_info(struct device_node *dn, void *data); | ||||
| 
 | ||||
| static inline int pci_device_from_OF_node(struct device_node *np, | ||||
|  |  | |||
|  | @ -134,6 +134,16 @@ void pcibios_reset_secondary_bus(struct pci_dev *dev) | |||
| 	pci_reset_secondary_bus(dev); | ||||
| } | ||||
| 
 | ||||
| #ifdef CONFIG_PCI_IOV | ||||
| resource_size_t pcibios_iov_resource_alignment(struct pci_dev *pdev, int resno) | ||||
| { | ||||
| 	if (ppc_md.pcibios_iov_resource_alignment) | ||||
| 		return ppc_md.pcibios_iov_resource_alignment(pdev, resno); | ||||
| 
 | ||||
| 	return pci_iov_resource_size(pdev, resno); | ||||
| } | ||||
| #endif /* CONFIG_PCI_IOV */ | ||||
| 
 | ||||
| static resource_size_t pcibios_io_size(const struct pci_controller *hose) | ||||
| { | ||||
| #ifdef CONFIG_PPC64 | ||||
|  | @ -792,6 +802,10 @@ static void pcibios_fixup_resources(struct pci_dev *dev) | |||
| 		       pci_name(dev)); | ||||
| 		return; | ||||
| 	} | ||||
| 
 | ||||
| 	if (dev->is_virtfn) | ||||
| 		return; | ||||
| 
 | ||||
| 	for (i = 0; i < DEVICE_COUNT_RESOURCE; i++) { | ||||
| 		struct resource *res = dev->resource + i; | ||||
| 		struct pci_bus_region reg; | ||||
|  | @ -995,6 +1009,12 @@ int pcibios_add_device(struct pci_dev *dev) | |||
| 	 */ | ||||
| 	if (dev->bus->is_added) | ||||
| 		pcibios_setup_device(dev); | ||||
| 
 | ||||
| #ifdef CONFIG_PCI_IOV | ||||
| 	if (ppc_md.pcibios_fixup_sriov) | ||||
| 		ppc_md.pcibios_fixup_sriov(dev); | ||||
| #endif /* CONFIG_PCI_IOV */ | ||||
| 
 | ||||
| 	return 0; | ||||
| } | ||||
| 
 | ||||
|  |  | |||
|  | @ -136,6 +136,135 @@ struct pci_dn *pci_get_pdn(struct pci_dev *pdev) | |||
| 	return NULL; | ||||
| } | ||||
| 
 | ||||
| #ifdef CONFIG_PCI_IOV | ||||
| static struct pci_dn *add_one_dev_pci_data(struct pci_dn *parent, | ||||
| 					   struct pci_dev *pdev, | ||||
| 					   int busno, int devfn) | ||||
| { | ||||
| 	struct pci_dn *pdn; | ||||
| 
 | ||||
| 	/* Except PHB, we always have the parent */ | ||||
| 	if (!parent) | ||||
| 		return NULL; | ||||
| 
 | ||||
| 	pdn = kzalloc(sizeof(*pdn), GFP_KERNEL); | ||||
| 	if (!pdn) { | ||||
| 		dev_warn(&pdev->dev, "%s: Out of memory!\n", __func__); | ||||
| 		return NULL; | ||||
| 	} | ||||
| 
 | ||||
| 	pdn->phb = parent->phb; | ||||
| 	pdn->parent = parent; | ||||
| 	pdn->busno = busno; | ||||
| 	pdn->devfn = devfn; | ||||
| #ifdef CONFIG_PPC_POWERNV | ||||
| 	pdn->pe_number = IODA_INVALID_PE; | ||||
| #endif | ||||
| 	INIT_LIST_HEAD(&pdn->child_list); | ||||
| 	INIT_LIST_HEAD(&pdn->list); | ||||
| 	list_add_tail(&pdn->list, &parent->child_list); | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * If we already have PCI device instance, lets | ||||
| 	 * bind them. | ||||
| 	 */ | ||||
| 	if (pdev) | ||||
| 		pdev->dev.archdata.pci_data = pdn; | ||||
| 
 | ||||
| 	return pdn; | ||||
| } | ||||
| #endif | ||||
| 
 | ||||
| struct pci_dn *add_dev_pci_data(struct pci_dev *pdev) | ||||
| { | ||||
| #ifdef CONFIG_PCI_IOV | ||||
| 	struct pci_dn *parent, *pdn; | ||||
| 	int i; | ||||
| 
 | ||||
| 	/* Only support IOV for now */ | ||||
| 	if (!pdev->is_physfn) | ||||
| 		return pci_get_pdn(pdev); | ||||
| 
 | ||||
| 	/* Check if VFs have been populated */ | ||||
| 	pdn = pci_get_pdn(pdev); | ||||
| 	if (!pdn || (pdn->flags & PCI_DN_FLAG_IOV_VF)) | ||||
| 		return NULL; | ||||
| 
 | ||||
| 	pdn->flags |= PCI_DN_FLAG_IOV_VF; | ||||
| 	parent = pci_bus_to_pdn(pdev->bus); | ||||
| 	if (!parent) | ||||
| 		return NULL; | ||||
| 
 | ||||
| 	for (i = 0; i < pci_sriov_get_totalvfs(pdev); i++) { | ||||
| 		pdn = add_one_dev_pci_data(parent, NULL, | ||||
| 					   pci_iov_virtfn_bus(pdev, i), | ||||
| 					   pci_iov_virtfn_devfn(pdev, i)); | ||||
| 		if (!pdn) { | ||||
| 			dev_warn(&pdev->dev, "%s: Cannot create firmware data for VF#%d\n", | ||||
| 				 __func__, i); | ||||
| 			return NULL; | ||||
| 		} | ||||
| 	} | ||||
| #endif /* CONFIG_PCI_IOV */ | ||||
| 
 | ||||
| 	return pci_get_pdn(pdev); | ||||
| } | ||||
| 
 | ||||
| void remove_dev_pci_data(struct pci_dev *pdev) | ||||
| { | ||||
| #ifdef CONFIG_PCI_IOV | ||||
| 	struct pci_dn *parent; | ||||
| 	struct pci_dn *pdn, *tmp; | ||||
| 	int i; | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * VF and VF PE are created/released dynamically, so we need to | ||||
| 	 * bind/unbind them.  Otherwise the VF and VF PE would be mismatched | ||||
| 	 * when re-enabling SR-IOV. | ||||
| 	 */ | ||||
| 	if (pdev->is_virtfn) { | ||||
| 		pdn = pci_get_pdn(pdev); | ||||
| #ifdef CONFIG_PPC_POWERNV | ||||
| 		pdn->pe_number = IODA_INVALID_PE; | ||||
| #endif | ||||
| 		return; | ||||
| 	} | ||||
| 
 | ||||
| 	/* Only support IOV PF for now */ | ||||
| 	if (!pdev->is_physfn) | ||||
| 		return; | ||||
| 
 | ||||
| 	/* Check if VFs have been populated */ | ||||
| 	pdn = pci_get_pdn(pdev); | ||||
| 	if (!pdn || !(pdn->flags & PCI_DN_FLAG_IOV_VF)) | ||||
| 		return; | ||||
| 
 | ||||
| 	pdn->flags &= ~PCI_DN_FLAG_IOV_VF; | ||||
| 	parent = pci_bus_to_pdn(pdev->bus); | ||||
| 	if (!parent) | ||||
| 		return; | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * We might introduce flag to pci_dn in future | ||||
| 	 * so that we can release VF's firmware data in | ||||
| 	 * a batch mode. | ||||
| 	 */ | ||||
| 	for (i = 0; i < pci_sriov_get_totalvfs(pdev); i++) { | ||||
| 		list_for_each_entry_safe(pdn, tmp, | ||||
| 			&parent->child_list, list) { | ||||
| 			if (pdn->busno != pci_iov_virtfn_bus(pdev, i) || | ||||
| 			    pdn->devfn != pci_iov_virtfn_devfn(pdev, i)) | ||||
| 				continue; | ||||
| 
 | ||||
| 			if (!list_empty(&pdn->list)) | ||||
| 				list_del(&pdn->list); | ||||
| 
 | ||||
| 			kfree(pdn); | ||||
| 		} | ||||
| 	} | ||||
| #endif /* CONFIG_PCI_IOV */ | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  * Traverse_func that inits the PCI fields of the device node. | ||||
|  * NOTE: this *must* be done before read/write config to the device. | ||||
|  |  | |||
|  | @ -44,6 +44,9 @@ | |||
| #include "powernv.h" | ||||
| #include "pci.h" | ||||
| 
 | ||||
| /* 256M DMA window, 4K TCE pages, 8 bytes TCE */ | ||||
| #define TCE32_TABLE_SIZE	((0x10000000 / 0x1000) * 8) | ||||
| 
 | ||||
| static void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level, | ||||
| 			    const char *fmt, ...) | ||||
| { | ||||
|  | @ -56,11 +59,18 @@ static void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level, | |||
| 	vaf.fmt = fmt; | ||||
| 	vaf.va = &args; | ||||
| 
 | ||||
| 	if (pe->pdev) | ||||
| 	if (pe->flags & PNV_IODA_PE_DEV) | ||||
| 		strlcpy(pfix, dev_name(&pe->pdev->dev), sizeof(pfix)); | ||||
| 	else | ||||
| 	else if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL)) | ||||
| 		sprintf(pfix, "%04x:%02x     ", | ||||
| 			pci_domain_nr(pe->pbus), pe->pbus->number); | ||||
| #ifdef CONFIG_PCI_IOV | ||||
| 	else if (pe->flags & PNV_IODA_PE_VF) | ||||
| 		sprintf(pfix, "%04x:%02x:%2x.%d", | ||||
| 			pci_domain_nr(pe->parent_dev->bus), | ||||
| 			(pe->rid & 0xff00) >> 8, | ||||
| 			PCI_SLOT(pe->rid), PCI_FUNC(pe->rid)); | ||||
| #endif /* CONFIG_PCI_IOV*/ | ||||
| 
 | ||||
| 	printk("%spci %s: [PE# %.3d] %pV", | ||||
| 	       level, pfix, pe->pe_number, &vaf); | ||||
|  | @ -591,7 +601,7 @@ static int pnv_ioda_set_peltv(struct pnv_phb *phb, | |||
| 			      bool is_add) | ||||
| { | ||||
| 	struct pnv_ioda_pe *slave; | ||||
| 	struct pci_dev *pdev; | ||||
| 	struct pci_dev *pdev = NULL; | ||||
| 	int ret; | ||||
| 
 | ||||
| 	/*
 | ||||
|  | @ -630,8 +640,12 @@ static int pnv_ioda_set_peltv(struct pnv_phb *phb, | |||
| 
 | ||||
| 	if (pe->flags & (PNV_IODA_PE_BUS_ALL | PNV_IODA_PE_BUS)) | ||||
| 		pdev = pe->pbus->self; | ||||
| 	else | ||||
| 	else if (pe->flags & PNV_IODA_PE_DEV) | ||||
| 		pdev = pe->pdev->bus->self; | ||||
| #ifdef CONFIG_PCI_IOV | ||||
| 	else if (pe->flags & PNV_IODA_PE_VF) | ||||
| 		pdev = pe->parent_dev->bus->self; | ||||
| #endif /* CONFIG_PCI_IOV */ | ||||
| 	while (pdev) { | ||||
| 		struct pci_dn *pdn = pci_get_pdn(pdev); | ||||
| 		struct pnv_ioda_pe *parent; | ||||
|  | @ -649,6 +663,87 @@ static int pnv_ioda_set_peltv(struct pnv_phb *phb, | |||
| 	return 0; | ||||
| } | ||||
| 
 | ||||
| #ifdef CONFIG_PCI_IOV | ||||
| static int pnv_ioda_deconfigure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe) | ||||
| { | ||||
| 	struct pci_dev *parent; | ||||
| 	uint8_t bcomp, dcomp, fcomp; | ||||
| 	int64_t rc; | ||||
| 	long rid_end, rid; | ||||
| 
 | ||||
| 	/* Currently, we just deconfigure VF PE. Bus PE will always there.*/ | ||||
| 	if (pe->pbus) { | ||||
| 		int count; | ||||
| 
 | ||||
| 		dcomp = OPAL_IGNORE_RID_DEVICE_NUMBER; | ||||
| 		fcomp = OPAL_IGNORE_RID_FUNCTION_NUMBER; | ||||
| 		parent = pe->pbus->self; | ||||
| 		if (pe->flags & PNV_IODA_PE_BUS_ALL) | ||||
| 			count = pe->pbus->busn_res.end - pe->pbus->busn_res.start + 1; | ||||
| 		else | ||||
| 			count = 1; | ||||
| 
 | ||||
| 		switch(count) { | ||||
| 		case  1: bcomp = OpalPciBusAll;         break; | ||||
| 		case  2: bcomp = OpalPciBus7Bits;       break; | ||||
| 		case  4: bcomp = OpalPciBus6Bits;       break; | ||||
| 		case  8: bcomp = OpalPciBus5Bits;       break; | ||||
| 		case 16: bcomp = OpalPciBus4Bits;       break; | ||||
| 		case 32: bcomp = OpalPciBus3Bits;       break; | ||||
| 		default: | ||||
| 			dev_err(&pe->pbus->dev, "Number of subordinate buses %d unsupported\n", | ||||
| 			        count); | ||||
| 			/* Do an exact match only */ | ||||
| 			bcomp = OpalPciBusAll; | ||||
| 		} | ||||
| 		rid_end = pe->rid + (count << 8); | ||||
| 	} else { | ||||
| 		if (pe->flags & PNV_IODA_PE_VF) | ||||
| 			parent = pe->parent_dev; | ||||
| 		else | ||||
| 			parent = pe->pdev->bus->self; | ||||
| 		bcomp = OpalPciBusAll; | ||||
| 		dcomp = OPAL_COMPARE_RID_DEVICE_NUMBER; | ||||
| 		fcomp = OPAL_COMPARE_RID_FUNCTION_NUMBER; | ||||
| 		rid_end = pe->rid + 1; | ||||
| 	} | ||||
| 
 | ||||
| 	/* Clear the reverse map */ | ||||
| 	for (rid = pe->rid; rid < rid_end; rid++) | ||||
| 		phb->ioda.pe_rmap[rid] = 0; | ||||
| 
 | ||||
| 	/* Release from all parents PELT-V */ | ||||
| 	while (parent) { | ||||
| 		struct pci_dn *pdn = pci_get_pdn(parent); | ||||
| 		if (pdn && pdn->pe_number != IODA_INVALID_PE) { | ||||
| 			rc = opal_pci_set_peltv(phb->opal_id, pdn->pe_number, | ||||
| 						pe->pe_number, OPAL_REMOVE_PE_FROM_DOMAIN); | ||||
| 			/* XXX What to do in case of error ? */ | ||||
| 		} | ||||
| 		parent = parent->bus->self; | ||||
| 	} | ||||
| 
 | ||||
| 	opal_pci_eeh_freeze_set(phb->opal_id, pe->pe_number, | ||||
| 				  OPAL_EEH_ACTION_CLEAR_FREEZE_ALL); | ||||
| 
 | ||||
| 	/* Disassociate PE in PELT */ | ||||
| 	rc = opal_pci_set_peltv(phb->opal_id, pe->pe_number, | ||||
| 				pe->pe_number, OPAL_REMOVE_PE_FROM_DOMAIN); | ||||
| 	if (rc) | ||||
| 		pe_warn(pe, "OPAL error %ld remove self from PELTV\n", rc); | ||||
| 	rc = opal_pci_set_pe(phb->opal_id, pe->pe_number, pe->rid, | ||||
| 			     bcomp, dcomp, fcomp, OPAL_UNMAP_PE); | ||||
| 	if (rc) | ||||
| 		pe_err(pe, "OPAL error %ld trying to setup PELT table\n", rc); | ||||
| 
 | ||||
| 	pe->pbus = NULL; | ||||
| 	pe->pdev = NULL; | ||||
| 	pe->parent_dev = NULL; | ||||
| 
 | ||||
| 	return 0; | ||||
| } | ||||
| #endif /* CONFIG_PCI_IOV */ | ||||
| 
 | ||||
| static int pnv_ioda_configure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe) | ||||
| { | ||||
| 	struct pci_dev *parent; | ||||
|  | @ -675,15 +770,19 @@ static int pnv_ioda_configure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe) | |||
| 		case 16: bcomp = OpalPciBus4Bits;	break; | ||||
| 		case 32: bcomp = OpalPciBus3Bits;	break; | ||||
| 		default: | ||||
| 			pr_err("%s: Number of subordinate busses %d" | ||||
| 			       " unsupported\n", | ||||
| 			       pci_name(pe->pbus->self), count); | ||||
| 			dev_err(&pe->pbus->dev, "Number of subordinate buses %d unsupported\n", | ||||
| 			        count); | ||||
| 			/* Do an exact match only */ | ||||
| 			bcomp = OpalPciBusAll; | ||||
| 		} | ||||
| 		rid_end = pe->rid + (count << 8); | ||||
| 	} else { | ||||
| 		parent = pe->pdev->bus->self; | ||||
| #ifdef CONFIG_PCI_IOV | ||||
| 		if (pe->flags & PNV_IODA_PE_VF) | ||||
| 			parent = pe->parent_dev; | ||||
| 		else | ||||
| #endif /* CONFIG_PCI_IOV */ | ||||
| 			parent = pe->pdev->bus->self; | ||||
| 		bcomp = OpalPciBusAll; | ||||
| 		dcomp = OPAL_COMPARE_RID_DEVICE_NUMBER; | ||||
| 		fcomp = OPAL_COMPARE_RID_FUNCTION_NUMBER; | ||||
|  | @ -774,6 +873,78 @@ static unsigned int pnv_ioda_dma_weight(struct pci_dev *dev) | |||
| 	return 10; | ||||
| } | ||||
| 
 | ||||
| #ifdef CONFIG_PCI_IOV | ||||
| static int pnv_pci_vf_resource_shift(struct pci_dev *dev, int offset) | ||||
| { | ||||
| 	struct pci_dn *pdn = pci_get_pdn(dev); | ||||
| 	int i; | ||||
| 	struct resource *res, res2; | ||||
| 	resource_size_t size; | ||||
| 	u16 num_vfs; | ||||
| 
 | ||||
| 	if (!dev->is_physfn) | ||||
| 		return -EINVAL; | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * "offset" is in VFs.  The M64 windows are sized so that when they | ||||
| 	 * are segmented, each segment is the same size as the IOV BAR. | ||||
| 	 * Each segment is in a separate PE, and the high order bits of the | ||||
| 	 * address are the PE number.  Therefore, each VF's BAR is in a | ||||
| 	 * separate PE, and changing the IOV BAR start address changes the | ||||
| 	 * range of PEs the VFs are in. | ||||
| 	 */ | ||||
| 	num_vfs = pdn->num_vfs; | ||||
| 	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) { | ||||
| 		res = &dev->resource[i + PCI_IOV_RESOURCES]; | ||||
| 		if (!res->flags || !res->parent) | ||||
| 			continue; | ||||
| 
 | ||||
| 		if (!pnv_pci_is_mem_pref_64(res->flags)) | ||||
| 			continue; | ||||
| 
 | ||||
| 		/*
 | ||||
| 		 * The actual IOV BAR range is determined by the start address | ||||
| 		 * and the actual size for num_vfs VFs BAR.  This check is to | ||||
| 		 * make sure that after shifting, the range will not overlap | ||||
| 		 * with another device. | ||||
| 		 */ | ||||
| 		size = pci_iov_resource_size(dev, i + PCI_IOV_RESOURCES); | ||||
| 		res2.flags = res->flags; | ||||
| 		res2.start = res->start + (size * offset); | ||||
| 		res2.end = res2.start + (size * num_vfs) - 1; | ||||
| 
 | ||||
| 		if (res2.end > res->end) { | ||||
| 			dev_err(&dev->dev, "VF BAR%d: %pR would extend past %pR (trying to enable %d VFs shifted by %d)\n", | ||||
| 				i, &res2, res, num_vfs, offset); | ||||
| 			return -EBUSY; | ||||
| 		} | ||||
| 	} | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * After doing so, there would be a "hole" in the /proc/iomem when | ||||
| 	 * offset is a positive value. It looks like the device return some | ||||
| 	 * mmio back to the system, which actually no one could use it. | ||||
| 	 */ | ||||
| 	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) { | ||||
| 		res = &dev->resource[i + PCI_IOV_RESOURCES]; | ||||
| 		if (!res->flags || !res->parent) | ||||
| 			continue; | ||||
| 
 | ||||
| 		if (!pnv_pci_is_mem_pref_64(res->flags)) | ||||
| 			continue; | ||||
| 
 | ||||
| 		size = pci_iov_resource_size(dev, i + PCI_IOV_RESOURCES); | ||||
| 		res2 = *res; | ||||
| 		res->start += size * offset; | ||||
| 
 | ||||
| 		dev_info(&dev->dev, "VF BAR%d: %pR shifted to %pR (enabling %d VFs shifted by %d)\n", | ||||
| 			 i, &res2, res, num_vfs, offset); | ||||
| 		pci_update_resource(dev, i + PCI_IOV_RESOURCES); | ||||
| 	} | ||||
| 	return 0; | ||||
| } | ||||
| #endif /* CONFIG_PCI_IOV */ | ||||
| 
 | ||||
| #if 0 | ||||
| static struct pnv_ioda_pe *pnv_ioda_setup_dev_PE(struct pci_dev *dev) | ||||
| { | ||||
|  | @ -857,7 +1028,6 @@ static void pnv_ioda_setup_same_PE(struct pci_bus *bus, struct pnv_ioda_pe *pe) | |||
| 				pci_name(dev)); | ||||
| 			continue; | ||||
| 		} | ||||
| 		pdn->pcidev = dev; | ||||
| 		pdn->pe_number = pe->pe_number; | ||||
| 		pe->dma_weight += pnv_ioda_dma_weight(dev); | ||||
| 		if ((pe->flags & PNV_IODA_PE_BUS_ALL) && dev->subordinate) | ||||
|  | @ -916,6 +1086,10 @@ static void pnv_ioda_setup_bus_PE(struct pci_bus *bus, int all) | |||
| 		return; | ||||
| 	} | ||||
| 
 | ||||
| 	pe->tce32_table = kzalloc_node(sizeof(struct iommu_table), | ||||
| 			GFP_KERNEL, hose->node); | ||||
| 	pe->tce32_table->data = pe; | ||||
| 
 | ||||
| 	/* Associate it with all child devices */ | ||||
| 	pnv_ioda_setup_same_PE(bus, pe); | ||||
| 
 | ||||
|  | @ -974,6 +1148,441 @@ static void pnv_pci_ioda_setup_PEs(void) | |||
| 	} | ||||
| } | ||||
| 
 | ||||
| #ifdef CONFIG_PCI_IOV | ||||
| static int pnv_pci_vf_release_m64(struct pci_dev *pdev) | ||||
| { | ||||
| 	struct pci_bus        *bus; | ||||
| 	struct pci_controller *hose; | ||||
| 	struct pnv_phb        *phb; | ||||
| 	struct pci_dn         *pdn; | ||||
| 	int                    i, j; | ||||
| 
 | ||||
| 	bus = pdev->bus; | ||||
| 	hose = pci_bus_to_host(bus); | ||||
| 	phb = hose->private_data; | ||||
| 	pdn = pci_get_pdn(pdev); | ||||
| 
 | ||||
| 	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) | ||||
| 		for (j = 0; j < M64_PER_IOV; j++) { | ||||
| 			if (pdn->m64_wins[i][j] == IODA_INVALID_M64) | ||||
| 				continue; | ||||
| 			opal_pci_phb_mmio_enable(phb->opal_id, | ||||
| 				OPAL_M64_WINDOW_TYPE, pdn->m64_wins[i][j], 0); | ||||
| 			clear_bit(pdn->m64_wins[i][j], &phb->ioda.m64_bar_alloc); | ||||
| 			pdn->m64_wins[i][j] = IODA_INVALID_M64; | ||||
| 		} | ||||
| 
 | ||||
| 	return 0; | ||||
| } | ||||
| 
 | ||||
| static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, u16 num_vfs) | ||||
| { | ||||
| 	struct pci_bus        *bus; | ||||
| 	struct pci_controller *hose; | ||||
| 	struct pnv_phb        *phb; | ||||
| 	struct pci_dn         *pdn; | ||||
| 	unsigned int           win; | ||||
| 	struct resource       *res; | ||||
| 	int                    i, j; | ||||
| 	int64_t                rc; | ||||
| 	int                    total_vfs; | ||||
| 	resource_size_t        size, start; | ||||
| 	int                    pe_num; | ||||
| 	int                    vf_groups; | ||||
| 	int                    vf_per_group; | ||||
| 
 | ||||
| 	bus = pdev->bus; | ||||
| 	hose = pci_bus_to_host(bus); | ||||
| 	phb = hose->private_data; | ||||
| 	pdn = pci_get_pdn(pdev); | ||||
| 	total_vfs = pci_sriov_get_totalvfs(pdev); | ||||
| 
 | ||||
| 	/* Initialize the m64_wins to IODA_INVALID_M64 */ | ||||
| 	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) | ||||
| 		for (j = 0; j < M64_PER_IOV; j++) | ||||
| 			pdn->m64_wins[i][j] = IODA_INVALID_M64; | ||||
| 
 | ||||
| 	if (pdn->m64_per_iov == M64_PER_IOV) { | ||||
| 		vf_groups = (num_vfs <= M64_PER_IOV) ? num_vfs: M64_PER_IOV; | ||||
| 		vf_per_group = (num_vfs <= M64_PER_IOV)? 1: | ||||
| 			roundup_pow_of_two(num_vfs) / pdn->m64_per_iov; | ||||
| 	} else { | ||||
| 		vf_groups = 1; | ||||
| 		vf_per_group = 1; | ||||
| 	} | ||||
| 
 | ||||
| 	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) { | ||||
| 		res = &pdev->resource[i + PCI_IOV_RESOURCES]; | ||||
| 		if (!res->flags || !res->parent) | ||||
| 			continue; | ||||
| 
 | ||||
| 		if (!pnv_pci_is_mem_pref_64(res->flags)) | ||||
| 			continue; | ||||
| 
 | ||||
| 		for (j = 0; j < vf_groups; j++) { | ||||
| 			do { | ||||
| 				win = find_next_zero_bit(&phb->ioda.m64_bar_alloc, | ||||
| 						phb->ioda.m64_bar_idx + 1, 0); | ||||
| 
 | ||||
| 				if (win >= phb->ioda.m64_bar_idx + 1) | ||||
| 					goto m64_failed; | ||||
| 			} while (test_and_set_bit(win, &phb->ioda.m64_bar_alloc)); | ||||
| 
 | ||||
| 			pdn->m64_wins[i][j] = win; | ||||
| 
 | ||||
| 			if (pdn->m64_per_iov == M64_PER_IOV) { | ||||
| 				size = pci_iov_resource_size(pdev, | ||||
| 							PCI_IOV_RESOURCES + i); | ||||
| 				size = size * vf_per_group; | ||||
| 				start = res->start + size * j; | ||||
| 			} else { | ||||
| 				size = resource_size(res); | ||||
| 				start = res->start; | ||||
| 			} | ||||
| 
 | ||||
| 			/* Map the M64 here */ | ||||
| 			if (pdn->m64_per_iov == M64_PER_IOV) { | ||||
| 				pe_num = pdn->offset + j; | ||||
| 				rc = opal_pci_map_pe_mmio_window(phb->opal_id, | ||||
| 						pe_num, OPAL_M64_WINDOW_TYPE, | ||||
| 						pdn->m64_wins[i][j], 0); | ||||
| 			} | ||||
| 
 | ||||
| 			rc = opal_pci_set_phb_mem_window(phb->opal_id, | ||||
| 						 OPAL_M64_WINDOW_TYPE, | ||||
| 						 pdn->m64_wins[i][j], | ||||
| 						 start, | ||||
| 						 0, /* unused */ | ||||
| 						 size); | ||||
| 
 | ||||
| 
 | ||||
| 			if (rc != OPAL_SUCCESS) { | ||||
| 				dev_err(&pdev->dev, "Failed to map M64 window #%d: %lld\n", | ||||
| 					win, rc); | ||||
| 				goto m64_failed; | ||||
| 			} | ||||
| 
 | ||||
| 			if (pdn->m64_per_iov == M64_PER_IOV) | ||||
| 				rc = opal_pci_phb_mmio_enable(phb->opal_id, | ||||
| 				     OPAL_M64_WINDOW_TYPE, pdn->m64_wins[i][j], 2); | ||||
| 			else | ||||
| 				rc = opal_pci_phb_mmio_enable(phb->opal_id, | ||||
| 				     OPAL_M64_WINDOW_TYPE, pdn->m64_wins[i][j], 1); | ||||
| 
 | ||||
| 			if (rc != OPAL_SUCCESS) { | ||||
| 				dev_err(&pdev->dev, "Failed to enable M64 window #%d: %llx\n", | ||||
| 					win, rc); | ||||
| 				goto m64_failed; | ||||
| 			} | ||||
| 		} | ||||
| 	} | ||||
| 	return 0; | ||||
| 
 | ||||
| m64_failed: | ||||
| 	pnv_pci_vf_release_m64(pdev); | ||||
| 	return -EBUSY; | ||||
| } | ||||
| 
 | ||||
| static void pnv_pci_ioda2_release_dma_pe(struct pci_dev *dev, struct pnv_ioda_pe *pe) | ||||
| { | ||||
| 	struct pci_bus        *bus; | ||||
| 	struct pci_controller *hose; | ||||
| 	struct pnv_phb        *phb; | ||||
| 	struct iommu_table    *tbl; | ||||
| 	unsigned long         addr; | ||||
| 	int64_t               rc; | ||||
| 
 | ||||
| 	bus = dev->bus; | ||||
| 	hose = pci_bus_to_host(bus); | ||||
| 	phb = hose->private_data; | ||||
| 	tbl = pe->tce32_table; | ||||
| 	addr = tbl->it_base; | ||||
| 
 | ||||
| 	opal_pci_map_pe_dma_window(phb->opal_id, pe->pe_number, | ||||
| 				   pe->pe_number << 1, 1, __pa(addr), | ||||
| 				   0, 0x1000); | ||||
| 
 | ||||
| 	rc = opal_pci_map_pe_dma_window_real(pe->phb->opal_id, | ||||
| 				        pe->pe_number, | ||||
| 				        (pe->pe_number << 1) + 1, | ||||
| 				        pe->tce_bypass_base, | ||||
| 				        0); | ||||
| 	if (rc) | ||||
| 		pe_warn(pe, "OPAL error %ld release DMA window\n", rc); | ||||
| 
 | ||||
| 	iommu_free_table(tbl, of_node_full_name(dev->dev.of_node)); | ||||
| 	free_pages(addr, get_order(TCE32_TABLE_SIZE)); | ||||
| 	pe->tce32_table = NULL; | ||||
| } | ||||
| 
 | ||||
| static void pnv_ioda_release_vf_PE(struct pci_dev *pdev, u16 num_vfs) | ||||
| { | ||||
| 	struct pci_bus        *bus; | ||||
| 	struct pci_controller *hose; | ||||
| 	struct pnv_phb        *phb; | ||||
| 	struct pnv_ioda_pe    *pe, *pe_n; | ||||
| 	struct pci_dn         *pdn; | ||||
| 	u16                    vf_index; | ||||
| 	int64_t                rc; | ||||
| 
 | ||||
| 	bus = pdev->bus; | ||||
| 	hose = pci_bus_to_host(bus); | ||||
| 	phb = hose->private_data; | ||||
| 	pdn = pci_get_pdn(pdev); | ||||
| 
 | ||||
| 	if (!pdev->is_physfn) | ||||
| 		return; | ||||
| 
 | ||||
| 	if (pdn->m64_per_iov == M64_PER_IOV && num_vfs > M64_PER_IOV) { | ||||
| 		int   vf_group; | ||||
| 		int   vf_per_group; | ||||
| 		int   vf_index1; | ||||
| 
 | ||||
| 		vf_per_group = roundup_pow_of_two(num_vfs) / pdn->m64_per_iov; | ||||
| 
 | ||||
| 		for (vf_group = 0; vf_group < M64_PER_IOV; vf_group++) | ||||
| 			for (vf_index = vf_group * vf_per_group; | ||||
| 				vf_index < (vf_group + 1) * vf_per_group && | ||||
| 				vf_index < num_vfs; | ||||
| 				vf_index++) | ||||
| 				for (vf_index1 = vf_group * vf_per_group; | ||||
| 					vf_index1 < (vf_group + 1) * vf_per_group && | ||||
| 					vf_index1 < num_vfs; | ||||
| 					vf_index1++){ | ||||
| 
 | ||||
| 					rc = opal_pci_set_peltv(phb->opal_id, | ||||
| 						pdn->offset + vf_index, | ||||
| 						pdn->offset + vf_index1, | ||||
| 						OPAL_REMOVE_PE_FROM_DOMAIN); | ||||
| 
 | ||||
| 					if (rc) | ||||
| 					    dev_warn(&pdev->dev, "%s: Failed to unlink same group PE#%d(%lld)\n", | ||||
| 						__func__, | ||||
| 						pdn->offset + vf_index1, rc); | ||||
| 				} | ||||
| 	} | ||||
| 
 | ||||
| 	list_for_each_entry_safe(pe, pe_n, &phb->ioda.pe_list, list) { | ||||
| 		if (pe->parent_dev != pdev) | ||||
| 			continue; | ||||
| 
 | ||||
| 		pnv_pci_ioda2_release_dma_pe(pdev, pe); | ||||
| 
 | ||||
| 		/* Remove from list */ | ||||
| 		mutex_lock(&phb->ioda.pe_list_mutex); | ||||
| 		list_del(&pe->list); | ||||
| 		mutex_unlock(&phb->ioda.pe_list_mutex); | ||||
| 
 | ||||
| 		pnv_ioda_deconfigure_pe(phb, pe); | ||||
| 
 | ||||
| 		pnv_ioda_free_pe(phb, pe->pe_number); | ||||
| 	} | ||||
| } | ||||
| 
 | ||||
| void pnv_pci_sriov_disable(struct pci_dev *pdev) | ||||
| { | ||||
| 	struct pci_bus        *bus; | ||||
| 	struct pci_controller *hose; | ||||
| 	struct pnv_phb        *phb; | ||||
| 	struct pci_dn         *pdn; | ||||
| 	struct pci_sriov      *iov; | ||||
| 	u16 num_vfs; | ||||
| 
 | ||||
| 	bus = pdev->bus; | ||||
| 	hose = pci_bus_to_host(bus); | ||||
| 	phb = hose->private_data; | ||||
| 	pdn = pci_get_pdn(pdev); | ||||
| 	iov = pdev->sriov; | ||||
| 	num_vfs = pdn->num_vfs; | ||||
| 
 | ||||
| 	/* Release VF PEs */ | ||||
| 	pnv_ioda_release_vf_PE(pdev, num_vfs); | ||||
| 
 | ||||
| 	if (phb->type == PNV_PHB_IODA2) { | ||||
| 		if (pdn->m64_per_iov == 1) | ||||
| 			pnv_pci_vf_resource_shift(pdev, -pdn->offset); | ||||
| 
 | ||||
| 		/* Release M64 windows */ | ||||
| 		pnv_pci_vf_release_m64(pdev); | ||||
| 
 | ||||
| 		/* Release PE numbers */ | ||||
| 		bitmap_clear(phb->ioda.pe_alloc, pdn->offset, num_vfs); | ||||
| 		pdn->offset = 0; | ||||
| 	} | ||||
| } | ||||
| 
 | ||||
| static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb, | ||||
| 				       struct pnv_ioda_pe *pe); | ||||
| static void pnv_ioda_setup_vf_PE(struct pci_dev *pdev, u16 num_vfs) | ||||
| { | ||||
| 	struct pci_bus        *bus; | ||||
| 	struct pci_controller *hose; | ||||
| 	struct pnv_phb        *phb; | ||||
| 	struct pnv_ioda_pe    *pe; | ||||
| 	int                    pe_num; | ||||
| 	u16                    vf_index; | ||||
| 	struct pci_dn         *pdn; | ||||
| 	int64_t                rc; | ||||
| 
 | ||||
| 	bus = pdev->bus; | ||||
| 	hose = pci_bus_to_host(bus); | ||||
| 	phb = hose->private_data; | ||||
| 	pdn = pci_get_pdn(pdev); | ||||
| 
 | ||||
| 	if (!pdev->is_physfn) | ||||
| 		return; | ||||
| 
 | ||||
| 	/* Reserve PE for each VF */ | ||||
| 	for (vf_index = 0; vf_index < num_vfs; vf_index++) { | ||||
| 		pe_num = pdn->offset + vf_index; | ||||
| 
 | ||||
| 		pe = &phb->ioda.pe_array[pe_num]; | ||||
| 		pe->pe_number = pe_num; | ||||
| 		pe->phb = phb; | ||||
| 		pe->flags = PNV_IODA_PE_VF; | ||||
| 		pe->pbus = NULL; | ||||
| 		pe->parent_dev = pdev; | ||||
| 		pe->tce32_seg = -1; | ||||
| 		pe->mve_number = -1; | ||||
| 		pe->rid = (pci_iov_virtfn_bus(pdev, vf_index) << 8) | | ||||
| 			   pci_iov_virtfn_devfn(pdev, vf_index); | ||||
| 
 | ||||
| 		pe_info(pe, "VF %04d:%02d:%02d.%d associated with PE#%d\n", | ||||
| 			hose->global_number, pdev->bus->number, | ||||
| 			PCI_SLOT(pci_iov_virtfn_devfn(pdev, vf_index)), | ||||
| 			PCI_FUNC(pci_iov_virtfn_devfn(pdev, vf_index)), pe_num); | ||||
| 
 | ||||
| 		if (pnv_ioda_configure_pe(phb, pe)) { | ||||
| 			/* XXX What do we do here ? */ | ||||
| 			if (pe_num) | ||||
| 				pnv_ioda_free_pe(phb, pe_num); | ||||
| 			pe->pdev = NULL; | ||||
| 			continue; | ||||
| 		} | ||||
| 
 | ||||
| 		pe->tce32_table = kzalloc_node(sizeof(struct iommu_table), | ||||
| 				GFP_KERNEL, hose->node); | ||||
| 		pe->tce32_table->data = pe; | ||||
| 
 | ||||
| 		/* Put PE to the list */ | ||||
| 		mutex_lock(&phb->ioda.pe_list_mutex); | ||||
| 		list_add_tail(&pe->list, &phb->ioda.pe_list); | ||||
| 		mutex_unlock(&phb->ioda.pe_list_mutex); | ||||
| 
 | ||||
| 		pnv_pci_ioda2_setup_dma_pe(phb, pe); | ||||
| 	} | ||||
| 
 | ||||
| 	if (pdn->m64_per_iov == M64_PER_IOV && num_vfs > M64_PER_IOV) { | ||||
| 		int   vf_group; | ||||
| 		int   vf_per_group; | ||||
| 		int   vf_index1; | ||||
| 
 | ||||
| 		vf_per_group = roundup_pow_of_two(num_vfs) / pdn->m64_per_iov; | ||||
| 
 | ||||
| 		for (vf_group = 0; vf_group < M64_PER_IOV; vf_group++) { | ||||
| 			for (vf_index = vf_group * vf_per_group; | ||||
| 			     vf_index < (vf_group + 1) * vf_per_group && | ||||
| 			     vf_index < num_vfs; | ||||
| 			     vf_index++) { | ||||
| 				for (vf_index1 = vf_group * vf_per_group; | ||||
| 				     vf_index1 < (vf_group + 1) * vf_per_group && | ||||
| 				     vf_index1 < num_vfs; | ||||
| 				     vf_index1++) { | ||||
| 
 | ||||
| 					rc = opal_pci_set_peltv(phb->opal_id, | ||||
| 						pdn->offset + vf_index, | ||||
| 						pdn->offset + vf_index1, | ||||
| 						OPAL_ADD_PE_TO_DOMAIN); | ||||
| 
 | ||||
| 					if (rc) | ||||
| 					    dev_warn(&pdev->dev, "%s: Failed to link same group PE#%d(%lld)\n", | ||||
| 						__func__, | ||||
| 						pdn->offset + vf_index1, rc); | ||||
| 				} | ||||
| 			} | ||||
| 		} | ||||
| 	} | ||||
| } | ||||
| 
 | ||||
| int pnv_pci_sriov_enable(struct pci_dev *pdev, u16 num_vfs) | ||||
| { | ||||
| 	struct pci_bus        *bus; | ||||
| 	struct pci_controller *hose; | ||||
| 	struct pnv_phb        *phb; | ||||
| 	struct pci_dn         *pdn; | ||||
| 	int                    ret; | ||||
| 
 | ||||
| 	bus = pdev->bus; | ||||
| 	hose = pci_bus_to_host(bus); | ||||
| 	phb = hose->private_data; | ||||
| 	pdn = pci_get_pdn(pdev); | ||||
| 
 | ||||
| 	if (phb->type == PNV_PHB_IODA2) { | ||||
| 		/* Calculate available PE for required VFs */ | ||||
| 		mutex_lock(&phb->ioda.pe_alloc_mutex); | ||||
| 		pdn->offset = bitmap_find_next_zero_area( | ||||
| 			phb->ioda.pe_alloc, phb->ioda.total_pe, | ||||
| 			0, num_vfs, 0); | ||||
| 		if (pdn->offset >= phb->ioda.total_pe) { | ||||
| 			mutex_unlock(&phb->ioda.pe_alloc_mutex); | ||||
| 			dev_info(&pdev->dev, "Failed to enable VF%d\n", num_vfs); | ||||
| 			pdn->offset = 0; | ||||
| 			return -EBUSY; | ||||
| 		} | ||||
| 		bitmap_set(phb->ioda.pe_alloc, pdn->offset, num_vfs); | ||||
| 		pdn->num_vfs = num_vfs; | ||||
| 		mutex_unlock(&phb->ioda.pe_alloc_mutex); | ||||
| 
 | ||||
| 		/* Assign M64 window accordingly */ | ||||
| 		ret = pnv_pci_vf_assign_m64(pdev, num_vfs); | ||||
| 		if (ret) { | ||||
| 			dev_info(&pdev->dev, "Not enough M64 window resources\n"); | ||||
| 			goto m64_failed; | ||||
| 		} | ||||
| 
 | ||||
| 		/*
 | ||||
| 		 * When using one M64 BAR to map one IOV BAR, we need to shift | ||||
| 		 * the IOV BAR according to the PE# allocated to the VFs. | ||||
| 		 * Otherwise, the PE# for the VF will conflict with others. | ||||
| 		 */ | ||||
| 		if (pdn->m64_per_iov == 1) { | ||||
| 			ret = pnv_pci_vf_resource_shift(pdev, pdn->offset); | ||||
| 			if (ret) | ||||
| 				goto m64_failed; | ||||
| 		} | ||||
| 	} | ||||
| 
 | ||||
| 	/* Setup VF PEs */ | ||||
| 	pnv_ioda_setup_vf_PE(pdev, num_vfs); | ||||
| 
 | ||||
| 	return 0; | ||||
| 
 | ||||
| m64_failed: | ||||
| 	bitmap_clear(phb->ioda.pe_alloc, pdn->offset, num_vfs); | ||||
| 	pdn->offset = 0; | ||||
| 
 | ||||
| 	return ret; | ||||
| } | ||||
| 
 | ||||
| int pcibios_sriov_disable(struct pci_dev *pdev) | ||||
| { | ||||
| 	pnv_pci_sriov_disable(pdev); | ||||
| 
 | ||||
| 	/* Release PCI data */ | ||||
| 	remove_dev_pci_data(pdev); | ||||
| 	return 0; | ||||
| } | ||||
| 
 | ||||
| int pcibios_sriov_enable(struct pci_dev *pdev, u16 num_vfs) | ||||
| { | ||||
| 	/* Allocate PCI data */ | ||||
| 	add_dev_pci_data(pdev); | ||||
| 
 | ||||
| 	pnv_pci_sriov_enable(pdev, num_vfs); | ||||
| 	return 0; | ||||
| } | ||||
| #endif /* CONFIG_PCI_IOV */ | ||||
| 
 | ||||
| static void pnv_pci_ioda_dma_dev_setup(struct pnv_phb *phb, struct pci_dev *pdev) | ||||
| { | ||||
| 	struct pci_dn *pdn = pci_get_pdn(pdev); | ||||
|  | @ -989,7 +1598,7 @@ static void pnv_pci_ioda_dma_dev_setup(struct pnv_phb *phb, struct pci_dev *pdev | |||
| 
 | ||||
| 	pe = &phb->ioda.pe_array[pdn->pe_number]; | ||||
| 	WARN_ON(get_dma_ops(&pdev->dev) != &dma_iommu_ops); | ||||
| 	set_iommu_table_base_and_group(&pdev->dev, &pe->tce32_table); | ||||
| 	set_iommu_table_base_and_group(&pdev->dev, pe->tce32_table); | ||||
| } | ||||
| 
 | ||||
| static int pnv_pci_ioda_dma_set_mask(struct pnv_phb *phb, | ||||
|  | @ -1016,7 +1625,7 @@ static int pnv_pci_ioda_dma_set_mask(struct pnv_phb *phb, | |||
| 	} else { | ||||
| 		dev_info(&pdev->dev, "Using 32-bit DMA via iommu\n"); | ||||
| 		set_dma_ops(&pdev->dev, &dma_iommu_ops); | ||||
| 		set_iommu_table_base(&pdev->dev, &pe->tce32_table); | ||||
| 		set_iommu_table_base(&pdev->dev, pe->tce32_table); | ||||
| 	} | ||||
| 	*pdev->dev.dma_mask = dma_mask; | ||||
| 	return 0; | ||||
|  | @ -1053,9 +1662,9 @@ static void pnv_ioda_setup_bus_dma(struct pnv_ioda_pe *pe, | |||
| 	list_for_each_entry(dev, &bus->devices, bus_list) { | ||||
| 		if (add_to_iommu_group) | ||||
| 			set_iommu_table_base_and_group(&dev->dev, | ||||
| 						       &pe->tce32_table); | ||||
| 						       pe->tce32_table); | ||||
| 		else | ||||
| 			set_iommu_table_base(&dev->dev, &pe->tce32_table); | ||||
| 			set_iommu_table_base(&dev->dev, pe->tce32_table); | ||||
| 
 | ||||
| 		if (dev->subordinate) | ||||
| 			pnv_ioda_setup_bus_dma(pe, dev->subordinate, | ||||
|  | @ -1145,8 +1754,7 @@ static void pnv_pci_ioda2_tce_invalidate(struct pnv_ioda_pe *pe, | |||
| void pnv_pci_ioda_tce_invalidate(struct iommu_table *tbl, | ||||
| 				 __be64 *startp, __be64 *endp, bool rm) | ||||
| { | ||||
| 	struct pnv_ioda_pe *pe = container_of(tbl, struct pnv_ioda_pe, | ||||
| 					      tce32_table); | ||||
| 	struct pnv_ioda_pe *pe = tbl->data; | ||||
| 	struct pnv_phb *phb = pe->phb; | ||||
| 
 | ||||
| 	if (phb->type == PNV_PHB_IODA1) | ||||
|  | @ -1167,9 +1775,6 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb, | |||
| 	int64_t rc; | ||||
| 	void *addr; | ||||
| 
 | ||||
| 	/* 256M DMA window, 4K TCE pages, 8 bytes TCE */ | ||||
| #define TCE32_TABLE_SIZE	((0x10000000 / 0x1000) * 8) | ||||
| 
 | ||||
| 	/* XXX FIXME: Handle 64-bit only DMA devices */ | ||||
| 	/* XXX FIXME: Provide 64-bit DMA facilities & non-4K TCE tables etc.. */ | ||||
| 	/* XXX FIXME: Allocate multi-level tables on PHB3 */ | ||||
|  | @ -1212,7 +1817,7 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb, | |||
| 	} | ||||
| 
 | ||||
| 	/* Setup linux iommu table */ | ||||
| 	tbl = &pe->tce32_table; | ||||
| 	tbl = pe->tce32_table; | ||||
| 	pnv_pci_setup_iommu_table(tbl, addr, TCE32_TABLE_SIZE * segs, | ||||
| 				  base << 28, IOMMU_PAGE_SHIFT_4K); | ||||
| 
 | ||||
|  | @ -1232,12 +1837,19 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb, | |||
| 				 TCE_PCI_SWINV_PAIR); | ||||
| 	} | ||||
| 	iommu_init_table(tbl, phb->hose->node); | ||||
| 	iommu_register_group(tbl, phb->hose->global_number, pe->pe_number); | ||||
| 
 | ||||
| 	if (pe->pdev) | ||||
| 	if (pe->flags & PNV_IODA_PE_DEV) { | ||||
| 		iommu_register_group(tbl, phb->hose->global_number, | ||||
| 				     pe->pe_number); | ||||
| 		set_iommu_table_base_and_group(&pe->pdev->dev, tbl); | ||||
| 	else | ||||
| 	} else if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL)) { | ||||
| 		iommu_register_group(tbl, phb->hose->global_number, | ||||
| 				     pe->pe_number); | ||||
| 		pnv_ioda_setup_bus_dma(pe, pe->pbus, true); | ||||
| 	} else if (pe->flags & PNV_IODA_PE_VF) { | ||||
| 		iommu_register_group(tbl, phb->hose->global_number, | ||||
| 				     pe->pe_number); | ||||
| 	} | ||||
| 
 | ||||
| 	return; | ||||
|  fail: | ||||
|  | @ -1250,8 +1862,7 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb, | |||
| 
 | ||||
| static void pnv_pci_ioda2_set_bypass(struct iommu_table *tbl, bool enable) | ||||
| { | ||||
| 	struct pnv_ioda_pe *pe = container_of(tbl, struct pnv_ioda_pe, | ||||
| 					      tce32_table); | ||||
| 	struct pnv_ioda_pe *pe = tbl->data; | ||||
| 	uint16_t window_id = (pe->pe_number << 1 ) + 1; | ||||
| 	int64_t rc; | ||||
| 
 | ||||
|  | @ -1296,10 +1907,10 @@ static void pnv_pci_ioda2_setup_bypass_pe(struct pnv_phb *phb, | |||
| 	pe->tce_bypass_base = 1ull << 59; | ||||
| 
 | ||||
| 	/* Install set_bypass callback for VFIO */ | ||||
| 	pe->tce32_table.set_bypass = pnv_pci_ioda2_set_bypass; | ||||
| 	pe->tce32_table->set_bypass = pnv_pci_ioda2_set_bypass; | ||||
| 
 | ||||
| 	/* Enable bypass by default */ | ||||
| 	pnv_pci_ioda2_set_bypass(&pe->tce32_table, true); | ||||
| 	pnv_pci_ioda2_set_bypass(pe->tce32_table, true); | ||||
| } | ||||
| 
 | ||||
| static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb, | ||||
|  | @ -1347,7 +1958,7 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb, | |||
| 	} | ||||
| 
 | ||||
| 	/* Setup linux iommu table */ | ||||
| 	tbl = &pe->tce32_table; | ||||
| 	tbl = pe->tce32_table; | ||||
| 	pnv_pci_setup_iommu_table(tbl, addr, tce_table_size, 0, | ||||
| 			IOMMU_PAGE_SHIFT_4K); | ||||
| 
 | ||||
|  | @ -1365,12 +1976,19 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb, | |||
| 		tbl->it_type |= (TCE_PCI_SWINV_CREATE | TCE_PCI_SWINV_FREE); | ||||
| 	} | ||||
| 	iommu_init_table(tbl, phb->hose->node); | ||||
| 	iommu_register_group(tbl, phb->hose->global_number, pe->pe_number); | ||||
| 
 | ||||
| 	if (pe->pdev) | ||||
| 	if (pe->flags & PNV_IODA_PE_DEV) { | ||||
| 		iommu_register_group(tbl, phb->hose->global_number, | ||||
| 				     pe->pe_number); | ||||
| 		set_iommu_table_base_and_group(&pe->pdev->dev, tbl); | ||||
| 	else | ||||
| 	} else if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL)) { | ||||
| 		iommu_register_group(tbl, phb->hose->global_number, | ||||
| 				     pe->pe_number); | ||||
| 		pnv_ioda_setup_bus_dma(pe, pe->pbus, true); | ||||
| 	} else if (pe->flags & PNV_IODA_PE_VF) { | ||||
| 		iommu_register_group(tbl, phb->hose->global_number, | ||||
| 				     pe->pe_number); | ||||
| 	} | ||||
| 
 | ||||
| 	/* Also create a bypass window */ | ||||
| 	if (!pnv_iommu_bypass_disabled) | ||||
|  | @ -1731,6 +2349,73 @@ static void pnv_pci_init_ioda_msis(struct pnv_phb *phb) | |||
| static void pnv_pci_init_ioda_msis(struct pnv_phb *phb) { } | ||||
| #endif /* CONFIG_PCI_MSI */ | ||||
| 
 | ||||
| #ifdef CONFIG_PCI_IOV | ||||
| static void pnv_pci_ioda_fixup_iov_resources(struct pci_dev *pdev) | ||||
| { | ||||
| 	struct pci_controller *hose; | ||||
| 	struct pnv_phb *phb; | ||||
| 	struct resource *res; | ||||
| 	int i; | ||||
| 	resource_size_t size; | ||||
| 	struct pci_dn *pdn; | ||||
| 	int mul, total_vfs; | ||||
| 
 | ||||
| 	if (!pdev->is_physfn || pdev->is_added) | ||||
| 		return; | ||||
| 
 | ||||
| 	hose = pci_bus_to_host(pdev->bus); | ||||
| 	phb = hose->private_data; | ||||
| 
 | ||||
| 	pdn = pci_get_pdn(pdev); | ||||
| 	pdn->vfs_expanded = 0; | ||||
| 
 | ||||
| 	total_vfs = pci_sriov_get_totalvfs(pdev); | ||||
| 	pdn->m64_per_iov = 1; | ||||
| 	mul = phb->ioda.total_pe; | ||||
| 
 | ||||
| 	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) { | ||||
| 		res = &pdev->resource[i + PCI_IOV_RESOURCES]; | ||||
| 		if (!res->flags || res->parent) | ||||
| 			continue; | ||||
| 		if (!pnv_pci_is_mem_pref_64(res->flags)) { | ||||
| 			dev_warn(&pdev->dev, " non M64 VF BAR%d: %pR\n", | ||||
| 				 i, res); | ||||
| 			continue; | ||||
| 		} | ||||
| 
 | ||||
| 		size = pci_iov_resource_size(pdev, i + PCI_IOV_RESOURCES); | ||||
| 
 | ||||
| 		/* bigger than 64M */ | ||||
| 		if (size > (1 << 26)) { | ||||
| 			dev_info(&pdev->dev, "PowerNV: VF BAR%d: %pR IOV size is bigger than 64M, roundup power2\n", | ||||
| 				 i, res); | ||||
| 			pdn->m64_per_iov = M64_PER_IOV; | ||||
| 			mul = roundup_pow_of_two(total_vfs); | ||||
| 			break; | ||||
| 		} | ||||
| 	} | ||||
| 
 | ||||
| 	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) { | ||||
| 		res = &pdev->resource[i + PCI_IOV_RESOURCES]; | ||||
| 		if (!res->flags || res->parent) | ||||
| 			continue; | ||||
| 		if (!pnv_pci_is_mem_pref_64(res->flags)) { | ||||
| 			dev_warn(&pdev->dev, "Skipping expanding VF BAR%d: %pR\n", | ||||
| 				 i, res); | ||||
| 			continue; | ||||
| 		} | ||||
| 
 | ||||
| 		dev_dbg(&pdev->dev, " Fixing VF BAR%d: %pR to\n", i, res); | ||||
| 		size = pci_iov_resource_size(pdev, i + PCI_IOV_RESOURCES); | ||||
| 		res->end = res->start + size * mul - 1; | ||||
| 		dev_dbg(&pdev->dev, "                       %pR\n", res); | ||||
| 		dev_info(&pdev->dev, "VF BAR%d: %pR (expanded to %d VFs for PE alignment)", | ||||
| 			 i, res, mul); | ||||
| 	} | ||||
| 	pdn->vfs_expanded = mul; | ||||
| } | ||||
| #endif /* CONFIG_PCI_IOV */ | ||||
| 
 | ||||
| /*
 | ||||
|  * This function is supposed to be called on basis of PE from top | ||||
|  * to bottom style. So the the I/O or MMIO segment assigned to | ||||
|  | @ -1908,6 +2593,25 @@ static resource_size_t pnv_pci_window_alignment(struct pci_bus *bus, | |||
| 	return phb->ioda.io_segsize; | ||||
| } | ||||
| 
 | ||||
| #ifdef CONFIG_PCI_IOV | ||||
| static resource_size_t pnv_pci_iov_resource_alignment(struct pci_dev *pdev, | ||||
| 						      int resno) | ||||
| { | ||||
| 	struct pci_dn *pdn = pci_get_pdn(pdev); | ||||
| 	resource_size_t align, iov_align; | ||||
| 
 | ||||
| 	iov_align = resource_size(&pdev->resource[resno]); | ||||
| 	if (iov_align) | ||||
| 		return iov_align; | ||||
| 
 | ||||
| 	align = pci_iov_resource_size(pdev, resno); | ||||
| 	if (pdn->vfs_expanded) | ||||
| 		return pdn->vfs_expanded * align; | ||||
| 
 | ||||
| 	return align; | ||||
| } | ||||
| #endif /* CONFIG_PCI_IOV */ | ||||
| 
 | ||||
| /* Prevent enabling devices for which we couldn't properly
 | ||||
|  * assign a PE | ||||
|  */ | ||||
|  | @ -1993,6 +2697,7 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np, | |||
| 	phb->hub_id = hub_id; | ||||
| 	phb->opal_id = phb_id; | ||||
| 	phb->type = ioda_type; | ||||
| 	mutex_init(&phb->ioda.pe_alloc_mutex); | ||||
| 
 | ||||
| 	/* Detect specific models for error handling */ | ||||
| 	if (of_device_is_compatible(np, "ibm,p7ioc-pciex")) | ||||
|  | @ -2052,6 +2757,7 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np, | |||
| 
 | ||||
| 	INIT_LIST_HEAD(&phb->ioda.pe_dma_list); | ||||
| 	INIT_LIST_HEAD(&phb->ioda.pe_list); | ||||
| 	mutex_init(&phb->ioda.pe_list_mutex); | ||||
| 
 | ||||
| 	/* Calculate how many 32-bit TCE segments we have */ | ||||
| 	phb->ioda.tce32_count = phb->ioda.m32_pci_base >> 28; | ||||
|  | @ -2106,6 +2812,12 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np, | |||
| 	pnv_pci_controller_ops.enable_device_hook = pnv_pci_enable_device_hook; | ||||
| 	pnv_pci_controller_ops.window_alignment = pnv_pci_window_alignment; | ||||
| 	pnv_pci_controller_ops.reset_secondary_bus = pnv_pci_reset_secondary_bus; | ||||
| 
 | ||||
| #ifdef CONFIG_PCI_IOV | ||||
| 	ppc_md.pcibios_fixup_sriov = pnv_pci_ioda_fixup_iov_resources; | ||||
| 	ppc_md.pcibios_iov_resource_alignment = pnv_pci_iov_resource_alignment; | ||||
| #endif | ||||
| 
 | ||||
| 	pci_add_flags(PCI_REASSIGN_ALL_RSRC); | ||||
| 
 | ||||
| 	/* Reset IODA tables to a clean state */ | ||||
|  |  | |||
|  | @ -666,6 +666,24 @@ static void pnv_pci_dma_dev_setup(struct pci_dev *pdev) | |||
| { | ||||
| 	struct pci_controller *hose = pci_bus_to_host(pdev->bus); | ||||
| 	struct pnv_phb *phb = hose->private_data; | ||||
| #ifdef CONFIG_PCI_IOV | ||||
| 	struct pnv_ioda_pe *pe; | ||||
| 	struct pci_dn *pdn; | ||||
| 
 | ||||
| 	/* Fix the VF pdn PE number */ | ||||
| 	if (pdev->is_virtfn) { | ||||
| 		pdn = pci_get_pdn(pdev); | ||||
| 		WARN_ON(pdn->pe_number != IODA_INVALID_PE); | ||||
| 		list_for_each_entry(pe, &phb->ioda.pe_list, list) { | ||||
| 			if (pe->rid == ((pdev->bus->number << 8) | | ||||
| 			    (pdev->devfn & 0xff))) { | ||||
| 				pdn->pe_number = pe->pe_number; | ||||
| 				pe->pdev = pdev; | ||||
| 				break; | ||||
| 			} | ||||
| 		} | ||||
| 	} | ||||
| #endif /* CONFIG_PCI_IOV */ | ||||
| 
 | ||||
| 	if (phb && phb->dma_dev_setup) | ||||
| 		phb->dma_dev_setup(phb, pdev); | ||||
|  |  | |||
|  | @ -23,6 +23,7 @@ enum pnv_phb_model { | |||
| #define PNV_IODA_PE_BUS_ALL	(1 << 2)	/* PE has subordinate buses	*/ | ||||
| #define PNV_IODA_PE_MASTER	(1 << 3)	/* Master PE in compound case	*/ | ||||
| #define PNV_IODA_PE_SLAVE	(1 << 4)	/* Slave PE in compound case	*/ | ||||
| #define PNV_IODA_PE_VF		(1 << 5)	/* PE for one VF 		*/ | ||||
| 
 | ||||
| /* Data associated with a PE, including IOMMU tracking etc.. */ | ||||
| struct pnv_phb; | ||||
|  | @ -34,6 +35,9 @@ struct pnv_ioda_pe { | |||
| 	 * entire bus (& children). In the former case, pdev | ||||
| 	 * is populated, in the later case, pbus is. | ||||
| 	 */ | ||||
| #ifdef CONFIG_PCI_IOV | ||||
| 	struct pci_dev          *parent_dev; | ||||
| #endif | ||||
| 	struct pci_dev		*pdev; | ||||
| 	struct pci_bus		*pbus; | ||||
| 
 | ||||
|  | @ -53,7 +57,7 @@ struct pnv_ioda_pe { | |||
| 	/* "Base" iommu table, ie, 4K TCEs, 32-bit DMA */ | ||||
| 	int			tce32_seg; | ||||
| 	int			tce32_segcount; | ||||
| 	struct iommu_table	tce32_table; | ||||
| 	struct iommu_table	*tce32_table; | ||||
| 	phys_addr_t		tce_inval_reg_phys; | ||||
| 
 | ||||
| 	/* 64-bit TCE bypass region */ | ||||
|  | @ -145,6 +149,8 @@ struct pnv_phb { | |||
| 
 | ||||
| 			/* PE allocation bitmap */ | ||||
| 			unsigned long		*pe_alloc; | ||||
| 			/* PE allocation mutex */ | ||||
| 			struct mutex		pe_alloc_mutex; | ||||
| 
 | ||||
| 			/* M32 & IO segment maps */ | ||||
| 			unsigned int		*m32_segmap; | ||||
|  | @ -159,6 +165,7 @@ struct pnv_phb { | |||
| 			 * on the sequence of creation | ||||
| 			 */ | ||||
| 			struct list_head	pe_list; | ||||
| 			struct mutex            pe_list_mutex; | ||||
| 
 | ||||
| 			/* Reverse map of PEs, will have to extend if
 | ||||
| 			 * we are to support more than 256 PEs, indexed | ||||
|  |  | |||
|  | @ -19,16 +19,59 @@ | |||
| 
 | ||||
| #define VIRTFN_ID_LEN	16 | ||||
| 
 | ||||
| static inline u8 virtfn_bus(struct pci_dev *dev, int id) | ||||
| int pci_iov_virtfn_bus(struct pci_dev *dev, int vf_id) | ||||
| { | ||||
| 	if (!dev->is_physfn) | ||||
| 		return -EINVAL; | ||||
| 	return dev->bus->number + ((dev->devfn + dev->sriov->offset + | ||||
| 				    dev->sriov->stride * id) >> 8); | ||||
| 				    dev->sriov->stride * vf_id) >> 8); | ||||
| } | ||||
| 
 | ||||
| static inline u8 virtfn_devfn(struct pci_dev *dev, int id) | ||||
| int pci_iov_virtfn_devfn(struct pci_dev *dev, int vf_id) | ||||
| { | ||||
| 	if (!dev->is_physfn) | ||||
| 		return -EINVAL; | ||||
| 	return (dev->devfn + dev->sriov->offset + | ||||
| 		dev->sriov->stride * id) & 0xff; | ||||
| 		dev->sriov->stride * vf_id) & 0xff; | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  * Per SR-IOV spec sec 3.3.10 and 3.3.11, First VF Offset and VF Stride may | ||||
|  * change when NumVFs changes. | ||||
|  * | ||||
|  * Update iov->offset and iov->stride when NumVFs is written. | ||||
|  */ | ||||
| static inline void pci_iov_set_numvfs(struct pci_dev *dev, int nr_virtfn) | ||||
| { | ||||
| 	struct pci_sriov *iov = dev->sriov; | ||||
| 
 | ||||
| 	pci_write_config_word(dev, iov->pos + PCI_SRIOV_NUM_VF, nr_virtfn); | ||||
| 	pci_read_config_word(dev, iov->pos + PCI_SRIOV_VF_OFFSET, &iov->offset); | ||||
| 	pci_read_config_word(dev, iov->pos + PCI_SRIOV_VF_STRIDE, &iov->stride); | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  * The PF consumes one bus number.  NumVFs, First VF Offset, and VF Stride | ||||
|  * determine how many additional bus numbers will be consumed by VFs. | ||||
|  * | ||||
|  * Iterate over all valid NumVFs and calculate the maximum number of bus | ||||
|  * numbers that could ever be required. | ||||
|  */ | ||||
| static inline u8 virtfn_max_buses(struct pci_dev *dev) | ||||
| { | ||||
| 	struct pci_sriov *iov = dev->sriov; | ||||
| 	int nr_virtfn; | ||||
| 	u8 max = 0; | ||||
| 	int busnr; | ||||
| 
 | ||||
| 	for (nr_virtfn = 1; nr_virtfn <= iov->total_VFs; nr_virtfn++) { | ||||
| 		pci_iov_set_numvfs(dev, nr_virtfn); | ||||
| 		busnr = pci_iov_virtfn_bus(dev, nr_virtfn - 1); | ||||
| 		if (busnr > max) | ||||
| 			max = busnr; | ||||
| 	} | ||||
| 
 | ||||
| 	return max; | ||||
| } | ||||
| 
 | ||||
| static struct pci_bus *virtfn_add_bus(struct pci_bus *bus, int busnr) | ||||
|  | @ -57,6 +100,14 @@ static void virtfn_remove_bus(struct pci_bus *physbus, struct pci_bus *virtbus) | |||
| 		pci_remove_bus(virtbus); | ||||
| } | ||||
| 
 | ||||
| resource_size_t pci_iov_resource_size(struct pci_dev *dev, int resno) | ||||
| { | ||||
| 	if (!dev->is_physfn) | ||||
| 		return 0; | ||||
| 
 | ||||
| 	return dev->sriov->barsz[resno - PCI_IOV_RESOURCES]; | ||||
| } | ||||
| 
 | ||||
| static int virtfn_add(struct pci_dev *dev, int id, int reset) | ||||
| { | ||||
| 	int i; | ||||
|  | @ -69,7 +120,7 @@ static int virtfn_add(struct pci_dev *dev, int id, int reset) | |||
| 	struct pci_bus *bus; | ||||
| 
 | ||||
| 	mutex_lock(&iov->dev->sriov->lock); | ||||
| 	bus = virtfn_add_bus(dev->bus, virtfn_bus(dev, id)); | ||||
| 	bus = virtfn_add_bus(dev->bus, pci_iov_virtfn_bus(dev, id)); | ||||
| 	if (!bus) | ||||
| 		goto failed; | ||||
| 
 | ||||
|  | @ -77,7 +128,7 @@ static int virtfn_add(struct pci_dev *dev, int id, int reset) | |||
| 	if (!virtfn) | ||||
| 		goto failed0; | ||||
| 
 | ||||
| 	virtfn->devfn = virtfn_devfn(dev, id); | ||||
| 	virtfn->devfn = pci_iov_virtfn_devfn(dev, id); | ||||
| 	virtfn->vendor = dev->vendor; | ||||
| 	pci_read_config_word(dev, iov->pos + PCI_SRIOV_VF_DID, &virtfn->device); | ||||
| 	pci_setup_device(virtfn); | ||||
|  | @ -87,13 +138,12 @@ static int virtfn_add(struct pci_dev *dev, int id, int reset) | |||
| 	virtfn->multifunction = 0; | ||||
| 
 | ||||
| 	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) { | ||||
| 		res = dev->resource + PCI_IOV_RESOURCES + i; | ||||
| 		res = &dev->resource[i + PCI_IOV_RESOURCES]; | ||||
| 		if (!res->parent) | ||||
| 			continue; | ||||
| 		virtfn->resource[i].name = pci_name(virtfn); | ||||
| 		virtfn->resource[i].flags = res->flags; | ||||
| 		size = resource_size(res); | ||||
| 		do_div(size, iov->total_VFs); | ||||
| 		size = pci_iov_resource_size(dev, i + PCI_IOV_RESOURCES); | ||||
| 		virtfn->resource[i].start = res->start + size * id; | ||||
| 		virtfn->resource[i].end = virtfn->resource[i].start + size - 1; | ||||
| 		rc = request_resource(res, &virtfn->resource[i]); | ||||
|  | @ -140,8 +190,8 @@ static void virtfn_remove(struct pci_dev *dev, int id, int reset) | |||
| 	struct pci_sriov *iov = dev->sriov; | ||||
| 
 | ||||
| 	virtfn = pci_get_domain_bus_and_slot(pci_domain_nr(dev->bus), | ||||
| 					     virtfn_bus(dev, id), | ||||
| 					     virtfn_devfn(dev, id)); | ||||
| 					     pci_iov_virtfn_bus(dev, id), | ||||
| 					     pci_iov_virtfn_devfn(dev, id)); | ||||
| 	if (!virtfn) | ||||
| 		return; | ||||
| 
 | ||||
|  | @ -170,6 +220,11 @@ static void virtfn_remove(struct pci_dev *dev, int id, int reset) | |||
| 	pci_dev_put(dev); | ||||
| } | ||||
| 
 | ||||
| int __weak pcibios_sriov_enable(struct pci_dev *pdev, u16 num_vfs) | ||||
| { | ||||
|        return 0; | ||||
| } | ||||
| 
 | ||||
| static int sriov_enable(struct pci_dev *dev, int nr_virtfn) | ||||
| { | ||||
| 	int rc; | ||||
|  | @ -180,6 +235,8 @@ static int sriov_enable(struct pci_dev *dev, int nr_virtfn) | |||
| 	struct pci_dev *pdev; | ||||
| 	struct pci_sriov *iov = dev->sriov; | ||||
| 	int bars = 0; | ||||
| 	int bus; | ||||
| 	int retval; | ||||
| 
 | ||||
| 	if (!nr_virtfn) | ||||
| 		return 0; | ||||
|  | @ -204,7 +261,7 @@ static int sriov_enable(struct pci_dev *dev, int nr_virtfn) | |||
| 	nres = 0; | ||||
| 	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) { | ||||
| 		bars |= (1 << (i + PCI_IOV_RESOURCES)); | ||||
| 		res = dev->resource + PCI_IOV_RESOURCES + i; | ||||
| 		res = &dev->resource[i + PCI_IOV_RESOURCES]; | ||||
| 		if (res->parent) | ||||
| 			nres++; | ||||
| 	} | ||||
|  | @ -216,8 +273,10 @@ static int sriov_enable(struct pci_dev *dev, int nr_virtfn) | |||
| 	iov->offset = offset; | ||||
| 	iov->stride = stride; | ||||
| 
 | ||||
| 	if (virtfn_bus(dev, nr_virtfn - 1) > dev->bus->busn_res.end) { | ||||
| 		dev_err(&dev->dev, "SR-IOV: bus number out of range\n"); | ||||
| 	bus = pci_iov_virtfn_bus(dev, nr_virtfn - 1); | ||||
| 	if (bus > dev->bus->busn_res.end) { | ||||
| 		dev_err(&dev->dev, "can't enable %d VFs (bus %02x out of range of %pR)\n", | ||||
| 			nr_virtfn, bus, &dev->bus->busn_res); | ||||
| 		return -ENOMEM; | ||||
| 	} | ||||
| 
 | ||||
|  | @ -243,7 +302,7 @@ static int sriov_enable(struct pci_dev *dev, int nr_virtfn) | |||
| 			return rc; | ||||
| 	} | ||||
| 
 | ||||
| 	pci_write_config_word(dev, iov->pos + PCI_SRIOV_NUM_VF, nr_virtfn); | ||||
| 	pci_iov_set_numvfs(dev, nr_virtfn); | ||||
| 	iov->ctrl |= PCI_SRIOV_CTRL_VFE | PCI_SRIOV_CTRL_MSE; | ||||
| 	pci_cfg_access_lock(dev); | ||||
| 	pci_write_config_word(dev, iov->pos + PCI_SRIOV_CTRL, iov->ctrl); | ||||
|  | @ -254,6 +313,12 @@ static int sriov_enable(struct pci_dev *dev, int nr_virtfn) | |||
| 	if (nr_virtfn < initial) | ||||
| 		initial = nr_virtfn; | ||||
| 
 | ||||
| 	if ((retval = pcibios_sriov_enable(dev, initial))) { | ||||
| 		dev_err(&dev->dev, "failure %d from pcibios_sriov_enable()\n", | ||||
| 			retval); | ||||
| 		return retval; | ||||
| 	} | ||||
| 
 | ||||
| 	for (i = 0; i < initial; i++) { | ||||
| 		rc = virtfn_add(dev, i, 0); | ||||
| 		if (rc) | ||||
|  | @ -272,7 +337,7 @@ failed: | |||
| 	iov->ctrl &= ~(PCI_SRIOV_CTRL_VFE | PCI_SRIOV_CTRL_MSE); | ||||
| 	pci_cfg_access_lock(dev); | ||||
| 	pci_write_config_word(dev, iov->pos + PCI_SRIOV_CTRL, iov->ctrl); | ||||
| 	pci_write_config_word(dev, iov->pos + PCI_SRIOV_NUM_VF, 0); | ||||
| 	pci_iov_set_numvfs(dev, 0); | ||||
| 	ssleep(1); | ||||
| 	pci_cfg_access_unlock(dev); | ||||
| 
 | ||||
|  | @ -282,6 +347,11 @@ failed: | |||
| 	return rc; | ||||
| } | ||||
| 
 | ||||
| int __weak pcibios_sriov_disable(struct pci_dev *pdev) | ||||
| { | ||||
|        return 0; | ||||
| } | ||||
| 
 | ||||
| static void sriov_disable(struct pci_dev *dev) | ||||
| { | ||||
| 	int i; | ||||
|  | @ -293,6 +363,8 @@ static void sriov_disable(struct pci_dev *dev) | |||
| 	for (i = 0; i < iov->num_VFs; i++) | ||||
| 		virtfn_remove(dev, i, 0); | ||||
| 
 | ||||
| 	pcibios_sriov_disable(dev); | ||||
| 
 | ||||
| 	iov->ctrl &= ~(PCI_SRIOV_CTRL_VFE | PCI_SRIOV_CTRL_MSE); | ||||
| 	pci_cfg_access_lock(dev); | ||||
| 	pci_write_config_word(dev, iov->pos + PCI_SRIOV_CTRL, iov->ctrl); | ||||
|  | @ -303,12 +375,12 @@ static void sriov_disable(struct pci_dev *dev) | |||
| 		sysfs_remove_link(&dev->dev.kobj, "dep_link"); | ||||
| 
 | ||||
| 	iov->num_VFs = 0; | ||||
| 	pci_write_config_word(dev, iov->pos + PCI_SRIOV_NUM_VF, 0); | ||||
| 	pci_iov_set_numvfs(dev, 0); | ||||
| } | ||||
| 
 | ||||
| static int sriov_init(struct pci_dev *dev, int pos) | ||||
| { | ||||
| 	int i; | ||||
| 	int i, bar64; | ||||
| 	int rc; | ||||
| 	int nres; | ||||
| 	u32 pgsz; | ||||
|  | @ -357,27 +429,29 @@ found: | |||
| 	pgsz &= ~(pgsz - 1); | ||||
| 	pci_write_config_dword(dev, pos + PCI_SRIOV_SYS_PGSIZE, pgsz); | ||||
| 
 | ||||
| 	iov = kzalloc(sizeof(*iov), GFP_KERNEL); | ||||
| 	if (!iov) | ||||
| 		return -ENOMEM; | ||||
| 
 | ||||
| 	nres = 0; | ||||
| 	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) { | ||||
| 		res = dev->resource + PCI_IOV_RESOURCES + i; | ||||
| 		i += __pci_read_base(dev, pci_bar_unknown, res, | ||||
| 				     pos + PCI_SRIOV_BAR + i * 4); | ||||
| 		res = &dev->resource[i + PCI_IOV_RESOURCES]; | ||||
| 		bar64 = __pci_read_base(dev, pci_bar_unknown, res, | ||||
| 					pos + PCI_SRIOV_BAR + i * 4); | ||||
| 		if (!res->flags) | ||||
| 			continue; | ||||
| 		if (resource_size(res) & (PAGE_SIZE - 1)) { | ||||
| 			rc = -EIO; | ||||
| 			goto failed; | ||||
| 		} | ||||
| 		iov->barsz[i] = resource_size(res); | ||||
| 		res->end = res->start + resource_size(res) * total - 1; | ||||
| 		dev_info(&dev->dev, "VF(n) BAR%d space: %pR (contains BAR%d for %d VFs)\n", | ||||
| 			 i, res, i, total); | ||||
| 		i += bar64; | ||||
| 		nres++; | ||||
| 	} | ||||
| 
 | ||||
| 	iov = kzalloc(sizeof(*iov), GFP_KERNEL); | ||||
| 	if (!iov) { | ||||
| 		rc = -ENOMEM; | ||||
| 		goto failed; | ||||
| 	} | ||||
| 
 | ||||
| 	iov->pos = pos; | ||||
| 	iov->nres = nres; | ||||
| 	iov->ctrl = ctrl; | ||||
|  | @ -400,15 +474,17 @@ found: | |||
| 
 | ||||
| 	dev->sriov = iov; | ||||
| 	dev->is_physfn = 1; | ||||
| 	iov->max_VF_buses = virtfn_max_buses(dev); | ||||
| 
 | ||||
| 	return 0; | ||||
| 
 | ||||
| failed: | ||||
| 	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) { | ||||
| 		res = dev->resource + PCI_IOV_RESOURCES + i; | ||||
| 		res = &dev->resource[i + PCI_IOV_RESOURCES]; | ||||
| 		res->flags = 0; | ||||
| 	} | ||||
| 
 | ||||
| 	kfree(iov); | ||||
| 	return rc; | ||||
| } | ||||
| 
 | ||||
|  | @ -439,7 +515,7 @@ static void sriov_restore_state(struct pci_dev *dev) | |||
| 		pci_update_resource(dev, i); | ||||
| 
 | ||||
| 	pci_write_config_dword(dev, iov->pos + PCI_SRIOV_SYS_PGSIZE, iov->pgsz); | ||||
| 	pci_write_config_word(dev, iov->pos + PCI_SRIOV_NUM_VF, iov->num_VFs); | ||||
| 	pci_iov_set_numvfs(dev, iov->num_VFs); | ||||
| 	pci_write_config_word(dev, iov->pos + PCI_SRIOV_CTRL, iov->ctrl); | ||||
| 	if (iov->ctrl & PCI_SRIOV_CTRL_VFE) | ||||
| 		msleep(100); | ||||
|  | @ -493,6 +569,12 @@ int pci_iov_resource_bar(struct pci_dev *dev, int resno) | |||
| 		4 * (resno - PCI_IOV_RESOURCES); | ||||
| } | ||||
| 
 | ||||
| resource_size_t __weak pcibios_iov_resource_alignment(struct pci_dev *dev, | ||||
| 						      int resno) | ||||
| { | ||||
| 	return pci_iov_resource_size(dev, resno); | ||||
| } | ||||
| 
 | ||||
| /**
 | ||||
|  * pci_sriov_resource_alignment - get resource alignment for VF BAR | ||||
|  * @dev: the PCI device | ||||
|  | @ -505,14 +587,7 @@ int pci_iov_resource_bar(struct pci_dev *dev, int resno) | |||
|  */ | ||||
| resource_size_t pci_sriov_resource_alignment(struct pci_dev *dev, int resno) | ||||
| { | ||||
| 	struct resource tmp; | ||||
| 	int reg = pci_iov_resource_bar(dev, resno); | ||||
| 
 | ||||
| 	if (!reg) | ||||
| 		return 0; | ||||
| 
 | ||||
| 	 __pci_read_base(dev, pci_bar_unknown, &tmp, reg); | ||||
| 	return resource_alignment(&tmp); | ||||
| 	return pcibios_iov_resource_alignment(dev, resno); | ||||
| } | ||||
| 
 | ||||
| /**
 | ||||
|  | @ -535,15 +610,13 @@ void pci_restore_iov_state(struct pci_dev *dev) | |||
| int pci_iov_bus_range(struct pci_bus *bus) | ||||
| { | ||||
| 	int max = 0; | ||||
| 	u8 busnr; | ||||
| 	struct pci_dev *dev; | ||||
| 
 | ||||
| 	list_for_each_entry(dev, &bus->devices, bus_list) { | ||||
| 		if (!dev->is_physfn) | ||||
| 			continue; | ||||
| 		busnr = virtfn_bus(dev, dev->sriov->total_VFs - 1); | ||||
| 		if (busnr > max) | ||||
| 			max = busnr; | ||||
| 		if (dev->sriov->max_VF_buses > max) | ||||
| 			max = dev->sriov->max_VF_buses; | ||||
| 	} | ||||
| 
 | ||||
| 	return max ? max - bus->number : 0; | ||||
|  |  | |||
|  | @ -243,10 +243,12 @@ struct pci_sriov { | |||
| 	u16 stride;		/* following VF stride */ | ||||
| 	u32 pgsz;		/* page size for BAR alignment */ | ||||
| 	u8 link;		/* Function Dependency Link */ | ||||
| 	u8 max_VF_buses;	/* max buses consumed by VFs */ | ||||
| 	u16 driver_max_VFs;	/* max num VFs driver supports */ | ||||
| 	struct pci_dev *dev;	/* lowest numbered PF */ | ||||
| 	struct pci_dev *self;	/* this PF */ | ||||
| 	struct mutex lock;	/* lock for VF bus */ | ||||
| 	resource_size_t barsz[PCI_SRIOV_NUM_BARS];	/* VF BAR size */ | ||||
| }; | ||||
| 
 | ||||
| #ifdef CONFIG_PCI_ATS | ||||
|  |  | |||
|  | @ -99,8 +99,8 @@ static void remove_from_list(struct list_head *head, | |||
| 	} | ||||
| } | ||||
| 
 | ||||
| static resource_size_t get_res_add_size(struct list_head *head, | ||||
| 					struct resource *res) | ||||
| static struct pci_dev_resource *res_to_dev_res(struct list_head *head, | ||||
| 					       struct resource *res) | ||||
| { | ||||
| 	struct pci_dev_resource *dev_res; | ||||
| 
 | ||||
|  | @ -109,17 +109,37 @@ static resource_size_t get_res_add_size(struct list_head *head, | |||
| 			int idx = res - &dev_res->dev->resource[0]; | ||||
| 
 | ||||
| 			dev_printk(KERN_DEBUG, &dev_res->dev->dev, | ||||
| 				 "res[%d]=%pR get_res_add_size add_size %llx\n", | ||||
| 				 "res[%d]=%pR res_to_dev_res add_size %llx min_align %llx\n", | ||||
| 				 idx, dev_res->res, | ||||
| 				 (unsigned long long)dev_res->add_size); | ||||
| 				 (unsigned long long)dev_res->add_size, | ||||
| 				 (unsigned long long)dev_res->min_align); | ||||
| 
 | ||||
| 			return dev_res->add_size; | ||||
| 			return dev_res; | ||||
| 		} | ||||
| 	} | ||||
| 
 | ||||
| 	return 0; | ||||
| 	return NULL; | ||||
| } | ||||
| 
 | ||||
| static resource_size_t get_res_add_size(struct list_head *head, | ||||
| 					struct resource *res) | ||||
| { | ||||
| 	struct pci_dev_resource *dev_res; | ||||
| 
 | ||||
| 	dev_res = res_to_dev_res(head, res); | ||||
| 	return dev_res ? dev_res->add_size : 0; | ||||
| } | ||||
| 
 | ||||
| static resource_size_t get_res_add_align(struct list_head *head, | ||||
| 					 struct resource *res) | ||||
| { | ||||
| 	struct pci_dev_resource *dev_res; | ||||
| 
 | ||||
| 	dev_res = res_to_dev_res(head, res); | ||||
| 	return dev_res ? dev_res->min_align : 0; | ||||
| } | ||||
| 
 | ||||
| 
 | ||||
| /* Sort resources by alignment */ | ||||
| static void pdev_sort_resources(struct pci_dev *dev, struct list_head *head) | ||||
| { | ||||
|  | @ -215,7 +235,7 @@ static void reassign_resources_sorted(struct list_head *realloc_head, | |||
| 	struct resource *res; | ||||
| 	struct pci_dev_resource *add_res, *tmp; | ||||
| 	struct pci_dev_resource *dev_res; | ||||
| 	resource_size_t add_size; | ||||
| 	resource_size_t add_size, align; | ||||
| 	int idx; | ||||
| 
 | ||||
| 	list_for_each_entry_safe(add_res, tmp, realloc_head, list) { | ||||
|  | @ -238,13 +258,13 @@ static void reassign_resources_sorted(struct list_head *realloc_head, | |||
| 
 | ||||
| 		idx = res - &add_res->dev->resource[0]; | ||||
| 		add_size = add_res->add_size; | ||||
| 		align = add_res->min_align; | ||||
| 		if (!resource_size(res)) { | ||||
| 			res->start = add_res->start; | ||||
| 			res->start = align; | ||||
| 			res->end = res->start + add_size - 1; | ||||
| 			if (pci_assign_resource(add_res->dev, idx)) | ||||
| 				reset_resource(res); | ||||
| 		} else { | ||||
| 			resource_size_t align = add_res->min_align; | ||||
| 			res->flags |= add_res->flags & | ||||
| 				 (IORESOURCE_STARTALIGN|IORESOURCE_SIZEALIGN); | ||||
| 			if (pci_reassign_resource(add_res->dev, idx, | ||||
|  | @ -368,8 +388,9 @@ static void __assign_resources_sorted(struct list_head *head, | |||
| 	LIST_HEAD(save_head); | ||||
| 	LIST_HEAD(local_fail_head); | ||||
| 	struct pci_dev_resource *save_res; | ||||
| 	struct pci_dev_resource *dev_res, *tmp_res; | ||||
| 	struct pci_dev_resource *dev_res, *tmp_res, *dev_res2; | ||||
| 	unsigned long fail_type; | ||||
| 	resource_size_t add_align, align; | ||||
| 
 | ||||
| 	/* Check if optional add_size is there */ | ||||
| 	if (!realloc_head || list_empty(realloc_head)) | ||||
|  | @ -384,10 +405,44 @@ static void __assign_resources_sorted(struct list_head *head, | |||
| 	} | ||||
| 
 | ||||
| 	/* Update res in head list with add_size in realloc_head list */ | ||||
| 	list_for_each_entry(dev_res, head, list) | ||||
| 	list_for_each_entry_safe(dev_res, tmp_res, head, list) { | ||||
| 		dev_res->res->end += get_res_add_size(realloc_head, | ||||
| 							dev_res->res); | ||||
| 
 | ||||
| 		/*
 | ||||
| 		 * There are two kinds of additional resources in the list: | ||||
| 		 * 1. bridge resource  -- IORESOURCE_STARTALIGN | ||||
| 		 * 2. SR-IOV resource   -- IORESOURCE_SIZEALIGN | ||||
| 		 * Here just fix the additional alignment for bridge | ||||
| 		 */ | ||||
| 		if (!(dev_res->res->flags & IORESOURCE_STARTALIGN)) | ||||
| 			continue; | ||||
| 
 | ||||
| 		add_align = get_res_add_align(realloc_head, dev_res->res); | ||||
| 
 | ||||
| 		/*
 | ||||
| 		 * The "head" list is sorted by the alignment to make sure | ||||
| 		 * resources with bigger alignment will be assigned first. | ||||
| 		 * After we change the alignment of a dev_res in "head" list, | ||||
| 		 * we need to reorder the list by alignment to make it | ||||
| 		 * consistent. | ||||
| 		 */ | ||||
| 		if (add_align > dev_res->res->start) { | ||||
| 			dev_res->res->start = add_align; | ||||
| 			dev_res->res->end = add_align + | ||||
| 				            resource_size(dev_res->res); | ||||
| 
 | ||||
| 			list_for_each_entry(dev_res2, head, list) { | ||||
| 				align = pci_resource_alignment(dev_res2->dev, | ||||
| 							       dev_res2->res); | ||||
| 				if (add_align > align) | ||||
| 					list_move_tail(&dev_res->list, | ||||
| 						       &dev_res2->list); | ||||
| 			} | ||||
|                } | ||||
| 
 | ||||
| 	} | ||||
| 
 | ||||
| 	/* Try updated head list with add_size added */ | ||||
| 	assign_requested_resources_sorted(head, &local_fail_head); | ||||
| 
 | ||||
|  | @ -962,6 +1017,8 @@ static int pbus_size_mem(struct pci_bus *bus, unsigned long mask, | |||
| 	struct resource *b_res = find_free_bus_resource(bus, | ||||
| 					mask | IORESOURCE_PREFETCH, type); | ||||
| 	resource_size_t children_add_size = 0; | ||||
| 	resource_size_t children_add_align = 0; | ||||
| 	resource_size_t add_align = 0; | ||||
| 
 | ||||
| 	if (!b_res) | ||||
| 		return -ENOSPC; | ||||
|  | @ -986,6 +1043,7 @@ static int pbus_size_mem(struct pci_bus *bus, unsigned long mask, | |||
| 			/* put SRIOV requested res to the optional list */ | ||||
| 			if (realloc_head && i >= PCI_IOV_RESOURCES && | ||||
| 					i <= PCI_IOV_RESOURCE_END) { | ||||
| 				add_align = max(pci_resource_alignment(dev, r), add_align); | ||||
| 				r->end = r->start - 1; | ||||
| 				add_to_list(realloc_head, dev, r, r_size, 0/* don't care */); | ||||
| 				children_add_size += r_size; | ||||
|  | @ -1016,19 +1074,23 @@ static int pbus_size_mem(struct pci_bus *bus, unsigned long mask, | |||
| 			if (order > max_order) | ||||
| 				max_order = order; | ||||
| 
 | ||||
| 			if (realloc_head) | ||||
| 			if (realloc_head) { | ||||
| 				children_add_size += get_res_add_size(realloc_head, r); | ||||
| 				children_add_align = get_res_add_align(realloc_head, r); | ||||
| 				add_align = max(add_align, children_add_align); | ||||
| 			} | ||||
| 		} | ||||
| 	} | ||||
| 
 | ||||
| 	min_align = calculate_mem_align(aligns, max_order); | ||||
| 	min_align = max(min_align, window_alignment(bus, b_res->flags)); | ||||
| 	size0 = calculate_memsize(size, min_size, 0, resource_size(b_res), min_align); | ||||
| 	add_align = max(min_align, add_align); | ||||
| 	if (children_add_size > add_size) | ||||
| 		add_size = children_add_size; | ||||
| 	size1 = (!realloc_head || (realloc_head && !add_size)) ? size0 : | ||||
| 		calculate_memsize(size, min_size, add_size, | ||||
| 				resource_size(b_res), min_align); | ||||
| 				resource_size(b_res), add_align); | ||||
| 	if (!size0 && !size1) { | ||||
| 		if (b_res->start || b_res->end) | ||||
| 			dev_info(&bus->self->dev, "disabling bridge window %pR to %pR (unused)\n", | ||||
|  | @ -1040,10 +1102,11 @@ static int pbus_size_mem(struct pci_bus *bus, unsigned long mask, | |||
| 	b_res->end = size0 + min_align - 1; | ||||
| 	b_res->flags |= IORESOURCE_STARTALIGN; | ||||
| 	if (size1 > size0 && realloc_head) { | ||||
| 		add_to_list(realloc_head, bus->self, b_res, size1-size0, min_align); | ||||
| 		dev_printk(KERN_DEBUG, &bus->self->dev, "bridge window %pR to %pR add_size %llx\n", | ||||
| 		add_to_list(realloc_head, bus->self, b_res, size1-size0, add_align); | ||||
| 		dev_printk(KERN_DEBUG, &bus->self->dev, "bridge window %pR to %pR add_size %llx add_align %llx\n", | ||||
| 			   b_res, &bus->busn_res, | ||||
| 			   (unsigned long long)size1-size0); | ||||
| 			   (unsigned long long) (size1 - size0), | ||||
| 			   (unsigned long long) add_align); | ||||
| 	} | ||||
| 	return 0; | ||||
| } | ||||
|  |  | |||
|  | @ -1174,6 +1174,7 @@ unsigned char pci_bus_max_busnr(struct pci_bus *bus); | |||
| void pci_setup_bridge(struct pci_bus *bus); | ||||
| resource_size_t pcibios_window_alignment(struct pci_bus *bus, | ||||
| 					 unsigned long type); | ||||
| resource_size_t pcibios_iov_resource_alignment(struct pci_dev *dev, int resno); | ||||
| 
 | ||||
| #define PCI_VGA_STATE_CHANGE_BRIDGE (1 << 0) | ||||
| #define PCI_VGA_STATE_CHANGE_DECODES (1 << 1) | ||||
|  | @ -1669,13 +1670,25 @@ int pci_ext_cfg_avail(void); | |||
| void __iomem *pci_ioremap_bar(struct pci_dev *pdev, int bar); | ||||
| 
 | ||||
| #ifdef CONFIG_PCI_IOV | ||||
| int pci_iov_virtfn_bus(struct pci_dev *dev, int id); | ||||
| int pci_iov_virtfn_devfn(struct pci_dev *dev, int id); | ||||
| 
 | ||||
| int pci_enable_sriov(struct pci_dev *dev, int nr_virtfn); | ||||
| void pci_disable_sriov(struct pci_dev *dev); | ||||
| int pci_num_vf(struct pci_dev *dev); | ||||
| int pci_vfs_assigned(struct pci_dev *dev); | ||||
| int pci_sriov_set_totalvfs(struct pci_dev *dev, u16 numvfs); | ||||
| int pci_sriov_get_totalvfs(struct pci_dev *dev); | ||||
| resource_size_t pci_iov_resource_size(struct pci_dev *dev, int resno); | ||||
| #else | ||||
| static inline int pci_iov_virtfn_bus(struct pci_dev *dev, int id) | ||||
| { | ||||
| 	return -ENOSYS; | ||||
| } | ||||
| static inline int pci_iov_virtfn_devfn(struct pci_dev *dev, int id) | ||||
| { | ||||
| 	return -ENOSYS; | ||||
| } | ||||
| static inline int pci_enable_sriov(struct pci_dev *dev, int nr_virtfn) | ||||
| { return -ENODEV; } | ||||
| static inline void pci_disable_sriov(struct pci_dev *dev) { } | ||||
|  | @ -1686,6 +1699,8 @@ static inline int pci_sriov_set_totalvfs(struct pci_dev *dev, u16 numvfs) | |||
| { return 0; } | ||||
| static inline int pci_sriov_get_totalvfs(struct pci_dev *dev) | ||||
| { return 0; } | ||||
| static inline resource_size_t pci_iov_resource_size(struct pci_dev *dev, int resno) | ||||
| { return 0; } | ||||
| #endif | ||||
| 
 | ||||
| #if defined(CONFIG_HOTPLUG_PCI) || defined(CONFIG_HOTPLUG_PCI_MODULE) | ||||
|  |  | |||
		Loading…
	
	Add table
		
		Reference in a new issue
	
	 Michael Ellerman
						Michael Ellerman