linux/drivers/net/ethernet/intel/ice/ice_adapter.c

117 lines
3.1 KiB
C
Raw Normal View History

// SPDX-License-Identifier: GPL-2.0-only
// SPDX-FileCopyrightText: Copyright Red Hat
#include <linux/bitfield.h>
#include <linux/cleanup.h>
#include <linux/mutex.h>
#include <linux/pci.h>
#include <linux/slab.h>
ice: avoid the PTP hardware semaphore in gettimex64 path The PTP hardware semaphore (PFTSYN_SEM) is used to synchronize operations that program the PTP timers. The operations involve issuing commands to the sideband queue. The E810 does not have a hardware sideband queue, so the admin queue is used. The admin queue is slow. I have observed delays in hundreds of milliseconds waiting for ice_sq_done. When phc2sys reads the time from the ice PTP clock and PFTSYN_SEM is held by a task performing one of the slow operations, ice_ptp_lock can easily time out. phc2sys gets -EBUSY and the kernel prints: ice 0000:XX:YY.0: PTP failed to get time These messages appear once every few seconds, causing log spam. The E810 datasheet recommends an algorithm for reading the upper 64 bits of the GLTSYN_TIME register. It matches what's implemented in ice_ptp_read_src_clk_reg. It is robust against wrap-around, but not necessarily against the concurrent setting of the register (with GLTSYN_CMD_{INIT,ADJ}_TIME commands). Perhaps that's why ice_ptp_gettimex64 also takes PFTSYN_SEM. The race with time setters can be prevented without relying on the PTP hardware semaphore. Using the "ice_adapter" from the previous patch, we can have a common spinlock for the PFs that share the clock hardware. It will protect the reading and writing to the GLTSYN_TIME register. The writing is performed indirectly, by the hardware, as a result of the driver writing GLTSYN_CMD_SYNC in ice_ptp_exec_tmr_cmd. I wasn't sure if the ice_flush there is enough to make sure GLTSYN_TIME has been updated, but it works well in my testing. My test code can be seen here: https://gitlab.com/mschmidt2/linux/-/commits/ice-ptp-host-side-lock-10 It consists of: - kernel threads reading the time in a busy loop and looking at the deltas between consecutive values, reporting new maxima. - a shell script that sets the time repeatedly; - a bpftrace probe to produce a histogram of the measured deltas. Without the spinlock ptp_gltsyn_time_lock, it is easy to see tearing. Deltas in the [2G, 4G) range appear in the histograms. With the spinlock added, there is no tearing and the biggest delta I saw was in the range [1M, 2M), that is under 2 ms. Reviewed-by: Jacob Keller <jacob.e.keller@intel.com> Reviewed-by: Przemek Kitszel <przemyslaw.kitszel@intel.com> Signed-off-by: Michal Schmidt <mschmidt@redhat.com> Tested-by: Pucha Himasekhar Reddy <himasekharx.reddy.pucha@intel.com> (A Contingent worker at Intel) Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
2024-03-26 00:20:38 +01:00
#include <linux/spinlock.h>
#include <linux/xarray.h>
#include "ice_adapter.h"
static DEFINE_XARRAY(ice_adapters);
/* PCI bus number is 8 bits. Slot is 5 bits. Domain can have the rest. */
#define INDEX_FIELD_DOMAIN GENMASK(BITS_PER_LONG - 1, 13)
#define INDEX_FIELD_BUS GENMASK(12, 5)
#define INDEX_FIELD_SLOT GENMASK(4, 0)
static unsigned long ice_adapter_index(const struct pci_dev *pdev)
{
unsigned int domain = pci_domain_nr(pdev->bus);
WARN_ON(domain > FIELD_MAX(INDEX_FIELD_DOMAIN));
return FIELD_PREP(INDEX_FIELD_DOMAIN, domain) |
FIELD_PREP(INDEX_FIELD_BUS, pdev->bus->number) |
FIELD_PREP(INDEX_FIELD_SLOT, PCI_SLOT(pdev->devfn));
}
static struct ice_adapter *ice_adapter_new(void)
{
struct ice_adapter *adapter;
adapter = kzalloc(sizeof(*adapter), GFP_KERNEL);
if (!adapter)
return NULL;
ice: avoid the PTP hardware semaphore in gettimex64 path The PTP hardware semaphore (PFTSYN_SEM) is used to synchronize operations that program the PTP timers. The operations involve issuing commands to the sideband queue. The E810 does not have a hardware sideband queue, so the admin queue is used. The admin queue is slow. I have observed delays in hundreds of milliseconds waiting for ice_sq_done. When phc2sys reads the time from the ice PTP clock and PFTSYN_SEM is held by a task performing one of the slow operations, ice_ptp_lock can easily time out. phc2sys gets -EBUSY and the kernel prints: ice 0000:XX:YY.0: PTP failed to get time These messages appear once every few seconds, causing log spam. The E810 datasheet recommends an algorithm for reading the upper 64 bits of the GLTSYN_TIME register. It matches what's implemented in ice_ptp_read_src_clk_reg. It is robust against wrap-around, but not necessarily against the concurrent setting of the register (with GLTSYN_CMD_{INIT,ADJ}_TIME commands). Perhaps that's why ice_ptp_gettimex64 also takes PFTSYN_SEM. The race with time setters can be prevented without relying on the PTP hardware semaphore. Using the "ice_adapter" from the previous patch, we can have a common spinlock for the PFs that share the clock hardware. It will protect the reading and writing to the GLTSYN_TIME register. The writing is performed indirectly, by the hardware, as a result of the driver writing GLTSYN_CMD_SYNC in ice_ptp_exec_tmr_cmd. I wasn't sure if the ice_flush there is enough to make sure GLTSYN_TIME has been updated, but it works well in my testing. My test code can be seen here: https://gitlab.com/mschmidt2/linux/-/commits/ice-ptp-host-side-lock-10 It consists of: - kernel threads reading the time in a busy loop and looking at the deltas between consecutive values, reporting new maxima. - a shell script that sets the time repeatedly; - a bpftrace probe to produce a histogram of the measured deltas. Without the spinlock ptp_gltsyn_time_lock, it is easy to see tearing. Deltas in the [2G, 4G) range appear in the histograms. With the spinlock added, there is no tearing and the biggest delta I saw was in the range [1M, 2M), that is under 2 ms. Reviewed-by: Jacob Keller <jacob.e.keller@intel.com> Reviewed-by: Przemek Kitszel <przemyslaw.kitszel@intel.com> Signed-off-by: Michal Schmidt <mschmidt@redhat.com> Tested-by: Pucha Himasekhar Reddy <himasekharx.reddy.pucha@intel.com> (A Contingent worker at Intel) Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
2024-03-26 00:20:38 +01:00
spin_lock_init(&adapter->ptp_gltsyn_time_lock);
refcount_set(&adapter->refcount, 1);
return adapter;
}
static void ice_adapter_free(struct ice_adapter *adapter)
{
kfree(adapter);
}
DEFINE_FREE(ice_adapter_free, struct ice_adapter*, if (_T) ice_adapter_free(_T))
/**
* ice_adapter_get - Get a shared ice_adapter structure.
* @pdev: Pointer to the pci_dev whose driver is getting the ice_adapter.
*
* Gets a pointer to a shared ice_adapter structure. Physical functions (PFs)
* of the same multi-function PCI device share one ice_adapter structure.
* The ice_adapter is reference-counted. The PF driver must use ice_adapter_put
* to release its reference.
*
* Context: Process, may sleep.
* Return: Pointer to ice_adapter on success.
* ERR_PTR() on error. -ENOMEM is the only possible error.
*/
struct ice_adapter *ice_adapter_get(const struct pci_dev *pdev)
{
struct ice_adapter *ret, __free(ice_adapter_free) *adapter = NULL;
unsigned long index = ice_adapter_index(pdev);
adapter = ice_adapter_new();
if (!adapter)
return ERR_PTR(-ENOMEM);
xa_lock(&ice_adapters);
ret = __xa_cmpxchg(&ice_adapters, index, NULL, adapter, GFP_KERNEL);
if (xa_is_err(ret)) {
ret = ERR_PTR(xa_err(ret));
goto unlock;
}
if (ret) {
refcount_inc(&ret->refcount);
goto unlock;
}
ret = no_free_ptr(adapter);
unlock:
xa_unlock(&ice_adapters);
return ret;
}
/**
* ice_adapter_put - Release a reference to the shared ice_adapter structure.
* @pdev: Pointer to the pci_dev whose driver is releasing the ice_adapter.
*
* Releases the reference to ice_adapter previously obtained with
* ice_adapter_get.
*
* Context: Any.
*/
void ice_adapter_put(const struct pci_dev *pdev)
{
unsigned long index = ice_adapter_index(pdev);
struct ice_adapter *adapter;
xa_lock(&ice_adapters);
adapter = xa_load(&ice_adapters, index);
if (WARN_ON(!adapter))
goto unlock;
if (!refcount_dec_and_test(&adapter->refcount))
goto unlock;
WARN_ON(__xa_erase(&ice_adapters, index) != adapter);
ice_adapter_free(adapter);
unlock:
xa_unlock(&ice_adapters);
}