2024-03-26 00:20:37 +01:00
|
|
|
// SPDX-License-Identifier: GPL-2.0-only
|
|
|
|
// SPDX-FileCopyrightText: Copyright Red Hat
|
|
|
|
|
|
|
|
#include <linux/bitfield.h>
|
|
|
|
#include <linux/cleanup.h>
|
|
|
|
#include <linux/mutex.h>
|
|
|
|
#include <linux/pci.h>
|
|
|
|
#include <linux/slab.h>
|
ice: avoid the PTP hardware semaphore in gettimex64 path
The PTP hardware semaphore (PFTSYN_SEM) is used to synchronize
operations that program the PTP timers. The operations involve issuing
commands to the sideband queue. The E810 does not have a hardware
sideband queue, so the admin queue is used. The admin queue is slow.
I have observed delays in hundreds of milliseconds waiting for
ice_sq_done.
When phc2sys reads the time from the ice PTP clock and PFTSYN_SEM is
held by a task performing one of the slow operations, ice_ptp_lock can
easily time out. phc2sys gets -EBUSY and the kernel prints:
ice 0000:XX:YY.0: PTP failed to get time
These messages appear once every few seconds, causing log spam.
The E810 datasheet recommends an algorithm for reading the upper 64 bits
of the GLTSYN_TIME register. It matches what's implemented in
ice_ptp_read_src_clk_reg. It is robust against wrap-around, but not
necessarily against the concurrent setting of the register (with
GLTSYN_CMD_{INIT,ADJ}_TIME commands). Perhaps that's why
ice_ptp_gettimex64 also takes PFTSYN_SEM.
The race with time setters can be prevented without relying on the PTP
hardware semaphore. Using the "ice_adapter" from the previous patch,
we can have a common spinlock for the PFs that share the clock hardware.
It will protect the reading and writing to the GLTSYN_TIME register.
The writing is performed indirectly, by the hardware, as a result of
the driver writing GLTSYN_CMD_SYNC in ice_ptp_exec_tmr_cmd. I wasn't
sure if the ice_flush there is enough to make sure GLTSYN_TIME has been
updated, but it works well in my testing.
My test code can be seen here:
https://gitlab.com/mschmidt2/linux/-/commits/ice-ptp-host-side-lock-10
It consists of:
- kernel threads reading the time in a busy loop and looking at the
deltas between consecutive values, reporting new maxima.
- a shell script that sets the time repeatedly;
- a bpftrace probe to produce a histogram of the measured deltas.
Without the spinlock ptp_gltsyn_time_lock, it is easy to see tearing.
Deltas in the [2G, 4G) range appear in the histograms.
With the spinlock added, there is no tearing and the biggest delta I saw
was in the range [1M, 2M), that is under 2 ms.
Reviewed-by: Jacob Keller <jacob.e.keller@intel.com>
Reviewed-by: Przemek Kitszel <przemyslaw.kitszel@intel.com>
Signed-off-by: Michal Schmidt <mschmidt@redhat.com>
Tested-by: Pucha Himasekhar Reddy <himasekharx.reddy.pucha@intel.com> (A Contingent worker at Intel)
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
2024-03-26 00:20:38 +01:00
|
|
|
#include <linux/spinlock.h>
|
2024-03-26 00:20:37 +01:00
|
|
|
#include <linux/xarray.h>
|
|
|
|
#include "ice_adapter.h"
|
|
|
|
|
|
|
|
static DEFINE_XARRAY(ice_adapters);
|
|
|
|
|
|
|
|
/* PCI bus number is 8 bits. Slot is 5 bits. Domain can have the rest. */
|
|
|
|
#define INDEX_FIELD_DOMAIN GENMASK(BITS_PER_LONG - 1, 13)
|
|
|
|
#define INDEX_FIELD_BUS GENMASK(12, 5)
|
|
|
|
#define INDEX_FIELD_SLOT GENMASK(4, 0)
|
|
|
|
|
|
|
|
static unsigned long ice_adapter_index(const struct pci_dev *pdev)
|
|
|
|
{
|
|
|
|
unsigned int domain = pci_domain_nr(pdev->bus);
|
|
|
|
|
|
|
|
WARN_ON(domain > FIELD_MAX(INDEX_FIELD_DOMAIN));
|
|
|
|
|
|
|
|
return FIELD_PREP(INDEX_FIELD_DOMAIN, domain) |
|
|
|
|
FIELD_PREP(INDEX_FIELD_BUS, pdev->bus->number) |
|
|
|
|
FIELD_PREP(INDEX_FIELD_SLOT, PCI_SLOT(pdev->devfn));
|
|
|
|
}
|
|
|
|
|
|
|
|
static struct ice_adapter *ice_adapter_new(void)
|
|
|
|
{
|
|
|
|
struct ice_adapter *adapter;
|
|
|
|
|
|
|
|
adapter = kzalloc(sizeof(*adapter), GFP_KERNEL);
|
|
|
|
if (!adapter)
|
|
|
|
return NULL;
|
|
|
|
|
ice: avoid the PTP hardware semaphore in gettimex64 path
The PTP hardware semaphore (PFTSYN_SEM) is used to synchronize
operations that program the PTP timers. The operations involve issuing
commands to the sideband queue. The E810 does not have a hardware
sideband queue, so the admin queue is used. The admin queue is slow.
I have observed delays in hundreds of milliseconds waiting for
ice_sq_done.
When phc2sys reads the time from the ice PTP clock and PFTSYN_SEM is
held by a task performing one of the slow operations, ice_ptp_lock can
easily time out. phc2sys gets -EBUSY and the kernel prints:
ice 0000:XX:YY.0: PTP failed to get time
These messages appear once every few seconds, causing log spam.
The E810 datasheet recommends an algorithm for reading the upper 64 bits
of the GLTSYN_TIME register. It matches what's implemented in
ice_ptp_read_src_clk_reg. It is robust against wrap-around, but not
necessarily against the concurrent setting of the register (with
GLTSYN_CMD_{INIT,ADJ}_TIME commands). Perhaps that's why
ice_ptp_gettimex64 also takes PFTSYN_SEM.
The race with time setters can be prevented without relying on the PTP
hardware semaphore. Using the "ice_adapter" from the previous patch,
we can have a common spinlock for the PFs that share the clock hardware.
It will protect the reading and writing to the GLTSYN_TIME register.
The writing is performed indirectly, by the hardware, as a result of
the driver writing GLTSYN_CMD_SYNC in ice_ptp_exec_tmr_cmd. I wasn't
sure if the ice_flush there is enough to make sure GLTSYN_TIME has been
updated, but it works well in my testing.
My test code can be seen here:
https://gitlab.com/mschmidt2/linux/-/commits/ice-ptp-host-side-lock-10
It consists of:
- kernel threads reading the time in a busy loop and looking at the
deltas between consecutive values, reporting new maxima.
- a shell script that sets the time repeatedly;
- a bpftrace probe to produce a histogram of the measured deltas.
Without the spinlock ptp_gltsyn_time_lock, it is easy to see tearing.
Deltas in the [2G, 4G) range appear in the histograms.
With the spinlock added, there is no tearing and the biggest delta I saw
was in the range [1M, 2M), that is under 2 ms.
Reviewed-by: Jacob Keller <jacob.e.keller@intel.com>
Reviewed-by: Przemek Kitszel <przemyslaw.kitszel@intel.com>
Signed-off-by: Michal Schmidt <mschmidt@redhat.com>
Tested-by: Pucha Himasekhar Reddy <himasekharx.reddy.pucha@intel.com> (A Contingent worker at Intel)
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
2024-03-26 00:20:38 +01:00
|
|
|
spin_lock_init(&adapter->ptp_gltsyn_time_lock);
|
2024-03-26 00:20:37 +01:00
|
|
|
refcount_set(&adapter->refcount, 1);
|
|
|
|
|
|
|
|
return adapter;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void ice_adapter_free(struct ice_adapter *adapter)
|
|
|
|
{
|
|
|
|
kfree(adapter);
|
|
|
|
}
|
|
|
|
|
|
|
|
DEFINE_FREE(ice_adapter_free, struct ice_adapter*, if (_T) ice_adapter_free(_T))
|
|
|
|
|
|
|
|
/**
|
|
|
|
* ice_adapter_get - Get a shared ice_adapter structure.
|
|
|
|
* @pdev: Pointer to the pci_dev whose driver is getting the ice_adapter.
|
|
|
|
*
|
|
|
|
* Gets a pointer to a shared ice_adapter structure. Physical functions (PFs)
|
|
|
|
* of the same multi-function PCI device share one ice_adapter structure.
|
|
|
|
* The ice_adapter is reference-counted. The PF driver must use ice_adapter_put
|
|
|
|
* to release its reference.
|
|
|
|
*
|
|
|
|
* Context: Process, may sleep.
|
|
|
|
* Return: Pointer to ice_adapter on success.
|
|
|
|
* ERR_PTR() on error. -ENOMEM is the only possible error.
|
|
|
|
*/
|
|
|
|
struct ice_adapter *ice_adapter_get(const struct pci_dev *pdev)
|
|
|
|
{
|
|
|
|
struct ice_adapter *ret, __free(ice_adapter_free) *adapter = NULL;
|
|
|
|
unsigned long index = ice_adapter_index(pdev);
|
|
|
|
|
|
|
|
adapter = ice_adapter_new();
|
|
|
|
if (!adapter)
|
|
|
|
return ERR_PTR(-ENOMEM);
|
|
|
|
|
|
|
|
xa_lock(&ice_adapters);
|
|
|
|
ret = __xa_cmpxchg(&ice_adapters, index, NULL, adapter, GFP_KERNEL);
|
|
|
|
if (xa_is_err(ret)) {
|
|
|
|
ret = ERR_PTR(xa_err(ret));
|
|
|
|
goto unlock;
|
|
|
|
}
|
|
|
|
if (ret) {
|
|
|
|
refcount_inc(&ret->refcount);
|
|
|
|
goto unlock;
|
|
|
|
}
|
|
|
|
ret = no_free_ptr(adapter);
|
|
|
|
unlock:
|
|
|
|
xa_unlock(&ice_adapters);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* ice_adapter_put - Release a reference to the shared ice_adapter structure.
|
|
|
|
* @pdev: Pointer to the pci_dev whose driver is releasing the ice_adapter.
|
|
|
|
*
|
|
|
|
* Releases the reference to ice_adapter previously obtained with
|
|
|
|
* ice_adapter_get.
|
|
|
|
*
|
|
|
|
* Context: Any.
|
|
|
|
*/
|
|
|
|
void ice_adapter_put(const struct pci_dev *pdev)
|
|
|
|
{
|
|
|
|
unsigned long index = ice_adapter_index(pdev);
|
|
|
|
struct ice_adapter *adapter;
|
|
|
|
|
|
|
|
xa_lock(&ice_adapters);
|
|
|
|
adapter = xa_load(&ice_adapters, index);
|
|
|
|
if (WARN_ON(!adapter))
|
|
|
|
goto unlock;
|
|
|
|
|
|
|
|
if (!refcount_dec_and_test(&adapter->refcount))
|
|
|
|
goto unlock;
|
|
|
|
|
|
|
|
WARN_ON(__xa_erase(&ice_adapters, index) != adapter);
|
|
|
|
ice_adapter_free(adapter);
|
|
|
|
unlock:
|
|
|
|
xa_unlock(&ice_adapters);
|
|
|
|
}
|