mirror of
git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
synced 2025-09-18 22:14:16 +00:00
powerpc/eeh: Make EEH driver device hotplug safe
Multiple race conditions existed between the PCIe hotplug driver and the EEH driver, leading to a variety of kernel oopses of the same general nature: <pcie device unplug> <eeh driver trigger> <hotplug removal trigger> <pcie tree reconfiguration> <eeh recovery next step> <oops in EEH driver bus iteration loop> A second class of oops is also seen when the underlying bus disappears during device recovery. Refactor the EEH module to be PCI rescan and remove safe. Also clean up a few minor formatting / readability issues. Signed-off-by: Timothy Pearson <tpearson@raptorengineering.com> Signed-off-by: Bjorn Helgaas <bhelgaas@google.com> Signed-off-by: Madhavan Srinivasan <maddy@linux.ibm.com> Link: https://patch.msgid.link/1334208367.1359861.1752615503144.JavaMail.zimbra@raptorengineeringinc.com
This commit is contained in:
parent
e82b34eed0
commit
1010b4c012
2 changed files with 38 additions and 20 deletions
|
|
@ -257,13 +257,12 @@ static void eeh_pe_report_edev(struct eeh_dev *edev, eeh_report_fn fn,
|
|||
struct pci_driver *driver;
|
||||
enum pci_ers_result new_result;
|
||||
|
||||
pci_lock_rescan_remove();
|
||||
pdev = edev->pdev;
|
||||
if (pdev)
|
||||
get_device(&pdev->dev);
|
||||
pci_unlock_rescan_remove();
|
||||
if (!pdev) {
|
||||
eeh_edev_info(edev, "no device");
|
||||
*result = PCI_ERS_RESULT_DISCONNECT;
|
||||
return;
|
||||
}
|
||||
device_lock(&pdev->dev);
|
||||
|
|
@ -304,8 +303,9 @@ static void eeh_pe_report(const char *name, struct eeh_pe *root,
|
|||
struct eeh_dev *edev, *tmp;
|
||||
|
||||
pr_info("EEH: Beginning: '%s'\n", name);
|
||||
eeh_for_each_pe(root, pe) eeh_pe_for_each_dev(pe, edev, tmp)
|
||||
eeh_pe_report_edev(edev, fn, result);
|
||||
eeh_for_each_pe(root, pe)
|
||||
eeh_pe_for_each_dev(pe, edev, tmp)
|
||||
eeh_pe_report_edev(edev, fn, result);
|
||||
if (result)
|
||||
pr_info("EEH: Finished:'%s' with aggregate recovery state:'%s'\n",
|
||||
name, pci_ers_result_name(*result));
|
||||
|
|
@ -383,6 +383,8 @@ static void eeh_dev_restore_state(struct eeh_dev *edev, void *userdata)
|
|||
if (!edev)
|
||||
return;
|
||||
|
||||
pci_lock_rescan_remove();
|
||||
|
||||
/*
|
||||
* The content in the config space isn't saved because
|
||||
* the blocked config space on some adapters. We have
|
||||
|
|
@ -393,14 +395,19 @@ static void eeh_dev_restore_state(struct eeh_dev *edev, void *userdata)
|
|||
if (list_is_last(&edev->entry, &edev->pe->edevs))
|
||||
eeh_pe_restore_bars(edev->pe);
|
||||
|
||||
pci_unlock_rescan_remove();
|
||||
return;
|
||||
}
|
||||
|
||||
pdev = eeh_dev_to_pci_dev(edev);
|
||||
if (!pdev)
|
||||
if (!pdev) {
|
||||
pci_unlock_rescan_remove();
|
||||
return;
|
||||
}
|
||||
|
||||
pci_restore_state(pdev);
|
||||
|
||||
pci_unlock_rescan_remove();
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -647,9 +654,7 @@ static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus,
|
|||
if (any_passed || driver_eeh_aware || (pe->type & EEH_PE_VF)) {
|
||||
eeh_pe_dev_traverse(pe, eeh_rmv_device, rmv_data);
|
||||
} else {
|
||||
pci_lock_rescan_remove();
|
||||
pci_hp_remove_devices(bus);
|
||||
pci_unlock_rescan_remove();
|
||||
}
|
||||
|
||||
/*
|
||||
|
|
@ -665,8 +670,6 @@ static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus,
|
|||
if (rc)
|
||||
return rc;
|
||||
|
||||
pci_lock_rescan_remove();
|
||||
|
||||
/* Restore PE */
|
||||
eeh_ops->configure_bridge(pe);
|
||||
eeh_pe_restore_bars(pe);
|
||||
|
|
@ -674,7 +677,6 @@ static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus,
|
|||
/* Clear frozen state */
|
||||
rc = eeh_clear_pe_frozen_state(pe, false);
|
||||
if (rc) {
|
||||
pci_unlock_rescan_remove();
|
||||
return rc;
|
||||
}
|
||||
|
||||
|
|
@ -709,7 +711,6 @@ static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus,
|
|||
pe->tstamp = tstamp;
|
||||
pe->freeze_count = cnt;
|
||||
|
||||
pci_unlock_rescan_remove();
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
|
@ -843,10 +844,13 @@ void eeh_handle_normal_event(struct eeh_pe *pe)
|
|||
{LIST_HEAD_INIT(rmv_data.removed_vf_list), 0};
|
||||
int devices = 0;
|
||||
|
||||
pci_lock_rescan_remove();
|
||||
|
||||
bus = eeh_pe_bus_get(pe);
|
||||
if (!bus) {
|
||||
pr_err("%s: Cannot find PCI bus for PHB#%x-PE#%x\n",
|
||||
__func__, pe->phb->global_number, pe->addr);
|
||||
pci_unlock_rescan_remove();
|
||||
return;
|
||||
}
|
||||
|
||||
|
|
@ -1094,10 +1098,15 @@ recover_failed:
|
|||
eeh_pe_state_clear(pe, EEH_PE_PRI_BUS, true);
|
||||
eeh_pe_dev_mode_mark(pe, EEH_DEV_REMOVED);
|
||||
|
||||
pci_lock_rescan_remove();
|
||||
pci_hp_remove_devices(bus);
|
||||
pci_unlock_rescan_remove();
|
||||
bus = eeh_pe_bus_get(pe);
|
||||
if (bus)
|
||||
pci_hp_remove_devices(bus);
|
||||
else
|
||||
pr_err("%s: PCI bus for PHB#%x-PE#%x disappeared\n",
|
||||
__func__, pe->phb->global_number, pe->addr);
|
||||
|
||||
/* The passed PE should no longer be used */
|
||||
pci_unlock_rescan_remove();
|
||||
return;
|
||||
}
|
||||
|
||||
|
|
@ -1114,6 +1123,8 @@ out:
|
|||
eeh_clear_slot_attention(edev->pdev);
|
||||
|
||||
eeh_pe_state_clear(pe, EEH_PE_RECOVERING, true);
|
||||
|
||||
pci_unlock_rescan_remove();
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -1132,6 +1143,7 @@ void eeh_handle_special_event(void)
|
|||
unsigned long flags;
|
||||
int rc;
|
||||
|
||||
pci_lock_rescan_remove();
|
||||
|
||||
do {
|
||||
rc = eeh_ops->next_error(&pe);
|
||||
|
|
@ -1171,10 +1183,12 @@ void eeh_handle_special_event(void)
|
|||
|
||||
break;
|
||||
case EEH_NEXT_ERR_NONE:
|
||||
pci_unlock_rescan_remove();
|
||||
return;
|
||||
default:
|
||||
pr_warn("%s: Invalid value %d from next_error()\n",
|
||||
__func__, rc);
|
||||
pci_unlock_rescan_remove();
|
||||
return;
|
||||
}
|
||||
|
||||
|
|
@ -1186,7 +1200,9 @@ void eeh_handle_special_event(void)
|
|||
if (rc == EEH_NEXT_ERR_FROZEN_PE ||
|
||||
rc == EEH_NEXT_ERR_FENCED_PHB) {
|
||||
eeh_pe_state_mark(pe, EEH_PE_RECOVERING);
|
||||
pci_unlock_rescan_remove();
|
||||
eeh_handle_normal_event(pe);
|
||||
pci_lock_rescan_remove();
|
||||
} else {
|
||||
eeh_for_each_pe(pe, tmp_pe)
|
||||
eeh_pe_for_each_dev(tmp_pe, edev, tmp_edev)
|
||||
|
|
@ -1199,7 +1215,6 @@ void eeh_handle_special_event(void)
|
|||
eeh_report_failure, NULL);
|
||||
eeh_set_channel_state(pe, pci_channel_io_perm_failure);
|
||||
|
||||
pci_lock_rescan_remove();
|
||||
list_for_each_entry(hose, &hose_list, list_node) {
|
||||
phb_pe = eeh_phb_pe_get(hose);
|
||||
if (!phb_pe ||
|
||||
|
|
@ -1218,7 +1233,6 @@ void eeh_handle_special_event(void)
|
|||
}
|
||||
pci_hp_remove_devices(bus);
|
||||
}
|
||||
pci_unlock_rescan_remove();
|
||||
}
|
||||
|
||||
/*
|
||||
|
|
@ -1228,4 +1242,6 @@ void eeh_handle_special_event(void)
|
|||
if (rc == EEH_NEXT_ERR_DEAD_IOC)
|
||||
break;
|
||||
} while (rc != EEH_NEXT_ERR_NONE);
|
||||
|
||||
pci_unlock_rescan_remove();
|
||||
}
|
||||
|
|
|
|||
|
|
@ -671,10 +671,12 @@ static void eeh_bridge_check_link(struct eeh_dev *edev)
|
|||
eeh_ops->write_config(edev, cap + PCI_EXP_LNKCTL, 2, val);
|
||||
|
||||
/* Check link */
|
||||
if (!edev->pdev->link_active_reporting) {
|
||||
eeh_edev_dbg(edev, "No link reporting capability\n");
|
||||
msleep(1000);
|
||||
return;
|
||||
if (edev->pdev) {
|
||||
if (!edev->pdev->link_active_reporting) {
|
||||
eeh_edev_dbg(edev, "No link reporting capability\n");
|
||||
msleep(1000);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
/* Wait the link is up until timeout (5s) */
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue