Skip to content

Commit

Permalink
Fix PCI Link down issue during EEMI PMC reset (#8208) (#8222)
Browse files Browse the repository at this point in the history
* reset_fix

Signed-off-by: karthdmg <karthdmg@amd.com>

* defined new function xclmgmt_reset_device

Signed-off-by: karthdmg <karthdmg@amd.com>

* fix errors

Signed-off-by: karthdmg <karthdmg@amd.com>

---------

Signed-off-by: karthdmg <karthdmg@amd.com>
(cherry picked from commit f3169ab)
  • Loading branch information
karthdmg-xilinx authored Jun 5, 2024
1 parent 0214e00 commit ab6b3d9
Show file tree
Hide file tree
Showing 4 changed files with 164 additions and 20 deletions.
18 changes: 15 additions & 3 deletions src/runtime_src/core/pcie/driver/linux/xocl/mgmtpf/mgmt-core.c
Original file line number Diff line number Diff line change
Expand Up @@ -608,7 +608,17 @@ static int xclmgmt_reset(xdev_handle_t xdev_hdl)
{
struct xclmgmt_dev *lro = (struct xclmgmt_dev *)xdev_hdl;

return xclmgmt_hot_reset(lro, true);
return xclmgmt_reset_device(lro, true);
}

long xclmgmt_reset_device(struct xclmgmt_dev *lro, bool force)
{
if (XOCL_DSA_EEMI_API_SRST(lro)) {
return xclmgmt_eemi_pmc_reset(lro);
}
else {
return xclmgmt_hot_reset(lro, force);
}
}

struct xocl_pci_funcs xclmgmt_pci_ops = {
Expand Down Expand Up @@ -1414,12 +1424,14 @@ static void xclmgmt_work_cb(struct work_struct *work)

switch (_work->op) {
case XOCL_WORK_RESET:
ret = (int) xclmgmt_hot_reset(lro, false);
ret = (int) xclmgmt_reset_device(lro, false);

if (!ret)
xocl_drvinst_set_offline(lro, false);
break;
case XOCL_WORK_FORCE_RESET:
ret = (int) xclmgmt_hot_reset(lro, true);
ret = (int) xclmgmt_reset_device(lro, true);

if (!ret)
xocl_drvinst_set_offline(lro, false);
break;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -177,7 +177,10 @@ void store_pcie_link_info(struct xclmgmt_dev *lro);
/* utils.c */
int pci_fundamental_reset(struct xclmgmt_dev *lro);


long xclmgmt_reset_device(struct xclmgmt_dev *lro, bool force);
long xclmgmt_hot_reset(struct xclmgmt_dev *lro, bool force);
long xclmgmt_eemi_pmc_reset(struct xclmgmt_dev *lro);
int xocl_wait_master_off(struct xclmgmt_dev *lro);
int xocl_set_master_on(struct xclmgmt_dev *lro);
void xocl_pci_save_config_all(struct xclmgmt_dev *lro);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -434,7 +434,8 @@ static ssize_t mgmt_reset_store(struct device *dev,
*/
switch(val) {
case 1:
ret = (int) xclmgmt_hot_reset(lro, true);
ret = (int) xclmgmt_reset_device(lro, true);

if (ret < 0)
return ret;
break;
Expand Down
160 changes: 144 additions & 16 deletions src/runtime_src/core/pcie/driver/linux/xocl/mgmtpf/mgmt-utils.c
Original file line number Diff line number Diff line change
Expand Up @@ -260,6 +260,145 @@ static int xclmgmt_get_buddy_cb(struct device *dev, void *data)
return 0;
}

/**
* Perform a EEMI based PMC reset.
* This method uses EEMI PMC based system reset API to perform a system reset.
*/
long xclmgmt_eemi_pmc_reset(struct xclmgmt_dev *lro)
{
long err = 0;
const char *ep_name;
struct pci_dev *pdev = lro->pci_dev;
struct xocl_board_private *dev_info = &lro->core.priv;
int retry = 0;
u16 devctl;
u16 slot_ctrl_orig = 0, slot_ctrl;

if (!pdev->bus || !pdev->bus->self) {
mgmt_err(lro, "Unable to identify device root port for card %d",
lro->instance);
err = -ENODEV;
goto failed;
}

ep_name = pdev->bus->name;
mgmt_info(lro, "Trying to reset card %d in slot %s:%02x:%1x",
lro->instance, ep_name,
PCI_SLOT(pdev->devfn), PCI_FUNC(pdev->devfn));

xocl_thread_stop(lro);

xocl_subdev_destroy_by_level(lro, XOCL_SUBDEV_LEVEL_URP);
(void) xocl_subdev_offline_by_id(lro, XOCL_SUBDEV_HWMON_SDM);
(void) xocl_subdev_offline_by_id(lro, XOCL_SUBDEV_UARTLITE);
(void) xocl_subdev_offline_by_id(lro, XOCL_SUBDEV_FLASH);
(void) xocl_subdev_offline_by_id(lro, XOCL_SUBDEV_ICAP);
(void) xocl_subdev_offline_by_id(lro, XOCL_SUBDEV_MAILBOX);
(void) xocl_subdev_offline_by_id(lro, XOCL_SUBDEV_AF);
(void) xocl_subdev_offline_by_id(lro, XOCL_SUBDEV_AXIGATE);

xocl_pci_save_config_all(lro);

/* Mask the Correctable Error Reporting Enable bit.
* This is required to mask the correctable errors reported by device
* during system reset
*/
pcie_capability_read_word(pdev->bus->self, PCI_EXP_DEVCTL, &devctl);
pcie_capability_write_word(pdev->bus->self, PCI_EXP_DEVCTL, (devctl & ~PCI_EXP_DEVCTL_CERE));

/* Disable the Hotplug Interrupt bit to avoid device slot disable during system reset */
pcie_capability_read_word(pdev->bus->self, PCI_EXP_SLTCTL, &slot_ctrl);
if (slot_ctrl != (u16) ~0) {
slot_ctrl_orig = slot_ctrl;
slot_ctrl &= ~(PCI_EXP_SLTCTL_HPIE);
pcie_capability_write_word(pdev->bus->self, PCI_EXP_SLTCTL, slot_ctrl);
}
/* Send XGQ command to VMR to perform EEMI PMC based system reset */
err = xocl_vmr_eemi_pmc_srst(lro);
if (err) {
mgmt_err(lro, "EMMI PMC SRST Failed. err: %ld", err);
goto failed;
}
/* Offline the XGQ subdev here as this is required to process the above SRST XGQ command */
(void) xocl_subdev_offline_by_id(lro, XOCL_SUBDEV_XGQ_VMR);

pci_disable_device(pdev);

/* Restore the device control register info */
pcie_capability_write_word(pdev->bus->self, PCI_EXP_DEVCTL, devctl);
/* Restore the slot control register info */
if (!slot_ctrl_orig)
pcie_capability_write_word(pdev->bus->self, PCI_EXP_SLTCTL, slot_ctrl_orig);

msleep(100);
if (pci_enable_device(pdev))
mgmt_err(lro, "failed to enable pci device");
xocl_wait_pci_status(pdev, 0, 0, 0);

xocl_pci_restore_config_all(lro);

xclmgmt_config_pci(lro);

(void) xocl_subdev_online_by_id(lro, XOCL_SUBDEV_AF);
(void) xocl_subdev_online_by_id(lro, XOCL_SUBDEV_MAILBOX);
(void) xocl_subdev_online_by_id(lro, XOCL_SUBDEV_ICAP);
(void) xocl_subdev_online_by_id(lro, XOCL_SUBDEV_FLASH);
(void) xocl_subdev_online_by_id(lro, XOCL_SUBDEV_UARTLITE);
(void) xocl_subdev_online_by_id(lro, XOCL_SUBDEV_XGQ_VMR);
(void) xocl_subdev_online_by_id(lro, XOCL_SUBDEV_HWMON_SDM);

/* Workaround for some DSAs. Flush axilite busses */
if (dev_info->flags & XOCL_DSAFLAG_AXILITE_FLUSH)
platform_axilite_flush(lro);

/*
* Check firewall status. Status should be 0 (cleared)
* Otherwise issue message that a warm reboot is required.
*/
msleep(20);
while (retry++ < XCLMGMT_RESET_MAX_RETRY && xocl_af_check(lro, NULL)) {
xocl_af_clear(lro);
msleep(20);
}

if (retry >= XCLMGMT_RESET_MAX_RETRY) {
mgmt_err(lro, "Board is not able to recover by PCI Hot reset. "
"Please warm reboot");
return -EIO;
}

(void) xocl_hwmon_sdm_get_sensors_list(lro, true);

/* Workaround for some DSAs. Flush axilite busses */
if (dev_info->flags & XOCL_DSAFLAG_AXILITE_FLUSH)
platform_axilite_flush(lro);

lro->reset_requested = false;
xocl_thread_start(lro);

xocl_clear_pci_errors(lro);
store_pcie_link_info(lro);

/*
* Update the userspace fdt with the current values in the mgmt driver
*
* For vmr supported versal devices, we enabled A/B boot, thus we should
* reload fdt from the right boot image instead of using unchanged fdt
* for other ALEVO devices.
*/
mutex_lock(&lro->busy_mutex);
(void) xclmgmt_reload_fdt_blob_vmr(lro);
(void) xclmgmt_update_userpf_blob(lro);
mutex_unlock(&lro->busy_mutex);

xclmgmt_connect_notify(lro, true);

return 0;

failed:
return err;
}

/**
* Perform a PCIe secondary bus reset. Note: Use this method over pcie fundamental reset.
* This method is known to work better.
Expand Down Expand Up @@ -297,22 +436,11 @@ long xclmgmt_hot_reset(struct xclmgmt_dev *lro, bool force)

xocl_thread_stop(lro);

if (XOCL_DSA_EEMI_API_SRST(lro)) {
xocl_subdev_destroy_by_level(lro, XOCL_SUBDEV_LEVEL_URP);
err = xocl_vmr_eemi_pmc_srst(lro);
if (err) {
mgmt_err(lro, "EMMI PMC SRST Failed. err: %ld", err);
goto failed;
}
return 0;
}
else {
err = xocl_enable_vmr_boot(lro);
if (err) {
mgmt_err(lro, "enable reset failed");
err = -ENODEV;
goto failed;
}
err = xocl_enable_vmr_boot(lro);
if (err) {
mgmt_err(lro, "enable reset failed");
err = -ENODEV;
goto failed;
}

if (!force && xrt_reset_syncup) {
Expand Down

0 comments on commit ab6b3d9

Please sign in to comment.