Merge "kernel: Fix drivers panic during shutdown"

This commit is contained in:
Zuul 2023-09-06 19:09:34 +00:00 committed by Gerrit Code Review
commit 16c3057ed5
12 changed files with 553 additions and 0 deletions

View File

@ -0,0 +1,105 @@
From ce2502ef4019e7d00ccafdfbaa2bd8f4bf85c82b Mon Sep 17 00:00:00 2001
From: Jiping Ma <jiping.ma2@windriver.com>
Date: Tue, 4 Apr 2023 23:40:48 -0700
Subject: [PATCH] Fix the invalid check in iavf_remove()
If the netdev pointer is NULL, then iavf_remove() returns early to
ensure that it does not proceed with an already-freed netdev instance.
However, drvdata field of the iavf driver's pci_dev structure continues
to keep the former value of the netdev pointer, and this value can be
acquired from the pci_dev structure via pci_get_drvdata(). This causes a
kernel panic when a forced reboot/shutdown is in progress due to the
following sequence of events:
- The iavf_shutdown() callback is called by the kernel. This function
detaches the device, brings it down if it was running and frees
resources.
- Later, the associated PF driver's shutdown callback is called:
ice_shutdown(). That callback calls, among others, sriov_disable(),
which then indirectly calls iavf_remove() again.
- Kernel WARNING is reported because the work adminq_task->func is NULL
in cancel_work_sync(&adapter->adminq_task) during iavf_remove(), that
reason is the resource already had been freed in the first iavf_remove()
running stage.
"WARNING: CPU: 63 PID: 93678 at kernel/workqueue.c:3047
__flush_work.isra.0+0x6b/0x80"
The patch for iavf resolves this issue by checking the pci_dev
structure's is_busmaster field at the beginning of iavf_remove(). If the
PCI device had already been disabled by an earlier call to
iavf_shutdown() or iavf_remove(), via a call to pci_disable_device(),
then the is_busmaster field would be set to zero. Based on this logic,
if the is_busmaster field is set to zero, then the iavf_remove function
returns early. This in turn avoids the aforementioned kernel panic
caused by multiple calls to iavf_remove().
Reproducer:
1. Create container with VF on PF driven by ice.
2. Ensure that the VF is bound to iavf driver
3. Reboot -f
[ 341.561449] iavf 0000:51:05.2: Removing device
[ 341.730407] iavf 0000:51:05.1: Removing device
[ 341.924457] iavf 0000:51:05.0: Removing device
[ 347.130324] pci 0000:51:05.0: Removing from iommu group 161
[ 347.130367] ------------[ cut here ]------------
[ 347.130372] WARNING: CPU: 63 PID: 93678 at kernel/workqueue.c:3047 \
__flush_work.isra.0+0x6b/0x80
[ 347.130373] Modules linked in: ...
[ 347.130688] ...
[ 347.130958] CPU: 63 PID: 93678 Comm: reboot Kdump: loaded \
Tainted: G S O \
5.10.0-6-amd64 #1 Debian 5.10.162-1.stx.64
[ 347.130990] Hardware name: ...
[ 347.130995] RIP: 0010:__flush_work.isra.0+0x6b/0x80
...
[ 347.131076] Call Trace:
[ 347.131083] __cancel_work_timer+0xff/0x190
[ 347.131089] ? kernfs_put.part.0+0xd9/0x1a0
[ 347.131150] ? kmem_cache_free+0x3bd/0x410
[ 347.131158] iavf_remove+0x5e/0xe0 [iavf]
[ 347.131163] ? pci_device_remove+0x38/0xa0
[ 347.131167] ? __device_release_driver+0x17b/0x250
[ 347.131169] ? device_release_driver+0x24/0x30
[ 347.131172] ? pci_stop_bus_device+0x6c/0x90
[ 347.131174] ? pci_stop_and_remove_bus_device+0xe/0x20
[ 347.131179] ? pci_iov_remove_virtfn+0xc0/0x130
[ 347.131185] ? sriov_disable+0x34/0xe0
[ 347.131210] ? ice_free_vfs+0x77/0x350 [ice]
[ 347.131215] ? flow_indr_dev_unregister+0x243/0x250
[ 347.131226] ? ice_remove+0x3e5/0x430 [ice]
[ 347.131237] ? ice_shutdown+0x16/0x50 [ice]
[ 347.131240] ? pci_device_shutdown+0x31/0x60
[ 347.131243] ? device_shutdown+0x156/0x1b0
[ 347.131248] ? __do_sys_reboot.cold+0x2f/0x5b
[ 347.131251] ? vfs_writev+0xc5/0x160
[ 347.131254] ? get_max_files+0x20/0x20
[ 347.131258] ? sched_clock+0x5/0x10
[ 347.131264] ? get_vtime_delta+0xf/0xc0
[ 347.131267] ? vtime_user_exit+0x1c/0x70
[ 347.131272] ? do_syscall_64+0x30/0x40
[ 347.131276] ? entry_SYSCALL_64_after_hwframe+0x61/0xc6
Signed-off-by: Jiping Ma <Jiping.ma2@windriver.com>
---
src/iavf_main.c | 4 ++++
1 file changed, 4 insertions(+)
diff --git a/src/iavf_main.c b/src/iavf_main.c
index 69cc5b3..ef3404c 100644
--- a/src/iavf_main.c
+++ b/src/iavf_main.c
@@ -4483,6 +4483,10 @@ static void iavf_remove(struct pci_dev *pdev)
struct iavf_mac_filter *f, *ftmp;
struct iavf_hw *hw = &adapter->hw;
int err;
+
+ /* Don't proceed with remove if pci device is already disable */
+ if(pdev->is_busmaster == 0)
+ return;
/* Indicate we are in remove and not to run/schedule any driver tasks */
set_bit(__IAVF_IN_REMOVE_TASK, &adapter->crit_section);
cancel_delayed_work_sync(&adapter->client_task);
--
2.40.0

View File

@ -1,2 +1,3 @@
Fix-build-issues.patch
iavf_main-Use-irq_update_affinity_hint.patch
Fix-the-invalid-check.patch

View File

@ -0,0 +1,104 @@
From af651b6e67d2b5b4d779b0f8cfab6fa9db811a1b Mon Sep 17 00:00:00 2001
From: Jiping Ma <jiping.ma2@windriver.com>
Date: Tue, 4 Apr 2023 23:40:48 -0700
Subject: [PATCH] Fix the invalid check in iavf_remove()
If the netdev pointer is NULL, then iavf_remove() returns early to
ensure that it does not proceed with an already-freed netdev instance.
However, drvdata field of the iavf driver's pci_dev structure continues
to keep the former value of the netdev pointer, and this value can be
acquired from the pci_dev structure via pci_get_drvdata(). This causes a
kernel panic when a forced reboot/shutdown is in progress due to the
following sequence of events:
- The iavf_shutdown() callback is called by the kernel. This function
detaches the device, brings it down if it was running and frees
resources.
- Later, the associated PF driver's shutdown callback is called:
ice_shutdown(). That callback calls, among others, sriov_disable(),
which then indirectly calls iavf_remove() again.
- Kernel WARNING is reported because the work adminq_task->func is NULL
in cancel_work_sync(&adapter->adminq_task) during iavf_remove(), that
reason is the resource already had been freed in the first iavf_remove()
running stage.
"WARNING: CPU: 63 PID: 93678 at kernel/workqueue.c:3047
__flush_work.isra.0+0x6b/0x80"
The patch for iavf resolves this issue by checking the pci_dev
structure's is_busmaster field at the beginning of iavf_remove(). If the
PCI device had already been disabled by an earlier call to
iavf_shutdown() or iavf_remove(), via a call to pci_disable_device(),
then the is_busmaster field would be set to zero. Based on this logic,
if the is_busmaster field is set to zero, then the iavf_remove function
returns early. This in turn avoids the aforementioned kernel panic
caused by multiple calls to iavf_remove().
Reproducer:
1. Create container with VF on PF driven by ice.
2. Ensure that the VF is bound to iavf driver
3. Reboot -f
[ 341.561449] iavf 0000:51:05.2: Removing device
[ 341.730407] iavf 0000:51:05.1: Removing device
[ 341.924457] iavf 0000:51:05.0: Removing device
[ 347.130324] pci 0000:51:05.0: Removing from iommu group 161
[ 347.130367] ------------[ cut here ]------------
[ 347.130372] WARNING: CPU: 63 PID: 93678 at kernel/workqueue.c:3047 \
__flush_work.isra.0+0x6b/0x80
[ 347.130373] Modules linked in: ...
[ 347.130688] ...
[ 347.130958] CPU: 63 PID: 93678 Comm: reboot Kdump: loaded \
Tainted: G S O \
5.10.0-6-amd64 #1 Debian 5.10.162-1.stx.64
[ 347.130990] Hardware name: ...
[ 347.130995] RIP: 0010:__flush_work.isra.0+0x6b/0x80
...
[ 347.131076] Call Trace:
[ 347.131083] __cancel_work_timer+0xff/0x190
[ 347.131089] ? kernfs_put.part.0+0xd9/0x1a0
[ 347.131150] ? kmem_cache_free+0x3bd/0x410
[ 347.131158] iavf_remove+0x5e/0xe0 [iavf]
[ 347.131163] ? pci_device_remove+0x38/0xa0
[ 347.131167] ? __device_release_driver+0x17b/0x250
[ 347.131169] ? device_release_driver+0x24/0x30
[ 347.131172] ? pci_stop_bus_device+0x6c/0x90
[ 347.131174] ? pci_stop_and_remove_bus_device+0xe/0x20
[ 347.131179] ? pci_iov_remove_virtfn+0xc0/0x130
[ 347.131185] ? sriov_disable+0x34/0xe0
[ 347.131210] ? ice_free_vfs+0x77/0x350 [ice]
[ 347.131215] ? flow_indr_dev_unregister+0x243/0x250
[ 347.131226] ? ice_remove+0x3e5/0x430 [ice]
[ 347.131237] ? ice_shutdown+0x16/0x50 [ice]
[ 347.131240] ? pci_device_shutdown+0x31/0x60
[ 347.131243] ? device_shutdown+0x156/0x1b0
[ 347.131248] ? __do_sys_reboot.cold+0x2f/0x5b
[ 347.131251] ? vfs_writev+0xc5/0x160
[ 347.131254] ? get_max_files+0x20/0x20
[ 347.131258] ? sched_clock+0x5/0x10
[ 347.131264] ? get_vtime_delta+0xf/0xc0
[ 347.131267] ? vtime_user_exit+0x1c/0x70
[ 347.131272] ? do_syscall_64+0x30/0x40
[ 347.131276] ? entry_SYSCALL_64_after_hwframe+0x61/0xc6
Signed-off-by: Jiping Ma <Jiping.ma2@windriver.com>
---
src/iavf_main.c | 3 +++
1 file changed, 3 insertions(+)
diff --git a/src/iavf_main.c b/src/iavf_main.c
index 14d2051..68cd002 100644
--- a/src/iavf_main.c
+++ b/src/iavf_main.c
@@ -5801,6 +5801,9 @@ static void iavf_remove(struct pci_dev *pdev)
/* Don't proceed with remove if netdev is already freed */
if (!netdev)
return;
+ /* Don't proceed with remove if pci device is already disable */
+ if(pdev->is_busmaster == 0)
+ return;
/* Indicate we are in remove and not to run/schedule any driver tasks */
if (test_and_set_bit(__IAVF_IN_REMOVE_TASK, &adapter->crit_section))
--
2.40.0

View File

@ -1,2 +1,3 @@
iavf_main-Use-irq_update_affinity_hint.patch
0001-intel-iavf-pass-linux-common-header-to-check_aux_bus.patch
0001-Fix-the-invalid-check.patch

View File

@ -0,0 +1,104 @@
From 1c303402cf0fec469097fab8a4b898703c596831 Mon Sep 17 00:00:00 2001
From: Jiping Ma <jiping.ma2@windriver.com>
Date: Tue, 4 Apr 2023 23:40:48 -0700
Subject: [PATCH] Fix the invalid check in iavf_remove()
If the netdev pointer is NULL, then iavf_remove() returns early to
ensure that it does not proceed with an already-freed netdev instance.
However, drvdata field of the iavf driver's pci_dev structure continues
to keep the former value of the netdev pointer, and this value can be
acquired from the pci_dev structure via pci_get_drvdata(). This causes a
kernel panic when a forced reboot/shutdown is in progress due to the
following sequence of events:
- The iavf_shutdown() callback is called by the kernel. This function
detaches the device, brings it down if it was running and frees
resources.
- Later, the associated PF driver's shutdown callback is called:
ice_shutdown(). That callback calls, among others, sriov_disable(),
which then indirectly calls iavf_remove() again.
- Kernel WARNING is reported because the work adminq_task->func is NULL
in cancel_work_sync(&adapter->adminq_task) during iavf_remove(), that
reason is the resource already had been freed in the first iavf_remove()
running stage.
"WARNING: CPU: 63 PID: 93678 at kernel/workqueue.c:3047
__flush_work.isra.0+0x6b/0x80"
The patch for iavf resolves this issue by checking the pci_dev
structure's is_busmaster field at the beginning of iavf_remove(). If the
PCI device had already been disabled by an earlier call to
iavf_shutdown() or iavf_remove(), via a call to pci_disable_device(),
then the is_busmaster field would be set to zero. Based on this logic,
if the is_busmaster field is set to zero, then the iavf_remove function
returns early. This in turn avoids the aforementioned kernel panic
caused by multiple calls to iavf_remove().
Reproducer:
1. Create container with VF on PF driven by ice.
2. Ensure that the VF is bound to iavf driver
3. Reboot -f
[ 341.561449] iavf 0000:51:05.2: Removing device
[ 341.730407] iavf 0000:51:05.1: Removing device
[ 341.924457] iavf 0000:51:05.0: Removing device
[ 347.130324] pci 0000:51:05.0: Removing from iommu group 161
[ 347.130367] ------------[ cut here ]------------
[ 347.130372] WARNING: CPU: 63 PID: 93678 at kernel/workqueue.c:3047 \
__flush_work.isra.0+0x6b/0x80
[ 347.130373] Modules linked in: ...
[ 347.130688] ...
[ 347.130958] CPU: 63 PID: 93678 Comm: reboot Kdump: loaded \
Tainted: G S O \
5.10.0-6-amd64 #1 Debian 5.10.162-1.stx.64
[ 347.130990] Hardware name: ...
[ 347.130995] RIP: 0010:__flush_work.isra.0+0x6b/0x80
...
[ 347.131076] Call Trace:
[ 347.131083] __cancel_work_timer+0xff/0x190
[ 347.131089] ? kernfs_put.part.0+0xd9/0x1a0
[ 347.131150] ? kmem_cache_free+0x3bd/0x410
[ 347.131158] iavf_remove+0x5e/0xe0 [iavf]
[ 347.131163] ? pci_device_remove+0x38/0xa0
[ 347.131167] ? __device_release_driver+0x17b/0x250
[ 347.131169] ? device_release_driver+0x24/0x30
[ 347.131172] ? pci_stop_bus_device+0x6c/0x90
[ 347.131174] ? pci_stop_and_remove_bus_device+0xe/0x20
[ 347.131179] ? pci_iov_remove_virtfn+0xc0/0x130
[ 347.131185] ? sriov_disable+0x34/0xe0
[ 347.131210] ? ice_free_vfs+0x77/0x350 [ice]
[ 347.131215] ? flow_indr_dev_unregister+0x243/0x250
[ 347.131226] ? ice_remove+0x3e5/0x430 [ice]
[ 347.131237] ? ice_shutdown+0x16/0x50 [ice]
[ 347.131240] ? pci_device_shutdown+0x31/0x60
[ 347.131243] ? device_shutdown+0x156/0x1b0
[ 347.131248] ? __do_sys_reboot.cold+0x2f/0x5b
[ 347.131251] ? vfs_writev+0xc5/0x160
[ 347.131254] ? get_max_files+0x20/0x20
[ 347.131258] ? sched_clock+0x5/0x10
[ 347.131264] ? get_vtime_delta+0xf/0xc0
[ 347.131267] ? vtime_user_exit+0x1c/0x70
[ 347.131272] ? do_syscall_64+0x30/0x40
[ 347.131276] ? entry_SYSCALL_64_after_hwframe+0x61/0xc6
Signed-off-by: Jiping Ma <Jiping.ma2@windriver.com>
---
src/iavf_main.c | 3 +++
1 file changed, 3 insertions(+)
diff --git a/src/iavf_main.c b/src/iavf_main.c
index 779e752..54d697c 100644
--- a/src/iavf_main.c
+++ b/src/iavf_main.c
@@ -5727,6 +5727,9 @@ static void iavf_remove(struct pci_dev *pdev)
struct iavf_mac_filter *f, *ftmp;
struct iavf_hw *hw = &adapter->hw;
+ /* Don't proceed with remove if pci device is already disable */
+ if(pdev->is_busmaster == 0)
+ return;
/* Indicate we are in remove and not to run/schedule any driver tasks */
set_bit(__IAVF_IN_REMOVE_TASK, &adapter->crit_section);
cancel_work_sync(&adapter->adminq_task);
--
2.40.0

View File

@ -1,2 +1,3 @@
iavf_main-Use-irq_update_affinity_hint.patch
0001-intel-iavf-pass-linux-common-header-to-check_aux_bus.patch
0001-Fix-the-invalid-check.patch

View File

@ -0,0 +1,78 @@
From 302d470b0559ecaf0c5392035f2054ae5e3ab53a Mon Sep 17 00:00:00 2001
From: Jiping Ma <jiping.ma2@windriver.com>
Date: Mon, 10 Apr 2023 01:22:57 -0700
Subject: [PATCH] Disable irq_msix_misc before ptp release
The ptp resources are released before disable irq_msix_misc
interrupt in ice_remove() function when the system reboot, but
the interrupt handler ice_misc_intr is still using these
resources, which cause the panic to happen.
The patch fixes this by putting irq_msix_misc() before
ice_ptp_release().
[48357.039676] pci 0000:51:06.6: Removing from iommu group 175
[48357.039750] pci 0000:51:06.7: Removing from iommu group 176
[48359.332713] ice 0000:51:00.0: removed Clock from enp81s0f0
[48359.492523] BUG: kernel NULL pointer dereference, address: 0000000000000004
[48359.499484] #PF: supervisor write access in kernel mode
[48359.504711] #PF: error_code(0x0002) - not-present page
[48359.509850] PGD 213439067 P4D 0
[48359.513084] Oops: 0002 [#1] PREEMPT SMP NOPTI
[48359.517442] CPU: 0 PID: 0 Comm: swapper/0 Kdump: loaded \
Tainted: G S O \
5.10.0-6-amd64 #1 Debian 5.10.162-1.stx.64
[48359.528821] Hardware name: ...
[48359.536303] RIP: 0010:_raw_spin_lock_irqsave+0x19/0x40
[48359.541441] Code: ...
...
[48359.641473] <IRQ>
[48359.643494] kthread_queue_work+0x22/0x70
[48359.647527] ice_misc_intr+0x237/0x2c0 [ice]
[48359.651796] ? __handle_irq_event_percpu+0x3d/0x190
[48359.656680] ? handle_irq_event+0x58/0xb0
[48359.660700] ? handle_edge_irq+0x93/0x240
[48359.664721] ? asm_call_irq_on_stack+0xf/0x20
[48359.669076] </IRQ>
[48359.671186] ? common_interrupt+0xb3/0x130
[48359.675285] ? asm_common_interrupt+0x1e/0x40
[48359.679645] ? cpuidle_enter_state+0xca/0x350
[48359.684002] ? cpuidle_enter+0x29/0x40
[48359.687753] ? do_idle+0x1ec/0x2a0
[48359.691160] ? cpu_startup_entry+0x19/0x20
[48359.695260] ? start_kernel+0x54c/0x571
[48359.699099] ? secondary_startup_64_no_verify+0xc2/0xcb
Reproducer:
1. Create container with sts-silicom application(ptp related appliction).
2. Ensure sts-silicom pod is running status.
3. Reboot -f
Signed-off-by: Jiping Ma <Jiping.ma2@windriver.com>
---
src/ice_main.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/src/ice_main.c b/src/ice_main.c
index f43c2e2..8feba46 100644
--- a/src/ice_main.c
+++ b/src/ice_main.c
@@ -6083,6 +6083,7 @@ static void ice_remove(struct pci_dev *pdev)
#ifdef HAVE_NETDEV_UPPER_INFO
ice_deinit_lag(pf);
#endif /* HAVE_NETDEV_UPPER_INFO */
+ ice_free_irq_msix_misc(pf);
if (test_bit(ICE_FLAG_PTP_ENA, pf->flags))
ice_ptp_release(pf);
if (!ice_is_safe_mode(pf))
@@ -6097,7 +6098,6 @@ static void ice_remove(struct pci_dev *pdev)
devm_kfree(&pdev->dev, pf->peers);
}
ice_set_wake(pf);
- ice_free_irq_msix_misc(pf);
ice_for_each_vsi(pf, i) {
if (!pf->vsi[i])
continue;
--
2.40.0

View File

@ -1,3 +1,4 @@
0001-ice_xsk-Avoid-dependency-on-napi_busy_loop-with-PREE.patch
0002-ice_main-ice_lib-Use-irq_update_affinity_hint.patch
0003-ddp-change-ddp-file-name-for-legacy-driver.patch
0004-Disable-irq_msix_misc-before-ptp-release.patch

View File

@ -0,0 +1,78 @@
From f89a25cb7c8e25aaf59567c81e9af993872a0e6c Mon Sep 17 00:00:00 2001
From: Jiping Ma <jiping.ma2@windriver.com>
Date: Mon, 10 Apr 2023 01:22:57 -0700
Subject: [PATCH] Disable irq_msix_misc before ptp release
The ptp resources are released before disable irq_msix_misc
interrupt in ice_remove() function when the system reboot, but
the interrupt handler ice_misc_intr is still using these
resources, which cause the panic to happen.
The patch fixes this by putting irq_msix_misc() before
ice_ptp_release().
[48357.039676] pci 0000:51:06.6: Removing from iommu group 175
[48357.039750] pci 0000:51:06.7: Removing from iommu group 176
[48359.332713] ice 0000:51:00.0: removed Clock from enp81s0f0
[48359.492523] BUG: kernel NULL pointer dereference, address: 0000000000000004
[48359.499484] #PF: supervisor write access in kernel mode
[48359.504711] #PF: error_code(0x0002) - not-present page
[48359.509850] PGD 213439067 P4D 0
[48359.513084] Oops: 0002 [#1] PREEMPT SMP NOPTI
[48359.517442] CPU: 0 PID: 0 Comm: swapper/0 Kdump: loaded \
Tainted: G S O \
5.10.0-6-amd64 #1 Debian 5.10.162-1.stx.64
[48359.528821] Hardware name: ...
[48359.536303] RIP: 0010:_raw_spin_lock_irqsave+0x19/0x40
[48359.541441] Code: ...
...
[48359.641473] <IRQ>
[48359.643494] kthread_queue_work+0x22/0x70
[48359.647527] ice_misc_intr+0x237/0x2c0 [ice]
[48359.651796] ? __handle_irq_event_percpu+0x3d/0x190
[48359.656680] ? handle_irq_event+0x58/0xb0
[48359.660700] ? handle_edge_irq+0x93/0x240
[48359.664721] ? asm_call_irq_on_stack+0xf/0x20
[48359.669076] </IRQ>
[48359.671186] ? common_interrupt+0xb3/0x130
[48359.675285] ? asm_common_interrupt+0x1e/0x40
[48359.679645] ? cpuidle_enter_state+0xca/0x350
[48359.684002] ? cpuidle_enter+0x29/0x40
[48359.687753] ? do_idle+0x1ec/0x2a0
[48359.691160] ? cpu_startup_entry+0x19/0x20
[48359.695260] ? start_kernel+0x54c/0x571
[48359.699099] ? secondary_startup_64_no_verify+0xc2/0xcb
Reproducer:
1. Create container with sts-silicom application(ptp related appliction).
2. Ensure sts-silicom pod is running status.
3. Reboot -f
Signed-off-by: Jiping Ma <Jiping.ma2@windriver.com>
---
src/ice_main.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/src/ice_main.c b/src/ice_main.c
index 9df9e85..b9ae8dc 100644
--- a/src/ice_main.c
+++ b/src/ice_main.c
@@ -6589,6 +6589,7 @@ static void ice_remove(struct pci_dev *pdev)
#ifdef HAVE_NETDEV_UPPER_INFO
ice_deinit_lag(pf);
#endif /* HAVE_NETDEV_UPPER_INFO */
+ ice_free_irq_msix_misc(pf);
if (test_bit(ICE_FLAG_PTP_SUPPORTED, pf->flags))
ice_ptp_release(pf);
if (ice_is_feature_supported(pf, ICE_F_GNSS))
@@ -6598,7 +6599,6 @@ static void ice_remove(struct pci_dev *pdev)
ice_setup_mc_magic_wake(pf);
ice_vsi_release_all(pf);
ice_set_wake(pf);
- ice_free_irq_msix_misc(pf);
ice_for_each_vsi(pf, i) {
if (!pf->vsi[i])
continue;
--
2.40.0

View File

@ -1,3 +1,4 @@
0001-ice_main-ice_lib-Use-irq_update_affinity_hint.patch
0002-intel-ice-pass-linux-common-header-to-check_aux_bus.patch
0003-rename-the-ddp-file-to-avoid-conflict.patch
0004-Disable-irq_msix_misc-before-ptp-release.patch

View File

@ -0,0 +1,78 @@
From f89a25cb7c8e25aaf59567c81e9af993872a0e6c Mon Sep 17 00:00:00 2001
From: Jiping Ma <jiping.ma2@windriver.com>
Date: Mon, 10 Apr 2023 01:22:57 -0700
Subject: [PATCH] Disable irq_msix_misc before ptp release
The ptp resources are released before disable irq_msix_misc
interrupt in ice_remove() function when the system reboot, but
the interrupt handler ice_misc_intr is still using these
resources, which cause the panic to happen.
The patch fixes this by putting irq_msix_misc() before
ice_ptp_release().
[48357.039676] pci 0000:51:06.6: Removing from iommu group 175
[48357.039750] pci 0000:51:06.7: Removing from iommu group 176
[48359.332713] ice 0000:51:00.0: removed Clock from enp81s0f0
[48359.492523] BUG: kernel NULL pointer dereference, address: 0000000000000004
[48359.499484] #PF: supervisor write access in kernel mode
[48359.504711] #PF: error_code(0x0002) - not-present page
[48359.509850] PGD 213439067 P4D 0
[48359.513084] Oops: 0002 [#1] PREEMPT SMP NOPTI
[48359.517442] CPU: 0 PID: 0 Comm: swapper/0 Kdump: loaded \
Tainted: G S O \
5.10.0-6-amd64 #1 Debian 5.10.162-1.stx.64
[48359.528821] Hardware name: ...
[48359.536303] RIP: 0010:_raw_spin_lock_irqsave+0x19/0x40
[48359.541441] Code: ...
...
[48359.641473] <IRQ>
[48359.643494] kthread_queue_work+0x22/0x70
[48359.647527] ice_misc_intr+0x237/0x2c0 [ice]
[48359.651796] ? __handle_irq_event_percpu+0x3d/0x190
[48359.656680] ? handle_irq_event+0x58/0xb0
[48359.660700] ? handle_edge_irq+0x93/0x240
[48359.664721] ? asm_call_irq_on_stack+0xf/0x20
[48359.669076] </IRQ>
[48359.671186] ? common_interrupt+0xb3/0x130
[48359.675285] ? asm_common_interrupt+0x1e/0x40
[48359.679645] ? cpuidle_enter_state+0xca/0x350
[48359.684002] ? cpuidle_enter+0x29/0x40
[48359.687753] ? do_idle+0x1ec/0x2a0
[48359.691160] ? cpu_startup_entry+0x19/0x20
[48359.695260] ? start_kernel+0x54c/0x571
[48359.699099] ? secondary_startup_64_no_verify+0xc2/0xcb
Reproducer:
1. Create container with sts-silicom application(ptp related appliction).
2. Ensure sts-silicom pod is running status.
3. Reboot -f
Signed-off-by: Jiping Ma <Jiping.ma2@windriver.com>
---
src/ice_main.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/src/ice_main.c b/src/ice_main.c
index 9df9e85..b9ae8dc 100644
--- a/src/ice_main.c
+++ b/src/ice_main.c
@@ -6589,6 +6589,7 @@ static void ice_remove(struct pci_dev *pdev)
#ifdef HAVE_NETDEV_UPPER_INFO
ice_deinit_lag(pf);
#endif /* HAVE_NETDEV_UPPER_INFO */
+ ice_free_irq_msix_misc(pf);
if (test_bit(ICE_FLAG_PTP_SUPPORTED, pf->flags))
ice_ptp_release(pf);
if (ice_is_feature_supported(pf, ICE_F_GNSS))
@@ -6598,7 +6599,6 @@ static void ice_remove(struct pci_dev *pdev)
ice_setup_mc_magic_wake(pf);
ice_vsi_release_all(pf);
ice_set_wake(pf);
- ice_free_irq_msix_misc(pf);
ice_for_each_vsi(pf, i) {
if (!pf->vsi[i])
continue;
--
2.40.0

View File

@ -1,3 +1,4 @@
0001-ice_main-ice_lib-Use-irq_update_affinity_hint.patch
0001-intel-ice-pass-linux-common-header-to-check_aux_bus.patch
0001-rename-the-ddp-file-to-avoid-conflict.patch
0001-Disable-irq_msix_misc-before-ptp-release.patch