Skip to content

Commit 86519e4

Browse files
committed
irqchip/gicv3: Workaround for NVIDIA erratum T241-FABRIC-4
jira LE-1907 Rebuild_History Non-Buildable kernel-5.14.0-284.30.1.el9_2 commit-author Shanker Donthineni <[email protected]> commit 35727af The T241 platform suffers from the T241-FABRIC-4 erratum which causes unexpected behavior in the GIC when multiple transactions are received simultaneously from different sources. This hardware issue impacts NVIDIA server platforms that use more than two T241 chips interconnected. Each chip has support for 320 {E}SPIs. This issue occurs when multiple packets from different GICs are incorrectly interleaved at the target chip. The erratum text below specifies exactly what can cause multiple transfer packets susceptible to interleaving and GIC state corruption. GIC state corruption can lead to a range of problems, including kernel panics, and unexpected behavior. >From the erratum text: "In some cases, inter-socket AXI4 Stream packets with multiple transfers, may be interleaved by the fabric when presented to ARM Generic Interrupt Controller. GIC expects all transfers of a packet to be delivered without any interleaving. The following GICv3 commands may result in multiple transfer packets over inter-socket AXI4 Stream interface: - Register reads from GICD_I* and GICD_N* - Register writes to 64-bit GICD registers other than GICD_IROUTERn* - ITS command MOVALL Multiple commands in GICv4+ utilize multiple transfer packets, including VMOVP, VMOVI, VMAPP, and 64-bit register accesses." This issue impacts system configurations with more than 2 sockets, that require multi-transfer packets to be sent over inter-socket AXI4 Stream interface between GIC instances on different sockets. GICv4 cannot be supported. GICv3 SW model can only be supported with the workaround. Single and Dual socket configurations are not impacted by this issue and support GICv3 and GICv4." Link: https://developer.nvidia.com/docs/t241-fabric-4/nvidia-t241-fabric-4-errata.pdf Writing to the chip alias region of the GICD_In{E} registers except GICD_ICENABLERn has an equivalent effect as writing to the global distributor. The SPI interrupt deactivate path is not impacted by the erratum. To fix this problem, implement a workaround that ensures read accesses to the GICD_In{E} registers are directed to the chip that owns the SPI, and disable GICv4.x features. To simplify code changes, the gic_configure_irq() function uses the same alias region for both read and write operations to GICD_ICFGR. Co-developed-by: Vikram Sethi <[email protected]> Signed-off-by: Vikram Sethi <[email protected]> Signed-off-by: Shanker Donthineni <[email protected]> Acked-by: Sudeep Holla <[email protected]> (for SMCCC/SOC ID bits) Signed-off-by: Marc Zyngier <[email protected]> Link: https://lore.kernel.org/r/[email protected] (cherry picked from commit 35727af) Signed-off-by: Jonathan Maple <[email protected]>
1 parent 6d32e46 commit 86519e4

File tree

6 files changed

+156
-34
lines changed

6 files changed

+156
-34
lines changed

Documentation/arm64/silicon-errata.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -164,6 +164,8 @@ stable kernels.
164164
+----------------+-----------------+-----------------+-----------------------------+
165165
| NVIDIA | Carmel Core | N/A | NVIDIA_CARMEL_CNP_ERRATUM |
166166
+----------------+-----------------+-----------------+-----------------------------+
167+
| NVIDIA | T241 GICv3/4.x | T241-FABRIC-4 | N/A |
168+
+----------------+-----------------+-----------------+-----------------------------+
167169
+----------------+-----------------+-----------------+-----------------------------+
168170
| Freescale/NXP | LS2080A/LS1043A | A-008585 | FSL_ERRATUM_A008585 |
169171
+----------------+-----------------+-----------------+-----------------------------+

drivers/firmware/smccc/smccc.c

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,16 +17,32 @@ static enum arm_smccc_conduit smccc_conduit = SMCCC_CONDUIT_NONE;
1717

1818
bool __ro_after_init smccc_trng_available = false;
1919
u64 __ro_after_init smccc_has_sve_hint = false;
20+
s32 __ro_after_init smccc_soc_id_version = SMCCC_RET_NOT_SUPPORTED;
21+
s32 __ro_after_init smccc_soc_id_revision = SMCCC_RET_NOT_SUPPORTED;
2022

2123
void __init arm_smccc_version_init(u32 version, enum arm_smccc_conduit conduit)
2224
{
25+
struct arm_smccc_res res;
26+
2327
smccc_version = version;
2428
smccc_conduit = conduit;
2529

2630
smccc_trng_available = smccc_probe_trng();
2731
if (IS_ENABLED(CONFIG_ARM64_SVE) &&
2832
smccc_version >= ARM_SMCCC_VERSION_1_3)
2933
smccc_has_sve_hint = true;
34+
35+
if ((smccc_version >= ARM_SMCCC_VERSION_1_2) &&
36+
(smccc_conduit != SMCCC_CONDUIT_NONE)) {
37+
arm_smccc_1_1_invoke(ARM_SMCCC_ARCH_FEATURES_FUNC_ID,
38+
ARM_SMCCC_ARCH_SOC_ID, &res);
39+
if ((s32)res.a0 >= 0) {
40+
arm_smccc_1_1_invoke(ARM_SMCCC_ARCH_SOC_ID, 0, &res);
41+
smccc_soc_id_version = (s32)res.a0;
42+
arm_smccc_1_1_invoke(ARM_SMCCC_ARCH_SOC_ID, 1, &res);
43+
smccc_soc_id_revision = (s32)res.a0;
44+
}
45+
}
3046
}
3147

3248
enum arm_smccc_conduit arm_smccc_1_1_get_conduit(void)
@@ -44,6 +60,16 @@ u32 arm_smccc_get_version(void)
4460
}
4561
EXPORT_SYMBOL_GPL(arm_smccc_get_version);
4662

63+
s32 arm_smccc_get_soc_id_version(void)
64+
{
65+
return smccc_soc_id_version;
66+
}
67+
68+
s32 arm_smccc_get_soc_id_revision(void)
69+
{
70+
return smccc_soc_id_revision;
71+
}
72+
4773
static int __init smccc_devices_init(void)
4874
{
4975
struct platform_device *pdev;

drivers/firmware/smccc/soc_id.c

Lines changed: 5 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -42,41 +42,23 @@ static int __init smccc_soc_init(void)
4242
if (arm_smccc_get_version() < ARM_SMCCC_VERSION_1_2)
4343
return 0;
4444

45-
if (arm_smccc_1_1_get_conduit() == SMCCC_CONDUIT_NONE) {
46-
pr_err("%s: invalid SMCCC conduit\n", __func__);
47-
return -EOPNOTSUPP;
48-
}
49-
50-
arm_smccc_1_1_invoke(ARM_SMCCC_ARCH_FEATURES_FUNC_ID,
51-
ARM_SMCCC_ARCH_SOC_ID, &res);
52-
53-
if ((int)res.a0 == SMCCC_RET_NOT_SUPPORTED) {
45+
soc_id_version = arm_smccc_get_soc_id_version();
46+
if (soc_id_version == SMCCC_RET_NOT_SUPPORTED) {
5447
pr_info("ARCH_SOC_ID not implemented, skipping ....\n");
5548
return 0;
5649
}
5750

58-
if ((int)res.a0 < 0) {
59-
pr_info("ARCH_FEATURES(ARCH_SOC_ID) returned error: %lx\n",
60-
res.a0);
61-
return -EINVAL;
62-
}
63-
64-
arm_smccc_1_1_invoke(ARM_SMCCC_ARCH_SOC_ID, 0, &res);
65-
if ((int)res.a0 < 0) {
51+
if (soc_id_version < 0) {
6652
pr_err("ARCH_SOC_ID(0) returned error: %lx\n", res.a0);
6753
return -EINVAL;
6854
}
6955

70-
soc_id_version = res.a0;
71-
72-
arm_smccc_1_1_invoke(ARM_SMCCC_ARCH_SOC_ID, 1, &res);
73-
if ((int)res.a0 < 0) {
56+
soc_id_rev = arm_smccc_get_soc_id_revision();
57+
if (soc_id_rev < 0) {
7458
pr_err("ARCH_SOC_ID(1) returned error: %lx\n", res.a0);
7559
return -EINVAL;
7660
}
7761

78-
soc_id_rev = res.a0;
79-
8062
soc_dev_attr = kzalloc(sizeof(*soc_dev_attr), GFP_KERNEL);
8163
if (!soc_dev_attr)
8264
return -ENOMEM;

drivers/irqchip/Kconfig

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ config ARM_GIC_V3
3535
select IRQ_DOMAIN_HIERARCHY
3636
select PARTITION_PERCPU
3737
select GENERIC_IRQ_EFFECTIVE_AFF_MASK if SMP
38+
select HAVE_ARM_SMCCC_DISCOVERY
3839

3940
config ARM_GIC_V3_ITS
4041
bool

drivers/irqchip/irq-gic-v3.c

Lines changed: 104 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,9 @@
2323
#include <linux/irqchip/arm-gic-common.h>
2424
#include <linux/irqchip/arm-gic-v3.h>
2525
#include <linux/irqchip/irq-partition-percpu.h>
26+
#include <linux/bitfield.h>
27+
#include <linux/bits.h>
28+
#include <linux/arm-smccc.h>
2629

2730
#include <asm/cputype.h>
2831
#include <asm/exception.h>
@@ -46,6 +49,7 @@ struct redist_region {
4649

4750
struct gic_chip_data {
4851
struct fwnode_handle *fwnode;
52+
phys_addr_t dist_phys_base;
4953
void __iomem *dist_base;
5054
struct redist_region *redist_regions;
5155
struct rdists rdists;
@@ -58,6 +62,10 @@ struct gic_chip_data {
5862
struct partition_desc **ppi_descs;
5963
};
6064

65+
#define T241_CHIPS_MAX 4
66+
static void __iomem *t241_dist_base_alias[T241_CHIPS_MAX] __read_mostly;
67+
static DEFINE_STATIC_KEY_FALSE(gic_nvidia_t241_erratum);
68+
6169
static struct gic_chip_data gic_data __read_mostly;
6270
static DEFINE_STATIC_KEY_TRUE(supports_deactivate_key);
6371

@@ -187,6 +195,39 @@ static inline bool gic_irq_in_rdist(struct irq_data *d)
187195
}
188196
}
189197

198+
static inline void __iomem *gic_dist_base_alias(struct irq_data *d)
199+
{
200+
if (static_branch_unlikely(&gic_nvidia_t241_erratum)) {
201+
irq_hw_number_t hwirq = irqd_to_hwirq(d);
202+
u32 chip;
203+
204+
/*
205+
* For the erratum T241-FABRIC-4, read accesses to GICD_In{E}
206+
* registers are directed to the chip that owns the SPI. The
207+
* the alias region can also be used for writes to the
208+
* GICD_In{E} except GICD_ICENABLERn. Each chip has support
209+
* for 320 {E}SPIs. Mappings for all 4 chips:
210+
* Chip0 = 32-351
211+
* Chip1 = 352-671
212+
* Chip2 = 672-991
213+
* Chip3 = 4096-4415
214+
*/
215+
switch (__get_intid_range(hwirq)) {
216+
case SPI_RANGE:
217+
chip = (hwirq - 32) / 320;
218+
break;
219+
case ESPI_RANGE:
220+
chip = 3;
221+
break;
222+
default:
223+
unreachable();
224+
}
225+
return t241_dist_base_alias[chip];
226+
}
227+
228+
return gic_data.dist_base;
229+
}
230+
190231
static inline void __iomem *gic_dist_base(struct irq_data *d)
191232
{
192233
switch (get_intid_range(d)) {
@@ -345,7 +386,7 @@ static int gic_peek_irq(struct irq_data *d, u32 offset)
345386
if (gic_irq_in_rdist(d))
346387
base = gic_data_rdist_sgi_base();
347388
else
348-
base = gic_data.dist_base;
389+
base = gic_dist_base_alias(d);
349390

350391
return !!(readl_relaxed(base + offset + (index / 32) * 4) & mask);
351392
}
@@ -596,7 +637,7 @@ static int gic_set_type(struct irq_data *d, unsigned int type)
596637
if (gic_irq_in_rdist(d))
597638
base = gic_data_rdist_sgi_base();
598639
else
599-
base = gic_data.dist_base;
640+
base = gic_dist_base_alias(d);
600641

601642
offset = convert_offset_index(d, GICD_ICFGR, &index);
602643

@@ -1718,6 +1759,43 @@ static bool gic_enable_quirk_hip06_07(void *data)
17181759
return false;
17191760
}
17201761

1762+
#define T241_CHIPN_MASK GENMASK_ULL(45, 44)
1763+
#define T241_CHIP_GICDA_OFFSET 0x1580000
1764+
#define SMCCC_SOC_ID_T241 0x036b0241
1765+
1766+
static bool gic_enable_quirk_nvidia_t241(void *data)
1767+
{
1768+
s32 soc_id = arm_smccc_get_soc_id_version();
1769+
unsigned long chip_bmask = 0;
1770+
phys_addr_t phys;
1771+
u32 i;
1772+
1773+
/* Check JEP106 code for NVIDIA T241 chip (036b:0241) */
1774+
if ((soc_id < 0) || (soc_id != SMCCC_SOC_ID_T241))
1775+
return false;
1776+
1777+
/* Find the chips based on GICR regions PHYS addr */
1778+
for (i = 0; i < gic_data.nr_redist_regions; i++) {
1779+
chip_bmask |= BIT(FIELD_GET(T241_CHIPN_MASK,
1780+
(u64)gic_data.redist_regions[i].phys_base));
1781+
}
1782+
1783+
if (hweight32(chip_bmask) < 3)
1784+
return false;
1785+
1786+
/* Setup GICD alias regions */
1787+
for (i = 0; i < ARRAY_SIZE(t241_dist_base_alias); i++) {
1788+
if (chip_bmask & BIT(i)) {
1789+
phys = gic_data.dist_phys_base + T241_CHIP_GICDA_OFFSET;
1790+
phys |= FIELD_PREP(T241_CHIPN_MASK, i);
1791+
t241_dist_base_alias[i] = ioremap(phys, SZ_64K);
1792+
WARN_ON_ONCE(!t241_dist_base_alias[i]);
1793+
}
1794+
}
1795+
static_branch_enable(&gic_nvidia_t241_erratum);
1796+
return true;
1797+
}
1798+
17211799
static const struct gic_quirk gic_quirks[] = {
17221800
{
17231801
.desc = "GICv3: Qualcomm MSM8996 broken firmware",
@@ -1749,6 +1827,12 @@ static const struct gic_quirk gic_quirks[] = {
17491827
.mask = 0xe8f00fff,
17501828
.init = gic_enable_quirk_cavium_38539,
17511829
},
1830+
{
1831+
.desc = "GICv3: NVIDIA erratum T241-FABRIC-4",
1832+
.iidr = 0x0402043b,
1833+
.mask = 0xffffffff,
1834+
.init = gic_enable_quirk_nvidia_t241,
1835+
},
17521836
{
17531837
}
17541838
};
@@ -1816,7 +1900,8 @@ static void gic_enable_nmi_support(void)
18161900
gic_chip.flags |= IRQCHIP_SUPPORTS_NMI;
18171901
}
18181902

1819-
static int __init gic_init_bases(void __iomem *dist_base,
1903+
static int __init gic_init_bases(phys_addr_t dist_phys_base,
1904+
void __iomem *dist_base,
18201905
struct redist_region *rdist_regs,
18211906
u32 nr_redist_regions,
18221907
u64 redist_stride,
@@ -1832,6 +1917,7 @@ static int __init gic_init_bases(void __iomem *dist_base,
18321917
pr_info("GIC: Using split EOI/Deactivate mode\n");
18331918

18341919
gic_data.fwnode = handle;
1920+
gic_data.dist_phys_base = dist_phys_base;
18351921
gic_data.dist_base = dist_base;
18361922
gic_data.redist_regions = rdist_regs;
18371923
gic_data.nr_redist_regions = nr_redist_regions;
@@ -1859,10 +1945,13 @@ static int __init gic_init_bases(void __iomem *dist_base,
18591945
gic_data.domain = irq_domain_create_tree(handle, &gic_irq_domain_ops,
18601946
&gic_data);
18611947
gic_data.rdists.rdist = alloc_percpu(typeof(*gic_data.rdists.rdist));
1862-
gic_data.rdists.has_rvpeid = true;
1863-
gic_data.rdists.has_vlpis = true;
1864-
gic_data.rdists.has_direct_lpi = true;
1865-
gic_data.rdists.has_vpend_valid_dirty = true;
1948+
if (!static_branch_unlikely(&gic_nvidia_t241_erratum)) {
1949+
/* Disable GICv4.x features for the erratum T241-FABRIC-4 */
1950+
gic_data.rdists.has_rvpeid = true;
1951+
gic_data.rdists.has_vlpis = true;
1952+
gic_data.rdists.has_direct_lpi = true;
1953+
gic_data.rdists.has_vpend_valid_dirty = true;
1954+
}
18661955

18671956
if (WARN_ON(!gic_data.domain) || WARN_ON(!gic_data.rdists.rdist)) {
18681957
err = -ENOMEM;
@@ -2065,6 +2154,7 @@ static void __iomem *gic_of_iomap(struct device_node *node, int idx,
20652154

20662155
static int __init gic_of_init(struct device_node *node, struct device_node *parent)
20672156
{
2157+
phys_addr_t dist_phys_base;
20682158
void __iomem *dist_base;
20692159
struct redist_region *rdist_regs;
20702160
struct resource res;
@@ -2078,6 +2168,8 @@ static int __init gic_of_init(struct device_node *node, struct device_node *pare
20782168
return PTR_ERR(dist_base);
20792169
}
20802170

2171+
dist_phys_base = res.start;
2172+
20812173
err = gic_validate_dist_version(dist_base);
20822174
if (err) {
20832175
pr_err("%pOF: no distributor detected, giving up\n", node);
@@ -2109,8 +2201,8 @@ static int __init gic_of_init(struct device_node *node, struct device_node *pare
21092201

21102202
gic_enable_of_quirks(node, gic_quirks, &gic_data);
21112203

2112-
err = gic_init_bases(dist_base, rdist_regs, nr_redist_regions,
2113-
redist_stride, &node->fwnode);
2204+
err = gic_init_bases(dist_phys_base, dist_base, rdist_regs,
2205+
nr_redist_regions, redist_stride, &node->fwnode);
21142206
if (err)
21152207
goto out_unmap_rdist;
21162208

@@ -2426,8 +2518,9 @@ gic_acpi_init(union acpi_subtable_headers *header, const unsigned long end)
24262518
goto out_redist_unmap;
24272519
}
24282520

2429-
err = gic_init_bases(acpi_data.dist_base, acpi_data.redist_regs,
2430-
acpi_data.nr_redist_regions, 0, gsi_domain_handle);
2521+
err = gic_init_bases(dist->base_address, acpi_data.dist_base,
2522+
acpi_data.redist_regs, acpi_data.nr_redist_regions,
2523+
0, gsi_domain_handle);
24312524
if (err)
24322525
goto out_fwhandle_free;
24332526

include/linux/arm-smccc.h

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -226,6 +226,24 @@ void __init arm_smccc_version_init(u32 version, enum arm_smccc_conduit conduit);
226226

227227
extern u64 smccc_has_sve_hint;
228228

229+
/**
230+
* arm_smccc_get_soc_id_version()
231+
*
232+
* Returns the SOC ID version.
233+
*
234+
* When ARM_SMCCC_ARCH_SOC_ID is not present, returns SMCCC_RET_NOT_SUPPORTED.
235+
*/
236+
s32 arm_smccc_get_soc_id_version(void);
237+
238+
/**
239+
* arm_smccc_get_soc_id_revision()
240+
*
241+
* Returns the SOC ID revision.
242+
*
243+
* When ARM_SMCCC_ARCH_SOC_ID is not present, returns SMCCC_RET_NOT_SUPPORTED.
244+
*/
245+
s32 arm_smccc_get_soc_id_revision(void);
246+
229247
/**
230248
* struct arm_smccc_res - Result from SMC/HVC call
231249
* @a0-a3 result values from registers 0 to 3

0 commit comments

Comments
 (0)