Skip to content

[NPU] Add NPU optimized sparsemax#1104

Open
lowdy1 wants to merge 1 commit intolinkedin:mainfrom
lowdy1:sparsemax
Open

[NPU] Add NPU optimized sparsemax#1104
lowdy1 wants to merge 1 commit intolinkedin:mainfrom
lowdy1:sparsemax

Conversation

@lowdy1
Copy link
Contributor

@lowdy1 lowdy1 commented Feb 14, 2026

Summary

This PR adds a usable Sparsemax implementation on NPU that passes all tests and runs all benchmarks without errors. It includes both tiled and non-tiled kernels for forward and backward. However, due to the relatively complex tiled forward computation, the forward kernel performance maybe still requires further optimization.

Testing Done

Tested with python -m pytest ./test/transformers/test_sparsemax.py -v
Atlas 800I A2(32GB)

  • Hardware Type:
  • run make test to ensure correctness
  • run make checkstyle to ensure code style
  • run make test-convergence to ensure convergence

@lowdy1
Copy link
Contributor Author

lowdy1 commented Feb 14, 2026


 BENCHMARKING SPEED for SPARSEMAX

********** Benchmark Data **********
[
{
"kernel_name": "sparsemax",
"kernel_provider": "liger",
"metric_name": "speed",
"metric_unit": "ms",
"gpu_name": "Ascend910B4",
"x_name": "V",
"x_label": "feature size",
"x_values": [
1024,
2048,
4096,
8192,
16384,
32768
],
"y_values_50": [
1.9094799757003784,
3.7213799953460693,
14.270119667053223,
29.154630661010742,
58.19636154174805,
117.08264923095703
],
"y_values_20": [
1.9082720279693604,
3.720412015914917,
14.269319534301758,
29.153060913085938,
58.192779541015625,
117.0779037475586
],
"y_values_80": [
1.9107600450515747,
3.722259998321533,
14.271140098571777,
29.15753936767578,
58.1993522644043,
117.08646392822266
],
"timestamp": "2026-02-14 08:38:08",
"kernel_operation_mode": "forward",
"extra_benchmark_config_str": "{"B": 4, "T": 512, "dim": -1, "dtype": "torch.float32"}",
"liger_version": "0.6.4"
},
{
"kernel_name": "sparsemax",
"kernel_provider": "torch",
"metric_name": "speed",
"metric_unit": "ms",
"gpu_name": "Ascend910B4",
"x_name": "V",
"x_label": "feature size",
"x_values": [
1024,
2048,
4096,
8192,
16384,
32768
],
"y_values_50": [
1.6931400299072266,
2.1005799770355225,
3.079580068588257,
6.020939826965332,
11.919639587402344,
24.208271026611328
],
"y_values_20": [
1.6891120672225952,
2.0926198959350586,
3.064379930496216,
6.003036022186279,
11.878459930419922,
24.162364959716797
],
"y_values_80": [
1.6984000205993652,
2.1066958904266357,
3.090059995651245,
6.049811840057373,
11.953760147094727,
24.263195037841797
],
"timestamp": "2026-02-14 08:38:11",
"kernel_operation_mode": "forward",
"extra_benchmark_config_str": "{"B": 4, "T": 512, "dim": -1, "dtype": "torch.float32"}",
"liger_version": "0.6.4"
},
{
"kernel_name": "sparsemax",
"kernel_provider": "liger",
"metric_name": "speed",
"metric_unit": "ms",
"gpu_name": "Ascend910B4",
"x_name": "V",
"x_label": "feature size",
"x_values": [
1024,
2048,
4096,
8192,
16384,
32768
],
"y_values_50": [
2.835059881210327,
5.512899875640869,
17.788179397583008,
31.29404067993164,
62.44466018676758,
125.57151794433594
],
"y_values_20": [
2.8338239192962646,
5.511876106262207,
17.787044525146484,
31.29190444946289,
62.441490173339844,
125.57073974609375
],
"y_values_80": [
2.836331844329834,
5.5136919021606445,
17.789840698242188,
31.297374725341797,
62.4507942199707,
125.57853698730469
],
"timestamp": "2026-02-14 08:38:26",
"kernel_operation_mode": "full",
"extra_benchmark_config_str": "{"B": 4, "T": 512, "dim": -1, "dtype": "torch.float32"}",
"liger_version": "0.6.4"
},
{
"kernel_name": "sparsemax",
"kernel_provider": "torch",
"metric_name": "speed",
"metric_unit": "ms",
"gpu_name": "Ascend910B4",
"x_name": "V",
"x_label": "feature size",
"x_values": [
1024,
2048,
4096,
8192,
16384,
32768
],
"y_values_50": [
2.756899833679199,
4.205060005187988,
7.279119968414307,
14.797189712524414,
40.63865661621094,
111.20559692382812
],
"y_values_20": [
2.7510080337524414,
4.19704008102417,
7.269708156585693,
14.772064208984375,
40.620086669921875,
111.14126586914062
],
"y_values_80": [
2.7629239559173584,
4.209980010986328,
7.291855812072754,
14.816191673278809,
40.699920654296875,
111.25631713867188
],
"timestamp": "2026-02-14 08:38:30",
"kernel_operation_mode": "full",
"extra_benchmark_config_str": "{"B": 4, "T": 512, "dim": -1, "dtype": "torch.float32"}",
"liger_version": "0.6.4"
},
{
"kernel_name": "sparsemax",
"kernel_provider": "liger",
"metric_name": "speed",
"metric_unit": "ms",
"gpu_name": "Ascend910B4",
"x_name": "V",
"x_label": "feature size",
"x_values": [
1024,
2048,
4096,
8192,
16384,
32768
],
"y_values_50": [
0.9273399710655212,
1.791159987449646,
3.5187599658966064,
2.1222000122070312,
4.232600212097168,
8.457340240478516
],
"y_values_20": [
0.9268199801445007,
1.79066002368927,
3.517331838607788,
2.1211118698120117,
4.231019973754883,
8.453971862792969
],
"y_values_80": [
0.9278799891471863,
1.7915799617767334,
3.523831844329834,
2.123255968093872,
4.23497200012207,
8.461055755615234
],
"timestamp": "2026-02-14 08:38:33",
"kernel_operation_mode": "backward",
"extra_benchmark_config_str": "{"B": 4, "T": 512, "dim": -1, "dtype": "torch.float32"}",
"liger_version": "0.6.4"
},
{
"kernel_name": "sparsemax",
"kernel_provider": "torch",
"metric_name": "speed",
"metric_unit": "ms",
"gpu_name": "Ascend910B4",
"x_name": "V",
"x_label": "feature size",
"x_values": [
1024,
2048,
4096,
8192,
16384,
32768
],
"y_values_50": [
1.1463099718093872,
2.1653800010681152,
4.292479991912842,
8.867199897766113,
28.778961181640625,
87.06780242919922
],
"y_values_20": [
1.1445800065994263,
2.1640119552612305,
4.289247989654541,
8.864232063293457,
28.77522087097168,
87.06510162353516
],
"y_values_80": [
1.1490639448165894,
2.169152021408081,
4.297499656677246,
8.871403694152832,
28.782100677490234,
87.06903076171875
],
"timestamp": "2026-02-14 08:38:37",
"kernel_operation_mode": "backward",
"extra_benchmark_config_str": "{"B": 4, "T": 512, "dim": -1, "dtype": "torch.float32"}",
"liger_version": "0.6.4"
}
]


 BENCHMARKING MEMORY for SPARSEMAX

********** Benchmark Data **********
[
{
"kernel_name": "sparsemax",
"kernel_provider": "liger",
"metric_name": "memory",
"metric_unit": "MB",
"gpu_name": "Ascend910B4",
"x_name": "V",
"x_label": "feature size",
"x_values": [
1024,
2048,
4096,
8192,
16384,
32768
],
"y_values_50": [
56.0048828125,
112.0048828125,
224.0048828125,
704.00537109375,
1408.00537109375,
2816.00537109375
],
"y_values_20": [
56.0048828125,
112.0048828125,
224.0048828125,
704.00537109375,
1408.00537109375,
2816.00537109375
],
"y_values_80": [
56.0048828125,
112.0048828125,
224.0048828125,
704.00537109375,
1408.00537109375,
2816.00537109375
],
"timestamp": "2026-02-14 08:38:37",
"kernel_operation_mode": "full",
"extra_benchmark_config_str": "{"B": 4, "T": 512, "dim": -1, "dtype": "torch.float32"}",
"liger_version": "0.6.4"
},
{
"kernel_name": "sparsemax",
"kernel_provider": "torch",
"metric_name": "memory",
"metric_unit": "MB",
"gpu_name": "Ascend910B4",
"x_name": "V",
"x_label": "feature size",
"x_values": [
1024,
2048,
4096,
8192,
16384,
32768
],
"y_values_50": [
106.0224609375,
196.0224609375,
376.0224609375,
736.0224609375,
1456.0224609375,
2896.0224609375
],
"y_values_20": [
106.0224609375,
196.0224609375,
376.0224609375,
736.0224609375,
1456.0224609375,
2896.0224609375
],
"y_values_80": [
106.0224609375,
196.0224609375,
376.0224609375,
736.0224609375,
1456.0224609375,
2896.0224609375
],
"timestamp": "2026-02-14 08:38:37",
"kernel_operation_mode": "full",
"extra_benchmark_config_str": "{"B": 4, "T": 512, "dim": -1, "dtype": "torch.float32"}",
"liger_version": "0.6.4"
}
]

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

None yet

Projects

None yet

Development

Successfully merging this pull request may close these issues.

1 participant