Skip to content
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.

Commit eed8c14

Browse files
cotaGoogle-ML-Automation
authored andcommittedMar 13, 2025·
[xla:cpu] flip xla_cpu_use_fusion_emitters to true
Note, however, that no fusion emitter is enabled yet. Flipping this flag does change the HLO pipeline, and is therefore worth submitting as its own CL to get test coverage and measure performance differences. The largest perf difference can be seen in Gather: <details> <summary>Microbenchmarks thunks vs. fusion emitters</summary> ``` │ thunks │ fusion_emitters │ │ cpu-sec/op │ cpu-sec/op vs base │ BM_ConcatenateTwoR3F32/parallel:0/batch:128/width:64/height:256/axis:0/process_time 602.3µ ± 0% 603.0µ ± 1% ~ (p=0.485 n=6) BM_ConcatenateTwoR3F32/parallel:0/batch:128/width:64/height:256/axis:1/process_time 624.8µ ± 2% 806.0µ ± 1% +29.00% (p=0.002 n=6) BM_ConcatenateTwoR3F32/parallel:0/batch:128/width:64/height:256/axis:2/process_time 998.4µ ± 0% 1127.5µ ± 1% +12.93% (p=0.002 n=6) BM_ConcatenateTwoR3F32/parallel:0/batch:256/width:128/height:64/axis:0/process_time 605.0µ ± 4% 609.5µ ± 2% ~ (p=0.132 n=6) BM_ConcatenateTwoR3F32/parallel:0/batch:256/width:128/height:64/axis:1/process_time 620.0µ ± 2% 854.8µ ± 28% ~ (p=0.132 n=6) BM_ConcatenateTwoR3F32/parallel:0/batch:256/width:128/height:64/axis:2/process_time 1.284m ± 3% 1.936m ± 7% +50.79% (p=0.002 n=6) BM_ConcatenateTwoR3F32/parallel:0/batch:64/width:256/height:128/axis:0/process_time 602.2µ ± 3% 602.8µ ± 1% ~ (p=0.699 n=6) BM_ConcatenateTwoR3F32/parallel:0/batch:64/width:256/height:128/axis:1/process_time 615.2µ ± 0% 828.1µ ± 8% +34.61% (p=0.002 n=6) BM_ConcatenateTwoR3F32/parallel:0/batch:64/width:256/height:128/axis:2/process_time 1.031m ± 1% 1.175m ± 6% +13.99% (p=0.002 n=6) BM_ConcatenateTwoR3F32/parallel:1/batch:128/width:64/height:256/axis:0/process_time 1.753m ± 13% 1.650m ± 8% ~ (p=0.180 n=6) BM_ConcatenateTwoR3F32/parallel:1/batch:128/width:64/height:256/axis:1/process_time 1.711m ± 6% 1.804m ± 11% ~ (p=0.310 n=6) BM_ConcatenateTwoR3F32/parallel:1/batch:128/width:64/height:256/axis:2/process_time 2.602m ± 8% 2.434m ± 7% -6.47% (p=0.041 n=6) BM_ConcatenateTwoR3F32/parallel:1/batch:256/width:128/height:64/axis:0/process_time 1.684m ± 12% 1.730m ± 14% ~ (p=0.485 n=6) BM_ConcatenateTwoR3F32/parallel:1/batch:256/width:128/height:64/axis:1/process_time 1.798m ± 10% 1.668m ± 5% -7.25% (p=0.026 n=6) BM_ConcatenateTwoR3F32/parallel:1/batch:256/width:128/height:64/axis:2/process_time 2.287m ± 12% 2.251m ± 7% ~ (p=0.180 n=6) BM_ConcatenateTwoR3F32/parallel:1/batch:64/width:256/height:128/axis:0/process_time 1.703m ± 7% 1.666m ± 6% ~ (p=0.485 n=6) BM_ConcatenateTwoR3F32/parallel:1/batch:64/width:256/height:128/axis:1/process_time 1.727m ± 5% 1.717m ± 6% ~ (p=0.589 n=6) BM_ConcatenateTwoR3F32/parallel:1/batch:64/width:256/height:128/axis:2/process_time 2.232m ± 11% 2.235m ± 16% ~ (p=0.818 n=6) BM_Conv1DStrided/1/129/process_time 54.79m ± 8% 54.83m ± 6% ~ (p=0.699 n=6) BM_Conv1DStrided/3/129/process_time 171.4m ± 9% 170.4m ± 6% ~ (p=1.000 n=6) BM_Conv1DTransposedStrided/129/1/process_time 47.47m ± 3% 53.37m ± 7% +12.43% (p=0.002 n=6) BM_Conv1DTransposedStrided/129/3/process_time 155.8m ± 5% 155.1m ± 9% ~ (p=0.937 n=6) BM_Conv1DTransposedStridedNonDefaultLayout/129/1/process_time 37.36m ± 9% 34.52m ± 2% ~ (p=0.180 n=6) BM_Conv1DTransposedStridedNonDefaultLayout/129/3/process_time 122.1m ± 5% 128.4m ± 7% ~ (p=0.132 n=6) BM_Conv2D<F32>/16/32/32/128/1/1/1024/process_time 123.6m ± 21% 121.2m ± 14% ~ (p=0.180 n=6) BM_Conv2D<F32>/16/32/32/128/3/3/1024/process_time 888.2m ± 10% 1032.4m ± 27% ~ (p=0.093 n=6) BM_Conv2D<F32>/32/256/256/4/1/1/16/process_time 202.9m ± 8% 190.3m ± 8% -6.24% (p=0.026 n=6) BM_Conv2D<F32>/32/256/256/4/3/3/16/process_time 427.8m ± 15% 376.0m ± 5% -12.10% (p=0.041 n=6) BM_Conv2D<F32>/32/32/32/128/1/1/1024/process_time 249.9m ± 4% 249.0m ± 3% ~ (p=0.937 n=6) BM_Conv2D<F32>/32/32/32/128/3/3/1024/process_time 1.948 ± 8% 1.890 ± 6% ~ (p=0.310 n=6) BM_Conv2D<F32>/32/32/32/96/1/1/96/process_time 21.98m ± 8% 22.39m ± 3% ~ (p=0.818 n=6) BM_Conv2D<F32>/32/32/32/96/3/3/96/process_time 141.6m ± 4% 144.9m ± 3% ~ (p=0.310 n=6) BM_Conv2D<F32>/32/64/64/32/1/1/64/process_time 29.17m ± 3% 27.90m ± 4% -4.35% (p=0.015 n=6) BM_Conv2D<F32>/32/64/64/32/3/3/64/process_time 161.0m ± 4% 159.4m ± 3% ~ (p=0.310 n=6) BM_Conv2D<F32>/32/64/64/4/1/1/16/process_time 1.202m ± 5% 1.207m ± 14% ~ (p=0.699 n=6) BM_Conv2D<F32>/32/64/64/4/3/3/16/process_time 26.16m ± 4% 26.60m ± 3% ~ (p=0.240 n=6) BM_Conv2D<F32>/8/128/128/4/1/1/8/process_time 457.0µ ± 4% 453.2µ ± 4% ~ (p=0.589 n=6) BM_Conv2D<F32>/8/128/128/4/3/3/8/process_time 14.59m ± 4% 14.81m ± 4% ~ (p=0.485 n=6) BM_Conv2D<F32>/8/32/32/128/1/1/1024/process_time 62.50m ± 10% 57.55m ± 8% ~ (p=0.065 n=6) BM_Conv2D<F32>/8/32/32/128/3/3/1024/process_time 449.5m ± 6% 451.0m ± 6% ~ (p=0.937 n=6) BM_Conv2D<F32>/8/5/5/1/1/1/32/process_time 3.269µ ± 4% 3.262µ ± 1% ~ (p=0.589 n=6) BM_Conv2D<F32>/8/5/5/1/3/3/32/process_time 13.38µ ± 9% 13.31µ ± 0% -0.52% (p=0.009 n=6) BM_Conv2D<F32>/8/5/5/4/1/1/32/process_time 1.821µ ± 1% 1.842µ ± 0% +1.10% (p=0.041 n=6) BM_Conv2D<F32>/8/5/5/4/3/3/32/process_time 17.93µ ± 4% 18.00µ ± 2% ~ (p=0.310 n=6) BM_Conv2DStrided/process_time 57.50m ± 5% 59.22m ± 6% ~ (p=0.394 n=6) BM_Conv2DTransposedStrided/process_time 47.48m ± 2% 48.09m ± 4% ~ (p=0.310 n=6) BM_GroupedConv2D/1/45/45/1024/5/5/1024/1024/process_time 439.2m ± 5% 425.9m ± 3% ~ (p=0.093 n=6) BM_GroupedConv2DStrided/128/128/128/process_time 65.37m ± 8% 60.74m ± 6% ~ (p=0.132 n=6) BM_GroupedConv2DStrided/128/128/16/process_time 57.54m ± 7% 60.12m ± 11% ~ (p=0.485 n=6) BM_GroupedConv2DTransposedStrided/128/128/128/process_time 4.373 ± 4% 4.342 ± 5% ~ (p=0.937 n=6) BM_GroupedConv2DTransposedStrided/128/128/16/process_time 4.997 ± 5% 4.696 ± 7% -6.02% (p=0.026 n=6) BM_CustomCall_16FloatBuffers/process_time 2.039µ ± 4% 1.916µ ± 4% -5.99% (p=0.004 n=6) BM_CustomCall_16IntAttributes/process_time 677.2n ± 2% 642.6n ± 1% -5.10% (p=0.002 n=6) BM_CustomCall_Minimal/process_time 590.2n ± 1% 557.3n ± 1% -5.57% (p=0.002 n=6) BM_DagExecution/1024/process_time 7.481m ± 1% 7.649m ± 9% +2.24% (p=0.009 n=6) BM_DagExecution/128/process_time 682.5µ ± 5% 689.5µ ± 2% ~ (p=0.065 n=6) BM_DagExecution/16384/process_time 171.4m ± 9% 197.4m ± 13% +15.17% (p=0.015 n=6) BM_DagExecution/256/process_time 1.926m ± 4% 1.939m ± 1% ~ (p=0.699 n=6) BM_DagExecution/512/process_time 3.449m ± 1% 3.467m ± 2% ~ (p=0.240 n=6) BM_DagExecution/8192/process_time 59.51m ± 15% 62.34m ± 15% ~ (p=0.310 n=6) BM_BatchedDot/11/1/128/process_time 37.62µ ± 0% 37.50µ ± 0% -0.33% (p=0.002 n=6) BM_BatchedDot/11/1/2/process_time 488.7n ± 11% 458.3n ± 3% -6.21% (p=0.002 n=6) BM_BatchedDot/11/1/256/process_time 766.9µ ± 3% 769.9µ ± 3% ~ (p=0.818 n=6) BM_BatchedDot/11/1/32/process_time 1.770µ ± 2% 1.716µ ± 0% -3.05% (p=0.002 n=6) BM_BatchedDot/11/1/512/process_time 9.177m ± 1% 9.440m ± 2% +2.86% (p=0.004 n=6) BM_BatchedDot/11/1/64/process_time 6.431µ ± 2% 6.376µ ± 0% -0.85% (p=0.002 n=6) BM_BatchedDot/11/2/128/process_time 74.44µ ± 0% 74.38µ ± 0% ~ (p=0.310 n=6) BM_BatchedDot/11/2/2/process_time 495.0n ± 1% 462.9n ± 4% -6.49% (p=0.002 n=6) BM_BatchedDot/11/2/256/process_time 1.623m ± 26% 1.575m ± 3% ~ (p=1.000 n=6) BM_BatchedDot/11/2/32/process_time 2.804µ ± 0% 2.750µ ± 0% -1.92% (p=0.002 n=6) BM_BatchedDot/11/2/512/process_time 26.11m ± 3% 22.17m ± 2% -15.08% (p=0.002 n=6) BM_BatchedDot/11/2/64/process_time 12.10µ ± 1% 12.00µ ± 1% -0.81% (p=0.015 n=6) BM_BatchedDot/11/4/128/process_time 148.8µ ± 0% 148.6µ ± 0% -0.13% (p=0.002 n=6) BM_BatchedDot/11/4/2/process_time 500.0n ± 1% 465.5n ± 2% -6.91% (p=0.002 n=6) BM_BatchedDot/11/4/256/process_time 3.716m ± 2% 3.192m ± 4% -14.11% (p=0.002 n=6) BM_BatchedDot/11/4/32/process_time 4.818µ ± 1% 4.788µ ± 4% ~ (p=0.394 n=6) BM_BatchedDot/11/4/512/process_time 47.46m ± 3% 46.98m ± 1% ~ (p=0.065 n=6) BM_BatchedDot/11/4/64/process_time 23.36µ ± 0% 23.26µ ± 0% -0.45% (p=0.002 n=6) BM_BatchedDot/11/8/128/process_time 298.0µ ± 0% 297.8µ ± 0% -0.05% (p=0.041 n=6) BM_BatchedDot/11/8/2/process_time 514.2n ± 2% 475.1n ± 1% -7.60% (p=0.002 n=6) BM_BatchedDot/11/8/256/process_time 7.223m ± 7% 6.327m ± 4% -12.40% (p=0.002 n=6) BM_BatchedDot/11/8/32/process_time 8.787µ ± 0% 8.739µ ± 0% -0.55% (p=0.002 n=6) BM_BatchedDot/11/8/512/process_time 101.90m ± 5% 98.02m ± 3% ~ (p=0.180 n=6) BM_BatchedDot/11/8/64/process_time 46.00µ ± 0% 45.93µ ± 0% ~ (p=0.065 n=6) BM_BatchedDot/16/1/128/process_time 40.95µ ± 0% 40.84µ ± 0% -0.27% (p=0.002 n=6) BM_BatchedDot/16/1/2/process_time 557.7n ± 5% 523.7n ± 6% -6.10% (p=0.004 n=6) BM_BatchedDot/16/1/256/process_time 1.113m ± 4% 1.078m ± 3% ~ (p=0.093 n=6) BM_BatchedDot/16/1/32/process_time 2.059µ ± 1% 2.001µ ± 1% -2.85% (p=0.002 n=6) BM_BatchedDot/16/1/512/process_time 11.57m ± 2% 11.10m ± 1% -4.02% (p=0.002 n=6) BM_BatchedDot/16/1/64/process_time 7.363µ ± 1% 7.293µ ± 1% -0.95% (p=0.002 n=6) BM_BatchedDot/16/2/128/process_time 81.31µ ± 0% 81.00µ ± 0% -0.38% (p=0.002 n=6) BM_BatchedDot/16/2/2/process_time 594.0n ± 4% 530.4n ± 2% -10.71% (p=0.002 n=6) BM_BatchedDot/16/2/256/process_time 2.270m ± 1% 2.212m ± 4% ~ (p=0.180 n=6) BM_BatchedDot/16/2/32/process_time 3.348µ ± 1% 3.297µ ± 1% -1.54% (p=0.002 n=6) BM_BatchedDot/16/2/512/process_time 25.68m ± 2% 25.40m ± 2% ~ (p=0.485 n=6) BM_BatchedDot/16/2/64/process_time 13.89µ ± 1% 13.73µ ± 0% -1.19% (p=0.002 n=6) BM_BatchedDot/16/4/128/process_time 163.8µ ± 0% 163.6µ ± 0% ~ (p=0.093 n=6) BM_BatchedDot/16/4/2/process_time 607.9n ± 1% 542.8n ± 1% -10.71% (p=0.002 n=6) BM_BatchedDot/16/4/256/process_time 4.782m ± 1% 4.668m ± 2% -2.39% (p=0.009 n=6) BM_BatchedDot/16/4/32/process_time 5.754µ ± 1% 5.675µ ± 2% -1.38% (p=0.041 n=6) BM_BatchedDot/16/4/512/process_time 56.58m ± 3% 55.00m ± 1% -2.78% (p=0.015 n=6) BM_BatchedDot/16/4/64/process_time 26.67µ ± 0% 26.59µ ± 0% ~ (p=0.093 n=6) BM_BatchedDot/16/8/128/process_time 749.6µ ± 16% 753.7µ ± 4% ~ (p=0.180 n=6) BM_BatchedDot/16/8/2/process_time 632.7n ± 4% 636.7n ± 1% ~ (p=0.699 n=6) BM_BatchedDot/16/8/256/process_time 9.479m ± 2% 9.313m ± 2% -1.75% (p=0.009 n=6) BM_BatchedDot/16/8/32/process_time 10.52µ ± 0% 10.53µ ± 0% ~ (p=0.699 n=6) BM_BatchedDot/16/8/512/process_time 114.5m ± 7% 113.0m ± 4% ~ (p=0.240 n=6) BM_BatchedDot/16/8/64/process_time 52.56µ ± 0% 52.50µ ± 0% -0.13% (p=0.015 n=6) BM_DynamicUpdateSliceF32/1024/process_time 25.78µ ± 1% 25.95µ ± 3% ~ (p=0.065 n=6) BM_DynamicUpdateSliceF32/128/process_time 3.151µ ± 3% 3.026µ ± 0% -3.97% (p=0.002 n=6) BM_DynamicUpdateSliceF32/16384/process_time 798.0µ ± 7% 764.4µ ± 4% -4.22% (p=0.015 n=6) BM_DynamicUpdateSliceF32/256/process_time 5.810µ ± 1% 5.680µ ± 3% ~ (p=0.065 n=6) BM_DynamicUpdateSliceF32/512/process_time 13.70µ ± 3% 13.65µ ± 3% ~ (p=0.240 n=6) BM_DynamicUpdateSliceF32/8192/process_time 435.5µ ± 2% 427.8µ ± 5% ~ (p=0.180 n=6) BM_AddBF16/1024/process_time 198.4µ ± 1% 183.6µ ± 2% -7.45% (p=0.002 n=6) BM_AddBF16/128/process_time 10.23µ ± 0% 10.17µ ± 0% -0.56% (p=0.002 n=6) BM_AddBF16/16384/process_time 2.231m ± 4% 2.164m ± 27% ~ (p=1.000 n=6) BM_AddBF16/256/process_time 37.87µ ± 3% 33.62µ ± 1% -11.23% (p=0.002 n=6) BM_AddBF16/32768/process_time 6.420m ± 15% 5.611m ± 16% -12.61% (p=0.015 n=6) BM_AddBF16/512/process_time 110.95µ ± 1% 95.40µ ± 1% -14.02% (p=0.002 n=6) BM_AddBF16/8192/process_time 1.050m ± 6% 1.022m ± 7% ~ (p=0.589 n=6) BM_AddF32/1024/process_time 343.5µ ± 5% 320.5µ ± 4% -6.70% (p=0.002 n=6) BM_AddF32/128/process_time 21.95µ ± 1% 21.79µ ± 2% ~ (p=0.240 n=6) BM_AddF32/16384/process_time 6.284m ± 15% 5.409m ± 11% -13.91% (p=0.002 n=6) BM_AddF32/256/process_time 67.42µ ± 1% 67.63µ ± 1% ~ (p=0.485 n=6) BM_AddF32/32768/process_time 15.24m ± 6% 14.96m ± 5% ~ (p=0.394 n=6) BM_AddF32/512/process_time 164.0µ ± 14% 146.4µ ± 1% ~ (p=0.394 n=6) BM_AddF32/8192/process_time 1.967m ± 19% 1.792m ± 24% -8.90% (p=0.041 n=6) BM_ConvertF32ToBF16/1024/process_time 154.1µ ± 5% 185.0µ ± 1% +20.04% (p=0.002 n=6) BM_ConvertF32ToBF16/128/process_time 5.918µ ± 1% 5.861µ ± 1% ~ (p=0.093 n=6) BM_ConvertF32ToBF16/16384/process_time 1.901m ± 10% 1.973m ± 7% ~ (p=0.589 n=6) BM_ConvertF32ToBF16/256/process_time 25.28µ ± 1% 30.70µ ± 1% +21.43% (p=0.002 n=6) BM_ConvertF32ToBF16/32768/process_time 5.758m ± 45% 6.767m ± 18% ~ (p=0.310 n=6) BM_ConvertF32ToBF16/512/process_time 73.36µ ± 1% 91.98µ ± 1% +25.38% (p=0.002 n=6) BM_ConvertF32ToBF16/8192/process_time 977.4µ ± 10% 1046.1µ ± 9% ~ (p=0.394 n=6) BM_BcastFusionF32/1024/process_time 279.5µ ± 3% 274.0µ ± 2% -1.96% (p=0.015 n=6) BM_BcastFusionF32/128/process_time 20.29µ ± 2% 19.91µ ± 2% -1.90% (p=0.015 n=6) BM_BcastFusionF32/16384/process_time 3.623m ± 7% 3.459m ± 8% ~ (p=0.093 n=6) BM_BcastFusionF32/256/process_time 52.64µ ± 3% 52.06µ ± 9% ~ (p=0.394 n=6) BM_BcastFusionF32/512/process_time 113.9µ ± 1% 117.3µ ± 1% +2.92% (p=0.002 n=6) BM_BcastFusionF32/8192/process_time 1.739m ± 9% 1.657m ± 6% ~ (p=0.240 n=6) BM_ChainOfAddF32/1024/process_time 72.65µ ± 1% 71.52µ ± 2% -1.56% (p=0.041 n=6) BM_ChainOfAddF32/128/process_time 12.72µ ± 8% 12.43µ ± 1% ~ (p=0.394 n=6) BM_ChainOfAddF32/256/process_time 20.55µ ± 1% 20.06µ ± 0% -2.39% (p=0.002 n=6) BM_ChainOfAddF32/512/process_time 37.05µ ± 2% 36.13µ ± 2% -2.50% (p=0.004 n=6) BM_ChainOfAddF32/64/process_time 8.835µ ± 1% 8.864µ ± 1% ~ (p=0.394 n=6) BM_DynamicUpdateSliceFusionF32/1024/process_time 26.06µ ± 2% 25.82µ ± 2% ~ (p=0.310 n=6) BM_DynamicUpdateSliceFusionF32/128/process_time 2.891µ ± 0% 2.952µ ± 2% +2.12% (p=0.002 n=6) BM_DynamicUpdateSliceFusionF32/16384/process_time 780.4µ ± 4% 774.4µ ± 7% ~ (p=0.394 n=6) BM_DynamicUpdateSliceFusionF32/256/process_time 5.538µ ± 1% 5.766µ ± 3% +4.11% (p=0.002 n=6) BM_DynamicUpdateSliceFusionF32/512/process_time 13.23µ ± 2% 13.42µ ± 2% +1.45% (p=0.041 n=6) BM_DynamicUpdateSliceFusionF32/8192/process_time 431.9µ ± 4% 431.0µ ± 5% ~ (p=0.818 n=6) BM_FusionF32/1024/process_time 322.1µ ± 1% 319.2µ ± 1% ~ (p=0.310 n=6) BM_FusionF32/128/process_time 22.09µ ± 2% 21.97µ ± 1% ~ (p=0.240 n=6) BM_FusionF32/16384/process_time 5.473m ± 13% 6.237m ± 10% +13.95% (p=0.015 n=6) BM_FusionF32/256/process_time 67.53µ ± 2% 68.25µ ± 8% +1.07% (p=0.009 n=6) BM_FusionF32/512/process_time 146.9µ ± 7% 147.0µ ± 7% ~ (p=0.818 n=6) BM_FusionF32/8192/process_time 1.855m ± 10% 1.989m ± 16% +7.25% (p=0.041 n=6) BM_FusionF32_2/160/process_time 1.227µ ± 4% 1.283µ ± 1% +4.55% (p=0.002 n=6) BM_FusionF32_2/240/process_time 1.501µ ± 3% 1.560µ ± 1% +3.90% (p=0.002 n=6) BM_FusionF32_2/40/process_time 827.2n ± 2% 873.4n ± 1% +5.60% (p=0.002 n=6) BM_FusionF32_2/80/process_time 963.5n ± 2% 1009.0n ± 1% +4.73% (p=0.002 n=6) BM_GatherS32/10/128/1/process_time 564.5n ± 19% 541.5n ± 2% ~ (p=0.394 n=6) BM_GatherS32/10/128/2/process_time 466.5n ± 3% 549.0n ± 2% +17.69% (p=0.002 n=6) BM_GatherS32/10/128/32/process_time 639.6n ± 5% 718.4n ± 2% +12.31% (p=0.002 n=6) BM_GatherS32/10/256/1/process_time 469.1n ± 5% 552.1n ± 0% +17.70% (p=0.002 n=6) BM_GatherS32/10/256/2/process_time 477.7n ± 1% 562.1n ± 1% +17.66% (p=0.002 n=6) BM_GatherS32/10/256/64/process_time 1.219µ ± 1% 1.286µ ± 1% +5.50% (p=0.002 n=6) BM_GatherS32/10/3/1/process_time 558.5n ± 0% 454.7n ± 3% -18.58% (p=0.002 n=6) BM_GatherS32/10/3/2/process_time 565.3n ± 1% 454.6n ± 1% -19.59% (p=0.002 n=6) BM_GatherS32/10/3/4/process_time 566.3n ± 0% 462.2n ± 1% -18.38% (p=0.002 n=6) BM_GatherS32/10/32/1/process_time 561.1n ± 1% 458.6n ± 1% -18.26% (p=0.002 n=6) BM_GatherS32/10/32/2/process_time 561.6n ± 5% 457.5n ± 21% -18.53% (p=0.002 n=6) BM_GatherS32/10/32/8/process_time 573.4n ± 9% 560.8n ± 2% -2.19% (p=0.004 n=6) BM_GatherS32/10/512/1/process_time 489.2n ± 3% 576.0n ± 2% +17.74% (p=0.002 n=6) BM_GatherS32/10/512/128/process_time 18.982µ ± 2% 4.127µ ± 3% -78.26% (p=0.002 n=6) BM_GatherS32/10/512/2/process_time 506.1n ± 2% 587.1n ± 2% +15.99% (p=0.002 n=6) BM_GatherS32/10/64/1/process_time 563.9n ± 4% 534.6n ± 1% -5.20% (p=0.002 n=6) BM_GatherS32/10/64/16/process_time 603.1n ± 2% 582.6n ± 1% -3.41% (p=0.002 n=6) BM_GatherS32/10/64/2/process_time 567.9n ± 1% 548.6n ± 1% -3.40% (p=0.002 n=6) BM_GatherS32/100/128/1/process_time 460.2n ± 0% 550.9n ± 1% +19.71% (p=0.002 n=6) BM_GatherS32/100/128/2/process_time 469.1n ± 3% 562.0n ± 1% +19.80% (p=0.002 n=6) BM_GatherS32/100/128/32/process_time 768.6n ± 1% 849.3n ± 1% +10.51% (p=0.002 n=6) BM_GatherS32/100/256/1/process_time 468.1n ± 1% 561.8n ± 1% +20.02% (p=0.002 n=6) BM_GatherS32/100/256/2/process_time 479.0n ± 1% 567.1n ± 2% +18.38% (p=0.002 n=6) BM_GatherS32/100/256/64/process_time 1.604µ ± 0% 1.674µ ± 0% +4.33% (p=0.002 n=6) BM_GatherS32/100/3/1/process_time 452.1n ± 2% 533.6n ± 1% +18.02% (p=0.002 n=6) BM_GatherS32/100/3/2/process_time 455.0n ± 1% 556.9n ± 3% +22.40% (p=0.002 n=6) BM_GatherS32/100/3/4/process_time 463.5n ± 4% 537.0n ± 1% +15.85% (p=0.002 n=6) BM_GatherS32/100/32/1/process_time 460.3n ± 1% 535.5n ± 3% +16.35% (p=0.002 n=6) BM_GatherS32/100/32/2/process_time 461.6n ± 1% 549.1n ± 1% +18.96% (p=0.002 n=6) BM_GatherS32/100/32/8/process_time 467.6n ± 0% 566.6n ± 1% +21.15% (p=0.002 n=6) BM_GatherS32/100/512/1/process_time 479.8n ± 4% 567.8n ± 2% +18.35% (p=0.002 n=6) BM_GatherS32/100/512/128/process_time 20.158µ ± 3% 5.464µ ± 3% -72.89% (p=0.002 n=6) BM_GatherS32/100/512/2/process_time 507.9n ± 3% 585.8n ± 1% +15.34% (p=0.002 n=6) BM_GatherS32/100/64/1/process_time 458.4n ± 4% 544.9n ± 3% +18.87% (p=0.002 n=6) BM_GatherS32/100/64/16/process_time 496.1n ± 2% 594.7n ± 3% +19.88% (p=0.002 n=6) BM_GatherS32/100/64/2/process_time 461.6n ± 1% 554.9n ± 1% +20.21% (p=0.002 n=6) BM_GatherS32/3/128/1/process_time 567.6n ± 1% 463.7n ± 5% -18.32% (p=0.002 n=6) BM_GatherS32/3/128/2/process_time 569.7n ± 2% 470.7n ± 1% -17.38% (p=0.002 n=6) BM_GatherS32/3/128/32/process_time 722.9n ± 0% 625.5n ± 2% -13.47% (p=0.002 n=6) BM_GatherS32/3/256/1/process_time 573.1n ± 2% 473.1n ± 2% -17.44% (p=0.002 n=6) BM_GatherS32/3/256/2/process_time 582.2n ± 1% 480.9n ± 2% -17.41% (p=0.002 n=6) BM_GatherS32/3/256/64/process_time 1.202µ ± 1% 1.107µ ± 1% -7.91% (p=0.002 n=6) BM_GatherS32/3/3/1/process_time 566.0n ± 1% 460.1n ± 16% -18.70% (p=0.002 n=6) BM_GatherS32/3/3/2/process_time 564.1n ± 0% 456.7n ± 3% -19.05% (p=0.002 n=6) BM_GatherS32/3/3/4/process_time 567.6n ± 3% 461.9n ± 0% -18.63% (p=0.002 n=6) BM_GatherS32/3/32/1/process_time 563.9n ± 1% 459.4n ± 1% -18.53% (p=0.002 n=6) BM_GatherS32/3/32/2/process_time 562.9n ± 0% 459.0n ± 5% -18.46% (p=0.002 n=6) BM_GatherS32/3/32/8/process_time 572.5n ± 1% 469.5n ± 1% -18.01% (p=0.002 n=6) BM_GatherS32/3/512/1/process_time 585.8n ± 1% 482.0n ± 1% -17.73% (p=0.002 n=6) BM_GatherS32/3/512/128/process_time 18.549µ ± 3% 3.101µ ± 1% -83.28% (p=0.002 n=6) BM_GatherS32/3/512/2/process_time 612.2n ± 7% 505.5n ± 1% -17.42% (p=0.002 n=6) BM_GatherS32/3/64/1/process_time 562.2n ± 3% 457.4n ± 1% -18.63% (p=0.002 n=6) BM_GatherS32/3/64/16/process_time 604.1n ± 4% 494.6n ± 0% -18.13% (p=0.002 n=6) BM_GatherS32/3/64/2/process_time 567.1n ± 1% 463.8n ± 1% -18.22% (p=0.002 n=6) BM_Optimizer0/1024/process_time 5.125m ± 43% 5.046m ± 1% ~ (p=0.132 n=6) BM_Optimizer0/128/process_time 468.4µ ± 7% 449.4µ ± 12% -4.07% (p=0.026 n=6) BM_Optimizer0/16384/process_time 65.03m ± 5% 62.68m ± 7% ~ (p=0.485 n=6) BM_Optimizer0/256/process_time 1.219m ± 5% 1.208m ± 7% ~ (p=0.485 n=6) BM_Optimizer0/512/process_time 3.541m ± 32% 2.392m ± 3% -32.45% (p=0.015 n=6) BM_Optimizer0/8192/process_time 33.71m ± 2% 32.97m ± 5% ~ (p=0.132 n=6) BM_PadF32/1024/process_time 28.24m ± 9% 28.63m ± 6% ~ (p=0.485 n=6) BM_PadF32/128/process_time 339.8µ ± 1% 341.0µ ± 0% ~ (p=0.310 n=6) BM_PadF32/256/process_time 1.192m ± 4% 1.195m ± 3% ~ (p=0.589 n=6) BM_PadF32/4096/process_time 422.4m ± 1% 423.9m ± 0% ~ (p=0.310 n=6) BM_PadF32/512/process_time 4.339m ± 6% 4.170m ± 7% ~ (p=0.065 n=6) BM_ReduceAddBF16/1024/process_time 3.435m ± 2% 3.401m ± 9% ~ (p=0.180 n=6) BM_ReduceAddBF16/128/process_time 239.5µ ± 0% 239.0µ ± 0% -0.20% (p=0.002 n=6) BM_ReduceAddBF16/16384/process_time 51.15m ± 4% 50.94m ± 2% ~ (p=0.818 n=6) BM_ReduceAddBF16/256/process_time 914.1µ ± 11% 777.5µ ± 8% -14.94% (p=0.004 n=6) BM_ReduceAddBF16/512/process_time 1.704m ± 6% 1.680m ± 1% ~ (p=0.093 n=6) BM_ReduceAddBF16/8192/process_time 26.21m ± 3% 25.97m ± 7% ~ (p=0.699 n=6) BM_ReduceAddF32/1024/process_time 488.1µ ± 1% 487.1µ ± 0% ~ (p=0.093 n=6) BM_ReduceAddF32/128/process_time 23.36µ ± 0% 23.42µ ± 0% +0.26% (p=0.026 n=6) BM_ReduceAddF32/16384/process_time 6.756m ± 1% 6.672m ± 0% -1.23% (p=0.002 n=6) BM_ReduceAddF32/256/process_time 62.44µ ± 1% 62.76µ ± 5% ~ (p=0.132 n=6) BM_ReduceAddF32/512/process_time 246.5µ ± 3% 246.9µ ± 0% ~ (p=0.310 n=6) BM_ReduceAddF32/8192/process_time 3.479m ± 0% 3.462m ± 0% -0.47% (p=0.004 n=6) BM_ScatterS32_R1/262144/262144/process_time 592.3µ ± 1% 590.2µ ± 1% ~ (p=0.240 n=6) BM_ScatterS32_R2/512/512/process_time 77.23µ ± 2% 77.87µ ± 2% ~ (p=0.394 n=6) BM_ScatterS32_R3/64/64/process_time 54.41µ ± 0% 54.79µ ± 2% ~ (p=0.180 n=6) BM_SimpleScatterReduceF32_R3/d0:1/d1:64/d2:8/num_slices:1/process_time 727.3n ± 1% 720.6n ± 2% -0.92% (p=0.026 n=6) BM_SimpleScatterReduceF32_R3/d0:50/d1:64/d2:8/num_slices:10/process_time 108.0µ ± 1% 107.3µ ± 3% ~ (p=0.394 n=6) BM_SimpleScatterReduceF32_R3/d0:500/d1:64/d2:8/num_slices:100/process_time 10.38m ± 1% 10.33m ± 1% ~ (p=0.132 n=6) BM_SelectAndScatterF32/128/process_time 35.65µ ± 1% 35.72µ ± 4% ~ (p=0.699 n=6) BM_SelectAndScatterF32/256/process_time 118.7µ ± 1% 117.9µ ± 3% ~ (p=0.310 n=6) BM_SelectAndScatterF32/512/process_time 1.657m ± 5% 1.640m ± 6% ~ (p=0.240 n=6) BM_TanhF16/1024/process_time 684.4n ± 1% 716.9n ± 3% +4.75% (p=0.002 n=6) BM_TanhF16/128/process_time 435.9n ± 2% 466.8n ± 7% +7.07% (p=0.002 n=6) BM_TanhF16/256/process_time 474.4n ± 2% 504.8n ± 1% +6.41% (p=0.002 n=6) BM_TanhF16/4096/process_time 1.521µ ± 0% 1.545µ ± 1% +1.56% (p=0.002 n=6) BM_TanhF16/512/process_time 547.4n ± 1% 578.6n ± 1% +5.70% (p=0.002 n=6) BM_TanhF32/1024/process_time 688.6n ± 0% 716.3n ± 1% +4.02% (p=0.002 n=6) BM_TanhF32/128/process_time 446.9n ± 3% 466.9n ± 2% +4.48% (p=0.002 n=6) BM_TanhF32/256/process_time 476.8n ± 1% 507.2n ± 1% +6.36% (p=0.002 n=6) BM_TanhF32/4096/process_time 1.580µ ± 0% 1.616µ ± 1% +2.31% (p=0.002 n=6) BM_TanhF32/512/process_time 556.7n ± 1% 584.9n ± 1% +5.06% (p=0.002 n=6) BM_TanhF64/1024/process_time 12.60µ ± 2% 12.61µ ± 0% ~ (p=0.310 n=6) BM_TanhF64/128/process_time 1.944µ ± 0% 1.973µ ± 0% +1.49% (p=0.002 n=6) BM_TanhF64/256/process_time 3.463µ ± 1% 3.493µ ± 1% +0.86% (p=0.002 n=6) BM_TanhF64/4096/process_time 49.11µ ± 0% 49.10µ ± 0% ~ (p=0.394 n=6) BM_TanhF64/512/process_time 6.510µ ± 0% 6.527µ ± 0% +0.25% (p=0.002 n=6) geomean 94.44µ 92.38µ -2.19% ``` </details> PiperOrigin-RevId: 736201097
1 parent 45e1297 commit eed8c14

17 files changed

+2005
-48
lines changed
 

‎xla/backends/cpu/codegen/BUILD

+5
Original file line numberDiff line numberDiff line change
@@ -59,14 +59,19 @@ cc_library(
5959
hdrs = ["ir_compiler.h"],
6060
deps = [
6161
":polynomial_approximations",
62+
"//xla:debug_options_flags",
6263
"//xla:util",
6364
"//xla/service:hlo_module_config",
65+
"//xla/service/cpu:cpu_options",
6466
"//xla/service/llvm_ir:llvm_util",
6567
"@com_google_absl//absl/base:core_headers",
68+
"@com_google_absl//absl/base:nullability",
6669
"@com_google_absl//absl/log",
6770
"@com_google_absl//absl/log:check",
6871
"@com_google_absl//absl/status:statusor",
72+
"@com_google_absl//absl/strings",
6973
"@com_google_absl//absl/strings:str_format",
74+
"@com_google_absl//absl/strings:string_view",
7075
"@com_google_absl//absl/synchronization",
7176
"@llvm-project//llvm:Analysis",
7277
"@llvm-project//llvm:Core",
+135
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,135 @@
1+
load("//xla:xla.bzl", "xla_cc_test")
2+
load("//xla/tsl/platform:rules_cc.bzl", "cc_library")
3+
4+
package(
5+
# copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
6+
default_visibility = ["//xla/backends/cpu:xla_backend_cpu_internal_access"],
7+
licenses = ["notice"],
8+
)
9+
10+
package_group(
11+
name = "friends",
12+
includes = [
13+
"//xla:friends",
14+
],
15+
)
16+
17+
cc_library(
18+
name = "cpu_fusion_emitter_config",
19+
hdrs = ["cpu_fusion_emitter_config.h"],
20+
)
21+
22+
cc_library(
23+
name = "cpu_fusion_emitters",
24+
srcs = [
25+
"cpu_fusion_emitter.cc",
26+
"cpu_scatter_emitter.cc",
27+
],
28+
hdrs = [
29+
"cpu_fusion_emitter.h",
30+
"cpu_scatter_emitter.h",
31+
],
32+
deps = [
33+
"//xla:shape_util",
34+
"//xla:status_macros",
35+
"//xla:util",
36+
"//xla:xla_data_proto_cc",
37+
"//xla/backends/cpu/codegen:kernel_api_ir_builder",
38+
"//xla/backends/cpu/codegen/emitters/ir:xla_cpu",
39+
"//xla/backends/cpu/codegen/emitters/transforms:passes",
40+
"//xla/codegen/emitters:computation_partitioner",
41+
"//xla/codegen/emitters:elemental_hlo_to_mlir",
42+
"//xla/codegen/emitters:type_util",
43+
"//xla/codegen/emitters/ir:xla",
44+
"//xla/codegen/emitters/transforms:passes",
45+
"//xla/hlo/analysis:indexing_analysis",
46+
"//xla/hlo/ir:hlo",
47+
"//xla/mlir/tools/mlir_replay/public:compiler_trace_proto_cc",
48+
"//xla/mlir_hlo",
49+
"//xla/mlir_hlo:mhlo_passes",
50+
"//xla/service:buffer_assignment",
51+
"//xla/service:dump",
52+
"//xla/service:scatter_simplifier",
53+
"//xla/service/cpu:backend_config_proto_cc",
54+
"//xla/service/llvm_ir:llvm_util",
55+
"//xla/tsl/framework/mlir:status_scoped_diagnostic_handler",
56+
"//xla/tsl/platform:errors",
57+
"//xla/tsl/platform:statusor",
58+
"@com_google_absl//absl/algorithm:container",
59+
"@com_google_absl//absl/container:flat_hash_map",
60+
"@com_google_absl//absl/container:flat_hash_set",
61+
"@com_google_absl//absl/log",
62+
"@com_google_absl//absl/log:check",
63+
"@com_google_absl//absl/status",
64+
"@com_google_absl//absl/status:statusor",
65+
"@com_google_absl//absl/strings",
66+
"@com_google_absl//absl/types:span",
67+
"@llvm-project//llvm:Linker",
68+
"@llvm-project//llvm:Support",
69+
"@llvm-project//llvm:ir_headers",
70+
"@llvm-project//mlir:AffineDialect",
71+
"@llvm-project//mlir:AffineToStandard",
72+
"@llvm-project//mlir:ArithDialect",
73+
"@llvm-project//mlir:BufferizationInterfaces",
74+
"@llvm-project//mlir:BuiltinToLLVMIRTranslation",
75+
"@llvm-project//mlir:ComplexToStandard",
76+
"@llvm-project//mlir:ControlFlowDialect",
77+
"@llvm-project//mlir:DLTIDialect",
78+
"@llvm-project//mlir:DataLayoutInterfaces",
79+
"@llvm-project//mlir:FuncDialect",
80+
"@llvm-project//mlir:FuncExtensions",
81+
"@llvm-project//mlir:IR",
82+
"@llvm-project//mlir:LLVMDialect",
83+
"@llvm-project//mlir:LLVMIRTransforms",
84+
"@llvm-project//mlir:LLVMToLLVMIRTranslation",
85+
"@llvm-project//mlir:MathDialect",
86+
"@llvm-project//mlir:MemRefTransforms",
87+
"@llvm-project//mlir:NVVMToLLVMIRTranslation",
88+
"@llvm-project//mlir:Pass",
89+
"@llvm-project//mlir:ROCDLToLLVMIRTranslation",
90+
"@llvm-project//mlir:ReconcileUnrealizedCasts",
91+
"@llvm-project//mlir:SCFDialect",
92+
"@llvm-project//mlir:SCFToControlFlow",
93+
"@llvm-project//mlir:Support",
94+
"@llvm-project//mlir:TensorDialect",
95+
"@llvm-project//mlir:ToLLVMIRTranslation",
96+
"@llvm-project//mlir:Transforms",
97+
"@llvm-project//mlir:VectorDialect",
98+
],
99+
)
100+
101+
xla_cc_test(
102+
name = "cpu_fusion_emitter_test",
103+
srcs = ["cpu_fusion_emitter_test.cc"],
104+
deps = [
105+
":cpu_fusion_emitters",
106+
"//xla/hlo/analysis:hlo_ordering",
107+
"//xla/hlo/ir:hlo",
108+
"//xla/hlo/testlib:filecheck",
109+
"//xla/mlir_hlo",
110+
"//xla/service:buffer_assignment",
111+
"//xla/service:logical_buffer",
112+
"//xla/tests:hlo_test_base",
113+
"//xla/tests:xla_internal_test_main",
114+
"//xla/tsl/platform:statusor",
115+
"@com_google_absl//absl/status:statusor",
116+
"@com_google_absl//absl/strings:string_view",
117+
"@com_google_googletest//:gtest",
118+
"@llvm-project//llvm:Support",
119+
"@llvm-project//llvm:ir_headers",
120+
"@llvm-project//mlir:AffineDialect",
121+
"@llvm-project//mlir:ArithDialect",
122+
"@llvm-project//mlir:BuiltinToLLVMIRTranslation",
123+
"@llvm-project//mlir:ComplexDialect",
124+
"@llvm-project//mlir:FuncDialect",
125+
"@llvm-project//mlir:FuncExtensions",
126+
"@llvm-project//mlir:IR",
127+
"@llvm-project//mlir:LLVMToLLVMIRTranslation",
128+
"@llvm-project//mlir:MathDialect",
129+
"@llvm-project//mlir:NVVMToLLVMIRTranslation",
130+
"@llvm-project//mlir:Pass",
131+
"@llvm-project//mlir:ROCDLToLLVMIRTranslation",
132+
"@llvm-project//mlir:SCFDialect",
133+
"@llvm-project//mlir:TensorDialect",
134+
],
135+
)

0 commit comments

Comments
 (0)
Please sign in to comment.