framework,version,device,op_name,kernel_source,allreduce_dtype,num_gpus,message_size,latency
TRTLLM,1.0.0rc3,NVIDIA H200,all_reduce,CUDA_Graph,float16,2,128,0.00301503986120224
TRTLLM,1.0.0rc3,NVIDIA H200,all_reduce,CUDA_Graph,float16,2,256,0.0030211201310157776
TRTLLM,1.0.0rc3,NVIDIA H200,all_reduce,CUDA_Graph,float16,2,512,0.0030303999781608583
TRTLLM,1.0.0rc3,NVIDIA H200,all_reduce,CUDA_Graph,float16,2,1024,0.0032985600829124454
TRTLLM,1.0.0rc3,NVIDIA H200,all_reduce,CUDA_Graph,float16,2,2048,0.003343679904937744
TRTLLM,1.0.0rc3,NVIDIA H200,all_reduce,CUDA_Graph,float16,2,4096,0.003366400003433227
TRTLLM,1.0.0rc3,NVIDIA H200,all_reduce,CUDA_Graph,float16,2,8192,0.0033174398541450504
TRTLLM,1.0.0rc3,NVIDIA H200,all_reduce,CUDA_Graph,float16,2,16384,0.0033318400382995605
TRTLLM,1.0.0rc3,NVIDIA H200,all_reduce,CUDA_Graph,float16,2,32768,0.0034569600224494935
TRTLLM,1.0.0rc3,NVIDIA H200,all_reduce,CUDA_Graph,float16,2,65536,0.003635840117931366
TRTLLM,1.0.0rc3,NVIDIA H200,all_reduce,CUDA_Graph,float16,2,131072,0.004099200069904328
TRTLLM,1.0.0rc3,NVIDIA H200,all_reduce,CUDA_Graph,float16,2,262144,0.004824959933757781
TRTLLM,1.0.0rc3,NVIDIA H200,all_reduce,CUDA_Graph,float16,2,524288,0.006517760157585144
TRTLLM,1.0.0rc3,NVIDIA H200,all_reduce,CUDA_Graph,float16,2,1048576,0.016499520540237428
TRTLLM,1.0.0rc3,NVIDIA H200,all_reduce,CUDA_Graph,float16,2,2097152,0.02395519971847534
TRTLLM,1.0.0rc3,NVIDIA H200,all_reduce,CUDA_Graph,float16,2,4194304,0.038940799236297605
TRTLLM,1.0.0rc3,NVIDIA H200,all_reduce,CUDA_Graph,float16,2,8388608,0.07837471961975098
TRTLLM,1.0.0rc3,NVIDIA H200,all_reduce,CUDA_Graph,float16,2,16777216,0.1388748836517334
TRTLLM,1.0.0rc3,NVIDIA H200,all_reduce,CUDA_Graph,float16,2,33554432,0.2481135940551758
TRTLLM,1.0.0rc3,NVIDIA H200,all_reduce,CUDA_Graph,float16,2,67108864,0.46744865417480475
TRTLLM,1.0.0rc3,NVIDIA H200,all_reduce,CUDA_Graph,float16,2,134217728,0.8914819335937499
TRTLLM,1.0.0rc3,NVIDIA H200,all_reduce,CUDA_Graph,float16,2,268435456,1.7055130004882812
TRTLLM,1.0.0rc3,NVIDIA H200,all_reduce,CUDA_Graph,float16,2,536870912,3.2240280151367187
TRTLLM,1.0.0rc3,NVIDIA H200,all_reduce,CUDA_Graph,float16,4,128,0.0031174400448799136
TRTLLM,1.0.0rc3,NVIDIA H200,all_reduce,CUDA_Graph,float16,4,256,0.0030780801177024844
TRTLLM,1.0.0rc3,NVIDIA H200,all_reduce,CUDA_Graph,float16,4,512,0.003213120102882385
TRTLLM,1.0.0rc3,NVIDIA H200,all_reduce,CUDA_Graph,float16,4,1024,0.0035590401291847227
TRTLLM,1.0.0rc3,NVIDIA H200,all_reduce,CUDA_Graph,float16,4,2048,0.0035788801312446592
TRTLLM,1.0.0rc3,NVIDIA H200,all_reduce,CUDA_Graph,float16,4,4096,0.0035955199599266054
TRTLLM,1.0.0rc3,NVIDIA H200,all_reduce,CUDA_Graph,float16,4,8192,0.00356799989938736
TRTLLM,1.0.0rc3,NVIDIA H200,all_reduce,CUDA_Graph,float16,4,16384,0.003745599985122681
TRTLLM,1.0.0rc3,NVIDIA H200,all_reduce,CUDA_Graph,float16,4,32768,0.0039452800154685976
TRTLLM,1.0.0rc3,NVIDIA H200,all_reduce,CUDA_Graph,float16,4,65536,0.00460671991109848
TRTLLM,1.0.0rc3,NVIDIA H200,all_reduce,CUDA_Graph,float16,4,131072,0.005916799902915955
TRTLLM,1.0.0rc3,NVIDIA H200,all_reduce,CUDA_Graph,float16,4,262144,0.008271039724349975
TRTLLM,1.0.0rc3,NVIDIA H200,all_reduce,CUDA_Graph,float16,4,524288,0.013203840255737304
TRTLLM,1.0.0rc3,NVIDIA H200,all_reduce,CUDA_Graph,float16,4,1048576,0.029001278877258303
TRTLLM,1.0.0rc3,NVIDIA H200,all_reduce,CUDA_Graph,float16,4,2097152,0.04278111934661865
TRTLLM,1.0.0rc3,NVIDIA H200,all_reduce,CUDA_Graph,float16,4,4194304,0.06309120178222656
TRTLLM,1.0.0rc3,NVIDIA H200,all_reduce,CUDA_Graph,float16,4,8388608,0.0993331241607666
TRTLLM,1.0.0rc3,NVIDIA H200,all_reduce,CUDA_Graph,float16,4,16777216,0.1737276840209961
TRTLLM,1.0.0rc3,NVIDIA H200,all_reduce,CUDA_Graph,float16,4,33554432,0.3242652893066406
TRTLLM,1.0.0rc3,NVIDIA H200,all_reduce,CUDA_Graph,float16,4,67108864,0.6139295959472657
TRTLLM,1.0.0rc3,NVIDIA H200,all_reduce,CUDA_Graph,float16,4,134217728,1.184546890258789
TRTLLM,1.0.0rc3,NVIDIA H200,all_reduce,CUDA_Graph,float16,4,268435456,2.320589141845703
TRTLLM,1.0.0rc3,NVIDIA H200,all_reduce,CUDA_Graph,float16,4,536870912,4.524656066894531
TRTLLM,1.0.0rc3,NVIDIA H200,all_reduce,CUDA_Graph,float16,8,128,0.0033849599957466127
TRTLLM,1.0.0rc3,NVIDIA H200,all_reduce,CUDA_Graph,float16,8,256,0.0034377598762512208
TRTLLM,1.0.0rc3,NVIDIA H200,all_reduce,CUDA_Graph,float16,8,512,0.0035452800989151005
TRTLLM,1.0.0rc3,NVIDIA H200,all_reduce,CUDA_Graph,float16,8,1024,0.003928639888763428
TRTLLM,1.0.0rc3,NVIDIA H200,all_reduce,CUDA_Graph,float16,8,2048,0.004588159918785095
TRTLLM,1.0.0rc3,NVIDIA H200,all_reduce,CUDA_Graph,float16,8,4096,0.0040217599272727965
TRTLLM,1.0.0rc3,NVIDIA H200,all_reduce,CUDA_Graph,float16,8,8192,0.004052479863166809
TRTLLM,1.0.0rc3,NVIDIA H200,all_reduce,CUDA_Graph,float16,8,16384,0.004750080108642578
TRTLLM,1.0.0rc3,NVIDIA H200,all_reduce,CUDA_Graph,float16,8,32768,0.005028480291366577
TRTLLM,1.0.0rc3,NVIDIA H200,all_reduce,CUDA_Graph,float16,8,65536,0.006714879870414733
TRTLLM,1.0.0rc3,NVIDIA H200,all_reduce,CUDA_Graph,float16,8,131072,0.010327999591827394
TRTLLM,1.0.0rc3,NVIDIA H200,all_reduce,CUDA_Graph,float16,8,262144,0.016840640306472778
TRTLLM,1.0.0rc3,NVIDIA H200,all_reduce,CUDA_Graph,float16,8,524288,0.03139967918395996
TRTLLM,1.0.0rc3,NVIDIA H200,all_reduce,CUDA_Graph,float16,8,1048576,0.04001471996307373
TRTLLM,1.0.0rc3,NVIDIA H200,all_reduce,CUDA_Graph,float16,8,2097152,0.05214752197265625
TRTLLM,1.0.0rc3,NVIDIA H200,all_reduce,CUDA_Graph,float16,8,4194304,0.07800159931182862
TRTLLM,1.0.0rc3,NVIDIA H200,all_reduce,CUDA_Graph,float16,8,8388608,0.1229856014251709
TRTLLM,1.0.0rc3,NVIDIA H200,all_reduce,CUDA_Graph,float16,8,16777216,0.19817184448242187
TRTLLM,1.0.0rc3,NVIDIA H200,all_reduce,CUDA_Graph,float16,8,33554432,0.32217792510986326
TRTLLM,1.0.0rc3,NVIDIA H200,all_reduce,CUDA_Graph,float16,8,67108864,0.5677916717529297
TRTLLM,1.0.0rc3,NVIDIA H200,all_reduce,CUDA_Graph,float16,8,134217728,1.05569091796875
TRTLLM,1.0.0rc3,NVIDIA H200,all_reduce,CUDA_Graph,float16,8,268435456,2.0369427490234377
TRTLLM,1.0.0rc3,NVIDIA H200,all_reduce,CUDA_Graph,float16,8,536870912,4.004513854980469