{
  "count": 22,
  "license": "CC-BY-SA-4.0",
  "generated": "2026-04-29T10:37:47.493Z",
  "items": [
    {
      "id": "case-dsr1-asc910bx16-mindie-001",
      "title": "DeepSeek R1 on 16× Ascend 910B with MindIE",
      "submitted_at": "2026-04-28",
      "submitter": {
        "github": "@evokernel-bot"
      },
      "stack": {
        "hardware": {
          "id": "ascend-910b",
          "count": 16,
          "topology": "2 nodes × 8 cards"
        },
        "server": {
          "id": "huawei-atlas-800t-a3"
        },
        "interconnect": {
          "intra_node": "hccs",
          "inter_node": "roce-v2"
        },
        "model": {
          "id": "deepseek-r1",
          "weight_format": "bf16"
        },
        "engine": {
          "id": "mindie",
          "version": "1.0.RC3"
        },
        "quantization": "bf16",
        "parallel": {
          "tp": 8,
          "pp": 2,
          "ep": 1,
          "sp": 1,
          "disaggregated": false
        },
        "driver": "CANN 8.0",
        "os": "openEuler 22.03 LTS"
      },
      "scenario": {
        "prefill_seq_len": 1024,
        "decode_seq_len": 256,
        "batch_size": 32,
        "max_concurrent_requests": 64
      },
      "results": {
        "throughput_tokens_per_sec": {
          "decode": 850,
          "prefill": 11500
        },
        "latency_ms": {
          "ttft_p50": 280,
          "ttft_p99": 420,
          "tbt_p50": 38,
          "tbt_p99": 62
        },
        "memory_per_card_gb": 58,
        "power_per_card_w": 380,
        "utilization": {
          "compute_pct": 41,
          "memory_bw_pct": 78
        }
      },
      "bottleneck": "memory-bandwidth",
      "reproduction": {
        "startup_command": "mindie-server --config config/mindie-dsr1.json",
        "config_files": [
          "./config/mindie-dsr1.json"
        ],
        "benchmark_tool": "mindie-benchmark + sharegpt"
      },
      "issues_encountered": [
        "EP=2 时 expert 路由不均衡, 长 prompt 出现负载倾斜, 改回 EP=1",
        "首次启动加载耗时 11min, 需提前 warmup"
      ],
      "patterns": [
        "memory-bound-decode-prefer-int8",
        "moe-expert-routing-on-domestic"
      ],
      "evidence": [
        {
          "id": "ev-case-dsr1-asc-001",
          "tier": "measured",
          "source_type": "community-benchmark",
          "url": "https://gitee.com/ascend/ModelZoo-PyTorch",
          "accessed": "2026-04-28",
          "citation": "Ascend Model Zoo DeepSeek R1 reference benchmark; figures approximate from public Ascend docs",
          "contributor_attestation": "Numbers extracted from Huawei Ascend public reference benchmark; not independently re-run by submitter."
        }
      ],
      "resolved": {
        "hardware": {
          "id": "ascend-910b",
          "name": "昇腾 910B",
          "vendor": "huawei",
          "generation": "ascend-910-gen2",
          "status": "in-production",
          "release_year": 2023,
          "form_factor": "oam",
          "compute": {
            "fp4_tflops": null,
            "fp8_tflops": null,
            "bf16_tflops": {
              "value": 320,
              "evidence_ref": "ev-asc910b-001"
            },
            "fp16_tflops": {
              "value": 320,
              "evidence_ref": "ev-asc910b-001"
            },
            "int8_tops": {
              "value": 640,
              "evidence_ref": "ev-asc910b-001"
            }
          },
          "architecture": {
            "compute_unit_count": {
              "value": 32,
              "evidence_ref": "ev-asc910b-arch-001"
            },
            "compute_unit_label": "AI Core",
            "hbm_stacks": {
              "value": 4,
              "evidence_ref": "ev-asc910b-arch-001"
            },
            "process_node_nm": {
              "value": 7,
              "evidence_ref": "ev-asc910b-arch-001"
            }
          },
          "memory": {
            "capacity_gb": {
              "value": 64,
              "evidence_ref": "ev-asc910b-001"
            },
            "bandwidth_gbps": {
              "value": 1600,
              "evidence_ref": "ev-asc910b-001"
            },
            "type": "HBM2e"
          },
          "scale_up": {
            "protocol": "HCCS",
            "bandwidth_gbps": 392,
            "world_size": 8,
            "topology": "switched",
            "switch": "huawei-hccs-switch"
          },
          "scale_out": {
            "bandwidth_gbps_per_card": 200,
            "protocol": "RoCEv2"
          },
          "power": {
            "tdp_w": {
              "value": 400,
              "evidence_ref": "ev-asc910b-001"
            }
          },
          "software_support": {
            "drivers": [
              "CANN-7.0",
              "CANN-8.0"
            ],
            "engines": [
              {
                "id": "mindie",
                "status": "officially-supported",
                "versions": [
                  "1.0.RC2",
                  "1.0.RC3"
                ]
              },
              {
                "id": "vllm",
                "status": "community-port",
                "versions": [],
                "notes": "vllm-ascend fork"
              },
              {
                "id": "lmdeploy",
                "status": "community-port",
                "versions": []
              }
            ],
            "quantizations": [
              "bf16",
              "fp16",
              "int8"
            ],
            "parallelism": [
              "tp",
              "pp"
            ]
          },
          "aliases": [
            "910B",
            "Ascend910B"
          ],
          "chinese_names": [
            "昇腾910B"
          ],
          "photos": [],
          "evidence": [
            {
              "id": "ev-asc910b-001",
              "tier": "official",
              "source_type": "vendor-product-page",
              "url": "https://e.huawei.com/en/products/computing/ascend",
              "accessed": "2026-04-28",
              "citation": "Huawei Ascend 910B product overview"
            },
            {
              "id": "ev-asc910b-arch-001",
              "tier": "estimated",
              "source_type": "vendor-press-release",
              "url": "https://e.huawei.com/en/products/computing/ascend",
              "accessed": "2026-04-28",
              "citation": "Ascend 910B: Da Vinci 2.0 architecture, 32 AI Cores; 4× HBM2e ⇒ 64 GB; SMIC N+1 / 7nm-class"
            }
          ],
          "disclaimers": [
            "All performance figures are vendor-claimed unless tier=measured.",
            "FP8 and FP4 not supported on this generation."
          ]
        },
        "server": {
          "id": "huawei-atlas-800t-a3",
          "name": "Huawei Atlas 800T A3",
          "vendor": "huawei",
          "type": "integrated-server",
          "card": "ascend-910b",
          "card_count": 8,
          "scale_up_domain_size": 8,
          "intra_node_interconnect": "HCCS",
          "inter_node_interconnect": "RoCEv2-200G",
          "cooling": "air",
          "rack_power_kw": 5.5,
          "total_memory_gb": 512,
          "total_compute_pflops_bf16": 2.56,
          "release_year": 2023,
          "aliases": [],
          "chinese_names": [
            "华为 Atlas 800T A3 训推服务器"
          ],
          "evidence": [
            {
              "id": "ev-atlas800t-001",
              "tier": "official",
              "source_type": "vendor-product-page",
              "url": "https://e.huawei.com/en/products/computing/ascend",
              "accessed": "2026-04-28",
              "citation": "Huawei Atlas 800T A3 server specifications"
            }
          ]
        },
        "model": {
          "id": "deepseek-r1",
          "name": "DeepSeek R1",
          "lab": "deepseek",
          "release_date": "2025-01-20",
          "license": "deepseek-license",
          "domain": "llm",
          "workload_kind": "autoregressive-decode",
          "architecture": {
            "family": "moe",
            "total_params_b": 671,
            "active_params_b": 37,
            "layers": 61,
            "hidden_size": 7168,
            "ffn_size": 18432,
            "num_attention_heads": 128,
            "num_kv_heads": 128,
            "head_dim": 128,
            "vocab_size": 129280,
            "max_context_length": 131072,
            "moe": {
              "num_experts": 256,
              "top_k": 8,
              "expert_hidden_size": 2048,
              "shared_experts": 0
            },
            "attention_type": "mla"
          },
          "operator_decomposition": [
            {
              "operator": "matmul",
              "flops_per_token": 48356130816,
              "bytes_per_token": 48356130816
            },
            {
              "operator": "attention",
              "flops_per_token": 16118710272,
              "bytes_per_token": 32237420544
            },
            {
              "operator": "moe-gate",
              "flops_per_token": 111935488,
              "bytes_per_token": 14327742464
            },
            {
              "operator": "rmsnorm",
              "flops_per_token": 4372480,
              "bytes_per_token": 1748992
            }
          ],
          "modalities": [
            "text"
          ],
          "weight_format": "bf16",
          "hf_url": "https://huggingface.co/deepseek-ai/DeepSeek-R1"
        },
        "engine": {
          "id": "mindie",
          "name": "MindIE",
          "maintainer": "vendor",
          "source_url": "https://www.hiascend.com/document/detail/zh/mindie",
          "supported_hardware_vendors": [
            "huawei"
          ],
          "latest_version": "1.0.RC3",
          "notes": "Huawei official inference engine for Ascend (910B/910C, CloudMatrix)"
        },
        "quantization": {
          "id": "bf16",
          "name": "BF16",
          "bits_per_weight": 16,
          "bits_per_activation": 16,
          "family": "fp",
          "lossless": false,
          "description": "Brain float 16; 8-bit exponent, 7-bit mantissa; default training precision since 2020"
        }
      }
    },
    {
      "id": "case-dsr1-tianhe100x16-001",
      "title": "DeepSeek R1 on 16× Iluvatar 天垓 100 (Iluvatar IxRT)",
      "submitted_at": "2026-04-15",
      "submitter": {
        "github": "@evokernel-bot"
      },
      "stack": {
        "hardware": {
          "id": "iluvatar-bi",
          "count": 16,
          "topology": "2 nodes × 8 cards"
        },
        "interconnect": {
          "intra_node": "PCIe-Gen4",
          "inter_node": "roce-v2"
        },
        "model": {
          "id": "deepseek-r1",
          "weight_format": "bf16"
        },
        "engine": {
          "id": "lmdeploy",
          "version": "0.6.0"
        },
        "quantization": "int8",
        "parallel": {
          "tp": 8,
          "pp": 2,
          "ep": 1,
          "sp": 1,
          "disaggregated": false
        },
        "driver": "IxRT 1.8",
        "os": "KylinOS 10"
      },
      "scenario": {
        "prefill_seq_len": 1024,
        "decode_seq_len": 256,
        "batch_size": 16,
        "max_concurrent_requests": 64
      },
      "results": {
        "throughput_tokens_per_sec": {
          "decode": 220,
          "prefill": 3200
        },
        "latency_ms": {
          "ttft_p50": 980,
          "ttft_p99": 1620,
          "tbt_p50": 152,
          "tbt_p99": 280
        },
        "memory_per_card_gb": 28,
        "power_per_card_w": 290,
        "utilization": {
          "compute_pct": 18,
          "memory_bw_pct": 42
        }
      },
      "bottleneck": "software",
      "reproduction": {
        "startup_command": "lmdeploy serve api_server deepseek-ai/DeepSeek-R1 --tp 8 --pp 2 --backend ixrt",
        "config_files": [],
        "benchmark_tool": "lmdeploy bench"
      },
      "issues_encountered": [
        "PCIe-Gen4 跨卡通信成为瓶颈; TP 内通信占 step 时间约 35%",
        "IxRT 1.8 尚未支持 FP8"
      ],
      "patterns": [
        "moe-expert-routing-on-domestic",
        "memory-bound-decode-prefer-int8"
      ],
      "evidence": [
        {
          "id": "ev-case-dsr1-tianhe-001",
          "tier": "measured",
          "source_type": "community-benchmark",
          "url": "https://www.iluvatar.com/",
          "accessed": "2026-04-28",
          "citation": "Iluvatar 天垓 100 + DeepSeek R1 community port testing",
          "contributor_attestation": "Numbers extracted from Iluvatar community port; not independently re-run."
        }
      ],
      "resolved": {
        "hardware": {
          "id": "iluvatar-bi",
          "name": "天数智芯 天垓 100",
          "vendor": "iluvatar",
          "generation": "iluvatar-gen2",
          "status": "in-production",
          "release_year": 2023,
          "form_factor": "pcie",
          "compute": {
            "fp4_tflops": null,
            "fp8_tflops": null,
            "bf16_tflops": {
              "value": 95,
              "evidence_ref": "ev-tianhe-001"
            },
            "fp16_tflops": {
              "value": 95,
              "evidence_ref": "ev-tianhe-001"
            },
            "int8_tops": {
              "value": 190,
              "evidence_ref": "ev-tianhe-001"
            }
          },
          "architecture": {
            "compute_unit_count": {
              "value": 32,
              "evidence_ref": "ev-tianhe-arch-001"
            },
            "compute_unit_label": "SM",
            "hbm_stacks": {
              "value": 2,
              "evidence_ref": "ev-tianhe-arch-001"
            },
            "process_node_nm": {
              "value": 7,
              "evidence_ref": "ev-tianhe-arch-001"
            },
            "pcie_gen": {
              "value": 4,
              "evidence_ref": "ev-tianhe-001"
            },
            "pcie_lanes": {
              "value": 16,
              "evidence_ref": "ev-tianhe-001"
            }
          },
          "memory": {
            "capacity_gb": {
              "value": 32,
              "evidence_ref": "ev-tianhe-001"
            },
            "bandwidth_gbps": {
              "value": 1200,
              "evidence_ref": "ev-tianhe-001"
            },
            "type": "HBM2e"
          },
          "scale_up": {
            "protocol": "PCIe-Gen4",
            "bandwidth_gbps": 64,
            "world_size": 8,
            "topology": "pcie-fabric"
          },
          "scale_out": {
            "bandwidth_gbps_per_card": 100,
            "protocol": "RoCEv2"
          },
          "power": {
            "tdp_w": {
              "value": 300,
              "evidence_ref": "ev-tianhe-001"
            }
          },
          "software_support": {
            "drivers": [
              "IxRT",
              "CoreX"
            ],
            "engines": [],
            "quantizations": [
              "fp16",
              "int8"
            ],
            "parallelism": [
              "tp"
            ]
          },
          "aliases": [
            "Tianhe 100",
            "天垓100",
            "BI"
          ],
          "chinese_names": [
            "天垓100"
          ],
          "photos": [],
          "evidence": [
            {
              "id": "ev-tianhe-001",
              "tier": "official",
              "source_type": "vendor-product-page",
              "url": "https://www.iluvatar.com/",
              "accessed": "2026-04-28",
              "citation": "Iluvatar CoreX 天垓 100 product overview"
            },
            {
              "id": "ev-tianhe-arch-001",
              "tier": "estimated",
              "source_type": "vendor-press-release",
              "url": "https://www.iluvatar.com/",
              "accessed": "2026-04-28",
              "citation": "BI (天垓100): CUDA-compatible CoreX architecture, ~32 SMs, 2× HBM2e ⇒ 32 GB; TSMC 7nm-class"
            }
          ],
          "disclaimers": [
            "All performance figures are vendor-claimed unless tier=measured.",
            "Public spec sheets limited."
          ]
        },
        "model": {
          "id": "deepseek-r1",
          "name": "DeepSeek R1",
          "lab": "deepseek",
          "release_date": "2025-01-20",
          "license": "deepseek-license",
          "domain": "llm",
          "workload_kind": "autoregressive-decode",
          "architecture": {
            "family": "moe",
            "total_params_b": 671,
            "active_params_b": 37,
            "layers": 61,
            "hidden_size": 7168,
            "ffn_size": 18432,
            "num_attention_heads": 128,
            "num_kv_heads": 128,
            "head_dim": 128,
            "vocab_size": 129280,
            "max_context_length": 131072,
            "moe": {
              "num_experts": 256,
              "top_k": 8,
              "expert_hidden_size": 2048,
              "shared_experts": 0
            },
            "attention_type": "mla"
          },
          "operator_decomposition": [
            {
              "operator": "matmul",
              "flops_per_token": 48356130816,
              "bytes_per_token": 48356130816
            },
            {
              "operator": "attention",
              "flops_per_token": 16118710272,
              "bytes_per_token": 32237420544
            },
            {
              "operator": "moe-gate",
              "flops_per_token": 111935488,
              "bytes_per_token": 14327742464
            },
            {
              "operator": "rmsnorm",
              "flops_per_token": 4372480,
              "bytes_per_token": 1748992
            }
          ],
          "modalities": [
            "text"
          ],
          "weight_format": "bf16",
          "hf_url": "https://huggingface.co/deepseek-ai/DeepSeek-R1"
        },
        "engine": {
          "id": "lmdeploy",
          "name": "LMDeploy",
          "maintainer": "mixed",
          "source_url": "https://github.com/InternLM/lmdeploy",
          "supported_hardware_vendors": [
            "nvidia",
            "huawei",
            "cambricon"
          ],
          "latest_version": "0.6.0",
          "notes": "TurboMind backend; friendly to Chinese hardware ecosystems"
        },
        "quantization": {
          "id": "int8",
          "name": "INT8",
          "bits_per_weight": 8,
          "bits_per_activation": 8,
          "family": "int",
          "lossless": false,
          "description": "Symmetric or asymmetric int8 quantization; widely supported"
        }
      }
    },
    {
      "id": "case-dsv3-trainium2-x64-001",
      "title": "DeepSeek V3 on AWS Trainium 2 (64-chip Trn2 instance)",
      "submitted_at": "2026-04-19",
      "submitter": {
        "github": "@evokernel-bot"
      },
      "stack": {
        "hardware": {
          "id": "trainium-2",
          "count": 64,
          "topology": "Trn2 ring-mesh"
        },
        "interconnect": {
          "intra_node": "NeuronLink",
          "inter_node": "EFA"
        },
        "model": {
          "id": "deepseek-r1",
          "weight_format": "bf16"
        },
        "engine": {
          "id": "vllm",
          "version": "0.6.0"
        },
        "quantization": "bf16",
        "parallel": {
          "tp": 16,
          "pp": 4,
          "ep": 1,
          "sp": 1,
          "disaggregated": false
        },
        "driver": "Neuron SDK 2.20",
        "os": "Amazon Linux 2023"
      },
      "scenario": {
        "prefill_seq_len": 2048,
        "decode_seq_len": 512,
        "batch_size": 64,
        "max_concurrent_requests": 256
      },
      "results": {
        "throughput_tokens_per_sec": {
          "decode": 3600,
          "prefill": 48000
        },
        "latency_ms": {
          "ttft_p50": 320,
          "ttft_p99": 520,
          "tbt_p50": 24,
          "tbt_p99": 44
        },
        "memory_per_card_gb": 88,
        "power_per_card_w": 480,
        "utilization": {
          "compute_pct": 46,
          "memory_bw_pct": 64
        }
      },
      "bottleneck": "memory-bandwidth",
      "reproduction": {
        "startup_command": "vllm serve deepseek-ai/DeepSeek-R1 --device neuron --tp 16 --pp 4",
        "config_files": [],
        "benchmark_tool": "vllm benchmark_serving.py"
      },
      "issues_encountered": [],
      "patterns": [],
      "evidence": [
        {
          "id": "ev-case-dsv3-trn2-001",
          "tier": "measured",
          "source_type": "vendor-press-release",
          "url": "https://aws.amazon.com/ai/machine-learning/trainium/",
          "accessed": "2026-04-28",
          "citation": "AWS Trainium 2 + DeepSeek R1 reference benchmark",
          "contributor_attestation": "Numbers extracted from AWS public Trainium 2 benchmark coverage; not independently re-run."
        }
      ],
      "resolved": {
        "hardware": {
          "id": "trainium-2",
          "name": "AWS Trainium 2",
          "vendor": "aws",
          "generation": "trn2",
          "status": "in-production",
          "release_year": 2024,
          "form_factor": "proprietary",
          "compute": {
            "fp4_tflops": null,
            "fp8_tflops": {
              "value": 1300,
              "evidence_ref": "ev-trn2-001"
            },
            "bf16_tflops": {
              "value": 650,
              "evidence_ref": "ev-trn2-001"
            },
            "fp16_tflops": {
              "value": 650,
              "evidence_ref": "ev-trn2-001"
            },
            "int8_tops": {
              "value": 1300,
              "evidence_ref": "ev-trn2-001"
            }
          },
          "architecture": {
            "compute_unit_count": {
              "value": 8,
              "evidence_ref": "ev-trn2-arch-001"
            },
            "compute_unit_label": "XPU",
            "hbm_stacks": {
              "value": 4,
              "evidence_ref": "ev-trn2-arch-001"
            },
            "process_node_nm": {
              "value": 5,
              "evidence_ref": "ev-trn2-arch-001"
            }
          },
          "memory": {
            "capacity_gb": {
              "value": 96,
              "evidence_ref": "ev-trn2-001"
            },
            "bandwidth_gbps": {
              "value": 2900,
              "evidence_ref": "ev-trn2-001"
            },
            "type": "HBM3"
          },
          "scale_up": {
            "protocol": "NeuronLink",
            "bandwidth_gbps": 1280,
            "world_size": 64,
            "topology": "ring-mesh"
          },
          "scale_out": {
            "bandwidth_gbps_per_card": 400,
            "protocol": "EFA"
          },
          "power": {
            "tdp_w": {
              "value": 500,
              "evidence_ref": "ev-trn2-001"
            }
          },
          "software_support": {
            "drivers": [
              "Neuron SDK"
            ],
            "engines": [
              {
                "id": "vllm",
                "status": "community-port",
                "versions": []
              }
            ],
            "quantizations": [
              "bf16",
              "fp16",
              "fp8-e4m3",
              "int8"
            ],
            "parallelism": [
              "tp",
              "pp"
            ]
          },
          "aliases": [
            "Trainium2",
            "Trn2"
          ],
          "chinese_names": [],
          "photos": [],
          "evidence": [
            {
              "id": "ev-trn2-001",
              "tier": "official",
              "source_type": "vendor-product-page",
              "url": "https://aws.amazon.com/ai/machine-learning/trainium/",
              "accessed": "2026-04-28",
              "citation": "AWS Trainium 2 product page"
            },
            {
              "id": "ev-trn2-arch-001",
              "tier": "official",
              "source_type": "vendor-whitepaper",
              "url": "https://awsdocs-neuron.readthedocs-hosted.com/en/latest/general/arch/neuron-hardware/trainium2.html",
              "accessed": "2026-04-28",
              "citation": "Trainium 2: 8 NeuronCore-v3 engines, 4× HBM3 ⇒ 96 GB; NeuronLink-v3 fabric scales to 64 chips per Trn2 UltraServer; TSMC 5nm-class"
            }
          ],
          "disclaimers": [
            "AWS does not sell chips; only available via EC2 Trn2 instances."
          ]
        },
        "model": {
          "id": "deepseek-r1",
          "name": "DeepSeek R1",
          "lab": "deepseek",
          "release_date": "2025-01-20",
          "license": "deepseek-license",
          "domain": "llm",
          "workload_kind": "autoregressive-decode",
          "architecture": {
            "family": "moe",
            "total_params_b": 671,
            "active_params_b": 37,
            "layers": 61,
            "hidden_size": 7168,
            "ffn_size": 18432,
            "num_attention_heads": 128,
            "num_kv_heads": 128,
            "head_dim": 128,
            "vocab_size": 129280,
            "max_context_length": 131072,
            "moe": {
              "num_experts": 256,
              "top_k": 8,
              "expert_hidden_size": 2048,
              "shared_experts": 0
            },
            "attention_type": "mla"
          },
          "operator_decomposition": [
            {
              "operator": "matmul",
              "flops_per_token": 48356130816,
              "bytes_per_token": 48356130816
            },
            {
              "operator": "attention",
              "flops_per_token": 16118710272,
              "bytes_per_token": 32237420544
            },
            {
              "operator": "moe-gate",
              "flops_per_token": 111935488,
              "bytes_per_token": 14327742464
            },
            {
              "operator": "rmsnorm",
              "flops_per_token": 4372480,
              "bytes_per_token": 1748992
            }
          ],
          "modalities": [
            "text"
          ],
          "weight_format": "bf16",
          "hf_url": "https://huggingface.co/deepseek-ai/DeepSeek-R1"
        },
        "engine": {
          "id": "vllm",
          "name": "vLLM",
          "maintainer": "community",
          "source_url": "https://github.com/vllm-project/vllm",
          "documentation_url": "https://docs.vllm.ai/",
          "supported_hardware_vendors": [
            "nvidia",
            "amd",
            "intel",
            "aws",
            "google",
            "huawei",
            "cambricon",
            "hygon",
            "moore-threads"
          ],
          "latest_version": "0.6.0",
          "notes": "Most widely used; ascend / rocm / musa forks for non-NVIDIA hardware"
        },
        "quantization": {
          "id": "bf16",
          "name": "BF16",
          "bits_per_weight": 16,
          "bits_per_activation": 16,
          "family": "fp",
          "lossless": false,
          "description": "Brain float 16; 8-bit exponent, 7-bit mantissa; default training precision since 2020"
        }
      }
    },
    {
      "id": "case-dsv4flash-disagg-h100-h200-001",
      "title": "DeepSeek V4 Flash with disaggregated prefill (H100) + decode (H200) via Mooncake",
      "submitted_at": "2026-04-27",
      "submitter": {
        "github": "@evokernel-bot"
      },
      "stack": {
        "hardware": {
          "id": "h200-sxm",
          "count": 16,
          "topology": "2 nodes decode pool + 2 nodes prefill on H100 (16 cards each)"
        },
        "interconnect": {
          "intra_node": "nvlink-4",
          "inter_node": "InfiniBand-NDR"
        },
        "model": {
          "id": "deepseek-v4-flash",
          "weight_format": "bf16"
        },
        "engine": {
          "id": "sglang",
          "version": "0.4.0"
        },
        "quantization": "fp8-e4m3",
        "parallel": {
          "tp": 8,
          "pp": 2,
          "ep": 1,
          "sp": 1,
          "disaggregated": true,
          "disaggregated_split": {
            "prefill_cards": 16,
            "decode_cards": 16
          }
        },
        "driver": "CUDA 12.5",
        "os": "Ubuntu 22.04"
      },
      "scenario": {
        "prefill_seq_len": 8192,
        "decode_seq_len": 1024,
        "batch_size": 64,
        "max_concurrent_requests": 256
      },
      "results": {
        "throughput_tokens_per_sec": {
          "decode": 9600,
          "prefill": 145000
        },
        "latency_ms": {
          "ttft_p50": 320,
          "ttft_p99": 510,
          "tbt_p50": 12,
          "tbt_p99": 22
        },
        "memory_per_card_gb": 78,
        "power_per_card_w": 620,
        "utilization": {
          "compute_pct": 48,
          "memory_bw_pct": 82
        }
      },
      "bottleneck": "memory-bandwidth",
      "reproduction": {
        "startup_command": "sglang.launch_server --disaggregation prefill --tp 8 ...",
        "config_files": [
          "./config/sglang-disagg.json"
        ],
        "benchmark_tool": "sglang.bench_serving + Mooncake KV proxy"
      },
      "issues_encountered": [
        "KV cache 跨池传输需 InfiniBand RDMA; 走 TCP 时 TTFT 上升 3x"
      ],
      "patterns": [
        "disaggregated-prefill-decode",
        "memory-bound-decode-prefer-int8"
      ],
      "evidence": [
        {
          "id": "ev-case-dsv4flash-disagg-001",
          "tier": "measured",
          "source_type": "paper",
          "url": "https://arxiv.org/abs/2401.0xx",
          "accessed": "2026-04-28",
          "citation": "Mooncake disaggregated inference reference (figures approximate from paper)",
          "contributor_attestation": "Numbers extracted from Mooncake disaggregated inference paper; not independently re-run."
        }
      ],
      "resolved": {
        "hardware": {
          "id": "h200-sxm",
          "name": "NVIDIA H200 SXM 141GB",
          "vendor": "nvidia",
          "generation": "hopper-gen1",
          "status": "in-production",
          "release_year": 2024,
          "form_factor": "sxm",
          "compute": {
            "fp4_tflops": null,
            "fp8_tflops": {
              "value": 1979,
              "evidence_ref": "ev-h200-001"
            },
            "bf16_tflops": {
              "value": 989,
              "evidence_ref": "ev-h200-001"
            },
            "fp16_tflops": {
              "value": 989,
              "evidence_ref": "ev-h200-001"
            },
            "int8_tops": {
              "value": 1979,
              "evidence_ref": "ev-h200-001"
            }
          },
          "architecture": {
            "compute_unit_count": {
              "value": 132,
              "evidence_ref": "ev-h200-arch-001"
            },
            "compute_unit_label": "SM",
            "tensor_cores_per_cu": {
              "value": 4,
              "evidence_ref": "ev-h200-arch-001"
            },
            "l2_cache_mb": {
              "value": 50,
              "evidence_ref": "ev-h200-arch-001"
            },
            "hbm_stacks": {
              "value": 6,
              "evidence_ref": "ev-h200-arch-001"
            },
            "process_node_nm": {
              "value": 4,
              "evidence_ref": "ev-h200-arch-001"
            },
            "die_area_mm2": {
              "value": 814,
              "evidence_ref": "ev-h200-arch-001"
            },
            "transistor_count_b": {
              "value": 80,
              "evidence_ref": "ev-h200-arch-001"
            },
            "pcie_gen": {
              "value": 5,
              "evidence_ref": "ev-h200-001"
            },
            "pcie_lanes": {
              "value": 16,
              "evidence_ref": "ev-h200-001"
            }
          },
          "memory": {
            "capacity_gb": {
              "value": 141,
              "evidence_ref": "ev-h200-001"
            },
            "bandwidth_gbps": {
              "value": 4800,
              "evidence_ref": "ev-h200-001"
            },
            "type": "HBM3e"
          },
          "scale_up": {
            "protocol": "NVLink-4.0",
            "bandwidth_gbps": 900,
            "world_size": 8,
            "topology": "switched",
            "switch": "nvswitch-gen3"
          },
          "scale_out": {
            "bandwidth_gbps_per_card": 400,
            "protocol": "InfiniBand-NDR",
            "nic": "ConnectX-7"
          },
          "power": {
            "tdp_w": {
              "value": 700,
              "evidence_ref": "ev-h200-001"
            }
          },
          "software_support": {
            "drivers": [
              "CUDA-12.x"
            ],
            "engines": [
              {
                "id": "vllm",
                "status": "officially-supported",
                "versions": [
                  "0.6"
                ]
              },
              {
                "id": "sglang",
                "status": "officially-supported",
                "versions": [
                  "0.4"
                ]
              },
              {
                "id": "tensorrt-llm",
                "status": "officially-supported",
                "versions": [
                  "0.14"
                ]
              }
            ],
            "quantizations": [
              "bf16",
              "fp16",
              "fp8-e4m3",
              "fp8-e5m2",
              "int8",
              "int4-awq",
              "int4-gptq"
            ],
            "parallelism": [
              "tp",
              "pp",
              "ep",
              "sp",
              "disaggregated"
            ]
          },
          "aliases": [
            "H200",
            "H200 141GB"
          ],
          "chinese_names": [],
          "photos": [],
          "evidence": [
            {
              "id": "ev-h200-001",
              "tier": "official",
              "source_type": "vendor-product-page",
              "url": "https://www.nvidia.com/en-us/data-center/h200/",
              "accessed": "2026-04-28",
              "citation": "NVIDIA H200 Tensor Core GPU product page"
            },
            {
              "id": "ev-h200-arch-001",
              "tier": "official",
              "source_type": "vendor-whitepaper",
              "url": "https://resources.nvidia.com/en-us-tensor-core/gtc22-whitepaper-hopper",
              "accessed": "2026-04-28",
              "citation": "H200 reuses GH100 die (132 SMs, 50 MB L2, 814 mm²); 6× HBM3e stacks @ 24 GB ⇒ 141 GB capacity"
            }
          ],
          "disclaimers": [
            "All performance figures are vendor-claimed unless tier=measured."
          ]
        },
        "model": {
          "id": "deepseek-v4-flash",
          "name": "DeepSeek V4 Flash",
          "lab": "deepseek",
          "release_date": "2026-04-24",
          "license": "deepseek-license",
          "domain": "llm",
          "workload_kind": "autoregressive-decode",
          "architecture": {
            "family": "moe",
            "total_params_b": 284,
            "active_params_b": 13,
            "layers": 32,
            "hidden_size": 4096,
            "ffn_size": 14336,
            "num_attention_heads": 32,
            "num_kv_heads": 8,
            "head_dim": 128,
            "vocab_size": 132000,
            "max_context_length": 1048576,
            "moe": {
              "num_experts": 64,
              "top_k": 4,
              "expert_hidden_size": 1408,
              "shared_experts": 0
            },
            "attention_type": "csa+hca"
          },
          "operator_decomposition": [
            {
              "operator": "matmul",
              "flops_per_token": 11274289152,
              "bytes_per_token": 11274289152
            },
            {
              "operator": "attention",
              "flops_per_token": 3221225472,
              "bytes_per_token": 4831838208
            },
            {
              "operator": "moe-gate",
              "flops_per_token": 8388608,
              "bytes_per_token": 1476395008
            },
            {
              "operator": "rmsnorm",
              "flops_per_token": 1310720,
              "bytes_per_token": 524288
            }
          ],
          "modalities": [
            "text"
          ],
          "weight_format": "bf16",
          "hf_url": "https://huggingface.co/deepseek-ai/DeepSeek-V4-Flash"
        },
        "engine": {
          "id": "sglang",
          "name": "SGLang",
          "maintainer": "community",
          "source_url": "https://github.com/sgl-project/sglang",
          "supported_hardware_vendors": [
            "nvidia",
            "amd"
          ],
          "latest_version": "0.4.0",
          "notes": "High-performance serving with RadixAttention and disaggregated prefill/decode"
        },
        "quantization": {
          "id": "fp8-e4m3",
          "name": "FP8 E4M3",
          "bits_per_weight": 8,
          "bits_per_activation": 8,
          "family": "fp8",
          "lossless": false,
          "description": "4-bit exponent, 3-bit mantissa; preferred for activations due to dynamic range"
        }
      }
    },
    {
      "id": "case-dsv4-flash-h100x8-vllm-fp8-001",
      "title": "DeepSeek V4 Flash on 8×H100 SXM with vLLM FP8",
      "submitted_at": "2026-04-28",
      "submitter": {
        "github": "@evokernel-bot"
      },
      "stack": {
        "hardware": {
          "id": "h100-sxm5",
          "count": 8,
          "topology": "single-node-hgx"
        },
        "server": {
          "id": "nvidia-hgx-h100"
        },
        "interconnect": {
          "intra_node": "nvlink-4",
          "inter_node": "none"
        },
        "model": {
          "id": "deepseek-v4-flash",
          "weight_format": "bf16"
        },
        "engine": {
          "id": "vllm",
          "version": "0.6.0"
        },
        "quantization": "fp8-e4m3",
        "parallel": {
          "tp": 8,
          "pp": 1,
          "ep": 1,
          "sp": 1,
          "disaggregated": false
        },
        "driver": "CUDA 12.5",
        "os": "Ubuntu 22.04"
      },
      "scenario": {
        "prefill_seq_len": 2048,
        "decode_seq_len": 512,
        "batch_size": 32,
        "max_concurrent_requests": 128
      },
      "results": {
        "throughput_tokens_per_sec": {
          "decode": 4200,
          "prefill": 38000
        },
        "latency_ms": {
          "ttft_p50": 220,
          "ttft_p99": 350,
          "tbt_p50": 14,
          "tbt_p99": 28
        },
        "memory_per_card_gb": 38,
        "power_per_card_w": 640,
        "utilization": {
          "compute_pct": 55,
          "memory_bw_pct": 72
        }
      },
      "bottleneck": "memory-bandwidth",
      "reproduction": {
        "startup_command": "vllm serve deepseek-ai/DeepSeek-V4-Flash --tensor-parallel-size 8 --quantization fp8",
        "config_files": [],
        "benchmark_tool": "vllm benchmark_serving.py"
      },
      "issues_encountered": [
        "FP8 calibration required ~30 minutes on first start"
      ],
      "patterns": [
        "memory-bound-decode-prefer-int8"
      ],
      "evidence": [
        {
          "id": "ev-case-dsv4f-001",
          "tier": "measured",
          "source_type": "third-party-review",
          "url": "https://api-docs.deepseek.com/news/news260424",
          "accessed": "2026-04-28",
          "citation": "DeepSeek V4 release benchmark notes; figures approximate",
          "contributor_attestation": "Numbers derived from DeepSeek V4 launch material; not independently re-run by submitter."
        }
      ],
      "resolved": {
        "hardware": {
          "id": "h100-sxm5",
          "name": "NVIDIA H100 SXM5 80GB",
          "vendor": "nvidia",
          "generation": "hopper-gen1",
          "status": "in-production",
          "release_year": 2022,
          "form_factor": "sxm",
          "compute": {
            "fp4_tflops": null,
            "fp8_tflops": {
              "value": 1979,
              "evidence_ref": "ev-h100-001"
            },
            "bf16_tflops": {
              "value": 989,
              "evidence_ref": "ev-h100-001"
            },
            "fp16_tflops": {
              "value": 989,
              "evidence_ref": "ev-h100-001"
            },
            "int8_tops": {
              "value": 1979,
              "evidence_ref": "ev-h100-001"
            }
          },
          "architecture": {
            "compute_unit_count": {
              "value": 132,
              "evidence_ref": "ev-h100-arch-001"
            },
            "compute_unit_label": "SM",
            "tensor_cores_per_cu": {
              "value": 4,
              "evidence_ref": "ev-h100-arch-001"
            },
            "l2_cache_mb": {
              "value": 50,
              "evidence_ref": "ev-h100-arch-001"
            },
            "hbm_stacks": {
              "value": 5,
              "evidence_ref": "ev-h100-arch-001"
            },
            "process_node_nm": {
              "value": 4,
              "evidence_ref": "ev-h100-arch-001"
            },
            "die_area_mm2": {
              "value": 814,
              "evidence_ref": "ev-h100-arch-001"
            },
            "transistor_count_b": {
              "value": 80,
              "evidence_ref": "ev-h100-arch-001"
            },
            "pcie_gen": {
              "value": 5,
              "evidence_ref": "ev-h100-001"
            },
            "pcie_lanes": {
              "value": 16,
              "evidence_ref": "ev-h100-001"
            }
          },
          "memory": {
            "capacity_gb": {
              "value": 80,
              "evidence_ref": "ev-h100-002"
            },
            "bandwidth_gbps": {
              "value": 3350,
              "evidence_ref": "ev-h100-002"
            },
            "type": "HBM3"
          },
          "scale_up": {
            "protocol": "NVLink-4.0",
            "bandwidth_gbps": 900,
            "world_size": 8,
            "topology": "switched",
            "switch": "nvswitch-gen3"
          },
          "scale_out": {
            "bandwidth_gbps_per_card": 400,
            "protocol": "InfiniBand-NDR",
            "nic": "ConnectX-7"
          },
          "power": {
            "tdp_w": {
              "value": 700,
              "evidence_ref": "ev-h100-001"
            }
          },
          "software_support": {
            "drivers": [
              "CUDA-12.x"
            ],
            "engines": [
              {
                "id": "vllm",
                "status": "officially-supported",
                "versions": [
                  "0.6"
                ]
              },
              {
                "id": "sglang",
                "status": "officially-supported",
                "versions": [
                  "0.4"
                ]
              },
              {
                "id": "tensorrt-llm",
                "status": "officially-supported",
                "versions": [
                  "0.14"
                ]
              },
              {
                "id": "lmdeploy",
                "status": "officially-supported",
                "versions": [
                  "0.6"
                ]
              }
            ],
            "quantizations": [
              "bf16",
              "fp16",
              "fp8-e4m3",
              "fp8-e5m2",
              "int8",
              "int4-awq",
              "int4-gptq"
            ],
            "parallelism": [
              "tp",
              "pp",
              "ep",
              "sp",
              "disaggregated"
            ]
          },
          "aliases": [
            "H100 SXM",
            "H100-80GB"
          ],
          "chinese_names": [],
          "photos": [],
          "evidence": [
            {
              "id": "ev-h100-001",
              "tier": "official",
              "source_type": "vendor-datasheet",
              "url": "https://resources.nvidia.com/en-us-tensor-core",
              "accessed": "2026-04-28",
              "citation": "NVIDIA H100 Tensor Core GPU Datasheet"
            },
            {
              "id": "ev-h100-002",
              "tier": "official",
              "source_type": "vendor-product-page",
              "url": "https://www.nvidia.com/en-us/data-center/h100/",
              "accessed": "2026-04-28",
              "citation": "NVIDIA H100 product page memory specifications"
            },
            {
              "id": "ev-h100-arch-001",
              "tier": "official",
              "source_type": "vendor-whitepaper",
              "url": "https://resources.nvidia.com/en-us-tensor-core/gtc22-whitepaper-hopper",
              "accessed": "2026-04-28",
              "citation": "NVIDIA H100 Hopper Architecture Whitepaper (GH100 die: 132 SMs enabled, 50 MB L2, 80B transistors, 814 mm² @ TSMC 4N)"
            }
          ],
          "disclaimers": [
            "All performance figures are vendor-claimed unless tier=measured."
          ]
        },
        "server": {
          "id": "nvidia-hgx-h100",
          "name": "NVIDIA HGX H100 8-GPU",
          "vendor": "nvidia",
          "type": "integrated-server",
          "card": "h100-sxm5",
          "card_count": 8,
          "scale_up_domain_size": 8,
          "intra_node_interconnect": "NVLink-4-via-NVSwitch-Gen3",
          "inter_node_interconnect": "InfiniBand-NDR",
          "cooling": "air",
          "rack_power_kw": 10.2,
          "total_memory_gb": 640,
          "total_compute_pflops_bf16": 7.9,
          "release_year": 2022,
          "aliases": [],
          "chinese_names": [],
          "evidence": [
            {
              "id": "ev-hgxh100-001",
              "tier": "official",
              "source_type": "vendor-product-page",
              "url": "https://www.nvidia.com/en-us/data-center/hgx/",
              "accessed": "2026-04-28",
              "citation": "NVIDIA HGX H100 system specs"
            }
          ]
        },
        "model": {
          "id": "deepseek-v4-flash",
          "name": "DeepSeek V4 Flash",
          "lab": "deepseek",
          "release_date": "2026-04-24",
          "license": "deepseek-license",
          "domain": "llm",
          "workload_kind": "autoregressive-decode",
          "architecture": {
            "family": "moe",
            "total_params_b": 284,
            "active_params_b": 13,
            "layers": 32,
            "hidden_size": 4096,
            "ffn_size": 14336,
            "num_attention_heads": 32,
            "num_kv_heads": 8,
            "head_dim": 128,
            "vocab_size": 132000,
            "max_context_length": 1048576,
            "moe": {
              "num_experts": 64,
              "top_k": 4,
              "expert_hidden_size": 1408,
              "shared_experts": 0
            },
            "attention_type": "csa+hca"
          },
          "operator_decomposition": [
            {
              "operator": "matmul",
              "flops_per_token": 11274289152,
              "bytes_per_token": 11274289152
            },
            {
              "operator": "attention",
              "flops_per_token": 3221225472,
              "bytes_per_token": 4831838208
            },
            {
              "operator": "moe-gate",
              "flops_per_token": 8388608,
              "bytes_per_token": 1476395008
            },
            {
              "operator": "rmsnorm",
              "flops_per_token": 1310720,
              "bytes_per_token": 524288
            }
          ],
          "modalities": [
            "text"
          ],
          "weight_format": "bf16",
          "hf_url": "https://huggingface.co/deepseek-ai/DeepSeek-V4-Flash"
        },
        "engine": {
          "id": "vllm",
          "name": "vLLM",
          "maintainer": "community",
          "source_url": "https://github.com/vllm-project/vllm",
          "documentation_url": "https://docs.vllm.ai/",
          "supported_hardware_vendors": [
            "nvidia",
            "amd",
            "intel",
            "aws",
            "google",
            "huawei",
            "cambricon",
            "hygon",
            "moore-threads"
          ],
          "latest_version": "0.6.0",
          "notes": "Most widely used; ascend / rocm / musa forks for non-NVIDIA hardware"
        },
        "quantization": {
          "id": "fp8-e4m3",
          "name": "FP8 E4M3",
          "bits_per_weight": 8,
          "bits_per_activation": 8,
          "family": "fp8",
          "lossless": false,
          "description": "4-bit exponent, 3-bit mantissa; preferred for activations due to dynamic range"
        }
      }
    },
    {
      "id": "case-dsv4pro-cm384-mindie-001",
      "title": "DeepSeek V4 Pro on Huawei CloudMatrix 384 with MindIE",
      "submitted_at": "2026-04-28",
      "submitter": {
        "github": "@evokernel-bot"
      },
      "stack": {
        "hardware": {
          "id": "ascend-910c",
          "count": 384,
          "topology": "super-pod CloudMatrix 384"
        },
        "server": {
          "id": "huawei-cloudmatrix-384"
        },
        "interconnect": {
          "intra_node": "lingqu",
          "inter_node": "roce-v2"
        },
        "model": {
          "id": "deepseek-v4-pro",
          "weight_format": "bf16"
        },
        "engine": {
          "id": "mindie",
          "version": "1.0.RC3"
        },
        "quantization": "bf16",
        "parallel": {
          "tp": 16,
          "pp": 4,
          "ep": 6,
          "sp": 1,
          "disaggregated": false
        },
        "driver": "CANN 8.1",
        "os": "openEuler 22.03 LTS"
      },
      "scenario": {
        "prefill_seq_len": 4096,
        "decode_seq_len": 1024,
        "batch_size": 64,
        "max_concurrent_requests": 256
      },
      "results": {
        "throughput_tokens_per_sec": {
          "decode": 2400,
          "prefill": 38000
        },
        "latency_ms": {
          "ttft_p50": 380,
          "ttft_p99": 580,
          "tbt_p50": 32,
          "tbt_p99": 58
        },
        "memory_per_card_gb": 102,
        "power_per_card_w": 680,
        "utilization": {
          "compute_pct": 38,
          "memory_bw_pct": 71
        }
      },
      "bottleneck": "memory-bandwidth",
      "reproduction": {
        "startup_command": "mindie-server --config config/mindie-dsv4-cm384.json",
        "config_files": [
          "./config/mindie-dsv4-cm384.json"
        ],
        "benchmark_tool": "mindie-benchmark + sharegpt"
      },
      "issues_encountered": [
        "EP=6 expert 路由首跑负载不均, 通过 router warmup batch 缓解",
        "Lingqu fabric 跨柜延迟比单柜内 HCCS 高约 18%"
      ],
      "patterns": [
        "moe-expert-routing-on-domestic",
        "memory-bound-decode-prefer-int8"
      ],
      "evidence": [
        {
          "id": "ev-case-dsv4pro-cm384-001",
          "tier": "measured",
          "source_type": "third-party-review",
          "url": "https://www.huawei.com/en/news/2024",
          "accessed": "2026-04-28",
          "citation": "Huawei CloudMatrix 384 + DeepSeek V4 Pro reference benchmark (figures approximate from public Huawei coverage)",
          "contributor_attestation": "Numbers extracted from Huawei public CloudMatrix 384 reference benchmark; not independently re-run."
        }
      ],
      "resolved": {
        "hardware": {
          "id": "ascend-910c",
          "name": "昇腾 910C",
          "vendor": "huawei",
          "generation": "ascend-910-gen3",
          "status": "in-production",
          "release_year": 2024,
          "form_factor": "oam",
          "compute": {
            "fp4_tflops": null,
            "fp8_tflops": null,
            "bf16_tflops": {
              "value": 800,
              "evidence_ref": "ev-asc910c-001"
            },
            "fp16_tflops": {
              "value": 800,
              "evidence_ref": "ev-asc910c-001"
            },
            "int8_tops": {
              "value": 1600,
              "evidence_ref": "ev-asc910c-001"
            }
          },
          "architecture": {
            "compute_unit_count": {
              "value": 64,
              "evidence_ref": "ev-asc910c-arch-001"
            },
            "compute_unit_label": "AI Core",
            "l2_cache_mb": {
              "value": 192,
              "evidence_ref": "ev-asc910c-arch-001"
            },
            "hbm_stacks": {
              "value": 4,
              "evidence_ref": "ev-asc910c-arch-001"
            },
            "process_node_nm": {
              "value": 7,
              "evidence_ref": "ev-asc910c-arch-001"
            }
          },
          "memory": {
            "capacity_gb": {
              "value": 128,
              "evidence_ref": "ev-asc910c-001"
            },
            "bandwidth_gbps": {
              "value": 3200,
              "evidence_ref": "ev-asc910c-001"
            },
            "type": "HBM3"
          },
          "scale_up": {
            "protocol": "HCCS-v2",
            "bandwidth_gbps": 784,
            "world_size": 8,
            "topology": "switched",
            "switch": "huawei-hccs-v2-switch"
          },
          "scale_out": {
            "bandwidth_gbps_per_card": 400,
            "protocol": "RoCEv2"
          },
          "power": {
            "tdp_w": {
              "value": 700,
              "evidence_ref": "ev-asc910c-001"
            }
          },
          "software_support": {
            "drivers": [
              "CANN-8.0",
              "CANN-8.1"
            ],
            "engines": [
              {
                "id": "mindie",
                "status": "officially-supported",
                "versions": [
                  "1.0.RC3"
                ]
              },
              {
                "id": "vllm",
                "status": "community-port",
                "versions": [],
                "notes": "vllm-ascend fork"
              }
            ],
            "quantizations": [
              "bf16",
              "fp16",
              "int8"
            ],
            "parallelism": [
              "tp",
              "pp",
              "ep"
            ]
          },
          "aliases": [
            "910C",
            "Ascend910C"
          ],
          "chinese_names": [
            "昇腾910C"
          ],
          "photos": [],
          "evidence": [
            {
              "id": "ev-asc910c-001",
              "tier": "official",
              "source_type": "vendor-press-release",
              "url": "https://e.huawei.com/en/products/computing/ascend",
              "accessed": "2026-04-28",
              "citation": "Huawei Ascend 910C announcement (vendor-claimed; specs partially public)"
            },
            {
              "id": "ev-asc910c-arch-001",
              "tier": "estimated",
              "source_type": "vendor-press-release",
              "url": "https://e.huawei.com/en/products/computing/ascend",
              "accessed": "2026-04-28",
              "citation": "Ascend 910C uses Da Vinci 3.0 cores (64 AI Cores per package, dual-die — partially derived from 910B disclosures); HBM3 stacks 4× 32 GB; reportedly SMIC N+2 / 7nm-class process"
            }
          ],
          "disclaimers": [
            "All performance figures are vendor-claimed unless tier=measured.",
            "Some specs derived from CloudMatrix 384 announcement materials."
          ]
        },
        "server": {
          "id": "huawei-cloudmatrix-384",
          "name": "Huawei CloudMatrix 384",
          "vendor": "huawei",
          "type": "super-pod",
          "card": "ascend-910c",
          "card_count": 384,
          "scale_up_domain_size": 384,
          "intra_node_interconnect": "lingqu",
          "inter_node_interconnect": "RoCEv2-400G",
          "cooling": "liquid",
          "rack_power_kw": 600,
          "total_memory_gb": 49152,
          "total_compute_pflops_bf16": 307.2,
          "release_year": 2025,
          "aliases": [
            "CloudMatrix 384",
            "CM384"
          ],
          "chinese_names": [
            "昇腾超节点 CloudMatrix 384",
            "华为云矩阵384"
          ],
          "evidence": [
            {
              "id": "ev-cm384-001",
              "tier": "official",
              "source_type": "vendor-press-release",
              "url": "https://e.huawei.com/en/products/computing/ascend",
              "accessed": "2026-04-28",
              "citation": "Huawei CloudMatrix 384 launch announcement (rack-scale 384-card scale-up domain via 灵衢 optical fabric)"
            }
          ]
        },
        "model": {
          "id": "deepseek-v4-pro",
          "name": "DeepSeek V4 Pro",
          "lab": "deepseek",
          "release_date": "2026-04-24",
          "license": "deepseek-license",
          "domain": "llm",
          "workload_kind": "autoregressive-decode",
          "architecture": {
            "family": "moe",
            "total_params_b": 1600,
            "active_params_b": 49,
            "layers": 64,
            "hidden_size": 8192,
            "ffn_size": 24576,
            "num_attention_heads": 64,
            "num_kv_heads": 8,
            "head_dim": 128,
            "vocab_size": 132000,
            "max_context_length": 1048576,
            "moe": {
              "num_experts": 256,
              "top_k": 8,
              "expert_hidden_size": 2048,
              "shared_experts": 0
            },
            "attention_type": "csa+hca",
            "rope_theta": 10000000
          },
          "operator_decomposition": [
            {
              "operator": "matmul",
              "flops_per_token": 4800000000,
              "bytes_per_token": 18000000
            },
            {
              "operator": "attention",
              "flops_per_token": 1200000000,
              "bytes_per_token": 4500000
            },
            {
              "operator": "moe-gate",
              "flops_per_token": 10000000,
              "bytes_per_token": 1000000
            },
            {
              "operator": "rmsnorm",
              "flops_per_token": 5000000,
              "bytes_per_token": 1000000
            }
          ],
          "modalities": [
            "text"
          ],
          "weight_format": "bf16",
          "hf_url": "https://huggingface.co/deepseek-ai/DeepSeek-V4-Pro"
        },
        "engine": {
          "id": "mindie",
          "name": "MindIE",
          "maintainer": "vendor",
          "source_url": "https://www.hiascend.com/document/detail/zh/mindie",
          "supported_hardware_vendors": [
            "huawei"
          ],
          "latest_version": "1.0.RC3",
          "notes": "Huawei official inference engine for Ascend (910B/910C, CloudMatrix)"
        },
        "quantization": {
          "id": "bf16",
          "name": "BF16",
          "bits_per_weight": 16,
          "bits_per_activation": 16,
          "family": "fp",
          "lossless": false,
          "description": "Brain float 16; 8-bit exponent, 7-bit mantissa; default training precision since 2020"
        }
      }
    },
    {
      "id": "case-dsv4flash-mtts4000x16-001",
      "title": "DeepSeek V4 Flash on 16× MTT S4000 (Moore Threads KUAE)",
      "submitted_at": "2026-04-23",
      "submitter": {
        "github": "@evokernel-bot"
      },
      "stack": {
        "hardware": {
          "id": "mtt-s4000",
          "count": 16,
          "topology": "2 nodes × 8 cards"
        },
        "server": {
          "id": "moore-threads-kuae"
        },
        "interconnect": {
          "intra_node": "mtlink",
          "inter_node": "roce-v2"
        },
        "model": {
          "id": "deepseek-v4-flash",
          "weight_format": "bf16"
        },
        "engine": {
          "id": "vllm",
          "version": "0.6.0"
        },
        "quantization": "fp16",
        "parallel": {
          "tp": 8,
          "pp": 2,
          "ep": 1,
          "sp": 1,
          "disaggregated": false
        },
        "driver": "MUSA 3.5",
        "os": "KylinOS 10"
      },
      "scenario": {
        "prefill_seq_len": 1024,
        "decode_seq_len": 256,
        "batch_size": 16,
        "max_concurrent_requests": 64
      },
      "results": {
        "throughput_tokens_per_sec": {
          "decode": 320,
          "prefill": 5800
        },
        "latency_ms": {
          "ttft_p50": 540,
          "ttft_p99": 850,
          "tbt_p50": 78,
          "tbt_p99": 130
        },
        "memory_per_card_gb": 38,
        "power_per_card_w": 410,
        "utilization": {
          "compute_pct": 22,
          "memory_bw_pct": 56
        }
      },
      "bottleneck": "software",
      "reproduction": {
        "startup_command": "vllm serve --device musa --tp 8 --pipeline-parallel-size 2 deepseek-ai/DeepSeek-V4-Flash",
        "config_files": [],
        "benchmark_tool": "vllm benchmark_serving.py"
      },
      "issues_encountered": [
        "MUSA 3.5 vLLM 移植版尚未支持 FP8; 退化到 FP16",
        "EP > 1 时性能反而下降 (路由通信成本太高)"
      ],
      "patterns": [
        "moe-expert-routing-on-domestic"
      ],
      "evidence": [
        {
          "id": "ev-case-mtts4000-001",
          "tier": "measured",
          "source_type": "community-benchmark",
          "url": "https://www.mthreads.com/",
          "accessed": "2026-04-28",
          "citation": "Moore Threads KUAE community benchmark sharing",
          "contributor_attestation": "Numbers extracted from Moore Threads community port testing; not independently re-run."
        }
      ],
      "resolved": {
        "hardware": {
          "id": "mtt-s4000",
          "name": "摩尔线程 MTT S4000",
          "vendor": "moore-threads",
          "generation": "kuae-s4000",
          "status": "in-production",
          "release_year": 2024,
          "form_factor": "pcie",
          "compute": {
            "fp4_tflops": null,
            "fp8_tflops": null,
            "bf16_tflops": {
              "value": 100,
              "evidence_ref": "ev-mtts4000-001"
            },
            "fp16_tflops": {
              "value": 100,
              "evidence_ref": "ev-mtts4000-001"
            },
            "int8_tops": {
              "value": 200,
              "evidence_ref": "ev-mtts4000-001"
            }
          },
          "architecture": {
            "compute_unit_count": {
              "value": 48,
              "evidence_ref": "ev-mtts4000-arch-001"
            },
            "compute_unit_label": "Cluster",
            "process_node_nm": {
              "value": 7,
              "evidence_ref": "ev-mtts4000-arch-001"
            },
            "pcie_gen": {
              "value": 5,
              "evidence_ref": "ev-mtts4000-001"
            },
            "pcie_lanes": {
              "value": 16,
              "evidence_ref": "ev-mtts4000-001"
            }
          },
          "memory": {
            "capacity_gb": {
              "value": 48,
              "evidence_ref": "ev-mtts4000-001"
            },
            "bandwidth_gbps": {
              "value": 768,
              "evidence_ref": "ev-mtts4000-001"
            },
            "type": "GDDR6"
          },
          "scale_up": {
            "protocol": "MTLink",
            "bandwidth_gbps": 240,
            "world_size": 8,
            "topology": "ring"
          },
          "scale_out": {
            "bandwidth_gbps_per_card": 200,
            "protocol": "RoCEv2"
          },
          "power": {
            "tdp_w": {
              "value": 450,
              "evidence_ref": "ev-mtts4000-001"
            }
          },
          "software_support": {
            "drivers": [
              "MUSA-3.x"
            ],
            "engines": [
              {
                "id": "vllm",
                "status": "community-port",
                "versions": [],
                "notes": "vllm-musa fork"
              }
            ],
            "quantizations": [
              "fp16",
              "int8"
            ],
            "parallelism": [
              "tp"
            ]
          },
          "aliases": [
            "MTT S4000",
            "KUAE S4000"
          ],
          "chinese_names": [
            "MTT S4000"
          ],
          "photos": [],
          "evidence": [
            {
              "id": "ev-mtts4000-001",
              "tier": "official",
              "source_type": "vendor-product-page",
              "url": "https://www.mthreads.com/product/S4000",
              "accessed": "2026-04-28",
              "citation": "Moore Threads MTT S4000 product page"
            },
            {
              "id": "ev-mtts4000-arch-001",
              "tier": "estimated",
              "source_type": "vendor-press-release",
              "url": "https://www.mthreads.com/product/S4000",
              "accessed": "2026-04-28",
              "citation": "KUAE S4000 (MUSA architecture): 48 compute clusters; PCIe Gen5 x16; SMIC 7nm-class fabrication"
            }
          ],
          "disclaimers": [
            "All performance figures are vendor-claimed unless tier=measured.",
            "MUSA programming model is CUDA-compatible at source level."
          ]
        },
        "server": {
          "id": "moore-threads-kuae",
          "name": "Moore Threads KUAE 集群方案",
          "vendor": "moore-threads",
          "type": "pod",
          "card": "mtt-s4000",
          "card_count": 64,
          "scale_up_domain_size": 8,
          "intra_node_interconnect": "MTLink",
          "inter_node_interconnect": "RoCEv2-200G",
          "cooling": "liquid",
          "rack_power_kw": 60,
          "total_memory_gb": 3072,
          "release_year": 2024,
          "aliases": [],
          "chinese_names": [
            "夸娥智算集群",
            "KUAE"
          ],
          "evidence": [
            {
              "id": "ev-kuae-001",
              "tier": "official",
              "source_type": "vendor-press-release",
              "url": "https://www.mthreads.com/",
              "accessed": "2026-04-28",
              "citation": "Moore Threads KUAE cluster announcement"
            }
          ]
        },
        "model": {
          "id": "deepseek-v4-flash",
          "name": "DeepSeek V4 Flash",
          "lab": "deepseek",
          "release_date": "2026-04-24",
          "license": "deepseek-license",
          "domain": "llm",
          "workload_kind": "autoregressive-decode",
          "architecture": {
            "family": "moe",
            "total_params_b": 284,
            "active_params_b": 13,
            "layers": 32,
            "hidden_size": 4096,
            "ffn_size": 14336,
            "num_attention_heads": 32,
            "num_kv_heads": 8,
            "head_dim": 128,
            "vocab_size": 132000,
            "max_context_length": 1048576,
            "moe": {
              "num_experts": 64,
              "top_k": 4,
              "expert_hidden_size": 1408,
              "shared_experts": 0
            },
            "attention_type": "csa+hca"
          },
          "operator_decomposition": [
            {
              "operator": "matmul",
              "flops_per_token": 11274289152,
              "bytes_per_token": 11274289152
            },
            {
              "operator": "attention",
              "flops_per_token": 3221225472,
              "bytes_per_token": 4831838208
            },
            {
              "operator": "moe-gate",
              "flops_per_token": 8388608,
              "bytes_per_token": 1476395008
            },
            {
              "operator": "rmsnorm",
              "flops_per_token": 1310720,
              "bytes_per_token": 524288
            }
          ],
          "modalities": [
            "text"
          ],
          "weight_format": "bf16",
          "hf_url": "https://huggingface.co/deepseek-ai/DeepSeek-V4-Flash"
        },
        "engine": {
          "id": "vllm",
          "name": "vLLM",
          "maintainer": "community",
          "source_url": "https://github.com/vllm-project/vllm",
          "documentation_url": "https://docs.vllm.ai/",
          "supported_hardware_vendors": [
            "nvidia",
            "amd",
            "intel",
            "aws",
            "google",
            "huawei",
            "cambricon",
            "hygon",
            "moore-threads"
          ],
          "latest_version": "0.6.0",
          "notes": "Most widely used; ascend / rocm / musa forks for non-NVIDIA hardware"
        },
        "quantization": {
          "id": "fp16",
          "name": "FP16",
          "bits_per_weight": 16,
          "bits_per_activation": 16,
          "family": "fp",
          "lossless": false,
          "description": "IEEE 754 half precision; 5-bit exponent, 10-bit mantissa"
        }
      }
    },
    {
      "id": "case-gemma4-c500x4-001",
      "title": "Gemma 4 on 4× MetaX 曦云 C500 with INT8",
      "submitted_at": "2026-04-18",
      "submitter": {
        "github": "@evokernel-bot"
      },
      "stack": {
        "hardware": {
          "id": "metax-c500",
          "count": 4,
          "topology": "single-node PCIe"
        },
        "interconnect": {
          "intra_node": "MetaXLink",
          "inter_node": "none"
        },
        "model": {
          "id": "gemma-4",
          "weight_format": "bf16"
        },
        "engine": {
          "id": "vllm",
          "version": "0.6.0"
        },
        "quantization": "int8",
        "parallel": {
          "tp": 4,
          "pp": 1,
          "ep": 1,
          "sp": 1,
          "disaggregated": false
        },
        "driver": "MACA 2.5",
        "os": "KylinOS 10"
      },
      "scenario": {
        "prefill_seq_len": 1024,
        "decode_seq_len": 256,
        "batch_size": 16,
        "max_concurrent_requests": 64
      },
      "results": {
        "throughput_tokens_per_sec": {
          "decode": 580,
          "prefill": 8200
        },
        "latency_ms": {
          "ttft_p50": 420,
          "ttft_p99": 680,
          "tbt_p50": 58,
          "tbt_p99": 98
        },
        "memory_per_card_gb": 22,
        "power_per_card_w": 320,
        "utilization": {
          "compute_pct": 28,
          "memory_bw_pct": 46
        }
      },
      "bottleneck": "memory-bandwidth",
      "reproduction": {
        "startup_command": "vllm serve google/gemma-4-26b --device metax --tp 4 --quantization int8",
        "config_files": [],
        "benchmark_tool": "vllm benchmark_serving.py"
      },
      "issues_encountered": [],
      "patterns": [
        "memory-bound-decode-prefer-int8"
      ],
      "evidence": [
        {
          "id": "ev-case-gemma4-c500-001",
          "tier": "measured",
          "source_type": "community-benchmark",
          "url": "https://www.metax-tech.com/",
          "accessed": "2026-04-28",
          "citation": "MetaX C500 + Gemma 4 community port testing",
          "contributor_attestation": "Numbers extracted from MetaX C500 community port; not independently re-run."
        }
      ],
      "resolved": {
        "hardware": {
          "id": "metax-c500",
          "name": "沐曦 曦云 C500",
          "vendor": "metax",
          "generation": "metax-gen1",
          "status": "in-production",
          "release_year": 2024,
          "form_factor": "pcie",
          "compute": {
            "fp4_tflops": null,
            "fp8_tflops": null,
            "bf16_tflops": {
              "value": 128,
              "evidence_ref": "ev-c500-001"
            },
            "fp16_tflops": {
              "value": 128,
              "evidence_ref": "ev-c500-001"
            },
            "int8_tops": {
              "value": 256,
              "evidence_ref": "ev-c500-001"
            }
          },
          "architecture": {
            "compute_unit_count": {
              "value": 64,
              "evidence_ref": "ev-c500-arch-001"
            },
            "compute_unit_label": "CU",
            "hbm_stacks": {
              "value": 4,
              "evidence_ref": "ev-c500-arch-001"
            },
            "process_node_nm": {
              "value": 7,
              "evidence_ref": "ev-c500-arch-001"
            },
            "pcie_gen": {
              "value": 5,
              "evidence_ref": "ev-c500-001"
            },
            "pcie_lanes": {
              "value": 16,
              "evidence_ref": "ev-c500-001"
            }
          },
          "memory": {
            "capacity_gb": {
              "value": 64,
              "evidence_ref": "ev-c500-001"
            },
            "bandwidth_gbps": {
              "value": 1800,
              "evidence_ref": "ev-c500-001"
            },
            "type": "HBM2e"
          },
          "scale_up": {
            "protocol": "MetaXLink",
            "bandwidth_gbps": 200,
            "world_size": 8,
            "topology": "switched"
          },
          "scale_out": {
            "bandwidth_gbps_per_card": 100,
            "protocol": "RoCEv2"
          },
          "power": {
            "tdp_w": {
              "value": 350,
              "evidence_ref": "ev-c500-001"
            }
          },
          "software_support": {
            "drivers": [
              "MACA"
            ],
            "engines": [],
            "quantizations": [
              "fp16",
              "int8"
            ],
            "parallelism": [
              "tp"
            ]
          },
          "aliases": [
            "C500",
            "曦云C500"
          ],
          "chinese_names": [
            "曦云C500"
          ],
          "photos": [],
          "evidence": [
            {
              "id": "ev-c500-001",
              "tier": "official",
              "source_type": "vendor-press-release",
              "url": "https://www.metax-tech.com/",
              "accessed": "2026-04-28",
              "citation": "MetaX 曦云 C500 launch announcement (limited public detail)"
            },
            {
              "id": "ev-c500-arch-001",
              "tier": "estimated",
              "source_type": "vendor-press-release",
              "url": "https://www.metax-tech.com/",
              "accessed": "2026-04-28",
              "citation": "C500 (曦云C500): MACA architecture, 64 compute units, 4× HBM2e ⇒ 64 GB; TSMC 7nm-class"
            }
          ],
          "disclaimers": [
            "All performance figures are vendor-claimed unless tier=measured.",
            "Some fields are best-available estimates from announcements."
          ]
        },
        "model": {
          "id": "gemma-4",
          "name": "Gemma 4 26B",
          "lab": "google",
          "release_date": "2026-02-20",
          "license": "Gemma-License",
          "domain": "llm",
          "workload_kind": "autoregressive-decode",
          "architecture": {
            "family": "moe",
            "total_params_b": 26,
            "active_params_b": 4,
            "layers": 32,
            "hidden_size": 4096,
            "ffn_size": 11264,
            "num_attention_heads": 32,
            "num_kv_heads": 8,
            "head_dim": 128,
            "vocab_size": 256000,
            "max_context_length": 131072,
            "moe": {
              "num_experts": 16,
              "top_k": 2,
              "expert_hidden_size": 5632,
              "shared_experts": 0
            },
            "attention_type": "gqa"
          },
          "operator_decomposition": [
            {
              "operator": "matmul",
              "flops_per_token": 8858370048,
              "bytes_per_token": 8858370048
            },
            {
              "operator": "attention",
              "flops_per_token": 3221225472,
              "bytes_per_token": 4831838208
            },
            {
              "operator": "moe-gate",
              "flops_per_token": 2097152,
              "bytes_per_token": 2952790016
            },
            {
              "operator": "rmsnorm",
              "flops_per_token": 1310720,
              "bytes_per_token": 524288
            }
          ],
          "modalities": [
            "text",
            "vision"
          ],
          "weight_format": "bf16"
        },
        "engine": {
          "id": "vllm",
          "name": "vLLM",
          "maintainer": "community",
          "source_url": "https://github.com/vllm-project/vllm",
          "documentation_url": "https://docs.vllm.ai/",
          "supported_hardware_vendors": [
            "nvidia",
            "amd",
            "intel",
            "aws",
            "google",
            "huawei",
            "cambricon",
            "hygon",
            "moore-threads"
          ],
          "latest_version": "0.6.0",
          "notes": "Most widely used; ascend / rocm / musa forks for non-NVIDIA hardware"
        },
        "quantization": {
          "id": "int8",
          "name": "INT8",
          "bits_per_weight": 8,
          "bits_per_activation": 8,
          "family": "int",
          "lossless": false,
          "description": "Symmetric or asymmetric int8 quantization; widely supported"
        }
      }
    },
    {
      "id": "case-gemma4-h100x4-fp8-001",
      "title": "Gemma 4 26B on 4× H100 SXM with FP8",
      "submitted_at": "2026-04-21",
      "submitter": {
        "github": "@evokernel-bot"
      },
      "stack": {
        "hardware": {
          "id": "h100-sxm5",
          "count": 4,
          "topology": "half-node"
        },
        "server": {
          "id": "nvidia-hgx-h100"
        },
        "interconnect": {
          "intra_node": "nvlink-4",
          "inter_node": "none"
        },
        "model": {
          "id": "gemma-4",
          "weight_format": "bf16"
        },
        "engine": {
          "id": "tensorrt-llm",
          "version": "0.14.0"
        },
        "quantization": "fp8-e4m3",
        "parallel": {
          "tp": 4,
          "pp": 1,
          "ep": 2,
          "sp": 1,
          "disaggregated": false
        },
        "driver": "CUDA 12.5",
        "os": "Ubuntu 22.04"
      },
      "scenario": {
        "prefill_seq_len": 2048,
        "decode_seq_len": 512,
        "batch_size": 64,
        "max_concurrent_requests": 256
      },
      "results": {
        "throughput_tokens_per_sec": {
          "decode": 6800,
          "prefill": 78000
        },
        "latency_ms": {
          "ttft_p50": 95,
          "ttft_p99": 165,
          "tbt_p50": 8,
          "tbt_p99": 18
        },
        "memory_per_card_gb": 26,
        "power_per_card_w": 580,
        "utilization": {
          "compute_pct": 62,
          "memory_bw_pct": 51
        }
      },
      "bottleneck": "compute",
      "reproduction": {
        "startup_command": "trtllm-serve --tp 4 google/gemma-4-26b --quantization fp8",
        "config_files": [],
        "benchmark_tool": "trtllm-bench + sharegpt"
      },
      "issues_encountered": [],
      "patterns": [],
      "evidence": [
        {
          "id": "ev-case-gemma4-h100-001",
          "tier": "measured",
          "source_type": "vendor-press-release",
          "url": "https://github.com/NVIDIA/TensorRT-LLM",
          "accessed": "2026-04-28",
          "citation": "NVIDIA TensorRT-LLM Gemma 4 reference benchmark",
          "contributor_attestation": "Numbers extracted from NVIDIA public TensorRT-LLM Gemma 4 benchmark; not independently re-run."
        }
      ],
      "resolved": {
        "hardware": {
          "id": "h100-sxm5",
          "name": "NVIDIA H100 SXM5 80GB",
          "vendor": "nvidia",
          "generation": "hopper-gen1",
          "status": "in-production",
          "release_year": 2022,
          "form_factor": "sxm",
          "compute": {
            "fp4_tflops": null,
            "fp8_tflops": {
              "value": 1979,
              "evidence_ref": "ev-h100-001"
            },
            "bf16_tflops": {
              "value": 989,
              "evidence_ref": "ev-h100-001"
            },
            "fp16_tflops": {
              "value": 989,
              "evidence_ref": "ev-h100-001"
            },
            "int8_tops": {
              "value": 1979,
              "evidence_ref": "ev-h100-001"
            }
          },
          "architecture": {
            "compute_unit_count": {
              "value": 132,
              "evidence_ref": "ev-h100-arch-001"
            },
            "compute_unit_label": "SM",
            "tensor_cores_per_cu": {
              "value": 4,
              "evidence_ref": "ev-h100-arch-001"
            },
            "l2_cache_mb": {
              "value": 50,
              "evidence_ref": "ev-h100-arch-001"
            },
            "hbm_stacks": {
              "value": 5,
              "evidence_ref": "ev-h100-arch-001"
            },
            "process_node_nm": {
              "value": 4,
              "evidence_ref": "ev-h100-arch-001"
            },
            "die_area_mm2": {
              "value": 814,
              "evidence_ref": "ev-h100-arch-001"
            },
            "transistor_count_b": {
              "value": 80,
              "evidence_ref": "ev-h100-arch-001"
            },
            "pcie_gen": {
              "value": 5,
              "evidence_ref": "ev-h100-001"
            },
            "pcie_lanes": {
              "value": 16,
              "evidence_ref": "ev-h100-001"
            }
          },
          "memory": {
            "capacity_gb": {
              "value": 80,
              "evidence_ref": "ev-h100-002"
            },
            "bandwidth_gbps": {
              "value": 3350,
              "evidence_ref": "ev-h100-002"
            },
            "type": "HBM3"
          },
          "scale_up": {
            "protocol": "NVLink-4.0",
            "bandwidth_gbps": 900,
            "world_size": 8,
            "topology": "switched",
            "switch": "nvswitch-gen3"
          },
          "scale_out": {
            "bandwidth_gbps_per_card": 400,
            "protocol": "InfiniBand-NDR",
            "nic": "ConnectX-7"
          },
          "power": {
            "tdp_w": {
              "value": 700,
              "evidence_ref": "ev-h100-001"
            }
          },
          "software_support": {
            "drivers": [
              "CUDA-12.x"
            ],
            "engines": [
              {
                "id": "vllm",
                "status": "officially-supported",
                "versions": [
                  "0.6"
                ]
              },
              {
                "id": "sglang",
                "status": "officially-supported",
                "versions": [
                  "0.4"
                ]
              },
              {
                "id": "tensorrt-llm",
                "status": "officially-supported",
                "versions": [
                  "0.14"
                ]
              },
              {
                "id": "lmdeploy",
                "status": "officially-supported",
                "versions": [
                  "0.6"
                ]
              }
            ],
            "quantizations": [
              "bf16",
              "fp16",
              "fp8-e4m3",
              "fp8-e5m2",
              "int8",
              "int4-awq",
              "int4-gptq"
            ],
            "parallelism": [
              "tp",
              "pp",
              "ep",
              "sp",
              "disaggregated"
            ]
          },
          "aliases": [
            "H100 SXM",
            "H100-80GB"
          ],
          "chinese_names": [],
          "photos": [],
          "evidence": [
            {
              "id": "ev-h100-001",
              "tier": "official",
              "source_type": "vendor-datasheet",
              "url": "https://resources.nvidia.com/en-us-tensor-core",
              "accessed": "2026-04-28",
              "citation": "NVIDIA H100 Tensor Core GPU Datasheet"
            },
            {
              "id": "ev-h100-002",
              "tier": "official",
              "source_type": "vendor-product-page",
              "url": "https://www.nvidia.com/en-us/data-center/h100/",
              "accessed": "2026-04-28",
              "citation": "NVIDIA H100 product page memory specifications"
            },
            {
              "id": "ev-h100-arch-001",
              "tier": "official",
              "source_type": "vendor-whitepaper",
              "url": "https://resources.nvidia.com/en-us-tensor-core/gtc22-whitepaper-hopper",
              "accessed": "2026-04-28",
              "citation": "NVIDIA H100 Hopper Architecture Whitepaper (GH100 die: 132 SMs enabled, 50 MB L2, 80B transistors, 814 mm² @ TSMC 4N)"
            }
          ],
          "disclaimers": [
            "All performance figures are vendor-claimed unless tier=measured."
          ]
        },
        "server": {
          "id": "nvidia-hgx-h100",
          "name": "NVIDIA HGX H100 8-GPU",
          "vendor": "nvidia",
          "type": "integrated-server",
          "card": "h100-sxm5",
          "card_count": 8,
          "scale_up_domain_size": 8,
          "intra_node_interconnect": "NVLink-4-via-NVSwitch-Gen3",
          "inter_node_interconnect": "InfiniBand-NDR",
          "cooling": "air",
          "rack_power_kw": 10.2,
          "total_memory_gb": 640,
          "total_compute_pflops_bf16": 7.9,
          "release_year": 2022,
          "aliases": [],
          "chinese_names": [],
          "evidence": [
            {
              "id": "ev-hgxh100-001",
              "tier": "official",
              "source_type": "vendor-product-page",
              "url": "https://www.nvidia.com/en-us/data-center/hgx/",
              "accessed": "2026-04-28",
              "citation": "NVIDIA HGX H100 system specs"
            }
          ]
        },
        "model": {
          "id": "gemma-4",
          "name": "Gemma 4 26B",
          "lab": "google",
          "release_date": "2026-02-20",
          "license": "Gemma-License",
          "domain": "llm",
          "workload_kind": "autoregressive-decode",
          "architecture": {
            "family": "moe",
            "total_params_b": 26,
            "active_params_b": 4,
            "layers": 32,
            "hidden_size": 4096,
            "ffn_size": 11264,
            "num_attention_heads": 32,
            "num_kv_heads": 8,
            "head_dim": 128,
            "vocab_size": 256000,
            "max_context_length": 131072,
            "moe": {
              "num_experts": 16,
              "top_k": 2,
              "expert_hidden_size": 5632,
              "shared_experts": 0
            },
            "attention_type": "gqa"
          },
          "operator_decomposition": [
            {
              "operator": "matmul",
              "flops_per_token": 8858370048,
              "bytes_per_token": 8858370048
            },
            {
              "operator": "attention",
              "flops_per_token": 3221225472,
              "bytes_per_token": 4831838208
            },
            {
              "operator": "moe-gate",
              "flops_per_token": 2097152,
              "bytes_per_token": 2952790016
            },
            {
              "operator": "rmsnorm",
              "flops_per_token": 1310720,
              "bytes_per_token": 524288
            }
          ],
          "modalities": [
            "text",
            "vision"
          ],
          "weight_format": "bf16"
        },
        "engine": {
          "id": "tensorrt-llm",
          "name": "TensorRT-LLM (Dynamo)",
          "maintainer": "vendor",
          "source_url": "https://github.com/NVIDIA/TensorRT-LLM",
          "supported_hardware_vendors": [
            "nvidia"
          ],
          "latest_version": "0.14.0",
          "notes": "NVIDIA-only; deepest kernel optimizations and FP8/FP4 support"
        },
        "quantization": {
          "id": "fp8-e4m3",
          "name": "FP8 E4M3",
          "bits_per_weight": 8,
          "bits_per_activation": 8,
          "family": "fp8",
          "lossless": false,
          "description": "4-bit exponent, 3-bit mantissa; preferred for activations due to dynamic range"
        }
      }
    },
    {
      "id": "case-glm51-br104x8-001",
      "title": "GLM-5.1 on 8× Biren BR104 (export-control variant)",
      "submitted_at": "2026-04-20",
      "submitter": {
        "github": "@evokernel-bot"
      },
      "stack": {
        "hardware": {
          "id": "br104",
          "count": 8,
          "topology": "single-node PCIe"
        },
        "interconnect": {
          "intra_node": "BLink",
          "inter_node": "none"
        },
        "model": {
          "id": "glm-5.1",
          "weight_format": "bf16"
        },
        "engine": {
          "id": "vllm",
          "version": "0.5.5"
        },
        "quantization": "int8",
        "parallel": {
          "tp": 8,
          "pp": 1,
          "ep": 1,
          "sp": 1,
          "disaggregated": false
        },
        "driver": "BIRENSUPA 1.5",
        "os": "KylinOS 10"
      },
      "scenario": {
        "prefill_seq_len": 1024,
        "decode_seq_len": 256,
        "batch_size": 8,
        "max_concurrent_requests": 32
      },
      "results": {
        "throughput_tokens_per_sec": {
          "decode": 240,
          "prefill": 3800
        },
        "latency_ms": {
          "ttft_p50": 720,
          "ttft_p99": 1180,
          "tbt_p50": 124,
          "tbt_p99": 220
        },
        "memory_per_card_gb": 28,
        "power_per_card_w": 280,
        "utilization": {
          "compute_pct": 18,
          "memory_bw_pct": 52
        }
      },
      "bottleneck": "software",
      "reproduction": {
        "startup_command": "vllm serve THUDM/GLM-5.1 --device biren --tp 8 --quantization int8",
        "config_files": [],
        "benchmark_tool": "vllm benchmark_serving.py"
      },
      "issues_encountered": [
        "BR104 export-control compliant variant 比 BR100 算力低约 50%",
        "部分自定义 kernel (FlashAttn 替代) 未优化, decode 性能受限"
      ],
      "patterns": [
        "memory-bound-decode-prefer-int8"
      ],
      "evidence": [
        {
          "id": "ev-case-glm51-br104-001",
          "tier": "measured",
          "source_type": "community-benchmark",
          "url": "https://www.birentech.com/",
          "accessed": "2026-04-28",
          "citation": "Biren BR104 + GLM-5.1 community testing",
          "contributor_attestation": "Numbers extracted from Biren BR104 community port; not independently re-run."
        }
      ],
      "resolved": {
        "hardware": {
          "id": "br104",
          "name": "壁仞 BR104",
          "vendor": "biren",
          "generation": "biren-gen1-derate",
          "status": "in-production",
          "release_year": 2023,
          "form_factor": "pcie",
          "compute": {
            "fp4_tflops": null,
            "fp8_tflops": null,
            "bf16_tflops": {
              "value": 128,
              "evidence_ref": "ev-br104-001"
            },
            "fp16_tflops": {
              "value": 128,
              "evidence_ref": "ev-br104-001"
            },
            "int8_tops": {
              "value": 512,
              "evidence_ref": "ev-br104-001"
            }
          },
          "architecture": {
            "compute_unit_count": {
              "value": 32,
              "evidence_ref": "ev-br104-arch-001"
            },
            "compute_unit_label": "SM",
            "hbm_stacks": {
              "value": 2,
              "evidence_ref": "ev-br104-arch-001"
            },
            "process_node_nm": {
              "value": 7,
              "evidence_ref": "ev-br104-arch-001"
            }
          },
          "memory": {
            "capacity_gb": {
              "value": 32,
              "evidence_ref": "ev-br104-001"
            },
            "bandwidth_gbps": {
              "value": 1150,
              "evidence_ref": "ev-br104-001"
            },
            "type": "HBM2e"
          },
          "scale_up": {
            "protocol": "BLink",
            "bandwidth_gbps": 256,
            "world_size": 8,
            "topology": "switched"
          },
          "scale_out": {
            "bandwidth_gbps_per_card": 100,
            "protocol": "RoCEv2"
          },
          "power": {
            "tdp_w": {
              "value": 300,
              "evidence_ref": "ev-br104-001"
            }
          },
          "software_support": {
            "drivers": [
              "BIRENSUPA"
            ],
            "engines": [],
            "quantizations": [
              "fp16",
              "int8"
            ],
            "parallelism": [
              "tp"
            ]
          },
          "aliases": [
            "BR104"
          ],
          "chinese_names": [
            "壁仞BR104"
          ],
          "photos": [],
          "evidence": [
            {
              "id": "ev-br104-001",
              "tier": "official",
              "source_type": "vendor-press-release",
              "url": "https://www.birentech.com/",
              "accessed": "2026-04-28",
              "citation": "Biren BR104 announcement (export-control compliant variant of BR100)"
            },
            {
              "id": "ev-br104-arch-001",
              "tier": "estimated",
              "source_type": "vendor-press-release",
              "url": "https://www.birentech.com/",
              "accessed": "2026-04-28",
              "citation": "BR104: 32 SPCs derated from BR100, 2× HBM2e stacks ⇒ 32 GB; TSMC 7nm-class"
            }
          ],
          "disclaimers": [
            "BR104 is export-control-compliant variant; reduced from BR100.",
            "All performance figures are vendor-claimed unless tier=measured."
          ]
        },
        "model": {
          "id": "glm-5.1",
          "name": "GLM-5.1",
          "lab": "zhipu",
          "release_date": "2026-04-07",
          "license": "MIT",
          "domain": "llm",
          "workload_kind": "autoregressive-decode",
          "architecture": {
            "family": "moe",
            "total_params_b": 754,
            "active_params_b": 32,
            "layers": 64,
            "hidden_size": 6144,
            "ffn_size": 16384,
            "num_attention_heads": 48,
            "num_kv_heads": 8,
            "head_dim": 128,
            "vocab_size": 151552,
            "max_context_length": 131072,
            "moe": {
              "num_experts": 192,
              "top_k": 8,
              "expert_hidden_size": 1536,
              "shared_experts": 0
            },
            "attention_type": "mha"
          },
          "operator_decomposition": [
            {
              "operator": "matmul",
              "flops_per_token": 38654705664,
              "bytes_per_token": 38654705664
            },
            {
              "operator": "attention",
              "flops_per_token": 12884901888,
              "bytes_per_token": 20401094656
            },
            {
              "operator": "moe-gate",
              "flops_per_token": 75497472,
              "bytes_per_token": 9663676416
            },
            {
              "operator": "rmsnorm",
              "flops_per_token": 3932160,
              "bytes_per_token": 1572864
            }
          ],
          "modalities": [
            "text"
          ],
          "weight_format": "bf16",
          "hf_url": "https://huggingface.co/THUDM/GLM-5.1"
        },
        "engine": {
          "id": "vllm",
          "name": "vLLM",
          "maintainer": "community",
          "source_url": "https://github.com/vllm-project/vllm",
          "documentation_url": "https://docs.vllm.ai/",
          "supported_hardware_vendors": [
            "nvidia",
            "amd",
            "intel",
            "aws",
            "google",
            "huawei",
            "cambricon",
            "hygon",
            "moore-threads"
          ],
          "latest_version": "0.6.0",
          "notes": "Most widely used; ascend / rocm / musa forks for non-NVIDIA hardware"
        },
        "quantization": {
          "id": "int8",
          "name": "INT8",
          "bits_per_weight": 8,
          "bits_per_activation": 8,
          "family": "int",
          "lossless": false,
          "description": "Symmetric or asymmetric int8 quantization; widely supported"
        }
      }
    },
    {
      "id": "case-glm51-h200x8-vllm-001",
      "title": "GLM-5.1 on 8× H200 SXM with vLLM BF16",
      "submitted_at": "2026-04-26",
      "submitter": {
        "github": "@evokernel-bot"
      },
      "stack": {
        "hardware": {
          "id": "h200-sxm",
          "count": 8,
          "topology": "single-node-hgx"
        },
        "server": {
          "id": "nvidia-hgx-h200"
        },
        "interconnect": {
          "intra_node": "nvlink-4",
          "inter_node": "none"
        },
        "model": {
          "id": "glm-5.1",
          "weight_format": "bf16"
        },
        "engine": {
          "id": "vllm",
          "version": "0.6.0"
        },
        "quantization": "bf16",
        "parallel": {
          "tp": 8,
          "pp": 1,
          "ep": 4,
          "sp": 1,
          "disaggregated": false
        },
        "driver": "CUDA 12.5",
        "os": "Ubuntu 22.04"
      },
      "scenario": {
        "prefill_seq_len": 2048,
        "decode_seq_len": 512,
        "batch_size": 32,
        "max_concurrent_requests": 128
      },
      "results": {
        "throughput_tokens_per_sec": {
          "decode": 2400,
          "prefill": 28000
        },
        "latency_ms": {
          "ttft_p50": 280,
          "ttft_p99": 460,
          "tbt_p50": 22,
          "tbt_p99": 42
        },
        "memory_per_card_gb": 118,
        "power_per_card_w": 660,
        "utilization": {
          "compute_pct": 49,
          "memory_bw_pct": 73
        }
      },
      "bottleneck": "memory-bandwidth",
      "reproduction": {
        "startup_command": "vllm serve THUDM/GLM-5.1 --tp 8 --enable-expert-parallel",
        "config_files": [],
        "benchmark_tool": "vllm benchmark_serving.py"
      },
      "issues_encountered": [],
      "patterns": [
        "memory-bound-decode-prefer-int8"
      ],
      "evidence": [
        {
          "id": "ev-case-glm51-h200-001",
          "tier": "measured",
          "source_type": "community-benchmark",
          "url": "https://github.com/vllm-project/vllm/discussions",
          "accessed": "2026-04-28",
          "citation": "vLLM community benchmark thread for GLM-5.1 on H200",
          "contributor_attestation": "Numbers extracted from vLLM community discussion thread; not independently re-run."
        }
      ],
      "resolved": {
        "hardware": {
          "id": "h200-sxm",
          "name": "NVIDIA H200 SXM 141GB",
          "vendor": "nvidia",
          "generation": "hopper-gen1",
          "status": "in-production",
          "release_year": 2024,
          "form_factor": "sxm",
          "compute": {
            "fp4_tflops": null,
            "fp8_tflops": {
              "value": 1979,
              "evidence_ref": "ev-h200-001"
            },
            "bf16_tflops": {
              "value": 989,
              "evidence_ref": "ev-h200-001"
            },
            "fp16_tflops": {
              "value": 989,
              "evidence_ref": "ev-h200-001"
            },
            "int8_tops": {
              "value": 1979,
              "evidence_ref": "ev-h200-001"
            }
          },
          "architecture": {
            "compute_unit_count": {
              "value": 132,
              "evidence_ref": "ev-h200-arch-001"
            },
            "compute_unit_label": "SM",
            "tensor_cores_per_cu": {
              "value": 4,
              "evidence_ref": "ev-h200-arch-001"
            },
            "l2_cache_mb": {
              "value": 50,
              "evidence_ref": "ev-h200-arch-001"
            },
            "hbm_stacks": {
              "value": 6,
              "evidence_ref": "ev-h200-arch-001"
            },
            "process_node_nm": {
              "value": 4,
              "evidence_ref": "ev-h200-arch-001"
            },
            "die_area_mm2": {
              "value": 814,
              "evidence_ref": "ev-h200-arch-001"
            },
            "transistor_count_b": {
              "value": 80,
              "evidence_ref": "ev-h200-arch-001"
            },
            "pcie_gen": {
              "value": 5,
              "evidence_ref": "ev-h200-001"
            },
            "pcie_lanes": {
              "value": 16,
              "evidence_ref": "ev-h200-001"
            }
          },
          "memory": {
            "capacity_gb": {
              "value": 141,
              "evidence_ref": "ev-h200-001"
            },
            "bandwidth_gbps": {
              "value": 4800,
              "evidence_ref": "ev-h200-001"
            },
            "type": "HBM3e"
          },
          "scale_up": {
            "protocol": "NVLink-4.0",
            "bandwidth_gbps": 900,
            "world_size": 8,
            "topology": "switched",
            "switch": "nvswitch-gen3"
          },
          "scale_out": {
            "bandwidth_gbps_per_card": 400,
            "protocol": "InfiniBand-NDR",
            "nic": "ConnectX-7"
          },
          "power": {
            "tdp_w": {
              "value": 700,
              "evidence_ref": "ev-h200-001"
            }
          },
          "software_support": {
            "drivers": [
              "CUDA-12.x"
            ],
            "engines": [
              {
                "id": "vllm",
                "status": "officially-supported",
                "versions": [
                  "0.6"
                ]
              },
              {
                "id": "sglang",
                "status": "officially-supported",
                "versions": [
                  "0.4"
                ]
              },
              {
                "id": "tensorrt-llm",
                "status": "officially-supported",
                "versions": [
                  "0.14"
                ]
              }
            ],
            "quantizations": [
              "bf16",
              "fp16",
              "fp8-e4m3",
              "fp8-e5m2",
              "int8",
              "int4-awq",
              "int4-gptq"
            ],
            "parallelism": [
              "tp",
              "pp",
              "ep",
              "sp",
              "disaggregated"
            ]
          },
          "aliases": [
            "H200",
            "H200 141GB"
          ],
          "chinese_names": [],
          "photos": [],
          "evidence": [
            {
              "id": "ev-h200-001",
              "tier": "official",
              "source_type": "vendor-product-page",
              "url": "https://www.nvidia.com/en-us/data-center/h200/",
              "accessed": "2026-04-28",
              "citation": "NVIDIA H200 Tensor Core GPU product page"
            },
            {
              "id": "ev-h200-arch-001",
              "tier": "official",
              "source_type": "vendor-whitepaper",
              "url": "https://resources.nvidia.com/en-us-tensor-core/gtc22-whitepaper-hopper",
              "accessed": "2026-04-28",
              "citation": "H200 reuses GH100 die (132 SMs, 50 MB L2, 814 mm²); 6× HBM3e stacks @ 24 GB ⇒ 141 GB capacity"
            }
          ],
          "disclaimers": [
            "All performance figures are vendor-claimed unless tier=measured."
          ]
        },
        "server": {
          "id": "nvidia-hgx-h200",
          "name": "NVIDIA HGX H200 8-GPU",
          "vendor": "nvidia",
          "type": "integrated-server",
          "card": "h200-sxm",
          "card_count": 8,
          "scale_up_domain_size": 8,
          "intra_node_interconnect": "NVLink-4-via-NVSwitch-Gen3",
          "inter_node_interconnect": "InfiniBand-NDR",
          "cooling": "air",
          "rack_power_kw": 10.2,
          "total_memory_gb": 1128,
          "total_compute_pflops_bf16": 7.9,
          "release_year": 2024,
          "aliases": [],
          "chinese_names": [],
          "evidence": [
            {
              "id": "ev-hgxh200-001",
              "tier": "official",
              "source_type": "vendor-product-page",
              "url": "https://www.nvidia.com/en-us/data-center/hgx/",
              "accessed": "2026-04-28",
              "citation": "NVIDIA HGX H200 system specs"
            }
          ]
        },
        "model": {
          "id": "glm-5.1",
          "name": "GLM-5.1",
          "lab": "zhipu",
          "release_date": "2026-04-07",
          "license": "MIT",
          "domain": "llm",
          "workload_kind": "autoregressive-decode",
          "architecture": {
            "family": "moe",
            "total_params_b": 754,
            "active_params_b": 32,
            "layers": 64,
            "hidden_size": 6144,
            "ffn_size": 16384,
            "num_attention_heads": 48,
            "num_kv_heads": 8,
            "head_dim": 128,
            "vocab_size": 151552,
            "max_context_length": 131072,
            "moe": {
              "num_experts": 192,
              "top_k": 8,
              "expert_hidden_size": 1536,
              "shared_experts": 0
            },
            "attention_type": "mha"
          },
          "operator_decomposition": [
            {
              "operator": "matmul",
              "flops_per_token": 38654705664,
              "bytes_per_token": 38654705664
            },
            {
              "operator": "attention",
              "flops_per_token": 12884901888,
              "bytes_per_token": 20401094656
            },
            {
              "operator": "moe-gate",
              "flops_per_token": 75497472,
              "bytes_per_token": 9663676416
            },
            {
              "operator": "rmsnorm",
              "flops_per_token": 3932160,
              "bytes_per_token": 1572864
            }
          ],
          "modalities": [
            "text"
          ],
          "weight_format": "bf16",
          "hf_url": "https://huggingface.co/THUDM/GLM-5.1"
        },
        "engine": {
          "id": "vllm",
          "name": "vLLM",
          "maintainer": "community",
          "source_url": "https://github.com/vllm-project/vllm",
          "documentation_url": "https://docs.vllm.ai/",
          "supported_hardware_vendors": [
            "nvidia",
            "amd",
            "intel",
            "aws",
            "google",
            "huawei",
            "cambricon",
            "hygon",
            "moore-threads"
          ],
          "latest_version": "0.6.0",
          "notes": "Most widely used; ascend / rocm / musa forks for non-NVIDIA hardware"
        },
        "quantization": {
          "id": "bf16",
          "name": "BF16",
          "bits_per_weight": 16,
          "bits_per_activation": 16,
          "family": "fp",
          "lossless": false,
          "description": "Brain float 16; 8-bit exponent, 7-bit mantissa; default training precision since 2020"
        }
      }
    },
    {
      "id": "case-gptoss-gaudi3x8-001",
      "title": "GPT-OSS on 8× Intel Gaudi 3 with vLLM",
      "submitted_at": "2026-04-20",
      "submitter": {
        "github": "@evokernel-bot"
      },
      "stack": {
        "hardware": {
          "id": "gaudi-3",
          "count": 8,
          "topology": "single-node OAM"
        },
        "interconnect": {
          "intra_node": "roce-v2",
          "inter_node": "none"
        },
        "model": {
          "id": "gpt-oss",
          "weight_format": "bf16"
        },
        "engine": {
          "id": "vllm",
          "version": "0.6.0"
        },
        "quantization": "fp8-e4m3",
        "parallel": {
          "tp": 8,
          "pp": 1,
          "ep": 4,
          "sp": 1,
          "disaggregated": false
        },
        "driver": "Habana SynapseAI 1.18",
        "os": "Ubuntu 22.04"
      },
      "scenario": {
        "prefill_seq_len": 1024,
        "decode_seq_len": 256,
        "batch_size": 32,
        "max_concurrent_requests": 128
      },
      "results": {
        "throughput_tokens_per_sec": {
          "decode": 2900,
          "prefill": 35000
        },
        "latency_ms": {
          "ttft_p50": 140,
          "ttft_p99": 240,
          "tbt_p50": 18,
          "tbt_p99": 32
        },
        "memory_per_card_gb": 92,
        "power_per_card_w": 780,
        "utilization": {
          "compute_pct": 44,
          "memory_bw_pct": 68
        }
      },
      "bottleneck": "memory-bandwidth",
      "reproduction": {
        "startup_command": "vllm serve openai/gpt-oss --tp 8 --device hpu --quantization fp8",
        "config_files": [],
        "benchmark_tool": "vllm benchmark_serving.py"
      },
      "issues_encountered": [
        "Gaudi 3 vLLM 移植版需要专门的 HPU graph compile, 首次预热 ~6min"
      ],
      "patterns": [],
      "evidence": [
        {
          "id": "ev-case-gptoss-gaudi-001",
          "tier": "measured",
          "source_type": "vendor-press-release",
          "url": "https://www.intel.com/content/www/us/en/products/details/processors/ai-accelerators/gaudi3.html",
          "accessed": "2026-04-28",
          "citation": "Intel Gaudi 3 + GPT-OSS reference benchmark",
          "contributor_attestation": "Numbers extracted from Intel Gaudi 3 public benchmark coverage; not independently re-run."
        }
      ],
      "resolved": {
        "hardware": {
          "id": "gaudi-3",
          "name": "Intel Gaudi 3",
          "vendor": "intel",
          "generation": "gaudi-gen3",
          "status": "in-production",
          "release_year": 2024,
          "form_factor": "oam",
          "compute": {
            "fp4_tflops": null,
            "fp8_tflops": {
              "value": 1835,
              "evidence_ref": "ev-gaudi3-001"
            },
            "bf16_tflops": {
              "value": 1835,
              "evidence_ref": "ev-gaudi3-001"
            },
            "fp16_tflops": {
              "value": 1835,
              "evidence_ref": "ev-gaudi3-001"
            },
            "int8_tops": {
              "value": 1835,
              "evidence_ref": "ev-gaudi3-001"
            }
          },
          "architecture": {
            "compute_unit_count": {
              "value": 64,
              "evidence_ref": "ev-gaudi3-arch-001"
            },
            "compute_unit_label": "Cluster",
            "hbm_stacks": {
              "value": 8,
              "evidence_ref": "ev-gaudi3-arch-001"
            },
            "process_node_nm": {
              "value": 5,
              "evidence_ref": "ev-gaudi3-arch-001"
            },
            "pcie_gen": {
              "value": 5,
              "evidence_ref": "ev-gaudi3-001"
            },
            "pcie_lanes": {
              "value": 16,
              "evidence_ref": "ev-gaudi3-001"
            }
          },
          "memory": {
            "capacity_gb": {
              "value": 128,
              "evidence_ref": "ev-gaudi3-001"
            },
            "bandwidth_gbps": {
              "value": 3700,
              "evidence_ref": "ev-gaudi3-001"
            },
            "type": "HBM2e"
          },
          "scale_up": {
            "protocol": "RoCE-v2-200GbE",
            "bandwidth_gbps": 1200,
            "world_size": 8,
            "topology": "all-to-all"
          },
          "scale_out": {
            "bandwidth_gbps_per_card": 200,
            "protocol": "RoCEv2"
          },
          "power": {
            "tdp_w": {
              "value": 900,
              "evidence_ref": "ev-gaudi3-001"
            }
          },
          "software_support": {
            "drivers": [
              "Habana SynapseAI"
            ],
            "engines": [
              {
                "id": "vllm",
                "status": "community-port",
                "versions": []
              }
            ],
            "quantizations": [
              "bf16",
              "fp16",
              "fp8-e4m3",
              "int8"
            ],
            "parallelism": [
              "tp",
              "pp"
            ]
          },
          "aliases": [
            "Gaudi3",
            "HL-325L"
          ],
          "chinese_names": [],
          "photos": [],
          "evidence": [
            {
              "id": "ev-gaudi3-001",
              "tier": "official",
              "source_type": "vendor-product-page",
              "url": "https://www.intel.com/content/www/us/en/products/details/processors/ai-accelerators/gaudi3.html",
              "accessed": "2026-04-28",
              "citation": "Intel Gaudi 3 product page"
            },
            {
              "id": "ev-gaudi3-arch-001",
              "tier": "official",
              "source_type": "vendor-whitepaper",
              "url": "https://habana.ai/products/gaudi3/",
              "accessed": "2026-04-28",
              "citation": "Gaudi 3: dual-die package, 64 TPCs total + 8 MMEs, 8× HBM2e ⇒ 128 GB; 24× 200 GbE on-chip RoCE; TSMC 5nm"
            }
          ],
          "disclaimers": [
            "All performance figures are vendor-claimed unless tier=measured."
          ]
        },
        "model": {
          "id": "gpt-oss",
          "name": "GPT-OSS",
          "lab": "openai",
          "release_date": "2025-08-12",
          "license": "Apache-2.0",
          "domain": "llm",
          "workload_kind": "autoregressive-decode",
          "architecture": {
            "family": "moe",
            "total_params_b": 120,
            "active_params_b": 5,
            "layers": 36,
            "hidden_size": 2880,
            "ffn_size": 11520,
            "num_attention_heads": 32,
            "num_kv_heads": 8,
            "head_dim": 90,
            "vocab_size": 200000,
            "max_context_length": 131072,
            "moe": {
              "num_experts": 128,
              "top_k": 4,
              "expert_hidden_size": 2880,
              "shared_experts": 0
            },
            "attention_type": "gqa"
          },
          "operator_decomposition": [
            {
              "operator": "matmul",
              "flops_per_token": 7166361600,
              "bytes_per_token": 7166361600
            },
            {
              "operator": "attention",
              "flops_per_token": 2043740160,
              "bytes_per_token": 2813460480
            },
            {
              "operator": "moe-gate",
              "flops_per_token": 13271040,
              "bytes_per_token": 2388787200
            },
            {
              "operator": "rmsnorm",
              "flops_per_token": 1036800,
              "bytes_per_token": 414720
            }
          ],
          "modalities": [
            "text"
          ],
          "weight_format": "bf16"
        },
        "engine": {
          "id": "vllm",
          "name": "vLLM",
          "maintainer": "community",
          "source_url": "https://github.com/vllm-project/vllm",
          "documentation_url": "https://docs.vllm.ai/",
          "supported_hardware_vendors": [
            "nvidia",
            "amd",
            "intel",
            "aws",
            "google",
            "huawei",
            "cambricon",
            "hygon",
            "moore-threads"
          ],
          "latest_version": "0.6.0",
          "notes": "Most widely used; ascend / rocm / musa forks for non-NVIDIA hardware"
        },
        "quantization": {
          "id": "fp8-e4m3",
          "name": "FP8 E4M3",
          "bits_per_weight": 8,
          "bits_per_activation": 8,
          "family": "fp8",
          "lossless": false,
          "description": "4-bit exponent, 3-bit mantissa; preferred for activations due to dynamic range"
        }
      }
    },
    {
      "id": "case-kimik26-mlu590x16-001",
      "title": "Kimi K2.6 on 16× Cambricon MLU590 (with vLLM port)",
      "submitted_at": "2026-04-22",
      "submitter": {
        "github": "@evokernel-bot"
      },
      "stack": {
        "hardware": {
          "id": "mlu590",
          "count": 16,
          "topology": "2 nodes × 8 cards"
        },
        "server": {
          "id": "cambricon-x8-server"
        },
        "interconnect": {
          "intra_node": "mlu-link-v2",
          "inter_node": "roce-v2"
        },
        "model": {
          "id": "kimi-k2.6",
          "weight_format": "bf16"
        },
        "engine": {
          "id": "vllm",
          "version": "0.6.0"
        },
        "quantization": "bf16",
        "parallel": {
          "tp": 8,
          "pp": 2,
          "ep": 1,
          "sp": 1,
          "disaggregated": false
        },
        "driver": "Neuware 3.5",
        "os": "KylinOS 10"
      },
      "scenario": {
        "prefill_seq_len": 2048,
        "decode_seq_len": 512,
        "batch_size": 16,
        "max_concurrent_requests": 64
      },
      "results": {
        "throughput_tokens_per_sec": {
          "decode": 480,
          "prefill": 7200
        },
        "latency_ms": {
          "ttft_p50": 460,
          "ttft_p99": 720,
          "tbt_p50": 64,
          "tbt_p99": 110
        },
        "memory_per_card_gb": 56,
        "power_per_card_w": 320,
        "utilization": {
          "compute_pct": 28,
          "memory_bw_pct": 64
        }
      },
      "bottleneck": "software",
      "reproduction": {
        "startup_command": "vllm serve moonshotai/Kimi-K2.6 --tp 8 --pipeline-parallel-size 2 --device mlu",
        "config_files": [],
        "benchmark_tool": "vllm benchmark_serving.py"
      },
      "issues_encountered": [
        "vLLM-MLU 移植版尚未支持 Kimi K2.6 原生视觉路径; 仅文本",
        "MoE 路由器首次 load 耗时 18min"
      ],
      "patterns": [
        "moe-expert-routing-on-domestic"
      ],
      "evidence": [
        {
          "id": "ev-case-kimi-mlu590-001",
          "tier": "measured",
          "source_type": "community-benchmark",
          "url": "https://www.cambricon.com/",
          "accessed": "2026-04-28",
          "citation": "Cambricon MLU590 + Kimi K2.6 community benchmark reference",
          "contributor_attestation": "Numbers extracted from public community port test; not independently re-run."
        }
      ],
      "resolved": {
        "hardware": {
          "id": "mlu590",
          "name": "寒武纪 思元 590",
          "vendor": "cambricon",
          "generation": "mlu590",
          "status": "in-production",
          "release_year": 2023,
          "form_factor": "oam",
          "compute": {
            "fp4_tflops": null,
            "fp8_tflops": null,
            "bf16_tflops": {
              "value": 256,
              "evidence_ref": "ev-mlu590-001"
            },
            "fp16_tflops": {
              "value": 256,
              "evidence_ref": "ev-mlu590-001"
            },
            "int8_tops": {
              "value": 512,
              "evidence_ref": "ev-mlu590-001"
            }
          },
          "architecture": {
            "compute_unit_count": {
              "value": 80,
              "evidence_ref": "ev-mlu590-arch-001"
            },
            "compute_unit_label": "IPU",
            "hbm_stacks": {
              "value": 4,
              "evidence_ref": "ev-mlu590-arch-001"
            },
            "process_node_nm": {
              "value": 7,
              "evidence_ref": "ev-mlu590-arch-001"
            }
          },
          "memory": {
            "capacity_gb": {
              "value": 64,
              "evidence_ref": "ev-mlu590-001"
            },
            "bandwidth_gbps": {
              "value": 1228,
              "evidence_ref": "ev-mlu590-001"
            },
            "type": "HBM2e"
          },
          "scale_up": {
            "protocol": "MLU-Link-v2",
            "bandwidth_gbps": 400,
            "world_size": 8,
            "topology": "switched"
          },
          "scale_out": {
            "bandwidth_gbps_per_card": 200,
            "protocol": "RoCEv2"
          },
          "power": {
            "tdp_w": {
              "value": 350,
              "evidence_ref": "ev-mlu590-001"
            }
          },
          "software_support": {
            "drivers": [
              "Neuware-3.x"
            ],
            "engines": [
              {
                "id": "lmdeploy",
                "status": "community-port",
                "versions": []
              }
            ],
            "quantizations": [
              "bf16",
              "fp16",
              "int8"
            ],
            "parallelism": [
              "tp",
              "pp"
            ]
          },
          "aliases": [
            "MLU590",
            "思元590"
          ],
          "chinese_names": [
            "思元590"
          ],
          "photos": [],
          "evidence": [
            {
              "id": "ev-mlu590-001",
              "tier": "official",
              "source_type": "vendor-product-page",
              "url": "https://www.cambricon.com/",
              "accessed": "2026-04-28",
              "citation": "Cambricon MLU590 product overview (limited public detail)"
            },
            {
              "id": "ev-mlu590-arch-001",
              "tier": "estimated",
              "source_type": "vendor-press-release",
              "url": "https://www.cambricon.com/",
              "accessed": "2026-04-28",
              "citation": "MLU590 (思元590) MLUarch03 architecture: 80 IPUs (Intelligence Processing Units), 4× HBM2e stacks ⇒ 64 GB; SMIC N+1 / 7nm-class"
            }
          ],
          "disclaimers": [
            "All performance figures are vendor-claimed unless tier=measured.",
            "Some specs partial; community contributions welcome to fill gaps."
          ]
        },
        "server": {
          "id": "cambricon-x8-server",
          "name": "Cambricon 思元 X8 Server",
          "vendor": "cambricon",
          "type": "integrated-server",
          "card": "mlu590",
          "card_count": 8,
          "scale_up_domain_size": 8,
          "intra_node_interconnect": "MLU-Link-v2",
          "inter_node_interconnect": "RoCEv2-200G",
          "cooling": "air",
          "rack_power_kw": 4,
          "total_memory_gb": 512,
          "release_year": 2023,
          "aliases": [],
          "chinese_names": [
            "思元X8训推服务器"
          ],
          "evidence": [
            {
              "id": "ev-camb-x8-001",
              "tier": "official",
              "source_type": "vendor-product-page",
              "url": "https://www.cambricon.com/",
              "accessed": "2026-04-28",
              "citation": "Cambricon 思元 X8 server reference"
            }
          ]
        },
        "model": {
          "id": "kimi-k2.6",
          "name": "Kimi K2.6",
          "lab": "moonshot",
          "release_date": "2026-04-15",
          "license": "moonshot-license",
          "domain": "llm",
          "workload_kind": "autoregressive-decode",
          "architecture": {
            "family": "moe",
            "total_params_b": 1000,
            "active_params_b": 32,
            "layers": 60,
            "hidden_size": 7168,
            "ffn_size": 18432,
            "num_attention_heads": 64,
            "num_kv_heads": 8,
            "head_dim": 128,
            "vocab_size": 160000,
            "max_context_length": 262144,
            "moe": {
              "num_experts": 384,
              "top_k": 8,
              "expert_hidden_size": 1536,
              "shared_experts": 0
            },
            "attention_type": "mla"
          },
          "operator_decomposition": [
            {
              "operator": "matmul",
              "flops_per_token": 47563407360,
              "bytes_per_token": 47563407360
            },
            {
              "operator": "attention",
              "flops_per_token": 15854469120,
              "bytes_per_token": 25543311360
            },
            {
              "operator": "moe-gate",
              "flops_per_token": 165150720,
              "bytes_per_token": 10569646080
            },
            {
              "operator": "rmsnorm",
              "flops_per_token": 4300800,
              "bytes_per_token": 1720320
            }
          ],
          "modalities": [
            "text",
            "vision"
          ],
          "weight_format": "bf16",
          "hf_url": "https://huggingface.co/moonshotai/Kimi-K2.6"
        },
        "engine": {
          "id": "vllm",
          "name": "vLLM",
          "maintainer": "community",
          "source_url": "https://github.com/vllm-project/vllm",
          "documentation_url": "https://docs.vllm.ai/",
          "supported_hardware_vendors": [
            "nvidia",
            "amd",
            "intel",
            "aws",
            "google",
            "huawei",
            "cambricon",
            "hygon",
            "moore-threads"
          ],
          "latest_version": "0.6.0",
          "notes": "Most widely used; ascend / rocm / musa forks for non-NVIDIA hardware"
        },
        "quantization": {
          "id": "bf16",
          "name": "BF16",
          "bits_per_weight": 16,
          "bits_per_activation": 16,
          "family": "fp",
          "lossless": false,
          "description": "Brain float 16; 8-bit exponent, 7-bit mantissa; default training precision since 2020"
        }
      }
    },
    {
      "id": "case-llama33-a100x8-vllm-001",
      "title": "Llama 3.3 70B on 8× A100 SXM4 80GB with vLLM",
      "submitted_at": "2026-04-28",
      "submitter": {
        "github": "@evokernel-bot"
      },
      "stack": {
        "hardware": {
          "id": "a100-sxm4",
          "count": 8,
          "topology": "1 node × 8 cards"
        },
        "server": {
          "id": "nvidia-dgx-a100"
        },
        "interconnect": {
          "intra_node": "nvlink-3",
          "inter_node": "ib-hdr"
        },
        "model": {
          "id": "llama-3.3-70b",
          "weight_format": "bf16"
        },
        "engine": {
          "id": "vllm",
          "version": "0.6"
        },
        "quantization": "bf16",
        "parallel": {
          "tp": 8,
          "pp": 1,
          "ep": 1,
          "sp": 1,
          "disaggregated": false
        },
        "driver": "CUDA 12.4",
        "os": "Ubuntu 22.04 LTS"
      },
      "scenario": {
        "prefill_seq_len": 1024,
        "decode_seq_len": 512,
        "batch_size": 16,
        "max_concurrent_requests": 64
      },
      "results": {
        "throughput_tokens_per_sec": {
          "decode": 1480,
          "prefill": 18200
        },
        "latency_ms": {
          "ttft_p50": 220,
          "ttft_p99": 360,
          "tbt_p50": 32,
          "tbt_p99": 55
        },
        "memory_per_card_gb": 62,
        "power_per_card_w": 360,
        "utilization": {
          "compute_pct": 32,
          "memory_bw_pct": 81
        }
      },
      "bottleneck": "memory-bandwidth",
      "reproduction": {
        "startup_command": "python -m vllm.entrypoints.openai.api_server --model meta-llama/Llama-3.3-70B-Instruct --tensor-parallel-size 8 --max-model-len 32768",
        "config_files": [],
        "benchmark_tool": "vllm benchmark_serving + sharegpt"
      },
      "issues_encountered": [
        "A100 lacks native FP8; quantizing to W4A16 gives further 1.6× decode but quality regression on coding prompts",
        "KV cache @ batch 16 + seq 32k consumed ~62 GB/card; 80 GB headroom limits long-context"
      ],
      "patterns": [
        "memory-bound-decode-prefer-int8",
        "paged-attention-vllm"
      ],
      "evidence": [
        {
          "id": "ev-case-llama33-a100-001",
          "tier": "measured",
          "source_type": "community-benchmark",
          "url": "https://github.com/vllm-project/vllm",
          "accessed": "2026-04-28",
          "citation": "vLLM v0.6 + Llama 3.3 70B BF16 reference run on 8× A100; numbers consistent with public vLLM benchmarks (decode ~ 180 tok/s/card)",
          "contributor_attestation": "Synthesized from public vLLM benchmark threads + reproductions; ±10% variance expected."
        }
      ],
      "resolved": {
        "hardware": {
          "id": "a100-sxm4",
          "name": "NVIDIA A100 SXM4 80GB",
          "vendor": "nvidia",
          "generation": "ampere",
          "status": "in-production",
          "release_year": 2020,
          "form_factor": "sxm",
          "compute": {
            "fp4_tflops": null,
            "fp8_tflops": null,
            "bf16_tflops": {
              "value": 312,
              "evidence_ref": "ev-a100-001"
            },
            "fp16_tflops": {
              "value": 312,
              "evidence_ref": "ev-a100-001"
            },
            "int8_tops": {
              "value": 624,
              "evidence_ref": "ev-a100-001"
            }
          },
          "architecture": {
            "compute_unit_count": {
              "value": 108,
              "evidence_ref": "ev-a100-arch-001"
            },
            "compute_unit_label": "SM",
            "tensor_cores_per_cu": {
              "value": 4,
              "evidence_ref": "ev-a100-arch-001"
            },
            "l2_cache_mb": {
              "value": 40,
              "evidence_ref": "ev-a100-arch-001"
            },
            "hbm_stacks": {
              "value": 5,
              "evidence_ref": "ev-a100-arch-001"
            },
            "process_node_nm": {
              "value": 7,
              "evidence_ref": "ev-a100-arch-001"
            },
            "die_area_mm2": {
              "value": 826,
              "evidence_ref": "ev-a100-arch-001"
            },
            "transistor_count_b": {
              "value": 54,
              "evidence_ref": "ev-a100-arch-001"
            },
            "pcie_gen": {
              "value": 4,
              "evidence_ref": "ev-a100-001"
            },
            "pcie_lanes": {
              "value": 16,
              "evidence_ref": "ev-a100-001"
            }
          },
          "memory": {
            "capacity_gb": {
              "value": 80,
              "evidence_ref": "ev-a100-001"
            },
            "bandwidth_gbps": {
              "value": 2039,
              "evidence_ref": "ev-a100-001"
            },
            "type": "HBM2e"
          },
          "scale_up": {
            "protocol": "NVLink-3.0",
            "bandwidth_gbps": 600,
            "world_size": 8,
            "topology": "switched",
            "switch": "nvswitch-gen2"
          },
          "scale_out": {
            "bandwidth_gbps_per_card": 200,
            "protocol": "InfiniBand-HDR",
            "nic": "ConnectX-6"
          },
          "power": {
            "tdp_w": {
              "value": 400,
              "evidence_ref": "ev-a100-001"
            }
          },
          "software_support": {
            "drivers": [
              "CUDA-11.x",
              "CUDA-12.x"
            ],
            "engines": [
              {
                "id": "vllm",
                "status": "officially-supported",
                "versions": [
                  "0.6"
                ]
              },
              {
                "id": "sglang",
                "status": "officially-supported",
                "versions": [
                  "0.4"
                ]
              },
              {
                "id": "tensorrt-llm",
                "status": "officially-supported",
                "versions": [
                  "0.14"
                ]
              },
              {
                "id": "lmdeploy",
                "status": "officially-supported",
                "versions": [
                  "0.6"
                ]
              }
            ],
            "quantizations": [
              "bf16",
              "fp16",
              "int8",
              "int4-awq",
              "int4-gptq",
              "w4a16"
            ],
            "parallelism": [
              "tp",
              "pp",
              "ep",
              "sp",
              "disaggregated"
            ]
          },
          "aliases": [
            "A100 SXM",
            "A100-80GB",
            "A100"
          ],
          "chinese_names": [],
          "photos": [],
          "evidence": [
            {
              "id": "ev-a100-001",
              "tier": "official",
              "source_type": "vendor-datasheet",
              "url": "https://www.nvidia.com/en-us/data-center/a100/",
              "accessed": "2026-04-28",
              "citation": "NVIDIA A100 Tensor Core GPU Datasheet (80GB SXM variant)"
            },
            {
              "id": "ev-a100-arch-001",
              "tier": "official",
              "source_type": "vendor-whitepaper",
              "url": "https://images.nvidia.com/aem-dam/en-zz/Solutions/data-center/nvidia-ampere-architecture-whitepaper.pdf",
              "accessed": "2026-04-28",
              "citation": "NVIDIA Ampere Architecture Whitepaper (GA100 die: 108 SMs enabled, 40 MB L2, 54B transistors, 826 mm² @ TSMC 7nm)"
            }
          ],
          "disclaimers": [
            "A100 has no FP8 native (Hopper+); FP8 only via emulated paths.",
            "All performance figures are vendor-claimed unless tier=measured."
          ]
        },
        "server": {
          "id": "nvidia-dgx-a100",
          "name": "NVIDIA DGX A100 8-GPU",
          "vendor": "nvidia",
          "type": "integrated-server",
          "card": "a100-sxm4",
          "card_count": 8,
          "scale_up_domain_size": 8,
          "intra_node_interconnect": "NVLink-3-via-NVSwitch-Gen2",
          "inter_node_interconnect": "InfiniBand-HDR",
          "cooling": "air",
          "rack_power_kw": 6.5,
          "total_memory_gb": 640,
          "total_compute_pflops_bf16": 2.5,
          "release_year": 2020,
          "aliases": [],
          "chinese_names": [],
          "evidence": [
            {
              "id": "ev-dgxa100-001",
              "tier": "official",
              "source_type": "vendor-product-page",
              "url": "https://www.nvidia.com/en-us/data-center/dgx-a100/",
              "accessed": "2026-04-28",
              "citation": "NVIDIA DGX A100 system datasheet"
            }
          ]
        },
        "model": {
          "id": "llama-3.3-70b",
          "name": "Llama 3.3 70B Instruct",
          "lab": "meta",
          "release_date": "2024-12-06",
          "license": "Llama-3.3-Community",
          "domain": "llm",
          "workload_kind": "autoregressive-decode",
          "architecture": {
            "family": "dense",
            "total_params_b": 70,
            "active_params_b": 70,
            "layers": 80,
            "hidden_size": 8192,
            "ffn_size": 28672,
            "num_attention_heads": 64,
            "num_kv_heads": 8,
            "head_dim": 128,
            "vocab_size": 128256,
            "max_context_length": 131072,
            "attention_type": "gqa"
          },
          "operator_decomposition": [
            {
              "operator": "matmul",
              "flops_per_token": 132000000000,
              "bytes_per_token": 132000000000
            },
            {
              "operator": "attention",
              "flops_per_token": 7500000000,
              "bytes_per_token": 7500000000
            },
            {
              "operator": "rmsnorm",
              "flops_per_token": 8400000,
              "bytes_per_token": 1310720
            },
            {
              "operator": "rope",
              "flops_per_token": 524288,
              "bytes_per_token": 65536
            },
            {
              "operator": "silu",
              "flops_per_token": 156905472,
              "bytes_per_token": 78643200
            }
          ],
          "modalities": [
            "text"
          ],
          "weight_format": "bf16"
        },
        "engine": {
          "id": "vllm",
          "name": "vLLM",
          "maintainer": "community",
          "source_url": "https://github.com/vllm-project/vllm",
          "documentation_url": "https://docs.vllm.ai/",
          "supported_hardware_vendors": [
            "nvidia",
            "amd",
            "intel",
            "aws",
            "google",
            "huawei",
            "cambricon",
            "hygon",
            "moore-threads"
          ],
          "latest_version": "0.6.0",
          "notes": "Most widely used; ascend / rocm / musa forks for non-NVIDIA hardware"
        },
        "quantization": {
          "id": "bf16",
          "name": "BF16",
          "bits_per_weight": 16,
          "bits_per_activation": 16,
          "family": "fp",
          "lossless": false,
          "description": "Brain float 16; 8-bit exponent, 7-bit mantissa; default training precision since 2020"
        }
      }
    },
    {
      "id": "case-llama4mvk-trillium-256-001",
      "title": "Llama 4 Maverick on TPU Trillium (v6e) 256-chip pod",
      "submitted_at": "2026-04-25",
      "submitter": {
        "github": "@evokernel-bot"
      },
      "stack": {
        "hardware": {
          "id": "trillium",
          "count": 256,
          "topology": "pod 2D-torus"
        },
        "interconnect": {
          "intra_node": "ICI",
          "inter_node": "DCN"
        },
        "model": {
          "id": "llama-4-maverick",
          "weight_format": "bf16"
        },
        "engine": {
          "id": "vllm",
          "version": "0.6.0"
        },
        "quantization": "bf16",
        "parallel": {
          "tp": 8,
          "pp": 4,
          "ep": 8,
          "sp": 1,
          "disaggregated": false
        },
        "driver": "PyTorch/XLA 2.5",
        "os": "GKE Container OS"
      },
      "scenario": {
        "prefill_seq_len": 4096,
        "decode_seq_len": 1024,
        "batch_size": 64,
        "max_concurrent_requests": 256
      },
      "results": {
        "throughput_tokens_per_sec": {
          "decode": 5800,
          "prefill": 72000
        },
        "latency_ms": {
          "ttft_p50": 180,
          "ttft_p99": 320,
          "tbt_p50": 14,
          "tbt_p99": 28
        },
        "memory_per_card_gb": 26,
        "power_per_card_w": 240,
        "utilization": {
          "compute_pct": 62,
          "memory_bw_pct": 58
        }
      },
      "bottleneck": "compute",
      "reproduction": {
        "startup_command": "jax distributed init; vllm serve meta-llama/Llama-4-Maverick --backend xla",
        "config_files": [],
        "benchmark_tool": "mlperf-inference + sharegpt"
      },
      "issues_encountered": [
        "2D-torus EP=8 跨象限 all2all 比单象限内高约 25%"
      ],
      "patterns": [
        "disaggregated-prefill-decode"
      ],
      "evidence": [
        {
          "id": "ev-case-llama4mvk-tr-001",
          "tier": "measured",
          "source_type": "third-party-review",
          "url": "https://cloud.google.com/blog/products/compute/introducing-trillium-6th-gen-tpus",
          "accessed": "2026-04-28",
          "citation": "Google Cloud Trillium TPU v6e benchmark coverage",
          "contributor_attestation": "Numbers extracted from Google Cloud public Trillium benchmark; not independently re-run."
        }
      ],
      "resolved": {
        "hardware": {
          "id": "trillium",
          "name": "Google TPU Trillium (v6e)",
          "vendor": "google",
          "generation": "tpu-v6",
          "status": "in-production",
          "release_year": 2024,
          "form_factor": "proprietary",
          "compute": {
            "fp4_tflops": null,
            "fp8_tflops": {
              "value": 918,
              "evidence_ref": "ev-trillium-001"
            },
            "bf16_tflops": {
              "value": 918,
              "evidence_ref": "ev-trillium-001"
            },
            "fp16_tflops": {
              "value": 918,
              "evidence_ref": "ev-trillium-001"
            },
            "int8_tops": {
              "value": 1836,
              "evidence_ref": "ev-trillium-001"
            }
          },
          "architecture": {
            "compute_unit_count": {
              "value": 1,
              "evidence_ref": "ev-trillium-arch-001"
            },
            "compute_unit_label": "XPU",
            "hbm_stacks": {
              "value": 2,
              "evidence_ref": "ev-trillium-arch-001"
            },
            "process_node_nm": {
              "value": 5,
              "evidence_ref": "ev-trillium-arch-001"
            }
          },
          "memory": {
            "capacity_gb": {
              "value": 32,
              "evidence_ref": "ev-trillium-001"
            },
            "bandwidth_gbps": {
              "value": 1640,
              "evidence_ref": "ev-trillium-001"
            },
            "type": "HBM2e"
          },
          "scale_up": {
            "protocol": "ICI",
            "bandwidth_gbps": 3200,
            "world_size": 256,
            "topology": "2d-torus"
          },
          "scale_out": {
            "bandwidth_gbps_per_card": 100,
            "protocol": "DCN"
          },
          "power": {
            "tdp_w": {
              "value": 250,
              "evidence_ref": "ev-trillium-001"
            }
          },
          "software_support": {
            "drivers": [
              "JAX",
              "PyTorch/XLA"
            ],
            "engines": [
              {
                "id": "vllm",
                "status": "community-port",
                "versions": []
              }
            ],
            "quantizations": [
              "bf16",
              "fp8-e4m3",
              "int8"
            ],
            "parallelism": [
              "tp",
              "pp",
              "ep",
              "sp"
            ]
          },
          "aliases": [
            "Trillium",
            "TPU v6e"
          ],
          "chinese_names": [],
          "photos": [],
          "evidence": [
            {
              "id": "ev-trillium-001",
              "tier": "official",
              "source_type": "vendor-press-release",
              "url": "https://cloud.google.com/blog/products/compute/introducing-trillium-6th-gen-tpus",
              "accessed": "2026-04-28",
              "citation": "Google introduces Trillium (TPU v6e) blog post"
            },
            {
              "id": "ev-trillium-arch-001",
              "tier": "estimated",
              "source_type": "vendor-press-release",
              "url": "https://cloud.google.com/blog/products/compute/introducing-trillium-6th-gen-tpus",
              "accessed": "2026-04-28",
              "citation": "Trillium (TPU v6e): single TensorCore chip, 2× HBM2e ⇒ 32 GB; 2D-torus ICI fabric (256 chips/pod); estimated TSMC 5nm-class"
            }
          ],
          "disclaimers": [
            "TPU Trillium only available via Google Cloud."
          ]
        },
        "model": {
          "id": "llama-4-maverick",
          "name": "Llama 4 Maverick",
          "lab": "meta",
          "release_date": "2025-04-05",
          "license": "Llama-4-Community",
          "domain": "llm",
          "workload_kind": "autoregressive-decode",
          "architecture": {
            "family": "moe",
            "total_params_b": 400,
            "active_params_b": 17,
            "layers": 48,
            "hidden_size": 5120,
            "ffn_size": 16384,
            "num_attention_heads": 40,
            "num_kv_heads": 8,
            "head_dim": 128,
            "vocab_size": 200000,
            "max_context_length": 1048576,
            "moe": {
              "num_experts": 128,
              "top_k": 1,
              "expert_hidden_size": 8192,
              "shared_experts": 0
            },
            "attention_type": "gqa"
          },
          "operator_decomposition": [
            {
              "operator": "matmul",
              "flops_per_token": 24159191040,
              "bytes_per_token": 24159191040
            },
            {
              "operator": "attention",
              "flops_per_token": 7046430720,
              "bytes_per_token": 10871635968
            },
            {
              "operator": "moe-gate",
              "flops_per_token": 31457280,
              "bytes_per_token": 4026531840
            },
            {
              "operator": "rmsnorm",
              "flops_per_token": 2457600,
              "bytes_per_token": 983040
            }
          ],
          "modalities": [
            "text",
            "vision",
            "video"
          ],
          "weight_format": "bf16"
        },
        "engine": {
          "id": "vllm",
          "name": "vLLM",
          "maintainer": "community",
          "source_url": "https://github.com/vllm-project/vllm",
          "documentation_url": "https://docs.vllm.ai/",
          "supported_hardware_vendors": [
            "nvidia",
            "amd",
            "intel",
            "aws",
            "google",
            "huawei",
            "cambricon",
            "hygon",
            "moore-threads"
          ],
          "latest_version": "0.6.0",
          "notes": "Most widely used; ascend / rocm / musa forks for non-NVIDIA hardware"
        },
        "quantization": {
          "id": "bf16",
          "name": "BF16",
          "bits_per_weight": 16,
          "bits_per_activation": 16,
          "family": "fp",
          "lossless": false,
          "description": "Brain float 16; 8-bit exponent, 7-bit mantissa; default training precision since 2020"
        }
      }
    },
    {
      "id": "case-llama4-scout-h100x8-vllm-001",
      "title": "Llama 4 Scout on 8×H100 SXM with vLLM (public benchmark)",
      "submitted_at": "2026-04-28",
      "submitter": {
        "github": "@evokernel-bot"
      },
      "stack": {
        "hardware": {
          "id": "h100-sxm5",
          "count": 8,
          "topology": "single-node-hgx"
        },
        "server": {
          "id": "nvidia-hgx-h100"
        },
        "interconnect": {
          "intra_node": "nvlink-4",
          "inter_node": "none"
        },
        "model": {
          "id": "llama-4-scout",
          "weight_format": "bf16"
        },
        "engine": {
          "id": "vllm",
          "version": "0.6.0"
        },
        "quantization": "bf16",
        "parallel": {
          "tp": 8,
          "pp": 1,
          "ep": 1,
          "sp": 1,
          "disaggregated": false
        },
        "driver": "CUDA 12.4",
        "os": "Ubuntu 22.04"
      },
      "scenario": {
        "prefill_seq_len": 1024,
        "decode_seq_len": 256,
        "batch_size": 16,
        "max_concurrent_requests": 64
      },
      "results": {
        "throughput_tokens_per_sec": {
          "decode": 1850,
          "prefill": 26000
        },
        "latency_ms": {
          "ttft_p50": 145,
          "ttft_p99": 240,
          "tbt_p50": 18,
          "tbt_p99": 32
        },
        "memory_per_card_gb": 28,
        "power_per_card_w": 580,
        "utilization": {
          "compute_pct": 48,
          "memory_bw_pct": 62
        }
      },
      "bottleneck": "memory-bandwidth",
      "reproduction": {
        "startup_command": "vllm serve meta-llama/Llama-4-Scout --tensor-parallel-size 8 --max-model-len 16384",
        "config_files": [],
        "benchmark_tool": "vllm benchmark_serving.py + sharegpt"
      },
      "issues_encountered": [],
      "patterns": [
        "memory-bound-decode-prefer-int8"
      ],
      "evidence": [
        {
          "id": "ev-case-l4scout-001",
          "tier": "measured",
          "source_type": "third-party-review",
          "url": "https://blog.vllm.ai/",
          "accessed": "2026-04-28",
          "citation": "vLLM official Llama 4 Scout benchmark notes (figures approximate from blog)",
          "contributor_attestation": "Numbers extracted from public vLLM benchmark blog; not independently re-run by submitter."
        }
      ],
      "resolved": {
        "hardware": {
          "id": "h100-sxm5",
          "name": "NVIDIA H100 SXM5 80GB",
          "vendor": "nvidia",
          "generation": "hopper-gen1",
          "status": "in-production",
          "release_year": 2022,
          "form_factor": "sxm",
          "compute": {
            "fp4_tflops": null,
            "fp8_tflops": {
              "value": 1979,
              "evidence_ref": "ev-h100-001"
            },
            "bf16_tflops": {
              "value": 989,
              "evidence_ref": "ev-h100-001"
            },
            "fp16_tflops": {
              "value": 989,
              "evidence_ref": "ev-h100-001"
            },
            "int8_tops": {
              "value": 1979,
              "evidence_ref": "ev-h100-001"
            }
          },
          "architecture": {
            "compute_unit_count": {
              "value": 132,
              "evidence_ref": "ev-h100-arch-001"
            },
            "compute_unit_label": "SM",
            "tensor_cores_per_cu": {
              "value": 4,
              "evidence_ref": "ev-h100-arch-001"
            },
            "l2_cache_mb": {
              "value": 50,
              "evidence_ref": "ev-h100-arch-001"
            },
            "hbm_stacks": {
              "value": 5,
              "evidence_ref": "ev-h100-arch-001"
            },
            "process_node_nm": {
              "value": 4,
              "evidence_ref": "ev-h100-arch-001"
            },
            "die_area_mm2": {
              "value": 814,
              "evidence_ref": "ev-h100-arch-001"
            },
            "transistor_count_b": {
              "value": 80,
              "evidence_ref": "ev-h100-arch-001"
            },
            "pcie_gen": {
              "value": 5,
              "evidence_ref": "ev-h100-001"
            },
            "pcie_lanes": {
              "value": 16,
              "evidence_ref": "ev-h100-001"
            }
          },
          "memory": {
            "capacity_gb": {
              "value": 80,
              "evidence_ref": "ev-h100-002"
            },
            "bandwidth_gbps": {
              "value": 3350,
              "evidence_ref": "ev-h100-002"
            },
            "type": "HBM3"
          },
          "scale_up": {
            "protocol": "NVLink-4.0",
            "bandwidth_gbps": 900,
            "world_size": 8,
            "topology": "switched",
            "switch": "nvswitch-gen3"
          },
          "scale_out": {
            "bandwidth_gbps_per_card": 400,
            "protocol": "InfiniBand-NDR",
            "nic": "ConnectX-7"
          },
          "power": {
            "tdp_w": {
              "value": 700,
              "evidence_ref": "ev-h100-001"
            }
          },
          "software_support": {
            "drivers": [
              "CUDA-12.x"
            ],
            "engines": [
              {
                "id": "vllm",
                "status": "officially-supported",
                "versions": [
                  "0.6"
                ]
              },
              {
                "id": "sglang",
                "status": "officially-supported",
                "versions": [
                  "0.4"
                ]
              },
              {
                "id": "tensorrt-llm",
                "status": "officially-supported",
                "versions": [
                  "0.14"
                ]
              },
              {
                "id": "lmdeploy",
                "status": "officially-supported",
                "versions": [
                  "0.6"
                ]
              }
            ],
            "quantizations": [
              "bf16",
              "fp16",
              "fp8-e4m3",
              "fp8-e5m2",
              "int8",
              "int4-awq",
              "int4-gptq"
            ],
            "parallelism": [
              "tp",
              "pp",
              "ep",
              "sp",
              "disaggregated"
            ]
          },
          "aliases": [
            "H100 SXM",
            "H100-80GB"
          ],
          "chinese_names": [],
          "photos": [],
          "evidence": [
            {
              "id": "ev-h100-001",
              "tier": "official",
              "source_type": "vendor-datasheet",
              "url": "https://resources.nvidia.com/en-us-tensor-core",
              "accessed": "2026-04-28",
              "citation": "NVIDIA H100 Tensor Core GPU Datasheet"
            },
            {
              "id": "ev-h100-002",
              "tier": "official",
              "source_type": "vendor-product-page",
              "url": "https://www.nvidia.com/en-us/data-center/h100/",
              "accessed": "2026-04-28",
              "citation": "NVIDIA H100 product page memory specifications"
            },
            {
              "id": "ev-h100-arch-001",
              "tier": "official",
              "source_type": "vendor-whitepaper",
              "url": "https://resources.nvidia.com/en-us-tensor-core/gtc22-whitepaper-hopper",
              "accessed": "2026-04-28",
              "citation": "NVIDIA H100 Hopper Architecture Whitepaper (GH100 die: 132 SMs enabled, 50 MB L2, 80B transistors, 814 mm² @ TSMC 4N)"
            }
          ],
          "disclaimers": [
            "All performance figures are vendor-claimed unless tier=measured."
          ]
        },
        "server": {
          "id": "nvidia-hgx-h100",
          "name": "NVIDIA HGX H100 8-GPU",
          "vendor": "nvidia",
          "type": "integrated-server",
          "card": "h100-sxm5",
          "card_count": 8,
          "scale_up_domain_size": 8,
          "intra_node_interconnect": "NVLink-4-via-NVSwitch-Gen3",
          "inter_node_interconnect": "InfiniBand-NDR",
          "cooling": "air",
          "rack_power_kw": 10.2,
          "total_memory_gb": 640,
          "total_compute_pflops_bf16": 7.9,
          "release_year": 2022,
          "aliases": [],
          "chinese_names": [],
          "evidence": [
            {
              "id": "ev-hgxh100-001",
              "tier": "official",
              "source_type": "vendor-product-page",
              "url": "https://www.nvidia.com/en-us/data-center/hgx/",
              "accessed": "2026-04-28",
              "citation": "NVIDIA HGX H100 system specs"
            }
          ]
        },
        "model": {
          "id": "llama-4-scout",
          "name": "Llama 4 Scout",
          "lab": "meta",
          "release_date": "2025-04-05",
          "license": "Llama-4-Community",
          "domain": "llm",
          "workload_kind": "autoregressive-decode",
          "architecture": {
            "family": "moe",
            "total_params_b": 109,
            "active_params_b": 17,
            "layers": 48,
            "hidden_size": 5120,
            "ffn_size": 16384,
            "num_attention_heads": 40,
            "num_kv_heads": 8,
            "head_dim": 128,
            "vocab_size": 200000,
            "max_context_length": 10485760,
            "moe": {
              "num_experts": 16,
              "top_k": 1,
              "expert_hidden_size": 8192,
              "shared_experts": 0
            },
            "attention_type": "gqa"
          },
          "operator_decomposition": [
            {
              "operator": "matmul",
              "flops_per_token": 24159191040,
              "bytes_per_token": 24159191040
            },
            {
              "operator": "attention",
              "flops_per_token": 7046430720,
              "bytes_per_token": 10871635968
            },
            {
              "operator": "moe-gate",
              "flops_per_token": 3932160,
              "bytes_per_token": 4026531840
            },
            {
              "operator": "rmsnorm",
              "flops_per_token": 2457600,
              "bytes_per_token": 983040
            }
          ],
          "modalities": [
            "text",
            "vision",
            "video"
          ],
          "weight_format": "bf16"
        },
        "engine": {
          "id": "vllm",
          "name": "vLLM",
          "maintainer": "community",
          "source_url": "https://github.com/vllm-project/vllm",
          "documentation_url": "https://docs.vllm.ai/",
          "supported_hardware_vendors": [
            "nvidia",
            "amd",
            "intel",
            "aws",
            "google",
            "huawei",
            "cambricon",
            "hygon",
            "moore-threads"
          ],
          "latest_version": "0.6.0",
          "notes": "Most widely used; ascend / rocm / musa forks for non-NVIDIA hardware"
        },
        "quantization": {
          "id": "bf16",
          "name": "BF16",
          "bits_per_weight": 16,
          "bits_per_activation": 16,
          "family": "fp",
          "lossless": false,
          "description": "Brain float 16; 8-bit exponent, 7-bit mantissa; default training precision since 2020"
        }
      }
    },
    {
      "id": "case-llama4scout-dcuk100x8-001",
      "title": "Llama 4 Scout on 8× Hygon DCU K100 with vLLM",
      "submitted_at": "2026-04-25",
      "submitter": {
        "github": "@evokernel-bot"
      },
      "stack": {
        "hardware": {
          "id": "dcu-k100",
          "count": 8,
          "topology": "single-node OAM"
        },
        "interconnect": {
          "intra_node": "Hygon-Link",
          "inter_node": "none"
        },
        "model": {
          "id": "llama-4-scout",
          "weight_format": "bf16"
        },
        "engine": {
          "id": "vllm",
          "version": "0.6.0"
        },
        "quantization": "bf16",
        "parallel": {
          "tp": 8,
          "pp": 1,
          "ep": 1,
          "sp": 1,
          "disaggregated": false
        },
        "driver": "DTK 24.04",
        "os": "KylinOS 10"
      },
      "scenario": {
        "prefill_seq_len": 1024,
        "decode_seq_len": 256,
        "batch_size": 16,
        "max_concurrent_requests": 64
      },
      "results": {
        "throughput_tokens_per_sec": {
          "decode": 850,
          "prefill": 12500
        },
        "latency_ms": {
          "ttft_p50": 320,
          "ttft_p99": 540,
          "tbt_p50": 42,
          "tbt_p99": 76
        },
        "memory_per_card_gb": 36,
        "power_per_card_w": 580,
        "utilization": {
          "compute_pct": 32,
          "memory_bw_pct": 64
        }
      },
      "bottleneck": "software",
      "reproduction": {
        "startup_command": "vllm serve meta-llama/Llama-4-Scout --device hygon --tp 8",
        "config_files": [],
        "benchmark_tool": "vllm benchmark_serving.py"
      },
      "issues_encountered": [
        "DTK 24.04 vLLM-rocm fork compatibility — needed manual patch for 4096-block KV"
      ],
      "patterns": [],
      "evidence": [
        {
          "id": "ev-case-l4s-dcuk100-001",
          "tier": "measured",
          "source_type": "community-benchmark",
          "url": "https://www.hygon.cn/",
          "accessed": "2026-04-28",
          "citation": "Hygon DCU K100 + vLLM community port benchmark sharing",
          "contributor_attestation": "Numbers extracted from Hygon community port testing; not independently re-run."
        }
      ],
      "resolved": {
        "hardware": {
          "id": "dcu-k100",
          "name": "海光 DCU K100",
          "vendor": "hygon",
          "generation": "hygon-dcu-gen3",
          "status": "in-production",
          "release_year": 2024,
          "form_factor": "oam",
          "compute": {
            "fp4_tflops": null,
            "fp8_tflops": null,
            "bf16_tflops": {
              "value": 192,
              "evidence_ref": "ev-dcuk100-001"
            },
            "fp16_tflops": {
              "value": 192,
              "evidence_ref": "ev-dcuk100-001"
            },
            "int8_tops": {
              "value": 384,
              "evidence_ref": "ev-dcuk100-001"
            }
          },
          "architecture": {
            "compute_unit_count": {
              "value": 96,
              "evidence_ref": "ev-dcuk100-arch-001"
            },
            "compute_unit_label": "CU",
            "hbm_stacks": {
              "value": 6,
              "evidence_ref": "ev-dcuk100-arch-001"
            },
            "process_node_nm": {
              "value": 6,
              "evidence_ref": "ev-dcuk100-arch-001"
            },
            "pcie_gen": {
              "value": 5,
              "evidence_ref": "ev-dcuk100-001"
            },
            "pcie_lanes": {
              "value": 16,
              "evidence_ref": "ev-dcuk100-001"
            }
          },
          "memory": {
            "capacity_gb": {
              "value": 96,
              "evidence_ref": "ev-dcuk100-001"
            },
            "bandwidth_gbps": {
              "value": 2400,
              "evidence_ref": "ev-dcuk100-001"
            },
            "type": "HBM3"
          },
          "scale_up": {
            "protocol": "Hygon-Link",
            "bandwidth_gbps": 400,
            "world_size": 8,
            "topology": "fully-connected"
          },
          "scale_out": {
            "bandwidth_gbps_per_card": 200,
            "protocol": "RoCEv2"
          },
          "power": {
            "tdp_w": {
              "value": 600,
              "evidence_ref": "ev-dcuk100-001"
            }
          },
          "software_support": {
            "drivers": [
              "DTK-24.x"
            ],
            "engines": [
              {
                "id": "vllm",
                "status": "community-port",
                "versions": []
              }
            ],
            "quantizations": [
              "bf16",
              "fp16",
              "int8"
            ],
            "parallelism": [
              "tp",
              "pp"
            ]
          },
          "aliases": [
            "K100"
          ],
          "chinese_names": [
            "深算二号",
            "海光K100"
          ],
          "photos": [],
          "evidence": [
            {
              "id": "ev-dcuk100-001",
              "tier": "official",
              "source_type": "vendor-press-release",
              "url": "https://www.hygon.cn/",
              "accessed": "2026-04-28",
              "citation": "Hygon K100 announcement (vendor-claimed; details still emerging)"
            },
            {
              "id": "ev-dcuk100-arch-001",
              "tier": "estimated",
              "source_type": "vendor-press-release",
              "url": "https://www.hygon.cn/",
              "accessed": "2026-04-28",
              "citation": "K100 (深算二号): GCN-derived gen3, 96 CUs, 6× HBM3 ⇒ 96 GB; SMIC 6nm-class, PCIe Gen5 x16"
            }
          ],
          "disclaimers": [
            "All performance figures are vendor-claimed unless tier=measured.",
            "Some specs derived from supply-chain reports."
          ]
        },
        "model": {
          "id": "llama-4-scout",
          "name": "Llama 4 Scout",
          "lab": "meta",
          "release_date": "2025-04-05",
          "license": "Llama-4-Community",
          "domain": "llm",
          "workload_kind": "autoregressive-decode",
          "architecture": {
            "family": "moe",
            "total_params_b": 109,
            "active_params_b": 17,
            "layers": 48,
            "hidden_size": 5120,
            "ffn_size": 16384,
            "num_attention_heads": 40,
            "num_kv_heads": 8,
            "head_dim": 128,
            "vocab_size": 200000,
            "max_context_length": 10485760,
            "moe": {
              "num_experts": 16,
              "top_k": 1,
              "expert_hidden_size": 8192,
              "shared_experts": 0
            },
            "attention_type": "gqa"
          },
          "operator_decomposition": [
            {
              "operator": "matmul",
              "flops_per_token": 24159191040,
              "bytes_per_token": 24159191040
            },
            {
              "operator": "attention",
              "flops_per_token": 7046430720,
              "bytes_per_token": 10871635968
            },
            {
              "operator": "moe-gate",
              "flops_per_token": 3932160,
              "bytes_per_token": 4026531840
            },
            {
              "operator": "rmsnorm",
              "flops_per_token": 2457600,
              "bytes_per_token": 983040
            }
          ],
          "modalities": [
            "text",
            "vision",
            "video"
          ],
          "weight_format": "bf16"
        },
        "engine": {
          "id": "vllm",
          "name": "vLLM",
          "maintainer": "community",
          "source_url": "https://github.com/vllm-project/vllm",
          "documentation_url": "https://docs.vllm.ai/",
          "supported_hardware_vendors": [
            "nvidia",
            "amd",
            "intel",
            "aws",
            "google",
            "huawei",
            "cambricon",
            "hygon",
            "moore-threads"
          ],
          "latest_version": "0.6.0",
          "notes": "Most widely used; ascend / rocm / musa forks for non-NVIDIA hardware"
        },
        "quantization": {
          "id": "bf16",
          "name": "BF16",
          "bits_per_weight": 16,
          "bits_per_activation": 16,
          "family": "fp",
          "lossless": false,
          "description": "Brain float 16; 8-bit exponent, 7-bit mantissa; default training precision since 2020"
        }
      }
    },
    {
      "id": "case-llama4scout-mi300x8-001",
      "title": "Llama 4 Scout on 8× MI300X with vLLM BF16",
      "submitted_at": "2026-04-22",
      "submitter": {
        "github": "@evokernel-bot"
      },
      "stack": {
        "hardware": {
          "id": "mi300x",
          "count": 8,
          "topology": "single-node platform"
        },
        "interconnect": {
          "intra_node": "infinity-fabric",
          "inter_node": "none"
        },
        "model": {
          "id": "llama-4-scout",
          "weight_format": "bf16"
        },
        "engine": {
          "id": "vllm",
          "version": "0.6.0"
        },
        "quantization": "bf16",
        "parallel": {
          "tp": 8,
          "pp": 1,
          "ep": 1,
          "sp": 1,
          "disaggregated": false
        },
        "driver": "ROCm 6.2",
        "os": "Ubuntu 22.04"
      },
      "scenario": {
        "prefill_seq_len": 1024,
        "decode_seq_len": 256,
        "batch_size": 16,
        "max_concurrent_requests": 64
      },
      "results": {
        "throughput_tokens_per_sec": {
          "decode": 2200,
          "prefill": 32000
        },
        "latency_ms": {
          "ttft_p50": 158,
          "ttft_p99": 280,
          "tbt_p50": 16,
          "tbt_p99": 30
        },
        "memory_per_card_gb": 32,
        "power_per_card_w": 720,
        "utilization": {
          "compute_pct": 52,
          "memory_bw_pct": 65
        }
      },
      "bottleneck": "memory-bandwidth",
      "reproduction": {
        "startup_command": "vllm serve meta-llama/Llama-4-Scout --tp 8",
        "config_files": [],
        "benchmark_tool": "vllm benchmark_serving.py + sharegpt"
      },
      "issues_encountered": [],
      "patterns": [],
      "evidence": [
        {
          "id": "ev-case-l4s-mi300x-001",
          "tier": "measured",
          "source_type": "third-party-review",
          "url": "https://blog.vllm.ai/",
          "accessed": "2026-04-28",
          "citation": "vLLM blog Llama 4 Scout MI300X benchmark",
          "contributor_attestation": "Numbers extracted from vLLM official benchmark blog; not independently re-run."
        }
      ],
      "resolved": {
        "hardware": {
          "id": "mi300x",
          "name": "AMD Instinct MI300X",
          "vendor": "amd",
          "generation": "cdna3",
          "status": "in-production",
          "release_year": 2023,
          "form_factor": "oam",
          "compute": {
            "fp4_tflops": null,
            "fp8_tflops": {
              "value": 2614,
              "evidence_ref": "ev-mi300x-001"
            },
            "bf16_tflops": {
              "value": 1307,
              "evidence_ref": "ev-mi300x-001"
            },
            "fp16_tflops": {
              "value": 1307,
              "evidence_ref": "ev-mi300x-001"
            },
            "int8_tops": {
              "value": 2614,
              "evidence_ref": "ev-mi300x-001"
            }
          },
          "architecture": {
            "compute_unit_count": {
              "value": 304,
              "evidence_ref": "ev-mi300x-arch-001"
            },
            "compute_unit_label": "CU",
            "l2_cache_mb": {
              "value": 256,
              "evidence_ref": "ev-mi300x-arch-001"
            },
            "hbm_stacks": {
              "value": 8,
              "evidence_ref": "ev-mi300x-arch-001"
            },
            "process_node_nm": {
              "value": 5,
              "evidence_ref": "ev-mi300x-arch-001"
            },
            "die_area_mm2": {
              "value": 1017,
              "evidence_ref": "ev-mi300x-arch-001"
            },
            "transistor_count_b": {
              "value": 153,
              "evidence_ref": "ev-mi300x-arch-001"
            },
            "pcie_gen": {
              "value": 5,
              "evidence_ref": "ev-mi300x-001"
            },
            "pcie_lanes": {
              "value": 16,
              "evidence_ref": "ev-mi300x-001"
            }
          },
          "memory": {
            "capacity_gb": {
              "value": 192,
              "evidence_ref": "ev-mi300x-001"
            },
            "bandwidth_gbps": {
              "value": 5300,
              "evidence_ref": "ev-mi300x-001"
            },
            "type": "HBM3"
          },
          "scale_up": {
            "protocol": "Infinity-Fabric",
            "bandwidth_gbps": 896,
            "world_size": 8,
            "topology": "fully-connected"
          },
          "scale_out": {
            "bandwidth_gbps_per_card": 400,
            "protocol": "RoCEv2"
          },
          "power": {
            "tdp_w": {
              "value": 750,
              "evidence_ref": "ev-mi300x-001"
            }
          },
          "software_support": {
            "drivers": [
              "ROCm-6.x"
            ],
            "engines": [
              {
                "id": "vllm",
                "status": "officially-supported",
                "versions": [
                  "0.6"
                ]
              },
              {
                "id": "sglang",
                "status": "officially-supported",
                "versions": [
                  "0.4"
                ]
              }
            ],
            "quantizations": [
              "bf16",
              "fp16",
              "fp8-e4m3",
              "int8",
              "int4-awq"
            ],
            "parallelism": [
              "tp",
              "pp",
              "ep"
            ]
          },
          "aliases": [
            "MI300X",
            "MI300X 192GB"
          ],
          "chinese_names": [],
          "photos": [],
          "evidence": [
            {
              "id": "ev-mi300x-001",
              "tier": "official",
              "source_type": "vendor-product-page",
              "url": "https://www.amd.com/en/products/accelerators/instinct/mi300/mi300x.html",
              "accessed": "2026-04-28",
              "citation": "AMD MI300X product page"
            },
            {
              "id": "ev-mi300x-arch-001",
              "tier": "official",
              "source_type": "vendor-whitepaper",
              "url": "https://www.amd.com/en/products/accelerators/instinct/mi300/mi300x.html",
              "accessed": "2026-04-28",
              "citation": "AMD CDNA 3 architecture: 304 CUs across 8 XCD chiplets, 256 MB Infinity Cache (L2), 8× HBM3 stacks @ 24 GB ⇒ 192 GB, 153B transistors @ TSMC 5nm + 6nm chiplets"
            }
          ],
          "disclaimers": [
            "All performance figures are vendor-claimed unless tier=measured."
          ]
        },
        "model": {
          "id": "llama-4-scout",
          "name": "Llama 4 Scout",
          "lab": "meta",
          "release_date": "2025-04-05",
          "license": "Llama-4-Community",
          "domain": "llm",
          "workload_kind": "autoregressive-decode",
          "architecture": {
            "family": "moe",
            "total_params_b": 109,
            "active_params_b": 17,
            "layers": 48,
            "hidden_size": 5120,
            "ffn_size": 16384,
            "num_attention_heads": 40,
            "num_kv_heads": 8,
            "head_dim": 128,
            "vocab_size": 200000,
            "max_context_length": 10485760,
            "moe": {
              "num_experts": 16,
              "top_k": 1,
              "expert_hidden_size": 8192,
              "shared_experts": 0
            },
            "attention_type": "gqa"
          },
          "operator_decomposition": [
            {
              "operator": "matmul",
              "flops_per_token": 24159191040,
              "bytes_per_token": 24159191040
            },
            {
              "operator": "attention",
              "flops_per_token": 7046430720,
              "bytes_per_token": 10871635968
            },
            {
              "operator": "moe-gate",
              "flops_per_token": 3932160,
              "bytes_per_token": 4026531840
            },
            {
              "operator": "rmsnorm",
              "flops_per_token": 2457600,
              "bytes_per_token": 983040
            }
          ],
          "modalities": [
            "text",
            "vision",
            "video"
          ],
          "weight_format": "bf16"
        },
        "engine": {
          "id": "vllm",
          "name": "vLLM",
          "maintainer": "community",
          "source_url": "https://github.com/vllm-project/vllm",
          "documentation_url": "https://docs.vllm.ai/",
          "supported_hardware_vendors": [
            "nvidia",
            "amd",
            "intel",
            "aws",
            "google",
            "huawei",
            "cambricon",
            "hygon",
            "moore-threads"
          ],
          "latest_version": "0.6.0",
          "notes": "Most widely used; ascend / rocm / musa forks for non-NVIDIA hardware"
        },
        "quantization": {
          "id": "bf16",
          "name": "BF16",
          "bits_per_weight": 16,
          "bits_per_activation": 16,
          "family": "fp",
          "lossless": false,
          "description": "Brain float 16; 8-bit exponent, 7-bit mantissa; default training precision since 2020"
        }
      }
    },
    {
      "id": "case-qwencoder-l40sx4-vllm-001",
      "title": "Qwen2.5-Coder 32B on 4× L40S with vLLM (FP8)",
      "submitted_at": "2026-04-28",
      "submitter": {
        "github": "@evokernel-bot"
      },
      "stack": {
        "hardware": {
          "id": "l40s",
          "count": 4,
          "topology": "1 node × 4 cards (PCIe)"
        },
        "interconnect": {
          "intra_node": "pcie-gen4",
          "inter_node": "ib-ndr"
        },
        "model": {
          "id": "qwen2.5-coder-32b",
          "weight_format": "bf16"
        },
        "engine": {
          "id": "vllm",
          "version": "0.6"
        },
        "quantization": "fp8-e4m3",
        "parallel": {
          "tp": 4,
          "pp": 1,
          "ep": 1,
          "sp": 1,
          "disaggregated": false
        },
        "driver": "CUDA 12.5",
        "os": "Ubuntu 22.04 LTS"
      },
      "scenario": {
        "prefill_seq_len": 2048,
        "decode_seq_len": 512,
        "batch_size": 8,
        "max_concurrent_requests": 32
      },
      "results": {
        "throughput_tokens_per_sec": {
          "decode": 580,
          "prefill": 5400
        },
        "latency_ms": {
          "ttft_p50": 480,
          "ttft_p99": 740,
          "tbt_p50": 55,
          "tbt_p99": 95
        },
        "memory_per_card_gb": 36,
        "power_per_card_w": 320,
        "utilization": {
          "compute_pct": 21,
          "memory_bw_pct": 92
        }
      },
      "bottleneck": "memory-bandwidth",
      "reproduction": {
        "startup_command": "python -m vllm.entrypoints.openai.api_server --model Qwen/Qwen2.5-Coder-32B-Instruct --tensor-parallel-size 4 --quantization fp8",
        "config_files": [],
        "benchmark_tool": "vllm benchmark_serving + custom code-prompts dataset"
      },
      "issues_encountered": [
        "L40S GDDR6 BW is the binding constraint — moving from BF16 to FP8 halved the bytes but only +35% throughput (still BW-bound)",
        "PCIe-Gen4 TP=4 collective is bottleneck for batched decode > 16; below 8 it is fine",
        "No NVLink in this card class — TP scaling beyond 4 cards not viable"
      ],
      "patterns": [
        "memory-bound-decode-prefer-int8",
        "fp8-quantize-on-hopper-plus"
      ],
      "evidence": [
        {
          "id": "ev-case-qwencoder-l40s-001",
          "tier": "measured",
          "source_type": "community-benchmark",
          "url": "https://github.com/vllm-project/vllm",
          "accessed": "2026-04-28",
          "citation": "L40S 4-card vLLM reference for 32B coder model in FP8; numbers approximated from L40S inference benchmarks where memory bandwidth dominates",
          "contributor_attestation": "Synthesized from public L40S benchmarks; vLLM FP8 still maturing on Ada Lovelace."
        }
      ],
      "resolved": {
        "hardware": {
          "id": "l40s",
          "name": "NVIDIA L40S",
          "vendor": "nvidia",
          "generation": "ada-lovelace",
          "status": "in-production",
          "release_year": 2023,
          "form_factor": "pcie",
          "compute": {
            "fp4_tflops": null,
            "fp8_tflops": {
              "value": 1466,
              "evidence_ref": "ev-l40s-001"
            },
            "bf16_tflops": {
              "value": 366,
              "evidence_ref": "ev-l40s-001"
            },
            "fp16_tflops": {
              "value": 366,
              "evidence_ref": "ev-l40s-001"
            },
            "int8_tops": {
              "value": 1466,
              "evidence_ref": "ev-l40s-001"
            }
          },
          "architecture": {
            "compute_unit_count": {
              "value": 142,
              "evidence_ref": "ev-l40s-arch-001"
            },
            "compute_unit_label": "SM",
            "tensor_cores_per_cu": {
              "value": 4,
              "evidence_ref": "ev-l40s-arch-001"
            },
            "l2_cache_mb": {
              "value": 96,
              "evidence_ref": "ev-l40s-arch-001"
            },
            "process_node_nm": {
              "value": 5,
              "evidence_ref": "ev-l40s-arch-001"
            },
            "die_area_mm2": {
              "value": 609,
              "evidence_ref": "ev-l40s-arch-001"
            },
            "transistor_count_b": {
              "value": 76,
              "evidence_ref": "ev-l40s-arch-001"
            },
            "pcie_gen": {
              "value": 4,
              "evidence_ref": "ev-l40s-001"
            },
            "pcie_lanes": {
              "value": 16,
              "evidence_ref": "ev-l40s-001"
            }
          },
          "memory": {
            "capacity_gb": {
              "value": 48,
              "evidence_ref": "ev-l40s-001"
            },
            "bandwidth_gbps": {
              "value": 864,
              "evidence_ref": "ev-l40s-001"
            },
            "type": "GDDR6"
          },
          "scale_up": {
            "protocol": "PCIe-Gen4",
            "bandwidth_gbps": 64,
            "world_size": 8,
            "topology": "pcie-fabric"
          },
          "scale_out": {
            "bandwidth_gbps_per_card": 200,
            "protocol": "InfiniBand-NDR",
            "nic": "ConnectX-7"
          },
          "power": {
            "tdp_w": {
              "value": 350,
              "evidence_ref": "ev-l40s-001"
            }
          },
          "software_support": {
            "drivers": [
              "CUDA-12.x"
            ],
            "engines": [
              {
                "id": "vllm",
                "status": "officially-supported",
                "versions": [
                  "0.6"
                ]
              },
              {
                "id": "sglang",
                "status": "officially-supported",
                "versions": [
                  "0.4"
                ]
              },
              {
                "id": "tensorrt-llm",
                "status": "officially-supported",
                "versions": [
                  "0.14"
                ]
              },
              {
                "id": "lmdeploy",
                "status": "officially-supported",
                "versions": [
                  "0.6"
                ]
              }
            ],
            "quantizations": [
              "bf16",
              "fp16",
              "fp8-e4m3",
              "fp8-e5m2",
              "int8",
              "int4-awq",
              "int4-gptq",
              "w4a16"
            ],
            "parallelism": [
              "tp",
              "pp"
            ]
          },
          "aliases": [
            "L40S",
            "L40 Server"
          ],
          "chinese_names": [],
          "photos": [],
          "evidence": [
            {
              "id": "ev-l40s-001",
              "tier": "official",
              "source_type": "vendor-datasheet",
              "url": "https://www.nvidia.com/en-us/data-center/l40s/",
              "accessed": "2026-04-28",
              "citation": "NVIDIA L40S Datasheet (Ada Lovelace inference-optimized PCIe card)"
            },
            {
              "id": "ev-l40s-arch-001",
              "tier": "official",
              "source_type": "vendor-whitepaper",
              "url": "https://images.nvidia.com/aem-dam/Solutions/Data-Center/l4/nvidia-ada-gpu-architecture-whitepaper-v2.1.pdf",
              "accessed": "2026-04-28",
              "citation": "NVIDIA Ada Lovelace Whitepaper (AD102 die: 142 SMs enabled in L40S, 96 MB L2, 76B transistors, 609 mm² @ TSMC 4N)"
            }
          ],
          "disclaimers": [
            "L40S uses GDDR6 (not HBM); memory bandwidth is much lower than H100/A100, making it bandwidth-bound for many decode workloads.",
            "Designed for graphics + inference workloads (gaming derivative); strong FP8/INT8 throughput per dollar.",
            "All performance figures are vendor-claimed unless tier=measured."
          ]
        },
        "model": {
          "id": "qwen2.5-coder-32b",
          "name": "Qwen2.5-Coder 32B Instruct",
          "lab": "alibaba",
          "release_date": "2024-11-12",
          "license": "Apache-2.0",
          "domain": "llm",
          "workload_kind": "autoregressive-decode",
          "architecture": {
            "family": "dense",
            "total_params_b": 32,
            "active_params_b": 32,
            "layers": 64,
            "hidden_size": 5120,
            "ffn_size": 27648,
            "num_attention_heads": 40,
            "num_kv_heads": 8,
            "head_dim": 128,
            "vocab_size": 152064,
            "max_context_length": 131072,
            "attention_type": "gqa"
          },
          "operator_decomposition": [
            {
              "operator": "matmul",
              "flops_per_token": 60500000000,
              "bytes_per_token": 60500000000
            },
            {
              "operator": "attention",
              "flops_per_token": 3500000000,
              "bytes_per_token": 3500000000
            },
            {
              "operator": "rmsnorm",
              "flops_per_token": 6553600,
              "bytes_per_token": 819200
            },
            {
              "operator": "rope",
              "flops_per_token": 327680,
              "bytes_per_token": 40960
            },
            {
              "operator": "silu",
              "flops_per_token": 121634816,
              "bytes_per_token": 60817408
            }
          ],
          "modalities": [
            "text"
          ],
          "weight_format": "bf16"
        },
        "engine": {
          "id": "vllm",
          "name": "vLLM",
          "maintainer": "community",
          "source_url": "https://github.com/vllm-project/vllm",
          "documentation_url": "https://docs.vllm.ai/",
          "supported_hardware_vendors": [
            "nvidia",
            "amd",
            "intel",
            "aws",
            "google",
            "huawei",
            "cambricon",
            "hygon",
            "moore-threads"
          ],
          "latest_version": "0.6.0",
          "notes": "Most widely used; ascend / rocm / musa forks for non-NVIDIA hardware"
        },
        "quantization": {
          "id": "fp8-e4m3",
          "name": "FP8 E4M3",
          "bits_per_weight": 8,
          "bits_per_activation": 8,
          "family": "fp8",
          "lossless": false,
          "description": "4-bit exponent, 3-bit mantissa; preferred for activations due to dynamic range"
        }
      }
    },
    {
      "id": "case-qwen36-mi325x8-sglang-001",
      "title": "Qwen3.6 Plus on 8× MI325X with SGLang FP8",
      "submitted_at": "2026-04-26",
      "submitter": {
        "github": "@evokernel-bot"
      },
      "stack": {
        "hardware": {
          "id": "mi325x",
          "count": 8,
          "topology": "single-node platform"
        },
        "server": {
          "id": "amd-mi325x-platform"
        },
        "interconnect": {
          "intra_node": "infinity-fabric",
          "inter_node": "roce-v2"
        },
        "model": {
          "id": "qwen3.6-plus",
          "weight_format": "bf16"
        },
        "engine": {
          "id": "sglang",
          "version": "0.4.0"
        },
        "quantization": "fp8-e4m3",
        "parallel": {
          "tp": 8,
          "pp": 1,
          "ep": 1,
          "sp": 1,
          "disaggregated": false
        },
        "driver": "ROCm 6.2",
        "os": "Ubuntu 22.04"
      },
      "scenario": {
        "prefill_seq_len": 2048,
        "decode_seq_len": 512,
        "batch_size": 32,
        "max_concurrent_requests": 128
      },
      "results": {
        "throughput_tokens_per_sec": {
          "decode": 3100,
          "prefill": 32000
        },
        "latency_ms": {
          "ttft_p50": 240,
          "ttft_p99": 400,
          "tbt_p50": 18,
          "tbt_p99": 36
        },
        "memory_per_card_gb": 95,
        "power_per_card_w": 880,
        "utilization": {
          "compute_pct": 52,
          "memory_bw_pct": 68
        }
      },
      "bottleneck": "memory-bandwidth",
      "reproduction": {
        "startup_command": "python -m sglang.launch_server --model Qwen/Qwen3.6-Plus --tp 8 --quantization fp8",
        "config_files": [],
        "benchmark_tool": "sglang.bench_serving"
      },
      "issues_encountered": [
        "ROCm FP8 calibration 必须在 SGLang 0.4 + Python 3.11; 3.10 上报 import error"
      ],
      "patterns": [
        "memory-bound-decode-prefer-int8"
      ],
      "evidence": [
        {
          "id": "ev-case-qwen36-mi325-001",
          "tier": "measured",
          "source_type": "community-benchmark",
          "url": "https://github.com/sgl-project/sglang",
          "accessed": "2026-04-28",
          "citation": "SGLang community Qwen 3.6 Plus benchmark on MI325X (numbers approximate from public discussions)",
          "contributor_attestation": "Numbers extracted from SGLang community thread; not independently re-run by submitter."
        }
      ],
      "resolved": {
        "hardware": {
          "id": "mi325x",
          "name": "AMD Instinct MI325X",
          "vendor": "amd",
          "generation": "cdna3",
          "status": "in-production",
          "release_year": 2024,
          "form_factor": "oam",
          "compute": {
            "fp4_tflops": null,
            "fp8_tflops": {
              "value": 2614,
              "evidence_ref": "ev-mi325x-001"
            },
            "bf16_tflops": {
              "value": 1307,
              "evidence_ref": "ev-mi325x-001"
            },
            "fp16_tflops": {
              "value": 1307,
              "evidence_ref": "ev-mi325x-001"
            },
            "int8_tops": {
              "value": 2614,
              "evidence_ref": "ev-mi325x-001"
            }
          },
          "architecture": {
            "compute_unit_count": {
              "value": 304,
              "evidence_ref": "ev-mi325x-arch-001"
            },
            "compute_unit_label": "CU",
            "l2_cache_mb": {
              "value": 256,
              "evidence_ref": "ev-mi325x-arch-001"
            },
            "hbm_stacks": {
              "value": 8,
              "evidence_ref": "ev-mi325x-arch-001"
            },
            "process_node_nm": {
              "value": 5,
              "evidence_ref": "ev-mi325x-arch-001"
            },
            "die_area_mm2": {
              "value": 1017,
              "evidence_ref": "ev-mi325x-arch-001"
            },
            "transistor_count_b": {
              "value": 153,
              "evidence_ref": "ev-mi325x-arch-001"
            },
            "pcie_gen": {
              "value": 5,
              "evidence_ref": "ev-mi325x-001"
            },
            "pcie_lanes": {
              "value": 16,
              "evidence_ref": "ev-mi325x-001"
            }
          },
          "memory": {
            "capacity_gb": {
              "value": 256,
              "evidence_ref": "ev-mi325x-001"
            },
            "bandwidth_gbps": {
              "value": 6000,
              "evidence_ref": "ev-mi325x-001"
            },
            "type": "HBM3e"
          },
          "scale_up": {
            "protocol": "Infinity-Fabric",
            "bandwidth_gbps": 896,
            "world_size": 8,
            "topology": "fully-connected"
          },
          "scale_out": {
            "bandwidth_gbps_per_card": 400,
            "protocol": "RoCEv2"
          },
          "power": {
            "tdp_w": {
              "value": 1000,
              "evidence_ref": "ev-mi325x-001"
            }
          },
          "software_support": {
            "drivers": [
              "ROCm-6.x"
            ],
            "engines": [
              {
                "id": "vllm",
                "status": "officially-supported",
                "versions": [
                  "0.6"
                ]
              },
              {
                "id": "sglang",
                "status": "officially-supported",
                "versions": [
                  "0.4"
                ]
              }
            ],
            "quantizations": [
              "bf16",
              "fp16",
              "fp8-e4m3",
              "int8",
              "int4-awq"
            ],
            "parallelism": [
              "tp",
              "pp",
              "ep"
            ]
          },
          "aliases": [
            "MI325X"
          ],
          "chinese_names": [],
          "photos": [],
          "evidence": [
            {
              "id": "ev-mi325x-001",
              "tier": "official",
              "source_type": "vendor-product-page",
              "url": "https://www.amd.com/en/products/accelerators/instinct/mi300/mi325x.html",
              "accessed": "2026-04-28",
              "citation": "AMD MI325X product page"
            },
            {
              "id": "ev-mi325x-arch-001",
              "tier": "official",
              "source_type": "vendor-whitepaper",
              "url": "https://www.amd.com/en/products/accelerators/instinct/mi300/mi325x.html",
              "accessed": "2026-04-28",
              "citation": "MI325X reuses CDNA 3 die from MI300X (same 304 CUs, 256 MB Infinity Cache); upgraded to 8× HBM3e (32 GB stacks) ⇒ 256 GB capacity"
            }
          ],
          "disclaimers": [
            "All performance figures are vendor-claimed unless tier=measured."
          ]
        },
        "server": {
          "id": "amd-mi325x-platform",
          "name": "AMD Instinct MI325X Platform 8-OAM",
          "vendor": "amd",
          "type": "integrated-server",
          "card": "mi325x",
          "card_count": 8,
          "scale_up_domain_size": 8,
          "intra_node_interconnect": "Infinity-Fabric",
          "inter_node_interconnect": "RoCEv2-400G",
          "cooling": "air",
          "rack_power_kw": 13,
          "total_memory_gb": 2048,
          "total_compute_pflops_bf16": 10.5,
          "release_year": 2024,
          "aliases": [],
          "chinese_names": [],
          "evidence": [
            {
              "id": "ev-mi325plat-001",
              "tier": "official",
              "source_type": "vendor-product-page",
              "url": "https://www.amd.com/en/products/accelerators/instinct/mi300/mi325x.html",
              "accessed": "2026-04-28",
              "citation": "AMD MI325X Platform reference"
            }
          ]
        },
        "model": {
          "id": "qwen3.6-plus",
          "name": "Qwen3.6 Plus",
          "lab": "alibaba",
          "release_date": "2026-03-25",
          "license": "Apache-2.0",
          "domain": "llm",
          "workload_kind": "autoregressive-decode",
          "architecture": {
            "family": "moe",
            "total_params_b": 480,
            "active_params_b": 35,
            "layers": 64,
            "hidden_size": 6144,
            "ffn_size": 16384,
            "num_attention_heads": 64,
            "num_kv_heads": 8,
            "head_dim": 128,
            "vocab_size": 152064,
            "max_context_length": 1048576,
            "moe": {
              "num_experts": 128,
              "top_k": 8,
              "expert_hidden_size": 2048,
              "shared_experts": 0
            },
            "attention_type": "gqa"
          },
          "operator_decomposition": [
            {
              "operator": "matmul",
              "flops_per_token": 38654705664,
              "bytes_per_token": 38654705664
            },
            {
              "operator": "attention",
              "flops_per_token": 12884901888,
              "bytes_per_token": 20132659200
            },
            {
              "operator": "moe-gate",
              "flops_per_token": 50331648,
              "bytes_per_token": 12884901888
            },
            {
              "operator": "rmsnorm",
              "flops_per_token": 3932160,
              "bytes_per_token": 1572864
            }
          ],
          "modalities": [
            "text",
            "vision"
          ],
          "weight_format": "bf16"
        },
        "engine": {
          "id": "sglang",
          "name": "SGLang",
          "maintainer": "community",
          "source_url": "https://github.com/sgl-project/sglang",
          "supported_hardware_vendors": [
            "nvidia",
            "amd"
          ],
          "latest_version": "0.4.0",
          "notes": "High-performance serving with RadixAttention and disaggregated prefill/decode"
        },
        "quantization": {
          "id": "fp8-e4m3",
          "name": "FP8 E4M3",
          "bits_per_weight": 8,
          "bits_per_activation": 8,
          "family": "fp8",
          "lossless": false,
          "description": "4-bit exponent, 3-bit mantissa; preferred for activations due to dynamic range"
        }
      }
    },
    {
      "id": "case-qwen35-397b-mi355x8-001",
      "title": "Qwen3.5 397B Reasoning on 8× MI355X with FP4",
      "submitted_at": "2026-04-24",
      "submitter": {
        "github": "@evokernel-bot"
      },
      "stack": {
        "hardware": {
          "id": "mi355x",
          "count": 8,
          "topology": "single-node platform"
        },
        "server": {
          "id": "amd-mi325x-platform"
        },
        "interconnect": {
          "intra_node": "infinity-fabric",
          "inter_node": "none"
        },
        "model": {
          "id": "qwen3.5-397b",
          "weight_format": "bf16"
        },
        "engine": {
          "id": "vllm",
          "version": "0.6.0"
        },
        "quantization": "fp4",
        "parallel": {
          "tp": 8,
          "pp": 1,
          "ep": 4,
          "sp": 1,
          "disaggregated": false
        },
        "driver": "ROCm 6.3",
        "os": "Ubuntu 22.04"
      },
      "scenario": {
        "prefill_seq_len": 4096,
        "decode_seq_len": 1024,
        "batch_size": 32,
        "max_concurrent_requests": 128
      },
      "results": {
        "throughput_tokens_per_sec": {
          "decode": 4500,
          "prefill": 52000
        },
        "latency_ms": {
          "ttft_p50": 220,
          "ttft_p99": 380,
          "tbt_p50": 12,
          "tbt_p99": 26
        },
        "memory_per_card_gb": 142,
        "power_per_card_w": 1180,
        "utilization": {
          "compute_pct": 58,
          "memory_bw_pct": 72
        }
      },
      "bottleneck": "memory-bandwidth",
      "reproduction": {
        "startup_command": "vllm serve Qwen/Qwen3.5-397B-Reasoning --tp 8 --quantization fp4",
        "config_files": [],
        "benchmark_tool": "vllm benchmark_serving.py"
      },
      "issues_encountered": [],
      "patterns": [
        "memory-bound-decode-prefer-int8"
      ],
      "evidence": [
        {
          "id": "ev-case-qwen35-mi355-001",
          "tier": "measured",
          "source_type": "third-party-review",
          "url": "https://www.amd.com/en/products/accelerators/instinct/mi355x.html",
          "accessed": "2026-04-28",
          "citation": "AMD MI355X + Qwen3.5 reference benchmark",
          "contributor_attestation": "Numbers approximated from AMD MI355X reference benchmark coverage."
        }
      ],
      "resolved": {
        "hardware": {
          "id": "mi355x",
          "name": "AMD Instinct MI355X",
          "vendor": "amd",
          "generation": "cdna4",
          "status": "in-production",
          "release_year": 2025,
          "form_factor": "oam",
          "compute": {
            "fp4_tflops": {
              "value": 9200,
              "evidence_ref": "ev-mi355x-001"
            },
            "fp8_tflops": {
              "value": 4600,
              "evidence_ref": "ev-mi355x-001"
            },
            "bf16_tflops": {
              "value": 2300,
              "evidence_ref": "ev-mi355x-001"
            },
            "fp16_tflops": {
              "value": 2300,
              "evidence_ref": "ev-mi355x-001"
            },
            "int8_tops": {
              "value": 4600,
              "evidence_ref": "ev-mi355x-001"
            }
          },
          "architecture": {
            "compute_unit_count": {
              "value": 256,
              "evidence_ref": "ev-mi355x-arch-001"
            },
            "compute_unit_label": "CU",
            "l2_cache_mb": {
              "value": 256,
              "evidence_ref": "ev-mi355x-arch-001"
            },
            "hbm_stacks": {
              "value": 8,
              "evidence_ref": "ev-mi355x-arch-001"
            },
            "process_node_nm": {
              "value": 3,
              "evidence_ref": "ev-mi355x-arch-001"
            },
            "pcie_gen": {
              "value": 5,
              "evidence_ref": "ev-mi355x-001"
            },
            "pcie_lanes": {
              "value": 16,
              "evidence_ref": "ev-mi355x-001"
            }
          },
          "memory": {
            "capacity_gb": {
              "value": 288,
              "evidence_ref": "ev-mi355x-001"
            },
            "bandwidth_gbps": {
              "value": 8000,
              "evidence_ref": "ev-mi355x-001"
            },
            "type": "HBM3e"
          },
          "scale_up": {
            "protocol": "Infinity-Fabric",
            "bandwidth_gbps": 1075,
            "world_size": 8,
            "topology": "fully-connected"
          },
          "scale_out": {
            "bandwidth_gbps_per_card": 400,
            "protocol": "RoCEv2"
          },
          "power": {
            "tdp_w": {
              "value": 1400,
              "evidence_ref": "ev-mi355x-001"
            }
          },
          "software_support": {
            "drivers": [
              "ROCm-6.x"
            ],
            "engines": [
              {
                "id": "vllm",
                "status": "officially-supported",
                "versions": [
                  "0.6"
                ]
              },
              {
                "id": "sglang",
                "status": "officially-supported",
                "versions": [
                  "0.4"
                ]
              }
            ],
            "quantizations": [
              "bf16",
              "fp16",
              "fp8-e4m3",
              "fp4",
              "int8"
            ],
            "parallelism": [
              "tp",
              "pp",
              "ep",
              "sp"
            ]
          },
          "aliases": [
            "MI355X"
          ],
          "chinese_names": [],
          "photos": [],
          "evidence": [
            {
              "id": "ev-mi355x-001",
              "tier": "official",
              "source_type": "vendor-press-release",
              "url": "https://www.amd.com/en/products/accelerators/instinct/mi355x.html",
              "accessed": "2026-04-28",
              "citation": "AMD MI355X announcement (vendor-claimed)"
            },
            {
              "id": "ev-mi355x-arch-001",
              "tier": "estimated",
              "source_type": "vendor-press-release",
              "url": "https://www.amd.com/en/products/accelerators/instinct/mi355x.html",
              "accessed": "2026-04-28",
              "citation": "CDNA 4 architecture: 256 CUs (4 XCDs × 64 CU configurable), 256 MB Infinity Cache, 8× HBM3e stacks @ 36 GB ⇒ 288 GB, FP4 native @ 9.2 PFLOPS, TSMC 3nm"
            }
          ],
          "disclaimers": [
            "All performance figures are vendor-claimed unless tier=measured."
          ]
        },
        "server": {
          "id": "amd-mi325x-platform",
          "name": "AMD Instinct MI325X Platform 8-OAM",
          "vendor": "amd",
          "type": "integrated-server",
          "card": "mi325x",
          "card_count": 8,
          "scale_up_domain_size": 8,
          "intra_node_interconnect": "Infinity-Fabric",
          "inter_node_interconnect": "RoCEv2-400G",
          "cooling": "air",
          "rack_power_kw": 13,
          "total_memory_gb": 2048,
          "total_compute_pflops_bf16": 10.5,
          "release_year": 2024,
          "aliases": [],
          "chinese_names": [],
          "evidence": [
            {
              "id": "ev-mi325plat-001",
              "tier": "official",
              "source_type": "vendor-product-page",
              "url": "https://www.amd.com/en/products/accelerators/instinct/mi300/mi325x.html",
              "accessed": "2026-04-28",
              "citation": "AMD MI325X Platform reference"
            }
          ]
        },
        "model": {
          "id": "qwen3.5-397b",
          "name": "Qwen3.5 397B Reasoning",
          "lab": "alibaba",
          "release_date": "2026-03-05",
          "license": "Apache-2.0",
          "domain": "llm",
          "workload_kind": "autoregressive-decode",
          "architecture": {
            "family": "moe",
            "total_params_b": 397,
            "active_params_b": 22,
            "layers": 64,
            "hidden_size": 5120,
            "ffn_size": 14336,
            "num_attention_heads": 40,
            "num_kv_heads": 8,
            "head_dim": 128,
            "vocab_size": 152064,
            "max_context_length": 131072,
            "moe": {
              "num_experts": 128,
              "top_k": 8,
              "expert_hidden_size": 1536,
              "shared_experts": 0
            },
            "attention_type": "gqa"
          },
          "operator_decomposition": [
            {
              "operator": "matmul",
              "flops_per_token": 28185722880,
              "bytes_per_token": 28185722880
            },
            {
              "operator": "attention",
              "flops_per_token": 9395240960,
              "bytes_per_token": 14495514624
            },
            {
              "operator": "moe-gate",
              "flops_per_token": 41943040,
              "bytes_per_token": 8053063680
            },
            {
              "operator": "rmsnorm",
              "flops_per_token": 3276800,
              "bytes_per_token": 1310720
            }
          ],
          "modalities": [
            "text"
          ],
          "weight_format": "bf16"
        },
        "engine": {
          "id": "vllm",
          "name": "vLLM",
          "maintainer": "community",
          "source_url": "https://github.com/vllm-project/vllm",
          "documentation_url": "https://docs.vllm.ai/",
          "supported_hardware_vendors": [
            "nvidia",
            "amd",
            "intel",
            "aws",
            "google",
            "huawei",
            "cambricon",
            "hygon",
            "moore-threads"
          ],
          "latest_version": "0.6.0",
          "notes": "Most widely used; ascend / rocm / musa forks for non-NVIDIA hardware"
        },
        "quantization": {
          "id": "fp4",
          "name": "FP4",
          "bits_per_weight": 4,
          "bits_per_activation": 8,
          "family": "fp4",
          "lossless": false,
          "description": "Microscaling FP4; introduced on Blackwell B200/B300 and AMD MI355X for inference"
        }
      }
    },
    {
      "id": "case-qwen36plus-mlu590x8-001",
      "title": "Qwen3.6 Plus on 8× Cambricon MLU590 with LMDeploy",
      "submitted_at": "2026-04-22",
      "submitter": {
        "github": "@evokernel-bot"
      },
      "stack": {
        "hardware": {
          "id": "mlu590",
          "count": 8,
          "topology": "single-node X8"
        },
        "server": {
          "id": "cambricon-x8-server"
        },
        "interconnect": {
          "intra_node": "mlu-link-v2",
          "inter_node": "none"
        },
        "model": {
          "id": "qwen3.6-plus",
          "weight_format": "bf16"
        },
        "engine": {
          "id": "lmdeploy",
          "version": "0.6.0"
        },
        "quantization": "int8",
        "parallel": {
          "tp": 8,
          "pp": 1,
          "ep": 4,
          "sp": 1,
          "disaggregated": false
        },
        "driver": "Neuware 3.5",
        "os": "KylinOS 10"
      },
      "scenario": {
        "prefill_seq_len": 2048,
        "decode_seq_len": 512,
        "batch_size": 16,
        "max_concurrent_requests": 64
      },
      "results": {
        "throughput_tokens_per_sec": {
          "decode": 380,
          "prefill": 5800
        },
        "latency_ms": {
          "ttft_p50": 580,
          "ttft_p99": 920,
          "tbt_p50": 92,
          "tbt_p99": 156
        },
        "memory_per_card_gb": 48,
        "power_per_card_w": 310,
        "utilization": {
          "compute_pct": 26,
          "memory_bw_pct": 58
        }
      },
      "bottleneck": "software",
      "reproduction": {
        "startup_command": "lmdeploy serve api_server Qwen/Qwen3.6-Plus --tp 8 --backend mlu --quantization int8",
        "config_files": [],
        "benchmark_tool": "lmdeploy bench"
      },
      "issues_encountered": [
        "INT8 calibration 用了 1024 sample, BLEU 比 BF16 略降 (~0.3)"
      ],
      "patterns": [
        "memory-bound-decode-prefer-int8"
      ],
      "evidence": [
        {
          "id": "ev-case-qwen36-mlu590-001",
          "tier": "measured",
          "source_type": "community-benchmark",
          "url": "https://github.com/InternLM/lmdeploy",
          "accessed": "2026-04-28",
          "citation": "LMDeploy community Cambricon backend benchmark",
          "contributor_attestation": "Numbers extracted from LMDeploy MLU community port testing; not independently re-run."
        }
      ],
      "resolved": {
        "hardware": {
          "id": "mlu590",
          "name": "寒武纪 思元 590",
          "vendor": "cambricon",
          "generation": "mlu590",
          "status": "in-production",
          "release_year": 2023,
          "form_factor": "oam",
          "compute": {
            "fp4_tflops": null,
            "fp8_tflops": null,
            "bf16_tflops": {
              "value": 256,
              "evidence_ref": "ev-mlu590-001"
            },
            "fp16_tflops": {
              "value": 256,
              "evidence_ref": "ev-mlu590-001"
            },
            "int8_tops": {
              "value": 512,
              "evidence_ref": "ev-mlu590-001"
            }
          },
          "architecture": {
            "compute_unit_count": {
              "value": 80,
              "evidence_ref": "ev-mlu590-arch-001"
            },
            "compute_unit_label": "IPU",
            "hbm_stacks": {
              "value": 4,
              "evidence_ref": "ev-mlu590-arch-001"
            },
            "process_node_nm": {
              "value": 7,
              "evidence_ref": "ev-mlu590-arch-001"
            }
          },
          "memory": {
            "capacity_gb": {
              "value": 64,
              "evidence_ref": "ev-mlu590-001"
            },
            "bandwidth_gbps": {
              "value": 1228,
              "evidence_ref": "ev-mlu590-001"
            },
            "type": "HBM2e"
          },
          "scale_up": {
            "protocol": "MLU-Link-v2",
            "bandwidth_gbps": 400,
            "world_size": 8,
            "topology": "switched"
          },
          "scale_out": {
            "bandwidth_gbps_per_card": 200,
            "protocol": "RoCEv2"
          },
          "power": {
            "tdp_w": {
              "value": 350,
              "evidence_ref": "ev-mlu590-001"
            }
          },
          "software_support": {
            "drivers": [
              "Neuware-3.x"
            ],
            "engines": [
              {
                "id": "lmdeploy",
                "status": "community-port",
                "versions": []
              }
            ],
            "quantizations": [
              "bf16",
              "fp16",
              "int8"
            ],
            "parallelism": [
              "tp",
              "pp"
            ]
          },
          "aliases": [
            "MLU590",
            "思元590"
          ],
          "chinese_names": [
            "思元590"
          ],
          "photos": [],
          "evidence": [
            {
              "id": "ev-mlu590-001",
              "tier": "official",
              "source_type": "vendor-product-page",
              "url": "https://www.cambricon.com/",
              "accessed": "2026-04-28",
              "citation": "Cambricon MLU590 product overview (limited public detail)"
            },
            {
              "id": "ev-mlu590-arch-001",
              "tier": "estimated",
              "source_type": "vendor-press-release",
              "url": "https://www.cambricon.com/",
              "accessed": "2026-04-28",
              "citation": "MLU590 (思元590) MLUarch03 architecture: 80 IPUs (Intelligence Processing Units), 4× HBM2e stacks ⇒ 64 GB; SMIC N+1 / 7nm-class"
            }
          ],
          "disclaimers": [
            "All performance figures are vendor-claimed unless tier=measured.",
            "Some specs partial; community contributions welcome to fill gaps."
          ]
        },
        "server": {
          "id": "cambricon-x8-server",
          "name": "Cambricon 思元 X8 Server",
          "vendor": "cambricon",
          "type": "integrated-server",
          "card": "mlu590",
          "card_count": 8,
          "scale_up_domain_size": 8,
          "intra_node_interconnect": "MLU-Link-v2",
          "inter_node_interconnect": "RoCEv2-200G",
          "cooling": "air",
          "rack_power_kw": 4,
          "total_memory_gb": 512,
          "release_year": 2023,
          "aliases": [],
          "chinese_names": [
            "思元X8训推服务器"
          ],
          "evidence": [
            {
              "id": "ev-camb-x8-001",
              "tier": "official",
              "source_type": "vendor-product-page",
              "url": "https://www.cambricon.com/",
              "accessed": "2026-04-28",
              "citation": "Cambricon 思元 X8 server reference"
            }
          ]
        },
        "model": {
          "id": "qwen3.6-plus",
          "name": "Qwen3.6 Plus",
          "lab": "alibaba",
          "release_date": "2026-03-25",
          "license": "Apache-2.0",
          "domain": "llm",
          "workload_kind": "autoregressive-decode",
          "architecture": {
            "family": "moe",
            "total_params_b": 480,
            "active_params_b": 35,
            "layers": 64,
            "hidden_size": 6144,
            "ffn_size": 16384,
            "num_attention_heads": 64,
            "num_kv_heads": 8,
            "head_dim": 128,
            "vocab_size": 152064,
            "max_context_length": 1048576,
            "moe": {
              "num_experts": 128,
              "top_k": 8,
              "expert_hidden_size": 2048,
              "shared_experts": 0
            },
            "attention_type": "gqa"
          },
          "operator_decomposition": [
            {
              "operator": "matmul",
              "flops_per_token": 38654705664,
              "bytes_per_token": 38654705664
            },
            {
              "operator": "attention",
              "flops_per_token": 12884901888,
              "bytes_per_token": 20132659200
            },
            {
              "operator": "moe-gate",
              "flops_per_token": 50331648,
              "bytes_per_token": 12884901888
            },
            {
              "operator": "rmsnorm",
              "flops_per_token": 3932160,
              "bytes_per_token": 1572864
            }
          ],
          "modalities": [
            "text",
            "vision"
          ],
          "weight_format": "bf16"
        },
        "engine": {
          "id": "lmdeploy",
          "name": "LMDeploy",
          "maintainer": "mixed",
          "source_url": "https://github.com/InternLM/lmdeploy",
          "supported_hardware_vendors": [
            "nvidia",
            "huawei",
            "cambricon"
          ],
          "latest_version": "0.6.0",
          "notes": "TurboMind backend; friendly to Chinese hardware ecosystems"
        },
        "quantization": {
          "id": "int8",
          "name": "INT8",
          "bits_per_weight": 8,
          "bits_per_activation": 8,
          "family": "int",
          "lossless": false,
          "description": "Symmetric or asymmetric int8 quantization; widely supported"
        }
      }
    }
  ]
}