    ModelProfile("llama3.2:1b", "llama", 1.24, "Q4_K_M", 0.9, 2.9, 128, {'rtx3090': 50, 'rtx4090': 80, 'm2max': 30, 'cpu': 2.0}, 2.0, ['tools', 'chat', 'agentic'], ['tools', 'chat', 'agentic'], "llama3.2:1b", "16 layers, 8 KV heads, head_dim 64. GGUF Q4_K_M file size.", weight_gb=0.81, kv_per_1k_gb=0.031, layers=16),
    ModelProfile("llama3.2:3b", "llama", 3.21, "fp16", 6.9, 8.9, 128, {'rtx3090': 50, 'rtx4090': 80, 'm2max': 30, 'cpu': 2.0}, 2.0, ['tools', 'chat', 'agentic'], ['tools', 'chat', 'agentic'], "llama3.2:3b", "28 layers, 8 KV heads, head_dim 128. GGUF fp16 file size.", weight_gb=6.43, kv_per_1k_gb=0.107, layers=28),
    ModelProfile("llama3.1:8b", "llama", 8.03, "fp16", 16.5, 18.5, 128, {'rtx3090': 50, 'rtx4090': 80, 'm2max': 30, 'cpu': 2.0}, 2.0, ['coding', 'agentic', 'chat', 'code', 'tools'], ['coding', 'agentic', 'chat', 'code', 'tools'], "llama3.1:8b", "32 layers, 8 KV heads, head_dim 128. Estimated from GGUF f32 file size (32.1 GB) / 2.", weight_gb=16.05, kv_per_1k_gb=0.122, layers=32),
    ModelProfile("llama3.2-11b:vision", "llama", 10.6, "Q4_K_M", 6.4, 8.4, 128, {'rtx3090': 50, 'rtx4090': 80, 'm2max': 30, 'cpu': 2.0}, 2.0, ['chat', 'vision', 'agentic', 'tools'], ['chat', 'vision', 'agentic', 'tools'], "llama3.2-11b:vision", "Multimodal (vision adapter on Llama 3.1 8B). 40 layers (32 self-attn + 8 cross-attn), 8 KV heads, head_dim 128. GGUF Q4_K_M file size.", weight_gb=5.96, kv_per_1k_gb=0.122, layers=40),
    ModelProfile("llama3.3:70b", "llama", 70.6, "Q4_K_M", 43.7, 45.7, 128, {'rtx3090': 50, 'rtx4090': 80, 'm2max': 30, 'cpu': 2.0}, 2.0, ['coding', 'agentic', 'chat', 'code', 'tools'], ['coding', 'agentic', 'chat', 'code', 'tools'], "llama3.3:70b", "80 layers, 8 KV heads, head_dim 128. GGUF Q4_K_M file size.", weight_gb=42.52, kv_per_1k_gb=0.305, layers=80),
    ModelProfile("llama-4:scout", "llama", 109, "Q4_K_M", 66.1, 68.1, 128, {'rtx3090': 50, 'rtx4090': 80, 'm2max': 30, 'cpu': 2.0}, 2.0, ['chat', 'vision', 'agentic', 'tools'], ['chat', 'vision', 'agentic', 'tools'], "llama-4:scout", "MoE: 109B total, 17B active (16 experts). 48 layers, 8 KV heads, head_dim 128, hybrid attention (12 global + 36 local, chunk=8192). GGUF Q4_K_M file size.", weight_gb=65.4, kv_per_1k_gb=0.183, layers=48),
    ModelProfile("llama-4:maverick", "llama", 400, "Q4_K_M", 243.7, 245.7, 128, {'rtx3090': 50, 'rtx4090': 80, 'm2max': 30, 'cpu': 2.0}, 2.0, ['chat', 'vision', 'agentic', 'tools'], ['chat', 'vision', 'agentic', 'tools'], "llama-4:maverick", "MoE: 400B total, 17B active (128 experts). 48 layers, 8 KV heads, head_dim 128. GGUF Q4_K_M file size.", weight_gb=243, kv_per_1k_gb=0.183, layers=48),
    ModelProfile("qwen2.5:3b", "qwen", 3.09, "Q4_K_M", 2.1, 4.1, 32, {'rtx3090': 50, 'rtx4090': 80, 'm2max': 30, 'cpu': 2.0}, 2.0, ['tools', 'chat', 'agentic'], ['tools', 'chat', 'agentic'], "qwen2.5:3b", "36 layers, 2 KV heads, head_dim 128. GGUF Q4_K_M file size.", weight_gb=1.93, kv_per_1k_gb=0.034, layers=36),
    ModelProfile("qwen2.5:7b", "qwen", 7.62, "Q4_K_M", 4.9, 6.9, 128, {'rtx3090': 50, 'rtx4090': 80, 'm2max': 30, 'cpu': 2.0}, 2.0, ['coding', 'agentic', 'chat', 'code', 'tools'], ['coding', 'agentic', 'chat', 'code', 'tools'], "qwen2.5:7b", "28 layers, 4 KV heads, head_dim 128. GGUF Q4_K_M file size.", weight_gb=4.68, kv_per_1k_gb=0.053, layers=28),
    ModelProfile("qwen2.5:14b", "qwen", 14.7, "Q4_K_M", 9.7, 11.7, 128, {'rtx3090': 50, 'rtx4090': 80, 'm2max': 30, 'cpu': 2.0}, 2.0, ['coding', 'agentic', 'chat', 'code', 'tools'], ['coding', 'agentic', 'chat', 'code', 'tools'], "qwen2.5:14b", "48 layers, 8 KV heads, head_dim 128. GGUF Q4_K_M file size.", weight_gb=8.99, kv_per_1k_gb=0.183, layers=48),
    ModelProfile("qwen2.5:32b", "qwen", 32.5, "Q4_K_M", 20.8, 22.8, 128, {'rtx3090': 50, 'rtx4090': 80, 'm2max': 30, 'cpu': 2.0}, 2.0, ['coding', 'agentic', 'chat', 'code', 'tools'], ['coding', 'agentic', 'chat', 'code', 'tools'], "qwen2.5:32b", "64 layers, 8 KV heads, head_dim 128. GGUF Q4_K_M file size.", weight_gb=19.85, kv_per_1k_gb=0.244, layers=64),
    ModelProfile("qwen2.5:72b", "qwen", 72.7, "Q4_K_M", 48.6, 50.6, 128, {'rtx3090': 50, 'rtx4090': 80, 'm2max': 30, 'cpu': 2.0}, 2.0, ['coding', 'agentic', 'chat', 'code', 'tools'], ['coding', 'agentic', 'chat', 'code', 'tools'], "qwen2.5:72b", "80 layers, 8 KV heads, head_dim 128. GGUF Q4_K_M file size.", weight_gb=47.42, kv_per_1k_gb=0.305, layers=80),
    ModelProfile("qwen2.5-vl:3b", "qwen", 3.09, "Q4_K_M", 2.1, 4.1, 128, {'rtx3090': 50, 'rtx4090': 80, 'm2max': 30, 'cpu': 2.0}, 2.0, ['chat', 'vision'], ['chat', 'vision'], "qwen2.5-vl:3b", "Vision-language model. Same text backbone as Qwen 2.5 3B. 36 layers, 2 KV heads, head_dim 128. GGUF Q4_K_M file size.", weight_gb=1.93, kv_per_1k_gb=0.034, layers=36),
    ModelProfile("qwen2.5-vl:7b", "qwen", 7.62, "Q4_K_M", 4.9, 6.9, 128, {'rtx3090': 50, 'rtx4090': 80, 'm2max': 30, 'cpu': 2.0}, 2.0, ['chat', 'vision', 'agentic', 'tools'], ['chat', 'vision', 'agentic', 'tools'], "qwen2.5-vl:7b", "Vision-language model. Same text backbone as Qwen 2.5 7B. 28 layers, 4 KV heads, head_dim 128. GGUF Q4_K_M file size.", weight_gb=4.68, kv_per_1k_gb=0.053, layers=28),
    ModelProfile("qwen2.5-vl:72b", "qwen", 72.7, "Q4_K_M", 48.6, 50.6, 128, {'rtx3090': 50, 'rtx4090': 80, 'm2max': 30, 'cpu': 2.0}, 2.0, ['chat', 'vision', 'agentic', 'tools'], ['chat', 'vision', 'agentic', 'tools'], "qwen2.5-vl:72b", "Vision-language model. Same text backbone as Qwen 2.5 72B. 80 layers, 8 KV heads, head_dim 128. GGUF Q4_K_M file size.", weight_gb=47.4, kv_per_1k_gb=0.305, layers=80),
    ModelProfile("qwen3:8b", "qwen", 8.2, "Q4_K_M", 5.6, 7.6, 128, {'rtx3090': 50, 'rtx4090': 80, 'm2max': 30, 'cpu': 2.0}, 2.0, ['math', 'reasoning', 'agentic', 'chat', 'tools'], ['math', 'reasoning', 'agentic', 'chat', 'tools'], "qwen3:8b", "Reasoning model (thinking/non-thinking modes). 36 layers, 8 KV heads, head_dim 128. GGUF Q4_K_M file size.", weight_gb=5.03, kv_per_1k_gb=0.137, layers=36),
    ModelProfile("qwen3:32b", "qwen", 32.8, "Q4_K_M", 20.8, 22.8, 128, {'rtx3090': 50, 'rtx4090': 80, 'm2max': 30, 'cpu': 2.0}, 2.0, ['math', 'reasoning', 'agentic', 'chat', 'tools'], ['math', 'reasoning', 'agentic', 'chat', 'tools'], "qwen3:32b", "Reasoning model (thinking/non-thinking modes). 64 layers, 8 KV heads, head_dim 128. GGUF Q4_K_M file size.", weight_gb=19.8, kv_per_1k_gb=0.244, layers=64),
    ModelProfile("qwen3-coder-30b:a3b", "qwen", 30.5, "Q4_K_M", 19.0, 21.0, 256, {'rtx3090': 50, 'rtx4090': 80, 'm2max': 30, 'cpu': 2.0}, 2.0, ['coding', 'agentic', 'chat', 'code', 'tools'], ['coding', 'agentic', 'chat', 'code', 'tools'], "qwen3-coder-30b:a3b", "MoE: 30.5B total, 3.3B active (128 experts, 8 active). Coding-specialized, non-thinking mode only. 48 layers, 4 KV heads, head_dim 128. GGUF Q4_K_M file size.", weight_gb=18.6, kv_per_1k_gb=0.092, layers=48),
    ModelProfile("qwen3-next-80b:a3b", "qwen", 80, "Q4_K_M", 48.5, 50.5, 256, {'rtx3090': 50, 'rtx4090': 80, 'm2max': 30, 'cpu': 2.0}, 2.0, ['tools', 'chat', 'agentic'], ['tools', 'chat', 'agentic'], "qwen3-next-80b:a3b", "MoE: 80B total, 3B active (512 experts). Hybrid architecture (Gated DeltaNet + Gated Attention). Only 12 of 48 layers use attention (2 KV heads, head_dim 256). KV estimate covers attention layers only; DeltaNet layers use fixed-size recurrent state. Instruct-only (no thinking mode). GGUF Q4_K_M file size.", weight_gb=48.4, kv_per_1k_gb=0.023, layers=48),
    ModelProfile("deepseek-r1:7b", "qwen", 7.62, "Q4_K_M", 4.9, 6.9, 128, {'rtx3090': 50, 'rtx4090': 80, 'm2max': 30, 'cpu': 2.0}, 2.0, ['math', 'coding', 'reasoning', 'chat', 'code'], ['math', 'coding', 'reasoning', 'chat', 'code'], "deepseek-r1:7b", "Reasoning model (distilled from R1). Based on Qwen 2.5 Math 7B. 28 layers, 4 KV heads, head_dim 128. GGUF Q4_K_M file size.", weight_gb=4.68, kv_per_1k_gb=0.053, layers=28),
    ModelProfile("deepseek-r1:14b", "qwen", 14.7, "Q4_K_M", 9.7, 11.7, 128, {'rtx3090': 50, 'rtx4090': 80, 'm2max': 30, 'cpu': 2.0}, 2.0, ['math', 'coding', 'reasoning', 'chat', 'code'], ['math', 'coding', 'reasoning', 'chat', 'code'], "deepseek-r1:14b", "Reasoning model (distilled from R1). Based on Qwen 2.5 14B. 48 layers, 8 KV heads, head_dim 128. GGUF Q4_K_M file size.", weight_gb=8.99, kv_per_1k_gb=0.183, layers=48),
    ModelProfile("deepseek-r1:32b", "qwen", 32.5, "Q4_K_M", 20.8, 22.8, 128, {'rtx3090': 50, 'rtx4090': 80, 'm2max': 30, 'cpu': 2.0}, 2.0, ['math', 'coding', 'reasoning', 'chat', 'code'], ['math', 'coding', 'reasoning', 'chat', 'code'], "deepseek-r1:32b", "Reasoning model (distilled from R1). Based on Qwen 2.5 32B. 64 layers, 8 KV heads, head_dim 128. GGUF Q4_K_M file size.", weight_gb=19.85, kv_per_1k_gb=0.244, layers=64),
    ModelProfile("deepseek-r1:70b", "llama", 70.6, "Q4_K_M", 43.7, 45.7, 128, {'rtx3090': 50, 'rtx4090': 80, 'm2max': 30, 'cpu': 2.0}, 2.0, ['math', 'coding', 'reasoning', 'chat', 'code'], ['math', 'coding', 'reasoning', 'chat', 'code'], "deepseek-r1:70b", "Reasoning model (distilled from R1). Based on Llama 3.3 70B. 80 layers, 8 KV heads, head_dim 128. GGUF Q4_K_M file size.", weight_gb=42.52, kv_per_1k_gb=0.305, layers=80),
    ModelProfile("qwq:32b", "unknown", 32.5, "Q4_K_M", 20.9, 22.9, 128, {'rtx3090': 50, 'rtx4090': 80, 'm2max': 30, 'cpu': 2.0}, 2.0, ['math', 'reasoning', 'agentic', 'chat', 'tools'], ['math', 'reasoning', 'agentic', 'chat', 'tools'], "qwq:32b", "Reasoning model. Based on Qwen 2.5 32B. 64 layers, 8 KV heads, head_dim 128. GGUF Q4_K_M file size.", weight_gb=19.9, kv_per_1k_gb=0.244, layers=64),
    ModelProfile("phi-4-mini:3.8b", "phi", 3.84, "Q4_K_M", 3.0, 5.0, 128, {'rtx3090': 50, 'rtx4090': 80, 'm2max': 30, 'cpu': 2.0}, 2.0, ['chat'], ['chat'], "phi-4-mini:3.8b", "32 layers, 8 KV heads, head_dim 128. 200K vocabulary. GGUF Q4_K_M file size.", weight_gb=2.49, kv_per_1k_gb=0.122, layers=32),
    ModelProfile("phi-4:14b", "phi", 14.7, "Q4_K_M", 9.8, 11.8, 16, {'rtx3090': 50, 'rtx4090': 80, 'm2max': 30, 'cpu': 2.0}, 2.0, ['code', 'chat', 'coding'], ['code', 'chat', 'coding'], "phi-4:14b", "40 layers, 10 KV heads, head_dim 128. GGUF Q4_K_M file size.", weight_gb=9.06, kv_per_1k_gb=0.191, layers=40),
    ModelProfile("mistral-nemo:12b", "mistral", 12.2, "Q4_K_M", 8.4, 10.4, 128, {'rtx3090': 50, 'rtx4090': 80, 'm2max': 30, 'cpu': 2.0}, 2.0, ['tools', 'chat', 'agentic'], ['tools', 'chat', 'agentic'], "mistral-nemo:12b", "Co-developed with NVIDIA. 40 layers, 8 KV heads, head_dim 160. GGUF Q4_K_M file size.", weight_gb=7.6, kv_per_1k_gb=0.191, layers=40),
    ModelProfile("mistral-small-3.1:24b", "mistral", 23.6, "Q4_K_M", 14.9, 16.9, 128, {'rtx3090': 50, 'rtx4090': 80, 'm2max': 30, 'cpu': 2.0}, 2.0, ['vision', 'coding', 'agentic', 'chat', 'code', 'tools'], ['vision', 'coding', 'agentic', 'chat', 'code', 'tools'], "mistral-small-3.1:24b", "40 layers, 8 KV heads, head_dim 128. GGUF Q4_K_M file size.", weight_gb=14.33, kv_per_1k_gb=0.153, layers=40),
    ModelProfile("gemma-2:9b", "gemma", 9.24, "Q4_K_M", 7.0, 9.0, 8, {'rtx3090': 50, 'rtx4090': 80, 'm2max': 30, 'cpu': 2.0}, 2.0, ['chat'], ['chat'], "gemma-2:9b", "42 layers, 8 KV heads, head_dim 256. GGUF Q4_K_M file size.", weight_gb=5.76, kv_per_1k_gb=0.321, layers=42),
    ModelProfile("gemma-2:27b", "gemma", 27.2, "Q4_K_M", 18.1, 20.1, 8, {'rtx3090': 50, 'rtx4090': 80, 'm2max': 30, 'cpu': 2.0}, 2.0, ['chat'], ['chat'], "gemma-2:27b", "46 layers, 16 KV heads, head_dim 128. GGUF Q4_K_M file size.", weight_gb=16.65, kv_per_1k_gb=0.351, layers=46),
    ModelProfile("gemma-3:4b", "gemma", 3.88, "Q4_K_M", 3.1, 5.1, 128, {'rtx3090': 50, 'rtx4090': 80, 'm2max': 30, 'cpu': 2.0}, 2.0, ['chat', 'vision'], ['chat', 'vision'], "gemma-3:4b", "34 layers (5:1 local:global, sw=1024), head_dim 256. KV estimate from Gemma 3 tech report. GGUF Q4_K_M file size.", weight_gb=2.49, kv_per_1k_gb=0.147, layers=34),
    ModelProfile("gemma-3:12b", "gemma", 11.8, "Q4_K_M", 9.2, 11.2, 128, {'rtx3090': 50, 'rtx4090': 80, 'm2max': 30, 'cpu': 2.0}, 2.0, ['code', 'chat', 'coding', 'vision'], ['code', 'chat', 'coding', 'vision'], "gemma-3:12b", "48 layers (5:1 local:global, sw=1024), head_dim 256. KV estimate from Gemma 3 tech report. GGUF Q4_K_M file size.", weight_gb=7.3, kv_per_1k_gb=0.466, layers=48),
    ModelProfile("gemma-3:27b", "gemma", 27.0, "Q4_K_M", 18.9, 20.9, 128, {'rtx3090': 50, 'rtx4090': 80, 'm2max': 30, 'cpu': 2.0}, 2.0, ['code', 'chat', 'coding', 'vision'], ['code', 'chat', 'coding', 'vision'], "gemma-3:27b", "62 layers (5:1 local:global, sw=1024), head_dim 256. KV estimate from Gemma 3 tech report. GGUF Q4_K_M file size.", weight_gb=16.55, kv_per_1k_gb=0.584, layers=62),
    ModelProfile("glm-4.7:flash", "unknown", 31, "Q4_K_M", 18.5, 20.5, 128, {'rtx3090': 50, 'rtx4090': 80, 'm2max': 30, 'cpu': 2.0}, 2.0, ['math', 'reasoning', 'agentic', 'chat', 'tools'], ['math', 'reasoning', 'agentic', 'chat', 'tools'], "glm-4.7:flash", "MoE: 30B total, 3B active (64 experts, 4 active). 47 layers, uses MLA (kv_lora_rank=512). KV estimate is for compressed MLA cache. GGUF Q4_K_M file size.", weight_gb=18.3, kv_per_1k_gb=0.05, layers=47),
    ModelProfile("nemotron-nano:12b", "unknown", 12.6, "Q4_K_M", 7.5, 9.5, 128, {'rtx3090': 50, 'rtx4090': 80, 'm2max': 30, 'cpu': 2.0}, 2.0, ['math', 'reasoning', 'agentic', 'chat', 'tools'], ['math', 'reasoning', 'agentic', 'chat', 'tools'], "nemotron-nano:12b", "Hybrid Mamba-Transformer (6 attention + 56 Mamba-2 layers). 8 KV heads, head_dim 128 on attention layers only. KV estimate covers attention layers; Mamba layers use fixed-size state (~140 MB). GGUF Q4_K_M file size.", weight_gb=7.37, kv_per_1k_gb=0.023, layers=62),
    ModelProfile("qwen2.5-coder:7b", "qwen", 7.62, "Q4_K_M", 4.9, 6.9, 128, {'rtx3090': 50, 'rtx4090': 80, 'm2max': 30, 'cpu': 2.0}, 2.0, ['code', 'chat', 'coding'], ['code', 'chat', 'coding'], "qwen2.5-coder:7b", "Code model. Based on Qwen 2.5 7B. 28 layers, 4 KV heads, head_dim 128. GGUF Q4_K_M file size.", weight_gb=4.68, kv_per_1k_gb=0.053, layers=28),
    ModelProfile("qwen2.5-coder:14b", "qwen", 14.7, "Q4_K_M", 9.7, 11.7, 128, {'rtx3090': 50, 'rtx4090': 80, 'm2max': 30, 'cpu': 2.0}, 2.0, ['code', 'chat', 'coding'], ['code', 'chat', 'coding'], "qwen2.5-coder:14b", "Code model. Based on Qwen 2.5 14B. 48 layers, 8 KV heads, head_dim 128. GGUF Q4_K_M file size.", weight_gb=8.99, kv_per_1k_gb=0.183, layers=48),
    ModelProfile("qwen2.5-coder:32b", "qwen", 32.5, "Q4_K_M", 20.8, 22.8, 128, {'rtx3090': 50, 'rtx4090': 80, 'm2max': 30, 'cpu': 2.0}, 2.0, ['code', 'chat', 'coding'], ['code', 'chat', 'coding'], "qwen2.5-coder:32b", "Code model. Based on Qwen 2.5 32B. 64 layers, 8 KV heads, head_dim 128. GGUF Q4_K_M file size.", weight_gb=19.85, kv_per_1k_gb=0.244, layers=64),
    ModelProfile("codestral:22b", "unknown", 22.2, "Q4_K_M", 14.2, 16.2, 32, {'rtx3090': 50, 'rtx4090': 80, 'm2max': 30, 'cpu': 2.0}, 2.0, ['chat'], ['chat'], "codestral:22b", "Code model. Mistral architecture. 56 layers, 8 KV heads, head_dim 128. GGUF Q4_K_M file size.", weight_gb=13.3, kv_per_1k_gb=0.214, layers=56),
    ModelProfile("starcoder2:15b", "unknown", 15.6, "Q4_K_M", 10.2, 12.2, 16, {'rtx3090': 50, 'rtx4090': 80, 'm2max': 30, 'cpu': 2.0}, 2.0, ['code', 'chat', 'coding'], ['code', 'chat', 'coding'], "starcoder2:15b", "Code model. BigCode. 40 layers, 4 KV heads, head_dim 128. Trained on 600+ languages. GGUF Q4_K_M file size.", weight_gb=9.86, kv_per_1k_gb=0.076, layers=40),
    ModelProfile("phi-3-mini:3.8b", "phi", 3.82, "Q4_K_M", 3.9, 5.9, 128, {'rtx3090': 50, 'rtx4090': 80, 'm2max': 30, 'cpu': 2.0}, 2.0, ['chat'], ['chat'], "phi-3-mini:3.8b", "32 layers, 32 KV heads (full MHA), head_dim 96. GGUF Q4_K_M file size.", weight_gb=2.39, kv_per_1k_gb=0.366, layers=32),
    ModelProfile("deepseek-coder:6.7b", "deepseek", 6.74, "Q4_K_M", 6.0, 8.0, 16, {'rtx3090': 50, 'rtx4090': 80, 'm2max': 30, 'cpu': 2.0}, 2.0, ['code', 'chat', 'coding'], ['code', 'chat', 'coding'], "deepseek-coder:6.7b", "Code model. Llama architecture. 32 layers, 32 KV heads (full MHA), head_dim 128. 16K context. GGUF Q4_K_M file size.", weight_gb=4.08, kv_per_1k_gb=0.488, layers=32),
    ModelProfile("smallthinker:3b", "unknown", 3.09, "Q4_K_M", 2.1, 4.1, 32, {'rtx3090': 50, 'rtx4090': 80, 'm2max': 30, 'cpu': 2.0}, 2.0, ['chat', 'math', 'reasoning'], ['chat', 'math', 'reasoning'], "smallthinker:3b", "Reasoning model. Fine-tuned from Qwen 2.5 3B Instruct. 36 layers, 2 KV heads, head_dim 128. GGUF Q4_K_M file size.", weight_gb=1.93, kv_per_1k_gb=0.034, layers=36),
    ModelProfile("ministral:8b", "unknown", 8.02, "Q4_K_M", 5.6, 7.6, 128, {'rtx3090': 50, 'rtx4090': 80, 'm2max': 30, 'cpu': 2.0}, 2.0, ['tools', 'chat', 'agentic'], ['tools', 'chat', 'agentic'], "ministral:8b", "Mistral edge model. 36 layers, 8 KV heads, head_dim 128. Interleaved sliding-window attention. GGUF Q4_K_M file size.", weight_gb=5.07, kv_per_1k_gb=0.137, layers=36),
    ModelProfile("devstral-small:24b", "unknown", 23.6, "Q4_K_M", 14.9, 16.9, 128, {'rtx3090': 50, 'rtx4090': 80, 'm2max': 30, 'cpu': 2.0}, 2.0, ['coding', 'agentic', 'chat', 'code', 'tools'], ['coding', 'agentic', 'chat', 'code', 'tools'], "devstral-small:24b", "Code model. Fine-tuned from Mistral Small 3.1 (vision encoder removed). 40 layers, 8 KV heads, head_dim 128. GGUF Q4_K_M file size.", weight_gb=14.33, kv_per_1k_gb=0.153, layers=40),
    ModelProfile("magistral-small:24b", "unknown", 23.6, "Q4_K_M", 14.9, 16.9, 128, {'rtx3090': 50, 'rtx4090': 80, 'm2max': 30, 'cpu': 2.0}, 2.0, ['math', 'vision', 'reasoning', 'agentic', 'chat', 'tools'], ['math', 'vision', 'reasoning', 'agentic', 'chat', 'tools'], "magistral-small:24b", "Reasoning model (Magistral 1.2). Based on Mistral Small 3.2. 40 layers, 8 KV heads, head_dim 128. GGUF Q4_K_M file size.", weight_gb=14.33, kv_per_1k_gb=0.153, layers=40),
    ModelProfile("gpt-oss:20b", "unknown", 21, "Q4_K_M", 11.9, 13.9, 128, {'rtx3090': 50, 'rtx4090': 80, 'm2max': 30, 'cpu': 2.0}, 2.0, ['math', 'reasoning', 'agentic', 'chat', 'tools'], ['math', 'reasoning', 'agentic', 'chat', 'tools'], "gpt-oss:20b", "MoE: 21B total, 3.6B active (32 experts, 4 per token). OpenAI open-weight model. 24 layers, 8 KV heads, head_dim 64. FFN weights stored as MXFP4. GGUF Q4_K_M file size.", weight_gb=11.67, kv_per_1k_gb=0.046, layers=24),
    ModelProfile("minimax:m2.1", "unknown", 229, "Q4_K_M", 138.3, 140.3, 128, {'rtx3090': 50, 'rtx4090': 80, 'm2max': 30, 'cpu': 2.0}, 2.0, ['math', 'coding', 'reasoning', 'agentic', 'chat', 'code', 'tools'], ['math', 'coding', 'reasoning', 'agentic', 'chat', 'code', 'tools'], "minimax:m2.1", "MoE: 229B total, ~10B active (128 experts, 2 per token). Hybrid lightning/softmax attention. KV estimate covers softmax attention layers only. GGUF Q4_K_M file size.", weight_gb=138, kv_per_1k_gb=0.077, layers=80),
    ModelProfile("step-3.5:flash", "unknown", 197, "Q4_K_M", 119.7, 121.7, 256, {'rtx3090': 50, 'rtx4090': 80, 'm2max': 30, 'cpu': 2.0}, 2.0, ['math', 'coding', 'reasoning', 'agentic', 'chat', 'code', 'tools'], ['math', 'coding', 'reasoning', 'agentic', 'chat', 'code', 'tools'], "step-3.5:flash", "MoE: 197B total, 11B active (288 experts, Top-8). 45 layers, 3:1 SWA ratio. KV estimate based on estimated 8 KV heads, head_dim 128. GGUF Q4_K_M file size.", weight_gb=119, kv_per_1k_gb=0.172, layers=45),
    ModelProfile("glm:4.6", "unknown", 357, "Q4_K_M", 216.3, 218.3, 200, {'rtx3090': 50, 'rtx4090': 80, 'm2max': 30, 'cpu': 2.0}, 2.0, ['math', 'reasoning', 'agentic', 'chat', 'tools'], ['math', 'reasoning', 'agentic', 'chat', 'tools'], "glm:4.6", "MoE: 357B total. Expanded context to 200K from GLM-4.5. Uses MLA. KV estimate is for compressed MLA cache. GGUF Q4_K_M file size.", weight_gb=216, kv_per_1k_gb=0.067, layers=62),
    ModelProfile("qwen3.5:0.8b", "qwen", 0.8, "Q4_K_M", 0.6, 2.6, 128, {'rtx3090': 50, 'rtx4090': 80, 'm2max': 30, 'cpu': 2.0}, 2.0, ['chat'], ['chat'], "qwen3.5:0.8b", "GGUF Q4_K_M file size from Qwen3.5-0.8B-Q4_K_M.gguf.", weight_gb=0.5, kv_per_1k_gb=0.02, layers=None),
    ModelProfile("qwen3.5:2b", "qwen", 2.0, "Q4_K_M", 1.3, 3.3, 128, {'rtx3090': 50, 'rtx4090': 80, 'm2max': 30, 'cpu': 2.0}, 2.0, ['chat'], ['chat'], "qwen3.5:2b", "GGUF Q4_K_M file size from Qwen3.5-2B-Q4_K_M.gguf.", weight_gb=1.19, kv_per_1k_gb=0.04, layers=None),
    ModelProfile("qwen3.5:4b", "qwen", 4.0, "Q4_K_M", 2.9, 4.9, 128, {'rtx3090': 50, 'rtx4090': 80, 'm2max': 30, 'cpu': 2.0}, 2.0, ['chat'], ['chat'], "qwen3.5:4b", "GGUF Q4_K_M file size from Qwen3.5-4B-Q4_K_M.gguf.", weight_gb=2.55, kv_per_1k_gb=0.08, layers=None),
    ModelProfile("qwen3.5:9b", "qwen", 9.0, "Q4_K_M", 5.9, 7.9, 128, {'rtx3090': 50, 'rtx4090': 80, 'm2max': 30, 'cpu': 2.0}, 2.0, ['chat'], ['chat'], "qwen3.5:9b", "GGUF Q4_K_M file size from Qwen3.5-9B-Q4_K_M.gguf.", weight_gb=5.29, kv_per_1k_gb=0.16, layers=None),
    ModelProfile("qwen3.5:27b", "qwen", 27.0, "Q4_K_M", 17.5, 19.5, 128, {'rtx3090': 50, 'rtx4090': 80, 'm2max': 30, 'cpu': 2.0}, 2.0, ['chat'], ['chat'], "qwen3.5:27b", "GGUF Q4_K_M file size from Qwen3.5-27B-Q4_K_M.gguf.", weight_gb=15.59, kv_per_1k_gb=0.48, layers=None),
    ModelProfile("qwen3.5-35b:a3b", "qwen", 35.0, "Q4_K_M", 23.0, 25.0, 128, {'rtx3090': 50, 'rtx4090': 80, 'm2max': 30, 'cpu': 2.0}, 2.0, ['chat'], ['chat'], "qwen3.5-35b:a3b", "GGUF Q4_K_M file size from Qwen3.5-35B-A3B-Q4_K_M.gguf.", weight_gb=20.5, kv_per_1k_gb=0.62, layers=None),
    ModelProfile("qwen3.5-122b:a10b", "qwen", 122.0, "Q4_K_M", 26.6, 28.6, 128, {'rtx3090': 50, 'rtx4090': 80, 'm2max': 30, 'cpu': 2.0}, 2.0, ['chat'], ['chat'], "qwen3.5-122b:a10b", "GGUF Q4_K_M file size from Q4_K_M/Qwen3.5-122B-A10B-Q4_K_M-00002-of-00003.gguf.", weight_gb=23.27, kv_per_1k_gb=0.84, layers=None),
    ModelProfile("qwen3.5-397b:a17b", "qwen", 397.0, "Q4_K_M", 29.2, 31.2, 128, {'rtx3090': 50, 'rtx4090': 80, 'm2max': 30, 'cpu': 2.0}, 2.0, ['chat'], ['chat'], "qwen3.5-397b:a17b", "GGUF Q4_K_M file size from Q4_K_M/Qwen3.5-397B-A17B-Q4_K_M-00004-of-00006.gguf.", weight_gb=22.95, kv_per_1k_gb=1.56, layers=None),
    ModelProfile("minimax:m2.5", "unknown", 56.85, "Q4_K_M", 26.6, 28.6, 128, {'rtx3090': 50, 'rtx4090': 80, 'm2max': 30, 'cpu': 2.0}, 2.0, ['tools', 'chat', 'agentic'], ['tools', 'chat', 'agentic'], "minimax:m2.5", "GGUF Q4_K_M file size from Q4_K_M/MiniMax-M2.5-Q4_K_M-00003-of-00004.gguf.", weight_gb=23.1, kv_per_1k_gb=0.88, layers=None),
    ModelProfile("glm:5", "unknown", 56.41, "Q4_K_M", 26.3, 28.3, 128, {'rtx3090': 50, 'rtx4090': 80, 'm2max': 30, 'cpu': 2.0}, 2.0, ['tools', 'chat', 'agentic'], ['tools', 'chat', 'agentic'], "glm:5", "GGUF Q4_K_M file size from Q4_K_M/GLM-5-Q4_K_M-00002-of-00011.gguf.", weight_gb=22.9, kv_per_1k_gb=0.86, layers=None),
    ModelProfile("gemma-4:e2b", "gemma", 2.3, "Q4_K_M", 3.4, 5.4, 128, {'rtx3090': 50, 'rtx4090': 80, 'm2max': 30, 'cpu': 2.0}, 2.0, ['math', 'vision', 'reasoning', 'agentic', 'chat', 'tools'], ['math', 'vision', 'reasoning', 'agentic', 'chat', 'tools'], "gemma-4:e2b", "35 layers, 1 KV head, head_dim 256 (MQA). Multimodal MoE edge model with 5:1 local:global attention (512-token sliding window). GGUF Q4_K_M file size.", weight_gb=3.22, kv_per_1k_gb=0.033, layers=35),
    ModelProfile("gemma-4:e4b", "gemma", 4.5, "Q4_K_M", 5.4, 7.4, 128, {'rtx3090': 50, 'rtx4090': 80, 'm2max': 30, 'cpu': 2.0}, 2.0, ['math', 'vision', 'reasoning', 'agentic', 'chat', 'tools'], ['math', 'vision', 'reasoning', 'agentic', 'chat', 'tools'], "gemma-4:e4b", "42 layers, 2 KV heads, head_dim 256. Multimodal MoE edge model with mixed sliding-window (512-token) and full attention. GGUF Q4_K_M file size.", weight_gb=5.03, kv_per_1k_gb=0.08, layers=42),
    ModelProfile("gemma-4-26b:a4b", "gemma", 26.0, "Q4_K_M", 15.9, 17.9, 256, {'rtx3090': 50, 'rtx4090': 80, 'm2max': 30, 'cpu': 2.0}, 2.0, ['math', 'vision', 'reasoning', 'agentic', 'chat', 'tools'], ['math', 'vision', 'reasoning', 'agentic', 'chat', 'tools'], "gemma-4-26b:a4b", "30 layers (5:1 local:global), 8 KV heads (local, sw=1024, head_dim 256), 2 KV heads (global, head_dim 512). Sparse MoE (128 experts, 8 active). KV per 1K from global attention only; local layers add ~0.20 GB fixed overhead. GGUF Q4_K_M file size.", weight_gb=15.87, kv_per_1k_gb=0.019, layers=30),
    ModelProfile("gemma-4:31b", "gemma", 31.0, "Q4_K_M", 18.6, 20.6, 256, {'rtx3090': 50, 'rtx4090': 80, 'm2max': 30, 'cpu': 2.0}, 2.0, ['math', 'vision', 'reasoning', 'agentic', 'chat', 'tools'], ['math', 'vision', 'reasoning', 'agentic', 'chat', 'tools'], "gemma-4:31b", "60 layers (5:1 local:global), 16 KV heads (local, head_dim 256, sw=1024), 4 KV heads (global, head_dim 512). KV per 1K from global attention only; local layers add ~0.78 GB fixed overhead. GGUF Q4_K_M file size.", weight_gb=18.25, kv_per_1k_gb=0.076, layers=60),
    ModelProfile("nemotron-3-nano:4b", "unknown", 3.97, "Q4_K_M", 2.8, 4.8, 256, {'rtx3090': 50, 'rtx4090': 80, 'm2max': 30, 'cpu': 2.0}, 2.0, ['math', 'reasoning', 'agentic', 'chat', 'tools'], ['math', 'reasoning', 'agentic', 'chat', 'tools'], "nemotron-3-nano:4b", "Hybrid Mamba-Transformer (4 attention + 38 Mamba-2 layers). 8 KV heads, head_dim 128 on attention layers only. KV estimate covers attention layers; Mamba layers use fixed-size state. GGUF Q4_K_M file size.", weight_gb=2.77, kv_per_1k_gb=0.015, layers=42),
    ModelProfile("nemotron-cascade-2-30b:a3b", "unknown", 30.0, "Q4_K_M", 23.1, 25.1, 256, {'rtx3090': 50, 'rtx4090': 80, 'm2max': 30, 'cpu': 2.0}, 2.0, ['math', 'reasoning', 'agentic', 'chat', 'tools'], ['math', 'reasoning', 'agentic', 'chat', 'tools'], "nemotron-cascade-2-30b:a3b", "Hybrid Mamba-Transformer MoE (6 attention + 23 Mamba-2 + 23 MoE layers, 128 experts, 6 active). 2 KV heads, head_dim 128 on attention layers only. KV estimate covers attention layers. GGUF Q4_K_M file size.", weight_gb=23.03, kv_per_1k_gb=0.006, layers=52),
