sgl-project
diff --git a/‎analyze_server_log.py‎
Lines changed: 173 additions & 0 deletions b/‎analyze_server_log.py‎
Lines changed: 173 additions & 0 deletions
diff --git a/‎check_actual_tokens.py‎
Lines changed: 105 additions & 0 deletions b/‎check_actual_tokens.py‎
Lines changed: 105 additions & 0 deletions
diff --git a/‎debug_token_count.py‎
Lines changed: 77 additions & 0 deletions b/‎debug_token_count.py‎
Lines changed: 77 additions & 0 deletions
@@ -0,0 +1,173 @@
+"""Analyze server logs to extract token count information.
+
+Usage:
+    # Analyze from stdin
+    cat server.log | python analyze_server_log.py
+
+    # Or analyze from file
+    python analyze_server_log.py server.log
+"""
+
+import re
+import sys
+from collections import defaultdict
+
+
+def parse_prefill_line(line):
+    """Parse prefill batch log line."""
+    # Example: Prefill batch. #new-seq: 8, #new-token: 65536, #cached-token: 0, token usage: 0.21, #running-req: 8, #queue-req: 8,
+    match = re.search(
+        r"#new-seq:\s*(\d+).*#new-token:\s*(\d+).*#cached-token:\s*(\d+).*token usage:\s*([\d.]+).*#running-req:\s*(\d+).*#queue-req:\s*(\d+)",
+        line,
+    )
+    if match:
+        return {
+            "type": "prefill",
+            "new_seq": int(match.group(1)),
+            "new_token": int(match.group(2)),
+            "cached_token": int(match.group(3)),
+            "token_usage": float(match.group(4)),
+            "running_req": int(match.group(5)),
+            "queue_req": int(match.group(6)),
+        }
+    return None
+
+
+def parse_decode_line(line):
+    """Parse decode batch log line."""
+    # Example: Decode batch. #running-req: 8, #token: 286848, token usage: 0.96, gen throughput (token/s): 232.27, #queue-req: 0,
+    match = re.search(
+        r"#running-req:\s*(\d+).*#token:\s*(\d+).*token usage:\s*([\d.]+).*gen throughput.*:\s*([\d.]+).*#queue-req:\s*(\d+)",
+        line,
+    )
+    if match:
+        return {
+            "type": "decode",
+            "running_req": int(match.group(1)),
+            "token": int(match.group(2)),
+            "token_usage": float(match.group(3)),
+            "gen_throughput": float(match.group(4)),
+            "queue_req": int(match.group(5)),
+        }
+    return None
+
+
+def main():
+    if len(sys.argv) > 1:
+        # Read from file
+        with open(sys.argv[1], "r") as f:
+            lines = f.readlines()
+    else:
+        # Read from stdin
+        lines = sys.stdin.readlines()
+
+    print("=" * 80)
+    print("SERVER LOG ANALYSIS")
+    print("=" * 80)
+
+    prefill_logs = []
+    decode_logs = []
+
+    for line in lines:
+        if "Prefill batch" in line:
+            data = parse_prefill_line(line)
+            if data:
+                prefill_logs.append(data)
+        elif "Decode batch" in line:
+            data = parse_decode_line(line)
+            if data:
+                decode_logs.append(data)
+
+    # Analyze prefill logs
+    if prefill_logs:
+        print(f"\nPREFILL BATCHES: {len(prefill_logs)} found")
+        print("-" * 80)
+        print(
+            f"{'#':>4} {'NewSeq':>8} {'NewTok':>10} {'CacheTok':>10} {'Usage':>8} {'RunReq':>8} {'QueueReq':>10}"
+        )
+        print("-" * 80)
+
+        total_new_tokens = 0
+        for i, log in enumerate(prefill_logs[:20]):  # Show first 20
+            print(
+                f"{i+1:4d} {log['new_seq']:8d} {log['new_token']:10d} {log['cached_token']:10d} "
+                f"{log['token_usage']:8.2f} {log['running_req']:8d} {log['queue_req']:10d}"
+            )
+            total_new_tokens += log["new_token"]
+
+        if len(prefill_logs) > 20:
+            print(f"... and {len(prefill_logs) - 20} more")
+
+        print(f"\nPrefill Statistics:")
+        print(f"  Total new tokens across all prefills: {total_new_tokens:,}")
+        print(f"  Average tokens per prefill: {total_new_tokens / len(prefill_logs):,.0f}")
+        if prefill_logs:
+            print(f"  First prefill new tokens: {prefill_logs[0]['new_token']:,}")
+
+    # Analyze decode logs
+    if decode_logs:
+        print(f"\n{'=' * 80}")
+        print(f"DECODE BATCHES: {len(decode_logs)} found")
+        print("-" * 80)
+        print(
+            f"{'#':>4} {'RunReq':>8} {'#Token':>12} {'Usage':>8} {'Throughput':>12} {'QueueReq':>10}"
+        )
+        print("-" * 80)
+
+        for i, log in enumerate(decode_logs[:20]):  # Show first 20
+            print(
+                f"{i+1:4d} {log['running_req']:8d} {log['token']:12,d} {log['token_usage']:8.2f} "
+                f"{log['gen_throughput']:12.2f} {log['queue_req']:10d}"
+            )
+
+        if len(decode_logs) > 20:
+            print(f"... and {len(decode_logs) - 20} more")
+
+        print(f"\nDecode Statistics:")
+        tokens_by_running_req = defaultdict(list)
+        for log in decode_logs:
+            tokens_by_running_req[log["running_req"]].append(log["token"])
+
+        for running_req in sorted(tokens_by_running_req.keys()):
+            tokens = tokens_by_running_req[running_req]
+            avg_token = sum(tokens) / len(tokens)
+            min_token = min(tokens)
+            max_token = max(tokens)
+            print(
+                f"  Running {running_req:2d} requests: avg={avg_token:10.0f}, min={min_token:10,d}, max={max_token:10,d} tokens ({len(tokens)} samples)"
+            )
+            if running_req > 0:
+                avg_per_req = avg_token / running_req
+                print(f"    → Avg per request: {avg_per_req:,.0f} tokens")
+
+    # Key findings
+    print(f"\n{'=' * 80}")
+    print("KEY FINDINGS")
+    print("=" * 80)
+
+    if prefill_logs and decode_logs:
+        first_prefill_tokens = prefill_logs[0]["new_token"]
+        first_decode_tokens = decode_logs[0]["token"] if decode_logs else 0
+
+        print(f"\n1. First prefill batch:")
+        print(f"   New tokens: {first_prefill_tokens:,}")
+        print(f"   New sequences: {prefill_logs[0]['new_seq']}")
+        print(f"   Average per sequence: {first_prefill_tokens / prefill_logs[0]['new_seq']:,.0f}")
+
+        print(f"\n2. Decode batch token count:")
+        print(f"   Token count: {first_decode_tokens:,}")
+        print(f"   Running requests: {decode_logs[0]['running_req']}")
+        if decode_logs[0]["running_req"] > 0:
+            print(
+                f"   Average per request: {first_decode_tokens / decode_logs[0]['running_req']:,.0f}"
+            )
+
+        print(
+            f"\n3. Ratio (decode/prefill): {first_decode_tokens / first_prefill_tokens if first_prefill_tokens > 0 else 0:.2f}x"
+        )
+
+    print("\n" + "=" * 80)
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,105 @@
+"""Check actual token count in benchmark requests."""
+
+import sys
+
+sys.path.insert(0, "/Users/ramezes/job/sgl-project/sgl-jax/python")
+
+import numpy as np
+
+from sgl_jax.srt.hf_transformers_utils import get_tokenizer
+
+# Initialize tokenizer
+tokenizer_path = "/models/xai-grok-2/tokenizer.tok.json"
+tokenizer = get_tokenizer(tokenizer_path)
+
+print("=" * 60)
+print("Analyzing Actual Token Counts")
+print("=" * 60)
+
+# Benchmark parameters
+num_prompts = 16
+random_input_len = 8192
+random_output_len = 1024
+random_range_ratio = 1.0
+
+# Generate random input lengths (same logic as bench_serving.py)
+input_lens = np.random.randint(
+    max(int(random_input_len * random_range_ratio), 1),
+    random_input_len + 1,
+    size=num_prompts,
+)
+output_lens = np.random.randint(
+    int(random_output_len * random_range_ratio),
+    random_output_len + 1,
+    size=num_prompts,
+)
+
+print(f"\nGenerated input lengths (first 8):")
+for i in range(min(8, len(input_lens))):
+    print(
+        f"  Request {i}: input={input_lens[i]}, output={output_lens[i]}, total={input_lens[i] + output_lens[i]}"
+    )
+
+total_input_tokens = sum(input_lens)
+total_output_tokens = sum(output_lens)
+total_tokens = total_input_tokens + total_output_tokens
+
+print(f"\nTotal across all {num_prompts} requests:")
+print(f"  Total input tokens: {total_input_tokens:,}")
+print(f"  Total output tokens: {total_output_tokens:,}")
+print(f"  Grand total: {total_tokens:,}")
+
+# With page_size=128 alignment
+page_size = 128
+print(f"\n{'=' * 60}")
+print(f"With PAGE_SIZE={page_size} alignment:")
+print(f"{'=' * 60}")
+
+aligned_tokens_per_req = []
+for i in range(min(8, len(input_lens))):
+    req_total = input_lens[i] + output_lens[i]
+    pages_needed = (req_total + page_size - 1) // page_size
+    aligned_tokens = pages_needed * page_size
+    aligned_tokens_per_req.append(aligned_tokens)
+    print(f"  Request {i}:")
+    print(f"    Actual tokens: {req_total}")
+    print(f"    Pages needed: {pages_needed}")
+    print(f"    Aligned tokens: {aligned_tokens} (+{aligned_tokens - req_total} overhead)")
+
+# For 8 concurrent requests
+print(f"\n{'=' * 60}")
+print(f"Concurrent execution (8 requests):")
+print(f"{'=' * 60}")
+concurrent_aligned = sum(aligned_tokens_per_req)
+print(f"  Total aligned tokens for 8 concurrent requests: {concurrent_aligned:,}")
+
+# Server reported value
+server_reported = 286848
+print(f"\nServer reported: {server_reported:,} tokens")
+print(f"Ratio: {server_reported / concurrent_aligned:.2f}x")
+print(f"Difference: {server_reported - concurrent_aligned:,} tokens")
+
+# Check if this might be context_len related
+avg_per_req = server_reported / 8
+print(f"\nAverage tokens per request (server): {avg_per_req:,.0f}")
+print(f"This suggests each request might be using ~{avg_per_req:,.0f} tokens")
+
+# Possible page-aligned context_len
+possible_context_lens = [32768, 36864, 40960, 49152]
+print(f"\nChecking common page-aligned context lengths:")
+for ctx_len in possible_context_lens:
+    pages = ctx_len // page_size
+    actual = pages * page_size
+    total_8_req = actual * 8
+    print(
+        f"  context_len={ctx_len:,} -> {pages} pages -> {actual:,} tokens/req -> {total_8_req:,} total"
+    )
+    if abs(total_8_req - server_reported) < 10000:
+        print(f"    ^^^ MATCH! This might be it!")
+
+print("\n" + "=" * 60)
+print("Recommendation:")
+print("=" * 60)
+print("1. Check if requests are pre-allocating a fixed context length")
+print("2. Verify the actual input token count in server logs")
+print("3. Look for prefill logs showing '#new-token' count")
@@ -0,0 +1,77 @@
+"""Debug script to understand token counting in sgl-jax server."""
+
+# Based on the benchmark parameters:
+# --num-prompts 16
+# --random-input 8192
+# --random-output 1024
+# --max-concurrency 8
+
+# Expected token count calculation:
+num_prompts = 16
+input_len = 8192
+output_len = 1024
+max_concurrency = 8
+
+print("=" * 60)
+print("Expected Token Count Analysis")
+print("=" * 60)
+
+# Calculation 1: If counting only currently running requests
+running_reqs = 8  # From log: #running-req: 8
+tokens_per_req_input = input_len
+tokens_per_req_total = input_len + output_len  # Assuming all generated
+
+expected_tokens_input_only = running_reqs * tokens_per_req_input
+expected_tokens_full = running_reqs * tokens_per_req_total
+
+print(f"\nScenario 1: Only counting input tokens")
+print(f"  Running requests: {running_reqs}")
+print(f"  Input tokens per request: {tokens_per_req_input}")
+print(f"  Expected total: {expected_tokens_input_only:,} tokens")
+
+print(f"\nScenario 2: Counting input + output tokens (if all generated)")
+print(f"  Running requests: {running_reqs}")
+print(f"  Total tokens per request: {tokens_per_req_total}")
+print(f"  Expected total: {expected_tokens_full:,} tokens")
+
+# Actual server report
+actual_tokens = 286848
+
+print(f"\nActual server report: {actual_tokens:,} tokens")
+print(f"\nRatio analysis:")
+print(f"  Actual / Expected (input only): {actual_tokens / expected_tokens_input_only:.2f}x")
+print(f"  Actual / Expected (input + output): {actual_tokens / expected_tokens_full:.2f}x")
+
+# Average tokens per request based on actual count
+avg_tokens_per_req = actual_tokens / running_reqs
+print(f"\nAverage tokens per running request: {avg_tokens_per_req:,.0f}")
+
+# Check if this matches context_len or some other value
+print(f"\nPossible explanations:")
+print(f"1. If each request pre-allocated max_context_len space:")
+print(f"   - Implied context_len: ~{avg_tokens_per_req:,.0f} tokens per request")
+
+print(f"\n2. If using paged allocation with large page_size:")
+print(f"   - Each request might be allocated in page-sized chunks")
+
+print(f"\n3. If there's a multiplier in the token counting:")
+print(f"   - Check if token count includes multiple layers or other factors")
+
+# Grok-2 model info (from HuggingFace)
+grok_num_layers = 64
+grok_num_kv_heads = 8
+grok_head_dim = 128
+
+print(f"\nGrok-2 Model Configuration:")
+print(f"  num_hidden_layers: {grok_num_layers}")
+print(f"  num_key_value_heads: {grok_num_kv_heads}")
+print(f"  head_dim: {grok_head_dim}")
+
+print("\n" + "=" * 60)
+print("Recommendation:")
+print("=" * 60)
+print("Check the following in your server logs/code:")
+print("1. What is the max_context_len for each request?")
+print("2. What is the page_size setting?")
+print("3. Are requests pre-allocating their maximum length?")
+print("4. Check scheduler.py:_get_token_info() for the actual calculation")