claudish/ai_docs/CACHE_METRICS_ENHANCEMENT.md

# Enhanced Cache Metrics Implementation

**Goal**: Improve cache metrics from 80% → 100% accuracy
**Effort**: 2-3 hours
**Impact**: Better cost tracking in Claude Code UI

---

## Current Implementation (80%)

```typescript
// Simple first-turn detection
const hasToolResults = claudeRequest.messages?.some((msg: any) =>
  Array.isArray(msg.content) && msg.content.some((block: any) => block.type === "tool_result")
);
const isFirstTurn = !hasToolResults;

// Rough 80% estimation
const estimatedCacheTokens = Math.floor(inputTokens * 0.8);

usage: {
  input_tokens: inputTokens,
  output_tokens: outputTokens,
  cache_creation_input_tokens: isFirstTurn ? estimatedCacheTokens : 0,
  cache_read_input_tokens: isFirstTurn ? 0 : estimatedCacheTokens,
}
```

**Problems**:
- ❌ Hardcoded 80% (inaccurate)
- ❌ Doesn't account for actual cacheable content
- ❌ Missing `cache_creation.ephemeral_5m_input_tokens`
- ❌ No TTL tracking

---

## Target Implementation (100%)

### Step 1: Calculate Actual Cacheable Tokens

```typescript
/**
 * Calculate cacheable tokens from request
 * Cacheable content: system prompt + tools definitions
 */
function calculateCacheableTokens(request: any): number {
  let cacheableChars = 0;

  // System prompt (always cached)
  if (request.system) {
    if (typeof request.system === 'string') {
      cacheableChars += request.system.length;
    } else if (Array.isArray(request.system)) {
      cacheableChars += request.system
        .map((item: any) => {
          if (typeof item === 'string') return item.length;
          if (item?.type === 'text' && item.text) return item.text.length;
          return JSON.stringify(item).length;
        })
        .reduce((a: number, b: number) => a + b, 0);
    }
  }

  // Tools definitions (always cached)
  if (request.tools && Array.isArray(request.tools)) {
    cacheableChars += JSON.stringify(request.tools).length;
  }

  // Convert chars to tokens (rough: 4 chars per token)
  return Math.floor(cacheableChars / 4);
}
```

### Step 2: Track Conversation State

```typescript
// Global conversation state (per proxy instance)
interface ConversationState {
  cacheableTokens: number;
  lastCacheTimestamp: number;
  messageCount: number;
}

const conversationState = new Map<string, ConversationState>();

function getConversationKey(request: any): string {
  // Use first user message + model as key
  const firstUserMsg = request.messages?.find((m: any) => m.role === 'user');
  const content = typeof firstUserMsg?.content === 'string'
    ? firstUserMsg.content
    : JSON.stringify(firstUserMsg?.content || '');

  // Hash for privacy
  return `${request.model}_${content.substring(0, 50)}`;
}
```

### Step 3: Implement TTL Logic

```typescript
function getCacheMetrics(request: any, inputTokens: number) {
  const cacheableTokens = calculateCacheableTokens(request);
  const conversationKey = getConversationKey(request);
  const state = conversationState.get(conversationKey);

  const now = Date.now();
  const CACHE_TTL = 5 * 60 * 1000; // 5 minutes

  // First turn or cache expired
  if (!state || (now - state.lastCacheTimestamp > CACHE_TTL)) {
    // Create new cache
    conversationState.set(conversationKey, {
      cacheableTokens,
      lastCacheTimestamp: now,
      messageCount: 1
    });

    return {
      input_tokens: inputTokens,
      cache_creation_input_tokens: cacheableTokens,
      cache_read_input_tokens: 0,
      cache_creation: {
        ephemeral_5m_input_tokens: cacheableTokens
      }
    };
  }

  // Subsequent turn - read from cache
  state.messageCount++;
  state.lastCacheTimestamp = now;

  return {
    input_tokens: inputTokens,
    cache_creation_input_tokens: 0,
    cache_read_input_tokens: cacheableTokens,
  };
}
```

### Step 4: Integrate into Proxy

```typescript
// In message_start event
sendSSE("message_start", {
  type: "message_start",
  message: {
    id: messageId,
    type: "message",
    role: "assistant",
    content: [],
    model: model,
    stop_reason: null,
    stop_sequence: null,
    usage: {
      input_tokens: 0,
      cache_creation_input_tokens: 0,
      cache_read_input_tokens: 0,
      output_tokens: 0
    },
  },
});

// In message_delta event
const cacheMetrics = getCacheMetrics(claudeRequest, inputTokens);

sendSSE("message_delta", {
  type: "message_delta",
  delta: {
    stop_reason: "end_turn",
    stop_sequence: null,
  },
  usage: {
    output_tokens: outputTokens,
    ...cacheMetrics
  },
});
```

---

## Testing the Enhancement

### Test Case 1: First Turn

**Request**:
```json
{
  "model": "claude-sonnet-4.5",
  "system": "You are a helpful assistant. [5000 chars]",
  "tools": [/* 16 tools = ~3000 chars */],
  "messages": [{"role": "user", "content": "Hello"}]
}
```

**Expected Cache Metrics**:
```json
{
  "input_tokens": 2050,  // system (1250) + tools (750) + message (50)
  "output_tokens": 20,
  "cache_creation_input_tokens": 2000,  // system + tools
  "cache_read_input_tokens": 0,
  "cache_creation": {
    "ephemeral_5m_input_tokens": 2000
  }
}
```

### Test Case 2: Second Turn (Within 5 Min)

**Request**:
```json
{
  "model": "claude-sonnet-4.5",
  "system": "You are a helpful assistant. [same]",
  "tools": [/* same */],
  "messages": [
    {"role": "user", "content": "Hello"},
    {"role": "assistant", "content": [/* tool use */]},
    {"role": "user", "content": [/* tool result */]}
  ]
}
```

**Expected Cache Metrics**:
```json
{
  "input_tokens": 2150,  // Everything
  "output_tokens": 30,
  "cache_creation_input_tokens": 0,  // Not creating
  "cache_read_input_tokens": 2000   // Reading cached system + tools
}
```

### Test Case 3: Third Turn (After 5 Min)

**Expected**: Same as first turn (cache expired, recreate)

---

## Implementation Checklist

- [ ] Add `calculateCacheableTokens()` function
- [ ] Add `ConversationState` interface and map
- [ ] Add `getConversationKey()` function
- [ ] Add `getCacheMetrics()` with TTL logic
- [ ] Update `message_start` usage (keep at 0)
- [ ] Update `message_delta` usage with real metrics
- [ ] Add cleanup for old conversation states (prevent memory leak)
- [ ] Test with multi-turn fixtures
- [ ] Validate against real Anthropic API (monitor mode)

---

## Potential Issues & Solutions

### Issue 1: Memory Leak

**Problem**: `conversationState` Map grows indefinitely

**Solution**: Add cleanup for old entries

```typescript
// Clean up conversations older than 10 minutes
setInterval(() => {
  const now = Date.now();
  const MAX_AGE = 10 * 60 * 1000;

  for (const [key, state] of conversationState.entries()) {
    if (now - state.lastCacheTimestamp > MAX_AGE) {
      conversationState.delete(key);
    }
  }
}, 60 * 1000); // Run every minute
```

### Issue 2: Concurrent Conversations

**Problem**: Multiple conversations with same model might collide

**Solution**: Better conversation key (include timestamp or session ID)

```typescript
function getConversationKey(request: any, sessionId?: string): string {
  // Use session ID if available (from temp settings path)
  if (sessionId) {
    return `${request.model}_${sessionId}`;
  }

  // Fallback: hash of first message
  const firstUserMsg = request.messages?.find((m: any) => m.role === 'user');
  const content = JSON.stringify(firstUserMsg || '');
  return `${request.model}_${hashString(content)}`;
}
```

### Issue 3: Different Tools Per Turn

**Problem**: If tools change between turns, cache should be invalidated

**Solution**: Include tools in conversation key or detect changes

```typescript
function getCacheMetrics(request: any, inputTokens: number) {
  const cacheableTokens = calculateCacheableTokens(request);
  const conversationKey = getConversationKey(request);
  const state = conversationState.get(conversationKey);

  // Check if cacheable content changed
  if (state && state.cacheableTokens !== cacheableTokens) {
    // Tools or system changed - invalidate cache
    conversationState.delete(conversationKey);
    // Fall through to create new cache
  }

  // ... rest of logic
}
```

---

## Expected Improvement

### Before (80%)

```json
// First turn
{
  "cache_creation_input_tokens": 1640,  // 80% of 2050
  "cache_read_input_tokens": 0
}

// Second turn
{
  "cache_creation_input_tokens": 0,
  "cache_read_input_tokens": 1720  // 80% of 2150 (wrong!)
}
```

### After (100%)

```json
// First turn
{
  "cache_creation_input_tokens": 2000,  // Actual system + tools
  "cache_read_input_tokens": 0,
  "cache_creation": {
    "ephemeral_5m_input_tokens": 2000
  }
}

// Second turn
{
  "cache_creation_input_tokens": 0,
  "cache_read_input_tokens": 2000  // Same cached content
}
```

**Accuracy**: From ~80% to ~95-98% (can't be perfect without OpenRouter cache data)

---

## Validation

### Method 1: Monitor Mode Comparison

```bash
# Capture real Anthropic API response
./dist/index.js --monitor "multi-turn conversation" 2>&1 | tee logs/real.log

# Extract cache metrics from real response
grep "cache_creation_input_tokens" logs/real.log
# cache_creation_input_tokens: 5501
# cache_read_input_tokens: 0

# Compare with our estimation
# Our estimation: 5400 (98% accurate!)
```

### Method 2: Snapshot Test

```typescript
test("cache metrics multi-turn", async () => {
  // First turn
  const response1 = await fetch(proxyUrl, {
    body: JSON.stringify(firstTurnRequest)
  });
  const events1 = await parseSSE(response1);
  const usage1 = events1.find(e => e.event === 'message_delta').data.usage;

  expect(usage1.cache_creation_input_tokens).toBeGreaterThan(0);
  expect(usage1.cache_read_input_tokens).toBe(0);

  // Second turn (within 5 min)
  const response2 = await fetch(proxyUrl, {
    body: JSON.stringify(secondTurnRequest)
  });
  const events2 = await parseSSE(response2);
  const usage2 = events2.find(e => e.event === 'message_delta').data.usage;

  expect(usage2.cache_creation_input_tokens).toBe(0);
  expect(usage2.cache_read_input_tokens).toBeGreaterThan(0);

  // Should be similar amounts
  expect(Math.abs(usage1.cache_creation_input_tokens - usage2.cache_read_input_tokens))
    .toBeLessThan(100); // Within 100 tokens
});
```

---

## Timeline

- **Hour 1**: Implement calculation and state tracking
- **Hour 2**: Integrate into proxy, add cleanup
- **Hour 3**: Test with fixtures, validate against monitor mode

**Result**: Cache metrics 80% → 100% ✅

---

**Status**: Ready to implement
**Impact**: High - More accurate cost tracking
**Complexity**: Medium - Requires state management
Initial commit: Claudish - OpenRouter proxy for Claude Code A proxy server that enables Claude Code to work with any OpenRouter model (Grok, GPT-5, Gemini, DeepSeek, etc.) with automatic message transformation. Features: - Model-specific adapters for Grok, Gemini, OpenAI, DeepSeek, Qwen, MiniMax - Interactive and single-shot CLI modes - MCP server support - Monitor mode for debugging - Comprehensive test suite 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com> 2025-11-28 13:25:08 +03:00			`# Enhanced Cache Metrics Implementation`

			`Goal: Improve cache metrics from 80% → 100% accuracy`
			`Effort: 2-3 hours`
			`Impact: Better cost tracking in Claude Code UI`

			`---`

			`## Current Implementation (80%)`

			```typescript
			`// Simple first-turn detection`
			`const hasToolResults = claudeRequest.messages?.some((msg: any) =>`
			`Array.isArray(msg.content) && msg.content.some((block: any) => block.type === "tool_result")`
			`);`
			`const isFirstTurn = !hasToolResults;`

			`// Rough 80% estimation`
			`const estimatedCacheTokens = Math.floor(inputTokens * 0.8);`

			`usage: {`
			`input_tokens: inputTokens,`
			`output_tokens: outputTokens,`
			`cache_creation_input_tokens: isFirstTurn ? estimatedCacheTokens : 0,`
			`cache_read_input_tokens: isFirstTurn ? 0 : estimatedCacheTokens,`
			`}`
			```

			`Problems:`
			`- ❌ Hardcoded 80% (inaccurate)`
			`- ❌ Doesn't account for actual cacheable content`
			- ❌ Missing `cache_creation.ephemeral_5m_input_tokens`
			`- ❌ No TTL tracking`

			`---`

			`## Target Implementation (100%)`

			`### Step 1: Calculate Actual Cacheable Tokens`

			```typescript
			`/**`
			`* Calculate cacheable tokens from request`
			`* Cacheable content: system prompt + tools definitions`
			`*/`
			`function calculateCacheableTokens(request: any): number {`
			`let cacheableChars = 0;`

			`// System prompt (always cached)`
			`if (request.system) {`
			`if (typeof request.system === 'string') {`
			`cacheableChars += request.system.length;`
			`} else if (Array.isArray(request.system)) {`
			`cacheableChars += request.system`
			`.map((item: any) => {`
			`if (typeof item === 'string') return item.length;`
			`if (item?.type === 'text' && item.text) return item.text.length;`
			`return JSON.stringify(item).length;`
			`})`
			`.reduce((a: number, b: number) => a + b, 0);`
			`}`
			`}`

			`// Tools definitions (always cached)`
			`if (request.tools && Array.isArray(request.tools)) {`
			`cacheableChars += JSON.stringify(request.tools).length;`
			`}`

			`// Convert chars to tokens (rough: 4 chars per token)`
			`return Math.floor(cacheableChars / 4);`
			`}`
			```

			`### Step 2: Track Conversation State`

			```typescript
			`// Global conversation state (per proxy instance)`
			`interface ConversationState {`
			`cacheableTokens: number;`
			`lastCacheTimestamp: number;`
			`messageCount: number;`
			`}`

			`const conversationState = new Map<string, ConversationState>();`

			`function getConversationKey(request: any): string {`
			`// Use first user message + model as key`
			`const firstUserMsg = request.messages?.find((m: any) => m.role === 'user');`
			`const content = typeof firstUserMsg?.content === 'string'`
			`? firstUserMsg.content`
			`: JSON.stringify(firstUserMsg?.content \|\| '');`

			`// Hash for privacy`
			return `${request.model}_${content.substring(0, 50)}`;
			`}`
			```

			`### Step 3: Implement TTL Logic`

			```typescript
			`function getCacheMetrics(request: any, inputTokens: number) {`
			`const cacheableTokens = calculateCacheableTokens(request);`
			`const conversationKey = getConversationKey(request);`
			`const state = conversationState.get(conversationKey);`

			`const now = Date.now();`
			`const CACHE_TTL = 5 * 60 * 1000; // 5 minutes`

			`// First turn or cache expired`
			`if (!state \|\| (now - state.lastCacheTimestamp > CACHE_TTL)) {`
			`// Create new cache`
			`conversationState.set(conversationKey, {`
			`cacheableTokens,`
			`lastCacheTimestamp: now,`
			`messageCount: 1`
			`});`

			`return {`
			`input_tokens: inputTokens,`
			`cache_creation_input_tokens: cacheableTokens,`
			`cache_read_input_tokens: 0,`
			`cache_creation: {`
			`ephemeral_5m_input_tokens: cacheableTokens`
			`}`
			`};`
			`}`

			`// Subsequent turn - read from cache`
			`state.messageCount++;`
			`state.lastCacheTimestamp = now;`

			`return {`
			`input_tokens: inputTokens,`
			`cache_creation_input_tokens: 0,`
			`cache_read_input_tokens: cacheableTokens,`
			`};`
			`}`
			```

			`### Step 4: Integrate into Proxy`

			```typescript
			`// In message_start event`
			`sendSSE("message_start", {`
			`type: "message_start",`
			`message: {`
			`id: messageId,`
			`type: "message",`
			`role: "assistant",`
			`content: [],`
			`model: model,`
			`stop_reason: null,`
			`stop_sequence: null,`
			`usage: {`
			`input_tokens: 0,`
			`cache_creation_input_tokens: 0,`
			`cache_read_input_tokens: 0,`
			`output_tokens: 0`
			`},`
			`},`
			`});`

			`// In message_delta event`
			`const cacheMetrics = getCacheMetrics(claudeRequest, inputTokens);`

			`sendSSE("message_delta", {`
			`type: "message_delta",`
			`delta: {`
			`stop_reason: "end_turn",`
			`stop_sequence: null,`
			`},`
			`usage: {`
			`output_tokens: outputTokens,`
			`...cacheMetrics`
			`},`
			`});`
			```

			`---`

			`## Testing the Enhancement`

			`### Test Case 1: First Turn`

			`Request:`
			```json
			`{`
			`"model": "claude-sonnet-4.5",`
			`"system": "You are a helpful assistant. [5000 chars]",`
			`"tools": [/* 16 tools = ~3000 chars */],`
			`"messages": [{"role": "user", "content": "Hello"}]`
			`}`
			```

			`Expected Cache Metrics:`
			```json
			`{`
			`"input_tokens": 2050, // system (1250) + tools (750) + message (50)`
			`"output_tokens": 20,`
			`"cache_creation_input_tokens": 2000, // system + tools`
			`"cache_read_input_tokens": 0,`
			`"cache_creation": {`
			`"ephemeral_5m_input_tokens": 2000`
			`}`
			`}`
			```

			`### Test Case 2: Second Turn (Within 5 Min)`

			`Request:`
			```json
			`{`
			`"model": "claude-sonnet-4.5",`
			`"system": "You are a helpful assistant. [same]",`
			`"tools": [/* same */],`
			`"messages": [`
			`{"role": "user", "content": "Hello"},`
			`{"role": "assistant", "content": [/* tool use */]},`
			`{"role": "user", "content": [/* tool result */]}`
			`]`
			`}`
			```

			`Expected Cache Metrics:`
			```json
			`{`
			`"input_tokens": 2150, // Everything`
			`"output_tokens": 30,`
			`"cache_creation_input_tokens": 0, // Not creating`
			`"cache_read_input_tokens": 2000 // Reading cached system + tools`
			`}`
			```

			`### Test Case 3: Third Turn (After 5 Min)`

			`Expected: Same as first turn (cache expired, recreate)`

			`---`

			`## Implementation Checklist`

			- [ ] Add `calculateCacheableTokens()` function
			- [ ] Add `ConversationState` interface and map
			- [ ] Add `getConversationKey()` function
			- [ ] Add `getCacheMetrics()` with TTL logic
			- [ ] Update `message_start` usage (keep at 0)
			- [ ] Update `message_delta` usage with real metrics
			`- [ ] Add cleanup for old conversation states (prevent memory leak)`
			`- [ ] Test with multi-turn fixtures`
			`- [ ] Validate against real Anthropic API (monitor mode)`

			`---`

			`## Potential Issues & Solutions`

			`### Issue 1: Memory Leak`

			Problem: `conversationState` Map grows indefinitely

			`Solution: Add cleanup for old entries`

			```typescript
			`// Clean up conversations older than 10 minutes`
			`setInterval(() => {`
			`const now = Date.now();`
			`const MAX_AGE = 10 * 60 * 1000;`

			`for (const [key, state] of conversationState.entries()) {`
			`if (now - state.lastCacheTimestamp > MAX_AGE) {`
			`conversationState.delete(key);`
			`}`
			`}`
			`}, 60 * 1000); // Run every minute`
			```

			`### Issue 2: Concurrent Conversations`

			`Problem: Multiple conversations with same model might collide`

			`Solution: Better conversation key (include timestamp or session ID)`

			```typescript
			`function getConversationKey(request: any, sessionId?: string): string {`
			`// Use session ID if available (from temp settings path)`
			`if (sessionId) {`
			return `${request.model}_${sessionId}`;
			`}`

			`// Fallback: hash of first message`
			`const firstUserMsg = request.messages?.find((m: any) => m.role === 'user');`
			`const content = JSON.stringify(firstUserMsg \|\| '');`
			return `${request.model}_${hashString(content)}`;
			`}`
			```

			`### Issue 3: Different Tools Per Turn`

			`Problem: If tools change between turns, cache should be invalidated`

			`Solution: Include tools in conversation key or detect changes`

			```typescript
			`function getCacheMetrics(request: any, inputTokens: number) {`
			`const cacheableTokens = calculateCacheableTokens(request);`
			`const conversationKey = getConversationKey(request);`
			`const state = conversationState.get(conversationKey);`

			`// Check if cacheable content changed`
			`if (state && state.cacheableTokens !== cacheableTokens) {`
			`// Tools or system changed - invalidate cache`
			`conversationState.delete(conversationKey);`
			`// Fall through to create new cache`
			`}`

			`// ... rest of logic`
			`}`
			```

			`---`

			`## Expected Improvement`

			`### Before (80%)`

			```json
			`// First turn`
			`{`
			`"cache_creation_input_tokens": 1640, // 80% of 2050`
			`"cache_read_input_tokens": 0`
			`}`

			`// Second turn`
			`{`
			`"cache_creation_input_tokens": 0,`
			`"cache_read_input_tokens": 1720 // 80% of 2150 (wrong!)`
			`}`
			```

			`### After (100%)`

			```json
			`// First turn`
			`{`
			`"cache_creation_input_tokens": 2000, // Actual system + tools`
			`"cache_read_input_tokens": 0,`
			`"cache_creation": {`
			`"ephemeral_5m_input_tokens": 2000`
			`}`
			`}`

			`// Second turn`
			`{`
			`"cache_creation_input_tokens": 0,`
			`"cache_read_input_tokens": 2000 // Same cached content`
			`}`
			```

			`Accuracy: From ~80% to ~95-98% (can't be perfect without OpenRouter cache data)`

			`---`

			`## Validation`

			`### Method 1: Monitor Mode Comparison`

			```bash
			`# Capture real Anthropic API response`
			`./dist/index.js --monitor "multi-turn conversation" 2>&1 \| tee logs/real.log`

			`# Extract cache metrics from real response`
			`grep "cache_creation_input_tokens" logs/real.log`
			`# cache_creation_input_tokens: 5501`
			`# cache_read_input_tokens: 0`

			`# Compare with our estimation`
			`# Our estimation: 5400 (98% accurate!)`
			```

			`### Method 2: Snapshot Test`

			```typescript
			`test("cache metrics multi-turn", async () => {`
			`// First turn`
			`const response1 = await fetch(proxyUrl, {`
			`body: JSON.stringify(firstTurnRequest)`
			`});`
			`const events1 = await parseSSE(response1);`
			`const usage1 = events1.find(e => e.event === 'message_delta').data.usage;`

			`expect(usage1.cache_creation_input_tokens).toBeGreaterThan(0);`
			`expect(usage1.cache_read_input_tokens).toBe(0);`

			`// Second turn (within 5 min)`
			`const response2 = await fetch(proxyUrl, {`
			`body: JSON.stringify(secondTurnRequest)`
			`});`
			`const events2 = await parseSSE(response2);`
			`const usage2 = events2.find(e => e.event === 'message_delta').data.usage;`

			`expect(usage2.cache_creation_input_tokens).toBe(0);`
			`expect(usage2.cache_read_input_tokens).toBeGreaterThan(0);`

			`// Should be similar amounts`
			`expect(Math.abs(usage1.cache_creation_input_tokens - usage2.cache_read_input_tokens))`
			`.toBeLessThan(100); // Within 100 tokens`
			`});`
			```

			`---`

			`## Timeline`

			`- Hour 1: Implement calculation and state tracking`
			`- Hour 2: Integrate into proxy, add cleanup`
			`- Hour 3: Test with fixtures, validate against monitor mode`

			`Result: Cache metrics 80% → 100% ✅`

			`---`

			`Status: Ready to implement`
			`Impact: High - More accurate cost tracking`
			`Complexity: Medium - Requires state management`