claudish/tests/comprehensive-model-test.ts

298 lines
11 KiB
TypeScript
Raw Permalink Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import { afterEach, describe, expect, test } from "bun:test";
import { MODEL_INFO } from "../src/config.js";
import { createProxyServer } from "../src/proxy-server.js";
import type {
AnthropicRequest,
AnthropicResponse,
OpenRouterModel,
ProxyServer,
} from "../src/types.js";
import { OPENROUTER_MODELS } from "../src/types.js";
// Load .env file
import { join } from "node:path";
const envPath = join(import.meta.dir, "..", ".env");
const envFile = await Bun.file(envPath).text();
for (const line of envFile.split("\n")) {
if (line.startsWith("#") || !line.includes("=")) continue;
const [key, ...values] = line.split("=");
process.env[key.trim()] = values.join("=").trim();
}
const OPENROUTER_API_KEY = process.env.OPENROUTER_API_KEY;
if (!OPENROUTER_API_KEY) {
throw new Error("OPENROUTER_API_KEY not found in .env file");
}
// Test all models except "custom" and "anthropic/claude-sonnet-4.5" (test separately)
const TEST_MODELS = OPENROUTER_MODELS.filter(
(m) => m !== "custom" && m !== "anthropic/claude-sonnet-4.5"
);
const ANTHROPIC_MODEL: OpenRouterModel = "anthropic/claude-sonnet-4.5";
const activeProxies: ProxyServer[] = [];
async function startTestProxy(
model: OpenRouterModel,
port: number
): Promise<ProxyServer> {
const proxy = await createProxyServer(port, OPENROUTER_API_KEY!, model);
activeProxies.push(proxy);
return proxy;
}
async function makeAnthropicRequest(
proxyUrl: string,
messages: Array<{ role: "user" | "assistant"; content: string }>
): Promise<AnthropicResponse> {
const request: AnthropicRequest = {
model: "claude-sonnet-4.5",
messages,
max_tokens: 300,
temperature: 0.3,
stream: false,
};
const response = await fetch(`${proxyUrl}/v1/messages`, {
method: "POST",
headers: {
"Content-Type": "application/json",
"anthropic-version": "2023-06-01",
},
body: JSON.stringify(request),
});
if (!response.ok) {
const error = await response.text();
throw new Error(`Proxy request failed: ${response.status} ${error}`);
}
return (await response.json()) as AnthropicResponse;
}
afterEach(async () => {
for (const proxy of activeProxies) {
await proxy.shutdown();
}
activeProxies.length = 0;
});
describe("Comprehensive Model Identity Tests", () => {
describe("Prove NOT Anthropic", () => {
for (const model of TEST_MODELS) {
test(`${model} should NOT identify as Anthropic`, async () => {
const port = 4000 + TEST_MODELS.indexOf(model);
const proxy = await startTestProxy(model, port);
const info = MODEL_INFO[model];
console.log(`\n━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━`);
console.log(`🧪 Testing: ${info.name} (${model})`);
console.log(`📍 Expected Provider: ${info.provider}`);
console.log(`━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━`);
const prompt = `Identify yourself: state your model name and creator. For example: 'I am GPT-4 by OpenAI' or 'I am Claude by Anthropic' or 'I am Grok by xAI'.`;
const response = await makeAnthropicRequest(proxy.url, [
{
role: "user",
content: prompt,
},
]);
const responseText = response.content[0].text?.toLowerCase() || "";
console.log(`💬 Response: "${response.content[0].text}"`);
console.log(`📊 Tokens: ${response.usage.input_tokens} in, ${response.usage.output_tokens} out`);
// Verify it's an Anthropic-format response (proxy working)
expect(response.type).toBe("message");
expect(response.role).toBe("assistant");
// Verify we got a response
expect(responseText).toBeTruthy();
expect(responseText.length).toBeGreaterThan(3);
// CRITICAL TEST: If this is NOT the Anthropic model, it should NOT mention Anthropic
if (model !== "anthropic/claude-3.5-sonnet") {
const mentionsAnthropic =
responseText.includes("anthropic") || responseText.includes("claude");
if (mentionsAnthropic) {
console.log(`❌ FAILED: Response mentions Anthropic/Claude!`);
console.log(` This suggests we might be getting Anthropic's model`);
} else {
console.log(`✅ PASSED: Does NOT mention Anthropic or Claude`);
}
expect(mentionsAnthropic).toBe(false);
} else {
// For the actual Anthropic model, it SHOULD mention Anthropic
const mentionsAnthropic =
responseText.includes("anthropic") || responseText.includes("claude");
console.log(
`✅ PASSED: Anthropic model correctly identifies as Anthropic: ${mentionsAnthropic}`
);
expect(mentionsAnthropic).toBe(true);
}
console.log(`━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n`);
}, 30000);
}
});
describe("Provider Verification", () => {
test("All models should return different providers", async () => {
const results: Record<string, { provider: string; response: string }> = {};
// Test subset for speed (first 4 models)
const modelsToTest = TEST_MODELS.slice(0, 4);
for (const model of modelsToTest) {
const port = 5000 + modelsToTest.indexOf(model);
const proxy = await startTestProxy(model, port);
const info = MODEL_INFO[model];
const response = await makeAnthropicRequest(proxy.url, [
{
role: "user",
content: "Identify yourself: state your model name and creator. For example: 'I am GPT-4 by OpenAI' or 'I am Claude by Anthropic' or 'I am Grok by xAI'.",
},
]);
results[model] = {
provider: info.provider,
response: response.content[0].text || "",
};
await proxy.shutdown();
activeProxies.pop();
}
console.log("\n📊 PROVIDER COMPARISON:");
console.log("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━");
for (const [model, data] of Object.entries(results)) {
console.log(`${data.provider.padEnd(10)} → "${data.response}"`);
}
console.log("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n");
// Verify we got responses from all tested models
expect(Object.keys(results).length).toBe(modelsToTest.length);
// Verify at least 3 different providers mentioned
const uniqueResponses = new Set(Object.values(results).map((r) => r.response.toLowerCase()));
console.log(`✅ Unique responses: ${uniqueResponses.size}/${modelsToTest.length}`);
expect(uniqueResponses.size).toBeGreaterThanOrEqual(3);
}, 90000);
});
describe("Detailed Provider Attribution", () => {
const providerTests: Array<{
model: OpenRouterModel;
expectedProvider: string;
keywords: string[];
}> = [
{
model: "x-ai/grok-code-fast-1",
expectedProvider: "xAI",
keywords: ["xai", "grok", "elon", "x.ai"],
},
{
model: "openai/gpt-5-codex",
expectedProvider: "OpenAI",
keywords: ["openai", "gpt", "chatgpt"],
},
{
model: "minimax/minimax-m2",
expectedProvider: "MiniMax",
keywords: ["minimax"],
},
{
model: "qwen/qwen3-vl-235b-a22b-instruct",
expectedProvider: "Alibaba",
keywords: ["alibaba", "qwen", "tongyi"],
},
];
for (const testCase of providerTests) {
test(`${testCase.model} should identify as ${testCase.expectedProvider}`, async () => {
const port = 6000 + providerTests.indexOf(testCase);
const proxy = await startTestProxy(testCase.model, port);
console.log(`\n🔍 Testing ${testCase.expectedProvider} attribution...`);
const response = await makeAnthropicRequest(proxy.url, [
{
role: "user",
content: "Identify yourself: state your model name and creator. For example: 'I am GPT-4 by OpenAI' or 'I am Claude by Anthropic' or 'I am Grok by xAI'.",
},
]);
const responseText = response.content[0].text?.toLowerCase() || "";
console.log(` Response: "${response.content[0].text}"`);
// Check if any expected keywords are mentioned
const mentionsProvider = testCase.keywords.some((keyword) =>
responseText.includes(keyword.toLowerCase())
);
if (mentionsProvider) {
console.log(` ✅ Correctly identifies as ${testCase.expectedProvider}`);
} else {
console.log(
` ⚠️ Doesn't mention expected keywords: ${testCase.keywords.join(", ")}`
);
console.log(` Note: This might still be correct, just phrased differently`);
}
// Main assertion: Should NOT mention Anthropic (except for Anthropic model)
const mentionsAnthropic =
responseText.includes("anthropic") || responseText.includes("claude");
console.log(` ${mentionsAnthropic ? "❌" : "✅"} Anthropic mentioned: ${mentionsAnthropic}`);
expect(mentionsAnthropic).toBe(false);
// Soft check: Ideally should mention one of the expected keywords
if (!mentionsProvider) {
console.log(
` Warning: Response doesn't contain expected keywords, but still valid if not Anthropic`
);
}
}, 30000);
}
});
describe("Anthropic Model Baseline", () => {
test("anthropic/claude-sonnet-4.5 SHOULD identify as Anthropic", async () => {
const port = 7000;
const proxy = await startTestProxy(ANTHROPIC_MODEL, port);
console.log("\n🔬 BASELINE TEST: Testing actual Anthropic model...");
const response = await makeAnthropicRequest(proxy.url, [
{
role: "user",
content: "Identify yourself: state your model name and creator.",
},
]);
const responseText = response.content[0].text?.toLowerCase() || "";
console.log(` Response: "${response.content[0].text}"`);
const mentionsAnthropic =
responseText.includes("anthropic") || responseText.includes("claude");
console.log(` ${mentionsAnthropic ? "✅" : "❌"} Mentions Anthropic: ${mentionsAnthropic}`);
// The Anthropic model SHOULD mention Anthropic
expect(mentionsAnthropic).toBe(true);
console.log(
" ✅ BASELINE CONFIRMED: Anthropic model identifies as Anthropic"
);
console.log(
" This proves other models NOT mentioning Anthropic are different!\n"
);
}, 30000);
});
});