import { EvaluationLogger } from 'weave';
// Create a simple dataset
interface DatasetExample {
question: string;
expected?: string;
expected_one_of?: string[];
}
const dataset: DatasetExample[] = [
{ question: 'What is 2 + 2?', expected: '4' },
{ question: 'What is the capital of France?', expected: 'Paris' },
{ question: 'Name a primary color', expected_one_of: ['red', 'blue', 'yellow'] },
];
// Define a scorer
const accuracyScorer = weave.op(function accuracyScorer(args: {
expected: string;
output: string;
expected_one_of?: string[];
}): { correct: boolean; score: number } {
const outputClean = args.output.trim().toLowerCase();
let isCorrect: boolean;
if (args.expected_one_of) {
isCorrect = args.expected_one_of.some(option =>
outputClean.includes(option.toLowerCase())
);
} else {
isCorrect = outputClean.includes(args.expected.toLowerCase());
}
return { correct: isCorrect, score: isCorrect ? 1.0 : 0.0 };
});
// Evaluate a model using Weave's EvaluationLogger
async function evaluateModel(
model: (question: string) => Promise<string>,
modelName: string,
dataset: DatasetExample[]
): Promise<void> {
// Initialize EvaluationLogger BEFORE calling the model to capture token usage
// This is especially important for W&B Inference to track costs
// Convert model name to a valid format (replace non-alphanumeric chars with underscores)
const safeModelName = modelName.replace(/\//g, '_').replace(/-/g, '_').replace(/\./g, '_');
const evalLogger = new EvaluationLogger({
name: 'inference_evaluation',
model: { name: safeModelName },
dataset: 'qa_dataset'
});
for (const example of dataset) {
// Get model prediction
const output = await model(example.question);
// Log the prediction
const predLogger = evalLogger.logPrediction(
{ question: example.question },
output
);
// Score the output
const score = await accuracyScorer({
expected: example.expected || '',
output: output,
expected_one_of: example.expected_one_of
});
// Log the score
predLogger.logScore('accuracy', score.score);
// Finish logging for this prediction
predLogger.finish();
}
// Log summary - Weave automatically aggregates the accuracy scores
await evalLogger.logSummary();
console.log(`Evaluation complete for ${modelName} (logged as: ${safeModelName}). View results in the Weave UI.`);
}
// Compare multiple models - a key feature of Weave's evaluation framework
const modelsToCompare = [
{ model: llamaModel, name: 'meta-llama/Llama-3.1-8B-Instruct' },
{ model: deepseekModel, name: 'deepseek-ai/DeepSeek-V3-0324' },
];
for (const { model, name } of modelsToCompare) {
await evaluateModel(model, name, dataset);
}
// In the Weave UI, navigate to the Evals tab to compare results across models