Extensibility Guide

Extending AgentEval with custom metrics, plugins, and integrations


Overview

AgentEval is designed for extensibility. You can:

  • Create custom metrics for domain-specific evaluation
  • Wrap external evaluation frameworks
  • Build plugins for custom test workflows
  • Integrate with Microsoft's official evaluators

Creating Custom Metrics

Basic Metric

Implement the IMetric interface for general-purpose metrics:

using AgentEval.Core;

public class ResponseLengthMetric : IMetric
{
    private readonly int _minLength;
    private readonly int _maxLength;
    
    public string Name => "ResponseLength";
    public string Description => "Validates response length is within expected range.";
    
    public ResponseLengthMetric(int minLength = 50, int maxLength = 500)
    {
        _minLength = minLength;
        _maxLength = maxLength;
    }
    
    public Task<MetricResult> EvaluateAsync(
        EvaluationContext context, 
        CancellationToken cancellationToken = default)
    {
        var length = context.Output?.Length ?? 0;
        
        if (length < _minLength)
        {
            return Task.FromResult(MetricResult.Fail(
                Name, 
                $"Response too short: {length} < {_minLength}",
                score: (double)length / _minLength * 50));
        }
        
        if (length > _maxLength)
        {
            return Task.FromResult(MetricResult.Fail(
                Name, 
                $"Response too long: {length} > {_maxLength}",
                score: Math.Max(0, 100 - (length - _maxLength) / 10)));
        }
        
        // Within range - full score
        var score = 100.0;
        return Task.FromResult(MetricResult.Pass(
            Name, 
            score, 
            $"Response length {length} is within range [{_minLength}, {_maxLength}]",
            new Dictionary<string, object>
            {
                ["length"] = length,
                ["minLength"] = _minLength,
                ["maxLength"] = _maxLength
            }));
    }
}

RAG Metric

Implement IRAGMetric for metrics that need context or ground truth:

public class KeywordCoverageMetric : IRAGMetric
{
    private readonly IReadOnlyList<string> _requiredKeywords;
    
    public string Name => "KeywordCoverage";
    public string Description => "Checks if response contains required keywords from context.";
    
    // RAG-specific requirements
    public bool RequiresContext => true;
    public bool RequiresGroundTruth => false;
    
    public KeywordCoverageMetric(IEnumerable<string> requiredKeywords)
    {
        _requiredKeywords = requiredKeywords.ToList();
    }
    
    public Task<MetricResult> EvaluateAsync(
        EvaluationContext context, 
        CancellationToken cancellationToken = default)
    {
        if (string.IsNullOrEmpty(context.Context))
        {
            return Task.FromResult(MetricResult.Fail(Name, "Context is required for keyword coverage."));
        }
        
        var outputLower = context.Output?.ToLowerInvariant() ?? "";
        var foundKeywords = _requiredKeywords
            .Where(kw => outputLower.Contains(kw.ToLowerInvariant()))
            .ToList();
        
        var coverage = _requiredKeywords.Count > 0 
            ? (double)foundKeywords.Count / _requiredKeywords.Count * 100 
            : 100;
        
        var missingKeywords = _requiredKeywords.Except(foundKeywords).ToList();
        
        if (missingKeywords.Any())
        {
            return Task.FromResult(MetricResult.Fail(
                Name,
                $"Missing keywords: {string.Join(", ", missingKeywords)}",
                coverage,
                new Dictionary<string, object>
                {
                    ["foundKeywords"] = foundKeywords,
                    ["missingKeywords"] = missingKeywords,
                    ["coverage"] = coverage / 100
                }));
        }
        
        return Task.FromResult(MetricResult.Pass(
            Name, 
            coverage,
            $"All {_requiredKeywords.Count} required keywords found."));
    }
}

Agentic Metric

Implement IAgenticMetric for metrics that evaluate tool usage:

public class ToolLatencyMetric : IAgenticMetric
{
    private readonly TimeSpan _maxLatency;
    
    public string Name => "ToolLatency";
    public string Description => "Validates all tool calls complete within time limit.";
    public bool RequiresToolUsage => true;
    
    public ToolLatencyMetric(TimeSpan maxLatency)
    {
        _maxLatency = maxLatency;
    }
    
    public Task<MetricResult> EvaluateAsync(
        EvaluationContext context, 
        CancellationToken cancellationToken = default)
    {
        if (context.ToolUsage == null || context.ToolUsage.Count == 0)
        {
            return Task.FromResult(MetricResult.Pass(Name, 100, "No tool calls to evaluate."));
        }
        
        var slowTools = context.ToolUsage.Calls
            .Where(c => c.Duration > _maxLatency)
            .ToList();
        
        if (slowTools.Any())
        {
            var slowestTool = slowTools.OrderByDescending(t => t.Duration).First();
            return Task.FromResult(MetricResult.Fail(
                Name,
                $"Tool '{slowestTool.Name}' took {slowestTool.Duration.TotalMilliseconds:F0}ms (max: {_maxLatency.TotalMilliseconds}ms)",
                score: Math.Max(0, 100 - slowTools.Count * 20),
                details: new Dictionary<string, object>
                {
                    ["slowTools"] = slowTools.Select(t => t.Name).ToList(),
                    ["maxLatencyMs"] = _maxLatency.TotalMilliseconds
                }));
        }
        
        return Task.FromResult(MetricResult.Pass(
            Name,
            100,
            $"All {context.ToolUsage.Count} tool calls completed within {_maxLatency.TotalMilliseconds}ms."));
    }
}

Using the Metric Registry

The MetricRegistry provides centralized metric management:

using AgentEval.Core;

// Create registry
var registry = new MetricRegistry();

// Register metrics by name
registry.Register("length", new ResponseLengthMetric(50, 500));
registry.Register("keywords", new KeywordCoverageMetric(new[] { "price", "shipping" }));

// Or let the metric use its own name
registry.Register(new ToolLatencyMetric(TimeSpan.FromSeconds(5)));

// Get a specific metric
var lengthMetric = registry.Get("length");
var toolLatency = registry.GetRequired("ToolLatency"); // Throws if not found

// Run all metrics
var results = new List<MetricResult>();
foreach (var metric in registry.GetAll())
{
    var result = await metric.EvaluateAsync(context);
    results.Add(result);
}

// Filter by type
var ragMetrics = registry.GetAll().OfType<IRAGMetric>();
var agenticMetrics = registry.GetAll().OfType<IAgenticMetric>();

Pre-populating the Registry

// Initialize with multiple metrics
var registry = new MetricRegistry(new IMetric[]
{
    new FaithfulnessMetric(chatClient),
    new RelevanceMetric(chatClient),
    new ToolSelectionMetric(expectedTools),
    new ResponseLengthMetric(100, 1000)
});

Wrapping Microsoft Evaluators

AgentEval includes MicrosoftEvaluatorAdapter to use official Microsoft evaluators:

using AgentEval.Adapters;
using Microsoft.Extensions.AI.Evaluation.Quality;

// Create Microsoft evaluator
var msCoherence = new CoherenceEvaluator();

// Wrap for AgentEval
var coherenceMetric = new MicrosoftEvaluatorAdapter(msCoherence, chatClient);

// Use like any AgentEval metric
var result = await coherenceMetric.EvaluateAsync(context);

// Score is automatically normalized from 1-5 to 0-100
Console.WriteLine($"Coherence: {result.Score}"); // e.g., 85.0

Available Microsoft Evaluators

Microsoft Evaluator AgentEval Equivalent Score Range
FluencyEvaluator Wrap with adapter 1-5 → 0-100
CoherenceEvaluator Wrap with adapter 1-5 → 0-100
RelevanceEvaluator RelevanceMetric (native) 0-100
GroundednessEvaluator FaithfulnessMetric (native) 0-100
EquivalenceEvaluator AnswerSimilarityMetric (native) 0-100
TaskAdherenceEvaluator TaskCompletionMetric (native) 0-100
ToolCallAccuracyEvaluator ToolSelectionMetric (native) 0-100

When to Use the Adapter

  • Use adapter when you want Microsoft's exact prompts and scoring
  • Use native when you want AgentEval's tool tracking and detailed breakdowns

Score Normalization

AgentEval uses 0-100 scores. The ScoreNormalizer class converts between scales:

using AgentEval.Core;

// Convert Microsoft's 1-5 scale to AgentEval's 0-100
double score = ScoreNormalizer.From1To5(4.2);  // Returns 80.0

// Formula: (score - 1) * 25
// 1 → 0, 2 → 25, 3 → 50, 4 → 75, 5 → 100

// Convert cosine similarity (0-1) to 0-100
double score2 = ScoreNormalizer.FromSimilarity(0.85);  // Returns 85.0

// Get human-readable interpretation
string interpretation = ScoreNormalizer.Interpret(85);
// Returns: "Good" (for 80-89 range)

// Interpretation ranges:
// 90-100: "Excellent"
// 80-89:  "Good"
// 70-79:  "Acceptable"
// 60-69:  "Marginal"
// 50-59:  "Poor"
// 0-49:   "Fail"

Custom Score Ranges

// Convert from any range to 0-100
public static double NormalizeScore(double value, double min, double max)
{
    if (max <= min) throw new ArgumentException("max must be greater than min");
    return (value - min) / (max - min) * 100;
}

// Example: Convert 0-10 scale
var score = NormalizeScore(7.5, 0, 10);  // Returns 75.0

Creating Plugins

Plugins extend the test harness with custom behavior:

public interface IAgentEvalPlugin
{
    string Name { get; }
    
    /// <summary>Called before each test runs.</summary>
    Task OnTestStartAsync(TestCase testCase, CancellationToken ct);
    
    /// <summary>Called after each test completes.</summary>
    Task OnTestCompleteAsync(TestResult result, CancellationToken ct);
    
    /// <summary>Called after the entire test suite completes.</summary>
    Task OnSuiteCompleteAsync(TestSummary summary, CancellationToken ct);
}

Example: Logging Plugin

public class ConsoleLoggingPlugin : IAgentEvalPlugin
{
    public string Name => "ConsoleLogging";
    
    public Task OnTestStartAsync(TestCase testCase, CancellationToken ct)
    {
        Console.WriteLine($"▶ Starting: {testCase.Name}");
        return Task.CompletedTask;
    }
    
    public Task OnTestCompleteAsync(TestResult result, CancellationToken ct)
    {
        var icon = result.Passed ? "✅" : "❌";
        Console.WriteLine($"{icon} {result.Name}: {result.Score:F1}/100");
        return Task.CompletedTask;
    }
    
    public Task OnSuiteCompleteAsync(TestSummary summary, CancellationToken ct)
    {
        Console.WriteLine($"\n📊 Summary: {summary.PassedCount}/{summary.TotalCount} passed");
        return Task.CompletedTask;
    }
}

Example: Metrics Export Plugin

public class PrometheusMetricsPlugin : IAgentEvalPlugin
{
    private readonly Counter _testsPassed;
    private readonly Counter _testsFailed;
    private readonly Histogram _testDuration;
    
    public string Name => "PrometheusMetrics";
    
    public Task OnTestCompleteAsync(TestResult result, CancellationToken ct)
    {
        if (result.Passed)
            _testsPassed.Inc();
        else
            _testsFailed.Inc();
        
        _testDuration.Observe(result.Duration.TotalSeconds);
        return Task.CompletedTask;
    }
}

Retry Policies

Use RetryPolicy for resilient metric evaluation:

using AgentEval.Core;

// Create retry policy
var retryPolicy = new RetryPolicy(
    maxRetries: 3,
    initialDelayMs: 1000,
    backoffMultiplier: 2.0);  // 1s, 2s, 4s

// Apply to a metric
var result = await retryPolicy.ExecuteAsync(
    async ct => await metric.EvaluateAsync(context, ct),
    cancellationToken);

// Or use extension method
var result = await metric.WithRetry(retryPolicy).EvaluateAsync(context);

When to Use Retry

  • LLM API calls (rate limits, transient failures)
  • Embedding generation
  • External service calls in custom metrics

LLM-Based Custom Metrics

Build metrics that use LLM-as-judge:

public class ToneMetric : IMetric
{
    private readonly IChatClient _chatClient;
    private readonly string _expectedTone;
    
    public string Name => "Tone";
    public string Description => $"Evaluates if response matches expected {_expectedTone} tone.";
    
    public ToneMetric(IChatClient chatClient, string expectedTone = "professional")
    {
        _chatClient = chatClient;
        _expectedTone = expectedTone;
    }
    
    public async Task<MetricResult> EvaluateAsync(
        EvaluationContext context, 
        CancellationToken ct = default)
    {
        var prompt = $@"Evaluate if this response has a {_expectedTone} tone.

Response to evaluate:
{context.Output}

Respond with JSON:
{{
    ""score"": <1-5 where 5 is perfect {_expectedTone} tone>,
    ""reasoning"": ""<brief explanation>""
}}";

        var response = await _chatClient.GetResponseAsync(
            new[] { new ChatMessage(ChatRole.User, prompt) },
            cancellationToken: ct);
        
        var parsed = LlmJsonParser.ParseMetricResponse<ToneResponse>(response.Message.Text);
        var score = ScoreNormalizer.From1To5(parsed?.Score ?? 3);
        
        return new MetricResult
        {
            MetricName = Name,
            Score = score,
            Passed = score >= 70,
            Explanation = parsed?.Reasoning ?? "Unable to parse response"
        };
    }
    
    private record ToneResponse(int Score, string Reasoning);
}

Testing Custom Metrics

Use FakeChatClient for unit testing:

using AgentEval.Testing;
using Xunit;

public class ToneMetricTests
{
    [Fact]
    public async Task Professional_Tone_Scores_High()
    {
        // Arrange
        var fakeChatClient = new FakeChatClient()
            .WithResponse(@"{""score"": 5, ""reasoning"": ""Very professional""}");
        
        var metric = new ToneMetric(fakeChatClient, "professional");
        
        var context = new EvaluationContext
        {
            Input = "How do I reset my password?",
            Output = "I'd be happy to help you reset your password. Please follow these steps..."
        };
        
        // Act
        var result = await metric.EvaluateAsync(context);
        
        // Assert
        Assert.True(result.Passed);
        Assert.Equal(100, result.Score);
        Assert.Contains("professional", fakeChatClient.LastPrompt);
    }
    
    [Fact]
    public async Task Unprofessional_Tone_Scores_Low()
    {
        var fakeChatClient = new FakeChatClient()
            .WithResponse(@"{""score"": 1, ""reasoning"": ""Too casual""}");
        
        var metric = new ToneMetric(fakeChatClient, "professional");
        
        var context = new EvaluationContext
        {
            Input = "How do I reset my password?",
            Output = "lol just click the button dude"
        };
        
        var result = await metric.EvaluateAsync(context);
        
        Assert.False(result.Passed);
        Assert.Equal(0, result.Score);
    }
}

Best Practices

1. Use Appropriate Interface

// ✅ Good: RAG metric declares its requirements
public class MyRagMetric : IRAGMetric
{
    public bool RequiresContext => true;
    public bool RequiresGroundTruth => true;
}

// ❌ Bad: Generic metric that secretly needs context
public class MyMetric : IMetric
{
    public Task<MetricResult> EvaluateAsync(EvaluationContext ctx, CancellationToken ct)
    {
        // Crashes if ctx.Context is null!
        var words = ctx.Context.Split(' ');
    }
}

2. Handle Missing Data Gracefully

public Task<MetricResult> EvaluateAsync(EvaluationContext context, CancellationToken ct)
{
    // ✅ Good: Check and fail gracefully
    if (string.IsNullOrEmpty(context.GroundTruth))
    {
        return Task.FromResult(MetricResult.Fail(Name, "Ground truth is required."));
    }
    
    // Continue with evaluation...
}

3. Include Detailed Metadata

return MetricResult.Pass(Name, score, explanation, new Dictionary<string, object>
{
    // ✅ Good: Rich details for debugging
    ["rawSimilarity"] = similarity,
    ["threshold"] = _threshold,
    ["comparisonType"] = "cosine",
    ["text1Length"] = text1.Length,
    ["text2Length"] = text2.Length
});

4. Use Constants for Magic Numbers

// ✅ Good: Use EvaluationDefaults
public MyMetric(double threshold = EvaluationDefaults.PassingScoreThreshold)

// ❌ Bad: Magic numbers
public MyMetric(double threshold = 70)

See Also