Agent 评估、安全与对齐
评估是确保 Agent 效果的基础。
安全和对齐是 Agent 能否被信任和部署的关键。
Agent 评测体系
评估 Agent 性能是开发过程中的关键环节。
好的评测体系能够帮助我们了解 Agent 的能力边界。
也为持续优化提供方向和依据。
评测维度
任务完成度:Agent 是否正确完成了给定任务。
效率指标:完成任务所需的步骤数、token 消耗、执行时间。
质量指标:答案的准确性、响应的一致性、对话的自然度。
鲁棒性:对异常输入、噪声数据的处理能力。
常用 Benchmark
| Benchmark | 用途 | 评估内容 |
|---|---|---|
| GAIA | 通用 AI 助手评测 | 复杂任务处理、多步骤推理 |
| MMLU | 多任务语言理解 | 57 个学科的知识问答 |
| HumanEval | 代码生成评测 | Python 代码编写正确性 |
| HotpotQA | 多跳问答评测 | 需要多个文档推理的问题 |
| AgentBench | Agent 能力评测 | 真实环境中的 Agent 表现 |
代码实现:评测框架
Agent 评测框架
class AgentEvaluator:
"""
Agent 评测框架
评估 Agent 在各种任务上的表现
"""
def __init__(self, agent, metrics):
# 待评测的 Agent
self.agent = agent
# 评测指标列表
self.metrics = metrics
def evaluate(self, test_cases):
"""
执行评测
:param test_cases: 测试用例列表
:return: 评测报告
"""
results = []
for test_case in test_cases:
# 执行任务
result = self.run_single_test(test_case)
results.append(result)
# 生成评测报告
report = self.generate_report(results)
return report
def run_single_test(self, test_case):
"""
运行单个测试用例
"""
# 记录开始时间
start_time = time.time()
# 执行 Agent
try:
output = self.agent.run(test_case.input)
success = self.evaluate_output(output, test_case.expected)
error = None
except Exception as e:
output = None
success = False
error = str(e)
# 记录结束时间
end_time = time.time()
return TestResult(
test_case=test_case,
output=output,
success=success,
error=error,
duration=end_time - start_time,
token_count=self.count_tokens(output)
)
def evaluate_output(self, output, expected):
"""评估输出是否符合预期"""
for metric in self.metrics:
if not metric.evaluate(output, expected):
return False
return True
def generate_report(self, results):
"""生成评测报告"""
total = len(results)
passed = sum(1 for r in results if r.success)
# 计算各项指标
avg_duration = sum(r.duration for r in results) / total
avg_tokens = sum(r.token_count for r in results) / total
# 按测试类型分组统计
by_category = {}
for r in results:
category = r.test_case.category
if category not in by_category:
by_category[category] = {"total": 0, "passed": 0}
by_category[category]["total"] += 1
if r.success:
by_category[category]["passed"] += 1
return EvaluationReport(
total=total,
passed=passed,
pass_rate=passed / total,
avg_duration=avg_duration,
avg_tokens=avg_tokens,
by_category=by_category,
results=results
)
class TestCase:
"""测试用例"""
def __init__(self, input, expected, category="general", metadata=None):
# 输入
self.input = input
# 预期输出或评估标准
self.expected = expected
# 类别
self.category = category
# 额外元数据
self.metadata
"""
Agent 评测框架
评估 Agent 在各种任务上的表现
"""
def __init__(self, agent, metrics):
# 待评测的 Agent
self.agent = agent
# 评测指标列表
self.metrics = metrics
def evaluate(self, test_cases):
"""
执行评测
:param test_cases: 测试用例列表
:return: 评测报告
"""
results = []
for test_case in test_cases:
# 执行任务
result = self.run_single_test(test_case)
results.append(result)
# 生成评测报告
report = self.generate_report(results)
return report
def run_single_test(self, test_case):
"""
运行单个测试用例
"""
# 记录开始时间
start_time = time.time()
# 执行 Agent
try:
output = self.agent.run(test_case.input)
success = self.evaluate_output(output, test_case.expected)
error = None
except Exception as e:
output = None
success = False
error = str(e)
# 记录结束时间
end_time = time.time()
return TestResult(
test_case=test_case,
output=output,
success=success,
error=error,
duration=end_time - start_time,
token_count=self.count_tokens(output)
)
def evaluate_output(self, output, expected):
"""评估输出是否符合预期"""
for metric in self.metrics:
if not metric.evaluate(output, expected):
return False
return True
def generate_report(self, results):
"""生成评测报告"""
total = len(results)
passed = sum(1 for r in results if r.success)
# 计算各项指标
avg_duration = sum(r.duration for r in results) / total
avg_tokens = sum(r.token_count for r in results) / total
# 按测试类型分组统计
by_category = {}
for r in results:
category = r.test_case.category
if category not in by_category:
by_category[category] = {"total": 0, "passed": 0}
by_category[category]["total"] += 1
if r.success:
by_category[category]["passed"] += 1
return EvaluationReport(
total=total,
passed=passed,
pass_rate=passed / total,
avg_duration=avg_duration,
avg_tokens=avg_tokens,
by_category=by_category,
results=results
)
class TestCase:
"""测试用例"""
def __init__(self, input, expected, category="general", metadata=None):
# 输入
self.input = input
# 预期输出或评估标准
self.expected = expected
# 类别
self.category = category
# 额外元数据
self.metadata