我们怎么测试AI应用是2026年AI工程师最常被问到的问题之一。传统软件测试方法在这里只够用一半另一半需要全新的思路。本文给你一套完整的AI应用测试框架。一、AI应用测试的特殊挑战传统软件测试的假设相同输入 → 相同输出AI应用的现实相同输入 → 概率性输出且输出质量难以用二进制判断。这带来了几个独特挑战1.不确定性temperature 0时每次运行结果不同2.质量模糊“好的答案没有明确边界3.评估成本高人工评估准确但昂贵自动评估快速但可能不准4.回归困难模型升级可能让以前好的输出变差5.边界情况多攻击性输入、超长文本、跨语言输入等## 二、AI测试的四个层次第4层E2E系统测试 ↑ 完整用户场景的端到端验证第3层集成测试 ↑ RAG检索质量 生成质量的联合评估第2层组件测试 ↑ Prompt模板、检索器、后处理的单独测试第1层单元测试 ↑ 工具函数、解析逻辑、过滤规则的确定性测试## 三、第一层确定性组件的单元测试这部分和传统软件测试完全相同pythonimport pytestfrom unittest.mock import MagicMock, patch# 测试JSON解析函数def test_json_fixer_handles_trailing_comma(): from app.utils import JSONOutputFixer bad_json {key: value,} result JSONOutputFixer.extract_json(bad_json) assert result {key: value}def test_json_fixer_returns_none_for_invalid(): from app.utils import JSONOutputFixer result JSONOutputFixer.extract_json(这不是JSON) assert result is None# 测试文档分块逻辑def test_text_splitter_respects_chunk_size(): from langchain_text_splitters import RecursiveCharacterTextSplitter splitter RecursiveCharacterTextSplitter(chunk_size100, chunk_overlap20) text A * 250 chunks splitter.split_text(text) for chunk in chunks: assert len(chunk) 120 # 允许一定超出# 测试检索元数据过滤def test_metadata_filter_construction(): from app.retriever import build_filter filter_obj build_filter(categoryai, min_date2026-01-01) assert filter_obj[category] ai assert date in filter_obj# 测试工具参数验证def test_tool_rejects_invalid_params(): from app.tools import SearchTool tool SearchTool() with pytest.raises(ValueError, matchquery不能为空): tool.run(query, max_results5)## 四、第二层Prompt模板测试pythonfrom dataclasses import dataclassfrom typing import Callableimport pytestdataclassclass PromptTestCase: name: str inputs: dict expected_behavior: str # 文字描述期望行为 assert_fn: Callable # 验证函数class PromptTester: Prompt模板测试框架 def __init__(self, prompt_template, llm_client, use_cacheTrue): self.template prompt_template self.llm llm_client self.cache {} if use_cache else None def run_test_case(self, test_case: PromptTestCase) - dict: 运行单个测试用例 # 渲染prompt prompt self.template.render(**test_case.inputs) # 获取LLM响应带缓存 cache_key hash(prompt) if self.cache is not None and cache_key in self.cache: response self.cache[cache_key] else: response self.llm.complete(prompt) if self.cache is not None: self.cache[cache_key] response # 运行断言 try: test_case.assert_fn(response) return {name: test_case.name, status: pass, response: response} except AssertionError as e: return {name: test_case.name, status: fail, error: str(e), response: response} def run_all(self, test_cases: list[PromptTestCase]) - dict: 运行所有测试用例 results [self.run_test_case(tc) for tc in test_cases] passed sum(1 for r in results if r[status] pass) return { total: len(results), passed: passed, failed: len(results) - passed, pass_rate: passed / len(results) if results else 0, details: results }# 实际测试用例code_review_test_cases [ PromptTestCase( nameSQL注入检测, inputs{ language: python, code: query fSELECT * FROM users WHERE id {user_id} }, expected_behavior应该识别出SQL注入风险, assert_fnlambda r: ( sql in r.lower() or injection in r.lower() or 注入 in r, 未检测到SQL注入问题 ) ), PromptTestCase( name安全代码不误报, inputs{ language: python, code: query SELECT * FROM users WHERE id %s\ncursor.execute(query, (user_id,)) }, expected_behavior参数化查询不应该报SQL注入, assert_fnlambda r: ( critical not in r.lower() or sql not in r.lower(), 对安全代码产生了误报 ) ), PromptTestCase( nameJSON格式输出, inputs{ language: python, code: x 1\ny x 1 }, expected_behavior输出必须是有效的JSON, assert_fnlambda r: __import__(json).loads(r) )]## 五、第三层RAG检索质量评估pythonimport jsonfrom typing import NamedTupleimport numpy as npclass RAGEvaluationMetrics(NamedTuple): RAG评估指标 context_recall: float # 上下文召回率黄金答案是否在检索结果中 context_precision: float # 上下文精确率检索结果有多大比例是相关的 faithfulness: float # 忠实度生成答案是否基于检索内容 answer_relevancy: float # 答案相关性答案是否回答了问题class RAGEvaluator: RAG系统质量评估器 def __init__(self, llm_judge, embedder): self.llm llm_judge self.embedder embedder def evaluate_context_recall(self, ground_truth: str, retrieved_contexts: list[str]) - float: 评估召回率黄金答案中的信息是否在检索结果中 if not retrieved_contexts: return 0.0 # 将所有检索内容合并 combined_context \n\n.join(retrieved_contexts) prompt f评估以下检索内容是否包含回答问题所需的信息。 正确答案{ground_truth}检索到的内容{combined_context}评分标准- 1.0检索内容完全包含了正确答案所需的所有信息- 0.7检索内容包含了大部分信息- 0.4检索内容只包含少量相关信息- 0.0检索内容不包含任何相关信息只输出0-1之间的数字 response self.llm.complete(prompt) try: return float(response.strip()) except: return 0.5 def evaluate_faithfulness(self, answer: str, contexts: list[str]) - float: 评估忠实度答案是否只基于提供的上下文 context_text \n\n.join(contexts) prompt f判断以下答案是否完全基于提供的上下文没有引入外部知识或捏造信息。上下文{context_text}答案{answer}评分- 1.0答案完全基于上下文无任何编造- 0.7大部分基于上下文有少量推断- 0.4部分基于上下文但有明显的外部知识引入- 0.0答案与上下文无关或完全捏造只输出0-1之间的数字 response self.llm.complete(prompt) try: return float(response.strip()) except: return 0.5 def evaluate_answer_relevancy(self, question: str, answer: str) - float: 评估答案相关性答案是否回答了问题 # 使用embedding计算问题和答案的相似度 q_emb self.embedder.encode(question) a_emb self.embedder.encode(answer) # 余弦相似度 similarity np.dot(q_emb, a_emb) / (np.linalg.norm(q_emb) * np.linalg.norm(a_emb)) return float(similarity) def run_evaluation_suite(self, test_cases: list[dict]) - dict: 运行完整评估套件 all_metrics [] for case in test_cases: question case[question] ground_truth case[ground_truth] # 运行RAG管道 retrieved self._retrieve(question) answer self._generate(question, retrieved) # 计算指标 metrics RAGEvaluationMetrics( context_recallself.evaluate_context_recall(ground_truth, retrieved), context_precisionself._evaluate_precision(question, retrieved), faithfulnessself.evaluate_faithfulness(answer, retrieved), answer_relevancyself.evaluate_answer_relevancy(question, answer) ) all_metrics.append(metrics) # 汇总 return { context_recall: np.mean([m.context_recall for m in all_metrics]), context_precision: np.mean([m.context_precision for m in all_metrics]), faithfulness: np.mean([m.faithfulness for m in all_metrics]), answer_relevancy: np.mean([m.answer_relevancy for m in all_metrics]), ragas_score: np.mean([ (m.context_recall m.context_precision m.faithfulness m.answer_relevancy) / 4 for m in all_metrics ]) }## 六、LLM作为评判者LLM-as-Judgepythonclass LLMJudge: 使用强模型评判弱模型的输出 COMPARISON_PROMPT 你是一个公正的AI应用质量评审专家。请比较以下两个AI回答的质量判断哪个更好。问题{question}回答A{answer_a}回答B{answer_b}评估维度1. 准确性答案是否正确2. 完整性是否回答了问题的所有方面3. 清晰度是否易于理解4. 简洁性是否避免冗余请输出JSON{{ winner: A 或 B 或 tie, confidence: 0-1之间的数字, reasoning: 简短解释}} SINGLE_SCORE_PROMPT 你是一个专业的AI质量评审员。请评估以下AI回答的质量针对给定问题。问题{question}回答{answer}参考答案如有{reference}从以下维度打分1-5- 准确性{accuracy_desc}- 相关性{relevance_desc}- 有帮助性{helpfulness_desc}输出JSON{{ accuracy: 1-5, relevance: 1-5, helpfulness: 1-5, overall: 1-5, feedback: 具体改进建议}} def __init__(self, judge_modelgpt-4o): self.client __import__(openai).OpenAI() self.judge_model judge_model def compare(self, question: str, answer_a: str, answer_b: str) - dict: 比较两个答案返回哪个更好 prompt self.COMPARISON_PROMPT.format( questionquestion, answer_aanswer_a, answer_banswer_b ) response self.client.chat.completions.create( modelself.judge_model, messages[{role: user, content: prompt}], temperature0, response_format{type: json_object} ) return json.loads(response.choices[0].message.content) def score(self, question: str, answer: str, reference: str ) - dict: 对单个答案评分 prompt self.single_score_prompt(question, answer, reference) response self.client.chat.completions.create( modelself.judge_model, messages[{role: user, content: prompt}], temperature0, response_format{type: json_object} ) return json.loads(response.choices[0].message.content)## 七、回归测试框架pythonclass AIRegressionTestSuite: AI应用回归测试检测模型升级或prompt变更是否导致性能下降 def __init__(self, test_set_path: str, judge: LLMJudge): self.test_set self._load_test_set(test_set_path) self.judge judge self.baseline_results {} def capture_baseline(self, rag_pipeline) - str: 捕获当前版本的基准结果 results {} for case in self.test_set: response rag_pipeline.query(case[question]) results[case[id]] { question: case[question], response: response, score: self.judge.score( case[question], response, case.get(reference_answer, ) ) } baseline_path f./baselines/{self._get_timestamp()}.json with open(baseline_path, w, encodingutf-8) as f: json.dump(results, f, ensure_asciiFalse, indent2) self.baseline_results results return baseline_path def run_regression(self, new_pipeline, baseline_path: str None) - dict: 运行回归测试比较新旧版本 if baseline_path: with open(baseline_path, r, encodingutf-8) as f: self.baseline_results json.load(f) regression_results [] for case in self.test_set: case_id case[id] baseline self.baseline_results.get(case_id) if not baseline: continue # 获取新版本响应 new_response new_pipeline.query(case[question]) # 比较新旧版本 comparison self.judge.compare( questioncase[question], answer_abaseline[response], # 旧版本 answer_bnew_response # 新版本 ) regression_results.append({ case_id: case_id, question: case[question], old_response: baseline[response][:200], new_response: new_response[:200], comparison: comparison, regression: comparison[winner] A # 旧版本更好回归 }) # 统计 regressions [r for r in regression_results if r[regression]] improvements [r for r in regression_results if r[comparison][winner] B] return { total_cases: len(regression_results), regressions: len(regressions), improvements: len(improvements), neutral: len(regression_results) - len(regressions) - len(improvements), regression_rate: len(regressions) / len(regression_results) if regression_results else 0, details: regression_results, recommendation: 通过 if len(regressions) / len(regression_results) 0.1 else 需要人工审查 }## 八、CI/CD集成yaml# .github/workflows/ai-tests.ymlname: AI Application Testson: push: branches: [main, develop] pull_request: branches: [main]jobs: unit-tests: runs-on: ubuntu-latest steps: - uses: actions/checkoutv4 - uses: actions/setup-pythonv5 with: {python-version: 3.11} - run: pip install pytest pytest-asyncio - run: pytest tests/unit/ -v prompt-tests: needs: unit-tests runs-on: ubuntu-latest env: OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} steps: - uses: actions/checkoutv4 - run: pip install -r requirements.txt - run: pytest tests/prompt/ -v --max-failures5 # 允许少量失败AI的不确定性 rag-evaluation: needs: prompt-tests runs-on: ubuntu-latest if: github.event_name pull_request steps: - uses: actions/checkoutv4 - run: | python scripts/evaluate_rag.py \ --test-set tests/rag_test_cases.json \ --threshold-recall 0.8 \ --threshold-faithfulness 0.85 - name: 评估结果评论到PR uses: actions/github-scriptv7 with: script: | const fs require(fs) const results JSON.parse(fs.readFileSync(rag_results.json)) github.rest.issues.createComment({ issue_number: context.issue.number, owner: context.repo.owner, repo: context.repo.repo, body: ## RAG评估结果\n\\\json\n${JSON.stringify(results, null, 2)}\n\\\ })## 九、总结AI测试的核心原则1.确定性部分充分测试工具函数、解析逻辑等必须有高覆盖率的单元测试2.接受概率性对于LLM输出测试属性而不是精确值”如包含关键词、格式正确、没有有害内容3.黄金测试集是资产维护一套高质量的人工标注测试集作为评估基准4.LLM-as-Judge用强模型评判弱模型比人工评判更快比规则匹配更准确5.回归测试是安全网每次模型升级或Prompt变更必须跑回归测试6.在CI中集成自动化测试是可持续AI工程的基础不要等到上线才发现问题AI应用测试是一个新兴领域最佳实践还在快速演化。但核心原则不变越早发现问题修复成本越低。