Spring with AI (2): 评估答案——UnitTest引入
1 搭建Mock Test先不谈评估大模型的答案先搭建基于SpringWireMock的单元测试。添加依赖dependency groupIdorg.wiremock.integrations/groupId artifactIdwiremock-spring-boot/artifactId version3.10.0/version /dependencyLLM返回的内容是不可精确预料的而且会消耗Token所以可以使用WireMock模拟LLM。在test/resources下添加两个测试配置文件来定义预设的答案test-openapi-response-usa.json{ id: chatcmpl-yDUbJwsur69ZLTSGiBpCUvL7QAAQ, object: chat.completion, created: 1771113600, model: qwen3.5-plus, choices: [ { index: 0, message: { role: assistant, content: 华盛顿, refusal: null, annotations: [] }, finish_reason: stop } ], usage: { prompt_tokens: 11, completion_tokens: 13, total_tokens: 24, prompt_tokens_details: { cached_tokens: 0, audio_tokens: 0 }, completion_tokens_details: { reasoning_tokens: 0, audio_tokens: 0, accepted_prediction_tokens: 0, rejected_prediction_tokens: 0 } }, service_tier: default, system_fingerprint: null }test-openapi-response-uk.json{ id: chatcmpl-yDUbJwsur69ZLTSGiBpCUvL7QAAR, object: chat.completion, created: 1771113601, model: qwen3.5-plus, choices: [ { index: 0, message: { role: assistant, content: 伦敦, refusal: null, annotations: [] }, finish_reason: stop } ], usage: { prompt_tokens: 11, completion_tokens: 13, total_tokens: 24, prompt_tokens_details: { cached_tokens: 0, audio_tokens: 0 }, completion_tokens_details: { reasoning_tokens: 0, audio_tokens: 0, accepted_prediction_tokens: 0, rejected_prediction_tokens: 0 } }, service_tier: default, system_fingerprint: null }单元测试代码package com.junteam.ai.demo.service; import java.io.IOException; import java.nio.charset.Charset; import org.assertj.core.api.Assertions; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; import org.springframework.ai.chat.client.ChatClient; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.beans.factory.annotation.Value; import org.springframework.boot.test.context.SpringBootTest; import org.springframework.core.io.Resource; import org.wiremock.spring.ConfigureWireMock; import org.wiremock.spring.EnableWireMock; import com.fasterxml.jackson.databind.ObjectMapper; import com.github.tomakehurst.wiremock.client.ResponseDefinitionBuilder; import com.github.tomakehurst.wiremock.client.WireMock; import com.junteam.ai.demo.model.ChatQuestion; /** * * author gujun */ EnableWireMock(ConfigureWireMock(baseUrlProperties openai.base.url)) SpringBootTest(properties spring.ai.openai.base-url${openai.base.url}) public class ChatServiceMockTest { Value(classpath:/test-openapi-response-usa.json) Resource responseResourceUSA; Value(classpath:/test-openapi-response-uk.json) Resource responseResourceUK; Autowired ChatClient.Builder chatClientBuilder; BeforeEach public void setup() throws IOException{ } public OpenAIChatServiceImplWireMockTest() { } /** * Test of ask method, of class OpenAIChatServiceImpl. * throws IOException */ SuppressWarnings(null) Test public void testAsk() throws IOException { var cannedResponse responseResourceUSA.getContentAsString(Charset.defaultCharset()); var mapper new ObjectMapper(); var responseNode mapper.readTree(cannedResponse); WireMock.stubFor(WireMock.post(/v1/chat/completions) .willReturn(ResponseDefinitionBuilder.okForJson(responseNode))); var instance new OpenAIChatServiceImpl(chatClientBuilder); var chatAnswer instance.ask(new ChatQuestion(美国的首都是哪里)); Assertions.assertThat(chatAnswer).isNotNull(); Assertions.assertThat(chatAnswer.answer()).isEqualTo(华盛顿); cannedResponse responseResourceUK.getContentAsString(Charset.defaultCharset()); responseNode mapper.readTree(cannedResponse); WireMock.stubFor(WireMock.post(/v1/chat/completions) .willReturn(ResponseDefinitionBuilder.okForJson(responseNode))); chatAnswer instance.ask(new ChatQuestion(英国的首都是哪里)); Assertions.assertThat(chatAnswer).isNotNull(); Assertions.assertThat(chatAnswer.answer()).isEqualTo(伦敦); } }2 评估答案2.1 相关性评估(Relevancy)package com.junteam.ai.demo.service; import org.assertj.core.api.Assertions; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; import org.springframework.ai.chat.client.ChatClient; import org.springframework.ai.chat.evaluation.RelevancyEvaluator; import org.springframework.ai.evaluation.EvaluationRequest; import org.springframework.ai.evaluation.EvaluationResponse; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.boot.test.context.SpringBootTest; import com.junteam.ai.demo.model.ChatAnswer; import com.junteam.ai.demo.model.ChatQuestion; /** * * author gujun */ SpringBootTest public class ChatServiceTest { Autowired private ChatService chatService; Autowired private ChatClient.Builder chatClientBuilder; private RelevancyEvaluator relevancyEvaluator; BeforeEach public void setup() { this.relevancyEvaluator new RelevancyEvaluator(chatClientBuilder); } Test public void evaluateRelevancy() { String userText Why the sky is blue?; ChatQuestion chatQuestion new ChatQuestion(userText); System.out.println( Chat Debug Start ); ChatAnswer chatAnswer chatService.ask(chatQuestion); System.out.println( Chat Debug Info ); System.out.println(Question: userText); System.out.println(Answer: chatAnswer.answer()); EvaluationRequest evaluationRequest new EvaluationRequest(userText, chatAnswer.answer()); System.out.println( Evaluator Debug Start ); EvaluationResponse evaluationResponse relevancyEvaluator.evaluate(evaluationRequest); // 添加调试打印 System.out.println( Evaluator Debug Info ); System.out.println(Score: evaluationResponse.getScore()); // 获取相关度 System.out.println(Feedback: evaluationResponse.getFeedback()); // 获取相关度说明 System.out.println(Raw Response: evaluationResponse.toString()); System.out.println(); Assertions.assertThat(evaluationResponse.isPass()) .withFailMessage( The answer %s is not considered relevant to the question %s. , chatAnswer.answer(), userText) .isTrue(); } }这个测试用例没有通过。日志如下 The answer The short answer is **Rayleigh scattering**. Here is a step-by-step breakdown of why this happens: **1. Sunlight looks white, but it isnt** Sunlight appears white to us, but it is actually made up of all the colors of the rainbow (red, orange, yellow, green, blue, indigo, and violet). You can see this when sunlight passes through a prism or water droplets to create a rainbow. **2. Light travels in waves** Each color of light travels in a wave of a different size (wavelength). * **Red light** has longer, lazier waves. * **Blue and violet light** have shorter, choppier waves. **3. The atmosphere is full of obstacles** Earths atmosphere is filled with gas molecules, primarily nitrogen and oxygen. These molecules are smaller than the wavelength of visible light. **4. Scattering occurs** When sunlight passes through the atmosphere, the longer waves (reds and yellows) pass through the gas molecules relatively easily. However, the shorter waves (blues and violets) hit the gas molecules and **scatter** in every direction. This is known as *Rayleigh scattering*. **5. What we see** When you look up at the sky, your eyes catch this scattered blue light coming from all directions. Because blue is scattered more strongly than any other color, the sky looks blue to us. *** **Two common follow-up questions:** * **Why isnt the sky violet?** Violet light actually scatters even more than blue light. However, the sky isnt violet for two reasons: 1. The sun emits much less violet light than blue light. 2. Human eyes are much more sensitive to blue light than violet light. * **Why are sunsets red?** When the sun is setting, it is lower on the horizon. The light has to travel through much more atmosphere to reach your eyes than it does at noon. By the time the light arrives, most of the blue light has been scattered away completely, leaving only the longer wavelengths (reds and oranges) to pass through to your eyes. is not considered relevant to the question Why is the sky blue?. at com.junteam.ai.demo.service.ChatServiceTest.evaluateRelevancy(ChatServiceTest.java:55) at java.base/java.lang.reflect.Method.invoke(Method.java:565) at java.base/java.util.ArrayList.forEach(ArrayList.java:1604) at java.base/java.util.ArrayList.forEach(ArrayList.java:1604) Results: Failures: ChatServiceTest.evaluateRelevancy:55 The answer The short answer is **Rayleigh scattering**. Here is a step-by-step breakdown of why this happens: **1. Sunlight looks white, but it isnt** Sunlight appears white to us, but it is actually made up of all the colors of the rainbow (red, orange, yellow, green, blue, indigo, and violet). You can see this when sunlight passes through a prism or water droplets to create a rainbow. **2. Light travels in waves** Each color of light travels in a wave of a different size (wavelength). * **Red light** has longer, lazier waves. * **Blue and violet light** have shorter, choppier waves. **3. The atmosphere is full of obstacles** Earths atmosphere is filled with gas molecules, primarily nitrogen and oxygen. These molecules are smaller than the wavelength of visible light. **4. Scattering occurs** When sunlight passes through the atmosphere, the longer waves (reds and yellows) pass through the gas molecules relatively easily. However, the shorter waves (blues and violets) hit the gas molecules and **scatter** in every direction. This is known as *Rayleigh scattering*. **5. What we see** When you look up at the sky, your eyes catch this scattered blue light coming from all directions. Because blue is scattered more strongly than any other color, the sky looks blue to us. *** **Two common follow-up questions:** * **Why isnt the sky violet?** Violet light actually scatters even more than blue light. However, the sky isnt violet for two reasons: 1. The sun emits much less violet light than blue light. 2. Human eyes are much more sensitive to blue light than violet light. * **Why are sunsets red?** When the sun is setting, it is lower on the horizon. The light has to travel through much more atmosphere to reach your eyes than it does at noon. By the time the light arrives, most of the blue light has been scattered away completely, leaving only the longer wavelengths (reds and oranges) to pass through to your eyes. is not considered relevant to the question Why is the sky blue?. Tests run: 1, Failures: 1, Errors: 0, Skipped: 0回答内容在人类看来是完美相关的但你的自动化评估器Evaluator认为它不相关。可能的原因评估器使用的 Prompt 可能要求答案必须包含特定的句式例如必须以 The sky is blue because... 开头而AI的答案是以 The short answer is... 开头。需要注意的是评估是很消耗资源的、速度会相当慢。再重新修改问题美国的首都哪里也没有通过。美国的首都是哪里通过测试。2.2 正确性评估(Factual Accuracy)需要注意的是正确性评估、千问默认的API没有完整实现需要自己写一个评估方法。package com.junteam.ai.demo.service; import org.assertj.core.api.Assertions; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; import org.springframework.ai.chat.client.ChatClient; import org.springframework.ai.chat.evaluation.FactCheckingEvaluator; import org.springframework.ai.chat.evaluation.RelevancyEvaluator; import org.springframework.ai.evaluation.EvaluationRequest; import org.springframework.ai.evaluation.EvaluationResponse; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.boot.test.context.SpringBootTest; import com.fasterxml.jackson.core.JsonProcessingException; import com.fasterxml.jackson.databind.ObjectMapper; import com.junteam.ai.demo.model.ChatAnswer; import com.junteam.ai.demo.model.ChatQuestion; /** * * author gujun */ SpringBootTest public class ChatServiceTest { Autowired private ChatService chatService; Autowired private ChatClient.Builder chatClientBuilder; private RelevancyEvaluator relevancyEvaluator; private FactCheckingEvaluator factCheckingEvaluator; BeforeEach public void setup() { this.relevancyEvaluator new RelevancyEvaluator(chatClientBuilder); this.factCheckingEvaluator FactCheckingEvaluator.builder(chatClientBuilder).build(); } Test public void evaluateRelevancy() { String userText 美国的首都是哪里; ChatQuestion chatQuestion new ChatQuestion(userText); System.out.println( Chat Debug Start ); ChatAnswer chatAnswer chatService.ask(chatQuestion); System.out.println( Chat Debug Info ); System.out.println(Question: userText); System.out.println(Answer: chatAnswer.answer()); EvaluationRequest evaluationRequest new EvaluationRequest(userText, chatAnswer.answer()); // 相关性评估 var response relevancyEvaluator.evaluate(evaluationRequest); System.out.println( Evaluator Debug Info ); System.out.println(Score: response.getScore()); // 获取相关度 System.out.println(Feedback: response.getFeedback()); // 获取相关度说明 System.out.println(Raw Response: response.toString()); System.out.println(); Assertions.assertThat(response.isPass()) .withFailMessage( The answer %s is not considered relevant to the question %s. , chatAnswer.answer(), userText) .isTrue(); } SuppressWarnings({null, CallToPrintStackTrace}) private EvaluationResponse factCheckingEvaluateWithQwen(EvaluationRequest evaluationRequest) { var client chatClientBuilder.build(); // 构造显式的中文 Prompt强制要求 JSON 输出 String prompt String.format( 你是一个事实核查助手。 问题%s 回答%s 请判断上述回答是否符合客观事实。 请仅返回一个 JSON 对象不要包含任何其他文字。格式如下 {pass: true/false, score: 1.0或0.0, feedback: 简短的理由} , evaluationRequest.getUserText(), evaluationRequest.getResponseContent()); var mapper new ObjectMapper(); try { String content client.prompt(prompt).call().content(); System.out.println(Custom Evaluator Raw Response: content); // 简单解析 JSON (实际项目中建议用 Jackson ObjectMapper) var responseNode mapper.readTree(content); return new EvaluationResponse( responseNode.get(pass).asBoolean(), (float) responseNode.get(score).asDouble(), responseNode.get(feedback).asText(), null ); } catch (JsonProcessingException e) { e.printStackTrace(); return new EvaluationResponse(false, 0.0f, e.getMessage(), null); } } Test public void evaluateFactualAccuracy() { String userText 美国首都是哪里; ChatQuestion chatQuestion new ChatQuestion(userText); System.out.println( Chat Debug Start ); ChatAnswer chatAnswer chatService.ask(chatQuestion); System.out.println( Chat Debug Info ); System.out.println(Question: userText); System.out.println(Answer: chatAnswer.answer()); var answer0 华盛顿特区; EvaluationRequest evaluationRequest0 new EvaluationRequest(userText, answer0); System.out.println( Evaluator Debug Start ); // 实时正确性 var response0 factCheckingEvaluator.evaluate(evaluationRequest0); // 添加调试打印 System.out.println( Evaluator0 Debug Info ); System.out.println(Score: response0.getScore()); // 获取相关度 System.out.println(Feedback: response0.getFeedback()); // 获取相关度说明 System.out.println(Raw Response: response0.toString()); EvaluationRequest evaluationRequest new EvaluationRequest(userText, chatAnswer.answer()); // 事实准确性评估 var response factCheckingEvaluator.evaluate(evaluationRequest); System.out.println( Evaluator Debug Info ); System.out.println(Score: response.getScore()); // 获取相关度 System.out.println(Feedback: response.getFeedback()); // 获取相关度说明 System.out.println(Raw Response: response.toString()); System.out.println(); var response1 factCheckingEvaluateWithQwen(evaluationRequest); System.out.println( Evaluator1 Debug Info ); System.out.println(Score: response1.getScore()); // 获取相关度 System.out.println(Feedback: response1.getFeedback()); // 获取相关度说明 System.out.println(Raw Response: response1.toString()); System.out.println(); Assertions.assertThat(response1.isPass()) .withFailMessage( The answer %s is not considered factually accurate to the question %s. , chatAnswer.answer(), userText) .isTrue(); // Assertions.assertThat(response0.isPass()) // .withFailMessage( // // The answer %s // is not considered correct to the question // %s. // // , answer0, userText) // .isTrue(); // Assertions.assertThat(response.isPass()) // .withFailMessage( // // The answer %s // is not considered correct to the question // %s. // // , chatAnswer.answer(), userText) // .isTrue(); } }3 生成时纠正可以通过评估相关性然后进行重新生成本文不再展开赘述。可以参考代码Override Retryable(retryFor AnswerNotRelevantException.class) public Answer askQuestion(Question question) { var answerText chatClient.prompt() .user(question.question()) .call() .content(); evaluateRelevancy(question, answerText); return new Answer(answerText); } private void evaluateRelevancy(Question question, String answerText) { var evaluationRequest new EvaluationRequest(question.question(), answerText); var evaluationResponse evaluator.evaluate(evaluationRequest); if (!evaluationResponse.isPass()) { throw new AnswerNotRelevantException(question.question(), answerText); } } public class AnswerNotRelevantException extends RuntimeException { public AnswerNotRelevantException(String question, String answer) { super(The answer answer is not relevant to the question question .); } }