Skip to content

LLM Error & Failure Reporting #692

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 9 commits into from
May 6, 2025
Merged
Original file line number Diff line number Diff line change
Expand Up @@ -28,14 +28,15 @@
import uk.ac.cam.cl.dtg.isaac.dos.content.Question;
import uk.ac.cam.cl.dtg.util.AbstractConfigLoader;

import javax.annotation.Nullable;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.stream.Collectors;
import java.util.Optional;

import static uk.ac.cam.cl.dtg.segue.api.Constants.*;

Expand Down Expand Up @@ -69,6 +70,10 @@ private void validateInputs(final Question question, final Choice answer) {
if (!(question instanceof IsaacLLMFreeTextQuestion)) {
throw new IllegalArgumentException(question.getId() + " is not a LLM free-text question");
}
if (((IsaacLLMFreeTextQuestion) question).getMaxMarks() == null) {
log.error("Question has missing maximum marks field: " + question.getId());
throw new IllegalArgumentException("This question cannot be answered correctly");
}

// Validate answer
Objects.requireNonNull(answer);
Expand All @@ -80,14 +85,16 @@ private void validateInputs(final Question question, final Choice answer) {
try { maxAnswerLength = Integer.parseInt(configLoader.getProperty(LLM_MARKER_MAX_ANSWER_LENGTH)); }
catch (NumberFormatException ignored) { /* Use default value */ }
if (answer.getValue().length() > maxAnswerLength) {
log.error("Answer exceeds maximum length for LLM free-text question marking: " + answer.getValue().length());
log.error(String.format("Answer was %d characters long and exceeded maximum %d length for LLM free-text question marking.", answer.getValue().length(), maxAnswerLength));
throw new IllegalArgumentException("Answer is too long for LLM free-text question marking");
}
}

private String generatePromptSystemMessage(final IsaacLLMFreeTextQuestion question) {
String llmMarkerSubject = Optional.ofNullable(configLoader.getProperty(LLM_MARKER_SUBJECT)).orElse("");

String contextAndInstructions = "You are a principal examiner marking A Level and GCSE "
+ String.format("%s questions.\n", this.configLoader.getProperty(LLM_MARKER_SUBJECT))
+ String.format("%s questions.\n", llmMarkerSubject)
+ "You do not like vagueness in student answers.\n"
+ "You follow a standard procedure when marking a question attempt and write your responses as JSON:\n"
+ "```\n"
Expand Down Expand Up @@ -166,14 +173,14 @@ private ChatRequestMessage extractUserAttemptAtQuestion(final Choice answer) {
* @param questionPrompt the prompt to send to the OpenAI API.
* @return the completions from the OpenAI API or null if there was an error.
*/
private @Nullable ChatCompletions retrieveCompletionsFromOpenAI(final List<ChatRequestMessage> questionPrompt) {
private ChatCompletions retrieveCompletionsFromOpenAI(final List<ChatRequestMessage> questionPrompt) throws IOException {
try {
return openAIClient.getChatCompletions(
configLoader.getProperty(LLM_MARKER_DEFAULT_MODEL_NAME),
new ChatCompletionsOptions(questionPrompt).setTemperature(0.0));
} catch (Exception e) {
log.error("Failed to retrieve completions from OpenAI API", e);
return null;
throw new IOException(e.getMessage());
}
}

Expand All @@ -188,11 +195,7 @@ private ChatRequestMessage extractUserAttemptAtQuestion(final Choice answer) {
* @return a map of the marks awarded for each field in the mark scheme.
*/
private Map<String, Integer> extractValidatedMarks(
final IsaacLLMFreeTextQuestion question, @Nullable final ChatCompletions chatCompletions) {
if (chatCompletions == null) {
// Error logged previously when we have more context.
return zeroMarkResult;
}
final IsaacLLMFreeTextQuestion question, final ChatCompletions chatCompletions) {
if (chatCompletions.getChoices().size() != 1) {
log.error("Expected exactly one choice from LLM completion provider, received: "
+ chatCompletions.getChoices().stream().map(c -> c.getMessage().getContent())
Expand Down Expand Up @@ -305,15 +308,20 @@ private LLMFreeTextQuestionValidationResponse generateQuestionValidationResponse
* @return a response to the user's attempt at the question.
*/
@Override
public final QuestionValidationResponse validateQuestionResponse(final Question question, final Choice answer) {
validateInputs(question, answer);

IsaacLLMFreeTextQuestion freeTextLLMQuestion = (IsaacLLMFreeTextQuestion) question;
List<ChatRequestMessage> questionPrompt = generateQuestionPrompt(freeTextLLMQuestion);
questionPrompt.add(extractUserAttemptAtQuestion(answer));
ChatCompletions chatCompletions = retrieveCompletionsFromOpenAI(questionPrompt);
Map<String, Integer> awardedMarks = extractValidatedMarks(freeTextLLMQuestion, chatCompletions);
int markTotal = evaluateMarkTotal(freeTextLLMQuestion, awardedMarks);
return generateQuestionValidationResponse(freeTextLLMQuestion, answer, awardedMarks, markTotal);
public final QuestionValidationResponse validateQuestionResponse(final Question question, final Choice answer) throws ValidatorUnavailableException {
try {
validateInputs(question, answer);
IsaacLLMFreeTextQuestion freeTextLLMQuestion = (IsaacLLMFreeTextQuestion) question;
List<ChatRequestMessage> questionPrompt = generateQuestionPrompt(freeTextLLMQuestion);
questionPrompt.add(extractUserAttemptAtQuestion(answer));
ChatCompletions chatCompletions = retrieveCompletionsFromOpenAI(questionPrompt);
Map<String, Integer> awardedMarks = extractValidatedMarks(freeTextLLMQuestion, chatCompletions);
int markTotal = evaluateMarkTotal(freeTextLLMQuestion, awardedMarks);
return generateQuestionValidationResponse(freeTextLLMQuestion, answer, awardedMarks, markTotal);
} catch (IOException e) {
log.error("Failed to check answer with OpenAI Client. Not trying again.");
throw new ValidatorUnavailableException("We are having problems marking LLM marked questions."
+ " Please try again later!");
}
}
}
4 changes: 2 additions & 2 deletions src/main/java/uk/ac/cam/cl/dtg/segue/api/QuestionFacade.java
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@
public class QuestionFacade extends AbstractSegueFacade {
private static final Logger log = LoggerFactory.getLogger(QuestionFacade.class);

private static final class NoUserConsentGrantedException extends Exception {
public static final class NoUserConsentGrantedException extends Exception {
NoUserConsentGrantedException(final String message) {
super(message);
}
Expand All @@ -117,7 +117,7 @@ private static final class NoUserConsentGrantedException extends Exception {
* @throws SegueResourceMisuseException - if the user has exceeded the number of attempts they can make over a period of time.
* @throws SegueDatabaseException - if there is an unexpected problem with the database.
*/
private RegisteredUserDTO assertUserCanAnswerLLMQuestions(final AbstractSegueUserDTO user) throws
public RegisteredUserDTO assertUserCanAnswerLLMQuestions(final AbstractSegueUserDTO user) throws
SegueDatabaseException, NoUserLoggedInException, NoUserConsentGrantedException,
ValidatorUnavailableException, SegueResourceMisuseException
{
Expand Down
139 changes: 120 additions & 19 deletions src/test/java/uk/ac/cam/cl/dtg/isaac/api/QuestionFacadeTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -15,17 +15,20 @@
*/
package uk.ac.cam.cl.dtg.isaac.api;

import org.junit.Before;
import org.junit.Test;
import org.junit.runner.RunWith;
import org.powermock.core.classloader.annotations.PowerMockIgnore;
import org.powermock.core.classloader.annotations.PrepareForTest;
import org.powermock.modules.junit4.PowerMockRunner;
import uk.ac.cam.cl.dtg.isaac.dos.AbstractUserPreferenceManager;
import uk.ac.cam.cl.dtg.isaac.dos.IUserStreaksManager;
import uk.ac.cam.cl.dtg.isaac.dos.UserPreference;
import uk.ac.cam.cl.dtg.isaac.dto.users.RegisteredUserDTO;
import uk.ac.cam.cl.dtg.isaac.quiz.ValidatorUnavailableException;
import uk.ac.cam.cl.dtg.segue.api.Constants;
import uk.ac.cam.cl.dtg.segue.api.QuestionFacade;
import uk.ac.cam.cl.dtg.segue.api.managers.QuestionManager;
import uk.ac.cam.cl.dtg.segue.api.managers.SegueResourceMisuseException;
import uk.ac.cam.cl.dtg.segue.api.managers.UserAssociationManager;
import uk.ac.cam.cl.dtg.segue.api.monitors.IMisuseMonitor;
import uk.ac.cam.cl.dtg.segue.dao.ILogManager;
Expand All @@ -40,43 +43,40 @@

import static org.easymock.EasyMock.anyObject;
import static org.easymock.EasyMock.expect;
import static org.hamcrest.CoreMatchers.instanceOf;
import static org.hamcrest.MatcherAssert.assertThat;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertThrows;
import static org.powermock.api.easymock.PowerMock.createMock;
import static org.powermock.api.easymock.PowerMock.createNiceMock;
import static org.powermock.api.easymock.PowerMock.replayAll;
import static uk.ac.cam.cl.dtg.segue.api.Constants.*;

@RunWith(PowerMockRunner.class)
@PrepareForTest(GitContentManager.class)
@PowerMockIgnore("javax.management.*")
public class QuestionFacadeTest extends AbstractFacadeTest {

private AbstractConfigLoader properties;
private AbstractUserPreferenceManager userPreferenceManager;
private IMisuseMonitor misuseMonitor;
private QuestionFacade questionFacade;

private Request requestForCaching;

private QuestionManager questionManager;

@Before
public void setUp() throws ContentManagerException {
requestForCaching = createMock(Request.class);
private void setUpQuestionFacade() throws ContentManagerException {
Request requestForCaching = createMock(Request.class);
expect(requestForCaching.evaluatePreconditions((EntityTag) anyObject())).andStubReturn(null);

AbstractConfigLoader properties = createMock(AbstractConfigLoader.class);
expect(properties.getProperty(Constants.SEGUE_APP_ENVIRONMENT)).andStubReturn(Constants.EnvironmentType.DEV.name());

ILogManager logManager = createNiceMock(ILogManager.class); // We don't care about logging.
GitContentManager contentManager = createMock(GitContentManager.class);
ILogManager logManager = createNiceMock(ILogManager.class); // We don't care about logging.
ContentMapper contentMapper = createMock(ContentMapper.class);

String contentIndex = "4b825dc642cb6eb9a060e54bf8d69288fbee4904";
IMisuseMonitor misuseMonitor = createMock(IMisuseMonitor.class);
QuestionManager questionManager = createMock(QuestionManager.class);
IUserStreaksManager userStreaksManager = createMock(IUserStreaksManager.class);
UserAssociationManager userAssociationManager = createMock(UserAssociationManager.class);
AbstractUserPreferenceManager userPreferencesManager = createMock(AbstractUserPreferenceManager.class);
questionManager = createMock(QuestionManager.class);

questionFacade = new QuestionFacade(properties, contentMapper, contentManager, userManager, userPreferencesManager,
questionFacade = new QuestionFacade(properties, contentMapper, contentManager, userManager, userPreferenceManager,
questionManager, logManager, misuseMonitor, userStreaksManager, userAssociationManager);

String contentIndex = "4b825dc642cb6eb9a060e54bf8d69288fbee4904";
expect(contentManager.getCurrentContentSHA()).andStubReturn(contentIndex);
expect(contentManager.getContentDOById(questionDO.getId())).andStubReturn(questionDO);
expect(contentManager.getContentDOById(studentQuizDO.getId())).andStubReturn(studentQuizDO);
Expand All @@ -86,7 +86,13 @@ public void setUp() throws ContentManagerException {
}

@Test
public void answerQuestionNotAvailableForQuizQuestions() {
public void answerQuestionNotAvailableForQuizQuestions() throws ContentManagerException {
// Arrange
properties = createMock(AbstractConfigLoader.class);
expect(properties.getProperty(Constants.SEGUE_APP_ENVIRONMENT)).andStubReturn(Constants.EnvironmentType.DEV.name());
setUpQuestionFacade();

// Act & Assert
String jsonAnswer = "jsonAnswer";
forEndpoint((questionId) -> () -> questionFacade.answerQuestion(httpServletRequest, questionId, jsonAnswer),
with(question.getId(),
Expand All @@ -96,4 +102,99 @@ public void answerQuestionNotAvailableForQuizQuestions() {
)
);
}

/*
Test that a user with invalid config settings is unable to answer LLMFreeTextQuestions
*/
@Test
public final void assertUserCanAnswerLLMQuestions_InvalidSettings_ShouldThrowException() throws Exception {
// Arrange
properties = createMock(AbstractConfigLoader.class);
expect(properties.getProperty(LLM_MARKER_FEATURE)).andReturn(("off"));

setUpQuestionFacade();

// Act & Assert
ValidatorUnavailableException exception = assertThrows(
ValidatorUnavailableException.class,
() -> questionFacade.assertUserCanAnswerLLMQuestions(adminUser)
);

assertEquals("LLM marked questions are currently unavailable. Please try again later!", exception.getMessage());
}

/*
Test that a non-consenting user is unable to answer LLMFreeTextQuestions
*/
@Test
public final void assertUserCanAnswerLLMQuestions_NonConsentingUser_ShouldThrowException() throws Exception {
// Arrange
properties = createMock(AbstractConfigLoader.class);
expect(properties.getProperty(LLM_MARKER_FEATURE)).andReturn(("on"));

userPreferenceManager = createMock(AbstractUserPreferenceManager.class);
UserPreference userPreference = new UserPreference(adminUser.getId(), "CONSENT", "OPENAI", false);
expect(userPreferenceManager.getUserPreference("CONSENT", LLM_PROVIDER_NAME, adminUser.getId())).andReturn(userPreference);

setUpQuestionFacade();

// Act & Assert
QuestionFacade.NoUserConsentGrantedException exception = assertThrows(
QuestionFacade.NoUserConsentGrantedException.class,
() -> questionFacade.assertUserCanAnswerLLMQuestions(adminUser)
);

assertEquals(String.format("You must consent to sending your attempts to %s.", LLM_PROVIDER_NAME), exception.getMessage());
}

/*
Test that a user with no available question attempts is unable to answer LLMFreeTextQuestions
*/
@Test
public final void assertUserCanAnswerLLMQuestions_NoUses_ShouldThrowException() throws Exception {
// Arrange
properties = createMock(AbstractConfigLoader.class);
expect(properties.getProperty(LLM_MARKER_FEATURE)).andReturn(("on"));

userPreferenceManager = createMock(AbstractUserPreferenceManager.class);
UserPreference userPreference = new UserPreference(adminUser.getId(), "CONSENT", "OPENAI", true);
expect(userPreferenceManager.getUserPreference("CONSENT", LLM_PROVIDER_NAME, adminUser.getId())).andReturn(userPreference);

misuseMonitor = createMock(IMisuseMonitor.class);
expect(misuseMonitor.getRemainingUses(adminUser.getId().toString(), "LLMFreeTextQuestionAttemptMisuseHandler")).andReturn(0);

setUpQuestionFacade();

// Act & Assert
SegueResourceMisuseException exception = assertThrows(
SegueResourceMisuseException.class,
() -> questionFacade.assertUserCanAnswerLLMQuestions(adminUser)
);

assertEquals("You have exceeded the number of attempts you can make on LLM marked free-text questions. Please try again later.", exception.getMessage());
}

/*
Test that a consenting user with valid config settings is able to answer LLMFreeTextQuestions
*/
@Test
public final void assertUserCanAnswerLLMQuestions_ConsentingUser_ShouldReturnOkayResponse() throws Exception {
// Arrange
properties = createMock(AbstractConfigLoader.class);
expect(properties.getProperty(LLM_MARKER_FEATURE)).andReturn(("on"));
misuseMonitor = createMock(IMisuseMonitor.class);
expect(misuseMonitor.getRemainingUses(adminUser.getId().toString(), "LLMFreeTextQuestionAttemptMisuseHandler")).andReturn(30);
userPreferenceManager = createMock(AbstractUserPreferenceManager.class);
UserPreference userPreference = new UserPreference(adminUser.getId(), "CONSENT", "OPENAI", true);
expect(userPreferenceManager.getUserPreference("CONSENT", LLM_PROVIDER_NAME, adminUser.getId())).andReturn(userPreference);

setUpQuestionFacade();

// Act
RegisteredUserDTO outUser = questionFacade.assertUserCanAnswerLLMQuestions(adminUser);

// Assert
assertThat(outUser, instanceOf(RegisteredUserDTO.class));
assertEquals(adminUser.getId(), outUser.getId());
}
}
Loading
Loading