generated from lambda-feedback/Evaluation-Function-Boilerplate
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsynthetic_conversation_generation.py
144 lines (122 loc) · 6.53 KB
/
synthetic_conversation_generation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
"""
## Synthetic Dataset Generator ##
-> GOAL: Generate a synthetic dataset of conversations between a tutor and a student [both LLMs].
For each question/scenario example in the example_inputs folder, a pipeline of two agents will be invoked.
The agents will play the role of a tutor and a student conversing about the question/scenario.
The conversations will be 20 turns long, with the tutor and student taking turns to send a message.
The tutor can be one of the following types:
- Informational Agent
- Socratic Agent
- Google's LearnLM-Tutor Agent
The tutor agent can be selected by changing the "agent_type" field in this script.
The student can have multiple skill levels and conversational styles. Those are defined by the prompts used by the LLM.
Any of the models accessible through the API calls defined in the 'llm_factory.py' can be used for either the tutor and the agent LLM.
"""
import csv
import json
try:
from ..student_agent.student_agent import invoke_student_agent
from ..informational_agent.informational_agent import invoke_informational_agent
from ..socratic_agent.socratic_agent import invoke_socratic_agent
from ..google_learnLM_agent.google_learnLM_agent import invoke_google_learnlm_agent
from .parse_json_to_prompt import parse_json_to_prompt
except ImportError:
from src.agents.student_agent.student_agent import invoke_student_agent
from src.agents.informational_agent.informational_agent import invoke_informational_agent
from src.agents.socratic_agent.socratic_agent import invoke_socratic_agent
from src.agents.google_learnLM_agent.google_learnLM_agent import invoke_google_learnlm_agent
from src.agents.utils.parse_json_to_prompt import parse_json_to_prompt
import os
def generate_synthetic_conversations(raw_text: str, num_turns: int, student_agent_type: str, tutor_agent_type: str):
"""
Generate a synthetic dataset of conversations between a tutor and a student [both LLMs].
"""
if tutor_agent_type == "informational":
invoke_tutor_agent = invoke_informational_agent
elif tutor_agent_type == "socratic":
invoke_tutor_agent = invoke_socratic_agent
elif tutor_agent_type == "google_learnlm":
invoke_tutor_agent = invoke_google_learnlm_agent
else:
raise Exception("Unknown Tutor Agent Type")
parsed_json = json.loads(raw_text)
params = parsed_json["params"]
conversation_id = params["conversation_id"]
include_test_data = params["include_test_data"]
summary = ""
conversational_style = ""
question_response_details = params["question_response_details"]
question_submission_summary = question_response_details["questionSubmissionSummary"] if "questionSubmissionSummary" in question_response_details else []
question_information = question_response_details["questionInformation"] if "questionInformation" in question_response_details else {}
question_access_information = question_response_details["questionAccessInformation"] if "questionAccessInformation" in question_response_details else {}
question_response_details_prompt = parse_json_to_prompt(
question_submission_summary,
question_information,
question_access_information
)
# Generate Conversation
conversation_history = []
message = "Ask a question."
for i in range(0,num_turns):
print(f"Turn {i+1} of {num_turns}")
if len(conversation_history) == 0:
message = "Ask me a question regarding your thoughts on the learning materials that you are currently woking on."
else:
message = conversation_history[-1]["content"]
if i % 2 == 0:
# Student starts
student_response = invoke_student_agent(message, conversation_history[:-1], summary, student_agent_type, question_response_details_prompt, conversation_id)
conversation_history.append({
"role": "user",
"content": student_response["output"]
})
else:
tutor_response = invoke_tutor_agent(message, conversation_history, summary, conversational_style, question_response_details_prompt, conversation_id)
conversation_history.append({
"role": "assistant",
"content": tutor_response["output"]
})
if "summary" in tutor_response:
summary = tutor_response["summary"]
if "conversationalStyle" in tutor_response:
conversational_style = tutor_response["conversationalStyle"]
# Save Conversation
conversation_output = {
"conversation_id": conversation_id+"_"+student_agent_type+"_"+tutor_agent_type+"_synthetic",
"student_agent_type": student_agent_type,
"tutor_agent_type": tutor_agent_type,
"conversation": conversation_history
}
return conversation_output
if __name__ == "__main__":
num_turns = 6
# Can be "informational", "socratic", "google_learnlm"
tutor_agent_types = ["google_learnlm"]
# Can be "base", "curious", "contradicting", "reliant", "confused", "unrelated"
student_agent_types = ["base", "curious", "contradicting", "reliant", "confused", "unrelated"]
# Read all question files
questions = []
example_inputs_folder = "src/agents/utils/example_inputs/"
output_folder = "src/agents/utils/synthetic_conversations/"
for filename in os.listdir(example_inputs_folder):
if filename.endswith("1.json"):
questions.append(os.path.join(example_inputs_folder, filename))
for tutor_agent_type in tutor_agent_types:
# Open CSV file for writing
csv_filename = os.path.join(output_folder, "all_conversations_"+tutor_agent_type+".csv")
with open(csv_filename, "w", newline='') as csvfile:
csv_writer = csv.writer(csvfile)
# Write the header
csv_writer.writerow(["tutor", "student", "conversation", "conversation_id"])
for student_agent_type in student_agent_types:
for question in questions:
print(f"Generating synthetic conversation for {question} with tutor: {tutor_agent_type} and student: {student_agent_type}")
with open(question, "r") as file:
raw_text = file.read()
conversation = generate_synthetic_conversations(raw_text, num_turns, student_agent_type, tutor_agent_type)
conversation_output_filename = output_folder + question.split('/')[-1].replace(".json", "_"+student_agent_type+"_"+tutor_agent_type+"_conversation.json")
with open(conversation_output_filename, "w") as file:
json.dump(conversation, file, indent=2)
# Write to CSV
conversation_id = conversation["conversation_id"]
csv_writer.writerow([tutor_agent_type, student_agent_type, conversation["conversation"], conversation_id])