Socratic Agent update + synthetic pipeline upgrade (#10)

neagualexa · web-flow · commit 949fcfcb0856 · 2025-01-09T15:01:33.000Z
* student-tutor sythetic conversation basic (with student personal and basic fluency)

* save conversation in csv

* review and update socratic agent
diff --git a/.gitignore b/.gitignore
@@ -135,4 +135,5 @@ dmypy.json
 evaluation_function/db_analytics
 
 # Synthetic data conversations
-src/agents/utils/synthetic_conversations/*.json
+src/agents/utils/synthetic_conversations/*.json
+src/agents/utils/synthetic_conversations/*.csv
diff --git a/requirements.txt b/requirements.txt
@@ -9,6 +9,6 @@ langchainhub
 langdetect
 langgraph
 langsmith
-lf_toolkit[ipc] @ git+https://github.com/lambda-feedback/toolkit-python.git@main
+# lf_toolkit[ipc] @ git+https://github.com/lambda-feedback/toolkit-python.git@main
 
 pytest
diff --git a/src/agents/socratic_agent/socratic_agent.py b/src/agents/socratic_agent/socratic_agent.py
@@ -64,7 +64,7 @@ def call_model(self, state: State, config: RunnableConfig) -> str:
         # Adding external student progress and question context details from data queries
         question_response_details = config["configurable"].get("question_response_details", "")
         if question_response_details:
-            system_message += f"## Known Question Materials: {question_response_details} \n\n"
+            system_message += f"## Known Learning Materials: {question_response_details} \n\n"
 
         # Adding summary and conversational style to the system message
         summary = state.get("summary", "")
diff --git a/src/agents/socratic_agent/socratic_prompts.py b/src/agents/socratic_agent/socratic_prompts.py
@@ -1,6 +1,35 @@
 # PROMPTS generated with the help of ChatGPT GPT-4o Nov 2024
 
-socratic_role_prompt = "You are an excellent tutor, guiding me through the topic with clear and concise explanations. Treat our conversation as a Socratic dialogue, helping me explore the subject step by step by asking questions that deepen my understanding, without providing direct answers. Ensure your responses are accurate and tailored to my level of understanding and conversational preferences. If I struggle or seem frustrated, reflect on my progress and the time spent on the topic, offering the expected guidance. If I ask about an irrelevant topic, politely redirect me by saying 'I'm not familiar with that topic, but I can help you with [topic].' Do not end your responses with a concluding statement.\n\n"
+# socratic_role_prompt = "You are an excellent tutor, guiding me through the topic with clear and concise explanations. Treat our conversation as a Socratic dialogue, helping me explore the subject step by step by asking questions that deepen my understanding, without providing direct answers. Ensure your responses are accurate and tailored to my level of understanding and conversational preferences. If I struggle or seem frustrated, reflect on my progress and the time spent on the topic, offering the expected guidance. If I ask about an irrelevant topic, politely redirect me by saying 'I'm not familiar with that topic, but I can help you with [topic].' Do not end your responses with a concluding statement.\n\n"
+
+socratic_role_prompt = """You are a highly skilled and patient AI tutor designed to assist me, the student, in discovering answers and mastering concepts. Your teaching style emphasizes student-centric learning, encouraging deep thinking, active engagement, and confidence building.
+
+## Teaching Methods:
+Socratic Questioning: Use open-ended questions to stimulate critical thinking and guide students to uncover answers themselves.
+Step-by-Step Learning: Break complex problems into smaller, manageable parts, solving one step at a time.
+Error Analysis: Treat mistakes as learning opportunities by helping students reflect on why they occurred and how to address them.
+Active Participation: Encourage students to take an active role in solving problems, providing guidance without overtaking their learning process.
+Tailored Feedback: Adapt your explanations, questions, and support to the student's level, needs, and progress.
+
+## Key Qualities:
+Patience: Allow students ample time to think, process, and respond without rushing them.
+Clarity: Simplify complex ideas into clear, actionable steps.
+Encouragement: Celebrate student efforts and achievements to keep motivation high.
+Adaptability: Customize teaching approaches based on the student's learning preferences and evolving needs.
+Curiosity-Building: Inspire students to ask thoughtful questions, fostering a love for learning.
+Consistency: Reinforce concepts regularly to build lasting understanding.
+Conversation Flow:
+Frequently conclude interactions with a question to keep the dialogue active and gauge the student's comprehension and comfort with the material.
+Continuously adapt to the student's problem-solving style, preferred level of guidance, and feedback.
+
+Example Conversation Style:
+
+If the student asks, "How do I solve this equation?" respond with:
+"Let's start by identifying what you know. What operation do you think comes first?"
+Follow up with guided hints or clarifications based on their response.
+
+## Flexibility:
+Adjust your approach dynamically, whether the student seeks detailed guidance, prefers a hands-off approach, or demonstrates unique problem-solving strategies. If the student struggles or seems frustrated, reflect on their progress and the time spent on the topic, offering the expected guidance. If the student asks about an irrelevant topic, politely redirect them back to the topic. Do not end your responses with a concluding statement."""
 
 pref_guidelines = """**Guidelines:**
 - Use concise, objective language.
diff --git a/src/agents/utils/example_inputs/example_input_1.json b/src/agents/utils/example_inputs/example_input_1.json
@@ -1,19 +1,14 @@
 {
-  "message": "mock",
+  "message": "i dont remember anything",
   "params": {
     "include_test_data": true,
     "conversation_history": [
-      { "type": "user", "content": "hi" },
+      { "type": "user", "content": "what should I do?" },
       {
         "type": "assistant",
-        "content": "How can I assist you today? Are you working on the dot product question?"
+        "content": "It seems like you're currently working on Part (a) of the dot product question. Since you haven't submitted an answer yet, let's take a moment to break it down together. \n\nWhat do you remember about how to calculate the dot product of two vectors? Can you describe the steps you would take?"
       },
-      { "type": "user", "content": "say hi" },
-      {
-        "type": "assistant",
-        "content": "Hi! How can I help you with your question or any topic you're working on?"
-      },
-      { "type": "user", "content": "mock" }
+      { "type": "user", "content": "i dont remember anything" }
     ],
     "summary": "",
     "conversational_style": "",
@@ -158,7 +153,7 @@
       "questionAccessInformation": {
         "estimatedMinimumTime": "1 minute",
         "estimaredMaximumTime": "4 minutes",
-        "timeTaken": "23792 minutes",
+        "timeTaken": "20 minutes",
         "accessStatus": "too much time spent on this question.",
         "markedDone": "",
         "currentPart": {
@@ -168,6 +163,6 @@
       }
     },
     "conversation_id": "7a65b6ed-85d1-4621-8efb-4fc8e9c5a8de",
-    "agent_type": "informational"
+    "agent_type": "socratic"
   }
 }
diff --git a/src/agents/utils/example_inputs/example_input_2.json b/src/agents/utils/example_inputs/example_input_2.json
@@ -213,7 +213,7 @@
       "questionAccessInformation": {
         "estimatedMinimumTime": "3 minutes",
         "estimaredMaximumTime": "7 minutes",
-        "timeTaken": "35610 minutes",
+        "timeTaken": "42 minutes",
         "accessStatus": "too much time spent on this question.",
         "markedDone": "",
         "currentPart": {
diff --git a/src/agents/utils/example_inputs/example_input_4.json b/src/agents/utils/example_inputs/example_input_4.json
@@ -445,7 +445,7 @@
       "questionAccessInformation": {
         "estimatedMinimumTime": "1 minute",
         "estimaredMaximumTime": "3 minutes",
-        "timeTaken": "13199 minutes",
+        "timeTaken": "13 minutes",
         "accessStatus": "too much time spent on this question.",
         "markedDone": "Part (a) was marked done",
         "currentPart": {
diff --git a/src/agents/utils/example_inputs/example_input_5.json b/src/agents/utils/example_inputs/example_input_5.json
@@ -223,7 +223,7 @@
       "questionAccessInformation": {
         "estimatedMinimumTime": "3 minutes",
         "estimaredMaximumTime": "7 minutes",
-        "timeTaken": "43881 minutes",
+        "timeTaken": "40 minutes",
         "accessStatus": "too much time spent on this question.",
         "markedDone": "Part (a) was marked done",
         "currentPart": {
diff --git a/src/agents/utils/parse_json_to_prompt.py b/src/agents/utils/parse_json_to_prompt.py
@@ -189,7 +189,7 @@ def format_part_details(part: PartDetails, currentPartId: str, summary: List[Stu
     Guidance to Solve the Question: {questionInformation.questionGuidance or 'None'};
     Description of Question: {questionInformation.questionContent};
     Expected Time to Complete the Question: {f'{questionInformation.durationLowerBound} - {questionInformation.durationUpperBound} min;' if questionInformation.durationLowerBound and questionInformation.durationUpperBound else 'No specified duration.'}
-    Time Spent on the Question This Session: {questionAccessInformation.timeTaken or 'No recorded duration'} {f'since {questionAccessInformation.markedDone}' if questionAccessInformation.markedDone else ''} {f'which is {questionAccessInformation.accessStatus}' if questionAccessInformation.accessStatus else ''};
+    Time Spent on the Question This Session: {questionAccessInformation.timeTaken or 'No recorded duration'} {f'since {questionAccessInformation.markedDone}' if questionAccessInformation.markedDone else {f'which is {questionAccessInformation.accessStatus}' if questionAccessInformation.accessStatus else ''}}; 
     """
 
     partsDetails = "\n".join(
diff --git a/src/agents/utils/synthetic_conversation_generation.py b/src/agents/utils/synthetic_conversation_generation.py
@@ -18,6 +18,7 @@
 Any of the models accessible through the API calls defined in the 'llm_factory.py' can be used for either the tutor and the agent LLM.
 """
 
+import csv
 import json
 try:
   from ..student_agent.student_agent import invoke_student_agent
@@ -104,7 +105,7 @@ def generate_synthetic_conversations(raw_text: str, num_turns: int, student_agen
 if __name__ == "__main__":
   num_turns = 6
   # Can be "informational", "socratic", "google_learnlm"
-  tutor_agent_types   = ["socratic"]                           
+  tutor_agent_types   = ["google_learnlm"]                           
   # Can be "base", "curious", "contradicting", "reliant", "confused", "unrelated"
   student_agent_types = ["base", "curious", "contradicting", "reliant", "confused", "unrelated"]  
 
@@ -116,15 +117,25 @@ def generate_synthetic_conversations(raw_text: str, num_turns: int, student_agen
     if filename.endswith("1.json"):
       questions.append(os.path.join(example_inputs_folder, filename))
 
-  for student_agent_type in student_agent_types:
     for tutor_agent_type in tutor_agent_types:
-      for question in questions:
-        with open(question, "r") as file:
-          raw_text = file.read()
-
-          conversation = generate_synthetic_conversations(raw_text, num_turns, student_agent_type, tutor_agent_type)
-
-          conversation_output_filename = output_folder + question.split('/')[-1].replace(".json", "_"+student_agent_type+"_"+tutor_agent_type+"_conversation.json")
-          with open(conversation_output_filename, "w") as file:
-            json.dump(conversation, file, indent=2)
-
+      # Open CSV file for writing
+      csv_filename = os.path.join(output_folder, "all_conversations_"+tutor_agent_type+".csv")
+      with open(csv_filename, "w", newline='') as csvfile:
+        csv_writer = csv.writer(csvfile)
+        # Write the header
+        csv_writer.writerow(["tutor", "student", "conversation", "conversation_id"])
+      
+        for student_agent_type in student_agent_types:
+          for question in questions:
+            with open(question, "r") as file:
+              raw_text = file.read()
+
+              conversation = generate_synthetic_conversations(raw_text, num_turns, student_agent_type, tutor_agent_type)
+
+              conversation_output_filename = output_folder + question.split('/')[-1].replace(".json", "_"+student_agent_type+"_"+tutor_agent_type+"_conversation.json")
+              with open(conversation_output_filename, "w") as file:
+                json.dump(conversation, file, indent=2)
+
+              # Write to CSV
+              conversation_id = conversation["conversation_id"]
+              csv_writer.writerow([tutor_agent_type, student_agent_type, conversation["conversation"], conversation_id])