Skip to content

Testimony summarization -- Just look at hearing_overview_pipeline.py #1838

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 10 commits into
base: main
Choose a base branch
from
143 changes: 143 additions & 0 deletions llm/hearing_overview_pipeline.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,143 @@
import json
import numpy as np
import os
import pandas as pd
# import tiktoken
# import streamlit as st
import urllib.request
# import chromadb
import re
import requests

# from chromadb.config import Settings
from dataclasses import dataclass, field
# from langchain_text_splitters import TokenTextSplitter

from operator import itemgetter
from pathlib import Path
from requests.exceptions import RequestException
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from typing import Tuple, List

from prompts import *
from tag_categories import *
import firebase_admin
from firebase_admin import firestore
import openai
# import google.auth
PROMPT_INSTRUCTIONS = "Follow these instructions when creating the summary: \n" \
"-Try to provide a balanced summary giving space to multiple sides of an issue that were shared in the hearing \n" \
"-Try to avoid repeating uncritically the facts that were shared in testimony, as they may not be accurate \n" \
"-Do not repeat any offensive, slanderous, or personally derogatory statements \n" \
"-Note that the transcripts may contain transcription errors, such as mis-identified homophones, and that names referenced in the hearing may not have been transcribed accurately."

# Set the environment variable for the Google Cloud project
os.environ["GOOGLE_CLOUD_PROJECT"] = "digital-testimony-dev"

@dataclass()
class HearingDetails():
'''
A class to store all the details pertaining to a testimony.
'''
hearing_id: str = ''
hearing_text: str = ''
summary: str = ''


def receive_hearing(hearing: HearingDetails):
hearing_text = hearing.hearing_text
hearing_id = hearing.hearing_id
print(f'Receiving hearing with ID: {hearing_id}')
link = f'https://malegislature.gov/api/Hearings/{hearing_id}'
#Convert the hearing to a JSON object, allows us to access the bill ids
#that are mentioned during the hearing
r = requests.get(link, verify=False)
r = r.json()
bill_numbers = [
doc["BillNumber"]
for agenda in r.get("HearingAgendas", [])
for doc in agenda.get("DocumentsInAgenda", [])
]
print(bill_numbers)
try:
db = connect_to_firestore()
except Exception as e:
print(f'Error connecting to Firestore: {e}')
# st.error("Could not connect to Firestore. Please check your configuration.")
return hearing_text, {}

bill_summaries = {}
for number in bill_numbers:
# Get the bill document from Firestore
bill_ref = db.collection("generalCourts").document("194").collection("bills").document(number)
# Fetch the document
bill_doc = bill_ref.get()
if bill_doc.exists:
bill_data = bill_doc.to_dict()
# Append the summary to the list
bill_summaries.update({number: bill_data.get("summary", "")})
# else:
# st.warning(f"Bill {number} not found in Firestore.")
return hearing_text, bill_summaries

def connect_to_firestore():
firebase_admin.initialize_app()
db = firestore.client()
return db

def make_openai_request(prompt: str) -> str:
"""
Make a request to OpenAI's API to get a response for the given prompt.
"""
url = "https://api.openai.com/v1/chat/completions"

openai.api_key = os.environ.get("OPENAI_API_KEY")
headers = {
"Authorization": f"Bearer " + openai.api_key,
"Content-Type": "application/json"
}
data = {
"model": "gpt-4o-mini",
"messages": [
{"role": "user", "content": prompt}
],
}
response = requests.post(url, headers=headers, json=data)
message = response.json()
return message['choices'][0]['message']['content']

if __name__ == "__main__":
# Example usage
data = open('./jsons/hearing-4539.json',)

testimony = json.load(data)

hearing_text = testimony['text']
hearing = HearingDetails(
hearing_id=4539,
hearing_text=hearing_text,
summary="This is a sample summary."
)
text, summaries = receive_hearing(hearing)
PROMPT_BILL_SUMMARIES = f'''
Provide a summary of this hearing to a regular person with no special knowledge or expertise of this area.
This is a hearing discussing several pending bills.
Provide a short summary of the sentiments that were expressed about the bills that were discussed during the hearing.
Focus on which bills were discussed the most, and what the most common points were.
Pull, if applicable, a compelling quote that is representative of a commonly made argument.
If there was consensus on any specific point agreed upon by stakeholders who otherwise disagreed, please note that.
You do not need to provide a summary of the bills themselves.
Try and keep the overview of the hearing to 300 words or less.
Follow these instructions when creating the prompt: {PROMPT_INSTRUCTIONS}
The text of the hearing is as follows:
```
{text}
```
The summaries of each bill mentioned during the hearing are as follows:
```
{summaries}
```
'''
response = make_openai_request(PROMPT_BILL_SUMMARIES)
print(response)
Loading