Skip to content

Commit d50ca83

Browse files
authored
Create LLMEval.py
1 parent 69f89b7 commit d50ca83

File tree

1 file changed

+212
-0
lines changed

1 file changed

+212
-0
lines changed

fmeval/LLMEval.py

Lines changed: 212 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,212 @@
1+
from datasets import load_dataset
2+
from tabulate import tabulate
3+
4+
from fmeval.eval_algorithms.qa_accuracy import QAAccuracy, QAAccuracyConfig
5+
6+
import random
7+
import boto3
8+
import botocore
9+
from botocore.client import Config
10+
import json
11+
import pandas as pd
12+
13+
import plotly.express as px
14+
15+
def huggingFaceDatasetDownloader(download_file_path):
16+
# Load the dataset from HuggingFace
17+
dataset = load_dataset("databricks/databricks-dolly-15k")
18+
df = dataset["train"].to_pandas()
19+
20+
# Display the first records
21+
#print(tabulate(df.head(1), headers='keys', tablefmt='psql'))
22+
23+
record_count = len(df)
24+
print("Record count:", record_count)
25+
26+
# Group by category column and display the count for each category
27+
category_counts = df.groupby("category").size().reset_index(name='count')
28+
print("Category Counts:")
29+
print(tabulate(category_counts, headers='keys', tablefmt='psql'))
30+
31+
# Save the DataFrame as a CSV file
32+
df.to_csv(download_file_path, index=False)
33+
return df
34+
35+
def invokeMetaLlama3Model(df, random_sample_count):
36+
random_records = random.sample(range(len(df)), random_sample_count)
37+
df_sample = df.iloc[random_records].copy()
38+
df_sample['prompt'] = ""
39+
df_sample['metaLlama3Response'] = ""
40+
41+
bedrock_runtime = boto3.client('bedrock-runtime', config=Config(read_timeout=500))
42+
43+
count = 0
44+
for index, row in df_sample.iterrows():
45+
count += 1
46+
print(f"Processing the request for Llama3 Model: {count}")
47+
instruction = row['instruction']
48+
context = row['context']
49+
category = row['category']
50+
51+
prompt = f"""
52+
Category: {category}
53+
Instruction: {instruction}
54+
Context: {context}
55+
56+
Please provide a precise response to the instruction and context based on the category.
57+
58+
"""
59+
try:
60+
body = json.dumps({"prompt": prompt, "max_gen_len":200, "temperature":0.5, "top_p":0.9})
61+
modelId = "meta.llama3-70b-instruct-v1:0"
62+
accept = "application/json"
63+
contentType = "application/json"
64+
65+
response = bedrock_runtime.invoke_model(
66+
body=body, modelId=modelId, accept=accept, contentType=contentType
67+
)
68+
response_body = json.loads(response.get("body").read()).get("generation")
69+
70+
#print(response_body)
71+
df_sample.at[index, 'prompt'] = prompt
72+
df_sample.at[index, 'metaLlama3Response'] = response_body
73+
74+
except botocore.exceptions.ClientError as error:
75+
if error.response['Error']['Code'] == 'AccessDeniedException':
76+
print(f"\x1b[41m{error.response['Error']['Message']} \
77+
\nTo troubeshoot this issue please refer to the following resources.\
78+
\nhttps://docs.aws.amazon.com/IAM/latest/UserGuide/troubleshoot_access-denied.html\
79+
\nhttps://docs.aws.amazon.com/bedrock/latest/userguide/security-iam.html\x1b[0m\n")
80+
81+
else:
82+
raise error
83+
return df_sample
84+
85+
def invokeAnthropicModel(df_sample, response_file_path):
86+
df_sample['anthropicResponse'] = ""
87+
88+
bedrock_runtime = boto3.client('bedrock-runtime', config=Config(read_timeout=500))
89+
90+
count = 0
91+
for index, row in df_sample.iterrows():
92+
count += 1
93+
print(f"Processing the request for Anthropic Model: {count}")
94+
instruction = row['instruction']
95+
context = row['context']
96+
category = row['category']
97+
98+
prompt = f"""
99+
Category: {category}
100+
Instruction: {instruction}
101+
Context: {context}
102+
103+
Please provide a precise response to the instruction and context based on the category.
104+
105+
"""
106+
try:
107+
messages=[{ "role":'user', "content":[{'type':'text','text': prompt}]}]
108+
body = json.dumps({"anthropic_version": "bedrock-2023-05-31", "max_tokens": 200, "messages": messages, "temperature": 0.5, "top_p": 0.9})
109+
modelId = "anthropic.claude-3-sonnet-20240229-v1:0"
110+
accept = "application/json"
111+
contentType = "application/json"
112+
113+
response = bedrock_runtime.invoke_model(
114+
body=body, modelId=modelId, accept=accept, contentType=contentType
115+
)
116+
response_body = json.loads(response.get('body').read())
117+
response_text = response_body.get('content')[0]['text']
118+
119+
#print(response_text)
120+
df_sample.at[index, 'anthropicResponse'] = response_text
121+
122+
except botocore.exceptions.ClientError as error:
123+
if error.response['Error']['Code'] == 'AccessDeniedException':
124+
print(f"\x1b[41m{error.response['Error']['Message']}\
125+
\nTo troubeshoot this issue please refer to the following resources.\
126+
\nhttps://docs.aws.amazon.com/IAM/latest/UserGuide/troubleshoot_access-denied.html\
127+
\nhttps://docs.aws.amazon.com/bedrock/latest/userguide/security-iam.html\x1b[0m\n")
128+
129+
else:
130+
raise error
131+
df_sample.to_json(response_file_path, orient='records', lines="true")
132+
return df_sample
133+
134+
135+
def modelEvaluator(model_name, model_output_attribute, model_output_file, evaluator_response_file):
136+
config = DataConfig(
137+
dataset_name = model_name,
138+
dataset_uri = model_output_file,
139+
dataset_mime_type = MIME_TYPE_JSONLINES,
140+
model_input_location = "instruction",
141+
target_output_location = "response",
142+
model_output_location = model_output_attribute
143+
)
144+
145+
# Configure and run QAAccuracy evaluation
146+
qa_eval = QAAccuracy(QAAccuracyConfig(target_output_delimiter="<OR>"))
147+
results = qa_eval.evaluate(dataset_config=config, save=True)
148+
#print(json.dumps(results, default=vars, indent=4))
149+
with open(evaluator_response_file, 'w') as f:
150+
json.dump(results, f, default=lambda c: c.__dict__)
151+
print(f'Results saved to {evaluator_response_file}')
152+
return results
153+
154+
155+
def load_results(model_names, evaluator_response_folder):
156+
accuracy_results = []
157+
for model_name in model_names:
158+
file = f'{evaluator_response_folder}/{model_name}.json'
159+
with open(file, 'r') as f:
160+
res = json.load(f)
161+
for accuracy_eval in res:
162+
for accuracy_scores in accuracy_eval["dataset_scores"]:
163+
accuracy_results.append(
164+
{'model': model_name, 'evaluation': 'accuracy', 'dataset': accuracy_eval["dataset_name"],
165+
'metric': accuracy_scores["name"], 'value': accuracy_scores["value"]})
166+
167+
accuracy_results_df = pd.DataFrame(accuracy_results)
168+
return accuracy_results_df
169+
170+
171+
def visualize_radar(results_df, plotfilePath, dataset):
172+
if dataset == 'all':
173+
mean_across_datasets = results_df.drop('evaluation', axis=1).groupby(['model', 'metric']).describe()['value']['mean']
174+
results_df = pd.DataFrame(mean_across_datasets).reset_index().rename({'mean':'value'}, axis=1)
175+
else:
176+
results_df = results_df[results_df['dataset'] == dataset]
177+
178+
fig = px.line_polar(results_df, r='value', theta='metric', color='model', line_close=True)
179+
xlim = 1
180+
fig.update_layout(
181+
polar=dict(
182+
radialaxis=dict(
183+
visible=True,
184+
range=[0, xlim],
185+
)),
186+
margin=dict(l=150, r=0, t=100, b=80)
187+
)
188+
189+
title = 'Average Performance over databricks/databricks-dolly-15k' if dataset == 'all' else dataset
190+
fig.update_layout(
191+
title=dict(text=title, font=dict(size=20), yref='container')
192+
)
193+
fig.show()
194+
fig.write_image(plotfilePath)
195+
196+
197+
def main():
198+
user_dir = "/home/sagemaker-user/"
199+
models = ["Meta_Llama3_70b_Instruct", "Anthropic_Claude_3_Sonnet"]
200+
random_sample_count = 3000
201+
202+
df = huggingFaceDatasetDownloader(user_dir + "databricks-dolly-15k.csv")
203+
df_sample = invokeMetaLlama3Model(df, random_sample_count)
204+
df_sample = invokeAnthropicModel(df_sample, user_dir + "response.json")
205+
modelEvaluator(models[0], "metaLlama3Response", user_dir + "response.json", user_dir + f"{models[0]}.json")
206+
modelEvaluator(models[1], "anthropicResponse", user_dir + "response.json", user_dir + f"{models[1]}.json")
207+
208+
results_df = load_results(models, user_dir)
209+
visualize_radar(results_df, user_dir + "modelEvaluvationPlot.pdf", dataset='all')
210+
211+
if __name__ == '__main__':
212+
main()

0 commit comments

Comments
 (0)