Skip to content

Commit

Permalink
Integration btw LLM and GMail
Browse files Browse the repository at this point in the history
  • Loading branch information
siddhantkushwaha committed Dec 25, 2024
1 parent 4545095 commit 832092f
Show file tree
Hide file tree
Showing 4 changed files with 71 additions and 48 deletions.
92 changes: 59 additions & 33 deletions cleanup.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

import params
import util
import ollama_lib
from db.database import get_db
from db.models import Rule
from falcon import FalconClient, process_gmail_dic
Expand Down Expand Up @@ -64,24 +65,19 @@ def consolidate(falcon_client, main_query):
time.sleep(0.5)


def get_label_ids(mail):
return {i for i in mail.get('labelIds', [])}
def get_label_names(mail_processed, label_id_to_name_mapping):
return {label_id_to_name_mapping[i] for i in mail_processed['LabelIds']}


def get_label_names(mail, label_id_to_name_mapping):
return {label_id_to_name_mapping[i] for i in get_label_ids(mail)}


def should_delete_email(mail, blacklist_rules, whitelist_rules, label_id_to_name_mapping):
def should_delete_email(mail_processed, blacklist_rules, whitelist_rules, label_id_to_name_mapping):
curr_time = int(time.time())

mail_processed = process_gmail_dic(mail)
sender = lower_strip_clean(mail_processed['Sender'])
subject = mail_processed['Subject']
text = mail_processed['Text']
snippet = mail_processed['Snippet']
timediff = curr_time - int(mail_processed['DateTime'].timestamp())
labels = get_label_names(mail, label_id_to_name_mapping)
labels = get_label_names(mail_processed, label_id_to_name_mapping)
tags = set()

if mail_processed['Unsubscribe'] is not None:
Expand All @@ -100,16 +96,15 @@ def should_delete_email(mail, blacklist_rules, whitelist_rules, label_id_to_name
return False


def process_labelling(mail, label_rules, add_labels, remove_labels, label_id_to_name_mapping):
def process_labelling(mail_processed, label_rules, add_labels, remove_labels, label_id_to_name_mapping):
curr_time = int(time.time())

mail_processed = process_gmail_dic(mail)
sender = mail_processed['Sender']
subject = mail_processed['Subject']
text = mail_processed['Text']
snippet = mail_processed['Snippet']
timediff = curr_time - int(mail_processed['DateTime'].timestamp())
labels = get_label_names(mail, label_id_to_name_mapping)
labels = get_label_names(mail_processed, label_id_to_name_mapping)
tags = set()

if mail_processed['Unsubscribe'] is not None:
Expand Down Expand Up @@ -144,7 +139,23 @@ def process_labelling(mail, label_rules, add_labels, remove_labels, label_id_to_
util.log('Skipping processing other labelling rules.')
break

def apply_ai_labels(mail_processed, ai_labels, add_labels, remove_labels, label_id_to_name_mapping):
email_labels = get_label_names(mail_processed, label_id_to_name_mapping)

out_labels, model_name = ollama_lib.process_email(mail_processed, ai_labels)

prev_ai_labels = [i for i in email_labels if i.startswith(f'AI/{model_name}'.upper())]
new_ai_labels = [f'AI/{model_name}/{label}'.upper() for label in out_labels]

for label in new_ai_labels:
if label not in email_labels:
add_labels.append(label)

for label in prev_ai_labels:
if label not in new_ai_labels:
remove_labels.append(label)


def cleanup(email, main_query, num_days, key):
util.log(f'Cleanup triggered for {email} - {main_query}.')

Expand All @@ -157,6 +168,8 @@ def get_query(rule_type):

whitelist_rules = {i.query for i in db.session.query(Rule).filter(get_query('whitelist')).all()}

ai_labels = ollama_lib.get_ai_labels()

# For safety, I have kept this hard-coded
whitelist_rules.add("'starred' in labels")

Expand All @@ -181,24 +194,26 @@ def get_query(rule_type):

mails = falcon_client.gmail.list_mails(query=get_query, max_pages=10000)

created_label_names = {label['name']: label['id'] for label in falcon_client.gmail.list_labels()['labels']}
created_label_ids = {label['id']: label['name'] for label in falcon_client.gmail.list_labels()['labels']}
labels_info = falcon_client.gmail.list_labels()["labels"]

created_label_names = {label["name"]: label["id"] for label in labels_info}
created_label_ids = {label["id"]: label["name"] for label in labels_info}

for mail in mails:
mail_id = mail['id']
mail_id = mail["id"]

mail_full = get_mail(falcon_client, mail_id)
mail_processed = process_gmail_dic(mail_full)

# --------------- code to dump
# pm = process_gmail_dic(mail_full)
# pm['DateTime'] = int(pm['DateTime'].timestamp())
# pm['Email'] = email
# util.save_mail_to_cache(pm)
# mail_processed['DateTime'] = int(mail_processed['DateTime'].timestamp())
# mail_processed['Email'] = email
# util.save_mail_to_cache(mail_processed)
# continue
# -----------------

move_to_trash = should_delete_email(
mail_full,
mail_processed,
blacklist_rules,
whitelist_rules,
created_label_ids
Expand All @@ -209,24 +224,33 @@ def get_query(rule_type):
remove_label_names = []

process_labelling(
mail_full,
mail_processed,
label_rules,
add_label_names,
remove_label_names,
created_label_ids
)

existing_label_ids = get_label_ids(mail_full)
if use_llm:
apply_ai_labels(mail_processed, ai_labels, add_label_names, remove_label_names, created_label_ids)

existing_label_ids = mail_processed['LabelIds']

add_label_ids = []
for label_name in add_label_names:
label_id = created_label_names.get(label_name, None)
if label_id is None:
util.log(f'Label [{label_name}] not found, creating it.')
label_id = falcon_client.gmail.create_label(label_name)['id']

created_label_names[label_name] = label_id
created_label_ids[label_id] = label_name
prev_node = ''
for label_node in label_name.split('/'):
if len(prev_node) > 0:
label_node = f'{prev_node}/{label_node}'

label_id = created_label_names.get(label_node, None)
if label_id is None:
util.log(f'Label [{label_node}] not found, creating it.')
label_id = falcon_client.gmail.create_label(label_node)['id']
created_label_names[label_node] = label_id
created_label_ids[label_id] = label_node

prev_node = label_node

add_label_ids.append(label_id)

Expand All @@ -241,7 +265,7 @@ def get_query(rule_type):
mail_full['labelIds'].append(label_name)

move_to_trash = should_delete_email(
mail_full,
mail_processed,
blacklist_rules,
whitelist_rules,
created_label_ids
Expand All @@ -255,17 +279,19 @@ def get_query(rule_type):
consolidate(falcon_client, main_query)


if __name__ == '__main__':
if __name__ == "__main__":
try:
num_days = int(sys.argv[1]) if len(sys.argv) > 1 else -1
if num_days == -1:
num_days = 2

key = sys.argv[2] if len(sys.argv) > 2 else None
if key is None:
if key is None or key == "#":
key = getpass.getpass("Please provide secret key: ")

util.log(f'Running cleanup on emails in last [{num_days}] days.')
use_llm = len(sys.argv) > 3 and sys.argv[3] == "1"

util.log(f"Running cleanup on emails in last [{num_days}] days.")

for em in list(params.emails):
cleanup(email=em, main_query=params.emails[em], num_days=num_days, key=key)
Expand Down
3 changes: 2 additions & 1 deletion falcon.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,8 @@ def process_gmail_dic(mail):
'AttachmentIds': attachment_ids,
'DateTime': date_time,
'Htmls': html_parts,
'Snippet': mail['snippet']
'Snippet': mail['snippet'],
'LabelIds': {i for i in mail.get('labelIds', [])}
}

return processed_data
Expand Down
22 changes: 9 additions & 13 deletions ollama_lib.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
MODEL_NAME = "phi3"


def get_predefined_labels():
def get_ai_labels():
with open(os.path.join(params.data_dir, "labels.txt"), "r") as fp:
content = fp.read()
labels = [i.lower().strip() for i in content.split()]
Expand Down Expand Up @@ -39,11 +39,11 @@ def generate_prompt(labels, sender, subject, snippet, email_content):
return prompt


def process_email(mail, predefined_labels):
def process_email(mail, ai_labels):
mail_id = mail["Id"]

prompt = generate_prompt(
labels=predefined_labels,
labels=ai_labels,
sender=mail["Sender"],
subject=mail["Subject"],
snippet=mail["Snippet"],
Expand All @@ -54,13 +54,10 @@ def process_email(mail, predefined_labels):

attempt = 1
max_attempts = 3
wait_time = 5
wait_time = 60
out_labels = []

while attempt <= max_attempts and len(out_labels) == 0:
if attempt > 1:
time.sleep(wait_time)

fp = open(os.path.join(params.dump_dir, f"{mail_id}.llm.txt"), "w")
fp.write(prompt)
fp.write(line_str)
Expand All @@ -69,7 +66,7 @@ def process_email(mail, predefined_labels):
fp.write(text_response)
fp.write(line_str)

for predef_label in predefined_labels:
for predef_label in ai_labels:
if text_response.find(f'"{predef_label}"') > -1:
out_labels.append(predef_label)

Expand All @@ -79,13 +76,14 @@ def process_email(mail, predefined_labels):
fp.write("- ".join(out_labels))
fp.close()

time.sleep(wait_time)
attempt += 1

return out_labels
return out_labels, MODEL_NAME


def process_dump():
predefined_labels = get_predefined_labels()
ai_labels = get_ai_labels()
rpt = os.path.join(params.root_dir, "dump")
for item in os.listdir(rpt):
if not item.endswith(".json"):
Expand All @@ -94,9 +92,7 @@ def process_dump():
with open(os.path.join(rpt, item), "r") as fp:
mail = json.load(fp)

process_email(mail, predefined_labels)

time.sleep(10)
process_email(mail, ai_labels)


if __name__ == "__main__":
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ google-auth==2.16.0
google-auth-httplib2==0.1.0
google-auth-oauthlib==0.8.0
google-pasta==0.2.0
google-py-apis==0.1.7
google-py-apis==0.1.9
googleapis-common-protos==1.58.0
greenlet==3.1.1
httplib2==0.21.0
Expand Down

0 comments on commit 832092f

Please sign in to comment.