Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 10 additions & 1 deletion src/typeagent/emails/email_import.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
# Licensed under the MIT License.

from email import message_from_string
from email.header import decode_header, make_header
from email.message import Message
from email.utils import parsedate_to_datetime
from pathlib import Path
Expand All @@ -11,6 +12,14 @@
from .email_message import EmailMessage, EmailMessageMeta


def decode_encoded_words(value: str) -> str:
"""Decode text that may contain RFC 2047 encoded words."""
if not value:
return ""

return str(make_header(decode_header(value)))


def import_emails_from_dir(
dir_path: str, max_chunk_length: int = 4096
) -> Iterable[EmailMessage]:
Expand Down Expand Up @@ -78,7 +87,7 @@ def import_email_message(msg: Message, max_chunk_length: int) -> EmailMessage:
body = get_last_response_in_thread(body)

if email_meta.subject is not None:
body = email_meta.subject + "\n\n" + body
body = decode_encoded_words(email_meta.subject) + "\n\n" + body

body_chunks = _text_to_chunks(body, max_chunk_length)
email: EmailMessage = EmailMessage(
Expand Down
23 changes: 3 additions & 20 deletions tools/ingest_email.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,13 +23,12 @@

import argparse
import asyncio
from email.header import decode_header
from pathlib import Path
import sys
import time

from typeagent.aitools import utils
from typeagent.emails.email_import import import_email_from_file
from typeagent.emails.email_import import decode_encoded_words, import_email_from_file
from typeagent.emails.email_memory import EmailMemory
from typeagent.emails.email_message import EmailMessage
from typeagent.knowpro.convsettings import ConversationSettings
Expand Down Expand Up @@ -91,21 +90,6 @@ def collect_email_files(paths: list[str], verbose: bool) -> list[Path]:
return email_files


def decode_encoded_word(s: str) -> str:
"""Decode an RFC 2047 encoded string."""
if "=?utf-8?" not in s:
return s # Fast path for common case
decoded_parts = decode_header(s)
return "".join(
(
part.decode(encoding or "utf-8", errors="replace")
if isinstance(part, bytes)
else part
)
for part, encoding in decoded_parts
)


async def ingest_emails(
paths: list[str],
database: str,
Expand Down Expand Up @@ -194,17 +178,16 @@ async def ingest_emails(
print()

if verbose:
print(f" From: {email.metadata.sender}")
print(f" From: {decode_encoded_words(email.metadata.sender)}")
if email.metadata.subject:
print(
f" Subject: {decode_encoded_word(email.metadata.subject).replace('\n', '\\n')}"
f" Subject: {decode_encoded_words(email.metadata.subject).replace('\n', '\\n')}"
)
print(f" Date: {email.timestamp}")
print(f" Body chunks: {len(email.text_chunks)}")
for chunk in email.text_chunks:
# Show first N chars of each decoded chunk
N = 150
chunk = decode_encoded_word(chunk)
preview = repr(chunk[: N + 1])[1:-1]
if len(preview) > N:
preview = preview[: N - 3] + "..."
Expand Down