SecureTranscribe/test_export_functionality.py at main · CharlesMorgan007/SecureTranscribe · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
#!/usr/bin/env python3
"""
Test script to verify export functionality with diarized transcription.
"""

import sys
import os
import logging
import time
from pathlib import Path

# Add the app directory to the path
sys.path.insert(0, str(Path(__file__).parent / "app"))

# Set up logging
logging.basicConfig(
    level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
)
logger = logging.getLogger(__name__)


def test_export_functionality():
    """Test the export service with diarized transcription."""
    from app.services.export_service import ExportService
    from app.services.diarization_service import DiarizationService
    from app.core.database import get_database
    from app.models.transcription import Transcription

    # Initialize services
    export_service = ExportService()
    diarization_service = DiarizationService()

    # Get database session
    db = get_database()
    session = next(db)

    try:
        logger.info("=== Testing export functionality with diarized transcription ===")

        # Create a mock transcription with segments and speaker assignments
        transcription = Transcription(
            session_id="test_export_" + str(int(time.time())),
            original_filename="stealth.mp3",
            file_path="/Users/cmorgan/Devel/Personal/SecureTranscribe/stealth.mp3",
            file_size=82184,
            file_duration=10.27,
            file_format="mp3",
            status="completed",
            whisper_model="base",
            pyannote_model="pyannote/speaker-diarization-3.1",
            speakers_assigned=True,
            num_speakers=2,
        )

        # Add mock transcription segments with speaker assignments
        mock_segments = [
            {
                "start_time": 0.0,
                "end_time": 0.8,
                "text": "Who's there?",
                "confidence": 0.95,
                "speaker": "John Doe",
            },
            {
                "start_time": 0.9,
                "end_time": 1.7,
                "text": "It's me.",
                "confidence": 0.93,
                "speaker": "Jane Smith",
            },
            {
                "start_time": 1.8,
                "end_time": 2.5,
                "text": "What do you want?",
                "confidence": 0.91,
                "speaker": "John Doe",
            },
            {
                "start_time": 2.6,
                "end_time": 3.4,
                "text": "I need help.",
                "confidence": 0.94,
                "speaker": "Jane Smith",
            },
            {
                "start_time": 3.5,
                "end_time": 4.2,
                "text": "Help with what?",
                "confidence": 0.92,
                "speaker": "John Doe",
            },
        ]

        transcription.segments = mock_segments

        session.add(transcription)
        session.commit()
        session.refresh(transcription)

        logger.info(
            f"Created transcription with {len(transcription.segments)} segments"
        )
        logger.info("Speaker assignments:")
        for i, segment in enumerate(transcription.segments):
            logger.info(f"  Segment {i}: {segment['speaker']} - '{segment['text']}'")

        # Test export to different formats
        formats = ["txt", "csv", "json"]

        for export_format in formats:
            logger.info(f"\n--- Testing {export_format.upper()} export ---")
            try:
                export_data = export_service.export_transcription(
                    transcription, export_format, session=session
                )

                if export_format == "txt":
                    content = export_data.decode("utf-8")
                    logger.info(f"TXT export (first 500 chars):\n{content[:500]}...")
                elif export_format == "csv":
                    content = export_data.decode("utf-8")
                    logger.info(f"CSV export (first 500 chars):\n{content[:500]}...")
                elif export_format == "json":
                    import json

                    data = json.loads(export_data.decode("utf-8"))
                    logger.info(f"JSON export structure:")
                    logger.info(f"  Transcription ID: {data['transcription']['id']}")
                    logger.info(
                        f"  Number of speakers: {data['transcription']['num_speakers']}"
                    )
                    if "segments" in data["transcription"]:
                        logger.info(
                            f"  Number of segments: {len(data['transcription']['segments'])}"
                        )
                        if data["transcription"]["segments"]:
                            logger.info("  Sample segments:")
                            for i, seg in enumerate(
                                data["transcription"]["segments"][:3]
                            ):
                                logger.info(f"    {seg['speaker']}: '{seg['text']}'")

            except Exception as e:
                logger.error(f"Export to {export_format} failed: {e}")
                import traceback

                traceback.print_exc()

    finally:
        session.close()


if __name__ == "__main__":
    test_export_functionality()