-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtext_processor.py
More file actions
141 lines (115 loc) · 3.96 KB
/
text_processor.py
File metadata and controls
141 lines (115 loc) · 3.96 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
import json
import re
from typing import List, Dict, Any
def convert_text_to_jsonl(text: str) -> List[Dict[str, Any]]:
"""
Convert text output from model to JSONL format.
Handles various formats the model might output:
- Raw JSON array
- JSON wrapped in markdown code blocks
- JSONL format (one JSON object per line)
- Mixed text with JSON content
"""
if not text or not text.strip():
return []
cleaned_text = text.strip()
strategies = [
("markdown_json", _parse_markdown_json),
("json_array", _parse_json_array),
("jsonl_lines", _parse_jsonl_lines),
("mixed_content", _parse_mixed_content),
("single_json_objects", _parse_single_json_objects),
("text", _parse_text_to_json)
]
for _, strategy_func in strategies:
try:
result = strategy_func(cleaned_text)
if result:
return result
except Exception as e:
continue
print(f"Failed to parse text: {text}")
return []
def _parse_markdown_json(text: str) -> List[Dict[str, Any]]:
"""Parse JSON wrapped in markdown code blocks"""
# Look for ```json ... ``` or ``` ... ``` patterns
json_pattern = r'```(?:json)?\s*\n?(.*?)\n?```'
matches = re.findall(json_pattern, text, re.DOTALL | re.IGNORECASE)
for match in matches:
try:
data = json.loads(match.strip())
if isinstance(data, list):
return data
elif isinstance(data, dict):
return [data]
except json.JSONDecodeError:
continue
raise ValueError()
def _parse_json_array(text: str) -> List[Dict[str, Any]]:
"""Parse direct JSON array"""
try:
data = json.loads(text)
if isinstance(data, list):
return data
elif isinstance(data, dict):
return [data]
else:
raise ValueError
except json.JSONDecodeError:
raise ValueError()
def _parse_jsonl_lines(text: str) -> List[Dict[str, Any]]:
"""Parse JSONL format (one JSON object per line)"""
lines = text.strip().split('\n')
results = []
for line in lines:
line = line.strip()
if not line:
continue
try:
data = json.loads(line)
results.append(data)
except json.JSONDecodeError:
pass
if results:
return results
else:
raise ValueError()
def _parse_mixed_content(text: str) -> List[Dict[str, Any]]:
"""Parse mixed content by extracting JSON objects"""
json_objects = []
json_pattern = r'\{[^{}]*(?:\{[^{}]*\}[^{}]*)*\}'
potential_jsons = re.findall(json_pattern, text, re.DOTALL)
for potential_json in potential_jsons:
try:
data = json.loads(potential_json)
if isinstance(data, dict):
json_objects.append(data)
except json.JSONDecodeError:
continue
if json_objects:
return json_objects
else:
raise ValueError()
def _parse_single_json_objects(text: str) -> List[Dict[str, Any]]:
"""Parse individual JSON objects separated by newlines or other delimiters"""
delimiters = ['\n\n', '\n---\n', '\n***\n', '\n---', '\n***']
for delimiter in delimiters:
parts = text.split(delimiter)
results = []
for part in parts:
part = part.strip()
if not part:
continue
try:
data = json.loads(part)
if isinstance(data, dict):
results.append(data)
elif isinstance(data, list):
results.extend(data)
except json.JSONDecodeError:
continue
if results:
return results
raise ValueError()
def _parse_text_to_json(text: str) -> List[Dict[str, Any]]:
return [{"mode": "text", "text": text}]