-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathImportProcessor.py
143 lines (127 loc) · 6.32 KB
/
ImportProcessor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
#!/usr/bin/env python3
import csv
import time
import FoxmlWorker as FW
import ImportServerUtilities as IS
import ImportUtilities as IU
"""
This class is used to prepare data for ingest into the UPEI ingestion platform.
"""
class ImportProcessor:
def __init__(self, namespace):
self.objectStore = '/usr/local/fedora/data/objectStore'
self.datastreamStore = '/usr/local/fedora/data/datastreamStore'
self.stream_map = {
'islandora:sp_pdf': ['OBJ', 'PDF'],
'islandora:sp_large_image_cmodel': ['OBJ'],
'islandora:sp_basic_image': ['OBJ'],
'ir:citationCModel': ['FULL_TEXT'],
'ir:thesisCModel': ['OBJ', 'PDF', 'FULL_TEXT'],
'islandora:sp_videoCModel': ['OBJ', 'PDF'],
'islandora:newspaperIssueCModel': ['OBJ', 'PDF'],
'islandora:sp-audioCModel': ['OBJ'],
}
self.iu = IU.ImportUtilities(namespace)
self.ms = IS.ImportServerUtilities(namespace)
self.namespace = namespace
self.export_dir = '/opt/islandora/upei_migration/export'
self.mimemap = {"image/jpeg": ".jpg",
"image/jp2": ".jp2",
"image/png": ".png",
"image/tiff": ".tif",
"text/xml": ".xml",
"text/plain": ".txt",
"application/pdf": ".pdf",
"application/xml": ".xml",
"audio/x-wav": ".wav",
"application/vnd.openxmlformats-officedocument.wordprocessingml.document": ".docx",
"application/octet-stream": ".bib",
"audio/mpeg": ".mp3",
"video/mp4": ".mp4",
"video/x-m4v": ".m4v",
"audio/vnd.wave": '.wav'
}
self.fieldnames = ['id', 'title', 'parent_id', 'field_member_of', 'field_edtf_date_issued', 'field_abstract',
'field_genre', 'field_subject', 'field_geographic_subject', 'field_physical_description',
'field_extent', 'field_resource_type', 'field_linked_agent', 'field_pid',
'field_related_item', 'field_edtf_date_other', 'field_edtf_copyright_date', 'field_issuance',
'field_location', 'field_publisher', 'field_edition', 'field_access_condition',
'field_model', 'field_edtf_date_created', 'file', 'field_subtitle', 'field_identifier',
'field_alternative_title']
self.start = time.time()
# Prepares workbench sheet for collection structure
def prepare_collection_worksheet(self, output_file):
collection_pids = self.iu.get_collection_pids(self.namespace)
with open(output_file, 'w', newline='') as csvfile:
writer = csv.DictWriter(csvfile, fieldnames=self.fieldnames)
writer.writeheader()
rows = []
for entry in collection_pids:
pid = entry.get('field_pid')
if self.namespace not in entry.get('field_member_of'):
continue
mods = self.iu.extract_from_mods(pid)
row = {'id': pid}
all_fields = entry | mods
for key, value in all_fields.items():
if type(value) is list:
value = '|'.join(value)
row[key] = value
rows.append(row)
processed = ['islandora:root']
while rows:
for row in rows:
if row.get('field_member_of') in processed:
writer.writerow(row)
rows.remove(row)
processed.append(row.get('id'))
# Prepares ingest worksheets per collections
def prepare_collection_member_worksheet(self, collections, output_file):
details = self.iu.get_collection_member_details(self.namespace, collections)
with open(output_file, 'w', newline='') as csvfile:
writer = csv.DictWriter(csvfile, fieldnames=self.fieldnames)
writer.writeheader()
for detail in details:
mods = self.iu.extract_from_mods(detail['field_pid'])
row = mods | detail
writer.writerow(row)
# Prepares worksheets for workbench ingest.
def prepare_intial_ingest_worksheet(self, output_file):
details = self.iu.get_worksheet_details()
if not details: # Check if details is None or empty
print("No worksheet details found.")
return
with open(output_file, 'w', newline='') as csvfile:
writer = csv.DictWriter(csvfile, fieldnames=['id', 'title', 'field_pid', 'field_model'])
writer.writeheader()
id = 1
for detail in details:
if not detail or not detail.get('field_pid'): # Add safer checks for detail and field_pid
continue
# Fetch DC values and ensure it's valid
dc = self.iu.get_dc_values(detail['field_pid'])
if not dc or 'title' not in dc: # Ensure dc is not None and has a 'title' key
print(f"Warning: Missing DC values for PID {detail['field_pid']}")
continue
# Prepare the row for CSV
row = {
'id': id,
'title': dc['title'],
'field_pid': detail['field_pid'],
'field_model': detail.get('field_model', 'Unknown') # Use .get() with fallback for safety
}
writer.writerow(row)
id += 1
def prepare_relationship_worksheet(self, output_file):
realtionships = self.iu.get_relationships()
with open(output_file, 'w', newline='') as csvfile:
writer = csv.DictWriter(csvfile, fieldnames=['node_id', 'field_member_of'])
writer.writeheader()
for relatiopship in realtionships:
row = {}
row['node_id'] = relatiopship['node_id']
row['field_member_of'] = relatiopship['member_of']
writer.writerow(row)
writer.writeheader()
MP = ImportProcessor('ivoices')
MP.prepare_relationship_worksheet('ivoices_relationships.csv')