Skip to content

Commit cccd7e8

Browse files
committed
Adds bitstream access rights comparison; fixes assumptions about input.
1 parent e30f0bc commit cccd7e8

File tree

5 files changed

+130
-23
lines changed

5 files changed

+130
-23
lines changed

src/compare_csv.py

Lines changed: 106 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -16,14 +16,15 @@
1616
import argparse
1717
import csv
1818
import logging
19-
import json
2019
import os
2120
import pathlib
2221
import re
2322
import sys
2423

2524
import pandas
2625

26+
from utils import utilities as utils
27+
2728

2829
def parse_args():
2930
"""
@@ -63,6 +64,26 @@ def string_compare(str1, str2):
6364
return str1 == str2
6465

6566

67+
#
68+
def access_rights_compare(str1, str2):
69+
"""
70+
Compare access rights given the following mapping
71+
"""
72+
access_rights = {
73+
"http://terms.library.ualberta.ca/public": "open.access",
74+
"http://terms.library.ualberta.ca/embargo": "metadata.only",
75+
"http://terms.library.ualberta.ca/authenticated": "restricted",
76+
}
77+
78+
logging.debug("%s ---- %s", str1, str2)
79+
80+
return (
81+
str2 == access_rights[str1]
82+
if isinstance(str1, str) and str1 in access_rights
83+
else False
84+
)
85+
86+
6687
# Scholaris removes trailing linebreaks
6788
def string_compare_ignore_whitespace(str1, str2):
6889
"""
@@ -94,16 +115,48 @@ def member_of_list_compare(list1, list2):
94115
return list1 == list2
95116

96117

118+
#
119+
def value_in_string_list_compare(str1, list2):
120+
"""
121+
compare a simple string to the contents of a list represented as a string
122+
"""
123+
logging.debug("%s ---- %s", str1, list2)
124+
list2 = utils.convert_string_list_representation_to_list(list2)
125+
logging.debug("%s ---- %s", str1, list2)
126+
127+
return True if not str1 and not list2 else str1 in list2
128+
129+
130+
#
131+
def string_lists_compare(list1, list2):
132+
"""
133+
compare a the contents of two lists represented as a strings
134+
"""
135+
logging.debug("%s ---- %s", list1, list2)
136+
list1 = utils.convert_string_list_representation_to_list(list1)
137+
list2 = utils.convert_string_list_representation_to_list(list2)
138+
logging.debug("%s ---- %s", list1, list2)
139+
return list1 == list2
140+
141+
97142
#
98143
def collection_parent_compare(list1, list2):
99144
"""
100145
Compare two lists
101146
"""
102-
logging.debug("member_of_list_compare: %s ---- %s", list1, list2)
147+
logging.debug("%s ---- %s", list1, list2)
148+
149+
# list 1 is nan if item not in Jupiter
150+
list1 = "[]" if isinstance(list1, str) is False else list1
151+
103152
list1_collection_ids = list(
104-
path.split("/")[1] for path in json.loads(list1) if path
153+
path.split("/")[1]
154+
for path in utils.convert_string_to_json(list1)
155+
if path and isinstance(list1, str)
156+
)
157+
return list1_collection_ids == utils.convert_string_list_representation_to_list(
158+
list2
105159
)
106-
return list1_collection_ids == json.loads(list2)
107160

108161

109162
#
@@ -128,17 +181,19 @@ def language_compare(list1, list2):
128181
}
129182
logging.debug("member_of_list_compare: %s ---- %s", list1, list2)
130183
conversion_result = list(
131-
easy_language_mapping[language] for language in json.loads(list1) if language
184+
easy_language_mapping[language]
185+
for language in utils.convert_string_list_representation_to_list(list1)
186+
if language
132187
)
133-
return conversion_result == json.loads(list2)
188+
return conversion_result == utils.convert_string_list_representation_to_list(list2)
134189

135190

136191
#
137192
def special_type_compare(row, key, value):
138193
"""
139194
Special type comparision
140195
"""
141-
logging.debug("special_type_compare: [%s] %s ---- %s", key, value, row)
196+
logging.debug("special_type_compare: [%s] %s", key, value)
142197

143198
# Adapted from the original migration
144199
# https://gist.github.com/lagoan/839cf8ce997fa17b529d84776b91cdac
@@ -162,12 +217,26 @@ def special_type_compare(row, key, value):
162217
"http://terms.library.ualberta.ca/learningObject": "http://purl.org/coar/resource_type/c_e059",
163218
}
164219

165-
list1 = [row[value["columns"]["jupiter"][0]]] + json.loads(
166-
row[value["columns"]["jupiter"][1]]
220+
# nan float if jupiter item not found
221+
list1 = (
222+
[row[value["columns"]["jupiter"][0]]]
223+
if isinstance(row[value["columns"]["jupiter"][0]], str)
224+
else []
167225
)
168-
# str1 = " ".join(easy_item_type_mapping[type] for type in list1 if type)
226+
if (
227+
isinstance(row[value["columns"]["jupiter"][1]], str)
228+
and row[value["columns"]["jupiter"][1]]
229+
):
230+
list1 = list1 + utils.convert_string_list_representation_to_list(
231+
row[value["columns"]["jupiter"][1]]
232+
)
233+
234+
logging.debug("special_type_compare: %s", list1)
235+
169236
list1 = list(easy_item_type_mapping[type] for type in list1 if type)
170-
list2 = json.loads(row[value["columns"]["dspace"]])
237+
list2 = utils.convert_string_list_representation_to_list(
238+
row[value["columns"]["dspace"]]
239+
)
171240

172241
logging.debug("special_type_compare: %s ---- %s", list1, list2)
173242

@@ -341,7 +410,7 @@ def special_type_compare(row, key, value):
341410
"description": {
342411
"columns": {
343412
"jupiter": "description",
344-
"dspace": "metadata.dc.description.0.value",
413+
"dspace": "metadata.dc.description",
345414
},
346415
"comparison_function": string_compare_ignore_whitespace,
347416
},
@@ -353,22 +422,22 @@ def special_type_compare(row, key, value):
353422
"comparison_function": collection_parent_compare,
354423
},
355424
"dc.title": {
356-
"columns": {"jupiter": "title", "dspace": "metadata.dc.title.0.value"},
425+
"columns": {"jupiter": "title", "dspace": "metadata.dc.title"},
357426
"comparison_function": string_compare,
358427
},
359428
"dc.contributor.author": {
360429
"columns": {
361430
"jupiter": "creators" "",
362431
"dspace": "metadata.dc.contributor.author",
363432
},
364-
"comparison_function": member_of_list_compare,
433+
"comparison_function": string_lists_compare,
365434
},
366435
"dc.contributor.other": {
367436
"columns": {
368437
"jupiter": "contributors" "",
369438
"dspace": "metadata.dc.contributor.other",
370439
},
371-
"comparison_function": member_of_list_compare,
440+
"comparison_function": string_lists_compare,
372441
},
373442
"dc.type": {
374443
"columns": {
@@ -383,19 +452,19 @@ def special_type_compare(row, key, value):
383452
},
384453
"dc.subject": {
385454
"columns": {"jupiter": "subject", "dspace": "metadata.dc.subject"},
386-
"comparison_function": member_of_list_compare,
455+
"comparison_function": string_lists_compare,
387456
},
388457
"dc.date.issued": {
389458
"columns": {"jupiter": "created", "dspace": "metadata.dc.date.issued"},
390-
"comparison_function": string_compare,
459+
"comparison_function": value_in_string_list_compare,
391460
},
392461
"dc.rights": {
393462
"columns": {"jupiter": "rights", "dspace": "metadata.dc.rights"},
394-
"comparison_function": member_of_list_compare,
463+
"comparison_function": value_in_string_list_compare,
395464
},
396465
"dc.rights.license": {
397466
"columns": {"jupiter": "license", "dspace": "metadata.dc.rights.license"},
398-
"comparison_function": member_of_list_compare,
467+
"comparison_function": value_in_string_list_compare,
399468
},
400469
# "dissertant": {
401470
# "columns": {"jupiter": "", "dspace": "metadata.dissertant"},
@@ -409,6 +478,10 @@ def special_type_compare(row, key, value):
409478
# "columns": {"jupiter": "", "dspace": "metadata.graduation_date"},
410479
# "comparison_function": string_compare,
411480
# },
481+
"access_rights": {
482+
"columns": {"jupiter": "visibility", "dspace": "access_rights"},
483+
"comparison_function": access_rights_compare,
484+
},
412485
},
413486
}
414487

@@ -424,12 +497,22 @@ def process_row(row, columns_to_compare):
424497
dspace_column = f"{value['columns']['dspace']}"
425498
comparison_function = value["comparison_function"]
426499

500+
logging.debug(
501+
"comparison [%s]: jupiter_column [%s] --- dspace_column [%s]",
502+
comparison_function.__name__,
503+
jupiter_column,
504+
dspace_column,
505+
)
506+
427507
if key == "dc.type":
428508
comparison_output[key] = comparison_function(row, key, value)
429509
elif comparison_function(row[jupiter_column], row[dspace_column]):
430510
comparison_output[key] = "PASS"
431511
else:
432512
comparison_output[key] = "FAIL"
513+
514+
logging.debug("key: [%s] status:[%s]", key, comparison_output[key])
515+
433516
return comparison_output
434517

435518

@@ -478,6 +561,9 @@ def process_input(
478561

479562
# Iterate over the rows in the aligned dataframe and compare the columns
480563
for index, row in aligned_df.iterrows():
564+
565+
logging.debug("ID [%s]", index)
566+
481567
comparison_output = {
482568
"index (empty if no ERA obj)": index,
483569
"label": row[comparison_config["label_column"]],
@@ -509,6 +595,7 @@ def process_input(
509595
comparison_output.update(
510596
process_row(row, comparison_config["comparison_types"])
511597
)
598+
logging.debug("output: [%s]", comparison_output)
512599
writer.writerow(comparison_output)
513600

514601

src/tests/assets/dspace_item.csv

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,2 @@
1-
uuid,name,metadata.dc.description.0.value,metadata.dc.title.0.value,metadata.ual.jupiterId,metadata.ual.jupiterCollection,lastModified,metadata.dc.contributor.author,metadata.dc.contributor.other,metadata.dc.type,metadata.dc.language.iso,metadata.dc.subject,metadata.ual.itemType,metadata.ual.publicationStatus,metadata.dc.date.issued,metadata.dc.rights,metadata.dc.rights.license
2-
1234,Test,Test description,Test,123456789.1,"[""collection_2""]",9999-01-02T03:04:05Z,"[""cre 1"",""cre 2""]","[""con 1"",""con 2""]","[""http://purl.org/coar/resource_type/c_2f33"",""http://purl.org/coar/version/c_970fb48d4fbd8a85""]","[""en""]","[""sub 1""]",http://purl.org/ontology/bibo/Book,"[""http://purl.org/ontology/bibo/status#published""]",9999-01-02,http://creativecommons.org/licenses/by-nc-nd/3.0/,asdf
1+
uuid,name,metadata.dc.description,metadata.dc.title,metadata.ual.jupiterId,metadata.ual.jupiterCollection,lastModified,metadata.dc.contributor.author,metadata.dc.contributor.other,metadata.dc.type,metadata.dc.language.iso,metadata.dc.subject,metadata.ual.itemType,metadata.ual.publicationStatus,metadata.dc.date.issued,metadata.dc.rights,metadata.dc.rights.license,access_rights
2+
1234,Test,Test description,Test,123456789.1,"[""collection_2""]",9999-01-02T03:04:05Z,"[""cre 1"",""cre 2""]","[""con 1"",""con 2""]","[""http://purl.org/coar/resource_type/c_2f33"",""http://purl.org/coar/version/c_970fb48d4fbd8a85""]","[""en""]","[""sub 1""]",http://purl.org/ontology/bibo/Book,"[""http://purl.org/ontology/bibo/status#published""]",['9999-01-02'],['http://creativecommons.org/licenses/by-nc-nd/3.0/'],['asdf'],open.access

src/tests/assets/jupiter_item.csv

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,2 @@
1-
title,description,id,updated_at,member_of_paths,creators,contributors,languages,subject,created,rights,license,item_type,publication_status
2-
Test,Test description,123456789.1,9999-01-02T03:04:05Z,"[""a/collection_2""]","[""cre 1"",""cre 2""]","[""con 1"",""con 2""]","[""http://id.loc.gov/vocabulary/iso639-2/eng""]","[""sub 1""]",9999-01-02,http://creativecommons.org/licenses/by-nc-nd/3.0/,asdf,http://purl.org/ontology/bibo/Book,"[""http://purl.org/ontology/bibo/status#published""]"
1+
title,description,id,updated_at,member_of_paths,creators,contributors,languages,subject,created,rights,license,item_type,publication_status,visibility
2+
Test,Test description,123456789.1,9999-01-02T03:04:05Z,"[""a/collection_2""]","[""cre 1"",""cre 2""]","[""con 1"",""con 2""]","[""http://id.loc.gov/vocabulary/iso639-2/eng""]","[""sub 1""]",9999-01-02,http://creativecommons.org/licenses/by-nc-nd/3.0/,asdf,http://purl.org/ontology/bibo/Book,"[""http://purl.org/ontology/bibo/status#published""]",http://terms.library.ualberta.ca/public

src/tests/test_integration.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -256,3 +256,4 @@ def test_input_process_item_valid(tmp_path):
256256
assert output_df["dc.rights"][0] == "PASS"
257257
assert output_df["dc.rights.license"][0] == "PASS"
258258
assert output_df["dc.type"][0] == "PASS"
259+
assert output_df["access_rights"][0] == "PASS"

src/utils/utilities.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
Script utility functions
33
"""
44

5+
import ast
56
import csv
67
import logging
78
import json
@@ -411,6 +412,24 @@ def convert_string_to_json(string):
411412
return None
412413

413414

415+
def convert_string_list_representation_to_list(string):
416+
"""
417+
Convert a string representation of a list (e.g., "['a','b']") to a list
418+
"""
419+
# try:
420+
# return json.loads(string) if isinstance(string, str) and string != '' else []
421+
# except json.JSONDecodeError as e:
422+
# logging.error("Error decoding JSON string: [%s] error %s", string, e)
423+
# return None
424+
try:
425+
return (
426+
ast.literal_eval(string) if isinstance(string, str) and string != "" else []
427+
)
428+
except ValueError as e:
429+
logging.error("Error decoding JSON string: [%s] error %s", string, e)
430+
return None
431+
432+
414433
def get_provenance_ual_jupiter_community_id(dspace_client, collection):
415434
"""
416435
Get the DC provenance UAL Jupiter ID from the collection

0 commit comments

Comments
 (0)