Skip to content

Commit 89a9151

Browse files
authored
Merge pull request #7 from jefferya/two_jupiter_to_one_dspace
Item and Thesis audit: creators versus dissertant into dc.contributor.author
2 parents ebf5f31 + 8b135a1 commit 89a9151

File tree

7 files changed

+173
-36
lines changed

7 files changed

+173
-36
lines changed

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -351,6 +351,7 @@ Process thoughts:
351351
* In ERA, changing an object to "read only" creates a new version change event: the read_only_event column that indicated if change event only updates read only status thus allow one to filter/order these events if too numerous
352352
* Sort by item_id & date ascending: this allows grouping a sequence of updates over time; if the same field changed multiple times then use the most recent.
353353
* Event "destroy" means the object has been deleted and there will be no Scholaris mapping
354+
* Event "destroy" will create a long change record -- "top align" text in cells to see the text
354355

355356
### Status
356357

src/compare_csv.py

Lines changed: 167 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -234,6 +234,128 @@ def language_compare(list1, list2):
234234
return conversion_result == utils.convert_string_list_representation_to_list(list2)
235235

236236

237+
#
238+
def item_or_thesis_jupiter_strings_to_single_dspace(row, key, value):
239+
"""
240+
Special dc.issue_data comparison: jupiter item and thesis have different fields
241+
that migrated into the dc.date.issued
242+
"""
243+
logging.debug(": [%s] %s", key, value)
244+
245+
col_1 = row[value["columns"]["jupiter"][0]]
246+
col_2 = row[value["columns"]["jupiter"][1]]
247+
list_dspace = utils.convert_string_list_representation_to_list(
248+
row[value["columns"]["dspace"]]
249+
)
250+
251+
logging.debug(
252+
": %s[%s] %s[%s] %s[%s]",
253+
value["columns"]["jupiter"][0],
254+
col_1,
255+
value["columns"]["jupiter"][1],
256+
col_2,
257+
value["columns"]["dspace"],
258+
list_dspace,
259+
)
260+
261+
ret = "FAIL"
262+
if not col_1 and not col_2 and not list_dspace:
263+
ret = "PASS"
264+
elif col_1 in list_dspace or col_2 in list_dspace:
265+
ret = "PASS"
266+
return ret
267+
268+
269+
#
270+
def item_or_thesis_jupiter_list_and_string_to_single_dspace(row, key, value):
271+
"""
272+
Special comparison: jupiter item and thesis have different fields, one list and one string
273+
that migrated into the Scholaris field
274+
"""
275+
logging.debug(": [%s] %s", key, value)
276+
277+
logging.debug(
278+
": %s[%s] %s[%s] %s[%s]",
279+
value["columns"]["jupiter"][0],
280+
row[value["columns"]["jupiter"][0]],
281+
value["columns"]["jupiter"][1],
282+
row[value["columns"]["jupiter"][1]],
283+
value["columns"]["dspace"],
284+
row[value["columns"]["dspace"]],
285+
)
286+
287+
list_1 = utils.convert_string_list_representation_to_list(
288+
row[value["columns"]["jupiter"][0]]
289+
)
290+
str_1 = row[value["columns"]["jupiter"][1]]
291+
list_dspace = utils.convert_string_list_representation_to_list(
292+
row[value["columns"]["dspace"]]
293+
)
294+
295+
logging.debug(
296+
": %s[%s] %s[%s] %s[%s]",
297+
value["columns"]["jupiter"][0],
298+
list_1,
299+
value["columns"]["jupiter"][1],
300+
str_1,
301+
value["columns"]["dspace"],
302+
list_dspace,
303+
)
304+
305+
ret = "FAIL"
306+
if not list_1 and not str_1 and not list_dspace:
307+
ret = "PASS"
308+
elif list_1 == list_dspace or str_1 in list_dspace:
309+
ret = "PASS"
310+
return ret
311+
312+
313+
#
314+
def item_or_thesis_jupiter_lists_to_single_dspace(row, key, value):
315+
"""
316+
Special comparison: jupiter item and thesis have different fields
317+
that migrated into the field
318+
"""
319+
logging.debug(": [%s] %s", key, value)
320+
321+
logging.debug(
322+
": %s[%s] %s[%s] %s[%s]",
323+
value["columns"]["jupiter"][0],
324+
row[value["columns"]["jupiter"][0]],
325+
value["columns"]["jupiter"][1],
326+
row[value["columns"]["jupiter"][1]],
327+
value["columns"]["dspace"],
328+
row[value["columns"]["dspace"]],
329+
)
330+
331+
list_1 = utils.convert_string_list_representation_to_list(
332+
row[value["columns"]["jupiter"][0]]
333+
)
334+
list_2 = utils.convert_string_list_representation_to_list(
335+
row[value["columns"]["jupiter"][1]]
336+
)
337+
list_dspace = utils.convert_string_list_representation_to_list(
338+
row[value["columns"]["dspace"]]
339+
)
340+
341+
logging.debug(
342+
": %s[%s] %s[%s] %s[%s]",
343+
value["columns"]["jupiter"][0],
344+
list_1,
345+
value["columns"]["jupiter"][1],
346+
list_2,
347+
value["columns"]["dspace"],
348+
list_dspace,
349+
)
350+
351+
ret = "FAIL"
352+
if not list_1 and not list_2 and not list_dspace:
353+
ret = "PASS"
354+
elif list_dspace in (list_1, list_2):
355+
ret = "PASS"
356+
return ret
357+
358+
237359
#
238360
def special_type_compare(row, key, value):
239361
"""
@@ -286,7 +408,12 @@ def special_type_compare(row, key, value):
286408

287409
logging.debug("special_type_compare: %s ---- %s", list1, list2)
288410

289-
return "PASS" if list1 == list2 else "FAIL"
411+
ret = "FAIL"
412+
if list1 == list2:
413+
ret = "PASS"
414+
elif not list1 and list2 == ["http://purl.org/coar/resource_type/c_46ec"]:
415+
ret = "STATIC VALUE ADDED (thesis?)"
416+
return ret
290417

291418

292419
# Define the columns to compare and how to compare them
@@ -311,18 +438,18 @@ def special_type_compare(row, key, value):
311438
"jupiter": "description",
312439
"dspace": "metadata.dc.description",
313440
},
314-
"comparison_function": string_compare_ignore_whitespace,
441+
"comparison_function": string_in_list_compare_ignore_whitespace,
315442
},
316443
"abstract": {
317444
"columns": {
318445
"jupiter": "description",
319446
"dspace": "metadata.dc.description.abstract",
320447
},
321-
"comparison_function": string_compare_ignore_whitespace,
448+
"comparison_function": string_in_list_compare_ignore_whitespace,
322449
},
323450
"dc.title": {
324451
"columns": {"jupiter": "title", "dspace": "metadata.dc.title"},
325-
"comparison_function": string_compare,
452+
"comparison_function": value_in_string_list_compare,
326453
},
327454
},
328455
}
@@ -352,18 +479,18 @@ def special_type_compare(row, key, value):
352479
"jupiter": "description",
353480
"dspace": "metadata.dc.description",
354481
},
355-
"comparison_function": string_compare_ignore_whitespace,
482+
"comparison_function": string_in_list_compare_ignore_whitespace,
356483
},
357484
"abstract": {
358485
"columns": {
359486
"jupiter": "description",
360487
"dspace": "metadata.dc.description.abstract",
361488
},
362-
"comparison_function": string_compare_ignore_whitespace,
489+
"comparison_function": string_in_list_compare_ignore_whitespace,
363490
},
364491
"dc.title": {
365492
"columns": {"jupiter": "title", "dspace": "metadata.dc.title"},
366-
"comparison_function": string_compare,
493+
"comparison_function": value_in_string_list_compare,
367494
},
368495
"collection_parent_expect_to_fail_due_to_lack_of_community_provenance": {
369496
"columns": {
@@ -471,19 +598,19 @@ def special_type_compare(row, key, value):
471598
"columns": {"jupiter": "title", "dspace": "metadata.dc.title"},
472599
"comparison_function": value_in_string_list_compare,
473600
},
474-
"dc.contributor": {
601+
"dc.contributor.other": {
475602
"columns": {
476-
"jupiter": "contributors",
603+
"jupiter": ["contributors", "committee_members"],
477604
"dspace": "metadata.dc.contributor.other",
478605
},
479-
"comparison_function": string_lists_compare,
606+
"comparison_function": item_or_thesis_jupiter_lists_to_single_dspace,
480607
},
481-
"dc.creator": {
608+
"dc.contributor.author": {
482609
"columns": {
483-
"jupiter": "creators",
610+
"jupiter": ["creators", "dissertant"],
484611
"dspace": "metadata.dc.contributor.author",
485612
},
486-
"comparison_function": string_lists_compare,
613+
"comparison_function": item_or_thesis_jupiter_list_and_string_to_single_dspace,
487614
},
488615
"dc.type": {
489616
"columns": {
@@ -501,8 +628,11 @@ def special_type_compare(row, key, value):
501628
"comparison_function": string_lists_compare,
502629
},
503630
"dc.date.issued": {
504-
"columns": {"jupiter": "created", "dspace": "metadata.dc.date.issued"},
505-
"comparison_function": value_in_string_list_compare,
631+
"columns": {
632+
"jupiter": ["created", "graduation_date"],
633+
"dspace": "metadata.dc.date.issued",
634+
},
635+
"comparison_function": item_or_thesis_jupiter_strings_to_single_dspace,
506636
},
507637
"dc.rights": {
508638
"columns": {"jupiter": "rights", "dspace": "metadata.dc.rights"},
@@ -527,27 +657,27 @@ def special_type_compare(row, key, value):
527657
# "columns": {"jupiter": "", "dspace": "metadata.thesis.degree.discipline"},
528658
# "comparison_function": value_in_string_list_compare,
529659
# },
530-
"if_thesis_dissertant": {
531-
"columns": {
532-
"jupiter": "dissertant",
533-
"dspace": "metadata.dc.contributor.author",
534-
},
535-
"comparison_function": value_in_string_list_compare,
536-
},
660+
# "if_thesis_dissertant": {
661+
# "columns": {
662+
# "jupiter": "dissertant",
663+
# "dspace": "metadata.dc.contributor.author",
664+
# },
665+
# "comparison_function": value_in_string_list_compare,
666+
# },
537667
"if_thesis_supervisor": {
538668
"columns": {
539669
"jupiter": "supervisors",
540670
"dspace": "metadata.dc.contributor.advisor",
541671
},
542672
"comparison_function": string_lists_compare,
543673
},
544-
"if_thesis_committee_members": {
545-
"columns": {
546-
"jupiter": "committee_members",
547-
"dspace": "metadata.dc.contributor.other",
548-
},
549-
"comparison_function": string_lists_compare,
550-
},
674+
# "if_thesis_committee_members": {
675+
# "columns": {
676+
# "jupiter": "committee_members",
677+
# "dspace": "metadata.dc.contributor.other",
678+
# },
679+
# "comparison_function": string_lists_compare,
680+
# },
551681
"if_thesis_degree.grantor": {
552682
"columns": {
553683
"jupiter": "institution",
@@ -575,7 +705,7 @@ def special_type_compare(row, key, value):
575705
},
576706
"if_thesis_ual.department": {
577707
"columns": {"jupiter": "departments", "dspace": "metadata.ual.department"},
578-
"comparison_function": value_in_string_list_compare,
708+
"comparison_function": string_lists_compare,
579709
},
580710
},
581711
}
@@ -599,7 +729,13 @@ def process_row(row, columns_to_compare):
599729
dspace_column,
600730
)
601731

602-
if key == "dc.type":
732+
if comparison_function.__name__ in [
733+
"special_type_compare",
734+
"item_or_thesis_jupiter_strings_to_single_dspace",
735+
"item_or_thesis_jupiter_list_and_string_to_single_dspace",
736+
"item_or_thesis_jupiter_lists_to_single_dspace",
737+
]:
738+
# special comparison function
603739
comparison_output[key] = comparison_function(row, key, value)
604740
elif comparison_function(row[jupiter_column], row[dspace_column]):
605741
comparison_output[key] = "PASS"
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,2 @@
11
uuid,name,metadata.dc.description,metadata.dc.description.abstract,metadata.dc.title,provenance.ual.jupiterId.collection,provenance.ual.jupiterId.community,lastModified
2-
1234,Test,Test description,Test description,Test,123456789.1,community_2,9999-01-02T03:04:05Z
2+
1234,Test,"['Test description']","['Test description']","['Test']",123456789.1,community_2,9999-01-02T03:04:05Z
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,2 @@
11
uuid,name,metadata.dc.description,metadata.dc.description.abstract,metadata.dc.title,provenance.ual.jupiterId.collection,provenance.ual.jupiterId.community,languages.iso,metadata.dc.identifier.0.value,lastModified
2-
1234,Test,Test description 2,Test description 2,Test 2,123456789.1,community_2,9999-01-02T00:00:00Z,a,b
2+
1234,Test,"['Test description 2']","['Test description 2']","['Test 2']",123456789.1,community_2,9999-01-02T00:00:00Z,a,b

src/tests/assets/dspace_community.csv

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,2 @@
11
uuid,name,metadata.dc.description,metadata.dc.description.abstract,metadata.dc.title,lastModified
2-
1234,Test,Test description,Test description,Test,9999-01-02T03:04:05Z
2+
1234,Test,"['Test description']","['Test description']","['Test']",9999-01-02T03:04:05Z
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,2 @@
11
uuid,name,metadata.dc.description,metadata.dc.description.abstract,metadata.dc.title,lastModified
2-
i1234,Test,Test description 2,Test description 2,Test 2,9999-01-02T03:04:05Z
2+
i1234,Test,"['Test description 2']","['Test description 2']","['Test 2']",9999-01-02T03:04:05Z

src/tests/test_integration.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -335,7 +335,7 @@ def test_input_process_item_valid(tmp_path):
335335
assert output_df["description"][0] == "PASS"
336336
assert output_df["collection_parent"][0] == "PASS"
337337
assert output_df["dc.title"][0] == "PASS"
338-
assert output_df["dc.contributor"][0] == "PASS"
338+
assert output_df["dc.contributor.author"][0] == "PASS"
339339
assert output_df["dc.language"][0] == "PASS"
340340
assert output_df["dc.subject"][0] == "PASS"
341341
assert output_df["dc.date.issued"][0] == "PASS"

0 commit comments

Comments
 (0)