Skip to content

Commit e30f0bc

Browse files
committed
Adds script to output a subset of Jupiter content based on a list of Jupiter IDs extracted from DSpace items.
1 parent 531ae24 commit e30f0bc

File tree

2 files changed

+159
-1
lines changed

2 files changed

+159
-1
lines changed

jupiter_output_scripts/jupiter_collection_metadata_to_CSV.rb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# Ruby irb script to output the metadata of a collection in CSV format
1+
# Output a set of Jupiter metadata via Ruby irb script to output the metadata in CSV format
22
# Usage: irb -r ./juptiter_collection_metadata_to_CSV.rb
33

44
class JupiterBasicMetadataToCSV
Lines changed: 158 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,158 @@
1+
# Subset of Jupiter content via a Ruby irb script to output the metadata in CSV format
2+
# Usage: irb -r ./juptiter_collection_metadata_to_CSV.rb
3+
4+
ITEM_SUBSET = [
5+
'0e790a8e-a263-4a99-9a77-418d91b700c0',
6+
'204c4969-4b26-4737-9482-39981fca2ded',
7+
'285a73a4-fa9d-41f8-9df3-c92956a338cd',
8+
'297d92b8-f255-49ff-9835-b52c7c1a0b6a',
9+
'2c5219bd-01e4-4b1b-8c9f-2c3bddf12c0e',
10+
'2e7f1207-79dd-4620-9a01-127f66c81def',
11+
'2f3407d5-193a-4a9e-b2cb-633ffd7c8a3f',
12+
#'3424cccc-3bfb-4ba1-992e-37c5adfd52bf', not found in prd
13+
'369dad5d-f221-4973-92aa-e9ce673cd58f',
14+
'3d6b31c2-686c-4c62-a598-eda1cf159125',
15+
'5a0aad85-bffa-4686-bae3-d589a64361dc',
16+
'69c8b80b-2251-4ed3-9fd4-5f433c30e521',
17+
'8d2ccf90-1cec-474b-9d83-b6d7bc02704c',
18+
'90fb3051-2a0f-4217-89a4-fc0ce2bfd672',
19+
'930915e9-aad9-46e6-9b98-72c379789869',
20+
'a20df6be-4fc3-4a19-aeea-6609474bec3f',
21+
'b464a3dc-510d-4625-a009-75dddaac1c91',
22+
'be7d39d9-9561-4485-a6f2-0f69f8084f83',
23+
# 'c3bcf748-5676-4b58-bdbe-168590b0bbbc', not found in prd
24+
'c8d55e58-b23f-421c-a75a-7c26c67a1c48',
25+
'c97c1a89-c2aa-49ae-9c26-d7560d8dfdc0',
26+
'c9f40f8c-56d1-4acd-80d6-dca64bd3e068',
27+
'cc9de8a2-ec54-46e2-b1a3-f3e34e7c762c',
28+
'd0973c15-8cff-431c-94e5-c21738e09594',
29+
'd18f9dd3-afda-4108-b32c-fff2ccbf93cf',
30+
'd3a154a1-cdb5-4a27-8074-c598448ea5df',
31+
'd8ab19e4-1f58-404d-a7f1-e7ac071d0a74',
32+
'db0339cf-5faf-4f28-9a25-5360bad5e8d9',
33+
'dd5e240c-533e-482c-9141-fcbab6362d50',
34+
'e22bb1bc-cdc5-4cc7-b907-8cc723599ec2',
35+
'f3678283-1807-4016-90e4-07f5dde4efd3'
36+
].freeze
37+
38+
#ITEM_SUBSET = [
39+
#"047630f2-beba-4895-b7da-78c1a0219a92"
40+
#].freeze
41+
42+
class JupiterBasicMetadataToCSV
43+
def initialize()
44+
@output_file = ""
45+
@instance = nil
46+
end
47+
def enumerable
48+
@instance.find_each() do |i|
49+
yeild i
50+
end
51+
end
52+
def run
53+
raise "Instance not set" unless @instance
54+
headers = @instance.new.decorate.attributes.keys
55+
CSV.open(@output_file, 'wb', write_headers: true, headers: headers) do |csv|
56+
enumerable do |i|
57+
csv << i.decorate.attributes.values
58+
end
59+
end
60+
end
61+
end
62+
63+
# Jupiter Item metadata
64+
class JupiterItemMetadataToCSV < JupiterBasicMetadataToCSV
65+
def initialize(output_directory)
66+
super()
67+
@date_time = Time.now.strftime("%Y-%m-%d_%H-%M-%S")
68+
@output_file = output_directory + "jupiter_item_#{@date_time}.csv"
69+
@instance = Item
70+
end
71+
end
72+
73+
# Subset of Jupiter Item metadata
74+
class JupiterItemMetadataToCSVSubset < JupiterBasicMetadataToCSV
75+
def initialize(output_directory)
76+
super()
77+
@date_time = Time.now.strftime("%Y-%m-%d_%H-%M-%S")
78+
@output_file = output_directory + "jupiter_item_subset_#{@date_time}.csv"
79+
@instance = Item
80+
end
81+
def enumerable
82+
ITEM_SUBSET.each do |i|
83+
yield @instance.find(i)
84+
end
85+
end
86+
end
87+
88+
89+
# Juptier Active Storage Blob and Item metadata
90+
class JupiterActiveStorageBlobMetadataToCSV
91+
def initialize(output_directory)
92+
@date_time = Time.now.strftime("%Y-%m-%d_%H-%M-%S")
93+
@output_file = output_directory + "jupiter_activestorage_#{@date_time}.csv"
94+
@instance = Item
95+
end
96+
def enumerable
97+
@instance.find_each() do |i|
98+
yeild i
99+
end
100+
end
101+
def run
102+
# "provenance.ual.jupiterId.item" & "bitstream.sequenceId" labels need to align
103+
# with the DSpace CSV for use with pandas dataframe multi-index join
104+
# in the comparison script
105+
headers = ["item.id",
106+
"item.title",
107+
"provenance.ual.jupiterId.item",
108+
"bitstream.sequenceId",
109+
"key",
110+
"filename",
111+
"content_type",
112+
"metadata",
113+
"byte_size",
114+
"checksum",
115+
"created_at"]
116+
CSV.open(@output_file, 'wb', write_headers: true, headers: headers) do |csv|
117+
enumerable do |item|
118+
sequence_num = 0
119+
item.ordered_files.each do |f|
120+
sequence_num += 1
121+
csv << [item.id,
122+
item.title,
123+
item.id,
124+
sequence_num,
125+
f.blob.key,
126+
f.blob.filename,
127+
f.blob.content_type,
128+
f.blob.metadata,
129+
f.blob.byte_size,
130+
f.blob.checksum,
131+
f.blob.created_at]
132+
end
133+
end
134+
end
135+
end
136+
end
137+
138+
# Subset of Juptier Active Storage Blob and Item metadata
139+
class JupiterActiveStorageBlobMetadataToCSVSubset < JupiterActiveStorageBlobMetadataToCSV
140+
def initialize(output_directory)
141+
@date_time = Time.now.strftime("%Y-%m-%d_%H-%M-%S")
142+
@output_file = output_directory + "jupiter_activestorage_subset_#{@date_time}.csv"
143+
@instance = Item
144+
end
145+
def enumerable
146+
ITEM_SUBSET.each do |i|
147+
yield @instance.find(i)
148+
end
149+
end
150+
end
151+
152+
#JupiterCommunityMetadataToCSV.new("/era_tmp/delete_me_by_2025-04-15/").run
153+
#JupiterCollectionMetadataToCSV.new("/era_tmp/delete_me_by_2025-04-15/").run
154+
#JupiterItemMetadataToCSV.new("/era_tmp/delete_me_by_2025-04-15/").run
155+
#JupiterActiveStorageBlobMetadataToCSV.new("/era_tmp/delete_me_by_2025-04-15/").run
156+
157+
JupiterItemMetadataToCSVSubset.new("/tmp/").run
158+
JupiterActiveStorageBlobMetadataToCSVSubset.new("/tmp/").run

0 commit comments

Comments
 (0)