Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
137 changes: 69 additions & 68 deletions app/models/bulkrax/mets_xml_entry.rb
Original file line number Diff line number Diff line change
Expand Up @@ -2,97 +2,97 @@

require 'nokogiri'
module Bulkrax
# Generic XML Entry
class MetsXmlEntry < Entry
serialize :raw_metadata, JSON

def self.fields_from_data(data); end
class MetsXmlEntry < XmlEntry
# MetsXmlEntry method overrides
#

# modified from XmlEntry: don't remove namespaces
# @param [String] path
# @return [Nokogiri::XML::Document]
def self.read_data(path)
# This doesn't cope with BOM sequences:
# Nokogiri::XML(open(path), nil, 'UTF-8').remove_namespaces!
Nokogiri::XML(open(path))
end

# replacement for bulkrax 0.1.0 method
def self.source_identifier_field
source_identifier_config.last&.[](:from)&.first
end

def self.source_identifier_export
source_identifier_config.first
end

def self.source_identifier_config
Bulkrax.field_mappings['Bulkrax::MetsXmlParser'].select { |k,v| v.is_a?(Hash) && v[:source_identifier] }.first
end

# modified from XmlEntry for source_id value sourcing
# @param [Nokogiri::XML::Element] data
def self.data_for_entry(data)
# @param [Symbol] source_id
# @param [Bulkrax::MetsXMLParser] _parser
# @return Hash
def self.data_for_entry(data, source_id, _parser)
collections = []
children = []

source_identifier = data.attributes[source_identifier_field].text

return {
source_identifier: source_identifier,
source_id => data.attributes.with_indifferent_access[source_id].text,
delete: data.xpath(".//*[name()='delete']").first&.text,
data:
data.document.to_xml(
encoding: 'utf-8',
data.to_xml(
encoding: 'UTF-8',
save_with:
Nokogiri::XML::Node::SaveOptions::DEFAULT_XML
),
Nokogiri::XML::Node::SaveOptions::NO_DECLARATION | Nokogiri::XML::Node::SaveOptions::NO_EMPTY_TAGS
).delete("\n").delete("\t").squeeze(' '), # Remove newlines, tabs, and extra whitespace
collection: collections,
children: children
}
end

def source_identifier
@source_identifier ||= self.raw_metadata['source_identifier']
end

# modified from XmlEntry
# uses mets instead of xml for record
# @return IuMetata::METSRecord
def record
@record ||= IuMetadata::METSRecord.new(source_identifier, raw_metadata['data'])
@record ||= IuMetadata::METSRecord.new(source_identifier_value, raw_metadata['data'])
end

def files
@files ||= record.files
# modified from XmlEntry
# sources model from importer form, rather than imported record
def establish_factory_class
self.parsed_metadata['model'] = parser.parser_fields['work_type'] || 'PagedResource'
end

def add_work_type
self.parsed_metadata ||= {}
self.parsed_metadata['work_type'] = [parser.parser_fields['work_type'] || 'PagedResource']
# modified from XmlEntry
# uses IuMetatata::METSRecord for sourcing,
# elements defaults to all available attributes
# @param Array elements
def each_candidate_metadata_node_name_and_content(elements: record.attributes.keys)
record.attributes.select { |k,v| k.in?(elements) }.each do |k,v|
yield(k,v)
end
end

def build_metadata
raise StandardError, 'Record not found' if record.nil?
raise StandardError, 'Missing source identifier' if source_identifier.blank?
self.parsed_metadata = {}
self.parsed_metadata['admin_set_id'] = self.importerexporter.admin_set_id
self.parsed_metadata[self.class.source_identifier_export] = [source_identifier]
add_work_type
record.attributes.each do |k,v|
add_metadata(k, v) unless v.blank?
end
add_title
add_visibility
add_rights_statement
# MetsXmlEntry overrides of inherited methods
#

# modifed from HasLocalProcessing module
# adds mets-specific metadata handling
def add_local
add_local_files
add_remote_files
add_title
add_logical_structure
add_collections
add_local
raise StandardError, "title is required" if self.parsed_metadata['title'].join.blank?
self.parsed_metadata
add_parents
end

def override_title
%w[true 1].include?(parser.parser_fields['override_title'].to_s)
# modified from ImportBehavior
# gets single collection_id, if any, from importer form
def find_collection_ids
if parser.parser_fields['collection_id'].present?
self.collection_ids = Array.wrap(parser.parser_fields['collection_id'])
else
self.collection_ids = []
end
self.collection_ids
end

def add_title
self.parsed_metadata['title'] = [parser.parser_fields['title']] if override_title || self.parsed_metadata['title'].blank?
# MetsXmlEntry new methods
#

def source_identifier_value
@source_identifier_value ||= self.raw_metadata[source_identifier]
end

# memoize to handle url redirections just once
def files
@files ||= record.files
end

def add_local_files
Expand All @@ -109,15 +109,16 @@ def add_logical_structure
self.parsed_metadata['structure'] = record.structure
end

# the form only allows selecting an existing collection
def find_or_create_collection_ids
if parser.parser_fields['collection_id'].present?
self.collection_ids = Array.wrap(parser.parser_fields['collection_id'])
else
self.collection_ids = []
end
self.collection_ids
def add_parents
self.parsed_metadata['parents'] = collection_ids
end

def override_title?
%w[true 1].include?(parser.parser_fields['override_title'].to_s)
end

def add_title
self.parsed_metadata['title'] = [parser.parser_fields['title']] if override_title? || self.parsed_metadata['title'].blank?
end
end
end

97 changes: 6 additions & 91 deletions app/parsers/bulkrax/mets_xml_parser.rb
Original file line number Diff line number Diff line change
@@ -1,102 +1,17 @@
# frozen_string_literal: true

# @todo update for bulkrax 5+, where XML import isn't fully supported; currently broken
module Bulkrax
class MetsXmlParser < ApplicationParser
class MetsXmlParser < XmlParser
# modified from XmlParser
def entry_class
Bulkrax::MetsXmlEntry
end

# @todo not yet supported
def collection_entry_class; end

# @todo not yet supported
def create_collections; end

# @todo not yet supported
def import_fields; end

# @todo not yet supported
def file_set_entry_class; end

def valid_import?
raise StandardError, 'No metadata files found' if metadata_paths.blank?
raise StandardError, 'No records found' if records.blank?
true
rescue StandardError => e
status_info(e)
false
end

# For multiple, we expect to find metadata for multiple works in the given metadata file(s)
# For single, we expect to find metadata for a single work in the given metadata file(s)
# if the file contains more than one record, we take only the first
# In either case there may be multiple metadata files returned by metadata_paths
def records(_opts = {})
@records ||=
if parser_fields['import_type'] == 'multiple'
r = []
metadata_paths.map do |md|
# Retrieve all records
elements = entry_class.read_data(md).xpath("//#{record_element}")
r += elements.map { |el| entry_class.data_for_entry(el) }
end
# Flatten because we may have multiple records per array
r.compact.flatten
elsif parser_fields['import_type'] == 'single'
metadata_paths.map do |md|
data = entry_class.read_data(md).xpath("//#{record_element}").first # Take only the first record
entry_class.data_for_entry(data)
end.compact # No need to flatten because we take only the first record
end
end

def record_element
parser_fields['record_element']
end

# Return all files in the import directory and sub-directories
def file_paths
@file_paths ||=
# Relative to the file
if file?
Dir.glob("#{File.dirname(import_file_path)}/**/*").reject { |f| File.file?(f) == false }
# In the supplied directory
else
Dir.glob("#{import_file_path}/**/*").reject { |f| File.file?(f) == false }
end
end

# If the import_file_path is an xml file, return that
# Otherwise return all xml files in the given folder
def metadata_paths
@metadata_paths ||=
if file? && MIME::Types.type_for(import_file_path).include?('application/xml')
[import_file_path]
else
file_paths.select do |f|
MIME::Types.type_for(f).include?('application/xml') &&
f.include?("import_#{importerexporter.id}")
end
end
end

def create_works
records.each_with_index do |record, index|
next if record[:source_identifier].blank?
break if !limit.nil? && index >= limit

seen[record[:source_identifier]] = true
new_entry = find_or_create_entry(entry_class, record[:source_identifier], 'Bulkrax::Importer', record)
ImportWorkJob.send(perform_method, new_entry.id, current_run.id)
increment_counters(index)
# modified from ApplicationParser
def create_relationships
if parser_fields['collection_id'].present?
ScheduleRelationshipsJob.set(wait: 5.minutes).perform_later(importer_id: importerexporter.id)
end
rescue StandardError => e
status_info(e)
end

def total
records.size
end
end
end
9 changes: 4 additions & 5 deletions config/initializers/bulkrax.rb
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,7 @@

# Add local parsers
config.parsers += [
# @todo drop or fix -- currently broken, retaining at PO request
{ name: 'METS XML (currently broken)', class_name: 'Bulkrax::MetsXmlParser', partial: 'mets_xml_fields' },
{ name: 'METS XML', class_name: 'Bulkrax::MetsXmlParser', partial: 'mets_xml_fields' },
{ name: 'XML (currently broken)', class_name: 'Bulkrax::XmlParser', partial: 'xml_fields' },
]

Expand Down Expand Up @@ -67,10 +66,10 @@
'parents' => { from: ['parents'], related_parents_field_mapping: true },
'children' => { from: ['children'], related_children_field_mapping: true }
},
# Mets import primarily uses METSRecord attributes
"Bulkrax::MetsXmlParser" => {
'source' => { from: ['OBJID'], source_identifier: true, split: false },
"source_identifier" => { from: ["identifier"] },
"work_type" => 'PagedResource'
'source_identifier' => { from: ['OBJID'], source_identifier: true, split: false },
'model' => { from: ['work_type'] }
}
}

Expand Down
2 changes: 1 addition & 1 deletion lib/iu_metadata/mets_record.rb
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ def initialize(id, source)
].freeze

def attributes
ATTRIBUTES.map { |att| [att, send(att)] }.to_h.compact
@attributes ||= ATTRIBUTES.map { |att| [att, send(att)] }.to_h.compact.with_indifferent_access
end

def identifier
Expand Down
58 changes: 22 additions & 36 deletions spec/parsers/bulkrax/mets_xml_parser_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -4,46 +4,32 @@

module Bulkrax
RSpec.describe MetsXmlParser do
describe '#create_works' do
subject(:xml_parser) { described_class.new(importer) }
let(:importer) { create(:bulkrax_importer_mets_xml) }
let(:entry) { create(:bulkrax_entry, importerexporter: importer) }
let(:importer) { create(:bulkrax_importer_mets_xml) }
subject(:xml_parser) { described_class.new(importer) }

before do
Bulkrax.default_work_type = 'Work'
Bulkrax.field_mappings['Bulkrax::MetsXmlParser'] = {
"source" => { from: ["OBJID"], source_identifier: true },
"source_identifier" => { from: ["identifier"] },
"work_type" => 'PagedResource'
}

allow(Bulkrax::MetsXmlEntry).to receive_message_chain(:where, :first_or_create!).and_return(entry)
allow(entry).to receive(:id)
allow(Bulkrax::ImportWorkJob).to receive(:perform_later)
describe '#entry_class' do
it 'returns Bulkrax::MetsXmlEntry' do
expect(subject.entry_class).to eq Bulkrax::MetsXmlEntry
end
end

context 'with good data' do
before do
importer.parser_fields = {
'import_file_path' => './spec/fixtures/xml/mets.xml',
'record_element' => 'mets:mets'
}
describe '#create_relationships' do
before do
allow(subject).to receive(:parser_fields).and_return(parser_fields.with_indifferent_access)
allow(ScheduleRelationshipsJob).to receive(:set).and_return(double('PerformLater', perform_later: true))
end
context 'with no collection_id' do
let(:parser_fields) { {} }
it 'does not call ScheduleRelationshipsJob' do
expect(ScheduleRelationshipsJob).not_to receive(:set)
subject.create_relationships
end

context 'and import_type set to single' do
before do
importer.parser_fields.merge!('import_type' => 'single')
end

it 'processes the line' do
expect(xml_parser).to receive(:increment_counters).once
xml_parser.create_works
end

it 'counts the correct number of works and collections' do
expect(xml_parser.total).to eq(1)
expect(xml_parser.collections_total).to eq(0)
end
end
context 'with a collection_id' do
let(:parser_fields) { { collection_id: 'cid' } }
it 'calls ScheduleRelationshipJob' do
expect(ScheduleRelationshipsJob).to receive(:set)
subject.create_relationships
end
end
end
Expand Down