-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathiafc.py
executable file
·95 lines (75 loc) · 2.5 KB
/
iafc.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
#!/usr/bin/env python3
import argparse
import os
import sys
import requests
import time
from internetarchive import get_item
from pyfc4.models import *
from bs4 import BeautifulSoup
from werkzeug.urls import url_fix
try:
fedora_root = os.environ["FEDORA"]
except KeyError:
print("Please set the environment variable FEDORA")
sys.exit(1)
parser = argparse.ArgumentParser()
parser.add_argument("item", help="IA item id")
args = parser.parse_args()
item_name = args.item
item = get_item(item_name)
if not item.metadata:
print("ERROR: item with no metadata")
sys.exit(1)
repo = Repository(fedora_root, None, None, context={
'pcdm': 'http://pcdm.org/models#'})
try:
local_item = BasicContainer(repo, item_name)
local_item.create(specify_uri=True)
except:
local_item = repo.get_resource(item_name)
print(f'+ Container:\t {local_item.uri}')
# crosswalk metadata from IA json to DC
metadata_crosswalk = {
'title': 'dc.title',
'creator': 'dc.creator',
'uploader': 'dc.contributor',
'identifier-access': 'dc.identifier',
'identifier-ark': 'dc.identifier',
'language': 'dc.language',
'date': 'dc.date',
'subject': 'dc.subject',
'mediatype': 'dc.type',
'collection': 'dc.relation'
}
for k, v in metadata_crosswalk.items():
try:
if isinstance(item.metadata[k], list):
[local_item.add_triple(
eval(f'local_item.rdf.prefixes.{v}'), value) for value in item.metadata[k]]
else:
local_item.add_triple(
eval(f'local_item.rdf.prefixes.{v}'), item.metadata[k])
except:
pass
# local_item.add_triple(local_item.rdf.prefixes.rdf.type,
# local_item.rdf.prefixes.pcdm.Object)
# description may contain html, sanitized to text
try:
description = BeautifulSoup(str(item.metadata['description']), "lxml")
description_text = description.get_text('\n')
local_item.add_triple(local_item.rdf.prefixes.dc.description,
description_text)
except:
pass
local_item.update()
for item_file in item.files:
url = f'http://{item.d1}{item.dir}/{item_file["name"]}'
content_type = requests.head(url).headers.get('content-type')
print(f'- Binary:\t {url} ({content_type})')
local_file = Binary(
repo, f'{item_name}/files/{item_file["name"].replace(" ", "_")}')
local_file.headers['Content-Location'] = url_fix(url)
local_file.headers['Content-Type'] = content_type
local_file.create(specify_uri=True)
time.sleep(5)