-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathuploadBAR.py
242 lines (185 loc) · 6.46 KB
/
uploadBAR.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
###
# GLBT Historical Society
# BAR Digitization Project -- Upload to Internet Archive
# This script creates a temporary zipfile of TIFFs for an issue of the BAR,
# gathers metadata from our project Google Sheet to a dictionary, and uploads
# the package to the Internet Archive.
# by Bill Levay
###
import zipfile, os, datetime, gspread, subprocess, sys
from internetarchive import upload
from oauth2client.service_account import ServiceAccountCredentials
###
# Get issue metadata from Google Sheet
###
def get_metadata(issue):
# Google Sheet setup
scope = ['https://spreadsheets.google.com/feeds']
credentials = ServiceAccountCredentials.from_json_keyfile_name('BAR Digitization-fb1d45aa1d32.json', scope)
gc = gspread.authorize(credentials)
issue_meta = {}
# Open spreadsheet and worksheet
sh = gc.open_by_key('1tZjpKZfkGsuUD1iEx_blclJiNQBcfiGhkdXPn9voYGo')
wks = sh.worksheet('itemList')
print 'Getting metadata from Google Sheet...'
# Find cell by finding issue date in Sheet
try:
cell_list = wks.findall(issue)
# Get the row, then get some values in that row
row = str(cell_list[0].row)
vol = wks.acell('C' + row).value
issue_no = wks.acell('D' + row).value
page_ct = wks.acell('F' + row).value
publisher = wks.acell('S' + row).value
ia_upload = wks.acell('W' + row).value
date = issue[0:4] + '-' + issue[4:6] + '-' + issue[6:8]
datetext = datetime.datetime.strptime(issue, '%Y%m%d').strftime('%d %B %Y').lstrip('0')
ia_id = 'BAR_' + issue
ia_title = 'Bay Area Reporter, Volume {}, Number {}, {}'.format(vol, issue_no, datetext)
# Add issue metadata to the issue_meta dict
issue_meta['vol'] = vol
issue_meta['issue_no'] = issue_no
issue_meta['page_ct'] = page_ct
issue_meta['publisher'] = publisher
issue_meta['date'] = date
issue_meta['datetext'] = datetext
issue_meta['ia_upload'] = ia_upload
issue_meta['ia_id'] = ia_id
issue_meta['ia_title'] = ia_title
except Exception as e:
print 'Error with metadata for {}: {}'.format(issue, e)
print issue_meta
return issue_meta
###
# Update Google Sheet after processing
###
def update_sheet(issue):
# Google Sheet setup
scope = ['https://spreadsheets.google.com/feeds']
credentials = ServiceAccountCredentials.from_json_keyfile_name('BAR Digitization-fb1d45aa1d32.json', scope)
gc = gspread.authorize(credentials)
try:
# Open spreadsheet and worksheet
sh = gc.open_by_key('1tZjpKZfkGsuUD1iEx_blclJiNQBcfiGhkdXPn9voYGo')
wks = sh.worksheet('itemList')
# Find cell
cell_list = wks.findall(issue)
# Get the row
row = str(cell_list[0].row)
# Update cells in that row
wks.update_acell('W' + row, issue_meta['ia_upload'])
except Exception as e:
print 'Could not update Google Sheet for {}: {}'.format(issue, e)
###
# Create a list of issues to process
###
def process():
process_list = []
if os.path.exists(source_path):
for issue in os.listdir(source_path):
if len(issue) == 8:
process_list.append(issue)
print process_list
return process_list
else:
print 'Sorry, we cannot find any issues from', year
###
# Create temporary zipfile of TIFFs
###
def zip(issue):
# open zipfile
print 'creating archive'
with zipfile.ZipFile(zip_path, mode='w', allowZip64 = True) as zf:
# loop through files and add TIFFs to ZIP
for file in os.listdir(issue_path):
if '.tif' in file:
file_path = os.path.join(issue_path, file)
try:
print 'writing', file
zf.write(file_path)
except Exception as e:
print 'An error occurred with', file
pass
# close zipfile
zf.close()
print 'Created zipfile for', issue
#TO DO: confirm zipfile has same number of files as pages in spreadsheet
###
# Upload to IA
###
def upload(issue):
# create a command-line string to run as a subprocess
ia_string = 'ia --config-file "config\\ia.ini" upload {} "{}" -m "title:{}" -m "date:{}" -m "publisher:{}" -m "rights:Copyright BAR Media, Inc." -m "contributor:GLBT Historical Society" -m "coverage:San Francisco (Calif.)" -m "mediatype:texts" -m "collection:bayareareporter" -m "language:English"'.format(issue_meta['ia_id'], zip_path, issue_meta['ia_title'], issue_meta['date'], issue_meta['publisher'])
try:
print 'Uploading...'
r = subprocess.check_output(ia_string, stderr=subprocess.STDOUT)
print r
issue_meta['ia_upload'] = 'TRUE'
except Exception as e:
print e
pass
# TO DO: confirm upload before deleting zipfile
# delete zip
try:
os.remove(zip_path)
print 'Removed', zip_path
except Exception as e:
print e
pass
###
# Yes or No?
###
def query_yes_no(question, default="yes"):
"""Ask a yes/no question via raw_input() and return their answer.
"question" is a string that is presented to the user.
"default" is the presumed answer if the user just hits <Enter>.
It must be "yes" (the default), "no" or None (meaning
an answer is required of the user).
The "answer" return value is True for "yes" or False for "no".
"""
valid = {"yes": True, "y": True, "ye": True,
"no": False, "n": False}
if default is None:
prompt = " [y/n] "
elif default == "yes":
prompt = " [Y/n] "
elif default == "no":
prompt = " [y/N] "
else:
raise ValueError("invalid default answer: '%s'" % default)
while True:
sys.stdout.write(question + prompt)
choice = raw_input().lower()
if default is not None and choice == '':
return valid[default]
elif choice in valid:
return valid[choice]
else:
sys.stdout.write("Please respond with 'yes' or 'no' "
"(or 'y' or 'n').\n")
###
# Start processing
###
if __name__ == "__main__":
year = raw_input('Enter the year you want to upload to Internet Archive: ')
source_path = 'G:\\Dropbox (GLBTHS)\\Archive\\BAR\\{}\\'.format(year)
process_list = process()
if process_list is not None:
if query_yes_no('Upload ' + str(len(process_list)) + ' issues from ' + year + '?', None):
for issue in process_list:
issue_path = source_path + issue
issue_meta = get_metadata(issue)
# check to make sure we didn't already upload this one
if issue_meta['ia_upload'] == '':
zip_path = 'G:\\{}_images.zip'.format(issue_meta['ia_id'])
zip(issue)
upload(issue)
update_sheet(issue)
print 'Finished with', issue, '- moving on to next issue.'
else:
print issue, 'was already uploaded to IA. Moving to next issue.'
else:
print('No issues uploaded.')
else:
print('No issues to upload.')
print 'ALL DONE'