Skip to content

Commit 26fc8e1

Browse files
authored
Merge pull request #532 from dimitri-yatsenko/attachments
Attachments and configurable blobs
2 parents 2d21899 + afeadb1 commit 26fc8e1

21 files changed

+549
-185
lines changed

datajoint/__init__.py

Lines changed: 11 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -14,30 +14,27 @@
1414
http://dx.doi.org/10.1101/031658
1515
"""
1616

17-
__author__ = "Dimitri Yatsenko, Edgar Y. Walker, and Fabian Sinz at Baylor College of Medicine"
18-
__date__ = "Nov 15, 2018"
17+
__author__ = "DataJoint Contributors"
18+
__date__ = "February 7, 2019"
1919
__all__ = ['__author__', '__version__',
20-
'config', 'conn', 'kill', 'Table',
21-
'Connection', 'Heading', 'FreeTable', 'Not', 'schema',
20+
'config', 'conn', 'Connection',
21+
'schema', 'create_virtual_module', 'get_schema_names',
22+
'Table', 'FreeTable',
2223
'Manual', 'Lookup', 'Imported', 'Computed', 'Part',
23-
'AndList', 'ERD', 'U', 'key',
24-
'DataJointError', 'DuplicateError',
25-
'set_password', 'create_virtual_module']
24+
'Not', 'AndList', 'U', 'ERD',
25+
'set_password', 'kill',
26+
'DataJointError', 'DuplicateError', 'key']
2627

2728

28-
# ------------- flatten import hierarchy -------------------------
2929
from .version import __version__
3030
from .settings import config
3131
from .connection import conn, Connection
32-
from .table import FreeTable, Table
32+
from .schema import Schema as schema
33+
from .schema import create_virtual_module, get_schema_names
34+
from .table import Table, FreeTable
3335
from .user_tables import Manual, Lookup, Imported, Computed, Part
3436
from .expression import Not, AndList, U
35-
from .heading import Heading
36-
from .schema import Schema as schema
37-
from .schema import create_virtual_module
3837
from .erd import ERD
3938
from .admin import set_password, kill
4039
from .errors import DataJointError, DuplicateError
4140
from .fetch import key
42-
43-

datajoint/attach.py

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
"""
2+
functionality for attaching files
3+
"""
4+
from os import path
5+
from itertools import count, chain
6+
7+
8+
def load(local_path):
9+
""" make an attachment from a local file """
10+
with open(local_path, mode='rb') as f: # b is important -> binary
11+
contents = f.read()
12+
return str.encode(path.basename(local_path)) + b'\0' + contents
13+
14+
15+
def save(buffer, save_path='.'):
16+
""" save attachment from memory buffer into the save_path """
17+
p = buffer.find(b'\0')
18+
file_path = path.abspath(path.join(save_path, buffer[:p].decode()))
19+
20+
if path.isfile(file_path):
21+
# generate a new filename
22+
file, ext = path.splitext(file_path)
23+
file_path = next(f for f in ('%s_%04x%s' % (file, n, ext) for n in count())
24+
if not path.isfile(f))
25+
26+
with open(file_path, mode='wb') as f:
27+
f.write(buffer[p+1:])
28+
return file_path

datajoint/declare.py

Lines changed: 31 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -197,7 +197,6 @@ def declare(full_table_name, definition, context):
197197
:param definition: DataJoint table definition
198198
:param context: dictionary of objects that might be referred to in the table.
199199
"""
200-
201200
table_name = full_table_name.strip('`').split('.')[1]
202201
if len(table_name) > MAX_TABLE_NAME_LENGTH:
203202
raise DataJointError(
@@ -271,12 +270,13 @@ def compile_attribute(line, in_key, foreign_key_sql):
271270
match['default'] = ''
272271
match = {k: v.strip() for k, v in match.items()}
273272
match['nullable'] = match['default'].lower() == 'null'
274-
accepted_datatype = r'time|date|year|enum|(var)?char|float|real|double|decimal|numeric|' \
275-
r'(tiny|small|medium|big)?int|bool|' \
276-
r'(tiny|small|medium|long)?blob|external|attach'
273+
blob_datatype = r'(tiny|small|medium|long)?blob'
274+
accepted_datatype = (
275+
r'time|date|year|enum|(var)?char|float|real|double|decimal|numeric|'
276+
r'(tiny|small|medium|big)?int|bool|external|attach|' + blob_datatype)
277277
if re.match(accepted_datatype, match['type'], re.I) is None:
278278
raise DataJointError('DataJoint does not support datatype "{type}"'.format(**match))
279-
279+
is_blob = bool(re.match(blob_datatype, match['type'], re.I))
280280
literals = ['CURRENT_TIMESTAMP'] # not to be enclosed in quotes
281281
if match['nullable']:
282282
if in_key:
@@ -285,38 +285,45 @@ def compile_attribute(line, in_key, foreign_key_sql):
285285
else:
286286
if match['default']:
287287
quote = match['default'].upper() not in literals and match['default'][0] not in '"\''
288-
match['default'] = ('NOT NULL DEFAULT ' +
289-
('"%s"' if quote else "%s") % match['default'])
288+
match['default'] = 'NOT NULL DEFAULT ' + ('"%s"' if quote else "%s") % match['default']
290289
else:
291290
match['default'] = 'NOT NULL'
292291
match['comment'] = match['comment'].replace('"', '\\"') # escape double quotes in comment
293-
294-
is_external = match['type'].startswith('external')
295-
is_attachment = match['type'].startswith('attachment')
296-
if not is_external:
297-
sql = ('`{name}` {type} {default}' + (' COMMENT "{comment}"' if match['comment'] else '')).format(**match)
298-
else:
299-
# process externally stored attribute
292+
is_configurable = match['type'].startswith(('external', 'blob-', 'attach'))
293+
is_external = False
294+
if is_configurable:
300295
if in_key:
301-
raise DataJointError('External attributes cannot be primary in:\n%s' % line)
296+
raise DataJointError('Configurable attributes cannot be primary in:\n%s' % line)
297+
match['comment'] = ':{type}:{comment}'.format(**match) # insert configurable type into comment
302298
store_name = match['type'].split('-')
303-
if store_name[0] != 'external':
304-
raise DataJointError('External store types must be specified as "external" or "external-<name>"')
299+
if store_name[0] not in ('external', 'blob', 'attach'):
300+
raise DataJointError('Configurable types must be in the form blob-<store> or attach-<store> in:\n%s' % line)
305301
store_name = '-'.join(store_name[1:])
306-
if store_name != '' and not store_name.isidentifier():
302+
if store_name and not store_name.isidentifier():
307303
raise DataJointError(
308304
'The external store name `{type}` is invalid. Make like a python identifier.'.format(**match))
309305
if len(store_name) > STORE_NAME_LENGTH:
310306
raise DataJointError(
311307
'The external store name `{type}` is too long. Must be <={max_len} characters.'.format(
312308
max_len=STORE_NAME_LENGTH, **match))
313-
if not match['default'] in ('DEFAULT NULL', 'NOT NULL'):
314-
raise DataJointError('The only acceptable default value for an external field is null in:\n%s' % line)
315-
if match['type'] not in config:
316-
raise DataJointError('The external store `{type}` is not configured.'.format(**match))
309+
spec = config.get_store_spec(store_name)
310+
is_external = spec['protocol'] in {'s3', 'file'}
311+
if not is_external:
312+
is_blob = re.match(blob_datatype, spec['protocol'], re.I)
313+
if not is_blob:
314+
raise DataJointError('Invalid protocol {protocol} in external store in:\n{line}'.format(
315+
line=line, **spec))
316+
match['type'] = spec['protocol']
317+
318+
if (is_external or is_blob) and match['default'] not in ('DEFAULT NULL', 'NOT NULL'):
319+
raise DataJointError(
320+
'The default value for a blob or attachment can only be NULL in:\n%s' % line)
317321

318-
# append external configuration name to the end of the comment
319-
sql = '`{name}` {hash_type} {default} COMMENT ":{type}:{comment}"'.format(
322+
if not is_external:
323+
sql = ('`{name}` {type} {default}' + (' COMMENT "{comment}"' if match['comment'] else '')).format(**match)
324+
else:
325+
# add hash field with a dependency on the ~external table
326+
sql = '`{name}` {hash_type} {default} COMMENT "{comment}"'.format(
320327
hash_type=HASH_DATA_TYPE, **match)
321328
foreign_key_sql.append(
322329
"FOREIGN KEY (`{name}`) REFERENCES {{external_table}} (`hash`) "

datajoint/erd.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -50,8 +50,8 @@ class ERD:
5050
"""
5151

5252
def __init__(self, *args, **kwargs):
53-
warnings.warn('ERD functionality depends on matplotlib and pygraphviz. Please install both of these '
54-
'libraries to enable the ERD feature.')
53+
warnings.warn('ERD functionality depends on matplotlib and pygraphviz. '
54+
'Please install both of these libraries to enable the ERD feature.')
5555
else:
5656
class ERD(nx.DiGraph):
5757
"""
@@ -228,7 +228,6 @@ def _make_graph(self):
228228
return graph
229229

230230
def make_dot(self):
231-
import networkx as nx
232231

233232
graph = self._make_graph()
234233
graph.nodes()

datajoint/external.py

Lines changed: 50 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,21 @@
11
import os
2-
from tqdm import tqdm
2+
import itertools
33
from .settings import config
44
from .errors import DataJointError
55
from .hash import long_hash
6-
from .blob import pack, unpack
76
from .table import Table
87
from .declare import STORE_HASH_LENGTH, HASH_DATA_TYPE
9-
from .s3 import Folder as S3Folder
8+
from . import s3
109
from .utils import safe_write
1110

1211

12+
def subfold(name, folds):
13+
"""
14+
subfolding for external storage: e.g. subfold('abcdefg', (2, 3)) --> ['ab','cde']
15+
"""
16+
return (name[:folds[0]].lower(),) + subfold(name[folds[0]:], folds[1:]) if folds else ()
17+
18+
1319
class ExternalTable(Table):
1420
"""
1521
The table tracking externally stored objects.
@@ -42,15 +48,15 @@ def definition(self):
4248
def table_name(self):
4349
return '~external'
4450

45-
def put(self, store, obj):
51+
def put(self, store, blob):
4652
"""
4753
put an object in external store
4854
"""
49-
spec = self._get_store_spec(store)
50-
blob = pack(obj)
51-
blob_hash = long_hash(blob) + store[len('external-'):]
55+
store = ''.join(store.split('-')[1:])
56+
spec = config.get_store_spec(store)
57+
blob_hash = long_hash(blob) + store
5258
if spec['protocol'] == 'file':
53-
folder = os.path.join(spec['location'], self.database)
59+
folder = os.path.join(spec['location'], self.database, *subfold(blob_hash, spec['subfolding']))
5460
full_path = os.path.join(folder, blob_hash)
5561
if not os.path.isfile(full_path):
5662
try:
@@ -59,9 +65,10 @@ def put(self, store, obj):
5965
os.makedirs(folder)
6066
safe_write(full_path, blob)
6167
elif spec['protocol'] == 's3':
62-
S3Folder(database=self.database, **spec).put(blob_hash, blob)
68+
folder = '/'.join(subfold(blob_hash, spec['subfolding']))
69+
s3.Folder(database=self.database, **spec).put('/'.join((folder, blob_hash)), blob)
6370
else:
64-
raise DataJointError('Unknown external storage protocol {protocol} for {store}'.format(
71+
raise DataJointError('Unknown external storage protocol {protocol} in store "-{store}"'.format(
6572
store=store, protocol=spec['protocol']))
6673

6774
# insert tracking info
@@ -80,31 +87,33 @@ def get(self, blob_hash):
8087
"""
8188
if blob_hash is None:
8289
return None
83-
store = blob_hash[STORE_HASH_LENGTH:]
84-
store = 'external' + ('-' if store else '') + store
85-
86-
cache_folder = config.get('cache', None)
8790

91+
# attempt to get object from cache
8892
blob = None
93+
cache_folder = config.get('cache', None)
8994
if cache_folder:
9095
try:
9196
with open(os.path.join(cache_folder, blob_hash), 'rb') as f:
9297
blob = f.read()
9398
except FileNotFoundError:
9499
pass
95100

101+
# attempt to get object from store
96102
if blob is None:
97-
spec = self._get_store_spec(store)
103+
store = blob_hash[STORE_HASH_LENGTH:]
104+
spec = config.get_store_spec(store)
98105
if spec['protocol'] == 'file':
99-
full_path = os.path.join(spec['location'], self.database, blob_hash)
106+
subfolders = os.path.join(*subfold(blob_hash, spec['subfolding']))
107+
full_path = os.path.join(spec['location'], self.database, subfolders, blob_hash)
100108
try:
101109
with open(full_path, 'rb') as f:
102110
blob = f.read()
103111
except FileNotFoundError:
104112
raise DataJointError('Lost access to external blob %s.' % full_path) from None
105113
elif spec['protocol'] == 's3':
106114
try:
107-
blob = S3Folder(database=self.database, **spec).get(blob_hash)
115+
subfolder = '/'.join(subfold(blob_hash, spec['subfolding']))
116+
blob = s3.Folder(database=self.database, **spec).get('/'.join((subfolder, blob_hash)))
108117
except TypeError:
109118
raise DataJointError('External store {store} configuration is incomplete.'.format(store=store))
110119
else:
@@ -115,7 +124,7 @@ def get(self, blob_hash):
115124
os.makedirs(cache_folder)
116125
safe_write(os.path.join(cache_folder, blob_hash), blob)
117126

118-
return unpack(blob)
127+
return blob
119128

120129
@property
121130
def references(self):
@@ -156,34 +165,35 @@ def delete_garbage(self):
156165
for ref in self.references) or "TRUE")
157166
print('Deleted %d items' % self.connection.query("SELECT ROW_COUNT()").fetchone()[0])
158167

159-
def clean_store(self, store, display_progress=True):
168+
def clean_store(self, store, verbose=True):
160169
"""
161170
Clean unused data in an external storage repository from unused blobs.
162171
This must be performed after delete_garbage during low-usage periods to reduce risks of data loss.
163172
"""
164-
spec = self._get_store_spec(store)
165-
progress = tqdm if display_progress else lambda x: x
173+
spec = config.get_store_spec(store)
174+
in_use = set(x for x in (self & '`hash` LIKE "%%{store}"'.format(store=store)).fetch('hash'))
166175
if spec['protocol'] == 'file':
167-
folder = os.path.join(spec['location'], self.database)
168-
delete_list = set(os.listdir(folder)).difference(self.fetch('hash'))
169-
print('Deleting %d unused items from %s' % (len(delete_list), folder), flush=True)
170-
for f in progress(delete_list):
171-
os.remove(os.path.join(folder, f))
176+
count = itertools.count()
177+
print('Deleting...')
178+
deleted_folders = set()
179+
for folder, dirs, files in os.walk(os.path.join(spec['location'], self.database), topdown=False):
180+
if dirs and files:
181+
raise DataJointError('Invalid repository with files in non-terminal folder %s' % folder)
182+
dirs = set(d for d in dirs if os.path.join(folder, d) not in deleted_folders)
183+
if not dirs:
184+
files_not_in_use = [f for f in files if f not in in_use]
185+
for f in files_not_in_use:
186+
filename = os.path.join(folder, f)
187+
next(count)
188+
if verbose:
189+
print(filename)
190+
os.remove(filename)
191+
if len(files_not_in_use) == len(files):
192+
os.rmdir(folder)
193+
deleted_folders.add(folder)
194+
print('Deleted %d objects' % next(count))
172195
elif spec['protocol'] == 's3':
173196
try:
174-
S3Folder(database=self.database, **spec).clean(self.fetch('hash'))
197+
failed_deletes = s3.Folder(database=self.database, **spec).clean(in_use, verbose=verbose)
175198
except TypeError:
176199
raise DataJointError('External store {store} configuration is incomplete.'.format(store=store))
177-
178-
@staticmethod
179-
def _get_store_spec(store):
180-
try:
181-
spec = config[store]
182-
except KeyError:
183-
raise DataJointError('Storage {store} is requested but not configured'.format(store=store)) from None
184-
if 'protocol' not in spec:
185-
raise DataJointError('Storage {store} config is missing the protocol field'.format(store=store))
186-
if spec['protocol'] not in {'file', 's3'}:
187-
raise DataJointError(
188-
'Unknown external storage protocol "{protocol}" in "{store}"'.format(store=store, **spec))
189-
return spec

0 commit comments

Comments
 (0)