1
1
import os
2
- from tqdm import tqdm
2
+ import itertools
3
3
from .settings import config
4
4
from .errors import DataJointError
5
5
from .hash import long_hash
6
- from .blob import pack , unpack
7
6
from .table import Table
8
7
from .declare import STORE_HASH_LENGTH , HASH_DATA_TYPE
9
- from .s3 import Folder as S3Folder
8
+ from . import s3
10
9
from .utils import safe_write
11
10
12
11
12
+ def subfold (name , folds ):
13
+ """
14
+ subfolding for external storage: e.g. subfold('abcdefg', (2, 3)) --> ['ab','cde']
15
+ """
16
+ return (name [:folds [0 ]].lower (),) + subfold (name [folds [0 ]:], folds [1 :]) if folds else ()
17
+
18
+
13
19
class ExternalTable (Table ):
14
20
"""
15
21
The table tracking externally stored objects.
@@ -42,15 +48,15 @@ def definition(self):
42
48
def table_name (self ):
43
49
return '~external'
44
50
45
- def put (self , store , obj ):
51
+ def put (self , store , blob ):
46
52
"""
47
53
put an object in external store
48
54
"""
49
- spec = self . _get_store_spec (store )
50
- blob = pack ( obj )
51
- blob_hash = long_hash (blob ) + store [ len ( 'external-' ):]
55
+ store = '' . join (store . split ( '-' )[ 1 :] )
56
+ spec = config . get_store_spec ( store )
57
+ blob_hash = long_hash (blob ) + store
52
58
if spec ['protocol' ] == 'file' :
53
- folder = os .path .join (spec ['location' ], self .database )
59
+ folder = os .path .join (spec ['location' ], self .database , * subfold ( blob_hash , spec [ 'subfolding' ]) )
54
60
full_path = os .path .join (folder , blob_hash )
55
61
if not os .path .isfile (full_path ):
56
62
try :
@@ -59,9 +65,10 @@ def put(self, store, obj):
59
65
os .makedirs (folder )
60
66
safe_write (full_path , blob )
61
67
elif spec ['protocol' ] == 's3' :
62
- S3Folder (database = self .database , ** spec ).put (blob_hash , blob )
68
+ folder = '/' .join (subfold (blob_hash , spec ['subfolding' ]))
69
+ s3 .Folder (database = self .database , ** spec ).put ('/' .join ((folder , blob_hash )), blob )
63
70
else :
64
- raise DataJointError ('Unknown external storage protocol {protocol} for {store}' .format (
71
+ raise DataJointError ('Unknown external storage protocol {protocol} in store "- {store}" ' .format (
65
72
store = store , protocol = spec ['protocol' ]))
66
73
67
74
# insert tracking info
@@ -80,31 +87,33 @@ def get(self, blob_hash):
80
87
"""
81
88
if blob_hash is None :
82
89
return None
83
- store = blob_hash [STORE_HASH_LENGTH :]
84
- store = 'external' + ('-' if store else '' ) + store
85
-
86
- cache_folder = config .get ('cache' , None )
87
90
91
+ # attempt to get object from cache
88
92
blob = None
93
+ cache_folder = config .get ('cache' , None )
89
94
if cache_folder :
90
95
try :
91
96
with open (os .path .join (cache_folder , blob_hash ), 'rb' ) as f :
92
97
blob = f .read ()
93
98
except FileNotFoundError :
94
99
pass
95
100
101
+ # attempt to get object from store
96
102
if blob is None :
97
- spec = self ._get_store_spec (store )
103
+ store = blob_hash [STORE_HASH_LENGTH :]
104
+ spec = config .get_store_spec (store )
98
105
if spec ['protocol' ] == 'file' :
99
- full_path = os .path .join (spec ['location' ], self .database , blob_hash )
106
+ subfolders = os .path .join (* subfold (blob_hash , spec ['subfolding' ]))
107
+ full_path = os .path .join (spec ['location' ], self .database , subfolders , blob_hash )
100
108
try :
101
109
with open (full_path , 'rb' ) as f :
102
110
blob = f .read ()
103
111
except FileNotFoundError :
104
112
raise DataJointError ('Lost access to external blob %s.' % full_path ) from None
105
113
elif spec ['protocol' ] == 's3' :
106
114
try :
107
- blob = S3Folder (database = self .database , ** spec ).get (blob_hash )
115
+ subfolder = '/' .join (subfold (blob_hash , spec ['subfolding' ]))
116
+ blob = s3 .Folder (database = self .database , ** spec ).get ('/' .join ((subfolder , blob_hash )))
108
117
except TypeError :
109
118
raise DataJointError ('External store {store} configuration is incomplete.' .format (store = store ))
110
119
else :
@@ -115,7 +124,7 @@ def get(self, blob_hash):
115
124
os .makedirs (cache_folder )
116
125
safe_write (os .path .join (cache_folder , blob_hash ), blob )
117
126
118
- return unpack ( blob )
127
+ return blob
119
128
120
129
@property
121
130
def references (self ):
@@ -156,34 +165,35 @@ def delete_garbage(self):
156
165
for ref in self .references ) or "TRUE" )
157
166
print ('Deleted %d items' % self .connection .query ("SELECT ROW_COUNT()" ).fetchone ()[0 ])
158
167
159
- def clean_store (self , store , display_progress = True ):
168
+ def clean_store (self , store , verbose = True ):
160
169
"""
161
170
Clean unused data in an external storage repository from unused blobs.
162
171
This must be performed after delete_garbage during low-usage periods to reduce risks of data loss.
163
172
"""
164
- spec = self . _get_store_spec (store )
165
- progress = tqdm if display_progress else lambda x : x
173
+ spec = config . get_store_spec (store )
174
+ in_use = set ( x for x in ( self & '`hash` LIKE "%%{store}"' . format ( store = store )). fetch ( 'hash' ))
166
175
if spec ['protocol' ] == 'file' :
167
- folder = os .path .join (spec ['location' ], self .database )
168
- delete_list = set (os .listdir (folder )).difference (self .fetch ('hash' ))
169
- print ('Deleting %d unused items from %s' % (len (delete_list ), folder ), flush = True )
170
- for f in progress (delete_list ):
171
- os .remove (os .path .join (folder , f ))
176
+ count = itertools .count ()
177
+ print ('Deleting...' )
178
+ deleted_folders = set ()
179
+ for folder , dirs , files in os .walk (os .path .join (spec ['location' ], self .database ), topdown = False ):
180
+ if dirs and files :
181
+ raise DataJointError ('Invalid repository with files in non-terminal folder %s' % folder )
182
+ dirs = set (d for d in dirs if os .path .join (folder , d ) not in deleted_folders )
183
+ if not dirs :
184
+ files_not_in_use = [f for f in files if f not in in_use ]
185
+ for f in files_not_in_use :
186
+ filename = os .path .join (folder , f )
187
+ next (count )
188
+ if verbose :
189
+ print (filename )
190
+ os .remove (filename )
191
+ if len (files_not_in_use ) == len (files ):
192
+ os .rmdir (folder )
193
+ deleted_folders .add (folder )
194
+ print ('Deleted %d objects' % next (count ))
172
195
elif spec ['protocol' ] == 's3' :
173
196
try :
174
- S3Folder (database = self .database , ** spec ).clean (self . fetch ( 'hash' ) )
197
+ failed_deletes = s3 . Folder (database = self .database , ** spec ).clean (in_use , verbose = verbose )
175
198
except TypeError :
176
199
raise DataJointError ('External store {store} configuration is incomplete.' .format (store = store ))
177
-
178
- @staticmethod
179
- def _get_store_spec (store ):
180
- try :
181
- spec = config [store ]
182
- except KeyError :
183
- raise DataJointError ('Storage {store} is requested but not configured' .format (store = store )) from None
184
- if 'protocol' not in spec :
185
- raise DataJointError ('Storage {store} config is missing the protocol field' .format (store = store ))
186
- if spec ['protocol' ] not in {'file' , 's3' }:
187
- raise DataJointError (
188
- 'Unknown external storage protocol "{protocol}" in "{store}"' .format (store = store , ** spec ))
189
- return spec
0 commit comments