Skip to content

Commit 43cacb1

Browse files
committed
upload resume works. inventory entires now their own class.
1 parent 744081c commit 43cacb1

File tree

5 files changed

+244
-77
lines changed

5 files changed

+244
-77
lines changed

backup.py

+18-9
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
from fileupload import FileUpload
22
from inventory import Inventory
3+
from inventory import FileState
34
import boto3
45
import cli
56
import json
@@ -10,9 +11,9 @@
1011
client = boto3.client('glacier')
1112

1213

13-
def upload_file(filePath):
14-
file_upload = FileUpload(config['vaultName'], filePath)
15-
cli.pp(file_upload.upload(client))
14+
def upload_file(inventory_entry):
15+
file_upload = FileUpload(config['vaultName'], inventory_entry)
16+
file_upload.upload(client)
1617

1718

1819
def list_jobs(vaultName):
@@ -41,18 +42,26 @@ def fetch_inventory(vaultName, jobId):
4142

4243
def sync(vaultName):
4344

44-
# glacier = boto3.resource('glacier')
45-
# vault = glacier.Vault(config['accountId'], vaultName)
4645
inventory = Inventory('/home/iolsen/test_pyback')
47-
cli.pp(inventory._entries)
46+
# cli.pp(inventory._entries)
4847
inventory.save()
49-
48+
for entry in inventory.get_by_state(FileState.IN_PROGRESS):
49+
upload_file(entry)
50+
for entry in inventory.get_by_state(FileState.NEW):
51+
upload_file(entry)
52+
entry = inventory.get_inventory_file_entry()
53+
if entry.get_state() != FileState.UPLOADED:
54+
upload_file(entry)
5055

5156
# perform_inventory(config['vaultName'])
5257
# list_jobs(config['vaultName'])
5358
# fetch_inventory(config['vaultName'],
5459
# '-YDD4AVvtcn6rn7zEYz8SF2HzNdLqqIhRnduONtSTz40jOBfAvIvycrfGJNijSefJHDS8D8A9tOCNxv6akFckF81Z493')
5560

56-
upload_file(config['filePath'])
61+
# upload_file(config['filePath'])
62+
63+
sync(config['vaultName'])
5764

58-
# sync(config['vaultName'])
65+
# inventory = Inventory('/home/iolsen/test_pyback')
66+
# for entry in inventory._entries.itervalues():
67+
# cli.pp(entry.__dict__)

cli.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -65,10 +65,10 @@ def cli_progress(filename,
6565
rate,
6666
remaining)
6767

68-
sys.stdout.write(output.ljust(int(columns)))
68+
sys.stderr.write(output.ljust(int(columns)))
6969
if current_val == end_val:
70-
sys.stdout.write('\n')
71-
sys.stdout.flush()
70+
sys.stderr.write('\n')
71+
sys.stderr.flush()
7272

7373

7474
def pp(data):

fileupload.py

+101-37
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
from inventory import FileState
12
from treehash import TreeHash
23
import botocore
34
import cli
@@ -7,13 +8,47 @@
78

89
class FileUpload:
910

10-
def __init__(self, vaultName, filePath):
11-
self._filePath = filePath
11+
def __init__(self, vaultName, inventory_entry):
12+
self._startTime = 0
1213
self._vaultName = vaultName
13-
self._fileName = os.path.basename(filePath)
14-
self._fileSizeBytes = os.path.getsize(self._filePath)
15-
self._partSize = get_best_part_size(self._fileSizeBytes)
16-
self._partNumUploading = 0
14+
self._inventory_entry = inventory_entry
15+
self._fileSizeBytes = os.path.getsize(
16+
self._inventory_entry.get_filePath())
17+
18+
if inventory_entry.get_state() == FileState.IN_PROGRESS:
19+
self._upload_id = inventory_entry.get_upload_id()
20+
self._partSize = inventory_entry.get_part_size()
21+
self._partNumUploading = inventory_entry.get_parts_uploaded()
22+
else:
23+
self._partSize = self._get_best_part_size(self._fileSizeBytes)
24+
self._partNumUploading = 0
25+
26+
def get_state(self):
27+
return self._inventory_entry.get_state()
28+
29+
def get_parts_uploaded(self):
30+
return self._partNumUploading
31+
32+
def get_part_size(self):
33+
return self._partSize
34+
35+
def get_upload_id(self):
36+
return self._upload_id
37+
38+
def get_end_time(self):
39+
return self._endTime
40+
41+
def get_checksum(self):
42+
return self._checksum
43+
44+
def get_http_status(self):
45+
return self._http_status
46+
47+
def get_archive_id(self):
48+
return self._archive_id
49+
50+
def get_upload_location(self):
51+
return self._upload_location
1752

1853
def formattedFileSize(self):
1954
if not hasattr(self, '_formattedFileSize'):
@@ -27,15 +62,25 @@ def formattedPartSize(self):
2762

2863
def upload(self, client):
2964

30-
self._upload = client.initiate_multipart_upload(
31-
vaultName=self._vaultName,
32-
archiveDescription=self._fileName,
33-
partSize=str(self._partSize))
34-
35-
treehash = TreeHash()
36-
partBegin = 0
37-
self._partNumUploading = 0
38-
with open(self._filePath, "rb") as f:
65+
if (self._inventory_entry.get_state() == FileState.IN_PROGRESS):
66+
self._upload_id = self._inventory_entry.get_upload_id()
67+
else:
68+
tmp_upload = client.initiate_multipart_upload(
69+
vaultName=self._vaultName,
70+
archiveDescription=self._inventory_entry.get_fileName(),
71+
partSize=str(self._partSize))
72+
self._upload_id = tmp_upload['uploadId']
73+
74+
if self._partSize < self._fileSizeBytes:
75+
self._inventory_entry.set_state_from_upload(
76+
self, FileState.IN_PROGRESS)
77+
78+
partBegin = self._partNumUploading * self._partSize
79+
data = b""
80+
with open(self._inventory_entry.get_filePath(), "rb") as f:
81+
if partBegin:
82+
data = f.read(partBegin)
83+
treehash = TreeHash(data=data, block_size=self._partSize)
3984
while partBegin < self._fileSizeBytes:
4085
partEnd = partBegin + self._partSize - 1
4186
if partEnd > self._fileSizeBytes:
@@ -44,65 +89,84 @@ def upload(self, client):
4489
part = f.read(self._partSize)
4590
treehash.update(part)
4691

47-
if partBegin == 0:
92+
if not self._startTime:
4893
self._startTime = time.time()
94+
4995
self._upload_part(client, part, partBegin, partEnd)
5096
partBegin = partEnd + 1
5197
self._partNumUploading += 1
5298

99+
if partEnd < self._fileSizeBytes:
100+
self._inventory_entry.set_state_from_upload(
101+
self, FileState.IN_PROGRESS)
102+
103+
completed_treehash = treehash.hexdigest()
53104
response = client.complete_multipart_upload(
54105
vaultName=self._vaultName,
55-
uploadId=self._upload['uploadId'],
106+
uploadId=self._upload_id,
56107
archiveSize=str(self._fileSizeBytes),
57-
checksum=treehash.hexdigest())
108+
checksum=completed_treehash)
109+
110+
self._endTime = time.time()
58111

59-
cli.cli_progress(self._fileName,
112+
cli.cli_progress(self._inventory_entry.get_fileName(),
60113
self.formattedFileSize(),
61114
self.formattedPartSize(),
62115
self._startTime,
63116
self._fileSizeBytes-1,
64117
self._fileSizeBytes-1)
65118

66-
return response
119+
# Sanity check that's probably unnecessary.
120+
if treehash.hexdigest() != response['checksum']:
121+
raise Exception('checksum mismatch')
122+
123+
self._checksum = response['checksum']
124+
self._http_status = response['ResponseMetadata']['HTTPStatusCode']
125+
self._archive_id = response['archiveId']
126+
self._upload_location = response['location']
127+
# cli.pp(json.dumps(self, default=lambda o: o.__dict__))
128+
129+
self._inventory_entry.set_state_from_upload(self, FileState.UPLOADED)
67130

68131
def _upload_part(self,
69132
client,
70133
part,
71134
partBegin,
72135
partEnd):
73136

74-
cli.cli_progress(self._fileName,
137+
cli.cli_progress(self._inventory_entry.get_fileName(),
75138
self.formattedFileSize(),
76139
self.formattedPartSize(),
77140
self._startTime,
78141
partBegin,
79142
self._fileSizeBytes-1)
80143

81144
for upload_attempt in range(0, 2):
82-
# print 'Uploading bytes %d through %d (%d%%)...' % (
83-
# partBegin, partEnd,
84-
# float(partEnd)/(self._fileSizeBytes-1)*100)
145+
print '\nUploading bytes %d through %d (%d%%)...' % (
146+
partBegin, partEnd,
147+
float(partEnd)/(self._fileSizeBytes-1)*100)
85148
try:
86149
response = client.upload_multipart_part(
87150
vaultName=self._vaultName,
88-
uploadId=self._upload['uploadId'],
151+
uploadId=self._upload_id,
89152
range='bytes %d-%d/*' % (partBegin, partEnd),
90153
body=part)
91154
return response
155+
92156
except botocore.exceptions.ClientError, e:
93-
print "\n" + e
157+
print "\n"
158+
print e
94159
print "Retrying..."
95160

96161
print "\nFAILED"
97162

98-
99-
def get_best_part_size(fileSizeBytes):
100-
# We want the smallest possible part size. Maximum parts is 10,000.
101-
# So we find the first part size larger than file_len/10,000.
102-
targetSize = fileSizeBytes / 10000
103-
partSize = 1048576 # min size 1 MB
104-
while partSize < targetSize:
105-
partSize *= 2
106-
if partSize > targetSize or partSize == 4294967296: # max size 4GB
107-
break
108-
return partSize
163+
def _get_best_part_size(self, fileSizeBytes):
164+
# We want the smallest possible part size. Maximum parts is 10,000.
165+
# So we find the first part size larger than file_len/10,000.
166+
targetSize = fileSizeBytes / 10000
167+
partSize = 1048576 # min size 1 MB
168+
while partSize < targetSize:
169+
partSize *= 2
170+
if partSize > targetSize or partSize == 4294967296: # max size 4GB
171+
break
172+
return partSize

0 commit comments

Comments
 (0)