Skip to content

Commit 40762f9

Browse files
committed
Merge branch 'smmap'
2 parents 17d9d13 + cb4059b commit 40762f9

File tree

9 files changed

+76
-54
lines changed

9 files changed

+76
-54
lines changed

.gitmodules

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
11
[submodule "async"]
22
path = gitdb/ext/async
33
url = git://github.com/gitpython-developers/async.git
4-
branch = master
4+
[submodule "smmap"]
5+
path = gitdb/ext/smmap
6+
url = git://github.com/Byron/smmap.git

doc/source/changes.rst

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,11 @@
22
Changelog
33
#########
44

5+
*****
6+
0.5.3
7+
*****
8+
* Added support for smmap. SmartMMap allows resources to be managed and controlled. This brings the implementation closer to the way git handles memory maps, such that unused cached memory maps will automatically be freed once a resource limit is hit. The memory limit on 32 bit systems remains though as a sliding mmap implementation is not used for performance reasons.
9+
510
*****
611
0.5.2
712
*****

doc/source/conf.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@
3838

3939
# General information about the project.
4040
project = u'GitDB'
41-
copyright = u'2010, Sebastian Thiel'
41+
copyright = u'2011, Sebastian Thiel'
4242

4343
# The version info for the project you're documenting, acts as replacement for
4444
# |version| and |release|, also used in various other places throughout the
@@ -47,7 +47,7 @@
4747
# The short X.Y version.
4848
version = '0.5'
4949
# The full version, including alpha/beta/rc tags.
50-
release = '0.5.1'
50+
release = '0.5.3'
5151

5252
# The language for content autogenerated by Sphinx. Refer to documentation
5353
# for a list of supported languages.

gitdb/__init__.py

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -10,13 +10,15 @@
1010
#{ Initialization
1111
def _init_externals():
1212
"""Initialize external projects by putting them into the path"""
13-
sys.path.append(os.path.join(os.path.dirname(__file__), 'ext', 'async'))
14-
15-
try:
16-
import async
17-
except ImportError:
18-
raise ImportError("'async' could not be imported, assure it is located in your PYTHONPATH")
19-
#END verify import
13+
for module in ('async', 'smmap'):
14+
sys.path.append(os.path.join(os.path.dirname(__file__), 'ext', module))
15+
16+
try:
17+
__import__(module)
18+
except ImportError:
19+
raise ImportError("'%s' could not be imported, assure it is located in your PYTHONPATH" % module)
20+
#END verify import
21+
#END handel imports
2022

2123
#} END initialization
2224

gitdb/ext/smmap

Submodule smmap added at 84eedc5

gitdb/pack.py

Lines changed: 36 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -10,10 +10,10 @@
1010
)
1111
from util import (
1212
zlib,
13+
mman,
1314
LazyMixin,
1415
unpack_from,
1516
bin_to_hex,
16-
file_contents_ro_filepath,
1717
)
1818

1919
from fun import (
@@ -247,7 +247,7 @@ class PackIndexFile(LazyMixin):
247247

248248
# Dont use slots as we dynamically bind functions for each version, need a dict for this
249249
# The slots you see here are just to keep track of our instance variables
250-
# __slots__ = ('_indexpath', '_fanout_table', '_data', '_version',
250+
# __slots__ = ('_indexpath', '_fanout_table', '_cursor', '_version',
251251
# '_sha_list_offset', '_crc_list_offset', '_pack_offset', '_pack_64_offset')
252252

253253
# used in v2 indices
@@ -261,22 +261,23 @@ def __init__(self, indexpath):
261261

262262
def _set_cache_(self, attr):
263263
if attr == "_packfile_checksum":
264-
self._packfile_checksum = self._data[-40:-20]
264+
self._packfile_checksum = self._cursor.map()[-40:-20]
265265
elif attr == "_packfile_checksum":
266-
self._packfile_checksum = self._data[-20:]
267-
elif attr == "_data":
266+
self._packfile_checksum = self._cursor.map()[-20:]
267+
elif attr == "_cursor":
268268
# Note: We don't lock the file when reading as we cannot be sure
269269
# that we can actually write to the location - it could be a read-only
270270
# alternate for instance
271-
self._data = file_contents_ro_filepath(self._indexpath)
271+
self._cursor = mman.make_cursor(self._indexpath).use_region()
272272
else:
273273
# now its time to initialize everything - if we are here, someone wants
274274
# to access the fanout table or related properties
275275

276276
# CHECK VERSION
277-
self._version = (self._data[:4] == self.index_v2_signature and 2) or 1
277+
mmap = self._cursor.map()
278+
self._version = (mmap[:4] == self.index_v2_signature and 2) or 1
278279
if self._version == 2:
279-
version_id = unpack_from(">L", self._data, 4)[0]
280+
version_id = unpack_from(">L", mmap, 4)[0]
280281
assert version_id == self._version, "Unsupported index version: %i" % version_id
281282
# END assert version
282283

@@ -297,16 +298,16 @@ def _set_cache_(self, attr):
297298

298299
def _entry_v1(self, i):
299300
""":return: tuple(offset, binsha, 0)"""
300-
return unpack_from(">L20s", self._data, 1024 + i*24) + (0, )
301+
return unpack_from(">L20s", self._cursor.map(), 1024 + i*24) + (0, )
301302

302303
def _offset_v1(self, i):
303304
"""see ``_offset_v2``"""
304-
return unpack_from(">L", self._data, 1024 + i*24)[0]
305+
return unpack_from(">L", self._cursor.map(), 1024 + i*24)[0]
305306

306307
def _sha_v1(self, i):
307308
"""see ``_sha_v2``"""
308309
base = 1024 + (i*24)+4
309-
return self._data[base:base+20]
310+
return self._cursor.map()[base:base+20]
310311

311312
def _crc_v1(self, i):
312313
"""unsupported"""
@@ -322,25 +323,25 @@ def _entry_v2(self, i):
322323
def _offset_v2(self, i):
323324
""":return: 32 or 64 byte offset into pack files. 64 byte offsets will only
324325
be returned if the pack is larger than 4 GiB, or 2^32"""
325-
offset = unpack_from(">L", self._data, self._pack_offset + i * 4)[0]
326+
offset = unpack_from(">L", self._cursor.map(), self._pack_offset + i * 4)[0]
326327

327328
# if the high-bit is set, this indicates that we have to lookup the offset
328329
# in the 64 bit region of the file. The current offset ( lower 31 bits )
329330
# are the index into it
330331
if offset & 0x80000000:
331-
offset = unpack_from(">Q", self._data, self._pack_64_offset + (offset & ~0x80000000) * 8)[0]
332+
offset = unpack_from(">Q", self._cursor.map(), self._pack_64_offset + (offset & ~0x80000000) * 8)[0]
332333
# END handle 64 bit offset
333334

334335
return offset
335336

336337
def _sha_v2(self, i):
337338
""":return: sha at the given index of this file index instance"""
338339
base = self._sha_list_offset + i * 20
339-
return self._data[base:base+20]
340+
return self._cursor.map()[base:base+20]
340341

341342
def _crc_v2(self, i):
342343
""":return: 4 bytes crc for the object at index i"""
343-
return unpack_from(">L", self._data, self._crc_list_offset + i * 4)[0]
344+
return unpack_from(">L", self._cursor.map(), self._crc_list_offset + i * 4)[0]
344345

345346
#} END access V2
346347

@@ -358,7 +359,7 @@ def _initialize(self):
358359

359360
def _read_fanout(self, byte_offset):
360361
"""Generate a fanout table from our data"""
361-
d = self._data
362+
d = self._cursor.map()
362363
out = list()
363364
append = out.append
364365
for i in range(256):
@@ -382,19 +383,19 @@ def path(self):
382383

383384
def packfile_checksum(self):
384385
""":return: 20 byte sha representing the sha1 hash of the pack file"""
385-
return self._data[-40:-20]
386+
return self._cursor.map()[-40:-20]
386387

387388
def indexfile_checksum(self):
388389
""":return: 20 byte sha representing the sha1 hash of this index file"""
389-
return self._data[-20:]
390+
return self._cursor.map()[-20:]
390391

391392
def offsets(self):
392393
""":return: sequence of all offsets in the order in which they were written
393394
:note: return value can be random accessed, but may be immmutable"""
394395
if self._version == 2:
395396
# read stream to array, convert to tuple
396397
a = array.array('I') # 4 byte unsigned int, long are 8 byte on 64 bit it appears
397-
a.fromstring(buffer(self._data, self._pack_offset, self._pack_64_offset - self._pack_offset))
398+
a.fromstring(buffer(self._cursor.map(), self._pack_offset, self._pack_64_offset - self._pack_offset))
398399

399400
# networkbyteorder to something array likes more
400401
if sys.byteorder == 'little':
@@ -501,7 +502,7 @@ class PackFile(LazyMixin):
501502
for some reason - one clearly doesn't want to read 10GB at once in that
502503
case"""
503504

504-
__slots__ = ('_packpath', '_data', '_size', '_version')
505+
__slots__ = ('_packpath', '_cursor', '_size', '_version')
505506
pack_signature = 0x5041434b # 'PACK'
506507
pack_version_default = 2
507508

@@ -513,26 +514,20 @@ def __init__(self, packpath):
513514
self._packpath = packpath
514515

515516
def _set_cache_(self, attr):
516-
if attr == '_data':
517-
self._data = file_contents_ro_filepath(self._packpath)
518-
519-
# read the header information
520-
type_id, self._version, self._size = unpack_from(">LLL", self._data, 0)
521-
522-
# TODO: figure out whether we should better keep the lock, or maybe
523-
# add a .keep file instead ?
524-
else: # must be '_size' or '_version'
525-
# read header info - we do that just with a file stream
526-
type_id, self._version, self._size = unpack(">LLL", open(self._packpath).read(12))
527-
# END handle header
517+
# we fill the whole cache, whichever attribute gets queried first
518+
self._cursor = mman.make_cursor(self._packpath).use_region()
528519

520+
# read the header information
521+
type_id, self._version, self._size = unpack_from(">LLL", self._cursor.map(), 0)
522+
523+
# TODO: figure out whether we should better keep the lock, or maybe
524+
# add a .keep file instead ?
529525
if type_id != self.pack_signature:
530526
raise ParseError("Invalid pack signature: %i" % type_id)
531-
#END assert type id
532527

533528
def _iter_objects(self, start_offset, as_stream=True):
534529
"""Handle the actual iteration of objects within this pack"""
535-
data = self._data
530+
data = self._cursor.map()
536531
content_size = len(data) - self.footer_size
537532
cur_offset = start_offset or self.first_object_offset
538533

@@ -568,11 +563,11 @@ def data(self):
568563
"""
569564
:return: read-only data of this pack. It provides random access and usually
570565
is a memory map"""
571-
return self._data
566+
return self._cursor.map()
572567

573568
def checksum(self):
574569
""":return: 20 byte sha1 hash on all object sha's contained in this file"""
575-
return self._data[-20:]
570+
return self._cursor.map()[-20:]
576571

577572
def path(self):
578573
""":return: path to the packfile"""
@@ -591,8 +586,9 @@ def collect_streams(self, offset):
591586
If the object at offset is no delta, the size of the list is 1.
592587
:param offset: specifies the first byte of the object within this pack"""
593588
out = list()
589+
data = self._cursor.map()
594590
while True:
595-
ostream = pack_object_at(self._data, offset, True)[1]
591+
ostream = pack_object_at(data, offset, True)[1]
596592
out.append(ostream)
597593
if ostream.type_id == OFS_DELTA:
598594
offset = ostream.pack_offset - ostream.delta_info
@@ -614,14 +610,14 @@ def info(self, offset):
614610
615611
:param offset: byte offset
616612
:return: OPackInfo instance, the actual type differs depending on the type_id attribute"""
617-
return pack_object_at(self._data, offset or self.first_object_offset, False)[1]
613+
return pack_object_at(self._cursor.map(), offset or self.first_object_offset, False)[1]
618614

619615
def stream(self, offset):
620616
"""Retrieve an object at the given file-relative offset as stream along with its information
621617
622618
:param offset: byte offset
623619
:return: OPackStream instance, the actual type differs depending on the type_id attribute"""
624-
return pack_object_at(self._data, offset or self.first_object_offset, True)[1]
620+
return pack_object_at(self._cursor.map(), offset or self.first_object_offset, True)[1]
625621

626622
def stream_iter(self, start_offset=0):
627623
"""
@@ -704,7 +700,7 @@ def _object(self, sha, as_stream, index=-1):
704700
sha = self._index.sha(index)
705701
# END assure sha is present ( in output )
706702
offset = self._index.offset(index)
707-
type_id, uncomp_size, data_rela_offset = pack_object_header_info(buffer(self._pack._data, offset))
703+
type_id, uncomp_size, data_rela_offset = pack_object_header_info(buffer(self._pack._cursor.map(), offset))
708704
if as_stream:
709705
if type_id not in delta_types:
710706
packstream = self._pack.stream(offset)

gitdb/test/performance/test_pack.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,9 +15,11 @@
1515
from time import time
1616
import random
1717

18+
from nose import SkipTest
19+
1820
class TestPackedDBPerformance(TestBigRepoR):
1921

20-
def _test_pack_random_access(self):
22+
def test_pack_random_access(self):
2123
pdb = PackedDB(os.path.join(self.gitrepopath, "objects/pack"))
2224

2325
# sha lookup
@@ -66,6 +68,7 @@ def _test_pack_random_access(self):
6668
print >> sys.stderr, "PDB: Obtained %i streams by sha and read all bytes totallying %i KiB ( %f KiB / s ) in %f s ( %f streams/s )" % (max_items, total_kib, total_kib/elapsed , elapsed, max_items / elapsed)
6769

6870
def test_correctness(self):
71+
raise SkipTest("Takes too long, enable it if you change the algorithm and want to be sure you decode packs correctly")
6972
pdb = PackedDB(os.path.join(self.gitrepopath, "objects/pack"))
7073
# disabled for now as it used to work perfectly, checking big repositories takes a long time
7174
print >> sys.stderr, "Endurance run: verify streaming of objects (crc and sha)"

gitdb/util.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,14 @@
2323
# END try async zlib
2424

2525
from async import ThreadPool
26+
from smmap import (
27+
StaticWindowMapManager,
28+
SlidingWindowMapBuffer
29+
)
30+
31+
# initialize our global memory manager instance
32+
# Use it to free cached (and unused) resources.
33+
mman = StaticWindowMapManager()
2634

2735
try:
2836
import hashlib
@@ -180,6 +188,11 @@ def file_contents_ro_filepath(filepath, stream=False, allow_mmap=True, flags=0):
180188
close(fd)
181189
# END assure file is closed
182190

191+
def sliding_ro_buffer(filepath, flags=0):
192+
""":return: a buffer compatible object which uses our mapped memory manager internally
193+
ready to read the whole given filepath"""
194+
return SlidingWindowMapBuffer(mman.make_cursor(filepath), flags=flags)
195+
183196
def to_hex_sha(sha):
184197
""":return: hexified version of sha"""
185198
if len(sha) == 40:

setup.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,7 @@ def get_data_files(self):
6969

7070
setup(cmdclass={'build_ext':build_ext_nofail},
7171
name = "gitdb",
72-
version = "0.5.2",
72+
version = "0.5.3",
7373
description = "Git Object Database",
7474
author = "Sebastian Thiel",
7575
author_email = "[email protected]",
@@ -80,7 +80,7 @@ def get_data_files(self):
8080
ext_modules=[Extension('gitdb._perf', ['gitdb/_fun.c', 'gitdb/_delta_apply.c'], include_dirs=['gitdb'])],
8181
license = "BSD License",
8282
zip_safe=False,
83-
requires=('async (>=0.6.1)',),
84-
install_requires='async >= 0.6.1',
83+
requires=('async (>=0.6.1)', 'smmap (>=0.8.0)'),
84+
install_requires=('async >= 0.6.1', 'smmap >= 0.8.0'),
8585
long_description = """GitDB is a pure-Python git object database"""
8686
)

0 commit comments

Comments
 (0)