Merge branch 'smmap'

Byron · Byron · commit 40762f9a744c · 2011-06-13T15:05:31.000+02:00
diff --git a/.gitmodules b/.gitmodules
@@ -1,4 +1,6 @@
 [submodule "async"]
 	path = gitdb/ext/async
 	url = git://github.com/gitpython-developers/async.git
-	branch = master
+[submodule "smmap"]
+	path = gitdb/ext/smmap
+	url = git://github.com/Byron/smmap.git
diff --git a/doc/source/changes.rst b/doc/source/changes.rst
@@ -2,6 +2,11 @@
 Changelog
 #########
 
+*****
+0.5.3
+*****
+* Added support for smmap. SmartMMap allows resources to be managed and controlled. This brings the implementation closer to the way git handles memory maps, such that unused cached memory maps will automatically be freed once a resource limit is hit. The memory limit on 32 bit systems remains though as a sliding mmap implementation is not used for performance reasons. 
+
 *****
 0.5.2
 *****
diff --git a/doc/source/conf.py b/doc/source/conf.py
@@ -38,7 +38,7 @@
 
 # General information about the project.
 project = u'GitDB'
-copyright = u'2010, Sebastian Thiel'
+copyright = u'2011, Sebastian Thiel'
 
 # The version info for the project you're documenting, acts as replacement for
 # |version| and |release|, also used in various other places throughout the
@@ -47,7 +47,7 @@
 # The short X.Y version.
 version = '0.5'
 # The full version, including alpha/beta/rc tags.
-release = '0.5.1'
+release = '0.5.3'
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.
diff --git a/gitdb/__init__.py b/gitdb/__init__.py
@@ -10,13 +10,15 @@
 #{ Initialization
 def _init_externals():
 	"""Initialize external projects by putting them into the path"""
-	sys.path.append(os.path.join(os.path.dirname(__file__), 'ext', 'async'))
-	
-	try:
-		import async
-	except ImportError:
-		raise ImportError("'async' could not be imported, assure it is located in your PYTHONPATH")
-	#END verify import
+	for module in ('async', 'smmap'):
+		sys.path.append(os.path.join(os.path.dirname(__file__), 'ext', module))
+		
+		try:
+			__import__(module)
+		except ImportError:
+			raise ImportError("'%s' could not be imported, assure it is located in your PYTHONPATH" % module)
+		#END verify import
+	#END handel imports
 	
 #} END initialization
 
diff --git a/gitdb/ext/smmap b/gitdb/ext/smmap
@@ -0,0 +1 @@
+Subproject commit 84eedc5d1def7bfefefc729d09c39a6a9cde81f2
diff --git a/gitdb/pack.py b/gitdb/pack.py
@@ -10,10 +10,10 @@
 						)
 from util import (
 					zlib,
+					mman,
 					LazyMixin,
 					unpack_from,
 					bin_to_hex,
-					file_contents_ro_filepath,
 					)
 
 from fun import (
@@ -247,7 +247,7 @@ class PackIndexFile(LazyMixin):
 	
 	# Dont use slots as we dynamically bind functions for each version, need a dict for this
 	# The slots you see here are just to keep track of our instance variables
-	# __slots__ = ('_indexpath', '_fanout_table', '_data', '_version', 
+	# __slots__ = ('_indexpath', '_fanout_table', '_cursor', '_version', 
 	#				'_sha_list_offset', '_crc_list_offset', '_pack_offset', '_pack_64_offset')
 
 	# used in v2 indices
@@ -261,22 +261,23 @@ def __init__(self, indexpath):
 	
 	def _set_cache_(self, attr):
 		if attr == "_packfile_checksum":
-			self._packfile_checksum = self._data[-40:-20]
+			self._packfile_checksum = self._cursor.map()[-40:-20]
 		elif attr == "_packfile_checksum":
-			self._packfile_checksum = self._data[-20:]
-		elif attr == "_data":
+			self._packfile_checksum = self._cursor.map()[-20:]
+		elif attr == "_cursor":
 			# Note: We don't lock the file when reading as we cannot be sure
 			# that we can actually write to the location - it could be a read-only
 			# alternate for instance
-			self._data = file_contents_ro_filepath(self._indexpath)
+			self._cursor = mman.make_cursor(self._indexpath).use_region()
 		else:
 			# now its time to initialize everything - if we are here, someone wants
 			# to access the fanout table or related properties
 			
 			# CHECK VERSION
-			self._version = (self._data[:4] == self.index_v2_signature and 2) or 1
+			mmap = self._cursor.map()
+			self._version = (mmap[:4] == self.index_v2_signature and 2) or 1
 			if self._version == 2:
-				version_id = unpack_from(">L", self._data, 4)[0] 
+				version_id = unpack_from(">L", mmap, 4)[0] 
 				assert version_id == self._version, "Unsupported index version: %i" % version_id
 			# END assert version
 			
@@ -297,16 +298,16 @@ def _set_cache_(self, attr):
 	
 	def _entry_v1(self, i):
 		""":return: tuple(offset, binsha, 0)"""
-		return unpack_from(">L20s", self._data, 1024 + i*24) + (0, ) 
+		return unpack_from(">L20s", self._cursor.map(), 1024 + i*24) + (0, ) 
 	
 	def _offset_v1(self, i):
 		"""see ``_offset_v2``"""
-		return unpack_from(">L", self._data, 1024 + i*24)[0]
+		return unpack_from(">L", self._cursor.map(), 1024 + i*24)[0]
 	
 	def _sha_v1(self, i):
 		"""see ``_sha_v2``"""
 		base = 1024 + (i*24)+4
-		return self._data[base:base+20]
+		return self._cursor.map()[base:base+20]
 		
 	def _crc_v1(self, i):
 		"""unsupported"""
@@ -322,25 +323,25 @@ def _entry_v2(self, i):
 	def _offset_v2(self, i):
 		""":return: 32 or 64 byte offset into pack files. 64 byte offsets will only 
 			be returned if the pack is larger than 4 GiB, or 2^32"""
-		offset = unpack_from(">L", self._data, self._pack_offset + i * 4)[0]
+		offset = unpack_from(">L", self._cursor.map(), self._pack_offset + i * 4)[0]
 		
 		# if the high-bit is set, this indicates that we have to lookup the offset
 		# in the 64 bit region of the file. The current offset ( lower 31 bits )
 		# are the index into it
 		if offset & 0x80000000:
-			offset = unpack_from(">Q", self._data, self._pack_64_offset + (offset & ~0x80000000) * 8)[0]
+			offset = unpack_from(">Q", self._cursor.map(), self._pack_64_offset + (offset & ~0x80000000) * 8)[0]
 		# END handle 64 bit offset
 		
 		return offset
 		
 	def _sha_v2(self, i):
 		""":return: sha at the given index of this file index instance"""
 		base = self._sha_list_offset + i * 20
-		return self._data[base:base+20]
+		return self._cursor.map()[base:base+20]
 		
 	def _crc_v2(self, i):
 		""":return: 4 bytes crc for the object at index i"""
-		return unpack_from(">L", self._data, self._crc_list_offset + i * 4)[0] 
+		return unpack_from(">L", self._cursor.map(), self._crc_list_offset + i * 4)[0] 
 		
 	#} END access V2
 	
@@ -358,7 +359,7 @@ def _initialize(self):
 		
 	def _read_fanout(self, byte_offset):
 		"""Generate a fanout table from our data"""
-		d = self._data
+		d = self._cursor.map()
 		out = list()
 		append = out.append
 		for i in range(256):
@@ -382,19 +383,19 @@ def path(self):
 		
 	def packfile_checksum(self):
 		""":return: 20 byte sha representing the sha1 hash of the pack file"""
-		return self._data[-40:-20]
+		return self._cursor.map()[-40:-20]
 		
 	def indexfile_checksum(self):
 		""":return: 20 byte sha representing the sha1 hash of this index file"""
-		return self._data[-20:]
+		return self._cursor.map()[-20:]
 		
 	def offsets(self):
 		""":return: sequence of all offsets in the order in which they were written
 		:note: return value can be random accessed, but may be immmutable"""
 		if self._version == 2:
 			# read stream to array, convert to tuple
 			a = array.array('I')	# 4 byte unsigned int, long are 8 byte on 64 bit it appears
-			a.fromstring(buffer(self._data, self._pack_offset, self._pack_64_offset - self._pack_offset))
+			a.fromstring(buffer(self._cursor.map(), self._pack_offset, self._pack_64_offset - self._pack_offset))
 			
 			# networkbyteorder to something array likes more
 			if sys.byteorder == 'little':
@@ -501,7 +502,7 @@ class PackFile(LazyMixin):
 		for some reason - one clearly doesn't want to read 10GB at once in that 
 		case"""
 	
-	__slots__ = ('_packpath', '_data', '_size', '_version')
+	__slots__ = ('_packpath', '_cursor', '_size', '_version')
 	pack_signature = 0x5041434b		# 'PACK'
 	pack_version_default = 2
 	
@@ -513,26 +514,20 @@ def __init__(self, packpath):
 		self._packpath = packpath
 		
 	def _set_cache_(self, attr):
-		if attr == '_data':
-			self._data = file_contents_ro_filepath(self._packpath)
-			
-			# read the header information
-			type_id, self._version, self._size = unpack_from(">LLL", self._data, 0)
-			
-			# TODO: figure out whether we should better keep the lock, or maybe
-			# add a .keep file instead ?
-		else: # must be '_size' or '_version'
-			# read header info - we do that just with a file stream
-			type_id, self._version, self._size = unpack(">LLL", open(self._packpath).read(12))
-		# END handle header
+		# we fill the whole cache, whichever attribute gets queried first
+		self._cursor = mman.make_cursor(self._packpath).use_region()
 		
+		# read the header information
+		type_id, self._version, self._size = unpack_from(">LLL", self._cursor.map(), 0)
+			
+		# TODO: figure out whether we should better keep the lock, or maybe
+		# add a .keep file instead ?
 		if type_id != self.pack_signature:
 			raise ParseError("Invalid pack signature: %i" % type_id)
-		#END assert type id
 		
 	def _iter_objects(self, start_offset, as_stream=True):
 		"""Handle the actual iteration of objects within this pack"""
-		data = self._data
+		data = self._cursor.map()
 		content_size = len(data) - self.footer_size
 		cur_offset = start_offset or self.first_object_offset
 		
@@ -568,11 +563,11 @@ def data(self):
 		"""
 		:return: read-only data of this pack. It provides random access and usually
 			is a memory map"""
-		return self._data
+		return self._cursor.map()
 		
 	def checksum(self):
 		""":return: 20 byte sha1 hash on all object sha's contained in this file"""
-		return self._data[-20:]
+		return self._cursor.map()[-20:]
 	
 	def path(self):
 		""":return: path to the packfile"""
@@ -591,8 +586,9 @@ def collect_streams(self, offset):
 			If the object at offset is no delta, the size of the list is 1.
 		:param offset: specifies the first byte of the object within this pack"""
 		out = list()
+		data = self._cursor.map()
 		while True:
-			ostream = pack_object_at(self._data, offset, True)[1]
+			ostream = pack_object_at(data, offset, True)[1]
 			out.append(ostream)
 			if ostream.type_id == OFS_DELTA:
 				offset = ostream.pack_offset - ostream.delta_info
@@ -614,14 +610,14 @@ def info(self, offset):
 		
 		:param offset: byte offset
 		:return: OPackInfo instance, the actual type differs depending on the type_id attribute"""
-		return pack_object_at(self._data, offset or self.first_object_offset, False)[1]
+		return pack_object_at(self._cursor.map(), offset or self.first_object_offset, False)[1]
 		
 	def stream(self, offset):
 		"""Retrieve an object at the given file-relative offset as stream along with its information
 		
 		:param offset: byte offset
 		:return: OPackStream instance, the actual type differs depending on the type_id attribute"""
-		return pack_object_at(self._data, offset or self.first_object_offset, True)[1]
+		return pack_object_at(self._cursor.map(), offset or self.first_object_offset, True)[1]
 		
 	def stream_iter(self, start_offset=0):
 		"""
@@ -704,7 +700,7 @@ def _object(self, sha, as_stream, index=-1):
 			sha = self._index.sha(index)
 		# END assure sha is present ( in output )
 		offset = self._index.offset(index)
-		type_id, uncomp_size, data_rela_offset = pack_object_header_info(buffer(self._pack._data, offset))
+		type_id, uncomp_size, data_rela_offset = pack_object_header_info(buffer(self._pack._cursor.map(), offset))
 		if as_stream:
 			if type_id not in delta_types:
 				packstream = self._pack.stream(offset)
diff --git a/gitdb/test/performance/test_pack.py b/gitdb/test/performance/test_pack.py
@@ -15,9 +15,11 @@
 from time import time
 import random
 
+from nose import SkipTest
+
 class TestPackedDBPerformance(TestBigRepoR):
 	
-	def _test_pack_random_access(self):
+	def test_pack_random_access(self):
 		pdb = PackedDB(os.path.join(self.gitrepopath, "objects/pack"))
 		
 		# sha lookup
@@ -66,6 +68,7 @@ def _test_pack_random_access(self):
 		print >> sys.stderr, "PDB: Obtained %i streams by sha and read all bytes totallying %i KiB ( %f KiB / s ) in %f s ( %f streams/s )" % (max_items, total_kib, total_kib/elapsed , elapsed, max_items / elapsed)
 		
 	def test_correctness(self):
+		raise SkipTest("Takes too long, enable it if you change the algorithm and want to be sure you decode packs correctly")
 		pdb = PackedDB(os.path.join(self.gitrepopath, "objects/pack"))
 		# disabled for now as it used to work perfectly, checking big repositories takes a long time
 		print >> sys.stderr, "Endurance run: verify streaming of objects (crc and sha)"
diff --git a/gitdb/util.py b/gitdb/util.py
@@ -23,6 +23,14 @@
 # END try async zlib
 
 from async import ThreadPool
+from smmap import (
+					StaticWindowMapManager,
+					SlidingWindowMapBuffer
+				)
+
+# initialize our global memory manager instance
+# Use it to free cached (and unused) resources.
+mman = StaticWindowMapManager()
 
 try:
     import hashlib
@@ -180,6 +188,11 @@ def file_contents_ro_filepath(filepath, stream=False, allow_mmap=True, flags=0):
 		close(fd)
 	# END assure file is closed
 	
+def sliding_ro_buffer(filepath, flags=0):
+	""":return: a buffer compatible object which uses our mapped memory manager internally
+		ready to read the whole given filepath"""
+	return SlidingWindowMapBuffer(mman.make_cursor(filepath), flags=flags)
+	
 def to_hex_sha(sha):
 	""":return: hexified version  of sha"""
 	if len(sha) == 40:
diff --git a/setup.py b/setup.py
@@ -69,7 +69,7 @@ def get_data_files(self):
 
 setup(cmdclass={'build_ext':build_ext_nofail},
       name = "gitdb",
-      version = "0.5.2",
+      version = "0.5.3",
       description = "Git Object Database",
       author = "Sebastian Thiel",
       author_email = "byronimo@gmail.com",
@@ -80,7 +80,7 @@ def get_data_files(self):
       ext_modules=[Extension('gitdb._perf', ['gitdb/_fun.c', 'gitdb/_delta_apply.c'], include_dirs=['gitdb'])],
       license = "BSD License",
       zip_safe=False,
-      requires=('async (>=0.6.1)',),
-      install_requires='async >= 0.6.1',
+      requires=('async (>=0.6.1)', 'smmap (>=0.8.0)'),
+      install_requires=('async >= 0.6.1', 'smmap >= 0.8.0'),
       long_description = """GitDB is a pure-Python git object database"""
       )