Skip to content

Commit 22d8ed1

Browse files
committed
Basic manual sha calculation works
1 parent 723d8e3 commit 22d8ed1

File tree

8 files changed

+123
-27
lines changed

8 files changed

+123
-27
lines changed

generate-commits-fast.md

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,5 +9,6 @@ It is sometimes interesting to generate a ton of commits to test some edge case,
99
- `openssl dgst -sha1`: 22s
1010
- `git hash-object --stdin -w`: 21s
1111
- `git hash-object --stdin`: 20s
12-
- `sha1sum` Coreutils: 1.4s. TODO: why so much faster than `hash-object`? This is minimum bottleneck per CPU. We can reuse identical blobs.
13-
- touch: 0.9s (same on ramfs). This is the minimum IO bottleneck. Since the CPU bottleneck is not much above, parallelization is not the trouble.
12+
- `sha1sum` Coreutils: 1.4s.
13+
- touch: 0.9s (same on ramfs).
14+
- `time python3 <(printf 'import hashlib; import sys;\nfor i in range(1000): print(hashlib.sha1(str(i).encode("ascii")).hexdigest())')`: 0.14s TODO: why so much faster than `hash-object`? This is minimum bottleneck per CPU.

other-test-repos/.gitignore

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,3 @@
1-
/clone/
2-
/repo/
1+
*.pyc
2+
/*.tmp/
3+
__pycache__/

other-test-repos/commit-meta.bashrc

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,9 @@
22
date='2000-01-01T00:00:00+0000'
33
44
name='a'
5-
export GIT_COMMITTER_EMAIL="$email"
6-
export GIT_COMMITTER_NAME="$name"
7-
export GIT_COMMITTER_DATE="$date"
5+
export GIT_AUTHOR_DATE="$date"
86
export GIT_AUTHOR_EMAIL="$email"
97
export GIT_AUTHOR_NAME="$name"
10-
export GIT_AUTHOR_DATE="$date"
8+
export GIT_COMMITTER_DATE="$date"
9+
export GIT_COMMITTER_EMAIL="$email"
10+
export GIT_COMMITTER_NAME="$name"

other-test-repos/finish-bare.bashrc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
git clone -q repo clone
1+
git clone -q repo.tmp clone.tmp

other-test-repos/init.bashrc

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
rm -rf repo clone
2-
mkdir -p repo
3-
cd repo
1+
rm -rf repo.tmp clone.tmp
2+
mkdir -p repo.tmp
3+
cd repo.tmp
44
git init -q

other-test-repos/manual-sha-calculation.py

100644100755
Lines changed: 81 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,86 @@
11
#!/usr/bin/env python3
22

3+
"""
4+
Create objects very manually. Goals:
5+
6+
- learn the object file
7+
- be faster than Git to generate large repos
8+
"""
9+
310
import datetime
11+
import subprocess
12+
import hashlib
13+
import zlib
414
import os
515

6-
date0 = datetime.date(2000, 1, 1)
7-
datef = datetime.date(2100, 1, 1)
8-
date = date0
9-
while date < datef:
10-
s = date.strftime("%Y-%m-%dT01:00:00")
11-
cmd = 'GIT_COMMITTER_DATE="{0}" git commit --allow-empty --allow-empty-message -m "" --date="{0}"'.format(s)
12-
print cmd
13-
os.system(cmd)
14-
date += datetime.timedelta(days=1)
16+
import util
17+
18+
util.init()
19+
git_dir = b'.git'
20+
objects_dir = os.path.join(git_dir, b'objects')
21+
22+
# Directory parameters.
23+
blob_content = b'a'
24+
blob_basename = b'a'
25+
blob_mode = b'100644'
26+
27+
name = b'a'
28+
29+
# 2000-01-01T00:00:00+0000
30+
date = b'946684800 +0000'
31+
32+
author_date = date
33+
author_email = email
34+
author_name = name
35+
committer_date = date
36+
committer_email = email
37+
committer_name = name
38+
message = b'a'
39+
40+
def get_object_and_sha(obj_type, content):
41+
obj = b'%s %s\0%s' % (obj_type, str(len(content)).encode('ascii'), content)
42+
hash = hashlib.sha1(obj)
43+
return (obj, hash.hexdigest().encode('ascii'), hash.digest())
44+
45+
def save_object(obj_type, content):
46+
obj, sha_ascii, sha = get_object_and_sha(obj_type, content)
47+
obj_dir = os.path.join(objects_dir, sha_ascii[:2])
48+
obj_path = os.path.join(obj_dir, sha_ascii[2:])
49+
os.makedirs(obj_dir, exist_ok=True)
50+
with open(obj_path, 'wb') as f:
51+
f.write(zlib.compress(obj))
52+
53+
def get_git_hash_object(obj_type, input):
54+
cmd = [b'git', b'hash-object', b'--stdin', b'-t', obj_type]
55+
return subprocess.check_output(cmd, input=input).rstrip()
56+
57+
# Blob.
58+
save_object(b'blob', blob_content)
59+
obj, blob_sha_ascii, blob_sha = get_object_and_sha(b'blob', blob_content)
60+
# Check sha matches Git.
61+
blob_sha_git = get_git_hash_object(b'blob', blob_content)
62+
assert blob_sha_ascii == blob_sha_git
63+
64+
# Tree.
65+
tree_content = b'%s %s\0%s' % (blob_mode, blob_basename, blob_sha)
66+
save_object(b'tree', tree_content)
67+
# Check sha matches Git.
68+
obj, tree_sha_ascii, tree_sha = get_object_and_sha(b'tree', tree_content)
69+
tree_sha_git = get_git_hash_object(b'tree', tree_content)
70+
assert tree_sha_ascii == tree_sha_git
71+
72+
# Commit.
73+
commit_content = b'tree %s\nauthor %s <%s> %s\ncommitter %s <%s> %s\n\n%s\n' % (
74+
tree_sha_ascii,
75+
author_name, author_email, author_date,
76+
committer_name, committer_email, committer_date,
77+
message)
78+
save_object(b'commit', commit_content)
79+
# Check sha matches Git.
80+
obj, commit_sha_ascii, commit_sha = get_object_and_sha(b'commit', commit_content)
81+
commit_sha_git = get_git_hash_object(b'commit', commit_content)
82+
assert commit_sha_ascii == commit_sha_git
83+
84+
# Create master branch.
85+
subprocess.check_output(['git', 'branch', 'master', commit_sha_ascii])
86+
subprocess.check_output(['git', 'clone', '.', '../clone.tmp'])

other-test-repos/streak.py

100644100755
Lines changed: 17 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2,17 +2,28 @@
22

33
import datetime
44
import os
5+
import shutil
6+
import subprocess
57

6-
repo = 'repo'
7-
os.mkdir(repo)
8-
os.cwd(repo)
8+
import util
9+
10+
util.init()
11+
12+
name = 'a'
13+
14+
os.environ['GIT_AUTHOR_EMAIL'] = email
15+
os.environ['GIT_AUTHOR_NAME'] = name
16+
os.environ['GIT_COMMITTER_EMAIL'] = email
17+
os.environ['GIT_COMMITTER_NAME'] = name
918

1019
date0 = datetime.date(2000, 1, 1)
1120
datef = datetime.date(2100, 1, 1)
1221
date = date0
1322
while date < datef:
1423
s = date.strftime('%Y-%m-%dT01:00:00')
15-
cmd = 'GIT_COMMITTER_DATE="{0}" git commit --allow-empty --allow-empty-message -m "" --date="{0}"'.format(s)
16-
print(cmd)
17-
os.system(cmd)
24+
print(s)
25+
os.environ['GIT_AUTHOR_DATE'] = s
26+
os.environ['GIT_COMMITTER_DATE'] = s
27+
cmd = ['git', 'commit', '-q', '--allow-empty', '--allow-empty-message', '-m', '']
28+
subprocess.check_output(cmd)
1829
date += datetime.timedelta(days=1)

other-test-repos/util.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
import os
2+
import shutil
3+
import subprocess
4+
5+
def init():
6+
repo = 'repo.tmp'
7+
for d in (repo, 'clone.tmp'):
8+
shutil.rmtree(d, ignore_errors=True)
9+
os.mkdir(repo)
10+
os.chdir(repo)
11+
subprocess.check_output(['git', 'init', '-q'])

0 commit comments

Comments
 (0)