Skip to content

Commit 42be0e1

Browse files
committed
1 parent c8d2186 commit 42be0e1

File tree

257 files changed

+20746
-6142
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

257 files changed

+20746
-6142
lines changed

.build/custom_pandoc_filter.py

+139
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,139 @@
1+
from pandocfilters import toJSONFilter, Div, RawBlock, Para, Str, Space, Link, Code, CodeBlock
2+
import markdown
3+
import html
4+
5+
def to_markdown(item, skip_octicon=False):
6+
# A handler function to process strings, links, code, and code
7+
# blocks
8+
if item['t'] == 'Str':
9+
return item['c']
10+
elif item['t'] == 'Space':
11+
return ' '
12+
elif item['t'] == 'Link':
13+
link_text = ''.join(to_markdown(i, skip_octicon) for i in item['c'][1])
14+
return f'<a href="{item["c"][2][0]}">{link_text}</a>'
15+
elif item['t'] == 'Code':
16+
# Need to remove icticon as they don't render in .ipynb
17+
if any(value == 'octicon' for key, value in item['c'][0][2]):
18+
return ''
19+
else:
20+
# Escape the code and wrap it in <code> tags
21+
return f'<code>{html.escape(item["c"][1])}</code>'
22+
elif item['t'] == 'CodeBlock':
23+
# Escape the code block and wrap it in <pre><code> tags
24+
return f'<pre><code>{html.escape(item["c"][1])}</code></pre>'
25+
else:
26+
return ''
27+
28+
29+
def process_admonitions(key, value, format, meta):
30+
# Replace admonitions with proper HTML.
31+
if key == 'Div':
32+
[[ident, classes, keyvals], contents] = value
33+
if 'note' in classes:
34+
color = '#54c7ec'
35+
label = 'NOTE:'
36+
elif 'tip' in classes:
37+
color = '#6bcebb'
38+
label = 'TIP:'
39+
elif 'warning' in classes:
40+
color = '#e94f3b'
41+
label = 'WARNING:'
42+
else:
43+
return
44+
45+
note_content = []
46+
for block in contents:
47+
if block.get('t') == 'Para':
48+
for item in block['c']:
49+
if item['t'] == 'Str':
50+
note_content.append(Str(item['c']))
51+
elif item['t'] == 'Space':
52+
note_content.append(Space())
53+
elif item['t'] == 'Link':
54+
note_content.append(Link(*item['c']))
55+
elif item['t'] == 'Code':
56+
note_content.append(Code(*item['c']))
57+
elif block.get('t') == 'CodeBlock':
58+
note_content.append(CodeBlock(*block['c']))
59+
60+
note_content_md = ''.join(to_markdown(item) for item in note_content)
61+
html_content = markdown.markdown(note_content_md)
62+
63+
return [{'t': 'RawBlock', 'c': ['html', f'<div style="background-color: {color}; color: #fff; font-weight: 700; padding-left: 10px; padding-top: 5px; padding-bottom: 5px"><strong>{label}</strong></div>']}, {'t': 'RawBlock', 'c': ['html', '<div style="background-color: #f3f4f7; padding-left: 10px; padding-top: 10px; padding-bottom: 10px; padding-right: 10px">']}, {'t': 'RawBlock', 'c': ['html', html_content]}, {'t': 'RawBlock', 'c': ['html', '</div>']}]
64+
elif key == 'RawBlock':
65+
# this is needed for the cells that have embedded video.
66+
# We add a special tag to those: ``` {python, .jupyter-code-cell}
67+
# The post-processing script then finds those and genrates separate
68+
# code cells that can load video.
69+
[format, content] = value
70+
if format == 'html' and 'iframe' in content:
71+
# Extract the video URL
72+
video_url = content.split('src="')[1].split('"')[0]
73+
# Create the Python code to display the video
74+
python_code = f"""
75+
from IPython.display import display, HTML
76+
html_code = \"""
77+
{content}
78+
\"""
79+
display(HTML(html_code))
80+
"""
81+
82+
return {'t': 'CodeBlock', 'c': [['', ['python', 'jupyter-code-cell'], []], python_code]}
83+
84+
85+
def process_images(key, value, format, meta):
86+
# Add https://tutorials.pytorch.kr/ to images so that they
87+
# load correctly in the notebook.
88+
if key != 'Image':
89+
return None
90+
[ident, classes, keyvals], caption, [src, title] = value
91+
if not src.startswith('http'):
92+
while src.startswith('../'):
93+
src = src[3:]
94+
if src.startswith('/_static'):
95+
src = src[1:]
96+
src = 'https://tutorials.pytorch.kr/' + src
97+
98+
return {'t': 'Image', 'c': [[ident, classes, keyvals], caption, [src, title]]}
99+
100+
101+
def process_grids(key, value, format, meta):
102+
# Generate side by side grid cards. Only for the two-cards layout
103+
# that we use in the tutorial template.
104+
if key == 'Div':
105+
[[ident, classes, keyvals], contents] = value
106+
if 'grid' in classes:
107+
columns = ['<div style="width: 45%; float: left; padding: 20px;">',
108+
'<div style="width: 45%; float: right; padding: 20px;">']
109+
column_num = 0
110+
for block in contents:
111+
if 't' in block and block['t'] == 'Div' and 'grid-item-card' in block['c'][0][1]:
112+
item_html = ''
113+
for item in block['c'][1]:
114+
if item['t'] == 'Para':
115+
item_html += '<h2>' + ''.join(to_markdown(i) for i in item['c']) + '</h2>'
116+
elif item['t'] == 'BulletList':
117+
item_html += '<ul>'
118+
for list_item in item['c']:
119+
item_html += '<li>' + ''.join(to_markdown(i) for i in list_item[0]['c']) + '</li>'
120+
item_html += '</ul>'
121+
columns[column_num] += item_html
122+
column_num = (column_num + 1) % 2
123+
columns = [column + '</div>' for column in columns]
124+
return {'t': 'RawBlock', 'c': ['html', ''.join(columns)]}
125+
126+
def is_code_block(item):
127+
return item['t'] == 'Code' and 'octicon' in item['c'][1]
128+
129+
130+
def process_all(key, value, format, meta):
131+
for transform in [process_admonitions, process_images, process_grids]:
132+
new_value = transform(key, value, format, meta)
133+
if new_value is not None:
134+
break
135+
return new_value
136+
137+
138+
if __name__ == "__main__":
139+
toJSONFilter(process_all)

.build/download_data.py

+136
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,136 @@
1+
#!/usr/bin/env python3
2+
import hashlib
3+
import os
4+
5+
from typing import Optional
6+
from urllib.request import urlopen, Request
7+
from pathlib import Path
8+
from zipfile import ZipFile
9+
10+
REPO_BASE_DIR = Path(__file__).absolute().parent.parent
11+
DATA_DIR = REPO_BASE_DIR / "_data"
12+
BEGINNER_DATA_DIR = REPO_BASE_DIR / "beginner_source" / "data"
13+
INTERMEDIATE_DATA_DIR = REPO_BASE_DIR / "intermediate_source" / "data"
14+
ADVANCED_DATA_DIR = REPO_BASE_DIR / "advanced_source" / "data"
15+
PROTOTYPE_DATA_DIR = REPO_BASE_DIR / "prototype_source" / "data"
16+
FILES_TO_RUN = os.getenv("FILES_TO_RUN")
17+
18+
19+
def size_fmt(nbytes: int) -> str:
20+
"""Returns a formatted file size string"""
21+
KB = 1024
22+
MB = 1024 * KB
23+
GB = 1024 * MB
24+
if abs(nbytes) >= GB:
25+
return f"{nbytes * 1.0 / GB:.2f} Gb"
26+
elif abs(nbytes) >= MB:
27+
return f"{nbytes * 1.0 / MB:.2f} Mb"
28+
elif abs(nbytes) >= KB:
29+
return f"{nbytes * 1.0 / KB:.2f} Kb"
30+
return str(nbytes) + " bytes"
31+
32+
33+
def download_url_to_file(url: str,
34+
dst: Optional[str] = None,
35+
prefix: Optional[Path] = None,
36+
sha256: Optional[str] = None) -> Path:
37+
dst = dst if dst is not None else Path(url).name
38+
dst = dst if prefix is None else str(prefix / dst)
39+
if Path(dst).exists():
40+
print(f"Skip downloading {url} as {dst} already exists")
41+
return Path(dst)
42+
file_size = None
43+
u = urlopen(Request(url, headers={"User-Agent": "tutorials.downloader"}))
44+
meta = u.info()
45+
if hasattr(meta, 'getheaders'):
46+
content_length = meta.getheaders("Content-Length")
47+
else:
48+
content_length = meta.get_all("Content-Length")
49+
if content_length is not None and len(content_length) > 0:
50+
file_size = int(content_length[0])
51+
sha256_sum = hashlib.sha256()
52+
with open(dst, "wb") as f:
53+
while True:
54+
buffer = u.read(32768)
55+
if len(buffer) == 0:
56+
break
57+
sha256_sum.update(buffer)
58+
f.write(buffer)
59+
digest = sha256_sum.hexdigest()
60+
if sha256 is not None and sha256 != digest:
61+
Path(dst).unlink()
62+
raise RuntimeError(f"Downloaded {url} has unexpected sha256sum {digest} should be {sha256}")
63+
print(f"Downloaded {url} sha256sum={digest} size={size_fmt(file_size)}")
64+
return Path(dst)
65+
66+
67+
def unzip(archive: Path, tgt_dir: Path) -> None:
68+
with ZipFile(str(archive), "r") as zip_ref:
69+
zip_ref.extractall(str(tgt_dir))
70+
71+
72+
def download_hymenoptera_data():
73+
# transfer learning tutorial data
74+
z = download_url_to_file("https://download.pytorch.org/tutorial/hymenoptera_data.zip",
75+
prefix=DATA_DIR,
76+
sha256="fbc41b31d544714d18dd1230b1e2b455e1557766e13e67f9f5a7a23af7c02209",
77+
)
78+
unzip(z, BEGINNER_DATA_DIR)
79+
80+
81+
def download_nlp_data() -> None:
82+
# nlp tutorial data
83+
z = download_url_to_file("https://download.pytorch.org/tutorial/data.zip",
84+
prefix=DATA_DIR,
85+
sha256="fb317e80248faeb62dc25ef3390ae24ca34b94e276bbc5141fd8862c2200bff5",
86+
)
87+
# This will unzip all files in data.zip to intermediate_source/data/ folder
88+
unzip(z, INTERMEDIATE_DATA_DIR.parent)
89+
90+
91+
def download_dcgan_data() -> None:
92+
# Download dataset for beginner_source/dcgan_faces_tutorial.py
93+
z = download_url_to_file("https://s3.amazonaws.com/pytorch-tutorial-assets/img_align_celeba.zip",
94+
prefix=DATA_DIR,
95+
sha256="46fb89443c578308acf364d7d379fe1b9efb793042c0af734b6112e4fd3a8c74",
96+
)
97+
unzip(z, BEGINNER_DATA_DIR / "celeba")
98+
99+
100+
def download_lenet_mnist() -> None:
101+
# Download model for beginner_source/fgsm_tutorial.py
102+
download_url_to_file("https://docs.google.com/uc?export=download&id=1HJV2nUHJqclXQ8flKvcWmjZ-OU5DGatl",
103+
prefix=BEGINNER_DATA_DIR,
104+
dst="lenet_mnist_model.pth",
105+
sha256="cb5f8e578aef96d5c1a2cc5695e1aa9bbf4d0fe00d25760eeebaaac6ebc2edcb",
106+
)
107+
108+
def download_gpu_quantization_torchao() -> None:
109+
# Download SAM model checkpoint for prototype_source/gpu_quantization_torchao_tutorial.py
110+
download_url_to_file("https://dl.fbaipublicfiles.com/segment_anything/sam_vit_h_4b8939.pth",
111+
prefix=PROTOTYPE_DATA_DIR,
112+
dst="sam_vit_h_4b8939.pth",
113+
sha256="a7bf3b02f3ebf1267aba913ff637d9a2d5c33d3173bb679e46d9f338c26f262e",
114+
)
115+
116+
def main() -> None:
117+
DATA_DIR.mkdir(exist_ok=True)
118+
BEGINNER_DATA_DIR.mkdir(exist_ok=True)
119+
ADVANCED_DATA_DIR.mkdir(exist_ok=True)
120+
INTERMEDIATE_DATA_DIR.mkdir(exist_ok=True)
121+
PROTOTYPE_DATA_DIR.mkdir(exist_ok=True)
122+
123+
if FILES_TO_RUN is None or "transfer_learning_tutorial" in FILES_TO_RUN:
124+
download_hymenoptera_data()
125+
nlp_tutorials = ["seq2seq_translation_tutorial", "char_rnn_classification_tutorial", "char_rnn_generation_tutorial"]
126+
if FILES_TO_RUN is None or any(x in FILES_TO_RUN for x in nlp_tutorials):
127+
download_nlp_data()
128+
if FILES_TO_RUN is None or "dcgan_faces_tutorial" in FILES_TO_RUN:
129+
download_dcgan_data()
130+
if FILES_TO_RUN is None or "fgsm_tutorial" in FILES_TO_RUN:
131+
download_lenet_mnist()
132+
if FILES_TO_RUN is None or "gpu_quantization_torchao_tutorial" in FILES_TO_RUN:
133+
download_gpu_quantization_torchao()
134+
135+
if __name__ == "__main__":
136+
main()

.build/get_files_to_run.py

+15-16
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
import json
33
import os
44
from pathlib import Path
5-
# from remove_runnable_code import remove_runnable_code
5+
from remove_runnable_code import remove_runnable_code
66

77

88
# Calculate repo base dir
@@ -11,7 +11,7 @@
1111

1212
def get_all_files() -> List[str]:
1313
sources = [x.relative_to(REPO_BASE_DIR) for x in REPO_BASE_DIR.glob("*_source/**/*.py") if 'data' not in x.parts]
14-
return [str(x) for x in sources]
14+
return sorted([str(x) for x in sources])
1515

1616

1717
def read_metadata() -> Dict[str, Any]:
@@ -40,27 +40,26 @@ def add_to_shard(i, filename):
4040
)
4141

4242
all_other_files = all_files.copy()
43-
needs_gpu_nvidia_small_multi = list(
44-
filter(lambda x: get_needs_machine(x) == "gpu.nvidia.small.multi", all_files,)
43+
needs_multigpu = list(
44+
filter(lambda x: get_needs_machine(x) == "linux.16xlarge.nvidia.gpu", all_files,)
4545
)
46-
needs_gpu_nvidia_medium = list(
47-
filter(lambda x: get_needs_machine(x) == "gpu.nvidia.large", all_files,)
46+
needs_a10g = list(
47+
filter(lambda x: get_needs_machine(x) == "linux.g5.4xlarge.nvidia.gpu", all_files,)
4848
)
49-
for filename in needs_gpu_nvidia_small_multi:
50-
# currently, the only job that uses gpu.nvidia.small.multi is the 0th worker,
49+
for filename in needs_multigpu:
50+
# currently, the only job that has multigpu is the 0th worker,
5151
# so we'll add all the jobs that need this machine to the 0th worker
5252
add_to_shard(0, filename)
5353
all_other_files.remove(filename)
54-
for filename in needs_gpu_nvidia_medium:
55-
# currently, the only job that uses gpu.nvidia.large is the 1st worker,
54+
for filename in needs_a10g:
55+
# currently, workers 1-5 use linux.g5.4xlarge.nvidia.gpu (sm86, A10G),
5656
# so we'll add all the jobs that need this machine to the 1st worker
5757
add_to_shard(1, filename)
5858
all_other_files.remove(filename)
59-
6059
sorted_files = sorted(all_other_files, key=get_duration, reverse=True,)
6160

6261
for filename in sorted_files:
63-
min_shard_index = sorted(range(num_shards), key=lambda i: sharded_files[i][0])[
62+
min_shard_index = sorted(range(1, num_shards), key=lambda i: sharded_files[i][0])[
6463
0
6564
]
6665
add_to_shard(min_shard_index, filename)
@@ -87,21 +86,21 @@ def parse_args() -> Any:
8786
from argparse import ArgumentParser
8887
parser = ArgumentParser("Select files to run")
8988
parser.add_argument("--dry-run", action="store_true")
90-
parser.add_argument("--num-shards", type=int, default=int(os.environ.get("NUM_WORKERS", 20)))
91-
parser.add_argument("--shard-num", type=int, default=int(os.environ.get("WORKER_ID", 0)))
89+
parser.add_argument("--num-shards", type=int, default=int(os.environ.get("NUM_WORKERS", "20")))
90+
parser.add_argument("--shard-num", type=int, default=int(os.environ.get("WORKER_ID", "1")))
9291
return parser.parse_args()
9392

9493

9594
def main() -> None:
9695
args = parse_args()
9796

9897
all_files = get_all_files()
99-
files_to_run = calculate_shards(all_files, num_shards=args.num_shards)[args.shard_num]
98+
files_to_run = calculate_shards(all_files, num_shards=args.num_shards)[args.shard_num - 1]
10099
if not args.dry_run:
101100
remove_other_files(all_files, compute_files_to_keep(files_to_run))
102101
stripped_file_names = [Path(x).stem for x in files_to_run]
103102
print(" ".join(stripped_file_names))
104103

105104

106105
if __name__ == "__main__":
107-
main()
106+
main()

0 commit comments

Comments
 (0)