jh941213
diff --git a/‎.build/custom_pandoc_filter.py
+139 b/‎.build/custom_pandoc_filter.py
+139
diff --git a/‎.build/download_data.py
+136 b/‎.build/download_data.py
+136
diff --git a/‎.build/get_files_to_run.py
+15-16 b/‎.build/get_files_to_run.py
+15-16
@@ -0,0 +1,139 @@
+from pandocfilters import toJSONFilter, Div, RawBlock, Para, Str, Space, Link, Code, CodeBlock
+import markdown
+import html
+
+def to_markdown(item, skip_octicon=False):
+    # A handler function to process strings, links, code, and code
+    # blocks
+    if item['t'] == 'Str':
+        return item['c']
+    elif item['t'] == 'Space':
+        return ' '
+    elif item['t'] == 'Link':
+        link_text = ''.join(to_markdown(i, skip_octicon) for i in item['c'][1])
+        return f'<a href="{item["c"][2][0]}">{link_text}</a>'
+    elif item['t'] == 'Code':
+        # Need to remove icticon as they don't render in .ipynb
+        if any(value == 'octicon' for key, value in item['c'][0][2]):
+            return ''
+        else:
+            # Escape the code and wrap it in <code> tags
+            return f'<code>{html.escape(item["c"][1])}</code>'
+    elif item['t'] == 'CodeBlock':
+        # Escape the code block and wrap it in <pre><code> tags
+        return f'<pre><code>{html.escape(item["c"][1])}</code></pre>'
+    else:
+        return ''
+
+
+def process_admonitions(key, value, format, meta):
+    # Replace admonitions with proper HTML.
+    if key == 'Div':
+        [[ident, classes, keyvals], contents] = value
+        if 'note' in classes:
+            color = '#54c7ec'
+            label = 'NOTE:'
+        elif 'tip' in classes:
+            color = '#6bcebb'
+            label = 'TIP:'
+        elif 'warning' in classes:
+            color = '#e94f3b'
+            label = 'WARNING:'
+        else:
+            return
+
+        note_content = []
+        for block in contents:
+            if block.get('t') == 'Para':
+                for item in block['c']:
+                    if item['t'] == 'Str':
+                        note_content.append(Str(item['c']))
+                    elif item['t'] == 'Space':
+                        note_content.append(Space())
+                    elif item['t'] == 'Link':
+                        note_content.append(Link(*item['c']))
+                    elif item['t'] == 'Code':
+                        note_content.append(Code(*item['c']))
+            elif block.get('t') == 'CodeBlock':
+                note_content.append(CodeBlock(*block['c']))
+
+        note_content_md = ''.join(to_markdown(item) for item in note_content)
+        html_content = markdown.markdown(note_content_md)
+
+        return [{'t': 'RawBlock', 'c': ['html', f'<div style="background-color: {color}; color: #fff; font-weight: 700; padding-left: 10px; padding-top: 5px; padding-bottom: 5px"><strong>{label}</strong></div>']}, {'t': 'RawBlock', 'c': ['html', '<div style="background-color: #f3f4f7; padding-left: 10px; padding-top: 10px; padding-bottom: 10px; padding-right: 10px">']}, {'t': 'RawBlock', 'c': ['html', html_content]}, {'t': 'RawBlock', 'c': ['html', '</div>']}]
+    elif key == 'RawBlock':
+    # this is needed for the cells that have embedded video.
+    # We add a special tag to those: ``` {python, .jupyter-code-cell}
+    # The post-processing script then finds those and genrates separate
+    # code cells that can load video.
+        [format, content] = value
+        if format == 'html' and 'iframe' in content:
+            # Extract the video URL
+            video_url = content.split('src="')[1].split('"')[0]
+            # Create the Python code to display the video
+            python_code = f"""
+from IPython.display import display, HTML
+html_code = \"""
+{content}
+\"""
+display(HTML(html_code))
+"""
+
+            return {'t': 'CodeBlock', 'c': [['', ['python', 'jupyter-code-cell'], []], python_code]}
+
+
+def process_images(key, value, format, meta):
+    # Add https://tutorials.pytorch.kr/ to images so that they
+    # load correctly in the notebook.
+    if key != 'Image':
+        return None
+    [ident, classes, keyvals], caption, [src, title] = value
+    if not src.startswith('http'):
+        while src.startswith('../'):
+            src = src[3:]
+        if src.startswith('/_static'):
+            src = src[1:]
+        src = 'https://tutorials.pytorch.kr/' + src
+
+    return {'t': 'Image', 'c': [[ident, classes, keyvals], caption, [src, title]]}
+
+
+def process_grids(key, value, format, meta):
+    # Generate side by side grid cards. Only for the two-cards layout
+    # that we use in the tutorial template.
+    if key == 'Div':
+        [[ident, classes, keyvals], contents] = value
+        if 'grid' in classes:
+            columns = ['<div style="width: 45%; float: left; padding: 20px;">',
+                       '<div style="width: 45%; float: right; padding: 20px;">']
+            column_num = 0
+            for block in contents:
+                if 't' in block and block['t'] == 'Div' and 'grid-item-card' in block['c'][0][1]:
+                    item_html = ''
+                    for item in block['c'][1]:
+                        if item['t'] == 'Para':
+                            item_html += '<h2>' + ''.join(to_markdown(i) for i in item['c']) + '</h2>'
+                        elif item['t'] == 'BulletList':
+                            item_html += '<ul>'
+                            for list_item in item['c']:
+                                item_html += '<li>' + ''.join(to_markdown(i) for i in list_item[0]['c']) + '</li>'
+                            item_html += '</ul>'
+                    columns[column_num] += item_html
+                    column_num = (column_num + 1) % 2
+            columns = [column + '</div>' for column in columns]
+            return {'t': 'RawBlock', 'c': ['html', ''.join(columns)]}
+
+def is_code_block(item):
+    return item['t'] == 'Code' and 'octicon' in item['c'][1]
+
+
+def process_all(key, value, format, meta):
+    for transform in [process_admonitions, process_images, process_grids]:
+        new_value = transform(key, value, format, meta)
+        if new_value is not None:
+            break
+    return new_value
+
+
+if __name__ == "__main__":
+    toJSONFilter(process_all)
@@ -0,0 +1,136 @@
+#!/usr/bin/env python3
+import hashlib
+import os
+
+from typing import Optional
+from urllib.request import urlopen, Request
+from pathlib import Path
+from zipfile import ZipFile
+
+REPO_BASE_DIR = Path(__file__).absolute().parent.parent
+DATA_DIR = REPO_BASE_DIR / "_data"
+BEGINNER_DATA_DIR = REPO_BASE_DIR / "beginner_source" / "data"
+INTERMEDIATE_DATA_DIR = REPO_BASE_DIR / "intermediate_source" / "data"
+ADVANCED_DATA_DIR = REPO_BASE_DIR / "advanced_source" / "data"
+PROTOTYPE_DATA_DIR = REPO_BASE_DIR / "prototype_source" / "data"
+FILES_TO_RUN = os.getenv("FILES_TO_RUN")
+
+
+def size_fmt(nbytes: int) -> str:
+    """Returns a formatted file size string"""
+    KB = 1024
+    MB = 1024 * KB
+    GB = 1024 * MB
+    if abs(nbytes) >= GB:
+        return f"{nbytes * 1.0 / GB:.2f} Gb"
+    elif abs(nbytes) >= MB:
+        return f"{nbytes * 1.0 / MB:.2f} Mb"
+    elif abs(nbytes) >= KB:
+        return f"{nbytes * 1.0 / KB:.2f} Kb"
+    return str(nbytes) + " bytes"
+
+
+def download_url_to_file(url: str,
+                         dst: Optional[str] = None,
+                         prefix: Optional[Path] = None,
+                         sha256: Optional[str] = None) -> Path:
+    dst = dst if dst is not None else Path(url).name
+    dst = dst if prefix is None else str(prefix / dst)
+    if Path(dst).exists():
+        print(f"Skip downloading {url} as {dst} already exists")
+        return Path(dst)
+    file_size = None
+    u = urlopen(Request(url, headers={"User-Agent": "tutorials.downloader"}))
+    meta = u.info()
+    if hasattr(meta, 'getheaders'):
+        content_length = meta.getheaders("Content-Length")
+    else:
+        content_length = meta.get_all("Content-Length")
+    if content_length is not None and len(content_length) > 0:
+        file_size = int(content_length[0])
+    sha256_sum = hashlib.sha256()
+    with open(dst, "wb") as f:
+        while True:
+            buffer = u.read(32768)
+            if len(buffer) == 0:
+                break
+            sha256_sum.update(buffer)
+            f.write(buffer)
+    digest = sha256_sum.hexdigest()
+    if sha256 is not None and sha256 != digest:
+        Path(dst).unlink()
+        raise RuntimeError(f"Downloaded {url} has unexpected sha256sum {digest} should be {sha256}")
+    print(f"Downloaded {url} sha256sum={digest} size={size_fmt(file_size)}")
+    return Path(dst)
+
+
+def unzip(archive: Path, tgt_dir: Path) -> None:
+    with ZipFile(str(archive), "r") as zip_ref:
+        zip_ref.extractall(str(tgt_dir))
+
+
+def download_hymenoptera_data():
+    # transfer learning tutorial data
+    z = download_url_to_file("https://download.pytorch.org/tutorial/hymenoptera_data.zip",
+                             prefix=DATA_DIR,
+                             sha256="fbc41b31d544714d18dd1230b1e2b455e1557766e13e67f9f5a7a23af7c02209",
+                             )
+    unzip(z, BEGINNER_DATA_DIR)
+
+
+def download_nlp_data() -> None:
+    # nlp tutorial data
+    z = download_url_to_file("https://download.pytorch.org/tutorial/data.zip",
+                             prefix=DATA_DIR,
+                             sha256="fb317e80248faeb62dc25ef3390ae24ca34b94e276bbc5141fd8862c2200bff5",
+                             )
+    # This will unzip all files in data.zip to intermediate_source/data/ folder
+    unzip(z, INTERMEDIATE_DATA_DIR.parent)
+
+
+def download_dcgan_data() -> None:
+    # Download dataset for beginner_source/dcgan_faces_tutorial.py
+    z = download_url_to_file("https://s3.amazonaws.com/pytorch-tutorial-assets/img_align_celeba.zip",
+                             prefix=DATA_DIR,
+                             sha256="46fb89443c578308acf364d7d379fe1b9efb793042c0af734b6112e4fd3a8c74",
+                             )
+    unzip(z, BEGINNER_DATA_DIR / "celeba")
+
+
+def download_lenet_mnist() -> None:
+    # Download model for beginner_source/fgsm_tutorial.py
+    download_url_to_file("https://docs.google.com/uc?export=download&id=1HJV2nUHJqclXQ8flKvcWmjZ-OU5DGatl",
+                         prefix=BEGINNER_DATA_DIR,
+                         dst="lenet_mnist_model.pth",
+                         sha256="cb5f8e578aef96d5c1a2cc5695e1aa9bbf4d0fe00d25760eeebaaac6ebc2edcb",
+                         )
+
+def download_gpu_quantization_torchao() -> None:
+    # Download SAM model checkpoint for prototype_source/gpu_quantization_torchao_tutorial.py
+    download_url_to_file("https://dl.fbaipublicfiles.com/segment_anything/sam_vit_h_4b8939.pth",
+                         prefix=PROTOTYPE_DATA_DIR,
+                         dst="sam_vit_h_4b8939.pth",
+                         sha256="a7bf3b02f3ebf1267aba913ff637d9a2d5c33d3173bb679e46d9f338c26f262e",
+                         )
+
+def main() -> None:
+    DATA_DIR.mkdir(exist_ok=True)
+    BEGINNER_DATA_DIR.mkdir(exist_ok=True)
+    ADVANCED_DATA_DIR.mkdir(exist_ok=True)
+    INTERMEDIATE_DATA_DIR.mkdir(exist_ok=True)
+    PROTOTYPE_DATA_DIR.mkdir(exist_ok=True)
+
+    if FILES_TO_RUN is None or "transfer_learning_tutorial" in FILES_TO_RUN:
+        download_hymenoptera_data()
+    nlp_tutorials = ["seq2seq_translation_tutorial", "char_rnn_classification_tutorial", "char_rnn_generation_tutorial"]
+    if FILES_TO_RUN is None or any(x in FILES_TO_RUN for x in nlp_tutorials):
+        download_nlp_data()
+    if FILES_TO_RUN is None or "dcgan_faces_tutorial" in FILES_TO_RUN:
+        download_dcgan_data()
+    if FILES_TO_RUN is None or "fgsm_tutorial" in FILES_TO_RUN:
+        download_lenet_mnist()
+    if FILES_TO_RUN is None or "gpu_quantization_torchao_tutorial" in FILES_TO_RUN:
+        download_gpu_quantization_torchao()
+
+if __name__ == "__main__":
+    main()
@@ -2,7 +2,7 @@
 import json
 import os
 from pathlib import Path
-# from remove_runnable_code import remove_runnable_code
+from remove_runnable_code import remove_runnable_code
 
 
 # Calculate repo base dir
@@ -11,7 +11,7 @@
 
 def get_all_files() -> List[str]:
     sources = [x.relative_to(REPO_BASE_DIR) for x in REPO_BASE_DIR.glob("*_source/**/*.py") if 'data' not in x.parts]
-    return [str(x) for x in sources]
+    return sorted([str(x) for x in sources])
 
 
 def read_metadata() -> Dict[str, Any]:
@@ -40,27 +40,26 @@ def add_to_shard(i, filename):
         )
 
     all_other_files = all_files.copy()
-    needs_gpu_nvidia_small_multi = list(
-        filter(lambda x: get_needs_machine(x) == "gpu.nvidia.small.multi", all_files,)
+    needs_multigpu = list(
+        filter(lambda x: get_needs_machine(x) == "linux.16xlarge.nvidia.gpu", all_files,)
     )
-    needs_gpu_nvidia_medium = list(
-        filter(lambda x: get_needs_machine(x) == "gpu.nvidia.large", all_files,)
+    needs_a10g = list(
+        filter(lambda x: get_needs_machine(x) == "linux.g5.4xlarge.nvidia.gpu", all_files,)
     )
-    for filename in needs_gpu_nvidia_small_multi:
-        # currently, the only job that uses gpu.nvidia.small.multi is the 0th worker,
+    for filename in needs_multigpu:
+        # currently, the only job that has multigpu is the 0th worker,
         # so we'll add all the jobs that need this machine to the 0th worker
         add_to_shard(0, filename)
         all_other_files.remove(filename)
-    for filename in needs_gpu_nvidia_medium:
-        # currently, the only job that uses gpu.nvidia.large is the 1st worker,
+    for filename in needs_a10g:
+        # currently, workers 1-5 use linux.g5.4xlarge.nvidia.gpu (sm86, A10G),
         # so we'll add all the jobs that need this machine to the 1st worker
         add_to_shard(1, filename)
         all_other_files.remove(filename)
-
     sorted_files = sorted(all_other_files, key=get_duration, reverse=True,)
 
     for filename in sorted_files:
-        min_shard_index = sorted(range(num_shards), key=lambda i: sharded_files[i][0])[
+        min_shard_index = sorted(range(1, num_shards), key=lambda i: sharded_files[i][0])[
             0
         ]
         add_to_shard(min_shard_index, filename)
@@ -87,21 +86,21 @@ def parse_args() -> Any:
     from argparse import ArgumentParser
     parser = ArgumentParser("Select files to run")
     parser.add_argument("--dry-run", action="store_true")
-    parser.add_argument("--num-shards", type=int, default=int(os.environ.get("NUM_WORKERS", 20)))
-    parser.add_argument("--shard-num", type=int, default=int(os.environ.get("WORKER_ID", 0)))
+    parser.add_argument("--num-shards", type=int, default=int(os.environ.get("NUM_WORKERS", "20")))
+    parser.add_argument("--shard-num", type=int, default=int(os.environ.get("WORKER_ID", "1")))
     return parser.parse_args()
 
 
 def main() -> None:
     args = parse_args()
 
     all_files = get_all_files()
-    files_to_run = calculate_shards(all_files, num_shards=args.num_shards)[args.shard_num]
+    files_to_run = calculate_shards(all_files, num_shards=args.num_shards)[args.shard_num - 1]
     if not args.dry_run:
         remove_other_files(all_files, compute_files_to_keep(files_to_run))
     stripped_file_names = [Path(x).stem for x in files_to_run]
     print(" ".join(stripped_file_names))
 
 
 if __name__ == "__main__":
-    main()
+    main()