31
31
# > conda create -n myenv python=3.10
32
32
# > pip3 install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu121
33
33
# > pip install git+https://github.com/facebookresearch/segment-anything.git
34
- # > pip install git+https://github.com/pytorch-labs /ao.git
34
+ # > pip install git+https://github.com/pytorch/ao.git
35
35
#
36
36
# Segment Anything Model checkpoint setup:
37
37
#
44
44
#
45
45
46
46
import torch
47
- from torchao .quantization .quant_api import quantize_ , int8_dynamic_activation_int8_weight
47
+ from torchao .quantization .quant_api import quantize_ , Int8DynamicActivationInt8WeightConfig
48
48
from torchao .utils import unwrap_tensor_subclass , TORCH_VERSION_AT_LEAST_2_5
49
49
from segment_anything import sam_model_registry
50
50
from torch .utils .benchmark import Timer
@@ -143,7 +143,7 @@ def get_sam_model(only_one_block=False, batchsize=1):
143
143
# for improvements.
144
144
#
145
145
# Next, let's apply quantization. Quantization for GPUs comes in three main forms
146
- # in `torchao <https://github.com/pytorch-labs /ao>`_ which is just native
146
+ # in `torchao <https://github.com/pytorch/ao>`_ which is just native
147
147
# pytorch+python code. This includes:
148
148
#
149
149
# * int8 dynamic quantization
@@ -157,9 +157,9 @@ def get_sam_model(only_one_block=False, batchsize=1):
157
157
# in memory bound situations where the benefit comes from loading less
158
158
# weight data, rather than doing less computation. The torchao APIs:
159
159
#
160
- # ``int8_dynamic_activation_int8_weight ()``,
161
- # ``int8_weight_only ()`` or
162
- # ``int4_weight_only ()``
160
+ # ``Int8DynamicActivationInt8WeightConfig ()``,
161
+ # ``Int8WeightOnlyConfig ()`` or
162
+ # ``Int4WeightOnlyConfig ()``
163
163
#
164
164
# can be used to easily apply the desired quantization technique and then
165
165
# once the model is compiled with ``torch.compile`` with ``max-autotune``, quantization is
@@ -171,7 +171,7 @@ def get_sam_model(only_one_block=False, batchsize=1):
171
171
# ``apply_weight_only_int8_quant`` instead as drop in replacement for the two
172
172
# above (no replacement for int4).
173
173
#
174
- # The difference between the two APIs is that ``int8_dynamic_activation `` API
174
+ # The difference between the two APIs is that the ``Int8DynamicActivationInt8WeightConfig `` API
175
175
# alters the weight tensor of the linear module so instead of doing a
176
176
# normal linear, it does a quantized operation. This is helpful when you
177
177
# have non-standard linear ops that do more than one thing. The ``apply``
@@ -186,7 +186,7 @@ def get_sam_model(only_one_block=False, batchsize=1):
186
186
model , image = get_sam_model (only_one_block , batchsize )
187
187
model = model .to (torch .bfloat16 )
188
188
image = image .to (torch .bfloat16 )
189
- quantize_ (model , int8_dynamic_activation_int8_weight ())
189
+ quantize_ (model , Int8DynamicActivationInt8WeightConfig ())
190
190
if not TORCH_VERSION_AT_LEAST_2_5 :
191
191
# needed for subclass + compile to work on older versions of pytorch
192
192
unwrap_tensor_subclass (model )
@@ -224,7 +224,7 @@ def get_sam_model(only_one_block=False, batchsize=1):
224
224
model = model .to (torch .bfloat16 )
225
225
image = image .to (torch .bfloat16 )
226
226
torch ._inductor .config .force_fuse_int_mm_with_mul = True
227
- quantize_ (model , int8_dynamic_activation_int8_weight ())
227
+ quantize_ (model , Int8DynamicActivationInt8WeightConfig ())
228
228
if not TORCH_VERSION_AT_LEAST_2_5 :
229
229
# needed for subclass + compile to work on older versions of pytorch
230
230
unwrap_tensor_subclass (model )
@@ -258,7 +258,7 @@ def get_sam_model(only_one_block=False, batchsize=1):
258
258
torch ._inductor .config .coordinate_descent_tuning = True
259
259
torch ._inductor .config .coordinate_descent_check_all_directions = True
260
260
torch ._inductor .config .force_fuse_int_mm_with_mul = True
261
- quantize_ (model , int8_dynamic_activation_int8_weight ())
261
+ quantize_ (model , Int8DynamicActivationInt8WeightConfig ())
262
262
if not TORCH_VERSION_AT_LEAST_2_5 :
263
263
# needed for subclass + compile to work on older versions of pytorch
264
264
unwrap_tensor_subclass (model )
@@ -290,7 +290,7 @@ def get_sam_model(only_one_block=False, batchsize=1):
290
290
model , image = get_sam_model (False , batchsize )
291
291
model = model .to (torch .bfloat16 )
292
292
image = image .to (torch .bfloat16 )
293
- quantize_ (model , int8_dynamic_activation_int8_weight ())
293
+ quantize_ (model , Int8DynamicActivationInt8WeightConfig ())
294
294
if not TORCH_VERSION_AT_LEAST_2_5 :
295
295
# needed for subclass + compile to work on older versions of pytorch
296
296
unwrap_tensor_subclass (model )
@@ -315,6 +315,6 @@ def get_sam_model(only_one_block=False, batchsize=1):
315
315
# the model. For example, this can be done with some form of flash attention.
316
316
#
317
317
# For more information visit
318
- # `torchao <https://github.com/pytorch-labs /ao>`_ and try it on your own
318
+ # `torchao <https://github.com/pytorch/ao>`_ and try it on your own
319
319
# models.
320
320
#
0 commit comments