@@ -77,7 +77,7 @@ img = self.precls_trans(cv2.resize(cv2.cvtColor(img, cv2.COLOR_BGR2RGB), (224,22
77
77
``` py
78
78
79
79
input_shape = torch.ones((1 , 3 , 224 , 224 )).cuda()
80
- self .classification_engine = torch2trt(resnet50, [input_shape],
80
+ self .classification_engine = torch2trt(resnet50, [input_shape],
81
81
fp16_mode = self .fp16,
82
82
max_batch_size = self .cls_trt_max_batchsize,
83
83
)
@@ -116,8 +116,8 @@ config = torchpipe.parse_toml("resnet50.toml")
116
116
self .classification_engine = pipe(config)
117
117
118
118
self .classification_engine(bin_data)
119
-
120
-
119
+
120
+
121
121
if TASK_RESULT_KEY not in bin_data.keys():
122
122
print (" error decode" )
123
123
return results
@@ -133,9 +133,9 @@ The contents of the toml file are as follows:
133
133
134
134
``` bash
135
135
# Schedule'parameter
136
- batching_timeout = 5
137
- instance_num = 8
138
- precision = " fp16"
136
+ batching_timeout = 5
137
+ instance_num = 8
138
+ precision = " fp16"
139
139
140
140
# # Data decoding
141
141
#
@@ -145,11 +145,11 @@ precision = "fp16"
145
145
# The original decoding output format was BGR
146
146
# The DecodeMat backend also defaults to outputting in BGR format
147
147
# Since decoding is done on the CPU, DecodeMat is used
148
- # After each node is completed, the name of the next node needs to be
148
+ # After each node is completed, the name of the next node needs to be
149
149
# appended, otherwise the last node is assumed by default
150
150
#
151
151
[cpu_decoder]
152
- backend = " DecodeMat"
152
+ backend = " DecodeMat"
153
153
next = " cpu_posdecoder"
154
154
155
155
# # preprocessing: resize、cvtColorMat
@@ -160,11 +160,11 @@ next = "cpu_posdecoder"
160
160
# Note:
161
161
# The original preprocessing order was resize, cv2.COLOR_BGR2RGB,
162
162
# then Normalize.
163
- # However, the normalization step is now integrated into the model
164
- # processing (the [resnet50] node), so the output result after the
165
- # preprocessing in this node is consistent with the preprocessing result
163
+ # However, the normalization step is now integrated into the model
164
+ # processing (the [resnet50] node), so the output result after the
165
+ # preprocessing in this node is consistent with the preprocessing result
166
166
# without normalization.
167
- # After each node is completed, the name of the next node needs to be
167
+ # After each node is completed, the name of the next node needs to be
168
168
# appended, otherwise the last node is assumed by default.
169
169
#
170
170
[cpu_posdecoder]
@@ -183,23 +183,23 @@ next = "resnet50"
183
183
#
184
184
# This corresponds to 3.1(3) TensorRT acceleration and 3.1(2)Normalize
185
185
# Note:
186
- # There's a slight difference from the original method of generating
187
- # engines online. Here, the model needs to be first converted to ONNX
186
+ # There's a slight difference from the original method of generating
187
+ # engines online. Here, the model needs to be first converted to ONNX
188
188
# format.
189
- #
189
+ #
190
190
# For the conversion method, see [Converting Torch to ONNX].
191
191
#
192
192
[resnet50]
193
- backend = " SyncTensor[TensorrtTensor]"
193
+ backend = " SyncTensor[TensorrtTensor]"
194
194
min = 1
195
195
max = 4
196
196
instance_num = 4
197
- model = " /you/model/path/resnet50.onnx"
197
+ model = " /you/model/path/resnet50.onnx"
198
198
199
199
mean=" 123.675, 116.28, 103.53" # 255*"0.485, 0.456, 0.406"
200
200
std=" 58.395, 57.120, 57.375" # 255*"0.229, 0.224, 0.225"
201
201
202
- # TensorrtTensor
202
+ # TensorrtTensor
203
203
" model::cache" =" /you/model/path/resnet50.trt" # or resnet50.trt.encrypted
204
204
205
205
```
@@ -221,7 +221,7 @@ std="58.395, 57.120, 57.375" # 255*"0.229, 0.224, 0.225"
221
221
222
222
The specific test code can be found at [ client_qps.py] ( https://github.com/torchpipe/torchpipe/blob/develop/examples/resnet50_thrift/client_qps.py )
223
223
224
- With the same Thrift service interface, testing on a machine with NIDIA -3080 GPU, 36-core CPU, and concurrency of 10, we have the following results:
224
+ With the same Thrift service interface, testing on a machine with NVIDIA -3080 GPU, 36-core CPU, and concurrency of 10, we have the following results:
225
225
226
226
- throughput:
227
227
@@ -233,7 +233,7 @@ With the same Thrift service interface, testing on a machine with NIDIA-3080 GPU
233
233
- response time:
234
234
235
235
| Methods | TP50 | TP99 |
236
- :-: | :-: | :-:|
236
+ :-: | :-: | :-:|
237
237
| Pure TensorRT | 26.74 | 35.24|
238
238
| Using TorchPipe | 8.89| 14.28|
239
239
0 commit comments