tensorRT_Pro
第一个版本固定下来
用其他语言阅读此信息:英语,简体中文。
C ++接口:3行代码,您需要运行yolox
// create inference engine on gpu-0
// auto engine = Yolo::create_infer("yolov5m.fp32.trtmodel", Yolo::Type::V5, 0);
auto engine = Yolo::create_infer( " yolox_m.fp32.trtmodel " , Yolo::Type::X, 0 );
// load image
auto image = cv::imread( " 1.jpg " );
// do inference and get the result
auto box = engine-> commit (image).get(); // return vector<Box>Python接口:
import pytrt
model = models . resnet18 ( True ). eval (). to ( device )
trt_model = tp . from_torch ( model , input )
trt_out = trt_model ( input ) import os
import cv2
import numpy as np
import pytrt as tp
engine_file = "yolov5s.fp32.trtmodel"
if not os . path . exists ( engine_file ):
tp . compile_onnx_to_file ( 1 , tp . onnx_hub ( "yolov5s" ), engine_file )
yolo = tp . Yolo ( engine_file , type = tp . YoloType . V5 )
image = cv2 . imread ( "car.jpg" )
bboxes = yolo . commit ( image ). get ()
print ( f" { len ( bboxes ) } objects" )
for box in bboxes :
left , top , right , bottom = map ( int , [ box . left , box . top , box . right , box . bottom ])
cv2 . rectangle ( image , ( left , top ), ( right , bottom ), tp . random_color ( box . class_label ), 5 )
saveto = "yolov5.car.jpg"
print ( f"Save to { saveto } " )
cv2 . imwrite ( saveto , image )
cv2 . imshow ( "result" , image )
cv2 . waitKey ()| 模型 | 解决 | 类型 | 精确 | 经过时间 | FPS |
|---|---|---|---|---|---|
| yolox_x | 640x640 | YOLOX | fp32 | 21.879 | 45.71 |
| yolox_l | 640x640 | YOLOX | fp32 | 12.308 | 81.25 |
| yolox_m | 640x640 | YOLOX | fp32 | 6.862 | 145.72 |
| yolox_s | 640x640 | YOLOX | fp32 | 3.088 | 323.81 |
| yolox_x | 640x640 | YOLOX | FP16 | 6.763 | 147.86 |
| yolox_l | 640x640 | YOLOX | FP16 | 3.933 | 254.25 |
| yolox_m | 640x640 | YOLOX | FP16 | 2.515 | 397.55 |
| yolox_s | 640x640 | YOLOX | FP16 | 1.362 | 734.48 |
| yolox_x | 640x640 | YOLOX | INT8 | 4.070 | 245.68 |
| yolox_l | 640x640 | YOLOX | INT8 | 2.444 | 409.21 |
| yolox_m | 640x640 | YOLOX | INT8 | 1.730 | 577.98 |
| yolox_s | 640x640 | YOLOX | INT8 | 1.060 | 943.15 |
| Yolov5x6 | 1280x1280 | yolov5_p6 | fp32 | 68.022 | 14.70 |
| yolov5l6 | 1280x1280 | yolov5_p6 | fp32 | 37.931 | 26.36 |
| yolov5m6 | 1280x1280 | yolov5_p6 | fp32 | 20.127 | 49.69 |
| Yolov5S6 | 1280x1280 | yolov5_p6 | fp32 | 8.715 | 114.75 |
| Yolov5x | 640x640 | yolov5_p5 | fp32 | 18.480 | 54.11 |
| yolov5l | 640x640 | yolov5_p5 | fp32 | 10.110 | 98.91 |
| Yolov5m | 640x640 | yolov5_p5 | fp32 | 5.639 | 177.33 |
| yolov5s | 640x640 | yolov5_p5 | fp32 | 2.578 | 387.92 |
| Yolov5x6 | 1280x1280 | yolov5_p6 | FP16 | 20.877 | 47.90 |
| yolov5l6 | 1280x1280 | yolov5_p6 | FP16 | 10.960 | 91.24 |
| yolov5m6 | 1280x1280 | yolov5_p6 | FP16 | 7.236 | 138.20 |
| Yolov5S6 | 1280x1280 | yolov5_p6 | FP16 | 3.851 | 259.68 |
| Yolov5x | 640x640 | yolov5_p5 | FP16 | 5.933 | 168.55 |
| yolov5l | 640x640 | yolov5_p5 | FP16 | 3.450 | 289.86 |
| Yolov5m | 640x640 | yolov5_p5 | FP16 | 2.184 | 457.90 |
| yolov5s | 640x640 | yolov5_p5 | FP16 | 1.307 | 765.10 |
| Yolov5x6 | 1280x1280 | yolov5_p6 | INT8 | 12.207 | 81.92 |
| yolov5l6 | 1280x1280 | yolov5_p6 | INT8 | 7.221 | 138.49 |
| yolov5m6 | 1280x1280 | yolov5_p6 | INT8 | 5.248 | 190.55 |
| Yolov5S6 | 1280x1280 | yolov5_p6 | INT8 | 3.149 | 317.54 |
| Yolov5x | 640x640 | yolov5_p5 | INT8 | 3.704 | 269.97 |
| yolov5l | 640x640 | yolov5_p5 | INT8 | 2.255 | 443.53 |
| Yolov5m | 640x640 | yolov5_p5 | INT8 | 1.674 | 597.40 |
| yolov5s | 640x640 | yolov5_p5 | INT8 | 1.143 | 874.91 |
| 模型 | 解决 | 类型 | 精确 | 经过时间 | FPS |
|---|---|---|---|---|---|
| yolox_x_fast | 640x640 | YOLOX | fp32 | 21.598 | 46.30 |
| yolox_l_fast | 640x640 | YOLOX | fp32 | 12.199 | 81.97 |
| yolox_m_fast | 640x640 | YOLOX | fp32 | 6.819 | 146.65 |
| yolox_s_fast | 640x640 | YOLOX | fp32 | 2.979 | 335.73 |
| yolox_x_fast | 640x640 | YOLOX | FP16 | 6.764 | 147.84 |
| yolox_l_fast | 640x640 | YOLOX | FP16 | 3.866 | 258.64 |
| yolox_m_fast | 640x640 | YOLOX | FP16 | 2.386 | 419.16 |
| yolox_s_fast | 640x640 | YOLOX | FP16 | 1.259 | 794.36 |
| yolox_x_fast | 640x640 | YOLOX | INT8 | 3.918 | 255.26 |
| yolox_l_fast | 640x640 | YOLOX | INT8 | 2.292 | 436.38 |
| yolox_m_fast | 640x640 | YOLOX | INT8 | 1.589 | 629.49 |
| yolox_s_fast | 640x640 | YOLOX | INT8 | 0.954 | 1048.47 |
| yolov5x6_fast | 1280x1280 | yolov5_p6 | fp32 | 67.075 | 14.91 |
| yolov5l6_fast | 1280x1280 | yolov5_p6 | fp32 | 37.491 | 26.67 |
| yolov5m6_fast | 1280x1280 | yolov5_p6 | fp32 | 19.422 | 51.49 |
| yolov5s6_fast | 1280x1280 | yolov5_p6 | fp32 | 7.900 | 126.57 |
| yolov5x_fast | 640x640 | yolov5_p5 | fp32 | 18.554 | 53.90 |
| yolov5l_fast | 640x640 | yolov5_p5 | fp32 | 10.060 | 99.41 |
| yolov5m_fast | 640x640 | yolov5_p5 | fp32 | 5.500 | 181.82 |
| yolov5s_fast | 640x640 | yolov5_p5 | fp32 | 2.342 | 427.07 |
| yolov5x6_fast | 1280x1280 | yolov5_p6 | FP16 | 20.538 | 48.69 |
| yolov5l6_fast | 1280x1280 | yolov5_p6 | FP16 | 10.404 | 96.12 |
| yolov5m6_fast | 1280x1280 | yolov5_p6 | FP16 | 6.577 | 152.06 |
| yolov5s6_fast | 1280x1280 | yolov5_p6 | FP16 | 3.087 | 323.99 |
| yolov5x_fast | 640x640 | yolov5_p5 | FP16 | 5.919 | 168.95 |
| yolov5l_fast | 640x640 | yolov5_p5 | FP16 | 3.348 | 298.69 |
| yolov5m_fast | 640x640 | yolov5_p5 | FP16 | 2.015 | 496.34 |
| yolov5s_fast | 640x640 | yolov5_p5 | FP16 | 1.087 | 919.63 |
| yolov5x6_fast | 1280x1280 | yolov5_p6 | INT8 | 11.236 | 89.00 |
| yolov5l6_fast | 1280x1280 | yolov5_p6 | INT8 | 6.235 | 160.38 |
| yolov5m6_fast | 1280x1280 | yolov5_p6 | INT8 | 4.311 | 231.97 |
| yolov5s6_fast | 1280x1280 | yolov5_p6 | INT8 | 2.139 | 467.45 |
| yolov5x_fast | 640x640 | yolov5_p5 | INT8 | 3.456 | 289.37 |
| yolov5l_fast | 640x640 | yolov5_p5 | INT8 | 2.019 | 495.41 |
| yolov5m_fast | 640x640 | yolov5_p5 | INT8 | 1.425 | 701.71 |
| yolov5s_fast | 640x640 | yolov5_p5 | INT8 | 0.844 | 1185.47 |
-gencode=arch=compute_75,code=sm_75 。如果您使用的是3080TI,则应是gencode=arch=compute_86,code=sm_86mkdir build && cd buildcmake ..make yolo -j8make yolo -j8use_python := true在makefile中set(HAS_PYTHON ON)make pyinstall -j8python/pytrt/libpytrtc.so中。请检查lean/readme.md以获取详细的依赖性
在tensorrt.vcxproj中,替换<Import Project="$(VCTargetsPath)BuildCustomizationsCUDA 10.0.props" />用您自己的cuda路径
在tensorrt.vcxproj中,用自己的cuda路径替换<Import Project="$(VCTargetsPath)BuildCustomizationsCUDA 10.0.targets" />
在Tensorrt.vcxproj中,用您的计算能力替换<CodeGeneration>compute_61,sm_61</CodeGeneration> 。
配置您的依赖性或下载到Foler /Lean。配置VC ++ DIR(包括Dir和Refence)
配置您的env,debug->环境
编译并运行该示例,其中有3个选项可用。
python/pytrt/libpytrtc.pyd中protoc=/data/sxai/lean/protobuf3.11.4/bin/protoc # cd the path in terminal to /onnx
cd onnx
# execuete the command to make pb files
bash make_pb.shset(PROTOBUF_DIR "/data/sxai/lean/protobuf3.11.4")具有相同的原始路径。 mkdir build && cd build
cmake ..
make yolo -j64lean_protobuf := /data/sxai/lean/protobuf3.11.4在用相同的protoc路径的makefile中make yolo -j64bash onnx_parser/use_tensorrt_7.x.shmake yolo -j64bash onnx_parser/use_tensorrt_8.x.shmake yolo -j64 git clone [email protected]:ultralytics/yolov5.git # line 55 forward function in yolov5/models/yolo.py
# bs, _, ny, nx = x[i].shape # x(bs,255,20,20) to x(bs,3,20,20,85)
# x[i] = x[i].view(bs, self.na, self.no, ny, nx).permute(0, 1, 3, 4, 2).contiguous()
# modified into:
bs , _ , ny , nx = x [ i ]. shape # x(bs,255,20,20) to x(bs,3,20,20,85)
bs = - 1
ny = int ( ny )
nx = int ( nx )
x [ i ] = x [ i ]. view ( bs , self . na , self . no , ny , nx ). permute ( 0 , 1 , 3 , 4 , 2 ). contiguous ()
# line 70 in yolov5/models/yolo.py
# z.append(y.view(bs, -1, self.no))
# modified into:
z . append ( y . view ( bs , self . na * ny * nx , self . no ))
############# for yolov5-6.0 #####################
# line 65 in yolov5/models/yolo.py
# if self.grid[i].shape[2:4] != x[i].shape[2:4] or self.onnx_dynamic:
# self.grid[i], self.anchor_grid[i] = self._make_grid(nx, ny, i)
# modified into:
if self . grid [ i ]. shape [ 2 : 4 ] != x [ i ]. shape [ 2 : 4 ] or self . onnx_dynamic :
self . grid [ i ], self . anchor_grid [ i ] = self . _make_grid ( nx , ny , i )
# disconnect for pytorch trace
anchor_grid = ( self . anchors [ i ]. clone () * self . stride [ i ]). view ( 1 , - 1 , 1 , 1 , 2 )
# line 70 in yolov5/models/yolo.py
# y[..., 2:4] = (y[..., 2:4] * 2) ** 2 * self.anchor_grid[i] # wh
# modified into:
y [..., 2 : 4 ] = ( y [..., 2 : 4 ] * 2 ) ** 2 * anchor_grid # wh
# line 73 in yolov5/models/yolo.py
# wh = (y[..., 2:4] * 2) ** 2 * self.anchor_grid[i] # wh
# modified into:
wh = ( y [..., 2 : 4 ] * 2 ) ** 2 * anchor_grid # wh
############# for yolov5-6.0 #####################
# line 52 in yolov5/export.py
# torch.onnx.export(dynamic_axes={'images': {0: 'batch', 2: 'height', 3: 'width'}, # shape(1,3,640,640)
# 'output': {0: 'batch', 1: 'anchors'} # shape(1,25200,85) 修改为
# modified into:
torch . onnx . export ( dynamic_axes = { 'images' : { 0 : 'batch' }, # shape(1,3,640,640)
'output' : { 0 : 'batch' } # shape(1,25200,85) cd yolov5
python export.py --weights=yolov5s.pt --dynamic --include=onnx --opset=11cp yolov5/yolov5s.onnx tensorRT_cpp/workspace/
cd tensorRT_cpp
make yolo -j32 # from cdn
# or wget https://github.com/WongKinYiu/yolov7/releases/download/v0.1/yolov7.pt
wget https://cdn.githubjs.cf/WongKinYiu/yolov7/releases/download/v0.1/yolov7.pt
git clone [email protected]:WongKinYiu/yolov7.git # line 45 forward function in yolov7/models/yolo.py
# bs, _, ny, nx = x[i].shape # x(bs,255,20,20) to x(bs,3,20,20,85)
# x[i] = x[i].view(bs, self.na, self.no, ny, nx).permute(0, 1, 3, 4, 2).contiguous()
# modified into:
bs , _ , ny , nx = map ( int , x [ i ]. shape ) # x(bs,255,20,20) to x(bs,3,20,20,85)
bs = - 1
x [ i ] = x [ i ]. view ( bs , self . na , self . no , ny , nx ). permute ( 0 , 1 , 3 , 4 , 2 ). contiguous ()
# line 52 in yolov7/models/yolo.py
# y = x[i].sigmoid()
# y[..., 0:2] = (y[..., 0:2] * 2. - 0.5 + self.grid[i]) * self.stride[i] # xy
# y[..., 2:4] = (y[..., 2:4] * 2) ** 2 * self.anchor_grid[i] # wh
# z.append(y.view(bs, -1, self.no))
# modified into:
y = x [ i ]. sigmoid ()
xy = ( y [..., 0 : 2 ] * 2. - 0.5 + self . grid [ i ]) * self . stride [ i ] # xy
wh = ( y [..., 2 : 4 ] * 2 ) ** 2 * self . anchor_grid [ i ]. view ( 1 , - 1 , 1 , 1 , 2 ) # wh
classif = y [..., 4 :]
y = torch . cat ([ xy , wh , classif ], dim = - 1 )
z . append ( y . view ( bs , self . na * ny * nx , self . no ))
# line 57 in yolov7/models/yolo.py
# return x if self.training else (torch.cat(z, 1), x)
# modified into:
return x if self . training else torch . cat ( z , 1 )
# line 52 in yolov7/models/export.py
# output_names=['classes', 'boxes'] if y is None else ['output'],
# dynamic_axes={'images': {0: 'batch', 2: 'height', 3: 'width'}, # size(1,3,640,640)
# 'output': {0: 'batch', 2: 'y', 3: 'x'}} if opt.dynamic else None)
# modified into:
output_names = [ 'classes' , 'boxes' ] if y is None else [ 'output' ],
dynamic_axes = { 'images' : { 0 : 'batch' }, # size(1,3,640,640)
'output' : { 0 : 'batch' }} if opt . dynamic else None ) cd yolov7
python models/export.py --dynamic --grid --weight=yolov7.ptcp yolov7/yolov7.onnx tensorRT_cpp/workspace/
cd tensorRT_cpp
make yolo -j32git clone [email protected]:Megvii-BaseDetection/YOLOX.git
cd YOLOXMissing scale and zero-point for tensor (Unnamed Layer* 686) 。 # line 206 forward fuction in yolox/models/yolo_head.py. Replace the commented code with the uncommented code
# self.hw = [x.shape[-2:] for x in outputs]
self . hw = [ list ( map ( int , x . shape [ - 2 :])) for x in outputs ]
# line 208 forward function in yolox/models/yolo_head.py. Replace the commented code with the uncommented code
# [batch, n_anchors_all, 85]
# outputs = torch.cat(
# [x.flatten(start_dim=2) for x in outputs], dim=2
# ).permute(0, 2, 1)
proc_view = lambda x : x . view ( - 1 , int ( x . size ( 1 )), int ( x . size ( 2 ) * x . size ( 3 )))
outputs = torch . cat (
[ proc_view ( x ) for x in outputs ], dim = 2
). permute ( 0 , 2 , 1 )
# line 253 decode_output function in yolox/models/yolo_head.py Replace the commented code with the uncommented code
#outputs[..., :2] = (outputs[..., :2] + grids) * strides
#outputs[..., 2:4] = torch.exp(outputs[..., 2:4]) * strides
#return outputs
xy = ( outputs [..., : 2 ] + grids ) * strides
wh = torch . exp ( outputs [..., 2 : 4 ]) * strides
return torch . cat (( xy , wh , outputs [..., 4 :]), dim = - 1 )
# line 77 in tools/export_onnx.py
model . head . decode_in_inference = True # download model
wget https://github.com/Megvii-BaseDetection/YOLOX/releases/download/0.1.1rc0/yolox_m.pth
# export
export PYTHONPATH= $PYTHONPATH :.
python tools/export_onnx.py -c yolox_m.pth -f exps/default/yolox_m.py --output-name=yolox_m.onnx --dynamic --no-onnxsimcp YOLOX/yolox_m.onnx tensorRT_cpp/workspace/
cd tensorRT_cpp
make yolo -j32git clone [email protected]:ultralytics/yolov3.git # line 55 forward function in yolov3/models/yolo.py
# bs, _, ny, nx = x[i].shape # x(bs,255,20,20) to x(bs,3,20,20,85)
# x[i] = x[i].view(bs, self.na, self.no, ny, nx).permute(0, 1, 3, 4, 2).contiguous()
# modified into:
bs , _ , ny , nx = map ( int , x [ i ]. shape ) # x(bs,255,20,20) to x(bs,3,20,20,85)
bs = - 1
x [ i ] = x [ i ]. view ( bs , self . na , self . no , ny , nx ). permute ( 0 , 1 , 3 , 4 , 2 ). contiguous ()
# line 70 in yolov3/models/yolo.py
# z.append(y.view(bs, -1, self.no))
# modified into:
z . append ( y . view ( bs , self . na * ny * nx , self . no ))
# line 62 in yolov3/models/yolo.py
# if self.grid[i].shape[2:4] != x[i].shape[2:4] or self.onnx_dynamic:
# self.grid[i], self.anchor_grid[i] = self._make_grid(nx, ny, i)
# modified into:
if self . grid [ i ]. shape [ 2 : 4 ] != x [ i ]. shape [ 2 : 4 ] or self . onnx_dynamic :
self . grid [ i ], self . anchor_grid [ i ] = self . _make_grid ( nx , ny , i )
anchor_grid = ( self . anchors [ i ]. clone () * self . stride [ i ]). view ( 1 , - 1 , 1 , 1 , 2 )
# line 70 in yolov3/models/yolo.py
# y[..., 2:4] = (y[..., 2:4] * 2) ** 2 * self.anchor_grid[i] # wh
# modified into:
y [..., 2 : 4 ] = ( y [..., 2 : 4 ] * 2 ) ** 2 * anchor_grid # wh
# line 73 in yolov3/models/yolo.py
# wh = (y[..., 2:4] * 2) ** 2 * self.anchor_grid[i] # wh
# modified into:
wh = ( y [..., 2 : 4 ] * 2 ) ** 2 * anchor_grid # wh
# line 52 in yolov3/export.py
# torch.onnx.export(dynamic_axes={'images': {0: 'batch', 2: 'height', 3: 'width'}, # shape(1,3,640,640)
# 'output': {0: 'batch', 1: 'anchors'} # shape(1,25200,85)
# modified into:
torch . onnx . export ( dynamic_axes = { 'images' : { 0 : 'batch' }, # shape(1,3,640,640)
'output' : { 0 : 'batch' } # shape(1,25200,85) cd yolov3
python export.py --weights=yolov3.pt --dynamic --include=onnx --opset=11cp yolov3/yolov3.onnx tensorRT_cpp/workspace/
cd tensorRT_cpp
# change src/application/app_yolo.cpp: main
# test(Yolo::Type::V3, TRT::Mode::FP32, "yolov3");
make yolo -j32 make dunet -j32
git clone [email protected]:biubug6/Pytorch_Retinaface.git
cd Pytorch_Retinaface从https://github.com/biubug6/pytorch_retinaface#training中下载readme.md的培训。在这里,我们使用mobilenet0.25_final.pth
修改代码
# line 24 in models/retinaface.py
# return out.view(out.shape[0], -1, 2) is modified into
return out . view ( - 1 , int ( out . size ( 1 ) * out . size ( 2 ) * 2 ), 2 )
# line 35 in models/retinaface.py
# return out.view(out.shape[0], -1, 4) is modified into
return out . view ( - 1 , int ( out . size ( 1 ) * out . size ( 2 ) * 2 ), 4 )
# line 46 in models/retinaface.py
# return out.view(out.shape[0], -1, 10) is modified into
return out . view ( - 1 , int ( out . size ( 1 ) * out . size ( 2 ) * 2 ), 10 )
# The following modification ensures the output of resize node is based on scale rather than shape such that dynamic batch can be achieved.
# line 89 in models/net.py
# up3 = F.interpolate(output3, size=[output2.size(2), output2.size(3)], mode="nearest") is modified into
up3 = F . interpolate ( output3 , scale_factor = 2 , mode = "nearest" )
# line 93 in models/net.py
# up2 = F.interpolate(output2, size=[output1.size(2), output1.size(3)], mode="nearest") is modified into
up2 = F . interpolate ( output2 , scale_factor = 2 , mode = "nearest" )
# The following code removes softmax (bug sometimes happens). At the same time, concatenate the output to simplify the decoding.
# line 123 in models/retinaface.py
# if self.phase == 'train':
# output = (bbox_regressions, classifications, ldm_regressions)
# else:
# output = (bbox_regressions, F.softmax(classifications, dim=-1), ldm_regressions)
# return output
# the above is modified into:
output = ( bbox_regressions , classifications , ldm_regressions )
return torch . cat ( output , dim = - 1 )
# set 'opset_version=11' to ensure a successful export
# torch_out = torch.onnx._export(net, inputs, output_onnx, export_params=True, verbose=False,
# input_names=input_names, output_names=output_names)
# is modified into:
torch_out = torch . onnx . _export ( net , inputs , output_onnx , export_params = True , verbose = False , opset_version = 11 ,
input_names = input_names , output_names = output_names )
python convert_to_onnx.pycp FaceDetector.onnx ../tensorRT_cpp/workspace/mb_retinaface.onnx
cd ../tensorRT_cpp
make retinaface -j64make dbface -j64 auto arcface = Arcface::create_infer( " arcface_iresnet50.fp32.trtmodel " , 0 );
auto feature = arcface-> commit (make_tuple(face, landmarks)).get();
cout << feature << endl; // 1x512workspace/face/library是注册的面孔集。workspace/face/recognize是要识别的面孔。workspace/face/result和workspace/face/library_draw在教程/2.0中检查出色的详细信息
make bert -j6 import pytrt
model = models . resnet18 ( True ). eval ()
pytrt . from_torch (
model ,
dummy_input ,
max_batch_size = 16 ,
onnx_save_file = "test.onnx" ,
engine_save_file = "engine.trtmodel"
) import pytrt
yolo = tp . Yolo ( engine_file , type = tp . YoloType . X ) # engine_file is the trtmodel file
image = cv2 . imread ( "inference/car.jpg" )
bboxes = yolo . commit ( image ). get () import pytrt
model = models . resnet18 ( True ). eval (). to ( device ) # pt model
trt_model = tp . from_torch ( model , input )
trt_out = trt_model ( input ) // create infer engine on gpu 0
auto engine = Yolo::create_infer( " yolox_m.fp32.trtmodel " , Yolo::Type::X, 0 );
// load image
auto image = cv::imread( " 1.jpg " );
// do inference and get the result
auto box = engine-> commit (image).get(); TRT::compile (
TRT::Mode::FP32, // compile model in fp32
3 , // max batch size
" plugin.onnx " , // onnx file
" plugin.fp32.trtmodel " , // save path
{} // redefine the shape of input when needed
); // define int8 calibration function to read data and handle it to tenor.
auto int8process = []( int current, int count, vector<string>& images, shared_ptr<TRT::Tensor>& tensor){
for ( int i = 0 ; i < images. size (); ++i){
// int8 compilation requires calibration. We read image data and set_norm_mat. Then the data will be transfered into the tensor.
auto image = cv::imread (images[i]);
cv::resize (image, image, cv::Size ( 640 , 640 ));
float mean[] = { 0 , 0 , 0 };
float std[] = { 1 , 1 , 1 };
tensor-> set_norm_mat (i, image, mean, std);
}
};
// Specify TRT::Mode as INT8
auto model_file = " yolov5m.int8.trtmodel " ;
TRT::compile (
TRT::Mode::INT8, // INT8
3 , // max batch size
" yolov5m.onnx " , // onnx
model_file, // saved filename
{}, // redefine the input shape
int8process, // the recall function for calibration
" . " , // the dir where the image data is used for calibration
" " // the dir where the data generated from calibration is saved(a.k.a where to load the calibration data.)
);我们介绍类张量,以便于主机之间更容易的推断和数据传输到设备之间。因此,作为用户,细节不会令人讨厌。
班级发动机是另一个促进者。
// load model and get a shared_ptr. get nullptr if fail to load.
auto engine = TRT::load_infer( " yolov5m.fp32.trtmodel " );
// print model info
engine-> print ();
// load image
auto image = imread( " demo.jpg " );
// get the model input and output node, which can be accessed by name or index
auto input = engine-> input ( 0 ); // or auto input = engine->input("images");
auto output = engine-> output ( 0 ); // or auto output = engine->output("output");
// put the image into input tensor by calling set_norm_mat()
float mean[] = { 0 , 0 , 0 };
float std[] = { 1 , 1 , 1 };
input-> set_norm_mat (i, image, mean, std);
// do the inference. Here sync(true) or async(false) is optional
engine-> forward (); // engine->forward(true or false)
// get the outut_ptr, which can used to access the output
float * output_ptr = output->cpu< float >(); template <>
__global__ void HSwishKernel ( float * input, float * output, int edge) {
KernelPositionBlock;
float x = input[position];
float a = x + 3 ;
a = a < 0 ? 0 : (a >= 6 ? 6 : a);
output[position] = x * a / 6 ;
}
int HSwish::enqueue ( const std::vector<GTensor>& inputs, std::vector<GTensor>& outputs, const std::vector<GTensor>& weights, void * workspace, cudaStream_t stream) {
int count = inputs[ 0 ]. count ();
auto grid = CUDATools::grid_dims (count);
auto block = CUDATools::block_dims (count);
HSwishKernel <<<grid, block, 0 , stream >>> (inputs[ 0 ]. ptr < float >(), outputs[ 0 ]. ptr < float >(), count);
return 0 ;
}
RegisterPlugin (HSwish);