计算机视觉部署:从训练到生产
计算机视觉部署从训练到生产1. 技术分析1.1 部署流程概述计算机视觉模型部署需要多个步骤部署流程 训练 → 模型优化 → 转换 → 部署 → 监控1.2 部署框架对比框架特点支持格式部署平台ONNX开放标准ONNX多平台TensorRTNVIDIA优化TensorRTGPUTensorFlow Lite移动端TFLite移动端PyTorch MobilePyTorch原生PT移动端1.3 模型优化技术模型优化技术 量化: INT8/FP16 剪枝: 去除冗余参数 蒸馏: 知识迁移 压缩: 权重压缩2. 核心功能实现2.1 模型转换import torch import torchvision.models as models import onnx import onnxruntime as ort class ModelConverter: def __init__(self): pass def pytorch_to_onnx(self, model, input_shape, output_path): dummy_input torch.randn(*input_shape) torch.onnx.export( model, dummy_input, output_path, opset_version11, input_names[input], output_names[output] ) def onnx_to_tensorrt(self, onnx_path, output_path): import tensorrt as trt TRT_LOGGER trt.Logger(trt.Logger.WARNING) with trt.Builder(TRT_LOGGER) as builder, \ builder.create_network(1) as network, \ trt.OnnxParser(network, TRT_LOGGER) as parser: builder.max_workspace_size 1 30 with open(onnx_path, rb) as f: parser.parse(f.read()) engine builder.build_cuda_engine(network) with open(output_path, wb) as f: f.write(engine.serialize()) def pytorch_to_tflite(self, model, input_shape, output_path): converter tf.lite.TFLiteConverter.from_pytorch(model, input_shape) converter.optimizations [tf.lite.Optimize.DEFAULT] tflite_model converter.convert() with open(output_path, wb) as f: f.write(tflite_model)2.2 模型优化class ModelOptimizer: def __init__(self): pass def quantize(self, model): return torch.ao.quantization.quantize_dynamic( model, {torch.nn.Linear}, dtypetorch.qint8 ) def prune(self, model, amount0.3): import torch.nn.utils.prune as prune for name, module in model.named_modules(): if isinstance(module, torch.nn.Conv2d): prune.l1_unstructured(module, nameweight, amountamount) prune.remove(module, weight) return model def distill(self, teacher_model, student_model, dataloader, epochs10): teacher_model.eval() student_model.train() optimizer torch.optim.Adam(student_model.parameters()) loss_fn torch.nn.MSELoss() for epoch in range(epochs): for inputs, _ in dataloader: optimizer.zero_grad() with torch.no_grad(): teacher_outputs teacher_model(inputs) student_outputs student_model(inputs) loss loss_fn(student_outputs, teacher_outputs) loss.backward() optimizer.step() return student_model2.3 模型部署class ModelDeployer: def __init__(self, model_path, model_typeonnx): self.model_path model_path self.model_type model_type self.session None def load(self): if self.model_type onnx: self.session ort.InferenceSession(self.model_path) elif self.model_type tensorrt: self.load_tensorrt() def load_tensorrt(self): import tensorrt as trt import pycuda.driver as cuda TRT_LOGGER trt.Logger(trt.Logger.WARNING) with open(self.model_path, rb) as f, \ trt.Runtime(TRT_LOGGER) as runtime: engine runtime.deserialize_cuda_engine(f.read()) self.context engine.create_execution_context() def infer(self, input_data): if self.model_type onnx: inputs {self.session.get_inputs()[0].name: input_data} outputs self.session.run(None, inputs) return outputs[0] elif self.model_type tensorrt: return self.infer_tensorrt(input_data) def infer_tensorrt(self, input_data): import pycuda.driver as cuda d_input cuda.mem_alloc(input_data.nbytes) d_output cuda.mem_alloc(1000 * 4) cuda.memcpy_htod(d_input, input_data) bindings [int(d_input), int(d_output)] self.context.execute_v2(bindings) output_data np.empty(1000, dtypenp.float32) cuda.memcpy_dtoh(output_data, d_output) return output_data class APIServer: def __init__(self, model_path, model_typeonnx): self.deployer ModelDeployer(model_path, model_type) self.deployer.load() def predict(self, image): input_data self.preprocess(image) output self.deployer.infer(input_data) return self.postprocess(output) def preprocess(self, image): image cv2.resize(image, (224, 224)) image image / 255.0 image np.transpose(image, (2, 0, 1)) return image.astype(np.float32) def postprocess(self, output): return np.argmax(output)3. 性能对比3.1 部署框架对比框架推理速度(ms)准确率损失模型大小(MB)PyTorch500%240ONNX400%240TensorRT101%120TFLite202%603.2 优化技术效果技术速度提升准确率损失模型压缩比FP16量化2x1%2xINT8量化4x2-3%4x剪枝(30%)1.2x1%1.3x知识蒸馏1.5x1%1.5x3.3 不同硬件表现硬件PyTorch(ms)TensorRT(ms)TFLite(ms)GPU (RTX 3090)5010-CPU (i9-12900K)200--Mobile (Snapdragon)--504. 最佳实践4.1 部署方案选择def select_deployment(target_platform): if target_platform gpu_server: return {framework: tensorrt, optimization: fp16} elif target_platform cpu_server: return {framework: onnx, optimization: int8} elif target_platform mobile: return {framework: tflite, optimization: int8} class DeploymentFactory: staticmethod def create(config): deployer ModelDeployer(config[model_path], config[framework]) return APIServer(config[model_path], config[framework])4.2 部署流程class DeploymentPipeline: def __init__(self, model, config): self.model model self.config config def optimize(self): if self.config.get(quantize, False): self.model ModelOptimizer().quantize(self.model) if self.config.get(prune, False): self.model ModelOptimizer().prune(self.model) def convert(self): converter ModelConverter() if self.config[framework] onnx: converter.pytorch_to_onnx(self.model, (1, 3, 224, 224), model.onnx) elif self.config[framework] tensorrt: converter.pytorch_to_onnx(self.model, (1, 3, 224, 224), model.onnx) converter.onnx_to_tensorrt(model.onnx, model.engine) def deploy(self): return APIServer(self.config[model_path], self.config[framework])5. 总结计算机视觉部署需要综合考虑多个因素框架选择根据目标平台选择合适框架模型优化量化、剪枝、蒸馏可提升性能部署平台GPU、CPU、移动端各有特点监控维护部署后需要监控和维护对比数据如下TensorRT 在 GPU 上性能最好TFLite 是移动端最佳选择INT8 量化可获得 4x 加速推荐使用 ONNX 作为中间格式