PyTorch 量化感知训练QAT 与 PTQ 实践指南1. 技术分析1.1 量化类型量化类型描述训练要求精度损失PTQ (Post-Training Quantization)训练后量化无需重新训练中QAT (Quantization-Aware Training)训练中量化需要重新训练低Dynamic Quantization仅量化权重无需数据低Static Quantization量化权重和激活需要校准数据低1.2 量化位宽对比位宽内存节省速度提升精度损失INT84x2-4x低INT48x4-8x中FP162x1.5x极低1.3 量化流程量化流程 1. 训练模型 (FP32) 2. 量化准备 (插入量化节点) 3. 校准/微调 (QAT) 4. 量化转换 (INT8) 5. 推理部署2. 核心功能实现2.1 动态量化import torch import torch.nn as nn class SimpleModel(nn.Module): def __init__(self): super().__init__() self.fc1 nn.Linear(100, 200) self.relu nn.ReLU() self.fc2 nn.Linear(200, 10) def forward(self, x): x self.fc1(x) x self.relu(x) x self.fc2(x) return x def dynamic_quantization(): model SimpleModel() model.eval() quantized_model torch.quantization.quantize_dynamic( model, {nn.Linear}, dtypetorch.qint8 ) input torch.randn(1, 100) output quantized_model(input) print(f原始模型大小: {sum(p.numel() for p in model.parameters()) * 4 / 1024:.2f} KB) print(f量化模型大小: {sum(p.numel() for p in quantized_model.parameters()) * 1 / 1024:.2f} KB) class DynamicQuantWrapper: def __init__(self, model): self.model model self.quantized None def quantize(self, qconfigNone): if qconfig is None: qconfig torch.quantization.default_dynamic_qconfig self.quantized torch.quantization.quantize_dynamic( self.model, qconfig_spec{nn.Linear: qconfig} ) return self.quantized def inference(self, input): if self.quantized is None: return self.model(input) return self.quantized(input)2.2 静态量化class StaticQuantModel(nn.Module): def __init__(self): super().__init__() self.quant torch.quantization.QuantStub() self.conv1 nn.Conv2d(3, 64, kernel_size3) self.relu1 nn.ReLU() self.conv2 nn.Conv2d(64, 128, kernel_size3) self.relu2 nn.ReLU() self.fc nn.Linear(128 * 28 * 28, 10) self.dequant torch.quantization.DeQuantStub() def forward(self, x): x self.quant(x) x self.conv1(x) x self.relu1(x) x self.conv2(x) x self.relu2(x) x x.view(x.size(0), -1) x self.fc(x) x self.dequant(x) return x def static_quantization(model, calibration_data): model.qconfig torch.quantization.get_default_qconfig(fbgemm) model torch.quantization.prepare(model, inplaceFalse) with torch.no_grad(): for data in calibration_data: model(data) model torch.quantization.convert(model, inplaceFalse) return model class StaticQuantPipeline: def __init__(self, model): self.model model self.calibration_data None def set_calibration_data(self, data): self.calibration_data data def run(self, backendfbgemm): self.model.qconfig torch.quantization.get_default_qconfig(backend) self.model torch.quantization.prepare(self.model) self._calibrate() self.model torch.quantization.convert(self.model) return self.model def _calibrate(self): if self.calibration_data is None: raise ValueError(Calibration data not set) self.model.eval() with torch.no_grad(): for batch in self.calibration_data: self.model(batch)2.3 量化感知训练 (QAT)class QATModel(nn.Module): def __init__(self): super().__init__() self.quant torch.quantization.QuantStub() self.layers nn.Sequential( nn.Linear(100, 200), nn.ReLU(), nn.Linear(200, 200), nn.ReLU(), nn.Linear(200, 10) ) self.dequant torch.quantization.DeQuantStub() def forward(self, x): x self.quant(x) x self.layers(x) x self.dequant(x) return x def train_qat(): model QATModel() model.train() model.qconfig torch.quantization.get_default_qat_qconfig(fbgemm) model torch.quantization.prepare_qat(model, inplaceFalse) optimizer torch.optim.Adam(model.parameters(), lr0.001) loss_fn nn.CrossEntropyLoss() for epoch in range(10): inputs torch.randn(32, 100) targets torch.randint(0, 10, (32,)) optimizer.zero_grad() outputs model(inputs) loss loss_fn(outputs, targets) loss.backward() optimizer.step() model.eval() model torch.quantization.convert(model, inplaceFalse) return model class QATTraining: def __init__(self, model, optimizer, loss_fn): self.model model self.optimizer optimizer self.loss_fn loss_fn def prepare_qat(self, backendfbgemm): self.model.qconfig torch.quantization.get_default_qat_qconfig(backend) self.model torch.quantization.prepare_qat(self.model) def train_epoch(self, dataloader): self.model.train() for inputs, targets in dataloader: self.optimizer.zero_grad() outputs self.model(inputs) loss self.loss_fn(outputs, targets) loss.backward() self.optimizer.step() def finalize(self): self.model.eval() self.model torch.quantization.convert(self.model) return self.model2.4 量化模型导出class QuantizationExporter: staticmethod def export_onnx(model, input_shape, filepath): model.eval() dummy_input torch.randn(*input_shape) torch.onnx.export( model, dummy_input, filepath, opset_import13, do_constant_foldingTrue, input_names[input], output_names[output] ) staticmethod def export_torchscript(model, input_shape, filepath): model.eval() dummy_input torch.randn(*input_shape) traced_model torch.jit.trace(model, dummy_input) traced_model.save(filepath) staticmethod def optimize_for_mobile(model): model.eval() optimized torch.quantization.convert(model) return torch.jit.script(optimized)3. 性能对比3.1 量化方法对比方法精度损失速度提升内存节省实现复杂度动态量化低2x4x低静态量化低3x4x中QAT极低3x4x高FP16极低1.5x2x低3.2 推理性能对比模型FP32延迟INT8延迟加速比ResNet-1810ms3ms3.3xMobileNet8ms2.5ms3.2xBERT-base50ms15ms3.3xGPT-2200ms60ms3.3x3.3 量化精度对比模型FP32PTQQAT精度损失ResNet-1871.8%71.5%71.7%PTQ: -0.3%MobileNet70.0%69.5%69.9%PTQ: -0.5%BERT-base82.5%81.8%82.3%PTQ: -0.7%4. 最佳实践4.1 量化模型验证class QuantizationValidator: def __init__(self, fp32_model, quantized_model): self.fp32_model fp32_model self.quantized_model quantized_model def validate(self, test_data): self.fp32_model.eval() self.quantized_model.eval() fp32_predictions [] quantized_predictions [] with torch.no_grad(): for data in test_data: fp32_pred self.fp32_model(data).argmax(dim1) quantized_pred self.quantized_model(data).argmax(dim1) fp32_predictions.append(fp32_pred) quantized_predictions.append(quantized_pred) accuracy sum( torch.equal(f, q) for f, q in zip(fp32_predictions, quantized_predictions) ) / len(test_data) return accuracy4.2 量化配置选择def select_qconfig(backendfbgemm): if backend fbgemm: return torch.quantization.get_default_qconfig(fbgemm) elif backend qnnpack: return torch.quantization.get_default_qconfig(qnnpack) else: raise ValueError(fUnknown backend: {backend}) class QuantizationConfig: staticmethod def for_device(device_type): if device_type cpu: return torch.quantization.get_default_qconfig(fbgemm) elif device_type mobile: return torch.quantization.get_default_qconfig(qnnpack) else: return torch.quantization.get_default_qconfig(fbgemm)5. 总结量化是模型优化的重要技术PTQ快速部署无需重新训练QAT更高精度需要重新训练动态量化仅量化权重适用于Transformer静态量化量化权重和激活适用于CNN对比数据如下INT8量化可获得3-4倍加速内存占用减少75%QAT精度损失小于0.3%PTQ精度损失约0.3-0.7%