|
| 1 | +#!/usr/bin/env python3 |
| 2 | +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. |
| 3 | +# SPDX-License-Identifier: Apache-2.0 |
| 4 | +# |
| 5 | +# Licensed under the Apache License, Version 2.0 (the "License"); |
| 6 | +# you may not use this file except in compliance with the License. |
| 7 | +# You may obtain a copy of the License at |
| 8 | +# |
| 9 | +# http://www.apache.org/licenses/LICENSE-2.0 |
| 10 | +# |
| 11 | +# Unless required by applicable law or agreed to in writing, software |
| 12 | +# distributed under the License is distributed on an "AS IS" BASIS, |
| 13 | +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 14 | +# See the License for the specific language governing permissions and |
| 15 | +# limitations under the License. |
| 16 | + |
| 17 | +"""ONNX Q/DQ Autotuning Command-Line Interface. |
| 18 | +
|
| 19 | +This module provides a command-line interface for automated Q/DQ (Quantize/Dequantize) |
| 20 | +optimization of ONNX models. It uses pattern-based region analysis and TensorRT performance |
| 21 | +measurement to find optimal Q/DQ insertion points that minimize inference latency. |
| 22 | +
|
| 23 | +**Usage Examples:** |
| 24 | +
|
| 25 | + # Basic usage - automatic region discovery and optimization |
| 26 | + python -m modelopt.onnx.quantization.autotune --model model.onnx |
| 27 | +
|
| 28 | + # INT8 vs FP8 quantization |
| 29 | + python -m modelopt.onnx.quantization.autotune --model model.onnx --quant-type fp8 |
| 30 | +
|
| 31 | + # Warm-start from pattern cache (transfer learning) |
| 32 | + python -m modelopt.onnx.quantization.autotune \\ |
| 33 | + --model model.onnx \\ |
| 34 | + --pattern-cache ./output/pattern_cache.yaml |
| 35 | +
|
| 36 | + # Import patterns from pre-quantized baseline model |
| 37 | + python -m modelopt.onnx.quantization.autotune \\ |
| 38 | + --model model.onnx \\ |
| 39 | + --qdq-baseline quantized_baseline.onnx |
| 40 | +
|
| 41 | + # Full example with all optimization options |
| 42 | + python -m modelopt.onnx.quantization.autotune \\ |
| 43 | + --model model.onnx \\ |
| 44 | + --schemes-per-region 50 \\ |
| 45 | + --pattern-cache pattern_cache.yaml \\ |
| 46 | + --qdq-baseline baseline.onnx \\ |
| 47 | + --output ./results \\ |
| 48 | + --quant-type int8 \\ |
| 49 | + --verbose |
| 50 | +
|
| 51 | + # Use custom TensorRT plugins for model-specific operations |
| 52 | + python -m modelopt.onnx.quantization.autotune \\ |
| 53 | + --model model.onnx \\ |
| 54 | + --plugin-libraries /path/to/plugin1.so /path/to/plugin2.so |
| 55 | +
|
| 56 | +**Output Files:** |
| 57 | +
|
| 58 | + output_dir/ |
| 59 | + ├── autotuner_state.yaml # Checkpoint for resume capability |
| 60 | + ├── baseline.onnx # Unquantized baseline model |
| 61 | + ├── optimized_final.onnx # Final optimized model with Q/DQ |
| 62 | + ├── logs/ # TensorRT build logs per scheme |
| 63 | + │ ├── baseline.log |
| 64 | + │ ├── region_*_scheme_*.log |
| 65 | + │ └── final.log |
| 66 | + └── region_models/ # Best model per region |
| 67 | + └── region_*_level_*.onnx |
| 68 | +""" |
| 69 | + |
| 70 | +import sys |
| 71 | + |
| 72 | +from modelopt.onnx.quantization.autotune.cli import get_autotune_parser, run_autotune |
| 73 | + |
| 74 | + |
| 75 | +def main(): |
| 76 | + """Command-line entry point for ONNX Q/DQ autotuning. |
| 77 | +
|
| 78 | + Parses command-line arguments and executes the autotuning workflow. |
| 79 | +
|
| 80 | + Returns: |
| 81 | + Exit code from run_autotune (0 for success, non-zero for errors) |
| 82 | + """ |
| 83 | + parser = get_autotune_parser() |
| 84 | + args = parser.parse_args() |
| 85 | + |
| 86 | + # Run autotuning |
| 87 | + return run_autotune(args) |
| 88 | + |
| 89 | + |
| 90 | +if __name__ == "__main__": |
| 91 | + sys.exit(main()) |
0 commit comments