Skip to content

Commit 3454bba

Browse files
committed
Integrate Automated QDQ placement tool - part 3
Signed-off-by: Will Guo <[email protected]>
1 parent 3f7ff31 commit 3454bba

File tree

8 files changed

+4602
-5
lines changed

8 files changed

+4602
-5
lines changed

modelopt/onnx/quantization/autotune/__init__.py

Lines changed: 18 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,12 @@
9191
- RegionPattern: Pattern matching and signature computation
9292
"""
9393

94+
# Autotuner classes
95+
from .autotuner import QDQAutotuner, QDQAutotunerBase
96+
97+
# Benchmark classes
98+
from .benchmark import Benchmark, TensorRTPyBenchmark, TrtExecBenchmark
99+
94100
# Core data structures
95101
from .common import (
96102
AutotunerError,
@@ -124,21 +130,28 @@
124130
# Exceptions
125131
"AutotunerError",
126132
"AutotunerNotInitializedError",
127-
"ChildRegionInputInsertionPoint",
128-
"CombinedRegionSearch",
133+
# Benchmark classes
134+
"Benchmark",
135+
"TensorRTPyBenchmark",
136+
"TrtExecBenchmark",
129137
# Configuration and state
130138
"Config",
131139
# Q/DQ insertion
132140
"InsertionScheme",
133141
"InvalidSchemeError",
134142
"NodeInputInsertionPoint",
143+
"ChildRegionInputInsertionPoint",
144+
"RegionOutputInsertionPoint",
135145
"ResolvedInsertionPoint",
136-
"PatternCache",
137-
"PatternSchemes",
146+
# Main autotuner classes
147+
"QDQAutotuner",
148+
"QDQAutotunerBase",
138149
# Region classes
139150
"Region",
140151
"RegionError",
141-
"RegionOutputInsertionPoint",
142152
"RegionPattern",
143153
"RegionType",
154+
"PatternCache",
155+
"PatternSchemes",
156+
"CombinedRegionSearch",
144157
]
Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,91 @@
1+
#!/usr/bin/env python3
2+
# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3+
# SPDX-License-Identifier: Apache-2.0
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License");
6+
# you may not use this file except in compliance with the License.
7+
# You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS,
13+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
# See the License for the specific language governing permissions and
15+
# limitations under the License.
16+
17+
"""ONNX Q/DQ Autotuning Command-Line Interface.
18+
19+
This module provides a command-line interface for automated Q/DQ (Quantize/Dequantize)
20+
optimization of ONNX models. It uses pattern-based region analysis and TensorRT performance
21+
measurement to find optimal Q/DQ insertion points that minimize inference latency.
22+
23+
**Usage Examples:**
24+
25+
# Basic usage - automatic region discovery and optimization
26+
python -m modelopt.onnx.quantization.autotune --model model.onnx
27+
28+
# INT8 vs FP8 quantization
29+
python -m modelopt.onnx.quantization.autotune --model model.onnx --quant-type fp8
30+
31+
# Warm-start from pattern cache (transfer learning)
32+
python -m modelopt.onnx.quantization.autotune \\
33+
--model model.onnx \\
34+
--pattern-cache ./output/pattern_cache.yaml
35+
36+
# Import patterns from pre-quantized baseline model
37+
python -m modelopt.onnx.quantization.autotune \\
38+
--model model.onnx \\
39+
--qdq-baseline quantized_baseline.onnx
40+
41+
# Full example with all optimization options
42+
python -m modelopt.onnx.quantization.autotune \\
43+
--model model.onnx \\
44+
--schemes-per-region 50 \\
45+
--pattern-cache pattern_cache.yaml \\
46+
--qdq-baseline baseline.onnx \\
47+
--output ./results \\
48+
--quant-type int8 \\
49+
--verbose
50+
51+
# Use custom TensorRT plugins for model-specific operations
52+
python -m modelopt.onnx.quantization.autotune \\
53+
--model model.onnx \\
54+
--plugin-libraries /path/to/plugin1.so /path/to/plugin2.so
55+
56+
**Output Files:**
57+
58+
output_dir/
59+
├── autotuner_state.yaml # Checkpoint for resume capability
60+
├── baseline.onnx # Unquantized baseline model
61+
├── optimized_final.onnx # Final optimized model with Q/DQ
62+
├── logs/ # TensorRT build logs per scheme
63+
│ ├── baseline.log
64+
│ ├── region_*_scheme_*.log
65+
│ └── final.log
66+
└── region_models/ # Best model per region
67+
└── region_*_level_*.onnx
68+
"""
69+
70+
import sys
71+
72+
from modelopt.onnx.quantization.autotune.cli import get_autotune_parser, run_autotune
73+
74+
75+
def main():
76+
"""Command-line entry point for ONNX Q/DQ autotuning.
77+
78+
Parses command-line arguments and executes the autotuning workflow.
79+
80+
Returns:
81+
Exit code from run_autotune (0 for success, non-zero for errors)
82+
"""
83+
parser = get_autotune_parser()
84+
args = parser.parse_args()
85+
86+
# Run autotuning
87+
return run_autotune(args)
88+
89+
90+
if __name__ == "__main__":
91+
sys.exit(main())

0 commit comments

Comments
 (0)