Skip to content

Commit

Permalink
Use regular division inside Scale Estimation (#3210)
Browse files Browse the repository at this point in the history
### Changes

Compute division inside SE algorithm always as `a/b` instead of
`a*(1/b)` in some cases.

### Reason for changes

During implementation #2727 some choices were made regarding how
division operation is computed in order for the changes to be completely
aligned with the previous implementation. Namely, before #2727 some
divisions were computed as `a*(1/b)`, and this is currently still the
case.

The way these divisions are computed originally was not intended. Now,
all divisions are aligned to the `a/b` form.

Compression time and memory are roughly the same.

| Model | Compression | Compression Time Develop (sec.) | Compression
Time Branch (sec.) | Peak Memory Develop (MiB) | Peak Memory Branch
(MiB) |

|--------------|--------------|---------------------------------|--------------------------------|---------------------------|--------------------------|
| tiny-llama | int4, SE | 222* | 228* | 3030 | 3032 |
| phi4-mini | in4, SE | 789* | 790* | 10817 | 10768 |
| llama-3.1-8b | int4, SE | 1776* | 1801* | 17756 | 18224 |

*time column includes PT -> OV conversion time.

### Related tickets

163286

### Tests

- https://github.com/openvinotoolkit/nncf/actions/runs/13368886294
- NNCF/job/manual/job/post_training_weight_compression/324/
- OVVP validation ✅
  • Loading branch information
nikita-savelyevv authored Mar 10, 2025
1 parent 64d8468 commit 73590b0
Show file tree
Hide file tree
Showing 5 changed files with 6 additions and 27 deletions.
12 changes: 2 additions & 10 deletions nncf/openvino/optimized_functions/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,6 @@ def do_int_quantization(
reduction_axes: Optional[ReductionAxes] = None,
precomputed_scale: Tensor = None,
precomputed_zero_point: Tensor = None,
**kwargs,
) -> Tuple[Tensor, Tensor, Tensor]:
"""
Quantizes the given weight tensor.
Expand All @@ -50,10 +49,7 @@ def do_int_quantization(
scale_shape = None if precomputed_scale is None else precomputed_scale.shape
zero_point_shape = None if precomputed_zero_point is None else precomputed_zero_point.shape

ov_model_params = OVModelParameters(
dynamic_shapes=kwargs.get("dynamic_shapes") is True,
convertable_division=kwargs.get("convertable_division") is True,
)
ov_model_params = OVModelParameters()
ov_model_params.input_dtypes["weight"] = weight.dtype
if precomputed_scale is not None:
ov_model_params.input_dtypes["scale"] = precomputed_scale.dtype
Expand Down Expand Up @@ -108,7 +104,6 @@ def quantize_dequantize_weight(
precomputed_scale: Optional[Tensor] = None,
precomputed_zero_point: Optional[Tensor] = None,
return_compressed_weight: Optional[bool] = False,
**kwargs,
) -> Union[Tensor, Tuple[Tensor, Tensor, Tensor, Tensor]]:
"""
Quantizes the given weight tensor and then dequantizes it back to obtain float32 values.
Expand All @@ -133,10 +128,7 @@ def quantize_dequantize_weight(
scale_shape = precomputed_scale.shape if precomputed_scale is not None else None
zero_point_shape = precomputed_zero_point.shape if precomputed_zero_point is not None else None

ov_model_params = OVModelParameters(
dynamic_shapes=kwargs.get("dynamic_shapes") is True,
convertable_division=kwargs.get("convertable_division") is True,
)
ov_model_params = OVModelParameters()
ov_model_params.input_dtypes["weight"] = weight.dtype
if precomputed_scale is not None:
ov_model_params.input_dtypes["scale"] = precomputed_scale.dtype
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -246,10 +246,6 @@ def calculate_quantization_params(
zero_scale = 0.001
zero_mask = zero_scale * zero_mask.astype(original_weight.dtype)

# This is required for alignment with a previous OpenVINO models implementation
# TODO(Nikita Savelyev): remove this
opt_fns_kwargs = dict(dynamic_shapes=False, convertable_division=True)

# iterative rectification of initial scale
for i in range(initial_steps):
near_to_ideal_scale = estimate_scales(original_weight, target, zero_mask, importance)
Expand All @@ -264,7 +260,6 @@ def calculate_quantization_params(
config,
precomputed_scale=near_to_ideal_scale,
precomputed_zero_point=zp,
**opt_fns_kwargs,
)

q_weights_ = fns.zeros_like(original_weight) + out
Expand Down Expand Up @@ -299,7 +294,6 @@ def calculate_quantization_params(
config,
precomputed_scale=near_to_ideal_scale,
precomputed_zero_point=zp,
**opt_fns_kwargs,
)
compressed_weights = fns.zeros_like(original_weight) + out
target, zero_mask = get_target_zero_mask(compressed_weights, zp)
Expand All @@ -318,7 +312,6 @@ def calculate_quantization_params(
config,
precomputed_scale=scaled_scale,
precomputed_zero_point=zp,
**opt_fns_kwargs,
)
compressed_weights = fns.zeros_like(original_weight) + out

Expand All @@ -336,7 +329,6 @@ def calculate_quantization_params(
config,
precomputed_scale=near_to_ideal_scale,
precomputed_zero_point=zp,
**opt_fns_kwargs,
)
q_weights_ = fns.zeros_like(original_weight) + out

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -443,7 +443,6 @@ def do_int_quantization(
reduction_axes: Optional[ReductionAxes] = None,
precomputed_scale: Tensor = None,
precomputed_zero_point: Tensor = None,
**kwargs,
) -> Tuple[Tensor, Tensor, Tensor]:
"""
Performs integer quantization on the given weight tensor.
Expand Down Expand Up @@ -475,9 +474,7 @@ def do_int_quantization(
if _can_run_optimized(weight.backend):
from nncf.openvino.optimized_functions import do_int_quantization as do_int_quantization_ov

return do_int_quantization_ov(
weight, config, reduction_axes, precomputed_scale, precomputed_zero_point, **kwargs
)
return do_int_quantization_ov(weight, config, reduction_axes, precomputed_scale, precomputed_zero_point)

# Reference implementation
if weight.backend == TensorBackend.ov:
Expand Down Expand Up @@ -507,7 +504,6 @@ def quantize_dequantize_weight(
precomputed_scale: Optional[Tensor] = None,
precomputed_zero_point: Optional[Tensor] = None,
return_compressed_weight: Optional[bool] = False,
**kwargs,
) -> Union[Tensor, Tuple[Tensor, Tensor, Tensor, Tensor]]:
"""
First quantizes the given weight tensor and then dequantizes it back to obtain float32 values.
Expand All @@ -533,7 +529,6 @@ def quantize_dequantize_weight(
precomputed_scale,
precomputed_zero_point,
return_compressed_weight,
**kwargs,
)

# Reference implementation
Expand Down
4 changes: 2 additions & 2 deletions tests/post_training/data/wc_reference_data.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -36,11 +36,11 @@ tinyllama_data_aware_gptq_scale_estimation_stateful_backend_OV:
num_int8: 124
metrics_xfail_reason: "Issue-148819"
tinyllama_scale_estimation_per_channel_backend_OV:
metric_value: 0.81389
metric_value: 0.80873
num_int4: 188
num_int8: 124
tinyllama_scale_estimation_per_channel_backend_TORCH:
metric_value: 0.81389
metric_value: 0.80873
num_int4: 188
num_int8: 124
atol: 0.006 # difference across devices: 0.80873 vs 0.81389
Expand Down
2 changes: 1 addition & 1 deletion tests/post_training/data/wc_reference_data_2025.0.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ tinyllama_data_aware_backend_TORCH:
num_int8: 124
num_compressed_xfail_reason: "Issue-160006"
tinyllama_scale_estimation_per_channel_backend_TORCH:
metric_value: 0.81389
metric_value: 0.80873
num_int4: 188
num_int8: 124
atol: 0.006 # difference across devices: 0.80873 vs 0.81389
Expand Down

0 comments on commit 73590b0

Please sign in to comment.